Package Bio :: Package Ndb
[hide private]
[frames] | no frames]

Source Code for Package Bio.Ndb

  1  # Copyright 2002 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provided code to parse HTML files from NDB (DEPRECATED). 
  8   
  9  This module provides an HTML parser designed for the NDB website 
 10  http://ndbserver.rutgers.edu/ as it was circa 2002.  The site has since 
 11  been redesigned, breaking the parser.  Bio.Ndb is therefore deprecated, 
 12  and will be removed in a future release of Biopython. 
 13   
 14  Classes: 
 15  Record             Holds NDB sequence data. 
 16  NdbParser          Parses NDB sequence data into a Record object. 
 17   
 18  The algorithm is based on a state machine because the record has multiple  
 19  sections and the handling of tags varies depending on the section.   
 20  Citations have their own state machine. 
 21  """ 
 22  import warnings 
 23  warnings.warn("Bio.Ndb has been deprecated as the NDB website it used to"\ 
 24                " parse has been redesigned.", DeprecationWarning) 
 25   
 26  from types import * 
 27  import string 
 28  from Bio import File 
 29  from Bio import Index 
 30  from Bio.Crystal import Hetero 
 31  from Bio.Crystal import Chain 
 32  from Bio.Crystal import Crystal 
 33  from Bio.SeqFeature import Reference 
 34  import urllib 
 35  import sgmllib 
 36  from Bio.ParserSupport import * 
 37  from Bio.SeqFeature import Reference 
 38   
 39   
40 -class Record( dict ):
41
42 - def __init__( self ):
43 self[ 'Id' ] = '' 44 self[ 'Features' ] = '' 45 self[ 'Name' ] = '' 46 self[ 'Sequence' ] = Crystal( {} ) 47 self[ 'Citation' ] = Reference() 48 self[ 'Space Group' ] = '' 49 self[ 'Cell Constants' ] = {} 50 self[ 'Crystallization Conditions' ] = [] 51 self[ 'Refinement' ] = '' 52 self[ 'Coordinates' ] = ''
53
54 - def __str__( self ):
55 keys = self.keys() 56 keys.sort() 57 out = '' 58 for key in keys: 59 val = self[ key ] 60 if( type( val ) == type( [] ) ): 61 out = out + '\n%s\n' % key 62 for item in val: 63 out = out + '%s\n' % item 64 65 elif( type( val ) == type( {} ) ): 66 out = out + '\n%s\n' % key 67 subkeys = val.keys() 68 subkeys.sort() 69 for item in subkeys: 70 out = out + '%s : %s\n' % ( item, val[ item ] ) 71 elif( isinstance( val, dict ) ): 72 out = out + '\n%s\n' % key 73 subkeys = val.keys() 74 subkeys.sort() 75 for item in subkeys: 76 out = out + '%s : %s\n' % ( item, val[ item ] ) 77 78 else: 79 out = out + '%s: %s\n' % ( key, self[ key ] ) 80 return out
81
82 -def _parse_constants( text ):
83 items = text.split( '=' ) 84 constants = {} 85 key = '' 86 for i in range( 0, ( len( items ) - 1 ) ): 87 item = items[ i ] 88 item = item.strip() 89 separator = item.rfind( ' ' ) 90 if( separator < 0 ): 91 separator = 0 92 val = item[ :separator ] 93 val = val.strip() 94 if( key != '' ): 95 constants[ key ] = val 96 key = item[ separator: ] 97 key = key.strip() 98 constants[ key ] = items[ -1 ] 99 return constants
100 101 102 103 104
105 -class NdbParser( sgmllib.SGMLParser ):
106 """Parses Ndb sequence data into a Record object. 107 data available at: http://ndbserver.rutgers.edu/NDB/NDBATLAS/index.html 108 """
109 - def reset(self):
110 sgmllib.SGMLParser.reset( self ) 111 self.ndb_dict = Record() 112 self.text = '' 113 self._space_group = '' 114 self._state = 'id' 115 self._reference_state = 'authors' 116 self._current_reference = Reference()
117
118 - def parse(self, handle):
119 self.reset() 120 self.feed(handle) 121 return self.ndb_dict
122
123 - def feed(self, handle):
124 """feed(self, handle ) 125 126 Feed in ndb data for scanning. handle is a file-like object 127 containing ndb data. consumer is a Consumer object that will 128 receive events as the ndb data is scanned. 129 130 """ 131 if isinstance(handle, File.UndoHandle): 132 uhandle = handle 133 else: 134 uhandle = File.UndoHandle(handle) 135 text = '' 136 while 1: 137 line = uhandle.readline() 138 if( not line ): 139 break 140 line = string.strip( line ) 141 if( line[ -7: ] == '</HTML>' ): 142 break 143 text = text + ' ' + line 144 145 sgmllib.SGMLParser.feed( self, text )
146 147
148 - def handle_data(self, newtext ):
149 newtext = string.strip( newtext ) 150 self.text = self.text + newtext
151
152 - def start_h1( self, attrs ):
153 self._flush_text()
154
155 - def end_h1( self ):
156 text = self._flush_text() 157 if( self._state == 'id' ): 158 cols = text.split( ':' ) 159 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper() 160 self._state = 'id_found'
161
162 - def start_h2( self, attrs ):
163 text = self._flush_text() 164 if( self._state == 'features' ): 165 self.ndb_dict[ 'Features' ] = text 166 elif( self._state == 'name' ): 167 self.ndb_dict[ 'Name' ] = text 168 elif( self._state == 'sequence' ): 169 pass 170 elif( self._state == 'citation' ): 171 if( self._reference_state == 'journal' ): 172 self._current_reference.journal = text 173 self.ndb_dict[ 'Citation' ] = self._current_reference 174 elif( self._state == 'space' ): 175 self._space_group = self._space_group + text 176 self.ndb_dict[ 'Space Group' ] = self._space_group 177 elif( self._state == 'constants' ): 178 self.ndb_dict[ 'Cell Constants' ] = _parse_constants( text ) 179 elif( self._state == 'crystallization' ): 180 pass 181 elif( self._state == 'refinement' ): 182 self.ndb_dict[ 'Refinement' ] = text 183 elif( self._state == 'coordinates' ): 184 self.ndb_dict[ 'Coordinates' ] = text
185
186 - def end_h2( self ):
187 text = self._flush_text() 188 text = text.lower() 189 if( self._state == 'id' ): 190 if( text.find( 'id' ) >= 0 ): 191 cols = text.split( ':' ) 192 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper() 193 self._state = 'id_found' 194 elif( text.find( 'feature' ) >= 0 ): 195 self._state = 'features' 196 elif( text.find( 'name' ) >= 0 ): 197 self._state = 'name' 198 elif( text.find( 'sequence' ) >= 0 ): 199 self._state = 'sequence' 200 elif( text.find( 'citation' ) >= 0 ): 201 self._state = 'citation' 202 elif( text.find( 'space' ) >= 0 ): 203 self._state = 'space' 204 elif( text.find( 'constants' ) >= 0 ): 205 self._state = 'constants' 206 elif( text.find( 'crystallization' ) >= 0 ): 207 self._state = 'crystallization' 208 elif( text.find( 'refinement' ) >= 0 ): 209 self._state = 'refinement' 210 elif( text.find( 'coordinates' ) >= 0 ): 211 self._state = 'coordinates'
212 213
214 - def start_ul( self, attrs ):
215 if( self._state == 'sequence' ): 216 self._flush_text() 217 218 elif( self._state == 'crystallization' ): 219 self._flush_text()
220
221 - def end_ul( self ):
222 if( self._state == 'sequence' ): 223 self._parse_chain() 224 elif( self._state == 'crystallization' ): 225 text = self._flush_text() 226 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text ) 227 elif( self._state == 'citation' ): 228 if( self._reference_state == 'journal' ): 229 self._current_reference.journal = self._flush_text() 230 self._reference_state = 'done'
231
232 - def start_sub( self, attrs ):
233 if( self._state == 'space' ): 234 self._space_group = self._space_group + self._flush_text()
235
236 - def end_sub( self ):
237 if( self._state == 'space' ): 238 self._space_group = self._space_group + '(%s) ' % self._flush_text()
239
240 - def start_li( self, attrs ):
241 if( self._state == 'sequence' ): 242 self._parse_chain() 243 elif( self._state == 'crystallization' ): 244 text = self._flush_text() 245 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
246
247 - def end_li( self ):
248 if( self._state == 'sequence' ): 249 self._parse_chain() 250 elif( self._state == 'crystallization' ): 251 text = self._flush_text() 252 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
253
254 - def do_br( self, attrs ):
255 if( self._state == 'citation' ): 256 if( self._reference_state == 'authors' ): 257 self._current_reference.authors = self._flush_text() 258 self._reference_state = 'title' 259 elif( self._reference_state == 'title' ): 260 self._current_reference.title = self._flush_text() 261 self._reference_state = 'journal'
262
263 - def start_i( self, attrs ):
264 pass
265
266 - def end_i( self ):
267 if( self._state == 'references' ): 268 if( self._reference_state == 'title' ): 269 text = self._flush_text() 270 self._current_reference.title = text 271 self._reference_state = 'journal'
272 273
274 - def _parse_chain( self ):
275 text = self._flush_text() 276 text = text.strip() 277 if( text.lower().startswith( 'chain' ) ): 278 fields = text.split( ':' ) 279 words = fields[ 0 ].split() 280 key = words[ 1 ] 281 val = fields[ 1 ] 282 self.ndb_dict[ 'Sequence' ][ key ] = val
283 284 285
286 - def _flush_text( self ):
287 text = string.strip( self.text ) 288 self.text = '' 289 return text[:]
290 291 292 if( __name__ == '__main__' ): 293 handle = open( 'PR0004.htm') 294 undo_handle = File.UndoHandle( handle ) 295 ndb_parser = NdbParser() 296 record = ndb_parser.parse( handle ) 297 print str( record ) 298