1
2
3
4
5
6 """
7 This module provided code to parse HTML files from NDB (DEPRECATED).
8
9 This module provides an HTML parser designed for the NDB website
10 http://ndbserver.rutgers.edu/ as it was circa 2002. The site has since
11 been redesigned, breaking the parser. Bio.Ndb is therefore deprecated,
12 and will be removed in a future release of Biopython.
13
14 Classes:
15 Record Holds NDB sequence data.
16 NdbParser Parses NDB sequence data into a Record object.
17
18 The algorithm is based on a state machine because the record has multiple
19 sections and the handling of tags varies depending on the section.
20 Citations have their own state machine.
21 """
22 import warnings
23 warnings.warn("Bio.Ndb has been deprecated as the NDB website it used to"\
24 " parse has been redesigned.", DeprecationWarning)
25
26 from types import *
27 import string
28 from Bio import File
29 from Bio import Index
30 from Bio.Crystal import Hetero
31 from Bio.Crystal import Chain
32 from Bio.Crystal import Crystal
33 from Bio.SeqFeature import Reference
34 import urllib
35 import sgmllib
36 from Bio.ParserSupport import *
37 from Bio.SeqFeature import Reference
38
39
41
43 self[ 'Id' ] = ''
44 self[ 'Features' ] = ''
45 self[ 'Name' ] = ''
46 self[ 'Sequence' ] = Crystal( {} )
47 self[ 'Citation' ] = Reference()
48 self[ 'Space Group' ] = ''
49 self[ 'Cell Constants' ] = {}
50 self[ 'Crystallization Conditions' ] = []
51 self[ 'Refinement' ] = ''
52 self[ 'Coordinates' ] = ''
53
55 keys = self.keys()
56 keys.sort()
57 out = ''
58 for key in keys:
59 val = self[ key ]
60 if( type( val ) == type( [] ) ):
61 out = out + '\n%s\n' % key
62 for item in val:
63 out = out + '%s\n' % item
64
65 elif( type( val ) == type( {} ) ):
66 out = out + '\n%s\n' % key
67 subkeys = val.keys()
68 subkeys.sort()
69 for item in subkeys:
70 out = out + '%s : %s\n' % ( item, val[ item ] )
71 elif( isinstance( val, dict ) ):
72 out = out + '\n%s\n' % key
73 subkeys = val.keys()
74 subkeys.sort()
75 for item in subkeys:
76 out = out + '%s : %s\n' % ( item, val[ item ] )
77
78 else:
79 out = out + '%s: %s\n' % ( key, self[ key ] )
80 return out
81
100
101
102
103
104
106 """Parses Ndb sequence data into a Record object.
107 data available at: http://ndbserver.rutgers.edu/NDB/NDBATLAS/index.html
108 """
110 sgmllib.SGMLParser.reset( self )
111 self.ndb_dict = Record()
112 self.text = ''
113 self._space_group = ''
114 self._state = 'id'
115 self._reference_state = 'authors'
116 self._current_reference = Reference()
117
118 - def parse(self, handle):
122
123 - def feed(self, handle):
124 """feed(self, handle )
125
126 Feed in ndb data for scanning. handle is a file-like object
127 containing ndb data. consumer is a Consumer object that will
128 receive events as the ndb data is scanned.
129
130 """
131 if isinstance(handle, File.UndoHandle):
132 uhandle = handle
133 else:
134 uhandle = File.UndoHandle(handle)
135 text = ''
136 while 1:
137 line = uhandle.readline()
138 if( not line ):
139 break
140 line = string.strip( line )
141 if( line[ -7: ] == '</HTML>' ):
142 break
143 text = text + ' ' + line
144
145 sgmllib.SGMLParser.feed( self, text )
146
147
149 newtext = string.strip( newtext )
150 self.text = self.text + newtext
151
154
156 text = self._flush_text()
157 if( self._state == 'id' ):
158 cols = text.split( ':' )
159 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper()
160 self._state = 'id_found'
161
163 text = self._flush_text()
164 if( self._state == 'features' ):
165 self.ndb_dict[ 'Features' ] = text
166 elif( self._state == 'name' ):
167 self.ndb_dict[ 'Name' ] = text
168 elif( self._state == 'sequence' ):
169 pass
170 elif( self._state == 'citation' ):
171 if( self._reference_state == 'journal' ):
172 self._current_reference.journal = text
173 self.ndb_dict[ 'Citation' ] = self._current_reference
174 elif( self._state == 'space' ):
175 self._space_group = self._space_group + text
176 self.ndb_dict[ 'Space Group' ] = self._space_group
177 elif( self._state == 'constants' ):
178 self.ndb_dict[ 'Cell Constants' ] = _parse_constants( text )
179 elif( self._state == 'crystallization' ):
180 pass
181 elif( self._state == 'refinement' ):
182 self.ndb_dict[ 'Refinement' ] = text
183 elif( self._state == 'coordinates' ):
184 self.ndb_dict[ 'Coordinates' ] = text
185
187 text = self._flush_text()
188 text = text.lower()
189 if( self._state == 'id' ):
190 if( text.find( 'id' ) >= 0 ):
191 cols = text.split( ':' )
192 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper()
193 self._state = 'id_found'
194 elif( text.find( 'feature' ) >= 0 ):
195 self._state = 'features'
196 elif( text.find( 'name' ) >= 0 ):
197 self._state = 'name'
198 elif( text.find( 'sequence' ) >= 0 ):
199 self._state = 'sequence'
200 elif( text.find( 'citation' ) >= 0 ):
201 self._state = 'citation'
202 elif( text.find( 'space' ) >= 0 ):
203 self._state = 'space'
204 elif( text.find( 'constants' ) >= 0 ):
205 self._state = 'constants'
206 elif( text.find( 'crystallization' ) >= 0 ):
207 self._state = 'crystallization'
208 elif( text.find( 'refinement' ) >= 0 ):
209 self._state = 'refinement'
210 elif( text.find( 'coordinates' ) >= 0 ):
211 self._state = 'coordinates'
212
213
215 if( self._state == 'sequence' ):
216 self._flush_text()
217
218 elif( self._state == 'crystallization' ):
219 self._flush_text()
220
222 if( self._state == 'sequence' ):
223 self._parse_chain()
224 elif( self._state == 'crystallization' ):
225 text = self._flush_text()
226 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
227 elif( self._state == 'citation' ):
228 if( self._reference_state == 'journal' ):
229 self._current_reference.journal = self._flush_text()
230 self._reference_state = 'done'
231
233 if( self._state == 'space' ):
234 self._space_group = self._space_group + self._flush_text()
235
237 if( self._state == 'space' ):
238 self._space_group = self._space_group + '(%s) ' % self._flush_text()
239
241 if( self._state == 'sequence' ):
242 self._parse_chain()
243 elif( self._state == 'crystallization' ):
244 text = self._flush_text()
245 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
246
248 if( self._state == 'sequence' ):
249 self._parse_chain()
250 elif( self._state == 'crystallization' ):
251 text = self._flush_text()
252 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
253
254 - def do_br( self, attrs ):
255 if( self._state == 'citation' ):
256 if( self._reference_state == 'authors' ):
257 self._current_reference.authors = self._flush_text()
258 self._reference_state = 'title'
259 elif( self._reference_state == 'title' ):
260 self._current_reference.title = self._flush_text()
261 self._reference_state = 'journal'
262
265
267 if( self._state == 'references' ):
268 if( self._reference_state == 'title' ):
269 text = self._flush_text()
270 self._current_reference.title = text
271 self._reference_state = 'journal'
272
273
283
284
285
286 - def _flush_text( self ):
287 text = string.strip( self.text )
288 self.text = ''
289 return text[:]
290
291
292 if( __name__ == '__main__' ):
293 handle = open( 'PR0004.htm')
294 undo_handle = File.UndoHandle( handle )
295 ndb_parser = NdbParser()
296 record = ndb_parser.parse( handle )
297 print str( record )
298