1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 __doc__="Parse the header of a PDB file."
25
26 import sys
27 import os,string,re
28 import urllib
29 import types
30
31
40
42
43
44 references=[]
45 actref=""
46 for l in inl:
47 if re.search("\AREMARK 1",l):
48 if re.search("\AREMARK 1 REFERENCE",l):
49 if actref!="":
50 actref=re.sub("\s\s+"," ",actref)
51 if actref!=" ":
52 references.append(actref)
53 actref=""
54 else:
55 actref+=string.lower(l[19:72])
56
57 if actref!="":
58 actref=re.sub("\s\s+"," ",actref)
59 if actref!=" ":
60 references.append(actref)
61 return references
62
63
64
81
82
84 """Chops lines ending with ' 1CSA 14' and the like."""
85 import re
86 return re.sub("\s\s\s\s+[\w]{4}.\s+\d*\Z","",line)
87
89 """Chops lines ending with ' 14-JUL-97 1CSA' and the like."""
90 import re
91 return re.sub("\s\s\s\s+.*\Z","",line)
92
94 """Makes A Lowercase String With Capitals."""
95 import string
96 l=string.lower(line)
97 s=""
98 i=0
99 nextCap=1
100 while i<len(l):
101 c=l[i]
102 if c>='a' and c<='z' and nextCap:
103 c=string.upper(c)
104 nextCap=0
105 elif c==' ' or c=='.' or c==',' or c==';' or c==':' or c=='\t' or\
106 c=='-' or c=='_':
107 nextCap=1
108 s+=c
109 i+=1
110 return s
111
113 """
114 Returns the header lines of a pdb file as a dictionary.
115
116 Dictionary keys are: head, deposition_date, release_date, structure_method,
117 resolution, structure_reference, journal_reference, author and
118 compound.
119 """
120 header=[]
121 if type(file)==types.StringType:
122 f=open(file,'r')
123 else:
124 f=file
125 for l in f:
126 record_type=l[0:6]
127 if record_type=='ATOM ' or record_type=='HETATM' or record_type=='MODEL ':
128 break
129 else:
130 header.append(l)
131 f.close()
132 return _parse_pdb_header_list(header)
133
135
136 dict={'name':"",
137 'head':'',
138 'deposition_date' : "1909-01-08",
139 'release_date' : "1909-01-08",
140 'structure_method' : "unknown",
141 'resolution' : 0.0,
142 'structure_reference' : "unknown",
143 'journal_reference' : "unknown",
144 'author' : "",
145 'compound':{'1':{'misc':''}},'source':{'1':{'misc':''}}}
146
147 dict['structure_reference'] = _get_references(header)
148 dict['journal_reference'] = _get_journal(header)
149 comp_molid="1"
150 src_molid="1"
151 last_comp_key="misc"
152 last_src_key="misc"
153
154 for hh in header:
155 h=re.sub("[\s\n\r]*\Z","",hh)
156 key=re.sub("\s.+\s*","",h)
157 tail=re.sub("\A\w+\s+\d*\s*","",h)
158
159
160
161 if key=="TITLE":
162 name=string.lower(_chop_end_codes(tail))
163 if dict.has_key('name'):
164 dict['name'] += " "+name
165 else:
166 dict['name']=name
167 elif key=="HEADER":
168 rr=re.search("\d\d-\w\w\w-\d\d",tail)
169 if rr!=None:
170 dict['deposition_date']=_format_date(_nice_case(rr.group()))
171 head=string.lower(_chop_end_misc(tail))
172 dict['head']=head
173 elif key=="COMPND":
174 tt=string.lower(re.sub("\;\s*\Z","",_chop_end_codes(tail)))
175
176 rec = re.search('\d+\.\d+\.\d+\.\d+',tt)
177 if rec:
178 dict['compound'][comp_molid]['ec_number']=rec.group()
179 tt=re.sub("\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)","",tt)
180 tok=tt.split(":")
181 if len(tok)>=2:
182 ckey=tok[0]
183 cval=re.sub("\A\s*","",tok[1])
184 if ckey=='mol_id':
185 dict['compound'][cval]={'misc':''}
186 comp_molid=cval
187 last_comp_key="misc"
188 else:
189 dict['compound'][comp_molid][ckey]=cval
190 last_comp_key=ckey
191 else:
192 dict['compound'][comp_molid][last_comp_key]+=tok[0]+" "
193 elif key=="SOURCE":
194 tt=string.lower(re.sub("\;\s*\Z","",_chop_end_codes(tail)))
195 tok=tt.split(":")
196
197 if len(tok)>=2:
198 ckey=tok[0]
199 cval=re.sub("\A\s*","",tok[1])
200 if ckey=='mol_id':
201 dict['source'][cval]={'misc':''}
202 comp_molid=cval
203 last_src_key="misc"
204 else:
205 dict['source'][comp_molid][ckey]=cval
206 last_src_key=ckey
207 else:
208 dict['source'][comp_molid][last_src_key]+=tok[0]+" "
209 elif key=="KEYWDS":
210 kwd=string.lower(_chop_end_codes(tail))
211 if dict.has_key('keywords'):
212 dict['keywords']+=" "+kwd
213 else:
214 dict['keywords']=kwd
215 elif key=="EXPDTA":
216 expd=_chop_end_codes(tail)
217
218 expd=re.sub('\s\s\s\s\s\s\s.*\Z','',expd)
219
220
221 dict['structure_method']=string.lower(expd)
222 elif key=="CAVEAT":
223
224 pass
225 elif key=="REVDAT":
226 rr=re.search("\d\d-\w\w\w-\d\d",tail)
227 if rr!=None:
228 dict['release_date']=_format_date(_nice_case(rr.group()))
229 elif key=="JRNL":
230
231 if dict.has_key('journal'):
232 dict['journal']+=tail
233 else:
234 dict['journal']=tail
235 elif key=="AUTHOR":
236 auth = _nice_case(_chop_end_codes(tail))
237 if dict.has_key('author'):
238 dict['author']+=auth
239 else:
240 dict['author']=auth
241 elif key=="REMARK":
242 if re.search("REMARK 2 RESOLUTION.",hh):
243 r=_chop_end_codes(re.sub("REMARK 2 RESOLUTION.",'',hh))
244 r=re.sub("\s+ANGSTROM.*","",r)
245 try:
246 dict['resolution']=float(r)
247 except:
248
249 dict['resolution']=None
250 else:
251
252 pass
253 if dict['structure_method']=='unknown':
254 if dict['resolution']>0.0: dict['structure_method']='x-ray diffraction'
255 return dict
256
257 if __name__=='__main__':
258 """
259 Reads a PDB file passed as argument, parses its header, extracts
260 some data and returns it as a dictionary.
261 """
262 filename = sys.argv[1]
263 file = open(filename,'r')
264 dict = parse_pdb_header(file)
265
266
267 for d in dict.keys():
268 print "-"*40
269 print d
270 print dict[d]
271