Package Bio :: Package SCOP :: Module Raf
[hide private]
[frames] | no frames]

Source Code for Module Bio.SCOP.Raf

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Gavin E. Crooks 2001-10-10 
  7   
  8  """ASTRAL RAF (Rapid Access Format) Sequence Maps. 
  9   
 10  The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES 
 11  records (representing the sequence of the molecule used in an experiment) to  
 12  the ATOM records (representing the atoms experimentally observed).  
 13   
 14  This data is derived from the Protein Data Bank CIF files. Known errors in the 
 15  CIF files are corrected manually, with the original PDB file serving as the 
 16  final arbiter in case of discrepancies.  
 17   
 18  Residues are referenced by residue ID. This consists of a the PDB residue 
 19  sequence number (upto 4 digits) and an optional PDB  insertion code (an 
 20  ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1" 
 21   
 22  See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html 
 23   
 24  to_one_letter_code -- A mapping from the 3-letter amino acid codes found 
 25                          in PDB files to 1-letter codes.  The 3-letter codes 
 26                          include chemically modified residues. 
 27  """ 
 28   
 29  from copy import copy  
 30  from types import * 
 31   
 32  from Residues import Residues 
 33   
 34  # This table is taken from the RAF release notes, and includes the 
 35  # undocumented mapping "UNK" -> "X" 
 36  to_one_letter_code= { 
 37      'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M', 
 38      'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K', 
 39      'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H', 
 40      'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G', 
 41      '2AS':'D', '3AH':'H', '5HP':'E', 'ACL':'R', 'AIB':'A', 
 42      'ALM':'A', 'ALO':'T', 'ALY':'K', 'ARM':'R', 'ASA':'D', 
 43      'ASB':'D', 'ASK':'D', 'ASL':'D', 'ASQ':'D', 'AYA':'A', 
 44      'BCS':'C', 'BHD':'D', 'BMT':'T', 'BNN':'A', 'BUC':'C', 
 45      'BUG':'L', 'C5C':'C', 'C6C':'C', 'CCS':'C', 'CEA':'C', 
 46      'CHG':'A', 'CLE':'L', 'CME':'C', 'CSD':'A', 'CSO':'C', 
 47      'CSP':'C', 'CSS':'C', 'CSW':'C', 'CXM':'M', 'CY1':'C', 
 48      'CY3':'C', 'CYG':'C', 'CYM':'C', 'CYQ':'C', 'DAH':'F', 
 49      'DAL':'A', 'DAR':'R', 'DAS':'D', 'DCY':'C', 'DGL':'E', 
 50      'DGN':'Q', 'DHA':'A', 'DHI':'H', 'DIL':'I', 'DIV':'V', 
 51      'DLE':'L', 'DLY':'K', 'DNP':'A', 'DPN':'F', 'DPR':'P', 
 52      'DSN':'S', 'DSP':'D', 'DTH':'T', 'DTR':'W', 'DTY':'Y', 
 53      'DVA':'V', 'EFC':'C', 'FLA':'A', 'FME':'M', 'GGL':'E', 
 54      'GLZ':'G', 'GMA':'E', 'GSC':'G', 'HAC':'A', 'HAR':'R', 
 55      'HIC':'H', 'HIP':'H', 'HMR':'R', 'HPQ':'F', 'HTR':'W', 
 56      'HYP':'P', 'IIL':'I', 'IYR':'Y', 'KCX':'K', 'LLP':'K', 
 57      'LLY':'K', 'LTR':'W', 'LYM':'K', 'LYZ':'K', 'MAA':'A', 
 58      'MEN':'N', 'MHS':'H', 'MIS':'S', 'MLE':'L', 'MPQ':'G', 
 59      'MSA':'G', 'MSE':'M', 'MVA':'V', 'NEM':'H', 'NEP':'H', 
 60      'NLE':'L', 'NLN':'L', 'NLP':'L', 'NMC':'G', 'OAS':'S', 
 61      'OCS':'C', 'OMT':'M', 'PAQ':'Y', 'PCA':'E', 'PEC':'C', 
 62      'PHI':'F', 'PHL':'F', 'PR3':'C', 'PRR':'A', 'PTR':'Y', 
 63      'SAC':'S', 'SAR':'G', 'SCH':'C', 'SCS':'C', 'SCY':'C', 
 64      'SEL':'S', 'SEP':'S', 'SET':'S', 'SHC':'C', 'SHR':'K', 
 65      'SOC':'C', 'STY':'Y', 'SVA':'S', 'TIH':'A', 'TPL':'W', 
 66      'TPO':'T', 'TPQ':'A', 'TRG':'K', 'TRO':'W', 'TYB':'Y', 
 67      'TYQ':'Y', 'TYS':'Y', 'TYY':'Y', 'AGM':'R', 'GL3':'G', 
 68      'SMC':'C', 'ASX':'B', 'CGU':'E', 'CSX':'C', 'GLX':'Z', 
 69      'UNK':'X' 
 70      } 
 71   
 72   
73 -def normalize_letters(one_letter_code) :
74 """Convert RAF one-letter amino acid codes into IUPAC standard codes. 75 76 Letters are uppercased, and "." ("Unknown") is converted to "X". 77 """ 78 if one_letter_code == '.' : 79 return 'X' 80 else : 81 return one_letter_code.upper()
82
83 -class SeqMapIndex(dict):
84 """An RAF file index. 85 86 The RAF file itself is about 50 MB. This index provides rapid, random 87 access of RAF records without having to load the entire file into memory. 88 89 The index key is a concatenation of the PDB ID and chain ID. e.g 90 "2drcA", "155c_". RAF uses an underscore to indicate blank 91 chain IDs. 92 """ 93
94 - def __init__(self, filename) :
95 """ 96 Arguments: 97 98 filename -- The file to index 99 """ 100 dict.__init__(self) 101 self.filename = filename 102 103 f = open(self.filename) 104 try: 105 position = 0 106 while True: 107 line = f.readline() 108 if not line: break 109 key = line[0:5] 110 if key != None : 111 self[key]=position 112 position = f.tell() 113 finally : 114 f.close()
115
116 - def __getitem__(self, key) :
117 """ Return an item from the indexed file. """ 118 position = dict.__getitem__(self,key) 119 120 f = open(self.filename) 121 try: 122 f.seek(position) 123 line = f.readline() 124 record = SeqMap(line) 125 finally: 126 f.close() 127 return record
128 129
130 - def getSeqMap(self, residues) :
131 """Get the sequence map for a collection of residues. 132 133 residues -- A Residues instance, or a string that can be converted into 134 a Residues instance. 135 """ 136 if type(residues) == StringType : 137 residues = Residues(residues) 138 139 pdbid = residues.pdbid 140 frags = residues.fragments 141 if not frags: frags =(('_','',''),) # All residues of unnamed chain 142 143 seqMap = None 144 for frag in frags : 145 chainid = frag[0] 146 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_': 147 chainid = '_' 148 id = pdbid + chainid 149 150 151 sm = self[id] 152 153 #Cut out fragment of interest 154 start = 0 155 end = len(sm.res) 156 if frag[1] : start = int(sm.index(frag[1], chainid)) 157 if frag[2] : end = int(sm.index(frag[2], chainid)+1) 158 159 sm = sm[start:end] 160 161 if seqMap == None : 162 seqMap = sm 163 else : 164 seqMap += sm 165 166 return seqMap
167 168 169
170 -class SeqMap :
171 """An ASTRAL RAF (Rapid Access Format) Sequence Map. 172 173 This is a list like object; You can find the location of particular residues 174 with index(), slice this SeqMap into fragments, and glue fragments back 175 together with extend(). 176 177 pdbid -- The PDB 4 character ID 178 179 pdb_datestamp -- From the PDB file 180 181 version -- The RAF format version. e.g. 0.01 182 183 flags -- RAF flags. (See release notes for more information.) 184 185 res -- A list of Res objects, one for each residue in this sequence map 186 """ 187
188 - def __init__(self, line=None):
189 self.pdbid = '' 190 self.pdb_datestamp = '' 191 self.version = '' 192 self.flags = '' 193 self.res = [] 194 if line: 195 self._process(line)
196 197
198 - def _process(self, line):
199 """Parses a RAF record into a SeqMap object. 200 """ 201 header_len = 38 202 203 line = line.rstrip() # no trailing whitespace 204 205 if len(line)<header_len: 206 raise ValueError("Incomplete header: "+line) 207 208 self.pdbid = line[0:4] 209 chainid = line[4:5] 210 211 self.version = line[6:10] 212 213 #Raf format versions 0.01 and 0.02 are identical for practical purposes 214 if(self.version != "0.01" and self.version !="0.02") : 215 raise ValueError("Incompatible RAF version: "+self.version) 216 217 self.pdb_datestamp = line[14:20] 218 self.flags = line[21:27] 219 220 for i in range(header_len, len(line), 7) : 221 f = line[i : i+7] 222 if len(f)!=7: 223 raise ValueError("Corrupt Field: ("+f+")") 224 r = Res() 225 r.chainid = chainid 226 r.resid = f[0:5].strip() 227 r.atom = normalize_letters(f[5:6]) 228 r.seqres = normalize_letters(f[6:7]) 229 230 self.res.append(r)
231 232
233 - def index(self, resid, chainid="_") :
234 for i in range(0, len(self.res)) : 235 if self.res[i].resid == resid and self.res[i].chainid == chainid : 236 return i 237 raise KeyError("No such residue "+chainid+resid)
238
239 - def __getslice__(self, i, j) :
240 s = copy(self) 241 s.res = s.res[i:j] 242 return s
243
244 - def append(self, res) :
245 """Append another Res object onto the list of residue mappings.""" 246 self.res.append(res)
247
248 - def extend(self, other) :
249 """Append another SeqMap onto the end of self. 250 251 Both SeqMaps must have the same PDB ID, PDB datestamp and 252 RAF version. The RAF flags are erased if they are inconsistent. This 253 may happen when fragments are taken from different chains. 254 """ 255 if not isinstance(other, SeqMap): 256 raise TypeError("Can only extend a SeqMap with a SeqMap.") 257 if self.pdbid != other.pdbid : 258 raise TypeError("Cannot add fragments from different proteins") 259 if self.version != other.version : 260 raise TypeError("Incompatible rafs") 261 if self.pdb_datestamp != other.pdb_datestamp : 262 raise TypeError("Different pdb dates!") 263 if self.flags != other.flags : 264 self.flags = '' 265 self.res += other.res
266
267 - def __iadd__(self, other) :
268 self.extend(other) 269 return self
270
271 - def __add__(self, other) :
272 s = copy(self) 273 s.extend(other) 274 return s
275
276 - def getAtoms(self, pdb_handle, out_handle) :
277 """Extract all relevant ATOM and HETATOM records from a PDB file. 278 279 The PDB file is scanned for ATOM and HETATOM records. If the 280 chain ID, residue ID (seqNum and iCode), and residue type match 281 a residue in this sequence map, then the record is echoed to the 282 output handle. 283 284 This is typically used to find the coordinates of a domain, or other 285 residue subset. 286 287 pdb_handle -- A handle to the relevant PDB file. 288 289 out_handle -- All output is written to this file like object. 290 """ 291 #This code should be refactored when (if?) biopython gets a PDB parser 292 293 #The set of residues that I have to find records for. 294 resSet = {} 295 for r in self.res : 296 if r.atom=='X' : #Unknown residue type 297 continue 298 chainid = r.chainid 299 if chainid == '_': 300 chainid = ' ' 301 resid = r.resid 302 resSet[(chainid,resid)] = r 303 304 resFound = {} 305 for line in pdb_handle.xreadlines() : 306 if line.startswith("ATOM ") or line.startswith("HETATM") : 307 chainid = line[21:22] 308 resid = line[22:27].strip() 309 key = (chainid, resid) 310 if key in resSet: 311 res = resSet[key] 312 atom_aa = res.atom 313 resName = line[17:20] 314 if resName in to_one_letter_code : 315 if to_one_letter_code[resName] == atom_aa : 316 out_handle.write(line) 317 resFound[key] = res 318 319 if len(resSet) != len(resFound) : 320 #for k in resFound.keys(): 321 # del resSet[k] 322 #print resSet 323 324 raise RuntimeError('I could not find at least one ATOM or HETATM' \ 325 +' record for each and every residue in this sequence map.')
326 327 328
329 -class Res :
330 """ A single residue mapping from a RAF record. 331 332 chainid -- A single character chain ID. 333 334 resid -- The residue ID. 335 336 atom -- amino acid one-letter code from ATOM records. 337 338 seqres -- amino acid one-letter code from SEQRES records. 339 """
340 - def __init__(self) :
341 self.chainid = '' 342 self.resid = '' 343 self.atom = '' 344 self.seqres = ''
345 346
347 -def parse(handle):
348 """Iterates over a RAF file, returning a SeqMap object for each line 349 in the file. 350 351 Arguments: 352 353 handle -- file-like object. 354 """ 355 for line in handle: 356 yield SeqMap(line)
357 358
359 -class Iterator:
360 """Iterates over a RAF file. 361 """
362 - def __init__(self, handle, parser=None):
363 """Create an object that iterates over a RAF file. 364 365 handle -- file-like object. 366 367 parser -- an optional Parser object to change the results into 368 another form. If set to None, then the raw contents 369 of the file will be returned. 370 371 """ 372 import warnings 373 warnings.warn("Bio.SCOP.Raf.Iterator is deprecated. Please use Bio.SCOP.Raf.parse() instead.", DeprecationWarning) 374 375 from types import FileType, InstanceType 376 if type(handle) is not FileType and type(handle) is not InstanceType: 377 raise TypeError("I expected a file handle or file-like object") 378 self._handle = handle 379 self._parser = parser
380
381 - def next(self):
382 """Retrieve the next record.""" 383 while 1: 384 line = self._handle.readline() 385 if not line: return None 386 if line[0] !='#': break # Not a comment line 387 if self._parser is not None : 388 return self._parser.parse(line) 389 return line
390
391 - def __iter__(self):
392 return iter(self.next, None)
393 394
395 -class Parser:
396 - def __init__(self):
397 import warnings 398 warnings.warn("""Bio.SCOP.Raf.Parser is deprecated. 399 Instead of 400 401 parser = Raf.Parser() 402 record = parser.parse(entry) 403 404 please use 405 406 record = Raf.SeqMap(entry) 407 """, DeprecationWarning)
408 409
410 - def parse(self, line):
411 """Returns a SeqMap """ 412 return SeqMap(line)
413