Package Bio :: Package GFF
[hide private]
[frames] | no frames]

Source Code for Package Bio.GFF

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright 2002 by Michael Hoffman.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """ 
  9  Access to General Feature Format databases created with Bio::DB:GFF 
 10   
 11  based on documentation for Lincoln Stein's Perl Bio::DB::GFF 
 12   
 13  >>> import os 
 14  >>> import Bio.GFF 
 15  >>> PASSWORD = os.environ['MYSQLPASS'] 
 16  >>> DATABASE_GFF = 'wormbase_ws93' 
 17  >>> FASTADIR = '/local/wormbase_ws93/' 
 18  >>> gff = Bio.GFF.Connection(passwd=PASSWORD, db=DATABASE_GFF, fastadir=FASTADIR) 
 19   
 20  """ 
 21   
 22  __version__ = "$Revision: 1.9 $" 
 23  # $Source: /home/repository/biopython/biopython/Bio/GFF/__init__.py,v $ 
 24   
 25  import exceptions 
 26  import operator 
 27  import os.path 
 28  import sys 
 29  import types 
 30   
 31  from Bio import MissingExternalDependencyError 
 32   
 33  try: 
 34      import MySQLdb 
 35  except: 
 36      raise MissingExternalDependencyError("Install MySQLdb if you want to use Bio.GFF.") 
 37   
 38  from Bio.Alphabet import IUPAC 
 39  from Bio import DocSQL 
 40  from Bio.Seq import Seq 
 41  from Bio.SeqRecord import SeqRecord 
 42  from Bio import Translate 
 43   
 44  import binning 
 45  import easy 
 46  import GenericTools 
 47   
 48   
 49  DEFAULT_ALPHABET = IUPAC.unambiguous_dna 
 50  standard_translator = Translate.unambiguous_dna_by_id[1] 
 51   
52 -class Segment(object):
53 """ 54 this will only work for the simplest of easy.Location objects 55 """
56 - def __init__(self, gff, location=None):
57 self.gff = gff 58 if location is None: 59 self.seqname = None 60 self.coords = None 61 self.complement = None 62 else: 63 self.seqname = location.seqname 64 self.coords = [location.start(), location.end()] 65 self.complement = location.complement
66
67 - def features(self, *args, **keywds):
68 return FeatureQuery(self.seqname, self.coords, self.complement, connection=self.gff.db, *args, **keywds)
69
70 - def single_feature(self, *args, **keywds):
71 features = self.features(*args, **keywds) 72 all_features = GenericTools.all(features) 73 assert len(all_features) == 1 74 return all_features[0]
75
76 -class Connection(Segment):
77 """ 78 Connection to GFF database 79 80 also functions as whole segment 81 """ 82
83 - def __init__(self, *args, **keywds):
84 try: 85 RetrieveSeqname._dir = keywds['fastadir'] 86 del keywds['fastadir'] 87 except KeyError: 88 RetrieveSeqname._dir = '.' 89 self.db = MySQLdb.connect(*args, **keywds) 90 Segment.__init__(self, self)
91
92 - def segment(self, *args, **keywds):
93 return Segment(self, *args, **keywds)
94
95 -class RetrieveSeqname(GenericTools.Surrogate, SeqRecord):
96 """ 97 Singleton: contain records of loaded FASTA files 98 99 >>> RetrieveSeqname._dir = 'GFF' 100 >>> RetrieveSeqname._diagnostics = 1 101 >>> sys.stderr = sys.stdout # dump diagnostics to stdout so doctest can see them 102 >>> record1 = RetrieveSeqname('NC_001802.fna') 103 Loading GFF/NC_001802.fna... Done. 104 >>> record1.id 105 'gi|9629357|ref|NC_001802.1|' 106 >>> record2 = RetrieveSeqname('NC_001802.fna') 107 >>> record1.seq is record2.seq # should be same space in memory 108 1 109 """ 110 __records = {} 111 _diagnostics = 0 112
113 - def __init__(self, seqname):
114 try: 115 GenericTools.Surrogate.__init__(self, self.__records[seqname]) 116 except KeyError: 117 filename = os.path.join(self._dir, seqname) 118 if self._diagnostics: 119 sys.stderr.write("Loading %s..." % filename) 120 record = easy.fasta_single(filename) 121 self.__records[seqname] = record 122 GenericTools.Surrogate.__init__(self, self.__records[seqname]) 123 if self._diagnostics: 124 print >>sys.stderr, " Done."
125
126 -class Feature(object):
127 """ 128 strand may be: 129 +/0 = Watson 130 -/1 = Crick 131 132 I propose that we start calling these the Rosalind and Franklin strands 133 134 >>> RetrieveSeqname._dir = 'GFF' 135 >>> feature = Feature("NC_001802x.fna", 73, 78) # not having the x will interfere with the RetrieveSequence test 136 >>> feature.seq() 137 Seq('AATAAA', Alphabet()) 138 >>> print feature.location() 139 NC_001802x.fna:73..78 140 >>> from Bio import SeqIO 141 >>> record = feature.record() 142 >>> records = [record] 143 >>> SeqIO.write(records, sys.stdout, 'fasta') 144 > NC_001802x.fna:73..78 145 AATAAA 146 >>> feature2 = Feature(location=easy.LocationFromString("NC_001802x.fna:73..78")) 147 >>> writer.write(feature2.record()) 148 > NC_001802x.fna:73..78 149 AATAAA 150 >>> location3 = easy.LocationFromString("NC_001802x.fna:complement(73..78)") 151 >>> feature3 = Feature(location=location3) 152 >>> writer.write(feature3.record()) 153 > NC_001802x.fna:complement(73..78) 154 TTTATT 155 >>> location4 = easy.LocationFromString("NC_001802x.fna:336..1631") 156 >>> feature4 = Feature(location=location4, frame=0) 157 >>> feature4.frame 158 0 159 >>> feature4.translate()[:7] 160 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*')) 161 >>> feature4.frame = 6 # can't happen, but a useful demonstration 162 >>> feature4.translate()[:5] 163 Seq('ARASV', HasStopCodon(IUPACProtein(), '*')) 164 >>> feature4.frame = 1 165 >>> feature4.translate()[:5] 166 Seq('WVRER', HasStopCodon(IUPACProtein(), '*')) 167 >>> location5 = easy.LocationFromString("NC_001802lc.fna:336..1631") # lowercase data 168 >>> feature5 = Feature(location=location5, frame=0) 169 >>> feature5.translate()[:7] 170 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*')) 171 >>> location6 = easy.LocationFromString("NC_001802lc.fna:335..351") 172 >>> feature6 = Feature(location=location6, frame=1) 173 >>> feature6.translate() 174 Seq('MGARA', HasStopCodon(IUPACProtein(), '*')) 175 """
176 - def __init__(self, 177 seqname=None, 178 start=None, 179 end=None, 180 strand="+", 181 frame=None, 182 location=None, 183 alphabet=DEFAULT_ALPHABET):
184 if not isinstance(seqname, (types.StringType, types.NoneType)): 185 raise exceptions.TypeError("seqname needs to be string") 186 self.frame = frame 187 self.alphabet = alphabet 188 if location is None: 189 self.seqname = seqname 190 self.start = start 191 self.end = end 192 self.strand = strand 193 return 194 else: 195 self.seqname = location.seqname 196 self.start = location.start() + 1 197 self.end = location.end() + 1 198 self.strand = location.complement
199
200 - def __hash__(self):
201 return hash((self.seqname, 202 self.start, 203 self.end, 204 self.strand, 205 self.frame, 206 self.alphabet))
207
208 - def seq(self):
209 rec = RetrieveSeqname(self.seqname) 210 return easy.record_subseq(rec, self.location(), upper=1)
211
212 - def translate(self):
213 seq = self.seq() 214 try: 215 seq = Seq(seq.tostring()[self.frame:], self.alphabet) 216 except TypeError: 217 seq.alphabet = self.alphabet 218 try: 219 return standard_translator.translate(seq) 220 except AssertionError: 221 # if the feature was pickled then we have problems 222 import cPickle 223 if cPickle.dumps(seq.alphabet) == cPickle.dumps(DEFAULT_ALPHABET): 224 seq.alphabet = DEFAULT_ALPHABET 225 return standard_translator.translate(seq) 226 else: 227 raise
228
229 - def location(self):
230 return easy.LocationFromCoords(self.start-1, self.end-1, self.strand, seqname=self.seqname)
231
232 - def target_location(self):
233 if self.target_start <= self.target_end: 234 return easy.LocationFromCoords(self.target_start-1, self.target_end-1, 0, seqname=self.gname) 235 else: 236 # switch start and end and make it complement: 237 return easy.LocationFromCoords(self.target_end-1, self.target_start-1, 1, seqname=self.gname)
238
239 - def id(self):
240 try: 241 return "%s:%s" % (self.gclass, self.gname) 242 except AttributeError: 243 return ""
244
245 - def record(self):
246 return SeqRecord(self.seq(), self.id(), "", self.location())
247
248 - def xrange(self):
249 return xrange(self.start, self.end)
250
251 - def __str__(self):
252 return "Feature(%s)" % self.location()
253
254 -class FeatureQueryRow(DocSQL.QueryRow, Feature):
255 """ 256 row of FeatureQuery results 257 works like a Feature 258 """
259 - def __init__(self, *args, **keywds):
260 DocSQL.QueryRow.__init__(self, *args, **keywds) 261 try: 262 self.frame = int(self.frame) 263 except ValueError: 264 self.frame = '' 265 except TypeError: 266 self.frame = None 267 self.alphabet = DEFAULT_ALPHABET
268
269 -class FeatureQuery(DocSQL.Query):
270 """ 271 SELECT fdata.fref AS seqname, 272 ftype.fsource AS source, 273 ftype.fmethod AS feature, 274 fdata.fstart AS start, 275 fdata.fstop AS end, 276 fdata.fscore AS score, 277 fdata.fstrand AS strand, 278 fdata.fphase AS frame, 279 fdata.ftarget_start AS target_start, 280 fdata.ftarget_stop AS target_end, 281 fdata.gid, 282 fgroup.gclass, 283 fgroup.gname 284 FROM fdata, ftype, fgroup 285 WHERE fdata.ftypeid = ftype.ftypeid 286 AND fdata.gid = fgroup.gid 287 """ 288 289 row_class = FeatureQueryRow
290 - def __init__(self, 291 seqname=None, 292 coords=None, 293 complement=0, 294 method=None, 295 source=None, 296 object=None, # "class:name" 297 gid=None, 298 *args, 299 **keywds):
300 301 DocSQL.Query.__init__(self, *args, **keywds) 302 303 if seqname is not None: 304 self.statement += ' AND fref="%s"\n' % seqname 305 if coords is not None: 306 self.statement += " AND (%s)\n" % binning.query(coords[0], coords[1]) 307 if coords[0] is not None: 308 self.statement += (" AND fstop >= %s\n" % coords[0]) 309 if coords[1] is not None: 310 self.statement += (" AND fstart <= %s\n" % coords[1]) 311 if method is not None: 312 self.statement += ' AND fmethod LIKE "%s"\n' % method # LIKE allows SQL "%" wildcards 313 if source is not None: 314 self.statement += ' AND fsource LIKE "%s"\n' % source 315 if object is not None: 316 self.statement += ' AND %s\n' % object2fgroup_sql(object) 317 if gid is not None: 318 self.statement += ' AND fgroup.gid = %s\n' % gid 319 320 if complement: 321 self.statement += " ORDER BY 0-fstart, 0-fstop" 322 else: 323 self.statement += " ORDER BY fstart, fstop"
324
325 - def aggregate(self):
326 return FeatureAggregate(self)
327
328 -def object2fgroup_sql(object):
329 return 'gclass LIKE "%s" AND gname LIKE "%s"' % object_split(object)
330
331 -class FeatureAggregate(list, Feature):
332 """ 333 >>> feature1_1 = Feature(location=easy.LocationFromString("NC_001802x.fna:336..1631"), frame=0) # gag-pol 334 >>> feature1_2 = Feature(location=easy.LocationFromString("NC_001802x.fna:1631..4642"), frame=0) # slippage 335 >>> aggregate = FeatureAggregate([feature1_1, feature1_2]) 336 >>> print aggregate.location() 337 join(NC_001802x.fna:336..1631,NC_001802x.fna:1631..4642) 338 >>> xlate_str = aggregate.translate().tostring() 339 >>> xlate_str[:5], xlate_str[-5:] 340 ('MGARA', 'RQDED') 341 342 >>> location1 = easy.LocationFromString("NC_001802x.fna:complement(1..6)") 343 >>> location2 = easy.LocationFromString("NC_001802x.fna:complement(7..12)") 344 >>> feature2_1 = Feature(location=location1, frame=0) 345 >>> feature2_2 = Feature(location=location2, frame=0) 346 >>> aggregate2 = FeatureAggregate([feature2_1, feature2_2]) 347 >>> print aggregate2.location() 348 complement(join(NC_001802x.fna:1..6,NC_001802x.fna:7..12)) 349 >>> print aggregate2.translate() 350 Seq('TRET', HasStopCodon(IUPACProtein(), '*')) 351 >>> location1.reverse() 352 >>> location2.reverse() 353 >>> aggregate3 = FeatureAggregate([Feature(location=x, frame=0) for x in [location1, location2]]) 354 >>> print aggregate3.location() 355 join(NC_001802x.fna:1..6,NC_001802x.fna:7..12) 356 >>> print aggregate3.translate() 357 Seq('GLSG', HasStopCodon(IUPACProtein(), '*')) 358 >>> aggregate3[0].frame = 3 359 >>> print aggregate3.translate() 360 Seq('LSG', HasStopCodon(IUPACProtein(), '*')) 361 362 >>> aggregate4 = FeatureAggregate() 363 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:1..5"), frame=0)) 364 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:6..12"), frame=2)) 365 >>> aggregate4.seq() 366 Seq('GGTCTCTCTGGT', Alphabet()) 367 >>> aggregate4.translate() 368 Seq('GLSG', HasStopCodon(IUPACProtein(), '*')) 369 """
370 - def __init__(self, feature_query=None):
371 if feature_query is None: 372 list.__init__(self, []) 373 else: 374 list.__init__(self, map(lambda x: x, feature_query))
375
376 - def location(self):
377 loc = easy.LocationJoin(map(lambda x: x.location(), self)) 378 loc.reorient() 379 return loc
380
381 - def map(self, func):
382 mapped = map(func, self) 383 if self.location().complement: 384 mapped.reverse() 385 return reduce(operator.add, mapped)
386
387 - def seq(self):
388 return self.map(lambda x: x.seq())
389
390 - def translate(self):
391 attributes = ['alphabet', 'frame'] 392 index = [0, -1][self.location().complement] 393 for attribute in attributes: 394 self.__dict__[attribute] = self[index].__dict__[attribute] 395 translation = Feature.translate(self) 396 try: 397 assert translation.tostring().index("*") == len(translation) - 1 398 return translation[:translation.tostring().index("*")] 399 except ValueError: 400 return translation
401
402 -def object_split(object):
403 """ 404 >>> object_split("Sequence:F02E9.2a") 405 ('Sequence', 'F02E9.2a') 406 """ 407 return tuple(object.split(":"))
408
409 -def _test(*args, **keywds):
410 import doctest, sys 411 doctest.testmod(sys.modules[__name__], *args, **keywds)
412 413 if __name__ == "__main__": 414 if __debug__: 415 _test() 416