Package Bio :: Module SeqFeature
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqFeature

  1  """Represent a Sequence Feature holding info about a part of a sequence. 
  2   
  3  This is heavily modeled after the Biocorba SeqFeature objects, and 
  4  may be pretty biased towards GenBank stuff since I'm writing it 
  5  for the GenBank parser output... 
  6   
  7  What's here: 
  8   
  9  Base class to hold a Feature. 
 10  ---------------------------- 
 11  classes: 
 12  o SeqFeature 
 13   
 14  Hold information about a Reference. 
 15  ---------------------------------- 
 16   
 17  This is an attempt to create a General class to hold Reference type 
 18  information. 
 19   
 20  classes: 
 21  o Reference 
 22   
 23  Specify locations of a feature on a Sequence. 
 24  --------------------------------------------- 
 25   
 26  This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in 
 27  much the same way as Biocorba. This has the advantages of allowing us 
 28  to handle fuzzy stuff in case anyone needs it, and also be compatible 
 29  with Biocorba. 
 30   
 31  classes: 
 32  o FeatureLocation - Specify the start and end location of a feature. 
 33   
 34  o ExactPosition - Specify the position as being exact. 
 35  o WithinPosition - Specify a position occuring within some range. 
 36  o BetweenPosition - Specify a position occuring between a range. 
 37  o BeforePosition - Specify the position as being found before some base. 
 38  o AfterPosition - Specify the position as being found after some base. 
 39  """ 
 40   
41 -class SeqFeature:
42 """Represent a Sequence Feature on an object. 43 44 Attributes: 45 o location - the location of the feature on the sequence 46 o type - the specified type of the feature (ie. CDS, exon, repeat...) 47 o location_operator - a string specifying how this SeqFeature may 48 be related to others. For example, in the example GenBank feature 49 shown below, the location_operator would be "join" 50 o strand - A value specifying on which strand (of a DNA sequence, for 51 instance) the feature deals with. 1 indicates the plus strand, -1 52 indicates the minus strand, 0 indicates both strands, and None indicates 53 that strand doesn't apply (ie. for proteins) or is not known. 54 o id - A string identifier for the feature. 55 o ref - A reference to another sequence. This could be an accession 56 number for some different sequence. 57 o ref_db - A different database for the reference accession number. 58 o qualifier - A dictionary of qualifiers on the feature. These are 59 analagous to the qualifiers from a GenBank feature table. The keys of 60 the dictionary are qualifier names, the values are the qualifier 61 values. 62 o sub_features - Additional SeqFeatures which fall under this 'parent' 63 feature. For instance, if we having something like: 64 65 CDS join(1..10,30..40,50..60) 66 67 The the top level feature would be a CDS from 1 to 60, and the sub 68 features would be of 'CDS_join' type and would be from 1 to 10, 30 to 69 40 and 50 to 60, respectively. 70 """
71 - def __init__(self, location = None, type = '', location_operator = '', 72 strand = None, id = "<unknown id>", 73 qualifiers = {}, sub_features = [], 74 ref = None, ref_db = None):
75 """Initialize a SeqFeature on a Sequence. 76 """ 77 self.location = location 78 79 self.type = type 80 self.location_operator = location_operator 81 self.strand = strand 82 self.id = id 83 # XXX right now sub_features and qualifiers cannot be set 84 # from the initializer because this causes all kinds 85 # of recursive import problems. I can't understand why this is 86 # at all :-< 87 self.qualifiers = {} 88 self.sub_features = [] 89 self.ref = ref 90 self.ref_db = ref_db
91
92 - def __repr__(self):
93 """A string representation of the record for debugging.""" 94 answer = "%s(%s" % (self.__class__, repr(self.location)) 95 if self.type : 96 answer += ", type=%s" % repr(self.type) 97 if self.location_operator : 98 answer += ", location_operator=%s" % repr(self.location_operator) 99 if self.strand : 100 answer += ", strand=%s" % repr(self.strand) 101 if self.id and self.id != "<unknown id>" : 102 answer += ", id=%s" % repr(self.id) 103 if self.ref : 104 answer += ", ref=%s" % repr(self.ref) 105 if self.ref_db : 106 answer += ", ref_db=%s" % repr(self.ref_db) 107 answer += ")" 108 return answer
109
110 - def __str__(self):
111 """A readable summary of the feature intended to be printed to screen. 112 """ 113 out = "type: %s\n" % self.type 114 out += "location: %s\n" % self.location 115 out += "ref: %s:%s\n" % (self.ref, self.ref_db) 116 out += "strand: %s\n" % self.strand 117 out += "qualifiers: \n" 118 qualifier_keys = self.qualifiers.keys() 119 qualifier_keys.sort() 120 for qual_key in qualifier_keys: 121 out += "\tKey: %s, Value: %s\n" % (qual_key, 122 self.qualifiers[qual_key]) 123 if len(self.sub_features) != 0: 124 out += "Sub-Features\n" 125 for sub_feature in self.sub_features: 126 out +="%s\n" % sub_feature 127 128 return out
129 130 # --- References 131 132 # TODO -- Will this hold PubMed and Medline information decently?
133 -class Reference:
134 """Represent a Generic Reference object. 135 136 Attributes: 137 o location - A list of Location objects specifying regions of 138 the sequence that the references correspond to. If no locations are 139 specified, the entire sequence is assumed. 140 o authors - A big old string, or a list split by author, of authors 141 for the reference. 142 o title - The title of the reference. 143 o journal - Journal the reference was published in. 144 o medline_id - A medline reference for the article. 145 o pubmed_id - A pubmed reference for the article. 146 o comment - A place to stick any comments about the reference. 147 """
148 - def __init__(self):
149 self.location = [] 150 self.authors = '' 151 self.consrtm = '' 152 self.title = '' 153 self.journal = '' 154 self.medline_id = '' 155 self.pubmed_id = '' 156 self.comment = ''
157
158 - def __str__(self):
159 """Output an informative string for debugging. 160 """ 161 out = "" 162 for single_location in self.location: 163 out += "location: %s\n" % single_location 164 out += "authors: %s\n" % self.authors 165 if self.consrtm: 166 out += "consrtm: %s\n" % self.consrtm 167 out += "title: %s\n" % self.title 168 out += "journal: %s\n" % self.journal 169 out += "medline id: %s\n" % self.medline_id 170 out += "pubmed id: %s\n" % self.pubmed_id 171 out += "comment: %s\n" % self.comment 172 173 return out
174 175 # --- Handling feature locations 176
177 -class FeatureLocation:
178 """Specify the location of a feature along a sequence. 179 180 This attempts to deal with fuzziness of position ends, but also 181 make it easy to get the start and end in the 'normal' case (no 182 fuzziness). 183 184 You should access the start and end attributes with 185 your_location.start and your_location.end. If the start and 186 end are exact, this will return the positions, if not, we'll return 187 the approriate Fuzzy class with info about the position and fuzziness. 188 189 Note that the start and end location numbering follow Python's scheme, 190 thus a GenBank entry of 123..150 (one based counting) becomes a location 191 of [122:150] (zero based counting). 192 """
193 - def __init__(self, start, end):
194 """Specify the start and end of a sequence feature. 195 196 start and end arguments specify the values where the feature begins 197 and ends. These can either by any of the *Position objects that 198 inherit from AbstractPosition, or can just be integers specifying the 199 position. In the case of integers, the values are assumed to be 200 exact and are converted in ExactPosition arguments. This is meant 201 to make it easy to deal with non-fuzzy ends. 202 """ 203 if isinstance(start, AbstractPosition): 204 self._start = start 205 else: 206 self._start = ExactPosition(start) 207 208 if isinstance(end, AbstractPosition): 209 self._end = end 210 else: 211 self._end = ExactPosition(end)
212
213 - def __str__(self):
214 """Returns a representation of the location (with python counting). 215 216 For the simple case this uses the python splicing syntax, [122:150] 217 (zero based counting) which GenBank would call 123..150 (one based 218 counting). 219 """ 220 return "[%s:%s]" % (self._start, self._end)
221
222 - def __repr__(self):
223 """A string representation of the location for debugging.""" 224 return "%s(%s,%s)" \ 225 % (self.__class__, repr(self.start), repr(self.end))
226
227 - def __getattr__(self, attr):
228 """Make it easy to get non-fuzzy starts and ends. 229 230 We override get_attribute here so that in non-fuzzy cases we 231 can just return the start and end position without any hassle. 232 233 To get fuzzy start and ends, just ask for item.start and 234 item.end. To get non-fuzzy attributes (ie. the position only) 235 ask for 'item.nofuzzy_start', 'item.nofuzzy_end'. These should return 236 the largest range of the fuzzy position. So something like: 237 (10.20)..(30.40) should return 10 for start, and 40 for end. 238 239 The special tricky case where is when we have a single between position 240 argument like 2^3 for the range. We want nofuzzy_start and nofuzzy_end 241 to give a reasonable approximation of what this really means, which 242 is an empty string -- so the same position for both. Doing a special 243 case here sucks, but there is really not a general rule you can apply 244 to this. 245 """ 246 if attr == 'start': 247 return self._start 248 elif attr == 'end': 249 return self._end 250 elif attr == 'nofuzzy_start': 251 if ((self._start == self._end) and isinstance(self._start, 252 BetweenPosition)): 253 return self._start.position 254 else: 255 return min(self._start.position, 256 self._start.position + self._start.extension) 257 elif attr == 'nofuzzy_end': 258 if ((self._start == self._end) and isinstance(self._start, 259 BetweenPosition)): 260 return self._end.position 261 else: 262 return max(self._end.position, 263 self._end.position + self._end.extension) 264 else: 265 raise AttributeError("Cannot evaluate attribute %s." % attr)
266
267 -class AbstractPosition:
268 """Abstract base class representing a position. 269 """
270 - def __init__(self, position, extension):
271 self.position = position 272 self.extension = extension
273
274 - def __repr__(self) :
275 """String representation of the location for debugging.""" 276 return "%s(%s,%s)" \ 277 % (self.__class__, repr(self.position), repr(self.extension))
278
279 - def __cmp__(self, other):
280 """A simple comparison function for positions. 281 282 This is very simple-minded and just compares the position attribute 283 of the features; extensions are not considered at all. This could 284 potentially be expanded to try to take advantage of extensions. 285 """ 286 assert isinstance(other, AbstractPosition), \ 287 "We can only do comparisons between Biopython Position objects." 288 289 return cmp(self.position, other.position)
290
291 -class ExactPosition(AbstractPosition):
292 """Specify the specific position of a boundary. 293 294 o position - The position of the boundary. 295 o extension - An optional argument which must be zero since we don't 296 have an extension. The argument is provided so that the same number of 297 arguments can be passed to all position types. 298 299 In this case, there is no fuzziness associated with the position. 300 """
301 - def __init__(self, position, extension = 0):
302 if extension != 0: 303 raise AttributeError("Non-zero extension %s for exact position." 304 % extension) 305 AbstractPosition.__init__(self, position, 0)
306
307 - def __repr__(self) :
308 """String representation of the ExactPosition location for debugging.""" 309 assert self.extension == 0 310 return "%s(%s)" % (self.__class__, repr(self.position))
311
312 - def __str__(self):
313 return str(self.position)
314
315 -class WithinPosition(AbstractPosition):
316 """Specify the position of a boundary within some coordinates. 317 318 Arguments: 319 o position - The start position of the boundary 320 o extension - The range to which the boundary can extend. 321 322 This allows dealing with a position like ((1.4)..100). This 323 indicates that the start of the sequence is somewhere between 1 324 and 4. To represent that with this class we would set position as 325 1 and extension as 3. 326 """
327 - def __init__(self, position, extension = 0):
328 AbstractPosition.__init__(self, position, extension)
329
330 - def __str__(self):
331 return "(%s.%s)" % (self.position, self.position + self.extension)
332
333 -class BetweenPosition(AbstractPosition):
334 """Specify the position of a boundary between two coordinates. 335 336 Arguments: 337 o position - The start position of the boundary. 338 o extension - The range to the other position of a boundary. 339 340 This specifies a coordinate which is found between the two positions. 341 So this allows us to deal with a position like ((1^2)..100). To 342 represent that with this class we set position as 1 and the 343 extension as 1. 344 """
345 - def __init__(self, position, extension = 0):
346 AbstractPosition.__init__(self, position, extension)
347
348 - def __str__(self):
349 return "(%s^%s)" % (self.position, self.position + self.extension)
350
351 -class BeforePosition(AbstractPosition):
352 """Specify a position where the actual location occurs before it. 353 354 Arguments: 355 o position - The upper boundary of where the location can occur. 356 o extension - An optional argument which must be zero since we don't 357 have an extension. The argument is provided so that the same number of 358 arguments can be passed to all position types. 359 360 This is used to specify positions like (<10..100) where the location 361 occurs somewhere before position 10. 362 """
363 - def __init__(self, position, extension = 0):
364 if extension != 0: 365 raise AttributeError("Non-zero extension %s for exact position." 366 % extension) 367 AbstractPosition.__init__(self, position, 0)
368
369 - def __repr__(self) :
370 """A string representation of the location for debugging.""" 371 assert self.extension == 0 372 return "%s(%s)" % (self.__class__, repr(self.position))
373
374 - def __str__(self):
375 return "<%s" % self.position
376
377 -class AfterPosition(AbstractPosition):
378 """Specify a position where the actual location is found after it. 379 380 Arguments: 381 o position - The lower boundary of where the location can occur. 382 o extension - An optional argument which must be zero since we don't 383 have an extension. The argument is provided so that the same number of 384 arguments can be passed to all position types. 385 386 This is used to specify positions like (>10..100) where the location 387 occurs somewhere after position 10. 388 """
389 - def __init__(self, position, extension = 0):
390 if extension != 0: 391 raise AttributeError("Non-zero extension %s for exact position." 392 % extension) 393 AbstractPosition.__init__(self, position, 0)
394
395 - def __repr__(self) :
396 """A string representation of the location for debugging.""" 397 assert self.extension == 0 398 return "%s(%s)" % (self.__class__, repr(self.position))
399
400 - def __str__(self):
401 return ">%s" % self.position
402
403 -class OneOfPosition(AbstractPosition):
404 """Specify a position where the location can be multiple positions. 405 406 This models the GenBank 'one-of(1888,1901)' function, and tries 407 to make this fit within the Biopython Position models. In our case 408 the position of the "one-of" is set as the lowest choice, and the 409 extension is the range to the highest choice. 410 """
411 - def __init__(self, position_list):
412 """Initialize with a set of posssible positions. 413 414 position_list is a list of AbstractPosition derived objects, 415 specifying possible locations. 416 """ 417 # unique attribute for this type of positions 418 self.position_choices = position_list 419 # find the smallest and largest position in the choices 420 smallest = None 421 largest = None 422 for position_choice in self.position_choices: 423 assert isinstance(position_choice, AbstractPosition), \ 424 "Expected position objects, got %r" % position_choice 425 if smallest is None and largest is None: 426 smallest = position_choice.position 427 largest = position_choice.position 428 elif position_choice.position > largest: 429 largest = position_choice.position 430 elif position_choice.position < smallest: 431 smallest = position_choice.position 432 # initialize with our definition of position and extension 433 AbstractPosition.__init__(self, smallest, largest - smallest)
434
435 - def __repr__(self) :
436 """String representation of the OneOfPosition location for debugging.""" 437 return "%s(%s)" % (self.__class__, repr(self.position_choices))
438
439 - def __str__(self):
440 out = "one-of(" 441 for position in self.position_choices: 442 out += "%s," % position 443 # replace the last comma with the closing parenthesis 444 out = out[:-1] + ")" 445 return out
446
447 -class PositionGap:
448 """Simple class to hold information about a gap between positions. 449 """
450 - def __init__(self, gap_size):
451 """Intialize with a position object containing the gap information. 452 """ 453 self.gap_size = gap_size
454
455 - def __repr__(self) :
456 """A string representation of the position gap for debugging.""" 457 return "%s(%s)" % (self.__class__, repr(self.gap_size))
458
459 - def __str__(self):
460 out = "gap(%s)" % self.gap_size 461 return out
462