1 """Hold GenBank data in a straightforward format.
2
3 classes:
4 o Record - All of the information in a GenBank record.
5 o Reference - hold reference data for a record.
6 o Feature - Hold the information in a Feature Table.
7 o Qualifier - Qualifiers on a Feature.
8 """
9
10 import Bio.GenBank
11
13 """Write a line of GenBank info that can wrap over multiple lines.
14
15 This takes a line of information which can potentially wrap over
16 multiple lines, and breaks it up with carriage returns and
17 indentation so it fits properly into a GenBank record.
18
19 Arguments:
20
21 o information - The string holding the information we want
22 wrapped in GenBank method.
23
24 o indent - The indentation on the lines we are writing.
25
26 o wrap_space - Whether or not to wrap only on spaces in the
27 information.
28
29 o split_char - A specific character to split the lines on. By default
30 spaces are used.
31 """
32 info_length = Record.GB_LINE_LENGTH - indent
33
34 if wrap_space:
35 info_parts = information.split(split_char)
36 else:
37 cur_pos = 0
38 info_parts = []
39 while cur_pos < len(information):
40 info_parts.append(information[cur_pos: cur_pos + info_length])
41 cur_pos += info_length
42
43
44 output_parts = []
45 cur_part = ""
46 for info_part in info_parts:
47 if len(cur_part) + 1 + len(info_part) > info_length:
48 if cur_part:
49 if split_char != " ":
50 cur_part += split_char
51 output_parts.append(cur_part)
52 cur_part = info_part
53 else:
54 if cur_part == "":
55 cur_part = info_part
56 else:
57 cur_part += split_char + info_part
58
59
60 if cur_part:
61 output_parts.append(cur_part)
62
63
64 output_info = output_parts[0] + "\n"
65 for output_part in output_parts[1:]:
66 output_info += " " * indent + output_part + "\n"
67
68 return output_info
69
71 """Write out information with the specified indent.
72
73 Unlike _wrapped_genbank, this function makes no attempt to wrap
74 lines -- it assumes that the information already has newlines in the
75 appropriate places, and will add the specified indent to the start of
76 each line.
77 """
78
79 info_parts = information.split("\n")
80
81
82 output_info = info_parts[0] + "\n"
83 for info_part in info_parts[1:]:
84 output_info += " " * indent + info_part + "\n"
85
86 return output_info
87
89 """Hold GenBank information in a format similar to the original record.
90
91 The Record class is meant to make data easy to get to when you are
92 just interested in looking at GenBank data.
93
94 Attributes:
95 o locus - The name specified after the LOCUS keyword in the GenBank
96 record. This may be the accession number, or a clone id or something else.
97 o size - The size of the record.
98 o residue_type - The type of residues making up the sequence in this
99 record. Normally something like RNA, DNA or PROTEIN, but may be as
100 esoteric as 'ss-RNA circular'.
101 o data_file_division - The division this record is stored under in
102 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...)
103 o date - The date of submission of the record, in a form like '28-JUL-1998'
104 o accession - list of all accession numbers for the sequence.
105 o nid - Nucleotide identifier number.
106 o pid - Proteint identifier number
107 o version - The accession number + version (ie. AB01234.2)
108 o db_source - Information about the database the record came from
109 o gi - The NCBI gi identifier for the record.
110 o keywords - A list of keywords related to the record.
111 o segment - If the record is one of a series, this is info about which
112 segment this record is (something like '1 of 6').
113 o source - The source of material where the sequence came from.
114 o organism - The genus and species of the organism (ie. 'Homo sapiens')
115 o taxonomy - A listing of the taxonomic classification of the organism,
116 starting general and getting more specific.
117 o references - A list of Reference objects.
118 o comment - Text with any kind of comment about the record.
119 o features - A listing of Features making up the feature table.
120 o base_counts - A string with the counts of bases for the sequence.
121 o origin - A string specifying info about the origin of the sequence.
122 o sequence - A string with the sequence itself.
123 o contig - A string of location information for a CONTIG in a RefSeq file
124 o project - The genome sequencing project numbers
125 (will be replaced by the dblink cross-references in 2009).
126 o dblinks - The genome sequencing project number(s) and other links.
127 (will replace the project information in 2009).
128 """
129
130 GB_LINE_LENGTH = 79
131 GB_BASE_INDENT = 12
132 GB_FEATURE_INDENT = 21
133 GB_INTERNAL_INDENT = 2
134 GB_OTHER_INTERNAL_INDENT = 3
135 GB_FEATURE_INTERNAL_INDENT = 5
136 GB_SEQUENCE_INDENT = 9
137
138 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s"
139 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \
140 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s"
141 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \
142 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \
143 "s"
144
145 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s"
146 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \
147 str(GB_FEATURE_INDENT -
148 GB_FEATURE_INTERNAL_INDENT) + "s"
149 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s"
150
179
218
220 """Provide the output string for the LOCUS line.
221 """
222 output = "LOCUS"
223 output += " " * 7
224 output += "%-9s" % self.locus
225 output += " "
226 output += "%7s" % self.size
227 if self.residue_type.find("PROTEIN") >= 0:
228 output += " aa"
229 else:
230 output += " bp "
231
232
233
234 if self.residue_type.find("circular") >= 0:
235 output += "%17s" % self.residue_type
236
237 elif self.residue_type.find("-") >= 0:
238 output += "%7s" % self.residue_type
239 output += " " * 10
240 else:
241 output += " " * 3
242 output += "%-4s" % self.residue_type
243 output += " " * 10
244
245 output += " " * 2
246 output += "%3s" % self.data_file_division
247 output += " " * 7
248 output += "%11s" % self.date
249 output += "\n"
250 return output
251
258
275
277 """Output for the VERSION line.
278 """
279 if self.version:
280 output = Record.BASE_FORMAT % "VERSION"
281 output += self.version
282 output += " GI:"
283 output += "%s\n" % self.gi
284 else:
285 output = ""
286 return output
287
289 output = ""
290 if len(self.projects) > 0:
291 output = Record.BASE_FORMAT % "PROJECT"
292 output += "%s\n" % " ".join(self.projects)
293 return output
294
302
304 """Output for the NID line. Use of NID is obsolete in GenBank files.
305 """
306 if self.nid:
307 output = Record.BASE_FORMAT % "NID"
308 output += "%s\n" % self.nid
309 else:
310 output = ""
311 return output
312
314 """Output for PID line. Presumedly, PID usage is also obsolete.
315 """
316 if self.pid:
317 output = Record.BASE_FORMAT % "PID"
318 output += "%s\n" % self.pid
319 else:
320 output = ""
321 return output
322
340
342 """Output for DBSOURCE line.
343 """
344 if self.db_source:
345 output = Record.BASE_FORMAT % "DBSOURCE"
346 output += "%s\n" % self.db_source
347 else:
348 output = ""
349 return output
350
359
366
382
392
394 """Output for the FEATURES line.
395 """
396 output = ""
397 if len(self.features) > 0:
398 output += Record.BASE_FEATURE_FORMAT % "FEATURES"
399 output += "Location/Qualifiers\n"
400 return output
401
403 """Output for the BASE COUNT line with base information.
404 """
405 output = ""
406 if self.base_counts:
407 output += Record.BASE_FORMAT % "BASE COUNT "
408
409 count_parts = self.base_counts.split(" ")
410 while '' in count_parts:
411 count_parts.remove('')
412
413
414 if len(count_parts) % 2 == 0:
415 while len(count_parts) > 0:
416 count_info = count_parts.pop(0)
417 count_type = count_parts.pop(0)
418
419 output += "%7s %s" % (count_info, count_type)
420
421
422
423 else:
424 output += self.base_counts
425 output += "\n"
426 return output
427
441
443 """Output for all of the sequence.
444 """
445 output = ""
446 if self.sequence:
447 cur_seq_pos = 0
448 while cur_seq_pos < len(self.sequence):
449 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1)
450
451 for section in range(6):
452 start_pos = cur_seq_pos + section * 10
453 end_pos = start_pos + 10
454 seq_section = self.sequence[start_pos:end_pos]
455 output += " %s" % seq_section.lower()
456
457
458 if end_pos > len(self.sequence):
459 break
460
461 output += "\n"
462 cur_seq_pos += 60
463 return output
464
474
476 """Hold information from a GenBank reference.
477
478 Attributes:
479 o number - The number of the reference in the listing of references.
480 o bases - The bases in the sequence the reference refers to.
481 o authors - String with all of the authors.
482 o consrtm - Consortium the authors belong to.
483 o title - The title of the reference.
484 o journal - Information about the journal where the reference appeared.
485 o medline_id - The medline id for the reference.
486 o pubmed_id - The pubmed_id for the reference.
487 o remark - Free-form remarks about the reference.
488 """
499
511
513 """Output for REFERENCE lines.
514 """
515 output = Record.BASE_FORMAT % "REFERENCE"
516 if self.number:
517 if self.bases:
518 output += "%-3s" % self.number
519 output += "%s" % self.bases
520 else:
521 output += "%s" % self.number
522
523 output += "\n"
524 return output
525
534
543
552
561
570
579
588
590 """Hold information about a Feature in the Feature Table of GenBank record.
591
592 Attributes:
593 o key - The key name of the featue (ie. source)
594 o location - The string specifying the location of the feature.
595 o qualfiers - A listing Qualifier objects in the feature.
596 """
598 self.key = ''
599 self.location = ''
600 self.qualifiers = []
601
619
621 """Hold information about a qualifier in a GenBank feature.
622
623 Attributes:
624 o key - The key name of the qualifier (ie. /organism=)
625 o value - The value of the qualifier ("Dictyostelium discoideum").
626 """
630