1
2
3
4
5
6 """
7 Parser for PHD files output by PHRED and used by PHRAP and CONSED.
8
9 This module can be used used directly which will return Record objects
10 which should contain all the original data in the file.
11
12 Alternatively, using Bio.SeqIO with the "phd" format will call this module
13 internally. This will give SeqRecord objects for each contig sequence.
14 """
15
16 from Bio import Seq
17 from Bio.Alphabet import IUPAC
18
19 CKEYWORDS=['CHROMAT_FILE','ABI_THUMBPRINT','PHRED_VERSION','CALL_METHOD',\
20 'QUALITY_LEVELS','TIME','TRACE_ARRAY_MIN_INDEX','TRACE_ARRAY_MAX_INDEX',\
21 'TRIM','TRACE_PEAK_AREA_RATIO','CHEM','DYE']
22
24 """Hold information from a PHD file."""
26 self.file_name = ''
27 self.comments={}
28 for kw in CKEYWORDS:
29 self.comments[kw.lower()]=None
30 self.sites = []
31 self.seq = ''
32 self.seq_trimmed = ''
33
34
36 """Reads the next PHD record from the file, returning it as a Record object.
37
38 This function reads PHD file data line by line from the handle,
39 and returns a single Record object.
40 """
41 for line in handle:
42 if line.startswith("BEGIN_SEQUENCE"):
43 record = Record()
44 record.file_name = line[15:].rstrip()
45 break
46 else:
47 return
48
49 for line in handle:
50 if line.startswith("BEGIN_COMMENT"):
51 break
52 else:
53 raise ValueError("Failed to find BEGIN_COMMENT line")
54
55 for line in handle:
56 line = line.strip()
57 if not line:
58 continue
59 if line=="END_COMMENT":
60 break
61 keyword, value = line.split(":", 1)
62 keyword = keyword.lower()
63 value = value.strip()
64 if keyword in ('chromat_file',
65 'phred_version',
66 'call_method',
67 'chem',
68 'dye',
69 'time',
70 'basecaller_version',
71 'trace_processor_version'):
72 record.comments[keyword] = value
73 elif keyword in ('abi_thumbprint',
74 'quality_levels',
75 'trace_array_min_index',
76 'trace_array_max_index'):
77 record.comments[keyword] = int(value)
78 elif keyword=='trace_peak_area_ratio':
79 record.comments[keyword] = float(value)
80 elif keyword=='trim':
81 first, last, prob = value.split()
82 record.comments[keyword] = (int(first), int(last), float(prob))
83 else:
84 raise ValueError("Failed to find END_COMMENT line")
85
86 for line in handle:
87 if line.startswith('BEGIN_DNA'):
88 break
89 else:
90 raise ValueError("Failed to find BEGIN_DNA line")
91
92 for line in handle:
93 if line.startswith('END_DNA'):
94 break
95 else:
96 base, quality, location = line.split()
97 record.sites.append((base, quality, location))
98
99 for line in handle:
100 if line.startswith("END_SEQUENCE"):
101 break
102 else:
103 raise ValueError("Failed to find END_SEQUENCE line")
104
105 alphabet = IUPAC.IUPACAmbiguousDNA()
106 record.seq = Seq.Seq(''.join([n[0] for n in record.sites]), alphabet)
107 if record.comments['trim'] is not None:
108 first, last = record.comments['trim'][:2]
109 record.seq_trimmed = record.seq[first:last]
110
111 return record
112
114 """Iterates over a file returning multiple PHD records.
115
116 The data is read line by line from the handle. The handle can be a list
117 of lines, an open file, or similar; the only requirement is that we can
118 iterate over the handle to retrieve lines from it.
119
120 Typical usage:
121
122 records = parse(handle)
123 for record in records:
124 # do something with the record object
125 """
126 while True:
127 record = read(handle)
128 if not record:
129 return
130 yield record
131
132
133
134
135 from Bio import File
136 from Bio.ParserSupport import *
137
138
140 """Iterates over a file of multiple PHD records (DEPRECATED).
141
142 Methods:
143 next Return the next record from the stream, or None.
144 """
145
146 - def __init__(self, handle, parser=None):
147 """Create a new iterator.
148
149 Create a new iterator. handle is a file-like object. parser
150 is an optional Parser object to change the results into another form.
151 If set to None, then the raw contents of the file will be returned.
152 """
153 import warnings
154 warnings.warn("Bio.Sequencing.Phd.Iterator is deprecated. Please use Bio.Sequencing.Phd.parse(handle) instead of Bio.Sequencing.Phd.Iterator(handle, RecordParser())", DeprecationWarning)
155 self._uhandle = File.UndoHandle(handle)
156 self._parser = parser
157
159 lines = []
160 while 1:
161 line = self._uhandle.readline()
162 if not line:
163 break
164
165 if lines and line[:14] == 'BEGIN_SEQUENCE':
166 self._uhandle.saveline(line)
167 break
168 lines.append(line)
169
170 if not lines:
171 return None
172
173 data = ''.join(lines)
174 if self._parser is not None:
175 return self._parser.parse(File.StringHandle(data))
176 return data
177
179 """Iterate over the PHY file record by record."""
180 return iter(self.next, None)
181
183 """Parses PHD file data into a Record object (DEPRECATED)."""
185 import warnings
186 warnings.warn("Bio.Sequencing.Phd.RecordParser is deprecated. Please use Bio.Sequencing.Phd.read(handle) instead of Bio.Sequencing.Phd.RecordParser().parse(handle)", DeprecationWarning)
187 self._scanner = _Scanner()
188 self._consumer = _RecordConsumer()
189
190 - def parse(self, handle):
197
198
200 """Scans a PHD-formatted file (DEPRECATED).
201
202 Methods:
203 feed - Feed one PHD record.
204 """
205 - def feed(self, handle, consumer):
206 """Reads in PDH data from the handle for scanning.
207
208 Feed in PHD data for scanning. handle is a file-like object
209 containing PHD data. consumer is a Consumer object that will
210 receive events as the PHD data is scanned.
211 """
212 assert isinstance(handle, File.UndoHandle), \
213 "handle must be an UndoHandle"
214 if handle.peekline():
215 self._scan_record(handle, consumer)
216
222
225
241
243 while 1:
244 line = uhandle.readline()
245 if is_blank_line(line) or line == 'BEGIN_DNA\n':
246 continue
247 elif line == 'END_DNA\n':
248 break
249 consumer.read_dna(line)
250
251
253 """Consumer that converts a PHD record to a Record object (DEPRECATED)."""
256
260
267
270
273
276
279
282
283 - def time(self, line):
285
288
291
292 - def trim(self, line):
293 first, last, prob = line[5:-1].split()
294 self.data.comments['trim'] = (int(first), int(last), float(prob))
295
298
299 - def chem(self, line):
301
302 - def dye(self, line):
304
308
309 if __name__ == "__main__" :
310 print "Quick self test"
311
312 handle = open("../../Tests/Phd/phd1")
313 records = parse(handle)
314 for record in records:
315 print record.file_name, len(record.seq)
316 handle.close()
317 print "Done"
318