1 """Extract information from alignment objects.
2
3 In order to try and avoid huge alignment objects with tons of functions,
4 functions which return summary type information about alignments should
5 be put into classes in this module.
6
7 classes:
8 o SummaryInfo
9 o PSSM
10 """
11
12
13 import string
14 import math
15 import sys
16
17
18 try:
19 set = set
20 except NameError:
21 from sets import Set as set
22
23
24 from Bio import Alphabet
25 from Bio.Alphabet import IUPAC
26 from Bio.Seq import Seq
27 from Bio.SubsMat import FreqTable
28
29
30
31 Protein20Random = 0.05
32 Nucleotide4Random = 0.25
34 """Calculate summary info about the alignment.
35
36 This class should be used to caclculate information summarizing the
37 results of an alignment. This may either be straight consensus info
38 or more complicated things.
39 """
41 """Initialize with the alignment to calculate information on.
42 ic_vector attribute. A dictionary. Keys: column numbers. Values:
43 """
44 self.alignment = alignment
45 self.ic_vector = {}
46
47 - def dumb_consensus(self, threshold = .7, ambiguous = "X",
48 consensus_alpha = None, require_multiple = 0):
49 """Output a fast consensus sequence of the alignment.
50
51 This doesn't do anything fancy at all. It will just go through the
52 sequence residue by residue and count up the number of each type
53 of residue (ie. A or G or T or C for DNA) in all sequences in the
54 alignment. If the percentage of the most common residue type is
55 greater then the passed threshold, then we will add that residue type,
56 otherwise an ambiguous character will be added.
57
58 This could be made a lot fancier (ie. to take a substitution matrix
59 into account), but it just meant for a quick and dirty consensus.
60
61 Arguments:
62 o threshold - The threshold value that is required to add a particular
63 atom.
64 o ambiguous - The ambiguous character to be added when the threshold is
65 not reached.
66 o consensus_alpha - The alphabet to return for the consensus sequence.
67 If this is None, then we will try to guess the alphabet.
68 o require_multiple - If set as 1, this will require that more than
69 1 sequence be part of an alignment to put it in the consensus (ie.
70 not just 1 sequence and gaps).
71 """
72
73 consensus = ''
74
75
76 con_len = self.alignment.get_alignment_length()
77
78
79 for n in range(con_len):
80
81 atom_dict = {}
82 num_atoms = 0
83
84 for record in self.alignment._records:
85
86
87 if n < len(record.seq):
88 if record.seq[n] != '-' and record.seq[n] != '.':
89 if record.seq[n] not in atom_dict.keys():
90 atom_dict[record.seq[n]] = 1
91 else:
92 atom_dict[record.seq[n]] = \
93 atom_dict[record.seq[n]] + 1
94
95 num_atoms = num_atoms + 1
96
97 max_atoms = []
98 max_size = 0
99
100 for atom in atom_dict.keys():
101 if atom_dict[atom] > max_size:
102 max_atoms = [atom]
103 max_size = atom_dict[atom]
104 elif atom_dict[atom] == max_size:
105 max_atoms.append(atom)
106
107 if require_multiple and num_atoms == 1:
108 consensus = consensus + ambiguous
109 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
110 >= threshold):
111 consensus = consensus + max_atoms[0]
112 else:
113 consensus = consensus + ambiguous
114
115
116 if consensus_alpha is None:
117 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
118
119 return Seq(consensus, consensus_alpha)
120
121 - def gap_consensus(self, threshold = .7, ambiguous = "X",
122 consensus_alpha = None, require_multiple = 0):
123 """Same as dumb_consensus(), but allows gap on the output.
124
125 Things to do: Let the user define that with only one gap, the result
126 character in consensus is gap. Let the user select gap character, now
127 it takes the same is input.
128 """
129
130 consensus = ''
131
132
133 con_len = self.alignment.get_alignment_length()
134
135
136 for n in range(con_len):
137
138 atom_dict = {}
139 num_atoms = 0
140
141 for record in self.alignment._records:
142
143
144 if n < len(record.seq):
145 if record.seq[n] not in atom_dict.keys():
146 atom_dict[record.seq[n]] = 1
147 else:
148 atom_dict[record.seq[n]] = \
149 atom_dict[record.seq[n]] + 1
150
151 num_atoms = num_atoms + 1
152
153 max_atoms = []
154 max_size = 0
155
156 for atom in atom_dict.keys():
157 if atom_dict[atom] > max_size:
158 max_atoms = [atom]
159 max_size = atom_dict[atom]
160 elif atom_dict[atom] == max_size:
161 max_atoms.append(atom)
162
163 if require_multiple and num_atoms == 1:
164 consensus = consensus + ambiguous
165 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
166 >= threshold):
167 consensus = consensus + max_atoms[0]
168 else:
169 consensus = consensus + ambiguous
170
171
172 if consensus_alpha is None:
173
174 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
175
176 return Seq(consensus, consensus_alpha)
177
219
221 """Generate a replacement dictionary to plug into a substitution matrix
222
223 This should look at an alignment, and be able to generate the number
224 of substitutions of different residues for each other in the
225 aligned object.
226
227 Will then return a dictionary with this information:
228 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....}
229
230 This also treats weighted sequences. The following example shows how
231 we calculate the replacement dictionary. Given the following
232 multiple sequence alignments:
233
234 GTATC 0.5
235 AT--C 0.8
236 CTGTC 1.0
237
238 For the first column we have:
239 ('A', 'G') : 0.5 * 0.8 = 0.4
240 ('C', 'G') : 0.5 * 1.0 = 0.5
241 ('A', 'C') : 0.8 * 1.0 = 0.8
242
243 We then continue this for all of the columns in the alignment, summing
244 the information for each substitution in each column, until we end
245 up with the replacement dictionary.
246
247 Arguments:
248 o skip_chars - A list of characters to skip when creating the dictionary.
249 For instance, you might have Xs (screened stuff) or Ns, and not want
250 to include the ambiguity characters in the dictionary.
251 """
252
253 rep_dict, skip_items = self._get_base_replacements(skip_chars)
254
255
256 for rec_num1 in range(len(self.alignment._records)):
257
258
259 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)):
260
261
262 rep_dict = self._pair_replacement(
263 self.alignment._records[rec_num1].seq,
264 self.alignment._records[rec_num2].seq,
265 self.alignment._records[rec_num1].annotations.get('weight',1),
266 self.alignment._records[rec_num2].annotations.get('weight',1),
267 rep_dict, skip_items)
268
269 return rep_dict
270
271 - def _pair_replacement(self, seq1, seq2, weight1, weight2,
272 start_dict, ignore_chars):
273 """Compare two sequences and generate info on the replacements seen.
274
275 Arguments:
276 o seq1, seq2 - The two sequences to compare.
277 o weight1, weight2 - The relative weights of seq1 and seq2.
278 o start_dict - The dictionary containing the starting replacement
279 info that we will modify.
280 o ignore_chars - A list of characters to ignore when calculating
281 replacements (ie. '-').
282
283 Returns:
284 o A replacment dictionary which is modified from initial_dict with
285 the information from the sequence comparison.
286 """
287
288 for residue_num in range(len(seq1)):
289 residue1 = seq1[residue_num]
290 try:
291 residue2 = seq2[residue_num]
292
293
294 except IndexError:
295 return start_dict
296
297
298 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars):
299 try:
300
301
302 start_dict[(residue1, residue2)] = \
303 start_dict[(residue1, residue2)] + \
304 weight1 * weight2
305
306
307 except KeyError:
308 raise ValueError("Residues %s, %s not found in alphabet %s"
309 % (residue1, residue2,
310 self.alignment._alphabet))
311
312 return start_dict
313
314
316 """Returns a string containing the expected letters in the alignment."""
317 all_letters = self.alignment._alphabet.letters
318 if all_letters is None \
319 or (isinstance(self.alignment._alphabet, Alphabet.Gapped) \
320 and all_letters == self.alignment._alphabet.gap_char):
321
322
323
324 set_letters = set()
325 for record in self.alignment :
326
327
328 set_letters = set_letters.union(record.seq)
329 list_letters = list(set_letters)
330 list_letters.sort()
331 all_letters = "".join(list_letters)
332 return all_letters
333
335 """Get a zeroed dictonary of all possible letter combinations.
336
337 This looks at the type of alphabet and gets the letters for it.
338 It then creates a dictionary with all possible combinations of these
339 letters as keys (ie. ('A', 'G')) and sets the values as zero.
340
341 Returns:
342 o The base dictionary created
343 o A list of alphabet items to skip when filling the dictionary.Right
344 now the only thing I can imagine in this list is gap characters, but
345 maybe X's or something else might be useful later. This will also
346 include any characters that are specified to be skipped.
347 """
348 base_dictionary = {}
349 all_letters = self._get_all_letters()
350
351
352
353 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
354 skip_items.append(self.alignment._alphabet.gap_char)
355 all_letters = string.replace(all_letters,
356 self.alignment._alphabet.gap_char,
357 '')
358
359
360 for first_letter in all_letters:
361 for second_letter in all_letters:
362 if (first_letter not in skip_items and
363 second_letter not in skip_items):
364 base_dictionary[(first_letter, second_letter)] = 0
365
366 return base_dictionary, skip_items
367
368
371 """Create a position specific score matrix object for the alignment.
372
373 This creates a position specific score matrix (pssm) which is an
374 alternative method to look at a consensus sequence.
375
376 Arguments:
377 o chars_to_ignore - A listing of all characters not to include in
378 the pssm. If the alignment alphabet declares a gap character,
379 then it will be excluded automatically.
380 o axis_seq - An optional argument specifying the sequence to
381 put on the axis of the PSSM. This should be a Seq object. If nothing
382 is specified, the consensus sequence, calculated with default
383 parameters, will be used.
384
385 Returns:
386 o A PSSM (position specific score matrix) object.
387 """
388
389 all_letters = self._get_all_letters()
390 assert all_letters
391
392 if not isinstance(chars_to_ignore, list) :
393 raise TypeError("chars_to_ignore should be a list.")
394
395
396 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
397 chars_to_ignore.append(self.alignment._alphabet.gap_char)
398
399 for char in chars_to_ignore:
400 all_letters = string.replace(all_letters, char, '')
401
402 if axis_seq:
403 left_seq = axis_seq
404 assert len(axis_seq) == self.alignment.get_alignment_length()
405 else:
406 left_seq = self.dumb_consensus()
407
408 pssm_info = []
409
410 for residue_num in range(len(left_seq)):
411 score_dict = self._get_base_letters(all_letters)
412 for record in self.alignment._records:
413 try:
414 this_residue = record.seq[residue_num]
415
416
417 except IndexError:
418 this_residue = None
419
420 if this_residue and this_residue not in chars_to_ignore:
421 weight = record.annotations.get('weight', 1)
422 try:
423 score_dict[this_residue] += weight
424
425 except KeyError:
426 raise ValueError("Residue %s not found in alphabet %s"
427 % (this_residue,
428 self.alignment._alphabet))
429
430 pssm_info.append((left_seq[residue_num],
431 score_dict))
432
433
434 return PSSM(pssm_info)
435
437 """Create a zeroed dictionary with all of the specified letters.
438 """
439 base_info = {}
440 for letter in letters:
441 base_info[letter] = 0
442
443 return base_info
444
445 - def information_content(self, start = 0,
446 end = None,
447 e_freq_table = None, log_base = 2,
448 chars_to_ignore = []):
449 """Calculate the information content for each residue along an alignment.
450
451 Arguments:
452 o start, end - The starting an ending points to calculate the
453 information content. These points should be relative to the first
454 sequence in the alignment, starting at zero (ie. even if the 'real'
455 first position in the seq is 203 in the initial sequence, for
456 the info content, we need to use zero). This defaults to the entire
457 length of the first sequence.
458 o e_freq_table - A FreqTable object specifying the expected frequencies
459 for each letter in the alphabet we are using (e.g. {'G' : 0.4,
460 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
461 included, since these should not have expected frequencies.
462 o log_base - The base of the logathrim to use in calculating the
463 information content. This defaults to 2 so the info is in bits.
464 o chars_to_ignore - A listing of characterw which should be ignored
465 in calculating the info content.
466
467 Returns:
468 o A number representing the info content for the specified region.
469
470 Please see the Biopython manual for more information on how information
471 content is calculated.
472 """
473
474 if end is None:
475 end = len(self.alignment._records[0].seq)
476
477 if start < 0 or end > len(self.alignment._records[0].seq):
478 raise ValueError \
479 ("Start (%s) and end (%s) are not in the range %s to %s"
480 % (start, end, 0, len(self.alignment._records[0].seq)))
481
482 random_expected = None
483 if not e_freq_table:
484
485 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
486 if isinstance(base_alpha, Alphabet.ProteinAlphabet) :
487 random_expected = Protein20Random
488 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet) :
489 random_expected = Nucleotide4Random
490 else :
491 errstr = "Error in alphabet: not Nucleotide or Protein, "
492 errstr += "supply expected frequencies"
493 raise ValueError(errstr)
494 del base_alpha
495 elif not isinstance(e_freq_table, FreqTable.FreqTable) :
496 raise ValueError("e_freq_table should be a FreqTable object")
497
498
499
500 all_letters = self._get_all_letters()
501 for char in chars_to_ignore:
502 all_letters = string.replace(all_letters, char, '')
503
504 info_content = {}
505 for residue_num in range(start, end):
506 freq_dict = self._get_letter_freqs(residue_num,
507 self.alignment._records,
508 all_letters, chars_to_ignore)
509
510 column_score = self._get_column_info_content(freq_dict,
511 e_freq_table,
512 log_base,
513 random_expected)
514
515 info_content[residue_num] = column_score
516
517 total_info = 0
518 for column_info in info_content.values():
519 total_info = total_info + column_info
520
521 for i in info_content.keys():
522 self.ic_vector[i] = info_content[i]
523 return total_info
524
526 """Determine the frequency of specific letters in the alignment.
527
528 Arguments:
529 o residue_num - The number of the column we are getting frequencies
530 from.
531 o all_records - All of the SeqRecords in the alignment.
532 o letters - The letters we are interested in getting the frequency
533 for.
534 o to_ignore - Letters we are specifically supposed to ignore.
535
536 This will calculate the frequencies of each of the specified letters
537 in the alignment at the given frequency, and return this as a
538 dictionary where the keys are the letters and the values are the
539 frequencies.
540 """
541 freq_info = self._get_base_letters(letters)
542
543 total_count = 0
544
545 for record in all_records:
546 try:
547 if record.seq[residue_num] not in to_ignore:
548 weight = record.annotations.get('weight',1)
549 freq_info[record.seq[residue_num]] += weight
550 total_count += weight
551
552 except KeyError:
553 raise ValueError("Residue %s not found in alphabet %s"
554 % (record.seq[residue_num],
555 self.alignment._alphabet))
556
557 if total_count == 0 :
558
559 for letter in freq_info.keys():
560 assert freq_info[letter] == 0
561
562 else :
563
564 for letter in freq_info.keys():
565 freq_info[letter] = freq_info[letter] / total_count
566
567 return freq_info
568
569 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base,
570 random_expected):
571 """Calculate the information content for a column.
572
573 Arguments:
574 o obs_freq - The frequencies observed for each letter in the column.
575 o e_freq_table - An optional argument specifying the expected
576 frequencies for each letter. This is a SubsMat.FreqTable instance.
577 o log_base - The base of the logathrim to use in calculating the
578 info content.
579 """
580 try :
581 gap_char = self.alignment._alphabet.gap_char
582 except AttributeError :
583
584
585 gap_char = "-"
586
587 if e_freq_table:
588 if not isinstance(e_freq_table, FreqTable.FreqTable) :
589 raise ValueError("e_freq_table should be a FreqTable object")
590
591 for key in obs_freq.keys():
592 if (key != gap_char and key not in e_freq_table):
593 raise ValueError("Expected frequency letters %s" +
594 " do not match observed %s"
595 % (e_freq_table.keys(), obs_freq.keys() -
596 [gap_char]))
597
598 total_info = 0.0
599
600 for letter in obs_freq.keys():
601 inner_log = 0.0
602
603
604
605 if letter != gap_char:
606 if e_freq_table:
607 inner_log = obs_freq[letter] / e_freq_table[letter]
608 else:
609 inner_log = obs_freq[letter] / random_expected
610
611
612 if inner_log > 0:
613 letter_info = (obs_freq[letter] *
614 math.log(inner_log) / math.log(log_base))
615 total_info = total_info + letter_info
616 return total_info
617
620
622 """Represent a position specific score matrix.
623
624 This class is meant to make it easy to access the info within a PSSM
625 and also make it easy to print out the information in a nice table.
626
627 Let's say you had an alignment like this:
628 GTATC
629 AT--C
630 CTGTC
631
632 The position specific score matrix (when printed) looks like:
633
634 G A T C
635 G 1 1 0 1
636 T 0 0 3 0
637 A 1 1 0 0
638 T 0 0 2 0
639 C 0 0 0 3
640
641 You can access a single element of the PSSM using the following:
642
643 your_pssm[sequence_number][residue_count_name]
644
645 For instance, to get the 'T' residue for the second element in the
646 above alignment you would need to do:
647
648 your_pssm[1]['T']
649 """
651 """Initialize with pssm data to represent.
652
653 The pssm passed should be a list with the following structure:
654
655 list[0] - The letter of the residue being represented (for instance,
656 from the example above, the first few list[0]s would be GTAT...
657 list[1] - A dictionary with the letter substitutions and counts.
658 """
659 self.pssm = pssm
660
662 return self.pssm[pos][1]
663
665 out = " "
666 all_residues = self.pssm[0][1].keys()
667 all_residues.sort()
668
669
670 for res in all_residues:
671 out = out + " %s" % res
672 out = out + "\n"
673
674
675 for item in self.pssm:
676 out = out + "%s " % item[0]
677 for res in all_residues:
678 out = out + " %.1f" % item[1][res]
679
680 out = out + "\n"
681 return out
682
684 """Return the residue letter at the specified position.
685 """
686 return self.pssm[pos][0]
687
688
689 -def print_info_content(summary_info,fout=None,rep_record=0):
690 """ Three column output: position, aa in representative sequence,
691 ic_vector value"""
692 fout = fout or sys.stdout
693 if not summary_info.ic_vector:
694 summary_info.information_content()
695 rep_sequence = summary_info.alignment._records[rep_record].seq
696 positions = summary_info.ic_vector.keys()
697 positions.sort()
698 for pos in positions:
699 fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos],
700 summary_info.ic_vector[pos]))
701
702 if __name__ == "__main__" :
703 print "Quick test"
704 from Bio import AlignIO
705 from Bio.Align.Generic import Alignment
706
707 filename = "../../Tests/GFF/multi.fna"
708 format = "fasta"
709 expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25},
710 FreqTable.FREQ,
711 IUPAC.unambiguous_dna)
712
713 alignment = AlignIO.read(open(filename), format)
714 for record in alignment :
715 print record.seq.tostring()
716 print "="*alignment.get_alignment_length()
717
718 summary = SummaryInfo(alignment)
719 consensus = summary.dumb_consensus(ambiguous="N")
720 print consensus
721 consensus = summary.gap_consensus(ambiguous="N")
722 print consensus
723 print
724 print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
725 axis_seq=consensus)
726 print
727
728
729 print summary.information_content(e_freq_table=expected,
730 chars_to_ignore=['-'])
731 print
732 print "Trying a protein sequence with gaps and stops"
733
734 alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
735 a = Alignment(alpha)
736 a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
737 a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
738 a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
739 print a
740 print "="*a.get_alignment_length()
741
742 s = SummaryInfo(a)
743 c = s.dumb_consensus(ambiguous="X")
744 print c
745 c = s.gap_consensus(ambiguous="X")
746 print c
747 print
748 print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
749
750 print s.information_content(chars_to_ignore=['-', '*'])
751
752
753 print "Done"
754