Package Bio :: Package Data :: Module CodonTable
[hide private]
[frames] | no frames]

Source Code for Module Bio.Data.CodonTable

  1  #TODO - Remove this work around once we drop python 2.3 support 
  2  try: 
  3     set = set 
  4  except NameError: 
  5     from sets import Set as set 
  6   
  7  import string 
  8  from Bio import Alphabet 
  9  from Bio.Alphabet import IUPAC 
 10  from Bio.Data import IUPACData 
 11   
 12  unambiguous_dna_by_name = {} 
 13  unambiguous_dna_by_id = {} 
 14  unambiguous_rna_by_name = {} 
 15  unambiguous_rna_by_id = {} 
 16  generic_by_name = {} # unambiguous DNA or RNA 
 17  generic_by_id = {} # unambiguous DNA or RNA 
 18  ambiguous_generic_by_name = {} # ambiguous DNA or RNA 
 19  ambiguous_generic_by_id = {} # ambiguous DNA or RNA  
 20   
 21  # standard IUPAC unambiguous codons 
 22  standard_dna_table = None 
 23  standard_rna_table = None 
 24   
 25  # In the future, the back_table could return a statistically 
 26  # appropriate distribution of codons, so do not cache the results of 
 27  # back_table lookups! 
 28   
29 -class TranslationError(Exception):
30 pass
31
32 -class CodonTable:
33 nucleotide_alphabet = Alphabet.generic_nucleotide 34 protein_alphabet = Alphabet.generic_protein 35 36 forward_table = {} # only includes codons which actually code 37 back_table = {} # for back translations 38 start_codons = [] 39 stop_codons = [] 40 # Not always called from derived classes!
41 - def __init__(self, nucleotide_alphabet = nucleotide_alphabet, 42 protein_alphabet = protein_alphabet, 43 forward_table = forward_table, back_table = back_table, 44 start_codons = start_codons, stop_codons = stop_codons):
51
52 - def __str__(self) :
53 """Returns a simple text representation of the codon table 54 55 e.g. 56 >>> import Bio.Data.CodonTable 57 >>> print Bio.Data.CodonTable.standard_dna_table 58 >>> print Bio.Data.CodonTable.generic_by_id[1]""" 59 60 if self.id : 61 answer = "Table %i" % self.id 62 else : 63 answer = "Table ID unknown" 64 if self.names : 65 answer += " " + ", ".join(filter(None, self.names)) 66 67 #Use the main four letters (and the conventional ordering) 68 #even for ambiguous tables 69 letters = self.nucleotide_alphabet.letters 70 if isinstance(self.nucleotide_alphabet, Alphabet.DNAAlphabet) \ 71 or (letters is not None and "T" in letters) : 72 letters = "TCAG" 73 else : 74 #Should be either RNA or generic nucleotides, 75 #e.g. Bio.Data.CodonTable.generic_by_id[1] 76 letters = "UCAG" 77 78 #Build the table... 79 answer=answer + "\n\n |" + "|".join( \ 80 [" %s " % c2 for c2 in letters] \ 81 ) + "|" 82 answer=answer + "\n--+" \ 83 + "+".join(["---------" for c2 in letters]) + "+--" 84 for c1 in letters : 85 for c3 in letters : 86 line = c1 + " |" 87 for c2 in letters : 88 codon = c1+c2+c3 89 line = line + " %s" % codon 90 if codon in self.stop_codons : 91 line = line + " Stop|" 92 else : 93 try : 94 amino = self.forward_table[codon] 95 except KeyError : 96 amino = "?" 97 except TranslationError : 98 amino = "?" 99 if codon in self.start_codons : 100 line = line + " %s(s)|" % amino 101 else : 102 line = line + " %s |" % amino 103 line = line + " " + c3 104 answer = answer + "\n"+ line 105 answer=answer + "\n--+" \ 106 + "+".join(["---------" for c2 in letters]) + "+--" 107 return answer
108
109 -def make_back_table(table, default_stop_codon):
110 # ONLY RETURNS A SINGLE CODON 111 # Do the sort so changes in the hash implementation won't affect 112 # the result when one amino acid is coded by more than one codon. 113 back_table = {} 114 keys = table.keys() ; keys.sort() 115 for key in keys: 116 back_table[table[key]] = key 117 back_table[None] = default_stop_codon 118 return back_table
119 120
121 -class NCBICodonTable(CodonTable):
122 nucleotide_alphabet = Alphabet.generic_nucleotide 123 protein_alphabet = IUPAC.protein 124
125 - def __init__(self, id, names, table, start_codons, stop_codons):
126 self.id = id 127 self.names = names 128 self.forward_table = table 129 self.back_table = make_back_table(table, stop_codons[0]) 130 self.start_codons = start_codons 131 self.stop_codons = stop_codons
132 133
134 -class NCBICodonTableDNA(NCBICodonTable):
135 nucleotide_alphabet = IUPAC.unambiguous_dna
136
137 -class NCBICodonTableRNA(NCBICodonTable):
138 nucleotide_alphabet = IUPAC.unambiguous_rna
139 140 141
142 -def register_ncbi_table(name, alt_name, id, 143 table, start_codons, stop_codons):
144 names = string.split(name, "; ") 145 146 dna = NCBICodonTableDNA(id, names + [alt_name], table, start_codons, 147 stop_codons) 148 # replace all T's with U's for the RNA tables 149 rna_table = {} 150 generic_table = {} 151 for codon, val in table.items(): 152 generic_table[codon] = val 153 codon = codon.replace("T", "U") 154 generic_table[codon] = val 155 rna_table[codon] = val 156 rna_start_codons = [] 157 generic_start_codons = [] 158 for codon in start_codons: 159 generic_start_codons.append(codon) 160 codon = codon.replace("T", "U") 161 generic_start_codons.append(codon) 162 rna_start_codons.append(codon) 163 rna_stop_codons = [] 164 generic_stop_codons = [] 165 for codon in stop_codons: 166 generic_stop_codons.append(codon) 167 codon = codon.replace("T", "U") 168 generic_stop_codons.append(codon) 169 rna_stop_codons.append(codon) 170 171 generic = NCBICodonTable(id, names + [alt_name], generic_table, 172 generic_start_codons, generic_stop_codons) 173 rna = NCBICodonTableRNA(id, names + [alt_name], rna_table, 174 rna_start_codons, rna_stop_codons) 175 176 if id == 1: 177 global standard_dna_table, standard_rna_table 178 standard_dna_table = dna 179 standard_rna_table = rna 180 181 unambiguous_dna_by_id[id] = dna 182 unambiguous_rna_by_id[id] = rna 183 generic_by_id[id] = generic 184 185 if alt_name is not None: 186 names.append(alt_name) 187 188 for name in names: 189 unambiguous_dna_by_name[name] = dna 190 unambiguous_rna_by_name[name] = rna 191 generic_by_name[name] = generic
192 193 ### These tables created from the data file 194 ### ftp://ncbi.nlm.nih.gov/entrez/misc/data/gc.prt 195 ### using the following: 196 ##import re 197 ##for line in open("gc.prt").readlines(): 198 ## if line[:2] == " {": 199 ## names = [] 200 ## id = None 201 ## aa = None 202 ## start = None 203 ## bases = [] 204 ## elif line[:6] == " name": 205 ## names.append(re.search('"([^"]*)"', line).group(1)) 206 ## elif line[:8] == " name": 207 ## names.append(re.search('"(.*)$', line).group(1)) 208 ## elif line == ' Mitochondrial; Mycoplasma; Spiroplasma" ,\n': 209 ## names[-1] = names[-1] + " Mitochondrial; Mycoplasma; Spiroplasma" 210 ## elif line[:4] == " id": 211 ## id = int(re.search('(\d+)', line).group(1)) 212 ## elif line[:10] == " ncbieaa ": 213 ## aa = line[12:12+64] 214 ## elif line[:10] == " sncbieaa": 215 ## start = line[12:12+64] 216 ## elif line[:9] == " -- Base": 217 ## bases.append(line[12:12+64]) 218 ## elif line[:2] == " }": 219 ## assert names != [] and id is not None and aa is not None 220 ## assert start is not None and bases != [] 221 ## if len(names) == 1: 222 ## names.append(None) 223 ## print "register_ncbi_table(name = %s," % repr(names[0]) 224 ## print " alt_name = %s, id = %d", % \ 225 ## (repr(names[1]), id) 226 ## print " table = {" 227 ## s = " " 228 ## for i in range(64): 229 ## if aa[i] != "*": 230 ## t = " '%s%s%s': '%s'," % (bases[0][i], bases[1][i], 231 ## bases[2][i], aa[i]) 232 ## if len(s) + len(t) > 75: 233 ## print s 234 ## s = " " + t 235 ## else: 236 ## s = s + t 237 ## print s, "}," 238 239 ## s = " stop_codons = [" 240 ## for i in range(64): 241 ## if aa[i] == "*": 242 ## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i]) 243 ## if len(s) + len(t) > 75: 244 ## print s 245 ## s = " " + t 246 ## else: 247 ## s = s + t 248 ## print s, "]," 249 250 ## s = " start_codons = [" 251 ## for i in range(64): 252 ## if start[i] == "M": 253 ## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i]) 254 ## if len(s) + len(t) > 75: 255 ## print s 256 ## s = " " + t 257 ## else: 258 ## s = s + t 259 ## print s, "]" 260 ## print " )" 261 ## elif line[:2] == "--" or line == "\n" or line == "}\n" or \ 262 ## line == 'Genetic-code-table ::= {\n': 263 ## pass 264 ## else: 265 ## raise Exception("Unparsed: " + repr(line)) 266 267 register_ncbi_table(name = 'Standard', 268 alt_name = 'SGC0', id = 1, 269 table = { 270 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 271 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 272 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 273 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 274 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 275 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 276 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 277 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 278 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 279 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 280 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 281 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 282 'GGG': 'G', }, 283 stop_codons = [ 'TAA', 'TAG', 'TGA', ], 284 start_codons = [ 'TTG', 'CTG', 'ATG', ] 285 ) 286 register_ncbi_table(name = 'Vertebrate Mitochondrial', 287 alt_name = 'SGC1', id = 2, 288 table = { 289 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 290 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 291 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 292 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 293 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 294 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 295 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 296 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 297 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'GTT': 'V', 298 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 299 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 300 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, 301 stop_codons = [ 'TAA', 'TAG', 'AGA', 'AGG', ], 302 start_codons = [ 'ATT', 'ATC', 'ATA', 'ATG', 'GTG', ] 303 ) 304 register_ncbi_table(name = 'Yeast Mitochondrial', 305 alt_name = 'SGC2', id = 3, 306 table = { 307 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 308 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 309 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'T', 310 'CTC': 'T', 'CTA': 'T', 'CTG': 'T', 'CCT': 'P', 'CCC': 'P', 311 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 312 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 313 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 314 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 315 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 316 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 317 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 318 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 319 'GGA': 'G', 'GGG': 'G', }, 320 stop_codons = [ 'TAA', 'TAG', ], 321 start_codons = [ 'ATG', ] 322 ) 323 register_ncbi_table(name = 'Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma', 324 alt_name = 'SGC3', id = 4, 325 table = { 326 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 327 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 328 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 329 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 330 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 331 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 332 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 333 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 334 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 335 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 336 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 337 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 338 'GGA': 'G', 'GGG': 'G', }, 339 stop_codons = [ 'TAA', 'TAG', ], 340 start_codons = [ 'TTA', 'TTG', 'CTG', 'ATT', 'ATC', 341 'ATA', 'ATG', 'GTG', ] 342 ) 343 register_ncbi_table(name = 'Invertebrate Mitochondrial', 344 alt_name = 'SGC4', id = 5, 345 table = { 346 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 347 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 348 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 349 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 350 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 351 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 352 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 353 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 354 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S', 355 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 356 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 357 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 358 'GGA': 'G', 'GGG': 'G', }, 359 stop_codons = [ 'TAA', 'TAG', ], 360 start_codons = [ 'TTG', 'ATT', 'ATC', 'ATA', 'ATG', 361 'GTG', ] 362 ) 363 register_ncbi_table(name = 'Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear', 364 alt_name = 'SGC5', id = 6, 365 table = { 366 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 367 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 368 'TAA': 'Q', 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 369 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 370 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 371 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 372 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 373 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 374 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 375 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 376 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 377 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 378 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, 379 stop_codons = [ 'TGA', ], 380 start_codons = [ 'ATG', ] 381 ) 382 register_ncbi_table(name = 'Echinoderm Mitochondrial', 383 alt_name = 'SGC8', id = 9, 384 table = { 385 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 386 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 387 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 388 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 389 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 390 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 391 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 392 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 393 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S', 394 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 395 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 396 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 397 'GGA': 'G', 'GGG': 'G', }, 398 stop_codons = [ 'TAA', 'TAG', ], 399 start_codons = [ 'ATG', ] 400 ) 401 register_ncbi_table(name = 'Euplotid Nuclear', 402 alt_name = 'SGC9', id = 10, 403 table = { 404 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 405 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 406 'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W', 'CTT': 'L', 407 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 408 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 409 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 410 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 411 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 412 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 413 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 414 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 415 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 416 'GGA': 'G', 'GGG': 'G', }, 417 stop_codons = [ 'TAA', 'TAG', ], 418 start_codons = [ 'ATG', ] 419 ) 420 register_ncbi_table(name = 'Bacterial', 421 alt_name = None, id = 11, 422 table = { 423 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 424 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 425 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 426 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 427 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 428 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 429 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 430 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 431 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 432 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 433 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 434 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 435 'GGG': 'G', }, 436 stop_codons = [ 'TAA', 'TAG', 'TGA', ], 437 start_codons = [ 'TTG', 'CTG', 'ATT', 'ATC', 'ATA', 438 'ATG', 'GTG', ] 439 ) 440 register_ncbi_table(name = 'Alternative Yeast Nuclear', 441 alt_name = None, id = 12, 442 table = { 443 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 444 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 445 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 446 'CTA': 'L', 'CTG': 'S', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 447 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 448 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 449 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 450 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 451 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 452 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 453 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 454 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 455 'GGG': 'G', }, 456 stop_codons = [ 'TAA', 'TAG', 'TGA', ], 457 start_codons = [ 'CTG', 'ATG', ] 458 ) 459 register_ncbi_table(name = 'Ascidian Mitochondrial', 460 alt_name = None, id = 13, 461 table = { 462 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 463 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 464 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 465 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 466 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 467 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 468 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 469 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 470 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'G', 471 'AGG': 'G', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 472 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 473 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 474 'GGA': 'G', 'GGG': 'G', }, 475 stop_codons = [ 'TAA', 'TAG', ], 476 start_codons = [ 'ATG', ] 477 ) 478 register_ncbi_table(name = 'Flatworm Mitochondrial', 479 alt_name = None, id = 14, 480 table = { 481 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 482 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 483 'TAA': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 484 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 485 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 486 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 487 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 488 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 489 'AAC': 'N', 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 490 'AGA': 'S', 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 491 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 492 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 493 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, 494 stop_codons = [ 'TAG', ], 495 start_codons = [ 'ATG', ] 496 ) 497 register_ncbi_table(name = 'Blepharisma Macronuclear', 498 alt_name = None, id = 15, 499 table = { 500 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 501 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 502 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 503 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 504 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 505 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 506 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 507 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 508 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 509 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 510 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 511 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 512 'GGA': 'G', 'GGG': 'G', }, 513 stop_codons = [ 'TAA', 'TGA', ], 514 start_codons = [ 'ATG', ] 515 ) 516 517 ######### Deal with ambiguous forward translations 518
519 -class AmbiguousCodonTable(CodonTable):
520 - def __init__(self, codon_table, 521 ambiguous_nucleotide_alphabet, 522 ambiguous_nucleotide_values, 523 ambiguous_protein_alphabet, 524 ambiguous_protein_values):
525 CodonTable.__init__(self, 526 ambiguous_nucleotide_alphabet, 527 ambiguous_protein_alphabet, 528 AmbiguousForwardTable(codon_table.forward_table, 529 ambiguous_nucleotide_values, 530 ambiguous_protein_values), 531 codon_table.back_table, 532 533 # These two are WRONG! I need to get the 534 # list of ambiguous codons which code for 535 # the stop codons XXX 536 list_ambiguous_codons(codon_table.start_codons, ambiguous_nucleotide_values), 537 list_ambiguous_codons(codon_table.stop_codons, ambiguous_nucleotide_values) 538 ) 539 self._codon_table = codon_table
540 541 # Be sneaky and forward attribute lookups to the original table. 542 # This lets us get the names, if the original table is an NCBI 543 # table.
544 - def __getattr__(self, name):
545 return getattr(self._codon_table, name)
546
547 -def list_possible_proteins(codon, forward_table, ambiguous_nucleotide_values):
548 c1, c2, c3 = codon 549 x1 = ambiguous_nucleotide_values[c1] 550 x2 = ambiguous_nucleotide_values[c2] 551 x3 = ambiguous_nucleotide_values[c3] 552 possible = {} 553 stops = [] 554 for y1 in x1: 555 for y2 in x2: 556 for y3 in x3: 557 try: 558 possible[forward_table[y1+y2+y3]] = 1 559 except KeyError: 560 # If tripping over a stop codon 561 stops.append(y1+y2+y3) 562 if stops: 563 if possible.keys(): 564 raise TranslationError("ambiguous codon '%s' codes " % codon \ 565 + "for both proteins and stop codons") 566 # This is a true stop codon - tell the caller about it 567 raise KeyError(codon) 568 return possible.keys()
569
570 -def list_ambiguous_codons(codons, ambiguous_nucleotide_values):
571 """Extends a codon list to include all possible ambigous codons. 572 573 e.g. ['TAG', 'TAA'] -> ['TAG', 'TAA', 'TAR'] 574 ['UAG', 'UGA'] -> ['UAG', 'UGA', 'URA'] 575 576 Note that ['TAG', 'TGA'] -> ['TAG', 'TGA'], this does not add 'TRR'. 577 Thus only two more codons are added in the following: 578 579 e.g. ['TGA', 'TAA', 'TAG'] -> ['TGA', 'TAA', 'TAG', 'TRA', 'TAR'] 580 581 Returns a new (longer) list of codon strings. 582 """ 583 584 #Note ambiguous_nucleotide_values['R'] = 'AG' (etc) 585 #This will generate things like 'TRR' from ['TAG', 'TGA'], which 586 #we don't want to include: 587 c1_list = [letter for (letter, meanings) \ 588 in ambiguous_nucleotide_values.iteritems() \ 589 if set([codon[0] for codon in codons]).issuperset(set(meanings))] 590 c2_list = [letter for (letter, meanings) \ 591 in ambiguous_nucleotide_values.iteritems() \ 592 if set([codon[1] for codon in codons]).issuperset(set(meanings))] 593 c3_list = [letter for (letter, meanings) \ 594 in ambiguous_nucleotide_values.iteritems() \ 595 if set([codon[2] for codon in codons]).issuperset(set(meanings))] 596 set2 = set([codon[1] for codon in codons]) 597 set3 = set([codon[2] for codon in codons]) 598 candidates = set([c1+c2+c3 for c1 in c1_list for c2 in c2_list for c3 in c3_list]) 599 candidates.difference_update(codons) 600 answer = codons[:] #copy 601 #print "Have %i new candidates" % len(candidates) 602 for ambig_codon in candidates : 603 wanted = True 604 #e.g. 'TRR' -> 'TAA', 'TAG', 'TGA', 'TGG' 605 for codon in [c1+c2+c3 \ 606 for c1 in ambiguous_nucleotide_values[ambig_codon[0]] \ 607 for c2 in ambiguous_nucleotide_values[ambig_codon[1]] \ 608 for c3 in ambiguous_nucleotide_values[ambig_codon[2]]]: 609 if codon not in codons : 610 #This ambiguous codon can code for a non-stop, exclude it! 611 wanted=False 612 #print "Rejecting %s" % ambig_codon 613 continue 614 if wanted : 615 answer.append(ambig_codon) 616 return answer
617 assert list_ambiguous_codons(['TGA', 'TAA'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TRA'] 618 assert list_ambiguous_codons(['TAG', 'TGA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TGA'] 619 assert list_ambiguous_codons(['TAG', 'TAA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TAA', 'TAR'] 620 assert list_ambiguous_codons(['UAG', 'UAA'],IUPACData.ambiguous_rna_values) == ['UAG', 'UAA', 'UAR'] 621 assert list_ambiguous_codons(['TGA', 'TAA', 'TAG'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TAG', 'TAR', 'TRA'] 622 623 # Forward translation is "onto", that is, any given codon always maps 624 # to the same protein, or it doesn't map at all. Thus, I can build 625 # off of an existing table to produce the ambiguous mappings. 626 # 627 # This handles the general case. Perhaps it's overkill? 628 # >>> t = CodonTable.ambiguous_dna_by_id[1] 629 # >>> t.forward_table["AAT"] 630 # 'N' 631 # >>> t.forward_table["GAT"] 632 # 'D' 633 # >>> t.forward_table["RAT"] 634 # 'B' 635 # >>> t.forward_table["YTA"] 636 # 'L' 637
638 -class AmbiguousForwardTable:
639 - def __init__(self, forward_table, ambiguous_nucleotide, ambiguous_protein):
640 self.forward_table = forward_table 641 642 self.ambiguous_nucleotide = ambiguous_nucleotide 643 self.ambiguous_protein = ambiguous_protein 644 645 inverted = {} 646 for name, val in ambiguous_protein.items(): 647 for c in val: 648 x = inverted.get(c, {}) 649 x[name] = 1 650 inverted[c] = x 651 for name, val in inverted.items(): 652 inverted[name] = val.keys() 653 self._inverted = inverted 654 655 self._cache = {}
656
657 - def get(self, codon, failobj = None):
658 try: 659 return self.__getitem__(codon) 660 except KeyError: 661 return failobj
662
663 - def __getitem__(self, codon):
664 try: 665 x = self._cache[codon] 666 except KeyError: 667 pass 668 else: 669 if x is TranslationError: 670 raise TranslationError(codon) # no unique translation 671 if x is KeyError: 672 raise KeyError(codon) # it's a stop codon 673 return x 674 try: 675 x = self.forward_table[codon] 676 self._cache[codon] = x 677 return x 678 except KeyError: 679 pass 680 681 # XXX Need to make part of this into a method which returns 682 # a list of all possible encodings for a codon! 683 try: 684 possible = list_possible_proteins(codon, 685 self.forward_table, 686 self.ambiguous_nucleotide) 687 except KeyError: 688 self._cache[codon] = KeyError 689 raise KeyError(codon) # stop codon 690 except TranslationError: 691 self._cache[codon] = TranslationError 692 raise TranslationError(codon) # does not code 693 assert len(possible) > 0, "unambiguous codons must code" 694 695 # Hah! Only one possible protein, so use it 696 if len(possible) == 1: 697 self._cache[codon] = possible[0] 698 return possible[0] 699 700 # See if there's an ambiguous protein encoding for the multiples. 701 # Find residues which exist in every coding set. 702 ambiguous_possible = {} 703 for amino in possible: 704 for term in self._inverted[amino]: 705 ambiguous_possible[term] = ambiguous_possible.get(term, 0) + 1 706 707 n = len(possible) 708 possible = [] 709 for amino, val in ambiguous_possible.items(): 710 if val == n: 711 possible.append(amino) 712 713 # No amino acid encoding for the results 714 if len(possible) == 0: 715 self._cache[codon] = TranslationError 716 raise TranslationError(codon) # no valid translation 717 718 # All of these are valid, so choose one 719 # To be unique, sort by smallet ambiguity then alphabetically 720 # Can get this if "X" encodes for everything. 721 def _sort(x, y, table = self.ambiguous_protein): 722 a = cmp(len(table[x]), len(table[y])) 723 if a == 0: 724 return cmp(x, y) 725 return a
726 possible.sort(_sort) 727 728 x = possible[0] 729 self._cache[codon] = x 730 return x
731 732 #Prepare the ambiguous tables for DNA, RNA and Generic (DNA or RNA) 733 ambiguous_dna_by_name = {} 734 for key, val in unambiguous_dna_by_name.items(): 735 ambiguous_dna_by_name[key] = AmbiguousCodonTable(val, 736 IUPAC.ambiguous_dna, 737 IUPACData.ambiguous_dna_values, 738 IUPAC.extended_protein, 739 IUPACData.extended_protein_values) 740 ambiguous_dna_by_id = {} 741 for key, val in unambiguous_dna_by_id.items(): 742 ambiguous_dna_by_id[key] = AmbiguousCodonTable(val, 743 IUPAC.ambiguous_dna, 744 IUPACData.ambiguous_dna_values, 745 IUPAC.extended_protein, 746 IUPACData.extended_protein_values) 747 748 ambiguous_rna_by_name = {} 749 for key, val in unambiguous_rna_by_name.items(): 750 ambiguous_rna_by_name[key] = AmbiguousCodonTable(val, 751 IUPAC.ambiguous_rna, 752 IUPACData.ambiguous_rna_values, 753 IUPAC.extended_protein, 754 IUPACData.extended_protein_values) 755 ambiguous_rna_by_id = {} 756 for key, val in unambiguous_rna_by_id.items(): 757 ambiguous_rna_by_id[key] = AmbiguousCodonTable(val, 758 IUPAC.ambiguous_rna, 759 IUPACData.ambiguous_rna_values, 760 IUPAC.extended_protein, 761 IUPACData.extended_protein_values) 762 763 #The following isn't very elegant, but seems to work nicely. 764 _merged_values = dict(IUPACData.ambiguous_rna_values.iteritems()) 765 _merged_values["T"] = "U" 766 767 for key, val in generic_by_name.items(): 768 ambiguous_generic_by_name[key] = AmbiguousCodonTable(val, 769 Alphabet.NucleotideAlphabet(), 770 _merged_values, 771 IUPAC.extended_protein, 772 IUPACData.extended_protein_values) 773 774 for key, val in generic_by_id.items(): 775 ambiguous_generic_by_id[key] = AmbiguousCodonTable(val, 776 Alphabet.NucleotideAlphabet(), 777 _merged_values, 778 IUPAC.extended_protein, 779 IUPACData.extended_protein_values) 780 del _merged_values 781 del key, val 782 783 #Basic sanity test, 784 for n in ambiguous_generic_by_id.keys() : 785 assert ambiguous_rna_by_id[n].forward_table["GUU"] == "V" 786 assert ambiguous_rna_by_id[n].forward_table["GUN"] == "V" 787 assert ambiguous_rna_by_id[n].forward_table["UUN"] == "X" #F or L 788 #R = A or G, so URR = UAA or UGA / TRA = TAA or TGA = stop codons 789 if "UAA" in unambiguous_rna_by_id[n].stop_codons \ 790 and "UGA" in unambiguous_rna_by_id[n].stop_codons : 791 try : 792 print ambiguous_dna_by_id[n].forward_table["TRA"] 793 assert False, "Should be a stop only" 794 except KeyError : 795 pass 796 assert "URA" in ambiguous_generic_by_id[n].stop_codons 797 assert "URA" in ambiguous_rna_by_id[n].stop_codons 798 assert "TRA" in ambiguous_generic_by_id[n].stop_codons 799 assert "TRA" in ambiguous_dna_by_id[n].stop_codons 800 del n 801 assert ambiguous_generic_by_id[1].stop_codons == ambiguous_generic_by_name["Standard"].stop_codons 802 assert ambiguous_generic_by_id[4].stop_codons == ambiguous_generic_by_name["SGC3"].stop_codons 803 assert ambiguous_generic_by_id[15].stop_codons == ambiguous_generic_by_name['Blepharisma Macronuclear'].stop_codons 804