Package Bio :: Package Clustalw
[hide private]
[frames] | no frames]

Source Code for Package Bio.Clustalw

  1  # Clustalw modules 
  2   
  3  """ 
  4  A set of classes to interact with the multiple alignment command 
  5  line program clustalw.  
  6   
  7  Clustalw is the command line version of the graphical Clustalx  
  8  aligment program. 
  9   
 10  This requires clustalw available from: 
 11   
 12  ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/. 
 13   
 14  functions: 
 15  o read 
 16  o parse_file 
 17  o do_alignment 
 18   
 19  classes: 
 20  o ClustalAlignment 
 21  o MultipleAlignCL""" 
 22   
 23  # standard library 
 24  import os 
 25  import sys 
 26   
 27  # biopython 
 28  from Bio import Alphabet 
 29  from Bio.Alphabet import IUPAC 
 30  from Bio.Align.Generic import Alignment 
 31   
32 -def parse_file(file_name, alphabet = IUPAC.unambiguous_dna, debug_level = 0):
33 """Parse the given file into a clustal aligment object. 34 35 Arguments: 36 o file_name - The name of the file to parse. 37 o alphabet - The type of alphabet to use for the alignment sequences. 38 This should correspond to the type of information contained in the file. 39 Defaults to be unambiguous_dna sequence. 40 41 There is a deprecated optional argument debug_level which has no effect. 42 43 Since Biopython 1.46, this has called Bio.AlignIO internally. 44 """ 45 46 # Avoid code duplication by calling Bio.AlignIO to do this for us. 47 handle = open(file_name, 'r') 48 from Bio import AlignIO 49 generic_alignment = AlignIO.read(handle, "clustal") 50 handle.close() 51 52 #Force this generic alignment into a ClustalAlignment... nasty hack 53 if isinstance(alphabet, Alphabet.Gapped) : 54 alpha = alphabet 55 else : 56 alpha = Alphabet.Gapped(alphabet) 57 clustal_alignment = ClustalAlignment(alpha) 58 clustal_alignment._records = generic_alignment._records 59 for record in clustal_alignment._records : 60 record.seq.alphabet = alpha 61 62 try : 63 clustal_alignment._version = generic_alignment._version 64 except AttributeError : 65 #Missing the version, could be a 3rd party tool's output 66 pass 67 68 try : 69 clustal_alignment._star_info = generic_alignment._star_info 70 except AttributeError : 71 #Missing the consensus, again, this is not always present 72 pass 73 74 return clustal_alignment
75
76 -def do_alignment(command_line, alphabet=None):
77 """Perform an alignment with the given command line. 78 79 Arguments: 80 o command_line - A command line object that can give out 81 the command line we will input into clustalw. 82 o alphabet - the alphabet to use in the created alignment. If not 83 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for 84 dna and protein alignment respectively. 85 86 Returns: 87 o A clustal alignment object corresponding to the created alignment. 88 If the alignment type was not a clustal object, None is returned. 89 """ 90 try : 91 import subprocess 92 child_process = subprocess.Popen(str(command_line), 93 stdout=subprocess.PIPE, 94 stderr=subprocess.PIPE, 95 shell=(sys.platform!="win32") 96 ) 97 status = child_process.wait() 98 except ImportError : 99 #Fall back for python 2.3 100 run_clust = os.popen(str(command_line)) 101 status = run_clust.close() 102 103 104 # The exit status is the second byte of the termination status 105 # TODO - Check this holds on win32... 106 value = 0 107 if status: value = status / 256 108 # check the return value for errors, as on 1.81 the return value 109 # from Clustalw is actually helpful for figuring out errors 110 # 1 => bad command line option 111 if value == 1: 112 raise ValueError("Bad command line option in the command: %s" 113 % str(command_line)) 114 # 2 => can't open sequence file 115 elif value == 2: 116 raise IOError("Cannot open sequence file %s" 117 % command_line.sequence_file) 118 # 3 => wrong format in sequence file 119 elif value == 3: 120 raise IOError("Sequence file %s has an invalid format." 121 % command_line.sequence_file) 122 # 4 => sequence file only has one sequence 123 elif value == 4: 124 raise IOError("Sequence file %s has only one sequence present." 125 % command_line.sequence_file) 126 127 # if an output file was specified, we need to grab it 128 if command_line.output_file: 129 out_file = command_line.output_file 130 else: 131 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln' 132 133 # if we can't deal with the format, just return None 134 if command_line.output_type and command_line.output_type != 'CLUSTAL': 135 return None 136 # otherwise parse it into a ClustalAlignment object 137 else: 138 if not alphabet: 139 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[ 140 command_line.type == 'PROTEIN'] 141 142 # check if the outfile exists before parsing 143 if not(os.path.exists(out_file)): 144 raise IOError("Output .aln file %s not produced, commandline: %s" 145 % (out_file, command_line)) 146 147 return parse_file(out_file, alphabet)
148 149
150 -class ClustalAlignment(Alignment):
151 """Work with the clustal aligment format. 152 153 This format is the default output from clustal -- these files normally 154 have an extension of .aln. 155 """ 156 # the default version to use if one isn't set 157 DEFAULT_VERSION = '1.81' 158
159 - def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)):
160 Alignment.__init__(self, alphabet) 161 162 # represent all of those stars in the aln output format 163 self._star_info = '' 164 165 self._version = ''
166
167 - def __str__(self):
168 """Print out the alignment so it looks pretty. 169 170 The output produced from this should also be formatted in valid 171 clustal format. 172 """ 173 # Avoid code duplication by calling Bio.AlignIO to do this for us. 174 from Bio import AlignIO 175 from StringIO import StringIO 176 handle = StringIO() 177 AlignIO.write([self], handle, "clustal") 178 handle.seek(0) 179 return handle.read()
180
181 -def _escape_filename(filename) :
182 """Escape filenames with spaces (PRIVATE). 183 184 NOTE - This code is duplicated in Bio.Blast.NCBIStandalone, 185 scope for refactoring! 186 """ 187 if " " not in filename : 188 return filename 189 190 #We'll just quote it - works on Mac etc 191 if filename.startswith('"') and filename.endswith('"') : 192 #Its already quoted 193 return filename 194 else : 195 return '"%s"' % filename
196
197 -class MultipleAlignCL:
198 """Represent a clustalw multiple alignment command line. 199 200 This is meant to make it easy to code the command line options you 201 want to submit to clustalw. 202 203 Clustalw has a ton of options and things to do but this is set up to 204 represent a clustalw mutliple alignment. 205 206 Warning: I don't use all of these options personally, so if you find 207 one to be broken for any reason, please let us know! 208 """ 209 # set the valid options for different parameters 210 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA'] 211 OUTPUT_ORDER = ['INPUT', 'ALIGNED'] 212 OUTPUT_CASE = ['LOWER', 'UPPER'] 213 OUTPUT_SEQNOS = ['OFF', 'ON'] 214 RESIDUE_TYPES = ['PROTEIN', 'DNA'] 215 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID'] 216 DNA_MATRIX = ['IUB', 'CLUSTALW'] 217
218 - def __init__(self, sequence_file, command = 'clustalw'):
219 """Initialize some general parameters that can be set as attributes. 220 221 Arguments: 222 o sequence_file - The file to read the sequences for alignment from. 223 o command - The command used to run clustalw. This defaults to 224 just 'clustalw' (ie. assumes you have it on your path somewhere). 225 226 General attributes that can be set: 227 o is_quick - if set as 1, will use a fast algorithm to create 228 the alignment guide tree. 229 o allow_negative - allow negative values in the alignment matrix. 230 231 Multiple alignment attributes that can be set as attributes: 232 o gap_open_pen - Gap opening penalty 233 o gap_ext_pen - Gap extension penalty 234 o is_no_end_pen - A flag as to whether or not there should be a gap 235 separation penalty for the ends. 236 o gap_sep_range - The gap separation penalty range. 237 o is_no_pgap - A flag to turn off residue specific gaps 238 o is_no_hgap - A flag to turn off hydrophilic gaps 239 o h_gap_residues - A list of residues to count a hydrophilic 240 o max_div - A percent identity to use for delay (? - I don't undertand 241 this!) 242 o trans_weight - The weight to use for transitions 243 """ 244 self.sequence_file = sequence_file 245 self.command = command 246 247 self.is_quick = None 248 self.allow_negative = None 249 250 self.gap_open_pen = None 251 self.gap_ext_pen = None 252 self.is_no_end_pen = None 253 self.gap_sep_range = None 254 self.is_no_pgap = None 255 self.is_no_hgap = None 256 self.h_gap_residues = [] 257 self.max_div = None 258 self.trans_weight = None 259 260 # other attributes that should be set via various functions 261 # 1. output parameters 262 self.output_file = None 263 self.output_type = None 264 self.output_order = None 265 self.change_case = None 266 self.add_seqnos = None 267 268 # 2. a guide tree to use 269 self.guide_tree = None 270 self.new_tree = None 271 272 # 3. matrices 273 self.protein_matrix = None 274 self.dna_matrix = None 275 276 # 4. type of residues 277 self.type = None
278
279 - def __str__(self):
280 """Write out the command line as a string.""" 281 282 #On Linux with clustalw 1.83, you can do: 283 #clustalw input.faa 284 #clustalw /full/path/input.faa 285 #clustalw -INFILE=input.faa 286 #clustalw -INFILE=/full/path/input.faa 287 # 288 #Note these fail (using DOS style slashes): 289 # 290 #clustalw /INFILE=input.faa 291 #clustalw /INFILE=/full/path/input.faa 292 # 293 #On Windows XP with clustalw.exe 1.83, these work at 294 #the command prompt: 295 # 296 #clustalw.exe input.faa 297 #clustalw.exe /INFILE=input.faa 298 #clustalw.exe /INFILE="input.faa" 299 #clustalw.exe /INFILE="with space.faa" 300 #clustalw.exe /INFILE=C:\full\path\input.faa 301 #clustalw.exe /INFILE="C:\full path\with spaces.faa" 302 # 303 #Sadly these fail: 304 #clustalw.exe "input.faa" 305 #clustalw.exe "with space.faa" 306 #clustalw.exe C:\full\path\input.faa 307 #clustalw.exe "C:\full path\with spaces.faa" 308 # 309 #Testing today (using a different binary of clustalw.exe 1.83), 310 #using -INFILE as follows seems to work. However I had once noted: 311 #These also fail but a minus/dash does seem to 312 #work with other options (!): 313 #clustalw.exe -INFILE=input.faa 314 #clustalw.exe -INFILE=C:\full\path\input.faa 315 # 316 #Also these fail: 317 #clustalw.exe "/INFILE=input.faa" 318 #clustalw.exe "/INFILE=C:\full\path\input.faa" 319 # 320 #Thanks to Emanuel Hey for flagging this on the mailing list. 321 # 322 #In addition, both self.command and self.sequence_file 323 #may contain spaces, so should be quoted. But clustalw 324 #is fussy. 325 cline = _escape_filename(self.command) 326 cline += ' -INFILE=%s' % _escape_filename(self.sequence_file) 327 328 # general options 329 if self.type: 330 cline += " -TYPE=%s" % self.type 331 if self.is_quick == 1: 332 #Some versions of clustalw are case sensitive, 333 #and require -quicktree rather than -QUICKTREE 334 cline += " -quicktree" 335 if self.allow_negative == 1: 336 cline += " -NEGATIVE" 337 338 # output options 339 if self.output_file: 340 cline += " -OUTFILE=%s" % _escape_filename(self.output_file) 341 if self.output_type: 342 cline += " -OUTPUT=%s" % self.output_type 343 if self.output_order: 344 cline += " -OUTORDER=%s" % self.output_order 345 if self.change_case: 346 cline += " -CASE=%s" % self.change_case 347 if self.add_seqnos: 348 cline += " -SEQNOS=%s" % self.add_seqnos 349 if self.new_tree: 350 # clustal does not work if -align is written -ALIGN 351 cline += " -NEWTREE=%s -align" % _escape_filename(self.new_tree) 352 353 # multiple alignment options 354 if self.guide_tree: 355 cline += " -USETREE=%s" % _escape_filename(self.guide_tree) 356 if self.protein_matrix: 357 cline += " -MATRIX=%s" % self.protein_matrix 358 if self.dna_matrix: 359 cline += " -DNAMATRIX=%s" % self.dna_matrix 360 if self.gap_open_pen: 361 cline += " -GAPOPEN=%s" % self.gap_open_pen 362 if self.gap_ext_pen: 363 cline += " -GAPEXT=%s" % self.gap_ext_pen 364 if self.is_no_end_pen == 1: 365 cline += " -ENDGAPS" 366 if self.gap_sep_range: 367 cline += " -GAPDIST=%s" % self.gap_sep_range 368 if self.is_no_pgap == 1: 369 cline += " -NOPGAP" 370 if self.is_no_hgap == 1: 371 cline += " -NOHGAP" 372 if len(self.h_gap_residues) != 0: 373 # stick the list of residues together as one big list o' residues 374 residue_list = '' 375 for residue in self.h_gap_residues: 376 residue_list = residue_list + residue 377 cline += " -HGAPRESIDUES=%s" % residue_list 378 if self.max_div: 379 cline += " -MAXDIV=%s" % self.max_div 380 if self.trans_weight: 381 cline += " -TRANSWEIGHT=%s" % self.trans_weight 382 383 return cline
384
385 - def set_output(self, output_file, output_type = None, output_order = None, 386 change_case = None, add_seqnos = None):
387 """Set the output parameters for the command line. 388 """ 389 self.output_file = output_file 390 391 if output_type: 392 output_type = output_type.upper() 393 if output_type not in self.OUTPUT_TYPES: 394 raise ValueError("Invalid output type %s. Valid choices are %s" 395 % (output_type, self.OUTPUT_TYPES)) 396 else: 397 self.output_type = output_type 398 399 if output_order: 400 output_order = output_order.upper() 401 if output_order not in self.OUTPUT_ORDER: 402 raise ValueError("Invalid output order %s. Valid choices are %s" 403 % (output_order, self.OUTPUT_ORDER)) 404 else: 405 self.output_order = output_order 406 407 if change_case: 408 change_case = change_case.upper() 409 if output_type != "GDE": 410 raise ValueError("Change case only valid for GDE output.") 411 elif change_case not in self.CHANGE_CASE: 412 raise ValueError("Invalid change case %s. Valid choices are %s" 413 % (change_case, self.CHANGE_CASE)) 414 else: 415 self.change_case = change_case 416 417 if add_seqnos: 418 add_seqnos = add_seqnos.upper() 419 if output_type: 420 raise ValueError("Add SeqNos only valid for CLUSTAL output.") 421 elif add_seqnos not in self.OUTPUT_SEQNOS: 422 raise ValueError("Invalid seqnos option %s. Valid choices: %s" 423 % (add_seqnos, self.OUTPUT_SEQNOS)) 424 else: 425 self.add_seqnos = add_seqnos
426
427 - def set_guide_tree(self, tree_file):
428 """Provide a file to use as the guide tree for alignment. 429 430 Raises: 431 o IOError - If the tree_file doesn't exist.""" 432 if not(os.path.exists(tree_file)): 433 raise IOError("Could not find the guide tree file %s." % 434 tree_file) 435 else: 436 self.guide_tree = tree_file
437
438 - def set_new_guide_tree(self, tree_file):
439 """Set the name of the guide tree file generated in the alignment. 440 """ 441 self.new_tree = tree_file
442
443 - def set_protein_matrix(self, protein_matrix):
444 """Set the type of protein matrix to use. 445 446 Protein matrix can be either one of the defined types (blosum, pam, 447 gonnet or id) or a file with your own defined matrix. 448 """ 449 if protein_matrix.upper() in self.PROTEIN_MATRIX: 450 self.protein_matrix = protein_matrix.upper() 451 elif os.path.exists(protein_matrix): 452 self.protein_matrix = protein_matrix 453 else: 454 raise ValueError("Invalid matrix %s. Options are %s or a file." % 455 (protein_matrix.upper(), self.PROTEIN_MATRIX))
456
457 - def set_dna_matrix(self, dna_matrix):
458 """Set the type of DNA matrix to use. 459 460 The dna_matrix can either be one of the defined types (iub or clustalw) 461 or a file with the matrix to use.""" 462 if dna_matrix.upper() in self.DNA_MATRIX: 463 self.dna_matrix = dna_matrix.upper() 464 elif os.path.exists(dna_matrix): 465 self.dna_matrix = dna_matrix 466 else: 467 raise ValueError("Invalid matrix %s. Options are %s or a file." % 468 (dna_matrix, self.DNA_MATRIX))
469
470 - def set_type(self, residue_type):
471 """Set the type of residues within the file. 472 473 Clustal tries to guess whether the info is protein or DNA based on 474 the number of GATCs, but this can be wrong if you have a messed up 475 protein or DNA you are working with, so this allows you to set it 476 explicitly. 477 """ 478 residue_type = residue_type.upper() 479 if residue_type in self.RESIDUE_TYPES: 480 self.type = residue_type 481 else: 482 raise ValueError("Invalid residue type %s. Valid choices are %s" 483 % (residue_type, self.RESIDUE_TYPES))
484