Package nltk_lite :: Package contrib :: Package classifier :: Module featureselect
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.classifier.featureselect

  1  # Natural Language Toolkit - Feature Select 
  2  #  The command line entry point for feature selection 
  3  # 
  4  # Author: Sumukh Ghodke <sumukh dot ghodke at gmail dot com> 
  5  # 
  6  # URL: <http://nltk.sf.net> 
  7  # This software is distributed under GPL, for license information see LICENSE.TXT 
  8  from nltk_lite.contrib.classifier import split_ignore_space 
  9  from nltk_lite.contrib.classifier import format, cfile, commandline as cl 
 10  from nltk_lite.contrib.classifier.exceptions import invaliddataerror as inv 
 11   
 12  import sys 
 13   
 14  a_help = "Selects the feature selection algorithm                 " \ 
 15         + "Options: RNK for Ranking(Filter based Feature Selection)" \ 
 16         + "Default: RNK.                                           " 
 17          
 18  f_help = "Base name of attribute, klass, training, test and gold  " \ 
 19         + " files.                                                 " 
 20   
 21  t_help = "Base name of training file for feature selection.       " 
 22   
 23  T_help = "Base name of test file for feature selection.           " 
 24   
 25  g_help = "Base name of gold file for feature selection.           " 
 26   
 27  o_help = "Algorithm specific options                              " \ 
 28         + "For rank based feature selection the options should     " \ 
 29         + "include the method to calculate the rank:               " \ 
 30         + "  IG: for Information gain                              " \ 
 31         + "  GR: for Gain ratio                                    " \ 
 32         + "followed by a number which indicates the number of      " \ 
 33         + "attributes which should be chosen.                      " 
 34   
 35  OPTION_MAPPINGS = {'IG': 'information_gain', 'GR': 'gain_ratio'} 
 36   
 37  RANK='RNK' 
 38   
 39  ALGORITHM_MAPPINGS = {RANK:'by_rank'} 
 40   
41 -class FeatureSelect(cl.CommandLineInterface):
42 - def __init__(self):
43 cl.CommandLineInterface.__init__(self, ALGORITHM_MAPPINGS.keys(), RANK, a_help, f_help, t_help, T_help, g_help) 44 self.add_option("-o", "--options", dest="options", type="string", help=o_help)
45
46 - def execute(self):
47 cl.CommandLineInterface.execute(self) 48 self.validate_basic_arguments_are_present() 49 self.validate_files_arg_is_exclusive() 50 if self.get_value('options') is None: 51 self.required_arguments_not_present_error() 52 self.options = split_ignore_space(self.get_value('options')) 53 if self.algorithm == RANK and (len(self.options) != 2 or not OPTION_MAPPINGS.has_key(self.options[0]) or not int(self.options[1])): 54 self.error("Invalid options for Rank based feature selection. Options Found: " + str(self.options)) 55 self.select_features_and_write_to_file()
56
58 ignore_missing = False 59 #duplicate code and not tested!! 60 if self.files is not None: 61 self.training_path, self.test_path, self.gold_path = [self.files] * 3 62 ignore_missing = True 63 training, attributes, klass, test, gold = self.get_instances(self.training_path, self.test_path, self.gold_path, ignore_missing) 64 65 feature_sel = FeatureSelection(training, attributes, klass, test, gold, self.options) 66 getattr(feature_sel, ALGORITHM_MAPPINGS[self.algorithm])() 67 68 files_written = self.write_to_file(self.get_suffix(), training, attributes, klass, test, gold) 69 print 'The following files were created after feature selection...' 70 for file_name in files_written: 71 print file_name
72
73 - def get_suffix(self):
74 if self.options is None: return '-' + self.algorithm 75 suf = '-' + self.algorithm 76 for option in self.options: 77 suf += '_' + option 78 return suf
79
80 -class FeatureSelection:
81 - def __init__(self, training, attributes, klass, test, gold, options):
82 self.training, self.attributes, self.klass, self.test, self.gold = training, attributes, klass, test, gold 83 self.options = options
84
85 - def by_rank(self):
86 if self.attributes.has_continuous_attributes(): 87 raise inv.InvalidDataError("Rank based feature selection cannot be performed on continuous attributes.") 88 if len(self.options) != 2 or not OPTION_MAPPINGS.has_key(self.options[0]) or not int(self.options[1]): 89 raise inv.InvalidDataError("Invalid options for Rank based feature selection.")#Additional validation when not used from command prompt 90 rem_attributes = self.find_attributes_by_ranking(OPTION_MAPPINGS[self.options[0]], int(self.options[1])) 91 self.remove(rem_attributes)
92
93 - def find_attributes_by_ranking(self, method, number):
94 decision_stumps = self.attributes.empty_decision_stumps([], self.klass) 95 for decision_stump in decision_stumps: 96 for instance in self.training: 97 decision_stump.update_count(instance) 98 decision_stumps.sort(lambda x, y: cmp(getattr(x, method)(), getattr(y, method)())) 99 100 if number > len(decision_stumps): number = len(decision_stumps) 101 to_remove, attributes_to_remove = decision_stumps[:number * -1], [] 102 for stump in to_remove: 103 attributes_to_remove.append(stump.attribute) 104 return attributes_to_remove
105
106 - def remove(self, attributes):
107 self.training.remove_attributes(attributes) 108 if self.test is not None: self.test.remove_attributes(attributes) 109 if self.gold is not None: self.gold.remove_attributes(attributes) 110 self.attributes.remove_attributes(attributes)
111 112 if __name__ == "__main__": 113 FeatureSelect().run(sys.argv[1:]) 114