Package Bio :: Module LogisticRegression
[hide private]
[frames] | no frames]

Source Code for Module Bio.LogisticRegression

  1  #!/usr/bin/env python 
  2   
  3  """ 
  4  This module provides code for doing logistic regressions. 
  5   
  6   
  7  Classes: 
  8  LogisticRegression    Holds information for a LogisticRegression classifier. 
  9   
 10   
 11  Functions: 
 12  train        Train a new classifier. 
 13  calculate    Calculate the probabilities of each class, given an observation. 
 14  classify     Classify an observation into a class. 
 15  """ 
 16   
 17  #TODO - Remove this work around once we drop python 2.3 support 
 18  try: 
 19      set = set 
 20  except NameError: 
 21      from sets import Set as set 
 22   
 23  #from numpy import * 
 24  #from numpy.linalg import * 
 25  import numpy 
 26  import numpy.linalg 
 27   
28 -class LogisticRegression:
29 """Holds information necessary to do logistic regression 30 classification. 31 32 Members: 33 beta List of the weights for each dimension. 34 35 """
36 - def __init__(self):
37 """LogisticRegression()""" 38 self.beta = []
39
40 -def train(xs, ys, update_fn=None, typecode=None):
41 """train(xs, ys[, update_fn]) -> LogisticRegression 42 43 Train a logistic regression classifier on a training set. xs is a 44 list of observations and ys is a list of the class assignments, 45 which should be 0 or 1. xs and ys should contain the same number 46 of elements. update_fn is an optional callback function that 47 takes as parameters that iteration number and log likelihood. 48 49 """ 50 if len(xs) != len(ys): 51 raise ValueError("xs and ys should be the same length.") 52 classes = set(ys) 53 if classes != set([0, 1]): 54 raise ValueError("Classes should be 0's and 1's") 55 if typecode is None: 56 typecode = 'd' 57 58 # Dimensionality of the data is the dimensionality of the 59 # observations plus a constant dimension. 60 N, ndims = len(xs), len(xs[0]) + 1 61 if N==0 or ndims==1: 62 raise ValueError("No observations or observation of 0 dimension.") 63 64 # Make an X array, with a constant first dimension. 65 X = numpy.ones((N, ndims), typecode) 66 X[:, 1:] = xs 67 Xt = numpy.transpose(X) 68 y = numpy.asarray(ys, typecode) 69 70 # Initialize the beta parameter to 0. 71 beta = numpy.zeros(ndims, typecode) 72 73 MAX_ITERATIONS = 500 74 CONVERGE_THRESHOLD = 0.01 75 stepsize = 1.0 76 # Now iterate using Newton-Raphson until the log-likelihoods 77 # converge. 78 iter = 0 79 old_beta = old_llik = None 80 while iter < MAX_ITERATIONS: 81 # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X)) 82 ebetaX = numpy.exp(numpy.dot(beta, Xt)) 83 p = ebetaX / (1+ebetaX) 84 85 # Find the log likelihood score and see if I've converged. 86 logp = y*numpy.log(p) + (1-y)*numpy.log(1-p) 87 llik = sum(logp) 88 if update_fn is not None: 89 update_fn(iter, llik) 90 # Check to see if the likelihood decreased. If it did, then 91 # restore the old beta parameters and half the step size. 92 if llik < old_llik: 93 stepsize = stepsize / 2.0 94 beta = old_beta 95 # If I've converged, then stop. 96 if old_llik is not None and numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD: 97 break 98 old_llik, old_beta = llik, beta 99 iter += 1 100 101 W = numpy.identity(N) * p 102 Xtyp = numpy.dot(Xt, y-p) # Calculate the first derivative. 103 XtWX = numpy.dot(numpy.dot(Xt, W), X) # Calculate the second derivative. 104 #u, s, vt = singular_value_decomposition(XtWX) 105 #print "U", u 106 #print "S", s 107 delta = numpy.linalg.solve(XtWX, Xtyp) 108 if numpy.fabs(stepsize-1.0) > 0.001: 109 delta = delta * stepsize 110 beta = beta + delta # Update beta. 111 else: 112 raise RuntimeError("Didn't converge.") 113 114 lr = LogisticRegression() 115 lr.beta = map(float, beta) # Convert back to regular array. 116 return lr
117
118 -def calculate(lr, x):
119 """calculate(lr, x) -> list of probabilities 120 121 Calculate the probability for each class. lr is a 122 LogisticRegression object. x is the observed data. Returns a 123 list of the probability that it fits each class. 124 125 """ 126 # Insert a constant term for x. 127 x = numpy.asarray([1.0] + x) 128 # Calculate the probability. p = e^(beta X) / (1+e^(beta X)) 129 ebetaX = numpy.exp(numpy.dot(lr.beta, x)) 130 p = ebetaX / (1+ebetaX) 131 return [1-p, p]
132
133 -def classify(lr, x):
134 """classify(lr, x) -> 1 or 0 135 136 Classify an observation into a class. 137 138 """ 139 probs = calculate(lr, x) 140 if probs[0] > probs[1]: 141 return 0 142 return 1
143