Package Bio :: Package EUtils :: Module ReseekFile
[hide private]
[frames] | no frames]

Source Code for Module Bio.EUtils.ReseekFile

  1  """Wrap a file handle to allow seeks back to the beginning 
  2   
  3  Sometimes data coming from a socket or other input file handle isn't 
  4  what it was supposed to be.  For example, suppose you are reading from 
  5  a buggy server which is supposed to return an XML stream but can also 
  6  return an unformatted error message.  (This often happens because the 
  7  server doesn't handle incorrect input very well.) 
  8   
  9  A ReseekFile helps solve this problem.  It is a wrapper to the 
 10  original input stream but provides a buffer.  Read requests to the 
 11  ReseekFile get forwarded to the input stream, appended to a buffer, 
 12  then returned to the caller.  The buffer contains all the data read so 
 13  far. 
 14   
 15  The ReseekFile can be told to reseek to the start position.  The next 
 16  read request will come from the buffer, until the buffer has been 
 17  read, in which case it gets the data from the input stream.  This 
 18  newly read data is also appended to the buffer. 
 19   
 20  When buffering is no longer needed, use the 'nobuffer()' method.  This 
 21  tells the ReseekFile that once it has read from the buffer it should 
 22  throw the buffer away.  After nobuffer is called, the behaviour of 
 23  'seek' is no longer defined. 
 24   
 25  For example, suppose you have the server as above which either 
 26  gives an error message is of the form: 
 27   
 28    ERROR: cannot do that 
 29   
 30  or an XML data stream, starting with "<?xml". 
 31   
 32    infile = urllib2.urlopen("http://somewhere/") 
 33    infile = ReseekFile.ReseekFile(infile) 
 34    s = infile.readline() 
 35    if s.startswith("ERROR:"): 
 36        raise Exception(s[:-1]) 
 37    infile.seek(0) 
 38    infile.nobuffer()   # Don't buffer the data 
 39     ... process the XML from infile ... 
 40   
 41   
 42  This module also implements 'prepare_input_source(source)' modeled on 
 43  xml.sax.saxutils.prepare_input_source.  This opens a URL and if the 
 44  input stream is not already seekable, wraps it in a ReseekFile. 
 45   
 46   
 47  NOTE: 
 48    Don't use bound methods for the ReseekFile.  When the buffer is 
 49  empty, the ReseekFile reassigns the input file's read/readlines/etc. 
 50  method as instance variable.  This gives slightly better performance 
 51  at the cost of not allowing an infrequently used idiom. 
 52   
 53    Use tell() to get the beginning byte location.  ReseekFile will 
 54  attempt to get the real position from the wrapped file and use that as 
 55  the beginning location.  If the wrapped file does not support tell(), 
 56  ReseekFile.tell() will return 0. 
 57   
 58    readlines does not yet support a sizehint.  Want to 
 59  an implementation? 
 60   
 61  The latest version of this code can be found at 
 62    http://www.dalkescientific.com/Python/ 
 63  """ 
 64  # Written in 2003 by Andrew Dalke, Dalke Scientific Software, LLC. 
 65  # This software has been released to the public domain.  No 
 66  # copyright is asserted. 
 67   
 68  from cStringIO import StringIO 
 69   
70 -class ReseekFile:
71 """wrap a file handle to allow seeks back to the beginning 72 73 Takes a file handle in the constructor. 74 75 See the module docstring for more documentation. 76 """
77 - def __init__(self, file):
78 self.file = file 79 self.buffer_file = StringIO() 80 self.at_beginning = 1 81 try: 82 self.beginning = file.tell() 83 except (IOError, AttributeError): 84 self.beginning = 0 85 self._use_buffer = 1
86
87 - def seek(self, offset, whence = 0):
88 """offset, whence = 0 89 90 Seek to a given byte position. Only supports whence == 0 91 and offset == the initial value of ReseekFile.tell() (which 92 is usually 0, but not always.) 93 """ 94 if whence != 0: 95 raise TypeError("Unexpected whence value of %s; expecting 0" % \ 96 (whence,)) 97 if offset != self.beginning: 98 raise TypeError("Unexpected offset value of %r; expecting '%s'" % \ 99 (offset, self.beginning)) 100 self.buffer_file.seek(0) 101 self.at_beginning = 1
102
103 - def tell(self):
104 """the current position of the file 105 106 The initial position may not be 0 if the underlying input 107 file supports tell and it not at position 0. 108 """ 109 if not self.at_beginning: 110 raise TypeError("ReseekFile cannot tell except at the beginning of file") 111 return self.beginning
112
113 - def _read(self, size):
114 if size < 0: 115 y = self.file.read() 116 z = self.buffer_file.read() + y 117 if self._use_buffer: 118 self.buffer_file.write(y) 119 return z 120 if size == 0: 121 return "" 122 x = self.buffer_file.read(size) 123 if len(x) < size: 124 y = self.file.read(size - len(x)) 125 if self._use_buffer: 126 self.buffer_file.write(y) 127 return x + y 128 return x
129
130 - def read(self, size = -1):
131 """read up to 'size' bytes from the file 132 133 Default is -1, which means to read to end of file. 134 """ 135 x = self._read(size) 136 if self.at_beginning and x: 137 self.at_beginning = 0 138 self._check_no_buffer() 139 return x
140
141 - def readline(self):
142 """read a line from the file""" 143 144 # Can we get it out of the buffer_file? 145 s = self.buffer_file.readline() 146 if s[-1:] == "\n": 147 return s 148 # No, so now we read a line from the input file 149 t = self.file.readline() 150 151 # Append the new data to the buffer, if still buffering 152 if self._use_buffer: 153 self.buffer_file.write(t) 154 155 self._check_no_buffer() 156 157 return s + t
158
159 - def readlines(self):
160 """read all remaining lines from the file""" 161 s = self.read() 162 lines = [] 163 i, j = 0, s.find("\n") 164 while j > -1: 165 lines.append(s[i:j+1]) 166 i = j+1 167 j = s.find("\n", i) 168 if i < len(s): 169 # Only get here if the last line doesn't have a newline 170 lines.append(s[i:]) 171 return lines
172
173 - def _check_no_buffer(self):
174 # If 'nobuffer' called and finished with the buffer file 175 # then get rid of the buffer and redirect everything to 176 # the original input file. 177 if self._use_buffer == 0 and self.buffer_file.tell() == \ 178 len(self.buffer_file.getvalue()): 179 # I'm doing this for the slightly better performance 180 self.seek = getattr(self.file, "seek", None) 181 self.tell = getattr(self.file, "tell", None) 182 self.read = self.file.read 183 self.readline = self.file.readline 184 self.readlines = self.file.readlines 185 del self.buffer_file
186
187 - def nobuffer(self):
188 """tell the ReseekFile to stop using the buffer once it's exhausted""" 189 self._use_buffer = 0
190
191 -def prepare_input_source(source):
192 """given a URL, returns a xml.sax.xmlreader.InputSource 193 194 Works like xml.sax.saxutils.prepare_input_source. Wraps the 195 InputSource in a ReseekFile if the URL returns a non-seekable 196 file. 197 198 To turn the buffer off if that happens, you'll need to do 199 something like 200 201 f = source.getCharacterStream() 202 ... 203 try: 204 f.nobuffer() 205 except AttributeError: 206 pass 207 208 or 209 210 if isinstance(f, ReseekFile): 211 f.nobuffer() 212 213 """ 214 from xml.sax import saxutils 215 source = saxutils.prepare_input_source(source) 216 # Is this correct? Don't know - don't have Unicode exprerience 217 f = source.getCharacterStream() or source.getByteStream() 218 try: 219 f.tell() 220 except (AttributeError, IOError): 221 f = ReseekFile.ReseekFile(f) 222 source.setByteStream(f) 223 source.setCharacterStream(None) 224 return source
225
226 -def test_reads(test_s, file, seek0):
227 assert file.read(2) == "Th" 228 assert file.read(3) == "is " 229 assert file.read(4) == "is a" 230 assert file.read(0) == "" 231 assert file.read(0) == "" 232 assert file.read(6) == " test." 233 file.seek(seek0) 234 assert file.read(2) == "Th" 235 assert file.read(3) == "is " 236 assert file.read(4) == "is a" 237 assert file.read(0) == "" 238 assert file.read(0) == "" 239 assert file.read(6) == " test." 240 assert file.read(1) == "\n" 241 assert file.read(5) == "12345" 242 assert file.read() == "67890\n" 243 file.seek(seek0) 244 assert file.read() == test_s 245 file.seek(seek0)
246 247
248 -def test():
249 s = "This is a test.\n1234567890\n" 250 file = StringIO(s) 251 # Test with a normal file 252 x = file.tell() 253 test_reads(s, file, x) 254 test_reads(s, file, x) 255 256 # Test with a ReseekFile wrapper 257 rf = ReseekFile(file) 258 y = rf.tell() 259 rf.seek(y) 260 test_reads(s, rf, y) 261 assert rf.read() == s 262 assert rf.read() == "" 263 264 # Make sure the tell offset is correct (may not be 0) 265 file = StringIO("X" + s) 266 file.read(1) 267 rf = ReseekFile(file) 268 y = rf.tell() 269 test_reads(s, rf, y) 270 rf.seek(y) 271 test_reads(s, rf, y) 272 assert rf.read() == s 273 assert rf.read() == "" 274 275 # Test the ability to turn off buffering and have changes 276 # propogate correctly 277 file = StringIO("X" + s) 278 file.read(1) 279 rf = ReseekFile(file) 280 y = rf.tell() 281 assert y == 1 282 rf.read(1000) 283 rf.seek(y) 284 rf.nobuffer() 285 assert rf.tell() == y 286 test_reads(s, rf, y) 287 rf.seek(y) 288 test_reads(s, rf, y) 289 assert rf.read() == s 290 assert rf.read() == "" 291 292 # turn off buffering after partial reads 293 file = StringIO("X" + s) 294 file.read(1) 295 rf = ReseekFile(file) 296 y = rf.tell() 297 rf.read(5) 298 rf.seek(y) 299 rf.nobuffer() 300 assert rf.read() == s 301 302 file = StringIO("X" + s) 303 file.read(1) 304 rf = ReseekFile(file) 305 y = rf.tell() 306 t = rf.read(5) 307 rf.seek(y) 308 rf.nobuffer() 309 assert rf.read(5) == t 310 311 file = StringIO("X" + s) 312 file.read(1) 313 rf = ReseekFile(file) 314 y = rf.tell() 315 t = rf.read(5) 316 assert t == s[:5] 317 rf.seek(y) 318 rf.nobuffer() 319 assert rf.read(8) == s[:8] 320 321 file = StringIO("X" + s) 322 file.read(1) 323 rf = ReseekFile(file) 324 y = rf.tell() 325 t = rf.read(5) 326 assert t == s[:5] 327 rf.nobuffer() 328 assert rf.read(8) == s[5:5+8] 329 330 # Should only do this test on Unix systems 331 import os 332 infile = os.popen("echo HELLO_THERE") 333 infile.read(1) 334 rf = ReseekFile(infile) 335 y = rf.tell() 336 assert rf.read(1) == "E" 337 assert rf.read(2) == "LL" 338 rf.seek(y) 339 assert rf.read(4) == "ELLO" 340 rf.seek(y) 341 assert rf.read(1) == "E" 342 rf.nobuffer() 343 assert rf.read(1) == "L" 344 assert rf.read(4) == "LO_T" 345 assert rf.read(4) == "HERE" 346 try: 347 rf.seek(y) 348 raise AssertionError("Cannot seek here!") 349 except IOError: 350 pass 351 try: 352 rf.tell() 353 raise AssertionError("Cannot tell here!") 354 except IOError: 355 pass 356 357 # Check if readline/readlines works 358 s = "This is line 1.\nAnd line 2.\nAnd now, page 3!" 359 file = StringIO(s) 360 rf = ReseekFile(file) 361 rf.read(1) 362 assert rf.readline() == "his is line 1.\n" 363 rf.seek(0) 364 assert rf.readline() == "This is line 1.\n" 365 rf.read(2) 366 assert rf.readline() == "d line 2.\n" 367 rf.seek(0) 368 assert rf.readlines() == ["This is line 1.\n", 369 "And line 2.\n", 370 "And now, page 3!"] 371 372 rf.seek(0) 373 rf.read(len(s)) 374 assert rf.readlines() == [] 375 rf.seek(0) 376 377 # Now there is a final newline 378 s = "This is line 1.\nAnd line 2.\nAnd now, page 3!\n" 379 rf = ReseekFile(StringIO(s)) 380 rf.read(1) 381 rf.seek(0) 382 rf.nobuffer() 383 assert rf.readlines() == ["This is line 1.\n", 384 "And line 2.\n", 385 "And now, page 3!\n"]
386 387 388 if __name__ == "__main__": 389 test() 390