0001 """Functions that read and write gzipped files. 0002 0003 The user of the file doesn't have to worry about the compression, 0004 but random access is not allowed.""" 0005 0006 # based on Andrew Kuchling's minigzip.py distributed with the zlib module 0007 0008 import struct, sys, time 0009 import zlib 0010 import __builtin__ 0011 0012 __all__ = ["GzipFile","open"] 0013 0014 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 0015 0016 READ, WRITE = 1, 2 0017 0018 def U32(i): 0019 """Return i as an unsigned integer, assuming it fits in 32 bits. 0020 0021 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long. 0022 """ 0023 if i < 0: 0024 i += 1L << 32 0025 return i 0026 0027 def LOWU32(i): 0028 """Return the low-order 32 bits of an int, as a non-negative int.""" 0029 return i & 0xFFFFFFFFL 0030 0031 def write32(output, value): 0032 output.write(struct.pack("<l", value)) 0033 0034 def write32u(output, value): 0035 # The L format writes the bit pattern correctly whether signed 0036 # or unsigned. 0037 output.write(struct.pack("<L", value)) 0038 0039 def read32(input): 0040 return struct.unpack("<l", input.read(4))[0] 0041 0042 def open(filename, mode="rb", compresslevel=9): 0043 """Shorthand for GzipFile(filename, mode, compresslevel). 0044 0045 The filename argument is required; mode defaults to 'rb' 0046 and compresslevel defaults to 9. 0047 0048 """ 0049 return GzipFile(filename, mode, compresslevel) 0050 0051 class GzipFile: 0052 """The GzipFile class simulates most of the methods of a file object with 0053 the exception of the readinto() and truncate() methods. 0054 0055 """ 0056 0057 myfileobj = None 0058 0059 def __init__(self, filename=None, mode=None, 0060 compresslevel=9, fileobj=None): 0061 """Constructor for the GzipFile class. 0062 0063 At least one of fileobj and filename must be given a 0064 non-trivial value. 0065 0066 The new class instance is based on fileobj, which can be a regular 0067 file, a StringIO object, or any other object which simulates a file. 0068 It defaults to None, in which case filename is opened to provide 0069 a file object. 0070 0071 When fileobj is not None, the filename argument is only used to be 0072 included in the gzip file header, which may includes the original 0073 filename of the uncompressed file. It defaults to the filename of 0074 fileobj, if discernible; otherwise, it defaults to the empty string, 0075 and in this case the original filename is not included in the header. 0076 0077 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb', 0078 depending on whether the file will be read or written. The default 0079 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 0080 Be aware that only the 'rb', 'ab', and 'wb' values should be used 0081 for cross-platform portability. 0082 0083 The compresslevel argument is an integer from 1 to 9 controlling the 0084 level of compression; 1 is fastest and produces the least compression, 0085 and 9 is slowest and produces the most compression. The default is 9. 0086 0087 """ 0088 0089 # guarantee the file is opened in binary mode on platforms 0090 # that care about that sort of thing 0091 if mode and 'b' not in mode: 0092 mode += 'b' 0093 if fileobj is None: 0094 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb') 0095 if filename is None: 0096 if hasattr(fileobj, 'name'): filename = fileobj.name 0097 else: filename = '' 0098 if mode is None: 0099 if hasattr(fileobj, 'mode'): mode = fileobj.mode 0100 else: mode = 'rb' 0101 0102 if mode[0:1] == 'r': 0103 self.mode = READ 0104 # Set flag indicating start of a new member 0105 self._new_member = True 0106 self.extrabuf = "" 0107 self.extrasize = 0 0108 self.filename = filename 0109 0110 elif mode[0:1] == 'w' or mode[0:1] == 'a': 0111 self.mode = WRITE 0112 self._init_write(filename) 0113 self.compress = zlib.compressobj(compresslevel, 0114 zlib.DEFLATED, 0115 -zlib.MAX_WBITS, 0116 zlib.DEF_MEM_LEVEL, 0117 0) 0118 else: 0119 raise IOError, "Mode " + mode + " not supported" 0120 0121 self.fileobj = fileobj 0122 self.offset = 0 0123 0124 if self.mode == WRITE: 0125 self._write_gzip_header() 0126 0127 def __repr__(self): 0128 s = repr(self.fileobj) 0129 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' 0130 0131 def _init_write(self, filename): 0132 if filename[-3:] != '.gz': 0133 filename = filename + '.gz' 0134 self.filename = filename 0135 self.crc = zlib.crc32("") 0136 self.size = 0 0137 self.writebuf = [] 0138 self.bufsize = 0 0139 0140 def _write_gzip_header(self): 0141 self.fileobj.write('\037\213') # magic header 0142 self.fileobj.write('\010') # compression method 0143 fname = self.filename[:-3] 0144 flags = 0 0145 if fname: 0146 flags = FNAME 0147 self.fileobj.write(chr(flags)) 0148 write32u(self.fileobj, long(time.time())) 0149 self.fileobj.write('\002') 0150 self.fileobj.write('\377') 0151 if fname: 0152 self.fileobj.write(fname + '\000') 0153 0154 def _init_read(self): 0155 self.crc = zlib.crc32("") 0156 self.size = 0 0157 0158 def _read_gzip_header(self): 0159 magic = self.fileobj.read(2) 0160 if magic != '\037\213': 0161 raise IOError, 'Not a gzipped file' 0162 method = ord( self.fileobj.read(1) ) 0163 if method != 8: 0164 raise IOError, 'Unknown compression method' 0165 flag = ord( self.fileobj.read(1) ) 0166 # modtime = self.fileobj.read(4) 0167 # extraflag = self.fileobj.read(1) 0168 # os = self.fileobj.read(1) 0169 self.fileobj.read(6) 0170 0171 if flag & FEXTRA: 0172 # Read & discard the extra field, if present 0173 xlen = ord(self.fileobj.read(1)) 0174 xlen = xlen + 256*ord(self.fileobj.read(1)) 0175 self.fileobj.read(xlen) 0176 if flag & FNAME: 0177 # Read and discard a null-terminated string containing the filename 0178 while True: 0179 s = self.fileobj.read(1) 0180 if not s or s=='\000': 0181 break 0182 if flag & FCOMMENT: 0183 # Read and discard a null-terminated string containing a comment 0184 while True: 0185 s = self.fileobj.read(1) 0186 if not s or s=='\000': 0187 break 0188 if flag & FHCRC: 0189 self.fileobj.read(2) # Read & discard the 16-bit header CRC 0190 0191 0192 def write(self,data): 0193 if self.mode != WRITE: 0194 import errno 0195 raise IOError(errno.EBADF, "write() on read-only GzipFile object") 0196 0197 if self.fileobj is None: 0198 raise ValueError, "write() on closed GzipFile object" 0199 if len(data) > 0: 0200 self.size = self.size + len(data) 0201 self.crc = zlib.crc32(data, self.crc) 0202 self.fileobj.write( self.compress.compress(data) ) 0203 self.offset += len(data) 0204 0205 def read(self, size=-1): 0206 if self.mode != READ: 0207 import errno 0208 raise IOError(errno.EBADF, "read() on write-only GzipFile object") 0209 0210 if self.extrasize <= 0 and self.fileobj is None: 0211 return '' 0212 0213 readsize = 1024 0214 if size < 0: # get the whole thing 0215 try: 0216 while True: 0217 self._read(readsize) 0218 readsize = readsize * 2 0219 except EOFError: 0220 size = self.extrasize 0221 else: # just get some more of it 0222 try: 0223 while size > self.extrasize: 0224 self._read(readsize) 0225 readsize = readsize * 2 0226 except EOFError: 0227 if size > self.extrasize: 0228 size = self.extrasize 0229 0230 chunk = self.extrabuf[:size] 0231 self.extrabuf = self.extrabuf[size:] 0232 self.extrasize = self.extrasize - size 0233 0234 self.offset += size 0235 return chunk 0236 0237 def _unread(self, buf): 0238 self.extrabuf = buf + self.extrabuf 0239 self.extrasize = len(buf) + self.extrasize 0240 self.offset -= len(buf) 0241 0242 def _read(self, size=1024): 0243 if self.fileobj is None: 0244 raise EOFError, "Reached EOF" 0245 0246 if self._new_member: 0247 # If the _new_member flag is set, we have to 0248 # jump to the next member, if there is one. 0249 # 0250 # First, check if we're at the end of the file; 0251 # if so, it's time to stop; no more members to read. 0252 pos = self.fileobj.tell() # Save current position 0253 self.fileobj.seek(0, 2) # Seek to end of file 0254 if pos == self.fileobj.tell(): 0255 raise EOFError, "Reached EOF" 0256 else: 0257 self.fileobj.seek( pos ) # Return to original position 0258 0259 self._init_read() 0260 self._read_gzip_header() 0261 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) 0262 self._new_member = False 0263 0264 # Read a chunk of data from the file 0265 buf = self.fileobj.read(size) 0266 0267 # If the EOF has been reached, flush the decompression object 0268 # and mark this object as finished. 0269 0270 if buf == "": 0271 uncompress = self.decompress.flush() 0272 self._read_eof() 0273 self._add_read_data( uncompress ) 0274 raise EOFError, 'Reached EOF' 0275 0276 uncompress = self.decompress.decompress(buf) 0277 self._add_read_data( uncompress ) 0278 0279 if self.decompress.unused_data != "": 0280 # Ending case: we've come to the end of a member in the file, 0281 # so seek back to the start of the unused data, finish up 0282 # this member, and read a new gzip header. 0283 # (The number of bytes to seek back is the length of the unused 0284 # data, minus 8 because _read_eof() will rewind a further 8 bytes) 0285 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) 0286 0287 # Check the CRC and file size, and set the flag so we read 0288 # a new member on the next call 0289 self._read_eof() 0290 self._new_member = True 0291 0292 def _add_read_data(self, data): 0293 self.crc = zlib.crc32(data, self.crc) 0294 self.extrabuf = self.extrabuf + data 0295 self.extrasize = self.extrasize + len(data) 0296 self.size = self.size + len(data) 0297 0298 def _read_eof(self): 0299 # We've read to the end of the file, so we have to rewind in order 0300 # to reread the 8 bytes containing the CRC and the file size. 0301 # We check the that the computed CRC and size of the 0302 # uncompressed data matches the stored values. Note that the size 0303 # stored is the true file size mod 2**32. 0304 self.fileobj.seek(-8, 1) 0305 crc32 = read32(self.fileobj) 0306 isize = U32(read32(self.fileobj)) # may exceed 2GB 0307 if U32(crc32) != U32(self.crc): 0308 raise IOError, "CRC check failed" 0309 elif isize != LOWU32(self.size): 0310 raise IOError, "Incorrect length of data produced" 0311 0312 def close(self): 0313 if self.mode == WRITE: 0314 self.fileobj.write(self.compress.flush()) 0315 write32(self.fileobj, self.crc) 0316 # self.size may exceed 2GB, or even 4GB 0317 write32u(self.fileobj, LOWU32(self.size)) 0318 self.fileobj = None 0319 elif self.mode == READ: 0320 self.fileobj = None 0321 if self.myfileobj: 0322 self.myfileobj.close() 0323 self.myfileobj = None 0324 0325 def __del__(self): 0326 try: 0327 if (self.myfileobj is None and 0328 self.fileobj is None): 0329 return 0330 except AttributeError: 0331 return 0332 self.close() 0333 0334 def flush(self): 0335 self.fileobj.flush() 0336 0337 def fileno(self): 0338 """Invoke the underlying file object's fileno() method. 0339 0340 This will raise AttributeError if the underlying file object 0341 doesn't support fileno(). 0342 """ 0343 return self.fileobj.fileno() 0344 0345 def isatty(self): 0346 return False 0347 0348 def tell(self): 0349 return self.offset 0350 0351 def rewind(self): 0352 '''Return the uncompressed stream file position indicator to the 0353 beginning of the file''' 0354 if self.mode != READ: 0355 raise IOError("Can't rewind in write mode") 0356 self.fileobj.seek(0) 0357 self._new_member = True 0358 self.extrabuf = "" 0359 self.extrasize = 0 0360 self.offset = 0 0361 0362 def seek(self, offset): 0363 if self.mode == WRITE: 0364 if offset < self.offset: 0365 raise IOError('Negative seek in write mode') 0366 count = offset - self.offset 0367 for i in range(count // 1024): 0368 self.write(1024 * '\0') 0369 self.write((count % 1024) * '\0') 0370 elif self.mode == READ: 0371 if offset < self.offset: 0372 # for negative seek, rewind and do positive seek 0373 self.rewind() 0374 count = offset - self.offset 0375 for i in range(count // 1024): 0376 self.read(1024) 0377 self.read(count % 1024) 0378 0379 def readline(self, size=-1): 0380 if size < 0: size = sys.maxint 0381 bufs = [] 0382 readsize = min(100, size) # Read from the file in small chunks 0383 while True: 0384 if size == 0: 0385 return "".join(bufs) # Return resulting line 0386 0387 c = self.read(readsize) 0388 i = c.find('\n') 0389 if size is not None: 0390 # We set i=size to break out of the loop under two 0391 # conditions: 1) there's no newline, and the chunk is 0392 # larger than size, or 2) there is a newline, but the 0393 # resulting line would be longer than 'size'. 0394 if i==-1 and len(c) > size: i=size-1 0395 elif size <= i: i = size -1 0396 0397 if i >= 0 or c == '': 0398 bufs.append(c[:i+1]) # Add portion of last chunk 0399 self._unread(c[i+1:]) # Push back rest of chunk 0400 return ''.join(bufs) # Return resulting line 0401 0402 # Append chunk to list, decrease 'size', 0403 bufs.append(c) 0404 size = size - len(c) 0405 readsize = min(size, readsize * 2) 0406 0407 def readlines(self, sizehint=0): 0408 # Negative numbers result in reading all the lines 0409 if sizehint <= 0: 0410 sizehint = sys.maxint 0411 L = [] 0412 while sizehint > 0: 0413 line = self.readline() 0414 if line == "": 0415 break 0416 L.append(line) 0417 sizehint = sizehint - len(line) 0418 0419 return L 0420 0421 def writelines(self, L): 0422 for line in L: 0423 self.write(line) 0424 0425 def __iter__(self): 0426 return self 0427 0428 def next(self): 0429 line = self.readline() 0430 if line: 0431 return line 0432 else: 0433 raise StopIteration 0434 0435 0436 def _test(): 0437 # Act like gzip; with -d, act like gunzip. 0438 # The input file is not deleted, however, nor are any other gzip 0439 # options or features supported. 0440 args = sys.argv[1:] 0441 decompress = args and args[0] == "-d" 0442 if decompress: 0443 args = args[1:] 0444 if not args: 0445 args = ["-"] 0446 for arg in args: 0447 if decompress: 0448 if arg == "-": 0449 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin) 0450 g = sys.stdout 0451 else: 0452 if arg[-3:] != ".gz": 0453 print "filename doesn't end in .gz:", repr(arg) 0454 continue 0455 f = open(arg, "rb") 0456 g = __builtin__.open(arg[:-3], "wb") 0457 else: 0458 if arg == "-": 0459 f = sys.stdin 0460 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout) 0461 else: 0462 f = __builtin__.open(arg, "rb") 0463 g = open(arg + ".gz", "wb") 0464 while True: 0465 chunk = f.read(1024) 0466 if not chunk: 0467 break 0468 g.write(chunk) 0469 if g is not sys.stdout: 0470 g.close() 0471 if f is not sys.stdin: 0472 f.close() 0473 0474 if __name__ == '__main__': 0475 _test() 0476
Generated by PyXR 0.9.4