PyXR

c:\python24\lib \ gzip.py



0001 """Functions that read and write gzipped files.
0002 
0003 The user of the file doesn't have to worry about the compression,
0004 but random access is not allowed."""
0005 
0006 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
0007 
0008 import struct, sys, time
0009 import zlib
0010 import __builtin__
0011 
0012 __all__ = ["GzipFile","open"]
0013 
0014 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
0015 
0016 READ, WRITE = 1, 2
0017 
0018 def U32(i):
0019     """Return i as an unsigned integer, assuming it fits in 32 bits.
0020 
0021     If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
0022     """
0023     if i < 0:
0024         i += 1L << 32
0025     return i
0026 
0027 def LOWU32(i):
0028     """Return the low-order 32 bits of an int, as a non-negative int."""
0029     return i & 0xFFFFFFFFL
0030 
0031 def write32(output, value):
0032     output.write(struct.pack("<l", value))
0033 
0034 def write32u(output, value):
0035     # The L format writes the bit pattern correctly whether signed
0036     # or unsigned.
0037     output.write(struct.pack("<L", value))
0038 
0039 def read32(input):
0040     return struct.unpack("<l", input.read(4))[0]
0041 
0042 def open(filename, mode="rb", compresslevel=9):
0043     """Shorthand for GzipFile(filename, mode, compresslevel).
0044 
0045     The filename argument is required; mode defaults to 'rb'
0046     and compresslevel defaults to 9.
0047 
0048     """
0049     return GzipFile(filename, mode, compresslevel)
0050 
0051 class GzipFile:
0052     """The GzipFile class simulates most of the methods of a file object with
0053     the exception of the readinto() and truncate() methods.
0054 
0055     """
0056 
0057     myfileobj = None
0058 
0059     def __init__(self, filename=None, mode=None,
0060                  compresslevel=9, fileobj=None):
0061         """Constructor for the GzipFile class.
0062 
0063         At least one of fileobj and filename must be given a
0064         non-trivial value.
0065 
0066         The new class instance is based on fileobj, which can be a regular
0067         file, a StringIO object, or any other object which simulates a file.
0068         It defaults to None, in which case filename is opened to provide
0069         a file object.
0070 
0071         When fileobj is not None, the filename argument is only used to be
0072         included in the gzip file header, which may includes the original
0073         filename of the uncompressed file.  It defaults to the filename of
0074         fileobj, if discernible; otherwise, it defaults to the empty string,
0075         and in this case the original filename is not included in the header.
0076 
0077         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
0078         depending on whether the file will be read or written.  The default
0079         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
0080         Be aware that only the 'rb', 'ab', and 'wb' values should be used
0081         for cross-platform portability.
0082 
0083         The compresslevel argument is an integer from 1 to 9 controlling the
0084         level of compression; 1 is fastest and produces the least compression,
0085         and 9 is slowest and produces the most compression.  The default is 9.
0086 
0087         """
0088 
0089         # guarantee the file is opened in binary mode on platforms
0090         # that care about that sort of thing
0091         if mode and 'b' not in mode:
0092             mode += 'b'
0093         if fileobj is None:
0094             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
0095         if filename is None:
0096             if hasattr(fileobj, 'name'): filename = fileobj.name
0097             else: filename = ''
0098         if mode is None:
0099             if hasattr(fileobj, 'mode'): mode = fileobj.mode
0100             else: mode = 'rb'
0101 
0102         if mode[0:1] == 'r':
0103             self.mode = READ
0104             # Set flag indicating start of a new member
0105             self._new_member = True
0106             self.extrabuf = ""
0107             self.extrasize = 0
0108             self.filename = filename
0109 
0110         elif mode[0:1] == 'w' or mode[0:1] == 'a':
0111             self.mode = WRITE
0112             self._init_write(filename)
0113             self.compress = zlib.compressobj(compresslevel,
0114                                              zlib.DEFLATED,
0115                                              -zlib.MAX_WBITS,
0116                                              zlib.DEF_MEM_LEVEL,
0117                                              0)
0118         else:
0119             raise IOError, "Mode " + mode + " not supported"
0120 
0121         self.fileobj = fileobj
0122         self.offset = 0
0123 
0124         if self.mode == WRITE:
0125             self._write_gzip_header()
0126 
0127     def __repr__(self):
0128         s = repr(self.fileobj)
0129         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
0130 
0131     def _init_write(self, filename):
0132         if filename[-3:] != '.gz':
0133             filename = filename + '.gz'
0134         self.filename = filename
0135         self.crc = zlib.crc32("")
0136         self.size = 0
0137         self.writebuf = []
0138         self.bufsize = 0
0139 
0140     def _write_gzip_header(self):
0141         self.fileobj.write('\037\213')             # magic header
0142         self.fileobj.write('\010')                 # compression method
0143         fname = self.filename[:-3]
0144         flags = 0
0145         if fname:
0146             flags = FNAME
0147         self.fileobj.write(chr(flags))
0148         write32u(self.fileobj, long(time.time()))
0149         self.fileobj.write('\002')
0150         self.fileobj.write('\377')
0151         if fname:
0152             self.fileobj.write(fname + '\000')
0153 
0154     def _init_read(self):
0155         self.crc = zlib.crc32("")
0156         self.size = 0
0157 
0158     def _read_gzip_header(self):
0159         magic = self.fileobj.read(2)
0160         if magic != '\037\213':
0161             raise IOError, 'Not a gzipped file'
0162         method = ord( self.fileobj.read(1) )
0163         if method != 8:
0164             raise IOError, 'Unknown compression method'
0165         flag = ord( self.fileobj.read(1) )
0166         # modtime = self.fileobj.read(4)
0167         # extraflag = self.fileobj.read(1)
0168         # os = self.fileobj.read(1)
0169         self.fileobj.read(6)
0170 
0171         if flag & FEXTRA:
0172             # Read & discard the extra field, if present
0173             xlen = ord(self.fileobj.read(1))
0174             xlen = xlen + 256*ord(self.fileobj.read(1))
0175             self.fileobj.read(xlen)
0176         if flag & FNAME:
0177             # Read and discard a null-terminated string containing the filename
0178             while True:
0179                 s = self.fileobj.read(1)
0180                 if not s or s=='\000':
0181                     break
0182         if flag & FCOMMENT:
0183             # Read and discard a null-terminated string containing a comment
0184             while True:
0185                 s = self.fileobj.read(1)
0186                 if not s or s=='\000':
0187                     break
0188         if flag & FHCRC:
0189             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
0190 
0191 
0192     def write(self,data):
0193         if self.mode != WRITE:
0194             import errno
0195             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
0196 
0197         if self.fileobj is None:
0198             raise ValueError, "write() on closed GzipFile object"
0199         if len(data) > 0:
0200             self.size = self.size + len(data)
0201             self.crc = zlib.crc32(data, self.crc)
0202             self.fileobj.write( self.compress.compress(data) )
0203             self.offset += len(data)
0204 
0205     def read(self, size=-1):
0206         if self.mode != READ:
0207             import errno
0208             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
0209 
0210         if self.extrasize <= 0 and self.fileobj is None:
0211             return ''
0212 
0213         readsize = 1024
0214         if size < 0:        # get the whole thing
0215             try:
0216                 while True:
0217                     self._read(readsize)
0218                     readsize = readsize * 2
0219             except EOFError:
0220                 size = self.extrasize
0221         else:               # just get some more of it
0222             try:
0223                 while size > self.extrasize:
0224                     self._read(readsize)
0225                     readsize = readsize * 2
0226             except EOFError:
0227                 if size > self.extrasize:
0228                     size = self.extrasize
0229 
0230         chunk = self.extrabuf[:size]
0231         self.extrabuf = self.extrabuf[size:]
0232         self.extrasize = self.extrasize - size
0233 
0234         self.offset += size
0235         return chunk
0236 
0237     def _unread(self, buf):
0238         self.extrabuf = buf + self.extrabuf
0239         self.extrasize = len(buf) + self.extrasize
0240         self.offset -= len(buf)
0241 
0242     def _read(self, size=1024):
0243         if self.fileobj is None:
0244             raise EOFError, "Reached EOF"
0245 
0246         if self._new_member:
0247             # If the _new_member flag is set, we have to
0248             # jump to the next member, if there is one.
0249             #
0250             # First, check if we're at the end of the file;
0251             # if so, it's time to stop; no more members to read.
0252             pos = self.fileobj.tell()   # Save current position
0253             self.fileobj.seek(0, 2)     # Seek to end of file
0254             if pos == self.fileobj.tell():
0255                 raise EOFError, "Reached EOF"
0256             else:
0257                 self.fileobj.seek( pos ) # Return to original position
0258 
0259             self._init_read()
0260             self._read_gzip_header()
0261             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
0262             self._new_member = False
0263 
0264         # Read a chunk of data from the file
0265         buf = self.fileobj.read(size)
0266 
0267         # If the EOF has been reached, flush the decompression object
0268         # and mark this object as finished.
0269 
0270         if buf == "":
0271             uncompress = self.decompress.flush()
0272             self._read_eof()
0273             self._add_read_data( uncompress )
0274             raise EOFError, 'Reached EOF'
0275 
0276         uncompress = self.decompress.decompress(buf)
0277         self._add_read_data( uncompress )
0278 
0279         if self.decompress.unused_data != "":
0280             # Ending case: we've come to the end of a member in the file,
0281             # so seek back to the start of the unused data, finish up
0282             # this member, and read a new gzip header.
0283             # (The number of bytes to seek back is the length of the unused
0284             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
0285             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
0286 
0287             # Check the CRC and file size, and set the flag so we read
0288             # a new member on the next call
0289             self._read_eof()
0290             self._new_member = True
0291 
0292     def _add_read_data(self, data):
0293         self.crc = zlib.crc32(data, self.crc)
0294         self.extrabuf = self.extrabuf + data
0295         self.extrasize = self.extrasize + len(data)
0296         self.size = self.size + len(data)
0297 
0298     def _read_eof(self):
0299         # We've read to the end of the file, so we have to rewind in order
0300         # to reread the 8 bytes containing the CRC and the file size.
0301         # We check the that the computed CRC and size of the
0302         # uncompressed data matches the stored values.  Note that the size
0303         # stored is the true file size mod 2**32.
0304         self.fileobj.seek(-8, 1)
0305         crc32 = read32(self.fileobj)
0306         isize = U32(read32(self.fileobj))   # may exceed 2GB
0307         if U32(crc32) != U32(self.crc):
0308             raise IOError, "CRC check failed"
0309         elif isize != LOWU32(self.size):
0310             raise IOError, "Incorrect length of data produced"
0311 
0312     def close(self):
0313         if self.mode == WRITE:
0314             self.fileobj.write(self.compress.flush())
0315             write32(self.fileobj, self.crc)
0316             # self.size may exceed 2GB, or even 4GB
0317             write32u(self.fileobj, LOWU32(self.size))
0318             self.fileobj = None
0319         elif self.mode == READ:
0320             self.fileobj = None
0321         if self.myfileobj:
0322             self.myfileobj.close()
0323             self.myfileobj = None
0324 
0325     def __del__(self):
0326         try:
0327             if (self.myfileobj is None and
0328                 self.fileobj is None):
0329                 return
0330         except AttributeError:
0331             return
0332         self.close()
0333 
0334     def flush(self):
0335         self.fileobj.flush()
0336 
0337     def fileno(self):
0338         """Invoke the underlying file object's fileno() method.
0339 
0340         This will raise AttributeError if the underlying file object
0341         doesn't support fileno().
0342         """
0343         return self.fileobj.fileno()
0344 
0345     def isatty(self):
0346         return False
0347 
0348     def tell(self):
0349         return self.offset
0350 
0351     def rewind(self):
0352         '''Return the uncompressed stream file position indicator to the
0353         beginning of the file'''
0354         if self.mode != READ:
0355             raise IOError("Can't rewind in write mode")
0356         self.fileobj.seek(0)
0357         self._new_member = True
0358         self.extrabuf = ""
0359         self.extrasize = 0
0360         self.offset = 0
0361 
0362     def seek(self, offset):
0363         if self.mode == WRITE:
0364             if offset < self.offset:
0365                 raise IOError('Negative seek in write mode')
0366             count = offset - self.offset
0367             for i in range(count // 1024):
0368                 self.write(1024 * '\0')
0369             self.write((count % 1024) * '\0')
0370         elif self.mode == READ:
0371             if offset < self.offset:
0372                 # for negative seek, rewind and do positive seek
0373                 self.rewind()
0374             count = offset - self.offset
0375             for i in range(count // 1024):
0376                 self.read(1024)
0377             self.read(count % 1024)
0378 
0379     def readline(self, size=-1):
0380         if size < 0: size = sys.maxint
0381         bufs = []
0382         readsize = min(100, size)    # Read from the file in small chunks
0383         while True:
0384             if size == 0:
0385                 return "".join(bufs) # Return resulting line
0386 
0387             c = self.read(readsize)
0388             i = c.find('\n')
0389             if size is not None:
0390                 # We set i=size to break out of the loop under two
0391                 # conditions: 1) there's no newline, and the chunk is
0392                 # larger than size, or 2) there is a newline, but the
0393                 # resulting line would be longer than 'size'.
0394                 if i==-1 and len(c) > size: i=size-1
0395                 elif size <= i: i = size -1
0396 
0397             if i >= 0 or c == '':
0398                 bufs.append(c[:i+1])    # Add portion of last chunk
0399                 self._unread(c[i+1:])   # Push back rest of chunk
0400                 return ''.join(bufs)    # Return resulting line
0401 
0402             # Append chunk to list, decrease 'size',
0403             bufs.append(c)
0404             size = size - len(c)
0405             readsize = min(size, readsize * 2)
0406 
0407     def readlines(self, sizehint=0):
0408         # Negative numbers result in reading all the lines
0409         if sizehint <= 0:
0410             sizehint = sys.maxint
0411         L = []
0412         while sizehint > 0:
0413             line = self.readline()
0414             if line == "":
0415                 break
0416             L.append(line)
0417             sizehint = sizehint - len(line)
0418 
0419         return L
0420 
0421     def writelines(self, L):
0422         for line in L:
0423             self.write(line)
0424 
0425     def __iter__(self):
0426         return self
0427 
0428     def next(self):
0429         line = self.readline()
0430         if line:
0431             return line
0432         else:
0433             raise StopIteration
0434 
0435 
0436 def _test():
0437     # Act like gzip; with -d, act like gunzip.
0438     # The input file is not deleted, however, nor are any other gzip
0439     # options or features supported.
0440     args = sys.argv[1:]
0441     decompress = args and args[0] == "-d"
0442     if decompress:
0443         args = args[1:]
0444     if not args:
0445         args = ["-"]
0446     for arg in args:
0447         if decompress:
0448             if arg == "-":
0449                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
0450                 g = sys.stdout
0451             else:
0452                 if arg[-3:] != ".gz":
0453                     print "filename doesn't end in .gz:", repr(arg)
0454                     continue
0455                 f = open(arg, "rb")
0456                 g = __builtin__.open(arg[:-3], "wb")
0457         else:
0458             if arg == "-":
0459                 f = sys.stdin
0460                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
0461             else:
0462                 f = __builtin__.open(arg, "rb")
0463                 g = open(arg + ".gz", "wb")
0464         while True:
0465             chunk = f.read(1024)
0466             if not chunk:
0467                 break
0468             g.write(chunk)
0469         if g is not sys.stdout:
0470             g.close()
0471         if f is not sys.stdin:
0472             f.close()
0473 
0474 if __name__ == '__main__':
0475     _test()
0476 

Generated by PyXR 0.9.4
SourceForge.net Logo