PyXR

c:\python24\lib \ codecs.py


0001 """ codecs -- Python Codec Registry, API and helpers.
0002 
0003 
0004 Written by Marc-Andre Lemburg (mal@lemburg.com).
0005 
0006 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
0007 
0008 """#"
0009 
0010 import __builtin__, sys
0011 
0012 ### Registry and builtin stateless codec functions
0013 
0014 try:
0015     from _codecs import *
0016 except ImportError, why:
0017     raise SystemError,\
0018           'Failed to load the builtin codecs: %s' % why
0019 
0020 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
0021            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
0022            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
0023            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
0024            "strict_errors", "ignore_errors", "replace_errors",
0025            "xmlcharrefreplace_errors",
0026            "register_error", "lookup_error"]
0027 
0028 ### Constants
0029 
0030 #
0031 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
0032 # and its possible byte string values
0033 # for UTF8/UTF16/UTF32 output and little/big endian machines
0034 #
0035 
0036 # UTF-8
0037 BOM_UTF8 = '\xef\xbb\xbf'
0038 
0039 # UTF-16, little endian
0040 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
0041 
0042 # UTF-16, big endian
0043 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
0044 
0045 # UTF-32, little endian
0046 BOM_UTF32_LE = '\xff\xfe\x00\x00'
0047 
0048 # UTF-32, big endian
0049 BOM_UTF32_BE = '\x00\x00\xfe\xff'
0050 
0051 if sys.byteorder == 'little':
0052 
0053     # UTF-16, native endianness
0054     BOM = BOM_UTF16 = BOM_UTF16_LE
0055 
0056     # UTF-32, native endianness
0057     BOM_UTF32 = BOM_UTF32_LE
0058 
0059 else:
0060 
0061     # UTF-16, native endianness
0062     BOM = BOM_UTF16 = BOM_UTF16_BE
0063 
0064     # UTF-32, native endianness
0065     BOM_UTF32 = BOM_UTF32_BE
0066 
0067 # Old broken names (don't use in new code)
0068 BOM32_LE = BOM_UTF16_LE
0069 BOM32_BE = BOM_UTF16_BE
0070 BOM64_LE = BOM_UTF32_LE
0071 BOM64_BE = BOM_UTF32_BE
0072 
0073 
0074 ### Codec base classes (defining the API)
0075 
0076 class Codec:
0077 
0078     """ Defines the interface for stateless encoders/decoders.
0079 
0080         The .encode()/.decode() methods may use different error
0081         handling schemes by providing the errors argument. These
0082         string values are predefined:
0083 
0084          'strict' - raise a ValueError error (or a subclass)
0085          'ignore' - ignore the character and continue with the next
0086          'replace' - replace with a suitable replacement character;
0087                     Python will use the official U+FFFD REPLACEMENT
0088                     CHARACTER for the builtin Unicode codecs on
0089                     decoding and '?' on encoding.
0090          'xmlcharrefreplace' - Replace with the appropriate XML
0091                                character reference (only for encoding).
0092          'backslashreplace'  - Replace with backslashed escape sequences
0093                                (only for encoding).
0094 
0095         The set of allowed values can be extended via register_error.
0096 
0097     """
0098     def encode(self, input, errors='strict'):
0099 
0100         """ Encodes the object input and returns a tuple (output
0101             object, length consumed).
0102 
0103             errors defines the error handling to apply. It defaults to
0104             'strict' handling.
0105 
0106             The method may not store state in the Codec instance. Use
0107             StreamCodec for codecs which have to keep state in order to
0108             make encoding/decoding efficient.
0109 
0110             The encoder must be able to handle zero length input and
0111             return an empty object of the output object type in this
0112             situation.
0113 
0114         """
0115         raise NotImplementedError
0116 
0117     def decode(self, input, errors='strict'):
0118 
0119         """ Decodes the object input and returns a tuple (output
0120             object, length consumed).
0121 
0122             input must be an object which provides the bf_getreadbuf
0123             buffer slot. Python strings, buffer objects and memory
0124             mapped files are examples of objects providing this slot.
0125 
0126             errors defines the error handling to apply. It defaults to
0127             'strict' handling.
0128 
0129             The method may not store state in the Codec instance. Use
0130             StreamCodec for codecs which have to keep state in order to
0131             make encoding/decoding efficient.
0132 
0133             The decoder must be able to handle zero length input and
0134             return an empty object of the output object type in this
0135             situation.
0136 
0137         """
0138         raise NotImplementedError
0139 
0140 #
0141 # The StreamWriter and StreamReader class provide generic working
0142 # interfaces which can be used to implement new encoding submodules
0143 # very easily. See encodings/utf_8.py for an example on how this is
0144 # done.
0145 #
0146 
0147 class StreamWriter(Codec):
0148 
0149     def __init__(self, stream, errors='strict'):
0150 
0151         """ Creates a StreamWriter instance.
0152 
0153             stream must be a file-like object open for writing
0154             (binary) data.
0155 
0156             The StreamWriter may use different error handling
0157             schemes by providing the errors keyword argument. These
0158             parameters are predefined:
0159 
0160              'strict' - raise a ValueError (or a subclass)
0161              'ignore' - ignore the character and continue with the next
0162              'replace'- replace with a suitable replacement character
0163              'xmlcharrefreplace' - Replace with the appropriate XML
0164                                    character reference.
0165              'backslashreplace'  - Replace with backslashed escape
0166                                    sequences (only for encoding).
0167 
0168             The set of allowed parameter values can be extended via
0169             register_error.
0170         """
0171         self.stream = stream
0172         self.errors = errors
0173 
0174     def write(self, object):
0175 
0176         """ Writes the object's contents encoded to self.stream.
0177         """
0178         data, consumed = self.encode(object, self.errors)
0179         self.stream.write(data)
0180 
0181     def writelines(self, list):
0182 
0183         """ Writes the concatenated list of strings to the stream
0184             using .write().
0185         """
0186         self.write(''.join(list))
0187 
0188     def reset(self):
0189 
0190         """ Flushes and resets the codec buffers used for keeping state.
0191 
0192             Calling this method should ensure that the data on the
0193             output is put into a clean state, that allows appending
0194             of new fresh data without having to rescan the whole
0195             stream to recover state.
0196 
0197         """
0198         pass
0199 
0200     def __getattr__(self, name,
0201                     getattr=getattr):
0202 
0203         """ Inherit all other methods from the underlying stream.
0204         """
0205         return getattr(self.stream, name)
0206 
0207 ###
0208 
0209 class StreamReader(Codec):
0210 
0211     def __init__(self, stream, errors='strict'):
0212 
0213         """ Creates a StreamReader instance.
0214 
0215             stream must be a file-like object open for reading
0216             (binary) data.
0217 
0218             The StreamReader may use different error handling
0219             schemes by providing the errors keyword argument. These
0220             parameters are predefined:
0221 
0222              'strict' - raise a ValueError (or a subclass)
0223              'ignore' - ignore the character and continue with the next
0224              'replace'- replace with a suitable replacement character;
0225 
0226             The set of allowed parameter values can be extended via
0227             register_error.
0228         """
0229         self.stream = stream
0230         self.errors = errors
0231         self.bytebuffer = ""
0232         self.charbuffer = u""
0233 
0234     def decode(self, input, errors='strict'):
0235         raise NotImplementedError
0236 
0237     def read(self, size=-1, chars=-1):
0238 
0239         """ Decodes data from the stream self.stream and returns the
0240             resulting object.
0241 
0242             chars indicates the number of characters to read from the
0243             stream. read() will never return more than chars
0244             characters, but it might return less, if there are not enough
0245             characters available.
0246 
0247             size indicates the approximate maximum number of bytes to
0248             read from the stream for decoding purposes. The decoder
0249             can modify this setting as appropriate. The default value
0250             -1 indicates to read and decode as much as possible.  size
0251             is intended to prevent having to decode huge files in one
0252             step.
0253 
0254             The method should use a greedy read strategy meaning that
0255             it should read as much data as is allowed within the
0256             definition of the encoding and the given size, e.g.  if
0257             optional encoding endings or state markers are available
0258             on the stream, these should be read too.
0259 
0260         """
0261         # read until we get the required number of characters (if available)
0262         done = False
0263         while True:
0264             # can the request can be satisfied from the character buffer?
0265             if chars < 0:
0266                 if self.charbuffer:
0267                     done = True
0268             else:
0269                 if len(self.charbuffer) >= chars:
0270                     done = True
0271             if done:
0272                 if chars < 0:
0273                     result = self.charbuffer
0274                     self.charbuffer = u""
0275                     break
0276                 else:
0277                     result = self.charbuffer[:chars]
0278                     self.charbuffer = self.charbuffer[chars:]
0279                     break
0280             # we need more data
0281             if size < 0:
0282                 newdata = self.stream.read()
0283             else:
0284                 newdata = self.stream.read(size)
0285             data = self.bytebuffer + newdata
0286             object, decodedbytes = self.decode(data, self.errors)
0287             # keep undecoded bytes until the next call
0288             self.bytebuffer = data[decodedbytes:]
0289             # put new characters in the character buffer
0290             self.charbuffer += object
0291             # there was no data available
0292             if not newdata:
0293                 done = True
0294         return result
0295 
0296     def readline(self, size=None, keepends=True):
0297 
0298         """ Read one line from the input stream and return the
0299             decoded data.
0300 
0301             size, if given, is passed as size argument to the
0302             read() method.
0303 
0304         """
0305         if size is None:
0306             size = 10
0307         line = u""
0308         while True:
0309             data = self.read(size)
0310             line += data
0311             pos = line.find("\n")
0312             if pos>=0:
0313                 self.charbuffer = line[pos+1:] + self.charbuffer
0314                 if keepends:
0315                     line = line[:pos+1]
0316                 else:
0317                     line = line[:pos]
0318                 return line
0319             elif not data:
0320                 return line
0321             if size<8000:
0322                 size *= 2
0323 
0324     def readlines(self, sizehint=None, keepends=True):
0325 
0326         """ Read all lines available on the input stream
0327             and return them as list of lines.
0328 
0329             Line breaks are implemented using the codec's decoder
0330             method and are included in the list entries.
0331 
0332             sizehint, if given, is ignored since there is no efficient
0333             way to finding the true end-of-line.
0334 
0335         """
0336         data = self.read()
0337         return data.splitlines(keepends)
0338 
0339     def reset(self):
0340 
0341         """ Resets the codec buffers used for keeping state.
0342 
0343             Note that no stream repositioning should take place.
0344             This method is primarily intended to be able to recover
0345             from decoding errors.
0346 
0347         """
0348         pass
0349 
0350     def next(self):
0351 
0352         """ Return the next decoded line from the input stream."""
0353         line = self.readline()
0354         if line:
0355             return line
0356         raise StopIteration
0357 
0358     def __iter__(self):
0359         return self
0360 
0361     def __getattr__(self, name,
0362                     getattr=getattr):
0363 
0364         """ Inherit all other methods from the underlying stream.
0365         """
0366         return getattr(self.stream, name)
0367 
0368 ###
0369 
0370 class StreamReaderWriter:
0371 
0372     """ StreamReaderWriter instances allow wrapping streams which
0373         work in both read and write modes.
0374 
0375         The design is such that one can use the factory functions
0376         returned by the codec.lookup() function to construct the
0377         instance.
0378 
0379     """
0380     # Optional attributes set by the file wrappers below
0381     encoding = 'unknown'
0382 
0383     def __init__(self, stream, Reader, Writer, errors='strict'):
0384 
0385         """ Creates a StreamReaderWriter instance.
0386 
0387             stream must be a Stream-like object.
0388 
0389             Reader, Writer must be factory functions or classes
0390             providing the StreamReader, StreamWriter interface resp.
0391 
0392             Error handling is done in the same way as defined for the
0393             StreamWriter/Readers.
0394 
0395         """
0396         self.stream = stream
0397         self.reader = Reader(stream, errors)
0398         self.writer = Writer(stream, errors)
0399         self.errors = errors
0400 
0401     def read(self, size=-1):
0402 
0403         return self.reader.read(size)
0404 
0405     def readline(self, size=None):
0406 
0407         return self.reader.readline(size)
0408 
0409     def readlines(self, sizehint=None):
0410 
0411         return self.reader.readlines(sizehint)
0412 
0413     def next(self):
0414 
0415         """ Return the next decoded line from the input stream."""
0416         return self.reader.next()
0417 
0418     def __iter__(self):
0419         return self
0420 
0421     def write(self, data):
0422 
0423         return self.writer.write(data)
0424 
0425     def writelines(self, list):
0426 
0427         return self.writer.writelines(list)
0428 
0429     def reset(self):
0430 
0431         self.reader.reset()
0432         self.writer.reset()
0433 
0434     def __getattr__(self, name,
0435                     getattr=getattr):
0436 
0437         """ Inherit all other methods from the underlying stream.
0438         """
0439         return getattr(self.stream, name)
0440 
0441 ###
0442 
0443 class StreamRecoder:
0444 
0445     """ StreamRecoder instances provide a frontend - backend
0446         view of encoding data.
0447 
0448         They use the complete set of APIs returned by the
0449         codecs.lookup() function to implement their task.
0450 
0451         Data written to the stream is first decoded into an
0452         intermediate format (which is dependent on the given codec
0453         combination) and then written to the stream using an instance
0454         of the provided Writer class.
0455 
0456         In the other direction, data is read from the stream using a
0457         Reader instance and then return encoded data to the caller.
0458 
0459     """
0460     # Optional attributes set by the file wrappers below
0461     data_encoding = 'unknown'
0462     file_encoding = 'unknown'
0463 
0464     def __init__(self, stream, encode, decode, Reader, Writer,
0465                  errors='strict'):
0466 
0467         """ Creates a StreamRecoder instance which implements a two-way
0468             conversion: encode and decode work on the frontend (the
0469             input to .read() and output of .write()) while
0470             Reader and Writer work on the backend (reading and
0471             writing to the stream).
0472 
0473             You can use these objects to do transparent direct
0474             recodings from e.g. latin-1 to utf-8 and back.
0475 
0476             stream must be a file-like object.
0477 
0478             encode, decode must adhere to the Codec interface, Reader,
0479             Writer must be factory functions or classes providing the
0480             StreamReader, StreamWriter interface resp.
0481 
0482             encode and decode are needed for the frontend translation,
0483             Reader and Writer for the backend translation. Unicode is
0484             used as intermediate encoding.
0485 
0486             Error handling is done in the same way as defined for the
0487             StreamWriter/Readers.
0488 
0489         """
0490         self.stream = stream
0491         self.encode = encode
0492         self.decode = decode
0493         self.reader = Reader(stream, errors)
0494         self.writer = Writer(stream, errors)
0495         self.errors = errors
0496 
0497     def read(self, size=-1):
0498 
0499         data = self.reader.read(size)
0500         data, bytesencoded = self.encode(data, self.errors)
0501         return data
0502 
0503     def readline(self, size=None):
0504 
0505         if size is None:
0506             data = self.reader.readline()
0507         else:
0508             data = self.reader.readline(size)
0509         data, bytesencoded = self.encode(data, self.errors)
0510         return data
0511 
0512     def readlines(self, sizehint=None):
0513 
0514         data = self.reader.read()
0515         data, bytesencoded = self.encode(data, self.errors)
0516         return data.splitlines(1)
0517 
0518     def next(self):
0519 
0520         """ Return the next decoded line from the input stream."""
0521         return self.reader.next()
0522 
0523     def __iter__(self):
0524         return self
0525 
0526     def write(self, data):
0527 
0528         data, bytesdecoded = self.decode(data, self.errors)
0529         return self.writer.write(data)
0530 
0531     def writelines(self, list):
0532 
0533         data = ''.join(list)
0534         data, bytesdecoded = self.decode(data, self.errors)
0535         return self.writer.write(data)
0536 
0537     def reset(self):
0538 
0539         self.reader.reset()
0540         self.writer.reset()
0541 
0542     def __getattr__(self, name,
0543                     getattr=getattr):
0544 
0545         """ Inherit all other methods from the underlying stream.
0546         """
0547         return getattr(self.stream, name)
0548 
0549 ### Shortcuts
0550 
0551 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
0552 
0553     """ Open an encoded file using the given mode and return
0554         a wrapped version providing transparent encoding/decoding.
0555 
0556         Note: The wrapped version will only accept the object format
0557         defined by the codecs, i.e. Unicode objects for most builtin
0558         codecs. Output is also codec dependent and will usually by
0559         Unicode as well.
0560 
0561         Files are always opened in binary mode, even if no binary mode
0562         was specified. This is done to avoid data loss due to encodings
0563         using 8-bit values. The default file mode is 'rb' meaning to
0564         open the file in binary read mode.
0565 
0566         encoding specifies the encoding which is to be used for the
0567         file.
0568 
0569         errors may be given to define the error handling. It defaults
0570         to 'strict' which causes ValueErrors to be raised in case an
0571         encoding error occurs.
0572 
0573         buffering has the same meaning as for the builtin open() API.
0574         It defaults to line buffered.
0575 
0576         The returned wrapped file object provides an extra attribute
0577         .encoding which allows querying the used encoding. This
0578         attribute is only available if an encoding was specified as
0579         parameter.
0580 
0581     """
0582     if encoding is not None and \
0583        'b' not in mode:
0584         # Force opening of the file in binary mode
0585         mode = mode + 'b'
0586     file = __builtin__.open(filename, mode, buffering)
0587     if encoding is None:
0588         return file
0589     (e, d, sr, sw) = lookup(encoding)
0590     srw = StreamReaderWriter(file, sr, sw, errors)
0591     # Add attributes to simplify introspection
0592     srw.encoding = encoding
0593     return srw
0594 
0595 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
0596 
0597     """ Return a wrapped version of file which provides transparent
0598         encoding translation.
0599 
0600         Strings written to the wrapped file are interpreted according
0601         to the given data_encoding and then written to the original
0602         file as string using file_encoding. The intermediate encoding
0603         will usually be Unicode but depends on the specified codecs.
0604 
0605         Strings are read from the file using file_encoding and then
0606         passed back to the caller as string using data_encoding.
0607 
0608         If file_encoding is not given, it defaults to data_encoding.
0609 
0610         errors may be given to define the error handling. It defaults
0611         to 'strict' which causes ValueErrors to be raised in case an
0612         encoding error occurs.
0613 
0614         The returned wrapped file object provides two extra attributes
0615         .data_encoding and .file_encoding which reflect the given
0616         parameters of the same name. The attributes can be used for
0617         introspection by Python programs.
0618 
0619     """
0620     if file_encoding is None:
0621         file_encoding = data_encoding
0622     encode, decode = lookup(data_encoding)[:2]
0623     Reader, Writer = lookup(file_encoding)[2:]
0624     sr = StreamRecoder(file,
0625                        encode, decode, Reader, Writer,
0626                        errors)
0627     # Add attributes to simplify introspection
0628     sr.data_encoding = data_encoding
0629     sr.file_encoding = file_encoding
0630     return sr
0631 
0632 ### Helpers for codec lookup
0633 
0634 def getencoder(encoding):
0635 
0636     """ Lookup up the codec for the given encoding and return
0637         its encoder function.
0638 
0639         Raises a LookupError in case the encoding cannot be found.
0640 
0641     """
0642     return lookup(encoding)[0]
0643 
0644 def getdecoder(encoding):
0645 
0646     """ Lookup up the codec for the given encoding and return
0647         its decoder function.
0648 
0649         Raises a LookupError in case the encoding cannot be found.
0650 
0651     """
0652     return lookup(encoding)[1]
0653 
0654 def getreader(encoding):
0655 
0656     """ Lookup up the codec for the given encoding and return
0657         its StreamReader class or factory function.
0658 
0659         Raises a LookupError in case the encoding cannot be found.
0660 
0661     """
0662     return lookup(encoding)[2]
0663 
0664 def getwriter(encoding):
0665 
0666     """ Lookup up the codec for the given encoding and return
0667         its StreamWriter class or factory function.
0668 
0669         Raises a LookupError in case the encoding cannot be found.
0670 
0671     """
0672     return lookup(encoding)[3]
0673 
0674 ### Helpers for charmap-based codecs
0675 
0676 def make_identity_dict(rng):
0677 
0678     """ make_identity_dict(rng) -> dict
0679 
0680         Return a dictionary where elements of the rng sequence are
0681         mapped to themselves.
0682 
0683     """
0684     res = {}
0685     for i in rng:
0686         res[i]=i
0687     return res
0688 
0689 def make_encoding_map(decoding_map):
0690 
0691     """ Creates an encoding map from a decoding map.
0692 
0693         If a target mapping in the decoding map occurs multiple
0694         times, then that target is mapped to None (undefined mapping),
0695         causing an exception when encountered by the charmap codec
0696         during translation.
0697 
0698         One example where this happens is cp875.py which decodes
0699         multiple character to \u001a.
0700 
0701     """
0702     m = {}
0703     for k,v in decoding_map.items():
0704         if not v in m:
0705             m[v] = k
0706         else:
0707             m[v] = None
0708     return m
0709 
0710 ### error handlers
0711 
0712 strict_errors = lookup_error("strict")
0713 ignore_errors = lookup_error("ignore")
0714 replace_errors = lookup_error("replace")
0715 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
0716 backslashreplace_errors = lookup_error("backslashreplace")
0717 
0718 # Tell modulefinder that using codecs probably needs the encodings
0719 # package
0720 _false = 0
0721 if _false:
0722     import encodings
0723 
0724 ### Tests
0725 
0726 if __name__ == '__main__':
0727 
0728     # Make stdout translate Latin-1 output into UTF-8 output
0729     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
0730 
0731     # Have stdin translate Latin-1 input into UTF-8 input
0732     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
0733
Generated by PyXR 0.9.4