0001 """ codecs -- Python Codec Registry, API and helpers. 0002 0003 0004 Written by Marc-Andre Lemburg (mal@lemburg.com). 0005 0006 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 0007 0008 """#" 0009 0010 import __builtin__, sys 0011 0012 ### Registry and builtin stateless codec functions 0013 0014 try: 0015 from _codecs import * 0016 except ImportError, why: 0017 raise SystemError,\ 0018 'Failed to load the builtin codecs: %s' % why 0019 0020 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 0021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 0022 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 0023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 0024 "strict_errors", "ignore_errors", "replace_errors", 0025 "xmlcharrefreplace_errors", 0026 "register_error", "lookup_error"] 0027 0028 ### Constants 0029 0030 # 0031 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 0032 # and its possible byte string values 0033 # for UTF8/UTF16/UTF32 output and little/big endian machines 0034 # 0035 0036 # UTF-8 0037 BOM_UTF8 = '\xef\xbb\xbf' 0038 0039 # UTF-16, little endian 0040 BOM_LE = BOM_UTF16_LE = '\xff\xfe' 0041 0042 # UTF-16, big endian 0043 BOM_BE = BOM_UTF16_BE = '\xfe\xff' 0044 0045 # UTF-32, little endian 0046 BOM_UTF32_LE = '\xff\xfe\x00\x00' 0047 0048 # UTF-32, big endian 0049 BOM_UTF32_BE = '\x00\x00\xfe\xff' 0050 0051 if sys.byteorder == 'little': 0052 0053 # UTF-16, native endianness 0054 BOM = BOM_UTF16 = BOM_UTF16_LE 0055 0056 # UTF-32, native endianness 0057 BOM_UTF32 = BOM_UTF32_LE 0058 0059 else: 0060 0061 # UTF-16, native endianness 0062 BOM = BOM_UTF16 = BOM_UTF16_BE 0063 0064 # UTF-32, native endianness 0065 BOM_UTF32 = BOM_UTF32_BE 0066 0067 # Old broken names (don't use in new code) 0068 BOM32_LE = BOM_UTF16_LE 0069 BOM32_BE = BOM_UTF16_BE 0070 BOM64_LE = BOM_UTF32_LE 0071 BOM64_BE = BOM_UTF32_BE 0072 0073 0074 ### Codec base classes (defining the API) 0075 0076 class Codec: 0077 0078 """ Defines the interface for stateless encoders/decoders. 0079 0080 The .encode()/.decode() methods may use different error 0081 handling schemes by providing the errors argument. These 0082 string values are predefined: 0083 0084 'strict' - raise a ValueError error (or a subclass) 0085 'ignore' - ignore the character and continue with the next 0086 'replace' - replace with a suitable replacement character; 0087 Python will use the official U+FFFD REPLACEMENT 0088 CHARACTER for the builtin Unicode codecs on 0089 decoding and '?' on encoding. 0090 'xmlcharrefreplace' - Replace with the appropriate XML 0091 character reference (only for encoding). 0092 'backslashreplace' - Replace with backslashed escape sequences 0093 (only for encoding). 0094 0095 The set of allowed values can be extended via register_error. 0096 0097 """ 0098 def encode(self, input, errors='strict'): 0099 0100 """ Encodes the object input and returns a tuple (output 0101 object, length consumed). 0102 0103 errors defines the error handling to apply. It defaults to 0104 'strict' handling. 0105 0106 The method may not store state in the Codec instance. Use 0107 StreamCodec for codecs which have to keep state in order to 0108 make encoding/decoding efficient. 0109 0110 The encoder must be able to handle zero length input and 0111 return an empty object of the output object type in this 0112 situation. 0113 0114 """ 0115 raise NotImplementedError 0116 0117 def decode(self, input, errors='strict'): 0118 0119 """ Decodes the object input and returns a tuple (output 0120 object, length consumed). 0121 0122 input must be an object which provides the bf_getreadbuf 0123 buffer slot. Python strings, buffer objects and memory 0124 mapped files are examples of objects providing this slot. 0125 0126 errors defines the error handling to apply. It defaults to 0127 'strict' handling. 0128 0129 The method may not store state in the Codec instance. Use 0130 StreamCodec for codecs which have to keep state in order to 0131 make encoding/decoding efficient. 0132 0133 The decoder must be able to handle zero length input and 0134 return an empty object of the output object type in this 0135 situation. 0136 0137 """ 0138 raise NotImplementedError 0139 0140 # 0141 # The StreamWriter and StreamReader class provide generic working 0142 # interfaces which can be used to implement new encoding submodules 0143 # very easily. See encodings/utf_8.py for an example on how this is 0144 # done. 0145 # 0146 0147 class StreamWriter(Codec): 0148 0149 def __init__(self, stream, errors='strict'): 0150 0151 """ Creates a StreamWriter instance. 0152 0153 stream must be a file-like object open for writing 0154 (binary) data. 0155 0156 The StreamWriter may use different error handling 0157 schemes by providing the errors keyword argument. These 0158 parameters are predefined: 0159 0160 'strict' - raise a ValueError (or a subclass) 0161 'ignore' - ignore the character and continue with the next 0162 'replace'- replace with a suitable replacement character 0163 'xmlcharrefreplace' - Replace with the appropriate XML 0164 character reference. 0165 'backslashreplace' - Replace with backslashed escape 0166 sequences (only for encoding). 0167 0168 The set of allowed parameter values can be extended via 0169 register_error. 0170 """ 0171 self.stream = stream 0172 self.errors = errors 0173 0174 def write(self, object): 0175 0176 """ Writes the object's contents encoded to self.stream. 0177 """ 0178 data, consumed = self.encode(object, self.errors) 0179 self.stream.write(data) 0180 0181 def writelines(self, list): 0182 0183 """ Writes the concatenated list of strings to the stream 0184 using .write(). 0185 """ 0186 self.write(''.join(list)) 0187 0188 def reset(self): 0189 0190 """ Flushes and resets the codec buffers used for keeping state. 0191 0192 Calling this method should ensure that the data on the 0193 output is put into a clean state, that allows appending 0194 of new fresh data without having to rescan the whole 0195 stream to recover state. 0196 0197 """ 0198 pass 0199 0200 def __getattr__(self, name, 0201 getattr=getattr): 0202 0203 """ Inherit all other methods from the underlying stream. 0204 """ 0205 return getattr(self.stream, name) 0206 0207 ### 0208 0209 class StreamReader(Codec): 0210 0211 def __init__(self, stream, errors='strict'): 0212 0213 """ Creates a StreamReader instance. 0214 0215 stream must be a file-like object open for reading 0216 (binary) data. 0217 0218 The StreamReader may use different error handling 0219 schemes by providing the errors keyword argument. These 0220 parameters are predefined: 0221 0222 'strict' - raise a ValueError (or a subclass) 0223 'ignore' - ignore the character and continue with the next 0224 'replace'- replace with a suitable replacement character; 0225 0226 The set of allowed parameter values can be extended via 0227 register_error. 0228 """ 0229 self.stream = stream 0230 self.errors = errors 0231 self.bytebuffer = "" 0232 self.charbuffer = u"" 0233 0234 def decode(self, input, errors='strict'): 0235 raise NotImplementedError 0236 0237 def read(self, size=-1, chars=-1): 0238 0239 """ Decodes data from the stream self.stream and returns the 0240 resulting object. 0241 0242 chars indicates the number of characters to read from the 0243 stream. read() will never return more than chars 0244 characters, but it might return less, if there are not enough 0245 characters available. 0246 0247 size indicates the approximate maximum number of bytes to 0248 read from the stream for decoding purposes. The decoder 0249 can modify this setting as appropriate. The default value 0250 -1 indicates to read and decode as much as possible. size 0251 is intended to prevent having to decode huge files in one 0252 step. 0253 0254 The method should use a greedy read strategy meaning that 0255 it should read as much data as is allowed within the 0256 definition of the encoding and the given size, e.g. if 0257 optional encoding endings or state markers are available 0258 on the stream, these should be read too. 0259 0260 """ 0261 # read until we get the required number of characters (if available) 0262 done = False 0263 while True: 0264 # can the request can be satisfied from the character buffer? 0265 if chars < 0: 0266 if self.charbuffer: 0267 done = True 0268 else: 0269 if len(self.charbuffer) >= chars: 0270 done = True 0271 if done: 0272 if chars < 0: 0273 result = self.charbuffer 0274 self.charbuffer = u"" 0275 break 0276 else: 0277 result = self.charbuffer[:chars] 0278 self.charbuffer = self.charbuffer[chars:] 0279 break 0280 # we need more data 0281 if size < 0: 0282 newdata = self.stream.read() 0283 else: 0284 newdata = self.stream.read(size) 0285 data = self.bytebuffer + newdata 0286 object, decodedbytes = self.decode(data, self.errors) 0287 # keep undecoded bytes until the next call 0288 self.bytebuffer = data[decodedbytes:] 0289 # put new characters in the character buffer 0290 self.charbuffer += object 0291 # there was no data available 0292 if not newdata: 0293 done = True 0294 return result 0295 0296 def readline(self, size=None, keepends=True): 0297 0298 """ Read one line from the input stream and return the 0299 decoded data. 0300 0301 size, if given, is passed as size argument to the 0302 read() method. 0303 0304 """ 0305 if size is None: 0306 size = 10 0307 line = u"" 0308 while True: 0309 data = self.read(size) 0310 line += data 0311 pos = line.find("\n") 0312 if pos>=0: 0313 self.charbuffer = line[pos+1:] + self.charbuffer 0314 if keepends: 0315 line = line[:pos+1] 0316 else: 0317 line = line[:pos] 0318 return line 0319 elif not data: 0320 return line 0321 if size<8000: 0322 size *= 2 0323 0324 def readlines(self, sizehint=None, keepends=True): 0325 0326 """ Read all lines available on the input stream 0327 and return them as list of lines. 0328 0329 Line breaks are implemented using the codec's decoder 0330 method and are included in the list entries. 0331 0332 sizehint, if given, is ignored since there is no efficient 0333 way to finding the true end-of-line. 0334 0335 """ 0336 data = self.read() 0337 return data.splitlines(keepends) 0338 0339 def reset(self): 0340 0341 """ Resets the codec buffers used for keeping state. 0342 0343 Note that no stream repositioning should take place. 0344 This method is primarily intended to be able to recover 0345 from decoding errors. 0346 0347 """ 0348 pass 0349 0350 def next(self): 0351 0352 """ Return the next decoded line from the input stream.""" 0353 line = self.readline() 0354 if line: 0355 return line 0356 raise StopIteration 0357 0358 def __iter__(self): 0359 return self 0360 0361 def __getattr__(self, name, 0362 getattr=getattr): 0363 0364 """ Inherit all other methods from the underlying stream. 0365 """ 0366 return getattr(self.stream, name) 0367 0368 ### 0369 0370 class StreamReaderWriter: 0371 0372 """ StreamReaderWriter instances allow wrapping streams which 0373 work in both read and write modes. 0374 0375 The design is such that one can use the factory functions 0376 returned by the codec.lookup() function to construct the 0377 instance. 0378 0379 """ 0380 # Optional attributes set by the file wrappers below 0381 encoding = 'unknown' 0382 0383 def __init__(self, stream, Reader, Writer, errors='strict'): 0384 0385 """ Creates a StreamReaderWriter instance. 0386 0387 stream must be a Stream-like object. 0388 0389 Reader, Writer must be factory functions or classes 0390 providing the StreamReader, StreamWriter interface resp. 0391 0392 Error handling is done in the same way as defined for the 0393 StreamWriter/Readers. 0394 0395 """ 0396 self.stream = stream 0397 self.reader = Reader(stream, errors) 0398 self.writer = Writer(stream, errors) 0399 self.errors = errors 0400 0401 def read(self, size=-1): 0402 0403 return self.reader.read(size) 0404 0405 def readline(self, size=None): 0406 0407 return self.reader.readline(size) 0408 0409 def readlines(self, sizehint=None): 0410 0411 return self.reader.readlines(sizehint) 0412 0413 def next(self): 0414 0415 """ Return the next decoded line from the input stream.""" 0416 return self.reader.next() 0417 0418 def __iter__(self): 0419 return self 0420 0421 def write(self, data): 0422 0423 return self.writer.write(data) 0424 0425 def writelines(self, list): 0426 0427 return self.writer.writelines(list) 0428 0429 def reset(self): 0430 0431 self.reader.reset() 0432 self.writer.reset() 0433 0434 def __getattr__(self, name, 0435 getattr=getattr): 0436 0437 """ Inherit all other methods from the underlying stream. 0438 """ 0439 return getattr(self.stream, name) 0440 0441 ### 0442 0443 class StreamRecoder: 0444 0445 """ StreamRecoder instances provide a frontend - backend 0446 view of encoding data. 0447 0448 They use the complete set of APIs returned by the 0449 codecs.lookup() function to implement their task. 0450 0451 Data written to the stream is first decoded into an 0452 intermediate format (which is dependent on the given codec 0453 combination) and then written to the stream using an instance 0454 of the provided Writer class. 0455 0456 In the other direction, data is read from the stream using a 0457 Reader instance and then return encoded data to the caller. 0458 0459 """ 0460 # Optional attributes set by the file wrappers below 0461 data_encoding = 'unknown' 0462 file_encoding = 'unknown' 0463 0464 def __init__(self, stream, encode, decode, Reader, Writer, 0465 errors='strict'): 0466 0467 """ Creates a StreamRecoder instance which implements a two-way 0468 conversion: encode and decode work on the frontend (the 0469 input to .read() and output of .write()) while 0470 Reader and Writer work on the backend (reading and 0471 writing to the stream). 0472 0473 You can use these objects to do transparent direct 0474 recodings from e.g. latin-1 to utf-8 and back. 0475 0476 stream must be a file-like object. 0477 0478 encode, decode must adhere to the Codec interface, Reader, 0479 Writer must be factory functions or classes providing the 0480 StreamReader, StreamWriter interface resp. 0481 0482 encode and decode are needed for the frontend translation, 0483 Reader and Writer for the backend translation. Unicode is 0484 used as intermediate encoding. 0485 0486 Error handling is done in the same way as defined for the 0487 StreamWriter/Readers. 0488 0489 """ 0490 self.stream = stream 0491 self.encode = encode 0492 self.decode = decode 0493 self.reader = Reader(stream, errors) 0494 self.writer = Writer(stream, errors) 0495 self.errors = errors 0496 0497 def read(self, size=-1): 0498 0499 data = self.reader.read(size) 0500 data, bytesencoded = self.encode(data, self.errors) 0501 return data 0502 0503 def readline(self, size=None): 0504 0505 if size is None: 0506 data = self.reader.readline() 0507 else: 0508 data = self.reader.readline(size) 0509 data, bytesencoded = self.encode(data, self.errors) 0510 return data 0511 0512 def readlines(self, sizehint=None): 0513 0514 data = self.reader.read() 0515 data, bytesencoded = self.encode(data, self.errors) 0516 return data.splitlines(1) 0517 0518 def next(self): 0519 0520 """ Return the next decoded line from the input stream.""" 0521 return self.reader.next() 0522 0523 def __iter__(self): 0524 return self 0525 0526 def write(self, data): 0527 0528 data, bytesdecoded = self.decode(data, self.errors) 0529 return self.writer.write(data) 0530 0531 def writelines(self, list): 0532 0533 data = ''.join(list) 0534 data, bytesdecoded = self.decode(data, self.errors) 0535 return self.writer.write(data) 0536 0537 def reset(self): 0538 0539 self.reader.reset() 0540 self.writer.reset() 0541 0542 def __getattr__(self, name, 0543 getattr=getattr): 0544 0545 """ Inherit all other methods from the underlying stream. 0546 """ 0547 return getattr(self.stream, name) 0548 0549 ### Shortcuts 0550 0551 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 0552 0553 """ Open an encoded file using the given mode and return 0554 a wrapped version providing transparent encoding/decoding. 0555 0556 Note: The wrapped version will only accept the object format 0557 defined by the codecs, i.e. Unicode objects for most builtin 0558 codecs. Output is also codec dependent and will usually by 0559 Unicode as well. 0560 0561 Files are always opened in binary mode, even if no binary mode 0562 was specified. This is done to avoid data loss due to encodings 0563 using 8-bit values. The default file mode is 'rb' meaning to 0564 open the file in binary read mode. 0565 0566 encoding specifies the encoding which is to be used for the 0567 file. 0568 0569 errors may be given to define the error handling. It defaults 0570 to 'strict' which causes ValueErrors to be raised in case an 0571 encoding error occurs. 0572 0573 buffering has the same meaning as for the builtin open() API. 0574 It defaults to line buffered. 0575 0576 The returned wrapped file object provides an extra attribute 0577 .encoding which allows querying the used encoding. This 0578 attribute is only available if an encoding was specified as 0579 parameter. 0580 0581 """ 0582 if encoding is not None and \ 0583 'b' not in mode: 0584 # Force opening of the file in binary mode 0585 mode = mode + 'b' 0586 file = __builtin__.open(filename, mode, buffering) 0587 if encoding is None: 0588 return file 0589 (e, d, sr, sw) = lookup(encoding) 0590 srw = StreamReaderWriter(file, sr, sw, errors) 0591 # Add attributes to simplify introspection 0592 srw.encoding = encoding 0593 return srw 0594 0595 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 0596 0597 """ Return a wrapped version of file which provides transparent 0598 encoding translation. 0599 0600 Strings written to the wrapped file are interpreted according 0601 to the given data_encoding and then written to the original 0602 file as string using file_encoding. The intermediate encoding 0603 will usually be Unicode but depends on the specified codecs. 0604 0605 Strings are read from the file using file_encoding and then 0606 passed back to the caller as string using data_encoding. 0607 0608 If file_encoding is not given, it defaults to data_encoding. 0609 0610 errors may be given to define the error handling. It defaults 0611 to 'strict' which causes ValueErrors to be raised in case an 0612 encoding error occurs. 0613 0614 The returned wrapped file object provides two extra attributes 0615 .data_encoding and .file_encoding which reflect the given 0616 parameters of the same name. The attributes can be used for 0617 introspection by Python programs. 0618 0619 """ 0620 if file_encoding is None: 0621 file_encoding = data_encoding 0622 encode, decode = lookup(data_encoding)[:2] 0623 Reader, Writer = lookup(file_encoding)[2:] 0624 sr = StreamRecoder(file, 0625 encode, decode, Reader, Writer, 0626 errors) 0627 # Add attributes to simplify introspection 0628 sr.data_encoding = data_encoding 0629 sr.file_encoding = file_encoding 0630 return sr 0631 0632 ### Helpers for codec lookup 0633 0634 def getencoder(encoding): 0635 0636 """ Lookup up the codec for the given encoding and return 0637 its encoder function. 0638 0639 Raises a LookupError in case the encoding cannot be found. 0640 0641 """ 0642 return lookup(encoding)[0] 0643 0644 def getdecoder(encoding): 0645 0646 """ Lookup up the codec for the given encoding and return 0647 its decoder function. 0648 0649 Raises a LookupError in case the encoding cannot be found. 0650 0651 """ 0652 return lookup(encoding)[1] 0653 0654 def getreader(encoding): 0655 0656 """ Lookup up the codec for the given encoding and return 0657 its StreamReader class or factory function. 0658 0659 Raises a LookupError in case the encoding cannot be found. 0660 0661 """ 0662 return lookup(encoding)[2] 0663 0664 def getwriter(encoding): 0665 0666 """ Lookup up the codec for the given encoding and return 0667 its StreamWriter class or factory function. 0668 0669 Raises a LookupError in case the encoding cannot be found. 0670 0671 """ 0672 return lookup(encoding)[3] 0673 0674 ### Helpers for charmap-based codecs 0675 0676 def make_identity_dict(rng): 0677 0678 """ make_identity_dict(rng) -> dict 0679 0680 Return a dictionary where elements of the rng sequence are 0681 mapped to themselves. 0682 0683 """ 0684 res = {} 0685 for i in rng: 0686 res[i]=i 0687 return res 0688 0689 def make_encoding_map(decoding_map): 0690 0691 """ Creates an encoding map from a decoding map. 0692 0693 If a target mapping in the decoding map occurs multiple 0694 times, then that target is mapped to None (undefined mapping), 0695 causing an exception when encountered by the charmap codec 0696 during translation. 0697 0698 One example where this happens is cp875.py which decodes 0699 multiple character to \u001a. 0700 0701 """ 0702 m = {} 0703 for k,v in decoding_map.items(): 0704 if not v in m: 0705 m[v] = k 0706 else: 0707 m[v] = None 0708 return m 0709 0710 ### error handlers 0711 0712 strict_errors = lookup_error("strict") 0713 ignore_errors = lookup_error("ignore") 0714 replace_errors = lookup_error("replace") 0715 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 0716 backslashreplace_errors = lookup_error("backslashreplace") 0717 0718 # Tell modulefinder that using codecs probably needs the encodings 0719 # package 0720 _false = 0 0721 if _false: 0722 import encodings 0723 0724 ### Tests 0725 0726 if __name__ == '__main__': 0727 0728 # Make stdout translate Latin-1 output into UTF-8 output 0729 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 0730 0731 # Have stdin translate Latin-1 input into UTF-8 input 0732 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 0733
Generated by PyXR 0.9.4