0001 """Guess the MIME type of a file. 0002 0003 This module defines two useful functions: 0004 0005 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL. 0006 0007 guess_extension(type, strict=1) -- guess the extension for a given MIME type. 0008 0009 It also contains the following, for tuning the behavior: 0010 0011 Data: 0012 0013 knownfiles -- list of files to parse 0014 inited -- flag set when init() has been called 0015 suffix_map -- dictionary mapping suffixes to suffixes 0016 encodings_map -- dictionary mapping suffixes to encodings 0017 types_map -- dictionary mapping suffixes to types 0018 0019 Functions: 0020 0021 init([files]) -- parse a list of files, default knownfiles 0022 read_mime_types(file) -- parse one file, return a dictionary or None 0023 """ 0024 0025 import os 0026 import posixpath 0027 import urllib 0028 0029 __all__ = [ 0030 "guess_type","guess_extension","guess_all_extensions", 0031 "add_type","read_mime_types","init" 0032 ] 0033 0034 knownfiles = [ 0035 "/etc/mime.types", 0036 "/usr/local/etc/httpd/conf/mime.types", 0037 "/usr/local/lib/netscape/mime.types", 0038 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2 0039 "/usr/local/etc/mime.types", # Apache 1.3 0040 ] 0041 0042 inited = False 0043 0044 0045 class MimeTypes: 0046 """MIME-types datastore. 0047 0048 This datastore can handle information from mime.types-style files 0049 and supports basic determination of MIME type from a filename or 0050 URL, and can guess a reasonable extension given a MIME type. 0051 """ 0052 0053 def __init__(self, filenames=(), strict=True): 0054 if not inited: 0055 init() 0056 self.encodings_map = encodings_map.copy() 0057 self.suffix_map = suffix_map.copy() 0058 self.types_map = ({}, {}) # dict for (non-strict, strict) 0059 self.types_map_inv = ({}, {}) 0060 for (ext, type) in types_map.items(): 0061 self.add_type(type, ext, True) 0062 for (ext, type) in common_types.items(): 0063 self.add_type(type, ext, False) 0064 for name in filenames: 0065 self.read(name, strict) 0066 0067 def add_type(self, type, ext, strict=True): 0068 """Add a mapping between a type and an extension. 0069 0070 When the extension is already known, the new 0071 type will replace the old one. When the type 0072 is already known the extension will be added 0073 to the list of known extensions. 0074 0075 If strict is true, information will be added to 0076 list of standard types, else to the list of non-standard 0077 types. 0078 """ 0079 self.types_map[strict][ext] = type 0080 exts = self.types_map_inv[strict].setdefault(type, []) 0081 if ext not in exts: 0082 exts.append(ext) 0083 0084 def guess_type(self, url, strict=True): 0085 """Guess the type of a file based on its URL. 0086 0087 Return value is a tuple (type, encoding) where type is None if 0088 the type can't be guessed (no or unknown suffix) or a string 0089 of the form type/subtype, usable for a MIME Content-type 0090 header; and encoding is None for no encoding or the name of 0091 the program used to encode (e.g. compress or gzip). The 0092 mappings are table driven. Encoding suffixes are case 0093 sensitive; type suffixes are first tried case sensitive, then 0094 case insensitive. 0095 0096 The suffixes .tgz, .taz and .tz (case sensitive!) are all 0097 mapped to '.tar.gz'. (This is table-driven too, using the 0098 dictionary suffix_map.) 0099 0100 Optional `strict' argument when False adds a bunch of commonly found, 0101 but non-standard types. 0102 """ 0103 scheme, url = urllib.splittype(url) 0104 if scheme == 'data': 0105 # syntax of data URLs: 0106 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 0107 # mediatype := [ type "/" subtype ] *( ";" parameter ) 0108 # data := *urlchar 0109 # parameter := attribute "=" value 0110 # type/subtype defaults to "text/plain" 0111 comma = url.find(',') 0112 if comma < 0: 0113 # bad data URL 0114 return None, None 0115 semi = url.find(';', 0, comma) 0116 if semi >= 0: 0117 type = url[:semi] 0118 else: 0119 type = url[:comma] 0120 if '=' in type or '/' not in type: 0121 type = 'text/plain' 0122 return type, None # never compressed, so encoding is None 0123 base, ext = posixpath.splitext(url) 0124 while ext in self.suffix_map: 0125 base, ext = posixpath.splitext(base + self.suffix_map[ext]) 0126 if ext in self.encodings_map: 0127 encoding = self.encodings_map[ext] 0128 base, ext = posixpath.splitext(base) 0129 else: 0130 encoding = None 0131 types_map = self.types_map[True] 0132 if ext in types_map: 0133 return types_map[ext], encoding 0134 elif ext.lower() in types_map: 0135 return types_map[ext.lower()], encoding 0136 elif strict: 0137 return None, encoding 0138 types_map = self.types_map[False] 0139 if ext in types_map: 0140 return types_map[ext], encoding 0141 elif ext.lower() in types_map: 0142 return types_map[ext.lower()], encoding 0143 else: 0144 return None, encoding 0145 0146 def guess_all_extensions(self, type, strict=True): 0147 """Guess the extensions for a file based on its MIME type. 0148 0149 Return value is a list of strings giving the possible filename 0150 extensions, including the leading dot ('.'). The extension is not 0151 guaranteed to have been associated with any particular data stream, 0152 but would be mapped to the MIME type `type' by guess_type(). 0153 0154 Optional `strict' argument when false adds a bunch of commonly found, 0155 but non-standard types. 0156 """ 0157 type = type.lower() 0158 extensions = self.types_map_inv[True].get(type, []) 0159 if not strict: 0160 for ext in self.types_map_inv[False].get(type, []): 0161 if ext not in extensions: 0162 extensions.append(ext) 0163 return extensions 0164 0165 def guess_extension(self, type, strict=True): 0166 """Guess the extension for a file based on its MIME type. 0167 0168 Return value is a string giving a filename extension, 0169 including the leading dot ('.'). The extension is not 0170 guaranteed to have been associated with any particular data 0171 stream, but would be mapped to the MIME type `type' by 0172 guess_type(). If no extension can be guessed for `type', None 0173 is returned. 0174 0175 Optional `strict' argument when false adds a bunch of commonly found, 0176 but non-standard types. 0177 """ 0178 extensions = self.guess_all_extensions(type, strict) 0179 if not extensions: 0180 return None 0181 return extensions[0] 0182 0183 def read(self, filename, strict=True): 0184 """ 0185 Read a single mime.types-format file, specified by pathname. 0186 0187 If strict is true, information will be added to 0188 list of standard types, else to the list of non-standard 0189 types. 0190 """ 0191 fp = open(filename) 0192 self.readfp(fp, strict) 0193 fp.close() 0194 0195 def readfp(self, fp, strict=True): 0196 """ 0197 Read a single mime.types-format file. 0198 0199 If strict is true, information will be added to 0200 list of standard types, else to the list of non-standard 0201 types. 0202 """ 0203 while 1: 0204 line = fp.readline() 0205 if not line: 0206 break 0207 words = line.split() 0208 for i in range(len(words)): 0209 if words[i][0] == '#': 0210 del words[i:] 0211 break 0212 if not words: 0213 continue 0214 type, suffixes = words[0], words[1:] 0215 for suff in suffixes: 0216 self.add_type(type, '.' + suff, strict) 0217 0218 def guess_type(url, strict=True): 0219 """Guess the type of a file based on its URL. 0220 0221 Return value is a tuple (type, encoding) where type is None if the 0222 type can't be guessed (no or unknown suffix) or a string of the 0223 form type/subtype, usable for a MIME Content-type header; and 0224 encoding is None for no encoding or the name of the program used 0225 to encode (e.g. compress or gzip). The mappings are table 0226 driven. Encoding suffixes are case sensitive; type suffixes are 0227 first tried case sensitive, then case insensitive. 0228 0229 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped 0230 to ".tar.gz". (This is table-driven too, using the dictionary 0231 suffix_map). 0232 0233 Optional `strict' argument when false adds a bunch of commonly found, but 0234 non-standard types. 0235 """ 0236 init() 0237 return guess_type(url, strict) 0238 0239 0240 def guess_all_extensions(type, strict=True): 0241 """Guess the extensions for a file based on its MIME type. 0242 0243 Return value is a list of strings giving the possible filename 0244 extensions, including the leading dot ('.'). The extension is not 0245 guaranteed to have been associated with any particular data 0246 stream, but would be mapped to the MIME type `type' by 0247 guess_type(). If no extension can be guessed for `type', None 0248 is returned. 0249 0250 Optional `strict' argument when false adds a bunch of commonly found, 0251 but non-standard types. 0252 """ 0253 init() 0254 return guess_all_extensions(type, strict) 0255 0256 def guess_extension(type, strict=True): 0257 """Guess the extension for a file based on its MIME type. 0258 0259 Return value is a string giving a filename extension, including the 0260 leading dot ('.'). The extension is not guaranteed to have been 0261 associated with any particular data stream, but would be mapped to the 0262 MIME type `type' by guess_type(). If no extension can be guessed for 0263 `type', None is returned. 0264 0265 Optional `strict' argument when false adds a bunch of commonly found, 0266 but non-standard types. 0267 """ 0268 init() 0269 return guess_extension(type, strict) 0270 0271 def add_type(type, ext, strict=True): 0272 """Add a mapping between a type and an extension. 0273 0274 When the extension is already known, the new 0275 type will replace the old one. When the type 0276 is already known the extension will be added 0277 to the list of known extensions. 0278 0279 If strict is true, information will be added to 0280 list of standard types, else to the list of non-standard 0281 types. 0282 """ 0283 init() 0284 return add_type(type, ext, strict) 0285 0286 0287 def init(files=None): 0288 global guess_all_extensions, guess_extension, guess_type 0289 global suffix_map, types_map, encodings_map, common_types 0290 global add_type, inited 0291 inited = True 0292 db = MimeTypes() 0293 if files is None: 0294 files = knownfiles 0295 for file in files: 0296 if os.path.isfile(file): 0297 db.readfp(open(file)) 0298 encodings_map = db.encodings_map 0299 suffix_map = db.suffix_map 0300 types_map = db.types_map[True] 0301 guess_all_extensions = db.guess_all_extensions 0302 guess_extension = db.guess_extension 0303 guess_type = db.guess_type 0304 add_type = db.add_type 0305 common_types = db.types_map[False] 0306 0307 0308 def read_mime_types(file): 0309 try: 0310 f = open(file) 0311 except IOError: 0312 return None 0313 db = MimeTypes() 0314 db.readfp(f, True) 0315 return db.types_map[True] 0316 0317 0318 suffix_map = { 0319 '.tgz': '.tar.gz', 0320 '.taz': '.tar.gz', 0321 '.tz': '.tar.gz', 0322 } 0323 0324 encodings_map = { 0325 '.gz': 'gzip', 0326 '.Z': 'compress', 0327 } 0328 0329 # Before adding new types, make sure they are either registered with IANA, at 0330 # http://www.isi.edu/in-notes/iana/assignments/media-types 0331 # or extensions, i.e. using the x- prefix 0332 0333 # If you add to these, please keep them sorted! 0334 types_map = { 0335 '.a' : 'application/octet-stream', 0336 '.ai' : 'application/postscript', 0337 '.aif' : 'audio/x-aiff', 0338 '.aifc' : 'audio/x-aiff', 0339 '.aiff' : 'audio/x-aiff', 0340 '.au' : 'audio/basic', 0341 '.avi' : 'video/x-msvideo', 0342 '.bat' : 'text/plain', 0343 '.bcpio' : 'application/x-bcpio', 0344 '.bin' : 'application/octet-stream', 0345 '.bmp' : 'image/x-ms-bmp', 0346 '.c' : 'text/plain', 0347 # Duplicates :( 0348 '.cdf' : 'application/x-cdf', 0349 '.cdf' : 'application/x-netcdf', 0350 '.cpio' : 'application/x-cpio', 0351 '.csh' : 'application/x-csh', 0352 '.css' : 'text/css', 0353 '.dll' : 'application/octet-stream', 0354 '.doc' : 'application/msword', 0355 '.dot' : 'application/msword', 0356 '.dvi' : 'application/x-dvi', 0357 '.eml' : 'message/rfc822', 0358 '.eps' : 'application/postscript', 0359 '.etx' : 'text/x-setext', 0360 '.exe' : 'application/octet-stream', 0361 '.gif' : 'image/gif', 0362 '.gtar' : 'application/x-gtar', 0363 '.h' : 'text/plain', 0364 '.hdf' : 'application/x-hdf', 0365 '.htm' : 'text/html', 0366 '.html' : 'text/html', 0367 '.ief' : 'image/ief', 0368 '.jpe' : 'image/jpeg', 0369 '.jpeg' : 'image/jpeg', 0370 '.jpg' : 'image/jpeg', 0371 '.js' : 'application/x-javascript', 0372 '.ksh' : 'text/plain', 0373 '.latex' : 'application/x-latex', 0374 '.m1v' : 'video/mpeg', 0375 '.man' : 'application/x-troff-man', 0376 '.me' : 'application/x-troff-me', 0377 '.mht' : 'message/rfc822', 0378 '.mhtml' : 'message/rfc822', 0379 '.mif' : 'application/x-mif', 0380 '.mov' : 'video/quicktime', 0381 '.movie' : 'video/x-sgi-movie', 0382 '.mp2' : 'audio/mpeg', 0383 '.mp3' : 'audio/mpeg', 0384 '.mpa' : 'video/mpeg', 0385 '.mpe' : 'video/mpeg', 0386 '.mpeg' : 'video/mpeg', 0387 '.mpg' : 'video/mpeg', 0388 '.ms' : 'application/x-troff-ms', 0389 '.nc' : 'application/x-netcdf', 0390 '.nws' : 'message/rfc822', 0391 '.o' : 'application/octet-stream', 0392 '.obj' : 'application/octet-stream', 0393 '.oda' : 'application/oda', 0394 '.p12' : 'application/x-pkcs12', 0395 '.p7c' : 'application/pkcs7-mime', 0396 '.pbm' : 'image/x-portable-bitmap', 0397 '.pdf' : 'application/pdf', 0398 '.pfx' : 'application/x-pkcs12', 0399 '.pgm' : 'image/x-portable-graymap', 0400 '.pl' : 'text/plain', 0401 '.png' : 'image/png', 0402 '.pnm' : 'image/x-portable-anymap', 0403 '.pot' : 'application/vnd.ms-powerpoint', 0404 '.ppa' : 'application/vnd.ms-powerpoint', 0405 '.ppm' : 'image/x-portable-pixmap', 0406 '.pps' : 'application/vnd.ms-powerpoint', 0407 '.ppt' : 'application/vnd.ms-powerpoint', 0408 '.ps' : 'application/postscript', 0409 '.pwz' : 'application/vnd.ms-powerpoint', 0410 '.py' : 'text/x-python', 0411 '.pyc' : 'application/x-python-code', 0412 '.pyo' : 'application/x-python-code', 0413 '.qt' : 'video/quicktime', 0414 '.ra' : 'audio/x-pn-realaudio', 0415 '.ram' : 'application/x-pn-realaudio', 0416 '.ras' : 'image/x-cmu-raster', 0417 '.rdf' : 'application/xml', 0418 '.rgb' : 'image/x-rgb', 0419 '.roff' : 'application/x-troff', 0420 '.rtx' : 'text/richtext', 0421 '.sgm' : 'text/x-sgml', 0422 '.sgml' : 'text/x-sgml', 0423 '.sh' : 'application/x-sh', 0424 '.shar' : 'application/x-shar', 0425 '.snd' : 'audio/basic', 0426 '.so' : 'application/octet-stream', 0427 '.src' : 'application/x-wais-source', 0428 '.sv4cpio': 'application/x-sv4cpio', 0429 '.sv4crc' : 'application/x-sv4crc', 0430 '.swf' : 'application/x-shockwave-flash', 0431 '.t' : 'application/x-troff', 0432 '.tar' : 'application/x-tar', 0433 '.tcl' : 'application/x-tcl', 0434 '.tex' : 'application/x-tex', 0435 '.texi' : 'application/x-texinfo', 0436 '.texinfo': 'application/x-texinfo', 0437 '.tif' : 'image/tiff', 0438 '.tiff' : 'image/tiff', 0439 '.tr' : 'application/x-troff', 0440 '.tsv' : 'text/tab-separated-values', 0441 '.txt' : 'text/plain', 0442 '.ustar' : 'application/x-ustar', 0443 '.vcf' : 'text/x-vcard', 0444 '.wav' : 'audio/x-wav', 0445 '.wiz' : 'application/msword', 0446 '.xbm' : 'image/x-xbitmap', 0447 '.xlb' : 'application/vnd.ms-excel', 0448 # Duplicates :( 0449 '.xls' : 'application/excel', 0450 '.xls' : 'application/vnd.ms-excel', 0451 '.xml' : 'text/xml', 0452 '.xpm' : 'image/x-xpixmap', 0453 '.xsl' : 'application/xml', 0454 '.xwd' : 'image/x-xwindowdump', 0455 '.zip' : 'application/zip', 0456 } 0457 0458 # These are non-standard types, commonly found in the wild. They will only 0459 # match if strict=0 flag is given to the API methods. 0460 0461 # Please sort these too 0462 common_types = { 0463 '.jpg' : 'image/jpg', 0464 '.mid' : 'audio/midi', 0465 '.midi': 'audio/midi', 0466 '.pct' : 'image/pict', 0467 '.pic' : 'image/pict', 0468 '.pict': 'image/pict', 0469 '.rtf' : 'application/rtf', 0470 '.xul' : 'text/xul' 0471 } 0472 0473 0474 if __name__ == '__main__': 0475 import sys 0476 import getopt 0477 0478 USAGE = """\ 0479 Usage: mimetypes.py [options] type 0480 0481 Options: 0482 --help / -h -- print this message and exit 0483 --lenient / -l -- additionally search of some common, but non-standard 0484 types. 0485 --extension / -e -- guess extension instead of type 0486 0487 More than one type argument may be given. 0488 """ 0489 0490 def usage(code, msg=''): 0491 print USAGE 0492 if msg: print msg 0493 sys.exit(code) 0494 0495 try: 0496 opts, args = getopt.getopt(sys.argv[1:], 'hle', 0497 ['help', 'lenient', 'extension']) 0498 except getopt.error, msg: 0499 usage(1, msg) 0500 0501 strict = 1 0502 extension = 0 0503 for opt, arg in opts: 0504 if opt in ('-h', '--help'): 0505 usage(0) 0506 elif opt in ('-l', '--lenient'): 0507 strict = 0 0508 elif opt in ('-e', '--extension'): 0509 extension = 1 0510 for gtype in args: 0511 if extension: 0512 guess = guess_extension(gtype, strict) 0513 if not guess: print "I don't know anything about type", gtype 0514 else: print guess 0515 else: 0516 guess, encoding = guess_type(gtype, strict) 0517 if not guess: print "I don't know anything about type", gtype 0518 else: print 'type:', guess, 'encoding:', encoding 0519
Generated by PyXR 0.9.4