0001 """Parse (absolute and relative) URLs. 0002 0003 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, 0004 UC Irvine, June 1995. 0005 """ 0006 0007 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 0008 "urlsplit", "urlunsplit"] 0009 0010 # A classification of schemes ('' means apply by default) 0011 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', 0012 'wais', 'file', 'https', 'shttp', 'mms', 0013 'prospero', 'rtsp', 'rtspu', ''] 0014 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 0015 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 0016 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', ''] 0017 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 0018 'telnet', 'wais', 'imap', 'snews', 'sip'] 0019 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', 0020 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 0021 'mms', ''] 0022 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', 0023 'gopher', 'rtsp', 'rtspu', 'sip', ''] 0024 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 0025 'nntp', 'wais', 'https', 'shttp', 'snews', 0026 'file', 'prospero', ''] 0027 0028 # Characters valid in scheme names 0029 scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 0030 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 0031 '0123456789' 0032 '+-.') 0033 0034 MAX_CACHE_SIZE = 20 0035 _parse_cache = {} 0036 0037 def clear_cache(): 0038 """Clear the parse cache.""" 0039 global _parse_cache 0040 _parse_cache = {} 0041 0042 0043 def urlparse(url, scheme='', allow_fragments=1): 0044 """Parse a URL into 6 components: 0045 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 0046 Return a 6-tuple: (scheme, netloc, path, params, query, fragment). 0047 Note that we don't break the components up in smaller bits 0048 (e.g. netloc is a single string) and we don't expand % escapes.""" 0049 tuple = urlsplit(url, scheme, allow_fragments) 0050 scheme, netloc, url, query, fragment = tuple 0051 if scheme in uses_params and ';' in url: 0052 url, params = _splitparams(url) 0053 else: 0054 params = '' 0055 return scheme, netloc, url, params, query, fragment 0056 0057 def _splitparams(url): 0058 if '/' in url: 0059 i = url.find(';', url.rfind('/')) 0060 if i < 0: 0061 return url, '' 0062 else: 0063 i = url.find(';') 0064 return url[:i], url[i+1:] 0065 0066 def urlsplit(url, scheme='', allow_fragments=1): 0067 """Parse a URL into 5 components: 0068 <scheme>://<netloc>/<path>?<query>#<fragment> 0069 Return a 5-tuple: (scheme, netloc, path, query, fragment). 0070 Note that we don't break the components up in smaller bits 0071 (e.g. netloc is a single string) and we don't expand % escapes.""" 0072 key = url, scheme, allow_fragments 0073 cached = _parse_cache.get(key, None) 0074 if cached: 0075 return cached 0076 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 0077 clear_cache() 0078 netloc = query = fragment = '' 0079 i = url.find(':') 0080 if i > 0: 0081 if url[:i] == 'http': # optimize the common case 0082 scheme = url[:i].lower() 0083 url = url[i+1:] 0084 if url[:2] == '//': 0085 i = url.find('/', 2) 0086 if i < 0: 0087 i = url.find('#') 0088 if i < 0: 0089 i = len(url) 0090 netloc = url[2:i] 0091 url = url[i:] 0092 if allow_fragments and '#' in url: 0093 url, fragment = url.split('#', 1) 0094 if '?' in url: 0095 url, query = url.split('?', 1) 0096 tuple = scheme, netloc, url, query, fragment 0097 _parse_cache[key] = tuple 0098 return tuple 0099 for c in url[:i]: 0100 if c not in scheme_chars: 0101 break 0102 else: 0103 scheme, url = url[:i].lower(), url[i+1:] 0104 if scheme in uses_netloc: 0105 if url[:2] == '//': 0106 i = url.find('/', 2) 0107 if i < 0: 0108 i = len(url) 0109 netloc, url = url[2:i], url[i:] 0110 if allow_fragments and scheme in uses_fragment and '#' in url: 0111 url, fragment = url.split('#', 1) 0112 if scheme in uses_query and '?' in url: 0113 url, query = url.split('?', 1) 0114 tuple = scheme, netloc, url, query, fragment 0115 _parse_cache[key] = tuple 0116 return tuple 0117 0118 def urlunparse((scheme, netloc, url, params, query, fragment)): 0119 """Put a parsed URL back together again. This may result in a 0120 slightly different, but equivalent URL, if the URL that was parsed 0121 originally had redundant delimiters, e.g. a ? with an empty query 0122 (the draft states that these are equivalent).""" 0123 if params: 0124 url = "%s;%s" % (url, params) 0125 return urlunsplit((scheme, netloc, url, query, fragment)) 0126 0127 def urlunsplit((scheme, netloc, url, query, fragment)): 0128 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 0129 if url and url[:1] != '/': url = '/' + url 0130 url = '//' + (netloc or '') + url 0131 if scheme: 0132 url = scheme + ':' + url 0133 if query: 0134 url = url + '?' + query 0135 if fragment: 0136 url = url + '#' + fragment 0137 return url 0138 0139 def urljoin(base, url, allow_fragments = 1): 0140 """Join a base URL and a possibly relative URL to form an absolute 0141 interpretation of the latter.""" 0142 if not base: 0143 return url 0144 if not url: 0145 return base 0146 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 0147 urlparse(base, '', allow_fragments) 0148 scheme, netloc, path, params, query, fragment = \ 0149 urlparse(url, bscheme, allow_fragments) 0150 if scheme != bscheme or scheme not in uses_relative: 0151 return url 0152 if scheme in uses_netloc: 0153 if netloc: 0154 return urlunparse((scheme, netloc, path, 0155 params, query, fragment)) 0156 netloc = bnetloc 0157 if path[:1] == '/': 0158 return urlunparse((scheme, netloc, path, 0159 params, query, fragment)) 0160 if not (path or params or query): 0161 return urlunparse((scheme, netloc, bpath, 0162 bparams, bquery, fragment)) 0163 segments = bpath.split('/')[:-1] + path.split('/') 0164 # XXX The stuff below is bogus in various ways... 0165 if segments[-1] == '.': 0166 segments[-1] = '' 0167 while '.' in segments: 0168 segments.remove('.') 0169 while 1: 0170 i = 1 0171 n = len(segments) - 1 0172 while i < n: 0173 if (segments[i] == '..' 0174 and segments[i-1] not in ('', '..')): 0175 del segments[i-1:i+1] 0176 break 0177 i = i+1 0178 else: 0179 break 0180 if segments == ['', '..']: 0181 segments[-1] = '' 0182 elif len(segments) >= 2 and segments[-1] == '..': 0183 segments[-2:] = [''] 0184 return urlunparse((scheme, netloc, '/'.join(segments), 0185 params, query, fragment)) 0186 0187 def urldefrag(url): 0188 """Removes any existing fragment from URL. 0189 0190 Returns a tuple of the defragmented URL and the fragment. If 0191 the URL contained no fragments, the second element is the 0192 empty string. 0193 """ 0194 if '#' in url: 0195 s, n, p, a, q, frag = urlparse(url) 0196 defrag = urlunparse((s, n, p, a, q, '')) 0197 return defrag, frag 0198 else: 0199 return url, '' 0200 0201 0202 test_input = """ 0203 http://a/b/c/d 0204 0205 g:h = <URL:g:h> 0206 http:g = <URL:http://a/b/c/g> 0207 http: = <URL:http://a/b/c/d> 0208 g = <URL:http://a/b/c/g> 0209 ./g = <URL:http://a/b/c/g> 0210 g/ = <URL:http://a/b/c/g/> 0211 /g = <URL:http://a/g> 0212 //g = <URL:http://g> 0213 ?y = <URL:http://a/b/c/d?y> 0214 g?y = <URL:http://a/b/c/g?y> 0215 g?y/./x = <URL:http://a/b/c/g?y/./x> 0216 . = <URL:http://a/b/c/> 0217 ./ = <URL:http://a/b/c/> 0218 .. = <URL:http://a/b/> 0219 ../ = <URL:http://a/b/> 0220 ../g = <URL:http://a/b/g> 0221 ../.. = <URL:http://a/> 0222 ../../g = <URL:http://a/g> 0223 ../../../g = <URL:http://a/../g> 0224 ./../g = <URL:http://a/b/g> 0225 ./g/. = <URL:http://a/b/c/g/> 0226 /./g = <URL:http://a/./g> 0227 g/./h = <URL:http://a/b/c/g/h> 0228 g/../h = <URL:http://a/b/c/h> 0229 http:g = <URL:http://a/b/c/g> 0230 http: = <URL:http://a/b/c/d> 0231 http:?y = <URL:http://a/b/c/d?y> 0232 http:g?y = <URL:http://a/b/c/g?y> 0233 http:g?y/./x = <URL:http://a/b/c/g?y/./x> 0234 """ 0235 0236 def test(): 0237 import sys 0238 base = '' 0239 if sys.argv[1:]: 0240 fn = sys.argv[1] 0241 if fn == '-': 0242 fp = sys.stdin 0243 else: 0244 fp = open(fn) 0245 else: 0246 import StringIO 0247 fp = StringIO.StringIO(test_input) 0248 while 1: 0249 line = fp.readline() 0250 if not line: break 0251 words = line.split() 0252 if not words: 0253 continue 0254 url = words[0] 0255 parts = urlparse(url) 0256 print '%-10s : %s' % (url, parts) 0257 abs = urljoin(base, url) 0258 if not base: 0259 base = abs 0260 wrapped = '<URL:%s>' % abs 0261 print '%-10s = %s' % (url, wrapped) 0262 if len(words) == 3 and words[1] == '=': 0263 if wrapped != words[2]: 0264 print 'EXPECTED', words[2], '!!!!!!!!!!' 0265 0266 if __name__ == '__main__': 0267 test() 0268
Generated by PyXR 0.9.4