0001 # 0002 # Secret Labs' Regular Expression Engine 0003 # 0004 # re-compatible interface for the sre matching engine 0005 # 0006 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 0007 # 0008 # This version of the SRE library can be redistributed under CNRI's 0009 # Python 1.6 license. For any other use, please contact Secret Labs 0010 # AB (info@pythonware.com). 0011 # 0012 # Portions of this engine have been developed in cooperation with 0013 # CNRI. Hewlett-Packard provided funding for 1.6 integration and 0014 # other compatibility work. 0015 # 0016 0017 r"""Support for regular expressions (RE). 0018 0019 This module provides regular expression matching operations similar to 0020 those found in Perl. It supports both 8-bit and Unicode strings; both 0021 the pattern and the strings being processed can contain null bytes and 0022 characters outside the US ASCII range. 0023 0024 Regular expressions can contain both special and ordinary characters. 0025 Most ordinary characters, like "A", "a", or "0", are the simplest 0026 regular expressions; they simply match themselves. You can 0027 concatenate ordinary characters, so last matches the string 'last'. 0028 0029 The special characters are: 0030 "." Matches any character except a newline. 0031 "^" Matches the start of the string. 0032 "$" Matches the end of the string. 0033 "*" Matches 0 or more (greedy) repetitions of the preceding RE. 0034 Greedy means that it will match as many repetitions as possible. 0035 "+" Matches 1 or more (greedy) repetitions of the preceding RE. 0036 "?" Matches 0 or 1 (greedy) of the preceding RE. 0037 *?,+?,?? Non-greedy versions of the previous three special characters. 0038 {m,n} Matches from m to n repetitions of the preceding RE. 0039 {m,n}? Non-greedy version of the above. 0040 "\\" Either escapes special characters or signals a special sequence. 0041 [] Indicates a set of characters. 0042 A "^" as the first character indicates a complementing set. 0043 "|" A|B, creates an RE that will match either A or B. 0044 (...) Matches the RE inside the parentheses. 0045 The contents can be retrieved or matched later in the string. 0046 (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below). 0047 (?:...) Non-grouping version of regular parentheses. 0048 (?P<name>...) The substring matched by the group is accessible by name. 0049 (?P=name) Matches the text matched earlier by the group named name. 0050 (?#...) A comment; ignored. 0051 (?=...) Matches if ... matches next, but doesn't consume the string. 0052 (?!...) Matches if ... doesn't match next. 0053 0054 The special sequences consist of "\\" and a character from the list 0055 below. If the ordinary character is not on the list, then the 0056 resulting RE will match the second character. 0057 \number Matches the contents of the group of the same number. 0058 \A Matches only at the start of the string. 0059 \Z Matches only at the end of the string. 0060 \b Matches the empty string, but only at the start or end of a word. 0061 \B Matches the empty string, but not at the start or end of a word. 0062 \d Matches any decimal digit; equivalent to the set [0-9]. 0063 \D Matches any non-digit character; equivalent to the set [^0-9]. 0064 \s Matches any whitespace character; equivalent to [ \t\n\r\f\v]. 0065 \S Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v]. 0066 \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]. 0067 With LOCALE, it will match the set [0-9_] plus characters defined 0068 as letters for the current locale. 0069 \W Matches the complement of \w. 0070 \\ Matches a literal backslash. 0071 0072 This module exports the following functions: 0073 match Match a regular expression pattern to the beginning of a string. 0074 search Search a string for the presence of a pattern. 0075 sub Substitute occurrences of a pattern found in a string. 0076 subn Same as sub, but also return the number of substitutions made. 0077 split Split a string by the occurrences of a pattern. 0078 findall Find all occurrences of a pattern in a string. 0079 compile Compile a pattern into a RegexObject. 0080 purge Clear the regular expression cache. 0081 escape Backslash all non-alphanumerics in a string. 0082 0083 Some of the functions in this module takes flags as optional parameters: 0084 I IGNORECASE Perform case-insensitive matching. 0085 L LOCALE Make \w, \W, \b, \B, dependent on the current locale. 0086 M MULTILINE "^" matches the beginning of lines as well as the string. 0087 "$" matches the end of lines as well as the string. 0088 S DOTALL "." matches any character at all, including the newline. 0089 X VERBOSE Ignore whitespace and comments for nicer looking RE's. 0090 U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale. 0091 0092 This module also defines an exception 'error'. 0093 0094 """ 0095 0096 import sys 0097 import sre_compile 0098 import sre_parse 0099 0100 # public symbols 0101 __all__ = [ "match", "search", "sub", "subn", "split", "findall", 0102 "compile", "purge", "template", "escape", "I", "L", "M", "S", "X", 0103 "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", 0104 "UNICODE", "error" ] 0105 0106 __version__ = "2.2.1" 0107 0108 # flags 0109 I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case 0110 L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale 0111 U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale 0112 M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline 0113 S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline 0114 X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments 0115 0116 # sre extensions (experimental, don't rely on these) 0117 T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking 0118 DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation 0119 0120 # sre exception 0121 error = sre_compile.error 0122 0123 # -------------------------------------------------------------------- 0124 # public interface 0125 0126 def match(pattern, string, flags=0): 0127 """Try to apply the pattern at the start of the string, returning 0128 a match object, or None if no match was found.""" 0129 return _compile(pattern, flags).match(string) 0130 0131 def search(pattern, string, flags=0): 0132 """Scan through string looking for a match to the pattern, returning 0133 a match object, or None if no match was found.""" 0134 return _compile(pattern, flags).search(string) 0135 0136 def sub(pattern, repl, string, count=0): 0137 """Return the string obtained by replacing the leftmost 0138 non-overlapping occurrences of the pattern in string by the 0139 replacement repl. repl can be either a string or a callable; 0140 if a callable, it's passed the match object and must return 0141 a replacement string to be used.""" 0142 return _compile(pattern, 0).sub(repl, string, count) 0143 0144 def subn(pattern, repl, string, count=0): 0145 """Return a 2-tuple containing (new_string, number). 0146 new_string is the string obtained by replacing the leftmost 0147 non-overlapping occurrences of the pattern in the source 0148 string by the replacement repl. number is the number of 0149 substitutions that were made. repl can be either a string or a 0150 callable; if a callable, it's passed the match object and must 0151 return a replacement string to be used.""" 0152 return _compile(pattern, 0).subn(repl, string, count) 0153 0154 def split(pattern, string, maxsplit=0): 0155 """Split the source string by the occurrences of the pattern, 0156 returning a list containing the resulting substrings.""" 0157 return _compile(pattern, 0).split(string, maxsplit) 0158 0159 def findall(pattern, string, flags=0): 0160 """Return a list of all non-overlapping matches in the string. 0161 0162 If one or more groups are present in the pattern, return a 0163 list of groups; this will be a list of tuples if the pattern 0164 has more than one group. 0165 0166 Empty matches are included in the result.""" 0167 return _compile(pattern, flags).findall(string) 0168 0169 if sys.hexversion >= 0x02020000: 0170 __all__.append("finditer") 0171 def finditer(pattern, string, flags=0): 0172 """Return an iterator over all non-overlapping matches in the 0173 string. For each match, the iterator returns a match object. 0174 0175 Empty matches are included in the result.""" 0176 return _compile(pattern, flags).finditer(string) 0177 0178 def compile(pattern, flags=0): 0179 "Compile a regular expression pattern, returning a pattern object." 0180 return _compile(pattern, flags) 0181 0182 def purge(): 0183 "Clear the regular expression cache" 0184 _cache.clear() 0185 _cache_repl.clear() 0186 0187 def template(pattern, flags=0): 0188 "Compile a template pattern, returning a pattern object" 0189 return _compile(pattern, flags|T) 0190 0191 def escape(pattern): 0192 "Escape all non-alphanumeric characters in pattern." 0193 s = list(pattern) 0194 for i in range(len(pattern)): 0195 c = pattern[i] 0196 if not ("a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9"): 0197 if c == "\000": 0198 s[i] = "\\000" 0199 else: 0200 s[i] = "\\" + c 0201 return pattern[:0].join(s) 0202 0203 # -------------------------------------------------------------------- 0204 # internals 0205 0206 _cache = {} 0207 _cache_repl = {} 0208 0209 _pattern_type = type(sre_compile.compile("", 0)) 0210 0211 _MAXCACHE = 100 0212 0213 def _compile(*key): 0214 # internal: compile pattern 0215 cachekey = (type(key[0]),) + key 0216 p = _cache.get(cachekey) 0217 if p is not None: 0218 return p 0219 pattern, flags = key 0220 if isinstance(pattern, _pattern_type): 0221 return pattern 0222 if not sre_compile.isstring(pattern): 0223 raise TypeError, "first argument must be string or compiled pattern" 0224 try: 0225 p = sre_compile.compile(pattern, flags) 0226 except error, v: 0227 raise error, v # invalid expression 0228 if len(_cache) >= _MAXCACHE: 0229 _cache.clear() 0230 _cache[cachekey] = p 0231 return p 0232 0233 def _compile_repl(*key): 0234 # internal: compile replacement pattern 0235 p = _cache_repl.get(key) 0236 if p is not None: 0237 return p 0238 repl, pattern = key 0239 try: 0240 p = sre_parse.parse_template(repl, pattern) 0241 except error, v: 0242 raise error, v # invalid expression 0243 if len(_cache_repl) >= _MAXCACHE: 0244 _cache_repl.clear() 0245 _cache_repl[key] = p 0246 return p 0247 0248 def _expand(pattern, match, template): 0249 # internal: match.expand implementation hook 0250 template = sre_parse.parse_template(template, pattern) 0251 return sre_parse.expand_template(template, match) 0252 0253 def _subx(pattern, template): 0254 # internal: pattern.sub/subn implementation helper 0255 template = _compile_repl(template, pattern) 0256 if not template[0] and len(template[1]) == 1: 0257 # literal replacement 0258 return template[1][0] 0259 def filter(match, template=template): 0260 return sre_parse.expand_template(template, match) 0261 return filter 0262 0263 # register myself for pickling 0264 0265 import copy_reg 0266 0267 def _pickle(p): 0268 return _compile, (p.pattern, p.flags) 0269 0270 copy_reg.pickle(_pattern_type, _pickle, _compile) 0271 0272 # -------------------------------------------------------------------- 0273 # experimental stuff (see python-dev discussions for details) 0274 0275 class Scanner: 0276 def __init__(self, lexicon, flags=0): 0277 from sre_constants import BRANCH, SUBPATTERN 0278 self.lexicon = lexicon 0279 # combine phrases into a compound pattern 0280 p = [] 0281 s = sre_parse.Pattern() 0282 s.flags = flags 0283 for phrase, action in lexicon: 0284 p.append(sre_parse.SubPattern(s, [ 0285 (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), 0286 ])) 0287 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) 0288 s.groups = len(p) 0289 self.scanner = sre_compile.compile(p) 0290 def scan(self, string): 0291 result = [] 0292 append = result.append 0293 match = self.scanner.scanner(string).match 0294 i = 0 0295 while 1: 0296 m = match() 0297 if not m: 0298 break 0299 j = m.end() 0300 if i == j: 0301 break 0302 action = self.lexicon[m.lastindex-1][1] 0303 if callable(action): 0304 self.match = m 0305 action = action(self, m.group()) 0306 if action is not None: 0307 append(action) 0308 i = j 0309 return result, string[i:] 0310
Generated by PyXR 0.9.4