PyXR

c:\python24\lib \ sre.py



0001 #
0002 # Secret Labs' Regular Expression Engine
0003 #
0004 # re-compatible interface for the sre matching engine
0005 #
0006 # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
0007 #
0008 # This version of the SRE library can be redistributed under CNRI's
0009 # Python 1.6 license.  For any other use, please contact Secret Labs
0010 # AB (info@pythonware.com).
0011 #
0012 # Portions of this engine have been developed in cooperation with
0013 # CNRI.  Hewlett-Packard provided funding for 1.6 integration and
0014 # other compatibility work.
0015 #
0016 
0017 r"""Support for regular expressions (RE).
0018 
0019 This module provides regular expression matching operations similar to
0020 those found in Perl.  It supports both 8-bit and Unicode strings; both
0021 the pattern and the strings being processed can contain null bytes and
0022 characters outside the US ASCII range.
0023 
0024 Regular expressions can contain both special and ordinary characters.
0025 Most ordinary characters, like "A", "a", or "0", are the simplest
0026 regular expressions; they simply match themselves.  You can
0027 concatenate ordinary characters, so last matches the string 'last'.
0028 
0029 The special characters are:
0030     "."      Matches any character except a newline.
0031     "^"      Matches the start of the string.
0032     "$"      Matches the end of the string.
0033     "*"      Matches 0 or more (greedy) repetitions of the preceding RE.
0034              Greedy means that it will match as many repetitions as possible.
0035     "+"      Matches 1 or more (greedy) repetitions of the preceding RE.
0036     "?"      Matches 0 or 1 (greedy) of the preceding RE.
0037     *?,+?,?? Non-greedy versions of the previous three special characters.
0038     {m,n}    Matches from m to n repetitions of the preceding RE.
0039     {m,n}?   Non-greedy version of the above.
0040     "\\"      Either escapes special characters or signals a special sequence.
0041     []       Indicates a set of characters.
0042              A "^" as the first character indicates a complementing set.
0043     "|"      A|B, creates an RE that will match either A or B.
0044     (...)    Matches the RE inside the parentheses.
0045              The contents can be retrieved or matched later in the string.
0046     (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below).
0047     (?:...)  Non-grouping version of regular parentheses.
0048     (?P<name>...) The substring matched by the group is accessible by name.
0049     (?P=name)     Matches the text matched earlier by the group named name.
0050     (?#...)  A comment; ignored.
0051     (?=...)  Matches if ... matches next, but doesn't consume the string.
0052     (?!...)  Matches if ... doesn't match next.
0053 
0054 The special sequences consist of "\\" and a character from the list
0055 below.  If the ordinary character is not on the list, then the
0056 resulting RE will match the second character.
0057     \number  Matches the contents of the group of the same number.
0058     \A       Matches only at the start of the string.
0059     \Z       Matches only at the end of the string.
0060     \b       Matches the empty string, but only at the start or end of a word.
0061     \B       Matches the empty string, but not at the start or end of a word.
0062     \d       Matches any decimal digit; equivalent to the set [0-9].
0063     \D       Matches any non-digit character; equivalent to the set [^0-9].
0064     \s       Matches any whitespace character; equivalent to [ \t\n\r\f\v].
0065     \S       Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v].
0066     \w       Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
0067              With LOCALE, it will match the set [0-9_] plus characters defined
0068              as letters for the current locale.
0069     \W       Matches the complement of \w.
0070     \\       Matches a literal backslash.
0071 
0072 This module exports the following functions:
0073     match    Match a regular expression pattern to the beginning of a string.
0074     search   Search a string for the presence of a pattern.
0075     sub      Substitute occurrences of a pattern found in a string.
0076     subn     Same as sub, but also return the number of substitutions made.
0077     split    Split a string by the occurrences of a pattern.
0078     findall  Find all occurrences of a pattern in a string.
0079     compile  Compile a pattern into a RegexObject.
0080     purge    Clear the regular expression cache.
0081     escape   Backslash all non-alphanumerics in a string.
0082 
0083 Some of the functions in this module takes flags as optional parameters:
0084     I  IGNORECASE  Perform case-insensitive matching.
0085     L  LOCALE      Make \w, \W, \b, \B, dependent on the current locale.
0086     M  MULTILINE   "^" matches the beginning of lines as well as the string.
0087                    "$" matches the end of lines as well as the string.
0088     S  DOTALL      "." matches any character at all, including the newline.
0089     X  VERBOSE     Ignore whitespace and comments for nicer looking RE's.
0090     U  UNICODE     Make \w, \W, \b, \B, dependent on the Unicode locale.
0091 
0092 This module also defines an exception 'error'.
0093 
0094 """
0095 
0096 import sys
0097 import sre_compile
0098 import sre_parse
0099 
0100 # public symbols
0101 __all__ = [ "match", "search", "sub", "subn", "split", "findall",
0102     "compile", "purge", "template", "escape", "I", "L", "M", "S", "X",
0103     "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
0104     "UNICODE", "error" ]
0105 
0106 __version__ = "2.2.1"
0107 
0108 # flags
0109 I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
0110 L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
0111 U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale
0112 M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
0113 S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
0114 X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
0115 
0116 # sre extensions (experimental, don't rely on these)
0117 T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking
0118 DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation
0119 
0120 # sre exception
0121 error = sre_compile.error
0122 
0123 # --------------------------------------------------------------------
0124 # public interface
0125 
0126 def match(pattern, string, flags=0):
0127     """Try to apply the pattern at the start of the string, returning
0128     a match object, or None if no match was found."""
0129     return _compile(pattern, flags).match(string)
0130 
0131 def search(pattern, string, flags=0):
0132     """Scan through string looking for a match to the pattern, returning
0133     a match object, or None if no match was found."""
0134     return _compile(pattern, flags).search(string)
0135 
0136 def sub(pattern, repl, string, count=0):
0137     """Return the string obtained by replacing the leftmost
0138     non-overlapping occurrences of the pattern in string by the
0139     replacement repl.  repl can be either a string or a callable;
0140     if a callable, it's passed the match object and must return
0141     a replacement string to be used."""
0142     return _compile(pattern, 0).sub(repl, string, count)
0143 
0144 def subn(pattern, repl, string, count=0):
0145     """Return a 2-tuple containing (new_string, number).
0146     new_string is the string obtained by replacing the leftmost
0147     non-overlapping occurrences of the pattern in the source
0148     string by the replacement repl.  number is the number of
0149     substitutions that were made. repl can be either a string or a
0150     callable; if a callable, it's passed the match object and must
0151     return a replacement string to be used."""
0152     return _compile(pattern, 0).subn(repl, string, count)
0153 
0154 def split(pattern, string, maxsplit=0):
0155     """Split the source string by the occurrences of the pattern,
0156     returning a list containing the resulting substrings."""
0157     return _compile(pattern, 0).split(string, maxsplit)
0158 
0159 def findall(pattern, string, flags=0):
0160     """Return a list of all non-overlapping matches in the string.
0161 
0162     If one or more groups are present in the pattern, return a
0163     list of groups; this will be a list of tuples if the pattern
0164     has more than one group.
0165 
0166     Empty matches are included in the result."""
0167     return _compile(pattern, flags).findall(string)
0168 
0169 if sys.hexversion >= 0x02020000:
0170     __all__.append("finditer")
0171     def finditer(pattern, string, flags=0):
0172         """Return an iterator over all non-overlapping matches in the
0173         string.  For each match, the iterator returns a match object.
0174 
0175         Empty matches are included in the result."""
0176         return _compile(pattern, flags).finditer(string)
0177 
0178 def compile(pattern, flags=0):
0179     "Compile a regular expression pattern, returning a pattern object."
0180     return _compile(pattern, flags)
0181 
0182 def purge():
0183     "Clear the regular expression cache"
0184     _cache.clear()
0185     _cache_repl.clear()
0186 
0187 def template(pattern, flags=0):
0188     "Compile a template pattern, returning a pattern object"
0189     return _compile(pattern, flags|T)
0190 
0191 def escape(pattern):
0192     "Escape all non-alphanumeric characters in pattern."
0193     s = list(pattern)
0194     for i in range(len(pattern)):
0195         c = pattern[i]
0196         if not ("a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9"):
0197             if c == "\000":
0198                 s[i] = "\\000"
0199             else:
0200                 s[i] = "\\" + c
0201     return pattern[:0].join(s)
0202 
0203 # --------------------------------------------------------------------
0204 # internals
0205 
0206 _cache = {}
0207 _cache_repl = {}
0208 
0209 _pattern_type = type(sre_compile.compile("", 0))
0210 
0211 _MAXCACHE = 100
0212 
0213 def _compile(*key):
0214     # internal: compile pattern
0215     cachekey = (type(key[0]),) + key
0216     p = _cache.get(cachekey)
0217     if p is not None:
0218         return p
0219     pattern, flags = key
0220     if isinstance(pattern, _pattern_type):
0221         return pattern
0222     if not sre_compile.isstring(pattern):
0223         raise TypeError, "first argument must be string or compiled pattern"
0224     try:
0225         p = sre_compile.compile(pattern, flags)
0226     except error, v:
0227         raise error, v # invalid expression
0228     if len(_cache) >= _MAXCACHE:
0229         _cache.clear()
0230     _cache[cachekey] = p
0231     return p
0232 
0233 def _compile_repl(*key):
0234     # internal: compile replacement pattern
0235     p = _cache_repl.get(key)
0236     if p is not None:
0237         return p
0238     repl, pattern = key
0239     try:
0240         p = sre_parse.parse_template(repl, pattern)
0241     except error, v:
0242         raise error, v # invalid expression
0243     if len(_cache_repl) >= _MAXCACHE:
0244         _cache_repl.clear()
0245     _cache_repl[key] = p
0246     return p
0247 
0248 def _expand(pattern, match, template):
0249     # internal: match.expand implementation hook
0250     template = sre_parse.parse_template(template, pattern)
0251     return sre_parse.expand_template(template, match)
0252 
0253 def _subx(pattern, template):
0254     # internal: pattern.sub/subn implementation helper
0255     template = _compile_repl(template, pattern)
0256     if not template[0] and len(template[1]) == 1:
0257         # literal replacement
0258         return template[1][0]
0259     def filter(match, template=template):
0260         return sre_parse.expand_template(template, match)
0261     return filter
0262 
0263 # register myself for pickling
0264 
0265 import copy_reg
0266 
0267 def _pickle(p):
0268     return _compile, (p.pattern, p.flags)
0269 
0270 copy_reg.pickle(_pattern_type, _pickle, _compile)
0271 
0272 # --------------------------------------------------------------------
0273 # experimental stuff (see python-dev discussions for details)
0274 
0275 class Scanner:
0276     def __init__(self, lexicon, flags=0):
0277         from sre_constants import BRANCH, SUBPATTERN
0278         self.lexicon = lexicon
0279         # combine phrases into a compound pattern
0280         p = []
0281         s = sre_parse.Pattern()
0282         s.flags = flags
0283         for phrase, action in lexicon:
0284             p.append(sre_parse.SubPattern(s, [
0285                 (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))),
0286                 ]))
0287         p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
0288         s.groups = len(p)
0289         self.scanner = sre_compile.compile(p)
0290     def scan(self, string):
0291         result = []
0292         append = result.append
0293         match = self.scanner.scanner(string).match
0294         i = 0
0295         while 1:
0296             m = match()
0297             if not m:
0298                 break
0299             j = m.end()
0300             if i == j:
0301                 break
0302             action = self.lexicon[m.lastindex-1][1]
0303             if callable(action):
0304                 self.match = m
0305                 action = action(self, m.group())
0306             if action is not None:
0307                 append(action)
0308             i = j
0309         return result, string[i:]
0310 

Generated by PyXR 0.9.4
SourceForge.net Logo