PyXR

c:\python24\lib \ sre_compile.py


0001 #
0002 # Secret Labs' Regular Expression Engine
0003 #
0004 # convert template to internal format
0005 #
0006 # Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
0007 #
0008 # See the sre.py file for information on usage and redistribution.
0009 #
0010 
0011 """Internal support module for sre"""
0012 
0013 import _sre, sys
0014 
0015 from sre_constants import *
0016 
0017 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
0018 
0019 if _sre.CODESIZE == 2:
0020     MAXCODE = 65535
0021 else:
0022     MAXCODE = 0xFFFFFFFFL
0023 
0024 def _identityfunction(x):
0025     return x
0026 
0027 def _compile(code, pattern, flags):
0028     # internal: compile a (sub)pattern
0029     emit = code.append
0030     _len = len
0031     LITERAL_CODES = {LITERAL:1, NOT_LITERAL:1}
0032     REPEATING_CODES = {REPEAT:1, MIN_REPEAT:1, MAX_REPEAT:1}
0033     SUCCESS_CODES = {SUCCESS:1, FAILURE:1}
0034     ASSERT_CODES = {ASSERT:1, ASSERT_NOT:1}
0035     for op, av in pattern:
0036         if op in LITERAL_CODES:
0037             if flags & SRE_FLAG_IGNORECASE:
0038                 emit(OPCODES[OP_IGNORE[op]])
0039                 emit(_sre.getlower(av, flags))
0040             else:
0041                 emit(OPCODES[op])
0042                 emit(av)
0043         elif op is IN:
0044             if flags & SRE_FLAG_IGNORECASE:
0045                 emit(OPCODES[OP_IGNORE[op]])
0046                 def fixup(literal, flags=flags):
0047                     return _sre.getlower(literal, flags)
0048             else:
0049                 emit(OPCODES[op])
0050                 fixup = _identityfunction
0051             skip = _len(code); emit(0)
0052             _compile_charset(av, flags, code, fixup)
0053             code[skip] = _len(code) - skip
0054         elif op is ANY:
0055             if flags & SRE_FLAG_DOTALL:
0056                 emit(OPCODES[ANY_ALL])
0057             else:
0058                 emit(OPCODES[ANY])
0059         elif op in REPEATING_CODES:
0060             if flags & SRE_FLAG_TEMPLATE:
0061                 raise error, "internal: unsupported template operator"
0062                 emit(OPCODES[REPEAT])
0063                 skip = _len(code); emit(0)
0064                 emit(av[0])
0065                 emit(av[1])
0066                 _compile(code, av[2], flags)
0067                 emit(OPCODES[SUCCESS])
0068                 code[skip] = _len(code) - skip
0069             elif _simple(av) and op is not REPEAT:
0070                 if op is MAX_REPEAT:
0071                     emit(OPCODES[REPEAT_ONE])
0072                 else:
0073                     emit(OPCODES[MIN_REPEAT_ONE])
0074                 skip = _len(code); emit(0)
0075                 emit(av[0])
0076                 emit(av[1])
0077                 _compile(code, av[2], flags)
0078                 emit(OPCODES[SUCCESS])
0079                 code[skip] = _len(code) - skip
0080             else:
0081                 emit(OPCODES[REPEAT])
0082                 skip = _len(code); emit(0)
0083                 emit(av[0])
0084                 emit(av[1])
0085                 _compile(code, av[2], flags)
0086                 code[skip] = _len(code) - skip
0087                 if op is MAX_REPEAT:
0088                     emit(OPCODES[MAX_UNTIL])
0089                 else:
0090                     emit(OPCODES[MIN_UNTIL])
0091         elif op is SUBPATTERN:
0092             if av[0]:
0093                 emit(OPCODES[MARK])
0094                 emit((av[0]-1)*2)
0095             # _compile_info(code, av[1], flags)
0096             _compile(code, av[1], flags)
0097             if av[0]:
0098                 emit(OPCODES[MARK])
0099                 emit((av[0]-1)*2+1)
0100         elif op in SUCCESS_CODES:
0101             emit(OPCODES[op])
0102         elif op in ASSERT_CODES:
0103             emit(OPCODES[op])
0104             skip = _len(code); emit(0)
0105             if av[0] >= 0:
0106                 emit(0) # look ahead
0107             else:
0108                 lo, hi = av[1].getwidth()
0109                 if lo != hi:
0110                     raise error, "look-behind requires fixed-width pattern"
0111                 emit(lo) # look behind
0112             _compile(code, av[1], flags)
0113             emit(OPCODES[SUCCESS])
0114             code[skip] = _len(code) - skip
0115         elif op is CALL:
0116             emit(OPCODES[op])
0117             skip = _len(code); emit(0)
0118             _compile(code, av, flags)
0119             emit(OPCODES[SUCCESS])
0120             code[skip] = _len(code) - skip
0121         elif op is AT:
0122             emit(OPCODES[op])
0123             if flags & SRE_FLAG_MULTILINE:
0124                 av = AT_MULTILINE.get(av, av)
0125             if flags & SRE_FLAG_LOCALE:
0126                 av = AT_LOCALE.get(av, av)
0127             elif flags & SRE_FLAG_UNICODE:
0128                 av = AT_UNICODE.get(av, av)
0129             emit(ATCODES[av])
0130         elif op is BRANCH:
0131             emit(OPCODES[op])
0132             tail = []
0133             tailappend = tail.append
0134             for av in av[1]:
0135                 skip = _len(code); emit(0)
0136                 # _compile_info(code, av, flags)
0137                 _compile(code, av, flags)
0138                 emit(OPCODES[JUMP])
0139                 tailappend(_len(code)); emit(0)
0140                 code[skip] = _len(code) - skip
0141             emit(0) # end of branch
0142             for tail in tail:
0143                 code[tail] = _len(code) - tail
0144         elif op is CATEGORY:
0145             emit(OPCODES[op])
0146             if flags & SRE_FLAG_LOCALE:
0147                 av = CH_LOCALE[av]
0148             elif flags & SRE_FLAG_UNICODE:
0149                 av = CH_UNICODE[av]
0150             emit(CHCODES[av])
0151         elif op is GROUPREF:
0152             if flags & SRE_FLAG_IGNORECASE:
0153                 emit(OPCODES[OP_IGNORE[op]])
0154             else:
0155                 emit(OPCODES[op])
0156             emit(av-1)
0157         elif op is GROUPREF_EXISTS:
0158             emit(OPCODES[op])
0159             emit((av[0]-1)*2)
0160             skipyes = _len(code); emit(0)
0161             _compile(code, av[1], flags)
0162             if av[2]:
0163                 emit(OPCODES[JUMP])
0164                 skipno = _len(code); emit(0)
0165                 code[skipyes] = _len(code) - skipyes + 1
0166                 _compile(code, av[2], flags)
0167                 code[skipno] = _len(code) - skipno
0168             else:
0169                 code[skipyes] = _len(code) - skipyes + 1
0170         else:
0171             raise ValueError, ("unsupported operand type", op)
0172 
0173 def _compile_charset(charset, flags, code, fixup=None):
0174     # compile charset subprogram
0175     emit = code.append
0176     if fixup is None:
0177         fixup = _identityfunction
0178     for op, av in _optimize_charset(charset, fixup):
0179         emit(OPCODES[op])
0180         if op is NEGATE:
0181             pass
0182         elif op is LITERAL:
0183             emit(fixup(av))
0184         elif op is RANGE:
0185             emit(fixup(av[0]))
0186             emit(fixup(av[1]))
0187         elif op is CHARSET:
0188             code.extend(av)
0189         elif op is BIGCHARSET:
0190             code.extend(av)
0191         elif op is CATEGORY:
0192             if flags & SRE_FLAG_LOCALE:
0193                 emit(CHCODES[CH_LOCALE[av]])
0194             elif flags & SRE_FLAG_UNICODE:
0195                 emit(CHCODES[CH_UNICODE[av]])
0196             else:
0197                 emit(CHCODES[av])
0198         else:
0199             raise error, "internal: unsupported set operator"
0200     emit(OPCODES[FAILURE])
0201 
0202 def _optimize_charset(charset, fixup):
0203     # internal: optimize character set
0204     out = []
0205     outappend = out.append
0206     charmap = [0]*256
0207     try:
0208         for op, av in charset:
0209             if op is NEGATE:
0210                 outappend((op, av))
0211             elif op is LITERAL:
0212                 charmap[fixup(av)] = 1
0213             elif op is RANGE:
0214                 for i in range(fixup(av[0]), fixup(av[1])+1):
0215                     charmap[i] = 1
0216             elif op is CATEGORY:
0217                 # XXX: could append to charmap tail
0218                 return charset # cannot compress
0219     except IndexError:
0220         # character set contains unicode characters
0221         return _optimize_unicode(charset, fixup)
0222     # compress character map
0223     i = p = n = 0
0224     runs = []
0225     runsappend = runs.append
0226     for c in charmap:
0227         if c:
0228             if n == 0:
0229                 p = i
0230             n = n + 1
0231         elif n:
0232             runsappend((p, n))
0233             n = 0
0234         i = i + 1
0235     if n:
0236         runsappend((p, n))
0237     if len(runs) <= 2:
0238         # use literal/range
0239         for p, n in runs:
0240             if n == 1:
0241                 outappend((LITERAL, p))
0242             else:
0243                 outappend((RANGE, (p, p+n-1)))
0244         if len(out) < len(charset):
0245             return out
0246     else:
0247         # use bitmap
0248         data = _mk_bitmap(charmap)
0249         outappend((CHARSET, data))
0250         return out
0251     return charset
0252 
0253 def _mk_bitmap(bits):
0254     data = []
0255     dataappend = data.append
0256     if _sre.CODESIZE == 2:
0257         start = (1, 0)
0258     else:
0259         start = (1L, 0L)
0260     m, v = start
0261     for c in bits:
0262         if c:
0263             v = v + m
0264         m = m + m
0265         if m > MAXCODE:
0266             dataappend(v)
0267             m, v = start
0268     return data
0269 
0270 # To represent a big charset, first a bitmap of all characters in the
0271 # set is constructed. Then, this bitmap is sliced into chunks of 256
0272 # characters, duplicate chunks are eliminitated, and each chunk is
0273 # given a number. In the compiled expression, the charset is
0274 # represented by a 16-bit word sequence, consisting of one word for
0275 # the number of different chunks, a sequence of 256 bytes (128 words)
0276 # of chunk numbers indexed by their original chunk position, and a
0277 # sequence of chunks (16 words each).
0278 
0279 # Compression is normally good: in a typical charset, large ranges of
0280 # Unicode will be either completely excluded (e.g. if only cyrillic
0281 # letters are to be matched), or completely included (e.g. if large
0282 # subranges of Kanji match). These ranges will be represented by
0283 # chunks of all one-bits or all zero-bits.
0284 
0285 # Matching can be also done efficiently: the more significant byte of
0286 # the Unicode character is an index into the chunk number, and the
0287 # less significant byte is a bit index in the chunk (just like the
0288 # CHARSET matching).
0289 
0290 # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
0291 # of the basic multilingual plane; an efficient representation
0292 # for all of UTF-16 has not yet been developed. This means,
0293 # in particular, that negated charsets cannot be represented as
0294 # bigcharsets.
0295 
0296 def _optimize_unicode(charset, fixup):
0297     try:
0298         import array
0299     except ImportError:
0300         return charset
0301     charmap = [0]*65536
0302     negate = 0
0303     try:
0304         for op, av in charset:
0305             if op is NEGATE:
0306                 negate = 1
0307             elif op is LITERAL:
0308                 charmap[fixup(av)] = 1
0309             elif op is RANGE:
0310                 for i in xrange(fixup(av[0]), fixup(av[1])+1):
0311                     charmap[i] = 1
0312             elif op is CATEGORY:
0313                 # XXX: could expand category
0314                 return charset # cannot compress
0315     except IndexError:
0316         # non-BMP characters
0317         return charset
0318     if negate:
0319         if sys.maxunicode != 65535:
0320             # XXX: negation does not work with big charsets
0321             return charset
0322         for i in xrange(65536):
0323             charmap[i] = not charmap[i]
0324     comps = {}
0325     mapping = [0]*256
0326     block = 0
0327     data = []
0328     for i in xrange(256):
0329         chunk = tuple(charmap[i*256:(i+1)*256])
0330         new = comps.setdefault(chunk, block)
0331         mapping[i] = new
0332         if new == block:
0333             block = block + 1
0334             data = data + _mk_bitmap(chunk)
0335     header = [block]
0336     if _sre.CODESIZE == 2:
0337         code = 'H'
0338     else:
0339         code = 'I'
0340     # Convert block indices to byte array of 256 bytes
0341     mapping = array.array('b', mapping).tostring()
0342     # Convert byte array to word array
0343     mapping = array.array(code, mapping)
0344     assert mapping.itemsize == _sre.CODESIZE
0345     header = header + mapping.tolist()
0346     data[0:0] = header
0347     return [(BIGCHARSET, data)]
0348 
0349 def _simple(av):
0350     # check if av is a "simple" operator
0351     lo, hi = av[2].getwidth()
0352     if lo == 0 and hi == MAXREPEAT:
0353         raise error, "nothing to repeat"
0354     return lo == hi == 1 and av[2][0][0] != SUBPATTERN
0355 
0356 def _compile_info(code, pattern, flags):
0357     # internal: compile an info block.  in the current version,
0358     # this contains min/max pattern width, and an optional literal
0359     # prefix or a character map
0360     lo, hi = pattern.getwidth()
0361     if lo == 0:
0362         return # not worth it
0363     # look for a literal prefix
0364     prefix = []
0365     prefixappend = prefix.append
0366     prefix_skip = 0
0367     charset = [] # not used
0368     charsetappend = charset.append
0369     if not (flags & SRE_FLAG_IGNORECASE):
0370         # look for literal prefix
0371         for op, av in pattern.data:
0372             if op is LITERAL:
0373                 if len(prefix) == prefix_skip:
0374                     prefix_skip = prefix_skip + 1
0375                 prefixappend(av)
0376             elif op is SUBPATTERN and len(av[1]) == 1:
0377                 op, av = av[1][0]
0378                 if op is LITERAL:
0379                     prefixappend(av)
0380                 else:
0381                     break
0382             else:
0383                 break
0384         # if no prefix, look for charset prefix
0385         if not prefix and pattern.data:
0386             op, av = pattern.data[0]
0387             if op is SUBPATTERN and av[1]:
0388                 op, av = av[1][0]
0389                 if op is LITERAL:
0390                     charsetappend((op, av))
0391                 elif op is BRANCH:
0392                     c = []
0393                     cappend = c.append
0394                     for p in av[1]:
0395                         if not p:
0396                             break
0397                         op, av = p[0]
0398                         if op is LITERAL:
0399                             cappend((op, av))
0400                         else:
0401                             break
0402                     else:
0403                         charset = c
0404             elif op is BRANCH:
0405                 c = []
0406                 cappend = c.append
0407                 for p in av[1]:
0408                     if not p:
0409                         break
0410                     op, av = p[0]
0411                     if op is LITERAL:
0412                         cappend((op, av))
0413                     else:
0414                         break
0415                 else:
0416                     charset = c
0417             elif op is IN:
0418                 charset = av
0419 ##     if prefix:
0420 ##         print "*** PREFIX", prefix, prefix_skip
0421 ##     if charset:
0422 ##         print "*** CHARSET", charset
0423     # add an info block
0424     emit = code.append
0425     emit(OPCODES[INFO])
0426     skip = len(code); emit(0)
0427     # literal flag
0428     mask = 0
0429     if prefix:
0430         mask = SRE_INFO_PREFIX
0431         if len(prefix) == prefix_skip == len(pattern.data):
0432             mask = mask + SRE_INFO_LITERAL
0433     elif charset:
0434         mask = mask + SRE_INFO_CHARSET
0435     emit(mask)
0436     # pattern length
0437     if lo < MAXCODE:
0438         emit(lo)
0439     else:
0440         emit(MAXCODE)
0441         prefix = prefix[:MAXCODE]
0442     if hi < MAXCODE:
0443         emit(hi)
0444     else:
0445         emit(0)
0446     # add literal prefix
0447     if prefix:
0448         emit(len(prefix)) # length
0449         emit(prefix_skip) # skip
0450         code.extend(prefix)
0451         # generate overlap table
0452         table = [-1] + ([0]*len(prefix))
0453         for i in xrange(len(prefix)):
0454             table[i+1] = table[i]+1
0455             while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
0456                 table[i+1] = table[table[i+1]-1]+1
0457         code.extend(table[1:]) # don't store first entry
0458     elif charset:
0459         _compile_charset(charset, flags, code)
0460     code[skip] = len(code) - skip
0461 
0462 try:
0463     unicode
0464 except NameError:
0465     STRING_TYPES = (type(""),)
0466 else:
0467     STRING_TYPES = (type(""), type(unicode("")))
0468 
0469 def isstring(obj):
0470     for tp in STRING_TYPES:
0471         if isinstance(obj, tp):
0472             return 1
0473     return 0
0474 
0475 def _code(p, flags):
0476 
0477     flags = p.pattern.flags | flags
0478     code = []
0479 
0480     # compile info block
0481     _compile_info(code, p, flags)
0482 
0483     # compile the pattern
0484     _compile(code, p.data, flags)
0485 
0486     code.append(OPCODES[SUCCESS])
0487 
0488     return code
0489 
0490 def compile(p, flags=0):
0491     # internal: convert pattern list to internal format
0492 
0493     if isstring(p):
0494         import sre_parse
0495         pattern = p
0496         p = sre_parse.parse(p, flags)
0497     else:
0498         pattern = None
0499 
0500     code = _code(p, flags)
0501 
0502     # print code
0503 
0504     # XXX: <fl> get rid of this limitation!
0505     if p.pattern.groups > 100:
0506         raise AssertionError(
0507             "sorry, but this version only supports 100 named groups"
0508             )
0509 
0510     # map in either direction
0511     groupindex = p.pattern.groupdict
0512     indexgroup = [None] * p.pattern.groups
0513     for k, i in groupindex.items():
0514         indexgroup[i] = k
0515 
0516     return _sre.compile(
0517         pattern, flags, code,
0518         p.pattern.groups-1,
0519         groupindex, indexgroup
0520         )
0521
Generated by PyXR 0.9.4