0001 # 0002 # Secret Labs' Regular Expression Engine 0003 # 0004 # convert template to internal format 0005 # 0006 # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 0007 # 0008 # See the sre.py file for information on usage and redistribution. 0009 # 0010 0011 """Internal support module for sre""" 0012 0013 import _sre, sys 0014 0015 from sre_constants import * 0016 0017 assert _sre.MAGIC == MAGIC, "SRE module mismatch" 0018 0019 if _sre.CODESIZE == 2: 0020 MAXCODE = 65535 0021 else: 0022 MAXCODE = 0xFFFFFFFFL 0023 0024 def _identityfunction(x): 0025 return x 0026 0027 def _compile(code, pattern, flags): 0028 # internal: compile a (sub)pattern 0029 emit = code.append 0030 _len = len 0031 LITERAL_CODES = {LITERAL:1, NOT_LITERAL:1} 0032 REPEATING_CODES = {REPEAT:1, MIN_REPEAT:1, MAX_REPEAT:1} 0033 SUCCESS_CODES = {SUCCESS:1, FAILURE:1} 0034 ASSERT_CODES = {ASSERT:1, ASSERT_NOT:1} 0035 for op, av in pattern: 0036 if op in LITERAL_CODES: 0037 if flags & SRE_FLAG_IGNORECASE: 0038 emit(OPCODES[OP_IGNORE[op]]) 0039 emit(_sre.getlower(av, flags)) 0040 else: 0041 emit(OPCODES[op]) 0042 emit(av) 0043 elif op is IN: 0044 if flags & SRE_FLAG_IGNORECASE: 0045 emit(OPCODES[OP_IGNORE[op]]) 0046 def fixup(literal, flags=flags): 0047 return _sre.getlower(literal, flags) 0048 else: 0049 emit(OPCODES[op]) 0050 fixup = _identityfunction 0051 skip = _len(code); emit(0) 0052 _compile_charset(av, flags, code, fixup) 0053 code[skip] = _len(code) - skip 0054 elif op is ANY: 0055 if flags & SRE_FLAG_DOTALL: 0056 emit(OPCODES[ANY_ALL]) 0057 else: 0058 emit(OPCODES[ANY]) 0059 elif op in REPEATING_CODES: 0060 if flags & SRE_FLAG_TEMPLATE: 0061 raise error, "internal: unsupported template operator" 0062 emit(OPCODES[REPEAT]) 0063 skip = _len(code); emit(0) 0064 emit(av[0]) 0065 emit(av[1]) 0066 _compile(code, av[2], flags) 0067 emit(OPCODES[SUCCESS]) 0068 code[skip] = _len(code) - skip 0069 elif _simple(av) and op is not REPEAT: 0070 if op is MAX_REPEAT: 0071 emit(OPCODES[REPEAT_ONE]) 0072 else: 0073 emit(OPCODES[MIN_REPEAT_ONE]) 0074 skip = _len(code); emit(0) 0075 emit(av[0]) 0076 emit(av[1]) 0077 _compile(code, av[2], flags) 0078 emit(OPCODES[SUCCESS]) 0079 code[skip] = _len(code) - skip 0080 else: 0081 emit(OPCODES[REPEAT]) 0082 skip = _len(code); emit(0) 0083 emit(av[0]) 0084 emit(av[1]) 0085 _compile(code, av[2], flags) 0086 code[skip] = _len(code) - skip 0087 if op is MAX_REPEAT: 0088 emit(OPCODES[MAX_UNTIL]) 0089 else: 0090 emit(OPCODES[MIN_UNTIL]) 0091 elif op is SUBPATTERN: 0092 if av[0]: 0093 emit(OPCODES[MARK]) 0094 emit((av[0]-1)*2) 0095 # _compile_info(code, av[1], flags) 0096 _compile(code, av[1], flags) 0097 if av[0]: 0098 emit(OPCODES[MARK]) 0099 emit((av[0]-1)*2+1) 0100 elif op in SUCCESS_CODES: 0101 emit(OPCODES[op]) 0102 elif op in ASSERT_CODES: 0103 emit(OPCODES[op]) 0104 skip = _len(code); emit(0) 0105 if av[0] >= 0: 0106 emit(0) # look ahead 0107 else: 0108 lo, hi = av[1].getwidth() 0109 if lo != hi: 0110 raise error, "look-behind requires fixed-width pattern" 0111 emit(lo) # look behind 0112 _compile(code, av[1], flags) 0113 emit(OPCODES[SUCCESS]) 0114 code[skip] = _len(code) - skip 0115 elif op is CALL: 0116 emit(OPCODES[op]) 0117 skip = _len(code); emit(0) 0118 _compile(code, av, flags) 0119 emit(OPCODES[SUCCESS]) 0120 code[skip] = _len(code) - skip 0121 elif op is AT: 0122 emit(OPCODES[op]) 0123 if flags & SRE_FLAG_MULTILINE: 0124 av = AT_MULTILINE.get(av, av) 0125 if flags & SRE_FLAG_LOCALE: 0126 av = AT_LOCALE.get(av, av) 0127 elif flags & SRE_FLAG_UNICODE: 0128 av = AT_UNICODE.get(av, av) 0129 emit(ATCODES[av]) 0130 elif op is BRANCH: 0131 emit(OPCODES[op]) 0132 tail = [] 0133 tailappend = tail.append 0134 for av in av[1]: 0135 skip = _len(code); emit(0) 0136 # _compile_info(code, av, flags) 0137 _compile(code, av, flags) 0138 emit(OPCODES[JUMP]) 0139 tailappend(_len(code)); emit(0) 0140 code[skip] = _len(code) - skip 0141 emit(0) # end of branch 0142 for tail in tail: 0143 code[tail] = _len(code) - tail 0144 elif op is CATEGORY: 0145 emit(OPCODES[op]) 0146 if flags & SRE_FLAG_LOCALE: 0147 av = CH_LOCALE[av] 0148 elif flags & SRE_FLAG_UNICODE: 0149 av = CH_UNICODE[av] 0150 emit(CHCODES[av]) 0151 elif op is GROUPREF: 0152 if flags & SRE_FLAG_IGNORECASE: 0153 emit(OPCODES[OP_IGNORE[op]]) 0154 else: 0155 emit(OPCODES[op]) 0156 emit(av-1) 0157 elif op is GROUPREF_EXISTS: 0158 emit(OPCODES[op]) 0159 emit((av[0]-1)*2) 0160 skipyes = _len(code); emit(0) 0161 _compile(code, av[1], flags) 0162 if av[2]: 0163 emit(OPCODES[JUMP]) 0164 skipno = _len(code); emit(0) 0165 code[skipyes] = _len(code) - skipyes + 1 0166 _compile(code, av[2], flags) 0167 code[skipno] = _len(code) - skipno 0168 else: 0169 code[skipyes] = _len(code) - skipyes + 1 0170 else: 0171 raise ValueError, ("unsupported operand type", op) 0172 0173 def _compile_charset(charset, flags, code, fixup=None): 0174 # compile charset subprogram 0175 emit = code.append 0176 if fixup is None: 0177 fixup = _identityfunction 0178 for op, av in _optimize_charset(charset, fixup): 0179 emit(OPCODES[op]) 0180 if op is NEGATE: 0181 pass 0182 elif op is LITERAL: 0183 emit(fixup(av)) 0184 elif op is RANGE: 0185 emit(fixup(av[0])) 0186 emit(fixup(av[1])) 0187 elif op is CHARSET: 0188 code.extend(av) 0189 elif op is BIGCHARSET: 0190 code.extend(av) 0191 elif op is CATEGORY: 0192 if flags & SRE_FLAG_LOCALE: 0193 emit(CHCODES[CH_LOCALE[av]]) 0194 elif flags & SRE_FLAG_UNICODE: 0195 emit(CHCODES[CH_UNICODE[av]]) 0196 else: 0197 emit(CHCODES[av]) 0198 else: 0199 raise error, "internal: unsupported set operator" 0200 emit(OPCODES[FAILURE]) 0201 0202 def _optimize_charset(charset, fixup): 0203 # internal: optimize character set 0204 out = [] 0205 outappend = out.append 0206 charmap = [0]*256 0207 try: 0208 for op, av in charset: 0209 if op is NEGATE: 0210 outappend((op, av)) 0211 elif op is LITERAL: 0212 charmap[fixup(av)] = 1 0213 elif op is RANGE: 0214 for i in range(fixup(av[0]), fixup(av[1])+1): 0215 charmap[i] = 1 0216 elif op is CATEGORY: 0217 # XXX: could append to charmap tail 0218 return charset # cannot compress 0219 except IndexError: 0220 # character set contains unicode characters 0221 return _optimize_unicode(charset, fixup) 0222 # compress character map 0223 i = p = n = 0 0224 runs = [] 0225 runsappend = runs.append 0226 for c in charmap: 0227 if c: 0228 if n == 0: 0229 p = i 0230 n = n + 1 0231 elif n: 0232 runsappend((p, n)) 0233 n = 0 0234 i = i + 1 0235 if n: 0236 runsappend((p, n)) 0237 if len(runs) <= 2: 0238 # use literal/range 0239 for p, n in runs: 0240 if n == 1: 0241 outappend((LITERAL, p)) 0242 else: 0243 outappend((RANGE, (p, p+n-1))) 0244 if len(out) < len(charset): 0245 return out 0246 else: 0247 # use bitmap 0248 data = _mk_bitmap(charmap) 0249 outappend((CHARSET, data)) 0250 return out 0251 return charset 0252 0253 def _mk_bitmap(bits): 0254 data = [] 0255 dataappend = data.append 0256 if _sre.CODESIZE == 2: 0257 start = (1, 0) 0258 else: 0259 start = (1L, 0L) 0260 m, v = start 0261 for c in bits: 0262 if c: 0263 v = v + m 0264 m = m + m 0265 if m > MAXCODE: 0266 dataappend(v) 0267 m, v = start 0268 return data 0269 0270 # To represent a big charset, first a bitmap of all characters in the 0271 # set is constructed. Then, this bitmap is sliced into chunks of 256 0272 # characters, duplicate chunks are eliminitated, and each chunk is 0273 # given a number. In the compiled expression, the charset is 0274 # represented by a 16-bit word sequence, consisting of one word for 0275 # the number of different chunks, a sequence of 256 bytes (128 words) 0276 # of chunk numbers indexed by their original chunk position, and a 0277 # sequence of chunks (16 words each). 0278 0279 # Compression is normally good: in a typical charset, large ranges of 0280 # Unicode will be either completely excluded (e.g. if only cyrillic 0281 # letters are to be matched), or completely included (e.g. if large 0282 # subranges of Kanji match). These ranges will be represented by 0283 # chunks of all one-bits or all zero-bits. 0284 0285 # Matching can be also done efficiently: the more significant byte of 0286 # the Unicode character is an index into the chunk number, and the 0287 # less significant byte is a bit index in the chunk (just like the 0288 # CHARSET matching). 0289 0290 # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 0291 # of the basic multilingual plane; an efficient representation 0292 # for all of UTF-16 has not yet been developed. This means, 0293 # in particular, that negated charsets cannot be represented as 0294 # bigcharsets. 0295 0296 def _optimize_unicode(charset, fixup): 0297 try: 0298 import array 0299 except ImportError: 0300 return charset 0301 charmap = [0]*65536 0302 negate = 0 0303 try: 0304 for op, av in charset: 0305 if op is NEGATE: 0306 negate = 1 0307 elif op is LITERAL: 0308 charmap[fixup(av)] = 1 0309 elif op is RANGE: 0310 for i in xrange(fixup(av[0]), fixup(av[1])+1): 0311 charmap[i] = 1 0312 elif op is CATEGORY: 0313 # XXX: could expand category 0314 return charset # cannot compress 0315 except IndexError: 0316 # non-BMP characters 0317 return charset 0318 if negate: 0319 if sys.maxunicode != 65535: 0320 # XXX: negation does not work with big charsets 0321 return charset 0322 for i in xrange(65536): 0323 charmap[i] = not charmap[i] 0324 comps = {} 0325 mapping = [0]*256 0326 block = 0 0327 data = [] 0328 for i in xrange(256): 0329 chunk = tuple(charmap[i*256:(i+1)*256]) 0330 new = comps.setdefault(chunk, block) 0331 mapping[i] = new 0332 if new == block: 0333 block = block + 1 0334 data = data + _mk_bitmap(chunk) 0335 header = [block] 0336 if _sre.CODESIZE == 2: 0337 code = 'H' 0338 else: 0339 code = 'I' 0340 # Convert block indices to byte array of 256 bytes 0341 mapping = array.array('b', mapping).tostring() 0342 # Convert byte array to word array 0343 mapping = array.array(code, mapping) 0344 assert mapping.itemsize == _sre.CODESIZE 0345 header = header + mapping.tolist() 0346 data[0:0] = header 0347 return [(BIGCHARSET, data)] 0348 0349 def _simple(av): 0350 # check if av is a "simple" operator 0351 lo, hi = av[2].getwidth() 0352 if lo == 0 and hi == MAXREPEAT: 0353 raise error, "nothing to repeat" 0354 return lo == hi == 1 and av[2][0][0] != SUBPATTERN 0355 0356 def _compile_info(code, pattern, flags): 0357 # internal: compile an info block. in the current version, 0358 # this contains min/max pattern width, and an optional literal 0359 # prefix or a character map 0360 lo, hi = pattern.getwidth() 0361 if lo == 0: 0362 return # not worth it 0363 # look for a literal prefix 0364 prefix = [] 0365 prefixappend = prefix.append 0366 prefix_skip = 0 0367 charset = [] # not used 0368 charsetappend = charset.append 0369 if not (flags & SRE_FLAG_IGNORECASE): 0370 # look for literal prefix 0371 for op, av in pattern.data: 0372 if op is LITERAL: 0373 if len(prefix) == prefix_skip: 0374 prefix_skip = prefix_skip + 1 0375 prefixappend(av) 0376 elif op is SUBPATTERN and len(av[1]) == 1: 0377 op, av = av[1][0] 0378 if op is LITERAL: 0379 prefixappend(av) 0380 else: 0381 break 0382 else: 0383 break 0384 # if no prefix, look for charset prefix 0385 if not prefix and pattern.data: 0386 op, av = pattern.data[0] 0387 if op is SUBPATTERN and av[1]: 0388 op, av = av[1][0] 0389 if op is LITERAL: 0390 charsetappend((op, av)) 0391 elif op is BRANCH: 0392 c = [] 0393 cappend = c.append 0394 for p in av[1]: 0395 if not p: 0396 break 0397 op, av = p[0] 0398 if op is LITERAL: 0399 cappend((op, av)) 0400 else: 0401 break 0402 else: 0403 charset = c 0404 elif op is BRANCH: 0405 c = [] 0406 cappend = c.append 0407 for p in av[1]: 0408 if not p: 0409 break 0410 op, av = p[0] 0411 if op is LITERAL: 0412 cappend((op, av)) 0413 else: 0414 break 0415 else: 0416 charset = c 0417 elif op is IN: 0418 charset = av 0419 ## if prefix: 0420 ## print "*** PREFIX", prefix, prefix_skip 0421 ## if charset: 0422 ## print "*** CHARSET", charset 0423 # add an info block 0424 emit = code.append 0425 emit(OPCODES[INFO]) 0426 skip = len(code); emit(0) 0427 # literal flag 0428 mask = 0 0429 if prefix: 0430 mask = SRE_INFO_PREFIX 0431 if len(prefix) == prefix_skip == len(pattern.data): 0432 mask = mask + SRE_INFO_LITERAL 0433 elif charset: 0434 mask = mask + SRE_INFO_CHARSET 0435 emit(mask) 0436 # pattern length 0437 if lo < MAXCODE: 0438 emit(lo) 0439 else: 0440 emit(MAXCODE) 0441 prefix = prefix[:MAXCODE] 0442 if hi < MAXCODE: 0443 emit(hi) 0444 else: 0445 emit(0) 0446 # add literal prefix 0447 if prefix: 0448 emit(len(prefix)) # length 0449 emit(prefix_skip) # skip 0450 code.extend(prefix) 0451 # generate overlap table 0452 table = [-1] + ([0]*len(prefix)) 0453 for i in xrange(len(prefix)): 0454 table[i+1] = table[i]+1 0455 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 0456 table[i+1] = table[table[i+1]-1]+1 0457 code.extend(table[1:]) # don't store first entry 0458 elif charset: 0459 _compile_charset(charset, flags, code) 0460 code[skip] = len(code) - skip 0461 0462 try: 0463 unicode 0464 except NameError: 0465 STRING_TYPES = (type(""),) 0466 else: 0467 STRING_TYPES = (type(""), type(unicode(""))) 0468 0469 def isstring(obj): 0470 for tp in STRING_TYPES: 0471 if isinstance(obj, tp): 0472 return 1 0473 return 0 0474 0475 def _code(p, flags): 0476 0477 flags = p.pattern.flags | flags 0478 code = [] 0479 0480 # compile info block 0481 _compile_info(code, p, flags) 0482 0483 # compile the pattern 0484 _compile(code, p.data, flags) 0485 0486 code.append(OPCODES[SUCCESS]) 0487 0488 return code 0489 0490 def compile(p, flags=0): 0491 # internal: convert pattern list to internal format 0492 0493 if isstring(p): 0494 import sre_parse 0495 pattern = p 0496 p = sre_parse.parse(p, flags) 0497 else: 0498 pattern = None 0499 0500 code = _code(p, flags) 0501 0502 # print code 0503 0504 # XXX: <fl> get rid of this limitation! 0505 if p.pattern.groups > 100: 0506 raise AssertionError( 0507 "sorry, but this version only supports 100 named groups" 0508 ) 0509 0510 # map in either direction 0511 groupindex = p.pattern.groupdict 0512 indexgroup = [None] * p.pattern.groups 0513 for k, i in groupindex.items(): 0514 indexgroup[i] = k 0515 0516 return _sre.compile( 0517 pattern, flags, code, 0518 p.pattern.groups-1, 0519 groupindex, indexgroup 0520 ) 0521
Generated by PyXR 0.9.4