0001 import re 0002 import sys 0003 0004 # Reason last stmt is continued (or C_NONE if it's not). 0005 C_NONE, C_BACKSLASH, C_STRING, C_BRACKET = range(4) 0006 0007 if 0: # for throwaway debugging output 0008 def dump(*stuff): 0009 sys.__stdout__.write(" ".join(map(str, stuff)) + "\n") 0010 0011 # Find what looks like the start of a popular stmt. 0012 0013 _synchre = re.compile(r""" 0014 ^ 0015 [ \t]* 0016 (?: if 0017 | for 0018 | while 0019 | else 0020 | def 0021 | return 0022 | assert 0023 | break 0024 | class 0025 | continue 0026 | elif 0027 | try 0028 | except 0029 | raise 0030 | import 0031 | yield 0032 ) 0033 \b 0034 """, re.VERBOSE | re.MULTILINE).search 0035 0036 # Match blank line or non-indenting comment line. 0037 0038 _junkre = re.compile(r""" 0039 [ \t]* 0040 (?: \# \S .* )? 0041 \n 0042 """, re.VERBOSE).match 0043 0044 # Match any flavor of string; the terminating quote is optional 0045 # so that we're robust in the face of incomplete program text. 0046 0047 _match_stringre = re.compile(r""" 0048 \""" [^"\\]* (?: 0049 (?: \\. | "(?!"") ) 0050 [^"\\]* 0051 )* 0052 (?: \""" )? 0053 0054 | " [^"\\\n]* (?: \\. [^"\\\n]* )* "? 0055 0056 | ''' [^'\\]* (?: 0057 (?: \\. | '(?!'') ) 0058 [^'\\]* 0059 )* 0060 (?: ''' )? 0061 0062 | ' [^'\\\n]* (?: \\. [^'\\\n]* )* '? 0063 """, re.VERBOSE | re.DOTALL).match 0064 0065 # Match a line that starts with something interesting; 0066 # used to find the first item of a bracket structure. 0067 0068 _itemre = re.compile(r""" 0069 [ \t]* 0070 [^\s#\\] # if we match, m.end()-1 is the interesting char 0071 """, re.VERBOSE).match 0072 0073 # Match start of stmts that should be followed by a dedent. 0074 0075 _closere = re.compile(r""" 0076 \s* 0077 (?: return 0078 | break 0079 | continue 0080 | raise 0081 | pass 0082 ) 0083 \b 0084 """, re.VERBOSE).match 0085 0086 # Chew up non-special chars as quickly as possible. If match is 0087 # successful, m.end() less 1 is the index of the last boring char 0088 # matched. If match is unsuccessful, the string starts with an 0089 # interesting char. 0090 0091 _chew_ordinaryre = re.compile(r""" 0092 [^[\](){}#'"\\]+ 0093 """, re.VERBOSE).match 0094 0095 # Build translation table to map uninteresting chars to "x", open 0096 # brackets to "(", and close brackets to ")". 0097 0098 _tran = ['x'] * 256 0099 for ch in "({[": 0100 _tran[ord(ch)] = '(' 0101 for ch in ")}]": 0102 _tran[ord(ch)] = ')' 0103 for ch in "\"'\\\n#": 0104 _tran[ord(ch)] = ch 0105 _tran = ''.join(_tran) 0106 del ch 0107 0108 try: 0109 UnicodeType = type(unicode("")) 0110 except NameError: 0111 UnicodeType = None 0112 0113 class Parser: 0114 0115 def __init__(self, indentwidth, tabwidth): 0116 self.indentwidth = indentwidth 0117 self.tabwidth = tabwidth 0118 0119 def set_str(self, str): 0120 assert len(str) == 0 or str[-1] == '\n' 0121 if type(str) is UnicodeType: 0122 # The parse functions have no idea what to do with Unicode, so 0123 # replace all Unicode characters with "x". This is "safe" 0124 # so long as the only characters germane to parsing the structure 0125 # of Python are 7-bit ASCII. It's *necessary* because Unicode 0126 # strings don't have a .translate() method that supports 0127 # deletechars. 0128 uniphooey = str 0129 str = [] 0130 push = str.append 0131 for raw in map(ord, uniphooey): 0132 push(raw < 127 and chr(raw) or "x") 0133 str = "".join(str) 0134 self.str = str 0135 self.study_level = 0 0136 0137 # Return index of a good place to begin parsing, as close to the 0138 # end of the string as possible. This will be the start of some 0139 # popular stmt like "if" or "def". Return None if none found: 0140 # the caller should pass more prior context then, if possible, or 0141 # if not (the entire program text up until the point of interest 0142 # has already been tried) pass 0 to set_lo. 0143 # 0144 # This will be reliable iff given a reliable is_char_in_string 0145 # function, meaning that when it says "no", it's absolutely 0146 # guaranteed that the char is not in a string. 0147 # 0148 # Ack, hack: in the shell window this kills us, because there's 0149 # no way to tell the differences between output, >>> etc and 0150 # user input. Indeed, IDLE's first output line makes the rest 0151 # look like it's in an unclosed paren!: 0152 # Python 1.5.2 (#0, Apr 13 1999, ... 0153 0154 def find_good_parse_start(self, use_ps1, is_char_in_string=None, 0155 _synchre=_synchre): 0156 str, pos = self.str, None 0157 if use_ps1: 0158 # shell window 0159 ps1 = '\n' + sys.ps1 0160 i = str.rfind(ps1) 0161 if i >= 0: 0162 pos = i + len(ps1) 0163 # make it look like there's a newline instead 0164 # of ps1 at the start -- hacking here once avoids 0165 # repeated hackery later 0166 self.str = str[:pos-1] + '\n' + str[pos:] 0167 return pos 0168 0169 # File window -- real work. 0170 if not is_char_in_string: 0171 # no clue -- make the caller pass everything 0172 return None 0173 0174 # Peek back from the end for a good place to start, 0175 # but don't try too often; pos will be left None, or 0176 # bumped to a legitimate synch point. 0177 limit = len(str) 0178 for tries in range(5): 0179 i = str.rfind(":\n", 0, limit) 0180 if i < 0: 0181 break 0182 i = str.rfind('\n', 0, i) + 1 # start of colon line 0183 m = _synchre(str, i, limit) 0184 if m and not is_char_in_string(m.start()): 0185 pos = m.start() 0186 break 0187 limit = i 0188 if pos is None: 0189 # Nothing looks like a block-opener, or stuff does 0190 # but is_char_in_string keeps returning true; most likely 0191 # we're in or near a giant string, the colorizer hasn't 0192 # caught up enough to be helpful, or there simply *aren't* 0193 # any interesting stmts. In any of these cases we're 0194 # going to have to parse the whole thing to be sure, so 0195 # give it one last try from the start, but stop wasting 0196 # time here regardless of the outcome. 0197 m = _synchre(str) 0198 if m and not is_char_in_string(m.start()): 0199 pos = m.start() 0200 return pos 0201 0202 # Peeking back worked; look forward until _synchre no longer 0203 # matches. 0204 i = pos + 1 0205 while 1: 0206 m = _synchre(str, i) 0207 if m: 0208 s, i = m.span() 0209 if not is_char_in_string(s): 0210 pos = s 0211 else: 0212 break 0213 return pos 0214 0215 # Throw away the start of the string. Intended to be called with 0216 # find_good_parse_start's result. 0217 0218 def set_lo(self, lo): 0219 assert lo == 0 or self.str[lo-1] == '\n' 0220 if lo > 0: 0221 self.str = self.str[lo:] 0222 0223 # As quickly as humanly possible <wink>, find the line numbers (0- 0224 # based) of the non-continuation lines. 0225 # Creates self.{goodlines, continuation}. 0226 0227 def _study1(self): 0228 if self.study_level >= 1: 0229 return 0230 self.study_level = 1 0231 0232 # Map all uninteresting characters to "x", all open brackets 0233 # to "(", all close brackets to ")", then collapse runs of 0234 # uninteresting characters. This can cut the number of chars 0235 # by a factor of 10-40, and so greatly speed the following loop. 0236 str = self.str 0237 str = str.translate(_tran) 0238 str = str.replace('xxxxxxxx', 'x') 0239 str = str.replace('xxxx', 'x') 0240 str = str.replace('xx', 'x') 0241 str = str.replace('xx', 'x') 0242 str = str.replace('\nx', '\n') 0243 # note that replacing x\n with \n would be incorrect, because 0244 # x may be preceded by a backslash 0245 0246 # March over the squashed version of the program, accumulating 0247 # the line numbers of non-continued stmts, and determining 0248 # whether & why the last stmt is a continuation. 0249 continuation = C_NONE 0250 level = lno = 0 # level is nesting level; lno is line number 0251 self.goodlines = goodlines = [0] 0252 push_good = goodlines.append 0253 i, n = 0, len(str) 0254 while i < n: 0255 ch = str[i] 0256 i = i+1 0257 0258 # cases are checked in decreasing order of frequency 0259 if ch == 'x': 0260 continue 0261 0262 if ch == '\n': 0263 lno = lno + 1 0264 if level == 0: 0265 push_good(lno) 0266 # else we're in an unclosed bracket structure 0267 continue 0268 0269 if ch == '(': 0270 level = level + 1 0271 continue 0272 0273 if ch == ')': 0274 if level: 0275 level = level - 1 0276 # else the program is invalid, but we can't complain 0277 continue 0278 0279 if ch == '"' or ch == "'": 0280 # consume the string 0281 quote = ch 0282 if str[i-1:i+2] == quote * 3: 0283 quote = quote * 3 0284 w = len(quote) - 1 0285 i = i+w 0286 while i < n: 0287 ch = str[i] 0288 i = i+1 0289 0290 if ch == 'x': 0291 continue 0292 0293 if str[i-1:i+w] == quote: 0294 i = i+w 0295 break 0296 0297 if ch == '\n': 0298 lno = lno + 1 0299 if w == 0: 0300 # unterminated single-quoted string 0301 if level == 0: 0302 push_good(lno) 0303 break 0304 continue 0305 0306 if ch == '\\': 0307 assert i < n 0308 if str[i] == '\n': 0309 lno = lno + 1 0310 i = i+1 0311 continue 0312 0313 # else comment char or paren inside string 0314 0315 else: 0316 # didn't break out of the loop, so we're still 0317 # inside a string 0318 continuation = C_STRING 0319 continue # with outer loop 0320 0321 if ch == '#': 0322 # consume the comment 0323 i = str.find('\n', i) 0324 assert i >= 0 0325 continue 0326 0327 assert ch == '\\' 0328 assert i < n 0329 if str[i] == '\n': 0330 lno = lno + 1 0331 if i+1 == n: 0332 continuation = C_BACKSLASH 0333 i = i+1 0334 0335 # The last stmt may be continued for all 3 reasons. 0336 # String continuation takes precedence over bracket 0337 # continuation, which beats backslash continuation. 0338 if continuation != C_STRING and level > 0: 0339 continuation = C_BRACKET 0340 self.continuation = continuation 0341 0342 # Push the final line number as a sentinel value, regardless of 0343 # whether it's continued. 0344 assert (continuation == C_NONE) == (goodlines[-1] == lno) 0345 if goodlines[-1] != lno: 0346 push_good(lno) 0347 0348 def get_continuation_type(self): 0349 self._study1() 0350 return self.continuation 0351 0352 # study1 was sufficient to determine the continuation status, 0353 # but doing more requires looking at every character. study2 0354 # does this for the last interesting statement in the block. 0355 # Creates: 0356 # self.stmt_start, stmt_end 0357 # slice indices of last interesting stmt 0358 # self.lastch 0359 # last non-whitespace character before optional trailing 0360 # comment 0361 # self.lastopenbracketpos 0362 # if continuation is C_BRACKET, index of last open bracket 0363 0364 def _study2(self): 0365 if self.study_level >= 2: 0366 return 0367 self._study1() 0368 self.study_level = 2 0369 0370 # Set p and q to slice indices of last interesting stmt. 0371 str, goodlines = self.str, self.goodlines 0372 i = len(goodlines) - 1 0373 p = len(str) # index of newest line 0374 while i: 0375 assert p 0376 # p is the index of the stmt at line number goodlines[i]. 0377 # Move p back to the stmt at line number goodlines[i-1]. 0378 q = p 0379 for nothing in range(goodlines[i-1], goodlines[i]): 0380 # tricky: sets p to 0 if no preceding newline 0381 p = str.rfind('\n', 0, p-1) + 1 0382 # The stmt str[p:q] isn't a continuation, but may be blank 0383 # or a non-indenting comment line. 0384 if _junkre(str, p): 0385 i = i-1 0386 else: 0387 break 0388 if i == 0: 0389 # nothing but junk! 0390 assert p == 0 0391 q = p 0392 self.stmt_start, self.stmt_end = p, q 0393 0394 # Analyze this stmt, to find the last open bracket (if any) 0395 # and last interesting character (if any). 0396 lastch = "" 0397 stack = [] # stack of open bracket indices 0398 push_stack = stack.append 0399 while p < q: 0400 # suck up all except ()[]{}'"#\\ 0401 m = _chew_ordinaryre(str, p, q) 0402 if m: 0403 # we skipped at least one boring char 0404 newp = m.end() 0405 # back up over totally boring whitespace 0406 i = newp - 1 # index of last boring char 0407 while i >= p and str[i] in " \t\n": 0408 i = i-1 0409 if i >= p: 0410 lastch = str[i] 0411 p = newp 0412 if p >= q: 0413 break 0414 0415 ch = str[p] 0416 0417 if ch in "([{": 0418 push_stack(p) 0419 lastch = ch 0420 p = p+1 0421 continue 0422 0423 if ch in ")]}": 0424 if stack: 0425 del stack[-1] 0426 lastch = ch 0427 p = p+1 0428 continue 0429 0430 if ch == '"' or ch == "'": 0431 # consume string 0432 # Note that study1 did this with a Python loop, but 0433 # we use a regexp here; the reason is speed in both 0434 # cases; the string may be huge, but study1 pre-squashed 0435 # strings to a couple of characters per line. study1 0436 # also needed to keep track of newlines, and we don't 0437 # have to. 0438 lastch = ch 0439 p = _match_stringre(str, p, q).end() 0440 continue 0441 0442 if ch == '#': 0443 # consume comment and trailing newline 0444 p = str.find('\n', p, q) + 1 0445 assert p > 0 0446 continue 0447 0448 assert ch == '\\' 0449 p = p+1 # beyond backslash 0450 assert p < q 0451 if str[p] != '\n': 0452 # the program is invalid, but can't complain 0453 lastch = ch + str[p] 0454 p = p+1 # beyond escaped char 0455 0456 # end while p < q: 0457 0458 self.lastch = lastch 0459 if stack: 0460 self.lastopenbracketpos = stack[-1] 0461 0462 # Assuming continuation is C_BRACKET, return the number 0463 # of spaces the next line should be indented. 0464 0465 def compute_bracket_indent(self): 0466 self._study2() 0467 assert self.continuation == C_BRACKET 0468 j = self.lastopenbracketpos 0469 str = self.str 0470 n = len(str) 0471 origi = i = str.rfind('\n', 0, j) + 1 0472 j = j+1 # one beyond open bracket 0473 # find first list item; set i to start of its line 0474 while j < n: 0475 m = _itemre(str, j) 0476 if m: 0477 j = m.end() - 1 # index of first interesting char 0478 extra = 0 0479 break 0480 else: 0481 # this line is junk; advance to next line 0482 i = j = str.find('\n', j) + 1 0483 else: 0484 # nothing interesting follows the bracket; 0485 # reproduce the bracket line's indentation + a level 0486 j = i = origi 0487 while str[j] in " \t": 0488 j = j+1 0489 extra = self.indentwidth 0490 return len(str[i:j].expandtabs(self.tabwidth)) + extra 0491 0492 # Return number of physical lines in last stmt (whether or not 0493 # it's an interesting stmt! this is intended to be called when 0494 # continuation is C_BACKSLASH). 0495 0496 def get_num_lines_in_stmt(self): 0497 self._study1() 0498 goodlines = self.goodlines 0499 return goodlines[-1] - goodlines[-2] 0500 0501 # Assuming continuation is C_BACKSLASH, return the number of spaces 0502 # the next line should be indented. Also assuming the new line is 0503 # the first one following the initial line of the stmt. 0504 0505 def compute_backslash_indent(self): 0506 self._study2() 0507 assert self.continuation == C_BACKSLASH 0508 str = self.str 0509 i = self.stmt_start 0510 while str[i] in " \t": 0511 i = i+1 0512 startpos = i 0513 0514 # See whether the initial line starts an assignment stmt; i.e., 0515 # look for an = operator 0516 endpos = str.find('\n', startpos) + 1 0517 found = level = 0 0518 while i < endpos: 0519 ch = str[i] 0520 if ch in "([{": 0521 level = level + 1 0522 i = i+1 0523 elif ch in ")]}": 0524 if level: 0525 level = level - 1 0526 i = i+1 0527 elif ch == '"' or ch == "'": 0528 i = _match_stringre(str, i, endpos).end() 0529 elif ch == '#': 0530 break 0531 elif level == 0 and ch == '=' and \ 0532 (i == 0 or str[i-1] not in "=<>!") and \ 0533 str[i+1] != '=': 0534 found = 1 0535 break 0536 else: 0537 i = i+1 0538 0539 if found: 0540 # found a legit =, but it may be the last interesting 0541 # thing on the line 0542 i = i+1 # move beyond the = 0543 found = re.match(r"\s*\\", str[i:endpos]) is None 0544 0545 if not found: 0546 # oh well ... settle for moving beyond the first chunk 0547 # of non-whitespace chars 0548 i = startpos 0549 while str[i] not in " \t\n": 0550 i = i+1 0551 0552 return len(str[self.stmt_start:i].expandtabs(\ 0553 self.tabwidth)) + 1 0554 0555 # Return the leading whitespace on the initial line of the last 0556 # interesting stmt. 0557 0558 def get_base_indent_string(self): 0559 self._study2() 0560 i, n = self.stmt_start, self.stmt_end 0561 j = i 0562 str = self.str 0563 while j < n and str[j] in " \t": 0564 j = j + 1 0565 return str[i:j] 0566 0567 # Did the last interesting stmt open a block? 0568 0569 def is_block_opener(self): 0570 self._study2() 0571 return self.lastch == ':' 0572 0573 # Did the last interesting stmt close a block? 0574 0575 def is_block_closer(self): 0576 self._study2() 0577 return _closere(self.str, self.stmt_start) is not None 0578 0579 # index of last open bracket ({[, or None if none 0580 lastopenbracketpos = None 0581 0582 def get_last_open_bracket_pos(self): 0583 self._study2() 0584 return self.lastopenbracketpos 0585
Generated by PyXR 0.9.4