0001 """Tokenization help for Python programs. 0002 0003 generate_tokens(readline) is a generator that breaks a stream of 0004 text into Python tokens. It accepts a readline-like method which is called 0005 repeatedly to get the next line of input (or "" for EOF). It generates 0006 5-tuples with these members: 0007 0008 the token type (see token.py) 0009 the token (a string) 0010 the starting (row, column) indices of the token (a 2-tuple of ints) 0011 the ending (row, column) indices of the token (a 2-tuple of ints) 0012 the original line (string) 0013 0014 It is designed to match the working of the Python tokenizer exactly, except 0015 that it produces COMMENT tokens for comments and gives type OP for all 0016 operators 0017 0018 Older entry points 0019 tokenize_loop(readline, tokeneater) 0020 tokenize(readline, tokeneater=printtoken) 0021 are the same, except instead of generating tokens, tokeneater is a callback 0022 function to which the 5 fields described above are passed as 5 arguments, 0023 each time a new token is found.""" 0024 0025 __author__ = 'Ka-Ping Yee <ping@lfw.org>' 0026 __credits__ = \ 0027 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' 0028 0029 import string, re 0030 from token import * 0031 0032 import token 0033 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", 0034 "generate_tokens", "NL"] 0035 del x 0036 del token 0037 0038 COMMENT = N_TOKENS 0039 tok_name[COMMENT] = 'COMMENT' 0040 NL = N_TOKENS + 1 0041 tok_name[NL] = 'NL' 0042 N_TOKENS += 2 0043 0044 def group(*choices): return '(' + '|'.join(choices) + ')' 0045 def any(*choices): return group(*choices) + '*' 0046 def maybe(*choices): return group(*choices) + '?' 0047 0048 Whitespace = r'[ \f\t]*' 0049 Comment = r'#[^\r\n]*' 0050 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 0051 Name = r'[a-zA-Z_]\w*' 0052 0053 Hexnumber = r'0[xX][\da-fA-F]*[lL]?' 0054 Octnumber = r'0[0-7]*[lL]?' 0055 Decnumber = r'[1-9]\d*[lL]?' 0056 Intnumber = group(Hexnumber, Octnumber, Decnumber) 0057 Exponent = r'[eE][-+]?\d+' 0058 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 0059 Expfloat = r'\d+' + Exponent 0060 Floatnumber = group(Pointfloat, Expfloat) 0061 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') 0062 Number = group(Imagnumber, Floatnumber, Intnumber) 0063 0064 # Tail end of ' string. 0065 Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 0066 # Tail end of " string. 0067 Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 0068 # Tail end of ''' string. 0069 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 0070 # Tail end of """ string. 0071 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 0072 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""') 0073 # Single-line ' or " string. 0074 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 0075 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 0076 0077 # Because of leftmost-then-longest match semantics, be sure to put the 0078 # longest operators first (e.g., if = came before ==, == would get 0079 # recognized as two instances of =). 0080 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 0081 r"//=?", 0082 r"[+\-*/%&|^=<>]=?", 0083 r"~") 0084 0085 Bracket = '[][(){}]' 0086 Special = group(r'\r?\n', r'[:;.,`@]') 0087 Funny = group(Operator, Bracket, Special) 0088 0089 PlainToken = group(Number, Funny, String, Name) 0090 Token = Ignore + PlainToken 0091 0092 # First (or only) line of ' or " string. 0093 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 0094 group("'", r'\\\r?\n'), 0095 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 0096 group('"', r'\\\r?\n')) 0097 PseudoExtras = group(r'\\\r?\n', Comment, Triple) 0098 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 0099 0100 tokenprog, pseudoprog, single3prog, double3prog = map( 0101 re.compile, (Token, PseudoToken, Single3, Double3)) 0102 endprogs = {"'": re.compile(Single), '"': re.compile(Double), 0103 "'''": single3prog, '"""': double3prog, 0104 "r'''": single3prog, 'r"""': double3prog, 0105 "u'''": single3prog, 'u"""': double3prog, 0106 "ur'''": single3prog, 'ur"""': double3prog, 0107 "R'''": single3prog, 'R"""': double3prog, 0108 "U'''": single3prog, 'U"""': double3prog, 0109 "uR'''": single3prog, 'uR"""': double3prog, 0110 "Ur'''": single3prog, 'Ur"""': double3prog, 0111 "UR'''": single3prog, 'UR"""': double3prog, 0112 'r': None, 'R': None, 'u': None, 'U': None} 0113 0114 triple_quoted = {} 0115 for t in ("'''", '"""', 0116 "r'''", 'r"""', "R'''", 'R"""', 0117 "u'''", 'u"""', "U'''", 'U"""', 0118 "ur'''", 'ur"""', "Ur'''", 'Ur"""', 0119 "uR'''", 'uR"""', "UR'''", 'UR"""'): 0120 triple_quoted[t] = t 0121 single_quoted = {} 0122 for t in ("'", '"', 0123 "r'", 'r"', "R'", 'R"', 0124 "u'", 'u"', "U'", 'U"', 0125 "ur'", 'ur"', "Ur'", 'Ur"', 0126 "uR'", 'uR"', "UR'", 'UR"' ): 0127 single_quoted[t] = t 0128 0129 tabsize = 8 0130 0131 class TokenError(Exception): pass 0132 0133 class StopTokenizing(Exception): pass 0134 0135 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing 0136 print "%d,%d-%d,%d:\t%s\t%s" % \ 0137 (srow, scol, erow, ecol, tok_name[type], repr(token)) 0138 0139 def tokenize(readline, tokeneater=printtoken): 0140 """ 0141 The tokenize() function accepts two parameters: one representing the 0142 input stream, and one providing an output mechanism for tokenize(). 0143 0144 The first parameter, readline, must be a callable object which provides 0145 the same interface as the readline() method of built-in file objects. 0146 Each call to the function should return one line of input as a string. 0147 0148 The second parameter, tokeneater, must also be a callable object. It is 0149 called once for each token, with five arguments, corresponding to the 0150 tuples generated by generate_tokens(). 0151 """ 0152 try: 0153 tokenize_loop(readline, tokeneater) 0154 except StopTokenizing: 0155 pass 0156 0157 # backwards compatible interface 0158 def tokenize_loop(readline, tokeneater): 0159 for token_info in generate_tokens(readline): 0160 tokeneater(*token_info) 0161 0162 def generate_tokens(readline): 0163 """ 0164 The generate_tokens() generator requires one argment, readline, which 0165 must be a callable object which provides the same interface as the 0166 readline() method of built-in file objects. Each call to the function 0167 should return one line of input as a string. 0168 0169 The generator produces 5-tuples with these members: the token type; the 0170 token string; a 2-tuple (srow, scol) of ints specifying the row and 0171 column where the token begins in the source; a 2-tuple (erow, ecol) of 0172 ints specifying the row and column where the token ends in the source; 0173 and the line on which the token was found. The line passed is the 0174 logical line; continuation lines are included. 0175 """ 0176 lnum = parenlev = continued = 0 0177 namechars, numchars = string.ascii_letters + '_', '0123456789' 0178 contstr, needcont = '', 0 0179 contline = None 0180 indents = [0] 0181 0182 while 1: # loop over lines in stream 0183 line = readline() 0184 lnum = lnum + 1 0185 pos, max = 0, len(line) 0186 0187 if contstr: # continued string 0188 if not line: 0189 raise TokenError, ("EOF in multi-line string", strstart) 0190 endmatch = endprog.match(line) 0191 if endmatch: 0192 pos = end = endmatch.end(0) 0193 yield (STRING, contstr + line[:end], 0194 strstart, (lnum, end), contline + line) 0195 contstr, needcont = '', 0 0196 contline = None 0197 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 0198 yield (ERRORTOKEN, contstr + line, 0199 strstart, (lnum, len(line)), contline) 0200 contstr = '' 0201 contline = None 0202 continue 0203 else: 0204 contstr = contstr + line 0205 contline = contline + line 0206 continue 0207 0208 elif parenlev == 0 and not continued: # new statement 0209 if not line: break 0210 column = 0 0211 while pos < max: # measure leading whitespace 0212 if line[pos] == ' ': column = column + 1 0213 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize 0214 elif line[pos] == '\f': column = 0 0215 else: break 0216 pos = pos + 1 0217 if pos == max: break 0218 0219 if line[pos] in '#\r\n': # skip comments or blank lines 0220 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 0221 (lnum, pos), (lnum, len(line)), line) 0222 continue 0223 0224 if column > indents[-1]: # count indents or dedents 0225 indents.append(column) 0226 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 0227 while column < indents[-1]: 0228 indents = indents[:-1] 0229 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 0230 0231 else: # continued statement 0232 if not line: 0233 raise TokenError, ("EOF in multi-line statement", (lnum, 0)) 0234 continued = 0 0235 0236 while pos < max: 0237 pseudomatch = pseudoprog.match(line, pos) 0238 if pseudomatch: # scan for tokens 0239 start, end = pseudomatch.span(1) 0240 spos, epos, pos = (lnum, start), (lnum, end), end 0241 token, initial = line[start:end], line[start] 0242 0243 if initial in numchars or \ 0244 (initial == '.' and token != '.'): # ordinary number 0245 yield (NUMBER, token, spos, epos, line) 0246 elif initial in '\r\n': 0247 yield (parenlev > 0 and NL or NEWLINE, 0248 token, spos, epos, line) 0249 elif initial == '#': 0250 yield (COMMENT, token, spos, epos, line) 0251 elif token in triple_quoted: 0252 endprog = endprogs[token] 0253 endmatch = endprog.match(line, pos) 0254 if endmatch: # all on one line 0255 pos = endmatch.end(0) 0256 token = line[start:pos] 0257 yield (STRING, token, spos, (lnum, pos), line) 0258 else: 0259 strstart = (lnum, start) # multiple lines 0260 contstr = line[start:] 0261 contline = line 0262 break 0263 elif initial in single_quoted or \ 0264 token[:2] in single_quoted or \ 0265 token[:3] in single_quoted: 0266 if token[-1] == '\n': # continued string 0267 strstart = (lnum, start) 0268 endprog = (endprogs[initial] or endprogs[token[1]] or 0269 endprogs[token[2]]) 0270 contstr, needcont = line[start:], 1 0271 contline = line 0272 break 0273 else: # ordinary string 0274 yield (STRING, token, spos, epos, line) 0275 elif initial in namechars: # ordinary name 0276 yield (NAME, token, spos, epos, line) 0277 elif initial == '\\': # continued stmt 0278 continued = 1 0279 else: 0280 if initial in '([{': parenlev = parenlev + 1 0281 elif initial in ')]}': parenlev = parenlev - 1 0282 yield (OP, token, spos, epos, line) 0283 else: 0284 yield (ERRORTOKEN, line[pos], 0285 (lnum, pos), (lnum, pos+1), line) 0286 pos = pos + 1 0287 0288 for indent in indents[1:]: # pop remaining indent levels 0289 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 0290 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 0291 0292 if __name__ == '__main__': # testing 0293 import sys 0294 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) 0295 else: tokenize(sys.stdin.readline) 0296
Generated by PyXR 0.9.4