PyXR

c:\python24\lib \ tokenize.py


0001 """Tokenization help for Python programs.
0002 
0003 generate_tokens(readline) is a generator that breaks a stream of
0004 text into Python tokens.  It accepts a readline-like method which is called
0005 repeatedly to get the next line of input (or "" for EOF).  It generates
0006 5-tuples with these members:
0007 
0008     the token type (see token.py)
0009     the token (a string)
0010     the starting (row, column) indices of the token (a 2-tuple of ints)
0011     the ending (row, column) indices of the token (a 2-tuple of ints)
0012     the original line (string)
0013 
0014 It is designed to match the working of the Python tokenizer exactly, except
0015 that it produces COMMENT tokens for comments and gives type OP for all
0016 operators
0017 
0018 Older entry points
0019     tokenize_loop(readline, tokeneater)
0020     tokenize(readline, tokeneater=printtoken)
0021 are the same, except instead of generating tokens, tokeneater is a callback
0022 function to which the 5 fields described above are passed as 5 arguments,
0023 each time a new token is found."""
0024 
0025 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
0026 __credits__ = \
0027     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
0028 
0029 import string, re
0030 from token import *
0031 
0032 import token
0033 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
0034            "generate_tokens", "NL"]
0035 del x
0036 del token
0037 
0038 COMMENT = N_TOKENS
0039 tok_name[COMMENT] = 'COMMENT'
0040 NL = N_TOKENS + 1
0041 tok_name[NL] = 'NL'
0042 N_TOKENS += 2
0043 
0044 def group(*choices): return '(' + '|'.join(choices) + ')'
0045 def any(*choices): return group(*choices) + '*'
0046 def maybe(*choices): return group(*choices) + '?'
0047 
0048 Whitespace = r'[ \f\t]*'
0049 Comment = r'#[^\r\n]*'
0050 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
0051 Name = r'[a-zA-Z_]\w*'
0052 
0053 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
0054 Octnumber = r'0[0-7]*[lL]?'
0055 Decnumber = r'[1-9]\d*[lL]?'
0056 Intnumber = group(Hexnumber, Octnumber, Decnumber)
0057 Exponent = r'[eE][-+]?\d+'
0058 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
0059 Expfloat = r'\d+' + Exponent
0060 Floatnumber = group(Pointfloat, Expfloat)
0061 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
0062 Number = group(Imagnumber, Floatnumber, Intnumber)
0063 
0064 # Tail end of ' string.
0065 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
0066 # Tail end of " string.
0067 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
0068 # Tail end of ''' string.
0069 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
0070 # Tail end of """ string.
0071 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
0072 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
0073 # Single-line ' or " string.
0074 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
0075                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
0076 
0077 # Because of leftmost-then-longest match semantics, be sure to put the
0078 # longest operators first (e.g., if = came before ==, == would get
0079 # recognized as two instances of =).
0080 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
0081                  r"//=?",
0082                  r"[+\-*/%&|^=<>]=?",
0083                  r"~")
0084 
0085 Bracket = '[][(){}]'
0086 Special = group(r'\r?\n', r'[:;.,`@]')
0087 Funny = group(Operator, Bracket, Special)
0088 
0089 PlainToken = group(Number, Funny, String, Name)
0090 Token = Ignore + PlainToken
0091 
0092 # First (or only) line of ' or " string.
0093 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
0094                 group("'", r'\\\r?\n'),
0095                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
0096                 group('"', r'\\\r?\n'))
0097 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
0098 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
0099 
0100 tokenprog, pseudoprog, single3prog, double3prog = map(
0101     re.compile, (Token, PseudoToken, Single3, Double3))
0102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
0103             "'''": single3prog, '"""': double3prog,
0104             "r'''": single3prog, 'r"""': double3prog,
0105             "u'''": single3prog, 'u"""': double3prog,
0106             "ur'''": single3prog, 'ur"""': double3prog,
0107             "R'''": single3prog, 'R"""': double3prog,
0108             "U'''": single3prog, 'U"""': double3prog,
0109             "uR'''": single3prog, 'uR"""': double3prog,
0110             "Ur'''": single3prog, 'Ur"""': double3prog,
0111             "UR'''": single3prog, 'UR"""': double3prog,
0112             'r': None, 'R': None, 'u': None, 'U': None}
0113 
0114 triple_quoted = {}
0115 for t in ("'''", '"""',
0116           "r'''", 'r"""', "R'''", 'R"""',
0117           "u'''", 'u"""', "U'''", 'U"""',
0118           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
0119           "uR'''", 'uR"""', "UR'''", 'UR"""'):
0120     triple_quoted[t] = t
0121 single_quoted = {}
0122 for t in ("'", '"',
0123           "r'", 'r"', "R'", 'R"',
0124           "u'", 'u"', "U'", 'U"',
0125           "ur'", 'ur"', "Ur'", 'Ur"',
0126           "uR'", 'uR"', "UR'", 'UR"' ):
0127     single_quoted[t] = t
0128 
0129 tabsize = 8
0130 
0131 class TokenError(Exception): pass
0132 
0133 class StopTokenizing(Exception): pass
0134 
0135 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
0136     print "%d,%d-%d,%d:\t%s\t%s" % \
0137         (srow, scol, erow, ecol, tok_name[type], repr(token))
0138 
0139 def tokenize(readline, tokeneater=printtoken):
0140     """
0141     The tokenize() function accepts two parameters: one representing the
0142     input stream, and one providing an output mechanism for tokenize().
0143 
0144     The first parameter, readline, must be a callable object which provides
0145     the same interface as the readline() method of built-in file objects.
0146     Each call to the function should return one line of input as a string.
0147 
0148     The second parameter, tokeneater, must also be a callable object. It is
0149     called once for each token, with five arguments, corresponding to the
0150     tuples generated by generate_tokens().
0151     """
0152     try:
0153         tokenize_loop(readline, tokeneater)
0154     except StopTokenizing:
0155         pass
0156 
0157 # backwards compatible interface
0158 def tokenize_loop(readline, tokeneater):
0159     for token_info in generate_tokens(readline):
0160         tokeneater(*token_info)
0161 
0162 def generate_tokens(readline):
0163     """
0164     The generate_tokens() generator requires one argment, readline, which
0165     must be a callable object which provides the same interface as the
0166     readline() method of built-in file objects. Each call to the function
0167     should return one line of input as a string.
0168 
0169     The generator produces 5-tuples with these members: the token type; the
0170     token string; a 2-tuple (srow, scol) of ints specifying the row and
0171     column where the token begins in the source; a 2-tuple (erow, ecol) of
0172     ints specifying the row and column where the token ends in the source;
0173     and the line on which the token was found. The line passed is the
0174     logical line; continuation lines are included.
0175     """
0176     lnum = parenlev = continued = 0
0177     namechars, numchars = string.ascii_letters + '_', '0123456789'
0178     contstr, needcont = '', 0
0179     contline = None
0180     indents = [0]
0181 
0182     while 1:                                   # loop over lines in stream
0183         line = readline()
0184         lnum = lnum + 1
0185         pos, max = 0, len(line)
0186 
0187         if contstr:                            # continued string
0188             if not line:
0189                 raise TokenError, ("EOF in multi-line string", strstart)
0190             endmatch = endprog.match(line)
0191             if endmatch:
0192                 pos = end = endmatch.end(0)
0193                 yield (STRING, contstr + line[:end],
0194                            strstart, (lnum, end), contline + line)
0195                 contstr, needcont = '', 0
0196                 contline = None
0197             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
0198                 yield (ERRORTOKEN, contstr + line,
0199                            strstart, (lnum, len(line)), contline)
0200                 contstr = ''
0201                 contline = None
0202                 continue
0203             else:
0204                 contstr = contstr + line
0205                 contline = contline + line
0206                 continue
0207 
0208         elif parenlev == 0 and not continued:  # new statement
0209             if not line: break
0210             column = 0
0211             while pos < max:                   # measure leading whitespace
0212                 if line[pos] == ' ': column = column + 1
0213                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
0214                 elif line[pos] == '\f': column = 0
0215                 else: break
0216                 pos = pos + 1
0217             if pos == max: break
0218 
0219             if line[pos] in '#\r\n':           # skip comments or blank lines
0220                 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
0221                            (lnum, pos), (lnum, len(line)), line)
0222                 continue
0223 
0224             if column > indents[-1]:           # count indents or dedents
0225                 indents.append(column)
0226                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
0227             while column < indents[-1]:
0228                 indents = indents[:-1]
0229                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
0230 
0231         else:                                  # continued statement
0232             if not line:
0233                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
0234             continued = 0
0235 
0236         while pos < max:
0237             pseudomatch = pseudoprog.match(line, pos)
0238             if pseudomatch:                                # scan for tokens
0239                 start, end = pseudomatch.span(1)
0240                 spos, epos, pos = (lnum, start), (lnum, end), end
0241                 token, initial = line[start:end], line[start]
0242 
0243                 if initial in numchars or \
0244                    (initial == '.' and token != '.'):      # ordinary number
0245                     yield (NUMBER, token, spos, epos, line)
0246                 elif initial in '\r\n':
0247                     yield (parenlev > 0 and NL or NEWLINE,
0248                                token, spos, epos, line)
0249                 elif initial == '#':
0250                     yield (COMMENT, token, spos, epos, line)
0251                 elif token in triple_quoted:
0252                     endprog = endprogs[token]
0253                     endmatch = endprog.match(line, pos)
0254                     if endmatch:                           # all on one line
0255                         pos = endmatch.end(0)
0256                         token = line[start:pos]
0257                         yield (STRING, token, spos, (lnum, pos), line)
0258                     else:
0259                         strstart = (lnum, start)           # multiple lines
0260                         contstr = line[start:]
0261                         contline = line
0262                         break
0263                 elif initial in single_quoted or \
0264                     token[:2] in single_quoted or \
0265                     token[:3] in single_quoted:
0266                     if token[-1] == '\n':                  # continued string
0267                         strstart = (lnum, start)
0268                         endprog = (endprogs[initial] or endprogs[token[1]] or
0269                                    endprogs[token[2]])
0270                         contstr, needcont = line[start:], 1
0271                         contline = line
0272                         break
0273                     else:                                  # ordinary string
0274                         yield (STRING, token, spos, epos, line)
0275                 elif initial in namechars:                 # ordinary name
0276                     yield (NAME, token, spos, epos, line)
0277                 elif initial == '\\':                      # continued stmt
0278                     continued = 1
0279                 else:
0280                     if initial in '([{': parenlev = parenlev + 1
0281                     elif initial in ')]}': parenlev = parenlev - 1
0282                     yield (OP, token, spos, epos, line)
0283             else:
0284                 yield (ERRORTOKEN, line[pos],
0285                            (lnum, pos), (lnum, pos+1), line)
0286                 pos = pos + 1
0287 
0288     for indent in indents[1:]:                 # pop remaining indent levels
0289         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
0290     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
0291 
0292 if __name__ == '__main__':                     # testing
0293     import sys
0294     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
0295     else: tokenize(sys.stdin.readline)
0296
Generated by PyXR 0.9.4