PyXR

c:\python24\lib \ HTMLParser.py



0001 """A parser for HTML and XHTML."""
0002 
0003 # This file is based on sgmllib.py, but the API is slightly different.
0004 
0005 # XXX There should be a way to distinguish between PCDATA (parsed
0006 # character data -- the normal case), RCDATA (replaceable character
0007 # data -- only char and entity references and end tags are special)
0008 # and CDATA (character data -- only end tags are special).
0009 
0010 
0011 import markupbase
0012 import re
0013 
0014 # Regular expressions used for parsing
0015 
0016 interesting_normal = re.compile('[&<]')
0017 interesting_cdata = re.compile(r'<(/|\Z)')
0018 incomplete = re.compile('&[a-zA-Z#]')
0019 
0020 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
0021 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
0022 
0023 starttagopen = re.compile('<[a-zA-Z]')
0024 piclose = re.compile('>')
0025 commentclose = re.compile(r'--\s*>')
0026 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
0027 attrfind = re.compile(
0028     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
0029     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
0030 
0031 locatestarttagend = re.compile(r"""
0032   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
0033   (?:\s+                             # whitespace before attribute name
0034     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
0035       (?:\s*=\s*                     # value indicator
0036         (?:'[^']*'                   # LITA-enclosed value
0037           |\"[^\"]*\"                # LIT-enclosed value
0038           |[^'\">\s]+                # bare value
0039          )
0040        )?
0041      )
0042    )*
0043   \s*                                # trailing whitespace
0044 """, re.VERBOSE)
0045 endendtag = re.compile('>')
0046 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
0047 
0048 
0049 class HTMLParseError(Exception):
0050     """Exception raised for all parse errors."""
0051 
0052     def __init__(self, msg, position=(None, None)):
0053         assert msg
0054         self.msg = msg
0055         self.lineno = position[0]
0056         self.offset = position[1]
0057 
0058     def __str__(self):
0059         result = self.msg
0060         if self.lineno is not None:
0061             result = result + ", at line %d" % self.lineno
0062         if self.offset is not None:
0063             result = result + ", column %d" % (self.offset + 1)
0064         return result
0065 
0066 
0067 class HTMLParser(markupbase.ParserBase):
0068     """Find tags and other markup and call handler functions.
0069 
0070     Usage:
0071         p = HTMLParser()
0072         p.feed(data)
0073         ...
0074         p.close()
0075 
0076     Start tags are handled by calling self.handle_starttag() or
0077     self.handle_startendtag(); end tags by self.handle_endtag().  The
0078     data between tags is passed from the parser to the derived class
0079     by calling self.handle_data() with the data as argument (the data
0080     may be split up in arbitrary chunks).  Entity references are
0081     passed by calling self.handle_entityref() with the entity
0082     reference as the argument.  Numeric character references are
0083     passed to self.handle_charref() with the string containing the
0084     reference as the argument.
0085     """
0086 
0087     CDATA_CONTENT_ELEMENTS = ("script", "style")
0088 
0089 
0090     def __init__(self):
0091         """Initialize and reset this instance."""
0092         self.reset()
0093 
0094     def reset(self):
0095         """Reset this instance.  Loses all unprocessed data."""
0096         self.rawdata = ''
0097         self.lasttag = '???'
0098         self.interesting = interesting_normal
0099         markupbase.ParserBase.reset(self)
0100 
0101     def feed(self, data):
0102         """Feed data to the parser.
0103 
0104         Call this as often as you want, with as little or as much text
0105         as you want (may include '\n').
0106         """
0107         self.rawdata = self.rawdata + data
0108         self.goahead(0)
0109 
0110     def close(self):
0111         """Handle any buffered data."""
0112         self.goahead(1)
0113 
0114     def error(self, message):
0115         raise HTMLParseError(message, self.getpos())
0116 
0117     __starttag_text = None
0118 
0119     def get_starttag_text(self):
0120         """Return full source of start tag: '<...>'."""
0121         return self.__starttag_text
0122 
0123     def set_cdata_mode(self):
0124         self.interesting = interesting_cdata
0125 
0126     def clear_cdata_mode(self):
0127         self.interesting = interesting_normal
0128 
0129     # Internal -- handle data as far as reasonable.  May leave state
0130     # and data to be processed by a subsequent call.  If 'end' is
0131     # true, force handling all data as if followed by EOF marker.
0132     def goahead(self, end):
0133         rawdata = self.rawdata
0134         i = 0
0135         n = len(rawdata)
0136         while i < n:
0137             match = self.interesting.search(rawdata, i) # < or &
0138             if match:
0139                 j = match.start()
0140             else:
0141                 j = n
0142             if i < j: self.handle_data(rawdata[i:j])
0143             i = self.updatepos(i, j)
0144             if i == n: break
0145             startswith = rawdata.startswith
0146             if startswith('<', i):
0147                 if starttagopen.match(rawdata, i): # < + letter
0148                     k = self.parse_starttag(i)
0149                 elif startswith("</", i):
0150                     k = self.parse_endtag(i)
0151                 elif startswith("<!--", i):
0152                     k = self.parse_comment(i)
0153                 elif startswith("<?", i):
0154                     k = self.parse_pi(i)
0155                 elif startswith("<!", i):
0156                     k = self.parse_declaration(i)
0157                 elif (i + 1) < n:
0158                     self.handle_data("<")
0159                     k = i + 1
0160                 else:
0161                     break
0162                 if k < 0:
0163                     if end:
0164                         self.error("EOF in middle of construct")
0165                     break
0166                 i = self.updatepos(i, k)
0167             elif startswith("&#", i):
0168                 match = charref.match(rawdata, i)
0169                 if match:
0170                     name = match.group()[2:-1]
0171                     self.handle_charref(name)
0172                     k = match.end()
0173                     if not startswith(';', k-1):
0174                         k = k - 1
0175                     i = self.updatepos(i, k)
0176                     continue
0177                 else:
0178                     break
0179             elif startswith('&', i):
0180                 match = entityref.match(rawdata, i)
0181                 if match:
0182                     name = match.group(1)
0183                     self.handle_entityref(name)
0184                     k = match.end()
0185                     if not startswith(';', k-1):
0186                         k = k - 1
0187                     i = self.updatepos(i, k)
0188                     continue
0189                 match = incomplete.match(rawdata, i)
0190                 if match:
0191                     # match.group() will contain at least 2 chars
0192                     if end and match.group() == rawdata[i:]:
0193                         self.error("EOF in middle of entity or char ref")
0194                     # incomplete
0195                     break
0196                 elif (i + 1) < n:
0197                     # not the end of the buffer, and can't be confused
0198                     # with some other construct
0199                     self.handle_data("&")
0200                     i = self.updatepos(i, i + 1)
0201                 else:
0202                     break
0203             else:
0204                 assert 0, "interesting.search() lied"
0205         # end while
0206         if end and i < n:
0207             self.handle_data(rawdata[i:n])
0208             i = self.updatepos(i, n)
0209         self.rawdata = rawdata[i:]
0210 
0211     # Internal -- parse processing instr, return end or -1 if not terminated
0212     def parse_pi(self, i):
0213         rawdata = self.rawdata
0214         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
0215         match = piclose.search(rawdata, i+2) # >
0216         if not match:
0217             return -1
0218         j = match.start()
0219         self.handle_pi(rawdata[i+2: j])
0220         j = match.end()
0221         return j
0222 
0223     # Internal -- handle starttag, return end or -1 if not terminated
0224     def parse_starttag(self, i):
0225         self.__starttag_text = None
0226         endpos = self.check_for_whole_start_tag(i)
0227         if endpos < 0:
0228             return endpos
0229         rawdata = self.rawdata
0230         self.__starttag_text = rawdata[i:endpos]
0231 
0232         # Now parse the data between i+1 and j into a tag and attrs
0233         attrs = []
0234         match = tagfind.match(rawdata, i+1)
0235         assert match, 'unexpected call to parse_starttag()'
0236         k = match.end()
0237         self.lasttag = tag = rawdata[i+1:k].lower()
0238 
0239         while k < endpos:
0240             m = attrfind.match(rawdata, k)
0241             if not m:
0242                 break
0243             attrname, rest, attrvalue = m.group(1, 2, 3)
0244             if not rest:
0245                 attrvalue = None
0246             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
0247                  attrvalue[:1] == '"' == attrvalue[-1:]:
0248                 attrvalue = attrvalue[1:-1]
0249                 attrvalue = self.unescape(attrvalue)
0250             attrs.append((attrname.lower(), attrvalue))
0251             k = m.end()
0252 
0253         end = rawdata[k:endpos].strip()
0254         if end not in (">", "/>"):
0255             lineno, offset = self.getpos()
0256             if "\n" in self.__starttag_text:
0257                 lineno = lineno + self.__starttag_text.count("\n")
0258                 offset = len(self.__starttag_text) \
0259                          - self.__starttag_text.rfind("\n")
0260             else:
0261                 offset = offset + len(self.__starttag_text)
0262             self.error("junk characters in start tag: %r"
0263                        % (rawdata[k:endpos][:20],))
0264         if end.endswith('/>'):
0265             # XHTML-style empty tag: <span attr="value" />
0266             self.handle_startendtag(tag, attrs)
0267         else:
0268             self.handle_starttag(tag, attrs)
0269             if tag in self.CDATA_CONTENT_ELEMENTS:
0270                 self.set_cdata_mode()
0271         return endpos
0272 
0273     # Internal -- check to see if we have a complete starttag; return end
0274     # or -1 if incomplete.
0275     def check_for_whole_start_tag(self, i):
0276         rawdata = self.rawdata
0277         m = locatestarttagend.match(rawdata, i)
0278         if m:
0279             j = m.end()
0280             next = rawdata[j:j+1]
0281             if next == ">":
0282                 return j + 1
0283             if next == "/":
0284                 if rawdata.startswith("/>", j):
0285                     return j + 2
0286                 if rawdata.startswith("/", j):
0287                     # buffer boundary
0288                     return -1
0289                 # else bogus input
0290                 self.updatepos(i, j + 1)
0291                 self.error("malformed empty start tag")
0292             if next == "":
0293                 # end of input
0294                 return -1
0295             if next in ("abcdefghijklmnopqrstuvwxyz=/"
0296                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
0297                 # end of input in or before attribute value, or we have the
0298                 # '/' from a '/>' ending
0299                 return -1
0300             self.updatepos(i, j)
0301             self.error("malformed start tag")
0302         raise AssertionError("we should not get here!")
0303 
0304     # Internal -- parse endtag, return end or -1 if incomplete
0305     def parse_endtag(self, i):
0306         rawdata = self.rawdata
0307         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
0308         match = endendtag.search(rawdata, i+1) # >
0309         if not match:
0310             return -1
0311         j = match.end()
0312         match = endtagfind.match(rawdata, i) # </ + tag + >
0313         if not match:
0314             self.error("bad end tag: %r" % (rawdata[i:j],))
0315         tag = match.group(1)
0316         self.handle_endtag(tag.lower())
0317         self.clear_cdata_mode()
0318         return j
0319 
0320     # Overridable -- finish processing of start+end tag: <tag.../>
0321     def handle_startendtag(self, tag, attrs):
0322         self.handle_starttag(tag, attrs)
0323         self.handle_endtag(tag)
0324 
0325     # Overridable -- handle start tag
0326     def handle_starttag(self, tag, attrs):
0327         pass
0328 
0329     # Overridable -- handle end tag
0330     def handle_endtag(self, tag):
0331         pass
0332 
0333     # Overridable -- handle character reference
0334     def handle_charref(self, name):
0335         pass
0336 
0337     # Overridable -- handle entity reference
0338     def handle_entityref(self, name):
0339         pass
0340 
0341     # Overridable -- handle data
0342     def handle_data(self, data):
0343         pass
0344 
0345     # Overridable -- handle comment
0346     def handle_comment(self, data):
0347         pass
0348 
0349     # Overridable -- handle declaration
0350     def handle_decl(self, decl):
0351         pass
0352 
0353     # Overridable -- handle processing instruction
0354     def handle_pi(self, data):
0355         pass
0356 
0357     def unknown_decl(self, data):
0358         self.error("unknown declaration: %r" % (data,))
0359 
0360     # Internal -- helper to remove special character quoting
0361     def unescape(self, s):
0362         if '&' not in s:
0363             return s
0364         s = s.replace("&lt;", "<")
0365         s = s.replace("&gt;", ">")
0366         s = s.replace("&apos;", "'")
0367         s = s.replace("&quot;", '"')
0368         s = s.replace("&amp;", "&") # Must be last
0369         return s
0370 

Generated by PyXR 0.9.4
SourceForge.net Logo