0001 """A parser for HTML and XHTML.""" 0002 0003 # This file is based on sgmllib.py, but the API is slightly different. 0004 0005 # XXX There should be a way to distinguish between PCDATA (parsed 0006 # character data -- the normal case), RCDATA (replaceable character 0007 # data -- only char and entity references and end tags are special) 0008 # and CDATA (character data -- only end tags are special). 0009 0010 0011 import markupbase 0012 import re 0013 0014 # Regular expressions used for parsing 0015 0016 interesting_normal = re.compile('[&<]') 0017 interesting_cdata = re.compile(r'<(/|\Z)') 0018 incomplete = re.compile('&[a-zA-Z#]') 0019 0020 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 0021 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 0022 0023 starttagopen = re.compile('<[a-zA-Z]') 0024 piclose = re.compile('>') 0025 commentclose = re.compile(r'--\s*>') 0026 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') 0027 attrfind = re.compile( 0028 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' 0029 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?') 0030 0031 locatestarttagend = re.compile(r""" 0032 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 0033 (?:\s+ # whitespace before attribute name 0034 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 0035 (?:\s*=\s* # value indicator 0036 (?:'[^']*' # LITA-enclosed value 0037 |\"[^\"]*\" # LIT-enclosed value 0038 |[^'\">\s]+ # bare value 0039 ) 0040 )? 0041 ) 0042 )* 0043 \s* # trailing whitespace 0044 """, re.VERBOSE) 0045 endendtag = re.compile('>') 0046 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 0047 0048 0049 class HTMLParseError(Exception): 0050 """Exception raised for all parse errors.""" 0051 0052 def __init__(self, msg, position=(None, None)): 0053 assert msg 0054 self.msg = msg 0055 self.lineno = position[0] 0056 self.offset = position[1] 0057 0058 def __str__(self): 0059 result = self.msg 0060 if self.lineno is not None: 0061 result = result + ", at line %d" % self.lineno 0062 if self.offset is not None: 0063 result = result + ", column %d" % (self.offset + 1) 0064 return result 0065 0066 0067 class HTMLParser(markupbase.ParserBase): 0068 """Find tags and other markup and call handler functions. 0069 0070 Usage: 0071 p = HTMLParser() 0072 p.feed(data) 0073 ... 0074 p.close() 0075 0076 Start tags are handled by calling self.handle_starttag() or 0077 self.handle_startendtag(); end tags by self.handle_endtag(). The 0078 data between tags is passed from the parser to the derived class 0079 by calling self.handle_data() with the data as argument (the data 0080 may be split up in arbitrary chunks). Entity references are 0081 passed by calling self.handle_entityref() with the entity 0082 reference as the argument. Numeric character references are 0083 passed to self.handle_charref() with the string containing the 0084 reference as the argument. 0085 """ 0086 0087 CDATA_CONTENT_ELEMENTS = ("script", "style") 0088 0089 0090 def __init__(self): 0091 """Initialize and reset this instance.""" 0092 self.reset() 0093 0094 def reset(self): 0095 """Reset this instance. Loses all unprocessed data.""" 0096 self.rawdata = '' 0097 self.lasttag = '???' 0098 self.interesting = interesting_normal 0099 markupbase.ParserBase.reset(self) 0100 0101 def feed(self, data): 0102 """Feed data to the parser. 0103 0104 Call this as often as you want, with as little or as much text 0105 as you want (may include '\n'). 0106 """ 0107 self.rawdata = self.rawdata + data 0108 self.goahead(0) 0109 0110 def close(self): 0111 """Handle any buffered data.""" 0112 self.goahead(1) 0113 0114 def error(self, message): 0115 raise HTMLParseError(message, self.getpos()) 0116 0117 __starttag_text = None 0118 0119 def get_starttag_text(self): 0120 """Return full source of start tag: '<...>'.""" 0121 return self.__starttag_text 0122 0123 def set_cdata_mode(self): 0124 self.interesting = interesting_cdata 0125 0126 def clear_cdata_mode(self): 0127 self.interesting = interesting_normal 0128 0129 # Internal -- handle data as far as reasonable. May leave state 0130 # and data to be processed by a subsequent call. If 'end' is 0131 # true, force handling all data as if followed by EOF marker. 0132 def goahead(self, end): 0133 rawdata = self.rawdata 0134 i = 0 0135 n = len(rawdata) 0136 while i < n: 0137 match = self.interesting.search(rawdata, i) # < or & 0138 if match: 0139 j = match.start() 0140 else: 0141 j = n 0142 if i < j: self.handle_data(rawdata[i:j]) 0143 i = self.updatepos(i, j) 0144 if i == n: break 0145 startswith = rawdata.startswith 0146 if startswith('<', i): 0147 if starttagopen.match(rawdata, i): # < + letter 0148 k = self.parse_starttag(i) 0149 elif startswith("</", i): 0150 k = self.parse_endtag(i) 0151 elif startswith("<!--", i): 0152 k = self.parse_comment(i) 0153 elif startswith("<?", i): 0154 k = self.parse_pi(i) 0155 elif startswith("<!", i): 0156 k = self.parse_declaration(i) 0157 elif (i + 1) < n: 0158 self.handle_data("<") 0159 k = i + 1 0160 else: 0161 break 0162 if k < 0: 0163 if end: 0164 self.error("EOF in middle of construct") 0165 break 0166 i = self.updatepos(i, k) 0167 elif startswith("&#", i): 0168 match = charref.match(rawdata, i) 0169 if match: 0170 name = match.group()[2:-1] 0171 self.handle_charref(name) 0172 k = match.end() 0173 if not startswith(';', k-1): 0174 k = k - 1 0175 i = self.updatepos(i, k) 0176 continue 0177 else: 0178 break 0179 elif startswith('&', i): 0180 match = entityref.match(rawdata, i) 0181 if match: 0182 name = match.group(1) 0183 self.handle_entityref(name) 0184 k = match.end() 0185 if not startswith(';', k-1): 0186 k = k - 1 0187 i = self.updatepos(i, k) 0188 continue 0189 match = incomplete.match(rawdata, i) 0190 if match: 0191 # match.group() will contain at least 2 chars 0192 if end and match.group() == rawdata[i:]: 0193 self.error("EOF in middle of entity or char ref") 0194 # incomplete 0195 break 0196 elif (i + 1) < n: 0197 # not the end of the buffer, and can't be confused 0198 # with some other construct 0199 self.handle_data("&") 0200 i = self.updatepos(i, i + 1) 0201 else: 0202 break 0203 else: 0204 assert 0, "interesting.search() lied" 0205 # end while 0206 if end and i < n: 0207 self.handle_data(rawdata[i:n]) 0208 i = self.updatepos(i, n) 0209 self.rawdata = rawdata[i:] 0210 0211 # Internal -- parse processing instr, return end or -1 if not terminated 0212 def parse_pi(self, i): 0213 rawdata = self.rawdata 0214 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 0215 match = piclose.search(rawdata, i+2) # > 0216 if not match: 0217 return -1 0218 j = match.start() 0219 self.handle_pi(rawdata[i+2: j]) 0220 j = match.end() 0221 return j 0222 0223 # Internal -- handle starttag, return end or -1 if not terminated 0224 def parse_starttag(self, i): 0225 self.__starttag_text = None 0226 endpos = self.check_for_whole_start_tag(i) 0227 if endpos < 0: 0228 return endpos 0229 rawdata = self.rawdata 0230 self.__starttag_text = rawdata[i:endpos] 0231 0232 # Now parse the data between i+1 and j into a tag and attrs 0233 attrs = [] 0234 match = tagfind.match(rawdata, i+1) 0235 assert match, 'unexpected call to parse_starttag()' 0236 k = match.end() 0237 self.lasttag = tag = rawdata[i+1:k].lower() 0238 0239 while k < endpos: 0240 m = attrfind.match(rawdata, k) 0241 if not m: 0242 break 0243 attrname, rest, attrvalue = m.group(1, 2, 3) 0244 if not rest: 0245 attrvalue = None 0246 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 0247 attrvalue[:1] == '"' == attrvalue[-1:]: 0248 attrvalue = attrvalue[1:-1] 0249 attrvalue = self.unescape(attrvalue) 0250 attrs.append((attrname.lower(), attrvalue)) 0251 k = m.end() 0252 0253 end = rawdata[k:endpos].strip() 0254 if end not in (">", "/>"): 0255 lineno, offset = self.getpos() 0256 if "\n" in self.__starttag_text: 0257 lineno = lineno + self.__starttag_text.count("\n") 0258 offset = len(self.__starttag_text) \ 0259 - self.__starttag_text.rfind("\n") 0260 else: 0261 offset = offset + len(self.__starttag_text) 0262 self.error("junk characters in start tag: %r" 0263 % (rawdata[k:endpos][:20],)) 0264 if end.endswith('/>'): 0265 # XHTML-style empty tag: <span attr="value" /> 0266 self.handle_startendtag(tag, attrs) 0267 else: 0268 self.handle_starttag(tag, attrs) 0269 if tag in self.CDATA_CONTENT_ELEMENTS: 0270 self.set_cdata_mode() 0271 return endpos 0272 0273 # Internal -- check to see if we have a complete starttag; return end 0274 # or -1 if incomplete. 0275 def check_for_whole_start_tag(self, i): 0276 rawdata = self.rawdata 0277 m = locatestarttagend.match(rawdata, i) 0278 if m: 0279 j = m.end() 0280 next = rawdata[j:j+1] 0281 if next == ">": 0282 return j + 1 0283 if next == "/": 0284 if rawdata.startswith("/>", j): 0285 return j + 2 0286 if rawdata.startswith("/", j): 0287 # buffer boundary 0288 return -1 0289 # else bogus input 0290 self.updatepos(i, j + 1) 0291 self.error("malformed empty start tag") 0292 if next == "": 0293 # end of input 0294 return -1 0295 if next in ("abcdefghijklmnopqrstuvwxyz=/" 0296 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 0297 # end of input in or before attribute value, or we have the 0298 # '/' from a '/>' ending 0299 return -1 0300 self.updatepos(i, j) 0301 self.error("malformed start tag") 0302 raise AssertionError("we should not get here!") 0303 0304 # Internal -- parse endtag, return end or -1 if incomplete 0305 def parse_endtag(self, i): 0306 rawdata = self.rawdata 0307 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 0308 match = endendtag.search(rawdata, i+1) # > 0309 if not match: 0310 return -1 0311 j = match.end() 0312 match = endtagfind.match(rawdata, i) # </ + tag + > 0313 if not match: 0314 self.error("bad end tag: %r" % (rawdata[i:j],)) 0315 tag = match.group(1) 0316 self.handle_endtag(tag.lower()) 0317 self.clear_cdata_mode() 0318 return j 0319 0320 # Overridable -- finish processing of start+end tag: <tag.../> 0321 def handle_startendtag(self, tag, attrs): 0322 self.handle_starttag(tag, attrs) 0323 self.handle_endtag(tag) 0324 0325 # Overridable -- handle start tag 0326 def handle_starttag(self, tag, attrs): 0327 pass 0328 0329 # Overridable -- handle end tag 0330 def handle_endtag(self, tag): 0331 pass 0332 0333 # Overridable -- handle character reference 0334 def handle_charref(self, name): 0335 pass 0336 0337 # Overridable -- handle entity reference 0338 def handle_entityref(self, name): 0339 pass 0340 0341 # Overridable -- handle data 0342 def handle_data(self, data): 0343 pass 0344 0345 # Overridable -- handle comment 0346 def handle_comment(self, data): 0347 pass 0348 0349 # Overridable -- handle declaration 0350 def handle_decl(self, decl): 0351 pass 0352 0353 # Overridable -- handle processing instruction 0354 def handle_pi(self, data): 0355 pass 0356 0357 def unknown_decl(self, data): 0358 self.error("unknown declaration: %r" % (data,)) 0359 0360 # Internal -- helper to remove special character quoting 0361 def unescape(self, s): 0362 if '&' not in s: 0363 return s 0364 s = s.replace("<", "<") 0365 s = s.replace(">", ">") 0366 s = s.replace("'", "'") 0367 s = s.replace(""", '"') 0368 s = s.replace("&", "&") # Must be last 0369 return s 0370
Generated by PyXR 0.9.4