0001 """HTML 2.0 parser. 0002 0003 See the HTML 2.0 specification: 0004 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html 0005 """ 0006 0007 import sgmllib 0008 0009 from formatter import AS_IS 0010 0011 __all__ = ["HTMLParser", "HTMLParseError"] 0012 0013 0014 class HTMLParseError(sgmllib.SGMLParseError): 0015 """Error raised when an HTML document can't be parsed.""" 0016 0017 0018 class HTMLParser(sgmllib.SGMLParser): 0019 """This is the basic HTML parser class. 0020 0021 It supports all entity names required by the XHTML 1.0 Recommendation. 0022 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2 0023 elements. 0024 0025 """ 0026 0027 from htmlentitydefs import entitydefs 0028 0029 def __init__(self, formatter, verbose=0): 0030 """Creates an instance of the HTMLParser class. 0031 0032 The formatter parameter is the formatter instance associated with 0033 the parser. 0034 0035 """ 0036 sgmllib.SGMLParser.__init__(self, verbose) 0037 self.formatter = formatter 0038 0039 def error(self, message): 0040 raise HTMLParseError(message) 0041 0042 def reset(self): 0043 sgmllib.SGMLParser.reset(self) 0044 self.savedata = None 0045 self.isindex = 0 0046 self.title = None 0047 self.base = None 0048 self.anchor = None 0049 self.anchorlist = [] 0050 self.nofill = 0 0051 self.list_stack = [] 0052 0053 # ------ Methods used internally; some may be overridden 0054 0055 # --- Formatter interface, taking care of 'savedata' mode; 0056 # shouldn't need to be overridden 0057 0058 def handle_data(self, data): 0059 if self.savedata is not None: 0060 self.savedata = self.savedata + data 0061 else: 0062 if self.nofill: 0063 self.formatter.add_literal_data(data) 0064 else: 0065 self.formatter.add_flowing_data(data) 0066 0067 # --- Hooks to save data; shouldn't need to be overridden 0068 0069 def save_bgn(self): 0070 """Begins saving character data in a buffer instead of sending it 0071 to the formatter object. 0072 0073 Retrieve the stored data via the save_end() method. Use of the 0074 save_bgn() / save_end() pair may not be nested. 0075 0076 """ 0077 self.savedata = '' 0078 0079 def save_end(self): 0080 """Ends buffering character data and returns all data saved since 0081 the preceding call to the save_bgn() method. 0082 0083 If the nofill flag is false, whitespace is collapsed to single 0084 spaces. A call to this method without a preceding call to the 0085 save_bgn() method will raise a TypeError exception. 0086 0087 """ 0088 data = self.savedata 0089 self.savedata = None 0090 if not self.nofill: 0091 data = ' '.join(data.split()) 0092 return data 0093 0094 # --- Hooks for anchors; should probably be overridden 0095 0096 def anchor_bgn(self, href, name, type): 0097 """This method is called at the start of an anchor region. 0098 0099 The arguments correspond to the attributes of the <A> tag with 0100 the same names. The default implementation maintains a list of 0101 hyperlinks (defined by the HREF attribute for <A> tags) within 0102 the document. The list of hyperlinks is available as the data 0103 attribute anchorlist. 0104 0105 """ 0106 self.anchor = href 0107 if self.anchor: 0108 self.anchorlist.append(href) 0109 0110 def anchor_end(self): 0111 """This method is called at the end of an anchor region. 0112 0113 The default implementation adds a textual footnote marker using an 0114 index into the list of hyperlinks created by the anchor_bgn()method. 0115 0116 """ 0117 if self.anchor: 0118 self.handle_data("[%d]" % len(self.anchorlist)) 0119 self.anchor = None 0120 0121 # --- Hook for images; should probably be overridden 0122 0123 def handle_image(self, src, alt, *args): 0124 """This method is called to handle images. 0125 0126 The default implementation simply passes the alt value to the 0127 handle_data() method. 0128 0129 """ 0130 self.handle_data(alt) 0131 0132 # --------- Top level elememts 0133 0134 def start_html(self, attrs): pass 0135 def end_html(self): pass 0136 0137 def start_head(self, attrs): pass 0138 def end_head(self): pass 0139 0140 def start_body(self, attrs): pass 0141 def end_body(self): pass 0142 0143 # ------ Head elements 0144 0145 def start_title(self, attrs): 0146 self.save_bgn() 0147 0148 def end_title(self): 0149 self.title = self.save_end() 0150 0151 def do_base(self, attrs): 0152 for a, v in attrs: 0153 if a == 'href': 0154 self.base = v 0155 0156 def do_isindex(self, attrs): 0157 self.isindex = 1 0158 0159 def do_link(self, attrs): 0160 pass 0161 0162 def do_meta(self, attrs): 0163 pass 0164 0165 def do_nextid(self, attrs): # Deprecated 0166 pass 0167 0168 # ------ Body elements 0169 0170 # --- Headings 0171 0172 def start_h1(self, attrs): 0173 self.formatter.end_paragraph(1) 0174 self.formatter.push_font(('h1', 0, 1, 0)) 0175 0176 def end_h1(self): 0177 self.formatter.end_paragraph(1) 0178 self.formatter.pop_font() 0179 0180 def start_h2(self, attrs): 0181 self.formatter.end_paragraph(1) 0182 self.formatter.push_font(('h2', 0, 1, 0)) 0183 0184 def end_h2(self): 0185 self.formatter.end_paragraph(1) 0186 self.formatter.pop_font() 0187 0188 def start_h3(self, attrs): 0189 self.formatter.end_paragraph(1) 0190 self.formatter.push_font(('h3', 0, 1, 0)) 0191 0192 def end_h3(self): 0193 self.formatter.end_paragraph(1) 0194 self.formatter.pop_font() 0195 0196 def start_h4(self, attrs): 0197 self.formatter.end_paragraph(1) 0198 self.formatter.push_font(('h4', 0, 1, 0)) 0199 0200 def end_h4(self): 0201 self.formatter.end_paragraph(1) 0202 self.formatter.pop_font() 0203 0204 def start_h5(self, attrs): 0205 self.formatter.end_paragraph(1) 0206 self.formatter.push_font(('h5', 0, 1, 0)) 0207 0208 def end_h5(self): 0209 self.formatter.end_paragraph(1) 0210 self.formatter.pop_font() 0211 0212 def start_h6(self, attrs): 0213 self.formatter.end_paragraph(1) 0214 self.formatter.push_font(('h6', 0, 1, 0)) 0215 0216 def end_h6(self): 0217 self.formatter.end_paragraph(1) 0218 self.formatter.pop_font() 0219 0220 # --- Block Structuring Elements 0221 0222 def do_p(self, attrs): 0223 self.formatter.end_paragraph(1) 0224 0225 def start_pre(self, attrs): 0226 self.formatter.end_paragraph(1) 0227 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) 0228 self.nofill = self.nofill + 1 0229 0230 def end_pre(self): 0231 self.formatter.end_paragraph(1) 0232 self.formatter.pop_font() 0233 self.nofill = max(0, self.nofill - 1) 0234 0235 def start_xmp(self, attrs): 0236 self.start_pre(attrs) 0237 self.setliteral('xmp') # Tell SGML parser 0238 0239 def end_xmp(self): 0240 self.end_pre() 0241 0242 def start_listing(self, attrs): 0243 self.start_pre(attrs) 0244 self.setliteral('listing') # Tell SGML parser 0245 0246 def end_listing(self): 0247 self.end_pre() 0248 0249 def start_address(self, attrs): 0250 self.formatter.end_paragraph(0) 0251 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) 0252 0253 def end_address(self): 0254 self.formatter.end_paragraph(0) 0255 self.formatter.pop_font() 0256 0257 def start_blockquote(self, attrs): 0258 self.formatter.end_paragraph(1) 0259 self.formatter.push_margin('blockquote') 0260 0261 def end_blockquote(self): 0262 self.formatter.end_paragraph(1) 0263 self.formatter.pop_margin() 0264 0265 # --- List Elements 0266 0267 def start_ul(self, attrs): 0268 self.formatter.end_paragraph(not self.list_stack) 0269 self.formatter.push_margin('ul') 0270 self.list_stack.append(['ul', '*', 0]) 0271 0272 def end_ul(self): 0273 if self.list_stack: del self.list_stack[-1] 0274 self.formatter.end_paragraph(not self.list_stack) 0275 self.formatter.pop_margin() 0276 0277 def do_li(self, attrs): 0278 self.formatter.end_paragraph(0) 0279 if self.list_stack: 0280 [dummy, label, counter] = top = self.list_stack[-1] 0281 top[2] = counter = counter+1 0282 else: 0283 label, counter = '*', 0 0284 self.formatter.add_label_data(label, counter) 0285 0286 def start_ol(self, attrs): 0287 self.formatter.end_paragraph(not self.list_stack) 0288 self.formatter.push_margin('ol') 0289 label = '1.' 0290 for a, v in attrs: 0291 if a == 'type': 0292 if len(v) == 1: v = v + '.' 0293 label = v 0294 self.list_stack.append(['ol', label, 0]) 0295 0296 def end_ol(self): 0297 if self.list_stack: del self.list_stack[-1] 0298 self.formatter.end_paragraph(not self.list_stack) 0299 self.formatter.pop_margin() 0300 0301 def start_menu(self, attrs): 0302 self.start_ul(attrs) 0303 0304 def end_menu(self): 0305 self.end_ul() 0306 0307 def start_dir(self, attrs): 0308 self.start_ul(attrs) 0309 0310 def end_dir(self): 0311 self.end_ul() 0312 0313 def start_dl(self, attrs): 0314 self.formatter.end_paragraph(1) 0315 self.list_stack.append(['dl', '', 0]) 0316 0317 def end_dl(self): 0318 self.ddpop(1) 0319 if self.list_stack: del self.list_stack[-1] 0320 0321 def do_dt(self, attrs): 0322 self.ddpop() 0323 0324 def do_dd(self, attrs): 0325 self.ddpop() 0326 self.formatter.push_margin('dd') 0327 self.list_stack.append(['dd', '', 0]) 0328 0329 def ddpop(self, bl=0): 0330 self.formatter.end_paragraph(bl) 0331 if self.list_stack: 0332 if self.list_stack[-1][0] == 'dd': 0333 del self.list_stack[-1] 0334 self.formatter.pop_margin() 0335 0336 # --- Phrase Markup 0337 0338 # Idiomatic Elements 0339 0340 def start_cite(self, attrs): self.start_i(attrs) 0341 def end_cite(self): self.end_i() 0342 0343 def start_code(self, attrs): self.start_tt(attrs) 0344 def end_code(self): self.end_tt() 0345 0346 def start_em(self, attrs): self.start_i(attrs) 0347 def end_em(self): self.end_i() 0348 0349 def start_kbd(self, attrs): self.start_tt(attrs) 0350 def end_kbd(self): self.end_tt() 0351 0352 def start_samp(self, attrs): self.start_tt(attrs) 0353 def end_samp(self): self.end_tt() 0354 0355 def start_strong(self, attrs): self.start_b(attrs) 0356 def end_strong(self): self.end_b() 0357 0358 def start_var(self, attrs): self.start_i(attrs) 0359 def end_var(self): self.end_i() 0360 0361 # Typographic Elements 0362 0363 def start_i(self, attrs): 0364 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) 0365 def end_i(self): 0366 self.formatter.pop_font() 0367 0368 def start_b(self, attrs): 0369 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) 0370 def end_b(self): 0371 self.formatter.pop_font() 0372 0373 def start_tt(self, attrs): 0374 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) 0375 def end_tt(self): 0376 self.formatter.pop_font() 0377 0378 def start_a(self, attrs): 0379 href = '' 0380 name = '' 0381 type = '' 0382 for attrname, value in attrs: 0383 value = value.strip() 0384 if attrname == 'href': 0385 href = value 0386 if attrname == 'name': 0387 name = value 0388 if attrname == 'type': 0389 type = value.lower() 0390 self.anchor_bgn(href, name, type) 0391 0392 def end_a(self): 0393 self.anchor_end() 0394 0395 # --- Line Break 0396 0397 def do_br(self, attrs): 0398 self.formatter.add_line_break() 0399 0400 # --- Horizontal Rule 0401 0402 def do_hr(self, attrs): 0403 self.formatter.add_hor_rule() 0404 0405 # --- Image 0406 0407 def do_img(self, attrs): 0408 align = '' 0409 alt = '(image)' 0410 ismap = '' 0411 src = '' 0412 width = 0 0413 height = 0 0414 for attrname, value in attrs: 0415 if attrname == 'align': 0416 align = value 0417 if attrname == 'alt': 0418 alt = value 0419 if attrname == 'ismap': 0420 ismap = value 0421 if attrname == 'src': 0422 src = value 0423 if attrname == 'width': 0424 try: width = int(value) 0425 except ValueError: pass 0426 if attrname == 'height': 0427 try: height = int(value) 0428 except ValueError: pass 0429 self.handle_image(src, alt, ismap, align, width, height) 0430 0431 # --- Really Old Unofficial Deprecated Stuff 0432 0433 def do_plaintext(self, attrs): 0434 self.start_pre(attrs) 0435 self.setnomoretags() # Tell SGML parser 0436 0437 # --- Unhandled tags 0438 0439 def unknown_starttag(self, tag, attrs): 0440 pass 0441 0442 def unknown_endtag(self, tag): 0443 pass 0444 0445 0446 def test(args = None): 0447 import sys, formatter 0448 0449 if not args: 0450 args = sys.argv[1:] 0451 0452 silent = args and args[0] == '-s' 0453 if silent: 0454 del args[0] 0455 0456 if args: 0457 file = args[0] 0458 else: 0459 file = 'test.html' 0460 0461 if file == '-': 0462 f = sys.stdin 0463 else: 0464 try: 0465 f = open(file, 'r') 0466 except IOError, msg: 0467 print file, ":", msg 0468 sys.exit(1) 0469 0470 data = f.read() 0471 0472 if f is not sys.stdin: 0473 f.close() 0474 0475 if silent: 0476 f = formatter.NullFormatter() 0477 else: 0478 f = formatter.AbstractFormatter(formatter.DumbWriter()) 0479 0480 p = HTMLParser(f) 0481 p.feed(data) 0482 p.close() 0483 0484 0485 if __name__ == '__main__': 0486 test() 0487
Generated by PyXR 0.9.4