PyXR

c:\python24\lib \ htmllib.py



0001 """HTML 2.0 parser.
0002 
0003 See the HTML 2.0 specification:
0004 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
0005 """
0006 
0007 import sgmllib
0008 
0009 from formatter import AS_IS
0010 
0011 __all__ = ["HTMLParser", "HTMLParseError"]
0012 
0013 
0014 class HTMLParseError(sgmllib.SGMLParseError):
0015     """Error raised when an HTML document can't be parsed."""
0016 
0017 
0018 class HTMLParser(sgmllib.SGMLParser):
0019     """This is the basic HTML parser class.
0020 
0021     It supports all entity names required by the XHTML 1.0 Recommendation.
0022     It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
0023     elements.
0024 
0025     """
0026 
0027     from htmlentitydefs import entitydefs
0028 
0029     def __init__(self, formatter, verbose=0):
0030         """Creates an instance of the HTMLParser class.
0031 
0032         The formatter parameter is the formatter instance associated with
0033         the parser.
0034 
0035         """
0036         sgmllib.SGMLParser.__init__(self, verbose)
0037         self.formatter = formatter
0038 
0039     def error(self, message):
0040         raise HTMLParseError(message)
0041 
0042     def reset(self):
0043         sgmllib.SGMLParser.reset(self)
0044         self.savedata = None
0045         self.isindex = 0
0046         self.title = None
0047         self.base = None
0048         self.anchor = None
0049         self.anchorlist = []
0050         self.nofill = 0
0051         self.list_stack = []
0052 
0053     # ------ Methods used internally; some may be overridden
0054 
0055     # --- Formatter interface, taking care of 'savedata' mode;
0056     # shouldn't need to be overridden
0057 
0058     def handle_data(self, data):
0059         if self.savedata is not None:
0060             self.savedata = self.savedata + data
0061         else:
0062             if self.nofill:
0063                 self.formatter.add_literal_data(data)
0064             else:
0065                 self.formatter.add_flowing_data(data)
0066 
0067     # --- Hooks to save data; shouldn't need to be overridden
0068 
0069     def save_bgn(self):
0070         """Begins saving character data in a buffer instead of sending it
0071         to the formatter object.
0072 
0073         Retrieve the stored data via the save_end() method.  Use of the
0074         save_bgn() / save_end() pair may not be nested.
0075 
0076         """
0077         self.savedata = ''
0078 
0079     def save_end(self):
0080         """Ends buffering character data and returns all data saved since
0081         the preceding call to the save_bgn() method.
0082 
0083         If the nofill flag is false, whitespace is collapsed to single
0084         spaces.  A call to this method without a preceding call to the
0085         save_bgn() method will raise a TypeError exception.
0086 
0087         """
0088         data = self.savedata
0089         self.savedata = None
0090         if not self.nofill:
0091             data = ' '.join(data.split())
0092         return data
0093 
0094     # --- Hooks for anchors; should probably be overridden
0095 
0096     def anchor_bgn(self, href, name, type):
0097         """This method is called at the start of an anchor region.
0098 
0099         The arguments correspond to the attributes of the <A> tag with
0100         the same names.  The default implementation maintains a list of
0101         hyperlinks (defined by the HREF attribute for <A> tags) within
0102         the document.  The list of hyperlinks is available as the data
0103         attribute anchorlist.
0104 
0105         """
0106         self.anchor = href
0107         if self.anchor:
0108             self.anchorlist.append(href)
0109 
0110     def anchor_end(self):
0111         """This method is called at the end of an anchor region.
0112 
0113         The default implementation adds a textual footnote marker using an
0114         index into the list of hyperlinks created by the anchor_bgn()method.
0115 
0116         """
0117         if self.anchor:
0118             self.handle_data("[%d]" % len(self.anchorlist))
0119             self.anchor = None
0120 
0121     # --- Hook for images; should probably be overridden
0122 
0123     def handle_image(self, src, alt, *args):
0124         """This method is called to handle images.
0125 
0126         The default implementation simply passes the alt value to the
0127         handle_data() method.
0128 
0129         """
0130         self.handle_data(alt)
0131 
0132     # --------- Top level elememts
0133 
0134     def start_html(self, attrs): pass
0135     def end_html(self): pass
0136 
0137     def start_head(self, attrs): pass
0138     def end_head(self): pass
0139 
0140     def start_body(self, attrs): pass
0141     def end_body(self): pass
0142 
0143     # ------ Head elements
0144 
0145     def start_title(self, attrs):
0146         self.save_bgn()
0147 
0148     def end_title(self):
0149         self.title = self.save_end()
0150 
0151     def do_base(self, attrs):
0152         for a, v in attrs:
0153             if a == 'href':
0154                 self.base = v
0155 
0156     def do_isindex(self, attrs):
0157         self.isindex = 1
0158 
0159     def do_link(self, attrs):
0160         pass
0161 
0162     def do_meta(self, attrs):
0163         pass
0164 
0165     def do_nextid(self, attrs): # Deprecated
0166         pass
0167 
0168     # ------ Body elements
0169 
0170     # --- Headings
0171 
0172     def start_h1(self, attrs):
0173         self.formatter.end_paragraph(1)
0174         self.formatter.push_font(('h1', 0, 1, 0))
0175 
0176     def end_h1(self):
0177         self.formatter.end_paragraph(1)
0178         self.formatter.pop_font()
0179 
0180     def start_h2(self, attrs):
0181         self.formatter.end_paragraph(1)
0182         self.formatter.push_font(('h2', 0, 1, 0))
0183 
0184     def end_h2(self):
0185         self.formatter.end_paragraph(1)
0186         self.formatter.pop_font()
0187 
0188     def start_h3(self, attrs):
0189         self.formatter.end_paragraph(1)
0190         self.formatter.push_font(('h3', 0, 1, 0))
0191 
0192     def end_h3(self):
0193         self.formatter.end_paragraph(1)
0194         self.formatter.pop_font()
0195 
0196     def start_h4(self, attrs):
0197         self.formatter.end_paragraph(1)
0198         self.formatter.push_font(('h4', 0, 1, 0))
0199 
0200     def end_h4(self):
0201         self.formatter.end_paragraph(1)
0202         self.formatter.pop_font()
0203 
0204     def start_h5(self, attrs):
0205         self.formatter.end_paragraph(1)
0206         self.formatter.push_font(('h5', 0, 1, 0))
0207 
0208     def end_h5(self):
0209         self.formatter.end_paragraph(1)
0210         self.formatter.pop_font()
0211 
0212     def start_h6(self, attrs):
0213         self.formatter.end_paragraph(1)
0214         self.formatter.push_font(('h6', 0, 1, 0))
0215 
0216     def end_h6(self):
0217         self.formatter.end_paragraph(1)
0218         self.formatter.pop_font()
0219 
0220     # --- Block Structuring Elements
0221 
0222     def do_p(self, attrs):
0223         self.formatter.end_paragraph(1)
0224 
0225     def start_pre(self, attrs):
0226         self.formatter.end_paragraph(1)
0227         self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
0228         self.nofill = self.nofill + 1
0229 
0230     def end_pre(self):
0231         self.formatter.end_paragraph(1)
0232         self.formatter.pop_font()
0233         self.nofill = max(0, self.nofill - 1)
0234 
0235     def start_xmp(self, attrs):
0236         self.start_pre(attrs)
0237         self.setliteral('xmp') # Tell SGML parser
0238 
0239     def end_xmp(self):
0240         self.end_pre()
0241 
0242     def start_listing(self, attrs):
0243         self.start_pre(attrs)
0244         self.setliteral('listing') # Tell SGML parser
0245 
0246     def end_listing(self):
0247         self.end_pre()
0248 
0249     def start_address(self, attrs):
0250         self.formatter.end_paragraph(0)
0251         self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
0252 
0253     def end_address(self):
0254         self.formatter.end_paragraph(0)
0255         self.formatter.pop_font()
0256 
0257     def start_blockquote(self, attrs):
0258         self.formatter.end_paragraph(1)
0259         self.formatter.push_margin('blockquote')
0260 
0261     def end_blockquote(self):
0262         self.formatter.end_paragraph(1)
0263         self.formatter.pop_margin()
0264 
0265     # --- List Elements
0266 
0267     def start_ul(self, attrs):
0268         self.formatter.end_paragraph(not self.list_stack)
0269         self.formatter.push_margin('ul')
0270         self.list_stack.append(['ul', '*', 0])
0271 
0272     def end_ul(self):
0273         if self.list_stack: del self.list_stack[-1]
0274         self.formatter.end_paragraph(not self.list_stack)
0275         self.formatter.pop_margin()
0276 
0277     def do_li(self, attrs):
0278         self.formatter.end_paragraph(0)
0279         if self.list_stack:
0280             [dummy, label, counter] = top = self.list_stack[-1]
0281             top[2] = counter = counter+1
0282         else:
0283             label, counter = '*', 0
0284         self.formatter.add_label_data(label, counter)
0285 
0286     def start_ol(self, attrs):
0287         self.formatter.end_paragraph(not self.list_stack)
0288         self.formatter.push_margin('ol')
0289         label = '1.'
0290         for a, v in attrs:
0291             if a == 'type':
0292                 if len(v) == 1: v = v + '.'
0293                 label = v
0294         self.list_stack.append(['ol', label, 0])
0295 
0296     def end_ol(self):
0297         if self.list_stack: del self.list_stack[-1]
0298         self.formatter.end_paragraph(not self.list_stack)
0299         self.formatter.pop_margin()
0300 
0301     def start_menu(self, attrs):
0302         self.start_ul(attrs)
0303 
0304     def end_menu(self):
0305         self.end_ul()
0306 
0307     def start_dir(self, attrs):
0308         self.start_ul(attrs)
0309 
0310     def end_dir(self):
0311         self.end_ul()
0312 
0313     def start_dl(self, attrs):
0314         self.formatter.end_paragraph(1)
0315         self.list_stack.append(['dl', '', 0])
0316 
0317     def end_dl(self):
0318         self.ddpop(1)
0319         if self.list_stack: del self.list_stack[-1]
0320 
0321     def do_dt(self, attrs):
0322         self.ddpop()
0323 
0324     def do_dd(self, attrs):
0325         self.ddpop()
0326         self.formatter.push_margin('dd')
0327         self.list_stack.append(['dd', '', 0])
0328 
0329     def ddpop(self, bl=0):
0330         self.formatter.end_paragraph(bl)
0331         if self.list_stack:
0332             if self.list_stack[-1][0] == 'dd':
0333                 del self.list_stack[-1]
0334                 self.formatter.pop_margin()
0335 
0336     # --- Phrase Markup
0337 
0338     # Idiomatic Elements
0339 
0340     def start_cite(self, attrs): self.start_i(attrs)
0341     def end_cite(self): self.end_i()
0342 
0343     def start_code(self, attrs): self.start_tt(attrs)
0344     def end_code(self): self.end_tt()
0345 
0346     def start_em(self, attrs): self.start_i(attrs)
0347     def end_em(self): self.end_i()
0348 
0349     def start_kbd(self, attrs): self.start_tt(attrs)
0350     def end_kbd(self): self.end_tt()
0351 
0352     def start_samp(self, attrs): self.start_tt(attrs)
0353     def end_samp(self): self.end_tt()
0354 
0355     def start_strong(self, attrs): self.start_b(attrs)
0356     def end_strong(self): self.end_b()
0357 
0358     def start_var(self, attrs): self.start_i(attrs)
0359     def end_var(self): self.end_i()
0360 
0361     # Typographic Elements
0362 
0363     def start_i(self, attrs):
0364         self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
0365     def end_i(self):
0366         self.formatter.pop_font()
0367 
0368     def start_b(self, attrs):
0369         self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
0370     def end_b(self):
0371         self.formatter.pop_font()
0372 
0373     def start_tt(self, attrs):
0374         self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
0375     def end_tt(self):
0376         self.formatter.pop_font()
0377 
0378     def start_a(self, attrs):
0379         href = ''
0380         name = ''
0381         type = ''
0382         for attrname, value in attrs:
0383             value = value.strip()
0384             if attrname == 'href':
0385                 href = value
0386             if attrname == 'name':
0387                 name = value
0388             if attrname == 'type':
0389                 type = value.lower()
0390         self.anchor_bgn(href, name, type)
0391 
0392     def end_a(self):
0393         self.anchor_end()
0394 
0395     # --- Line Break
0396 
0397     def do_br(self, attrs):
0398         self.formatter.add_line_break()
0399 
0400     # --- Horizontal Rule
0401 
0402     def do_hr(self, attrs):
0403         self.formatter.add_hor_rule()
0404 
0405     # --- Image
0406 
0407     def do_img(self, attrs):
0408         align = ''
0409         alt = '(image)'
0410         ismap = ''
0411         src = ''
0412         width = 0
0413         height = 0
0414         for attrname, value in attrs:
0415             if attrname == 'align':
0416                 align = value
0417             if attrname == 'alt':
0418                 alt = value
0419             if attrname == 'ismap':
0420                 ismap = value
0421             if attrname == 'src':
0422                 src = value
0423             if attrname == 'width':
0424                 try: width = int(value)
0425                 except ValueError: pass
0426             if attrname == 'height':
0427                 try: height = int(value)
0428                 except ValueError: pass
0429         self.handle_image(src, alt, ismap, align, width, height)
0430 
0431     # --- Really Old Unofficial Deprecated Stuff
0432 
0433     def do_plaintext(self, attrs):
0434         self.start_pre(attrs)
0435         self.setnomoretags() # Tell SGML parser
0436 
0437     # --- Unhandled tags
0438 
0439     def unknown_starttag(self, tag, attrs):
0440         pass
0441 
0442     def unknown_endtag(self, tag):
0443         pass
0444 
0445 
0446 def test(args = None):
0447     import sys, formatter
0448 
0449     if not args:
0450         args = sys.argv[1:]
0451 
0452     silent = args and args[0] == '-s'
0453     if silent:
0454         del args[0]
0455 
0456     if args:
0457         file = args[0]
0458     else:
0459         file = 'test.html'
0460 
0461     if file == '-':
0462         f = sys.stdin
0463     else:
0464         try:
0465             f = open(file, 'r')
0466         except IOError, msg:
0467             print file, ":", msg
0468             sys.exit(1)
0469 
0470     data = f.read()
0471 
0472     if f is not sys.stdin:
0473         f.close()
0474 
0475     if silent:
0476         f = formatter.NullFormatter()
0477     else:
0478         f = formatter.AbstractFormatter(formatter.DumbWriter())
0479 
0480     p = HTMLParser(f)
0481     p.feed(data)
0482     p.close()
0483 
0484 
0485 if __name__ == '__main__':
0486     test()
0487 

Generated by PyXR 0.9.4
SourceForge.net Logo