PyXR

C:\Python24\Lib\site-packages\pyxr \ htmlCrawler.py



0001 # Copyright 2003, Grant T. Olson, see License.txt for details
0002 
0003 from HTMLParser import HTMLParser
0004 import os
0005 from misc import useLibraryReference, libDirectory, libUrl
0006 
0007 
0008 
0009 currentModule = None
0010 currentClass = None
0011 
0012 class subdocumentParser23(HTMLParser):
0013     """
0014     This uses much bad voodoo.
0015 
0016     It extracts link info from existing python library .html documents.  Extraction
0017     is based on alot of assumptions that may not hold true in later versions of python.
0018 
0019     Some assumptions:
0020 
0021         lib.html is the master document.  Any links contained within <ul class='childLinks'>
0022             are subdocuments for our purposes and will be crawled.
0023         If a filename starts with "module-" it's a module
0024         If a filename ends with "-objects.html" it's documentation for a class
0025         Modules, classes, functions, variables, methods and members can also be declared
0026             by a <tt> tag that is surrounded by an <a name=...> tag.  The data surrounded
0027             by <tt></tt> is the name of the Member, function, etc..., and the <a name=...>
0028             points to the link.
0029         The above references cannot be nested.
0030         If a module is found (either via a "module-" file or a module tag), it is set
0031             as the current module.  It remains the active module until another module is
0032             found.
0033         If a class if found (via a class tag) it remains the current class until another
0034             class declaration is found or the active module is switched
0035         It a member or method is found, it is applied to the current class
0036         If a variable or function is found, it is applied to the current module
0037 
0038     And since we're storing currentMethod and currentClass as globals, this library
0039     obviously isn't threadsafe and classes aren't reentrant.
0040     """
0041     def __init__(self, handler, url):
0042         HTMLParser.__init__(self)
0043         self.stack = []
0044         self.handler = handler
0045         self.url = url
0046         
0047     def attrs2dict(self, attrs):
0048         dict = {}
0049         for key,val in attrs:
0050             dict[key] = val
0051         return dict
0052     
0053     def handle_starttag(self, tag,attrs):
0054         attrDict = self.attrs2dict(attrs)
0055         if tag == 'a' and attrDict.has_key('name'):
0056             self.stack.append ( [tag, attrDict])
0057         elif tag == 'tt' and len(self.stack) > 0 and self.stack[-1][0] == 'a':
0058             self.stack.append([tag, attrDict])
0059             
0060     def handle_endtag(self,tag):
0061         if tag == 'a' and len(self.stack) > 0 and self.stack[-1][0] == 'a':
0062             self.stack.pop()
0063         elif tag == 'tt' and len(self.stack) > 0 and self.stack[-1][0] =='tt':
0064             self.stack.pop()
0065 
0066     def handle_data(self,data):
0067         global currentModule, currentClass
0068         stackLen = len(self.stack)
0069         if stackLen > 2:
0070             raise RuntimeError("Malformed stack")
0071         elif stackLen == 2:
0072             tt = self.stack[-1]
0073             a = self.stack[-2]
0074 
0075             if tt[0] != 'tt' or a[0] != 'a':
0076                 raise RuntimeError("Malformed stack")
0077             if tt[1].has_key('class'):
0078                 nameType = tt[1]['class']
0079             else:
0080                 nameType = 'variable'
0081                 
0082             nameRef = a[1]['name']
0083             scope = []
0084             url = "%s%s#%s" % (libUrl, self.url, nameRef)
0085             
0086             if nameType == "module":
0087                 #global currentModule, currentClass
0088                 currentModule = data
0089                 currentClass = None
0090                 scope = [currentModule]
0091                 self.handler(scope, url)
0092                 #print "MODULE %s %s#%s" % (currentModule, self.url, nameRef)
0093             elif nameType == 'class':
0094                 #global currentClass
0095                 currentClass = data
0096                 if currentModule: scope.append(currentModule)
0097                 scope.append(currentClass)
0098                 self.handler(scope,url)
0099             elif nameType in ('method', 'member'):
0100                 if currentModule:
0101                     scope.append(currentModule)
0102                 if currentClass:
0103                     scope.append(currentClass)
0104                 else:
0105                     pass #print "NO CLASS ",
0106                 scope.append(data)
0107                 self.handler(scope, url)
0108             else:
0109                 if currentModule: scope.append(currentModule)
0110                 scope.append(data)
0111                 self.handler(scope, url)
0112                 #print "%s %s LINK %s#%s"% (nameType.upper(), data, self.url, nameRef)
0113 
0114 class subdocumentParser24(HTMLParser):
0115     """
0116     The doc formats changed for 24 as expected.  now we look for a tt tag with
0117     an id and class.
0118     TODO: still don't catch variables.
0119     """    
0120     def __init__(self, handler, url):
0121         HTMLParser.__init__(self)
0122         self.stack = []
0123         self.handler = handler
0124         self.url = url
0125         
0126     def attrs2dict(self, attrs):
0127         dict = {}
0128         for key,val in attrs:
0129             dict[key] = val
0130         return dict
0131     
0132     def handle_starttag(self, tag,attrs):
0133         attrDict = self.attrs2dict(attrs)
0134         if tag == 'tt' and attrDict.has_key('id'): 
0135                 self.stack.append([tag, attrDict])
0136             
0137     def handle_endtag(self,tag):
0138         if tag == 'tt' and len(self.stack) > 0 and self.stack[-1][0] =='tt':
0139             self.stack.pop()
0140 
0141     def handle_data(self,data):
0142         global currentModule, currentClass
0143         stackLen = len(self.stack)
0144         if stackLen > 1:
0145             raise RuntimeError("Malformed stack")
0146         elif stackLen == 1:      
0147             tt = self.stack[-1]
0148             
0149             if tt[0] != 'tt':
0150                 raise RuntimeError("Malformed stack: python24")
0151             if tt[1].has_key('class'):
0152                 nameType = tt[1]['class']
0153             else:
0154                 nameType = 'variable'
0155                 
0156             nameRef = tt[1]['id']
0157 
0158             scope = []
0159             url = "%s%s#%s" % (libUrl, self.url, nameRef)
0160             if nameType == "module":
0161                 #global currentModule, currentClass
0162                 currentModule = data
0163                 currentClass = None
0164                 scope = [currentModule]
0165                 self.handler(scope, url)
0166             elif nameType == 'class':
0167                 #global currentClass
0168                 currentClass = data
0169                 if currentModule: scope.append(currentModule)
0170                 scope.append(currentClass)
0171                 self.handler(scope,url)
0172             elif nameType in ('method', 'member','function'):
0173                 if currentModule:
0174                     scope.append(currentModule)
0175                 if currentClass:
0176                     scope.append(currentClass)
0177                 else:
0178                     pass
0179                 scope.append(data)
0180                 self.handler(scope, url)
0181             else:
0182                 if currentModule: scope.append(currentModule)
0183                 scope.append(data)
0184                 self.handler(scope, url)
0185 
0186 subdocumentParser = subdocumentParser24                
0187 
0188 class libParser(HTMLParser):
0189     def __init__(self, handler):
0190         HTMLParser.__init__(self)
0191         self.uls = []
0192         self.handler = handler
0193         global currentModule
0194         currentModule = "__builtins__"
0195         
0196     def attrs2dict(self, attrs):
0197         dict = {}
0198         for key,val in attrs:
0199             dict[key] = val
0200         return dict
0201 
0202     def getModuleName(self, filename):
0203         """
0204         If a file is a module, return the module name,
0205         else return None
0206         """
0207         if filename.startswith("module-"):
0208             return filename[len("module-"):-5] #strip module- and html
0209         else:
0210             return None
0211 
0212     def getClassName(self, filename):
0213         """
0214         returns object name is it's an object,
0215         else None
0216         """
0217         if filename.endswith("-objects.html"):
0218             if filename.startswith("bltin-"): # hack for file ,elipsis, etc
0219                 return filename[len("bltin-"):-len('-objects.html')]
0220             else:
0221                 return filename[:-len("-objects.html")]
0222         else:
0223             return None
0224 
0225     def getModulePartName(self, filename):
0226         "Last resort for things in multiple files like os"
0227         parts = filename.split('-')
0228         if parts[0] in ('bltin','module') or parts[0].endswith('.html'):
0229             return None
0230         return parts[0]
0231         
0232     def handle_starttag(self, tag, attrs):
0233         if tag == 'ul':
0234             attrDict = self.attrs2dict(attrs)
0235             if attrDict.has_key('class') and attrDict['class'] == 'ChildLinks':
0236                 self.uls.append(1)
0237             else:
0238                 self.uls.append(0)
0239 
0240         elif tag == 'a' and (1 in self.uls): #a tag inside "Child Links"
0241             attrDict = self.attrs2dict(attrs)
0242             if attrDict.has_key('href'):
0243                 global currentModule, currentClass
0244                 #indentLevel = len(self.uls) - self.uls.index(1) - 1
0245                 filename = attrDict['href']
0246                 moduleName = self.getModuleName(filename)
0247                 className = self.getClassName(filename)
0248                 if moduleName:
0249                     currentModule = moduleName
0250                     currentClass = None #resets at module level
0251                     scope = [moduleName]
0252                     url = "%s%s" % (libUrl, filename)
0253                     self.handler(scope, url)
0254                 elif className:
0255                     currentClass = className
0256                     scope = []
0257                     if currentModule: scope.append(currentModule)
0258                     scope.append(currentClass)
0259                     url = "%s%s" % (libUrl, filename)
0260                     self.handler(scope, url)
0261                 else:
0262                     modulePartName = self.getModulePartName(filename)
0263                     if modulePartName and modulePartName == currentModule:
0264                         scope = [currentModule]
0265                         url = "%s%s" % (libUrl, filename)
0266                         self.handler(scope, url)
0267                 f = file(os.path.join(libDirectory, filename)).read()
0268                 hr = subdocumentParser(self.handler, filename)
0269                 hr.feed(f)
0270                 
0271     def handle_endtag(self, tag):
0272         if tag == 'ul':
0273             self.uls.pop()
0274 
0275 
0276 
0277     
0278 if __name__ == "__main__" and useLibraryReference:
0279     def handler(scope, url):
0280         print "%s %s" % (scope, url)
0281         
0282     f = file(os.path.join(libDirectory, "lib.html")).read()
0283     x = libParser(handler)
0284     x.feed(f)
0285 
0286     
0287 
0288 

Generated by PyXR 0.9.4
SourceForge.net Logo