0001 # Copyright 2003, Grant T. Olson, see License.txt for details 0002 0003 from HTMLParser import HTMLParser 0004 import os 0005 from misc import useLibraryReference, libDirectory, libUrl 0006 0007 0008 0009 currentModule = None 0010 currentClass = None 0011 0012 class subdocumentParser23(HTMLParser): 0013 """ 0014 This uses much bad voodoo. 0015 0016 It extracts link info from existing python library .html documents. Extraction 0017 is based on alot of assumptions that may not hold true in later versions of python. 0018 0019 Some assumptions: 0020 0021 lib.html is the master document. Any links contained within <ul class='childLinks'> 0022 are subdocuments for our purposes and will be crawled. 0023 If a filename starts with "module-" it's a module 0024 If a filename ends with "-objects.html" it's documentation for a class 0025 Modules, classes, functions, variables, methods and members can also be declared 0026 by a <tt> tag that is surrounded by an <a name=...> tag. The data surrounded 0027 by <tt></tt> is the name of the Member, function, etc..., and the <a name=...> 0028 points to the link. 0029 The above references cannot be nested. 0030 If a module is found (either via a "module-" file or a module tag), it is set 0031 as the current module. It remains the active module until another module is 0032 found. 0033 If a class if found (via a class tag) it remains the current class until another 0034 class declaration is found or the active module is switched 0035 It a member or method is found, it is applied to the current class 0036 If a variable or function is found, it is applied to the current module 0037 0038 And since we're storing currentMethod and currentClass as globals, this library 0039 obviously isn't threadsafe and classes aren't reentrant. 0040 """ 0041 def __init__(self, handler, url): 0042 HTMLParser.__init__(self) 0043 self.stack = [] 0044 self.handler = handler 0045 self.url = url 0046 0047 def attrs2dict(self, attrs): 0048 dict = {} 0049 for key,val in attrs: 0050 dict[key] = val 0051 return dict 0052 0053 def handle_starttag(self, tag,attrs): 0054 attrDict = self.attrs2dict(attrs) 0055 if tag == 'a' and attrDict.has_key('name'): 0056 self.stack.append ( [tag, attrDict]) 0057 elif tag == 'tt' and len(self.stack) > 0 and self.stack[-1][0] == 'a': 0058 self.stack.append([tag, attrDict]) 0059 0060 def handle_endtag(self,tag): 0061 if tag == 'a' and len(self.stack) > 0 and self.stack[-1][0] == 'a': 0062 self.stack.pop() 0063 elif tag == 'tt' and len(self.stack) > 0 and self.stack[-1][0] =='tt': 0064 self.stack.pop() 0065 0066 def handle_data(self,data): 0067 global currentModule, currentClass 0068 stackLen = len(self.stack) 0069 if stackLen > 2: 0070 raise RuntimeError("Malformed stack") 0071 elif stackLen == 2: 0072 tt = self.stack[-1] 0073 a = self.stack[-2] 0074 0075 if tt[0] != 'tt' or a[0] != 'a': 0076 raise RuntimeError("Malformed stack") 0077 if tt[1].has_key('class'): 0078 nameType = tt[1]['class'] 0079 else: 0080 nameType = 'variable' 0081 0082 nameRef = a[1]['name'] 0083 scope = [] 0084 url = "%s%s#%s" % (libUrl, self.url, nameRef) 0085 0086 if nameType == "module": 0087 #global currentModule, currentClass 0088 currentModule = data 0089 currentClass = None 0090 scope = [currentModule] 0091 self.handler(scope, url) 0092 #print "MODULE %s %s#%s" % (currentModule, self.url, nameRef) 0093 elif nameType == 'class': 0094 #global currentClass 0095 currentClass = data 0096 if currentModule: scope.append(currentModule) 0097 scope.append(currentClass) 0098 self.handler(scope,url) 0099 elif nameType in ('method', 'member'): 0100 if currentModule: 0101 scope.append(currentModule) 0102 if currentClass: 0103 scope.append(currentClass) 0104 else: 0105 pass #print "NO CLASS ", 0106 scope.append(data) 0107 self.handler(scope, url) 0108 else: 0109 if currentModule: scope.append(currentModule) 0110 scope.append(data) 0111 self.handler(scope, url) 0112 #print "%s %s LINK %s#%s"% (nameType.upper(), data, self.url, nameRef) 0113 0114 class subdocumentParser24(HTMLParser): 0115 """ 0116 The doc formats changed for 24 as expected. now we look for a tt tag with 0117 an id and class. 0118 TODO: still don't catch variables. 0119 """ 0120 def __init__(self, handler, url): 0121 HTMLParser.__init__(self) 0122 self.stack = [] 0123 self.handler = handler 0124 self.url = url 0125 0126 def attrs2dict(self, attrs): 0127 dict = {} 0128 for key,val in attrs: 0129 dict[key] = val 0130 return dict 0131 0132 def handle_starttag(self, tag,attrs): 0133 attrDict = self.attrs2dict(attrs) 0134 if tag == 'tt' and attrDict.has_key('id'): 0135 self.stack.append([tag, attrDict]) 0136 0137 def handle_endtag(self,tag): 0138 if tag == 'tt' and len(self.stack) > 0 and self.stack[-1][0] =='tt': 0139 self.stack.pop() 0140 0141 def handle_data(self,data): 0142 global currentModule, currentClass 0143 stackLen = len(self.stack) 0144 if stackLen > 1: 0145 raise RuntimeError("Malformed stack") 0146 elif stackLen == 1: 0147 tt = self.stack[-1] 0148 0149 if tt[0] != 'tt': 0150 raise RuntimeError("Malformed stack: python24") 0151 if tt[1].has_key('class'): 0152 nameType = tt[1]['class'] 0153 else: 0154 nameType = 'variable' 0155 0156 nameRef = tt[1]['id'] 0157 0158 scope = [] 0159 url = "%s%s#%s" % (libUrl, self.url, nameRef) 0160 if nameType == "module": 0161 #global currentModule, currentClass 0162 currentModule = data 0163 currentClass = None 0164 scope = [currentModule] 0165 self.handler(scope, url) 0166 elif nameType == 'class': 0167 #global currentClass 0168 currentClass = data 0169 if currentModule: scope.append(currentModule) 0170 scope.append(currentClass) 0171 self.handler(scope,url) 0172 elif nameType in ('method', 'member','function'): 0173 if currentModule: 0174 scope.append(currentModule) 0175 if currentClass: 0176 scope.append(currentClass) 0177 else: 0178 pass 0179 scope.append(data) 0180 self.handler(scope, url) 0181 else: 0182 if currentModule: scope.append(currentModule) 0183 scope.append(data) 0184 self.handler(scope, url) 0185 0186 subdocumentParser = subdocumentParser24 0187 0188 class libParser(HTMLParser): 0189 def __init__(self, handler): 0190 HTMLParser.__init__(self) 0191 self.uls = [] 0192 self.handler = handler 0193 global currentModule 0194 currentModule = "__builtins__" 0195 0196 def attrs2dict(self, attrs): 0197 dict = {} 0198 for key,val in attrs: 0199 dict[key] = val 0200 return dict 0201 0202 def getModuleName(self, filename): 0203 """ 0204 If a file is a module, return the module name, 0205 else return None 0206 """ 0207 if filename.startswith("module-"): 0208 return filename[len("module-"):-5] #strip module- and html 0209 else: 0210 return None 0211 0212 def getClassName(self, filename): 0213 """ 0214 returns object name is it's an object, 0215 else None 0216 """ 0217 if filename.endswith("-objects.html"): 0218 if filename.startswith("bltin-"): # hack for file ,elipsis, etc 0219 return filename[len("bltin-"):-len('-objects.html')] 0220 else: 0221 return filename[:-len("-objects.html")] 0222 else: 0223 return None 0224 0225 def getModulePartName(self, filename): 0226 "Last resort for things in multiple files like os" 0227 parts = filename.split('-') 0228 if parts[0] in ('bltin','module') or parts[0].endswith('.html'): 0229 return None 0230 return parts[0] 0231 0232 def handle_starttag(self, tag, attrs): 0233 if tag == 'ul': 0234 attrDict = self.attrs2dict(attrs) 0235 if attrDict.has_key('class') and attrDict['class'] == 'ChildLinks': 0236 self.uls.append(1) 0237 else: 0238 self.uls.append(0) 0239 0240 elif tag == 'a' and (1 in self.uls): #a tag inside "Child Links" 0241 attrDict = self.attrs2dict(attrs) 0242 if attrDict.has_key('href'): 0243 global currentModule, currentClass 0244 #indentLevel = len(self.uls) - self.uls.index(1) - 1 0245 filename = attrDict['href'] 0246 moduleName = self.getModuleName(filename) 0247 className = self.getClassName(filename) 0248 if moduleName: 0249 currentModule = moduleName 0250 currentClass = None #resets at module level 0251 scope = [moduleName] 0252 url = "%s%s" % (libUrl, filename) 0253 self.handler(scope, url) 0254 elif className: 0255 currentClass = className 0256 scope = [] 0257 if currentModule: scope.append(currentModule) 0258 scope.append(currentClass) 0259 url = "%s%s" % (libUrl, filename) 0260 self.handler(scope, url) 0261 else: 0262 modulePartName = self.getModulePartName(filename) 0263 if modulePartName and modulePartName == currentModule: 0264 scope = [currentModule] 0265 url = "%s%s" % (libUrl, filename) 0266 self.handler(scope, url) 0267 f = file(os.path.join(libDirectory, filename)).read() 0268 hr = subdocumentParser(self.handler, filename) 0269 hr.feed(f) 0270 0271 def handle_endtag(self, tag): 0272 if tag == 'ul': 0273 self.uls.pop() 0274 0275 0276 0277 0278 if __name__ == "__main__" and useLibraryReference: 0279 def handler(scope, url): 0280 print "%s %s" % (scope, url) 0281 0282 f = file(os.path.join(libDirectory, "lib.html")).read() 0283 x = libParser(handler) 0284 x.feed(f) 0285 0286 0287 0288
Generated by PyXR 0.9.4