webpage = urlopen('http://msdn.microsoft.com/en-us/library/aa383686(VS.85).aspx') text = webpage.read() lst = re.findall('href=".*?aspx">[a-zA-Z ()\-]+</a></li>',text)
c2s = {}
for str in lst: subpage = str[str.rindex('href')+6:str.rindex('"')] category = str[str.rindex('"')+2:str.rindex('</a>')] print category
webpage = urlopen(subpage) text = webpage.read() lst = re.findall('<b>[_a-zA-Z]+</b></a></td>',text) if lst: pass for str in lst: print '\t%s' % str[3:str.index('</b>')] else: startPos = text.find('<h2>Functions</h2>') endPos = text.find('<h2>', startPos+18) if endPos == -1: lst = re.findall('http://msdn.microsoft.com/en-us/library/.*?[_a-zA-Z]+</a></b><p>',text[startPos:]) else: lst = re.findall('http://msdn.microsoft.com/en-us/library/.*?[_a-zA-Z]+</a></b><p>',text[startPos:endPos]) for str in lst: print '\t%s' % str[str.index('">')+2:str.index('</a>')]
이렇게 하고 나서 긁히지 않는 Category는 Debug Help Library Message and Message Queue Multimedia Class Scheduler Service Network Management Performance Counters Performance Logs and Alerts PSAPI Restart Manager Static Control Text Object Model Thread Ordering Service Thread Pool Tool Help Transactional NTFS (TxF) User Profiles Wait Chain Traversal Windows Error Reporting Windows Networking