やっと、自分の家で写経しながら、crawlerでけた。
やっぱ、selfが面倒。
次は、BeautifulSoupの中身をみてみたい。
import urllib2 from BeautifulSoup import * from urlparse import urljoin class crawler: def addtoindex(self, url, soup): print 'Indexing %s' % url def crawl(self, pages, depth=2): for i in range(depth): newpages = set() for page in pages: print "page -->%s" % page try: connect = urllib2.urlopen(page) except: print "fault" continue soup = BeautifulSoup(connect.read()) self.addtoindex(page, soup) links = soup('a') for link in links: if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] if url[0:4] == "http": newpages.add(url) print "add-----------------------\n" pages = newpages c = crawler() c.crawl(['http://d.hatena.ne.jp/vestige/'])