The function and class definitions

• The body of the program, which calls the functions and class

Listing 4-1: spider.py_

# program: spider.py

# author: aahz

# date: June 2006

# description: start on command line with URL argument.

# Finds pages within a web site. ###############################################

# These modules do most of the work. import sys import urllib2 import urlparse import htmllib, formatter from cStringlO import StringlO

def log stdout(msg):

Print msg to the screen

print msg def get page(url, log):

"""Retrieve URL and return contents, log errors.""" try:

page = urllib2.urlopen(url) except urllib2.URLError: log("Error retrieving: " + url) return ''

body = page.read() page.close() return body def find links(html):

"""Return a list of links in html

# We're using the parser just to writer = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(writer) parser = htmllib.HTMLParser(f) parser.feed(html)

parser.close()

return parser.anchorlist class Spider: ii ii ii

The heart of this program, finds all links within a web site. run() contains the main loop.

process page() retrieves each page and finds the links.

ii II II

def init (self, startURL, log=None):

# This method sets initial values self.URLs = set() self.URLs.add(startURL) self.include = startURL

self. links to process = [startURL] if log is None:

# Use log stdout function if no log provided self.log = log stdout else:

# Processes list of URLs one at a time while self. links to process:

url = self. links to process.pop() self.log("Retrieving: " + url) self.process page(url)

def url in site(self, link):

# Checks whether the link starts with the base URL return link.startswith(self.include)

def process page(self, url):

# Retrieves page and finds links in it html = get page(url, self.log) for link in find links(html):

# Handle relative links link = urlparse.urljoin(url, link)

# Make sure this is a new URL within current site if link not in self.URLs and self.url in site(link): self.URLs.add(link)

self. links to process.append(link)

# This code runs when script is started from command line startURL = sys.argv[1] spider = Spider(startURL) spider.run()

for URL in sorted(spider.URLs): print URL

Was this article helpful?

0 0

Post a comment