#! /usr/bin/python # an example screen scraper for yanking the content out of an html wiki page # (with apologies to the Python experts out there for this complete hack-job) import urllib2 import re import sys # regex for getting the content of a wiki page pat = re.compile(r'
]*>(.*)
\s*
]*>)') # regex for removing id data from html tags rpat = re.compile(r'\s?id=["|''][^"'']*["|'']', re.IGNORECASE) # regex for getting the src of an img tag ipat = re.compile(r'^', re.IGNORECASE) # images we've already retrieved images = {} # pages we've already scraped pages = {} # read the content of a url def readurl(url): """ return the contents of a url as a string """ r = urllib2.urlopen(url) s = '' while True: line = r.read(1024) s = s + line if len(line) < 1024: break return s def processAnchor(current, x, l, subpages): """ process an html anchor tag. pages on the same server are processed and added to a subpages list. pages on another server are referenced. """ amat = apat.match(current) if amat: href = amat.group(1) if not href.startswith('http://'): if not pages.has_key(href): subpages.append(scrape(host, port, href)) # dump the anchor text and close tag x = x + 3 return (None, x, True) else: tmp = '' x = x + 1 while not a2pat.match(l[x]): (e, x, cont) = processImage(l[x], x, l, subpages) tmp = tmp + e x = x + 1 current = '%s (%s)' % (tmp, href) return (current, x, False) def processImage(current, x, l, subpages): """ process an html image tag. downloads the image, renames it and then replaces tag with the renamed image """ imat = ipat.match(current) # if we're processing an image if imat: src = imat.group(1) if images.has_key(src) == True: current = images[src] else: # read the image, rewrite the filename, then rewrite the image tag accordingly key = src if src.startswith('http:'): img = readurl(src) src = src.replace('http://','') else: img = readurl('http://%s:%s%s' % (host, port, src)) src = src.replace('/','-') f = file(src, 'w+') f.write(img) f.close() current = '' % src images[key] = current return (current, x, False) def scrape(host, port, page): """ 'screenscrape' a page, handling sub-pages linked on the same server """ s = readurl('http://%s:%s/%s' % (host, port, page)) pages[page] = page subpages = [] # match the content div in the wiki html mat = pat.search(s) if mat: content = mat.group(1) l = cpat.split(content) rtn = '' x = 0 while x < len(l): # flag to force a loop 'continue' cont = False e = l[x] # are we processing an html tag? if e.startswith('<') and e.endswith('>'): # get rid of the id attribute e = rpat.sub('', e) if e.startswith('