Context Navigation

source: products/quintagroup.distrpoxy/trunk/quintagroup/distproxy/spider.py @ 1603

Last change on this file since 1603 was 1603, checked in by chervol, 15 years ago
switched from using spider, added BasicHTTP authenticaion
File size: 41.3 KB

Rev	Line
[1065]	1	#! /usr/bin/env python
	2
	3	## Copyright (c) 1999 - 2003 L. C. Rees. All rights reserved.
	4	## See COPYRIGHT file for license terms.
	5
	6	from __future__ import generators
	7
	8
	9	class Spider:
	10
	11	'''HTTP and FTP crawling, reporting, and checking'''
	12
	13	import os as _os
	14	import urllib as _ulib
	15	import urlparse as _uparse
	16	from os import path as _path
	17	from ftplib import FTP as _ftp
	18	from time import strftime as _formtime
	19	from time import localtime as _localtime
	20	from ftplib import error_perm as _ftperr
	21	from sgmllib import SGMLParseError as _sperror
	22	from robotparser import RobotFileParser as _rparser
	23	# Use threads if available
	24	try: from threading import Thread as _thread
	25	except ImportError: pass
	26	_bdsig, _bfsig, _session, _newparser = None, None, None, None
	27	# HTML tags with URLs
	28	_urltags = {'a':1, 'img':1, 'link':1, 'script':1, 'iframe':1, 'object':1,
	29	'embed':1, 'area':1, 'frame':1, 'applet':1, 'input':1, 'base':1,
	30	'div':1, 'layer':1, 'ilayer':1, 'bgsound':1}
	31	# Supported protocols
	32	_supported = {'HTTP':1, 'http':1, 'HTTPS':1, 'https':1, 'FTP':1, 'ftp':1}
	33	# HTML attributes with URLs
	34	_urlattrs = {'href':1, 'src':1, 'data':1}
	35
	36	def __init__(self, base=None, width=None, depth=None):
	37	'''Initializes a Spider instance and its base attributes
	38
	39	Arguments:
	40	base -- URL to crawl (default: None)
	41	width -- maximum resources to crawl (default: None)
	42	depth -- how deep in a hierarchy to crawl (default: None)'''
	43	if base: self.base = base
	44	else: self.base = None
	45	if width: self.width = width
	46	else: self.width = None
	47	if depth: self.depth = depth
	48	else: self.depth = None
	49
	50	def _ftpopen(self, base, name='anonymous', password=None, attempts=3):
	51	'''Returns FTP client session
	52
	53	Arguments:
	54	base -- FTP server URL
	55	name -- login name (default: 'anonymous')
	56	password -- login password (default: None)
	57	attempts -- number of login attempts to try (default: 3)'''
	58
	59	def ftpprompt(tries=0):
	60	'''Prompts for FTP username and password
	61
	62	Arguments:
	63	tries -- number of login attempts'''
	64	tries += tries
	65	try:
	66	self._name = raw_input('Enter login name: ')
	67	self._password = raw_input('Enter password: ')
	68	session = ftp(base, self._name, self._password)
	69	return session
	70	# If login attempt fails, retry login
	71	except ftperr:
	72	if attempts >= tries:
	73	session = ftpprompt(tries)
	74	return session
	75	# Too many login attempts? End program
	76	elif attempts <= tries:
	77	raise IOError, 'Permission denied.'
	78	import sys
	79	sys.exit(0)
	80
	81	# Assignments
	82	self._name, self._password, ftperr = name, password, self._ftperr
	83	su, ftp = self._uparse.urlsplit(base), self._ftp
	84	# Set URL, path, and strip 'ftp://' off
	85	base, path = su[1], '/'.join([su[2], ''])
	86	try: session = ftp(base, name, password)
	87	# Prompt for username, password if initial arguments are incorrect
	88	except ftperr: session = ftpprompt()
	89	# Change to remote path if it exits
	90	if path: session.cwd(path)
	91	return session
	92
	93	def ftpmirror(self, l, t=None, b=None, w=200, d=6, n='anonymous', p=None):
	94	'''Mirrors an FTP site on a local filesystem
	95
	96	Arguments:
	97	l -- local filesystem path (default: None)
	98	b -- FTP server URL (default: None)
	99	t -- number of download threads (default: None)
	100	w -- maximum amount of resources to crawl (default: 200)
	101	d -- depth in hierarchy to crawl (default: 6)
	102	n -- login username (default: 'anonymous')
	103	p -- login password (default: None)'''
	104	if b: self.ftpspider(b, w, d, n, p)
	105	return self._mirror((self.paths, self.urls), l, t)
	106
	107	def ftppaths(self, b=None, w=200, d=6, n='anonymous', p=None):
	108	'''Returns a list of FTP paths.
	109
	110	Arguments:
	111	b -- FTP server URL (default: None)
	112	w -- maximum amount of resources to crawl (default: 200)
	113	d -- depth in hierarchy to crawl (default: 6)
	114	n -- login username (default: 'anonymous')
	115	p -- login password (default: None)'''
	116
	117	def sortftp(rdir):
	118	'''Returns a list of entries marked as files or directories
	119
	120	Arguments:
	121	rdir -- remote directory list'''
	122	rlist = []
	123	rappend = rlist.append
	124	for rl in rdir:
	125	# Split remote file based on whitespace
	126	ri = rl.split()[-1]
	127	# Add tuple of remote item type, permissions & name to rlist
	128	if ri not in ('.', '..'): rappend((rl[0], rl[7], ri))
	129	return rlist
	130
	131	def visitftp():
	132	'''Extracts contents of an FTP directory'''
	133	wd = pwd()
	134	if wd[-1] != '/': wd = '/'.join([wd, ''])
	135	# Add present working directory to visited directories
	136	dirs[wd], rlist = None, []
	137	# Get list of current directory's contents
	138	retr('LIST -a', rlist.append)
	139	for url in sortftp(rlist):
	140	# Test if remote item is a file (indicated by '-')
	141	if url[0] == '-':
	142	# Resolve path of file
	143	purl = ''.join([wd, url[2]])
	144	# Ensure file list don't exceed max number of resources
	145	if len(files) >= width: return None
	146	# Add files to file dictionary
	147	elif purl not in files: files[purl] = None
	148	# Test if it's a directory ('d') and allows scanning ('-')
	149	elif url[0] == 'd':
	150	if url[1] != '-':
	151	# Resolve path of directory
	152	purl = ''.join([wd, url[2], '/'])
	153	# Ensure no recursion beyond depth allowed
	154	if len(purl.split('/')) >= depth: dirs[purl] = None
	155	# Visit directory if it hasn't been visited yet
	156	elif purl not in dirs:
	157	# Change to new directory
	158	cwd(purl)
	159	# Run 'visitftp' on new directory
	160	visitftp()
	161
	162	# Use classwide attributes if set
	163	if b: self.base = b
	164	else: b = self.base
	165	# Use classwide width if different from method default
	166	if self.width and w == 200: width = self.width
	167	else: width = w
	168	# Use classwide depth if different from method default
	169	if self.depth and d == 6: depth = self.depth + 1
	170	else: depth = d + 1
	171	# File and directory dicts
	172	files, dirs = {}, {}
	173	# Use existing FTP client session if present
	174	if self._session: ftp = self._session
	175	# Create new FTP client session if necessary
	176	else:
	177	ftp = self._ftpopen(b, n, p)
	178	self._session = ftp
	179	# Avoid outside namespace lookups
	180	cwd, pwd, retr = ftp.cwd, ftp.pwd, ftp.retrlines
	181	# Walk FTP site
	182	visitftp()
	183	# Make path list out of files' keys and return it
	184	self.paths = files.keys()
	185	self.paths.sort()
	186	return self.paths
	187
	188	def ftpspider(self, b=None, w=200, d=6, n='anonymous', p=None):
	189	'''Returns lists of URLs and paths plus a live FTP client session
	190
	191	Arguments:
	192	b -- FTP server URL (default: None)
	193	w -- maximum amount of resources to crawl (default: 200)
	194	d -- depth in hierarchy to crawl (default: 6)
	195	n -- login username (default: 'anonymous')
	196	p -- login password (default: None)'''
	197	if b: ftppaths(b, w, d, n, p)
	198	return self.paths, ftpurls(), self._session
	199
	200	def ftpurls(self, b=None, w=200, d=6, n='anonymous', p=None):
	201	'''Returns a list of FTP URLs
	202
	203	Arguments:
	204	b -- FTP server URL (default: None)
	205	w -- maximum amount of resources to crawl (default: 200)
	206	d -- depth in hierarchy to crawl (default: 6)
	207	n -- login username (default: 'anonymous')
	208	p -- login password (default: None)'''
	209	if b:
	210	ftppaths(b, w, d, n, p)
	211	# Get rid of trailing '/' in base if present before joining
	212	if b[-1] == '/': base = b[:-1]
	213	else:
	214	base = self.base
	215	# Get rid of trailing '/' in base if present before joining
	216	if base[-1] == '/': base = self.base[:-1]
	217	paths = self.paths
	218	# Add FTP URL
	219	self.urls = [''.join([base, i]) for i in paths]
	220	return self.urls
	221
	222	def _parserpick(self, old=None):
	223	'''Returns a class using the sgmllib parser or the sgmlop parser
	224
	225	Arguments:
	226	old -- use classic sgmllib SGMLParser'''
	227	# Assignments
	228	urltags, urlattrs = self._urltags, self._urlattrs
	229	# Lists for bad file and bad directory signatures
	230	self._bfsig, self._bdsig = [], []
	231	bfsig, bdsig = self._bfsig, self._bdsig
	232	# Use faster SGMLParser if available
	233	try:
	234	from sgmlop import SGMLParser as newparser
	235	self._newparser = newparser
	236	# If unavailable, use classic SGML parser
	237	except ImportError:
	238	from sgmllib import SGMLParser as oldparser
	239	old = 1
	240	# Classes using classic sgmllib SGML Parser
	241	if old:
	242	from sgmllib import SGMLParser as oldparser
	243	# Remove sgmlop parser if present
	244	self._newparser = None
	245	# UrlExtract class using classic parser
	246	class UrlExtract(oldparser):
	247	'''Extracts URLs from a SGMLish document'''
	248	def reset(self):
	249	'''Resets SGML parser and clears lists'''
	250	oldparser.reset(self)
	251	self.urls, self.text, self.badurl = [], [], None
	252	def handle_data(self, data):
	253	'''Handles non-markup data'''
	254	# Get first 5 lines of non-markup data
	255	if len(self.text) <= 5: self.text.append(data)
	256	# Compare signature of known bad URL to a new web page
	257	if self.text == bfsig: self.badurl = 1
	258	elif self.text == bdsig: self.badurl = 1
	259	def finish_starttag(self, tag, attrs):
	260	'''Extracts URL bearing tags'''
	261	if tag in urltags:
	262	# Get key, vale in attributes if they match
	263	url = [v for k, v in attrs if k in urlattrs]
	264	if url: self.urls.extend(url)
	265	# BadUrl class using classic parser
	266	class BadUrl(oldparser):
	267	'''Collects results of intentionally incorrect URLs'''
	268	def reset(self):
	269	'''Resets SGML parser and clears lists'''
	270	oldparser.reset(self)
	271	self.text = []
	272	def handle_data(self, data):
	273	'''Collects lines to profile bad URLs'''
	274	# Adds first 5 lines of non-markup data to text
	275	if len(self.text) <= 5: self.text.append(data)
	276	# If no old flag, use SGMLParser from sgmlop and related classes
	277	else:
	278	# UrlExtract class using sgmlop parser
	279	class UrlExtract:
	280	'''Extracts URLs from a SGMLish document'''
	281	def __init__(self):
	282	'''Resets SGML parser and clears lists'''
	283	self.urls, self.text, self.badurl = [], [], None
	284	def handle_data(self, data):
	285	'''Handles non-markup data'''
	286	# Get first 5 lines of non-markup data
	287	if len(self.text) <= 5: self.text.append(data)
	288	# Compare signature of known bad URL to a new web page
	289	if self.text == bfsig: self.badurl = 1
	290	elif self.text == bdsig: self.badurl = 1
	291	def finish_starttag(self, tag, attrs):
	292	'''Extracts URL bearing tags'''
	293	if tag in urltags:
	294	# Get key, vale in attributes if they match
	295	url = [v for k, v in attrs if k in urlattrs]
	296	if url: self.urls.extend(url)
	297	# BadUrl class using sgmlop parser
	298	class BadUrl:
	299	'''Collects results of intentionally incorrect URLs'''
	300	def __init__(self):
	301	'''Resets SGML parser and clears lists'''
	302	self.text = []
	303	def handle_data(self, data):
	304	'''Collects lines to profile not found responses'''
	305	# Adds first 5 lines of non-markup data to list 'text'
	306	if len(self.text) <= 5: self.text.append(data)
	307	# Make resulting classes available class wide
	308	self._UrlExtract, self._BadUrl = UrlExtract, BadUrl
	309
	310	def _webtest(self):
	311	'''Generates signatures for identifying bad URLs'''
	312
	313	def badurl(url):
	314	'''Returns first 5 lines of a bad URL
	315
	316	Arguments:
	317	url -- Bad URL to open and parse'''
	318	# Use different classes if faster SGML Parser is available
	319	if self._newparser:
	320	# sgmlop parser must have a handler passed to it
	321	parser, urlget = self._newparser(), BadUrl()
	322	# Pass handler (sgmlop cannot be subclassed)
	323	parser.register(urlget)
	324	parser.feed(urlopen(url).read())
	325	parser.close()
	326	# Use classic parser
	327	else:
	328	urlget = BadUrl()
	329	urlget.feed(urlopen(url).read())
	330	urlget.close()
	331	# Return singature of bad URL
	332	return urlget.text
	333
	334	# Make globals local
	335	base, urljoin = self.base, self._uparse.urljoin
	336	urlopen, BadUrl = self._ulib.urlopen, self._BadUrl
	337	# Generate random string of jibber
	338	from string import letters, digits
	339	from random import choice, randint
	340	jibber = ''.join([letters, digits])
	341	ru = ''.join([choice(jibber) for x in range(randint(1, 30))])
	342	# Builds signature of a bad URL for a file
	343	self._bfsig.extend(badurl(urljoin(base, '%s.html' % ru)))
	344	# Builds signature of a bad URL for a directory
	345	self._bdsig.extend(badurl(urljoin(base,'%s/' % ru)))
	346
	347	def _webparser(self, html):
	348	'''Parses HTML and returns bad URL indicator and extracted URLs
	349
	350	Arguments:
	351	html -- HTML data'''
	352	# Use different classes if faster SGML Parser is available
	353	if self._newparser:
	354	# Make instances of SGML parser and URL extracting handler
	355	parser, urlget = self._newparser(), self._UrlExtract()
	356	# Pass handler to parser
	357	parser.register(urlget)
	358	# Feed data to parser
	359	parser.feed(html)
	360	parser.close()
	361	# Return bad URL indicator and extracted URLs
	362	else:
	363	urlget = self._UrlExtract()
	364	urlget.feed(html)
	365	urlget.close()
	366	# Return badurl marker and list of child URLS
	367	return urlget.badurl, urlget.urls
	368
	369	def _webopen(self, base):
	370	'''Verifies URL and returns actual URL and extracted child URLs
	371
	372	Arguments:
	373	base -- tuple containing a URL and its referring URL'''
	374	# Assignments
	375	good, cbase = self._good, base[0]
	376	try:
	377	# If webspiders can access URL, open it
	378	if self._robot.can_fetch('*', cbase):
	379	url = self._ulib.urlopen(cbase)
	380	# Otherwise, mark as visited and abort
	381	else:
	382	self._visited[cbase] = 1
	383	return False
	384	# If HTTP error, log bad URL and abort
	385	except IOError:
	386	self._visited[cbase] = 1
	387	self.badurls.append((base[1], cbase))
	388	return False
	389	# Get real URL
	390	newbase = url.geturl()
	391	# Change URL if different from old URL
	392	if newbase != cbase: cbase, base = newbase, (newbase, base[1])
	393	# URLs with mimetype 'text/html" scanned for URLs
	394	if url.headers.type == 'text/html':
	395	# Feed parser
	396	contents = url.read()
	397	try: badurl, urls = self._webparser(contents)
	398	# Log URL if SGML parser can't parse it
	399	except self._sperror:
	400	self._visited[cbase], self.badhtm[cbase] = 1, 1
	401	return False
	402	url.close()
	403	# Return URL and extracted urls if it's good
	404	if not badurl: return cbase, urls
	405	# If the URL is bad (after BadUrl), stop processing and log URL
	406	else:
	407	self._visited[cbase] = 1
	408	self.badurls.append((base[1], cbase))
	409	return False
	410	# Return URL of non-HTML resources and empty list
	411	else:
	412	url.close()
	413	return cbase, []
	414
	415	def _genverify(self, urls, base):
	416	'''Verifies a list of full URL relative to a base URL
	417
	418	Arguments:
	419	urls -- list of raw URLs
	420	base -- referring URL'''
	421	# Assignments
	422	cache, visit, urlverify = self._cache, self._visited, self._urlverify
	423	# Strip file off base URL for joining
	424	newbase = base.replace(base.split('/')[-1], '')
	425	for url in urls:
	426	# Get resolved url and raw child URLs
	427	url, rawurls = urlverify(url, base, newbase)
	428	# Handle any child URLs
	429	if rawurls:
	430	newurls = {}
	431	# Eliminate duplicate URLs
	432	for rawurl in rawurls:
	433	# Eliminate known visited URLs
	434	if rawurl not in visit: newurls[rawurl] = 1
	435	# Put new URLs in cache if present
	436	if newurls: cache[url] = newurls
	437	# Yield new URL
	438	if url: yield url
	439
	440	def _multiverify(self, url, base):
	441	'''Verifies a full URL relative to a base URL
	442
	443	Arguments:
	444	url -- a raw URLs
	445	base -- referring URL'''
	446	# Assignments
	447	cache, visited = self._cache, self._visited
	448	# Strip file off base URL for joining
	449	newbase = base.replace(base.split('/')[-1], '')
	450	# Get resolved url and raw child URLs
	451	url, rawurls = self._urlverify(url, base, newbase)
	452	# Handle any child URLs
	453	if rawurls:
	454	# Eliminate known visited URLs and duplicates
	455	for rawurl in rawurls:
	456	# Put new URLs in cache if present
	457	if rawurl not in visited: cache[rawurl] = url
	458	# Put URL in list of good URLs
	459	if url: self._good[url] = 1
	460
	461	def _urlverify(self, url, base, newbase):
	462	'''Returns a full URL relative to a base URL
	463
	464	Arguments:
	465	urls -- list of raw URLs
	466	base -- referring URL
	467	newbase -- temporary version of referring URL for joining'''
	468	# Assignments
	469	visited, webopen, other = self._visited, self._webopen, self.other
	470	sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
	471	urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
	472	outside, redirs, supported = self.outside, self.redirs, self._supported
	473	if url not in visited:
	474	# Remove whitespace from URL
	475	if url.find(' ') != -1:
	476	visited[url], url = 1, url.replace(' ', '')
	477	if url in visited: return 0, 0
	478	# Remove fragments i.e. 'http:foo/bar#frag'
	479	if url.find('#') != -1:
	480	visited[url], url = 1, urldefrag(url)[0]
	481	if url in visited: return 0, 0
	482	# Process full URLs i.e. 'http://foo/bar
	483	if url.find(':') != -1:
	484	urlseg = urlsplit(url)
	485	# Block non-FTP, HTTP URLs
	486	if urlseg[0] not in supported:
	487	# Log as non-FTP/HTTP URL
	488	other[url], visited[url] = 1, 1
	489	return 0, 0
	490	# If URL is not in root domain, block it
	491	if urlseg[1] not in sb:
	492	visited[url], outside[url] = 1, 1
	493	return 0, 0
	494	# Block duplicate root URLs
	495	elif not urlseg[2] and urlseg[1] == sb:
	496	visited[url] = 1
	497	return 0, 0
	498	# Handle relative URLs i.e. ../foo/bar
	499	elif url.find(':') == -1:
	500	# Join root domain and relative URL
	501	visited[url], url = 1, urljoin(newbase, url)
	502	if url in visited: return 0, 0
	503	# Test URL by attempting to open it
	504	rurl = webopen((url, base))
	505	if rurl and rurl[0] not in visited:
	506	# Get URL
	507	turl, rawurls = rurl
	508	visited[url], visited[turl] = 1, 1
	509	# If URL resolved to a different URL, process it
	510	if turl != url:
	511	urlseg = urlsplit(turl)
	512	# If URL is not in root domain, block it
	513	if urlseg[1] not in sb:
	514	# Log as a redirected internal URL
	515	redirs[(url, turl)] = 1
	516	return 0, 0
	517	# Block duplicate root URLs
	518	elif not urlseg[2] and urlseg[1] == sb: return 0, 0
	519	# If URL exceeds depth, don't process
	520	if len(turl.split('/')) >= depth: return 0, 0
	521	# Otherwise return URL
	522	else:
	523	if rawurls: return turl, rawurls
	524	else: return turl, []
	525	else: return 0,0
	526	else: return 0, 0
	527
	528	def _onewalk(self):
	529	'''Yields good URLs from under a base URL'''
	530	# Assignments
	531	cache, genverify = self._cache, self._genverify
	532	# End processing if cache is empty
	533	while cache:
	534	# Fetch item from cache
	535	base, urls = cache.popitem()
	536	# If item has child URLs, process them and yield good URLs
	537	if urls:
	538	for url in genverify(urls, base): yield url
	539
	540	def _multiwalk(self, threads):
	541	'''Extracts good URLs from under a base URL
	542
	543	Arguments:
	544	threads -- number of threads to run'''
	545
	546	def urlthread(url, base):
	547	'''Spawns a thread containing a multiverify function
	548
	549	Arguments:
	550
	551	url -- URL to verify
	552	base -- referring URL'''
	553	# Create instance of Thread
	554	dthread = Thread(target=multiverify, args=(url, base))
	555	# Put in pool
	556	pool.append(dthread)
	557
	558	# Assignments
	559	pool, cache, multiverify = [], self._cache, self._multiverify
	560	Thread, width, good = self._thread, self.width, self._good
	561	# End processing if cache is empty
	562	while cache:
	563	# Process URLs as long as width not exceeded
	564	if len(good) <= width:
	565	# Fetch item from cache
	566	url, base = cache.popitem()
	567	# Make thread
	568	if url: urlthread(url, base)
	569	# Run threads once pool size is reached
	570	if len(pool) == threads or threads >= len(cache):
	571	# Start threads
	572	for thread in pool: thread.start()
	573	# Empty thread pool as threads complete
	574	while pool:
	575	for thread in pool:
	576	if not thread.isAlive(): pool.remove(thread)
	577	# End if width reached
	578	elif len(good) >= width: break
	579
	580	def weburls(self, base=None, width=200, depth=5, thread=None):
	581	'''Returns a list of web paths.
	582
	583	Arguments:
	584	base -- base web URL (default: None)
	585	width -- amount of resources to crawl (default: 200)
	586	depth -- depth in hierarchy to crawl (default: 5)
	587	thread -- number of threads to run (default: None)'''
	588	# Assignments
	589	self._visited, self._good, self._cache, self.badurls = {}, {}, {}, []
	590	self.redirs, self.outside, self.badhtm, self.other = {}, {}, {}, {}
	591	onewalk, good, self._robot = self._onewalk, self._good, self._rparser()
	592	uparse, robot, multiwalk = self._uparse, self._robot, self._multiwalk
	593	cache = self._cache
	594	# Assign width
	595	if self.width and width == 200: width = self.width
	596	else: self.width = width
	597	# sgmlop crashes Python after too many iterations
	598	if width > 5000: self._parserpick(1)
	599	else: self._parserpick()
	600	# Use global base if present
	601	if not base: base = self.base
	602	# Verify URL and get child URLs
	603	newbase, rawurls = self._webopen((base, ''))
	604	if newbase:
	605	# Change base URL if different
	606	if newbase != base: base = newbase
	607	# Ensure there's a trailing '/' in base URL
	608	if base[-1] != '/':
	609	url = list(uparse.urlsplit(base))
	610	url[1] = ''.join([url[1], '/'])
	611	base = uparse.urlunsplit(url)
	612	# Eliminate duplicates and put raw URLs in cache
	613	newurls = {}
	614	for rawurl in rawurls: newurls[rawurl] = 1
	615	if newurls:
	616	# Cache URLs individually if threads are desired
	617	if thread:
	618	for newurl in newurls: cache[newurl] = base
	619	# Cache in group if no threads
	620	else: cache[base] = newurls
	621	# Make base URL, get split, and put in verified URL list
	622	self.base, self._sb = base, base.split('/')
	623	self._visited[base], good[base] = 1, 1
	624	# If URL is bad, abort and raise error
	625	else: raise IOError, "URL is invalid"
	626	# Adjust dept to length of base URL
	627	if self.depth and depth == 6: self.depth += len(self._sb)
	628	else: self.depth = depth + len(self._sb)
	629	# Get robot limits
	630	robot.set_url(''.join([base, 'robots.txt']))
	631	robot.read()
	632	# Get signature of bad URL
	633	self._webtest()
	634	# Get good URLs as long as total width isn't exceeded
	635	try:
	636	# Multiwalk if threaded
	637	if thread: self._multiwalk(thread)
	638	# Otherwise, use single thread
	639	else:
	640	for item in onewalk():
	641	# Don't exceed maximum width
	642	if len(good) <= width: good[item] = 1
	643	elif len(good) >= width: break
	644	# If user interrupts crawl, return what's done
	645	except KeyboardInterrupt: pass
	646	# Get URLs, sort them, and return list
	647	self.urls = good.keys()
	648	self.urls.sort()
	649	return self.urls
	650
	651	def webpaths(self, b=None, w=200, d=5, t=None):
	652	'''Returns a list of web paths.
	653
	654	Arguments:
	655	b -- base web URL (default: None)
	656	w -- amount of resources to crawl (default: 200)
	657	d -- depth in hierarchy to crawl (default: 5)
	658	t -- number of threads (default: None)'''
	659
	660	def pathize():
	661	'''Strips base URL from full URLs to produce paths'''
	662	for url in urls:
	663	# Remove base URL from path list
	664	url = url.replace(self.base, '')
	665	# Add default name 'index.html' to root URLs and directories
	666	if not url: url = 'index.html'
	667	elif url[-1] == '/': url = ''.join([url, 'index.html'])
	668	# Verify removal of base URL and remove it if found
	669	if url.find(':') != -1: url = urlsplit(url)[2:][0]
	670	yield url
	671
	672	# Assignments
	673	urlsplit = self._uparse.urlsplit
	674	# Run weburls if base passed as an argument
	675	if b: self.weburls(b, w, d, t)
	676	# Strip off trailing resource or query from base URL
	677	if self.base[-1] != '/': self.base = '/'.join(self._sb[:-1])
	678	urls = self.urls
	679	# Return path list after stripping base URL
	680	self.paths = list(pathize())
	681	return self.paths
	682
	683	def webmirror(self, root=None, t=None, base=None, width=200, depth=5):
	684	'''Mirrors a website on a local filesystem
	685
	686	Arguments:
	687	root -- local filesystem path (default: None)
	688	t -- number of threads (default: None)
	689	base -- base web URL (default: None)
	690	width -- amount of resources to crawl (default: 200)
	691	depth -- depth in hierarchy to crawl (default: 5)'''
	692	if base: self.webspider(base, width, depth, t)
	693	return self._mirror((self.paths, self.urls), root, t)
	694
	695	def webspider(self, b=None, w=200, d=5, t=None):
	696	'''Returns two lists of child URLs and paths
	697
	698	Arguments:
	699	b -- base web URL (default: None)
	700	w -- amount of resources to crawl (default: 200)
	701	d -- depth in hierarchy to crawl (default: 5)
	702	t -- number of threads (default: None)'''
	703	if b: self.weburls(b, w, d, t)
	704	return self.webpaths(), self.urls
	705
	706	def badurlreport(self, f=None, b=None, w=200, d=5, t=None):
	707	'''Pretties up a list of bad URLs
	708
	709	Arguments:
	710	f -- output file for report (default: None)
	711	b -- base web URL (default: None)
	712	w -- amount of resources to crawl (default: 200)
	713	d -- depth in hierarchy to crawl (default: 5)
	714	t -- number of threads (default: None)'''
	715	if b: self.weburls(b, w, d, t)
	716	# Format report if information is available
	717	if self.badurls:
	718	# Number of bad URLs
	719	amount = str(len(self.badurls))
	720	header = '%s broken URLs under %s on %s:\n'
	721	# Print referring URL pointing to bad URL
	722	body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.badurls])
	723	report = self._formatreport(amount, header, body, f)
	724	# Return if just getting string
	725	if report: return report
	726
	727	def badhtmreport(self, f=None, b=None, w=200, d=5, t=None):
	728	'''Pretties up a list of unparsed HTML URLs
	729
	730	Arguments:
	731	f -- output file for report (default: None)
	732	b -- base web URL (default: None)
	733	w -- amount of resources to crawl (default: 200)
	734	d -- depth in hierarchy to crawl (default: 5)
	735	t -- number of threads (default: None)'''
	736	if b: self.weburls(b, w, d, t)
	737	# Format report if information is available
	738	if self.badhtm:
	739	amount = str(len(self.badhtm))
	740	header = '%s unparsable HTML URLs under %s on %s:\n'
	741	body = '\n'.join(self.badhtm)
	742	report = self._formatreport(amount, header, body, f)
	743	# Return if just getting string
	744	if report: return report
	745
	746	def redireport(self, f=None, b=None, w=200, d=5, t=None):
	747	'''Pretties up a list of URLs redirected to an external URL
	748
	749	Arguments:
	750	f -- output file for report (default: None)
	751	b -- base web URL (default: None)
	752	w -- amount of resources to crawl (default: 200)
	753	d -- depth in hierarchy to crawl (default: 5)
	754	t -- number of threads (default: None)'''
	755	if b: self.weburls(b, w, d, t)
	756	# Format report if information is available
	757	if self.redirs:
	758	amount = str(len(self.redirs))
	759	header = '%s redirects to external URLs under %s on %s:\n'
	760	# Print referring URL pointing to new URL
	761	body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.redirs])
	762	report = self._formatreport(amount, header, body, f)
	763	# Return if just getting string
	764	if report: return report
	765
	766	def outreport(self, f=None, b=None, w=200, d=5, t=None):
	767	'''Pretties up a list of outside URLs referenced under the base URL
	768
	769	Arguments:
	770	f -- output file for report (default: None)
	771	b -- base web URL (default: None)
	772	w -- amount of resources to crawl (default: 200)
	773	d -- depth in hierarchy to crawl (default: 5)
	774	t -- number of threads (default: None)'''
	775	if b: self.weburls(b, w, d, t)
	776	# Format report if information is available
	777	if self.outside:
	778	amount = str(len(self.outside))
	779	header = '%s links to external URLs under %s on %s:\n'
	780	body = '\n'.join(self.outside)
	781	report = self._formatreport(amount, header, body, f)
	782	# Return if just getting string
	783	if report: return report
	784
	785	def othereport(self, f=None, b=None, w=200, d=5, t=None):
	786	'''Pretties up a list of non-HTTP/FTP URLs
	787
	788	Arguments:
	789	f -- output file for report (default: None)
	790	b -- base web URL (default: None)
	791	w -- amount of resources to crawl (default: 200)
	792	d -- depth in hierarchy to crawl (default: 5)
	793	t -- number of threads (default: None)'''
	794	if b: self.weburls(b, w, d, t)
	795	# Format report if information is available
	796	if self.other:
	797	amount = str(len(self.other))
	798	header = '%s non-FTP/non-HTTP URLs under %s on %s:\n'
	799	body = '\n'.join(self.other)
	800	report = self._formatreport(amount, header, body, f)
	801	# Return if just getting string
	802	if report: return report
	803
	804	def urlreport(self, f=None, b=None, w=200, d=5, t=None):
	805	'''Pretties up a list of all URLs under a URL
	806
	807	Arguments:
	808	f -- output file for report (default: None)
	809	b -- base web URL (default: None)
	810	w -- amount of resources to crawl (default: 200)
	811	d -- depth in hierarchy to crawl (default: 5)
	812	t -- number of threads (default: None)'''
	813	if b: self.weburls(b, w, d, t)
	814	# Format report if information is available
	815	if self.urls:
	816	amount = str(len(self.urls))
	817	header = '%s verified URLs under %s on %s:\n'
	818	body = '\n'.join(self.urls)
	819	report = self._formatreport(amount, header, body, f)
	820	# Return if just getting string
	821	if report: return report
	822
	823	def webreport(self, f=None, b=None, w=200, d=5, t=None, *vargs):
	824	'''Pretties up a list of logged information under a URL
	825
	826	Arguments:
	827	f -- output file for report (default: None)
	828	b -- base web URL (default: None)
	829	w -- amount of resources to crawl (default: 200)
	830	d -- depth in hierarchy to crawl (default: 5)
	831	t -- number of threads (default: None)
	832	vargs -- report sections to include or exclude
	833	To override defaults:
	834	To include a section add 'badhtm', 'redirs', 'outside', or 'other'
	835	To exclude a section add 'badurls' or "urls"'''
	836	if b: self.weburls(b, w, d, t)
	837	# Defaults for report
	838	badurls, badhtm, redirs, urls, outside, other = 1, 0, 0, 1, 0, 0
	839	# Create compilation list
	840	compile = []
	841	# Override default report settings if argument is passed to vargs
	842	for arg in vargs:
	843	if arg == 'badurls': badurls = 0
	844	elif arg == 'badhtm': badhtm = 1
	845	elif arg == 'redirs': redirs = 1
	846	elif arg == 'urls': urls = 0
	847	elif arg == 'outside': outside = 1
	848	elif arg == 'other': other = 1
	849	# Compile report
	850	if badurls:
	851	badurls = self.badurlreport()
	852	if badurls: compile.append(badurls)
	853	if urls:
	854	urls = self.urlreport()
	855	if urls: compile.append(urls)
	856	if outside:
	857	outside = self.outreport()
	858	if outside: compile.append(outside)
	859	if redirs:
	860	redirs = self.redireport()
	861	if redirs: compile.append(redirs)
	862	if badhtm:
	863	badhtm = self.badhtmreport()
	864	if badhtm: compile.append(badhtm)
	865	if other:
	866	other = self.othereport()
	867	if other: compile.append(other)
	868	# Make report
	869	report = '\n\n'.join(compile)
	870	# Write to file if argument present
	871	if file: open(f, 'w').write(report)
	872	# Or return string
	873	else: return report
	874
	875	def _formatreport(self, amount, header, body, file=None):
	876	'''Generic prettifier with date/time stamper
	877
	878	Arguments:
	879	header -- title of report
	880	body -- body of report
	881	file -- output file for report (default: None)'''
	882	# Get current time
	883	localtime, strftime = self._localtime, self._formtime
	884	curtime = strftime('%A, %B %d, %Y at %I:%M %p', localtime())
	885	# Make section header
	886	header = header % (amount, self.base, curtime)
	887	# Add header to body
	888	report = '\n'.join([header, body])
	889	# Write to file if argument present
	890	if file: open(file, 'w').write(report)
	891	# Or return string
	892	else: return report
	893
	894	def _mirror(self, lists, root=None, threads=None):
	895	'''Mirrors a site on a local filesystem based on lists passed to it
	896
	897	Argument:
	898	lists -- lists of URLs and paths
	899	root -- local filesystem path (default: None)
	900	threads -- number of threads (default: None)'''
	901
	902	def download(url, np, op):
	903	'''Downloads files that need to be mirrored.'''
	904	# If ftp...
	905	if url[:3] == 'ftp':
	906	# Open local file
	907	local = open(np, 'wb')
	908	# Download using FTP session
	909	ftp = ftpopen(base, name, password)
	910	ftp.retrbinary('RETR %s' % op, local.write)
	911	ftp.close()
	912	# Close local file
	913	local.close()
	914	# Use normal urlretrieve if no FTP required
	915	else: ulib.urlretrieve(url, np)
	916
	917	def dlthread(url, np, op):
	918	'''Spawns a thread containing the download function'''
	919	# Create thread
	920	dthread = Thread(target=download, args=(url, np, op))
	921	# Add to thread pool
	922	pool.append(dthread)
	923
	924	# Extract path and URL lists
	925	paths, urls = lists
	926	# Avoid outside namespace lookups
	927	ulib, makedirs, sep = self._ulib, self._os.makedirs, self._os.sep
	928	normcase, split = self._path.normcase, self._path.split
	929	exists, isdir = self._path.exists, self._path.isdir
	930	ftpopen = self._ftpopen
	931	# Create local names for thread class and thread pool
	932	if threads: Thread, pool = self._thread, []
	933	# Localize name and password if exists
	934	try: base, name, password = self.base, self._name, self._password
	935	except AttributeError: pass
	936	# Change to directory if given...
	937	if root:
	938	if exists(root):
	939	if isdir(root): self._os.chdir(root)
	940	# Create root if it doesn't exist
	941	else:
	942	makedirs(root)
	943	self._os.chdir(root)
	944	# Otherwise use current directory
	945	else: root = self._os.getcwd()
	946	# Iterate over paths and download files
	947	for oldpath in paths:
	948	# Sync with the URL for oldpath
	949	url = urls[paths.index(oldpath)]
	950	# Create name of local copy
	951	newpath = normcase(oldpath).lstrip(sep)
	952	# Get directory name
	953	dirname = split(newpath)[0]
	954	# If the directory exists, download the file directly
	955	if exists(dirname):
	956	if isdir(dirname):
	957	if threads: dlthread(url, newpath, oldpath)
	958	else: download(url, newpath, oldpath)
	959	# Don't create local directory if path in root of remote URL
	960	elif not dirname:
	961	if threads: dlthread(url, newpath, oldpath)
	962	else: download(url, newpath, oldpath)
	963	# Make local directory if it doesn't exist, then dowload file
	964	else:
	965	makedirs(dirname)
	966	if threads: dlthread(url, newpath, oldpath)
	967	else: download(url, newpath, oldpath)
	968	# Run threads if they've hit the max number of threads allowed
	969	if threads:
	970	# Run if max threads or final thread reached
	971	if len(pool) == threads or paths[-1] == oldpath:
	972	# Start all threads
	973	for thread in pool: thread.start()
	974	# Clear the thread pool as they finish
	975	while pool:
	976	for thread in pool:
	977	if not thread.isAlive(): pool.remove(thread)
	978
	979
	980	# Instance of Spider enables exporting Spider's methods as standalone functions
	981	_inst = Spider()
	982	ftpurls = _inst.ftpurls
	983	weburls = _inst.weburls
	984	ftppaths = _inst.ftppaths
	985	webpaths = _inst.webpaths
	986	ftpmirror = _inst.ftpmirror
	987	ftpspider = _inst.ftpspider
	988	webmirror = _inst.webmirror
	989	webspider = _inst.webspider
	990	webreport = _inst.webreport
	991	urlreport = _inst.urlreport
	992	outreport = _inst.outreport
	993	redireport = _inst.redireport
	994	othereport = _inst.othereport
	995	badurlreport = _inst.badurlreport
	996	badhtmreport = _inst.badhtmreport

Note: See TracBrowser for help on using the repository browser.

Download in other formats: