Context Navigation

source: products/quintagroup.distrpoxy/trunk/quintagroup/distproxy/spider.py @ 1552

Last change on this file since 1552 was 1065, checked in by chervol, 15 years ago
spider.py added (http://psilib.sf.net/)
File size: 41.7 KB

Line
1	#! /usr/bin/env python
2
3	## Copyright (c) 1999 - 2003 L. C. Rees. All rights reserved.
4	## See COPYRIGHT file for license terms.
5
6	__name__ = 'spider'
7	__version__ = '0.5'
8	__author__ = 'L.C. Rees (xanimal@users.sf.net)'
9	__all__ = ['ftpurls', 'ftppaths', 'weburls', 'ftpmirror', 'ftpspider',
10	'webpaths', 'webreport', 'webmirror', 'webspider', 'urlreport',
11	'badurlreport', 'badhtmreport', 'redireport', 'outreport', 'othereport']
12
13	'''Multithreaded crawling, reporting, and mirroring for Web and FTP.'''
14
15	from __future__ import generators
16
17
18	class Spider:
19
20	'''HTTP and FTP crawling, reporting, and checking'''
21
22	import os as _os
23	import urllib as _ulib
24	import urlparse as _uparse
25	from os import path as _path
26	from ftplib import FTP as _ftp
27	from time import strftime as _formtime
28	from time import localtime as _localtime
29	from ftplib import error_perm as _ftperr
30	from sgmllib import SGMLParseError as _sperror
31	from robotparser import RobotFileParser as _rparser
32	# Use threads if available
33	try: from threading import Thread as _thread
34	except ImportError: pass
35	_bdsig, _bfsig, _session, _newparser = None, None, None, None
36	# HTML tags with URLs
37	_urltags = {'a':1, 'img':1, 'link':1, 'script':1, 'iframe':1, 'object':1,
38	'embed':1, 'area':1, 'frame':1, 'applet':1, 'input':1, 'base':1,
39	'div':1, 'layer':1, 'ilayer':1, 'bgsound':1}
40	# Supported protocols
41	_supported = {'HTTP':1, 'http':1, 'HTTPS':1, 'https':1, 'FTP':1, 'ftp':1}
42	# HTML attributes with URLs
43	_urlattrs = {'href':1, 'src':1, 'data':1}
44
45	def __init__(self, base=None, width=None, depth=None):
46	'''Initializes a Spider instance and its base attributes
47
48	Arguments:
49	base -- URL to crawl (default: None)
50	width -- maximum resources to crawl (default: None)
51	depth -- how deep in a hierarchy to crawl (default: None)'''
52	if base: self.base = base
53	else: self.base = None
54	if width: self.width = width
55	else: self.width = None
56	if depth: self.depth = depth
57	else: self.depth = None
58
59	def _ftpopen(self, base, name='anonymous', password=None, attempts=3):
60	'''Returns FTP client session
61
62	Arguments:
63	base -- FTP server URL
64	name -- login name (default: 'anonymous')
65	password -- login password (default: None)
66	attempts -- number of login attempts to try (default: 3)'''
67
68	def ftpprompt(tries=0):
69	'''Prompts for FTP username and password
70
71	Arguments:
72	tries -- number of login attempts'''
73	tries += tries
74	try:
75	self._name = raw_input('Enter login name: ')
76	self._password = raw_input('Enter password: ')
77	session = ftp(base, self._name, self._password)
78	return session
79	# If login attempt fails, retry login
80	except ftperr:
81	if attempts >= tries:
82	session = ftpprompt(tries)
83	return session
84	# Too many login attempts? End program
85	elif attempts <= tries:
86	raise IOError, 'Permission denied.'
87	import sys
88	sys.exit(0)
89
90	# Assignments
91	self._name, self._password, ftperr = name, password, self._ftperr
92	su, ftp = self._uparse.urlsplit(base), self._ftp
93	# Set URL, path, and strip 'ftp://' off
94	base, path = su[1], '/'.join([su[2], ''])
95	try: session = ftp(base, name, password)
96	# Prompt for username, password if initial arguments are incorrect
97	except ftperr: session = ftpprompt()
98	# Change to remote path if it exits
99	if path: session.cwd(path)
100	return session
101
102	def ftpmirror(self, l, t=None, b=None, w=200, d=6, n='anonymous', p=None):
103	'''Mirrors an FTP site on a local filesystem
104
105	Arguments:
106	l -- local filesystem path (default: None)
107	b -- FTP server URL (default: None)
108	t -- number of download threads (default: None)
109	w -- maximum amount of resources to crawl (default: 200)
110	d -- depth in hierarchy to crawl (default: 6)
111	n -- login username (default: 'anonymous')
112	p -- login password (default: None)'''
113	if b: self.ftpspider(b, w, d, n, p)
114	return self._mirror((self.paths, self.urls), l, t)
115
116	def ftppaths(self, b=None, w=200, d=6, n='anonymous', p=None):
117	'''Returns a list of FTP paths.
118
119	Arguments:
120	b -- FTP server URL (default: None)
121	w -- maximum amount of resources to crawl (default: 200)
122	d -- depth in hierarchy to crawl (default: 6)
123	n -- login username (default: 'anonymous')
124	p -- login password (default: None)'''
125
126	def sortftp(rdir):
127	'''Returns a list of entries marked as files or directories
128
129	Arguments:
130	rdir -- remote directory list'''
131	rlist = []
132	rappend = rlist.append
133	for rl in rdir:
134	# Split remote file based on whitespace
135	ri = rl.split()[-1]
136	# Add tuple of remote item type, permissions & name to rlist
137	if ri not in ('.', '..'): rappend((rl[0], rl[7], ri))
138	return rlist
139
140	def visitftp():
141	'''Extracts contents of an FTP directory'''
142	wd = pwd()
143	if wd[-1] != '/': wd = '/'.join([wd, ''])
144	# Add present working directory to visited directories
145	dirs[wd], rlist = None, []
146	# Get list of current directory's contents
147	retr('LIST -a', rlist.append)
148	for url in sortftp(rlist):
149	# Test if remote item is a file (indicated by '-')
150	if url[0] == '-':
151	# Resolve path of file
152	purl = ''.join([wd, url[2]])
153	# Ensure file list don't exceed max number of resources
154	if len(files) >= width: return None
155	# Add files to file dictionary
156	elif purl not in files: files[purl] = None
157	# Test if it's a directory ('d') and allows scanning ('-')
158	elif url[0] == 'd':
159	if url[1] != '-':
160	# Resolve path of directory
161	purl = ''.join([wd, url[2], '/'])
162	# Ensure no recursion beyond depth allowed
163	if len(purl.split('/')) >= depth: dirs[purl] = None
164	# Visit directory if it hasn't been visited yet
165	elif purl not in dirs:
166	# Change to new directory
167	cwd(purl)
168	# Run 'visitftp' on new directory
169	visitftp()
170
171	# Use classwide attributes if set
172	if b: self.base = b
173	else: b = self.base
174	# Use classwide width if different from method default
175	if self.width and w == 200: width = self.width
176	else: width = w
177	# Use classwide depth if different from method default
178	if self.depth and d == 6: depth = self.depth + 1
179	else: depth = d + 1
180	# File and directory dicts
181	files, dirs = {}, {}
182	# Use existing FTP client session if present
183	if self._session: ftp = self._session
184	# Create new FTP client session if necessary
185	else:
186	ftp = self._ftpopen(b, n, p)
187	self._session = ftp
188	# Avoid outside namespace lookups
189	cwd, pwd, retr = ftp.cwd, ftp.pwd, ftp.retrlines
190	# Walk FTP site
191	visitftp()
192	# Make path list out of files' keys and return it
193	self.paths = files.keys()
194	self.paths.sort()
195	return self.paths
196
197	def ftpspider(self, b=None, w=200, d=6, n='anonymous', p=None):
198	'''Returns lists of URLs and paths plus a live FTP client session
199
200	Arguments:
201	b -- FTP server URL (default: None)
202	w -- maximum amount of resources to crawl (default: 200)
203	d -- depth in hierarchy to crawl (default: 6)
204	n -- login username (default: 'anonymous')
205	p -- login password (default: None)'''
206	if b: ftppaths(b, w, d, n, p)
207	return self.paths, ftpurls(), self._session
208
209	def ftpurls(self, b=None, w=200, d=6, n='anonymous', p=None):
210	'''Returns a list of FTP URLs
211
212	Arguments:
213	b -- FTP server URL (default: None)
214	w -- maximum amount of resources to crawl (default: 200)
215	d -- depth in hierarchy to crawl (default: 6)
216	n -- login username (default: 'anonymous')
217	p -- login password (default: None)'''
218	if b:
219	ftppaths(b, w, d, n, p)
220	# Get rid of trailing '/' in base if present before joining
221	if b[-1] == '/': base = b[:-1]
222	else:
223	base = self.base
224	# Get rid of trailing '/' in base if present before joining
225	if base[-1] == '/': base = self.base[:-1]
226	paths = self.paths
227	# Add FTP URL
228	self.urls = [''.join([base, i]) for i in paths]
229	return self.urls
230
231	def _parserpick(self, old=None):
232	'''Returns a class using the sgmllib parser or the sgmlop parser
233
234	Arguments:
235	old -- use classic sgmllib SGMLParser'''
236	# Assignments
237	urltags, urlattrs = self._urltags, self._urlattrs
238	# Lists for bad file and bad directory signatures
239	self._bfsig, self._bdsig = [], []
240	bfsig, bdsig = self._bfsig, self._bdsig
241	# Use faster SGMLParser if available
242	try:
243	from sgmlop import SGMLParser as newparser
244	self._newparser = newparser
245	# If unavailable, use classic SGML parser
246	except ImportError:
247	from sgmllib import SGMLParser as oldparser
248	old = 1
249	# Classes using classic sgmllib SGML Parser
250	if old:
251	from sgmllib import SGMLParser as oldparser
252	# Remove sgmlop parser if present
253	self._newparser = None
254	# UrlExtract class using classic parser
255	class UrlExtract(oldparser):
256	'''Extracts URLs from a SGMLish document'''
257	def reset(self):
258	'''Resets SGML parser and clears lists'''
259	oldparser.reset(self)
260	self.urls, self.text, self.badurl = [], [], None
261	def handle_data(self, data):
262	'''Handles non-markup data'''
263	# Get first 5 lines of non-markup data
264	if len(self.text) <= 5: self.text.append(data)
265	# Compare signature of known bad URL to a new web page
266	if self.text == bfsig: self.badurl = 1
267	elif self.text == bdsig: self.badurl = 1
268	def finish_starttag(self, tag, attrs):
269	'''Extracts URL bearing tags'''
270	if tag in urltags:
271	# Get key, vale in attributes if they match
272	url = [v for k, v in attrs if k in urlattrs]
273	if url: self.urls.extend(url)
274	# BadUrl class using classic parser
275	class BadUrl(oldparser):
276	'''Collects results of intentionally incorrect URLs'''
277	def reset(self):
278	'''Resets SGML parser and clears lists'''
279	oldparser.reset(self)
280	self.text = []
281	def handle_data(self, data):
282	'''Collects lines to profile bad URLs'''
283	# Adds first 5 lines of non-markup data to text
284	if len(self.text) <= 5: self.text.append(data)
285	# If no old flag, use SGMLParser from sgmlop and related classes
286	else:
287	# UrlExtract class using sgmlop parser
288	class UrlExtract:
289	'''Extracts URLs from a SGMLish document'''
290	def __init__(self):
291	'''Resets SGML parser and clears lists'''
292	self.urls, self.text, self.badurl = [], [], None
293	def handle_data(self, data):
294	'''Handles non-markup data'''
295	# Get first 5 lines of non-markup data
296	if len(self.text) <= 5: self.text.append(data)
297	# Compare signature of known bad URL to a new web page
298	if self.text == bfsig: self.badurl = 1
299	elif self.text == bdsig: self.badurl = 1
300	def finish_starttag(self, tag, attrs):
301	'''Extracts URL bearing tags'''
302	if tag in urltags:
303	# Get key, vale in attributes if they match
304	url = [v for k, v in attrs if k in urlattrs]
305	if url: self.urls.extend(url)
306	# BadUrl class using sgmlop parser
307	class BadUrl:
308	'''Collects results of intentionally incorrect URLs'''
309	def __init__(self):
310	'''Resets SGML parser and clears lists'''
311	self.text = []
312	def handle_data(self, data):
313	'''Collects lines to profile not found responses'''
314	# Adds first 5 lines of non-markup data to list 'text'
315	if len(self.text) <= 5: self.text.append(data)
316	# Make resulting classes available class wide
317	self._UrlExtract, self._BadUrl = UrlExtract, BadUrl
318
319	def _webtest(self):
320	'''Generates signatures for identifying bad URLs'''
321
322	def badurl(url):
323	'''Returns first 5 lines of a bad URL
324
325	Arguments:
326	url -- Bad URL to open and parse'''
327	# Use different classes if faster SGML Parser is available
328	if self._newparser:
329	# sgmlop parser must have a handler passed to it
330	parser, urlget = self._newparser(), BadUrl()
331	# Pass handler (sgmlop cannot be subclassed)
332	parser.register(urlget)
333	parser.feed(urlopen(url).read())
334	parser.close()
335	# Use classic parser
336	else:
337	urlget = BadUrl()
338	urlget.feed(urlopen(url).read())
339	urlget.close()
340	# Return singature of bad URL
341	return urlget.text
342
343	# Make globals local
344	base, urljoin = self.base, self._uparse.urljoin
345	urlopen, BadUrl = self._ulib.urlopen, self._BadUrl
346	# Generate random string of jibber
347	from string import letters, digits
348	from random import choice, randint
349	jibber = ''.join([letters, digits])
350	ru = ''.join([choice(jibber) for x in range(randint(1, 30))])
351	# Builds signature of a bad URL for a file
352	self._bfsig.extend(badurl(urljoin(base, '%s.html' % ru)))
353	# Builds signature of a bad URL for a directory
354	self._bdsig.extend(badurl(urljoin(base,'%s/' % ru)))
355
356	def _webparser(self, html):
357	'''Parses HTML and returns bad URL indicator and extracted URLs
358
359	Arguments:
360	html -- HTML data'''
361	# Use different classes if faster SGML Parser is available
362	if self._newparser:
363	# Make instances of SGML parser and URL extracting handler
364	parser, urlget = self._newparser(), self._UrlExtract()
365	# Pass handler to parser
366	parser.register(urlget)
367	# Feed data to parser
368	parser.feed(html)
369	parser.close()
370	# Return bad URL indicator and extracted URLs
371	else:
372	urlget = self._UrlExtract()
373	urlget.feed(html)
374	urlget.close()
375	# Return badurl marker and list of child URLS
376	return urlget.badurl, urlget.urls
377
378	def _webopen(self, base):
379	'''Verifies URL and returns actual URL and extracted child URLs
380
381	Arguments:
382	base -- tuple containing a URL and its referring URL'''
383	# Assignments
384	good, cbase = self._good, base[0]
385	try:
386	# If webspiders can access URL, open it
387	if self._robot.can_fetch('*', cbase):
388	url = self._ulib.urlopen(cbase)
389	# Otherwise, mark as visited and abort
390	else:
391	self._visited[cbase] = 1
392	return False
393	# If HTTP error, log bad URL and abort
394	except IOError:
395	self._visited[cbase] = 1
396	self.badurls.append((base[1], cbase))
397	return False
398	# Get real URL
399	newbase = url.geturl()
400	# Change URL if different from old URL
401	if newbase != cbase: cbase, base = newbase, (newbase, base[1])
402	# URLs with mimetype 'text/html" scanned for URLs
403	if url.headers.type == 'text/html':
404	# Feed parser
405	contents = url.read()
406	try: badurl, urls = self._webparser(contents)
407	# Log URL if SGML parser can't parse it
408	except self._sperror:
409	self._visited[cbase], self.badhtm[cbase] = 1, 1
410	return False
411	url.close()
412	# Return URL and extracted urls if it's good
413	if not badurl: return cbase, urls
414	# If the URL is bad (after BadUrl), stop processing and log URL
415	else:
416	self._visited[cbase] = 1
417	self.badurls.append((base[1], cbase))
418	return False
419	# Return URL of non-HTML resources and empty list
420	else:
421	url.close()
422	return cbase, []
423
424	def _genverify(self, urls, base):
425	'''Verifies a list of full URL relative to a base URL
426
427	Arguments:
428	urls -- list of raw URLs
429	base -- referring URL'''
430	# Assignments
431	cache, visit, urlverify = self._cache, self._visited, self._urlverify
432	# Strip file off base URL for joining
433	newbase = base.replace(base.split('/')[-1], '')
434	for url in urls:
435	# Get resolved url and raw child URLs
436	url, rawurls = urlverify(url, base, newbase)
437	# Handle any child URLs
438	if rawurls:
439	newurls = {}
440	# Eliminate duplicate URLs
441	for rawurl in rawurls:
442	# Eliminate known visited URLs
443	if rawurl not in visit: newurls[rawurl] = 1
444	# Put new URLs in cache if present
445	if newurls: cache[url] = newurls
446	# Yield new URL
447	if url: yield url
448
449	def _multiverify(self, url, base):
450	'''Verifies a full URL relative to a base URL
451
452	Arguments:
453	url -- a raw URLs
454	base -- referring URL'''
455	# Assignments
456	cache, visited = self._cache, self._visited
457	# Strip file off base URL for joining
458	newbase = base.replace(base.split('/')[-1], '')
459	# Get resolved url and raw child URLs
460	url, rawurls = self._urlverify(url, base, newbase)
461	# Handle any child URLs
462	if rawurls:
463	# Eliminate known visited URLs and duplicates
464	for rawurl in rawurls:
465	# Put new URLs in cache if present
466	if rawurl not in visited: cache[rawurl] = url
467	# Put URL in list of good URLs
468	if url: self._good[url] = 1
469
470	def _urlverify(self, url, base, newbase):
471	'''Returns a full URL relative to a base URL
472
473	Arguments:
474	urls -- list of raw URLs
475	base -- referring URL
476	newbase -- temporary version of referring URL for joining'''
477	# Assignments
478	visited, webopen, other = self._visited, self._webopen, self.other
479	sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
480	urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
481	outside, redirs, supported = self.outside, self.redirs, self._supported
482	if url not in visited:
483	# Remove whitespace from URL
484	if url.find(' ') != -1:
485	visited[url], url = 1, url.replace(' ', '')
486	if url in visited: return 0, 0
487	# Remove fragments i.e. 'http:foo/bar#frag'
488	if url.find('#') != -1:
489	visited[url], url = 1, urldefrag(url)[0]
490	if url in visited: return 0, 0
491	# Process full URLs i.e. 'http://foo/bar
492	if url.find(':') != -1:
493	urlseg = urlsplit(url)
494	# Block non-FTP, HTTP URLs
495	if urlseg[0] not in supported:
496	# Log as non-FTP/HTTP URL
497	other[url], visited[url] = 1, 1
498	return 0, 0
499	# If URL is not in root domain, block it
500	if urlseg[1] not in sb:
501	visited[url], outside[url] = 1, 1
502	return 0, 0
503	# Block duplicate root URLs
504	elif not urlseg[2] and urlseg[1] == sb:
505	visited[url] = 1
506	return 0, 0
507	# Handle relative URLs i.e. ../foo/bar
508	elif url.find(':') == -1:
509	# Join root domain and relative URL
510	visited[url], url = 1, urljoin(newbase, url)
511	if url in visited: return 0, 0
512	# Test URL by attempting to open it
513	rurl = webopen((url, base))
514	if rurl and rurl[0] not in visited:
515	# Get URL
516	turl, rawurls = rurl
517	visited[url], visited[turl] = 1, 1
518	# If URL resolved to a different URL, process it
519	if turl != url:
520	urlseg = urlsplit(turl)
521	# If URL is not in root domain, block it
522	if urlseg[1] not in sb:
523	# Log as a redirected internal URL
524	redirs[(url, turl)] = 1
525	return 0, 0
526	# Block duplicate root URLs
527	elif not urlseg[2] and urlseg[1] == sb: return 0, 0
528	# If URL exceeds depth, don't process
529	if len(turl.split('/')) >= depth: return 0, 0
530	# Otherwise return URL
531	else:
532	if rawurls: return turl, rawurls
533	else: return turl, []
534	else: return 0,0
535	else: return 0, 0
536
537	def _onewalk(self):
538	'''Yields good URLs from under a base URL'''
539	# Assignments
540	cache, genverify = self._cache, self._genverify
541	# End processing if cache is empty
542	while cache:
543	# Fetch item from cache
544	base, urls = cache.popitem()
545	# If item has child URLs, process them and yield good URLs
546	if urls:
547	for url in genverify(urls, base): yield url
548
549	def _multiwalk(self, threads):
550	'''Extracts good URLs from under a base URL
551
552	Arguments:
553	threads -- number of threads to run'''
554
555	def urlthread(url, base):
556	'''Spawns a thread containing a multiverify function
557
558	Arguments:
559
560	url -- URL to verify
561	base -- referring URL'''
562	# Create instance of Thread
563	dthread = Thread(target=multiverify, args=(url, base))
564	# Put in pool
565	pool.append(dthread)
566
567	# Assignments
568	pool, cache, multiverify = [], self._cache, self._multiverify
569	Thread, width, good = self._thread, self.width, self._good
570	# End processing if cache is empty
571	while cache:
572	# Process URLs as long as width not exceeded
573	if len(good) <= width:
574	# Fetch item from cache
575	url, base = cache.popitem()
576	# Make thread
577	if url: urlthread(url, base)
578	# Run threads once pool size is reached
579	if len(pool) == threads or threads >= len(cache):
580	# Start threads
581	for thread in pool: thread.start()
582	# Empty thread pool as threads complete
583	while pool:
584	for thread in pool:
585	if not thread.isAlive(): pool.remove(thread)
586	# End if width reached
587	elif len(good) >= width: break
588
589	def weburls(self, base=None, width=200, depth=5, thread=None):
590	'''Returns a list of web paths.
591
592	Arguments:
593	base -- base web URL (default: None)
594	width -- amount of resources to crawl (default: 200)
595	depth -- depth in hierarchy to crawl (default: 5)
596	thread -- number of threads to run (default: None)'''
597	# Assignments
598	self._visited, self._good, self._cache, self.badurls = {}, {}, {}, []
599	self.redirs, self.outside, self.badhtm, self.other = {}, {}, {}, {}
600	onewalk, good, self._robot = self._onewalk, self._good, self._rparser()
601	uparse, robot, multiwalk = self._uparse, self._robot, self._multiwalk
602	cache = self._cache
603	# Assign width
604	if self.width and width == 200: width = self.width
605	else: self.width = width
606	# sgmlop crashes Python after too many iterations
607	if width > 5000: self._parserpick(1)
608	else: self._parserpick()
609	# Use global base if present
610	if not base: base = self.base
611	# Verify URL and get child URLs
612	newbase, rawurls = self._webopen((base, ''))
613	if newbase:
614	# Change base URL if different
615	if newbase != base: base = newbase
616	# Ensure there's a trailing '/' in base URL
617	if base[-1] != '/':
618	url = list(uparse.urlsplit(base))
619	url[1] = ''.join([url[1], '/'])
620	base = uparse.urlunsplit(url)
621	# Eliminate duplicates and put raw URLs in cache
622	newurls = {}
623	for rawurl in rawurls: newurls[rawurl] = 1
624	if newurls:
625	# Cache URLs individually if threads are desired
626	if thread:
627	for newurl in newurls: cache[newurl] = base
628	# Cache in group if no threads
629	else: cache[base] = newurls
630	# Make base URL, get split, and put in verified URL list
631	self.base, self._sb = base, base.split('/')
632	self._visited[base], good[base] = 1, 1
633	# If URL is bad, abort and raise error
634	else: raise IOError, "URL is invalid"
635	# Adjust dept to length of base URL
636	if self.depth and depth == 6: self.depth += len(self._sb)
637	else: self.depth = depth + len(self._sb)
638	# Get robot limits
639	robot.set_url(''.join([base, 'robots.txt']))
640	robot.read()
641	# Get signature of bad URL
642	self._webtest()
643	# Get good URLs as long as total width isn't exceeded
644	try:
645	# Multiwalk if threaded
646	if thread: self._multiwalk(thread)
647	# Otherwise, use single thread
648	else:
649	for item in onewalk():
650	# Don't exceed maximum width
651	if len(good) <= width: good[item] = 1
652	elif len(good) >= width: break
653	# If user interrupts crawl, return what's done
654	except KeyboardInterrupt: pass
655	# Get URLs, sort them, and return list
656	self.urls = good.keys()
657	self.urls.sort()
658	return self.urls
659
660	def webpaths(self, b=None, w=200, d=5, t=None):
661	'''Returns a list of web paths.
662
663	Arguments:
664	b -- base web URL (default: None)
665	w -- amount of resources to crawl (default: 200)
666	d -- depth in hierarchy to crawl (default: 5)
667	t -- number of threads (default: None)'''
668
669	def pathize():
670	'''Strips base URL from full URLs to produce paths'''
671	for url in urls:
672	# Remove base URL from path list
673	url = url.replace(self.base, '')
674	# Add default name 'index.html' to root URLs and directories
675	if not url: url = 'index.html'
676	elif url[-1] == '/': url = ''.join([url, 'index.html'])
677	# Verify removal of base URL and remove it if found
678	if url.find(':') != -1: url = urlsplit(url)[2:][0]
679	yield url
680
681	# Assignments
682	urlsplit = self._uparse.urlsplit
683	# Run weburls if base passed as an argument
684	if b: self.weburls(b, w, d, t)
685	# Strip off trailing resource or query from base URL
686	if self.base[-1] != '/': self.base = '/'.join(self._sb[:-1])
687	urls = self.urls
688	# Return path list after stripping base URL
689	self.paths = list(pathize())
690	return self.paths
691
692	def webmirror(self, root=None, t=None, base=None, width=200, depth=5):
693	'''Mirrors a website on a local filesystem
694
695	Arguments:
696	root -- local filesystem path (default: None)
697	t -- number of threads (default: None)
698	base -- base web URL (default: None)
699	width -- amount of resources to crawl (default: 200)
700	depth -- depth in hierarchy to crawl (default: 5)'''
701	if base: self.webspider(base, width, depth, t)
702	return self._mirror((self.paths, self.urls), root, t)
703
704	def webspider(self, b=None, w=200, d=5, t=None):
705	'''Returns two lists of child URLs and paths
706
707	Arguments:
708	b -- base web URL (default: None)
709	w -- amount of resources to crawl (default: 200)
710	d -- depth in hierarchy to crawl (default: 5)
711	t -- number of threads (default: None)'''
712	if b: self.weburls(b, w, d, t)
713	return self.webpaths(), self.urls
714
715	def badurlreport(self, f=None, b=None, w=200, d=5, t=None):
716	'''Pretties up a list of bad URLs
717
718	Arguments:
719	f -- output file for report (default: None)
720	b -- base web URL (default: None)
721	w -- amount of resources to crawl (default: 200)
722	d -- depth in hierarchy to crawl (default: 5)
723	t -- number of threads (default: None)'''
724	if b: self.weburls(b, w, d, t)
725	# Format report if information is available
726	if self.badurls:
727	# Number of bad URLs
728	amount = str(len(self.badurls))
729	header = '%s broken URLs under %s on %s:\n'
730	# Print referring URL pointing to bad URL
731	body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.badurls])
732	report = self._formatreport(amount, header, body, f)
733	# Return if just getting string
734	if report: return report
735
736	def badhtmreport(self, f=None, b=None, w=200, d=5, t=None):
737	'''Pretties up a list of unparsed HTML URLs
738
739	Arguments:
740	f -- output file for report (default: None)
741	b -- base web URL (default: None)
742	w -- amount of resources to crawl (default: 200)
743	d -- depth in hierarchy to crawl (default: 5)
744	t -- number of threads (default: None)'''
745	if b: self.weburls(b, w, d, t)
746	# Format report if information is available
747	if self.badhtm:
748	amount = str(len(self.badhtm))
749	header = '%s unparsable HTML URLs under %s on %s:\n'
750	body = '\n'.join(self.badhtm)
751	report = self._formatreport(amount, header, body, f)
752	# Return if just getting string
753	if report: return report
754
755	def redireport(self, f=None, b=None, w=200, d=5, t=None):
756	'''Pretties up a list of URLs redirected to an external URL
757
758	Arguments:
759	f -- output file for report (default: None)
760	b -- base web URL (default: None)
761	w -- amount of resources to crawl (default: 200)
762	d -- depth in hierarchy to crawl (default: 5)
763	t -- number of threads (default: None)'''
764	if b: self.weburls(b, w, d, t)
765	# Format report if information is available
766	if self.redirs:
767	amount = str(len(self.redirs))
768	header = '%s redirects to external URLs under %s on %s:\n'
769	# Print referring URL pointing to new URL
770	body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.redirs])
771	report = self._formatreport(amount, header, body, f)
772	# Return if just getting string
773	if report: return report
774
775	def outreport(self, f=None, b=None, w=200, d=5, t=None):
776	'''Pretties up a list of outside URLs referenced under the base URL
777
778	Arguments:
779	f -- output file for report (default: None)
780	b -- base web URL (default: None)
781	w -- amount of resources to crawl (default: 200)
782	d -- depth in hierarchy to crawl (default: 5)
783	t -- number of threads (default: None)'''
784	if b: self.weburls(b, w, d, t)
785	# Format report if information is available
786	if self.outside:
787	amount = str(len(self.outside))
788	header = '%s links to external URLs under %s on %s:\n'
789	body = '\n'.join(self.outside)
790	report = self._formatreport(amount, header, body, f)
791	# Return if just getting string
792	if report: return report
793
794	def othereport(self, f=None, b=None, w=200, d=5, t=None):
795	'''Pretties up a list of non-HTTP/FTP URLs
796
797	Arguments:
798	f -- output file for report (default: None)
799	b -- base web URL (default: None)
800	w -- amount of resources to crawl (default: 200)
801	d -- depth in hierarchy to crawl (default: 5)
802	t -- number of threads (default: None)'''
803	if b: self.weburls(b, w, d, t)
804	# Format report if information is available
805	if self.other:
806	amount = str(len(self.other))
807	header = '%s non-FTP/non-HTTP URLs under %s on %s:\n'
808	body = '\n'.join(self.other)
809	report = self._formatreport(amount, header, body, f)
810	# Return if just getting string
811	if report: return report
812
813	def urlreport(self, f=None, b=None, w=200, d=5, t=None):
814	'''Pretties up a list of all URLs under a URL
815
816	Arguments:
817	f -- output file for report (default: None)
818	b -- base web URL (default: None)
819	w -- amount of resources to crawl (default: 200)
820	d -- depth in hierarchy to crawl (default: 5)
821	t -- number of threads (default: None)'''
822	if b: self.weburls(b, w, d, t)
823	# Format report if information is available
824	if self.urls:
825	amount = str(len(self.urls))
826	header = '%s verified URLs under %s on %s:\n'
827	body = '\n'.join(self.urls)
828	report = self._formatreport(amount, header, body, f)
829	# Return if just getting string
830	if report: return report
831
832	def webreport(self, f=None, b=None, w=200, d=5, t=None, *vargs):
833	'''Pretties up a list of logged information under a URL
834
835	Arguments:
836	f -- output file for report (default: None)
837	b -- base web URL (default: None)
838	w -- amount of resources to crawl (default: 200)
839	d -- depth in hierarchy to crawl (default: 5)
840	t -- number of threads (default: None)
841	vargs -- report sections to include or exclude
842	To override defaults:
843	To include a section add 'badhtm', 'redirs', 'outside', or 'other'
844	To exclude a section add 'badurls' or "urls"'''
845	if b: self.weburls(b, w, d, t)
846	# Defaults for report
847	badurls, badhtm, redirs, urls, outside, other = 1, 0, 0, 1, 0, 0
848	# Create compilation list
849	compile = []
850	# Override default report settings if argument is passed to vargs
851	for arg in vargs:
852	if arg == 'badurls': badurls = 0
853	elif arg == 'badhtm': badhtm = 1
854	elif arg == 'redirs': redirs = 1
855	elif arg == 'urls': urls = 0
856	elif arg == 'outside': outside = 1
857	elif arg == 'other': other = 1
858	# Compile report
859	if badurls:
860	badurls = self.badurlreport()
861	if badurls: compile.append(badurls)
862	if urls:
863	urls = self.urlreport()
864	if urls: compile.append(urls)
865	if outside:
866	outside = self.outreport()
867	if outside: compile.append(outside)
868	if redirs:
869	redirs = self.redireport()
870	if redirs: compile.append(redirs)
871	if badhtm:
872	badhtm = self.badhtmreport()
873	if badhtm: compile.append(badhtm)
874	if other:
875	other = self.othereport()
876	if other: compile.append(other)
877	# Make report
878	report = '\n\n'.join(compile)
879	# Write to file if argument present
880	if file: open(f, 'w').write(report)
881	# Or return string
882	else: return report
883
884	def _formatreport(self, amount, header, body, file=None):
885	'''Generic prettifier with date/time stamper
886
887	Arguments:
888	header -- title of report
889	body -- body of report
890	file -- output file for report (default: None)'''
891	# Get current time
892	localtime, strftime = self._localtime, self._formtime
893	curtime = strftime('%A, %B %d, %Y at %I:%M %p', localtime())
894	# Make section header
895	header = header % (amount, self.base, curtime)
896	# Add header to body
897	report = '\n'.join([header, body])
898	# Write to file if argument present
899	if file: open(file, 'w').write(report)
900	# Or return string
901	else: return report
902
903	def _mirror(self, lists, root=None, threads=None):
904	'''Mirrors a site on a local filesystem based on lists passed to it
905
906	Argument:
907	lists -- lists of URLs and paths
908	root -- local filesystem path (default: None)
909	threads -- number of threads (default: None)'''
910
911	def download(url, np, op):
912	'''Downloads files that need to be mirrored.'''
913	# If ftp...
914	if url[:3] == 'ftp':
915	# Open local file
916	local = open(np, 'wb')
917	# Download using FTP session
918	ftp = ftpopen(base, name, password)
919	ftp.retrbinary('RETR %s' % op, local.write)
920	ftp.close()
921	# Close local file
922	local.close()
923	# Use normal urlretrieve if no FTP required
924	else: ulib.urlretrieve(url, np)
925
926	def dlthread(url, np, op):
927	'''Spawns a thread containing the download function'''
928	# Create thread
929	dthread = Thread(target=download, args=(url, np, op))
930	# Add to thread pool
931	pool.append(dthread)
932
933	# Extract path and URL lists
934	paths, urls = lists
935	# Avoid outside namespace lookups
936	ulib, makedirs, sep = self._ulib, self._os.makedirs, self._os.sep
937	normcase, split = self._path.normcase, self._path.split
938	exists, isdir = self._path.exists, self._path.isdir
939	ftpopen = self._ftpopen
940	# Create local names for thread class and thread pool
941	if threads: Thread, pool = self._thread, []
942	# Localize name and password if exists
943	try: base, name, password = self.base, self._name, self._password
944	except AttributeError: pass
945	# Change to directory if given...
946	if root:
947	if exists(root):
948	if isdir(root): self._os.chdir(root)
949	# Create root if it doesn't exist
950	else:
951	makedirs(root)
952	self._os.chdir(root)
953	# Otherwise use current directory
954	else: root = self._os.getcwd()
955	# Iterate over paths and download files
956	for oldpath in paths:
957	# Sync with the URL for oldpath
958	url = urls[paths.index(oldpath)]
959	# Create name of local copy
960	newpath = normcase(oldpath).lstrip(sep)
961	# Get directory name
962	dirname = split(newpath)[0]
963	# If the directory exists, download the file directly
964	if exists(dirname):
965	if isdir(dirname):
966	if threads: dlthread(url, newpath, oldpath)
967	else: download(url, newpath, oldpath)
968	# Don't create local directory if path in root of remote URL
969	elif not dirname:
970	if threads: dlthread(url, newpath, oldpath)
971	else: download(url, newpath, oldpath)
972	# Make local directory if it doesn't exist, then dowload file
973	else:
974	makedirs(dirname)
975	if threads: dlthread(url, newpath, oldpath)
976	else: download(url, newpath, oldpath)
977	# Run threads if they've hit the max number of threads allowed
978	if threads:
979	# Run if max threads or final thread reached
980	if len(pool) == threads or paths[-1] == oldpath:
981	# Start all threads
982	for thread in pool: thread.start()
983	# Clear the thread pool as they finish
984	while pool:
985	for thread in pool:
986	if not thread.isAlive(): pool.remove(thread)
987
988
989	# Instance of Spider enables exporting Spider's methods as standalone functions
990	_inst = Spider()
991	ftpurls = _inst.ftpurls
992	weburls = _inst.weburls
993	ftppaths = _inst.ftppaths
994	webpaths = _inst.webpaths
995	ftpmirror = _inst.ftpmirror
996	ftpspider = _inst.ftpspider
997	webmirror = _inst.webmirror
998	webspider = _inst.webspider
999	webreport = _inst.webreport
1000	urlreport = _inst.urlreport
1001	outreport = _inst.outreport
1002	redireport = _inst.redireport
1003	othereport = _inst.othereport
1004	badurlreport = _inst.badurlreport
1005	badhtmreport = _inst.badhtmreport

Note: See TracBrowser for help on using the repository browser.

Download in other formats: