Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

spider.py

Last change on this file was 1814, checked in by chervol, 14 years ago
do not redownload when mirror
File size: 41.4 KB

Line
1	#! /usr/bin/env python
2
3	## Copyright (c) 1999 - 2003 L. C. Rees. All rights reserved.
4	## See COPYRIGHT file for license terms.
5
6	from __future__ import generators
7
8
9	class Spider:
10
11	'''HTTP and FTP crawling, reporting, and checking'''
12
13	import os as _os
14	import urllib as _ulib
15	import urlparse as _uparse
16	from os import path as _path
17	from ftplib import FTP as _ftp
18	from time import strftime as _formtime
19	from time import localtime as _localtime
20	from ftplib import error_perm as _ftperr
21	from sgmllib import SGMLParseError as _sperror
22	from robotparser import RobotFileParser as _rparser
23	# Use threads if available
24	try: from threading import Thread as _thread
25	except ImportError: pass
26	_bdsig, _bfsig, _session, _newparser = None, None, None, None
27	# HTML tags with URLs
28	_urltags = {'a':1, 'img':1, 'link':1, 'script':1, 'iframe':1, 'object':1,
29	'embed':1, 'area':1, 'frame':1, 'applet':1, 'input':1, 'base':1,
30	'div':1, 'layer':1, 'ilayer':1, 'bgsound':1}
31	# Supported protocols
32	_supported = {'HTTP':1, 'http':1, 'HTTPS':1, 'https':1, 'FTP':1, 'ftp':1}
33	# HTML attributes with URLs
34	_urlattrs = {'href':1, 'src':1, 'data':1}
35
36	def __init__(self, base=None, width=None, depth=None):
37	'''Initializes a Spider instance and its base attributes
38
39	Arguments:
40	base -- URL to crawl (default: None)
41	width -- maximum resources to crawl (default: None)
42	depth -- how deep in a hierarchy to crawl (default: None)'''
43	if base: self.base = base
44	else: self.base = None
45	if width: self.width = width
46	else: self.width = None
47	if depth: self.depth = depth
48	else: self.depth = None
49
50	def _ftpopen(self, base, name='anonymous', password=None, attempts=3):
51	'''Returns FTP client session
52
53	Arguments:
54	base -- FTP server URL
55	name -- login name (default: 'anonymous')
56	password -- login password (default: None)
57	attempts -- number of login attempts to try (default: 3)'''
58
59	def ftpprompt(tries=0):
60	'''Prompts for FTP username and password
61
62	Arguments:
63	tries -- number of login attempts'''
64	tries += tries
65	try:
66	self._name = raw_input('Enter login name: ')
67	self._password = raw_input('Enter password: ')
68	session = ftp(base, self._name, self._password)
69	return session
70	# If login attempt fails, retry login
71	except ftperr:
72	if attempts >= tries:
73	session = ftpprompt(tries)
74	return session
75	# Too many login attempts? End program
76	elif attempts <= tries:
77	raise IOError, 'Permission denied.'
78	import sys
79	sys.exit(0)
80
81	# Assignments
82	self._name, self._password, ftperr = name, password, self._ftperr
83	su, ftp = self._uparse.urlsplit(base), self._ftp
84	# Set URL, path, and strip 'ftp://' off
85	base, path = su[1], '/'.join([su[2], ''])
86	try: session = ftp(base, name, password)
87	# Prompt for username, password if initial arguments are incorrect
88	except ftperr: session = ftpprompt()
89	# Change to remote path if it exits
90	if path: session.cwd(path)
91	return session
92
93	def ftpmirror(self, l, t=None, b=None, w=200, d=6, n='anonymous', p=None):
94	'''Mirrors an FTP site on a local filesystem
95
96	Arguments:
97	l -- local filesystem path (default: None)
98	b -- FTP server URL (default: None)
99	t -- number of download threads (default: None)
100	w -- maximum amount of resources to crawl (default: 200)
101	d -- depth in hierarchy to crawl (default: 6)
102	n -- login username (default: 'anonymous')
103	p -- login password (default: None)'''
104	if b: self.ftpspider(b, w, d, n, p)
105	return self._mirror((self.paths, self.urls), l, t)
106
107	def ftppaths(self, b=None, w=200, d=6, n='anonymous', p=None):
108	'''Returns a list of FTP paths.
109
110	Arguments:
111	b -- FTP server URL (default: None)
112	w -- maximum amount of resources to crawl (default: 200)
113	d -- depth in hierarchy to crawl (default: 6)
114	n -- login username (default: 'anonymous')
115	p -- login password (default: None)'''
116
117	def sortftp(rdir):
118	'''Returns a list of entries marked as files or directories
119
120	Arguments:
121	rdir -- remote directory list'''
122	rlist = []
123	rappend = rlist.append
124	for rl in rdir:
125	# Split remote file based on whitespace
126	ri = rl.split()[-1]
127	# Add tuple of remote item type, permissions & name to rlist
128	if ri not in ('.', '..'): rappend((rl[0], rl[7], ri))
129	return rlist
130
131	def visitftp():
132	'''Extracts contents of an FTP directory'''
133	wd = pwd()
134	if wd[-1] != '/': wd = '/'.join([wd, ''])
135	# Add present working directory to visited directories
136	dirs[wd], rlist = None, []
137	# Get list of current directory's contents
138	retr('LIST -a', rlist.append)
139	for url in sortftp(rlist):
140	# Test if remote item is a file (indicated by '-')
141	if url[0] == '-':
142	# Resolve path of file
143	purl = ''.join([wd, url[2]])
144	# Ensure file list don't exceed max number of resources
145	if len(files) >= width: return None
146	# Add files to file dictionary
147	elif purl not in files: files[purl] = None
148	# Test if it's a directory ('d') and allows scanning ('-')
149	elif url[0] == 'd':
150	if url[1] != '-':
151	# Resolve path of directory
152	purl = ''.join([wd, url[2], '/'])
153	# Ensure no recursion beyond depth allowed
154	if len(purl.split('/')) >= depth: dirs[purl] = None
155	# Visit directory if it hasn't been visited yet
156	elif purl not in dirs:
157	# Change to new directory
158	cwd(purl)
159	# Run 'visitftp' on new directory
160	visitftp()
161
162	# Use classwide attributes if set
163	if b: self.base = b
164	else: b = self.base
165	# Use classwide width if different from method default
166	if self.width and w == 200: width = self.width
167	else: width = w
168	# Use classwide depth if different from method default
169	if self.depth and d == 6: depth = self.depth + 1
170	else: depth = d + 1
171	# File and directory dicts
172	files, dirs = {}, {}
173	# Use existing FTP client session if present
174	if self._session: ftp = self._session
175	# Create new FTP client session if necessary
176	else:
177	ftp = self._ftpopen(b, n, p)
178	self._session = ftp
179	# Avoid outside namespace lookups
180	cwd, pwd, retr = ftp.cwd, ftp.pwd, ftp.retrlines
181	# Walk FTP site
182	visitftp()
183	# Make path list out of files' keys and return it
184	self.paths = files.keys()
185	self.paths.sort()
186	return self.paths
187
188	def ftpspider(self, b=None, w=200, d=6, n='anonymous', p=None):
189	'''Returns lists of URLs and paths plus a live FTP client session
190
191	Arguments:
192	b -- FTP server URL (default: None)
193	w -- maximum amount of resources to crawl (default: 200)
194	d -- depth in hierarchy to crawl (default: 6)
195	n -- login username (default: 'anonymous')
196	p -- login password (default: None)'''
197	if b: ftppaths(b, w, d, n, p)
198	return self.paths, ftpurls(), self._session
199
200	def ftpurls(self, b=None, w=200, d=6, n='anonymous', p=None):
201	'''Returns a list of FTP URLs
202
203	Arguments:
204	b -- FTP server URL (default: None)
205	w -- maximum amount of resources to crawl (default: 200)
206	d -- depth in hierarchy to crawl (default: 6)
207	n -- login username (default: 'anonymous')
208	p -- login password (default: None)'''
209	if b:
210	ftppaths(b, w, d, n, p)
211	# Get rid of trailing '/' in base if present before joining
212	if b[-1] == '/': base = b[:-1]
213	else:
214	base = self.base
215	# Get rid of trailing '/' in base if present before joining
216	if base[-1] == '/': base = self.base[:-1]
217	paths = self.paths
218	# Add FTP URL
219	self.urls = [''.join([base, i]) for i in paths]
220	return self.urls
221
222	def _parserpick(self, old=None):
223	'''Returns a class using the sgmllib parser or the sgmlop parser
224
225	Arguments:
226	old -- use classic sgmllib SGMLParser'''
227	# Assignments
228	urltags, urlattrs = self._urltags, self._urlattrs
229	# Lists for bad file and bad directory signatures
230	self._bfsig, self._bdsig = [], []
231	bfsig, bdsig = self._bfsig, self._bdsig
232	# Use faster SGMLParser if available
233	try:
234	from sgmlop import SGMLParser as newparser
235	self._newparser = newparser
236	# If unavailable, use classic SGML parser
237	except ImportError:
238	from sgmllib import SGMLParser as oldparser
239	old = 1
240	# Classes using classic sgmllib SGML Parser
241	if old:
242	from sgmllib import SGMLParser as oldparser
243	# Remove sgmlop parser if present
244	self._newparser = None
245	# UrlExtract class using classic parser
246	class UrlExtract(oldparser):
247	'''Extracts URLs from a SGMLish document'''
248	def reset(self):
249	'''Resets SGML parser and clears lists'''
250	oldparser.reset(self)
251	self.urls, self.text, self.badurl = [], [], None
252	def handle_data(self, data):
253	'''Handles non-markup data'''
254	# Get first 5 lines of non-markup data
255	if len(self.text) <= 5: self.text.append(data)
256	# Compare signature of known bad URL to a new web page
257	if self.text == bfsig: self.badurl = 1
258	elif self.text == bdsig: self.badurl = 1
259	def finish_starttag(self, tag, attrs):
260	'''Extracts URL bearing tags'''
261	if tag in urltags:
262	# Get key, vale in attributes if they match
263	url = [v for k, v in attrs if k in urlattrs]
264	if url: self.urls.extend(url)
265	# BadUrl class using classic parser
266	class BadUrl(oldparser):
267	'''Collects results of intentionally incorrect URLs'''
268	def reset(self):
269	'''Resets SGML parser and clears lists'''
270	oldparser.reset(self)
271	self.text = []
272	def handle_data(self, data):
273	'''Collects lines to profile bad URLs'''
274	# Adds first 5 lines of non-markup data to text
275	if len(self.text) <= 5: self.text.append(data)
276	# If no old flag, use SGMLParser from sgmlop and related classes
277	else:
278	# UrlExtract class using sgmlop parser
279	class UrlExtract:
280	'''Extracts URLs from a SGMLish document'''
281	def __init__(self):
282	'''Resets SGML parser and clears lists'''
283	self.urls, self.text, self.badurl = [], [], None
284	def handle_data(self, data):
285	'''Handles non-markup data'''
286	# Get first 5 lines of non-markup data
287	if len(self.text) <= 5: self.text.append(data)
288	# Compare signature of known bad URL to a new web page
289	if self.text == bfsig: self.badurl = 1
290	elif self.text == bdsig: self.badurl = 1
291	def finish_starttag(self, tag, attrs):
292	'''Extracts URL bearing tags'''
293	if tag in urltags:
294	# Get key, vale in attributes if they match
295	url = [v for k, v in attrs if k in urlattrs]
296	if url: self.urls.extend(url)
297	# BadUrl class using sgmlop parser
298	class BadUrl:
299	'''Collects results of intentionally incorrect URLs'''
300	def __init__(self):
301	'''Resets SGML parser and clears lists'''
302	self.text = []
303	def handle_data(self, data):
304	'''Collects lines to profile not found responses'''
305	# Adds first 5 lines of non-markup data to list 'text'
306	if len(self.text) <= 5: self.text.append(data)
307	# Make resulting classes available class wide
308	self._UrlExtract, self._BadUrl = UrlExtract, BadUrl
309
310	def _webtest(self):
311	'''Generates signatures for identifying bad URLs'''
312
313	def badurl(url):
314	'''Returns first 5 lines of a bad URL
315
316	Arguments:
317	url -- Bad URL to open and parse'''
318	# Use different classes if faster SGML Parser is available
319	if self._newparser:
320	# sgmlop parser must have a handler passed to it
321	parser, urlget = self._newparser(), BadUrl()
322	# Pass handler (sgmlop cannot be subclassed)
323	parser.register(urlget)
324	parser.feed(urlopen(url).read())
325	parser.close()
326	# Use classic parser
327	else:
328	urlget = BadUrl()
329	urlget.feed(urlopen(url).read())
330	urlget.close()
331	# Return singature of bad URL
332	return urlget.text
333
334	# Make globals local
335	base, urljoin = self.base, self._uparse.urljoin
336	urlopen, BadUrl = self._ulib.urlopen, self._BadUrl
337	# Generate random string of jibber
338	from string import letters, digits
339	from random import choice, randint
340	jibber = ''.join([letters, digits])
341	ru = ''.join([choice(jibber) for x in range(randint(1, 30))])
342	# Builds signature of a bad URL for a file
343	self._bfsig.extend(badurl(urljoin(base, '%s.html' % ru)))
344	# Builds signature of a bad URL for a directory
345	self._bdsig.extend(badurl(urljoin(base,'%s/' % ru)))
346
347	def _webparser(self, html):
348	'''Parses HTML and returns bad URL indicator and extracted URLs
349
350	Arguments:
351	html -- HTML data'''
352	# Use different classes if faster SGML Parser is available
353	if self._newparser:
354	# Make instances of SGML parser and URL extracting handler
355	parser, urlget = self._newparser(), self._UrlExtract()
356	# Pass handler to parser
357	parser.register(urlget)
358	# Feed data to parser
359	parser.feed(html)
360	parser.close()
361	# Return bad URL indicator and extracted URLs
362	else:
363	urlget = self._UrlExtract()
364	urlget.feed(html)
365	urlget.close()
366	# Return badurl marker and list of child URLS
367	return urlget.badurl, urlget.urls
368
369	def _webopen(self, base):
370	'''Verifies URL and returns actual URL and extracted child URLs
371
372	Arguments:
373	base -- tuple containing a URL and its referring URL'''
374	# Assignments
375	good, cbase = self._good, base[0]
376	try:
377	# If webspiders can access URL, open it
378	if self._robot.can_fetch('*', cbase):
379	url = self._ulib.urlopen(cbase)
380	# Otherwise, mark as visited and abort
381	else:
382	self._visited[cbase] = 1
383	return False
384	# If HTTP error, log bad URL and abort
385	except IOError:
386	self._visited[cbase] = 1
387	self.badurls.append((base[1], cbase))
388	return False
389	# Get real URL
390	newbase = url.geturl()
391	# Change URL if different from old URL
392	if newbase != cbase: cbase, base = newbase, (newbase, base[1])
393	# URLs with mimetype 'text/html" scanned for URLs
394	if url.headers.type == 'text/html':
395	# Feed parser
396	contents = url.read()
397	try: badurl, urls = self._webparser(contents)
398	# Log URL if SGML parser can't parse it
399	except self._sperror:
400	self._visited[cbase], self.badhtm[cbase] = 1, 1
401	return False
402	url.close()
403	# Return URL and extracted urls if it's good
404	if not badurl: return cbase, urls
405	# If the URL is bad (after BadUrl), stop processing and log URL
406	else:
407	self._visited[cbase] = 1
408	self.badurls.append((base[1], cbase))
409	return False
410	# Return URL of non-HTML resources and empty list
411	else:
412	url.close()
413	return cbase, []
414
415	def _genverify(self, urls, base):
416	'''Verifies a list of full URL relative to a base URL
417
418	Arguments:
419	urls -- list of raw URLs
420	base -- referring URL'''
421	# Assignments
422	cache, visit, urlverify = self._cache, self._visited, self._urlverify
423	# Strip file off base URL for joining
424	newbase = base.replace(base.split('/')[-1], '')
425	for url in urls:
426	# Get resolved url and raw child URLs
427	url, rawurls = urlverify(url, base, newbase)
428	# Handle any child URLs
429	if rawurls:
430	newurls = {}
431	# Eliminate duplicate URLs
432	for rawurl in rawurls:
433	# Eliminate known visited URLs
434	if rawurl not in visit: newurls[rawurl] = 1
435	# Put new URLs in cache if present
436	if newurls: cache[url] = newurls
437	# Yield new URL
438	if url: yield url
439
440	def _multiverify(self, url, base):
441	'''Verifies a full URL relative to a base URL
442
443	Arguments:
444	url -- a raw URLs
445	base -- referring URL'''
446	# Assignments
447	cache, visited = self._cache, self._visited
448	# Strip file off base URL for joining
449	newbase = base.replace(base.split('/')[-1], '')
450	# Get resolved url and raw child URLs
451	url, rawurls = self._urlverify(url, base, newbase)
452	# Handle any child URLs
453	if rawurls:
454	# Eliminate known visited URLs and duplicates
455	for rawurl in rawurls:
456	# Put new URLs in cache if present
457	if rawurl not in visited: cache[rawurl] = url
458	# Put URL in list of good URLs
459	if url: self._good[url] = 1
460
461	def _urlverify(self, url, base, newbase):
462	'''Returns a full URL relative to a base URL
463
464	Arguments:
465	urls -- list of raw URLs
466	base -- referring URL
467	newbase -- temporary version of referring URL for joining'''
468	# Assignments
469	visited, webopen, other = self._visited, self._webopen, self.other
470	sb, depth, urljoin = self._sb[2], self.depth, self._uparse.urljoin
471	urlsplit, urldefrag = self._uparse.urlsplit, self._uparse.urldefrag
472	outside, redirs, supported = self.outside, self.redirs, self._supported
473	if url not in visited:
474	# Remove whitespace from URL
475	if url.find(' ') != -1:
476	visited[url], url = 1, url.replace(' ', '')
477	if url in visited: return 0, 0
478	# Remove fragments i.e. 'http:foo/bar#frag'
479	if url.find('#') != -1:
480	visited[url], url = 1, urldefrag(url)[0]
481	if url in visited: return 0, 0
482	# Process full URLs i.e. 'http://foo/bar
483	if url.find(':') != -1:
484	urlseg = urlsplit(url)
485	# Block non-FTP, HTTP URLs
486	if urlseg[0] not in supported:
487	# Log as non-FTP/HTTP URL
488	other[url], visited[url] = 1, 1
489	return 0, 0
490	# If URL is not in root domain, block it
491	if urlseg[1] not in sb:
492	visited[url], outside[url] = 1, 1
493	return 0, 0
494	# Block duplicate root URLs
495	elif not urlseg[2] and urlseg[1] == sb:
496	visited[url] = 1
497	return 0, 0
498	# Handle relative URLs i.e. ../foo/bar
499	elif url.find(':') == -1:
500	# Join root domain and relative URL
501	visited[url], url = 1, urljoin(newbase, url)
502	if url in visited: return 0, 0
503	# Test URL by attempting to open it
504	rurl = webopen((url, base))
505	if rurl and rurl[0] not in visited:
506	# Get URL
507	turl, rawurls = rurl
508	visited[url], visited[turl] = 1, 1
509	# If URL resolved to a different URL, process it
510	if turl != url:
511	urlseg = urlsplit(turl)
512	# If URL is not in root domain, block it
513	if urlseg[1] not in sb:
514	# Log as a redirected internal URL
515	redirs[(url, turl)] = 1
516	return 0, 0
517	# Block duplicate root URLs
518	elif not urlseg[2] and urlseg[1] == sb: return 0, 0
519	# If URL exceeds depth, don't process
520	if len(turl.split('/')) >= depth: return 0, 0
521	# Otherwise return URL
522	else:
523	if rawurls: return turl, rawurls
524	else: return turl, []
525	else: return 0,0
526	else: return 0, 0
527
528	def _onewalk(self):
529	'''Yields good URLs from under a base URL'''
530	# Assignments
531	cache, genverify = self._cache, self._genverify
532	# End processing if cache is empty
533	while cache:
534	# Fetch item from cache
535	base, urls = cache.popitem()
536	# If item has child URLs, process them and yield good URLs
537	if urls:
538	for url in genverify(urls, base): yield url
539
540	def _multiwalk(self, threads):
541	'''Extracts good URLs from under a base URL
542
543	Arguments:
544	threads -- number of threads to run'''
545
546	def urlthread(url, base):
547	'''Spawns a thread containing a multiverify function
548
549	Arguments:
550
551	url -- URL to verify
552	base -- referring URL'''
553	# Create instance of Thread
554	dthread = Thread(target=multiverify, args=(url, base))
555	# Put in pool
556	pool.append(dthread)
557
558	# Assignments
559	pool, cache, multiverify = [], self._cache, self._multiverify
560	Thread, width, good = self._thread, self.width, self._good
561	# End processing if cache is empty
562	while cache:
563	# Process URLs as long as width not exceeded
564	if len(good) <= width:
565	# Fetch item from cache
566	url, base = cache.popitem()
567	# Make thread
568	if url: urlthread(url, base)
569	# Run threads once pool size is reached
570	if len(pool) == threads or threads >= len(cache):
571	# Start threads
572	for thread in pool: thread.start()
573	# Empty thread pool as threads complete
574	while pool:
575	for thread in pool:
576	if not thread.isAlive(): pool.remove(thread)
577	# End if width reached
578	elif len(good) >= width: break
579
580	def weburls(self, base=None, width=200, depth=5, thread=None):
581	'''Returns a list of web paths.
582
583	Arguments:
584	base -- base web URL (default: None)
585	width -- amount of resources to crawl (default: 200)
586	depth -- depth in hierarchy to crawl (default: 5)
587	thread -- number of threads to run (default: None)'''
588	# Assignments
589	self._visited, self._good, self._cache, self.badurls = {}, {}, {}, []
590	self.redirs, self.outside, self.badhtm, self.other = {}, {}, {}, {}
591	onewalk, good, self._robot = self._onewalk, self._good, self._rparser()
592	uparse, robot, multiwalk = self._uparse, self._robot, self._multiwalk
593	cache = self._cache
594	# Assign width
595	if self.width and width == 200: width = self.width
596	else: self.width = width
597	# sgmlop crashes Python after too many iterations
598	if width > 5000: self._parserpick(1)
599	else: self._parserpick()
600	# Use global base if present
601	if not base: base = self.base
602	# Verify URL and get child URLs
603	newbase, rawurls = self._webopen((base, ''))
604	if newbase:
605	# Change base URL if different
606	if newbase != base: base = newbase
607	# Ensure there's a trailing '/' in base URL
608	if base[-1] != '/':
609	url = list(uparse.urlsplit(base))
610	url[1] = ''.join([url[1], '/'])
611	base = uparse.urlunsplit(url)
612	# Eliminate duplicates and put raw URLs in cache
613	newurls = {}
614	for rawurl in rawurls: newurls[rawurl] = 1
615	if newurls:
616	# Cache URLs individually if threads are desired
617	if thread:
618	for newurl in newurls: cache[newurl] = base
619	# Cache in group if no threads
620	else: cache[base] = newurls
621	# Make base URL, get split, and put in verified URL list
622	self.base, self._sb = base, base.split('/')
623	self._visited[base], good[base] = 1, 1
624	# If URL is bad, abort and raise error
625	else: raise IOError, "URL is invalid"
626	# Adjust dept to length of base URL
627	if self.depth and depth == 6: self.depth += len(self._sb)
628	else: self.depth = depth + len(self._sb)
629	# Get robot limits
630	robot.set_url(''.join([base, 'robots.txt']))
631	robot.read()
632	# Get signature of bad URL
633	self._webtest()
634	# Get good URLs as long as total width isn't exceeded
635	try:
636	# Multiwalk if threaded
637	if thread: self._multiwalk(thread)
638	# Otherwise, use single thread
639	else:
640	for item in onewalk():
641	# Don't exceed maximum width
642	if len(good) <= width: good[item] = 1
643	elif len(good) >= width: break
644	# If user interrupts crawl, return what's done
645	except KeyboardInterrupt: pass
646	# Get URLs, sort them, and return list
647	self.urls = good.keys()
648	self.urls.sort()
649	return self.urls
650
651	def webpaths(self, b=None, w=200, d=5, t=None):
652	'''Returns a list of web paths.
653
654	Arguments:
655	b -- base web URL (default: None)
656	w -- amount of resources to crawl (default: 200)
657	d -- depth in hierarchy to crawl (default: 5)
658	t -- number of threads (default: None)'''
659
660	def pathize():
661	'''Strips base URL from full URLs to produce paths'''
662	for url in urls:
663	# Remove base URL from path list
664	url = url.replace(self.base, '')
665	# Add default name 'index.html' to root URLs and directories
666	if not url: url = 'index.html'
667	elif url[-1] == '/': url = ''.join([url, 'index.html'])
668	# Verify removal of base URL and remove it if found
669	if url.find(':') != -1: url = urlsplit(url)[2:][0]
670	yield url
671
672	# Assignments
673	urlsplit = self._uparse.urlsplit
674	# Run weburls if base passed as an argument
675	if b: self.weburls(b, w, d, t)
676	# Strip off trailing resource or query from base URL
677	if self.base[-1] != '/': self.base = '/'.join(self._sb[:-1])
678	urls = self.urls
679	# Return path list after stripping base URL
680	self.paths = list(pathize())
681	return self.paths
682
683	def webmirror(self, root=None, t=None, base=None, width=200, depth=5):
684	'''Mirrors a website on a local filesystem
685
686	Arguments:
687	root -- local filesystem path (default: None)
688	t -- number of threads (default: None)
689	base -- base web URL (default: None)
690	width -- amount of resources to crawl (default: 200)
691	depth -- depth in hierarchy to crawl (default: 5)'''
692	if base: self.webspider(base, width, depth, t)
693	return self._mirror((self.paths, self.urls), root, t)
694
695	def webspider(self, b=None, w=200, d=5, t=None):
696	'''Returns two lists of child URLs and paths
697
698	Arguments:
699	b -- base web URL (default: None)
700	w -- amount of resources to crawl (default: 200)
701	d -- depth in hierarchy to crawl (default: 5)
702	t -- number of threads (default: None)'''
703	if b: self.weburls(b, w, d, t)
704	return self.webpaths(), self.urls
705
706	def badurlreport(self, f=None, b=None, w=200, d=5, t=None):
707	'''Pretties up a list of bad URLs
708
709	Arguments:
710	f -- output file for report (default: None)
711	b -- base web URL (default: None)
712	w -- amount of resources to crawl (default: 200)
713	d -- depth in hierarchy to crawl (default: 5)
714	t -- number of threads (default: None)'''
715	if b: self.weburls(b, w, d, t)
716	# Format report if information is available
717	if self.badurls:
718	# Number of bad URLs
719	amount = str(len(self.badurls))
720	header = '%s broken URLs under %s on %s:\n'
721	# Print referring URL pointing to bad URL
722	body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.badurls])
723	report = self._formatreport(amount, header, body, f)
724	# Return if just getting string
725	if report: return report
726
727	def badhtmreport(self, f=None, b=None, w=200, d=5, t=None):
728	'''Pretties up a list of unparsed HTML URLs
729
730	Arguments:
731	f -- output file for report (default: None)
732	b -- base web URL (default: None)
733	w -- amount of resources to crawl (default: 200)
734	d -- depth in hierarchy to crawl (default: 5)
735	t -- number of threads (default: None)'''
736	if b: self.weburls(b, w, d, t)
737	# Format report if information is available
738	if self.badhtm:
739	amount = str(len(self.badhtm))
740	header = '%s unparsable HTML URLs under %s on %s:\n'
741	body = '\n'.join(self.badhtm)
742	report = self._formatreport(amount, header, body, f)
743	# Return if just getting string
744	if report: return report
745
746	def redireport(self, f=None, b=None, w=200, d=5, t=None):
747	'''Pretties up a list of URLs redirected to an external URL
748
749	Arguments:
750	f -- output file for report (default: None)
751	b -- base web URL (default: None)
752	w -- amount of resources to crawl (default: 200)
753	d -- depth in hierarchy to crawl (default: 5)
754	t -- number of threads (default: None)'''
755	if b: self.weburls(b, w, d, t)
756	# Format report if information is available
757	if self.redirs:
758	amount = str(len(self.redirs))
759	header = '%s redirects to external URLs under %s on %s:\n'
760	# Print referring URL pointing to new URL
761	body = '\n'.join([' -> '.join([i[0], i[1]]) for i in self.redirs])
762	report = self._formatreport(amount, header, body, f)
763	# Return if just getting string
764	if report: return report
765
766	def outreport(self, f=None, b=None, w=200, d=5, t=None):
767	'''Pretties up a list of outside URLs referenced under the base URL
768
769	Arguments:
770	f -- output file for report (default: None)
771	b -- base web URL (default: None)
772	w -- amount of resources to crawl (default: 200)
773	d -- depth in hierarchy to crawl (default: 5)
774	t -- number of threads (default: None)'''
775	if b: self.weburls(b, w, d, t)
776	# Format report if information is available
777	if self.outside:
778	amount = str(len(self.outside))
779	header = '%s links to external URLs under %s on %s:\n'
780	body = '\n'.join(self.outside)
781	report = self._formatreport(amount, header, body, f)
782	# Return if just getting string
783	if report: return report
784
785	def othereport(self, f=None, b=None, w=200, d=5, t=None):
786	'''Pretties up a list of non-HTTP/FTP URLs
787
788	Arguments:
789	f -- output file for report (default: None)
790	b -- base web URL (default: None)
791	w -- amount of resources to crawl (default: 200)
792	d -- depth in hierarchy to crawl (default: 5)
793	t -- number of threads (default: None)'''
794	if b: self.weburls(b, w, d, t)
795	# Format report if information is available
796	if self.other:
797	amount = str(len(self.other))
798	header = '%s non-FTP/non-HTTP URLs under %s on %s:\n'
799	body = '\n'.join(self.other)
800	report = self._formatreport(amount, header, body, f)
801	# Return if just getting string
802	if report: return report
803
804	def urlreport(self, f=None, b=None, w=200, d=5, t=None):
805	'''Pretties up a list of all URLs under a URL
806
807	Arguments:
808	f -- output file for report (default: None)
809	b -- base web URL (default: None)
810	w -- amount of resources to crawl (default: 200)
811	d -- depth in hierarchy to crawl (default: 5)
812	t -- number of threads (default: None)'''
813	if b: self.weburls(b, w, d, t)
814	# Format report if information is available
815	if self.urls:
816	amount = str(len(self.urls))
817	header = '%s verified URLs under %s on %s:\n'
818	body = '\n'.join(self.urls)
819	report = self._formatreport(amount, header, body, f)
820	# Return if just getting string
821	if report: return report
822
823	def webreport(self, f=None, b=None, w=200, d=5, t=None, *vargs):
824	'''Pretties up a list of logged information under a URL
825
826	Arguments:
827	f -- output file for report (default: None)
828	b -- base web URL (default: None)
829	w -- amount of resources to crawl (default: 200)
830	d -- depth in hierarchy to crawl (default: 5)
831	t -- number of threads (default: None)
832	vargs -- report sections to include or exclude
833	To override defaults:
834	To include a section add 'badhtm', 'redirs', 'outside', or 'other'
835	To exclude a section add 'badurls' or "urls"'''
836	if b: self.weburls(b, w, d, t)
837	# Defaults for report
838	badurls, badhtm, redirs, urls, outside, other = 1, 0, 0, 1, 0, 0
839	# Create compilation list
840	compile = []
841	# Override default report settings if argument is passed to vargs
842	for arg in vargs:
843	if arg == 'badurls': badurls = 0
844	elif arg == 'badhtm': badhtm = 1
845	elif arg == 'redirs': redirs = 1
846	elif arg == 'urls': urls = 0
847	elif arg == 'outside': outside = 1
848	elif arg == 'other': other = 1
849	# Compile report
850	if badurls:
851	badurls = self.badurlreport()
852	if badurls: compile.append(badurls)
853	if urls:
854	urls = self.urlreport()
855	if urls: compile.append(urls)
856	if outside:
857	outside = self.outreport()
858	if outside: compile.append(outside)
859	if redirs:
860	redirs = self.redireport()
861	if redirs: compile.append(redirs)
862	if badhtm:
863	badhtm = self.badhtmreport()
864	if badhtm: compile.append(badhtm)
865	if other:
866	other = self.othereport()
867	if other: compile.append(other)
868	# Make report
869	report = '\n\n'.join(compile)
870	# Write to file if argument present
871	if file: open(f, 'w').write(report)
872	# Or return string
873	else: return report
874
875	def _formatreport(self, amount, header, body, file=None):
876	'''Generic prettifier with date/time stamper
877
878	Arguments:
879	header -- title of report
880	body -- body of report
881	file -- output file for report (default: None)'''
882	# Get current time
883	localtime, strftime = self._localtime, self._formtime
884	curtime = strftime('%A, %B %d, %Y at %I:%M %p', localtime())
885	# Make section header
886	header = header % (amount, self.base, curtime)
887	# Add header to body
888	report = '\n'.join([header, body])
889	# Write to file if argument present
890	if file: open(file, 'w').write(report)
891	# Or return string
892	else: return report
893
894	def _mirror(self, lists, root=None, threads=None):
895	'''Mirrors a site on a local filesystem based on lists passed to it
896
897	Argument:
898	lists -- lists of URLs and paths
899	root -- local filesystem path (default: None)
900	threads -- number of threads (default: None)'''
901
902	def download(url, np, op):
903	'''Downloads files that need to be mirrored.'''
904	# If ftp...
905	if self._path.exists(np):
906	return None
907	if url[:3] == 'ftp':
908	# Open local file
909	local = open(np, 'wb')
910	# Download using FTP session
911	ftp = ftpopen(base, name, password)
912	ftp.retrbinary('RETR %s' % op, local.write)
913	ftp.close()
914	# Close local file
915	local.close()
916	# Use normal urlretrieve if no FTP required
917	else: ulib.urlretrieve(url, np)
918
919	def dlthread(url, np, op):
920	'''Spawns a thread containing the download function'''
921	# Create thread
922	dthread = Thread(target=download, args=(url, np, op))
923	# Add to thread pool
924	pool.append(dthread)
925
926	# Extract path and URL lists
927	paths, urls = lists
928	# Avoid outside namespace lookups
929	ulib, makedirs, sep = self._ulib, self._os.makedirs, self._os.sep
930	normcase, split = self._path.normcase, self._path.split
931	exists, isdir = self._path.exists, self._path.isdir
932	ftpopen = self._ftpopen
933	# Create local names for thread class and thread pool
934	if threads: Thread, pool = self._thread, []
935	# Localize name and password if exists
936	try: base, name, password = self.base, self._name, self._password
937	except AttributeError: pass
938	# Change to directory if given...
939	if root:
940	if exists(root):
941	if isdir(root): self._os.chdir(root)
942	# Create root if it doesn't exist
943	else:
944	makedirs(root)
945	self._os.chdir(root)
946	# Otherwise use current directory
947	else: root = self._os.getcwd()
948	# Iterate over paths and download files
949	for oldpath in paths:
950	# Sync with the URL for oldpath
951	url = urls[paths.index(oldpath)]
952	# Create name of local copy
953	newpath = normcase(oldpath).lstrip(sep)
954	# Get directory name
955	dirname = split(newpath)[0]
956	# If the directory exists, download the file directly
957	if exists(dirname):
958	if isdir(dirname):
959	if threads: dlthread(url, newpath, oldpath)
960	else: download(url, newpath, oldpath)
961	# Don't create local directory if path in root of remote URL
962	elif not dirname:
963	if threads: dlthread(url, newpath, oldpath)
964	else: download(url, newpath, oldpath)
965	# Make local directory if it doesn't exist, then dowload file
966	else:
967	makedirs(dirname)
968	if threads: dlthread(url, newpath, oldpath)
969	else: download(url, newpath, oldpath)
970	# Run threads if they've hit the max number of threads allowed
971	if threads:
972	# Run if max threads or final thread reached
973	if len(pool) == threads or paths[-1] == oldpath:
974	# Start all threads
975	for thread in pool: thread.start()
976	# Clear the thread pool as they finish
977	while pool:
978	for thread in pool:
979	if not thread.isAlive(): pool.remove(thread)
980
981
982	# Instance of Spider enables exporting Spider's methods as standalone functions
983	_inst = Spider()
984	ftpurls = _inst.ftpurls
985	weburls = _inst.weburls
986	ftppaths = _inst.ftppaths
987	webpaths = _inst.webpaths
988	ftpmirror = _inst.ftpmirror
989	ftpspider = _inst.ftpspider
990	webmirror = _inst.webmirror
991	webspider = _inst.webspider
992	webreport = _inst.webreport
993	urlreport = _inst.urlreport
994	outreport = _inst.outreport
995	redireport = _inst.redireport
996	othereport = _inst.othereport
997	badurlreport = _inst.badurlreport
998	badhtmreport = _inst.badhtmreport

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: products/quintagroup.distrpoxy/trunk/quintagroup/distproxy/spider.py

Download in other formats: