Changeset 1015
- Timestamp:
- 11/30/07 09:34:02
- Files:
-
- qSEOptimizer/trunk/browser/keywords.py (modified) (3 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
qSEOptimizer/trunk/browser/keywords.py
r1012 r1015 1 import urllib 1 import urllib, re, os, commands 2 2 from xml.dom import minidom, Node 3 3 … … 24 24 # request html page of context object 25 25 url = self.context.absolute_url() 26 try:27 page = urllib.urlopen(url)28 except IOError:29 return _('Could not find requested page')26 #try: 27 #page = urllib.urlopen(url) 28 #except IOError: 29 #return _('Could not find requested page') 30 30 31 page_html = page.read()32 if not page_html:33 return _('Page is empty')31 #page_html = page.read() 32 #if not page_html: 33 #return _('Page is empty') 34 34 35 35 # extract words from body from html page 36 doc = minidom.parseString(page_html) 37 rootNode = doc.documentElement 38 bodies = rootNode.getElementsByTagName('body') 39 if len(bodies) > 0: 40 body = bodies[0] 36 37 # this block work only with valid html 38 #doc = minidom.parseString(page_html) 39 #rootNode = doc.documentElement 40 #bodies = rootNode.getElementsByTagName('body') 41 #if len(bodies) > 0: 42 #body = bodies[0] 43 #else: 44 #return _(u'Invalid page html') 45 #page_words = [] 46 #self.walkTextNodes(body, page_words) 47 48 # this block work even with invalid html 49 #pattern = re.compile('<\s*body[^>]*>(.*?)<\s*/\s*body\s*>', re.S|re.M|re.I) 50 #search = pattern.search(page_html) 51 #if search: 52 #body_html = search.group(1) 53 54 # extract words from url page using lynx browser 55 page_text = commands.getoutput('lynx --dump --nolist %s' % url) 56 if page_text and page_text != 'sh: lynx: command not found': 57 page_words = page_text.lower().split() 41 58 else: 42 return _(u'Invalid page html') 43 page_words = [] 44 self.walkTextNodes(body, page_words) 59 return _(u'Could not find lynx browser!') 45 60 46 61 # check every keyword on appearing in body of html page 47 missing = [keyword for keyword in keywords if keyword not in page_words] 62 missing = {} 63 for keyword in keywords: 64 if keyword not in page_words and keyword not in missing.keys(): 65 missing[keyword] = 1 48 66 49 67 # return list of missing keywords 50 68 if missing: 51 msg = u"""Next keywords did not appear on the page:\n%s""" % '\n'.join(missing )69 msg = u"""Next keywords did not appear on the page:\n%s""" % '\n'.join(missing.keys()) 52 70 else: 53 71 msg = u"""All keywords found on the page!""" … … 62 80 if value is not None: 63 81 page_words.extend(map(lambda x: x.lower(), value.split())) 82 83 def strip_tags(in_text): 84 s_list = list(in_text) 85 i,j = 0,0 86 87 while i < len(s_list): 88 if s_list[i] == '<': 89 while s_list[i] != '>': 90 # pop everything from the the left-angle bracket until the right-angle bracket 91 s_list.pop(i) 92 93 # pops the right-angle bracket, too 94 s_list.pop(i) 95 else: 96 i=i+1 97 98 # convert the list back into text 99 join_char='' 100 return join_char.join(s_list)
