root/qSEOptimizer/trunk/browser/keywords.py

Revision 1019 (checked in by piv, 1 year ago)

version 1.4.0

  • Property svn:eol-style set to native
Line 
1 import urllib, re, os, commands
2 from xml.dom import minidom, Node
3
4 from zope.interface import implements
5 from Products.Five.browser import BrowserView
6
7 from Products.CMFCore.utils import getToolByName
8 from Products.CMFPlone import PloneMessageFactory as _
9
10 from interfaces import IValidateSEOKeywordsView
11
12 class ValidateSEOKeywordsView(BrowserView):
13
14     implements(IValidateSEOKeywordsView)
15
16     def validateKeywords(self, text):
17         """ see interface """
18
19         # extract keywords from text
20         if not text.strip():
21             return _(u'Keywords list is empty!')
22
23         keywords = map(lambda x: x.strip(), text.lower().split('\n'))
24         if not keywords:
25             return _(u'Keywords list is empty!')
26
27         # request html page of context object
28         url = '%s?qseo_without_additional_keywords=1' % self.context.absolute_url()
29         #try:
30             #page = urllib.urlopen(url)
31         #except IOError:
32             #return _('Could not find requested page')
33
34         #page_html = page.read()
35         #if not page_html:
36             #return _('Page is empty')
37
38         # extract words from body from html page
39
40         # this block work only with valid html
41         #doc = minidom.parseString(page_html)
42         #rootNode = doc.documentElement
43         #bodies = rootNode.getElementsByTagName('body')
44         #if len(bodies) > 0:
45             #body = bodies[0]
46         #else:
47             #return _(u'Invalid page html')
48         #page_words = []
49         #self.walkTextNodes(body, page_words)
50
51         # this block work even with invalid html
52         #pattern = re.compile('<\s*body[^>]*>(.*?)<\s*/\s*body\s*>', re.S|re.M|re.I)
53         #search = pattern.search(page_html)
54         #if search:
55             #body_html = search.group(1)
56         #else:
57             #return _('Invalid html code on page')
58
59         #page_text = self.strip_tags(body_html)
60         #page_words = page_text.lower().split()
61
62         # extract words from url page using lynx browser
63         page_text = commands.getoutput('lynx --dump --nolist %s' % url).lower()
64         if page_text and page_text != 'sh: lynx: command not found':
65             #page_words = page_text.lower().split()
66             page_text = page_text
67         else:
68             return _(u'Could not find lynx browser!')
69
70         # check every keyword on appearing in body of html page
71         missing = []
72         added = {}
73         for keyword in keywords:
74             if keyword not in added.keys() and not re.compile(r'\b%s\b' % keyword, re.I).search(page_text):
75                 missing.append(keyword)
76                 added[keyword] = 1
77
78         # return list of missing keywords
79         if missing:
80             msg = u"""Next keywords did not appear on the page:\n%s""" % '\n'.join(missing)
81         else:
82             msg = u"""All keywords found on the page!"""
83         return _(msg)
84
85     def walkTextNodes(self, parent, page_words=[]):
86         for node in parent.childNodes:
87             if node.nodeType == Node.ELEMENT_NODE:
88                 self.walkTextNodes(node, page_words)
89             elif node.nodeType == Node.TEXT_NODE:
90                 value = node.nodeValue
91                 if value is not None:
92                     page_words.extend(map(lambda x: x.lower(), value.split()))
93
94     def strip_tags(self, in_text):
95         s_list = list(in_text)
96         i,j = 0,0
97
98         while i < len(s_list):
99             if s_list[i] == '<':
100                 while s_list[i] != '>':
101                     # pop everything from the the left-angle bracket until the right-angle bracket
102                     s_list.pop(i)
103                    
104                 # pops the right-angle bracket, too
105                 s_list.pop(i)
106             else:
107                 i=i+1
108                
109         # convert the list back into text
110         join_char=''
111         return join_char.join(s_list)
Note: See TracBrowser for help on using the browser.