source: products/qSEOptimizer/tags/1.5.0/browser/keywords.py @ 1

Last change on this file since 1 was 1, checked in by myroslav, 18 years ago

Building directory structure

  • Property svn:eol-style set to native
File size: 3.7 KB
Line 
1import urllib, re, os, commands
2from xml.dom import minidom, Node
3
4from zope.interface import implements
5from Products.Five.browser import BrowserView
6
7from Products.CMFCore.utils import getToolByName
8from Products.CMFPlone import PloneMessageFactory as _
9
10from interfaces import IValidateSEOKeywordsView
11
12class ValidateSEOKeywordsView(BrowserView):
13
14    implements(IValidateSEOKeywordsView)
15
16    def validateKeywords(self, text):
17        """ see interface """
18
19        # extract keywords from text
20        if not text.strip():
21            return _(u'Keywords list is empty!')
22
23        keywords = map(lambda x: x.strip(), text.lower().split('\n'))
24        if not keywords:
25            return _(u'Keywords list is empty!')
26
27        # request html page of context object
28        url = '%s?qseo_without_additional_keywords=1' % self.context.absolute_url()
29        #try:
30            #page = urllib.urlopen(url)
31        #except IOError:
32            #return _('Could not find requested page')
33
34        #page_html = page.read()
35        #if not page_html:
36            #return _('Page is empty')
37
38        # extract words from body from html page
39
40        # this block work only with valid html
41        #doc = minidom.parseString(page_html)
42        #rootNode = doc.documentElement
43        #bodies = rootNode.getElementsByTagName('body')
44        #if len(bodies) > 0:
45            #body = bodies[0]
46        #else:
47            #return _(u'Invalid page html')
48        #page_words = []
49        #self.walkTextNodes(body, page_words)
50
51        # this block work even with invalid html
52        #pattern = re.compile('<\s*body[^>]*>(.*?)<\s*/\s*body\s*>', re.S|re.M|re.I)
53        #search = pattern.search(page_html)
54        #if search:
55            #body_html = search.group(1)
56        #else:
57            #return _('Invalid html code on page')
58
59        #page_text = self.strip_tags(body_html)
60        #page_words = page_text.lower().split()
61
62        # extract words from url page using lynx browser
63        page_text = commands.getoutput('lynx --dump --nolist %s' % url).lower()
64        if page_text and page_text != 'sh: lynx: command not found':
65            #page_words = page_text.lower().split()
66            page_text = page_text
67        else:
68            return _(u'Could not find lynx browser!')
69
70        # check every keyword on appearing in body of html page
71        missing = []
72        added = {}
73        for keyword in keywords:
74            if keyword not in added.keys() and not re.compile(r'\b%s\b' % keyword, re.I).search(page_text):
75                missing.append(keyword)
76                added[keyword] = 1
77
78        # return list of missing keywords
79        if missing:
80            msg = u"""Next keywords did not appear on the page:\n%s""" % '\n'.join(missing)
81        else:
82            msg = u"""All keywords found on the page!"""
83        return _(msg)
84
85    def walkTextNodes(self, parent, page_words=[]):
86        for node in parent.childNodes:
87            if node.nodeType == Node.ELEMENT_NODE:
88                self.walkTextNodes(node, page_words)
89            elif node.nodeType == Node.TEXT_NODE:
90                value = node.nodeValue
91                if value is not None:
92                    page_words.extend(map(lambda x: x.lower(), value.split()))
93
94    def strip_tags(self, in_text):
95        s_list = list(in_text)
96        i,j = 0,0
97
98        while i < len(s_list):
99            if s_list[i] == '<':
100                while s_list[i] != '>':
101                    # pop everything from the the left-angle bracket until the right-angle bracket
102                    s_list.pop(i)
103                   
104                # pops the right-angle bracket, too
105                s_list.pop(i)
106            else:
107                i=i+1
108               
109        # convert the list back into text
110        join_char=''
111        return join_char.join(s_list)
Note: See TracBrowser for help on using the repository browser.