source: products/quintagroup.seoptimizer/trunk/quintagroup/seoptimizer/browser/keywords.py @ 1313

Last change on this file since 1313 was 1313, checked in by liebster, 15 years ago

Added metatags order feature, which is managed by metatags_order property of of configlet

  • Property svn:eol-style set to native
File size: 3.7 KB
Line 
1import urllib, re, os, commands
2from xml.dom import minidom, Node
3
4from zope.interface import implements
5from Products.Five.browser import BrowserView
6
7from Products.CMFCore.utils import getToolByName
8from Products.CMFPlone import PloneMessageFactory as _
9
10from interfaces import IValidateSEOKeywordsView
11
12class ValidateSEOKeywordsView(BrowserView):
13
14    implements(IValidateSEOKeywordsView)
15
16    def validateKeywords(self, text):
17        """ see interface """
18        # extract keywords from text
19        if not text.strip():
20            return _(u'Keywords list is empty!')
21
22        keywords = map(lambda x: x.strip(), text.lower().split('\n'))
23        if not keywords:
24            return _(u'Keywords list is empty!')
25
26        # request html page of context object
27        url = '%s?qseo_without_additional_keywords=1' % self.context.absolute_url()
28        #try:
29            #page = urllib.urlopen(url)
30        #except IOError:
31            #return _('Could not find requested page')
32
33        #page_html = page.read()
34        #if not page_html:
35            #return _('Page is empty')
36
37        # extract words from body from html page
38
39        # this block work only with valid html
40        #doc = minidom.parseString(page_html)
41        #rootNode = doc.documentElement
42        #bodies = rootNode.getElementsByTagName('body')
43        #if len(bodies) > 0:
44            #body = bodies[0]
45        #else:
46            #return _(u'Invalid page html')
47        #page_words = []
48        #self.walkTextNodes(body, page_words)
49
50        # this block work even with invalid html
51        #pattern = re.compile('<\s*body[^>]*>(.*?)<\s*/\s*body\s*>', re.S|re.M|re.I)
52        #search = pattern.search(page_html)
53        #if search:
54            #body_html = search.group(1)
55        #else:
56            #return _('Invalid html code on page')
57
58        #page_text = self.strip_tags(body_html)
59        #page_words = page_text.lower().split()
60
61        # extract words from url page using lynx browser
62        page_text = commands.getoutput('lynx --dump --nolist %s' % url).lower()
63        if page_text and page_text != 'sh: lynx: command not found':
64            #page_words = page_text.lower().split()
65            page_text = page_text
66        else:
67            return _(u'Could not find lynx browser!')
68
69        # check every keyword on appearing in body of html page
70        missing = []
71        added = {}
72        for keyword in keywords:
73            if keyword not in added.keys() and not re.compile(r'\b%s\b' % keyword, re.I).search(page_text):
74                missing.append(keyword)
75                added[keyword] = 1
76
77        # return list of missing keywords
78        if missing:
79            msg = u"""Next keywords did not appear on the page:\n%s""" % '\n'.join(missing)
80        else:
81            msg = u"""All keywords found on the page!"""
82        return _(msg)
83
84    def walkTextNodes(self, parent, page_words=[]):
85        for node in parent.childNodes:
86            if node.nodeType == Node.ELEMENT_NODE:
87                self.walkTextNodes(node, page_words)
88            elif node.nodeType == Node.TEXT_NODE:
89                value = node.nodeValue
90                if value is not None:
91                    page_words.extend(map(lambda x: x.lower(), value.split()))
92
93    def strip_tags(self, in_text):
94        s_list = list(in_text)
95        i,j = 0,0
96
97        while i < len(s_list):
98            if s_list[i] == '<':
99                while s_list[i] != '>':
100                    # pop everything from the the left-angle bracket until the right-angle bracket
101                    s_list.pop(i)
102                   
103                # pops the right-angle bracket, too
104                s_list.pop(i)
105            else:
106                i=i+1
107               
108        # convert the list back into text
109        join_char=''
110        return join_char.join(s_list)
Note: See TracBrowser for help on using the repository browser.