1 | import urllib, re, os, commands |
---|
2 | from xml.dom import minidom, Node |
---|
3 | |
---|
4 | from zope.interface import implements |
---|
5 | from Products.Five.browser import BrowserView |
---|
6 | |
---|
7 | from Products.CMFCore.utils import getToolByName |
---|
8 | from Products.CMFPlone import PloneMessageFactory as _ |
---|
9 | |
---|
10 | from interfaces import IValidateSEOKeywordsView |
---|
11 | |
---|
12 | class ValidateSEOKeywordsView(BrowserView): |
---|
13 | |
---|
14 | implements(IValidateSEOKeywordsView) |
---|
15 | |
---|
16 | def validateKeywords(self, text): |
---|
17 | """ see interface """ |
---|
18 | |
---|
19 | # extract keywords from text |
---|
20 | if not text.strip(): |
---|
21 | return _(u'Keywords list is empty!') |
---|
22 | |
---|
23 | keywords = map(lambda x: x.strip(), text.lower().split('\n')) |
---|
24 | if not keywords: |
---|
25 | return _(u'Keywords list is empty!') |
---|
26 | |
---|
27 | # request html page of context object |
---|
28 | url = '%s?qseo_without_additional_keywords=1' % self.context.absolute_url() |
---|
29 | #try: |
---|
30 | #page = urllib.urlopen(url) |
---|
31 | #except IOError: |
---|
32 | #return _('Could not find requested page') |
---|
33 | |
---|
34 | #page_html = page.read() |
---|
35 | #if not page_html: |
---|
36 | #return _('Page is empty') |
---|
37 | |
---|
38 | # extract words from body from html page |
---|
39 | |
---|
40 | # this block work only with valid html |
---|
41 | #doc = minidom.parseString(page_html) |
---|
42 | #rootNode = doc.documentElement |
---|
43 | #bodies = rootNode.getElementsByTagName('body') |
---|
44 | #if len(bodies) > 0: |
---|
45 | #body = bodies[0] |
---|
46 | #else: |
---|
47 | #return _(u'Invalid page html') |
---|
48 | #page_words = [] |
---|
49 | #self.walkTextNodes(body, page_words) |
---|
50 | |
---|
51 | # this block work even with invalid html |
---|
52 | #pattern = re.compile('<\s*body[^>]*>(.*?)<\s*/\s*body\s*>', re.S|re.M|re.I) |
---|
53 | #search = pattern.search(page_html) |
---|
54 | #if search: |
---|
55 | #body_html = search.group(1) |
---|
56 | #else: |
---|
57 | #return _('Invalid html code on page') |
---|
58 | |
---|
59 | #page_text = self.strip_tags(body_html) |
---|
60 | #page_words = page_text.lower().split() |
---|
61 | |
---|
62 | # extract words from url page using lynx browser |
---|
63 | page_text = commands.getoutput('lynx --dump --nolist %s' % url).lower() |
---|
64 | if page_text and page_text != 'sh: lynx: command not found': |
---|
65 | #page_words = page_text.lower().split() |
---|
66 | page_text = page_text |
---|
67 | else: |
---|
68 | return _(u'Could not find lynx browser!') |
---|
69 | |
---|
70 | # check every keyword on appearing in body of html page |
---|
71 | missing = [] |
---|
72 | added = {} |
---|
73 | for keyword in keywords: |
---|
74 | if keyword not in added.keys() and not re.compile(r'\b%s\b' % keyword, re.I).search(page_text): |
---|
75 | missing.append(keyword) |
---|
76 | added[keyword] = 1 |
---|
77 | |
---|
78 | # return list of missing keywords |
---|
79 | if missing: |
---|
80 | msg = u"""Next keywords did not appear on the page:\n%s""" % '\n'.join(missing) |
---|
81 | else: |
---|
82 | msg = u"""All keywords found on the page!""" |
---|
83 | return _(msg) |
---|
84 | |
---|
85 | def walkTextNodes(self, parent, page_words=[]): |
---|
86 | for node in parent.childNodes: |
---|
87 | if node.nodeType == Node.ELEMENT_NODE: |
---|
88 | self.walkTextNodes(node, page_words) |
---|
89 | elif node.nodeType == Node.TEXT_NODE: |
---|
90 | value = node.nodeValue |
---|
91 | if value is not None: |
---|
92 | page_words.extend(map(lambda x: x.lower(), value.split())) |
---|
93 | |
---|
94 | def strip_tags(self, in_text): |
---|
95 | s_list = list(in_text) |
---|
96 | i,j = 0,0 |
---|
97 | |
---|
98 | while i < len(s_list): |
---|
99 | if s_list[i] == '<': |
---|
100 | while s_list[i] != '>': |
---|
101 | # pop everything from the the left-angle bracket until the right-angle bracket |
---|
102 | s_list.pop(i) |
---|
103 | |
---|
104 | # pops the right-angle bracket, too |
---|
105 | s_list.pop(i) |
---|
106 | else: |
---|
107 | i=i+1 |
---|
108 | |
---|
109 | # convert the list back into text |
---|
110 | join_char='' |
---|
111 | return join_char.join(s_list) |
---|