1 | import urllib, re, os, commands |
---|
2 | from xml.dom import minidom, Node |
---|
3 | |
---|
4 | from zope.interface import implements |
---|
5 | from Products.Five.browser import BrowserView |
---|
6 | |
---|
7 | from Products.CMFPlone.utils import safe_unicode, getSiteEncoding |
---|
8 | from Products.CMFCore.utils import getToolByName |
---|
9 | |
---|
10 | from interfaces import IValidateSEOKeywordsView |
---|
11 | from quintagroup.seoptimizer import SeoptimizerMessageFactory as _ |
---|
12 | |
---|
13 | class ValidateSEOKeywordsView(BrowserView): |
---|
14 | |
---|
15 | implements(IValidateSEOKeywordsView) |
---|
16 | |
---|
17 | def validateKeywords(self, text): |
---|
18 | """ see interface """ |
---|
19 | ts = getToolByName(self.context, 'translation_service') |
---|
20 | # extract keywords from text |
---|
21 | enc = getSiteEncoding(self.context) |
---|
22 | if text.lower().strip(): |
---|
23 | keywords = filter(None, map(lambda x: safe_unicode(x.strip(), enc), |
---|
24 | text.lower().strip().split('\n'))) |
---|
25 | else: |
---|
26 | return ts.utranslate(domain='quintagroup.seoptimizer', |
---|
27 | msgid=_(u'Keywords list is empty!'), |
---|
28 | context=self.context) |
---|
29 | # request html page of context object |
---|
30 | url = '%s?without_metatag_keywords=1' % self.context.absolute_url() |
---|
31 | |
---|
32 | # extract words from url page using lynx browser (test page by 'url' randered without metatag keywords) |
---|
33 | page_text = commands.getoutput('lynx --dump --nolist %s' % url).lower() |
---|
34 | if page_text and page_text != 'sh: lynx: command not found': |
---|
35 | page_text = safe_unicode(page_text, 'utf-8') |
---|
36 | else: |
---|
37 | return ts.utranslate(domain='quintagroup.seoptimizer', |
---|
38 | msgid=_(u'Could not find lynx browser!'), |
---|
39 | context=self.context) |
---|
40 | |
---|
41 | # check every keyword on appearing in body of html page |
---|
42 | result = [] |
---|
43 | for keyword in keywords: |
---|
44 | keyword_on_page = unicode(len(re.findall(u'\\b%s\\b' % keyword, page_text, re.I|re.U))) |
---|
45 | result.append(' - '.join((keyword, keyword_on_page))) |
---|
46 | return ts.utranslate(domain='quintagroup.seoptimizer', |
---|
47 | msgid=_(u'number_keywords', |
---|
48 | default=u'Number of keywords at page:\n${result}', |
---|
49 | mapping={'result':'\n'.join(result)}), |
---|
50 | context=self.context) |
---|
51 | |
---|
52 | def walkTextNodes(self, parent, page_words=[]): |
---|
53 | for node in parent.childNodes: |
---|
54 | if node.nodeType == Node.ELEMENT_NODE: |
---|
55 | self.walkTextNodes(node, page_words) |
---|
56 | elif node.nodeType == Node.TEXT_NODE: |
---|
57 | value = node.nodeValue |
---|
58 | if value is not None: |
---|
59 | page_words.extend(map(lambda x: x.lower(), value.split())) |
---|
60 | |
---|
61 | def strip_tags(self, in_text): |
---|
62 | s_list = list(in_text) |
---|
63 | i,j = 0,0 |
---|
64 | |
---|
65 | while i < len(s_list): |
---|
66 | if s_list[i] == '<': |
---|
67 | while s_list[i] != '>': |
---|
68 | # pop everything from the the left-angle bracket until the right-angle bracket |
---|
69 | s_list.pop(i) |
---|
70 | |
---|
71 | # pops the right-angle bracket, too |
---|
72 | s_list.pop(i) |
---|
73 | else: |
---|
74 | i=i+1 |
---|
75 | |
---|
76 | # convert the list back into text |
---|
77 | join_char='' |
---|
78 | return join_char.join(s_list) |
---|