[387] | 1 | import urllib, re, os, commands |
---|
| 2 | from xml.dom import minidom, Node |
---|
| 3 | |
---|
| 4 | from zope.interface import implements |
---|
| 5 | from Products.Five.browser import BrowserView |
---|
| 6 | |
---|
| 7 | from Products.CMFCore.utils import getToolByName |
---|
| 8 | |
---|
| 9 | from interfaces import IValidateSEOKeywordsView |
---|
[1463] | 10 | from quintagroup.seoptimizer import SeoptimizerMessageFactory as _ |
---|
[387] | 11 | |
---|
| 12 | class ValidateSEOKeywordsView(BrowserView): |
---|
| 13 | |
---|
| 14 | implements(IValidateSEOKeywordsView) |
---|
| 15 | |
---|
| 16 | def validateKeywords(self, text): |
---|
| 17 | """ see interface """ |
---|
[1466] | 18 | ts = getToolByName(self.context, 'translation_service') |
---|
[387] | 19 | # extract keywords from text |
---|
[1466] | 20 | if text.lower().strip(): |
---|
| 21 | keywords = map(lambda x: x.strip(), text.lower().strip().split('\n')) |
---|
| 22 | else: |
---|
| 23 | return ts.utranslate(None, _(u'Keywords list is empty!'), context=self.context) |
---|
[387] | 24 | # request html page of context object |
---|
| 25 | url = '%s?qseo_without_additional_keywords=1' % self.context.absolute_url() |
---|
| 26 | #try: |
---|
| 27 | #page = urllib.urlopen(url) |
---|
| 28 | #except IOError: |
---|
| 29 | #return _('Could not find requested page') |
---|
| 30 | |
---|
| 31 | #page_html = page.read() |
---|
| 32 | #if not page_html: |
---|
| 33 | #return _('Page is empty') |
---|
| 34 | |
---|
| 35 | # extract words from body from html page |
---|
| 36 | |
---|
| 37 | # this block work only with valid html |
---|
| 38 | #doc = minidom.parseString(page_html) |
---|
| 39 | #rootNode = doc.documentElement |
---|
| 40 | #bodies = rootNode.getElementsByTagName('body') |
---|
| 41 | #if len(bodies) > 0: |
---|
| 42 | #body = bodies[0] |
---|
| 43 | #else: |
---|
| 44 | #return _(u'Invalid page html') |
---|
| 45 | #page_words = [] |
---|
| 46 | #self.walkTextNodes(body, page_words) |
---|
| 47 | |
---|
| 48 | # this block work even with invalid html |
---|
| 49 | #pattern = re.compile('<\s*body[^>]*>(.*?)<\s*/\s*body\s*>', re.S|re.M|re.I) |
---|
| 50 | #search = pattern.search(page_html) |
---|
| 51 | #if search: |
---|
| 52 | #body_html = search.group(1) |
---|
| 53 | #else: |
---|
| 54 | #return _('Invalid html code on page') |
---|
| 55 | |
---|
| 56 | #page_text = self.strip_tags(body_html) |
---|
| 57 | #page_words = page_text.lower().split() |
---|
| 58 | |
---|
| 59 | # extract words from url page using lynx browser |
---|
| 60 | page_text = commands.getoutput('lynx --dump --nolist %s' % url).lower() |
---|
| 61 | if page_text and page_text != 'sh: lynx: command not found': |
---|
| 62 | #page_words = page_text.lower().split() |
---|
[1467] | 63 | page_text = page_text.decode('utf8') |
---|
[387] | 64 | else: |
---|
| 65 | return _(u'Could not find lynx browser!') |
---|
| 66 | |
---|
| 67 | # check every keyword on appearing in body of html page |
---|
| 68 | missing = [] |
---|
[1466] | 69 | finding = [] |
---|
[387] | 70 | added = {} |
---|
[1466] | 71 | finded = {} |
---|
[387] | 72 | for keyword in keywords: |
---|
[1467] | 73 | keyword = keyword.decode('utf8') |
---|
[1466] | 74 | if keyword: |
---|
[1467] | 75 | keyword_on_page = len(re.findall(u'\\b%s\\b' % keyword, page_text, re.I|re.U)) |
---|
[1466] | 76 | if keyword not in added.keys() and not keyword_on_page: |
---|
[1477] | 77 | missing.append(keyword) |
---|
[1466] | 78 | added[keyword] = 1 |
---|
| 79 | if keyword not in finded.keys() and keyword_on_page: |
---|
[1467] | 80 | finding.append(keyword+u' - '+repr(keyword_on_page)) |
---|
[1466] | 81 | finded[keyword] = 1 |
---|
[387] | 82 | # return list of missing keywords |
---|
| 83 | if missing: |
---|
[1466] | 84 | msg = ts.utranslate(None, _('missing_keywords', default=u'Next keywords did not appear on the page:\n${missing}', mapping={'missing':'\n'.join(missing)}), context=self.context) |
---|
[387] | 85 | else: |
---|
[1466] | 86 | msg = ts.utranslate(None, _('finded_keywords', default=u'All keywords found on the page!\nMore detailed:\n${found}', mapping={'found': '\n'.join(finding)}), context=self.context) |
---|
| 87 | return msg |
---|
[387] | 88 | |
---|
| 89 | def walkTextNodes(self, parent, page_words=[]): |
---|
| 90 | for node in parent.childNodes: |
---|
| 91 | if node.nodeType == Node.ELEMENT_NODE: |
---|
| 92 | self.walkTextNodes(node, page_words) |
---|
| 93 | elif node.nodeType == Node.TEXT_NODE: |
---|
| 94 | value = node.nodeValue |
---|
| 95 | if value is not None: |
---|
| 96 | page_words.extend(map(lambda x: x.lower(), value.split())) |
---|
| 97 | |
---|
| 98 | def strip_tags(self, in_text): |
---|
| 99 | s_list = list(in_text) |
---|
| 100 | i,j = 0,0 |
---|
| 101 | |
---|
| 102 | while i < len(s_list): |
---|
| 103 | if s_list[i] == '<': |
---|
| 104 | while s_list[i] != '>': |
---|
| 105 | # pop everything from the the left-angle bracket until the right-angle bracket |
---|
| 106 | s_list.pop(i) |
---|
[1466] | 107 | |
---|
[387] | 108 | # pops the right-angle bracket, too |
---|
| 109 | s_list.pop(i) |
---|
| 110 | else: |
---|
| 111 | i=i+1 |
---|
[1466] | 112 | |
---|
[387] | 113 | # convert the list back into text |
---|
| 114 | join_char='' |
---|
| 115 | return join_char.join(s_list) |
---|