Changeset 2895 in products


Ignore:
Timestamp:
Oct 19, 2010 5:52:01 PM (14 years ago)
Author:
mylan
Message:

#233: Updated keywords calculation - replace lynx usage with portal_transforms, added optional external page retrieval

File:
1 edited

Legend:

Unmodified
Added
Removed
  • quintagroup.seoptimizer/trunk/quintagroup/seoptimizer/browser/keywords.py

    r2890 r2895  
    1 import re, commands 
     1import re, commands, urllib2 
    22from xml.dom import Node 
    33 
    44from zope.interface import implements 
     5from zope.component import queryAdapter 
     6from zope.component import queryMultiAdapter 
    57from Products.Five.browser import BrowserView 
    68 
     
    1012from interfaces import IValidateSEOKeywordsView 
    1113from quintagroup.seoptimizer import SeoptimizerMessageFactory as _ 
    12  
    13 #from pyquery import PyQuery as pq 
     14from quintagroup.seoptimizer.browser.seo_configlet import ISEOConfigletSchema 
    1415 
    1516class ValidateSEOKeywordsView(BrowserView): 
     
    2122        text = self.request.get('text') 
    2223        ts = getToolByName(self.context, 'translation_service') 
     24        transforms = getToolByName(self.context, 'portal_transforms') 
     25        portal = getToolByName(self.context, 'portal_url').getPortalObject() 
     26        isExternal = queryAdapter(portal, ISEOConfigletSchema).external_keywords_test 
    2327        # extract keywords from text 
    2428        enc = getSiteEncoding(self.context) 
     
    3135                                 context=self.context) 
    3236        # request html page of context object 
    33         url = '%s?without_metatag_keywords=1' % self.context.absolute_url() 
    34  
    35         # extract words from url page using lynx browser (test page by 'url' 
    36         # randered without metatag keywords) 
    37         #import pdb;pdb.set_trace() 
    38         page_text = commands.getoutput('lynx --dump --nolist %s' % url).lower() 
    39         if page_text and page_text != 'sh: lynx: command not found': 
    40             page_text = safe_unicode(page_text, 'utf-8') 
     37        if isExternal: 
     38            # Not pass timeout option because: 
     39            # 1. its value get from the global default timeout settings by default. 
     40            # 2. timeout option added in python 2.6 (so acceptable only in plone4+) 
     41            try: 
     42                html = urllib2.urlopen(self.context.absolute_url()) 
     43            except urllib2.URLError: 
     44                # In case of exceed timeout period 
     45                # or other URL connection errors. 
     46                html = unicode(self.context()).encode(enc) 
    4147        else: 
    42             return ts.utranslate(domain='quintagroup.seoptimizer', 
    43                                  msgid=_(u'Could not find lynx browser!'), 
    44                                  context=self.context) 
    45  
    46         # html = self.context() 
    47         # page_text = pq("body", html).text() 
    48  
     48            html = unicode(self.context()).encode(enc) 
     49        page_text = transforms.convert("html_to_text", html).getData() 
     50                                  
    4951        # check every keyword on appearing in body of html page 
    5052        result = [] 
Note: See TracChangeset for help on using the changeset viewer.