web/views/embedding.py
author Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
Mon, 16 Feb 2009 20:40:12 +0100
branchtls-sprint
changeset 642 2cf7b79d8e77
parent 431 18b4dd650ef8
child 631 99f5852f8604
permissions -rw-r--r--
[javascript] rewrote loadxhtml() plugin in order to be able to pass the XHR object to postAjaxLoad

"""Objects interacting together to provides the external page embeding
functionality.


:organization: Logilab
:copyright: 2001-2008 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
"""
__docformat__ = "restructuredtext en"

import re
from urlparse import urljoin
from urllib2 import urlopen, Request, HTTPError

from logilab.mtconverter import guess_encoding

from cubicweb import urlquote # XXX should use view.url_quote method
from cubicweb.interfaces import IEmbedable
from cubicweb.common.uilib import soup2xhtml
from cubicweb.common.selectors import (one_line_rset, score_entity_selector,
                                    match_search_state, implement_interface)
from cubicweb.common.view import NOINDEX, NOFOLLOW
from cubicweb.web.controller import Controller
from cubicweb.web.action import Action
from cubicweb.web.views import basetemplates


class ExternalTemplate(basetemplates.TheMainTemplate):
    """template embeding an external web pages into CubicWeb web interface
    """
    id = 'external'
    
    def call(self, body):
        # XXX fallback to HTML 4 mode when embeding ?
        self.set_request_content_type()
        self.process_rql(self.req.form.get('rql'))
        self.req.search_state = ('normal',)
        self.template_header(self.content_type, None, self.req._('external page'),
                             [NOINDEX, NOFOLLOW])
        self.content_header()
        self.w(body)
        self.content_footer()
        self.template_footer()


class EmbedController(Controller):
    id = 'embed'
    template = 'external'

    def publish(self, rset=None):
        req = self.req
        if 'custom_css' in req.form:
            req.add_css(req.form['custom_css'])
        embedded_url = req.form['url']
        allowed = self.config['embed-allowed']
        _ = req._
        if allowed is None or not allowed.match(embedded_url):
            body = '<h2>%s</h2><h3>%s</h3>' % (
                _('error while embedding page'),
                _('embedding this url is forbidden'))
        else:
            prefix = req.build_url(self.id, url='')
            authorization = req.get_header('Authorization')
            if authorization:
                headers = {'Authorization' : authorization}
            else:
                headers = {}
            try:
                body = embed_external_page(embedded_url, prefix,
                                           headers, req.form.get('custom_css'))
                body = soup2xhtml(body, self.req.encoding)
            except HTTPError, err:
                body = '<h2>%s</h2><h3>%s</h3>' % (
                    _('error while embedding page'), err)
        return self.vreg.main_template(req, self.template, body=body)


class EmbedAction(Action):
    """display an 'embed' link on entity implementing `embeded_url` method
    if the returned url match embeding configuration
    """
    id = 'embed'
    controller = 'embed'
    __selectors__ = (one_line_rset, match_search_state,
                     implement_interface, score_entity_selector)
    accepts_interfaces = (IEmbedable,)
    
    title = _('embed')
        
    @classmethod
    def score_entity(cls, entity):
        """return a score telling how well I can display the given 
        entity instance (required by the value_selector)
        """
        url = entity.embeded_url()
        if not url or not url.strip():
            return 0
        allowed = cls.config['embed-allowed']
        if allowed is None or not allowed.match(url):
            return 0
        return 1
    
    def url(self, row=0):
        entity = self.rset.get_entity(row, 0)
        url = urljoin(self.req.base_url(), entity.embeded_url())
        if self.req.form.has_key('rql'):
            return self.build_url(url=url, rql=self.req.form['rql'])
        return self.build_url(url=url)



# functions doing necessary substitutions to embed an external html page ######


BODY_RGX = re.compile('<body.*?>(.*?)</body>', re.I | re.S | re.U)
HREF_RGX = re.compile('<a\s+href="([^"]*)"', re.I | re.S | re.U)
SRC_RGX = re.compile('<img\s+src="([^"]*)"', re.I | re.S | re.U)


class replace_href:
    def __init__(self, prefix, custom_css=None):
        self.prefix = prefix
        self.custom_css = custom_css
        
    def __call__(self, match):
        original_url = match.group(1)
        url = self.prefix + urlquote(original_url, safe='')
        if self.custom_css is not None:
            if '?' in url:
                url = '%s&amp;custom_css=%s' % (url, self.custom_css)
            else:
                url = '%s?custom_css=%s' % (url, self.custom_css)
        return '<a href="%s"' % url

class absolutize_links:
    def __init__(self, embedded_url, tag, custom_css=None):
        self.embedded_url = embedded_url
        self.tag = tag
        self.custom_css = custom_css
    
    def __call__(self, match):
        original_url = match.group(1)
        if '://' in original_url:
            return match.group(0) # leave it unchanged
        return '%s="%s"' % (self.tag, urljoin(self.embedded_url, original_url))


def prefix_links(body, prefix, embedded_url, custom_css=None):
    filters = ((HREF_RGX, absolutize_links(embedded_url, '<a href', custom_css)),
               (SRC_RGX, absolutize_links(embedded_url, '<img src')),
               (HREF_RGX, replace_href(prefix, custom_css)))
    for rgx, repl in filters:
        body = rgx.sub(repl, body)
    return body
    
def embed_external_page(url, prefix, headers=None, custom_css=None):
    req = Request(url, headers=(headers or {}))
    content = urlopen(req).read()
    page_source = unicode(content, guess_encoding(content), 'replace')
    page_source =page_source
    match = BODY_RGX.search(page_source)
    if match is None:
        return page_source
    return prefix_links(match.group(1), prefix, url, custom_css)