[cache] factorize _validate_cache() logic implemented in wsgi and twisted handlers
authorAdrien Di Mascio <Adrien.DiMascio@logilab.fr>
Thu, 15 Mar 2012 17:59:27 +0100
changeset 8316 d5b1b75805dd
parent 8315 166e6d5d8e17
child 8318 e8a2fd7d9606
[cache] factorize _validate_cache() logic implemented in wsgi and twisted handlers
devtools/fake.py
etwist/http.py
etwist/request.py
web/httpcache.py
web/request.py
web/test/unittest_http.py
wsgi/request.py
--- a/devtools/fake.py	Thu Mar 15 17:57:40 2012 +0100
+++ b/devtools/fake.py	Thu Mar 15 17:59:27 2012 +0100
@@ -79,10 +79,6 @@
     def http_method(self):
         return self._http_method
 
-
-    def header_if_modified_since(self):
-        return None
-
     def relative_path(self, includeparams=True):
         """return the normalized path of the request (ie at least relative
         to the instance's root, but some other normalization may be needed
@@ -114,9 +110,6 @@
             return self.headers_out.getRawHeaders(header, [default])[0]
         return self.headers_out.getHeader(header, default)
 
-    def validate_cache(self):
-        pass
-
     def build_url_params(self, **kwargs):
         # overriden to get predictable resultts
         args = []
--- a/etwist/http.py	Thu Mar 15 17:57:40 2012 +0100
+++ b/etwist/http.py	Thu Mar 15 17:59:27 2012 +0100
@@ -43,19 +43,3 @@
 
     def __repr__(self):
         return "<%s.%s code=%d>" % (self.__module__, self.__class__.__name__, self._code)
-
-
-def not_modified_response(twisted_request, headers_in):
-    headers_out = Headers()
-
-    for header in (
-        # Required from sec 10.3.5:
-        'date', 'etag', 'content-location', 'expires',
-        'cache-control', 'vary',
-        # Others:
-        'server', 'proxy-authenticate', 'www-authenticate', 'warning'):
-        value = headers_in.getRawHeaders(header)
-        if value is not None:
-            headers_out.setRawHeaders(header, value)
-    return HTTPResponse(twisted_request=twisted_request,
-                        headers=headers_out)
--- a/etwist/request.py	Thu Mar 15 17:57:40 2012 +0100
+++ b/etwist/request.py	Thu Mar 15 17:59:27 2012 +0100
@@ -27,7 +27,6 @@
 from cubicweb.web.request import CubicWebRequestBase
 from cubicweb.web.httpcache import GMTOFFSET
 from cubicweb.web.http_headers import Headers
-from cubicweb.etwist.http import not_modified_response
 
 
 class CubicWebTwistedRequestAdapter(CubicWebRequestBase):
@@ -57,30 +56,3 @@
         if not includeparams:
             path = path.split('?', 1)[0]
         return path
-
-    def _validate_cache(self):
-        """raise a `DirectResponse` exception if a cached page along the way
-        exists and is still usable
-        """
-        if self.get_header('Cache-Control') in ('max-age=0', 'no-cache'):
-            # Expires header seems to be required by IE7
-            self.add_header('Expires', 'Sat, 01 Jan 2000 00:00:00 GMT')
-            return
-        # when using both 'Last-Modified' and 'ETag' response headers
-        # (i.e. using respectively If-Modified-Since and If-None-Match request
-        # headers, see
-        # http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.3.4 for
-        # reference
-        last_modified = self.headers_out.getHeader('last-modified')
-        if last_modified is not None:
-            status = self._twreq.setLastModified(last_modified)
-            if status != http.CACHED:
-                return
-        etag = self.headers_out.getRawHeaders('etag')
-        if etag is not None:
-            status = self._twreq.setETag(etag[0])
-            if status == http.CACHED:
-                response = not_modified_response(self._twreq, self._headers_in)
-                raise DirectResponse(response)
-        # Expires header seems to be required by IE7
-        self.add_header('Expires', 'Sat, 01 Jan 2000 00:00:00 GMT')
--- a/web/httpcache.py	Thu Mar 15 17:57:40 2012 +0100
+++ b/web/httpcache.py	Thu Mar 15 17:59:27 2012 +0100
@@ -147,3 +147,39 @@
 
 viewmod.StartupView.http_cache_manager = MaxAgeHTTPCacheManager
 viewmod.StartupView.cache_max_age = 60*60*2 # stay in http cache for 2 hours by default
+
+
+### HTTP Cache validator ############################################
+
+
+
+def get_validators(headers_in):
+    """return a list of http condition validator relevant to this request
+    """
+    result = []
+    for header, func in VALIDATORS:
+        value = headers_in.getHeader(header)
+        if value is not None:
+            result.append((func, value))
+    return result
+
+
+def if_modified_since(ref_date, headers_out):
+    last_modified = headers_out.getHeader('last-modified')
+    if last_modified is None:
+        return True
+    return ref_date < last_modified
+
+def if_none_match(tags, headers_out):
+    etag = headers_out.getHeader('etag')
+    if etag is None:
+        return True
+    return not ((etag in tags) or ('*' in tags))
+
+VALIDATORS = [
+    ('if-modified-since', if_modified_since),
+    #('if-unmodified-since', if_unmodified_since),
+    ('if-none-match', if_none_match),
+    #('if-modified-since', if_modified_since),
+]
+
--- a/web/request.py	Thu Mar 15 17:57:40 2012 +0100
+++ b/web/request.py	Thu Mar 15 17:59:27 2012 +0100
@@ -27,6 +27,7 @@
 from calendar import timegm
 from datetime import date, datetime
 from urlparse import urlsplit
+import httplib
 from itertools import count
 from warnings import warn
 
@@ -43,8 +44,8 @@
 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE_NOEXT
 from cubicweb.web import (INTERNAL_FIELD_VALUE, LOGGER, NothingToEdit,
                           RequestError, StatusResponse)
-from cubicweb.web.httpcache import GMTOFFSET
-from cubicweb.web.http_headers import Headers, Cookie
+from cubicweb.web.httpcache import GMTOFFSET, get_validators
+from cubicweb.web.http_headers import Headers, Cookie, parseDateTime
 
 _MARKER = object()
 
@@ -750,14 +751,33 @@
         return 'view'
 
     def validate_cache(self):
-        """raise a `DirectResponse` exception if a cached page along the way
+        """raise a `StatusResponse` exception if a cached page along the way
         exists and is still usable.
 
         calls the client-dependant implementation of `_validate_cache`
         """
-        self._validate_cache()
-        if self.http_method() == 'HEAD':
-            raise StatusResponse(200, '')
+        modified = True
+        if self.get_header('Cache-Control') not in ('max-age=0', 'no-cache'):
+            # Here, we search for any invalid 'not modified' condition
+            # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.3
+            validators = get_validators(self._headers_in)
+            if validators: # if we have no
+                modified = any(func(val, self.headers_out) for func, val in validators)
+        # Forge expected response
+        if modified:
+            if 'Expires' not in self.headers_out:
+                # Expires header seems to be required by IE7 -- Are you sure ?
+                self.add_header('Expires', 'Sat, 01 Jan 2000 00:00:00 GMT')
+            if self.http_method() == 'HEAD':
+                raise StatusResponse(200, '')
+            # /!\ no raise, the function returns and we keep processing the request)
+        else:
+            # overwrite headers_out to forge a brand new not-modified response
+            self.headers_out = self._forge_cached_headers()
+            if self.http_method() in ('HEAD', 'GET'):
+                raise StatusResponse(httplib.NOT_MODIFIED)
+            else:
+                raise StatusResponse(httplib.PRECONDITION_FAILED)
 
     # abstract methods to override according to the web front-end #############
 
@@ -765,11 +785,19 @@
         """returns 'POST', 'GET', 'HEAD', etc."""
         raise NotImplementedError()
 
-    def _validate_cache(self):
-        """raise a `DirectResponse` exception if a cached page along the way
-        exists and is still usable
-        """
-        raise NotImplementedError()
+    def _forge_cached_headers(self):
+        # overwrite headers_out to forge a brand new not-modified response
+        headers = Headers()
+        for header in (
+            # Required from sec 10.3.5:
+            'date', 'etag', 'content-location', 'expires',
+            'cache-control', 'vary',
+            # Others:
+            'server', 'proxy-authenticate', 'www-authenticate', 'warning'):
+            value = self._headers_in.getRawHeaders(header)
+            if value is not None:
+                headers.setRawHeaders(header, value)
+        return headers
 
     def relative_path(self, includeparams=True):
         """return the normalized path of the request (ie at least relative
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/web/test/unittest_http.py	Thu Mar 15 17:59:27 2012 +0100
@@ -0,0 +1,282 @@
+from logilab.common.testlib import TestCase, unittest_main, tag, Tags
+
+from cubicweb.web import StatusResponse
+from cubicweb.devtools.fake import FakeRequest
+
+
+def _test_cache(hin, hout, method='GET'):
+    """forge and process a request
+
+    return status code and the request object
+
+    status is None is no cache is involved
+    """
+    # forge request
+    req = FakeRequest(method=method)
+    for key, value in hin:
+        req._headers_in.addRawHeader(key, str(value))
+    for key, value in hout:
+        req.headers_out.addRawHeader(key, str(value))
+    # process
+    status = None
+    try:
+        req.validate_cache()
+    except StatusResponse, ex:
+        status = ex.status
+    return status, req
+
+class HTTPCache(TestCase):
+    """Check that the http cache logiac work as expected
+    (as far as we understood the RFC)
+
+    """
+    tags = TestCase.tags | Tags('http', 'cache')
+
+
+    def assertCache(self, expected, status, situation=''):
+        """simple assert for nicer message"""
+        if expected != status:
+            if expected is None:
+                expected = "MODIFIED"
+            if status is None:
+                status = "MODIFIED"
+            msg = 'expected %r got %r' % (expected, status)
+            if situation:
+                msg = "%s - when: %s" % (msg, situation)
+            self.fail(msg)
+
+    def test_IN_none_OUT_none(self):
+        #: test that no caching is requested when not data is available
+        #: on any side
+        status, req =_test_cache((),())
+        self.assertIsNone(status)
+
+    def test_IN_Some_OUT_none(self):
+        #: test that no caching is requested when no data is available
+        #: server (origin) side
+        hin = [('if-modified-since','Sat, 14 Apr 2012 14:39:32 GM'),
+              ]
+        status, req = _test_cache(hin, ())
+        self.assertIsNone(status)
+        hin = [('if-none-match','babar/huitre'),
+              ]
+        status, req = _test_cache(hin, ())
+        self.assertIsNone(status)
+        hin = [('if-modified-since','Sat, 14 Apr 2012 14:39:32 GM'),
+               ('if-none-match','babar/huitre'),
+              ]
+        status, req = _test_cache(hin, ())
+        self.assertIsNone(status)
+
+    def test_IN_none_OUT_Some(self):
+        #: test that no caching is requested when no data is provided
+        #: by the client
+        hout = [('last-modified','Sat, 14 Apr 2012 14:39:32 GM'),
+               ]
+        status, req = _test_cache((), hout)
+        self.assertIsNone(status)
+        hout = [('etag','babar/huitre'),
+               ]
+        status, req = _test_cache((), hout)
+        self.assertIsNone(status)
+        hout = [('last-modified', 'Sat, 14 Apr 2012 14:39:32 GM'),
+                ('etag','babar/huitre'),
+               ]
+        status, req = _test_cache((), hout)
+        self.assertIsNone(status)
+
+    @tag('last_modified')
+    def test_last_modified_newer(self):
+        #: test the proper behavior of modification date only
+        # newer
+        hin  = [('if-modified-since', 'Sat, 13 Apr 2012 14:39:32 GM'),
+               ]
+        hout = [('last-modified', 'Sat, 14 Apr 2012 14:39:32 GM'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(None, status, 'origin is newer than client')
+
+    @tag('last_modified')
+    def test_last_modified_older(self):
+        # older
+        hin  = [('if-modified-since', 'Sat, 15 Apr 2012 14:39:32 GM'),
+               ]
+        hout = [('last-modified', 'Sat, 14 Apr 2012 14:39:32 GM'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(304, status, 'origin is older than client')
+
+    @tag('last_modified')
+    def test_last_modified_same(self):
+        # same
+        hin  = [('if-modified-since', 'Sat, 14 Apr 2012 14:39:32 GM'),
+               ]
+        hout = [('last-modified', 'Sat, 14 Apr 2012 14:39:32 GM'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(304, status, 'origin is equal to client')
+
+    @tag('etag')
+    def test_etag_mismatch(self):
+        #: test the proper behavior of etag only
+        # etag mismatch
+        hin  = [('if-none-match', 'babar'),
+               ]
+        hout = [('etag', 'celestine'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(None, status, 'etag mismatch')
+
+    @tag('etag')
+    def test_etag_match(self):
+        # etag match
+        hin  = [('if-none-match', 'babar'),
+               ]
+        hout = [('etag', 'babar'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(304, status, 'etag match')
+        # etag match in multiple
+        hin  = [('if-none-match', 'loutre'),
+                ('if-none-match', 'babar'),
+               ]
+        hout = [('etag', 'babar'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(304, status, 'etag match in multiple')
+        # client use "*" as etag
+        hin  = [('if-none-match', '*'),
+               ]
+        hout = [('etag', 'babar'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(304, status, 'client use "*" as etag')
+
+    @tag('etag', 'last_modified')
+    def test_both(self):
+        #: test the proper behavior of etag only
+        # both wrong
+        hin  = [('if-none-match', 'babar'),
+                ('if-modified-since', 'Sat, 14 Apr 2012 14:39:32 GM'),
+               ]
+        hout = [('etag', 'loutre'),
+                ('last-modified', 'Sat, 15 Apr 2012 14:39:32 GM'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(None, status, 'both wrong')
+
+    @tag('etag', 'last_modified')
+    def test_both_etag_mismatch(self):
+        # both etag mismatch
+        hin  = [('if-none-match', 'babar'),
+                ('if-modified-since', 'Sat, 14 Apr 2012 14:39:32 GM'),
+               ]
+        hout = [('etag', 'loutre'),
+                ('last-modified', 'Sat, 13 Apr 2012 14:39:32 GM'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(None, status, 'both  but etag mismatch')
+
+    @tag('etag', 'last_modified')
+    def test_both_but_modified(self):
+        # both but modified
+        hin  = [('if-none-match', 'babar'),
+                ('if-modified-since', 'Sat, 14 Apr 2012 14:39:32 GM'),
+               ]
+        hout = [('etag', 'babar'),
+                ('last-modified', 'Sat, 15 Apr 2012 14:39:32 GM'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(None, status, 'both  but modified')
+
+    @tag('etag', 'last_modified')
+    def test_both_ok(self):
+        # both ok
+        hin  = [('if-none-match', 'babar'),
+                ('if-modified-since', 'Sat, 14 Apr 2012 14:39:32 GM'),
+               ]
+        hout = [('etag', 'babar'),
+                ('last-modified', 'Sat, 13 Apr 2012 14:39:32 GM'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(304, status, 'both ok')
+
+    @tag('etag', 'HEAD')
+    def test_head_verb(self):
+        #: check than FOUND 200 is properly raise without content on HEAD request
+        #: This logic does not really belong here :-/
+        # modified
+        hin  = [('if-none-match', 'babar'),
+               ]
+        hout = [('etag', 'rhino/really-not-babar'),
+               ]
+        status, req = _test_cache(hin, hout, method='HEAD')
+        self.assertCache(200, status, 'modifier HEAD verb')
+        # not modified
+        hin  = [('if-none-match', 'babar'),
+               ]
+        hout = [('etag', 'babar'),
+               ]
+        status, req = _test_cache(hin, hout, method='HEAD')
+        self.assertCache(304, status, 'not modifier HEAD verb')
+
+    @tag('etag', 'POST')
+    def test_post_verb(self):
+        # modified
+        hin  = [('if-none-match', 'babar'),
+               ]
+        hout = [('etag', 'rhino/really-not-babar'),
+               ]
+        status, req = _test_cache(hin, hout, method='POST')
+        self.assertCache(None, status, 'modifier HEAD verb')
+        # not modified
+        hin  = [('if-none-match', 'babar'),
+               ]
+        hout = [('etag', 'babar'),
+               ]
+        status, req = _test_cache(hin, hout, method='POST')
+        self.assertCache(412, status, 'not modifier HEAD verb')
+
+    @tag('expires')
+    def test_expires_added(self):
+        #: Check that Expires header is added:
+        #: - when the page is modified
+        #: - when none was already present
+        hin  = [('if-none-match', 'babar'),
+               ]
+        hout = [('etag', 'rhino/really-not-babar'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(None, status, 'modifier HEAD verb')
+        value = req.headers_out.getHeader('expires')
+        self.assertIsNotNone(value)
+
+    @tag('expires')
+    def test_expires_not_added(self):
+        #: Check that Expires header is not added if NOT-MODIFIED
+        hin  = [('if-none-match', 'babar'),
+               ]
+        hout = [('etag', 'babar'),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(304, status, 'not modifier HEAD verb')
+        value = req.headers_out.getHeader('expires')
+        self.assertIsNone(value)
+
+    @tag('expires')
+    def test_expires_no_overwrite(self):
+        #: Check that cache does not overwrite existing Expires header
+        hin  = [('if-none-match', 'babar'),
+               ]
+        DATE = 'Sat, 13 Apr 2012 14:39:32 GM'
+        hout = [('etag', 'rhino/really-not-babar'),
+                ('expires', DATE),
+               ]
+        status, req = _test_cache(hin, hout)
+        self.assertCache(None, status, 'not modifier HEAD verb')
+        value = req.headers_out.getRawHeaders('expires')
+        self.assertEqual(value, [DATE])
+
+
+if __name__ == '__main__':
+    unittest_main()
--- a/wsgi/request.py	Thu Mar 15 17:57:40 2012 +0100
+++ b/wsgi/request.py	Thu Mar 15 17:59:27 2012 +0100
@@ -150,19 +150,3 @@
         postdata = buf.getvalue()
         buf.close()
         return postdata
-
-    def _validate_cache(self):
-        """raise a `DirectResponse` exception if a cached page along the way
-        exists and is still usable
-        """
-        if self.get_header('Cache-Control') in ('max-age=0', 'no-cache'):
-            # Expires header seems to be required by IE7
-            self.add_header('Expires', 'Sat, 01 Jan 2000 00:00:00 GMT')
-            return
-#         try:
-#             http.checkPreconditions(self._twreq, _PreResponse(self))
-#         except http.HTTPError, ex:
-#             self.info('valid http cache, no actual rendering')
-#             raise DirectResponse(ex.response)
-        # Expires header seems to be required by IE7
-        self.add_header('Expires', 'Sat, 01 Jan 2000 00:00:00 GMT')