[http] implement 1587305: provide better implementation of Accept header parsing + tests
authorAdrien Di Mascio <Adrien.DiMascio@logilab.fr>
Fri, 01 Apr 2011 14:34:11 +0200
changeset 7164 93a19c1831aa
parent 7162 62561ea082d2
child 7165 b817d44cb606
[http] implement 1587305: provide better implementation of Accept header parsing + tests
web/request.py
web/test/unittest_request.py
--- a/web/request.py	Fri Apr 01 14:25:55 2011 +0200
+++ b/web/request.py	Fri Apr 01 14:34:11 2011 +0200
@@ -734,26 +734,14 @@
         return None, None
 
     def parse_accept_header(self, header):
-        """returns an ordered list of preferred languages"""
+        """returns an ordered list of accepted values"""
+        try:
+            value_parser, value_sort_key = ACCEPT_HEADER_PARSER[header.lower()]
+        except KeyError:
+            value_parser = value_sort_key = None
         accepteds = self.get_header(header, '')
-        values = []
-        for info in accepteds.split(','):
-            try:
-                value, scores = info.split(';', 1)
-            except ValueError:
-                value = info
-                score = 1.0
-            else:
-                for score in scores.split(';'):
-                    try:
-                        scorekey, scoreval = score.split('=')
-                        if scorekey == 'q': # XXX 'level'
-                            score = float(scoreval)
-                    except ValueError:
-                        continue
-            values.append((score, value))
-        values.sort(reverse=True)
-        return (value for (score, value) in values)
+        values = _parse_accept_header(accepteds, value_parser, value_sort_key)
+        return (raw_value for (raw_value, parsed_value, score) in values)
 
     def header_if_modified_since(self):
         """If the HTTP header If-modified-since is set, return the equivalent
@@ -858,5 +846,91 @@
                 self.parse_accept_header('Accept-Language')]
 
 
+
+## HTTP-accept parsers / utilies ##############################################
+def _mimetype_sort_key(accept_info):
+    """accepted mimetypes must be sorted by :
+
+    1/ highest score first
+    2/ most specific mimetype first, e.g. :
+       - 'text/html level=1' is more specific 'text/html'
+       - 'text/html' is more specific than 'text/*'
+       - 'text/*' itself more specific than '*/*'
+
+    """
+    raw_value, (media_type, media_subtype, media_type_params), score = accept_info
+    # FIXME: handle '+' in media_subtype ? (should xhtml+xml have a
+    # higher precedence than xml ?)
+    if media_subtype == '*':
+        score -= 0.0001
+    if media_type == '*':
+        score -= 0.0001
+    return 1./score, media_type, media_subtype, 1./(1+len(media_type_params))
+
+def _charset_sort_key(accept_info):
+    """accepted mimetypes must be sorted by :
+
+    1/ highest score first
+    2/ most specific charset first, e.g. :
+       - 'utf-8' is more specific than '*'
+    """
+    raw_value, value, score = accept_info
+    if value == '*':
+        score -= 0.0001
+    return 1./score, value
+
+def _parse_accept_header(raw_header, value_parser=None, value_sort_key=None):
+    """returns an ordered list accepted types
+
+    returned value is a list of 2-tuple (value, score), ordered
+    by score. Exact type of `value` will depend on what `value_parser`
+    will reutrn. if `value_parser` is None, then the raw value, as found
+    in the http header, is used.
+    """
+    if value_sort_key is None:
+        value_sort_key = lambda infos: 1./infos[-1]
+    values = []
+    for info in raw_header.split(','):
+        score = 1.0
+        other_params = {}
+        try:
+            value, infodef = info.split(';', 1)
+        except ValueError:
+            value = info
+        else:
+            for info in infodef.split(';'):
+                try:
+                    infokey, infoval = info.split('=')
+                    if infokey == 'q': # XXX 'level'
+                        score = float(infoval)
+                        continue
+                except ValueError:
+                    continue
+                other_params[infokey] = infoval
+        parsed_value = value_parser(value, other_params) if value_parser else value
+        values.append( (value.strip(), parsed_value, score) )
+    values.sort(key=value_sort_key)
+    return values
+
+
+def _mimetype_parser(value, other_params):
+    """return a 3-tuple
+    (type, subtype, type_params) corresponding to the mimetype definition
+    e.g. : for 'text/*', `mimetypeinfo` will be ('text', '*', {}), for
+    'text/html;level=1', `mimetypeinfo` will be ('text', '*', {'level': '1'})
+    """
+    try:
+        media_type, media_subtype = value.strip().split('/')
+    except ValueError: # safety belt : '/' should always be present
+        media_type = value.strip()
+        media_subtype = '*'
+    return (media_type, media_subtype, other_params)
+
+
+ACCEPT_HEADER_PARSER = {
+    'accept': (_mimetype_parser, _mimetype_sort_key),
+    'accept-charset': (None, _charset_sort_key),
+    }
+
 from cubicweb import set_log_methods
 set_log_methods(CubicWebRequestBase, LOGGER)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/web/test/unittest_request.py	Fri Apr 01 14:34:11 2011 +0200
@@ -0,0 +1,69 @@
+"""misc. unittests for utility functions
+"""
+
+from logilab.common.testlib import TestCase, unittest_main
+
+from functools import partial
+
+from cubicweb.web.request import (_parse_accept_header,
+                                  _mimetype_sort_key, _mimetype_parser, _charset_sort_key)
+
+
+
+class AcceptParserTC(TestCase):
+
+    def test_parse_accept(self):
+        parse_accept_header = partial(_parse_accept_header,
+                                      value_parser=_mimetype_parser,
+                                      value_sort_key=_mimetype_sort_key)
+        # compare scores
+        self.assertEqual(parse_accept_header("audio/*;q=0.2, audio/basic"),
+                         [( ('audio/basic', ('audio', 'basic', {}), 1.0 ) ),
+                          ( ('audio/*', ('audio', '*', {}), 0.2 ) )])
+        self.assertEqual(parse_accept_header("text/plain;q=0.5, text/html, text/x-dvi;q=0.8, text/x-c"),
+                         [( ('text/html', ('text', 'html', {}), 1.0 ) ),
+                          ( ('text/x-c', ('text', 'x-c', {}), 1.0 ) ),
+                          ( ('text/x-dvi', ('text', 'x-dvi', {}), 0.8 ) ),
+                          ( ('text/plain', ('text', 'plain', {}), 0.5 ) )])
+        # compare mimetype precedence for a same given score
+        self.assertEqual(parse_accept_header("audio/*, audio/basic"),
+                         [( ('audio/basic', ('audio', 'basic', {}), 1.0 ) ),
+                          ( ('audio/*', ('audio', '*', {}), 1.0 ) )])
+        self.assertEqual(parse_accept_header("text/*, text/html, text/html;level=1, */*"),
+                         [( ('text/html', ('text', 'html', {'level': '1'}), 1.0 ) ),
+                          ( ('text/html', ('text', 'html', {}), 1.0 ) ),
+                          ( ('text/*', ('text', '*', {}), 1.0 ) ),
+                          ( ('*/*', ('*', '*', {}), 1.0 ) )])
+        # free party
+        self.assertEqual(parse_accept_header("text/*;q=0.3, text/html;q=0.7, text/html;level=1, text/html;level=2;q=0.4, */*;q=0.5"),
+                         [( ('text/html', ('text', 'html', {'level': '1'}), 1.0 ) ),
+                          ( ('text/html', ('text', 'html', {}), 0.7 ) ),
+                          ( ('*/*', ('*', '*', {}), 0.5 ) ),
+                          ( ('text/html', ('text', 'html', {'level': '2'}), 0.4 ) ),
+                          ( ('text/*', ('text', '*', {}), 0.3 ) )
+                          ])
+        # chrome sample header
+        self.assertEqual(parse_accept_header("application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"),
+                         [( ('application/xhtml+xml', ('application', 'xhtml+xml', {}), 1.0 ) ),
+                          ( ('application/xml', ('application', 'xml', {}), 1.0 ) ),
+                          ( ('image/png', ('image', 'png', {}), 1.0 ) ),
+                          ( ('text/html', ('text', 'html', {}), 0.9 ) ),
+                          ( ('text/plain', ('text', 'plain', {}), 0.8 ) ),
+                          ( ('*/*', ('*', '*', {}), 0.5 ) ),
+                          ])
+
+    def test_parse_accept_language(self):
+        self.assertEqual(_parse_accept_header('fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3'),
+                         [('fr', 'fr', 1.0), ('fr-fr', 'fr-fr', 0.8),
+                          ('en-us', 'en-us', 0.5), ('en', 'en', 0.3)])
+
+    def test_parse_accept_charset(self):
+        parse_accept_header = partial(_parse_accept_header,
+                                      value_sort_key=_charset_sort_key)
+        self.assertEqual(parse_accept_header('ISO-8859-1,utf-8;q=0.7,*;q=0.7'),
+                         [('ISO-8859-1', 'ISO-8859-1', 1.0),
+                          ('utf-8', 'utf-8', 0.7),
+                          ('*', '*', 0.7)])
+
+if __name__ == '__main__':
+    unittest_main()