diff -r 058bb3dc685f -r 0b59724cb3f2 cubicweb/multipart.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cubicweb/multipart.py Sat Jan 16 13:48:51 2016 +0100 @@ -0,0 +1,416 @@ +# -*- coding: utf-8 -*- +''' +Parser for multipart/form-data +============================== + +This module provides a parser for the multipart/form-data format. It can read +from a file, a socket or a WSGI environment. The parser can be used to replace +cgi.FieldStorage (without the bugs) and works with Python 2.5+ and 3.x (2to3). + +Licence (MIT) +------------- + + Copyright (c) 2010, Marcel Hellkamp. + Inspired by the Werkzeug library: http://werkzeug.pocoo.org/ + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +''' + +__author__ = 'Marcel Hellkamp' +__version__ = '0.1' +__license__ = 'MIT' + +from tempfile import TemporaryFile +from wsgiref.headers import Headers +import re, sys +try: + from io import BytesIO +except ImportError: # pragma: no cover (fallback for Python 2.5) + from StringIO import StringIO as BytesIO + +from six import PY3, text_type +from six.moves.urllib.parse import parse_qs + +############################################################################## +################################ Helper & Misc ################################ +############################################################################## +# Some of these were copied from bottle: http://bottle.paws.de/ + +try: + from collections import MutableMapping as DictMixin +except ImportError: # pragma: no cover (fallback for Python 2.5) + from UserDict import DictMixin + +class MultiDict(DictMixin): + """ A dict that remembers old values for each key """ + def __init__(self, *a, **k): + self.dict = dict() + for k, v in dict(*a, **k).items(): + self[k] = v + + def __len__(self): return len(self.dict) + def __iter__(self): return iter(self.dict) + def __contains__(self, key): return key in self.dict + def __delitem__(self, key): del self.dict[key] + def keys(self): return self.dict.keys() + def __getitem__(self, key): return self.get(key, KeyError, -1) + def __setitem__(self, key, value): self.append(key, value) + + def append(self, key, value): self.dict.setdefault(key, []).append(value) + def replace(self, key, value): self.dict[key] = [value] + def getall(self, key): return self.dict.get(key) or [] + + def get(self, key, default=None, index=-1): + if key not in self.dict and default != KeyError: + return [default][index] + return self.dict[key][index] + + def iterallitems(self): + for key, values in self.dict.items(): + for value in values: + yield key, value + +def tob(data, enc='utf8'): # Convert strings to bytes (py2 and py3) + return data.encode(enc) if isinstance(data, text_type) else data + +def copy_file(stream, target, maxread=-1, buffer_size=2*16): + ''' Read from :stream and write to :target until :maxread or EOF. ''' + size, read = 0, stream.read + while 1: + to_read = buffer_size if maxread < 0 else min(buffer_size, maxread-size) + part = read(to_read) + if not part: return size + target.write(part) + size += len(part) + +############################################################################## +################################ Header Parser ################################ +############################################################################## + +_special = re.escape('()<>@,;:\\"/[]?={} \t') +_re_special = re.compile('[%s]' % _special) +_qstr = '"(?:\\\\.|[^"])*"' # Quoted string +_value = '(?:[^%s]+|%s)' % (_special, _qstr) # Save or quoted string +_option = '(?:;|^)\s*([^%s]+)\s*=\s*(%s)' % (_special, _value) +_re_option = re.compile(_option) # key=value part of an Content-Type like header + +def header_quote(val): + if not _re_special.search(val): + return val + return '"' + val.replace('\\','\\\\').replace('"','\\"') + '"' + +def header_unquote(val, filename=False): + if val[0] == val[-1] == '"': + val = val[1:-1] + if val[1:3] == ':\\' or val[:2] == '\\\\': + val = val.split('\\')[-1] # fix ie6 bug: full path --> filename + return val.replace('\\\\','\\').replace('\\"','"') + return val + +def parse_options_header(header, options=None): + if ';' not in header: + return header.lower().strip(), {} + ctype, tail = header.split(';', 1) + options = options or {} + for match in _re_option.finditer(tail): + key = match.group(1).lower() + value = header_unquote(match.group(2), key=='filename') + options[key] = value + return ctype, options + +############################################################################## +################################## Multipart ################################## +############################################################################## + + +class MultipartError(ValueError): pass + + +class MultipartParser(object): + + def __init__(self, stream, boundary, content_length=-1, + disk_limit=2**30, mem_limit=2**20, memfile_limit=2**18, + buffer_size=2**16, charset='latin1'): + ''' Parse a multipart/form-data byte stream. This object is an iterator + over the parts of the message. + + :param stream: A file-like stream. Must implement ``.read(size)``. + :param boundary: The multipart boundary as a byte string. + :param content_length: The maximum number of bytes to read. + ''' + self.stream, self.boundary = stream, boundary + self.content_length = content_length + self.disk_limit = disk_limit + self.memfile_limit = memfile_limit + self.mem_limit = min(mem_limit, self.disk_limit) + self.buffer_size = min(buffer_size, self.mem_limit) + self.charset = charset + if self.buffer_size - 6 < len(boundary): # "--boundary--\r\n" + raise MultipartError('Boundary does not fit into buffer_size.') + self._done = [] + self._part_iter = None + + def __iter__(self): + ''' Iterate over the parts of the multipart message. ''' + if not self._part_iter: + self._part_iter = self._iterparse() + for part in self._done: + yield part + for part in self._part_iter: + self._done.append(part) + yield part + + def parts(self): + ''' Returns a list with all parts of the multipart message. ''' + return list(iter(self)) + + def get(self, name, default=None): + ''' Return the first part with that name or a default value (None). ''' + for part in self: + if name == part.name: + return part + return default + + def get_all(self, name): + ''' Return a list of parts with that name. ''' + return [p for p in self if p.name == name] + + def _lineiter(self): + ''' Iterate over a binary file-like object line by line. Each line is + returned as a (line, line_ending) tuple. If the line does not fit + into self.buffer_size, line_ending is empty and the rest of the line + is returned with the next iteration. + ''' + read = self.stream.read + maxread, maxbuf = self.content_length, self.buffer_size + _bcrnl = tob('\r\n') + _bcr = _bcrnl[:1] + _bnl = _bcrnl[1:] + _bempty = _bcrnl[:0] # b'rn'[:0] -> b'' + buffer = _bempty # buffer for the last (partial) line + while 1: + data = read(maxbuf if maxread < 0 else min(maxbuf, maxread)) + maxread -= len(data) + lines = (buffer+data).splitlines(True) + len_first_line = len(lines[0]) + # be sure that the first line does not become too big + if len_first_line > self.buffer_size: + # at the same time don't split a '\r\n' accidentally + if (len_first_line == self.buffer_size+1 and + lines[0].endswith(_bcrnl)): + splitpos = self.buffer_size - 1 + else: + splitpos = self.buffer_size + lines[:1] = [lines[0][:splitpos], + lines[0][splitpos:]] + if data: + buffer = lines[-1] + lines = lines[:-1] + for line in lines: + if line.endswith(_bcrnl): yield line[:-2], _bcrnl + elif line.endswith(_bnl): yield line[:-1], _bnl + elif line.endswith(_bcr): yield line[:-1], _bcr + else: yield line, _bempty + if not data: + break + + def _iterparse(self): + lines, line = self._lineiter(), '' + separator = tob('--') + tob(self.boundary) + terminator = tob('--') + tob(self.boundary) + tob('--') + # Consume first boundary. Ignore leading blank lines + for line, nl in lines: + if line: break + if line != separator: + raise MultipartError("Stream does not start with boundary") + # For each part in stream... + mem_used, disk_used = 0, 0 # Track used resources to prevent DoS + is_tail = False # True if the last line was incomplete (cutted) + opts = {'buffer_size': self.buffer_size, + 'memfile_limit': self.memfile_limit, + 'charset': self.charset} + part = MultipartPart(**opts) + for line, nl in lines: + if line == terminator and not is_tail: + part.file.seek(0) + yield part + break + elif line == separator and not is_tail: + if part.is_buffered(): mem_used += part.size + else: disk_used += part.size + part.file.seek(0) + yield part + part = MultipartPart(**opts) + else: + is_tail = not nl # The next line continues this one + part.feed(line, nl) + if part.is_buffered(): + if part.size + mem_used > self.mem_limit: + raise MultipartError("Memory limit reached.") + elif part.size + disk_used > self.disk_limit: + raise MultipartError("Disk limit reached.") + if line != terminator: + raise MultipartError("Unexpected end of multipart stream.") + + +class MultipartPart(object): + + def __init__(self, buffer_size=2**16, memfile_limit=2**18, charset='latin1'): + self.headerlist = [] + self.headers = None + self.file = False + self.size = 0 + self._buf = tob('') + self.disposition, self.name, self.filename = None, None, None + self.content_type, self.charset = None, charset + self.memfile_limit = memfile_limit + self.buffer_size = buffer_size + + def feed(self, line, nl=''): + if self.file: + return self.write_body(line, nl) + return self.write_header(line, nl) + + def write_header(self, line, nl): + line = line.decode(self.charset or 'latin1') + if not nl: raise MultipartError('Unexpected end of line in header.') + if not line.strip(): # blank line -> end of header segment + self.finish_header() + elif line[0] in ' \t' and self.headerlist: + name, value = self.headerlist.pop() + self.headerlist.append((name, value+line.strip())) + else: + if ':' not in line: + raise MultipartError("Syntax error in header: No colon.") + name, value = line.split(':', 1) + self.headerlist.append((name.strip(), value.strip())) + + def write_body(self, line, nl): + if not line and not nl: return # This does not even flush the buffer + self.size += len(line) + len(self._buf) + self.file.write(self._buf + line) + self._buf = nl + if self.content_length > 0 and self.size > self.content_length: + raise MultipartError('Size of body exceeds Content-Length header.') + if self.size > self.memfile_limit and isinstance(self.file, BytesIO): + # TODO: What about non-file uploads that exceed the memfile_limit? + self.file, old = TemporaryFile(mode='w+b'), self.file + old.seek(0) + copy_file(old, self.file, self.size, self.buffer_size) + + def finish_header(self): + self.file = BytesIO() + self.headers = Headers(self.headerlist) + cdis = self.headers.get('Content-Disposition','') + ctype = self.headers.get('Content-Type','') + clen = self.headers.get('Content-Length','-1') + if not cdis: + raise MultipartError('Content-Disposition header is missing.') + self.disposition, self.options = parse_options_header(cdis) + self.name = self.options.get('name') + self.filename = self.options.get('filename') + self.content_type, options = parse_options_header(ctype) + self.charset = options.get('charset') or self.charset + self.content_length = int(self.headers.get('Content-Length','-1')) + + def is_buffered(self): + ''' Return true if the data is fully buffered in memory.''' + return isinstance(self.file, BytesIO) + + @property + def value(self): + ''' Data decoded with the specified charset ''' + pos = self.file.tell() + self.file.seek(0) + val = self.file.read() + self.file.seek(pos) + return val.decode(self.charset) + + def save_as(self, path): + fp = open(path, 'wb') + pos = self.file.tell() + try: + self.file.seek(0) + size = copy_file(self.file, fp) + finally: + self.file.seek(pos) + return size + +############################################################################## +#################################### WSGI #################################### +############################################################################## + +def parse_form_data(environ, charset='utf8', strict=False, **kw): + ''' Parse form data from an environ dict and return a (forms, files) tuple. + Both tuple values are dictionaries with the form-field name as a key + (unicode) and lists as values (multiple values per key are possible). + The forms-dictionary contains form-field values as unicode strings. + The files-dictionary contains :class:`MultipartPart` instances, either + because the form-field was a file-upload or the value is to big to fit + into memory limits. + + :param environ: An WSGI environment dict. + :param charset: The charset to use if unsure. (default: utf8) + :param strict: If True, raise :exc:`MultipartError` on any parsing + errors. These are silently ignored by default. + ''' + + forms, files = MultiDict(), MultiDict() + try: + if environ.get('REQUEST_METHOD','GET').upper() not in ('POST', 'PUT'): + raise MultipartError("Request method other than POST or PUT.") + content_length = int(environ.get('CONTENT_LENGTH', '-1')) + content_type = environ.get('CONTENT_TYPE', '') + if not content_type: + raise MultipartError("Missing Content-Type header.") + content_type, options = parse_options_header(content_type) + stream = environ.get('wsgi.input') or BytesIO() + kw['charset'] = charset = options.get('charset', charset) + if content_type == 'multipart/form-data': + boundary = options.get('boundary','') + if not boundary: + raise MultipartError("No boundary for multipart/form-data.") + for part in MultipartParser(stream, boundary, content_length, **kw): + if part.filename or not part.is_buffered(): + files[part.name] = part + else: # TODO: Big form-fields are in the files dict. really? + forms[part.name] = part.value + elif content_type in ('application/x-www-form-urlencoded', + 'application/x-url-encoded'): + mem_limit = kw.get('mem_limit', 2**20) + if content_length > mem_limit: + raise MultipartError("Request too big. Increase MAXMEM.") + data = stream.read(mem_limit) + if stream.read(1): # These is more that does not fit mem_limit + raise MultipartError("Request too big. Increase MAXMEM.") + if PY3: + data = data.decode('ascii') + data = parse_qs(data, keep_blank_values=True) + for key, values in data.items(): + for value in values: + if PY3: + forms[key] = value + else: + forms[key.decode(charset)] = value.decode(charset) + else: + raise MultipartError("Unsupported content type.") + except MultipartError: + if strict: raise + return forms, files