multipart.py
changeset 11057 0b59724cb3f2
parent 11052 058bb3dc685f
child 11058 23eb30449fe5
--- a/multipart.py	Mon Jan 04 18:40:30 2016 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,416 +0,0 @@
-# -*- coding: utf-8 -*-
-'''
-Parser for multipart/form-data
-==============================
-
-This module provides a parser for the multipart/form-data format. It can read
-from a file, a socket or a WSGI environment. The parser can be used to replace
-cgi.FieldStorage (without the bugs) and works with Python 2.5+ and 3.x (2to3).
-
-Licence (MIT)
--------------
-
-    Copyright (c) 2010, Marcel Hellkamp.
-    Inspired by the Werkzeug library: http://werkzeug.pocoo.org/
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy
-    of this software and associated documentation files (the "Software"), to deal
-    in the Software without restriction, including without limitation the rights
-    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-    copies of the Software, and to permit persons to whom the Software is
-    furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in
-    all copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-    THE SOFTWARE.
-
-'''
-
-__author__ = 'Marcel Hellkamp'
-__version__ = '0.1'
-__license__ = 'MIT'
-
-from tempfile import TemporaryFile
-from wsgiref.headers import Headers
-import re, sys
-try:
-    from io import BytesIO
-except ImportError: # pragma: no cover (fallback for Python 2.5)
-    from StringIO import StringIO as BytesIO
-
-from six import PY3, text_type
-from six.moves.urllib.parse import parse_qs
-
-##############################################################################
-################################ Helper & Misc ################################
-##############################################################################
-# Some of these were copied from bottle: http://bottle.paws.de/
-
-try:
-    from collections import MutableMapping as DictMixin
-except ImportError: # pragma: no cover (fallback for Python 2.5)
-    from UserDict import DictMixin
-
-class MultiDict(DictMixin):
-    """ A dict that remembers old values for each key """
-    def __init__(self, *a, **k):
-        self.dict = dict()
-        for k, v in dict(*a, **k).items():
-            self[k] = v
-
-    def __len__(self): return len(self.dict)
-    def __iter__(self): return iter(self.dict)
-    def __contains__(self, key): return key in self.dict
-    def __delitem__(self, key): del self.dict[key]
-    def keys(self): return self.dict.keys()
-    def __getitem__(self, key): return self.get(key, KeyError, -1)
-    def __setitem__(self, key, value): self.append(key, value)
-
-    def append(self, key, value): self.dict.setdefault(key, []).append(value)
-    def replace(self, key, value): self.dict[key] = [value]
-    def getall(self, key): return self.dict.get(key) or []
-
-    def get(self, key, default=None, index=-1):
-        if key not in self.dict and default != KeyError:
-            return [default][index]
-        return self.dict[key][index]
-
-    def iterallitems(self):
-        for key, values in self.dict.items():
-            for value in values:
-                yield key, value
-
-def tob(data, enc='utf8'): # Convert strings to bytes (py2 and py3)
-    return data.encode(enc) if isinstance(data, text_type) else data
-
-def copy_file(stream, target, maxread=-1, buffer_size=2*16):
-    ''' Read from :stream and write to :target until :maxread or EOF. '''
-    size, read = 0, stream.read
-    while 1:
-        to_read = buffer_size if maxread < 0 else min(buffer_size, maxread-size)
-        part = read(to_read)
-        if not part: return size
-        target.write(part)
-        size += len(part)
-
-##############################################################################
-################################ Header Parser ################################
-##############################################################################
-
-_special = re.escape('()<>@,;:\\"/[]?={} \t')
-_re_special = re.compile('[%s]' % _special)
-_qstr = '"(?:\\\\.|[^"])*"' # Quoted string
-_value = '(?:[^%s]+|%s)' % (_special, _qstr) # Save or quoted string
-_option = '(?:;|^)\s*([^%s]+)\s*=\s*(%s)' % (_special, _value)
-_re_option = re.compile(_option) # key=value part of an Content-Type like header
-
-def header_quote(val):
-    if not _re_special.search(val):
-        return val
-    return '"' + val.replace('\\','\\\\').replace('"','\\"') + '"'
-
-def header_unquote(val, filename=False):
-    if val[0] == val[-1] == '"':
-        val = val[1:-1]
-        if val[1:3] == ':\\' or val[:2] == '\\\\': 
-            val = val.split('\\')[-1] # fix ie6 bug: full path --> filename
-        return val.replace('\\\\','\\').replace('\\"','"')
-    return val
-
-def parse_options_header(header, options=None):
-    if ';' not in header:
-        return header.lower().strip(), {}
-    ctype, tail = header.split(';', 1)
-    options = options or {}
-    for match in _re_option.finditer(tail):
-        key = match.group(1).lower()
-        value = header_unquote(match.group(2), key=='filename')
-        options[key] = value
-    return ctype, options
-
-##############################################################################
-################################## Multipart ##################################
-##############################################################################
-
-
-class MultipartError(ValueError): pass
-
-
-class MultipartParser(object):
-    
-    def __init__(self, stream, boundary, content_length=-1,
-                 disk_limit=2**30, mem_limit=2**20, memfile_limit=2**18,
-                 buffer_size=2**16, charset='latin1'):
-        ''' Parse a multipart/form-data byte stream. This object is an iterator
-            over the parts of the message.
-            
-            :param stream: A file-like stream. Must implement ``.read(size)``.
-            :param boundary: The multipart boundary as a byte string.
-            :param content_length: The maximum number of bytes to read.
-        '''
-        self.stream, self.boundary = stream, boundary
-        self.content_length = content_length
-        self.disk_limit = disk_limit
-        self.memfile_limit = memfile_limit
-        self.mem_limit = min(mem_limit, self.disk_limit)
-        self.buffer_size = min(buffer_size, self.mem_limit)
-        self.charset = charset
-        if self.buffer_size - 6 < len(boundary): # "--boundary--\r\n"
-            raise MultipartError('Boundary does not fit into buffer_size.')
-        self._done = []
-        self._part_iter = None
-    
-    def __iter__(self):
-        ''' Iterate over the parts of the multipart message. '''
-        if not self._part_iter:
-            self._part_iter = self._iterparse()
-        for part in self._done:
-            yield part
-        for part in self._part_iter:
-            self._done.append(part)
-            yield part
-    
-    def parts(self):
-        ''' Returns a list with all parts of the multipart message. '''
-        return list(iter(self))
-    
-    def get(self, name, default=None):
-        ''' Return the first part with that name or a default value (None). '''
-        for part in self:
-            if name == part.name:
-                return part
-        return default
-
-    def get_all(self, name):
-        ''' Return a list of parts with that name. '''
-        return [p for p in self if p.name == name]
-
-    def _lineiter(self):
-        ''' Iterate over a binary file-like object line by line. Each line is
-            returned as a (line, line_ending) tuple. If the line does not fit
-            into self.buffer_size, line_ending is empty and the rest of the line
-            is returned with the next iteration.
-        '''
-        read = self.stream.read
-        maxread, maxbuf = self.content_length, self.buffer_size
-        _bcrnl = tob('\r\n')
-        _bcr = _bcrnl[:1]
-        _bnl = _bcrnl[1:]
-        _bempty = _bcrnl[:0] # b'rn'[:0] -> b''
-        buffer = _bempty # buffer for the last (partial) line
-        while 1:
-            data = read(maxbuf if maxread < 0 else min(maxbuf, maxread))
-            maxread -= len(data)
-            lines = (buffer+data).splitlines(True)
-            len_first_line = len(lines[0])
-            # be sure that the first line does not become too big
-            if len_first_line > self.buffer_size:
-                # at the same time don't split a '\r\n' accidentally
-                if (len_first_line == self.buffer_size+1 and
-                    lines[0].endswith(_bcrnl)):
-                    splitpos = self.buffer_size - 1
-                else:
-                    splitpos = self.buffer_size
-                lines[:1] = [lines[0][:splitpos],
-                             lines[0][splitpos:]]
-            if data:
-                buffer = lines[-1]
-                lines = lines[:-1]
-            for line in lines:
-                if line.endswith(_bcrnl): yield line[:-2], _bcrnl
-                elif line.endswith(_bnl): yield line[:-1], _bnl
-                elif line.endswith(_bcr): yield line[:-1], _bcr
-                else:                     yield line, _bempty
-            if not data:
-                break
-    
-    def _iterparse(self):
-        lines, line = self._lineiter(), ''
-        separator = tob('--') + tob(self.boundary)
-        terminator = tob('--') + tob(self.boundary) + tob('--')
-        # Consume first boundary. Ignore leading blank lines
-        for line, nl in lines:
-            if line: break
-        if line != separator:
-            raise MultipartError("Stream does not start with boundary")
-        # For each part in stream...
-        mem_used, disk_used = 0, 0 # Track used resources to prevent DoS
-        is_tail = False # True if the last line was incomplete (cutted)
-        opts = {'buffer_size': self.buffer_size,
-                'memfile_limit': self.memfile_limit,
-                'charset': self.charset}
-        part = MultipartPart(**opts)
-        for line, nl in lines:
-            if line == terminator and not is_tail:
-                part.file.seek(0)
-                yield part
-                break
-            elif line == separator and not is_tail:
-                if part.is_buffered(): mem_used  += part.size
-                else:                  disk_used += part.size
-                part.file.seek(0)
-                yield part
-                part = MultipartPart(**opts)
-            else:
-                is_tail = not nl # The next line continues this one
-                part.feed(line, nl)
-                if part.is_buffered():
-                    if part.size + mem_used > self.mem_limit:
-                        raise MultipartError("Memory limit reached.")
-                elif part.size + disk_used > self.disk_limit:
-                    raise MultipartError("Disk limit reached.")
-        if line != terminator:
-            raise MultipartError("Unexpected end of multipart stream.")
-            
-
-class MultipartPart(object):
-    
-    def __init__(self, buffer_size=2**16, memfile_limit=2**18, charset='latin1'):
-        self.headerlist = []
-        self.headers = None
-        self.file = False
-        self.size = 0
-        self._buf = tob('')
-        self.disposition, self.name, self.filename = None, None, None
-        self.content_type, self.charset = None, charset
-        self.memfile_limit = memfile_limit
-        self.buffer_size = buffer_size
-
-    def feed(self, line, nl=''):
-        if self.file:
-            return self.write_body(line, nl)
-        return self.write_header(line, nl)
-
-    def write_header(self, line, nl):
-        line = line.decode(self.charset or 'latin1')
-        if not nl: raise MultipartError('Unexpected end of line in header.')
-        if not line.strip(): # blank line -> end of header segment
-            self.finish_header()
-        elif line[0] in ' \t' and self.headerlist:
-            name, value = self.headerlist.pop()
-            self.headerlist.append((name, value+line.strip()))
-        else:
-            if ':' not in line:
-                raise MultipartError("Syntax error in header: No colon.")
-            name, value = line.split(':', 1)
-            self.headerlist.append((name.strip(), value.strip()))
-
-    def write_body(self, line, nl):
-        if not line and not nl: return # This does not even flush the buffer
-        self.size += len(line) + len(self._buf)
-        self.file.write(self._buf + line)
-        self._buf = nl
-        if self.content_length > 0 and self.size > self.content_length:
-            raise MultipartError('Size of body exceeds Content-Length header.')
-        if self.size > self.memfile_limit and isinstance(self.file, BytesIO):
-            # TODO: What about non-file uploads that exceed the memfile_limit?
-            self.file, old = TemporaryFile(mode='w+b'), self.file
-            old.seek(0)
-            copy_file(old, self.file, self.size, self.buffer_size)
-
-    def finish_header(self):
-        self.file = BytesIO()
-        self.headers = Headers(self.headerlist)
-        cdis = self.headers.get('Content-Disposition','')
-        ctype = self.headers.get('Content-Type','')
-        clen = self.headers.get('Content-Length','-1')
-        if not cdis:
-            raise MultipartError('Content-Disposition header is missing.')
-        self.disposition, self.options = parse_options_header(cdis)
-        self.name = self.options.get('name')
-        self.filename = self.options.get('filename')
-        self.content_type, options = parse_options_header(ctype)
-        self.charset = options.get('charset') or self.charset
-        self.content_length = int(self.headers.get('Content-Length','-1'))
-
-    def is_buffered(self):
-        ''' Return true if the data is fully buffered in memory.'''
-        return isinstance(self.file, BytesIO)
-
-    @property
-    def value(self):
-        ''' Data decoded with the specified charset '''
-        pos = self.file.tell()
-        self.file.seek(0)
-        val = self.file.read()
-        self.file.seek(pos)
-        return val.decode(self.charset)
-    
-    def save_as(self, path):
-        fp = open(path, 'wb')
-        pos = self.file.tell()
-        try:
-            self.file.seek(0)
-            size = copy_file(self.file, fp)
-        finally:
-            self.file.seek(pos)
-        return size
-
-##############################################################################
-#################################### WSGI ####################################
-##############################################################################
-
-def parse_form_data(environ, charset='utf8', strict=False, **kw):
-    ''' Parse form data from an environ dict and return a (forms, files) tuple.
-        Both tuple values are dictionaries with the form-field name as a key
-        (unicode) and lists as values (multiple values per key are possible).
-        The forms-dictionary contains form-field values as unicode strings.
-        The files-dictionary contains :class:`MultipartPart` instances, either
-        because the form-field was a file-upload or the value is to big to fit
-        into memory limits.
-        
-        :param environ: An WSGI environment dict.
-        :param charset: The charset to use if unsure. (default: utf8)
-        :param strict: If True, raise :exc:`MultipartError` on any parsing
-                       errors. These are silently ignored by default.
-    '''
-        
-    forms, files = MultiDict(), MultiDict()
-    try:
-        if environ.get('REQUEST_METHOD','GET').upper() not in ('POST', 'PUT'):
-            raise MultipartError("Request method other than POST or PUT.")
-        content_length = int(environ.get('CONTENT_LENGTH', '-1'))
-        content_type = environ.get('CONTENT_TYPE', '')
-        if not content_type:
-            raise MultipartError("Missing Content-Type header.")
-        content_type, options = parse_options_header(content_type)
-        stream = environ.get('wsgi.input') or BytesIO()
-        kw['charset'] = charset = options.get('charset', charset)
-        if content_type == 'multipart/form-data':
-            boundary = options.get('boundary','')
-            if not boundary:
-                raise MultipartError("No boundary for multipart/form-data.")
-            for part in MultipartParser(stream, boundary, content_length, **kw):
-                if part.filename or not part.is_buffered():
-                    files[part.name] = part
-                else: # TODO: Big form-fields are in the files dict. really?
-                    forms[part.name] = part.value
-        elif content_type in ('application/x-www-form-urlencoded',
-                              'application/x-url-encoded'):
-            mem_limit = kw.get('mem_limit', 2**20)
-            if content_length > mem_limit:
-                raise MultipartError("Request too big. Increase MAXMEM.")
-            data = stream.read(mem_limit)
-            if stream.read(1): # These is more that does not fit mem_limit
-                raise MultipartError("Request too big. Increase MAXMEM.")
-            if PY3:
-                data = data.decode('ascii')
-            data = parse_qs(data, keep_blank_values=True)
-            for key, values in data.items():
-                for value in values:
-                    if PY3:
-                        forms[key] = value
-                    else:
-                        forms[key.decode(charset)] = value.decode(charset)
-        else:
-            raise MultipartError("Unsupported content type.")
-    except MultipartError:
-        if strict: raise
-    return forms, files