From 9aabe7eae3eeb3e64c5a0f3687118cd806158550 Mon Sep 17 00:00:00 2001 From: Khudyakov Artem Date: Mon, 29 Jul 2024 22:05:10 +0300 Subject: [PATCH] Fixed #35440 -- Simplified parse_header_parameters by leveraging stdlid's Message. The `parse_header_parameters` function historically used Python's `cgi` module (now deprecated). In 34e2148fc725e7200050f74130d7523e3cd8507a, the logic was inlined to work around this deprecation ( #33173). Later, in d4d5427571b4bf3a21c902276c2a00215c2a37cc, the header parsing logic was further cleaned up to align with `multipartparser.py` (#33697). This change takes it a step further by replacing the copied `cgi` logic with Python's `email.message.Message` API for a more robust and maintainable header parsing implementation. Thanks to Raphael Gaschignard for testing, and to Adam Johnson and Shai Berger for reviews. Co-authored-by: Ben Cail Co-authored-by: Natalia <124304+nessita@users.noreply.github.com> --- django/utils/http.py | 58 ++++++++++++---------------------- docs/releases/6.0.txt | 4 +++ tests/utils_tests/test_http.py | 45 +++++++++++++++++++++++++- 3 files changed, 69 insertions(+), 38 deletions(-) diff --git a/django/utils/http.py b/django/utils/http.py index 9ca32eab08..1f9adeb707 100644 --- a/django/utils/http.py +++ b/django/utils/http.py @@ -3,8 +3,9 @@ import re import unicodedata from binascii import Error as BinasciiError from datetime import UTC, datetime -from email.utils import formatdate -from urllib.parse import quote, unquote +from email.message import Message +from email.utils import collapse_rfc2231_value, formatdate +from urllib.parse import quote from urllib.parse import urlencode as original_urlencode from urllib.parse import urlsplit @@ -24,6 +25,7 @@ ETAG_MATCH = _lazy_re_compile( re.X, ) +MAX_HEADER_LENGTH = 10_000 MONTHS = "jan feb mar apr may jun jul aug sep oct nov dec".split() __D = r"(?P[0-9]{2})" __D2 = r"(?P[ 0-9][0-9])" @@ -310,46 +312,28 @@ def escape_leading_slashes(url): return url -def _parseparam(s): - while s[:1] == ";": - s = s[1:] - end = s.find(";") - while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2: - end = s.find(";", end + 1) - if end < 0: - end = len(s) - f = s[:end] - yield f.strip() - s = s[end:] - - -def parse_header_parameters(line): +def parse_header_parameters(line, max_length=MAX_HEADER_LENGTH): """ Parse a Content-type like header. Return the main content-type and a dictionary of options. + + If `line` is longer than `max_length`, `ValueError` is raised. """ - parts = _parseparam(";" + line) - key = parts.__next__().lower() + if max_length is not None and line and len(line) > max_length: + raise ValueError("Unable to parse header parameters (value too long).") + + m = Message() + m["content-type"] = line + params = m.get_params() + pdict = {} - for p in parts: - i = p.find("=") - if i >= 0: - has_encoding = False - name = p[:i].strip().lower() - if name.endswith("*"): - # Lang/encoding embedded in the value (like "filename*=UTF-8''file.ext") - # https://tools.ietf.org/html/rfc2231#section-4 - name = name[:-1] - if p.count("'") == 2: - has_encoding = True - value = p[i + 1 :].strip() - if len(value) >= 2 and value[0] == value[-1] == '"': - value = value[1:-1] - value = value.replace("\\\\", "\\").replace('\\"', '"') - if has_encoding: - encoding, lang, value = value.split("'") - value = unquote(value, encoding=encoding) - pdict[name] = value + key = params.pop(0)[0].lower() + for name, value in params: + if not name: + continue + if isinstance(value, tuple): + value = collapse_rfc2231_value(value) + pdict[name] = value return key, pdict diff --git a/docs/releases/6.0.txt b/docs/releases/6.0.txt index 499163788f..fe090e3f2a 100644 --- a/docs/releases/6.0.txt +++ b/docs/releases/6.0.txt @@ -311,6 +311,10 @@ Miscellaneous * The :ref:`JSON ` serializer now writes a newline at the end of the output, even without the ``indent`` option set. +* The undocumented ``django.utils.http.parse_header_parameters()`` function is + refactored to use Python's :py:class:`email.message.Message` for parsing. + Input headers exceeding 10000 characters will now raise :exc:`ValueError`. + .. _deprecated-features-6.0: Features deprecated in 6.0 diff --git a/tests/utils_tests/test_http.py b/tests/utils_tests/test_http.py index d18fb63c0c..3730c2fcf5 100644 --- a/tests/utils_tests/test_http.py +++ b/tests/utils_tests/test_http.py @@ -6,6 +6,7 @@ from unittest import mock from django.test import SimpleTestCase from django.utils.datastructures import MultiValueDict from django.utils.http import ( + MAX_HEADER_LENGTH, base36_to_int, content_disposition_header, escape_leading_slashes, @@ -424,6 +425,8 @@ class EscapeLeadingSlashesTests(unittest.TestCase): class ParseHeaderParameterTests(unittest.TestCase): def test_basic(self): tests = [ + ("", ("", {})), + (None, ("none", {})), ("text/plain", ("text/plain", {})), ("text/vnd.just.made.this.up ; ", ("text/vnd.just.made.this.up", {})), ("text/plain;charset=us-ascii", ("text/plain", {"charset": "us-ascii"})), @@ -447,10 +450,18 @@ class ParseHeaderParameterTests(unittest.TestCase): 'attachment; filename="strange;name";size=123;', ("attachment", {"filename": "strange;name", "size": "123"}), ), + ( + 'attachment; filename="strange;name";;;;size=123;;;', + ("attachment", {"filename": "strange;name", "size": "123"}), + ), ( 'form-data; name="files"; filename="fo\\"o;bar"', ("form-data", {"name": "files", "filename": 'fo"o;bar'}), ), + ( + 'form-data; name="files"; filename="\\"fo\\"o;b\\\\ar\\""', + ("form-data", {"name": "files", "filename": '"fo"o;b\\ar"'}), + ), ] for header, expected in tests: with self.subTest(header=header): @@ -480,12 +491,13 @@ class ParseHeaderParameterTests(unittest.TestCase): """ Test wrongly formatted RFC 2231 headers (missing double single quotes). Parsing should not crash (#24209). + But stdlib email still decodes (#35440). """ test_data = ( ( "Content-Type: application/x-stuff; " "title*='This%20is%20%2A%2A%2Afun%2A%2A%2A", - "'This%20is%20%2A%2A%2Afun%2A%2A%2A", + "'This is ***fun***", ), ("Content-Type: application/x-stuff; title*='foo.html", "'foo.html"), ("Content-Type: application/x-stuff; title*=bar.html", "bar.html"), @@ -494,6 +506,37 @@ class ParseHeaderParameterTests(unittest.TestCase): parsed = parse_header_parameters(raw_line) self.assertEqual(parsed[1]["title"], expected_title) + def test_header_max_length(self): + base_header = "Content-Type: application/x-stuff; title*=" + base_header_len = len(base_header) + + test_data = [ + (MAX_HEADER_LENGTH, {}), + (MAX_HEADER_LENGTH, {"max_length": None}), + (MAX_HEADER_LENGTH + 1, {"max_length": None}), + (100, {"max_length": 100}), + ] + for line_length, kwargs in test_data: + with self.subTest(line_length=line_length, kwargs=kwargs): + title = "x" * (line_length - base_header_len) + line = base_header + title + assert len(line) == line_length + + parsed = parse_header_parameters(line, **kwargs) + + expected = ("content-type: application/x-stuff", {"title": title}) + self.assertEqual(parsed, expected) + + def test_header_too_long(self): + test_data = [ + ("x" * (MAX_HEADER_LENGTH + 1), {}), + ("x" * 101, {"max_length": 100}), + ] + for line, kwargs in test_data: + with self.subTest(line_length=len(line), kwargs=kwargs): + with self.assertRaises(ValueError): + parse_header_parameters(line, **kwargs) + class ContentDispositionHeaderTests(unittest.TestCase): def test_basic(self):