Fixed #35440 -- Simplified parse_header_parameters by leveraging stdlid's Message.

The `parse_header_parameters` function historically used Python's `cgi` module (now deprecated). In 34e2148fc725e7200050f74130d7523e3cd8507a, the logic was inlined to work around this deprecation ( #33173). Later, in d4d5427571b4bf3a21c902276c2a00215c2a37cc, the header parsing logic was further cleaned up to align with `multipartparser.py` (#33697). This change takes it a step further by replacing the copied `cgi` logic with Python's `email.message.Message` API for a more robust and maintainable header parsing implementation. Thanks to Raphael Gaschignard for testing, and to Adam Johnson and Shai Berger for reviews. Co-authored-by: Ben Cail <bcail@crossway.org> Co-authored-by: Natalia <124304+nessita@users.noreply.github.com>
2025-08-21 01:09:13 +00:00 · 2024-07-29 22:05:10 +03:00 · 2024-07-29 22:05:10 +03:00 · 9aabe7eae3
commit 9aabe7eae3
parent 0d92428d77
3 changed files with 69 additions and 38 deletions
--- a/django/utils/http.py
+++ b/django/utils/http.py
@ -3,8 +3,9 @@ import re
 import unicodedata
 from binascii import Error as BinasciiError
 from datetime import UTC, datetime
-from email.utils import formatdate
-from urllib.parse import quote, unquote
+from email.message import Message
+from email.utils import collapse_rfc2231_value, formatdate
+from urllib.parse import quote
 from urllib.parse import urlencode as original_urlencode
 from urllib.parse import urlsplit

@ -24,6 +25,7 @@ ETAG_MATCH = _lazy_re_compile(
    re.X,
 )

+MAX_HEADER_LENGTH = 10_000
 MONTHS = "jan feb mar apr may jun jul aug sep oct nov dec".split()
 __D = r"(?P<day>[0-9]{2})"
 __D2 = r"(?P<day>[ 0-9][0-9])"
@ -310,46 +312,28 @@ def escape_leading_slashes(url):
    return url


-def _parseparam(s):
-    while s[:1] == ";":
-        s = s[1:]
-        end = s.find(";")
-        while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2:
-            end = s.find(";", end + 1)
-        if end < 0:
-            end = len(s)
-        f = s[:end]
-        yield f.strip()
-        s = s[end:]
-
-
-def parse_header_parameters(line):
+def parse_header_parameters(line, max_length=MAX_HEADER_LENGTH):
    """
    Parse a Content-type like header.
    Return the main content-type and a dictionary of options.
+
+    If `line` is longer than `max_length`, `ValueError` is raised.
    """
-    parts = _parseparam(";" + line)
-    key = parts.__next__().lower()
+    if max_length is not None and line and len(line) > max_length:
+        raise ValueError("Unable to parse header parameters (value too long).")
+
+    m = Message()
+    m["content-type"] = line
+    params = m.get_params()
+
    pdict = {}
-    for p in parts:
-        i = p.find("=")
-        if i >= 0:
-            has_encoding = False
-            name = p[:i].strip().lower()
-            if name.endswith("*"):
-                # Lang/encoding embedded in the value (like "filename*=UTF-8''file.ext")
-                # https://tools.ietf.org/html/rfc2231#section-4
-                name = name[:-1]
-                if p.count("'") == 2:
-                    has_encoding = True
-            value = p[i + 1 :].strip()
-            if len(value) >= 2 and value[0] == value[-1] == '"':
-                value = value[1:-1]
-                value = value.replace("\\\\", "\\").replace('\\"', '"')
-            if has_encoding:
-                encoding, lang, value = value.split("'")
-                value = unquote(value, encoding=encoding)
-            pdict[name] = value
+    key = params.pop(0)[0].lower()
+    for name, value in params:
+        if not name:
+            continue
+        if isinstance(value, tuple):
+            value = collapse_rfc2231_value(value)
+        pdict[name] = value
    return key, pdict


--- a/docs/releases/6.0.txt
+++ b/docs/releases/6.0.txt
@ -311,6 +311,10 @@ Miscellaneous
 * The :ref:`JSON <serialization-formats-json>` serializer now writes a newline
  at the end of the output, even without the ``indent`` option set.

+* The undocumented ``django.utils.http.parse_header_parameters()`` function is
+  refactored to use Python's :py:class:`email.message.Message` for parsing.
+  Input headers exceeding 10000 characters will now raise :exc:`ValueError`.
+
 .. _deprecated-features-6.0:

 Features deprecated in 6.0
--- a/tests/utils_tests/test_http.py
+++ b/tests/utils_tests/test_http.py
@ -6,6 +6,7 @@ from unittest import mock
 from django.test import SimpleTestCase
 from django.utils.datastructures import MultiValueDict
 from django.utils.http import (
+    MAX_HEADER_LENGTH,
    base36_to_int,
    content_disposition_header,
    escape_leading_slashes,
@ -424,6 +425,8 @@ class EscapeLeadingSlashesTests(unittest.TestCase):
 class ParseHeaderParameterTests(unittest.TestCase):
    def test_basic(self):
        tests = [
+            ("", ("", {})),
+            (None, ("none", {})),
            ("text/plain", ("text/plain", {})),
            ("text/vnd.just.made.this.up ; ", ("text/vnd.just.made.this.up", {})),
            ("text/plain;charset=us-ascii", ("text/plain", {"charset": "us-ascii"})),
@ -447,10 +450,18 @@ class ParseHeaderParameterTests(unittest.TestCase):
                'attachment; filename="strange;name";size=123;',
                ("attachment", {"filename": "strange;name", "size": "123"}),
            ),
+            (
+                'attachment; filename="strange;name";;;;size=123;;;',
+                ("attachment", {"filename": "strange;name", "size": "123"}),
+            ),
            (
                'form-data; name="files"; filename="fo\\"o;bar"',
                ("form-data", {"name": "files", "filename": 'fo"o;bar'}),
            ),
+            (
+                'form-data; name="files"; filename="\\"fo\\"o;b\\\\ar\\""',
+                ("form-data", {"name": "files", "filename": '"fo"o;b\\ar"'}),
+            ),
        ]
        for header, expected in tests:
            with self.subTest(header=header):
@ -480,12 +491,13 @@ class ParseHeaderParameterTests(unittest.TestCase):
        """
        Test wrongly formatted RFC 2231 headers (missing double single quotes).
        Parsing should not crash (#24209).
+        But stdlib email still decodes (#35440).
        """
        test_data = (
            (
                "Content-Type: application/x-stuff; "
                "title*='This%20is%20%2A%2A%2Afun%2A%2A%2A",
-                "'This%20is%20%2A%2A%2Afun%2A%2A%2A",
+                "'This is ***fun***",
            ),
            ("Content-Type: application/x-stuff; title*='foo.html", "'foo.html"),
            ("Content-Type: application/x-stuff; title*=bar.html", "bar.html"),
@ -494,6 +506,37 @@ class ParseHeaderParameterTests(unittest.TestCase):
            parsed = parse_header_parameters(raw_line)
            self.assertEqual(parsed[1]["title"], expected_title)

+    def test_header_max_length(self):
+        base_header = "Content-Type: application/x-stuff; title*="
+        base_header_len = len(base_header)
+
+        test_data = [
+            (MAX_HEADER_LENGTH, {}),
+            (MAX_HEADER_LENGTH, {"max_length": None}),
+            (MAX_HEADER_LENGTH + 1, {"max_length": None}),
+            (100, {"max_length": 100}),
+        ]
+        for line_length, kwargs in test_data:
+            with self.subTest(line_length=line_length, kwargs=kwargs):
+                title = "x" * (line_length - base_header_len)
+                line = base_header + title
+                assert len(line) == line_length
+
+                parsed = parse_header_parameters(line, **kwargs)
+
+                expected = ("content-type: application/x-stuff", {"title": title})
+                self.assertEqual(parsed, expected)
+
+    def test_header_too_long(self):
+        test_data = [
+            ("x" * (MAX_HEADER_LENGTH + 1), {}),
+            ("x" * 101, {"max_length": 100}),
+        ]
+        for line, kwargs in test_data:
+            with self.subTest(line_length=len(line), kwargs=kwargs):
+                with self.assertRaises(ValueError):
+                    parse_header_parameters(line, **kwargs)
+

 class ContentDispositionHeaderTests(unittest.TestCase):
    def test_basic(self):