From 9aabe7eae3eeb3e64c5a0f3687118cd806158550 Mon Sep 17 00:00:00 2001
From: Khudyakov Artem <khudyak.artem@gmail.com>
Date: Mon, 29 Jul 2024 22:05:10 +0300
Subject: [PATCH] Fixed #35440 -- Simplified parse_header_parameters by
 leveraging stdlid's Message.

The `parse_header_parameters` function historically used Python's `cgi`
module  (now deprecated). In 34e2148fc725e7200050f74130d7523e3cd8507a,
the logic was inlined to work around this deprecation ( #33173). Later,
in d4d5427571b4bf3a21c902276c2a00215c2a37cc, the header parsing logic
was further cleaned up to align with `multipartparser.py` (#33697).

This change takes it a step further by replacing the copied `cgi` logic with
Python's `email.message.Message` API for a more robust and maintainable header
parsing implementation.

Thanks to Raphael Gaschignard for testing, and to Adam Johnson and Shai
Berger for reviews.

Co-authored-by: Ben Cail <bcail@crossway.org>
Co-authored-by: Natalia <124304+nessita@users.noreply.github.com>
---
 django/utils/http.py           | 58 ++++++++++++----------------------
 docs/releases/6.0.txt          |  4 +++
 tests/utils_tests/test_http.py | 45 +++++++++++++++++++++++++-
 3 files changed, 69 insertions(+), 38 deletions(-)
diff --git a/django/utils/http.py b/django/utils/http.py
index 9ca32eab08..1f9adeb707 100644
--- a/django/utils/http.py
+++ b/django/utils/http.py
@@ -3,8 +3,9 @@ import re
 import unicodedata
 from binascii import Error as BinasciiError
 from datetime import UTC, datetime
-from email.utils import formatdate
-from urllib.parse import quote, unquote
+from email.message import Message
+from email.utils import collapse_rfc2231_value, formatdate
+from urllib.parse import quote
 from urllib.parse import urlencode as original_urlencode
 from urllib.parse import urlsplit
 
@@ -24,6 +25,7 @@ ETAG_MATCH = _lazy_re_compile(
     re.X,
 )
 
+MAX_HEADER_LENGTH = 10_000
 MONTHS = "jan feb mar apr may jun jul aug sep oct nov dec".split()
 __D = r"(?P<day>[0-9]{2})"
 __D2 = r"(?P<day>[ 0-9][0-9])"
@@ -310,46 +312,28 @@ def escape_leading_slashes(url):
     return url
 
 
-def _parseparam(s):
-    while s[:1] == ";":
-        s = s[1:]
-        end = s.find(";")
-        while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2:
-            end = s.find(";", end + 1)
-        if end < 0:
-            end = len(s)
-        f = s[:end]
-        yield f.strip()
-        s = s[end:]
-
-
-def parse_header_parameters(line):
+def parse_header_parameters(line, max_length=MAX_HEADER_LENGTH):
     """
     Parse a Content-type like header.
     Return the main content-type and a dictionary of options.
+
+    If `line` is longer than `max_length`, `ValueError` is raised.
     """
-    parts = _parseparam(";" + line)
-    key = parts.__next__().lower()
+    if max_length is not None and line and len(line) > max_length:
+        raise ValueError("Unable to parse header parameters (value too long).")
+
+    m = Message()
+    m["content-type"] = line
+    params = m.get_params()
+
     pdict = {}
-    for p in parts:
-        i = p.find("=")
-        if i >= 0:
-            has_encoding = False
-            name = p[:i].strip().lower()
-            if name.endswith("*"):
-                # Lang/encoding embedded in the value (like "filename*=UTF-8''file.ext")
-                # https://tools.ietf.org/html/rfc2231#section-4
-                name = name[:-1]
-                if p.count("'") == 2:
-                    has_encoding = True
-            value = p[i + 1 :].strip()
-            if len(value) >= 2 and value[0] == value[-1] == '"':
-                value = value[1:-1]
-                value = value.replace("\\\\", "\\").replace('\\"', '"')
-            if has_encoding:
-                encoding, lang, value = value.split("'")
-                value = unquote(value, encoding=encoding)
-            pdict[name] = value
+    key = params.pop(0)[0].lower()
+    for name, value in params:
+        if not name:
+            continue
+        if isinstance(value, tuple):
+            value = collapse_rfc2231_value(value)
+        pdict[name] = value
     return key, pdict
 
 
diff --git a/docs/releases/6.0.txt b/docs/releases/6.0.txt
index 499163788f..fe090e3f2a 100644
--- a/docs/releases/6.0.txt
+++ b/docs/releases/6.0.txt
@@ -311,6 +311,10 @@ Miscellaneous
 * The :ref:`JSON <serialization-formats-json>` serializer now writes a newline
   at the end of the output, even without the ``indent`` option set.
 
+* The undocumented ``django.utils.http.parse_header_parameters()`` function is
+  refactored to use Python's :py:class:`email.message.Message` for parsing.
+  Input headers exceeding 10000 characters will now raise :exc:`ValueError`.
+
 .. _deprecated-features-6.0:
 
 Features deprecated in 6.0
diff --git a/tests/utils_tests/test_http.py b/tests/utils_tests/test_http.py
index d18fb63c0c..3730c2fcf5 100644
--- a/tests/utils_tests/test_http.py
+++ b/tests/utils_tests/test_http.py
@@ -6,6 +6,7 @@ from unittest import mock
 from django.test import SimpleTestCase
 from django.utils.datastructures import MultiValueDict
 from django.utils.http import (
+    MAX_HEADER_LENGTH,
     base36_to_int,
     content_disposition_header,
     escape_leading_slashes,
@@ -424,6 +425,8 @@ class EscapeLeadingSlashesTests(unittest.TestCase):
 class ParseHeaderParameterTests(unittest.TestCase):
     def test_basic(self):
         tests = [
+            ("", ("", {})),
+            (None, ("none", {})),
             ("text/plain", ("text/plain", {})),
             ("text/vnd.just.made.this.up ; ", ("text/vnd.just.made.this.up", {})),
             ("text/plain;charset=us-ascii", ("text/plain", {"charset": "us-ascii"})),
@@ -447,10 +450,18 @@ class ParseHeaderParameterTests(unittest.TestCase):
                 'attachment; filename="strange;name";size=123;',
                 ("attachment", {"filename": "strange;name", "size": "123"}),
             ),
+            (
+                'attachment; filename="strange;name";;;;size=123;;;',
+                ("attachment", {"filename": "strange;name", "size": "123"}),
+            ),
             (
                 'form-data; name="files"; filename="fo\\"o;bar"',
                 ("form-data", {"name": "files", "filename": 'fo"o;bar'}),
             ),
+            (
+                'form-data; name="files"; filename="\\"fo\\"o;b\\\\ar\\""',
+                ("form-data", {"name": "files", "filename": '"fo"o;b\\ar"'}),
+            ),
         ]
         for header, expected in tests:
             with self.subTest(header=header):
@@ -480,12 +491,13 @@ class ParseHeaderParameterTests(unittest.TestCase):
         """
         Test wrongly formatted RFC 2231 headers (missing double single quotes).
         Parsing should not crash (#24209).
+        But stdlib email still decodes (#35440).
         """
         test_data = (
             (
                 "Content-Type: application/x-stuff; "
                 "title*='This%20is%20%2A%2A%2Afun%2A%2A%2A",
-                "'This%20is%20%2A%2A%2Afun%2A%2A%2A",
+                "'This is ***fun***",
             ),
             ("Content-Type: application/x-stuff; title*='foo.html", "'foo.html"),
             ("Content-Type: application/x-stuff; title*=bar.html", "bar.html"),
@@ -494,6 +506,37 @@ class ParseHeaderParameterTests(unittest.TestCase):
             parsed = parse_header_parameters(raw_line)
             self.assertEqual(parsed[1]["title"], expected_title)
 
+    def test_header_max_length(self):
+        base_header = "Content-Type: application/x-stuff; title*="
+        base_header_len = len(base_header)
+
+        test_data = [
+            (MAX_HEADER_LENGTH, {}),
+            (MAX_HEADER_LENGTH, {"max_length": None}),
+            (MAX_HEADER_LENGTH + 1, {"max_length": None}),
+            (100, {"max_length": 100}),
+        ]
+        for line_length, kwargs in test_data:
+            with self.subTest(line_length=line_length, kwargs=kwargs):
+                title = "x" * (line_length - base_header_len)
+                line = base_header + title
+                assert len(line) == line_length
+
+                parsed = parse_header_parameters(line, **kwargs)
+
+                expected = ("content-type: application/x-stuff", {"title": title})
+                self.assertEqual(parsed, expected)
+
+    def test_header_too_long(self):
+        test_data = [
+            ("x" * (MAX_HEADER_LENGTH + 1), {}),
+            ("x" * 101, {"max_length": 100}),
+        ]
+        for line, kwargs in test_data:
+            with self.subTest(line_length=len(line), kwargs=kwargs):
+                with self.assertRaises(ValueError):
+                    parse_header_parameters(line, **kwargs)
+
 
 class ContentDispositionHeaderTests(unittest.TestCase):
     def test_basic(self):