1
0
mirror of https://github.com/django/django.git synced 2025-04-09 07:56:43 +00:00

Fixed #35440 -- Simplified parse_header_parameters by leveraging stdlid's Message.

The `parse_header_parameters` function historically used Python's `cgi`
module  (now deprecated). In 34e2148fc725e7200050f74130d7523e3cd8507a,
the logic was inlined to work around this deprecation ( #33173). Later,
in d4d5427571b4bf3a21c902276c2a00215c2a37cc, the header parsing logic
was further cleaned up to align with `multipartparser.py` (#33697).

This change takes it a step further by replacing the copied `cgi` logic with
Python's `email.message.Message` API for a more robust and maintainable header
parsing implementation.

Thanks to Raphael Gaschignard for testing, and to Adam Johnson and Shai
Berger for reviews.

Co-authored-by: Ben Cail <bcail@crossway.org>
Co-authored-by: Natalia <124304+nessita@users.noreply.github.com>
This commit is contained in:
Khudyakov Artem 2024-07-29 22:05:10 +03:00 committed by nessita
parent 0d92428d77
commit 9aabe7eae3
3 changed files with 69 additions and 38 deletions

View File

@ -3,8 +3,9 @@ import re
import unicodedata
from binascii import Error as BinasciiError
from datetime import UTC, datetime
from email.utils import formatdate
from urllib.parse import quote, unquote
from email.message import Message
from email.utils import collapse_rfc2231_value, formatdate
from urllib.parse import quote
from urllib.parse import urlencode as original_urlencode
from urllib.parse import urlsplit
@ -24,6 +25,7 @@ ETAG_MATCH = _lazy_re_compile(
re.X,
)
MAX_HEADER_LENGTH = 10_000
MONTHS = "jan feb mar apr may jun jul aug sep oct nov dec".split()
__D = r"(?P<day>[0-9]{2})"
__D2 = r"(?P<day>[ 0-9][0-9])"
@ -310,46 +312,28 @@ def escape_leading_slashes(url):
return url
def _parseparam(s):
while s[:1] == ";":
s = s[1:]
end = s.find(";")
while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2:
end = s.find(";", end + 1)
if end < 0:
end = len(s)
f = s[:end]
yield f.strip()
s = s[end:]
def parse_header_parameters(line):
def parse_header_parameters(line, max_length=MAX_HEADER_LENGTH):
"""
Parse a Content-type like header.
Return the main content-type and a dictionary of options.
If `line` is longer than `max_length`, `ValueError` is raised.
"""
parts = _parseparam(";" + line)
key = parts.__next__().lower()
if max_length is not None and line and len(line) > max_length:
raise ValueError("Unable to parse header parameters (value too long).")
m = Message()
m["content-type"] = line
params = m.get_params()
pdict = {}
for p in parts:
i = p.find("=")
if i >= 0:
has_encoding = False
name = p[:i].strip().lower()
if name.endswith("*"):
# Lang/encoding embedded in the value (like "filename*=UTF-8''file.ext")
# https://tools.ietf.org/html/rfc2231#section-4
name = name[:-1]
if p.count("'") == 2:
has_encoding = True
value = p[i + 1 :].strip()
if len(value) >= 2 and value[0] == value[-1] == '"':
value = value[1:-1]
value = value.replace("\\\\", "\\").replace('\\"', '"')
if has_encoding:
encoding, lang, value = value.split("'")
value = unquote(value, encoding=encoding)
pdict[name] = value
key = params.pop(0)[0].lower()
for name, value in params:
if not name:
continue
if isinstance(value, tuple):
value = collapse_rfc2231_value(value)
pdict[name] = value
return key, pdict

View File

@ -311,6 +311,10 @@ Miscellaneous
* The :ref:`JSON <serialization-formats-json>` serializer now writes a newline
at the end of the output, even without the ``indent`` option set.
* The undocumented ``django.utils.http.parse_header_parameters()`` function is
refactored to use Python's :py:class:`email.message.Message` for parsing.
Input headers exceeding 10000 characters will now raise :exc:`ValueError`.
.. _deprecated-features-6.0:
Features deprecated in 6.0

View File

@ -6,6 +6,7 @@ from unittest import mock
from django.test import SimpleTestCase
from django.utils.datastructures import MultiValueDict
from django.utils.http import (
MAX_HEADER_LENGTH,
base36_to_int,
content_disposition_header,
escape_leading_slashes,
@ -424,6 +425,8 @@ class EscapeLeadingSlashesTests(unittest.TestCase):
class ParseHeaderParameterTests(unittest.TestCase):
def test_basic(self):
tests = [
("", ("", {})),
(None, ("none", {})),
("text/plain", ("text/plain", {})),
("text/vnd.just.made.this.up ; ", ("text/vnd.just.made.this.up", {})),
("text/plain;charset=us-ascii", ("text/plain", {"charset": "us-ascii"})),
@ -447,10 +450,18 @@ class ParseHeaderParameterTests(unittest.TestCase):
'attachment; filename="strange;name";size=123;',
("attachment", {"filename": "strange;name", "size": "123"}),
),
(
'attachment; filename="strange;name";;;;size=123;;;',
("attachment", {"filename": "strange;name", "size": "123"}),
),
(
'form-data; name="files"; filename="fo\\"o;bar"',
("form-data", {"name": "files", "filename": 'fo"o;bar'}),
),
(
'form-data; name="files"; filename="\\"fo\\"o;b\\\\ar\\""',
("form-data", {"name": "files", "filename": '"fo"o;b\\ar"'}),
),
]
for header, expected in tests:
with self.subTest(header=header):
@ -480,12 +491,13 @@ class ParseHeaderParameterTests(unittest.TestCase):
"""
Test wrongly formatted RFC 2231 headers (missing double single quotes).
Parsing should not crash (#24209).
But stdlib email still decodes (#35440).
"""
test_data = (
(
"Content-Type: application/x-stuff; "
"title*='This%20is%20%2A%2A%2Afun%2A%2A%2A",
"'This%20is%20%2A%2A%2Afun%2A%2A%2A",
"'This is ***fun***",
),
("Content-Type: application/x-stuff; title*='foo.html", "'foo.html"),
("Content-Type: application/x-stuff; title*=bar.html", "bar.html"),
@ -494,6 +506,37 @@ class ParseHeaderParameterTests(unittest.TestCase):
parsed = parse_header_parameters(raw_line)
self.assertEqual(parsed[1]["title"], expected_title)
def test_header_max_length(self):
base_header = "Content-Type: application/x-stuff; title*="
base_header_len = len(base_header)
test_data = [
(MAX_HEADER_LENGTH, {}),
(MAX_HEADER_LENGTH, {"max_length": None}),
(MAX_HEADER_LENGTH + 1, {"max_length": None}),
(100, {"max_length": 100}),
]
for line_length, kwargs in test_data:
with self.subTest(line_length=line_length, kwargs=kwargs):
title = "x" * (line_length - base_header_len)
line = base_header + title
assert len(line) == line_length
parsed = parse_header_parameters(line, **kwargs)
expected = ("content-type: application/x-stuff", {"title": title})
self.assertEqual(parsed, expected)
def test_header_too_long(self):
test_data = [
("x" * (MAX_HEADER_LENGTH + 1), {}),
("x" * 101, {"max_length": 100}),
]
for line, kwargs in test_data:
with self.subTest(line_length=len(line), kwargs=kwargs):
with self.assertRaises(ValueError):
parse_header_parameters(line, **kwargs)
class ContentDispositionHeaderTests(unittest.TestCase):
def test_basic(self):