From 698d05c11c27d4ed5fd75194ac0edcf133bd7600 Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Sun, 15 Dec 2024 01:54:42 +0100 Subject: [PATCH] [5.2.x] Fixed #36013 -- Removed use of IDNA-2003 in django.utils.html. Removed obsolete and potentially problematic IDNA 2003 ("punycode") encoding of international domain names in smart_urlquote() and Urlizer, which are used (only) by AdminURLFieldWidget and the urlize/urlizetrunc template filters. Changed to use percent-encoded UTF-8, which defers IDNA details to the browser (like other URLs rendered by Django). Backport of 29ba75e6e57414f0e6f9528d08a520b8b931fb28 from main. --- AUTHORS | 1 + django/utils/html.py | 16 +++---- tests/admin_widgets/tests.py | 7 ++- .../filter_tests/test_urlize.py | 23 +++++++-- tests/utils_tests/test_html.py | 47 +++++++++++++++++-- 5 files changed, 74 insertions(+), 20 deletions(-) diff --git a/AUTHORS b/AUTHORS index c9a26fa6c8..9d8956bda0 100644 --- a/AUTHORS +++ b/AUTHORS @@ -735,6 +735,7 @@ answer newbie questions, and generally made Django that much better: Mihai Preda Mikaël Barbero Mike Axiak + Mike Edmunds Mike Grouchy Mike Malone Mike Richardson diff --git a/django/utils/html.py b/django/utils/html.py index ab1363e2de..30b3352556 100644 --- a/django/utils/html.py +++ b/django/utils/html.py @@ -11,7 +11,6 @@ from urllib.parse import parse_qsl, quote, unquote, urlencode, urlsplit, urlunsp from django.core.exceptions import SuspiciousOperation, ValidationError from django.core.validators import EmailValidator from django.utils.deprecation import RemovedInDjango60Warning -from django.utils.encoding import punycode from django.utils.functional import Promise, cached_property, keep_lazy, keep_lazy_text from django.utils.http import RFC3986_GENDELIMS, RFC3986_SUBDELIMS from django.utils.regex_helper import _lazy_re_compile @@ -245,17 +244,16 @@ def smart_urlquote(url): # see also https://bugs.python.org/issue16285 return quote(segment, safe=RFC3986_SUBDELIMS + RFC3986_GENDELIMS + "~") - # Handle IDN before quoting. try: scheme, netloc, path, query, fragment = urlsplit(url) except ValueError: # invalid IPv6 URL (normally square brackets in hostname part). return unquote_quote(url) - try: - netloc = punycode(netloc) # IDN -> ACE - except UnicodeError: # invalid domain part - return unquote_quote(url) + # Handle IDN as percent-encoded UTF-8 octets, per WHATWG URL Specification + # section 3.5 and RFC 3986 section 3.2.2. Defer any IDNA to the user agent. + # See #36013. + netloc = unquote_quote(netloc) if query: # Separately unquoting key/value, so as to not mix querystring separators @@ -356,10 +354,8 @@ class Urlizer: url = smart_urlquote("http://%s" % html.unescape(middle)) elif ":" not in middle and self.is_email_simple(middle): local, domain = middle.rsplit("@", 1) - try: - domain = punycode(domain) - except UnicodeError: - return word + # Encode per RFC 6068 Section 2 (items 1, 4, 5). Defer any IDNA + # to the user agent. See #36013. local = quote(local, safe="") domain = quote(domain, safe="") url = self.mailto_template.format(local=local, domain=domain) diff --git a/tests/admin_widgets/tests.py b/tests/admin_widgets/tests.py index 0cf1324623..74fc30a706 100644 --- a/tests/admin_widgets/tests.py +++ b/tests/admin_widgets/tests.py @@ -490,11 +490,13 @@ class AdminURLWidgetTest(SimpleTestCase): w = widgets.AdminURLFieldWidget() self.assertHTMLEqual( w.render("test", "http://example-äüö.com"), - '

Currently: ' + '

Currently: ' "http://example-äüö.com
" 'Change:

', ) + # Does not use obsolete IDNA-2003 encoding (#36013). + self.assertNotIn("fass.example.com", w.render("test", "http://faß.example.com")) def test_render_quoting(self): """ @@ -521,7 +523,8 @@ class AdminURLWidgetTest(SimpleTestCase): output = w.render("test", "http://example-äüö.com/some-text") self.assertEqual( HREF_RE.search(output)[1], - "http://xn--example--7za4pnc.com/%3Csometag%3Esome-text%3C/sometag%3E", + "http://example-%C3%A4%C3%BC%C3%B6.com/" + "%3Csometag%3Esome-text%3C/sometag%3E", ) self.assertEqual( TEXT_RE.search(output)[1], diff --git a/tests/template_tests/filter_tests/test_urlize.py b/tests/template_tests/filter_tests/test_urlize.py index 80dd94cd9f..c186acd948 100644 --- a/tests/template_tests/filter_tests/test_urlize.py +++ b/tests/template_tests/filter_tests/test_urlize.py @@ -229,19 +229,34 @@ class FunctionTests(SimpleTestCase): """ #13704 - Check urlize handles IDN correctly """ + # The "✶" below is \N{SIX POINTED BLACK STAR}, not "*" \N{ASTERISK}. self.assertEqual( urlize("http://c✶.ws"), - 'http://c✶.ws', + 'http://c✶.ws', ) self.assertEqual( urlize("www.c✶.ws"), - 'www.c✶.ws', + 'www.c✶.ws', ) self.assertEqual( - urlize("c✶.org"), 'c✶.org' + urlize("c✶.org"), + 'c✶.org', ) self.assertEqual( - urlize("info@c✶.org"), 'info@c✶.org' + urlize("info@c✶.org"), + 'info@c✶.org', + ) + + # Pre-encoded IDNA is urlized but not re-encoded. + self.assertEqual( + urlize("www.xn--iny-zx5a.com/idna2003"), + 'www.xn--iny-zx5a.com/idna2003', + ) + self.assertEqual( + urlize("www.xn--fa-hia.com/idna2008"), + 'www.xn--fa-hia.com/idna2008', ) def test_malformed(self): diff --git a/tests/utils_tests/test_html.py b/tests/utils_tests/test_html.py index 341e211c96..6d259d76d7 100644 --- a/tests/utils_tests/test_html.py +++ b/tests/utils_tests/test_html.py @@ -269,8 +269,26 @@ class TestUtilsHtml(SimpleTestCase): def test_smart_urlquote(self): items = ( - ("http://öäü.com/", "http://xn--4ca9at.com/"), - ("http://öäü.com/öäü/", "http://xn--4ca9at.com/%C3%B6%C3%A4%C3%BC/"), + # IDN is encoded as percent-encoded ("quoted") UTF-8 (#36013). + ("http://öäü.com/", "http://%C3%B6%C3%A4%C3%BC.com/"), + ("https://faß.example.com", "https://fa%C3%9F.example.com"), + ( + "http://öäü.com/öäü/", + "http://%C3%B6%C3%A4%C3%BC.com/%C3%B6%C3%A4%C3%BC/", + ), + ( + # Valid under IDNA 2008, but was invalid in IDNA 2003. + "https://މިހާރު.com", + "https://%DE%89%DE%A8%DE%80%DE%A7%DE%83%DE%AA.com", + ), + ( + # Valid under WHATWG URL Specification but not IDNA 2008. + "http://👓.ws", + "http://%F0%9F%91%93.ws", + ), + # Pre-encoded IDNA is left unchanged. + ("http://xn--iny-zx5a.com/idna2003", "http://xn--iny-zx5a.com/idna2003"), + ("http://xn--fa-hia.com/idna2008", "http://xn--fa-hia.com/idna2008"), # Everything unsafe is quoted, !*'();:@&=+$,/?#[]~ is considered # safe as per RFC. ( @@ -292,8 +310,10 @@ class TestUtilsHtml(SimpleTestCase): "django", ), ("http://.www.f oo.bar/", "http://.www.f%20oo.bar/"), + ('http://example.com">', "http://example.com%22%3E"), + ("http://10.22.1.1/", "http://10.22.1.1/"), + ("http://[fd00::1]/", "http://[fd00::1]/"), ) - # IDNs are properly quoted for value, output in items: with self.subTest(value=value, output=output): self.assertEqual(smart_urlquote(value), output) @@ -366,11 +386,21 @@ class TestUtilsHtml(SimpleTestCase): lazystr("Search for google.com/?q=!"), 'Search for google.com/?q=!', ), + ( + "http://www.foo.bar/", + 'http://www.foo.bar/', + ), + ( + "Look on www.نامه‌ای.com.", + "Look on www.نامه‌ای.com.", + ), ("foo@example.com", 'foo@example.com'), ( "test@" + "한.글." * 15 + "aaa", '' + "test@" + "한.글." * 15 @@ -383,6 +413,15 @@ class TestUtilsHtml(SimpleTestCase): 'yes+this=is&a%valid!email@example.com", ), + ( + "foo@faß.example.com", + 'foo@faß.example.com', + ), + ( + "idna-2008@މިހާރު.example.mv", + 'idna-2008@މިހާރު.example.mv', + ), ) for value, output in tests: with self.subTest(value=value):