Fixed #26005 -- Fixed some percent decoding cases in uri_to_iri().

2025-10-24 14:16:09 +00:00 · 2017-02-07 14:55:44 +01:00
parent 500532c95d
commit 03281d8fe7
4 changed files with 55 additions and 17 deletions
--- a/django/test/client.py
+++ b/django/test/client.py
@@ -6,7 +6,7 @@ import sys
 from copy import copy
 from importlib import import_module
 from io import BytesIO
-from urllib.parse import urljoin, urlparse, urlsplit
+from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit
 from django.conf import settings
 from django.core.handlers.base import BaseHandler
@@ -20,7 +20,7 @@ from django.template import TemplateDoesNotExist
 from django.test import signals
 from django.test.utils import ContextList
 from django.urls import resolve
-from django.utils.encoding import force_bytes, uri_to_iri
+from django.utils.encoding import force_bytes
 from django.utils.functional import SimpleLazyObject, curry
 from django.utils.http import urlencode
 from django.utils.itercompat import is_iterable
@@ -320,7 +320,7 @@ class RequestFactory:
        # If there are parameters, add them
        if parsed.params:
            path += ";" + parsed.params
-        path = uri_to_iri(path).encode()
+        path = unquote_to_bytes(path)
        # Replace the behavior where non-ASCII values in the WSGI environ are
        # arbitrarily decoded with ISO-8859-1.
        # Refs comment in `get_bytes_from_wsgi()`.
--- a/django/utils/encoding.py
+++ b/django/utils/encoding.py
@@ -2,7 +2,7 @@ import codecs
 import datetime
 import locale
 from decimal import Decimal
-from urllib.parse import quote, unquote_to_bytes
+from urllib.parse import quote
 from django.utils import six
 from django.utils.functional import Promise
@@ -151,20 +151,57 @@ def iri_to_uri(iri):
    return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
 # List of byte values that uri_to_iri() decodes from percent encoding.
 # First, the unreserved characters from RFC 3986:
 _ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
 _hextobyte = {
    (fmt % char).encode(): bytes((char,))
    for ascii_range in _ascii_ranges
    for char in ascii_range
    for fmt in ['%02x', '%02X']
 }
 # And then everything above 128, because bytes ≥ 128 are part of multibyte
 # unicode characters.
 _hexdig = '0123456789ABCDEFabcdef'
 _hextobyte.update({
    (a + b).encode(): bytes.fromhex(a + b)
    for a in _hexdig[8:] for b in _hexdig
 })
 def uri_to_iri(uri):
    """
    Converts a Uniform Resource Identifier(URI) into an Internationalized
    Resource Identifier(IRI).
-    This is the algorithm from section 3.2 of RFC 3987.
+    This is the algorithm from section 3.2 of RFC 3987, excluding step 4.
    Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
-    a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
+    a string containing the encoded result (e.g. '/I%20♥%20Django/').
    """
    if uri is None:
        return uri
    uri = force_bytes(uri)
-    iri = unquote_to_bytes(uri)
+    # Fast selective unqote: First, split on '%' and then starting with the
    # second block, decode the first 2 bytes if they represent a hex code to
    # decode. The rest of the block is the part after '%AB', not containing
    # any '%'. Add that to the output without further processing.
    bits = uri.split(b'%')
    if len(bits) == 1:
        iri = uri
    else:
        parts = [bits[0]]
        append = parts.append
        hextobyte = _hextobyte
        for item in bits[1:]:
            hex = item[:2]
            if hex in hextobyte:
                append(hextobyte[item[:2]])
                append(item[2:])
            else:
                append(b'%')
                append(item)
        iri = b''.join(parts)
    return repercent_broken_unicode(iri).decode()
--- a/docs/ref/unicode.txt
+++ b/docs/ref/unicode.txt
@@ -195,19 +195,17 @@ result.
 Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
 implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
 It decodes all percent-encodings except those that don't represent a valid
 UTF-8 sequence.
 An example to demonstrate::
    >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
    '/♥♥/?utf8=✓'
-    >>> uri_to_iri('%A9helloworld')
+    >>> uri_to_iri('%A9hello%3Fworld')
-    '%A9helloworld'
+    '%A9hello%3Fworld'
-In the first example, the UTF-8 characters and reserved characters are
+In the first example, the UTF-8 characters are unquoted. In the second, the
-unquoted. In the second, the percent-encoding remains unchanged because it
+percent-encodings remain unchanged because they lie outside the valid UTF-8
-lies outside the valid UTF-8 range.
+range or represent a reserved character.
 Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
 following is always true::
--- a/tests/utils_tests/test_encoding.py
+++ b/tests/utils_tests/test_encoding.py
@@ -93,9 +93,11 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
    def test_uri_to_iri(self):
        cases = [
            # Valid UTF-8 sequences are decoded.
-            ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
+            ('/%e2%89%Ab%E2%99%a5%E2%89%aB/', '/≫♥≫/'),
            ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
-
+            ('/%41%5a%6B/', '/AZk/'),
            # Reserved and non-URL valid ASCII chars are not decoded.
            ('/%25%20%02%41%7b/', '/%25%20%02A%7b/'),
            # Broken UTF-8 sequences remain escaped.
            ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
            ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
@@ -112,11 +114,12 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
    def test_complementarity(self):
        cases = [
-            ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'),
+            ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen%20M\xfcnster/'),
            ('%&', '%&'),
            ('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
            ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
            ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
            ('/%25%20%02%7b/', '/%25%20%02%7b/'),
            ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
            ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
            ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),