1
0
mirror of https://github.com/django/django.git synced 2025-03-27 01:30:46 +00:00

Fixed -- Fixed some percent decoding cases in uri_to_iri().

This commit is contained in:
Chronial 2017-02-07 14:55:44 +01:00 committed by Tim Graham
parent 500532c95d
commit 03281d8fe7
4 changed files with 55 additions and 17 deletions
django
docs/ref
tests/utils_tests

@ -6,7 +6,7 @@ import sys
from copy import copy from copy import copy
from importlib import import_module from importlib import import_module
from io import BytesIO from io import BytesIO
from urllib.parse import urljoin, urlparse, urlsplit from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit
from django.conf import settings from django.conf import settings
from django.core.handlers.base import BaseHandler from django.core.handlers.base import BaseHandler
@ -20,7 +20,7 @@ from django.template import TemplateDoesNotExist
from django.test import signals from django.test import signals
from django.test.utils import ContextList from django.test.utils import ContextList
from django.urls import resolve from django.urls import resolve
from django.utils.encoding import force_bytes, uri_to_iri from django.utils.encoding import force_bytes
from django.utils.functional import SimpleLazyObject, curry from django.utils.functional import SimpleLazyObject, curry
from django.utils.http import urlencode from django.utils.http import urlencode
from django.utils.itercompat import is_iterable from django.utils.itercompat import is_iterable
@ -320,7 +320,7 @@ class RequestFactory:
# If there are parameters, add them # If there are parameters, add them
if parsed.params: if parsed.params:
path += ";" + parsed.params path += ";" + parsed.params
path = uri_to_iri(path).encode() path = unquote_to_bytes(path)
# Replace the behavior where non-ASCII values in the WSGI environ are # Replace the behavior where non-ASCII values in the WSGI environ are
# arbitrarily decoded with ISO-8859-1. # arbitrarily decoded with ISO-8859-1.
# Refs comment in `get_bytes_from_wsgi()`. # Refs comment in `get_bytes_from_wsgi()`.

@ -2,7 +2,7 @@ import codecs
import datetime import datetime
import locale import locale
from decimal import Decimal from decimal import Decimal
from urllib.parse import quote, unquote_to_bytes from urllib.parse import quote
from django.utils import six from django.utils import six
from django.utils.functional import Promise from django.utils.functional import Promise
@ -151,20 +151,57 @@ def iri_to_uri(iri):
return quote(iri, safe="/#%[]=:;$&()+,!?*@'~") return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
# List of byte values that uri_to_iri() decodes from percent encoding.
# First, the unreserved characters from RFC 3986:
_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
_hextobyte = {
(fmt % char).encode(): bytes((char,))
for ascii_range in _ascii_ranges
for char in ascii_range
for fmt in ['%02x', '%02X']
}
# And then everything above 128, because bytes ≥ 128 are part of multibyte
# unicode characters.
_hexdig = '0123456789ABCDEFabcdef'
_hextobyte.update({
(a + b).encode(): bytes.fromhex(a + b)
for a in _hexdig[8:] for b in _hexdig
})
def uri_to_iri(uri): def uri_to_iri(uri):
""" """
Converts a Uniform Resource Identifier(URI) into an Internationalized Converts a Uniform Resource Identifier(URI) into an Internationalized
Resource Identifier(IRI). Resource Identifier(IRI).
This is the algorithm from section 3.2 of RFC 3987. This is the algorithm from section 3.2 of RFC 3987, excluding step 4.
Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/'). a string containing the encoded result (e.g. '/I%20♥%20Django/').
""" """
if uri is None: if uri is None:
return uri return uri
uri = force_bytes(uri) uri = force_bytes(uri)
iri = unquote_to_bytes(uri) # Fast selective unqote: First, split on '%' and then starting with the
# second block, decode the first 2 bytes if they represent a hex code to
# decode. The rest of the block is the part after '%AB', not containing
# any '%'. Add that to the output without further processing.
bits = uri.split(b'%')
if len(bits) == 1:
iri = uri
else:
parts = [bits[0]]
append = parts.append
hextobyte = _hextobyte
for item in bits[1:]:
hex = item[:2]
if hex in hextobyte:
append(hextobyte[item[:2]])
append(item[2:])
else:
append(b'%')
append(item)
iri = b''.join(parts)
return repercent_broken_unicode(iri).decode() return repercent_broken_unicode(iri).decode()

@ -195,19 +195,17 @@ result.
Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`. implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
It decodes all percent-encodings except those that don't represent a valid
UTF-8 sequence.
An example to demonstrate:: An example to demonstrate::
>>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93') >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
'/♥♥/?utf8=✓' '/♥♥/?utf8=✓'
>>> uri_to_iri('%A9helloworld') >>> uri_to_iri('%A9hello%3Fworld')
'%A9helloworld' '%A9hello%3Fworld'
In the first example, the UTF-8 characters and reserved characters are In the first example, the UTF-8 characters are unquoted. In the second, the
unquoted. In the second, the percent-encoding remains unchanged because it percent-encodings remain unchanged because they lie outside the valid UTF-8
lies outside the valid UTF-8 range. range or represent a reserved character.
Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
following is always true:: following is always true::

@ -93,9 +93,11 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
def test_uri_to_iri(self): def test_uri_to_iri(self):
cases = [ cases = [
# Valid UTF-8 sequences are decoded. # Valid UTF-8 sequences are decoded.
('/%E2%99%A5%E2%99%A5/', '/♥♥/'), ('/%e2%89%Ab%E2%99%a5%E2%89%aB/', '/≫♥≫/'),
('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'), ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
('/%41%5a%6B/', '/AZk/'),
# Reserved and non-URL valid ASCII chars are not decoded.
('/%25%20%02%41%7b/', '/%25%20%02A%7b/'),
# Broken UTF-8 sequences remain escaped. # Broken UTF-8 sequences remain escaped.
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'), ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'), ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
@ -112,11 +114,12 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
def test_complementarity(self): def test_complementarity(self):
cases = [ cases = [
('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'), ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen%20M\xfcnster/'),
('%&', '%&'), ('%&', '%&'),
('red&%E2%99%A5ros%#red', 'red&♥ros%#red'), ('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
('/%E2%99%A5%E2%99%A5/', '/♥♥/'), ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'), ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
('/%25%20%02%7b/', '/%25%20%02%7b/'),
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'), ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'), ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'), ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),