mirror of
https://github.com/django/django.git
synced 2025-03-27 01:30:46 +00:00
Fixed #26005 -- Fixed some percent decoding cases in uri_to_iri().
This commit is contained in:
parent
500532c95d
commit
03281d8fe7
@ -6,7 +6,7 @@ import sys
|
|||||||
from copy import copy
|
from copy import copy
|
||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from urllib.parse import urljoin, urlparse, urlsplit
|
from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.handlers.base import BaseHandler
|
from django.core.handlers.base import BaseHandler
|
||||||
@ -20,7 +20,7 @@ from django.template import TemplateDoesNotExist
|
|||||||
from django.test import signals
|
from django.test import signals
|
||||||
from django.test.utils import ContextList
|
from django.test.utils import ContextList
|
||||||
from django.urls import resolve
|
from django.urls import resolve
|
||||||
from django.utils.encoding import force_bytes, uri_to_iri
|
from django.utils.encoding import force_bytes
|
||||||
from django.utils.functional import SimpleLazyObject, curry
|
from django.utils.functional import SimpleLazyObject, curry
|
||||||
from django.utils.http import urlencode
|
from django.utils.http import urlencode
|
||||||
from django.utils.itercompat import is_iterable
|
from django.utils.itercompat import is_iterable
|
||||||
@ -320,7 +320,7 @@ class RequestFactory:
|
|||||||
# If there are parameters, add them
|
# If there are parameters, add them
|
||||||
if parsed.params:
|
if parsed.params:
|
||||||
path += ";" + parsed.params
|
path += ";" + parsed.params
|
||||||
path = uri_to_iri(path).encode()
|
path = unquote_to_bytes(path)
|
||||||
# Replace the behavior where non-ASCII values in the WSGI environ are
|
# Replace the behavior where non-ASCII values in the WSGI environ are
|
||||||
# arbitrarily decoded with ISO-8859-1.
|
# arbitrarily decoded with ISO-8859-1.
|
||||||
# Refs comment in `get_bytes_from_wsgi()`.
|
# Refs comment in `get_bytes_from_wsgi()`.
|
||||||
|
@ -2,7 +2,7 @@ import codecs
|
|||||||
import datetime
|
import datetime
|
||||||
import locale
|
import locale
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
from urllib.parse import quote, unquote_to_bytes
|
from urllib.parse import quote
|
||||||
|
|
||||||
from django.utils import six
|
from django.utils import six
|
||||||
from django.utils.functional import Promise
|
from django.utils.functional import Promise
|
||||||
@ -151,20 +151,57 @@ def iri_to_uri(iri):
|
|||||||
return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
|
return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
|
||||||
|
|
||||||
|
|
||||||
|
# List of byte values that uri_to_iri() decodes from percent encoding.
|
||||||
|
# First, the unreserved characters from RFC 3986:
|
||||||
|
_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
|
||||||
|
_hextobyte = {
|
||||||
|
(fmt % char).encode(): bytes((char,))
|
||||||
|
for ascii_range in _ascii_ranges
|
||||||
|
for char in ascii_range
|
||||||
|
for fmt in ['%02x', '%02X']
|
||||||
|
}
|
||||||
|
# And then everything above 128, because bytes ≥ 128 are part of multibyte
|
||||||
|
# unicode characters.
|
||||||
|
_hexdig = '0123456789ABCDEFabcdef'
|
||||||
|
_hextobyte.update({
|
||||||
|
(a + b).encode(): bytes.fromhex(a + b)
|
||||||
|
for a in _hexdig[8:] for b in _hexdig
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
def uri_to_iri(uri):
|
def uri_to_iri(uri):
|
||||||
"""
|
"""
|
||||||
Converts a Uniform Resource Identifier(URI) into an Internationalized
|
Converts a Uniform Resource Identifier(URI) into an Internationalized
|
||||||
Resource Identifier(IRI).
|
Resource Identifier(IRI).
|
||||||
|
|
||||||
This is the algorithm from section 3.2 of RFC 3987.
|
This is the algorithm from section 3.2 of RFC 3987, excluding step 4.
|
||||||
|
|
||||||
Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
|
Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
|
||||||
a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
|
a string containing the encoded result (e.g. '/I%20♥%20Django/').
|
||||||
"""
|
"""
|
||||||
if uri is None:
|
if uri is None:
|
||||||
return uri
|
return uri
|
||||||
uri = force_bytes(uri)
|
uri = force_bytes(uri)
|
||||||
iri = unquote_to_bytes(uri)
|
# Fast selective unqote: First, split on '%' and then starting with the
|
||||||
|
# second block, decode the first 2 bytes if they represent a hex code to
|
||||||
|
# decode. The rest of the block is the part after '%AB', not containing
|
||||||
|
# any '%'. Add that to the output without further processing.
|
||||||
|
bits = uri.split(b'%')
|
||||||
|
if len(bits) == 1:
|
||||||
|
iri = uri
|
||||||
|
else:
|
||||||
|
parts = [bits[0]]
|
||||||
|
append = parts.append
|
||||||
|
hextobyte = _hextobyte
|
||||||
|
for item in bits[1:]:
|
||||||
|
hex = item[:2]
|
||||||
|
if hex in hextobyte:
|
||||||
|
append(hextobyte[item[:2]])
|
||||||
|
append(item[2:])
|
||||||
|
else:
|
||||||
|
append(b'%')
|
||||||
|
append(item)
|
||||||
|
iri = b''.join(parts)
|
||||||
return repercent_broken_unicode(iri).decode()
|
return repercent_broken_unicode(iri).decode()
|
||||||
|
|
||||||
|
|
||||||
|
@ -195,19 +195,17 @@ result.
|
|||||||
|
|
||||||
Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
|
Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
|
||||||
implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
|
implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
|
||||||
It decodes all percent-encodings except those that don't represent a valid
|
|
||||||
UTF-8 sequence.
|
|
||||||
|
|
||||||
An example to demonstrate::
|
An example to demonstrate::
|
||||||
|
|
||||||
>>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
|
>>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
|
||||||
'/♥♥/?utf8=✓'
|
'/♥♥/?utf8=✓'
|
||||||
>>> uri_to_iri('%A9helloworld')
|
>>> uri_to_iri('%A9hello%3Fworld')
|
||||||
'%A9helloworld'
|
'%A9hello%3Fworld'
|
||||||
|
|
||||||
In the first example, the UTF-8 characters and reserved characters are
|
In the first example, the UTF-8 characters are unquoted. In the second, the
|
||||||
unquoted. In the second, the percent-encoding remains unchanged because it
|
percent-encodings remain unchanged because they lie outside the valid UTF-8
|
||||||
lies outside the valid UTF-8 range.
|
range or represent a reserved character.
|
||||||
|
|
||||||
Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
|
Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
|
||||||
following is always true::
|
following is always true::
|
||||||
|
@ -93,9 +93,11 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
|
|||||||
def test_uri_to_iri(self):
|
def test_uri_to_iri(self):
|
||||||
cases = [
|
cases = [
|
||||||
# Valid UTF-8 sequences are decoded.
|
# Valid UTF-8 sequences are decoded.
|
||||||
('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
|
('/%e2%89%Ab%E2%99%a5%E2%89%aB/', '/≫♥≫/'),
|
||||||
('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
|
('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
|
||||||
|
('/%41%5a%6B/', '/AZk/'),
|
||||||
|
# Reserved and non-URL valid ASCII chars are not decoded.
|
||||||
|
('/%25%20%02%41%7b/', '/%25%20%02A%7b/'),
|
||||||
# Broken UTF-8 sequences remain escaped.
|
# Broken UTF-8 sequences remain escaped.
|
||||||
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
|
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
|
||||||
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
|
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
|
||||||
@ -112,11 +114,12 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
|
|||||||
|
|
||||||
def test_complementarity(self):
|
def test_complementarity(self):
|
||||||
cases = [
|
cases = [
|
||||||
('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'),
|
('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen%20M\xfcnster/'),
|
||||||
('%&', '%&'),
|
('%&', '%&'),
|
||||||
('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
|
('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
|
||||||
('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
|
('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
|
||||||
('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
|
('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
|
||||||
|
('/%25%20%02%7b/', '/%25%20%02%7b/'),
|
||||||
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
|
('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
|
||||||
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
|
('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
|
||||||
('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),
|
('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user