From ff308a06047cd60806d604a7cf612e5656ee2ac9 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Wed, 29 May 2024 14:48:27 +0100 Subject: [PATCH] Fixed 35467 -- Replaced urlparse with urlsplit where appropriate. This work should not generate any change of functionality, and `urlsplit` is approximately 6x faster. Most use cases of `urlparse` didn't touch the path, so they can be converted to `urlsplit` without any issue. Most of those which do use `.path`, simply parse the URL, mutate the querystring, then put them back together, which is also fine (so long as urlunsplit is used). --- django/contrib/admin/options.py | 4 ++-- django/contrib/admin/templatetags/admin_urls.py | 10 +++++----- django/contrib/auth/decorators.py | 6 +++--- django/contrib/auth/middleware.py | 6 +++--- django/contrib/auth/mixins.py | 6 +++--- django/contrib/auth/views.py | 10 +++++----- django/contrib/staticfiles/handlers.py | 4 ++-- django/forms/fields.py | 4 ++-- django/http/response.py | 4 ++-- django/middleware/common.py | 4 ++-- django/middleware/csrf.py | 10 +++++----- django/test/client.py | 17 ++++++----------- django/test/testcases.py | 12 +++++------- django/utils/http.py | 6 +++--- docs/ref/urlresolvers.txt | 4 ++-- tests/admin_views/tests.py | 10 +++++----- tests/csrf_tests/tests.py | 16 ++++++---------- 17 files changed, 61 insertions(+), 72 deletions(-) diff --git a/django/contrib/admin/options.py b/django/contrib/admin/options.py index 12467de74d..9cc891d807 100644 --- a/django/contrib/admin/options.py +++ b/django/contrib/admin/options.py @@ -6,7 +6,7 @@ import warnings from functools import partial, update_wrapper from urllib.parse import parse_qsl from urllib.parse import quote as urlquote -from urllib.parse import urlparse +from urllib.parse import urlsplit from django import forms from django.conf import settings @@ -1384,7 +1384,7 @@ class ModelAdmin(BaseModelAdmin): ) def _get_preserved_qsl(self, request, preserved_filters): - query_string = urlparse(request.build_absolute_uri()).query + query_string = urlsplit(request.build_absolute_uri()).query return parse_qsl(query_string.replace(preserved_filters, "")) def response_add(self, request, obj, post_url_continue=None): diff --git a/django/contrib/admin/templatetags/admin_urls.py b/django/contrib/admin/templatetags/admin_urls.py index 871b0d5f20..176e7a49ed 100644 --- a/django/contrib/admin/templatetags/admin_urls.py +++ b/django/contrib/admin/templatetags/admin_urls.py @@ -1,4 +1,4 @@ -from urllib.parse import parse_qsl, unquote, urlparse, urlunparse +from urllib.parse import parse_qsl, unquote, urlsplit, urlunsplit from django import template from django.contrib.admin.utils import quote @@ -24,8 +24,8 @@ def add_preserved_filters(context, url, popup=False, to_field=None): preserved_filters = context.get("preserved_filters") preserved_qsl = context.get("preserved_qsl") - parsed_url = list(urlparse(url)) - parsed_qs = dict(parse_qsl(parsed_url[4])) + parsed_url = list(urlsplit(url)) + parsed_qs = dict(parse_qsl(parsed_url[3])) merged_qs = {} if preserved_qsl: @@ -66,5 +66,5 @@ def add_preserved_filters(context, url, popup=False, to_field=None): merged_qs.update(parsed_qs) - parsed_url[4] = urlencode(merged_qs) - return urlunparse(parsed_url) + parsed_url[3] = urlencode(merged_qs) + return urlunsplit(parsed_url) diff --git a/django/contrib/auth/decorators.py b/django/contrib/auth/decorators.py index ea1cef0795..78e76a9ae9 100644 --- a/django/contrib/auth/decorators.py +++ b/django/contrib/auth/decorators.py @@ -1,6 +1,6 @@ import asyncio from functools import wraps -from urllib.parse import urlparse +from urllib.parse import urlsplit from asgiref.sync import async_to_sync, sync_to_async @@ -25,8 +25,8 @@ def user_passes_test( resolved_login_url = resolve_url(login_url or settings.LOGIN_URL) # If the login url is the same scheme and net location then just # use the path as the "next" url. - login_scheme, login_netloc = urlparse(resolved_login_url)[:2] - current_scheme, current_netloc = urlparse(path)[:2] + login_scheme, login_netloc = urlsplit(resolved_login_url)[:2] + current_scheme, current_netloc = urlsplit(path)[:2] if (not login_scheme or login_scheme == current_scheme) and ( not login_netloc or login_netloc == current_netloc ): diff --git a/django/contrib/auth/middleware.py b/django/contrib/auth/middleware.py index 761929d67d..cb409ee778 100644 --- a/django/contrib/auth/middleware.py +++ b/django/contrib/auth/middleware.py @@ -1,5 +1,5 @@ from functools import partial -from urllib.parse import urlparse +from urllib.parse import urlsplit from django.conf import settings from django.contrib import auth @@ -74,8 +74,8 @@ class LoginRequiredMiddleware(MiddlewareMixin): resolved_login_url = resolve_url(self.get_login_url(view_func)) # If the login url is the same scheme and net location then use the # path as the "next" url. - login_scheme, login_netloc = urlparse(resolved_login_url)[:2] - current_scheme, current_netloc = urlparse(path)[:2] + login_scheme, login_netloc = urlsplit(resolved_login_url)[:2] + current_scheme, current_netloc = urlsplit(path)[:2] if (not login_scheme or login_scheme == current_scheme) and ( not login_netloc or login_netloc == current_netloc ): diff --git a/django/contrib/auth/mixins.py b/django/contrib/auth/mixins.py index 0e46000d97..1f2e95ff00 100644 --- a/django/contrib/auth/mixins.py +++ b/django/contrib/auth/mixins.py @@ -1,4 +1,4 @@ -from urllib.parse import urlparse +from urllib.parse import urlsplit from django.conf import settings from django.contrib.auth import REDIRECT_FIELD_NAME @@ -51,8 +51,8 @@ class AccessMixin: resolved_login_url = resolve_url(self.get_login_url()) # If the login url is the same scheme and net location then use the # path as the "next" url. - login_scheme, login_netloc = urlparse(resolved_login_url)[:2] - current_scheme, current_netloc = urlparse(path)[:2] + login_scheme, login_netloc = urlsplit(resolved_login_url)[:2] + current_scheme, current_netloc = urlsplit(path)[:2] if (not login_scheme or login_scheme == current_scheme) and ( not login_netloc or login_netloc == current_netloc ): diff --git a/django/contrib/auth/views.py b/django/contrib/auth/views.py index 9a6d18bcd2..a18cfdb347 100644 --- a/django/contrib/auth/views.py +++ b/django/contrib/auth/views.py @@ -1,4 +1,4 @@ -from urllib.parse import urlparse, urlunparse +from urllib.parse import urlsplit, urlunsplit from django.conf import settings @@ -183,13 +183,13 @@ def redirect_to_login(next, login_url=None, redirect_field_name=REDIRECT_FIELD_N """ resolved_url = resolve_url(login_url or settings.LOGIN_URL) - login_url_parts = list(urlparse(resolved_url)) + login_url_parts = list(urlsplit(resolved_url)) if redirect_field_name: - querystring = QueryDict(login_url_parts[4], mutable=True) + querystring = QueryDict(login_url_parts[3], mutable=True) querystring[redirect_field_name] = next - login_url_parts[4] = querystring.urlencode(safe="/") + login_url_parts[3] = querystring.urlencode(safe="/") - return HttpResponseRedirect(urlunparse(login_url_parts)) + return HttpResponseRedirect(urlunsplit(login_url_parts)) # Class-based password reset views diff --git a/django/contrib/staticfiles/handlers.py b/django/contrib/staticfiles/handlers.py index 7394eff818..686718a355 100644 --- a/django/contrib/staticfiles/handlers.py +++ b/django/contrib/staticfiles/handlers.py @@ -36,13 +36,13 @@ class StaticFilesHandlerMixin: * the host is provided as part of the base_url * the request's path isn't under the media path (or equal) """ - return path.startswith(self.base_url[2]) and not self.base_url[1] + return path.startswith(self.base_url.path) and not self.base_url.netloc def file_path(self, url): """ Return the relative path to the media file on disk for the given URL. """ - relative_url = url.removeprefix(self.base_url[2]) + relative_url = url.removeprefix(self.base_url.path) return url2pathname(relative_url) def serve(self, request): diff --git a/django/forms/fields.py b/django/forms/fields.py index 4ec7b7aee7..1a58a60743 100644 --- a/django/forms/fields.py +++ b/django/forms/fields.py @@ -792,13 +792,13 @@ class URLField(CharField): def to_python(self, value): def split_url(url): """ - Return a list of url parts via urlparse.urlsplit(), or raise + Return a list of url parts via urlsplit(), or raise ValidationError for some malformed URLs. """ try: return list(urlsplit(url)) except ValueError: - # urlparse.urlsplit can raise a ValueError with some + # urlsplit can raise a ValueError with some # misformatted URLs. raise ValidationError(self.error_messages["invalid"], code="invalid") diff --git a/django/http/response.py b/django/http/response.py index eecd972cd6..0d756403db 100644 --- a/django/http/response.py +++ b/django/http/response.py @@ -9,7 +9,7 @@ import time import warnings from email.header import Header from http.client import responses -from urllib.parse import urlparse +from urllib.parse import urlsplit from asgiref.sync import async_to_sync, sync_to_async @@ -616,7 +616,7 @@ class HttpResponseRedirectBase(HttpResponse): def __init__(self, redirect_to, *args, **kwargs): super().__init__(*args, **kwargs) self["Location"] = iri_to_uri(redirect_to) - parsed = urlparse(str(redirect_to)) + parsed = urlsplit(str(redirect_to)) if parsed.scheme and parsed.scheme not in self.allowed_schemes: raise DisallowedRedirect( "Unsafe redirect to URL with protocol '%s'" % parsed.scheme diff --git a/django/middleware/common.py b/django/middleware/common.py index 9f71b9d278..bf22d00f01 100644 --- a/django/middleware/common.py +++ b/django/middleware/common.py @@ -1,5 +1,5 @@ import re -from urllib.parse import urlparse +from urllib.parse import urlsplit from django.conf import settings from django.core.exceptions import PermissionDenied @@ -171,7 +171,7 @@ class BrokenLinkEmailsMiddleware(MiddlewareMixin): # The referer is equal to the current URL, ignoring the scheme (assumed # to be a poorly implemented bot). - parsed_referer = urlparse(referer) + parsed_referer = urlsplit(referer) if parsed_referer.netloc in ["", domain] and parsed_referer.path == uri: return True diff --git a/django/middleware/csrf.py b/django/middleware/csrf.py index f7943494ba..5ae1aae5c6 100644 --- a/django/middleware/csrf.py +++ b/django/middleware/csrf.py @@ -8,7 +8,7 @@ against request forgeries from other sites. import logging import string from collections import defaultdict -from urllib.parse import urlparse +from urllib.parse import urlsplit from django.conf import settings from django.core.exceptions import DisallowedHost, ImproperlyConfigured @@ -174,7 +174,7 @@ class CsrfViewMiddleware(MiddlewareMixin): @cached_property def csrf_trusted_origins_hosts(self): return [ - urlparse(origin).netloc.lstrip("*") + urlsplit(origin).netloc.lstrip("*") for origin in settings.CSRF_TRUSTED_ORIGINS ] @@ -190,7 +190,7 @@ class CsrfViewMiddleware(MiddlewareMixin): """ allowed_origin_subdomains = defaultdict(list) for parsed in ( - urlparse(origin) + urlsplit(origin) for origin in settings.CSRF_TRUSTED_ORIGINS if "*" in origin ): @@ -284,7 +284,7 @@ class CsrfViewMiddleware(MiddlewareMixin): if request_origin in self.allowed_origins_exact: return True try: - parsed_origin = urlparse(request_origin) + parsed_origin = urlsplit(request_origin) except ValueError: return False request_scheme = parsed_origin.scheme @@ -300,7 +300,7 @@ class CsrfViewMiddleware(MiddlewareMixin): raise RejectRequest(REASON_NO_REFERER) try: - referer = urlparse(referer) + referer = urlsplit(referer) except ValueError: raise RejectRequest(REASON_MALFORMED_REFERER) diff --git a/django/test/client.py b/django/test/client.py index aa42c1f60a..a755aae05c 100644 --- a/django/test/client.py +++ b/django/test/client.py @@ -8,7 +8,7 @@ from functools import partial from http import HTTPStatus from importlib import import_module from io import BytesIO, IOBase -from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit +from urllib.parse import unquote_to_bytes, urljoin, urlsplit from asgiref.sync import sync_to_async @@ -458,11 +458,7 @@ class RequestFactory: return json.dumps(data, cls=self.json_encoder) if should_encode else data def _get_path(self, parsed): - path = parsed.path - # If there are parameters, add them - if parsed.params: - path += ";" + parsed.params - path = unquote_to_bytes(path) + path = unquote_to_bytes(parsed.path) # Replace the behavior where non-ASCII values in the WSGI environ are # arbitrarily decoded with ISO-8859-1. # Refs comment in `get_bytes_from_wsgi()`. @@ -647,7 +643,7 @@ class RequestFactory: **extra, ): """Construct an arbitrary HTTP request.""" - parsed = urlparse(str(path)) # path can be lazy + parsed = urlsplit(str(path)) # path can be lazy data = force_bytes(data, settings.DEFAULT_CHARSET) r = { "PATH_INFO": self._get_path(parsed), @@ -671,8 +667,7 @@ class RequestFactory: # If QUERY_STRING is absent or empty, we want to extract it from the URL. if not r.get("QUERY_STRING"): # WSGI requires latin-1 encoded strings. See get_path_info(). - query_string = parsed[4].encode().decode("iso-8859-1") - r["QUERY_STRING"] = query_string + r["QUERY_STRING"] = parsed.query.encode().decode("iso-8859-1") return self.request(**r) @@ -748,7 +743,7 @@ class AsyncRequestFactory(RequestFactory): **extra, ): """Construct an arbitrary HTTP request.""" - parsed = urlparse(str(path)) # path can be lazy. + parsed = urlsplit(str(path)) # path can be lazy. data = force_bytes(data, settings.DEFAULT_CHARSET) s = { "method": method, @@ -772,7 +767,7 @@ class AsyncRequestFactory(RequestFactory): else: # If QUERY_STRING is absent or empty, we want to extract it from # the URL. - s["query_string"] = parsed[4] + s["query_string"] = parsed.query if headers: extra.update(HttpHeaders.to_asgi_names(headers)) s["headers"] += [ diff --git a/django/test/testcases.py b/django/test/testcases.py index 0a802c887b..f1c6b5ae9c 100644 --- a/django/test/testcases.py +++ b/django/test/testcases.py @@ -21,7 +21,7 @@ from urllib.parse import ( urljoin, urlparse, urlsplit, - urlunparse, + urlunsplit, ) from urllib.request import url2pathname @@ -541,11 +541,9 @@ class SimpleTestCase(unittest.TestCase): def normalize(url): """Sort the URL's query string parameters.""" url = str(url) # Coerce reverse_lazy() URLs. - scheme, netloc, path, params, query, fragment = urlparse(url) + scheme, netloc, path, query, fragment = urlsplit(url) query_parts = sorted(parse_qsl(query)) - return urlunparse( - (scheme, netloc, path, params, urlencode(query_parts), fragment) - ) + return urlunsplit((scheme, netloc, path, urlencode(query_parts), fragment)) if msg_prefix: msg_prefix += ": " @@ -1637,11 +1635,11 @@ class FSFilesHandler(WSGIHandler): * the host is provided as part of the base_url * the request's path isn't under the media path (or equal) """ - return path.startswith(self.base_url[2]) and not self.base_url[1] + return path.startswith(self.base_url.path) and not self.base_url.netloc def file_path(self, url): """Return the relative path to the file on disk for the given URL.""" - relative_url = url.removeprefix(self.base_url[2]) + relative_url = url.removeprefix(self.base_url.path) return url2pathname(relative_url) def get_response(self, request): diff --git a/django/utils/http.py b/django/utils/http.py index 78dfee7fee..bf783562dd 100644 --- a/django/utils/http.py +++ b/django/utils/http.py @@ -6,7 +6,7 @@ from datetime import datetime, timezone from email.utils import formatdate from urllib.parse import quote, unquote from urllib.parse import urlencode as original_urlencode -from urllib.parse import urlparse +from urllib.parse import urlsplit from django.utils.datastructures import MultiValueDict from django.utils.regex_helper import _lazy_re_compile @@ -271,11 +271,11 @@ def url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False): def _url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False): # Chrome considers any URL with more than two slashes to be absolute, but - # urlparse is not so flexible. Treat any url with three slashes as unsafe. + # urlsplit is not so flexible. Treat any url with three slashes as unsafe. if url.startswith("///"): return False try: - url_info = urlparse(url) + url_info = urlsplit(url) except ValueError: # e.g. invalid IPv6 addresses return False # Forbid URLs like http:///example.com - with a scheme, but without a hostname. diff --git a/docs/ref/urlresolvers.txt b/docs/ref/urlresolvers.txt index eb0b991f1b..b335d1fc39 100644 --- a/docs/ref/urlresolvers.txt +++ b/docs/ref/urlresolvers.txt @@ -203,7 +203,7 @@ A :class:`ResolverMatch` object can also be assigned to a triple:: One possible use of :func:`~django.urls.resolve` would be to test whether a view would raise a ``Http404`` error before redirecting to it:: - from urllib.parse import urlparse + from urllib.parse import urlsplit from django.urls import resolve from django.http import Http404, HttpResponseRedirect @@ -215,7 +215,7 @@ view would raise a ``Http404`` error before redirecting to it:: # modify the request and response as required, e.g. change locale # and set corresponding locale cookie - view, args, kwargs = resolve(urlparse(next)[2]) + view, args, kwargs = resolve(urlsplit(next).path) kwargs["request"] = request try: view(*args, **kwargs) diff --git a/tests/admin_views/tests.py b/tests/admin_views/tests.py index d49e7d028b..cb6815e7a8 100644 --- a/tests/admin_views/tests.py +++ b/tests/admin_views/tests.py @@ -4,7 +4,7 @@ import re import unittest import zoneinfo from unittest import mock -from urllib.parse import parse_qsl, urljoin, urlparse +from urllib.parse import parse_qsl, urljoin, urlsplit from django import forms from django.contrib import admin @@ -357,7 +357,7 @@ class AdminViewBasicTest(AdminViewBasicTestCase): **save_option, }, ) - parsed_url = urlparse(response.url) + parsed_url = urlsplit(response.url) self.assertEqual(parsed_url.query, qsl) def test_change_query_string_persists(self): @@ -386,7 +386,7 @@ class AdminViewBasicTest(AdminViewBasicTestCase): **save_option, }, ) - parsed_url = urlparse(response.url) + parsed_url = urlsplit(response.url) self.assertEqual(parsed_url.query, qsl) def test_basic_edit_GET(self): @@ -8032,11 +8032,11 @@ class AdminKeepChangeListFiltersTests(TestCase): Assert that two URLs are equal despite the ordering of their querystring. Refs #22360. """ - parsed_url1 = urlparse(url1) + parsed_url1 = urlsplit(url1) path1 = parsed_url1.path parsed_qs1 = dict(parse_qsl(parsed_url1.query)) - parsed_url2 = urlparse(url2) + parsed_url2 = urlsplit(url2) path2 = parsed_url2.path parsed_qs2 = dict(parse_qsl(parsed_url2.query)) diff --git a/tests/csrf_tests/tests.py b/tests/csrf_tests/tests.py index 9407221cd1..b736276534 100644 --- a/tests/csrf_tests/tests.py +++ b/tests/csrf_tests/tests.py @@ -709,25 +709,21 @@ class CsrfViewMiddlewareTestMixin(CsrfFunctionTestMixin): response = mw.process_view(req, post_form_view, (), {}) self.assertContains(response, malformed_referer_msg, status_code=403) # missing scheme - # >>> urlparse('//example.com/') - # ParseResult( - # scheme='', netloc='example.com', path='/', params='', query='', fragment='', - # ) + # >>> urlsplit('//example.com/') + # SplitResult(scheme='', netloc='example.com', path='/', query='', fragment='') req.META["HTTP_REFERER"] = "//example.com/" self._check_referer_rejects(mw, req) response = mw.process_view(req, post_form_view, (), {}) self.assertContains(response, malformed_referer_msg, status_code=403) # missing netloc - # >>> urlparse('https://') - # ParseResult( - # scheme='https', netloc='', path='', params='', query='', fragment='', - # ) + # >>> urlsplit('https://') + # SplitResult(scheme='https', netloc='', path='', query='', fragment='') req.META["HTTP_REFERER"] = "https://" self._check_referer_rejects(mw, req) response = mw.process_view(req, post_form_view, (), {}) self.assertContains(response, malformed_referer_msg, status_code=403) # Invalid URL - # >>> urlparse('https://[') + # >>> urlsplit('https://[') # ValueError: Invalid IPv6 URL req.META["HTTP_REFERER"] = "https://[" self._check_referer_rejects(mw, req) @@ -979,7 +975,7 @@ class CsrfViewMiddlewareTestMixin(CsrfFunctionTestMixin): @override_settings(ALLOWED_HOSTS=["www.example.com"]) def test_bad_origin_cannot_be_parsed(self): """ - A POST request with an origin that can't be parsed by urlparse() is + A POST request with an origin that can't be parsed by urlsplit() is rejected. """ req = self._get_POST_request_with_token()