From 2bab9d6d9ea095c4bcaeede2df576708afd46291 Mon Sep 17 00:00:00 2001 From: Bouke Haarsma Date: Tue, 12 Nov 2013 07:54:01 +0100 Subject: [PATCH] Fixed #21389 -- Accept most valid language codes By removing the 'supported' keyword from the detection methods and only relying on a cached settings.LANGUAGES, the speed of said methods has been improved; around 4x raw performance. This allows us to stop checking Python's incomplete list of locales, and rely on a less restrictive regular expression for accepting certain locales. HTTP Accept-Language is defined as being case-insensitive, based on this fact extra performance improvements have been made; it wouldn't make sense to check for case differences. --- django/middleware/locale.py | 7 +-- django/utils/translation/__init__.py | 4 +- django/utils/translation/trans_null.py | 2 +- django/utils/translation/trans_real.py | 83 +++++++++++--------------- docs/releases/1.7.txt | 8 +++ tests/i18n/tests.py | 22 ++++++- 6 files changed, 68 insertions(+), 58 deletions(-) diff --git a/django/middleware/locale.py b/django/middleware/locale.py index 1f64387d78..0867e3d60f 100644 --- a/django/middleware/locale.py +++ b/django/middleware/locale.py @@ -1,7 +1,5 @@ "This is the locale selecting middleware that will look at accept headers" -from collections import OrderedDict - from django.conf import settings from django.core.urlresolvers import (is_valid_path, get_resolver, LocaleRegexURLResolver) @@ -21,7 +19,6 @@ class LocaleMiddleware(object): response_redirect_class = HttpResponseRedirect def __init__(self): - self._supported_languages = OrderedDict(settings.LANGUAGES) self._is_language_prefix_patterns_used = False for url_pattern in get_resolver(None).url_patterns: if isinstance(url_pattern, LocaleRegexURLResolver): @@ -37,9 +34,7 @@ class LocaleMiddleware(object): def process_response(self, request, response): language = translation.get_language() - language_from_path = translation.get_language_from_path( - request.path_info, supported=self._supported_languages - ) + language_from_path = translation.get_language_from_path(request.path_info) if (response.status_code == 404 and not language_from_path and self.is_language_prefix_patterns_used()): urlconf = getattr(request, 'urlconf', None) diff --git a/django/utils/translation/__init__.py b/django/utils/translation/__init__.py index db159d2ef0..3803ade6b4 100644 --- a/django/utils/translation/__init__.py +++ b/django/utils/translation/__init__.py @@ -187,8 +187,8 @@ def get_language_from_request(request, check_path=False): return _trans.get_language_from_request(request, check_path) -def get_language_from_path(path, supported=None): - return _trans.get_language_from_path(path, supported=supported) +def get_language_from_path(path): + return _trans.get_language_from_path(path) def templatize(src, origin=None): diff --git a/django/utils/translation/trans_null.py b/django/utils/translation/trans_null.py index 69f32bc243..75dedda5f7 100644 --- a/django/utils/translation/trans_null.py +++ b/django/utils/translation/trans_null.py @@ -68,5 +68,5 @@ def get_language_from_request(request, check_path=False): return settings.LANGUAGE_CODE -def get_language_from_path(request, supported=None): +def get_language_from_path(request): return None diff --git a/django/utils/translation/trans_real.py b/django/utils/translation/trans_real.py index ea872e1ec9..80f18d6c6b 100644 --- a/django/utils/translation/trans_real.py +++ b/django/utils/translation/trans_real.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from collections import OrderedDict -import locale import os import re import sys @@ -29,9 +28,9 @@ _active = local() # The default translation is based on the settings file. _default = None -# This is a cache for normalized accept-header languages to prevent multiple -# file lookups when checking the same locale on repeated requests. -_accepted = {} +# This is a cache of settings.LANGUAGES in an OrderedDict for easy lookups by +# key +_supported = None # magic gettext number to separate context from message CONTEXT_SEPARATOR = "\x04" @@ -63,9 +62,11 @@ def reset_cache(**kwargs): Reset global state when LANGUAGES setting has been changed, as some languages should no longer be accepted. """ - if kwargs['setting'] == 'LANGUAGES': - global _accepted - _accepted = {} + if kwargs['setting'] in ('LANGUAGES', 'LANGUAGE_CODE'): + global _supported + _supported = None + check_for_language.cache_clear() + get_supported_language_variant.cache_clear() def to_locale(language, to_lower=False): @@ -388,7 +389,7 @@ def all_locale_paths(): return [globalpath] + list(settings.LOCALE_PATHS) -@lru_cache.lru_cache(maxsize=None) +@lru_cache.lru_cache() def check_for_language(lang_code): """ Checks whether there is a global language file for the given language @@ -404,39 +405,42 @@ def check_for_language(lang_code): return False -def get_supported_language_variant(lang_code, supported=None, strict=False): +@lru_cache.lru_cache(maxsize=1000) +def get_supported_language_variant(lang_code, strict=False): """ Returns the language-code that's listed in supported languages, possibly selecting a more generic variant. Raises LookupError if nothing found. If `strict` is False (the default), the function will look for an alternative country-specific variant when the currently checked is not found. + + lru_cache should have a maxsize to prevent from memory exhaustion attacks, + as the provided language codes are taken from the HTTP request. See also + . """ - if supported is None: + global _supported + if _supported is None: from django.conf import settings - supported = OrderedDict(settings.LANGUAGES) + _supported = OrderedDict(settings.LANGUAGES) if lang_code: # some browsers use deprecated language codes -- #18419 replacement = _BROWSERS_DEPRECATED_LOCALES.get(lang_code) - if lang_code not in supported and replacement in supported: + if lang_code not in _supported and replacement in _supported: return replacement - # if fr-CA is not supported, try fr-ca; if that fails, fallback to fr. + # if fr-ca is not supported, try fr. generic_lang_code = lang_code.split('-')[0] - variants = (lang_code, lang_code.lower(), generic_lang_code, - generic_lang_code.lower()) - for code in variants: - if code in supported and check_for_language(code): + for code in (lang_code, generic_lang_code): + if code in _supported and check_for_language(code): return code if not strict: # if fr-fr is not supported, try fr-ca. - for supported_code in supported: - if supported_code.startswith((generic_lang_code + '-', - generic_lang_code.lower() + '-')): + for supported_code in _supported: + if supported_code.startswith(generic_lang_code + '-'): return supported_code raise LookupError(lang_code) -def get_language_from_path(path, supported=None, strict=False): +def get_language_from_path(path, strict=False): """ Returns the language-code if there is a valid language-code found in the `path`. @@ -444,15 +448,12 @@ def get_language_from_path(path, supported=None, strict=False): If `strict` is False (the default), the function will look for an alternative country-specific variant when the currently checked is not found. """ - if supported is None: - from django.conf import settings - supported = OrderedDict(settings.LANGUAGES) regex_match = language_code_prefix_re.match(path) if not regex_match: return None lang_code = regex_match.group(1) try: - return get_supported_language_variant(lang_code, supported, strict=strict) + return get_supported_language_variant(lang_code, strict=strict) except LookupError: return None @@ -467,25 +468,26 @@ def get_language_from_request(request, check_path=False): If check_path is True, the URL path prefix will be checked for a language code, otherwise this is skipped for backwards compatibility. """ - global _accepted from django.conf import settings - supported = OrderedDict(settings.LANGUAGES) + global _supported + if _supported is None: + _supported = OrderedDict(settings.LANGUAGES) if check_path: - lang_code = get_language_from_path(request.path_info, supported) + lang_code = get_language_from_path(request.path_info) if lang_code is not None: return lang_code if hasattr(request, 'session'): # for backwards compatibility django_language is also checked (remove in 1.8) lang_code = request.session.get(LANGUAGE_SESSION_KEY, request.session.get('django_language')) - if lang_code in supported and lang_code is not None and check_for_language(lang_code): + if lang_code in _supported and lang_code is not None and check_for_language(lang_code): return lang_code lang_code = request.COOKIES.get(settings.LANGUAGE_COOKIE_NAME) try: - return get_supported_language_variant(lang_code, supported) + return get_supported_language_variant(lang_code) except LookupError: pass @@ -494,29 +496,16 @@ def get_language_from_request(request, check_path=False): if accept_lang == '*': break - # 'normalized' is the root name of the locale in POSIX format (which is - # the format used for the directories holding the MO files). - normalized = locale.locale_alias.get(to_locale(accept_lang, True)) - if not normalized: + if not language_code_re.search(accept_lang): continue - # Remove the default encoding from locale_alias. - normalized = normalized.split('.')[0] - - if normalized in _accepted: - # We've seen this locale before and have an MO file for it, so no - # need to check again. - return _accepted[normalized] try: - accept_lang = get_supported_language_variant(accept_lang, supported) + return get_supported_language_variant(accept_lang) except LookupError: continue - else: - _accepted[normalized] = accept_lang - return accept_lang try: - return get_supported_language_variant(settings.LANGUAGE_CODE, supported) + return get_supported_language_variant(settings.LANGUAGE_CODE) except LookupError: return settings.LANGUAGE_CODE @@ -732,7 +721,7 @@ def parse_accept_lang_header(lang_string): Any format errors in lang_string results in an empty list being returned. """ result = [] - pieces = accept_language_re.split(lang_string) + pieces = accept_language_re.split(lang_string.lower()) if pieces[-1]: return [] for i in range(0, len(pieces) - 1, 3): diff --git a/docs/releases/1.7.txt b/docs/releases/1.7.txt index 4d12a58e72..aef8842e36 100644 --- a/docs/releases/1.7.txt +++ b/docs/releases/1.7.txt @@ -1125,6 +1125,14 @@ Miscellaneous For example, if you use multi-inheritance, you need to define custom primary key fields on parent models, otherwise the default ``id`` fields will clash. +* :meth:`~django.utils.translation.parse_accept_lang_header` now returns + lowercase locales, instead of the case as it was provided. As locales should + be treated case-insensitive this allows us to speed up locale detection. + +* :meth:`~django.utils.translation.get_language_from_path` and + :meth:`~django.utils.translation.trans_real.get_supported_language_variant` + now no longer have a ``supported`` argument. + .. _deprecated-features-1.7: Features deprecated in 1.7 diff --git a/tests/i18n/tests.py b/tests/i18n/tests.py index 56c3f63316..cc25904ccc 100644 --- a/tests/i18n/tests.py +++ b/tests/i18n/tests.py @@ -821,10 +821,10 @@ class MiscTests(TestCase): p = trans_real.parse_accept_lang_header # Good headers. self.assertEqual([('de', 1.0)], p('de')) - self.assertEqual([('en-AU', 1.0)], p('en-AU')) + self.assertEqual([('en-au', 1.0)], p('en-AU')) self.assertEqual([('es-419', 1.0)], p('es-419')) self.assertEqual([('*', 1.0)], p('*;q=1.00')) - self.assertEqual([('en-AU', 0.123)], p('en-AU;q=0.123')) + self.assertEqual([('en-au', 0.123)], p('en-AU;q=0.123')) self.assertEqual([('en-au', 0.5)], p('en-au;q=0.5')) self.assertEqual([('en-au', 1.0)], p('en-au;q=1.0')) self.assertEqual([('da', 1.0), ('en', 0.5), ('en-gb', 0.25)], p('da, en-gb;q=0.25, en;q=0.5')) @@ -884,6 +884,24 @@ class MiscTests(TestCase): r.META = {'HTTP_ACCEPT_LANGUAGE': 'zh-cn,de'} self.assertEqual(g(r), 'zh-cn') + r.META = {'HTTP_ACCEPT_LANGUAGE': 'NL'} + self.assertEqual('nl', g(r)) + + r.META = {'HTTP_ACCEPT_LANGUAGE': 'fy'} + self.assertEqual('fy', g(r)) + + r.META = {'HTTP_ACCEPT_LANGUAGE': 'ia'} + self.assertEqual('ia', g(r)) + + r.META = {'HTTP_ACCEPT_LANGUAGE': 'sr-latn'} + self.assertEqual('sr-latn', g(r)) + + r.META = {'HTTP_ACCEPT_LANGUAGE': 'zh-hans'} + self.assertEqual('zh-hans', g(r)) + + r.META = {'HTTP_ACCEPT_LANGUAGE': 'zh-hant'} + self.assertEqual('zh-hant', g(r)) + @override_settings( LANGUAGES=( ('en', 'English'),