diff --git a/django/utils/html.py b/django/utils/html.py index 2687eb5232..4f74a7492c 100644 --- a/django/utils/html.py +++ b/django/utils/html.py @@ -17,6 +17,7 @@ TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>'] DOTS = [u'·', u'*', u'\u2022', u'•', u'•', u'•'] unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)') +unquoted_percents_re = re.compile(r'%(?![0-9A-Fa-f]{2})') word_split_re = re.compile(r'(\s+)') punctuation_re = re.compile('^(?P(?:%s)*)(?P.*?)(?P(?:%s)*)$' % \ ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]), @@ -100,6 +101,15 @@ def fix_ampersands(value): return unencoded_ampersands_re.sub('&', force_unicode(value)) fix_ampersands = allow_lazy(fix_ampersands, unicode) +def smart_urlquote(url): + """Quotes an URL if it isn't already quoted.""" + # An URL is considered unquoted if it contains no % character, or if it + # contains a % not followed by two hexadecimal digits. See #9655. + if '%' not in url or unquoted_percents_re.search(url): + # See http://bugs.python.org/issue2637 + return urlquote(url, safe='!*\'();:@&=+$,/?#[]~') + return url + def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): """ Converts any URLs in text into clickable links. @@ -130,11 +140,11 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): # Make URL we want to point to. url = None if middle.startswith('http://') or middle.startswith('https://'): - url = urlquote(middle, safe='/&=:;#?+*') + url = smart_urlquote(middle) elif middle.startswith('www.') or ('@' not in middle and \ middle and middle[0] in string.ascii_letters + string.digits and \ (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))): - url = urlquote('http://%s' % middle, safe='/&=:;#?+*') + url = smart_urlquote('http://%s' % middle) elif '@' in middle and not ':' in middle and simple_email_re.match(middle): url = 'mailto:%s' % middle nofollow_attr = '' diff --git a/docs/releases/1.4.txt b/docs/releases/1.4.txt index de7afc83c3..9f9ce7fb59 100644 --- a/docs/releases/1.4.txt +++ b/docs/releases/1.4.txt @@ -1044,6 +1044,15 @@ Now, the flags are keyword arguments of :meth:`@register.filter See :ref:`filters and auto-escaping ` for more information. +The :tfilter:`urlize` filter no longer escapes every URL +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When an URL contains a ``%xx`` sequence, where ``xx`` are two hexadecimal +digits, :tfilter:`urlize` assumes that the URL is already escaped, and doesn't +apply URL escaping again. This is wrong for URLs whose unquoted form contains +a ``%xx`` sequence, but such URLs are very unlikely to happen in the wild, +since they would confuse browsers too. + Session cookies now have the ``httponly`` flag by default ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/tests/regressiontests/defaultfilters/tests.py b/tests/regressiontests/defaultfilters/tests.py index 5e8c8f1538..00518344a3 100644 --- a/tests/regressiontests/defaultfilters/tests.py +++ b/tests/regressiontests/defaultfilters/tests.py @@ -238,6 +238,19 @@ class DefaultFiltersTests(TestCase): # Check urlize with https addresses self.assertEqual(urlize('https://google.com'), u'https://google.com') + # Check urlize doesn't overquote already quoted urls - see #9655 + self.assertEqual(urlize('http://hi.baidu.com/%D6%D8%D0%C2%BF'), + u'' + u'http://hi.baidu.com/%D6%D8%D0%C2%BF') + self.assertEqual(urlize('www.mystore.com/30%OffCoupons!'), + u'' + u'www.mystore.com/30%OffCoupons!') + self.assertEqual(urlize('http://en.wikipedia.org/wiki/Caf%C3%A9'), + u'' + u'http://en.wikipedia.org/wiki/Caf%C3%A9') + self.assertEqual(urlize('http://en.wikipedia.org/wiki/Café'), + u'' + u'http://en.wikipedia.org/wiki/Café') def test_wordcount(self): self.assertEqual(wordcount(''), 0)