(?:%s).*?[a-zA-Z].*?
\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) @@ -103,12 +104,22 @@ fix_ampersands = allow_lazy(fix_ampersands, unicode) def smart_urlquote(url): """Quotes an URL if it isn't already quoted.""" + # Handle IDN before quoting. + scheme, netloc, path, query, fragment = urlparse.urlsplit(url) + try: + netloc = netloc.encode('idna') # IDN -> ACE + except UnicodeError: # invalid domain part + pass + else: + url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) + # An URL is considered unquoted if it contains no % character, or if it # contains a % not followed by two hexadecimal digits. See #9655. if '%' not in url or unquoted_percents_re.search(url): # See http://bugs.python.org/issue2637 - return urlquote(url, safe='!*\'();:@&=+$,/?#[]~') - return url + url = urllib.quote(smart_str(url), safe='!*\'();:@&=+$,/?#[]~') + + return force_unicode(url) def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): """ @@ -145,8 +156,10 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): middle and middle[0] in string.ascii_letters + string.digits and \ (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))): url = smart_urlquote('http://%s' % middle) - elif '@' in middle and not ':' in middle and simple_email_re.match(middle): - url = 'mailto:%s' % middle + elif not ':' in middle and simple_email_re.match(middle): + local, domain = middle.rsplit('@', 1) + domain = domain.encode('idna') + url = 'mailto:%s@%s' % (local, domain) nofollow_attr = '' # Make link. if url: diff --git a/tests/regressiontests/defaultfilters/tests.py b/tests/regressiontests/defaultfilters/tests.py index 00518344a3..515840d87e 100644 --- a/tests/regressiontests/defaultfilters/tests.py +++ b/tests/regressiontests/defaultfilters/tests.py @@ -238,6 +238,7 @@ class DefaultFiltersTests(TestCase): # Check urlize with https addresses self.assertEqual(urlize('https://google.com'), u'https://google.com') + # Check urlize doesn't overquote already quoted urls - see #9655 self.assertEqual(urlize('http://hi.baidu.com/%D6%D8%D0%C2%BF'), u'' @@ -252,6 +253,16 @@ class DefaultFiltersTests(TestCase): u'' u'http://en.wikipedia.org/wiki/Café') + # Check urlize handles IDN correctly - see #13704 + self.assertEqual(urlize('http://c✶.ws'), + u'http://c✶.ws') + self.assertEqual(urlize('www.c✶.ws'), + u'www.c✶.ws') + self.assertEqual(urlize('c✶.org'), + u'c✶.org') + self.assertEqual(urlize('info@c✶.org'), + u'info@c✶.org') + def test_wordcount(self): self.assertEqual(wordcount(''), 0) self.assertEqual(wordcount(u'oneword'), 1)