From e567670b1abe61af4acfaa6a6a7e92a7acfa8b00 Mon Sep 17 00:00:00 2001 From: Claude Paroz Date: Thu, 14 Oct 2021 19:27:31 +0200 Subject: [PATCH] Fixed #33195 -- Refactored urlize() based on a class. This allows easier customization/ --- django/utils/html.py | 171 ++++++++++++++++++++++++------------------- 1 file changed, 97 insertions(+), 74 deletions(-) diff --git a/django/utils/html.py b/django/utils/html.py index 2c8c1cc79e..cb4757818e 100644 --- a/django/utils/html.py +++ b/django/utils/html.py @@ -15,17 +15,6 @@ from django.utils.regex_helper import _lazy_re_compile from django.utils.safestring import SafeData, SafeString, mark_safe from django.utils.text import normalize_newlines -# Configuration for urlize() function. -TRAILING_PUNCTUATION_CHARS = '.,:;!' -WRAPPING_PUNCTUATION = [('(', ')'), ('[', ']')] - -word_split_re = _lazy_re_compile(r'''([\s<>"']+)''') -simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE) -simple_url_2_re = _lazy_re_compile( - r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$', - re.IGNORECASE -) - @keep_lazy(str, SafeString) def escape(text): @@ -229,48 +218,118 @@ def smart_urlquote(url): return urlunsplit((scheme, netloc, path, query, fragment)) -@keep_lazy_text -def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): +class Urlizer: """ Convert any URLs in text into clickable links. - Works on http://, https://, www. links, and also on links ending in one of + Work on http://, https://, www. links, and also on links ending in one of the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org). Links can have trailing punctuation (periods, commas, close-parens) and leading punctuation (opening parens) and it'll still do the right thing. - - If trim_url_limit is not None, truncate the URLs in the link text longer - than this limit to trim_url_limit - 1 characters and append an ellipsis. - - If nofollow is True, give the links a rel="nofollow" attribute. - - If autoescape is True, autoescape the link text and URLs. """ - safe_input = isinstance(text, SafeData) + trailing_punctuation_chars = '.,:;!' + wrapping_punctuation = [('(', ')'), ('[', ']')] - def trim_url(x, limit=trim_url_limit): - if limit is None or len(x) <= limit: + simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE) + simple_url_2_re = _lazy_re_compile( + r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$', + re.IGNORECASE + ) + word_split_re = _lazy_re_compile(r'''([\s<>"']+)''') + + mailto_template = 'mailto:{local}@{domain}' + url_template = '{url}' + + def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False): + """ + If trim_url_limit is not None, truncate the URLs in the link text + longer than this limit to trim_url_limit - 1 characters and append an + ellipsis. + + If nofollow is True, give the links a rel="nofollow" attribute. + + If autoescape is True, autoescape the link text and URLs. + """ + self.trim_url_limit = trim_url_limit + self.nofollow = nofollow + self.autoescape = autoescape + self.safe_input = isinstance(text, SafeData) + + words = self.word_split_re.split(str(text)) + return ''.join([ + self.handle_word(word) for word in words + ]) + + def handle_word(self, word): + if '.' in word or '@' in word or ':' in word: + # lead: Punctuation trimmed from the beginning of the word. + # middle: State of the word. + # trail: Punctuation trimmed from the end of the word. + lead, middle, trail = self.trim_punctuation(word) + # Make URL we want to point to. + url = None + nofollow_attr = ' rel="nofollow"' if self.nofollow else '' + if self.simple_url_re.match(middle): + url = smart_urlquote(html.unescape(middle)) + elif self.simple_url_2_re.match(middle): + url = smart_urlquote('http://%s' % html.unescape(middle)) + elif ':' not in middle and self.is_email_simple(middle): + local, domain = middle.rsplit('@', 1) + try: + domain = punycode(domain) + except UnicodeError: + return word + url = self.mailto_template.format(local=local, domain=domain) + nofollow_attr = '' + # Make link. + if url: + trimmed = self.trim_url(middle) + if self.autoescape and not self.safe_input: + lead, trail = escape(lead), escape(trail) + trimmed = escape(trimmed) + middle = self.url_template.format( + href=escape(url), + attrs=nofollow_attr, + url=trimmed, + ) + return mark_safe(f'{lead}{middle}{trail}') + else: + if self.safe_input: + return mark_safe(word) + elif self.autoescape: + return escape(word) + elif self.safe_input: + return mark_safe(word) + elif self.autoescape: + return escape(word) + return word + + def trim_url(self, x): + if self.trim_url_limit is None or len(x) <= self.trim_url_limit: return x - return '%s…' % x[:max(0, limit - 1)] + return '%s…' % x[:max(0, self.trim_url_limit - 1)] - def trim_punctuation(lead, middle, trail): + def trim_punctuation(self, word): """ - Trim trailing and wrapping punctuation from `middle`. Return the items - of the new state. + Trim trailing and wrapping punctuation from `word`. Return the items of + the new state. """ + lead, middle, trail = '', word, '' # Continue trimming until middle remains unchanged. trimmed_something = True while trimmed_something: trimmed_something = False # Trim wrapping punctuation. - for opening, closing in WRAPPING_PUNCTUATION: + for opening, closing in self.wrapping_punctuation: if middle.startswith(opening): middle = middle[len(opening):] lead += opening trimmed_something = True # Keep parentheses at the end only if they're balanced. - if (middle.endswith(closing) and - middle.count(closing) == middle.count(opening) + 1): + if ( + middle.endswith(closing) and + middle.count(closing) == middle.count(opening) + 1 + ): middle = middle[:-len(closing)] trail = closing + trail trimmed_something = True @@ -278,7 +337,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): # as encoded entities contain ';'). Unescape entities to avoid # breaking them by removing ';'. middle_unescaped = html.unescape(middle) - stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS) + stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars) if middle_unescaped != stripped: punctuation_count = len(middle_unescaped) - len(stripped) trail = middle[-punctuation_count:] + trail @@ -286,6 +345,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): trimmed_something = True return lead, middle, trail + @staticmethod def is_email_simple(value): """Return True if value looks like an email address.""" # An @ must be in the middle of the value. @@ -301,50 +361,13 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): return False return True - words = word_split_re.split(str(text)) - for i, word in enumerate(words): - if '.' in word or '@' in word or ':' in word: - # lead: Current punctuation trimmed from the beginning of the word. - # middle: Current state of the word. - # trail: Current punctuation trimmed from the end of the word. - lead, middle, trail = '', word, '' - # Deal with punctuation. - lead, middle, trail = trim_punctuation(lead, middle, trail) - # Make URL we want to point to. - url = None - nofollow_attr = ' rel="nofollow"' if nofollow else '' - if simple_url_re.match(middle): - url = smart_urlquote(html.unescape(middle)) - elif simple_url_2_re.match(middle): - url = smart_urlquote('http://%s' % html.unescape(middle)) - elif ':' not in middle and is_email_simple(middle): - local, domain = middle.rsplit('@', 1) - try: - domain = punycode(domain) - except UnicodeError: - continue - url = 'mailto:%s@%s' % (local, domain) - nofollow_attr = '' +urlizer = Urlizer() - # Make link. - if url: - trimmed = trim_url(middle) - if autoescape and not safe_input: - lead, trail = escape(lead), escape(trail) - trimmed = escape(trimmed) - middle = '%s' % (escape(url), nofollow_attr, trimmed) - words[i] = mark_safe('%s%s%s' % (lead, middle, trail)) - else: - if safe_input: - words[i] = mark_safe(word) - elif autoescape: - words[i] = escape(word) - elif safe_input: - words[i] = mark_safe(word) - elif autoescape: - words[i] = escape(word) - return ''.join(words) + +@keep_lazy_text +def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): + return urlizer(text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape) def avoid_wrapping(value):