Fixed #7267 - UnicodeDecodeError in clean_html

Thanks to Nikolay for the report, and gav and aaugustin for the patch. git-svn-id: http://code.djangoproject.com/svn/django/trunk@16118 bcc190cf-cafb-0310-a4f2-bffc1f526a37
2025-10-26 07:06:08 +00:00 · 2011-04-28 14:08:53 +00:00
parent 2ac4f175ec
commit cf11e3789b
2 changed files with 16 additions and 4 deletions
--- a/django/utils/html.py
+++ b/django/utils/html.py
@@ -13,7 +13,7 @@ LEADING_PUNCTUATION  = ['(', '<', '&lt;']
 TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']

 # List of possible strings used for bullets in bulleted lists.
-DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
+DOTS = [u'&middot;', u'*', u'\u2022', u'&#149;', u'&bull;', u'&#8226;']

 unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
 word_split_re = re.compile(r'(\s+)')
@@ -180,13 +180,13 @@ def clean_html(text):
    text = html_gunk_re.sub('', text)
    # Convert hard-coded bullets into HTML unordered lists.
    def replace_p_tags(match):
-        s = match.group().replace('</p>', '</li>')
+        s = match.group().replace(u'</p>', u'</li>')
        for d in DOTS:
-            s = s.replace('<p>%s' % d, '<li>')
+            s = s.replace(u'<p>%s' % d, u'<li>')
        return u'<ul>\n%s\n</ul>' % s
    text = hard_coded_bullets_re.sub(replace_p_tags, text)
    # Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom
    # of the text.
-    text = trailing_empty_content_re.sub('', text)
+    text = trailing_empty_content_re.sub(u'', text)
    return text
 clean_html = allow_lazy(clean_html, unicode)