From 6ca6c36f82b97eafeada61384b2e2f1d0587da86 Mon Sep 17 00:00:00 2001
From: Claude Paroz
Date: Thu, 20 Mar 2014 16:50:50 +0100
Subject: [PATCH] Improved strip_tags and clarified documentation
The fact that strip_tags cannot guarantee to really strip all
non-safe HTML content was not clear enough. Also see:
https://www.djangoproject.com/weblog/2014/mar/22/strip-tags-advisory/
---
django/utils/html.py | 31 +++++++++++++++++++++++++++----
docs/ref/templates/builtins.txt | 12 +++++++++++-
docs/ref/utils.txt | 16 +++++++++++-----
tests/utils_tests/test_html.py | 2 ++
4 files changed, 51 insertions(+), 10 deletions(-)
diff --git a/django/utils/html.py b/django/utils/html.py
index b9444fc01f..8be7fd5153 100644
--- a/django/utils/html.py
+++ b/django/utils/html.py
@@ -118,7 +118,10 @@ linebreaks = allow_lazy(linebreaks, six.text_type)
class MLStripper(HTMLParser):
def __init__(self):
- HTMLParser.__init__(self)
+ if six.PY2:
+ HTMLParser.__init__(self)
+ else:
+ HTMLParser.__init__(self, strict=False)
self.reset()
self.fed = []
@@ -135,16 +138,36 @@ class MLStripper(HTMLParser):
return ''.join(self.fed)
-def strip_tags(value):
- """Returns the given HTML with all tags stripped."""
+def _strip_once(value):
+ """
+ Internal tag stripping utility used by strip_tags.
+ """
s = MLStripper()
try:
s.feed(value)
- s.close()
except HTMLParseError:
return value
+ try:
+ s.close()
+ except (HTMLParseError, UnboundLocalError) as err:
+ # UnboundLocalError because of http://bugs.python.org/issue17802
+ # on Python 3.2, triggered by strict=False mode of HTMLParser
+ return s.get_data() + s.rawdata
else:
return s.get_data()
+
+
+def strip_tags(value):
+ """Returns the given HTML with all tags stripped."""
+ while True:
+ if not ('<' in value or '>' in value):
+ return value
+ new_value = _strip_once(value)
+ if new_value == value:
+ # _strip_once was not able to detect more tags
+ return value
+ else:
+ value = new_value
strip_tags = allow_lazy(strip_tags)
diff --git a/docs/ref/templates/builtins.txt b/docs/ref/templates/builtins.txt
index a73ba87d75..34dd42252a 100644
--- a/docs/ref/templates/builtins.txt
+++ b/docs/ref/templates/builtins.txt
@@ -1985,7 +1985,7 @@ If ``value`` is ``10``, the output will be ``1.000000E+01``.
striptags
^^^^^^^^^
-Strips all [X]HTML tags.
+Makes all possible efforts to strip all [X]HTML tags.
For example::
@@ -1994,6 +1994,16 @@ For example::
If ``value`` is ``"Joel a slug"``, the
output will be ``"Joel is a slug"``.
+.. admonition:: No safety guarantee
+
+ Note that ``striptags`` doesn't give any guarantee about its output being
+ entirely HTML safe, particularly with non valid HTML input. So **NEVER**
+ apply the ``safe`` filter to a ``striptags`` output.
+ If you are looking for something more robust, you can use the ``bleach``
+ Python library, notably its `clean`_ method.
+
+.. _clean: http://bleach.readthedocs.org/en/latest/clean.html
+
.. templatefilter:: time
time
diff --git a/docs/ref/utils.txt b/docs/ref/utils.txt
index ea6bffce34..5515c01a20 100644
--- a/docs/ref/utils.txt
+++ b/docs/ref/utils.txt
@@ -595,17 +595,23 @@ escaping HTML.
.. function:: strip_tags(value)
- Removes anything that looks like an html tag from the string, that is
- anything contained within ``<>``.
+ Tries to remove anything that looks like an HTML tag from the string, that
+ is anything contained within ``<>``.
+ Absolutely NO guaranty is provided about the resulting string being entirely
+ HTML safe. So NEVER mark safe the result of a ``strip_tag`` call without
+ escaping it first, for example with :func:`~django.utils.html.escape`.
For example::
strip_tags(value)
If ``value`` is ``"Joel a slug"``
- the return value will be ``"Joel is a slug"``. Note that ``strip_tags``
- result may still contain unsafe HTML content, so you might use
- :func:`~django.utils.html.escape` to make it a safe string.
+ the return value will be ``"Joel is a slug"``.
+
+ If you are looking for a more robust solution, take a look at the `bleach`_
+ Python library.
+
+ .. _bleach: https://pypi.python.org/pypi/bleach
.. versionchanged:: 1.6
diff --git a/tests/utils_tests/test_html.py b/tests/utils_tests/test_html.py
index b4e61b9fd6..70de3a078e 100644
--- a/tests/utils_tests/test_html.py
+++ b/tests/utils_tests/test_html.py
@@ -80,6 +80,8 @@ class TestUtilsHtml(TestCase):
('a
b
c', 'abc'),
('de
f', 'def'),
('foobar', 'foobar'),
+ ('ript>test</script>', 'test'),
+ ('&h', 'alert()&h'),
)
for value, output in items:
self.check_output(f, value, output)