From 1896d531cb64873165195de46bb1dc5562887e5d Mon Sep 17 00:00:00 2001 From: Chris Cahoon Date: Tue, 16 Jun 2009 16:42:46 +0000 Subject: [PATCH] [soc2009/http-wsgi-improvements] Code and tests in support of http.charsets.determine_charset. This code determines the charset from a content-type or from the Accept-Charset request header. The code is fairly well documented, but that will be improved once the code is in closer to a final form. The codec that corresponds to the charset is also returned, but it is not currently used by HttpResponse. git-svn-id: http://code.djangoproject.com/svn/django/branches/soc2009/http-wsgi-improvements@11014 bcc190cf-cafb-0310-a4f2-bffc1f526a37 --- django/http/__init__.py | 15 +- django/http/charsets.py | 351 +++++++++++++++++++++ tests/regressiontests/charsets/__init__.py | 0 tests/regressiontests/charsets/models.py | 0 tests/regressiontests/charsets/tests.py | 60 ++++ tests/regressiontests/charsets/urls.py | 9 + tests/regressiontests/charsets/views.py | 11 + 7 files changed, 443 insertions(+), 3 deletions(-) create mode 100644 django/http/charsets.py create mode 100644 tests/regressiontests/charsets/__init__.py create mode 100644 tests/regressiontests/charsets/models.py create mode 100644 tests/regressiontests/charsets/tests.py create mode 100644 tests/regressiontests/charsets/urls.py create mode 100644 tests/regressiontests/charsets/views.py diff --git a/django/http/__init__.py b/django/http/__init__.py index 683212fcd4..9f7e82ba46 100644 --- a/django/http/__init__.py +++ b/django/http/__init__.py @@ -13,6 +13,7 @@ except ImportError: from django.utils.datastructures import MultiValueDict, ImmutableList from django.utils.encoding import smart_str, iri_to_uri, force_unicode from django.http.multipartparser import MultiPartParser +from django.http.charsets import determine_charset from django.conf import settings from django.core.files import uploadhandler from utils import * @@ -272,14 +273,16 @@ class HttpResponse(object): status_code = 200 def __init__(self, content='', mimetype=None, status=None, - content_type=None): + content_type=None, origin_request=None): from django.conf import settings self._charset = settings.DEFAULT_CHARSET if mimetype: - content_type = mimetype # For backwards compatibility + content_type = mimetype # Mimetype is an alias for content-type + if origin_request or content_type: + self._charset, self._codec = determine_charset(content_type, origin_request) if not content_type: content_type = "%s; charset=%s" % (settings.DEFAULT_CONTENT_TYPE, - settings.DEFAULT_CHARSET) + self._charset) if not isinstance(content, basestring) and hasattr(content, '__iter__'): self._container = content self._is_string = False @@ -432,6 +435,12 @@ class HttpResponseNotAllowed(HttpResponse): HttpResponse.__init__(self) self['Allow'] = ', '.join(permitted_methods) +class HttpResponseNotAcceptable(HttpResponse): + status_code = 406 + + # http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html + # if we want to make this more verbose (compliant, actually) + class HttpResponseGone(HttpResponse): status_code = 410 diff --git a/django/http/charsets.py b/django/http/charsets.py new file mode 100644 index 0000000000..035786e3aa --- /dev/null +++ b/django/http/charsets.py @@ -0,0 +1,351 @@ +"Maps codec names to http1.1 charsets" + +import codecs +import re +from operator import itemgetter +from django.conf import settings + +CHARSET_CODECS = { + '437': 'cp437', + '850': 'cp850', + '852': 'cp852', + '855': 'cp855', + '857': 'cp857', + '860': 'cp860', + '861': 'cp861', + '862': 'cp862', + '863': 'cp863', + '865': 'cp865', + '869': 'cp869', + 'ansi_x3.4-1968': 'ascii', + 'ansi_x3.4-1986': 'ascii', + 'arabic': 'iso8859-6', + 'ascii': 'ascii', + 'asmo-708': 'iso8859-6', + 'big5': 'big5', + 'big5-hkscs': 'big5hkscs', + 'ccsid01140': 'cp1140', + 'chinese': 'gb2312', + 'cp-gr': 'cp869', + 'cp-is': 'cp861', + 'cp01140': 'cp1140', + 'cp037': 'cp037', + 'cp1026': 'cp1026', + 'cp154': 'ptcp154', + 'cp367': 'ascii', + 'cp424': 'cp424', + 'cp437': 'cp437', + 'cp500': 'cp500', + 'cp775': 'cp775', + 'cp819': 'iso8859-1', + 'cp850': 'cp850', + 'cp852': 'cp852', + 'cp855': 'cp855', + 'cp857': 'cp857', + 'cp860': 'cp860', + 'cp861': 'cp861', + 'cp862': 'cp862', + 'cp863': 'cp863', + 'cp864': 'cp864', + 'cp865': 'cp865', + 'cp869': 'cp869', + 'cp936': 'gbk', + 'csascii': 'ascii', + 'csbig5': 'big5', + 'cseuckr': 'euc_kr', + 'cseucpkdfmtjapanese': 'euc_jp', + 'csibm037': 'cp037', + 'csibm1026': 'cp1026', + 'csibm424': 'cp424', + 'csibm500': 'cp500', + 'csibm855': 'cp855', + 'csibm857': 'cp857', + 'csibm860': 'cp860', + 'csibm861': 'cp861', + 'csibm863': 'cp863', + 'csibm864': 'cp864', + 'csibm865': 'cp865', + 'csibm869': 'cp869', + 'csiso2022jp': 'iso2022_jp', + 'csiso2022jp2': 'iso2022_jp_2', + 'csiso58gb231280': 'gb2312', + 'csisolatin1': 'iso8859-1', + 'csisolatin2': 'iso8859-2', + 'csisolatin3': 'iso8859-3', + 'csisolatin4': 'iso8859-4', + 'csisolatin5': 'iso8859-9', + 'csisolatin6': 'iso8859-10', + 'csisolatinarabic': 'iso8859-6', + 'csisolatincyrillic': 'iso8859-5', + 'csisolatingreek': 'iso8859-7', + 'csisolatinhebrew': 'iso8859-8', + 'cskoi8r': 'koi8-r', + 'cspc775baltic': 'cp775', + 'cspc850multilingual': 'cp850', + 'cspc862latinhebrew': 'cp862', + 'cspc8codepage437': 'cp437', + 'cspcp852': 'cp852', + 'csptcp154': 'ptcp154', + 'csshiftjis': 'shift_jis', + 'cyrillic': 'iso8859-5', + 'cyrillic-asian': 'ptcp154', + 'ebcdic-cp-be': 'cp500', + 'ebcdic-cp-ca': 'cp037', + 'ebcdic-cp-ch': 'cp500', + 'ebcdic-cp-he': 'cp424', + 'ebcdic-cp-nl': 'cp037', + 'ebcdic-cp-us': 'cp037', + 'ebcdic-cp-wt': 'cp037', + 'ebcdic-us-37+euro': 'cp1140', + 'ecma-114': 'iso8859-6', + 'ecma-118': 'iso8859-7', + 'elot_928': 'iso8859-7', + 'euc-jp': 'euc_jp', + 'euc-kr': 'euc_kr', + 'extended_unix_code_packed_format_for_japanese': 'euc_jp', + 'gb18030': 'gb18030', + 'gb_2312-80': 'gb2312', + 'gbk': 'gbk', + 'greek': 'iso8859-7', + 'greek8': 'iso8859-7', + 'hebrew': 'iso8859-8', + 'hz-gb-2312': 'hz', + 'ibm01140': 'cp1140', + 'ibm037': 'cp037', + 'ibm1026': 'cp1026', + 'ibm367': 'ascii', + 'ibm424': 'cp424', + 'ibm437': 'cp437', + 'ibm500': 'cp500', + 'ibm775': 'cp775', + 'ibm819': 'iso8859-1', + 'ibm850': 'cp850', + 'ibm852': 'cp852', + 'ibm855': 'cp855', + 'ibm857': 'cp857', + 'ibm860': 'cp860', + 'ibm861': 'cp861', + 'ibm862': 'cp862', + 'ibm863': 'cp863', + 'ibm864': 'cp864', + 'ibm865': 'cp865', + 'ibm869': 'cp869', + 'iso-2022-jp': 'iso2022_jp', + 'iso-2022-jp-2': 'iso2022_jp_2', + 'iso-8859-1': 'iso8859-1', + 'iso-8859-10': 'iso8859-10', + 'iso-8859-13': 'iso8859-13', + 'iso-8859-14': 'iso8859-14', + 'iso-8859-15': 'iso8859-15', + 'iso-8859-2': 'iso8859-2', + 'iso-8859-3': 'iso8859-3', + 'iso-8859-4': 'iso8859-4', + 'iso-8859-5': 'iso8859-5', + 'iso-8859-6': 'iso8859-6', + 'iso-8859-7': 'iso8859-7', + 'iso-8859-8': 'iso8859-8', + 'iso-8859-9': 'iso8859-9', + 'iso-celtic': 'iso8859-14', + 'iso-ir-100': 'iso8859-1', + 'iso-ir-101': 'iso8859-2', + 'iso-ir-109': 'iso8859-3', + 'iso-ir-110': 'iso8859-4', + 'iso-ir-126': 'iso8859-7', + 'iso-ir-127': 'iso8859-6', + 'iso-ir-138': 'iso8859-8', + 'iso-ir-144': 'iso8859-5', + 'iso-ir-148': 'iso8859-9', + 'iso-ir-157': 'iso8859-10', + 'iso-ir-199': 'iso8859-14', + 'iso-ir-58': 'gb2312', + 'iso-ir-6': 'ascii', + 'iso646-us': 'ascii', + 'iso_646.irv:1991': 'ascii', + 'iso_8859-1': 'iso8859-1', + 'iso_8859-10:1992': 'iso8859-10', + 'iso_8859-14': 'iso8859-14', + 'iso_8859-14:1998': 'iso8859-14', + 'iso_8859-15': 'iso8859-15', + 'iso_8859-1:1987': 'iso8859-1', + 'iso_8859-2': 'iso8859-2', + 'iso_8859-2:1987': 'iso8859-2', + 'iso_8859-3': 'iso8859-3', + 'iso_8859-3:1988': 'iso8859-3', + 'iso_8859-4': 'iso8859-4', + 'iso_8859-4:1988': 'iso8859-4', + 'iso_8859-5': 'iso8859-5', + 'iso_8859-5:1988': 'iso8859-5', + 'iso_8859-6': 'iso8859-6', + 'iso_8859-6:1987': 'iso8859-6', + 'iso_8859-7': 'iso8859-7', + 'iso_8859-7:1987': 'iso8859-7', + 'iso_8859-8': 'iso8859-8', + 'iso_8859-8:1988': 'iso8859-8', + 'iso_8859-9': 'iso8859-9', + 'iso_8859-9:1989': 'iso8859-9', + 'koi8-r': 'koi8-r', + 'koi8-u': 'koi8-u', + 'l1': 'iso8859-1', + 'l2': 'iso8859-2', + 'l3': 'iso8859-3', + 'l4': 'iso8859-4', + 'l5': 'iso8859-9', + 'l6': 'iso8859-10', + 'l8': 'iso8859-14', + 'latin-9': 'iso8859-15', + 'latin1': 'iso8859-1', + 'latin2': 'iso8859-2', + 'latin3': 'iso8859-3', + 'latin4': 'iso8859-4', + 'latin5': 'iso8859-9', + 'latin6': 'iso8859-10', + 'latin8': 'iso8859-14', + 'ms936': 'gbk', + 'ms_kanji': 'shift_jis', + 'pt154': 'ptcp154', + 'ptcp154': 'ptcp154', + 'shift_jis': 'shift_jis', + 'us': 'ascii', + 'us-ascii': 'ascii', + 'utf-16': 'utf-16', + 'utf-16le': 'utf-16-be', + 'utf-32': 'utf-32', + 'utf-32be': 'utf-32-be', + 'utf-32le': 'utf-32-le', + 'utf-7': 'utf-7', + 'utf-8': 'utf-8', + 'windows-1250': 'cp1250', + 'windows-1251': 'cp1251', + 'windows-1252': 'cp1252', + 'windows-1253': 'cp1253', + 'windows-1254': 'cp1254', + 'windows-1255': 'cp1255', + 'windows-1256': 'cp1256', + 'windows-1257': 'cp1257', + 'windows-1258': 'cp1258', + 'windows-936': 'gbk' +} + +def get_codec(charset): + """ + Given the name or alias of a character set, find its Python codec if there is one. + + http://www.iana.org/assignments/character-sets contains valid aliases. + The documentation for the codecs module has the list of codecs. + + CODEC_CHARSETS above has the codecs that correspond to character sets. + """ + try: + codec_name = CHARSET_CODECS[charset.strip().lower()] + codec = codecs.lookup(codec_name) + except KeyError: + #print "The charset %s is not supported by Django." % charset + codec = None + except LookupError: + #print "The encoding '%s' is not supported in this version of Python." % codec_name + codec = None + + return codec + +# Returns the key for the maximum value in a dictionary +max_dict_key = lambda l:sorted(l.iteritems(), key=itemgetter(1), reverse=True)[0][0] + +CONTENT_TYPE_RE = re.compile('.*; charset=([\w\d-]+);?') +ACCEPT_CHARSET_RE = re.compile('(?P([\w\d-]+)|(\*))(;q=(?P[01](\.\d{1,3})?))?,?') +def determine_charset(content_type, request): + """ + Searches request headers from clients and mimetype settings (which may be set + by users) for indicators of which charset and encoding the response should use. + + Attempted partial support for HTTP RFC 2616 section 14.2 and ticket 10190. + + Returns the highest "quality" (priority) charset that Python supports. + + Precedence: supported charset specified in content-type + settings.DEFAULT_CHARSET, + supported, "accept"ed charset such that its q > q of settings.DEFAULT_CHARSET + iso-8859-1 if q > 0 or is unspecified + 406 error + + """ + codec = None + charset = None + + # Attempt to get the codec from a content-type, and verify that the charset is valid. + if content_type: + match = CONTENT_TYPE_RE.match(content_type) + if match: + charset = match.group(1) + codec = get_codec(charset) + if not codec: # Unsupported charset + # we should throw an exception here + print "No CODEC ON MIMETYPE" + + # Handle Accept-Charset (which we only do if we do not deal with content_type). + else: + if request and "ACCEPT_CHARSET" in request.META: + # Get list of matches for Accepted-Charsets. + # [{ charset : q }, { charset : q }] + match_iterator = ACCEPT_CHARSET_RE.finditer(request.META["ACCEPT_CHARSET"]) + accept_charset = [m.groupdict() for m in match_iterator] + else: + accept_charset = [] # use settings.DEFAULT_CHARSET + charset = settings.DEFAULT_CHARSET + + # Remove charsets we cannot encode and whose q values are 0 + charsets = _process_accept_charset(accept_charset) + + # If we did not get a charset from the content type, we get it from accept_charset. + if not charset: + default_charset = settings.DEFAULT_CHARSET + fallback_charset = "ISO-8859-1" + # Prefer default_charset if its q value is 1 or we have no valid acceptable charsets. + max_q_charset = max_dict_key(charsets) + max_q_value = charsets[max_q_charset] + if max_q_value == 0 and fallback_charset not in charsets: + charset = fallback_charset + elif charsets[default_charset] == 1 or charsets[default_charset] == max_q_value: + charset = default_charset + # Get the highest valued acceptable charset (if we aren't going to the fallback + # or defaulting) + else: + charset = max_q_charset + + codec = get_codec(charset) + # We may reach here with no codec or no charset. We will change the status + # code in the HttpResponse. + return charset, codec + +# NOTE -- make sure we are not duping the processing of q values +def _process_accept_charset(accept_charset): + ''' + HTTP RFC 2616 section 14.2 dictates that q must be between 0 and 1. + This method normalizes charset quality values, cleans whitespace from charset + names, and excludes charsets without Python codecs and whose q values are 0. + ''' + accepted_charsets = {} + + default_value = 1 + wildcard = False + + for potential in accept_charset: + charset = potential["charset"].strip() + # The default quality value is 1 + if not potential["q"]: + q = 1. + else: + q = float(potential["q"]) + # Exclude unsupported charsets (those without codecs in Python) + if get_codec(charset) and q >= 0 and q <= 1: + accepted_charsets[charset] = q + elif charset == "*" and q >= 0 and q <= 1: + default_value = q + wildcard = True + + if settings.DEFAULT_CHARSET not in accepted_charsets: + accepted_charsets[settings.DEFAULT_CHARSET] = default_value + if "ISO-8859-1" not in accepted_charsets and wildcard: + accepted_charsets["ISO-8859-1"] = default_value + + + return accepted_charsets \ No newline at end of file diff --git a/tests/regressiontests/charsets/__init__.py b/tests/regressiontests/charsets/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/regressiontests/charsets/models.py b/tests/regressiontests/charsets/models.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/regressiontests/charsets/tests.py b/tests/regressiontests/charsets/tests.py new file mode 100644 index 0000000000..03c62fb54c --- /dev/null +++ b/tests/regressiontests/charsets/tests.py @@ -0,0 +1,60 @@ +from django.test import Client, TestCase +import re +from django.conf import settings + +CONTENT_TYPE_RE = re.compile('.*; charset=([\w\d-]+);?') + + +def get_charset(response): + match = CONTENT_TYPE_RE.match(response.get("content-type","")) + if match: + charset = match.group(1) + else: + charset = None + return charset + +class ClientTest(TestCase): + + def test_good_accept_charset(self): + "Use Accept-Charset" + # The data is ignored, but let's check it doesn't crash the system + # anyway. + + response = self.client.post('/accept_charset/', ACCEPT_CHARSET="ascii,utf-8;q=0") + self.assertEqual(get_charset(response), "ascii") + + # us is an alias for ascii + response = self.client.post('/accept_charset/', ACCEPT_CHARSET="us;q=0.8,*;q=0.9") + self.assertEqual(get_charset(response), settings.DEFAULT_CHARSET) + + response = self.client.post('/accept_charset/', ACCEPT_CHARSET="us;q=0.8,*;q=0.7") + self.assertEqual(get_charset(response), "us") + + response = self.client.post('/accept_charset/', ACCEPT_CHARSET="ascii;q=0.89,utf-8;q=.9") + self.assertEqual(get_charset(response), settings.DEFAULT_CHARSET) + + response = self.client.post('/accept_charset/', ACCEPT_CHARSET="utf-8;q=0") + self.assertEqual(get_charset(response), "ISO-8859-1") + + def test_bad_accept_charset(self): + "Do not use a malformed Accept-Charset" + # The data is ignored, but let's check it doesn't crash the system + # anyway. + + response = self.client.post('/accept_charset/', ACCEPT_CHARSET="this_is_junk") + self.assertEqual(get_charset(response), "utf-8") + + def test_good_content_type(self): + "Use content-type" + # The data is ignored, but let's check it doesn't crash the system + # anyway. + + response = self.client.post('/good_content_type/') + self.assertEqual(get_charset(response), "us") + + def test_bad_content_type(self): + "Use content-type" + # The data is ignored, but let's check it doesn't crash the system + # anyway. + + response = self.client.post('/bad_content_type/') \ No newline at end of file diff --git a/tests/regressiontests/charsets/urls.py b/tests/regressiontests/charsets/urls.py new file mode 100644 index 0000000000..8df7e6054d --- /dev/null +++ b/tests/regressiontests/charsets/urls.py @@ -0,0 +1,9 @@ +from django.conf.urls.defaults import * + +import views + +urlpatterns = patterns('', + (r'^accept_charset/', views.accept_charset), + (r'^good_content_type/', views.good_content_type), + (r'^bad_content_type/', views.bad_content_type), +) diff --git a/tests/regressiontests/charsets/views.py b/tests/regressiontests/charsets/views.py new file mode 100644 index 0000000000..cb9fa3a682 --- /dev/null +++ b/tests/regressiontests/charsets/views.py @@ -0,0 +1,11 @@ +from django.http import HttpResponse +from django.shortcuts import render_to_response + +def accept_charset(request): + return HttpResponse("ASCII.", origin_request=request) + +def good_content_type(request): + return HttpResponse("ASCII.", content_type="text/html; charset=us") + +def bad_content_type(request): + return HttpResponse("ASCII.", content_type="text/html; charset=this_should_be_junk")