From 1896d531cb64873165195de46bb1dc5562887e5d Mon Sep 17 00:00:00 2001
From: Chris Cahoon <chris.cahoon@gmail.com>
Date: Tue, 16 Jun 2009 16:42:46 +0000
Subject: [PATCH] [soc2009/http-wsgi-improvements] Code and tests in support of
 http.charsets.determine_charset.

This code determines the charset from a content-type or from the Accept-Charset request header.
The code is fairly well documented, but that will be improved once the code is in closer to a
final form. The codec that corresponds to the charset is also returned, but it is not currently
used by HttpResponse.

git-svn-id: http://code.djangoproject.com/svn/django/branches/soc2009/http-wsgi-improvements@11014 bcc190cf-cafb-0310-a4f2-bffc1f526a37
---
 django/http/__init__.py                    |  15 +-
 django/http/charsets.py                    | 351 +++++++++++++++++++++
 tests/regressiontests/charsets/__init__.py |   0
 tests/regressiontests/charsets/models.py   |   0
 tests/regressiontests/charsets/tests.py    |  60 ++++
 tests/regressiontests/charsets/urls.py     |   9 +
 tests/regressiontests/charsets/views.py    |  11 +
 7 files changed, 443 insertions(+), 3 deletions(-)
 create mode 100644 django/http/charsets.py
 create mode 100644 tests/regressiontests/charsets/__init__.py
 create mode 100644 tests/regressiontests/charsets/models.py
 create mode 100644 tests/regressiontests/charsets/tests.py
 create mode 100644 tests/regressiontests/charsets/urls.py
 create mode 100644 tests/regressiontests/charsets/views.py

diff --git a/django/http/__init__.py b/django/http/__init__.py
index 683212fcd4..9f7e82ba46 100644
--- a/django/http/__init__.py
+++ b/django/http/__init__.py
@@ -13,6 +13,7 @@ except ImportError:
 from django.utils.datastructures import MultiValueDict, ImmutableList
 from django.utils.encoding import smart_str, iri_to_uri, force_unicode
 from django.http.multipartparser import MultiPartParser
+from django.http.charsets import determine_charset
 from django.conf import settings
 from django.core.files import uploadhandler
 from utils import *
@@ -272,14 +273,16 @@ class HttpResponse(object):
     status_code = 200
 
     def __init__(self, content='', mimetype=None, status=None,
-            content_type=None):
+            content_type=None, origin_request=None):
         from django.conf import settings
         self._charset = settings.DEFAULT_CHARSET
         if mimetype:
-            content_type = mimetype     # For backwards compatibility
+            content_type = mimetype     # Mimetype is an alias for content-type 
+        if origin_request or content_type:
+           self._charset, self._codec = determine_charset(content_type, origin_request)
         if not content_type:
             content_type = "%s; charset=%s" % (settings.DEFAULT_CONTENT_TYPE,
-                    settings.DEFAULT_CHARSET)
+                    self._charset)
         if not isinstance(content, basestring) and hasattr(content, '__iter__'):
             self._container = content
             self._is_string = False
@@ -432,6 +435,12 @@ class HttpResponseNotAllowed(HttpResponse):
         HttpResponse.__init__(self)
         self['Allow'] = ', '.join(permitted_methods)
 
+class HttpResponseNotAcceptable(HttpResponse):
+    status_code = 406
+
+    # http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
+    # if we want to make this more verbose (compliant, actually)
+
 class HttpResponseGone(HttpResponse):
     status_code = 410
 
diff --git a/django/http/charsets.py b/django/http/charsets.py
new file mode 100644
index 0000000000..035786e3aa
--- /dev/null
+++ b/django/http/charsets.py
@@ -0,0 +1,351 @@
+"Maps codec names to http1.1 charsets"
+
+import codecs
+import re
+from operator import itemgetter
+from django.conf import settings
+
+CHARSET_CODECS = {
+    '437': 'cp437',
+    '850': 'cp850',
+    '852': 'cp852',
+    '855': 'cp855',
+    '857': 'cp857',
+    '860': 'cp860',
+    '861': 'cp861',
+    '862': 'cp862',
+    '863': 'cp863',
+    '865': 'cp865',
+    '869': 'cp869',
+    'ansi_x3.4-1968': 'ascii',
+    'ansi_x3.4-1986': 'ascii',
+    'arabic': 'iso8859-6',
+    'ascii': 'ascii',
+    'asmo-708': 'iso8859-6',
+    'big5': 'big5',
+    'big5-hkscs': 'big5hkscs',
+    'ccsid01140': 'cp1140',
+    'chinese': 'gb2312',
+    'cp-gr': 'cp869',
+    'cp-is': 'cp861',
+    'cp01140': 'cp1140',
+    'cp037': 'cp037',
+    'cp1026': 'cp1026',
+    'cp154': 'ptcp154',
+    'cp367': 'ascii',
+    'cp424': 'cp424',
+    'cp437': 'cp437',
+    'cp500': 'cp500',
+    'cp775': 'cp775',
+    'cp819': 'iso8859-1',
+    'cp850': 'cp850',
+    'cp852': 'cp852',
+    'cp855': 'cp855',
+    'cp857': 'cp857',
+    'cp860': 'cp860',
+    'cp861': 'cp861',
+    'cp862': 'cp862',
+    'cp863': 'cp863',
+    'cp864': 'cp864',
+    'cp865': 'cp865',
+    'cp869': 'cp869',
+    'cp936': 'gbk',
+    'csascii': 'ascii',
+    'csbig5': 'big5',
+    'cseuckr': 'euc_kr',
+    'cseucpkdfmtjapanese': 'euc_jp',
+    'csibm037': 'cp037',
+    'csibm1026': 'cp1026',
+    'csibm424': 'cp424',
+    'csibm500': 'cp500',
+    'csibm855': 'cp855',
+    'csibm857': 'cp857',
+    'csibm860': 'cp860',
+    'csibm861': 'cp861',
+    'csibm863': 'cp863',
+    'csibm864': 'cp864',
+    'csibm865': 'cp865',
+    'csibm869': 'cp869',
+    'csiso2022jp': 'iso2022_jp',
+    'csiso2022jp2': 'iso2022_jp_2',
+    'csiso58gb231280': 'gb2312',
+    'csisolatin1': 'iso8859-1',
+    'csisolatin2': 'iso8859-2',
+    'csisolatin3': 'iso8859-3',
+    'csisolatin4': 'iso8859-4',
+    'csisolatin5': 'iso8859-9',
+    'csisolatin6': 'iso8859-10',
+    'csisolatinarabic': 'iso8859-6',
+    'csisolatincyrillic': 'iso8859-5',
+    'csisolatingreek': 'iso8859-7',
+    'csisolatinhebrew': 'iso8859-8',
+    'cskoi8r': 'koi8-r',
+    'cspc775baltic': 'cp775',
+    'cspc850multilingual': 'cp850',
+    'cspc862latinhebrew': 'cp862',
+    'cspc8codepage437': 'cp437',
+    'cspcp852': 'cp852',
+    'csptcp154': 'ptcp154',
+    'csshiftjis': 'shift_jis',
+    'cyrillic': 'iso8859-5',
+    'cyrillic-asian': 'ptcp154',
+    'ebcdic-cp-be': 'cp500',
+    'ebcdic-cp-ca': 'cp037',
+    'ebcdic-cp-ch': 'cp500',
+    'ebcdic-cp-he': 'cp424',
+    'ebcdic-cp-nl': 'cp037',
+    'ebcdic-cp-us': 'cp037',
+    'ebcdic-cp-wt': 'cp037',
+    'ebcdic-us-37+euro': 'cp1140',
+    'ecma-114': 'iso8859-6',
+    'ecma-118': 'iso8859-7',
+    'elot_928': 'iso8859-7',
+    'euc-jp': 'euc_jp',
+    'euc-kr': 'euc_kr',
+    'extended_unix_code_packed_format_for_japanese': 'euc_jp',
+    'gb18030': 'gb18030',
+    'gb_2312-80': 'gb2312',
+    'gbk': 'gbk',
+    'greek': 'iso8859-7',
+    'greek8': 'iso8859-7',
+    'hebrew': 'iso8859-8',
+    'hz-gb-2312': 'hz',
+    'ibm01140': 'cp1140',
+    'ibm037': 'cp037',
+    'ibm1026': 'cp1026',
+    'ibm367': 'ascii',
+    'ibm424': 'cp424',
+    'ibm437': 'cp437',
+    'ibm500': 'cp500',
+    'ibm775': 'cp775',
+    'ibm819': 'iso8859-1',
+    'ibm850': 'cp850',
+    'ibm852': 'cp852',
+    'ibm855': 'cp855',
+    'ibm857': 'cp857',
+    'ibm860': 'cp860',
+    'ibm861': 'cp861',
+    'ibm862': 'cp862',
+    'ibm863': 'cp863',
+    'ibm864': 'cp864',
+    'ibm865': 'cp865',
+    'ibm869': 'cp869',
+    'iso-2022-jp': 'iso2022_jp',
+    'iso-2022-jp-2': 'iso2022_jp_2',
+    'iso-8859-1': 'iso8859-1',
+    'iso-8859-10': 'iso8859-10',
+    'iso-8859-13': 'iso8859-13',
+    'iso-8859-14': 'iso8859-14',
+    'iso-8859-15': 'iso8859-15',
+    'iso-8859-2': 'iso8859-2',
+    'iso-8859-3': 'iso8859-3',
+    'iso-8859-4': 'iso8859-4',
+    'iso-8859-5': 'iso8859-5',
+    'iso-8859-6': 'iso8859-6',
+    'iso-8859-7': 'iso8859-7',
+    'iso-8859-8': 'iso8859-8',
+    'iso-8859-9': 'iso8859-9',
+    'iso-celtic': 'iso8859-14',
+    'iso-ir-100': 'iso8859-1',
+    'iso-ir-101': 'iso8859-2',
+    'iso-ir-109': 'iso8859-3',
+    'iso-ir-110': 'iso8859-4',
+    'iso-ir-126': 'iso8859-7',
+    'iso-ir-127': 'iso8859-6',
+    'iso-ir-138': 'iso8859-8',
+    'iso-ir-144': 'iso8859-5',
+    'iso-ir-148': 'iso8859-9',
+    'iso-ir-157': 'iso8859-10',
+    'iso-ir-199': 'iso8859-14',
+    'iso-ir-58': 'gb2312',
+    'iso-ir-6': 'ascii',
+    'iso646-us': 'ascii',
+    'iso_646.irv:1991': 'ascii',
+    'iso_8859-1': 'iso8859-1',
+    'iso_8859-10:1992': 'iso8859-10',
+    'iso_8859-14': 'iso8859-14',
+    'iso_8859-14:1998': 'iso8859-14',
+    'iso_8859-15': 'iso8859-15',
+    'iso_8859-1:1987': 'iso8859-1',
+    'iso_8859-2': 'iso8859-2',
+    'iso_8859-2:1987': 'iso8859-2',
+    'iso_8859-3': 'iso8859-3',
+    'iso_8859-3:1988': 'iso8859-3',
+    'iso_8859-4': 'iso8859-4',
+    'iso_8859-4:1988': 'iso8859-4',
+    'iso_8859-5': 'iso8859-5',
+    'iso_8859-5:1988': 'iso8859-5',
+    'iso_8859-6': 'iso8859-6',
+    'iso_8859-6:1987': 'iso8859-6',
+    'iso_8859-7': 'iso8859-7',
+    'iso_8859-7:1987': 'iso8859-7',
+    'iso_8859-8': 'iso8859-8',
+    'iso_8859-8:1988': 'iso8859-8',
+    'iso_8859-9': 'iso8859-9',
+    'iso_8859-9:1989': 'iso8859-9',
+    'koi8-r': 'koi8-r',
+    'koi8-u': 'koi8-u',
+    'l1': 'iso8859-1',
+    'l2': 'iso8859-2',
+    'l3': 'iso8859-3',
+    'l4': 'iso8859-4',
+    'l5': 'iso8859-9',
+    'l6': 'iso8859-10',
+    'l8': 'iso8859-14',
+    'latin-9': 'iso8859-15',
+    'latin1': 'iso8859-1',
+    'latin2': 'iso8859-2',
+    'latin3': 'iso8859-3',
+    'latin4': 'iso8859-4',
+    'latin5': 'iso8859-9',
+    'latin6': 'iso8859-10',
+    'latin8': 'iso8859-14',
+    'ms936': 'gbk',
+    'ms_kanji': 'shift_jis',
+    'pt154': 'ptcp154',
+    'ptcp154': 'ptcp154',
+    'shift_jis': 'shift_jis',
+    'us': 'ascii',
+    'us-ascii': 'ascii',
+    'utf-16': 'utf-16',
+    'utf-16le': 'utf-16-be',
+    'utf-32': 'utf-32',
+    'utf-32be': 'utf-32-be',
+    'utf-32le': 'utf-32-le',
+    'utf-7': 'utf-7',
+    'utf-8': 'utf-8',
+    'windows-1250': 'cp1250',
+    'windows-1251': 'cp1251',
+    'windows-1252': 'cp1252',
+    'windows-1253': 'cp1253',
+    'windows-1254': 'cp1254',
+    'windows-1255': 'cp1255',
+    'windows-1256': 'cp1256',
+    'windows-1257': 'cp1257',
+    'windows-1258': 'cp1258',
+    'windows-936': 'gbk'
+}
+
+def get_codec(charset):
+    """
+    Given the name or alias of a character set, find its Python codec if there is one.
+    
+    http://www.iana.org/assignments/character-sets contains valid aliases.
+    The documentation for the codecs module has the list of codecs.
+    
+    CODEC_CHARSETS above has the codecs that correspond to character sets.
+    """
+    try:
+        codec_name = CHARSET_CODECS[charset.strip().lower()]
+        codec = codecs.lookup(codec_name)     
+    except KeyError:
+        #print "The charset %s is not supported by Django." % charset
+        codec = None
+    except LookupError:
+        #print "The encoding '%s' is not supported in this version of Python." % codec_name
+        codec = None 
+    
+    return codec
+
+# Returns the key for the maximum value in a dictionary
+max_dict_key = lambda l:sorted(l.iteritems(), key=itemgetter(1), reverse=True)[0][0]
+
+CONTENT_TYPE_RE = re.compile('.*; charset=([\w\d-]+);?')
+ACCEPT_CHARSET_RE = re.compile('(?P<charset>([\w\d-]+)|(\*))(;q=(?P<q>[01](\.\d{1,3})?))?,?')
+def determine_charset(content_type, request):
+    """
+    Searches request headers from clients and mimetype settings (which may be set 
+    by users) for indicators of which charset and encoding the response should use.
+    
+    Attempted partial support for HTTP RFC 2616 section 14.2 and ticket 10190.
+    
+    Returns the highest "quality" (priority) charset that Python supports.
+    
+    Precedence: supported charset specified in content-type
+                settings.DEFAULT_CHARSET,
+                supported, "accept"ed charset such that its q > q of settings.DEFAULT_CHARSET
+                iso-8859-1 if q > 0 or is unspecified
+                406 error
+            
+    """
+    codec = None
+    charset = None
+    
+    # Attempt to get the codec from a content-type, and verify that the charset is valid.
+    if content_type:
+        match = CONTENT_TYPE_RE.match(content_type)
+        if match:
+            charset = match.group(1)
+            codec = get_codec(charset)
+            if not codec:   # Unsupported charset
+                # we should throw an exception here
+                print "No CODEC ON MIMETYPE"
+    
+    # Handle Accept-Charset (which we only do if we do not deal with content_type).
+    else:
+        if request and "ACCEPT_CHARSET" in request.META:
+            # Get list of matches for Accepted-Charsets.
+            # [{ charset : q }, { charset : q }]
+            match_iterator = ACCEPT_CHARSET_RE.finditer(request.META["ACCEPT_CHARSET"])
+            accept_charset = [m.groupdict() for m in match_iterator]
+        else:
+            accept_charset = []    # use settings.DEFAULT_CHARSET
+            charset = settings.DEFAULT_CHARSET
+            
+        # Remove charsets we cannot encode and whose q values are 0
+        charsets = _process_accept_charset(accept_charset)
+        
+        # If we did not get a charset from the content type, we get it from accept_charset.
+        if not charset:
+            default_charset = settings.DEFAULT_CHARSET
+            fallback_charset = "ISO-8859-1"
+            # Prefer default_charset if its q value is 1 or we have no valid acceptable charsets.
+            max_q_charset = max_dict_key(charsets)
+            max_q_value = charsets[max_q_charset]
+            if max_q_value == 0 and fallback_charset not in charsets:
+                charset = fallback_charset
+            elif charsets[default_charset] == 1 or charsets[default_charset] == max_q_value:
+                charset = default_charset
+            # Get the highest valued acceptable charset (if we aren't going to the fallback
+            # or defaulting)
+            else:
+                charset = max_q_charset
+            
+        codec = get_codec(charset)
+    # We may reach here with no codec or no charset. We will change the status 
+    # code in the HttpResponse.
+    return charset, codec
+
+# NOTE -- make sure we are not duping the processing of q values
+def _process_accept_charset(accept_charset):
+    '''
+    HTTP RFC 2616 section 14.2 dictates that q must be between 0 and 1.
+    This method normalizes charset quality values, cleans whitespace from charset
+    names, and excludes charsets without Python codecs and whose q values are 0.
+    '''
+    accepted_charsets = {}
+    
+    default_value = 1
+    wildcard = False
+    
+    for potential in accept_charset:
+        charset = potential["charset"].strip()            
+        # The default quality value is 1
+        if not potential["q"]:
+            q = 1.
+        else:    
+            q = float(potential["q"])
+        # Exclude unsupported charsets (those without codecs in Python)
+        if get_codec(charset) and q >= 0 and q <= 1:
+            accepted_charsets[charset] = q
+        elif charset == "*" and q >= 0 and q <= 1:
+            default_value = q
+            wildcard = True
+            
+    if settings.DEFAULT_CHARSET not in accepted_charsets:
+        accepted_charsets[settings.DEFAULT_CHARSET] = default_value 
+    if "ISO-8859-1" not in accepted_charsets and wildcard: 
+        accepted_charsets["ISO-8859-1"] = default_value
+      
+      
+    return accepted_charsets
\ No newline at end of file
diff --git a/tests/regressiontests/charsets/__init__.py b/tests/regressiontests/charsets/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/regressiontests/charsets/models.py b/tests/regressiontests/charsets/models.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/regressiontests/charsets/tests.py b/tests/regressiontests/charsets/tests.py
new file mode 100644
index 0000000000..03c62fb54c
--- /dev/null
+++ b/tests/regressiontests/charsets/tests.py
@@ -0,0 +1,60 @@
+from django.test import Client, TestCase
+import re
+from django.conf import settings
+
+CONTENT_TYPE_RE = re.compile('.*; charset=([\w\d-]+);?')
+
+
+def get_charset(response):
+    match = CONTENT_TYPE_RE.match(response.get("content-type",""))
+    if match:
+        charset = match.group(1)
+    else:
+        charset = None
+    return charset
+
+class ClientTest(TestCase):
+    
+    def test_good_accept_charset(self):
+        "Use Accept-Charset"
+        # The data is ignored, but let's check it doesn't crash the system
+        # anyway.
+        
+        response = self.client.post('/accept_charset/', ACCEPT_CHARSET="ascii,utf-8;q=0")
+        self.assertEqual(get_charset(response), "ascii")
+         
+        # us is an alias for ascii
+        response = self.client.post('/accept_charset/', ACCEPT_CHARSET="us;q=0.8,*;q=0.9")
+        self.assertEqual(get_charset(response), settings.DEFAULT_CHARSET)
+        
+        response = self.client.post('/accept_charset/', ACCEPT_CHARSET="us;q=0.8,*;q=0.7")
+        self.assertEqual(get_charset(response), "us")
+        
+        response = self.client.post('/accept_charset/', ACCEPT_CHARSET="ascii;q=0.89,utf-8;q=.9")
+        self.assertEqual(get_charset(response), settings.DEFAULT_CHARSET)
+        
+        response = self.client.post('/accept_charset/', ACCEPT_CHARSET="utf-8;q=0")
+        self.assertEqual(get_charset(response), "ISO-8859-1")  
+    
+    def test_bad_accept_charset(self):
+        "Do not use a malformed Accept-Charset"
+        # The data is ignored, but let's check it doesn't crash the system
+        # anyway.
+        
+        response = self.client.post('/accept_charset/', ACCEPT_CHARSET="this_is_junk")
+        self.assertEqual(get_charset(response), "utf-8")
+        
+    def test_good_content_type(self):
+        "Use content-type"
+        # The data is ignored, but let's check it doesn't crash the system
+        # anyway.
+        
+        response = self.client.post('/good_content_type/')
+        self.assertEqual(get_charset(response), "us")
+        
+    def test_bad_content_type(self):
+        "Use content-type"
+        # The data is ignored, but let's check it doesn't crash the system
+        # anyway.
+        
+        response = self.client.post('/bad_content_type/')
\ No newline at end of file
diff --git a/tests/regressiontests/charsets/urls.py b/tests/regressiontests/charsets/urls.py
new file mode 100644
index 0000000000..8df7e6054d
--- /dev/null
+++ b/tests/regressiontests/charsets/urls.py
@@ -0,0 +1,9 @@
+from django.conf.urls.defaults import *
+
+import views
+
+urlpatterns = patterns('',
+    (r'^accept_charset/', views.accept_charset),
+    (r'^good_content_type/', views.good_content_type),
+    (r'^bad_content_type/', views.bad_content_type),
+)
diff --git a/tests/regressiontests/charsets/views.py b/tests/regressiontests/charsets/views.py
new file mode 100644
index 0000000000..cb9fa3a682
--- /dev/null
+++ b/tests/regressiontests/charsets/views.py
@@ -0,0 +1,11 @@
+from django.http import HttpResponse
+from django.shortcuts import render_to_response
+
+def accept_charset(request):
+    return HttpResponse("ASCII.", origin_request=request)
+
+def good_content_type(request):
+    return HttpResponse("ASCII.", content_type="text/html; charset=us")
+
+def bad_content_type(request):
+    return HttpResponse("ASCII.", content_type="text/html; charset=this_should_be_junk")