1
0
mirror of https://github.com/django/django.git synced 2025-07-04 01:39:20 +00:00

[soc2009/http-wsgi-improvements] Code and tests in support of http.charsets.determine_charset.

This code determines the charset from a content-type or from the Accept-Charset request header.
The code is fairly well documented, but that will be improved once the code is in closer to a
final form. The codec that corresponds to the charset is also returned, but it is not currently
used by HttpResponse.

git-svn-id: http://code.djangoproject.com/svn/django/branches/soc2009/http-wsgi-improvements@11014 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
Chris Cahoon 2009-06-16 16:42:46 +00:00
parent c435676176
commit 1896d531cb
7 changed files with 443 additions and 3 deletions

View File

@ -13,6 +13,7 @@ except ImportError:
from django.utils.datastructures import MultiValueDict, ImmutableList
from django.utils.encoding import smart_str, iri_to_uri, force_unicode
from django.http.multipartparser import MultiPartParser
from django.http.charsets import determine_charset
from django.conf import settings
from django.core.files import uploadhandler
from utils import *
@ -272,14 +273,16 @@ class HttpResponse(object):
status_code = 200
def __init__(self, content='', mimetype=None, status=None,
content_type=None):
content_type=None, origin_request=None):
from django.conf import settings
self._charset = settings.DEFAULT_CHARSET
if mimetype:
content_type = mimetype # For backwards compatibility
content_type = mimetype # Mimetype is an alias for content-type
if origin_request or content_type:
self._charset, self._codec = determine_charset(content_type, origin_request)
if not content_type:
content_type = "%s; charset=%s" % (settings.DEFAULT_CONTENT_TYPE,
settings.DEFAULT_CHARSET)
self._charset)
if not isinstance(content, basestring) and hasattr(content, '__iter__'):
self._container = content
self._is_string = False
@ -432,6 +435,12 @@ class HttpResponseNotAllowed(HttpResponse):
HttpResponse.__init__(self)
self['Allow'] = ', '.join(permitted_methods)
class HttpResponseNotAcceptable(HttpResponse):
status_code = 406
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
# if we want to make this more verbose (compliant, actually)
class HttpResponseGone(HttpResponse):
status_code = 410

351
django/http/charsets.py Normal file
View File

@ -0,0 +1,351 @@
"Maps codec names to http1.1 charsets"
import codecs
import re
from operator import itemgetter
from django.conf import settings
CHARSET_CODECS = {
'437': 'cp437',
'850': 'cp850',
'852': 'cp852',
'855': 'cp855',
'857': 'cp857',
'860': 'cp860',
'861': 'cp861',
'862': 'cp862',
'863': 'cp863',
'865': 'cp865',
'869': 'cp869',
'ansi_x3.4-1968': 'ascii',
'ansi_x3.4-1986': 'ascii',
'arabic': 'iso8859-6',
'ascii': 'ascii',
'asmo-708': 'iso8859-6',
'big5': 'big5',
'big5-hkscs': 'big5hkscs',
'ccsid01140': 'cp1140',
'chinese': 'gb2312',
'cp-gr': 'cp869',
'cp-is': 'cp861',
'cp01140': 'cp1140',
'cp037': 'cp037',
'cp1026': 'cp1026',
'cp154': 'ptcp154',
'cp367': 'ascii',
'cp424': 'cp424',
'cp437': 'cp437',
'cp500': 'cp500',
'cp775': 'cp775',
'cp819': 'iso8859-1',
'cp850': 'cp850',
'cp852': 'cp852',
'cp855': 'cp855',
'cp857': 'cp857',
'cp860': 'cp860',
'cp861': 'cp861',
'cp862': 'cp862',
'cp863': 'cp863',
'cp864': 'cp864',
'cp865': 'cp865',
'cp869': 'cp869',
'cp936': 'gbk',
'csascii': 'ascii',
'csbig5': 'big5',
'cseuckr': 'euc_kr',
'cseucpkdfmtjapanese': 'euc_jp',
'csibm037': 'cp037',
'csibm1026': 'cp1026',
'csibm424': 'cp424',
'csibm500': 'cp500',
'csibm855': 'cp855',
'csibm857': 'cp857',
'csibm860': 'cp860',
'csibm861': 'cp861',
'csibm863': 'cp863',
'csibm864': 'cp864',
'csibm865': 'cp865',
'csibm869': 'cp869',
'csiso2022jp': 'iso2022_jp',
'csiso2022jp2': 'iso2022_jp_2',
'csiso58gb231280': 'gb2312',
'csisolatin1': 'iso8859-1',
'csisolatin2': 'iso8859-2',
'csisolatin3': 'iso8859-3',
'csisolatin4': 'iso8859-4',
'csisolatin5': 'iso8859-9',
'csisolatin6': 'iso8859-10',
'csisolatinarabic': 'iso8859-6',
'csisolatincyrillic': 'iso8859-5',
'csisolatingreek': 'iso8859-7',
'csisolatinhebrew': 'iso8859-8',
'cskoi8r': 'koi8-r',
'cspc775baltic': 'cp775',
'cspc850multilingual': 'cp850',
'cspc862latinhebrew': 'cp862',
'cspc8codepage437': 'cp437',
'cspcp852': 'cp852',
'csptcp154': 'ptcp154',
'csshiftjis': 'shift_jis',
'cyrillic': 'iso8859-5',
'cyrillic-asian': 'ptcp154',
'ebcdic-cp-be': 'cp500',
'ebcdic-cp-ca': 'cp037',
'ebcdic-cp-ch': 'cp500',
'ebcdic-cp-he': 'cp424',
'ebcdic-cp-nl': 'cp037',
'ebcdic-cp-us': 'cp037',
'ebcdic-cp-wt': 'cp037',
'ebcdic-us-37+euro': 'cp1140',
'ecma-114': 'iso8859-6',
'ecma-118': 'iso8859-7',
'elot_928': 'iso8859-7',
'euc-jp': 'euc_jp',
'euc-kr': 'euc_kr',
'extended_unix_code_packed_format_for_japanese': 'euc_jp',
'gb18030': 'gb18030',
'gb_2312-80': 'gb2312',
'gbk': 'gbk',
'greek': 'iso8859-7',
'greek8': 'iso8859-7',
'hebrew': 'iso8859-8',
'hz-gb-2312': 'hz',
'ibm01140': 'cp1140',
'ibm037': 'cp037',
'ibm1026': 'cp1026',
'ibm367': 'ascii',
'ibm424': 'cp424',
'ibm437': 'cp437',
'ibm500': 'cp500',
'ibm775': 'cp775',
'ibm819': 'iso8859-1',
'ibm850': 'cp850',
'ibm852': 'cp852',
'ibm855': 'cp855',
'ibm857': 'cp857',
'ibm860': 'cp860',
'ibm861': 'cp861',
'ibm862': 'cp862',
'ibm863': 'cp863',
'ibm864': 'cp864',
'ibm865': 'cp865',
'ibm869': 'cp869',
'iso-2022-jp': 'iso2022_jp',
'iso-2022-jp-2': 'iso2022_jp_2',
'iso-8859-1': 'iso8859-1',
'iso-8859-10': 'iso8859-10',
'iso-8859-13': 'iso8859-13',
'iso-8859-14': 'iso8859-14',
'iso-8859-15': 'iso8859-15',
'iso-8859-2': 'iso8859-2',
'iso-8859-3': 'iso8859-3',
'iso-8859-4': 'iso8859-4',
'iso-8859-5': 'iso8859-5',
'iso-8859-6': 'iso8859-6',
'iso-8859-7': 'iso8859-7',
'iso-8859-8': 'iso8859-8',
'iso-8859-9': 'iso8859-9',
'iso-celtic': 'iso8859-14',
'iso-ir-100': 'iso8859-1',
'iso-ir-101': 'iso8859-2',
'iso-ir-109': 'iso8859-3',
'iso-ir-110': 'iso8859-4',
'iso-ir-126': 'iso8859-7',
'iso-ir-127': 'iso8859-6',
'iso-ir-138': 'iso8859-8',
'iso-ir-144': 'iso8859-5',
'iso-ir-148': 'iso8859-9',
'iso-ir-157': 'iso8859-10',
'iso-ir-199': 'iso8859-14',
'iso-ir-58': 'gb2312',
'iso-ir-6': 'ascii',
'iso646-us': 'ascii',
'iso_646.irv:1991': 'ascii',
'iso_8859-1': 'iso8859-1',
'iso_8859-10:1992': 'iso8859-10',
'iso_8859-14': 'iso8859-14',
'iso_8859-14:1998': 'iso8859-14',
'iso_8859-15': 'iso8859-15',
'iso_8859-1:1987': 'iso8859-1',
'iso_8859-2': 'iso8859-2',
'iso_8859-2:1987': 'iso8859-2',
'iso_8859-3': 'iso8859-3',
'iso_8859-3:1988': 'iso8859-3',
'iso_8859-4': 'iso8859-4',
'iso_8859-4:1988': 'iso8859-4',
'iso_8859-5': 'iso8859-5',
'iso_8859-5:1988': 'iso8859-5',
'iso_8859-6': 'iso8859-6',
'iso_8859-6:1987': 'iso8859-6',
'iso_8859-7': 'iso8859-7',
'iso_8859-7:1987': 'iso8859-7',
'iso_8859-8': 'iso8859-8',
'iso_8859-8:1988': 'iso8859-8',
'iso_8859-9': 'iso8859-9',
'iso_8859-9:1989': 'iso8859-9',
'koi8-r': 'koi8-r',
'koi8-u': 'koi8-u',
'l1': 'iso8859-1',
'l2': 'iso8859-2',
'l3': 'iso8859-3',
'l4': 'iso8859-4',
'l5': 'iso8859-9',
'l6': 'iso8859-10',
'l8': 'iso8859-14',
'latin-9': 'iso8859-15',
'latin1': 'iso8859-1',
'latin2': 'iso8859-2',
'latin3': 'iso8859-3',
'latin4': 'iso8859-4',
'latin5': 'iso8859-9',
'latin6': 'iso8859-10',
'latin8': 'iso8859-14',
'ms936': 'gbk',
'ms_kanji': 'shift_jis',
'pt154': 'ptcp154',
'ptcp154': 'ptcp154',
'shift_jis': 'shift_jis',
'us': 'ascii',
'us-ascii': 'ascii',
'utf-16': 'utf-16',
'utf-16le': 'utf-16-be',
'utf-32': 'utf-32',
'utf-32be': 'utf-32-be',
'utf-32le': 'utf-32-le',
'utf-7': 'utf-7',
'utf-8': 'utf-8',
'windows-1250': 'cp1250',
'windows-1251': 'cp1251',
'windows-1252': 'cp1252',
'windows-1253': 'cp1253',
'windows-1254': 'cp1254',
'windows-1255': 'cp1255',
'windows-1256': 'cp1256',
'windows-1257': 'cp1257',
'windows-1258': 'cp1258',
'windows-936': 'gbk'
}
def get_codec(charset):
"""
Given the name or alias of a character set, find its Python codec if there is one.
http://www.iana.org/assignments/character-sets contains valid aliases.
The documentation for the codecs module has the list of codecs.
CODEC_CHARSETS above has the codecs that correspond to character sets.
"""
try:
codec_name = CHARSET_CODECS[charset.strip().lower()]
codec = codecs.lookup(codec_name)
except KeyError:
#print "The charset %s is not supported by Django." % charset
codec = None
except LookupError:
#print "The encoding '%s' is not supported in this version of Python." % codec_name
codec = None
return codec
# Returns the key for the maximum value in a dictionary
max_dict_key = lambda l:sorted(l.iteritems(), key=itemgetter(1), reverse=True)[0][0]
CONTENT_TYPE_RE = re.compile('.*; charset=([\w\d-]+);?')
ACCEPT_CHARSET_RE = re.compile('(?P<charset>([\w\d-]+)|(\*))(;q=(?P<q>[01](\.\d{1,3})?))?,?')
def determine_charset(content_type, request):
"""
Searches request headers from clients and mimetype settings (which may be set
by users) for indicators of which charset and encoding the response should use.
Attempted partial support for HTTP RFC 2616 section 14.2 and ticket 10190.
Returns the highest "quality" (priority) charset that Python supports.
Precedence: supported charset specified in content-type
settings.DEFAULT_CHARSET,
supported, "accept"ed charset such that its q > q of settings.DEFAULT_CHARSET
iso-8859-1 if q > 0 or is unspecified
406 error
"""
codec = None
charset = None
# Attempt to get the codec from a content-type, and verify that the charset is valid.
if content_type:
match = CONTENT_TYPE_RE.match(content_type)
if match:
charset = match.group(1)
codec = get_codec(charset)
if not codec: # Unsupported charset
# we should throw an exception here
print "No CODEC ON MIMETYPE"
# Handle Accept-Charset (which we only do if we do not deal with content_type).
else:
if request and "ACCEPT_CHARSET" in request.META:
# Get list of matches for Accepted-Charsets.
# [{ charset : q }, { charset : q }]
match_iterator = ACCEPT_CHARSET_RE.finditer(request.META["ACCEPT_CHARSET"])
accept_charset = [m.groupdict() for m in match_iterator]
else:
accept_charset = [] # use settings.DEFAULT_CHARSET
charset = settings.DEFAULT_CHARSET
# Remove charsets we cannot encode and whose q values are 0
charsets = _process_accept_charset(accept_charset)
# If we did not get a charset from the content type, we get it from accept_charset.
if not charset:
default_charset = settings.DEFAULT_CHARSET
fallback_charset = "ISO-8859-1"
# Prefer default_charset if its q value is 1 or we have no valid acceptable charsets.
max_q_charset = max_dict_key(charsets)
max_q_value = charsets[max_q_charset]
if max_q_value == 0 and fallback_charset not in charsets:
charset = fallback_charset
elif charsets[default_charset] == 1 or charsets[default_charset] == max_q_value:
charset = default_charset
# Get the highest valued acceptable charset (if we aren't going to the fallback
# or defaulting)
else:
charset = max_q_charset
codec = get_codec(charset)
# We may reach here with no codec or no charset. We will change the status
# code in the HttpResponse.
return charset, codec
# NOTE -- make sure we are not duping the processing of q values
def _process_accept_charset(accept_charset):
'''
HTTP RFC 2616 section 14.2 dictates that q must be between 0 and 1.
This method normalizes charset quality values, cleans whitespace from charset
names, and excludes charsets without Python codecs and whose q values are 0.
'''
accepted_charsets = {}
default_value = 1
wildcard = False
for potential in accept_charset:
charset = potential["charset"].strip()
# The default quality value is 1
if not potential["q"]:
q = 1.
else:
q = float(potential["q"])
# Exclude unsupported charsets (those without codecs in Python)
if get_codec(charset) and q >= 0 and q <= 1:
accepted_charsets[charset] = q
elif charset == "*" and q >= 0 and q <= 1:
default_value = q
wildcard = True
if settings.DEFAULT_CHARSET not in accepted_charsets:
accepted_charsets[settings.DEFAULT_CHARSET] = default_value
if "ISO-8859-1" not in accepted_charsets and wildcard:
accepted_charsets["ISO-8859-1"] = default_value
return accepted_charsets

View File

View File

@ -0,0 +1,60 @@
from django.test import Client, TestCase
import re
from django.conf import settings
CONTENT_TYPE_RE = re.compile('.*; charset=([\w\d-]+);?')
def get_charset(response):
match = CONTENT_TYPE_RE.match(response.get("content-type",""))
if match:
charset = match.group(1)
else:
charset = None
return charset
class ClientTest(TestCase):
def test_good_accept_charset(self):
"Use Accept-Charset"
# The data is ignored, but let's check it doesn't crash the system
# anyway.
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="ascii,utf-8;q=0")
self.assertEqual(get_charset(response), "ascii")
# us is an alias for ascii
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="us;q=0.8,*;q=0.9")
self.assertEqual(get_charset(response), settings.DEFAULT_CHARSET)
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="us;q=0.8,*;q=0.7")
self.assertEqual(get_charset(response), "us")
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="ascii;q=0.89,utf-8;q=.9")
self.assertEqual(get_charset(response), settings.DEFAULT_CHARSET)
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="utf-8;q=0")
self.assertEqual(get_charset(response), "ISO-8859-1")
def test_bad_accept_charset(self):
"Do not use a malformed Accept-Charset"
# The data is ignored, but let's check it doesn't crash the system
# anyway.
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="this_is_junk")
self.assertEqual(get_charset(response), "utf-8")
def test_good_content_type(self):
"Use content-type"
# The data is ignored, but let's check it doesn't crash the system
# anyway.
response = self.client.post('/good_content_type/')
self.assertEqual(get_charset(response), "us")
def test_bad_content_type(self):
"Use content-type"
# The data is ignored, but let's check it doesn't crash the system
# anyway.
response = self.client.post('/bad_content_type/')

View File

@ -0,0 +1,9 @@
from django.conf.urls.defaults import *
import views
urlpatterns = patterns('',
(r'^accept_charset/', views.accept_charset),
(r'^good_content_type/', views.good_content_type),
(r'^bad_content_type/', views.bad_content_type),
)

View File

@ -0,0 +1,11 @@
from django.http import HttpResponse
from django.shortcuts import render_to_response
def accept_charset(request):
return HttpResponse("ASCII.", origin_request=request)
def good_content_type(request):
return HttpResponse("ASCII.", content_type="text/html; charset=us")
def bad_content_type(request):
return HttpResponse("ASCII.", content_type="text/html; charset=this_should_be_junk")