mirror of
https://github.com/django/django.git
synced 2025-07-04 17:59:13 +00:00
[soc2009/http-wsgi-improvements] Code and tests in support of http.charsets.determine_charset.
This code determines the charset from a content-type or from the Accept-Charset request header. The code is fairly well documented, but that will be improved once the code is in closer to a final form. The codec that corresponds to the charset is also returned, but it is not currently used by HttpResponse. git-svn-id: http://code.djangoproject.com/svn/django/branches/soc2009/http-wsgi-improvements@11014 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
parent
c435676176
commit
1896d531cb
@ -13,6 +13,7 @@ except ImportError:
|
||||
from django.utils.datastructures import MultiValueDict, ImmutableList
|
||||
from django.utils.encoding import smart_str, iri_to_uri, force_unicode
|
||||
from django.http.multipartparser import MultiPartParser
|
||||
from django.http.charsets import determine_charset
|
||||
from django.conf import settings
|
||||
from django.core.files import uploadhandler
|
||||
from utils import *
|
||||
@ -272,14 +273,16 @@ class HttpResponse(object):
|
||||
status_code = 200
|
||||
|
||||
def __init__(self, content='', mimetype=None, status=None,
|
||||
content_type=None):
|
||||
content_type=None, origin_request=None):
|
||||
from django.conf import settings
|
||||
self._charset = settings.DEFAULT_CHARSET
|
||||
if mimetype:
|
||||
content_type = mimetype # For backwards compatibility
|
||||
content_type = mimetype # Mimetype is an alias for content-type
|
||||
if origin_request or content_type:
|
||||
self._charset, self._codec = determine_charset(content_type, origin_request)
|
||||
if not content_type:
|
||||
content_type = "%s; charset=%s" % (settings.DEFAULT_CONTENT_TYPE,
|
||||
settings.DEFAULT_CHARSET)
|
||||
self._charset)
|
||||
if not isinstance(content, basestring) and hasattr(content, '__iter__'):
|
||||
self._container = content
|
||||
self._is_string = False
|
||||
@ -432,6 +435,12 @@ class HttpResponseNotAllowed(HttpResponse):
|
||||
HttpResponse.__init__(self)
|
||||
self['Allow'] = ', '.join(permitted_methods)
|
||||
|
||||
class HttpResponseNotAcceptable(HttpResponse):
|
||||
status_code = 406
|
||||
|
||||
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
|
||||
# if we want to make this more verbose (compliant, actually)
|
||||
|
||||
class HttpResponseGone(HttpResponse):
|
||||
status_code = 410
|
||||
|
||||
|
351
django/http/charsets.py
Normal file
351
django/http/charsets.py
Normal file
@ -0,0 +1,351 @@
|
||||
"Maps codec names to http1.1 charsets"
|
||||
|
||||
import codecs
|
||||
import re
|
||||
from operator import itemgetter
|
||||
from django.conf import settings
|
||||
|
||||
CHARSET_CODECS = {
|
||||
'437': 'cp437',
|
||||
'850': 'cp850',
|
||||
'852': 'cp852',
|
||||
'855': 'cp855',
|
||||
'857': 'cp857',
|
||||
'860': 'cp860',
|
||||
'861': 'cp861',
|
||||
'862': 'cp862',
|
||||
'863': 'cp863',
|
||||
'865': 'cp865',
|
||||
'869': 'cp869',
|
||||
'ansi_x3.4-1968': 'ascii',
|
||||
'ansi_x3.4-1986': 'ascii',
|
||||
'arabic': 'iso8859-6',
|
||||
'ascii': 'ascii',
|
||||
'asmo-708': 'iso8859-6',
|
||||
'big5': 'big5',
|
||||
'big5-hkscs': 'big5hkscs',
|
||||
'ccsid01140': 'cp1140',
|
||||
'chinese': 'gb2312',
|
||||
'cp-gr': 'cp869',
|
||||
'cp-is': 'cp861',
|
||||
'cp01140': 'cp1140',
|
||||
'cp037': 'cp037',
|
||||
'cp1026': 'cp1026',
|
||||
'cp154': 'ptcp154',
|
||||
'cp367': 'ascii',
|
||||
'cp424': 'cp424',
|
||||
'cp437': 'cp437',
|
||||
'cp500': 'cp500',
|
||||
'cp775': 'cp775',
|
||||
'cp819': 'iso8859-1',
|
||||
'cp850': 'cp850',
|
||||
'cp852': 'cp852',
|
||||
'cp855': 'cp855',
|
||||
'cp857': 'cp857',
|
||||
'cp860': 'cp860',
|
||||
'cp861': 'cp861',
|
||||
'cp862': 'cp862',
|
||||
'cp863': 'cp863',
|
||||
'cp864': 'cp864',
|
||||
'cp865': 'cp865',
|
||||
'cp869': 'cp869',
|
||||
'cp936': 'gbk',
|
||||
'csascii': 'ascii',
|
||||
'csbig5': 'big5',
|
||||
'cseuckr': 'euc_kr',
|
||||
'cseucpkdfmtjapanese': 'euc_jp',
|
||||
'csibm037': 'cp037',
|
||||
'csibm1026': 'cp1026',
|
||||
'csibm424': 'cp424',
|
||||
'csibm500': 'cp500',
|
||||
'csibm855': 'cp855',
|
||||
'csibm857': 'cp857',
|
||||
'csibm860': 'cp860',
|
||||
'csibm861': 'cp861',
|
||||
'csibm863': 'cp863',
|
||||
'csibm864': 'cp864',
|
||||
'csibm865': 'cp865',
|
||||
'csibm869': 'cp869',
|
||||
'csiso2022jp': 'iso2022_jp',
|
||||
'csiso2022jp2': 'iso2022_jp_2',
|
||||
'csiso58gb231280': 'gb2312',
|
||||
'csisolatin1': 'iso8859-1',
|
||||
'csisolatin2': 'iso8859-2',
|
||||
'csisolatin3': 'iso8859-3',
|
||||
'csisolatin4': 'iso8859-4',
|
||||
'csisolatin5': 'iso8859-9',
|
||||
'csisolatin6': 'iso8859-10',
|
||||
'csisolatinarabic': 'iso8859-6',
|
||||
'csisolatincyrillic': 'iso8859-5',
|
||||
'csisolatingreek': 'iso8859-7',
|
||||
'csisolatinhebrew': 'iso8859-8',
|
||||
'cskoi8r': 'koi8-r',
|
||||
'cspc775baltic': 'cp775',
|
||||
'cspc850multilingual': 'cp850',
|
||||
'cspc862latinhebrew': 'cp862',
|
||||
'cspc8codepage437': 'cp437',
|
||||
'cspcp852': 'cp852',
|
||||
'csptcp154': 'ptcp154',
|
||||
'csshiftjis': 'shift_jis',
|
||||
'cyrillic': 'iso8859-5',
|
||||
'cyrillic-asian': 'ptcp154',
|
||||
'ebcdic-cp-be': 'cp500',
|
||||
'ebcdic-cp-ca': 'cp037',
|
||||
'ebcdic-cp-ch': 'cp500',
|
||||
'ebcdic-cp-he': 'cp424',
|
||||
'ebcdic-cp-nl': 'cp037',
|
||||
'ebcdic-cp-us': 'cp037',
|
||||
'ebcdic-cp-wt': 'cp037',
|
||||
'ebcdic-us-37+euro': 'cp1140',
|
||||
'ecma-114': 'iso8859-6',
|
||||
'ecma-118': 'iso8859-7',
|
||||
'elot_928': 'iso8859-7',
|
||||
'euc-jp': 'euc_jp',
|
||||
'euc-kr': 'euc_kr',
|
||||
'extended_unix_code_packed_format_for_japanese': 'euc_jp',
|
||||
'gb18030': 'gb18030',
|
||||
'gb_2312-80': 'gb2312',
|
||||
'gbk': 'gbk',
|
||||
'greek': 'iso8859-7',
|
||||
'greek8': 'iso8859-7',
|
||||
'hebrew': 'iso8859-8',
|
||||
'hz-gb-2312': 'hz',
|
||||
'ibm01140': 'cp1140',
|
||||
'ibm037': 'cp037',
|
||||
'ibm1026': 'cp1026',
|
||||
'ibm367': 'ascii',
|
||||
'ibm424': 'cp424',
|
||||
'ibm437': 'cp437',
|
||||
'ibm500': 'cp500',
|
||||
'ibm775': 'cp775',
|
||||
'ibm819': 'iso8859-1',
|
||||
'ibm850': 'cp850',
|
||||
'ibm852': 'cp852',
|
||||
'ibm855': 'cp855',
|
||||
'ibm857': 'cp857',
|
||||
'ibm860': 'cp860',
|
||||
'ibm861': 'cp861',
|
||||
'ibm862': 'cp862',
|
||||
'ibm863': 'cp863',
|
||||
'ibm864': 'cp864',
|
||||
'ibm865': 'cp865',
|
||||
'ibm869': 'cp869',
|
||||
'iso-2022-jp': 'iso2022_jp',
|
||||
'iso-2022-jp-2': 'iso2022_jp_2',
|
||||
'iso-8859-1': 'iso8859-1',
|
||||
'iso-8859-10': 'iso8859-10',
|
||||
'iso-8859-13': 'iso8859-13',
|
||||
'iso-8859-14': 'iso8859-14',
|
||||
'iso-8859-15': 'iso8859-15',
|
||||
'iso-8859-2': 'iso8859-2',
|
||||
'iso-8859-3': 'iso8859-3',
|
||||
'iso-8859-4': 'iso8859-4',
|
||||
'iso-8859-5': 'iso8859-5',
|
||||
'iso-8859-6': 'iso8859-6',
|
||||
'iso-8859-7': 'iso8859-7',
|
||||
'iso-8859-8': 'iso8859-8',
|
||||
'iso-8859-9': 'iso8859-9',
|
||||
'iso-celtic': 'iso8859-14',
|
||||
'iso-ir-100': 'iso8859-1',
|
||||
'iso-ir-101': 'iso8859-2',
|
||||
'iso-ir-109': 'iso8859-3',
|
||||
'iso-ir-110': 'iso8859-4',
|
||||
'iso-ir-126': 'iso8859-7',
|
||||
'iso-ir-127': 'iso8859-6',
|
||||
'iso-ir-138': 'iso8859-8',
|
||||
'iso-ir-144': 'iso8859-5',
|
||||
'iso-ir-148': 'iso8859-9',
|
||||
'iso-ir-157': 'iso8859-10',
|
||||
'iso-ir-199': 'iso8859-14',
|
||||
'iso-ir-58': 'gb2312',
|
||||
'iso-ir-6': 'ascii',
|
||||
'iso646-us': 'ascii',
|
||||
'iso_646.irv:1991': 'ascii',
|
||||
'iso_8859-1': 'iso8859-1',
|
||||
'iso_8859-10:1992': 'iso8859-10',
|
||||
'iso_8859-14': 'iso8859-14',
|
||||
'iso_8859-14:1998': 'iso8859-14',
|
||||
'iso_8859-15': 'iso8859-15',
|
||||
'iso_8859-1:1987': 'iso8859-1',
|
||||
'iso_8859-2': 'iso8859-2',
|
||||
'iso_8859-2:1987': 'iso8859-2',
|
||||
'iso_8859-3': 'iso8859-3',
|
||||
'iso_8859-3:1988': 'iso8859-3',
|
||||
'iso_8859-4': 'iso8859-4',
|
||||
'iso_8859-4:1988': 'iso8859-4',
|
||||
'iso_8859-5': 'iso8859-5',
|
||||
'iso_8859-5:1988': 'iso8859-5',
|
||||
'iso_8859-6': 'iso8859-6',
|
||||
'iso_8859-6:1987': 'iso8859-6',
|
||||
'iso_8859-7': 'iso8859-7',
|
||||
'iso_8859-7:1987': 'iso8859-7',
|
||||
'iso_8859-8': 'iso8859-8',
|
||||
'iso_8859-8:1988': 'iso8859-8',
|
||||
'iso_8859-9': 'iso8859-9',
|
||||
'iso_8859-9:1989': 'iso8859-9',
|
||||
'koi8-r': 'koi8-r',
|
||||
'koi8-u': 'koi8-u',
|
||||
'l1': 'iso8859-1',
|
||||
'l2': 'iso8859-2',
|
||||
'l3': 'iso8859-3',
|
||||
'l4': 'iso8859-4',
|
||||
'l5': 'iso8859-9',
|
||||
'l6': 'iso8859-10',
|
||||
'l8': 'iso8859-14',
|
||||
'latin-9': 'iso8859-15',
|
||||
'latin1': 'iso8859-1',
|
||||
'latin2': 'iso8859-2',
|
||||
'latin3': 'iso8859-3',
|
||||
'latin4': 'iso8859-4',
|
||||
'latin5': 'iso8859-9',
|
||||
'latin6': 'iso8859-10',
|
||||
'latin8': 'iso8859-14',
|
||||
'ms936': 'gbk',
|
||||
'ms_kanji': 'shift_jis',
|
||||
'pt154': 'ptcp154',
|
||||
'ptcp154': 'ptcp154',
|
||||
'shift_jis': 'shift_jis',
|
||||
'us': 'ascii',
|
||||
'us-ascii': 'ascii',
|
||||
'utf-16': 'utf-16',
|
||||
'utf-16le': 'utf-16-be',
|
||||
'utf-32': 'utf-32',
|
||||
'utf-32be': 'utf-32-be',
|
||||
'utf-32le': 'utf-32-le',
|
||||
'utf-7': 'utf-7',
|
||||
'utf-8': 'utf-8',
|
||||
'windows-1250': 'cp1250',
|
||||
'windows-1251': 'cp1251',
|
||||
'windows-1252': 'cp1252',
|
||||
'windows-1253': 'cp1253',
|
||||
'windows-1254': 'cp1254',
|
||||
'windows-1255': 'cp1255',
|
||||
'windows-1256': 'cp1256',
|
||||
'windows-1257': 'cp1257',
|
||||
'windows-1258': 'cp1258',
|
||||
'windows-936': 'gbk'
|
||||
}
|
||||
|
||||
def get_codec(charset):
|
||||
"""
|
||||
Given the name or alias of a character set, find its Python codec if there is one.
|
||||
|
||||
http://www.iana.org/assignments/character-sets contains valid aliases.
|
||||
The documentation for the codecs module has the list of codecs.
|
||||
|
||||
CODEC_CHARSETS above has the codecs that correspond to character sets.
|
||||
"""
|
||||
try:
|
||||
codec_name = CHARSET_CODECS[charset.strip().lower()]
|
||||
codec = codecs.lookup(codec_name)
|
||||
except KeyError:
|
||||
#print "The charset %s is not supported by Django." % charset
|
||||
codec = None
|
||||
except LookupError:
|
||||
#print "The encoding '%s' is not supported in this version of Python." % codec_name
|
||||
codec = None
|
||||
|
||||
return codec
|
||||
|
||||
# Returns the key for the maximum value in a dictionary
|
||||
max_dict_key = lambda l:sorted(l.iteritems(), key=itemgetter(1), reverse=True)[0][0]
|
||||
|
||||
CONTENT_TYPE_RE = re.compile('.*; charset=([\w\d-]+);?')
|
||||
ACCEPT_CHARSET_RE = re.compile('(?P<charset>([\w\d-]+)|(\*))(;q=(?P<q>[01](\.\d{1,3})?))?,?')
|
||||
def determine_charset(content_type, request):
|
||||
"""
|
||||
Searches request headers from clients and mimetype settings (which may be set
|
||||
by users) for indicators of which charset and encoding the response should use.
|
||||
|
||||
Attempted partial support for HTTP RFC 2616 section 14.2 and ticket 10190.
|
||||
|
||||
Returns the highest "quality" (priority) charset that Python supports.
|
||||
|
||||
Precedence: supported charset specified in content-type
|
||||
settings.DEFAULT_CHARSET,
|
||||
supported, "accept"ed charset such that its q > q of settings.DEFAULT_CHARSET
|
||||
iso-8859-1 if q > 0 or is unspecified
|
||||
406 error
|
||||
|
||||
"""
|
||||
codec = None
|
||||
charset = None
|
||||
|
||||
# Attempt to get the codec from a content-type, and verify that the charset is valid.
|
||||
if content_type:
|
||||
match = CONTENT_TYPE_RE.match(content_type)
|
||||
if match:
|
||||
charset = match.group(1)
|
||||
codec = get_codec(charset)
|
||||
if not codec: # Unsupported charset
|
||||
# we should throw an exception here
|
||||
print "No CODEC ON MIMETYPE"
|
||||
|
||||
# Handle Accept-Charset (which we only do if we do not deal with content_type).
|
||||
else:
|
||||
if request and "ACCEPT_CHARSET" in request.META:
|
||||
# Get list of matches for Accepted-Charsets.
|
||||
# [{ charset : q }, { charset : q }]
|
||||
match_iterator = ACCEPT_CHARSET_RE.finditer(request.META["ACCEPT_CHARSET"])
|
||||
accept_charset = [m.groupdict() for m in match_iterator]
|
||||
else:
|
||||
accept_charset = [] # use settings.DEFAULT_CHARSET
|
||||
charset = settings.DEFAULT_CHARSET
|
||||
|
||||
# Remove charsets we cannot encode and whose q values are 0
|
||||
charsets = _process_accept_charset(accept_charset)
|
||||
|
||||
# If we did not get a charset from the content type, we get it from accept_charset.
|
||||
if not charset:
|
||||
default_charset = settings.DEFAULT_CHARSET
|
||||
fallback_charset = "ISO-8859-1"
|
||||
# Prefer default_charset if its q value is 1 or we have no valid acceptable charsets.
|
||||
max_q_charset = max_dict_key(charsets)
|
||||
max_q_value = charsets[max_q_charset]
|
||||
if max_q_value == 0 and fallback_charset not in charsets:
|
||||
charset = fallback_charset
|
||||
elif charsets[default_charset] == 1 or charsets[default_charset] == max_q_value:
|
||||
charset = default_charset
|
||||
# Get the highest valued acceptable charset (if we aren't going to the fallback
|
||||
# or defaulting)
|
||||
else:
|
||||
charset = max_q_charset
|
||||
|
||||
codec = get_codec(charset)
|
||||
# We may reach here with no codec or no charset. We will change the status
|
||||
# code in the HttpResponse.
|
||||
return charset, codec
|
||||
|
||||
# NOTE -- make sure we are not duping the processing of q values
|
||||
def _process_accept_charset(accept_charset):
|
||||
'''
|
||||
HTTP RFC 2616 section 14.2 dictates that q must be between 0 and 1.
|
||||
This method normalizes charset quality values, cleans whitespace from charset
|
||||
names, and excludes charsets without Python codecs and whose q values are 0.
|
||||
'''
|
||||
accepted_charsets = {}
|
||||
|
||||
default_value = 1
|
||||
wildcard = False
|
||||
|
||||
for potential in accept_charset:
|
||||
charset = potential["charset"].strip()
|
||||
# The default quality value is 1
|
||||
if not potential["q"]:
|
||||
q = 1.
|
||||
else:
|
||||
q = float(potential["q"])
|
||||
# Exclude unsupported charsets (those without codecs in Python)
|
||||
if get_codec(charset) and q >= 0 and q <= 1:
|
||||
accepted_charsets[charset] = q
|
||||
elif charset == "*" and q >= 0 and q <= 1:
|
||||
default_value = q
|
||||
wildcard = True
|
||||
|
||||
if settings.DEFAULT_CHARSET not in accepted_charsets:
|
||||
accepted_charsets[settings.DEFAULT_CHARSET] = default_value
|
||||
if "ISO-8859-1" not in accepted_charsets and wildcard:
|
||||
accepted_charsets["ISO-8859-1"] = default_value
|
||||
|
||||
|
||||
return accepted_charsets
|
0
tests/regressiontests/charsets/__init__.py
Normal file
0
tests/regressiontests/charsets/__init__.py
Normal file
0
tests/regressiontests/charsets/models.py
Normal file
0
tests/regressiontests/charsets/models.py
Normal file
60
tests/regressiontests/charsets/tests.py
Normal file
60
tests/regressiontests/charsets/tests.py
Normal file
@ -0,0 +1,60 @@
|
||||
from django.test import Client, TestCase
|
||||
import re
|
||||
from django.conf import settings
|
||||
|
||||
CONTENT_TYPE_RE = re.compile('.*; charset=([\w\d-]+);?')
|
||||
|
||||
|
||||
def get_charset(response):
|
||||
match = CONTENT_TYPE_RE.match(response.get("content-type",""))
|
||||
if match:
|
||||
charset = match.group(1)
|
||||
else:
|
||||
charset = None
|
||||
return charset
|
||||
|
||||
class ClientTest(TestCase):
|
||||
|
||||
def test_good_accept_charset(self):
|
||||
"Use Accept-Charset"
|
||||
# The data is ignored, but let's check it doesn't crash the system
|
||||
# anyway.
|
||||
|
||||
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="ascii,utf-8;q=0")
|
||||
self.assertEqual(get_charset(response), "ascii")
|
||||
|
||||
# us is an alias for ascii
|
||||
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="us;q=0.8,*;q=0.9")
|
||||
self.assertEqual(get_charset(response), settings.DEFAULT_CHARSET)
|
||||
|
||||
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="us;q=0.8,*;q=0.7")
|
||||
self.assertEqual(get_charset(response), "us")
|
||||
|
||||
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="ascii;q=0.89,utf-8;q=.9")
|
||||
self.assertEqual(get_charset(response), settings.DEFAULT_CHARSET)
|
||||
|
||||
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="utf-8;q=0")
|
||||
self.assertEqual(get_charset(response), "ISO-8859-1")
|
||||
|
||||
def test_bad_accept_charset(self):
|
||||
"Do not use a malformed Accept-Charset"
|
||||
# The data is ignored, but let's check it doesn't crash the system
|
||||
# anyway.
|
||||
|
||||
response = self.client.post('/accept_charset/', ACCEPT_CHARSET="this_is_junk")
|
||||
self.assertEqual(get_charset(response), "utf-8")
|
||||
|
||||
def test_good_content_type(self):
|
||||
"Use content-type"
|
||||
# The data is ignored, but let's check it doesn't crash the system
|
||||
# anyway.
|
||||
|
||||
response = self.client.post('/good_content_type/')
|
||||
self.assertEqual(get_charset(response), "us")
|
||||
|
||||
def test_bad_content_type(self):
|
||||
"Use content-type"
|
||||
# The data is ignored, but let's check it doesn't crash the system
|
||||
# anyway.
|
||||
|
||||
response = self.client.post('/bad_content_type/')
|
9
tests/regressiontests/charsets/urls.py
Normal file
9
tests/regressiontests/charsets/urls.py
Normal file
@ -0,0 +1,9 @@
|
||||
from django.conf.urls.defaults import *
|
||||
|
||||
import views
|
||||
|
||||
urlpatterns = patterns('',
|
||||
(r'^accept_charset/', views.accept_charset),
|
||||
(r'^good_content_type/', views.good_content_type),
|
||||
(r'^bad_content_type/', views.bad_content_type),
|
||||
)
|
11
tests/regressiontests/charsets/views.py
Normal file
11
tests/regressiontests/charsets/views.py
Normal file
@ -0,0 +1,11 @@
|
||||
from django.http import HttpResponse
|
||||
from django.shortcuts import render_to_response
|
||||
|
||||
def accept_charset(request):
|
||||
return HttpResponse("ASCII.", origin_request=request)
|
||||
|
||||
def good_content_type(request):
|
||||
return HttpResponse("ASCII.", content_type="text/html; charset=us")
|
||||
|
||||
def bad_content_type(request):
|
||||
return HttpResponse("ASCII.", content_type="text/html; charset=this_should_be_junk")
|
Loading…
x
Reference in New Issue
Block a user