mirror of
https://github.com/django/django.git
synced 2025-07-04 09:49:12 +00:00
unicode: Made some documentation edits and inconsequential typo fixes throughout code
git-svn-id: http://code.djangoproject.com/svn/django/branches/unicode@5597 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
parent
c62d6eea19
commit
1feda14c8e
@ -14,7 +14,7 @@ class ContentTypeManager(models.Manager):
|
|||||||
try:
|
try:
|
||||||
ct = CONTENT_TYPE_CACHE[key]
|
ct = CONTENT_TYPE_CACHE[key]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
# The unicode() is needed around opts.verbose_name because it might
|
# The smart_unicode() is needed around opts.verbose_name_raw because it might
|
||||||
# be a django.utils.functional.__proxy__ object.
|
# be a django.utils.functional.__proxy__ object.
|
||||||
ct, created = self.model._default_manager.get_or_create(app_label=key[0],
|
ct, created = self.model._default_manager.get_or_create(app_label=key[0],
|
||||||
model=key[1], defaults={'name': smart_unicode(opts.verbose_name_raw)})
|
model=key[1], defaults={'name': smart_unicode(opts.verbose_name_raw)})
|
||||||
|
@ -7,7 +7,7 @@ from django.conf import settings
|
|||||||
|
|
||||||
def add_domain(domain, url):
|
def add_domain(domain, url):
|
||||||
if not url.startswith('http://'):
|
if not url.startswith('http://'):
|
||||||
# 'url' must already be ASCII and URL-quoted, so no need for encodign
|
# 'url' must already be ASCII and URL-quoted, so no need for encoding
|
||||||
# conversions here.
|
# conversions here.
|
||||||
url = u'http://%s%s' % (domain, url)
|
url = u'http://%s%s' % (domain, url)
|
||||||
return url
|
return url
|
||||||
|
@ -50,7 +50,7 @@ class HttpRequest(object):
|
|||||||
def _set_encoding(self, val):
|
def _set_encoding(self, val):
|
||||||
"""
|
"""
|
||||||
Sets the encoding used for GET/POST accesses. If the GET or POST
|
Sets the encoding used for GET/POST accesses. If the GET or POST
|
||||||
dictionary has already been created it is removed and recreated on the
|
dictionary has already been created, it is removed and recreated on the
|
||||||
next access (so that it is decoded correctly).
|
next access (so that it is decoded correctly).
|
||||||
"""
|
"""
|
||||||
self._encoding = val
|
self._encoding = val
|
||||||
@ -101,7 +101,7 @@ class QueryDict(MultiValueDict):
|
|||||||
This is immutable unless you create a copy of it.
|
This is immutable unless you create a copy of it.
|
||||||
|
|
||||||
Values retrieved from this class are converted from the default encoding to
|
Values retrieved from this class are converted from the default encoding to
|
||||||
unicode (this is done on retrieval, rather than input to avoid breaking
|
unicode (this is done on retrieval, rather than input, to avoid breaking
|
||||||
references or mutating referenced objects).
|
references or mutating referenced objects).
|
||||||
"""
|
"""
|
||||||
def __init__(self, query_string, mutable=False, encoding=None):
|
def __init__(self, query_string, mutable=False, encoding=None):
|
||||||
|
@ -116,7 +116,8 @@ make_list = stringfilter(make_list)
|
|||||||
|
|
||||||
def slugify(value):
|
def slugify(value):
|
||||||
"Converts to lowercase, removes non-alpha chars and converts spaces to hyphens"
|
"Converts to lowercase, removes non-alpha chars and converts spaces to hyphens"
|
||||||
# Don't compile patterns as unicode because \w then would mean any letter. Slugify is effectively an asciiization.
|
# Don't compile patterns as unicode because \w then would mean any letter.
|
||||||
|
# Slugify is effectively a conversion to ASCII.
|
||||||
value = re.sub('[^\w\s-]', '', value).strip().lower()
|
value = re.sub('[^\w\s-]', '', value).strip().lower()
|
||||||
return re.sub('[-\s]+', '-', value)
|
return re.sub('[-\s]+', '-', value)
|
||||||
slugify = stringfilter(slugify)
|
slugify = stringfilter(slugify)
|
||||||
|
@ -68,26 +68,24 @@ In Python code
|
|||||||
Standard translation
|
Standard translation
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Specify a translation string by using the function ``ugettext()``. Since you
|
Specify a translation string by using the function ``ugettext()``. It's
|
||||||
may well be typing this a lot, it's often worthwhile importing it as a shorter
|
convention to import this as a shorter alias, ``_``, to save typing.
|
||||||
alias and ``_`` is a very common choice.
|
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
Python's standard library ``gettext`` module installs ``_()`` into the
|
Python's standard library ``gettext`` module installs ``_()`` into the
|
||||||
global namespace, as an alias for ``gettext()``. In Django, we have chosen
|
global namespace, as an alias for ``gettext()``. In Django, we have chosen
|
||||||
not to follow this practice, for a couple of reasons:
|
not to follow this practice, for a couple of reasons:
|
||||||
|
|
||||||
1. For international character set (unicode) support, you really wanting
|
1. For international character set (Unicode) support, ``ugettext()`` is
|
||||||
to be using ``ugettext()``, rather than ``gettext()``. Sometimes, you
|
more useful than ``gettext()``. Sometimes, you should be using
|
||||||
should be using ``ugettext_lazy()`` as the default translation method
|
``ugettext_lazy()`` as the default translation method for a particular
|
||||||
for a particular file. By not installing ``_`` directly, the
|
file. Without ``_()`` in the global namespace, the developer has to
|
||||||
developer has to think about which is the most appropriate function
|
think about which is the most appropriate translation function.
|
||||||
to use.
|
|
||||||
|
|
||||||
2. Python's interactive shell uses ``_`` to represent "the previous
|
2. The underscore character (``_``) is used to represent "the previous
|
||||||
result". This is also used in doctest tests and having ``_()`` causes
|
result" in Python's interactive shell and doctest tests. Installing a
|
||||||
interference. Explicitly importing ``ugettext()`` as ``_()`` avoids
|
global ``_()`` function causes interference. Explicitly importing
|
||||||
this problem.
|
``ugettext()`` as ``_()`` avoids this problem.
|
||||||
|
|
||||||
In this example, the text ``"Welcome to my site."`` is marked as a translation
|
In this example, the text ``"Welcome to my site."`` is marked as a translation
|
||||||
string::
|
string::
|
||||||
@ -98,7 +96,7 @@ string::
|
|||||||
output = _("Welcome to my site.")
|
output = _("Welcome to my site.")
|
||||||
return HttpResponse(output)
|
return HttpResponse(output)
|
||||||
|
|
||||||
Obviously you could code this without using the alias. This example is
|
Obviously, you could code this without using the alias. This example is
|
||||||
identical to the previous one::
|
identical to the previous one::
|
||||||
|
|
||||||
from django.utils.translation import ugettext
|
from django.utils.translation import ugettext
|
||||||
@ -300,7 +298,7 @@ Working with lazy translation objects
|
|||||||
=====================================
|
=====================================
|
||||||
|
|
||||||
Using ``ugettext_lazy()`` and ``ungettext_lazy()`` to mark strings in models
|
Using ``ugettext_lazy()`` and ``ungettext_lazy()`` to mark strings in models
|
||||||
and utility functions is a common operation. When you are working with these
|
and utility functions is a common operation. When you're working with these
|
||||||
objects elsewhere in your code, you should ensure that you don't accidentally
|
objects elsewhere in your code, you should ensure that you don't accidentally
|
||||||
convert them to strings, because they should be converted as late as possible
|
convert them to strings, because they should be converted as late as possible
|
||||||
(so that the correct locale is in effect). This necessitates the use of a
|
(so that the correct locale is in effect). This necessitates the use of a
|
||||||
@ -328,20 +326,20 @@ rendering time).
|
|||||||
The allow_lazy() decorator
|
The allow_lazy() decorator
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
There are a lot of useful utility functions in Django (particularly in
|
Django offers many utility functions (particularly in ``django.utils``) that
|
||||||
``django.utils``) that take a string as their first argument and do something
|
take a string as their first argument and do something to that string. These
|
||||||
to that string. These functions are used by template filters as well as
|
functions are used by template filters as well as directly in other code.
|
||||||
directly in other code.
|
|
||||||
|
|
||||||
If you write your own similar functions, you will rapidly come across the
|
If you write your own similar functions and deal with translations, you'll
|
||||||
problem of what to do when the first argument is a lazy translation object.
|
face the problem of what to do when the first argument is a lazy translation
|
||||||
You don't want to convert it to a string immediately, because you may be using
|
object. You don't want to convert it to a string immediately, because you might
|
||||||
this function outside of a view (and hence the current thread's locale setting
|
be using this function outside of a view (and hence the current thread's locale
|
||||||
will not be correct). For cases like this, the
|
setting will not be correct).
|
||||||
``django.utils.functional.allow_lazy()`` decorator will be useful. It modifies
|
|
||||||
the function so that *if* it is called with a lazy translation as the first
|
For cases like this, use the ``django.utils.functional.allow_lazy()``
|
||||||
argument, the function evaluation is delayed until it needs to be converted to
|
decorator. It modifies the function so that *if* it's called with a lazy
|
||||||
a string.
|
translation as the first argument, the function evaluation is delayed until it
|
||||||
|
needs to be converted to a string.
|
||||||
|
|
||||||
For example::
|
For example::
|
||||||
|
|
||||||
@ -353,9 +351,9 @@ For example::
|
|||||||
fancy_utility_function = allow_lazy(fancy_utility_function, unicode)
|
fancy_utility_function = allow_lazy(fancy_utility_function, unicode)
|
||||||
|
|
||||||
The ``allow_lazy()`` decorator takes, in addition to the function to decorate,
|
The ``allow_lazy()`` decorator takes, in addition to the function to decorate,
|
||||||
a number of extra arguments specifying the type(s) that the original function
|
a number of extra arguments (``*args``) specifying the type(s) that the
|
||||||
can return. Usually, it will be enough to just include ``unicode`` here and
|
original function can return. Usually, it's enough to include ``unicode`` here
|
||||||
ensure that your function returns Unicode strings.
|
and ensure that your function returns only Unicode strings.
|
||||||
|
|
||||||
Using this decorator means you can write your function and assume that the
|
Using this decorator means you can write your function and assume that the
|
||||||
input is a proper string, then add support for lazy translation objects at the
|
input is a proper string, then add support for lazy translation objects at the
|
||||||
|
@ -1044,11 +1044,11 @@ iriencode
|
|||||||
~~~~~~~~~
|
~~~~~~~~~
|
||||||
|
|
||||||
Converts an IRI (Internationalized Resource Identifier) to a string that is
|
Converts an IRI (Internationalized Resource Identifier) to a string that is
|
||||||
suitable for including in a URL. This is necessary if you are trying to use
|
suitable for including in a URL. This is necessary if you're trying to use
|
||||||
strings containing non-ASCII characters in a URL.
|
strings containing non-ASCII characters in a URL.
|
||||||
|
|
||||||
You can use this filter after you have used the ``urlencode`` filter on a
|
It's safe to use this filter on a string that has already gone through the
|
||||||
string, without harm.
|
``urlencode`` filter.
|
||||||
|
|
||||||
join
|
join
|
||||||
~~~~
|
~~~~
|
||||||
|
@ -492,20 +492,21 @@ your own sanity when dealing with the interactive prompt, but also because
|
|||||||
objects' representations are used throughout Django's automatically-generated
|
objects' representations are used throughout Django's automatically-generated
|
||||||
admin.
|
admin.
|
||||||
|
|
||||||
.. admonition:: Why ``__unicode__`` and not ``__str__``?
|
.. admonition:: Why ``__unicode__()`` and not ``__str__()``?
|
||||||
|
|
||||||
If you are wondering why we add a ``__unicode__()`` method, rather than a
|
If you're familiar with Python, you might be in the habit of adding
|
||||||
simple ``__str__()`` method, it is because Django models will contain
|
``__str__()`` methods to your classes, not ``__unicode__()`` methods.
|
||||||
unicode strings by default. The values returned from the database, for
|
We use ``__unicode__()`` here because Django models deal with Unicode by
|
||||||
example, are all unicode strings. In most cases, your code should be
|
default. All data stored in your database is converted to Unicode when it's
|
||||||
prepared to handle non-ASCII characters and this is a litle fiddly in
|
returned.
|
||||||
``__str__()`` methods, since you have to worry about which encoding to
|
|
||||||
use, amongst other things. If you create a ``__unicode__()`` method,
|
Django models have a default ``__str__()`` method that calls ``__unicode__()``
|
||||||
Django will provide a ``__str__()`` method that calls your
|
and converts the result to a UTF-8 bytestring. This means that ``unicode(p)``
|
||||||
``__unicode__()`` and then converts the result to UTF-8 strings when
|
will return a Unicode string, and ``str(p)`` will return a normal string,
|
||||||
required. So ``unicode(p)`` will return a unicode string and ``str(p)``
|
with characters encoded as UTF-8.
|
||||||
will return a normal string, with the characters encoded as UTF-8 when
|
|
||||||
necessary..
|
If all of this is jibberish to you, just remember to add ``__unicode__()``
|
||||||
|
methods to your models. With any luck, things should Just Work for you.
|
||||||
|
|
||||||
Note these are normal Python methods. Let's add a custom method, just for
|
Note these are normal Python methods. Let's add a custom method, just for
|
||||||
demonstration::
|
demonstration::
|
||||||
|
270
docs/unicode.txt
270
docs/unicode.txt
@ -8,24 +8,24 @@ Django natively supports Unicode data everywhere. Providing your database can
|
|||||||
somehow store the data, you can safely pass around Unicode strings to
|
somehow store the data, you can safely pass around Unicode strings to
|
||||||
templates, models and the database.
|
templates, models and the database.
|
||||||
|
|
||||||
This files describes some things to be aware of if you are writing applications
|
This document tells you what you need to know if you're writing applications
|
||||||
which do not only use ASCII-encoded data.
|
that use data or templates that are encoded in something other than ASCII.
|
||||||
|
|
||||||
Creating the database
|
Creating the database
|
||||||
=====================
|
=====================
|
||||||
|
|
||||||
Make sure your database is configured to be able to store arbitrary string
|
Make sure your database is configured to be able to store arbitrary string
|
||||||
data. Normally, this means giving it an encoding of UTF-8 or UTF-16. If you use
|
data. Normally, this means giving it an encoding of UTF-8 or UTF-16. If you use
|
||||||
a more restrictive encoding -- for example, latin1 (iso8859-1) -- there will be
|
a more restrictive encoding -- for example, latin1 (iso8859-1) -- you won't be
|
||||||
some characters that you cannot store in the database and information will be
|
able to store certain characters in the database, and information will be lost.
|
||||||
lost.
|
|
||||||
|
|
||||||
* For MySQL users, refer to the `MySQL manual`_ (section 10.3.2 for MySQL 5.1)
|
* MySQL users, refer to the `MySQL manual`_ (section 10.3.2 for MySQL 5.1) for
|
||||||
for details on how to set or alter the database character set encoding.
|
details on how to set or alter the database character set encoding.
|
||||||
|
|
||||||
* For PostgreSQL users, refer to the `PostgreSQL manual`_ (section 21.2.2 in
|
* PostgreSQL users, refer to the `PostgreSQL manual`_ (section 21.2.2 in
|
||||||
PostgreSQL 8) for details on creating databases with the correct encoding.
|
PostgreSQL 8) for details on creating databases with the correct encoding.
|
||||||
|
|
||||||
* For SQLite users, there is nothing you need to do. SQLite always uses UTF-8
|
* SQLite users, there is nothing you need to do. SQLite always uses UTF-8
|
||||||
for internal encoding.
|
for internal encoding.
|
||||||
|
|
||||||
.. _MySQL manual: http://www.mysql.org/doc/refman/5.1/en/charset-database.html
|
.. _MySQL manual: http://www.mysql.org/doc/refman/5.1/en/charset-database.html
|
||||||
@ -37,119 +37,119 @@ convert strings retrieved from the database into Python Unicode strings. You
|
|||||||
don't even need to tell Django what encoding your database uses: that is
|
don't even need to tell Django what encoding your database uses: that is
|
||||||
handled transparently.
|
handled transparently.
|
||||||
|
|
||||||
|
For more, see the section "The database API" below.
|
||||||
|
|
||||||
General string handling
|
General string handling
|
||||||
=======================
|
=======================
|
||||||
|
|
||||||
Whenever you use strings with Django, you have two choices. You can use Unicode
|
Whenever you use strings with Django -- e.g., in database lookups, template
|
||||||
strings or you can use normal strings (sometimes called bytestrings) that are
|
rendering or anywhere else -- you have two choices for encoding those strings.
|
||||||
encoded using UTF-8.
|
You can use Unicode strings, or you can use normal strings (sometimes called
|
||||||
|
"bytestrings") that are encoded using UTF-8.
|
||||||
|
|
||||||
.. warning::
|
.. warning::
|
||||||
A bytestring does not carry any information with it about its encoding. So
|
A bytestring does not carry any information with it about its encoding.
|
||||||
we have to make an assumption and Django assumes that all bytestrings are
|
For that reason, we have to make an assumption, and Django assumes that all
|
||||||
in UTF-8. If you pass a string to Django that has been encoded in some
|
bytestrings are in UTF-8.
|
||||||
other format, things will go wrong in interesting ways. Usually Django will
|
|
||||||
raise a UnicodeDecodeError at some point.
|
|
||||||
|
|
||||||
If your code only uses ASCII data, you are quite safe to simply use your normal
|
If you pass a string to Django that has been encoded in some other format,
|
||||||
strings (since ASCII is a subset of UTF-8) and pass them around at will.
|
things will go wrong in interesting ways. Usually, Django will raise a
|
||||||
|
``UnicodeDecodeError`` at some point.
|
||||||
|
|
||||||
Do not be fooled into thinking that if your ``DEFAULT_CHARSET`` setting is set
|
If your code only uses ASCII data, it's safe to use your normal strings,
|
||||||
to something other than ``utf-8`` you can use that encoding in your
|
passing them around at will, because ASCII is a subset of UTF-8.
|
||||||
bytestrings! The ``DEFAULT_CHARSET`` only applies to the strings generated as
|
|
||||||
the result of template rendering (and email). Django will always assume UTF-8
|
Don't be fooled into thinking that if your ``DEFAULT_CHARSET`` setting is set
|
||||||
|
to something other than ``'utf-8'`` you can use that other encoding in your
|
||||||
|
bytestrings! ``DEFAULT_CHARSET`` only applies to the strings generated as
|
||||||
|
the result of template rendering (and e-mail). Django will always assume UTF-8
|
||||||
encoding for internal bytestrings. The reason for this is that the
|
encoding for internal bytestrings. The reason for this is that the
|
||||||
``DEFAULT_CHARSET`` setting is not actually under your control (if you are the
|
``DEFAULT_CHARSET`` setting is not actually under your control (if you are the
|
||||||
application developer). It is under the control of the person installing and
|
application developer). It's under the control of the person installing and
|
||||||
using your application and if they choose a different setting, your code must
|
using your application -- and if that person chooses a different setting, your
|
||||||
still continue to work. Ergo, it cannot rely on that setting.
|
code must still continue to work. Ergo, it cannot rely on that setting.
|
||||||
|
|
||||||
In most cases when Django is dealing with strings, it will convert them to
|
In most cases when Django is dealing with strings, it will convert them to
|
||||||
Unicode strings before doing anything else. So if you pass in a bytestring, be
|
Unicode strings before doing anything else. So, as a general rule, if you pass
|
||||||
prepared to receive a Unicode string back in the result.
|
in a bytestring, be prepared to receive a Unicode string back in the result.
|
||||||
|
|
||||||
.. _lazy translation:
|
|
||||||
|
|
||||||
Translated strings
|
Translated strings
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
There is actually a third type of string-like object you may encounter when
|
Aside from Unicode strings and bytestrings, there's a third type of string-like
|
||||||
using Django. If you are using the internationalization features of Django,
|
object you may encounter when using Django. The framework's
|
||||||
there is the concept of a "lazy translation". This is a string that has been
|
internationalization features introduce the concept of a "lazy translation" --
|
||||||
marked as translated, but the actual result is not determined until the object
|
a string that has been marked as translated but whose actual translation result
|
||||||
is used in a string. This is useful because the locale that should be used for
|
isn't determined until the object is used in a string. This feature is useful
|
||||||
the translation will not be known until the string is used, even though the
|
in cases where the translation locale is unknown until the string is used, even
|
||||||
string might have originally been created when the code was first imported.
|
though the string might have originally been created when the code was first
|
||||||
|
imported.
|
||||||
|
|
||||||
Normally, you won't have to worry about lazy translations. Just be aware that
|
Normally, you won't have to worry about lazy translations. Just be aware that
|
||||||
if you examine an object and it claims to be a
|
if you examine an object and it claims to be a
|
||||||
``django.utils.functional.__proxy__`` object, it is a lazy translation.
|
``django.utils.functional.__proxy__`` object, it is a lazy translation.
|
||||||
Calling ``unicode()`` with the translation as the argument will generate a
|
Calling ``unicode()`` with the lazy translation as the argument will generate a
|
||||||
string in the current locale.
|
Unicode string in the current locale.
|
||||||
|
|
||||||
For more details about lazy translation objects, refer to the
|
For more details about lazy translation objects, refer to the
|
||||||
internationalization_ documentation.
|
internationalization_ documentation.
|
||||||
|
|
||||||
.. _internationalization: ../i18n/#lazy-translation
|
.. _internationalization: ../i18n/#lazy-translation
|
||||||
|
|
||||||
.. _utility functions:
|
|
||||||
|
|
||||||
Useful utility functions
|
Useful utility functions
|
||||||
------------------------
|
------------------------
|
||||||
|
|
||||||
Since some string operations come up again and again, Django ships with a few
|
Because some string operations come up again and again, Django ships with a few
|
||||||
useful functions that should make working with unicode and bytestring objects
|
useful functions that should make working with Unicode and bytestring objects
|
||||||
a bit easier.
|
a bit easier.
|
||||||
|
|
||||||
Conversion functions
|
Conversion functions
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
The ``django.utils.encoding`` module contains a few functions that are handy
|
The ``django.utils.encoding`` module contains a few functions that are handy
|
||||||
for converting back and forth between unicode and bytestrings.
|
for converting back and forth between Unicode and bytestrings.
|
||||||
|
|
||||||
* ``smart_unicode(s, encoding='utf-8', errors='strict')`` converts its
|
* ``smart_unicode(s, encoding='utf-8', errors='strict')`` converts its
|
||||||
input to unicode string. The ``encoding`` parameter specifies the input
|
input to a Unicode string. The ``encoding`` parameter specifies the input
|
||||||
encoding of any bytestring -- Django uses this internally when
|
encoding. (For example, Django uses this internally when processing form
|
||||||
processing form input data, for example, which might not be UTF-8
|
input data, which might not be UTF-8 encoded.) The ``errors`` parameter
|
||||||
encoded. The ``errors`` parameter takes any of the values that are
|
takes any of the values that are accepted by Python's ``unicode()``
|
||||||
accepted by Python's ``unicode()`` function for its error handling.
|
function for its error handling.
|
||||||
|
|
||||||
If you pass ``smart_unicode()`` an object that has a ``__unicode__``
|
If you pass ``smart_unicode()`` an object that has a ``__unicode__``
|
||||||
method, it will use that method to do the conversion.
|
method, it will use that method to do the conversion.
|
||||||
|
|
||||||
* ``force_unicode(s, encoding='utf-8', errors='strict')`` is identical to
|
* ``force_unicode(s, encoding='utf-8', errors='strict')`` is identical to
|
||||||
``smart_unicode()`` in almost all cases. The difference is when the
|
``smart_unicode()`` in almost all cases. The difference is when the
|
||||||
first argument is a `lazy translation`_ instance. Whilst
|
first argument is a `lazy translation`_ instance. While
|
||||||
``smart_unicode()`` preserves lazy translations, ``force_unicode()``
|
``smart_unicode()`` preserves lazy translations, ``force_unicode()``
|
||||||
forces those objects to a unicode string (causing the translation to
|
forces those objects to a Unicode string (causing the translation to
|
||||||
occur). Normally, you will want to use ``smart_unicode()``. However,
|
occur). Normally, you'll want to use ``smart_unicode()``. However,
|
||||||
``force_unicode()`` is useful in filters and template tags when you
|
``force_unicode()`` is useful in template tags and filters that
|
||||||
absolutely must have a string to work with, not just something that can
|
absolutely *must* have a string to work with, not just something that can
|
||||||
be converted to a string.
|
be converted to a string.
|
||||||
|
|
||||||
* ``smart_str(s, encoding='utf-8', strings_only=False, errors='strict')``
|
* ``smart_str(s, encoding='utf-8', strings_only=False, errors='strict')``
|
||||||
is essentially the opposite of ``smart_unicode()``. It forces the first
|
is essentially the opposite of ``smart_unicode()``. It forces the first
|
||||||
argument to a string. The ``strings_only`` parameter, if set to True,
|
argument to a bytestring. The ``strings_only`` parameter, if set to True,
|
||||||
will result in Python integers, booleans and ``None`` not being
|
will result in Python integers, booleans and ``None`` not being
|
||||||
converted to a string (they keep their original types). This is slightly
|
converted to a string (they keep their original types). This is slightly
|
||||||
different semantics from Python's builtin ``str()`` function, but the
|
different semantics from Python's builtin ``str()`` function, but the
|
||||||
difference is needed in a few places internally.
|
difference is needed in a few places within Django's internals.
|
||||||
|
|
||||||
Normally, you will only need to use ``smart_unicode()``. Call it as early as
|
Normally, you'll only need to use ``smart_unicode()``. Call it as early as
|
||||||
possible on any input data that might be either a unicode or bytestring and
|
possible on any input data that might be either Unicode or a bytestring, and
|
||||||
from then on you can treat the result as always being unicode.
|
from then on, you can treat the result as always being Unicode.
|
||||||
|
|
||||||
.. _uri_and_iri:
|
|
||||||
|
|
||||||
URI and IRI handling
|
URI and IRI handling
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Web frameworks have to deal with URLs (which are a type of URI_). One
|
Web frameworks have to deal with URLs (which are a type of URI_). One
|
||||||
requirement of URLs is that they are encoded using only ASCII characters.
|
requirement of URLs is that they are encoded using only ASCII characters.
|
||||||
However, in an international environment, you will often need to construct a
|
However, in an international environment, you might need to construct a
|
||||||
URL from an IRI_ (very loosely speaking, a URI that can contain unicode
|
URL from an IRI_ -- very loosely speaking, a URI that can contain Unicode
|
||||||
characters). Getting the quoting and conversion from IRI to URI correct can be
|
characters. Quoting and converting an IRI to URI can be a little tricky, so
|
||||||
a little tricky, so Django provides some assistance.
|
Django provides some assistance.
|
||||||
|
|
||||||
* The function ``django.utils.encoding.iri_to_uri()`` implements the
|
* The function ``django.utils.encoding.iri_to_uri()`` implements the
|
||||||
conversion from IRI to URI as required by the specification (`RFC
|
conversion from IRI to URI as required by the specification (`RFC
|
||||||
@ -158,9 +158,9 @@ a little tricky, so Django provides some assistance.
|
|||||||
* The functions ``django.utils.http.urlquote()`` and
|
* The functions ``django.utils.http.urlquote()`` and
|
||||||
``django.utils.http.urlquote_plus()`` are versions of Python's standard
|
``django.utils.http.urlquote_plus()`` are versions of Python's standard
|
||||||
``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII
|
``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII
|
||||||
characters (the data is converted to UTF-8 prior to encoding).
|
characters. (The data is converted to UTF-8 prior to encoding.)
|
||||||
|
|
||||||
These two groups of functions have slightly different purposes and it is
|
These two groups of functions have slightly different purposes, and it's
|
||||||
important to keep them straight. Normally, you would use ``urlquote()`` on the
|
important to keep them straight. Normally, you would use ``urlquote()`` on the
|
||||||
individual portions of the IRI or URI path so that any reserved characters
|
individual portions of the IRI or URI path so that any reserved characters
|
||||||
such as '&' or '%' are correctly encoded. Then, you apply ``iri_to_uri()`` to
|
such as '&' or '%' are correctly encoded. Then, you apply ``iri_to_uri()`` to
|
||||||
@ -168,10 +168,9 @@ the full IRI and it converts any non-ASCII characters to the correct encoded
|
|||||||
values.
|
values.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
It isn't completely correct to say that ``iri_to_uri()`` implements the
|
Technically, it isn't correct to say that ``iri_to_uri()`` implements the
|
||||||
full algorithm in the IRI specification. It does not perform the
|
full algorithm in the IRI specification. It doesn't (yet) perform the
|
||||||
international domain name encoding portion of the algorithm (at the
|
international domain name encoding portion of the algorithm.
|
||||||
moment).
|
|
||||||
|
|
||||||
The ``iri_to_uri()`` function will not change ASCII characters that are
|
The ``iri_to_uri()`` function will not change ASCII characters that are
|
||||||
otherwise permitted in a URL. So, for example, the character '%' is not
|
otherwise permitted in a URL. So, for example, the character '%' is not
|
||||||
@ -208,45 +207,46 @@ double-quoting problems.
|
|||||||
Models
|
Models
|
||||||
======
|
======
|
||||||
|
|
||||||
Because all strings are returned from the database as unicode strings, model
|
Because all strings are returned from the database as Unicode strings, model
|
||||||
fields that are character based (CharField, TextField, URLField, etc) will
|
fields that are character based (CharField, TextField, URLField, etc) will
|
||||||
contain unicode values when Django retrieves the model from the database. This
|
contain Unicode values when Django retrieves data from the database. This
|
||||||
is always the case, even if the data could fit into an ASCII string.
|
is *always* the case, even if the data could fit into an ASCII bytestring.
|
||||||
|
|
||||||
As always, you can pass in bytestrings when creating a model or populating a
|
You can pass in bytestrings when creating a model or populating a field, and
|
||||||
field and Django will convert it to unicode when it needs to.
|
Django will convert it to Unicode when it needs to.
|
||||||
|
|
||||||
Choosing between ``__str__()`` and ``__unicode__()``
|
Choosing between ``__str__()`` and ``__unicode__()``
|
||||||
-----------------------------------------------------
|
----------------------------------------------------
|
||||||
|
|
||||||
One consequence of using unicode by default is that you have to take some care
|
One consequence of using Unicode by default is that you have to take some care
|
||||||
when printing data from the model. In particular, rather than writing a
|
when printing data from the model.
|
||||||
``__str__()`` method, it is recommended to write a ``__unicode__()`` method for
|
|
||||||
your model. In the ``__unicode__()`` method, you can quite safely return the
|
|
||||||
values of all your fields without having to worry about whether they fit into a
|
|
||||||
bytestring or not (the result of ``__str__()`` is *always* a bytestring, even
|
|
||||||
if you accidentally try to return a unicode object).
|
|
||||||
|
|
||||||
You can still create a ``__str__()`` method on your models if you wish, of
|
In particular, rather than giving your model a ``__str__()`` method, we
|
||||||
course. However, Django's ``Model`` base class automatically provides you with
|
recommended you implement a ``__unicode__()`` method. In the ``__unicode__()``
|
||||||
a ``__str__()`` method that calls your ``__unicode__()`` method and then
|
method, you can quite safely return the values of all your fields without
|
||||||
encodes the result correctly into UTF-8. So you would normally only create a
|
having to worry about whether they fit into a bytestring or not. (The way
|
||||||
``__unicode__()`` method and let Django handle the coercion to a bytestring
|
Python works, the result of ``__str__()`` is *always* a bytestring, even if you
|
||||||
when required.
|
accidentally try to return a Unicode object).
|
||||||
|
|
||||||
|
You can still create a ``__str__()`` method on your models if you want, of
|
||||||
|
course, but you shouldn't need to do this unless you have a good reason.
|
||||||
|
Django's ``Model`` base class automatically provides a ``__str__()``
|
||||||
|
implementation that calls ``__unicode__()`` and encodes the result into UTF-8.
|
||||||
|
This means you'll normally only need to implement a ``__unicode__()`` method
|
||||||
|
and let Django handle the coercion to a bytestring when required.
|
||||||
|
|
||||||
Taking care in ``get_absolute_url()``
|
Taking care in ``get_absolute_url()``
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
|
|
||||||
URLs can only contain ASCII characters. If you are constructing a URL from
|
URLs can only contain ASCII characters. If you're constructing a URL from
|
||||||
pieces of data that might be non-ASCII, you must be careful to encode the
|
pieces of data that might be non-ASCII, be careful to encode the results in a
|
||||||
results in a way that is suitable for a URL. If you are using the
|
way that is suitable for a URL. The ``django.db.models.permalink()`` decorator
|
||||||
``django.db.models.permalink()`` decorator, this is handled automatically by
|
handles this for you automatically.
|
||||||
the decorator.
|
|
||||||
|
|
||||||
If you are constructing the URL manually, you need to take care of the
|
If you're constructing a URL manually (i.e., *not* using the ``permalink()``
|
||||||
encoding yourself. Normally, this would involve a combination of the
|
decorator), you'll need to take care of the encoding yourself. In this case,
|
||||||
``iri_to_uri()`` and ``urlquote()`` functions that were documented above_. For
|
use the ``iri_to_uri()`` and ``urlquote()`` functions that were documented
|
||||||
example::
|
above_. For example::
|
||||||
|
|
||||||
from django.utils.encoding import iri_to_uri
|
from django.utils.encoding import iri_to_uri
|
||||||
from django.utils.http import urlquote
|
from django.utils.http import urlquote
|
||||||
@ -265,28 +265,31 @@ non-ASCII characters would have been removed in quoting in the first line.)
|
|||||||
The database API
|
The database API
|
||||||
================
|
================
|
||||||
|
|
||||||
You can happily pass unicode strings or bytestrings as arguments to
|
You can pass either Unicode strings or UTF-8 bytestrings as arguments to
|
||||||
``filter()`` methods and the like in the database API. The following two
|
``filter()`` methods and the like in the database API. The following two
|
||||||
querysets are identical::
|
querysets are identical::
|
||||||
|
|
||||||
qs = People.objects.filter(name__contains=u'Å')
|
qs = People.objects.filter(name__contains=u'Å')
|
||||||
qs = People.objects.filter(name__contains='\xc3\85') # UTF-8 encoding of Å
|
qs = People.objects.filter(name__contains='\xc3\85') # UTF-8 encoding of Å
|
||||||
|
|
||||||
|
|
||||||
Templates
|
Templates
|
||||||
=========
|
=========
|
||||||
|
|
||||||
As usual, templates can be created from unicode or bytestrings. However, they
|
You can use either Unicode or bytestrings when creating templates manually::
|
||||||
can also be created by reading a file from disk and this creates a slight
|
|
||||||
complication: not all filesystems store their data encoded as UTF-8. If your
|
|
||||||
template files are not stored with a UTF-8 encoding, set the ``FILE_CHARSET``
|
|
||||||
setting to the encoding of the on-disk files. When Django reads in a template
|
|
||||||
file it will convert the data from this encoding to unicode.
|
|
||||||
|
|
||||||
When a template is rendered for sending out as an HTML document or an e-mail,
|
from django.template import Template
|
||||||
it may be convenient to use an encoding other than UTF-8. You should set the
|
t1 = Template('This is a bytestring template.')
|
||||||
``DEFAULT_CHARSET`` parameter to control the rendered template encoding (the
|
t2 = Template(u'This is a Unicode template.')
|
||||||
default setting is utf-8).
|
|
||||||
|
But the common case is to read templates from the filesystem, and this creates
|
||||||
|
a slight complication: not all filesystems store their data encoded as UTF-8.
|
||||||
|
If your template files are not stored with a UTF-8 encoding, set the ``FILE_CHARSET``
|
||||||
|
setting to the encoding of the files on disk. When Django reads in a template
|
||||||
|
file, it will convert the data from this encoding to Unicode. (``FILE_CHARSET``
|
||||||
|
is set to ``'utf-8'`` by default.)
|
||||||
|
|
||||||
|
The ``DEFAULT_CHARSET`` setting controls the encoding of rendered templates.
|
||||||
|
This is set to UTF-8 by default.
|
||||||
|
|
||||||
Template tags and filters
|
Template tags and filters
|
||||||
-------------------------
|
-------------------------
|
||||||
@ -299,18 +302,20 @@ A couple of tips to remember when writing your own template tags and filters:
|
|||||||
* Use ``force_unicode()`` in preference to ``smart_unicode()`` in these
|
* Use ``force_unicode()`` in preference to ``smart_unicode()`` in these
|
||||||
places. Tag rendering and filter calls occur as the template is being
|
places. Tag rendering and filter calls occur as the template is being
|
||||||
rendered, so there is no advantage to postponing the conversion of lazy
|
rendered, so there is no advantage to postponing the conversion of lazy
|
||||||
transation objects into strings any longer. It is easier to work solely
|
translation objects into strings. It's easier to work solely with Unicode
|
||||||
with Unicode strings at this point.
|
strings at that point.
|
||||||
|
|
||||||
E-mail
|
E-mail
|
||||||
======
|
======
|
||||||
|
|
||||||
Django's email framework (in ``django.core.mail``) supports unicode
|
Django's e-mail framework (in ``django.core.mail``) supports Unicode
|
||||||
transparently. You can use unicode data in the message bodies and any headers.
|
transparently. You can use Unicode data in the message bodies and any headers.
|
||||||
However, you must still respect the requirements of the email specifications,
|
However, you're still obligated to respect the requirements of the e-mail
|
||||||
so, for example, email addresses should use ASCII characters. The following
|
specifications, so, for example, e-mail addresses should use only ASCII
|
||||||
code is certainly possible (demonstrating the everything except e-mail
|
characters.
|
||||||
addresses can be non-ASCII)::
|
|
||||||
|
The following code example demonstrates that everything except e-mail addresses
|
||||||
|
can be non-ASCII::
|
||||||
|
|
||||||
from django.core.mail import EmailMessage
|
from django.core.mail import EmailMessage
|
||||||
|
|
||||||
@ -320,19 +325,20 @@ addresses can be non-ASCII)::
|
|||||||
body = u'...'
|
body = u'...'
|
||||||
EmailMessage(subject, body, sender, recipients).send()
|
EmailMessage(subject, body, sender, recipients).send()
|
||||||
|
|
||||||
|
|
||||||
Form submission
|
Form submission
|
||||||
===============
|
===============
|
||||||
|
|
||||||
HTML form submission is a tricky area. There is no guarantee that the
|
HTML form submission is a tricky area. There's no guarantee that the
|
||||||
submission will include encoding information.
|
submission will include encoding information, which means the framework might
|
||||||
|
have to guess at the encoding of submitted data.
|
||||||
|
|
||||||
Django adopts a "lazy" approach to decoding form data. The data in an
|
Django adopts a "lazy" approach to decoding form data. The data in an
|
||||||
``HttpRequest`` object is only decoded when you access it. In fact, most of
|
``HttpRequest`` object is only decoded when you access it. In fact, most of
|
||||||
the data is not decoded at all. Only the ``HttpRequest.GET`` and
|
the data is not decoded at all. Only the ``HttpRequest.GET`` and
|
||||||
``HttpRequest.POST`` data structures have any decoding applied to them. Those
|
``HttpRequest.POST`` data structures have any decoding applied to them. Those
|
||||||
two fields will return their members as unicode data. All other members will
|
two fields will return their members as Unicode data. All other attributes and
|
||||||
be returned exactly as they were submitted by the client.
|
methods of ``HttpRequest`` return data exactly as it was submitted by the
|
||||||
|
client.
|
||||||
|
|
||||||
By default, the ``DEFAULT_CHARSET`` setting is used as the assumed encoding
|
By default, the ``DEFAULT_CHARSET`` setting is used as the assumed encoding
|
||||||
for form data. If you need to change this for a particular form, you can set
|
for form data. If you need to change this for a particular form, you can set
|
||||||
@ -346,14 +352,12 @@ does this for you. For example::
|
|||||||
...
|
...
|
||||||
|
|
||||||
You can even change the encoding after having accessed ``request.GET`` or
|
You can even change the encoding after having accessed ``request.GET`` or
|
||||||
``request.POST`` and all subsequent accesses will use the new encoding.
|
``request.POST``, and all subsequent accesses will use the new encoding.
|
||||||
|
|
||||||
It will typically be very rare that you would need to worry about changing the
|
Most developers won't need to worry about changing form encoding, but this is
|
||||||
form encoding. However, if you are talking to a legacy system or a system
|
a useful feature for applications that talk to legacy systems whose encoding
|
||||||
beyond your control with particular ideas about encoding, you do have a way to
|
you cannot control.
|
||||||
control the decoding of the data.
|
|
||||||
|
|
||||||
For request features such as file uploads, no automatic decoding takes place,
|
|
||||||
because those attributes are normally treated as collections of bytes, rather
|
|
||||||
than strings. Any decoding would alter the meaning of the stream of bytes.
|
|
||||||
|
|
||||||
|
Django does not decode the data of file uploads, because that data is normally
|
||||||
|
treated as collections of bytes, rather than strings. Any automatic decoding
|
||||||
|
there would alter the meaning of the stream of bytes.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user