1
0
mirror of https://github.com/django/django.git synced 2025-07-04 01:39:20 +00:00

unicode: Made some documentation edits and inconsequential typo fixes throughout code

git-svn-id: http://code.djangoproject.com/svn/django/branches/unicode@5597 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
Adrian Holovaty 2007-07-03 18:29:56 +00:00
parent c62d6eea19
commit 1feda14c8e
8 changed files with 189 additions and 185 deletions

View File

@ -14,7 +14,7 @@ class ContentTypeManager(models.Manager):
try: try:
ct = CONTENT_TYPE_CACHE[key] ct = CONTENT_TYPE_CACHE[key]
except KeyError: except KeyError:
# The unicode() is needed around opts.verbose_name because it might # The smart_unicode() is needed around opts.verbose_name_raw because it might
# be a django.utils.functional.__proxy__ object. # be a django.utils.functional.__proxy__ object.
ct, created = self.model._default_manager.get_or_create(app_label=key[0], ct, created = self.model._default_manager.get_or_create(app_label=key[0],
model=key[1], defaults={'name': smart_unicode(opts.verbose_name_raw)}) model=key[1], defaults={'name': smart_unicode(opts.verbose_name_raw)})

View File

@ -7,7 +7,7 @@ from django.conf import settings
def add_domain(domain, url): def add_domain(domain, url):
if not url.startswith('http://'): if not url.startswith('http://'):
# 'url' must already be ASCII and URL-quoted, so no need for encodign # 'url' must already be ASCII and URL-quoted, so no need for encoding
# conversions here. # conversions here.
url = u'http://%s%s' % (domain, url) url = u'http://%s%s' % (domain, url)
return url return url

View File

@ -50,7 +50,7 @@ class HttpRequest(object):
def _set_encoding(self, val): def _set_encoding(self, val):
""" """
Sets the encoding used for GET/POST accesses. If the GET or POST Sets the encoding used for GET/POST accesses. If the GET or POST
dictionary has already been created it is removed and recreated on the dictionary has already been created, it is removed and recreated on the
next access (so that it is decoded correctly). next access (so that it is decoded correctly).
""" """
self._encoding = val self._encoding = val
@ -101,7 +101,7 @@ class QueryDict(MultiValueDict):
This is immutable unless you create a copy of it. This is immutable unless you create a copy of it.
Values retrieved from this class are converted from the default encoding to Values retrieved from this class are converted from the default encoding to
unicode (this is done on retrieval, rather than input to avoid breaking unicode (this is done on retrieval, rather than input, to avoid breaking
references or mutating referenced objects). references or mutating referenced objects).
""" """
def __init__(self, query_string, mutable=False, encoding=None): def __init__(self, query_string, mutable=False, encoding=None):

View File

@ -116,7 +116,8 @@ make_list = stringfilter(make_list)
def slugify(value): def slugify(value):
"Converts to lowercase, removes non-alpha chars and converts spaces to hyphens" "Converts to lowercase, removes non-alpha chars and converts spaces to hyphens"
# Don't compile patterns as unicode because \w then would mean any letter. Slugify is effectively an asciiization. # Don't compile patterns as unicode because \w then would mean any letter.
# Slugify is effectively a conversion to ASCII.
value = re.sub('[^\w\s-]', '', value).strip().lower() value = re.sub('[^\w\s-]', '', value).strip().lower()
return re.sub('[-\s]+', '-', value) return re.sub('[-\s]+', '-', value)
slugify = stringfilter(slugify) slugify = stringfilter(slugify)

View File

@ -68,26 +68,24 @@ In Python code
Standard translation Standard translation
~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~
Specify a translation string by using the function ``ugettext()``. Since you Specify a translation string by using the function ``ugettext()``. It's
may well be typing this a lot, it's often worthwhile importing it as a shorter convention to import this as a shorter alias, ``_``, to save typing.
alias and ``_`` is a very common choice.
.. note:: .. note::
Python's standard library ``gettext`` module installs ``_()`` into the Python's standard library ``gettext`` module installs ``_()`` into the
global namespace, as an alias for ``gettext()``. In Django, we have chosen global namespace, as an alias for ``gettext()``. In Django, we have chosen
not to follow this practice, for a couple of reasons: not to follow this practice, for a couple of reasons:
1. For international character set (unicode) support, you really wanting 1. For international character set (Unicode) support, ``ugettext()`` is
to be using ``ugettext()``, rather than ``gettext()``. Sometimes, you more useful than ``gettext()``. Sometimes, you should be using
should be using ``ugettext_lazy()`` as the default translation method ``ugettext_lazy()`` as the default translation method for a particular
for a particular file. By not installing ``_`` directly, the file. Without ``_()`` in the global namespace, the developer has to
developer has to think about which is the most appropriate function think about which is the most appropriate translation function.
to use.
2. Python's interactive shell uses ``_`` to represent "the previous 2. The underscore character (``_``) is used to represent "the previous
result". This is also used in doctest tests and having ``_()`` causes result" in Python's interactive shell and doctest tests. Installing a
interference. Explicitly importing ``ugettext()`` as ``_()`` avoids global ``_()`` function causes interference. Explicitly importing
this problem. ``ugettext()`` as ``_()`` avoids this problem.
In this example, the text ``"Welcome to my site."`` is marked as a translation In this example, the text ``"Welcome to my site."`` is marked as a translation
string:: string::
@ -98,7 +96,7 @@ string::
output = _("Welcome to my site.") output = _("Welcome to my site.")
return HttpResponse(output) return HttpResponse(output)
Obviously you could code this without using the alias. This example is Obviously, you could code this without using the alias. This example is
identical to the previous one:: identical to the previous one::
from django.utils.translation import ugettext from django.utils.translation import ugettext
@ -300,7 +298,7 @@ Working with lazy translation objects
===================================== =====================================
Using ``ugettext_lazy()`` and ``ungettext_lazy()`` to mark strings in models Using ``ugettext_lazy()`` and ``ungettext_lazy()`` to mark strings in models
and utility functions is a common operation. When you are working with these and utility functions is a common operation. When you're working with these
objects elsewhere in your code, you should ensure that you don't accidentally objects elsewhere in your code, you should ensure that you don't accidentally
convert them to strings, because they should be converted as late as possible convert them to strings, because they should be converted as late as possible
(so that the correct locale is in effect). This necessitates the use of a (so that the correct locale is in effect). This necessitates the use of a
@ -328,20 +326,20 @@ rendering time).
The allow_lazy() decorator The allow_lazy() decorator
-------------------------- --------------------------
There are a lot of useful utility functions in Django (particularly in Django offers many utility functions (particularly in ``django.utils``) that
``django.utils``) that take a string as their first argument and do something take a string as their first argument and do something to that string. These
to that string. These functions are used by template filters as well as functions are used by template filters as well as directly in other code.
directly in other code.
If you write your own similar functions, you will rapidly come across the If you write your own similar functions and deal with translations, you'll
problem of what to do when the first argument is a lazy translation object. face the problem of what to do when the first argument is a lazy translation
You don't want to convert it to a string immediately, because you may be using object. You don't want to convert it to a string immediately, because you might
this function outside of a view (and hence the current thread's locale setting be using this function outside of a view (and hence the current thread's locale
will not be correct). For cases like this, the setting will not be correct).
``django.utils.functional.allow_lazy()`` decorator will be useful. It modifies
the function so that *if* it is called with a lazy translation as the first For cases like this, use the ``django.utils.functional.allow_lazy()``
argument, the function evaluation is delayed until it needs to be converted to decorator. It modifies the function so that *if* it's called with a lazy
a string. translation as the first argument, the function evaluation is delayed until it
needs to be converted to a string.
For example:: For example::
@ -353,9 +351,9 @@ For example::
fancy_utility_function = allow_lazy(fancy_utility_function, unicode) fancy_utility_function = allow_lazy(fancy_utility_function, unicode)
The ``allow_lazy()`` decorator takes, in addition to the function to decorate, The ``allow_lazy()`` decorator takes, in addition to the function to decorate,
a number of extra arguments specifying the type(s) that the original function a number of extra arguments (``*args``) specifying the type(s) that the
can return. Usually, it will be enough to just include ``unicode`` here and original function can return. Usually, it's enough to include ``unicode`` here
ensure that your function returns Unicode strings. and ensure that your function returns only Unicode strings.
Using this decorator means you can write your function and assume that the Using this decorator means you can write your function and assume that the
input is a proper string, then add support for lazy translation objects at the input is a proper string, then add support for lazy translation objects at the

View File

@ -1044,11 +1044,11 @@ iriencode
~~~~~~~~~ ~~~~~~~~~
Converts an IRI (Internationalized Resource Identifier) to a string that is Converts an IRI (Internationalized Resource Identifier) to a string that is
suitable for including in a URL. This is necessary if you are trying to use suitable for including in a URL. This is necessary if you're trying to use
strings containing non-ASCII characters in a URL. strings containing non-ASCII characters in a URL.
You can use this filter after you have used the ``urlencode`` filter on a It's safe to use this filter on a string that has already gone through the
string, without harm. ``urlencode`` filter.
join join
~~~~ ~~~~

View File

@ -492,20 +492,21 @@ your own sanity when dealing with the interactive prompt, but also because
objects' representations are used throughout Django's automatically-generated objects' representations are used throughout Django's automatically-generated
admin. admin.
.. admonition:: Why ``__unicode__`` and not ``__str__``? .. admonition:: Why ``__unicode__()`` and not ``__str__()``?
If you are wondering why we add a ``__unicode__()`` method, rather than a If you're familiar with Python, you might be in the habit of adding
simple ``__str__()`` method, it is because Django models will contain ``__str__()`` methods to your classes, not ``__unicode__()`` methods.
unicode strings by default. The values returned from the database, for We use ``__unicode__()`` here because Django models deal with Unicode by
example, are all unicode strings. In most cases, your code should be default. All data stored in your database is converted to Unicode when it's
prepared to handle non-ASCII characters and this is a litle fiddly in returned.
``__str__()`` methods, since you have to worry about which encoding to
use, amongst other things. If you create a ``__unicode__()`` method, Django models have a default ``__str__()`` method that calls ``__unicode__()``
Django will provide a ``__str__()`` method that calls your and converts the result to a UTF-8 bytestring. This means that ``unicode(p)``
``__unicode__()`` and then converts the result to UTF-8 strings when will return a Unicode string, and ``str(p)`` will return a normal string,
required. So ``unicode(p)`` will return a unicode string and ``str(p)`` with characters encoded as UTF-8.
will return a normal string, with the characters encoded as UTF-8 when
necessary.. If all of this is jibberish to you, just remember to add ``__unicode__()``
methods to your models. With any luck, things should Just Work for you.
Note these are normal Python methods. Let's add a custom method, just for Note these are normal Python methods. Let's add a custom method, just for
demonstration:: demonstration::

View File

@ -8,24 +8,24 @@ Django natively supports Unicode data everywhere. Providing your database can
somehow store the data, you can safely pass around Unicode strings to somehow store the data, you can safely pass around Unicode strings to
templates, models and the database. templates, models and the database.
This files describes some things to be aware of if you are writing applications This document tells you what you need to know if you're writing applications
which do not only use ASCII-encoded data. that use data or templates that are encoded in something other than ASCII.
Creating the database Creating the database
===================== =====================
Make sure your database is configured to be able to store arbitrary string Make sure your database is configured to be able to store arbitrary string
data. Normally, this means giving it an encoding of UTF-8 or UTF-16. If you use data. Normally, this means giving it an encoding of UTF-8 or UTF-16. If you use
a more restrictive encoding -- for example, latin1 (iso8859-1) -- there will be a more restrictive encoding -- for example, latin1 (iso8859-1) -- you won't be
some characters that you cannot store in the database and information will be able to store certain characters in the database, and information will be lost.
lost.
* For MySQL users, refer to the `MySQL manual`_ (section 10.3.2 for MySQL 5.1) * MySQL users, refer to the `MySQL manual`_ (section 10.3.2 for MySQL 5.1) for
for details on how to set or alter the database character set encoding. details on how to set or alter the database character set encoding.
* For PostgreSQL users, refer to the `PostgreSQL manual`_ (section 21.2.2 in * PostgreSQL users, refer to the `PostgreSQL manual`_ (section 21.2.2 in
PostgreSQL 8) for details on creating databases with the correct encoding. PostgreSQL 8) for details on creating databases with the correct encoding.
* For SQLite users, there is nothing you need to do. SQLite always uses UTF-8 * SQLite users, there is nothing you need to do. SQLite always uses UTF-8
for internal encoding. for internal encoding.
.. _MySQL manual: http://www.mysql.org/doc/refman/5.1/en/charset-database.html .. _MySQL manual: http://www.mysql.org/doc/refman/5.1/en/charset-database.html
@ -37,119 +37,119 @@ convert strings retrieved from the database into Python Unicode strings. You
don't even need to tell Django what encoding your database uses: that is don't even need to tell Django what encoding your database uses: that is
handled transparently. handled transparently.
For more, see the section "The database API" below.
General string handling General string handling
======================= =======================
Whenever you use strings with Django, you have two choices. You can use Unicode Whenever you use strings with Django -- e.g., in database lookups, template
strings or you can use normal strings (sometimes called bytestrings) that are rendering or anywhere else -- you have two choices for encoding those strings.
encoded using UTF-8. You can use Unicode strings, or you can use normal strings (sometimes called
"bytestrings") that are encoded using UTF-8.
.. warning:: .. warning::
A bytestring does not carry any information with it about its encoding. So A bytestring does not carry any information with it about its encoding.
we have to make an assumption and Django assumes that all bytestrings are For that reason, we have to make an assumption, and Django assumes that all
in UTF-8. If you pass a string to Django that has been encoded in some bytestrings are in UTF-8.
other format, things will go wrong in interesting ways. Usually Django will
raise a UnicodeDecodeError at some point.
If your code only uses ASCII data, you are quite safe to simply use your normal If you pass a string to Django that has been encoded in some other format,
strings (since ASCII is a subset of UTF-8) and pass them around at will. things will go wrong in interesting ways. Usually, Django will raise a
``UnicodeDecodeError`` at some point.
Do not be fooled into thinking that if your ``DEFAULT_CHARSET`` setting is set If your code only uses ASCII data, it's safe to use your normal strings,
to something other than ``utf-8`` you can use that encoding in your passing them around at will, because ASCII is a subset of UTF-8.
bytestrings! The ``DEFAULT_CHARSET`` only applies to the strings generated as
the result of template rendering (and email). Django will always assume UTF-8 Don't be fooled into thinking that if your ``DEFAULT_CHARSET`` setting is set
to something other than ``'utf-8'`` you can use that other encoding in your
bytestrings! ``DEFAULT_CHARSET`` only applies to the strings generated as
the result of template rendering (and e-mail). Django will always assume UTF-8
encoding for internal bytestrings. The reason for this is that the encoding for internal bytestrings. The reason for this is that the
``DEFAULT_CHARSET`` setting is not actually under your control (if you are the ``DEFAULT_CHARSET`` setting is not actually under your control (if you are the
application developer). It is under the control of the person installing and application developer). It's under the control of the person installing and
using your application and if they choose a different setting, your code must using your application -- and if that person chooses a different setting, your
still continue to work. Ergo, it cannot rely on that setting. code must still continue to work. Ergo, it cannot rely on that setting.
In most cases when Django is dealing with strings, it will convert them to In most cases when Django is dealing with strings, it will convert them to
Unicode strings before doing anything else. So if you pass in a bytestring, be Unicode strings before doing anything else. So, as a general rule, if you pass
prepared to receive a Unicode string back in the result. in a bytestring, be prepared to receive a Unicode string back in the result.
.. _lazy translation:
Translated strings Translated strings
------------------ ------------------
There is actually a third type of string-like object you may encounter when Aside from Unicode strings and bytestrings, there's a third type of string-like
using Django. If you are using the internationalization features of Django, object you may encounter when using Django. The framework's
there is the concept of a "lazy translation". This is a string that has been internationalization features introduce the concept of a "lazy translation" --
marked as translated, but the actual result is not determined until the object a string that has been marked as translated but whose actual translation result
is used in a string. This is useful because the locale that should be used for isn't determined until the object is used in a string. This feature is useful
the translation will not be known until the string is used, even though the in cases where the translation locale is unknown until the string is used, even
string might have originally been created when the code was first imported. though the string might have originally been created when the code was first
imported.
Normally, you won't have to worry about lazy translations. Just be aware that Normally, you won't have to worry about lazy translations. Just be aware that
if you examine an object and it claims to be a if you examine an object and it claims to be a
``django.utils.functional.__proxy__`` object, it is a lazy translation. ``django.utils.functional.__proxy__`` object, it is a lazy translation.
Calling ``unicode()`` with the translation as the argument will generate a Calling ``unicode()`` with the lazy translation as the argument will generate a
string in the current locale. Unicode string in the current locale.
For more details about lazy translation objects, refer to the For more details about lazy translation objects, refer to the
internationalization_ documentation. internationalization_ documentation.
.. _internationalization: ../i18n/#lazy-translation .. _internationalization: ../i18n/#lazy-translation
.. _utility functions:
Useful utility functions Useful utility functions
------------------------ ------------------------
Since some string operations come up again and again, Django ships with a few Because some string operations come up again and again, Django ships with a few
useful functions that should make working with unicode and bytestring objects useful functions that should make working with Unicode and bytestring objects
a bit easier. a bit easier.
Conversion functions Conversion functions
~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~
The ``django.utils.encoding`` module contains a few functions that are handy The ``django.utils.encoding`` module contains a few functions that are handy
for converting back and forth between unicode and bytestrings. for converting back and forth between Unicode and bytestrings.
* ``smart_unicode(s, encoding='utf-8', errors='strict')`` converts its * ``smart_unicode(s, encoding='utf-8', errors='strict')`` converts its
input to unicode string. The ``encoding`` parameter specifies the input input to a Unicode string. The ``encoding`` parameter specifies the input
encoding of any bytestring -- Django uses this internally when encoding. (For example, Django uses this internally when processing form
processing form input data, for example, which might not be UTF-8 input data, which might not be UTF-8 encoded.) The ``errors`` parameter
encoded. The ``errors`` parameter takes any of the values that are takes any of the values that are accepted by Python's ``unicode()``
accepted by Python's ``unicode()`` function for its error handling. function for its error handling.
If you pass ``smart_unicode()`` an object that has a ``__unicode__`` If you pass ``smart_unicode()`` an object that has a ``__unicode__``
method, it will use that method to do the conversion. method, it will use that method to do the conversion.
* ``force_unicode(s, encoding='utf-8', errors='strict')`` is identical to * ``force_unicode(s, encoding='utf-8', errors='strict')`` is identical to
``smart_unicode()`` in almost all cases. The difference is when the ``smart_unicode()`` in almost all cases. The difference is when the
first argument is a `lazy translation`_ instance. Whilst first argument is a `lazy translation`_ instance. While
``smart_unicode()`` preserves lazy translations, ``force_unicode()`` ``smart_unicode()`` preserves lazy translations, ``force_unicode()``
forces those objects to a unicode string (causing the translation to forces those objects to a Unicode string (causing the translation to
occur). Normally, you will want to use ``smart_unicode()``. However, occur). Normally, you'll want to use ``smart_unicode()``. However,
``force_unicode()`` is useful in filters and template tags when you ``force_unicode()`` is useful in template tags and filters that
absolutely must have a string to work with, not just something that can absolutely *must* have a string to work with, not just something that can
be converted to a string. be converted to a string.
* ``smart_str(s, encoding='utf-8', strings_only=False, errors='strict')`` * ``smart_str(s, encoding='utf-8', strings_only=False, errors='strict')``
is essentially the opposite of ``smart_unicode()``. It forces the first is essentially the opposite of ``smart_unicode()``. It forces the first
argument to a string. The ``strings_only`` parameter, if set to True, argument to a bytestring. The ``strings_only`` parameter, if set to True,
will result in Python integers, booleans and ``None`` not being will result in Python integers, booleans and ``None`` not being
converted to a string (they keep their original types). This is slightly converted to a string (they keep their original types). This is slightly
different semantics from Python's builtin ``str()`` function, but the different semantics from Python's builtin ``str()`` function, but the
difference is needed in a few places internally. difference is needed in a few places within Django's internals.
Normally, you will only need to use ``smart_unicode()``. Call it as early as Normally, you'll only need to use ``smart_unicode()``. Call it as early as
possible on any input data that might be either a unicode or bytestring and possible on any input data that might be either Unicode or a bytestring, and
from then on you can treat the result as always being unicode. from then on, you can treat the result as always being Unicode.
.. _uri_and_iri:
URI and IRI handling URI and IRI handling
~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~
Web frameworks have to deal with URLs (which are a type of URI_). One Web frameworks have to deal with URLs (which are a type of URI_). One
requirement of URLs is that they are encoded using only ASCII characters. requirement of URLs is that they are encoded using only ASCII characters.
However, in an international environment, you will often need to construct a However, in an international environment, you might need to construct a
URL from an IRI_ (very loosely speaking, a URI that can contain unicode URL from an IRI_ -- very loosely speaking, a URI that can contain Unicode
characters). Getting the quoting and conversion from IRI to URI correct can be characters. Quoting and converting an IRI to URI can be a little tricky, so
a little tricky, so Django provides some assistance. Django provides some assistance.
* The function ``django.utils.encoding.iri_to_uri()`` implements the * The function ``django.utils.encoding.iri_to_uri()`` implements the
conversion from IRI to URI as required by the specification (`RFC conversion from IRI to URI as required by the specification (`RFC
@ -158,9 +158,9 @@ a little tricky, so Django provides some assistance.
* The functions ``django.utils.http.urlquote()`` and * The functions ``django.utils.http.urlquote()`` and
``django.utils.http.urlquote_plus()`` are versions of Python's standard ``django.utils.http.urlquote_plus()`` are versions of Python's standard
``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII ``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII
characters (the data is converted to UTF-8 prior to encoding). characters. (The data is converted to UTF-8 prior to encoding.)
These two groups of functions have slightly different purposes and it is These two groups of functions have slightly different purposes, and it's
important to keep them straight. Normally, you would use ``urlquote()`` on the important to keep them straight. Normally, you would use ``urlquote()`` on the
individual portions of the IRI or URI path so that any reserved characters individual portions of the IRI or URI path so that any reserved characters
such as '&' or '%' are correctly encoded. Then, you apply ``iri_to_uri()`` to such as '&' or '%' are correctly encoded. Then, you apply ``iri_to_uri()`` to
@ -168,10 +168,9 @@ the full IRI and it converts any non-ASCII characters to the correct encoded
values. values.
.. note:: .. note::
It isn't completely correct to say that ``iri_to_uri()`` implements the Technically, it isn't correct to say that ``iri_to_uri()`` implements the
full algorithm in the IRI specification. It does not perform the full algorithm in the IRI specification. It doesn't (yet) perform the
international domain name encoding portion of the algorithm (at the international domain name encoding portion of the algorithm.
moment).
The ``iri_to_uri()`` function will not change ASCII characters that are The ``iri_to_uri()`` function will not change ASCII characters that are
otherwise permitted in a URL. So, for example, the character '%' is not otherwise permitted in a URL. So, for example, the character '%' is not
@ -208,45 +207,46 @@ double-quoting problems.
Models Models
====== ======
Because all strings are returned from the database as unicode strings, model Because all strings are returned from the database as Unicode strings, model
fields that are character based (CharField, TextField, URLField, etc) will fields that are character based (CharField, TextField, URLField, etc) will
contain unicode values when Django retrieves the model from the database. This contain Unicode values when Django retrieves data from the database. This
is always the case, even if the data could fit into an ASCII string. is *always* the case, even if the data could fit into an ASCII bytestring.
As always, you can pass in bytestrings when creating a model or populating a You can pass in bytestrings when creating a model or populating a field, and
field and Django will convert it to unicode when it needs to. Django will convert it to Unicode when it needs to.
Choosing between ``__str__()`` and ``__unicode__()`` Choosing between ``__str__()`` and ``__unicode__()``
----------------------------------------------------- ----------------------------------------------------
One consequence of using unicode by default is that you have to take some care One consequence of using Unicode by default is that you have to take some care
when printing data from the model. In particular, rather than writing a when printing data from the model.
``__str__()`` method, it is recommended to write a ``__unicode__()`` method for
your model. In the ``__unicode__()`` method, you can quite safely return the
values of all your fields without having to worry about whether they fit into a
bytestring or not (the result of ``__str__()`` is *always* a bytestring, even
if you accidentally try to return a unicode object).
You can still create a ``__str__()`` method on your models if you wish, of In particular, rather than giving your model a ``__str__()`` method, we
course. However, Django's ``Model`` base class automatically provides you with recommended you implement a ``__unicode__()`` method. In the ``__unicode__()``
a ``__str__()`` method that calls your ``__unicode__()`` method and then method, you can quite safely return the values of all your fields without
encodes the result correctly into UTF-8. So you would normally only create a having to worry about whether they fit into a bytestring or not. (The way
``__unicode__()`` method and let Django handle the coercion to a bytestring Python works, the result of ``__str__()`` is *always* a bytestring, even if you
when required. accidentally try to return a Unicode object).
You can still create a ``__str__()`` method on your models if you want, of
course, but you shouldn't need to do this unless you have a good reason.
Django's ``Model`` base class automatically provides a ``__str__()``
implementation that calls ``__unicode__()`` and encodes the result into UTF-8.
This means you'll normally only need to implement a ``__unicode__()`` method
and let Django handle the coercion to a bytestring when required.
Taking care in ``get_absolute_url()`` Taking care in ``get_absolute_url()``
------------------------------------- -------------------------------------
URLs can only contain ASCII characters. If you are constructing a URL from URLs can only contain ASCII characters. If you're constructing a URL from
pieces of data that might be non-ASCII, you must be careful to encode the pieces of data that might be non-ASCII, be careful to encode the results in a
results in a way that is suitable for a URL. If you are using the way that is suitable for a URL. The ``django.db.models.permalink()`` decorator
``django.db.models.permalink()`` decorator, this is handled automatically by handles this for you automatically.
the decorator.
If you are constructing the URL manually, you need to take care of the If you're constructing a URL manually (i.e., *not* using the ``permalink()``
encoding yourself. Normally, this would involve a combination of the decorator), you'll need to take care of the encoding yourself. In this case,
``iri_to_uri()`` and ``urlquote()`` functions that were documented above_. For use the ``iri_to_uri()`` and ``urlquote()`` functions that were documented
example:: above_. For example::
from django.utils.encoding import iri_to_uri from django.utils.encoding import iri_to_uri
from django.utils.http import urlquote from django.utils.http import urlquote
@ -265,28 +265,31 @@ non-ASCII characters would have been removed in quoting in the first line.)
The database API The database API
================ ================
You can happily pass unicode strings or bytestrings as arguments to You can pass either Unicode strings or UTF-8 bytestrings as arguments to
``filter()`` methods and the like in the database API. The following two ``filter()`` methods and the like in the database API. The following two
querysets are identical:: querysets are identical::
qs = People.objects.filter(name__contains=u'Å') qs = People.objects.filter(name__contains=u'Å')
qs = People.objects.filter(name__contains='\xc3\85') # UTF-8 encoding of Å qs = People.objects.filter(name__contains='\xc3\85') # UTF-8 encoding of Å
Templates Templates
========= =========
As usual, templates can be created from unicode or bytestrings. However, they You can use either Unicode or bytestrings when creating templates manually::
can also be created by reading a file from disk and this creates a slight
complication: not all filesystems store their data encoded as UTF-8. If your
template files are not stored with a UTF-8 encoding, set the ``FILE_CHARSET``
setting to the encoding of the on-disk files. When Django reads in a template
file it will convert the data from this encoding to unicode.
When a template is rendered for sending out as an HTML document or an e-mail, from django.template import Template
it may be convenient to use an encoding other than UTF-8. You should set the t1 = Template('This is a bytestring template.')
``DEFAULT_CHARSET`` parameter to control the rendered template encoding (the t2 = Template(u'This is a Unicode template.')
default setting is utf-8).
But the common case is to read templates from the filesystem, and this creates
a slight complication: not all filesystems store their data encoded as UTF-8.
If your template files are not stored with a UTF-8 encoding, set the ``FILE_CHARSET``
setting to the encoding of the files on disk. When Django reads in a template
file, it will convert the data from this encoding to Unicode. (``FILE_CHARSET``
is set to ``'utf-8'`` by default.)
The ``DEFAULT_CHARSET`` setting controls the encoding of rendered templates.
This is set to UTF-8 by default.
Template tags and filters Template tags and filters
------------------------- -------------------------
@ -299,18 +302,20 @@ A couple of tips to remember when writing your own template tags and filters:
* Use ``force_unicode()`` in preference to ``smart_unicode()`` in these * Use ``force_unicode()`` in preference to ``smart_unicode()`` in these
places. Tag rendering and filter calls occur as the template is being places. Tag rendering and filter calls occur as the template is being
rendered, so there is no advantage to postponing the conversion of lazy rendered, so there is no advantage to postponing the conversion of lazy
transation objects into strings any longer. It is easier to work solely translation objects into strings. It's easier to work solely with Unicode
with Unicode strings at this point. strings at that point.
E-mail E-mail
====== ======
Django's email framework (in ``django.core.mail``) supports unicode Django's e-mail framework (in ``django.core.mail``) supports Unicode
transparently. You can use unicode data in the message bodies and any headers. transparently. You can use Unicode data in the message bodies and any headers.
However, you must still respect the requirements of the email specifications, However, you're still obligated to respect the requirements of the e-mail
so, for example, email addresses should use ASCII characters. The following specifications, so, for example, e-mail addresses should use only ASCII
code is certainly possible (demonstrating the everything except e-mail characters.
addresses can be non-ASCII)::
The following code example demonstrates that everything except e-mail addresses
can be non-ASCII::
from django.core.mail import EmailMessage from django.core.mail import EmailMessage
@ -320,19 +325,20 @@ addresses can be non-ASCII)::
body = u'...' body = u'...'
EmailMessage(subject, body, sender, recipients).send() EmailMessage(subject, body, sender, recipients).send()
Form submission Form submission
=============== ===============
HTML form submission is a tricky area. There is no guarantee that the HTML form submission is a tricky area. There's no guarantee that the
submission will include encoding information. submission will include encoding information, which means the framework might
have to guess at the encoding of submitted data.
Django adopts a "lazy" approach to decoding form data. The data in an Django adopts a "lazy" approach to decoding form data. The data in an
``HttpRequest`` object is only decoded when you access it. In fact, most of ``HttpRequest`` object is only decoded when you access it. In fact, most of
the data is not decoded at all. Only the ``HttpRequest.GET`` and the data is not decoded at all. Only the ``HttpRequest.GET`` and
``HttpRequest.POST`` data structures have any decoding applied to them. Those ``HttpRequest.POST`` data structures have any decoding applied to them. Those
two fields will return their members as unicode data. All other members will two fields will return their members as Unicode data. All other attributes and
be returned exactly as they were submitted by the client. methods of ``HttpRequest`` return data exactly as it was submitted by the
client.
By default, the ``DEFAULT_CHARSET`` setting is used as the assumed encoding By default, the ``DEFAULT_CHARSET`` setting is used as the assumed encoding
for form data. If you need to change this for a particular form, you can set for form data. If you need to change this for a particular form, you can set
@ -346,14 +352,12 @@ does this for you. For example::
... ...
You can even change the encoding after having accessed ``request.GET`` or You can even change the encoding after having accessed ``request.GET`` or
``request.POST`` and all subsequent accesses will use the new encoding. ``request.POST``, and all subsequent accesses will use the new encoding.
It will typically be very rare that you would need to worry about changing the Most developers won't need to worry about changing form encoding, but this is
form encoding. However, if you are talking to a legacy system or a system a useful feature for applications that talk to legacy systems whose encoding
beyond your control with particular ideas about encoding, you do have a way to you cannot control.
control the decoding of the data.
For request features such as file uploads, no automatic decoding takes place,
because those attributes are normally treated as collections of bytes, rather
than strings. Any decoding would alter the meaning of the stream of bytes.
Django does not decode the data of file uploads, because that data is normally
treated as collections of bytes, rather than strings. Any automatic decoding
there would alter the meaning of the stream of bytes.