From 1962a96a30e02de78a674a2e02979c00cc55655b Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Fri, 5 Jun 2015 17:37:48 +0100 Subject: [PATCH] Fixed #24938 -- Added PostgreSQL trigram support. --- django/contrib/postgres/apps.py | 4 +- django/contrib/postgres/lookups.py | 5 ++ django/contrib/postgres/operations.py | 6 ++ django/contrib/postgres/search.py | 16 ++++++ docs/ref/contrib/postgres/lookups.txt | 26 +++++++++ docs/ref/contrib/postgres/operations.txt | 10 ++++ docs/ref/contrib/postgres/search.txt | 55 +++++++++++++++++++ docs/releases/1.10.txt | 4 ++ docs/topics/db/search.txt | 5 +- .../migrations/0001_setup_extensions.py | 4 +- tests/postgres_tests/test_trigram.py | 53 ++++++++++++++++++ 11 files changed, 184 insertions(+), 4 deletions(-) create mode 100644 tests/postgres_tests/test_trigram.py diff --git a/django/contrib/postgres/apps.py b/django/contrib/postgres/apps.py index e7cdfd3866..e7f76b2017 100644 --- a/django/contrib/postgres/apps.py +++ b/django/contrib/postgres/apps.py @@ -3,7 +3,7 @@ from django.db.backends.signals import connection_created from django.db.models import CharField, TextField from django.utils.translation import ugettext_lazy as _ -from .lookups import SearchLookup, Unaccent +from .lookups import SearchLookup, TrigramSimilar, Unaccent from .signals import register_hstore_handler @@ -17,3 +17,5 @@ class PostgresConfig(AppConfig): TextField.register_lookup(Unaccent) CharField.register_lookup(SearchLookup) TextField.register_lookup(SearchLookup) + CharField.register_lookup(TrigramSimilar) + TextField.register_lookup(TrigramSimilar) diff --git a/django/contrib/postgres/lookups.py b/django/contrib/postgres/lookups.py index a39c9679e7..53a62eacd1 100644 --- a/django/contrib/postgres/lookups.py +++ b/django/contrib/postgres/lookups.py @@ -60,3 +60,8 @@ class SearchLookup(SearchVectorExact): self.lhs = SearchVector(self.lhs) lhs, lhs_params = super(SearchLookup, self).process_lhs(qn, connection) return lhs, lhs_params + + +class TrigramSimilar(PostgresSimpleLookup): + lookup_name = 'trigram_similar' + operator = '%%' diff --git a/django/contrib/postgres/operations.py b/django/contrib/postgres/operations.py index 2a0a8402ed..5992f00725 100644 --- a/django/contrib/postgres/operations.py +++ b/django/contrib/postgres/operations.py @@ -40,3 +40,9 @@ class UnaccentExtension(CreateExtension): def __init__(self): self.name = 'unaccent' + + +class TrigramExtension(CreateExtension): + + def __init__(self): + self.name = 'pg_trgm' diff --git a/django/contrib/postgres/search.py b/django/contrib/postgres/search.py index 91358c62aa..4628f4cf19 100644 --- a/django/contrib/postgres/search.py +++ b/django/contrib/postgres/search.py @@ -185,3 +185,19 @@ class SearchRank(Func): SearchVectorField.register_lookup(SearchVectorExact) + + +class TrigramBase(Func): + def __init__(self, expression, string, **extra): + if not hasattr(string, 'resolve_expression'): + string = Value(string) + super(TrigramBase, self).__init__(expression, string, output_field=FloatField(), **extra) + + +class TrigramSimilarity(TrigramBase): + function = 'SIMILARITY' + + +class TrigramDistance(TrigramBase): + function = '' + arg_joiner = ' <-> ' diff --git a/docs/ref/contrib/postgres/lookups.txt b/docs/ref/contrib/postgres/lookups.txt index 1f0af07b0d..daf784e221 100644 --- a/docs/ref/contrib/postgres/lookups.txt +++ b/docs/ref/contrib/postgres/lookups.txt @@ -2,6 +2,32 @@ PostgreSQL specific lookups =========================== +Trigram similarity +================== + +.. fieldlookup:: trigram_similar + +.. versionadded:: 1.10 + +The ``trigram_similar`` lookup allows you to perform trigram lookups, +measuring the number of trigrams (three consecutive characters) shared, using a +dedicated PostgreSQL extension. A trigram lookup is given an expression and +returns results that have a similarity measurement greater than the current +similarity threshold. + +To use it, add ``'django.contrib.postgres'`` in your :setting:`INSTALLED_APPS` +and activate the `pg_trgm extension +`_ on +PostgreSQL. You can install the extension using the +:class:`~django.contrib.postgres.operations.TrigramExtension` migration +operation. + +The ``trigram_similar`` lookup can be used on +:class:`~django.db.models.CharField` and :class:`~django.db.models.TextField`:: + + >>> City.objects.filter(name__trigram_similar="Middlesborough") + [''] + ``Unaccent`` ============ diff --git a/docs/ref/contrib/postgres/operations.txt b/docs/ref/contrib/postgres/operations.txt index b889105283..f4dd44ebbd 100644 --- a/docs/ref/contrib/postgres/operations.txt +++ b/docs/ref/contrib/postgres/operations.txt @@ -27,6 +27,16 @@ the ``django.contrib.postgres.operations`` module. which will install the ``hstore`` extension and also immediately set up the connection to interpret hstore data. +``TrigramExtension`` +==================== + +.. class:: TrigramExtension() + + .. versionadded:: 1.10 + + A subclass of :class:`~django.contrib.postgres.operations.CreateExtension` + that installs the ``pg_trgm`` extension. + ``UnaccentExtension`` ===================== diff --git a/docs/ref/contrib/postgres/search.txt b/docs/ref/contrib/postgres/search.txt index 592e730e8e..24000d341b 100644 --- a/docs/ref/contrib/postgres/search.txt +++ b/docs/ref/contrib/postgres/search.txt @@ -189,3 +189,58 @@ if it were an annotated ``SearchVector``:: [, ] .. _PostgreSQL documentation: http://www.postgresql.org/docs/current/static/textsearch-features.html#TEXTSEARCH-UPDATE-TRIGGERS + +Trigram similarity +================== + +Another approach to searching is trigram similarity. A trigram is a group of +three consecutive characters. In addition to the :lookup:`trigram_similar` +lookup, you can use a couple of other expressions. + +To use them, you need to activate the `pg_trgm extension +`_ on +PostgreSQL. You can install it using the +:class:`~django.contrib.postgres.operations.TrigramExtension` migration +operation. + +``TrigramSimilarity`` +--------------------- + +.. class:: TrigramSimilarity(expression, string, **extra) + +.. versionadded:: 1.10 + +Accepts a field name or expression, and a string or expression. Returns the +trigram similarity between the two arguments. + +Usage example:: + + >>> from django.contrib.postgres.search import TrigramSimilarity + >>> Author.objects.create(name='Katy Stevens') + >>> Author.objects.create(name='Stephen Keats') + >>> test = 'Katie Stephens' + >>> Author.objects.annotate( + ... similarity=TrigramSimilarity('name', test), + ... ).filter(similarity__gt=0.3).order_by('-similarity') + [, ] + +``TrigramDistance`` +------------------- + +.. class:: TrigramDistance(expression, string, **extra) + +.. versionadded:: 1.10 + +Accepts a field name or expression, and a string or expression. Returns the +trigram distance between the two arguments. + +Usage example:: + + >>> from django.contrib.postgres.search import TrigramDistance + >>> Author.objects.create(name='Katy Stevens') + >>> Author.objects.create(name='Stephen Keats') + >>> test = 'Katie Stephens' + >>> Author.objects.annotate( + ... distance=TrigramDistance('name', test), + ... ).filter(distance__lte=0.7).order_by('distance') + [, ] diff --git a/docs/releases/1.10.txt b/docs/releases/1.10.txt index 29af587eec..0463a284b3 100644 --- a/docs/releases/1.10.txt +++ b/docs/releases/1.10.txt @@ -33,6 +33,10 @@ search engine. You can search across multiple fields in your relational database, combine the searches with other lookups, use different language configurations and weightings, and rank the results by relevance. +It also now includes trigram support, using the :lookup:`trigram_similar` +lookup, and the :class:`~django.contrib.postgres.search.TrigramSimilarity` and +:class:`~django.contrib.postgres.search.TrigramDistance` expressions. + Minor features -------------- diff --git a/docs/topics/db/search.txt b/docs/topics/db/search.txt index fd62c6909c..04d84552df 100644 --- a/docs/topics/db/search.txt +++ b/docs/topics/db/search.txt @@ -55,11 +55,12 @@ use :lookup:`unaccented comparison `:: This shows another issue, where we are matching against a different spelling of the name. In this case we have an asymmetry though - a search for ``Helen`` will pick up ``Helena`` or ``Hélène``, but not the reverse. Another option -would be to use a trigram comparison, which compares sequences of letters. +would be to use a :lookup:`trigram_similar` comparison, which compares +sequences of letters. For example:: - >>> Author.objects.filter(name__unaccent__lower__trigram='Hélène') + >>> Author.objects.filter(name__unaccent__lower__trigram_similar='Hélène') [, ] Now we have a different problem - the longer name of "Helena Bonham Carter" diff --git a/tests/postgres_tests/migrations/0001_setup_extensions.py b/tests/postgres_tests/migrations/0001_setup_extensions.py index 07d5bfc7e7..400dd091f4 100644 --- a/tests/postgres_tests/migrations/0001_setup_extensions.py +++ b/tests/postgres_tests/migrations/0001_setup_extensions.py @@ -5,12 +5,13 @@ from django.db import migrations try: from django.contrib.postgres.operations import ( - CreateExtension, HStoreExtension, UnaccentExtension, + CreateExtension, HStoreExtension, TrigramExtension, UnaccentExtension, ) except ImportError: from django.test import mock CreateExtension = mock.Mock() HStoreExtension = mock.Mock() + TrigramExtension = mock.Mock() UnaccentExtension = mock.Mock() @@ -21,5 +22,6 @@ class Migration(migrations.Migration): # dash in its name. CreateExtension('uuid-ossp'), HStoreExtension(), + TrigramExtension(), UnaccentExtension(), ] diff --git a/tests/postgres_tests/test_trigram.py b/tests/postgres_tests/test_trigram.py new file mode 100644 index 0000000000..b340b41869 --- /dev/null +++ b/tests/postgres_tests/test_trigram.py @@ -0,0 +1,53 @@ +from django.contrib.postgres.search import TrigramDistance, TrigramSimilarity +from django.test import modify_settings + +from . import PostgreSQLTestCase +from .models import CharFieldModel, TextFieldModel + + +@modify_settings(INSTALLED_APPS={'append': 'django.contrib.postgres'}) +class TrigramTest(PostgreSQLTestCase): + Model = CharFieldModel + + @classmethod + def setUpTestData(cls): + cls.Model.objects.bulk_create([ + cls.Model(field='Matthew'), + cls.Model(field='Cat sat on mat.'), + cls.Model(field='Dog sat on rug.'), + ]) + + def test_trigram_search(self): + self.assertQuerysetEqual( + self.Model.objects.filter(field__trigram_similar='Mathew'), + ['Matthew'], + transform=lambda instance: instance.field, + ) + + def test_trigram_similarity(self): + search = 'Bat sat on cat.' + self.assertQuerysetEqual( + self.Model.objects.filter( + field__trigram_similar=search, + ).annotate(similarity=TrigramSimilarity('field', search)).order_by('-similarity'), + [('Cat sat on mat.', 0.625), ('Dog sat on rug.', 0.333333)], + transform=lambda instance: (instance.field, instance.similarity), + ordered=True, + ) + + def test_trigram_similarity_alternate(self): + self.assertQuerysetEqual( + self.Model.objects.annotate( + distance=TrigramDistance('field', 'Bat sat on cat.'), + ).filter(distance__lte=0.7).order_by('distance'), + [('Cat sat on mat.', 0.375), ('Dog sat on rug.', 0.666667)], + transform=lambda instance: (instance.field, instance.distance), + ordered=True, + ) + + +class TrigramTextFieldTest(TrigramTest): + """ + TextField has the same behavior as CharField regarding trigram lookups. + """ + Model = TextFieldModel