From 4e4082f9396e21de0bd88dbfc651da9ad01c7c0c Mon Sep 17 00:00:00 2001 From: Nikita Marchant Date: Wed, 15 Sep 2021 12:57:49 +0200 Subject: [PATCH] Fixed #32492 -- Added TrigramWordSimilarity() and TrigramWordDistance() on PostgreSQL. --- AUTHORS | 1 + django/contrib/postgres/apps.py | 6 ++- django/contrib/postgres/lookups.py | 5 ++ django/contrib/postgres/search.py | 18 +++++++ docs/ref/contrib/postgres/lookups.txt | 30 ++++++++++-- docs/ref/contrib/postgres/search.txt | 47 ++++++++++++++++++- docs/releases/4.0.txt | 7 +++ .../migrations/0002_create_test_models.py | 2 +- tests/postgres_tests/models.py | 2 +- tests/postgres_tests/test_trigram.py | 39 ++++++++++++++- 10 files changed, 148 insertions(+), 9 deletions(-) diff --git a/AUTHORS b/AUTHORS index 265163357e..214aca2480 100644 --- a/AUTHORS +++ b/AUTHORS @@ -710,6 +710,7 @@ answer newbie questions, and generally made Django that much better: Nicola Larosa Nicolas Lara Nicolas Noé + Nikita Marchant Niran Babalola Nis Jørgensen Nowell Strite diff --git a/django/contrib/postgres/apps.py b/django/contrib/postgres/apps.py index 781c8728f2..91fc29ac05 100644 --- a/django/contrib/postgres/apps.py +++ b/django/contrib/postgres/apps.py @@ -13,7 +13,7 @@ from django.test.signals import setting_changed from django.utils.translation import gettext_lazy as _ from .indexes import OpClass -from .lookups import SearchLookup, TrigramSimilar, Unaccent +from .lookups import SearchLookup, TrigramSimilar, TrigramWordSimilar, Unaccent from .serializers import RangeSerializer from .signals import register_type_handlers @@ -33,6 +33,8 @@ def uninstall_if_needed(setting, value, enter, **kwargs): TextField._unregister_lookup(SearchLookup) CharField._unregister_lookup(TrigramSimilar) TextField._unregister_lookup(TrigramSimilar) + CharField._unregister_lookup(TrigramWordSimilar) + TextField._unregister_lookup(TrigramWordSimilar) # Disconnect this receiver until the next time this app is installed # and ready() connects it again to prevent unnecessary processing on # each setting change. @@ -65,5 +67,7 @@ class PostgresConfig(AppConfig): TextField.register_lookup(SearchLookup) CharField.register_lookup(TrigramSimilar) TextField.register_lookup(TrigramSimilar) + CharField.register_lookup(TrigramWordSimilar) + TextField.register_lookup(TrigramWordSimilar) MigrationWriter.register_serializer(RANGE_TYPES, RangeSerializer) IndexExpression.register_wrappers(OrderBy, OpClass, Collate) diff --git a/django/contrib/postgres/lookups.py b/django/contrib/postgres/lookups.py index 28d8590e1d..f7c6fc4b0c 100644 --- a/django/contrib/postgres/lookups.py +++ b/django/contrib/postgres/lookups.py @@ -58,3 +58,8 @@ class SearchLookup(SearchVectorExact): class TrigramSimilar(PostgresOperatorLookup): lookup_name = 'trigram_similar' postgres_operator = '%%' + + +class TrigramWordSimilar(PostgresOperatorLookup): + lookup_name = 'trigram_word_similar' + postgres_operator = '%%>' diff --git a/django/contrib/postgres/search.py b/django/contrib/postgres/search.py index f1640d85ba..164d359b91 100644 --- a/django/contrib/postgres/search.py +++ b/django/contrib/postgres/search.py @@ -293,6 +293,15 @@ class TrigramBase(Func): super().__init__(expression, string, **extra) +class TrigramWordBase(Func): + output_field = FloatField() + + def __init__(self, string, expression, **extra): + if not hasattr(string, 'resolve_expression'): + string = Value(string) + super().__init__(string, expression, **extra) + + class TrigramSimilarity(TrigramBase): function = 'SIMILARITY' @@ -300,3 +309,12 @@ class TrigramSimilarity(TrigramBase): class TrigramDistance(TrigramBase): function = '' arg_joiner = ' <-> ' + + +class TrigramWordDistance(TrigramWordBase): + function = '' + arg_joiner = ' <<-> ' + + +class TrigramWordSimilarity(TrigramWordBase): + function = 'WORD_SIMILARITY' diff --git a/docs/ref/contrib/postgres/lookups.txt b/docs/ref/contrib/postgres/lookups.txt index ab7a954bf2..d9f76318cc 100644 --- a/docs/ref/contrib/postgres/lookups.txt +++ b/docs/ref/contrib/postgres/lookups.txt @@ -14,9 +14,8 @@ returns results that have a similarity measurement greater than the current similarity threshold. To use it, add ``'django.contrib.postgres'`` in your :setting:`INSTALLED_APPS` -and activate the `pg_trgm extension -`_ on PostgreSQL. You can -install the extension using the +and activate the `pg_trgm extension`_ on PostgreSQL. You can install the +extension using the :class:`~django.contrib.postgres.operations.TrigramExtension` migration operation. @@ -26,6 +25,31 @@ The ``trigram_similar`` lookup can be used on >>> City.objects.filter(name__trigram_similar="Middlesborough") [''] +.. fieldlookup:: trigram_word_similar + +.. versionadded:: 4.0 + +The ``trigram_word_similar`` lookup allows you to perform trigram word +similarity lookups using a dedicated PostgreSQL extension. It can be +approximately understood as measuring the greatest number of trigrams shared +between the parameter and any substring of the field. A trigram word lookup is +given an expression and returns results that have a word similarity measurement +greater than the current similarity threshold. + +To use it, add ``'django.contrib.postgres'`` in your :setting:`INSTALLED_APPS` +and activate the `pg_trgm extension`_ on PostgreSQL. You can install the +extension using the +:class:`~django.contrib.postgres.operations.TrigramExtension` migration +operation. + +The ``trigram_word_similar`` lookup can be used on +:class:`~django.db.models.CharField` and :class:`~django.db.models.TextField`:: + + >>> Sentence.objects.filter(name__trigram_word_similar='Middlesborough') + [''] + +.. _`pg_trgm extension`: https://www.postgresql.org/docs/current/pgtrgm.html + ``Unaccent`` ============ diff --git a/docs/ref/contrib/postgres/search.txt b/docs/ref/contrib/postgres/search.txt index fe4e86f05e..cfed877d9c 100644 --- a/docs/ref/contrib/postgres/search.txt +++ b/docs/ref/contrib/postgres/search.txt @@ -280,8 +280,9 @@ Trigram similarity ================== Another approach to searching is trigram similarity. A trigram is a group of -three consecutive characters. In addition to the :lookup:`trigram_similar` -lookup, you can use a couple of other expressions. +three consecutive characters. In addition to the :lookup:`trigram_similar` and +:lookup:`trigram_word_similar` lookups, you can use a couple of other +expressions. To use them, you need to activate the `pg_trgm extension `_ on PostgreSQL. You can @@ -308,6 +309,27 @@ Usage example:: ... ).filter(similarity__gt=0.3).order_by('-similarity') [, ] +``TrigramWordSimilarity`` +------------------------- + +.. versionadded:: 4.0 + +.. class:: TrigramWordSimilarity(string, expression, **extra) + +Accepts a string or expression, and a field name or expression. Returns the +trigram word similarity between the two arguments. + +Usage example:: + + >>> from django.contrib.postgres.search import TrigramWordSimilarity + >>> Author.objects.create(name='Katy Stevens') + >>> Author.objects.create(name='Stephen Keats') + >>> test = 'Kat' + >>> Author.objects.annotate( + ... similarity=TrigramWordSimilarity(test, 'name'), + ... ).filter(similarity__gt=0.3).order_by('-similarity') + [] + ``TrigramDistance`` ------------------- @@ -326,3 +348,24 @@ Usage example:: ... distance=TrigramDistance('name', test), ... ).filter(distance__lte=0.7).order_by('distance') [, ] + +``TrigramWordDistance`` +----------------------- + +.. versionadded:: 4.0 + +.. class:: TrigramWordDistance(string, expression, **extra) + +Accepts a string or expression, and a field name or expression. Returns the +trigram word distance between the two arguments. + +Usage example:: + + >>> from django.contrib.postgres.search import TrigramWordDistance + >>> Author.objects.create(name='Katy Stevens') + >>> Author.objects.create(name='Stephen Keats') + >>> test = 'Kat' + >>> Author.objects.annotate( + ... distance=TrigramWordDistance(test, 'name'), + ... ).filter(distance__lte=0.7).order_by('distance') + [] diff --git a/docs/releases/4.0.txt b/docs/releases/4.0.txt index 6ec04824f2..cf8fb3dff4 100644 --- a/docs/releases/4.0.txt +++ b/docs/releases/4.0.txt @@ -200,6 +200,13 @@ Minor features expression allows using subqueries to construct lists of values on PostgreSQL. +* The new :lookup:`trigram_word_similar` lookup, and the + :class:`TrigramWordDistance() + ` and + :class:`TrigramWordSimilarity() + ` expressions allow + using trigram word similarity. + :mod:`django.contrib.redirects` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/tests/postgres_tests/migrations/0002_create_test_models.py b/tests/postgres_tests/migrations/0002_create_test_models.py index c3ab5efed7..1ce875441b 100644 --- a/tests/postgres_tests/migrations/0002_create_test_models.py +++ b/tests/postgres_tests/migrations/0002_create_test_models.py @@ -110,7 +110,7 @@ class Migration(migrations.Migration): name='CharFieldModel', fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), - ('field', models.CharField(max_length=16)), + ('field', models.CharField(max_length=64)), ], options=None, bases=None, diff --git a/tests/postgres_tests/models.py b/tests/postgres_tests/models.py index c7f55a2661..ddae9d1edb 100644 --- a/tests/postgres_tests/models.py +++ b/tests/postgres_tests/models.py @@ -83,7 +83,7 @@ class ArrayEnumModel(PostgreSQLModel): class CharFieldModel(models.Model): - field = models.CharField(max_length=16) + field = models.CharField(max_length=64) class TextFieldModel(models.Model): diff --git a/tests/postgres_tests/test_trigram.py b/tests/postgres_tests/test_trigram.py index a5d7d868be..079a32a19b 100644 --- a/tests/postgres_tests/test_trigram.py +++ b/tests/postgres_tests/test_trigram.py @@ -5,7 +5,8 @@ from .models import CharFieldModel, TextFieldModel try: from django.contrib.postgres.search import ( - TrigramDistance, TrigramSimilarity, + TrigramDistance, TrigramSimilarity, TrigramWordDistance, + TrigramWordSimilarity, ) except ImportError: pass @@ -30,6 +31,15 @@ class TrigramTest(PostgreSQLTestCase): transform=lambda instance: instance.field, ) + def test_trigram_word_search(self): + obj = self.Model.objects.create( + field='Gumby rides on the path of Middlesbrough', + ) + self.assertSequenceEqual( + self.Model.objects.filter(field__trigram_word_similar='Middlesborough'), + [obj], + ) + def test_trigram_similarity(self): search = 'Bat sat on cat.' # Round result of similarity because PostgreSQL 12+ uses greater @@ -43,6 +53,20 @@ class TrigramTest(PostgreSQLTestCase): ordered=True, ) + def test_trigram_word_similarity(self): + search = 'mat' + self.assertSequenceEqual( + self.Model.objects.filter( + field__trigram_word_similar=search, + ).annotate( + word_similarity=TrigramWordSimilarity(search, 'field'), + ).values('field', 'word_similarity').order_by('-word_similarity'), + [ + {'field': 'Cat sat on mat.', 'word_similarity': 1.0}, + {'field': 'Matthew', 'word_similarity': 0.75}, + ], + ) + def test_trigram_similarity_alternate(self): # Round result of distance because PostgreSQL 12+ uses greater # precision. @@ -55,6 +79,19 @@ class TrigramTest(PostgreSQLTestCase): ordered=True, ) + def test_trigram_word_similarity_alternate(self): + self.assertSequenceEqual( + self.Model.objects.annotate( + word_distance=TrigramWordDistance('mat', 'field'), + ).filter( + word_distance__lte=0.7, + ).values('field', 'word_distance').order_by('word_distance'), + [ + {'field': 'Cat sat on mat.', 'word_distance': 0}, + {'field': 'Matthew', 'word_distance': 0.25}, + ], + ) + class TrigramTextFieldTest(TrigramTest): """