From 8d160f154f0240a423e83ffe0690e472f837373c Mon Sep 17 00:00:00 2001 From: Matt Brewer Date: Fri, 17 Jun 2022 08:44:03 +0100 Subject: [PATCH] Fixed #33788 -- Added TrigramStrictWordSimilarity() and TrigramStrictWordDistance() on PostgreSQL. --- AUTHORS | 1 + django/contrib/postgres/apps.py | 12 ++++++- django/contrib/postgres/lookups.py | 5 +++ django/contrib/postgres/search.py | 9 +++++ docs/ref/contrib/postgres/lookups.txt | 25 ++++++++++++++ docs/ref/contrib/postgres/search.txt | 28 ++++++++++++++-- docs/releases/4.2.txt | 7 +++- tests/postgres_tests/test_trigram.py | 48 +++++++++++++++++++++++++++ 8 files changed, 130 insertions(+), 5 deletions(-) diff --git a/AUTHORS b/AUTHORS index b726a8a67b..d8a3cf9103 100644 --- a/AUTHORS +++ b/AUTHORS @@ -636,6 +636,7 @@ answer newbie questions, and generally made Django that much better: Mathieu Agopian Matías Bordese Matt Boersma + Matt Brewer Matt Croydon Matt Deacalion Stevens Matt Dennenbaum diff --git a/django/contrib/postgres/apps.py b/django/contrib/postgres/apps.py index 79fbe57c8f..494cea245a 100644 --- a/django/contrib/postgres/apps.py +++ b/django/contrib/postgres/apps.py @@ -11,7 +11,13 @@ from django.db.models.indexes import IndexExpression from django.utils.translation import gettext_lazy as _ from .indexes import OpClass -from .lookups import SearchLookup, TrigramSimilar, TrigramWordSimilar, Unaccent +from .lookups import ( + SearchLookup, + TrigramSimilar, + TrigramStrictWordSimilar, + TrigramWordSimilar, + Unaccent, +) from .serializers import RangeSerializer from .signals import register_type_handlers @@ -37,6 +43,8 @@ def uninstall_if_needed(setting, value, enter, **kwargs): TextField._unregister_lookup(TrigramSimilar) CharField._unregister_lookup(TrigramWordSimilar) TextField._unregister_lookup(TrigramWordSimilar) + CharField._unregister_lookup(TrigramStrictWordSimilar) + TextField._unregister_lookup(TrigramStrictWordSimilar) # Disconnect this receiver until the next time this app is installed # and ready() connects it again to prevent unnecessary processing on # each setting change. @@ -73,5 +81,7 @@ class PostgresConfig(AppConfig): TextField.register_lookup(TrigramSimilar) CharField.register_lookup(TrigramWordSimilar) TextField.register_lookup(TrigramWordSimilar) + CharField.register_lookup(TrigramStrictWordSimilar) + TextField.register_lookup(TrigramStrictWordSimilar) MigrationWriter.register_serializer(RANGE_TYPES, RangeSerializer) IndexExpression.register_wrappers(OrderBy, OpClass, Collate) diff --git a/django/contrib/postgres/lookups.py b/django/contrib/postgres/lookups.py index 9fed0eea30..f2f88ebc0a 100644 --- a/django/contrib/postgres/lookups.py +++ b/django/contrib/postgres/lookups.py @@ -63,3 +63,8 @@ class TrigramSimilar(PostgresOperatorLookup): class TrigramWordSimilar(PostgresOperatorLookup): lookup_name = "trigram_word_similar" postgres_operator = "%%>" + + +class TrigramStrictWordSimilar(PostgresOperatorLookup): + lookup_name = "trigram_strict_word_similar" + postgres_operator = "%%>>" diff --git a/django/contrib/postgres/search.py b/django/contrib/postgres/search.py index d43163a40b..2b57156263 100644 --- a/django/contrib/postgres/search.py +++ b/django/contrib/postgres/search.py @@ -366,5 +366,14 @@ class TrigramWordDistance(TrigramWordBase): arg_joiner = " <<-> " +class TrigramStrictWordDistance(TrigramWordBase): + function = "" + arg_joiner = " <<<-> " + + class TrigramWordSimilarity(TrigramWordBase): function = "WORD_SIMILARITY" + + +class TrigramStrictWordSimilarity(TrigramWordBase): + function = "STRICT_WORD_SIMILARITY" diff --git a/docs/ref/contrib/postgres/lookups.txt b/docs/ref/contrib/postgres/lookups.txt index 83f21e8dc1..3070227530 100644 --- a/docs/ref/contrib/postgres/lookups.txt +++ b/docs/ref/contrib/postgres/lookups.txt @@ -7,6 +7,9 @@ Trigram similarity .. fieldlookup:: trigram_similar +``trigram_similar`` +------------------- + The ``trigram_similar`` lookup allows you to perform trigram lookups, measuring the number of trigrams (three consecutive characters) shared, using a dedicated PostgreSQL extension. A trigram lookup is given an expression and @@ -27,6 +30,9 @@ The ``trigram_similar`` lookup can be used on .. fieldlookup:: trigram_word_similar +``trigram_word_similar`` +------------------------ + The ``trigram_word_similar`` lookup allows you to perform trigram word similarity lookups using a dedicated PostgreSQL extension. It can be approximately understood as measuring the greatest number of trigrams shared @@ -46,6 +52,25 @@ The ``trigram_word_similar`` lookup can be used on >>> Sentence.objects.filter(name__trigram_word_similar='Middlesborough') [''] +.. fieldlookup:: trigram_strict_word_similar + +``trigram_strict_word_similar`` +------------------------------- + +.. versionadded:: 4.2 + +Similar to :lookup:`trigram_word_similar`, except that it forces extent +boundaries to match word boundaries. + +To use it, add ``'django.contrib.postgres'`` in your :setting:`INSTALLED_APPS` +and activate the `pg_trgm extension`_ on PostgreSQL. You can install the +extension using the +:class:`~django.contrib.postgres.operations.TrigramExtension` migration +operation. + +The ``trigram_strict_word_similar`` lookup can be used on +:class:`~django.db.models.CharField` and :class:`~django.db.models.TextField`. + .. _`pg_trgm extension`: https://www.postgresql.org/docs/current/pgtrgm.html ``Unaccent`` diff --git a/docs/ref/contrib/postgres/search.txt b/docs/ref/contrib/postgres/search.txt index e36f4028fe..2e2877a750 100644 --- a/docs/ref/contrib/postgres/search.txt +++ b/docs/ref/contrib/postgres/search.txt @@ -286,9 +286,9 @@ Trigram similarity ================== Another approach to searching is trigram similarity. A trigram is a group of -three consecutive characters. In addition to the :lookup:`trigram_similar` and -:lookup:`trigram_word_similar` lookups, you can use a couple of other -expressions. +three consecutive characters. In addition to the :lookup:`trigram_similar`, +:lookup:`trigram_word_similar`, and :lookup:`trigram_strict_word_similar` +lookups, you can use a couple of other expressions. To use them, you need to activate the `pg_trgm extension `_ on PostgreSQL. You can @@ -334,6 +334,18 @@ Usage example:: ... ).filter(similarity__gt=0.3).order_by('-similarity') [] +``TrigramStrictWordSimilarity`` +------------------------------- + +.. class:: TrigramStrictWordSimilarity(string, expression, **extra) + +.. versionadded:: 4.2 + +Accepts a string or expression, and a field name or expression. Returns the +trigram strict word similarity between the two arguments. Similar to +:class:`TrigramWordSimilarity() `, except that it forces +extent boundaries to match word boundaries. + ``TrigramDistance`` ------------------- @@ -371,3 +383,13 @@ Usage example:: ... distance=TrigramWordDistance(test, 'name'), ... ).filter(distance__lte=0.7).order_by('distance') [] + +``TrigramStrictWordDistance`` +----------------------------- + +.. class:: TrigramStrictWordDistance(string, expression, **extra) + +.. versionadded:: 4.2 + +Accepts a string or expression, and a field name or expression. Returns the +trigram strict word distance between the two arguments. diff --git a/docs/releases/4.2.txt b/docs/releases/4.2.txt index d43d391408..124470cf5b 100644 --- a/docs/releases/4.2.txt +++ b/docs/releases/4.2.txt @@ -65,7 +65,12 @@ Minor features :mod:`django.contrib.postgres` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* ... +* The new :lookup:`trigram_strict_word_similar` lookup, and the + :class:`TrigramStrictWordSimilarity() + ` and + :class:`TrigramStrictWordDistance() + ` expressions allow + using trigram strict word similarity. :mod:`django.contrib.redirects` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/tests/postgres_tests/test_trigram.py b/tests/postgres_tests/test_trigram.py index 6870e80737..2d7549fb4b 100644 --- a/tests/postgres_tests/test_trigram.py +++ b/tests/postgres_tests/test_trigram.py @@ -7,6 +7,8 @@ try: from django.contrib.postgres.search import ( TrigramDistance, TrigramSimilarity, + TrigramStrictWordDistance, + TrigramStrictWordSimilarity, TrigramWordDistance, TrigramWordSimilarity, ) @@ -43,6 +45,25 @@ class TrigramTest(PostgreSQLTestCase): self.Model.objects.filter(field__trigram_word_similar="Middlesborough"), [obj], ) + self.assertSequenceEqual( + self.Model.objects.filter(field__trigram_word_similar="Middle"), + [obj], + ) + + def test_trigram_strict_word_search_matched(self): + obj = self.Model.objects.create( + field="Gumby rides on the path of Middlesbrough", + ) + self.assertSequenceEqual( + self.Model.objects.filter( + field__trigram_strict_word_similar="Middlesborough" + ), + [obj], + ) + self.assertSequenceEqual( + self.Model.objects.filter(field__trigram_strict_word_similar="Middle"), + [], + ) def test_trigram_similarity(self): search = "Bat sat on cat." @@ -75,6 +96,19 @@ class TrigramTest(PostgreSQLTestCase): ], ) + def test_trigram_strict_word_similarity(self): + search = "matt" + self.assertSequenceEqual( + self.Model.objects.filter(field__trigram_word_similar=search) + .annotate(word_similarity=TrigramStrictWordSimilarity(search, "field")) + .values("field", "word_similarity") + .order_by("-word_similarity"), + [ + {"field": "Cat sat on mat.", "word_similarity": 0.5}, + {"field": "Matthew", "word_similarity": 0.44444445}, + ], + ) + def test_trigram_similarity_alternate(self): # Round result of distance because PostgreSQL uses greater precision. self.assertQuerysetEqual( @@ -104,6 +138,20 @@ class TrigramTest(PostgreSQLTestCase): ], ) + def test_trigram_strict_word_distance(self): + self.assertSequenceEqual( + self.Model.objects.annotate( + word_distance=TrigramStrictWordDistance("matt", "field"), + ) + .filter(word_distance__lte=0.7) + .values("field", "word_distance") + .order_by("word_distance"), + [ + {"field": "Cat sat on mat.", "word_distance": 0.5}, + {"field": "Matthew", "word_distance": 0.5555556}, + ], + ) + class TrigramTextFieldTest(TrigramTest): """