diff --git a/django/contrib/postgres/search.py b/django/contrib/postgres/search.py index 4ab27605cb..52e925d27a 100644 --- a/django/contrib/postgres/search.py +++ b/django/contrib/postgres/search.py @@ -1,3 +1,4 @@ +from django.db.backends.postgresql.psycopg_any import is_psycopg3 from django.db.models import ( CharField, Expression, @@ -10,9 +11,45 @@ from django.db.models import ( ) from django.db.models.expressions import CombinedExpression, register_combinable_fields from django.db.models.functions import Cast, Coalesce +from django.utils.regex_helper import _lazy_re_compile from .utils import CheckPostgresInstalledMixin +if is_psycopg3: + from psycopg.adapt import Dumper + + class UTF8Dumper(Dumper): + def dump(self, obj): + return bytes(obj, "utf-8") + + def quote_lexeme(value): + return UTF8Dumper(str).quote(psql_escape(value)).decode() + +else: + from psycopg2.extensions import adapt + + def quote_lexeme(value): + adapter = adapt(psql_escape(value)) + adapter.encoding = "utf-8" + return adapter.getquoted().decode() + + +spec_chars_re = _lazy_re_compile(r"['\0\[\]()|&:*!@<>\\]") +multiple_spaces_re = _lazy_re_compile(r"\s{2,}") + + +def normalize_spaces(val): + """Convert multiple spaces to single and strip from both sides.""" + if not (val := val.strip()): + return None + return multiple_spaces_re.sub(" ", val) + + +def psql_escape(query): + """Replace chars not fit for use in search queries with a single space.""" + query = spec_chars_re.sub(" ", query) + return normalize_spaces(query) + class SearchVectorExact(Lookup): lookup_name = "exact" @@ -205,6 +242,9 @@ class SearchQuery(SearchQueryCombinable, Func): invert=False, search_type="plain", ): + if isinstance(value, LexemeCombinable): + search_type = "raw" + self.function = self.SEARCH_TYPES.get(search_type) if self.function is None: raise ValueError("Unknown search_type argument '%s'." % search_type) @@ -383,3 +423,104 @@ class TrigramWordSimilarity(TrigramWordBase): class TrigramStrictWordSimilarity(TrigramWordBase): function = "STRICT_WORD_SIMILARITY" + + +class LexemeCombinable: + BITAND = "&" + BITOR = "|" + + def _combine(self, other, connector, reversed): + if not isinstance(other, LexemeCombinable): + raise TypeError( + "A Lexeme can only be combined with another Lexeme, " + f"got {other.__class__.__name__}." + ) + if reversed: + return CombinedLexeme(other, connector, self) + return CombinedLexeme(self, connector, other) + + # On Combinable, these are not implemented to reduce confusion with Q. In + # this case we are actually (ab)using them to do logical combination so + # it's consistent with other usage in Django. + def __or__(self, other): + return self._combine(other, self.BITOR, False) + + def __ror__(self, other): + return self._combine(other, self.BITOR, True) + + def __and__(self, other): + return self._combine(other, self.BITAND, False) + + def __rand__(self, other): + return self._combine(other, self.BITAND, True) + + +class Lexeme(LexemeCombinable, Value): + _output_field = SearchQueryField() + + def __init__( + self, value, output_field=None, *, invert=False, prefix=False, weight=None + ): + if value == "": + raise ValueError("Lexeme value cannot be empty.") + + if not isinstance(value, str): + raise TypeError( + f"Lexeme value must be a string, got {value.__class__.__name__}." + ) + + if weight is not None and ( + not isinstance(weight, str) or weight.lower() not in {"a", "b", "c", "d"} + ): + raise ValueError( + f"Weight must be one of 'A', 'B', 'C', and 'D', got {weight!r}." + ) + + self.prefix = prefix + self.invert = invert + self.weight = weight + super().__init__(value, output_field=output_field) + + def as_sql(self, compiler, connection): + param = quote_lexeme(self.value) + label = "" + if self.prefix: + label += "*" + if self.weight: + label += self.weight + + if label: + param = f"{param}:{label}" + if self.invert: + param = f"!{param}" + + return "%s", (param,) + + def __invert__(self): + cloned = self.copy() + cloned.invert = not self.invert + return cloned + + +class CombinedLexeme(LexemeCombinable, CombinedExpression): + _output_field = SearchQueryField() + + def as_sql(self, compiler, connection): + value_params = [] + lsql, params = compiler.compile(self.lhs) + value_params.extend(params) + + rsql, params = compiler.compile(self.rhs) + value_params.extend(params) + + combined_sql = f"({lsql} {self.connector} {rsql})" + combined_value = combined_sql % tuple(value_params) + return "%s", (combined_value,) + + def __invert__(self): + # Apply De Morgan's theorem. + cloned = self.copy() + cloned.connector = self.BITAND if self.connector == self.BITOR else self.BITOR + cloned.lhs = ~self.lhs + cloned.rhs = ~self.rhs + return cloned diff --git a/docs/ref/contrib/postgres/search.txt b/docs/ref/contrib/postgres/search.txt index 4647fcbfa2..88e3cfaeb0 100644 --- a/docs/ref/contrib/postgres/search.txt +++ b/docs/ref/contrib/postgres/search.txt @@ -96,7 +96,7 @@ Examples: .. code-block:: pycon - >>> from django.contrib.postgres.search import SearchQuery + >>> from django.contrib.postgres.search import SearchQuery, Lexeme >>> SearchQuery("red tomato") # two keywords >>> SearchQuery("tomato red") # same results as above >>> SearchQuery("red tomato", search_type="phrase") # a phrase @@ -105,6 +105,7 @@ Examples: >>> SearchQuery( ... "'tomato' ('red' OR 'green')", search_type="websearch" ... ) # websearch operators + >>> SearchQuery(Lexeme("tomato") & (Lexeme("red") | Lexeme("green"))) # Lexeme objects ``SearchQuery`` terms can be combined logically to provide more flexibility: @@ -118,6 +119,10 @@ Examples: See :ref:`postgresql-fts-search-configuration` for an explanation of the ``config`` parameter. +.. versionchanged:: 6.0 + + :class:`Lexeme` objects were added. + ``SearchRank`` ============== @@ -276,6 +281,53 @@ floats to :class:`SearchRank` as ``weights`` in the same order above: >>> rank = SearchRank(vector, query, weights=[0.2, 0.4, 0.6, 0.8]) >>> Entry.objects.annotate(rank=rank).filter(rank__gte=0.3).order_by("-rank") +``Lexeme`` +========== + +.. versionadded:: 6.0 + +.. class:: Lexeme(value, output_field=None, *, invert=False, prefix=False, weight=None) + +``Lexeme`` objects allow search operators to be safely used with strings from +an untrusted source. The content of each lexeme is escaped so that any +operators that may exist in the string itself will not be interpreted. + +You can combine lexemes with other lexemes using the ``&`` and ``|`` operators +and also negate them with the ``~`` operator. For example: + +.. code-block:: pycon + + >>> from django.contrib.postgres.search import SearchQuery, SearchVector, Lexeme + >>> vector = SearchVector("body_text", "blog__tagline") + >>> Entry.objects.annotate(search=vector).filter( + ... search=SearchQuery(Lexeme("fruit") & Lexeme("dessert")) + ... ) + , ]> + +.. code-block:: pycon + + >>> Entry.objects.annotate(search=vector).filter( + ... search=SearchQuery(Lexeme("fruit") & Lexeme("dessert") & ~Lexeme("banana")) + ... ) + ]> + +Lexeme objects also support term weighting and prefixes: + +.. code-block:: pycon + + >>> Entry.objects.annotate(search=vector).filter( + ... search=SearchQuery(Lexeme("Pizza") | Lexeme("Cheese")) + ... ) + , ]> + >>> Entry.objects.annotate(search=vector).filter( + ... search=SearchQuery(Lexeme("Pizza") | Lexeme("Cheese", weight="A")) + ... ) + ]> + >>> Entry.objects.annotate(search=vector).filter( + ... search=SearchQuery(Lexeme("za", prefix=True)) + ... ) + + Performance =========== diff --git a/docs/releases/6.0.txt b/docs/releases/6.0.txt index adfac83b8d..fba0935a2b 100644 --- a/docs/releases/6.0.txt +++ b/docs/releases/6.0.txt @@ -171,6 +171,12 @@ Minor features :mod:`django.contrib.postgres` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* The new :class:`Lexeme ` expression + for full text search provides fine-grained control over search terms. + ``Lexeme`` objects automatically escape their input and support logical + combination operators (``&``, ``|``, ``~``), prefix matching, and term + weighting. + * Model fields, indexes, and constraints from :mod:`django.contrib.postgres` now include system checks to verify that ``django.contrib.postgres`` is an installed app. diff --git a/tests/postgres_tests/test_search.py b/tests/postgres_tests/test_search.py index a7118e7c79..c206c69747 100644 --- a/tests/postgres_tests/test_search.py +++ b/tests/postgres_tests/test_search.py @@ -6,6 +6,7 @@ All text copyright Python (Monty) Pictures. Thanks to sacred-texts.com for the transcript. """ +from django.db import connection from django.db.models import F, Value from . import PostgreSQLSimpleTestCase, PostgreSQLTestCase @@ -13,11 +14,13 @@ from .models import Character, Line, LineSavedSearch, Scene try: from django.contrib.postgres.search import ( + Lexeme, SearchConfig, SearchHeadline, SearchQuery, SearchRank, SearchVector, + quote_lexeme, ) except ImportError: pass @@ -769,3 +772,223 @@ class SearchHeadlineTests(GrailTestData, PostgreSQLTestCase): "Brave, brave, brave...
" "brave Sir Robin", ) + + +class TestLexemes(GrailTestData, PostgreSQLTestCase): + def test_and(self): + searched = Line.objects.annotate( + search=SearchVector("scene__setting", "dialogue"), + ).filter(search=SearchQuery(Lexeme("bedemir") & Lexeme("scales"))) + self.assertSequenceEqual(searched, [self.bedemir0]) + + def test_multiple_and(self): + searched = Line.objects.annotate( + search=SearchVector("scene__setting", "dialogue"), + ).filter( + search=SearchQuery( + Lexeme("bedemir") & Lexeme("scales") & Lexeme("nostrils") + ) + ) + self.assertSequenceEqual(searched, []) + + searched = Line.objects.annotate( + search=SearchVector("scene__setting", "dialogue"), + ).filter(search=SearchQuery(Lexeme("shall") & Lexeme("use") & Lexeme("larger"))) + self.assertSequenceEqual(searched, [self.bedemir0]) + + def test_or(self): + searched = Line.objects.annotate(search=SearchVector("dialogue")).filter( + search=SearchQuery(Lexeme("kneecaps") | Lexeme("nostrils")) + ) + self.assertCountEqual(searched, [self.verse1, self.verse2]) + + def test_multiple_or(self): + searched = Line.objects.annotate(search=SearchVector("dialogue")).filter( + search=SearchQuery( + Lexeme("kneecaps") | Lexeme("nostrils") | Lexeme("Sir Robin") + ) + ) + self.assertCountEqual(searched, [self.verse1, self.verse2, self.verse0]) + + def test_advanced(self): + """ + Combination of & and | + This is mainly helpful for checking the test_advanced_invert below + """ + searched = Line.objects.annotate(search=SearchVector("dialogue")).filter( + search=SearchQuery( + Lexeme("shall") & Lexeme("use") & Lexeme("larger") | Lexeme("nostrils") + ) + ) + self.assertCountEqual(searched, [self.bedemir0, self.verse2]) + + def test_invert(self): + searched = Line.objects.annotate(search=SearchVector("dialogue")).filter( + character=self.minstrel, search=SearchQuery(~Lexeme("kneecaps")) + ) + self.assertCountEqual(searched, [self.verse0, self.verse2]) + + def test_advanced_invert(self): + """ + Inverting a query that uses a combination of & and | + should return the opposite of test_advanced. + """ + searched = Line.objects.annotate(search=SearchVector("dialogue")).filter( + search=SearchQuery( + ~( + Lexeme("shall") & Lexeme("use") & Lexeme("larger") + | Lexeme("nostrils") + ) + ) + ) + expected_result = Line.objects.exclude( + id__in=[self.bedemir0.id, self.verse2.id] + ) + self.assertCountEqual(searched, expected_result) + + def test_as_sql(self): + query = Line.objects.all().query + compiler = query.get_compiler(connection.alias) + + tests = ( + (Lexeme("a"), ("'a'",)), + (Lexeme("a", invert=True), ("!'a'",)), + (~Lexeme("a"), ("!'a'",)), + (Lexeme("a", prefix=True), ("'a':*",)), + (Lexeme("a", weight="D"), ("'a':D",)), + (Lexeme("a", invert=True, prefix=True, weight="D"), ("!'a':*D",)), + (Lexeme("a") | Lexeme("b") & ~Lexeme("c"), ("('a' | ('b' & !'c'))",)), + ( + ~(Lexeme("a") | Lexeme("b") & ~Lexeme("c")), + ("(!'a' & (!'b' | 'c'))",), + ), + ) + + for expression, expected_params in tests: + with self.subTest(expression=expression, expected_params=expected_params): + _, params = expression.as_sql(compiler, connection) + self.assertEqual(params, expected_params) + + def test_quote_lexeme(self): + tests = ( + ("L'amour piqué par une abeille", "'L amour piqué par une abeille'"), + ("'starting quote", "'starting quote'"), + ("ending quote'", "'ending quote'"), + ("double quo''te", "'double quo te'"), + ("triple quo'''te", "'triple quo te'"), + ("backslash\\", "'backslash'"), + ("exclamation!", "'exclamation'"), + ("ampers&nd", "'ampers nd'"), + ) + for lexeme, quoted in tests: + with self.subTest(lexeme=lexeme): + self.assertEqual(quote_lexeme(lexeme), quoted) + + def test_prefix_searching(self): + searched = Line.objects.annotate( + search=SearchVector("scene__setting", "dialogue"), + ).filter(search=SearchQuery(Lexeme("hear", prefix=True))) + + self.assertSequenceEqual(searched, [self.verse2]) + + def test_inverse_prefix_searching(self): + searched = Line.objects.annotate( + search=SearchVector("scene__setting", "dialogue"), + ).filter(search=SearchQuery(Lexeme("Robi", prefix=True, invert=True))) + self.assertEqual( + set(searched), + { + self.verse2, + self.bedemir0, + self.bedemir1, + self.french, + self.crowd, + self.witch, + self.duck, + }, + ) + + def test_lexemes_multiple_and(self): + searched = Line.objects.annotate( + search=SearchVector("scene__setting", "dialogue"), + ).filter( + search=SearchQuery( + Lexeme("Robi", prefix=True) & Lexeme("Camel", prefix=True) + ) + ) + + self.assertSequenceEqual(searched, [self.verse0]) + + def test_lexemes_multiple_or(self): + searched = Line.objects.annotate( + search=SearchVector("scene__setting", "dialogue"), + ).filter( + search=SearchQuery( + Lexeme("kneecap", prefix=True) | Lexeme("afrai", prefix=True) + ) + ) + + self.assertSequenceEqual(searched, [self.verse0, self.verse1]) + + def test_config_query_explicit(self): + searched = Line.objects.annotate( + search=SearchVector("scene__setting", "dialogue", config="french"), + ).filter(search=SearchQuery(Lexeme("cadeaux"), config="french")) + + self.assertSequenceEqual(searched, [self.french]) + + def test_config_query_implicit(self): + searched = Line.objects.annotate( + search=SearchVector("scene__setting", "dialogue", config="french"), + ).filter(search=Lexeme("cadeaux")) + + self.assertSequenceEqual(searched, [self.french]) + + def test_config_from_field_explicit(self): + searched = Line.objects.annotate( + search=SearchVector( + "scene__setting", "dialogue", config=F("dialogue_config") + ), + ).filter(search=SearchQuery(Lexeme("cadeaux"), config=F("dialogue_config"))) + self.assertSequenceEqual(searched, [self.french]) + + def test_config_from_field_implicit(self): + searched = Line.objects.annotate( + search=SearchVector( + "scene__setting", "dialogue", config=F("dialogue_config") + ), + ).filter(search=Lexeme("cadeaux")) + self.assertSequenceEqual(searched, [self.french]) + + def test_invalid_combinations(self): + msg = "A Lexeme can only be combined with another Lexeme, got NoneType." + with self.assertRaisesMessage(TypeError, msg): + Line.objects.filter(dialogue__search=None | Lexeme("kneecaps")) + + with self.assertRaisesMessage(TypeError, msg): + Line.objects.filter(dialogue__search=None & Lexeme("kneecaps")) + + def test_invalid_weights(self): + invalid_weights = ["E", "Drandom", "AB", "C ", 0, "", " ", [1, 2, 3]] + for weight in invalid_weights: + with self.subTest(weight=weight): + with self.assertRaisesMessage( + ValueError, + f"Weight must be one of 'A', 'B', 'C', and 'D', got {weight!r}.", + ): + Line.objects.filter( + dialogue__search=Lexeme("kneecaps", weight=weight) + ) + + def test_empty(self): + with self.assertRaisesMessage(ValueError, "Lexeme value cannot be empty."): + Line.objects.annotate( + search=SearchVector("scene__setting", "dialogue") + ).filter(search=SearchQuery(Lexeme(""))) + + def test_non_string_values(self): + msg = "Lexeme value must be a string, got NoneType." + with self.assertRaisesMessage(TypeError, msg): + Line.objects.annotate( + search=SearchVector("scene__setting", "dialogue") + ).filter(search=SearchQuery(Lexeme(None)))