Fixed #28041 -- Added Lexeme expression to contrib.postgres.search.

This expression automatically escapes its input and allows fine-grained control over prefix matching and term weighting via logical combinations. Thanks Mariusz Felisiak, Adam Zapletal, Paolo Melchiorre, Jacob Walls, Adam Johnson, and Simon Charette for reviews. Co-authored-by: joetsoi <joetsoi@users.noreply.github.com> Co-authored-by: Karl Hobley <karl@kaed.uk> Co-authored-by: Alexandr Tatarinov <tatarinov1997@gmail.com>
2025-09-17 14:39:17 +00:00 · 2017-04-06 16:42:49 +01:00 · 2017-04-06 16:42:49 +01:00 · 218f69f05e
commit 218f69f05e
parent e08fa42fa6
4 changed files with 423 additions and 1 deletions
--- a/django/contrib/postgres/search.py
+++ b/django/contrib/postgres/search.py
@ -1,3 +1,4 @@
 from django.db.backends.postgresql.psycopg_any import is_psycopg3
 from django.db.models import (
    CharField,
    Expression,
@ -10,9 +11,45 @@ from django.db.models import (
 )
 from django.db.models.expressions import CombinedExpression, register_combinable_fields
 from django.db.models.functions import Cast, Coalesce
 from django.utils.regex_helper import _lazy_re_compile
 from .utils import CheckPostgresInstalledMixin
 if is_psycopg3:
    from psycopg.adapt import Dumper
    class UTF8Dumper(Dumper):
        def dump(self, obj):
            return bytes(obj, "utf-8")
    def quote_lexeme(value):
        return UTF8Dumper(str).quote(psql_escape(value)).decode()
 else:
    from psycopg2.extensions import adapt
    def quote_lexeme(value):
        adapter = adapt(psql_escape(value))
        adapter.encoding = "utf-8"
        return adapter.getquoted().decode()
 spec_chars_re = _lazy_re_compile(r"['\0\[\]()|&:*!@<>\\]")
 multiple_spaces_re = _lazy_re_compile(r"\s{2,}")
 def normalize_spaces(val):
    """Convert multiple spaces to single and strip from both sides."""
    if not (val := val.strip()):
        return None
    return multiple_spaces_re.sub(" ", val)
 def psql_escape(query):
    """Replace chars not fit for use in search queries with a single space."""
    query = spec_chars_re.sub(" ", query)
    return normalize_spaces(query)
 class SearchVectorExact(Lookup):
    lookup_name = "exact"
@ -205,6 +242,9 @@ class SearchQuery(SearchQueryCombinable, Func):
        invert=False,
        search_type="plain",
    ):
        if isinstance(value, LexemeCombinable):
            search_type = "raw"
        self.function = self.SEARCH_TYPES.get(search_type)
        if self.function is None:
            raise ValueError("Unknown search_type argument '%s'." % search_type)
@ -383,3 +423,104 @@ class TrigramWordSimilarity(TrigramWordBase):
 class TrigramStrictWordSimilarity(TrigramWordBase):
    function = "STRICT_WORD_SIMILARITY"
 class LexemeCombinable:
    BITAND = "&"
    BITOR = "|"
    def _combine(self, other, connector, reversed):
        if not isinstance(other, LexemeCombinable):
            raise TypeError(
                "A Lexeme can only be combined with another Lexeme, "
                f"got {other.__class__.__name__}."
            )
        if reversed:
            return CombinedLexeme(other, connector, self)
        return CombinedLexeme(self, connector, other)
    # On Combinable, these are not implemented to reduce confusion with Q. In
    # this case we are actually (ab)using them to do logical combination so
    # it's consistent with other usage in Django.
    def __or__(self, other):
        return self._combine(other, self.BITOR, False)
    def __ror__(self, other):
        return self._combine(other, self.BITOR, True)
    def __and__(self, other):
        return self._combine(other, self.BITAND, False)
    def __rand__(self, other):
        return self._combine(other, self.BITAND, True)
 class Lexeme(LexemeCombinable, Value):
    _output_field = SearchQueryField()
    def __init__(
        self, value, output_field=None, *, invert=False, prefix=False, weight=None
    ):
        if value == "":
            raise ValueError("Lexeme value cannot be empty.")
        if not isinstance(value, str):
            raise TypeError(
                f"Lexeme value must be a string, got {value.__class__.__name__}."
            )
        if weight is not None and (
            not isinstance(weight, str) or weight.lower() not in {"a", "b", "c", "d"}
        ):
            raise ValueError(
                f"Weight must be one of 'A', 'B', 'C', and 'D', got {weight!r}."
            )
        self.prefix = prefix
        self.invert = invert
        self.weight = weight
        super().__init__(value, output_field=output_field)
    def as_sql(self, compiler, connection):
        param = quote_lexeme(self.value)
        label = ""
        if self.prefix:
            label += "*"
        if self.weight:
            label += self.weight
        if label:
            param = f"{param}:{label}"
        if self.invert:
            param = f"!{param}"
        return "%s", (param,)
    def __invert__(self):
        cloned = self.copy()
        cloned.invert = not self.invert
        return cloned
 class CombinedLexeme(LexemeCombinable, CombinedExpression):
    _output_field = SearchQueryField()
    def as_sql(self, compiler, connection):
        value_params = []
        lsql, params = compiler.compile(self.lhs)
        value_params.extend(params)
        rsql, params = compiler.compile(self.rhs)
        value_params.extend(params)
        combined_sql = f"({lsql} {self.connector} {rsql})"
        combined_value = combined_sql % tuple(value_params)
        return "%s", (combined_value,)
    def __invert__(self):
        # Apply De Morgan's theorem.
        cloned = self.copy()
        cloned.connector = self.BITAND if self.connector == self.BITOR else self.BITOR
        cloned.lhs = ~self.lhs
        cloned.rhs = ~self.rhs
        return cloned
--- a/docs/ref/contrib/postgres/search.txt
+++ b/docs/ref/contrib/postgres/search.txt
@ -96,7 +96,7 @@ Examples:
 .. code-block:: pycon
-    >>> from django.contrib.postgres.search import SearchQuery
+    >>> from django.contrib.postgres.search import SearchQuery, Lexeme
    >>> SearchQuery("red tomato")  # two keywords
    >>> SearchQuery("tomato red")  # same results as above
    >>> SearchQuery("red tomato", search_type="phrase")  # a phrase
@ -105,6 +105,7 @@ Examples:
    >>> SearchQuery(
    ...     "'tomato' ('red' OR 'green')", search_type="websearch"
    ... )  # websearch operators
    >>> SearchQuery(Lexeme("tomato") & (Lexeme("red") | Lexeme("green")))  # Lexeme objects
 ``SearchQuery`` terms can be combined logically to provide more flexibility:
@ -118,6 +119,10 @@ Examples:
 See :ref:`postgresql-fts-search-configuration` for an explanation of the
 ``config`` parameter.
 .. versionchanged:: 6.0
    :class:`Lexeme` objects were added.
 ``SearchRank``
 ==============
@ -276,6 +281,53 @@ floats to :class:`SearchRank` as ``weights`` in the same order above:
    >>> rank = SearchRank(vector, query, weights=[0.2, 0.4, 0.6, 0.8])
    >>> Entry.objects.annotate(rank=rank).filter(rank__gte=0.3).order_by("-rank")
 ``Lexeme``
 ==========
 .. versionadded:: 6.0
 .. class:: Lexeme(value, output_field=None, *, invert=False, prefix=False, weight=None)
 ``Lexeme`` objects allow search operators to be safely used with strings from
 an untrusted source. The content of each lexeme is escaped so that any
 operators that may exist in the string itself will not be interpreted.
 You can combine lexemes with other lexemes using the ``&`` and ``|`` operators
 and also negate them with the ``~`` operator. For example:
 .. code-block:: pycon
    >>> from django.contrib.postgres.search import SearchQuery, SearchVector, Lexeme
    >>> vector = SearchVector("body_text", "blog__tagline")
    >>> Entry.objects.annotate(search=vector).filter(
    ...     search=SearchQuery(Lexeme("fruit") & Lexeme("dessert"))
    ... )
    <QuerySet [<Entry: Apple Crumble Recipes>, <Entry: Banana Split Recipes>]>
 .. code-block:: pycon
    >>> Entry.objects.annotate(search=vector).filter(
    ...     search=SearchQuery(Lexeme("fruit") & Lexeme("dessert") & ~Lexeme("banana"))
    ... )
    <QuerySet [<Entry: Apple Crumble Recipes>]>
 Lexeme objects also support term weighting and prefixes:
 .. code-block:: pycon
    >>> Entry.objects.annotate(search=vector).filter(
    ...     search=SearchQuery(Lexeme("Pizza") | Lexeme("Cheese"))
    ... )
    <QuerySet [<Entry: Cheese on Toast recipes>, <Entry: Pizza recipes>]>
    >>> Entry.objects.annotate(search=vector).filter(
    ...     search=SearchQuery(Lexeme("Pizza") | Lexeme("Cheese", weight="A"))
    ... )
    <QuerySet [<Entry: Pizza recipes>]>
    >>> Entry.objects.annotate(search=vector).filter(
    ...     search=SearchQuery(Lexeme("za", prefix=True))
    ... )
    <QuerySet []>
 Performance
 ===========
--- a/docs/releases/6.0.txt
+++ b/docs/releases/6.0.txt
@ -171,6 +171,12 @@ Minor features
 :mod:`django.contrib.postgres`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 * The new :class:`Lexeme <django.contrib.postgres.search.Lexeme>` expression
  for full text search provides fine-grained control over search terms.
  ``Lexeme`` objects automatically escape their input and support logical
  combination operators (``&``, ``|``, ``~``), prefix matching, and term
  weighting.
 * Model fields, indexes, and constraints from :mod:`django.contrib.postgres`
  now include system checks to verify that ``django.contrib.postgres`` is an
  installed app.
--- a/tests/postgres_tests/test_search.py
+++ b/tests/postgres_tests/test_search.py
@ -6,6 +6,7 @@ All text copyright Python (Monty) Pictures. Thanks to sacred-texts.com for the
 transcript.
 """
 from django.db import connection
 from django.db.models import F, Value
 from . import PostgreSQLSimpleTestCase, PostgreSQLTestCase
@ -13,11 +14,13 @@ from .models import Character, Line, LineSavedSearch, Scene
 try:
    from django.contrib.postgres.search import (
        Lexeme,
        SearchConfig,
        SearchHeadline,
        SearchQuery,
        SearchRank,
        SearchVector,
        quote_lexeme,
    )
 except ImportError:
    pass
@ -769,3 +772,223 @@ class SearchHeadlineTests(GrailTestData, PostgreSQLTestCase):
            "<b>Brave</b>, <b>brave</b>, <b>brave</b>...<br>"
            "<b>brave</b> <b>Sir</b> <b>Robin</b>",
        )
 class TestLexemes(GrailTestData, PostgreSQLTestCase):
    def test_and(self):
        searched = Line.objects.annotate(
            search=SearchVector("scene__setting", "dialogue"),
        ).filter(search=SearchQuery(Lexeme("bedemir") & Lexeme("scales")))
        self.assertSequenceEqual(searched, [self.bedemir0])
    def test_multiple_and(self):
        searched = Line.objects.annotate(
            search=SearchVector("scene__setting", "dialogue"),
        ).filter(
            search=SearchQuery(
                Lexeme("bedemir") & Lexeme("scales") & Lexeme("nostrils")
            )
        )
        self.assertSequenceEqual(searched, [])
        searched = Line.objects.annotate(
            search=SearchVector("scene__setting", "dialogue"),
        ).filter(search=SearchQuery(Lexeme("shall") & Lexeme("use") & Lexeme("larger")))
        self.assertSequenceEqual(searched, [self.bedemir0])
    def test_or(self):
        searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
            search=SearchQuery(Lexeme("kneecaps") | Lexeme("nostrils"))
        )
        self.assertCountEqual(searched, [self.verse1, self.verse2])
    def test_multiple_or(self):
        searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
            search=SearchQuery(
                Lexeme("kneecaps") | Lexeme("nostrils") | Lexeme("Sir Robin")
            )
        )
        self.assertCountEqual(searched, [self.verse1, self.verse2, self.verse0])
    def test_advanced(self):
        """
        Combination of & and |
        This is mainly helpful for checking the test_advanced_invert below
        """
        searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
            search=SearchQuery(
                Lexeme("shall") & Lexeme("use") & Lexeme("larger") | Lexeme("nostrils")
            )
        )
        self.assertCountEqual(searched, [self.bedemir0, self.verse2])
    def test_invert(self):
        searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
            character=self.minstrel, search=SearchQuery(~Lexeme("kneecaps"))
        )
        self.assertCountEqual(searched, [self.verse0, self.verse2])
    def test_advanced_invert(self):
        """
        Inverting a query that uses a combination of & and |
        should return the opposite of test_advanced.
        """
        searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
            search=SearchQuery(
                ~(
                    Lexeme("shall") & Lexeme("use") & Lexeme("larger")
                    | Lexeme("nostrils")
                )
            )
        )
        expected_result = Line.objects.exclude(
            id__in=[self.bedemir0.id, self.verse2.id]
        )
        self.assertCountEqual(searched, expected_result)
    def test_as_sql(self):
        query = Line.objects.all().query
        compiler = query.get_compiler(connection.alias)
        tests = (
            (Lexeme("a"), ("'a'",)),
            (Lexeme("a", invert=True), ("!'a'",)),
            (~Lexeme("a"), ("!'a'",)),
            (Lexeme("a", prefix=True), ("'a':*",)),
            (Lexeme("a", weight="D"), ("'a':D",)),
            (Lexeme("a", invert=True, prefix=True, weight="D"), ("!'a':*D",)),
            (Lexeme("a") | Lexeme("b") & ~Lexeme("c"), ("('a' | ('b' & !'c'))",)),
            (
                ~(Lexeme("a") | Lexeme("b") & ~Lexeme("c")),
                ("(!'a' & (!'b' | 'c'))",),
            ),
        )
        for expression, expected_params in tests:
            with self.subTest(expression=expression, expected_params=expected_params):
                _, params = expression.as_sql(compiler, connection)
                self.assertEqual(params, expected_params)
    def test_quote_lexeme(self):
        tests = (
            ("L'amour piqué par une abeille", "'L amour piqué par une abeille'"),
            ("'starting quote", "'starting quote'"),
            ("ending quote'", "'ending quote'"),
            ("double quo''te", "'double quo te'"),
            ("triple quo'''te", "'triple quo te'"),
            ("backslash\\", "'backslash'"),
            ("exclamation!", "'exclamation'"),
            ("ampers&nd", "'ampers nd'"),
        )
        for lexeme, quoted in tests:
            with self.subTest(lexeme=lexeme):
                self.assertEqual(quote_lexeme(lexeme), quoted)
    def test_prefix_searching(self):
        searched = Line.objects.annotate(
            search=SearchVector("scene__setting", "dialogue"),
        ).filter(search=SearchQuery(Lexeme("hear", prefix=True)))
        self.assertSequenceEqual(searched, [self.verse2])
    def test_inverse_prefix_searching(self):
        searched = Line.objects.annotate(
            search=SearchVector("scene__setting", "dialogue"),
        ).filter(search=SearchQuery(Lexeme("Robi", prefix=True, invert=True)))
        self.assertEqual(
            set(searched),
            {
                self.verse2,
                self.bedemir0,
                self.bedemir1,
                self.french,
                self.crowd,
                self.witch,
                self.duck,
            },
        )
    def test_lexemes_multiple_and(self):
        searched = Line.objects.annotate(
            search=SearchVector("scene__setting", "dialogue"),
        ).filter(
            search=SearchQuery(
                Lexeme("Robi", prefix=True) & Lexeme("Camel", prefix=True)
            )
        )
        self.assertSequenceEqual(searched, [self.verse0])
    def test_lexemes_multiple_or(self):
        searched = Line.objects.annotate(
            search=SearchVector("scene__setting", "dialogue"),
        ).filter(
            search=SearchQuery(
                Lexeme("kneecap", prefix=True) | Lexeme("afrai", prefix=True)
            )
        )
        self.assertSequenceEqual(searched, [self.verse0, self.verse1])
    def test_config_query_explicit(self):
        searched = Line.objects.annotate(
            search=SearchVector("scene__setting", "dialogue", config="french"),
        ).filter(search=SearchQuery(Lexeme("cadeaux"), config="french"))
        self.assertSequenceEqual(searched, [self.french])
    def test_config_query_implicit(self):
        searched = Line.objects.annotate(
            search=SearchVector("scene__setting", "dialogue", config="french"),
        ).filter(search=Lexeme("cadeaux"))
        self.assertSequenceEqual(searched, [self.french])
    def test_config_from_field_explicit(self):
        searched = Line.objects.annotate(
            search=SearchVector(
                "scene__setting", "dialogue", config=F("dialogue_config")
            ),
        ).filter(search=SearchQuery(Lexeme("cadeaux"), config=F("dialogue_config")))
        self.assertSequenceEqual(searched, [self.french])
    def test_config_from_field_implicit(self):
        searched = Line.objects.annotate(
            search=SearchVector(
                "scene__setting", "dialogue", config=F("dialogue_config")
            ),
        ).filter(search=Lexeme("cadeaux"))
        self.assertSequenceEqual(searched, [self.french])
    def test_invalid_combinations(self):
        msg = "A Lexeme can only be combined with another Lexeme, got NoneType."
        with self.assertRaisesMessage(TypeError, msg):
            Line.objects.filter(dialogue__search=None | Lexeme("kneecaps"))
        with self.assertRaisesMessage(TypeError, msg):
            Line.objects.filter(dialogue__search=None & Lexeme("kneecaps"))
    def test_invalid_weights(self):
        invalid_weights = ["E", "Drandom", "AB", "C ", 0, "", " ", [1, 2, 3]]
        for weight in invalid_weights:
            with self.subTest(weight=weight):
                with self.assertRaisesMessage(
                    ValueError,
                    f"Weight must be one of 'A', 'B', 'C', and 'D', got {weight!r}.",
                ):
                    Line.objects.filter(
                        dialogue__search=Lexeme("kneecaps", weight=weight)
                    )
    def test_empty(self):
        with self.assertRaisesMessage(ValueError, "Lexeme value cannot be empty."):
            Line.objects.annotate(
                search=SearchVector("scene__setting", "dialogue")
            ).filter(search=SearchQuery(Lexeme("")))
    def test_non_string_values(self):
        msg = "Lexeme value must be a string, got NoneType."
        with self.assertRaisesMessage(TypeError, msg):
            Line.objects.annotate(
                search=SearchVector("scene__setting", "dialogue")
            ).filter(search=SearchQuery(Lexeme(None)))