mirror of
https://github.com/django/django.git
synced 2025-09-17 14:39:17 +00:00
Fixed #28041 -- Added Lexeme expression to contrib.postgres.search.
This expression automatically escapes its input and allows fine-grained control over prefix matching and term weighting via logical combinations. Thanks Mariusz Felisiak, Adam Zapletal, Paolo Melchiorre, Jacob Walls, Adam Johnson, and Simon Charette for reviews. Co-authored-by: joetsoi <joetsoi@users.noreply.github.com> Co-authored-by: Karl Hobley <karl@kaed.uk> Co-authored-by: Alexandr Tatarinov <tatarinov1997@gmail.com>
This commit is contained in:
parent
e08fa42fa6
commit
218f69f05e
@ -1,3 +1,4 @@
|
||||
from django.db.backends.postgresql.psycopg_any import is_psycopg3
|
||||
from django.db.models import (
|
||||
CharField,
|
||||
Expression,
|
||||
@ -10,9 +11,45 @@ from django.db.models import (
|
||||
)
|
||||
from django.db.models.expressions import CombinedExpression, register_combinable_fields
|
||||
from django.db.models.functions import Cast, Coalesce
|
||||
from django.utils.regex_helper import _lazy_re_compile
|
||||
|
||||
from .utils import CheckPostgresInstalledMixin
|
||||
|
||||
if is_psycopg3:
|
||||
from psycopg.adapt import Dumper
|
||||
|
||||
class UTF8Dumper(Dumper):
|
||||
def dump(self, obj):
|
||||
return bytes(obj, "utf-8")
|
||||
|
||||
def quote_lexeme(value):
|
||||
return UTF8Dumper(str).quote(psql_escape(value)).decode()
|
||||
|
||||
else:
|
||||
from psycopg2.extensions import adapt
|
||||
|
||||
def quote_lexeme(value):
|
||||
adapter = adapt(psql_escape(value))
|
||||
adapter.encoding = "utf-8"
|
||||
return adapter.getquoted().decode()
|
||||
|
||||
|
||||
spec_chars_re = _lazy_re_compile(r"['\0\[\]()|&:*!@<>\\]")
|
||||
multiple_spaces_re = _lazy_re_compile(r"\s{2,}")
|
||||
|
||||
|
||||
def normalize_spaces(val):
|
||||
"""Convert multiple spaces to single and strip from both sides."""
|
||||
if not (val := val.strip()):
|
||||
return None
|
||||
return multiple_spaces_re.sub(" ", val)
|
||||
|
||||
|
||||
def psql_escape(query):
|
||||
"""Replace chars not fit for use in search queries with a single space."""
|
||||
query = spec_chars_re.sub(" ", query)
|
||||
return normalize_spaces(query)
|
||||
|
||||
|
||||
class SearchVectorExact(Lookup):
|
||||
lookup_name = "exact"
|
||||
@ -205,6 +242,9 @@ class SearchQuery(SearchQueryCombinable, Func):
|
||||
invert=False,
|
||||
search_type="plain",
|
||||
):
|
||||
if isinstance(value, LexemeCombinable):
|
||||
search_type = "raw"
|
||||
|
||||
self.function = self.SEARCH_TYPES.get(search_type)
|
||||
if self.function is None:
|
||||
raise ValueError("Unknown search_type argument '%s'." % search_type)
|
||||
@ -383,3 +423,104 @@ class TrigramWordSimilarity(TrigramWordBase):
|
||||
|
||||
class TrigramStrictWordSimilarity(TrigramWordBase):
|
||||
function = "STRICT_WORD_SIMILARITY"
|
||||
|
||||
|
||||
class LexemeCombinable:
|
||||
BITAND = "&"
|
||||
BITOR = "|"
|
||||
|
||||
def _combine(self, other, connector, reversed):
|
||||
if not isinstance(other, LexemeCombinable):
|
||||
raise TypeError(
|
||||
"A Lexeme can only be combined with another Lexeme, "
|
||||
f"got {other.__class__.__name__}."
|
||||
)
|
||||
if reversed:
|
||||
return CombinedLexeme(other, connector, self)
|
||||
return CombinedLexeme(self, connector, other)
|
||||
|
||||
# On Combinable, these are not implemented to reduce confusion with Q. In
|
||||
# this case we are actually (ab)using them to do logical combination so
|
||||
# it's consistent with other usage in Django.
|
||||
def __or__(self, other):
|
||||
return self._combine(other, self.BITOR, False)
|
||||
|
||||
def __ror__(self, other):
|
||||
return self._combine(other, self.BITOR, True)
|
||||
|
||||
def __and__(self, other):
|
||||
return self._combine(other, self.BITAND, False)
|
||||
|
||||
def __rand__(self, other):
|
||||
return self._combine(other, self.BITAND, True)
|
||||
|
||||
|
||||
class Lexeme(LexemeCombinable, Value):
|
||||
_output_field = SearchQueryField()
|
||||
|
||||
def __init__(
|
||||
self, value, output_field=None, *, invert=False, prefix=False, weight=None
|
||||
):
|
||||
if value == "":
|
||||
raise ValueError("Lexeme value cannot be empty.")
|
||||
|
||||
if not isinstance(value, str):
|
||||
raise TypeError(
|
||||
f"Lexeme value must be a string, got {value.__class__.__name__}."
|
||||
)
|
||||
|
||||
if weight is not None and (
|
||||
not isinstance(weight, str) or weight.lower() not in {"a", "b", "c", "d"}
|
||||
):
|
||||
raise ValueError(
|
||||
f"Weight must be one of 'A', 'B', 'C', and 'D', got {weight!r}."
|
||||
)
|
||||
|
||||
self.prefix = prefix
|
||||
self.invert = invert
|
||||
self.weight = weight
|
||||
super().__init__(value, output_field=output_field)
|
||||
|
||||
def as_sql(self, compiler, connection):
|
||||
param = quote_lexeme(self.value)
|
||||
label = ""
|
||||
if self.prefix:
|
||||
label += "*"
|
||||
if self.weight:
|
||||
label += self.weight
|
||||
|
||||
if label:
|
||||
param = f"{param}:{label}"
|
||||
if self.invert:
|
||||
param = f"!{param}"
|
||||
|
||||
return "%s", (param,)
|
||||
|
||||
def __invert__(self):
|
||||
cloned = self.copy()
|
||||
cloned.invert = not self.invert
|
||||
return cloned
|
||||
|
||||
|
||||
class CombinedLexeme(LexemeCombinable, CombinedExpression):
|
||||
_output_field = SearchQueryField()
|
||||
|
||||
def as_sql(self, compiler, connection):
|
||||
value_params = []
|
||||
lsql, params = compiler.compile(self.lhs)
|
||||
value_params.extend(params)
|
||||
|
||||
rsql, params = compiler.compile(self.rhs)
|
||||
value_params.extend(params)
|
||||
|
||||
combined_sql = f"({lsql} {self.connector} {rsql})"
|
||||
combined_value = combined_sql % tuple(value_params)
|
||||
return "%s", (combined_value,)
|
||||
|
||||
def __invert__(self):
|
||||
# Apply De Morgan's theorem.
|
||||
cloned = self.copy()
|
||||
cloned.connector = self.BITAND if self.connector == self.BITOR else self.BITOR
|
||||
cloned.lhs = ~self.lhs
|
||||
cloned.rhs = ~self.rhs
|
||||
return cloned
|
||||
|
@ -96,7 +96,7 @@ Examples:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> from django.contrib.postgres.search import SearchQuery
|
||||
>>> from django.contrib.postgres.search import SearchQuery, Lexeme
|
||||
>>> SearchQuery("red tomato") # two keywords
|
||||
>>> SearchQuery("tomato red") # same results as above
|
||||
>>> SearchQuery("red tomato", search_type="phrase") # a phrase
|
||||
@ -105,6 +105,7 @@ Examples:
|
||||
>>> SearchQuery(
|
||||
... "'tomato' ('red' OR 'green')", search_type="websearch"
|
||||
... ) # websearch operators
|
||||
>>> SearchQuery(Lexeme("tomato") & (Lexeme("red") | Lexeme("green"))) # Lexeme objects
|
||||
|
||||
``SearchQuery`` terms can be combined logically to provide more flexibility:
|
||||
|
||||
@ -118,6 +119,10 @@ Examples:
|
||||
See :ref:`postgresql-fts-search-configuration` for an explanation of the
|
||||
``config`` parameter.
|
||||
|
||||
.. versionchanged:: 6.0
|
||||
|
||||
:class:`Lexeme` objects were added.
|
||||
|
||||
``SearchRank``
|
||||
==============
|
||||
|
||||
@ -276,6 +281,53 @@ floats to :class:`SearchRank` as ``weights`` in the same order above:
|
||||
>>> rank = SearchRank(vector, query, weights=[0.2, 0.4, 0.6, 0.8])
|
||||
>>> Entry.objects.annotate(rank=rank).filter(rank__gte=0.3).order_by("-rank")
|
||||
|
||||
``Lexeme``
|
||||
==========
|
||||
|
||||
.. versionadded:: 6.0
|
||||
|
||||
.. class:: Lexeme(value, output_field=None, *, invert=False, prefix=False, weight=None)
|
||||
|
||||
``Lexeme`` objects allow search operators to be safely used with strings from
|
||||
an untrusted source. The content of each lexeme is escaped so that any
|
||||
operators that may exist in the string itself will not be interpreted.
|
||||
|
||||
You can combine lexemes with other lexemes using the ``&`` and ``|`` operators
|
||||
and also negate them with the ``~`` operator. For example:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> from django.contrib.postgres.search import SearchQuery, SearchVector, Lexeme
|
||||
>>> vector = SearchVector("body_text", "blog__tagline")
|
||||
>>> Entry.objects.annotate(search=vector).filter(
|
||||
... search=SearchQuery(Lexeme("fruit") & Lexeme("dessert"))
|
||||
... )
|
||||
<QuerySet [<Entry: Apple Crumble Recipes>, <Entry: Banana Split Recipes>]>
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> Entry.objects.annotate(search=vector).filter(
|
||||
... search=SearchQuery(Lexeme("fruit") & Lexeme("dessert") & ~Lexeme("banana"))
|
||||
... )
|
||||
<QuerySet [<Entry: Apple Crumble Recipes>]>
|
||||
|
||||
Lexeme objects also support term weighting and prefixes:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
>>> Entry.objects.annotate(search=vector).filter(
|
||||
... search=SearchQuery(Lexeme("Pizza") | Lexeme("Cheese"))
|
||||
... )
|
||||
<QuerySet [<Entry: Cheese on Toast recipes>, <Entry: Pizza recipes>]>
|
||||
>>> Entry.objects.annotate(search=vector).filter(
|
||||
... search=SearchQuery(Lexeme("Pizza") | Lexeme("Cheese", weight="A"))
|
||||
... )
|
||||
<QuerySet [<Entry: Pizza recipes>]>
|
||||
>>> Entry.objects.annotate(search=vector).filter(
|
||||
... search=SearchQuery(Lexeme("za", prefix=True))
|
||||
... )
|
||||
<QuerySet []>
|
||||
|
||||
Performance
|
||||
===========
|
||||
|
||||
|
@ -171,6 +171,12 @@ Minor features
|
||||
:mod:`django.contrib.postgres`
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* The new :class:`Lexeme <django.contrib.postgres.search.Lexeme>` expression
|
||||
for full text search provides fine-grained control over search terms.
|
||||
``Lexeme`` objects automatically escape their input and support logical
|
||||
combination operators (``&``, ``|``, ``~``), prefix matching, and term
|
||||
weighting.
|
||||
|
||||
* Model fields, indexes, and constraints from :mod:`django.contrib.postgres`
|
||||
now include system checks to verify that ``django.contrib.postgres`` is an
|
||||
installed app.
|
||||
|
@ -6,6 +6,7 @@ All text copyright Python (Monty) Pictures. Thanks to sacred-texts.com for the
|
||||
transcript.
|
||||
"""
|
||||
|
||||
from django.db import connection
|
||||
from django.db.models import F, Value
|
||||
|
||||
from . import PostgreSQLSimpleTestCase, PostgreSQLTestCase
|
||||
@ -13,11 +14,13 @@ from .models import Character, Line, LineSavedSearch, Scene
|
||||
|
||||
try:
|
||||
from django.contrib.postgres.search import (
|
||||
Lexeme,
|
||||
SearchConfig,
|
||||
SearchHeadline,
|
||||
SearchQuery,
|
||||
SearchRank,
|
||||
SearchVector,
|
||||
quote_lexeme,
|
||||
)
|
||||
except ImportError:
|
||||
pass
|
||||
@ -769,3 +772,223 @@ class SearchHeadlineTests(GrailTestData, PostgreSQLTestCase):
|
||||
"<b>Brave</b>, <b>brave</b>, <b>brave</b>...<br>"
|
||||
"<b>brave</b> <b>Sir</b> <b>Robin</b>",
|
||||
)
|
||||
|
||||
|
||||
class TestLexemes(GrailTestData, PostgreSQLTestCase):
|
||||
def test_and(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector("scene__setting", "dialogue"),
|
||||
).filter(search=SearchQuery(Lexeme("bedemir") & Lexeme("scales")))
|
||||
self.assertSequenceEqual(searched, [self.bedemir0])
|
||||
|
||||
def test_multiple_and(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector("scene__setting", "dialogue"),
|
||||
).filter(
|
||||
search=SearchQuery(
|
||||
Lexeme("bedemir") & Lexeme("scales") & Lexeme("nostrils")
|
||||
)
|
||||
)
|
||||
self.assertSequenceEqual(searched, [])
|
||||
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector("scene__setting", "dialogue"),
|
||||
).filter(search=SearchQuery(Lexeme("shall") & Lexeme("use") & Lexeme("larger")))
|
||||
self.assertSequenceEqual(searched, [self.bedemir0])
|
||||
|
||||
def test_or(self):
|
||||
searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
|
||||
search=SearchQuery(Lexeme("kneecaps") | Lexeme("nostrils"))
|
||||
)
|
||||
self.assertCountEqual(searched, [self.verse1, self.verse2])
|
||||
|
||||
def test_multiple_or(self):
|
||||
searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
|
||||
search=SearchQuery(
|
||||
Lexeme("kneecaps") | Lexeme("nostrils") | Lexeme("Sir Robin")
|
||||
)
|
||||
)
|
||||
self.assertCountEqual(searched, [self.verse1, self.verse2, self.verse0])
|
||||
|
||||
def test_advanced(self):
|
||||
"""
|
||||
Combination of & and |
|
||||
This is mainly helpful for checking the test_advanced_invert below
|
||||
"""
|
||||
searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
|
||||
search=SearchQuery(
|
||||
Lexeme("shall") & Lexeme("use") & Lexeme("larger") | Lexeme("nostrils")
|
||||
)
|
||||
)
|
||||
self.assertCountEqual(searched, [self.bedemir0, self.verse2])
|
||||
|
||||
def test_invert(self):
|
||||
searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
|
||||
character=self.minstrel, search=SearchQuery(~Lexeme("kneecaps"))
|
||||
)
|
||||
self.assertCountEqual(searched, [self.verse0, self.verse2])
|
||||
|
||||
def test_advanced_invert(self):
|
||||
"""
|
||||
Inverting a query that uses a combination of & and |
|
||||
should return the opposite of test_advanced.
|
||||
"""
|
||||
searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
|
||||
search=SearchQuery(
|
||||
~(
|
||||
Lexeme("shall") & Lexeme("use") & Lexeme("larger")
|
||||
| Lexeme("nostrils")
|
||||
)
|
||||
)
|
||||
)
|
||||
expected_result = Line.objects.exclude(
|
||||
id__in=[self.bedemir0.id, self.verse2.id]
|
||||
)
|
||||
self.assertCountEqual(searched, expected_result)
|
||||
|
||||
def test_as_sql(self):
|
||||
query = Line.objects.all().query
|
||||
compiler = query.get_compiler(connection.alias)
|
||||
|
||||
tests = (
|
||||
(Lexeme("a"), ("'a'",)),
|
||||
(Lexeme("a", invert=True), ("!'a'",)),
|
||||
(~Lexeme("a"), ("!'a'",)),
|
||||
(Lexeme("a", prefix=True), ("'a':*",)),
|
||||
(Lexeme("a", weight="D"), ("'a':D",)),
|
||||
(Lexeme("a", invert=True, prefix=True, weight="D"), ("!'a':*D",)),
|
||||
(Lexeme("a") | Lexeme("b") & ~Lexeme("c"), ("('a' | ('b' & !'c'))",)),
|
||||
(
|
||||
~(Lexeme("a") | Lexeme("b") & ~Lexeme("c")),
|
||||
("(!'a' & (!'b' | 'c'))",),
|
||||
),
|
||||
)
|
||||
|
||||
for expression, expected_params in tests:
|
||||
with self.subTest(expression=expression, expected_params=expected_params):
|
||||
_, params = expression.as_sql(compiler, connection)
|
||||
self.assertEqual(params, expected_params)
|
||||
|
||||
def test_quote_lexeme(self):
|
||||
tests = (
|
||||
("L'amour piqué par une abeille", "'L amour piqué par une abeille'"),
|
||||
("'starting quote", "'starting quote'"),
|
||||
("ending quote'", "'ending quote'"),
|
||||
("double quo''te", "'double quo te'"),
|
||||
("triple quo'''te", "'triple quo te'"),
|
||||
("backslash\\", "'backslash'"),
|
||||
("exclamation!", "'exclamation'"),
|
||||
("ampers&nd", "'ampers nd'"),
|
||||
)
|
||||
for lexeme, quoted in tests:
|
||||
with self.subTest(lexeme=lexeme):
|
||||
self.assertEqual(quote_lexeme(lexeme), quoted)
|
||||
|
||||
def test_prefix_searching(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector("scene__setting", "dialogue"),
|
||||
).filter(search=SearchQuery(Lexeme("hear", prefix=True)))
|
||||
|
||||
self.assertSequenceEqual(searched, [self.verse2])
|
||||
|
||||
def test_inverse_prefix_searching(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector("scene__setting", "dialogue"),
|
||||
).filter(search=SearchQuery(Lexeme("Robi", prefix=True, invert=True)))
|
||||
self.assertEqual(
|
||||
set(searched),
|
||||
{
|
||||
self.verse2,
|
||||
self.bedemir0,
|
||||
self.bedemir1,
|
||||
self.french,
|
||||
self.crowd,
|
||||
self.witch,
|
||||
self.duck,
|
||||
},
|
||||
)
|
||||
|
||||
def test_lexemes_multiple_and(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector("scene__setting", "dialogue"),
|
||||
).filter(
|
||||
search=SearchQuery(
|
||||
Lexeme("Robi", prefix=True) & Lexeme("Camel", prefix=True)
|
||||
)
|
||||
)
|
||||
|
||||
self.assertSequenceEqual(searched, [self.verse0])
|
||||
|
||||
def test_lexemes_multiple_or(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector("scene__setting", "dialogue"),
|
||||
).filter(
|
||||
search=SearchQuery(
|
||||
Lexeme("kneecap", prefix=True) | Lexeme("afrai", prefix=True)
|
||||
)
|
||||
)
|
||||
|
||||
self.assertSequenceEqual(searched, [self.verse0, self.verse1])
|
||||
|
||||
def test_config_query_explicit(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector("scene__setting", "dialogue", config="french"),
|
||||
).filter(search=SearchQuery(Lexeme("cadeaux"), config="french"))
|
||||
|
||||
self.assertSequenceEqual(searched, [self.french])
|
||||
|
||||
def test_config_query_implicit(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector("scene__setting", "dialogue", config="french"),
|
||||
).filter(search=Lexeme("cadeaux"))
|
||||
|
||||
self.assertSequenceEqual(searched, [self.french])
|
||||
|
||||
def test_config_from_field_explicit(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector(
|
||||
"scene__setting", "dialogue", config=F("dialogue_config")
|
||||
),
|
||||
).filter(search=SearchQuery(Lexeme("cadeaux"), config=F("dialogue_config")))
|
||||
self.assertSequenceEqual(searched, [self.french])
|
||||
|
||||
def test_config_from_field_implicit(self):
|
||||
searched = Line.objects.annotate(
|
||||
search=SearchVector(
|
||||
"scene__setting", "dialogue", config=F("dialogue_config")
|
||||
),
|
||||
).filter(search=Lexeme("cadeaux"))
|
||||
self.assertSequenceEqual(searched, [self.french])
|
||||
|
||||
def test_invalid_combinations(self):
|
||||
msg = "A Lexeme can only be combined with another Lexeme, got NoneType."
|
||||
with self.assertRaisesMessage(TypeError, msg):
|
||||
Line.objects.filter(dialogue__search=None | Lexeme("kneecaps"))
|
||||
|
||||
with self.assertRaisesMessage(TypeError, msg):
|
||||
Line.objects.filter(dialogue__search=None & Lexeme("kneecaps"))
|
||||
|
||||
def test_invalid_weights(self):
|
||||
invalid_weights = ["E", "Drandom", "AB", "C ", 0, "", " ", [1, 2, 3]]
|
||||
for weight in invalid_weights:
|
||||
with self.subTest(weight=weight):
|
||||
with self.assertRaisesMessage(
|
||||
ValueError,
|
||||
f"Weight must be one of 'A', 'B', 'C', and 'D', got {weight!r}.",
|
||||
):
|
||||
Line.objects.filter(
|
||||
dialogue__search=Lexeme("kneecaps", weight=weight)
|
||||
)
|
||||
|
||||
def test_empty(self):
|
||||
with self.assertRaisesMessage(ValueError, "Lexeme value cannot be empty."):
|
||||
Line.objects.annotate(
|
||||
search=SearchVector("scene__setting", "dialogue")
|
||||
).filter(search=SearchQuery(Lexeme("")))
|
||||
|
||||
def test_non_string_values(self):
|
||||
msg = "Lexeme value must be a string, got NoneType."
|
||||
with self.assertRaisesMessage(TypeError, msg):
|
||||
Line.objects.annotate(
|
||||
search=SearchVector("scene__setting", "dialogue")
|
||||
).filter(search=SearchQuery(Lexeme(None)))
|
||||
|
Loading…
x
Reference in New Issue
Block a user