1
0
mirror of https://github.com/django/django.git synced 2025-09-17 14:39:17 +00:00

Fixed #28041 -- Added Lexeme expression to contrib.postgres.search.

This expression automatically escapes its input and allows
fine-grained control over prefix matching and term weighting
via logical combinations.

Thanks Mariusz Felisiak, Adam Zapletal, Paolo Melchiorre,
Jacob Walls, Adam Johnson, and Simon Charette for reviews.

Co-authored-by: joetsoi <joetsoi@users.noreply.github.com>
Co-authored-by: Karl Hobley <karl@kaed.uk>
Co-authored-by: Alexandr Tatarinov <tatarinov1997@gmail.com>
This commit is contained in:
GappleBee 2017-04-06 16:42:49 +01:00 committed by Jacob Walls
parent e08fa42fa6
commit 218f69f05e
4 changed files with 423 additions and 1 deletions

View File

@ -1,3 +1,4 @@
from django.db.backends.postgresql.psycopg_any import is_psycopg3
from django.db.models import ( from django.db.models import (
CharField, CharField,
Expression, Expression,
@ -10,9 +11,45 @@ from django.db.models import (
) )
from django.db.models.expressions import CombinedExpression, register_combinable_fields from django.db.models.expressions import CombinedExpression, register_combinable_fields
from django.db.models.functions import Cast, Coalesce from django.db.models.functions import Cast, Coalesce
from django.utils.regex_helper import _lazy_re_compile
from .utils import CheckPostgresInstalledMixin from .utils import CheckPostgresInstalledMixin
if is_psycopg3:
from psycopg.adapt import Dumper
class UTF8Dumper(Dumper):
def dump(self, obj):
return bytes(obj, "utf-8")
def quote_lexeme(value):
return UTF8Dumper(str).quote(psql_escape(value)).decode()
else:
from psycopg2.extensions import adapt
def quote_lexeme(value):
adapter = adapt(psql_escape(value))
adapter.encoding = "utf-8"
return adapter.getquoted().decode()
spec_chars_re = _lazy_re_compile(r"['\0\[\]()|&:*!@<>\\]")
multiple_spaces_re = _lazy_re_compile(r"\s{2,}")
def normalize_spaces(val):
"""Convert multiple spaces to single and strip from both sides."""
if not (val := val.strip()):
return None
return multiple_spaces_re.sub(" ", val)
def psql_escape(query):
"""Replace chars not fit for use in search queries with a single space."""
query = spec_chars_re.sub(" ", query)
return normalize_spaces(query)
class SearchVectorExact(Lookup): class SearchVectorExact(Lookup):
lookup_name = "exact" lookup_name = "exact"
@ -205,6 +242,9 @@ class SearchQuery(SearchQueryCombinable, Func):
invert=False, invert=False,
search_type="plain", search_type="plain",
): ):
if isinstance(value, LexemeCombinable):
search_type = "raw"
self.function = self.SEARCH_TYPES.get(search_type) self.function = self.SEARCH_TYPES.get(search_type)
if self.function is None: if self.function is None:
raise ValueError("Unknown search_type argument '%s'." % search_type) raise ValueError("Unknown search_type argument '%s'." % search_type)
@ -383,3 +423,104 @@ class TrigramWordSimilarity(TrigramWordBase):
class TrigramStrictWordSimilarity(TrigramWordBase): class TrigramStrictWordSimilarity(TrigramWordBase):
function = "STRICT_WORD_SIMILARITY" function = "STRICT_WORD_SIMILARITY"
class LexemeCombinable:
BITAND = "&"
BITOR = "|"
def _combine(self, other, connector, reversed):
if not isinstance(other, LexemeCombinable):
raise TypeError(
"A Lexeme can only be combined with another Lexeme, "
f"got {other.__class__.__name__}."
)
if reversed:
return CombinedLexeme(other, connector, self)
return CombinedLexeme(self, connector, other)
# On Combinable, these are not implemented to reduce confusion with Q. In
# this case we are actually (ab)using them to do logical combination so
# it's consistent with other usage in Django.
def __or__(self, other):
return self._combine(other, self.BITOR, False)
def __ror__(self, other):
return self._combine(other, self.BITOR, True)
def __and__(self, other):
return self._combine(other, self.BITAND, False)
def __rand__(self, other):
return self._combine(other, self.BITAND, True)
class Lexeme(LexemeCombinable, Value):
_output_field = SearchQueryField()
def __init__(
self, value, output_field=None, *, invert=False, prefix=False, weight=None
):
if value == "":
raise ValueError("Lexeme value cannot be empty.")
if not isinstance(value, str):
raise TypeError(
f"Lexeme value must be a string, got {value.__class__.__name__}."
)
if weight is not None and (
not isinstance(weight, str) or weight.lower() not in {"a", "b", "c", "d"}
):
raise ValueError(
f"Weight must be one of 'A', 'B', 'C', and 'D', got {weight!r}."
)
self.prefix = prefix
self.invert = invert
self.weight = weight
super().__init__(value, output_field=output_field)
def as_sql(self, compiler, connection):
param = quote_lexeme(self.value)
label = ""
if self.prefix:
label += "*"
if self.weight:
label += self.weight
if label:
param = f"{param}:{label}"
if self.invert:
param = f"!{param}"
return "%s", (param,)
def __invert__(self):
cloned = self.copy()
cloned.invert = not self.invert
return cloned
class CombinedLexeme(LexemeCombinable, CombinedExpression):
_output_field = SearchQueryField()
def as_sql(self, compiler, connection):
value_params = []
lsql, params = compiler.compile(self.lhs)
value_params.extend(params)
rsql, params = compiler.compile(self.rhs)
value_params.extend(params)
combined_sql = f"({lsql} {self.connector} {rsql})"
combined_value = combined_sql % tuple(value_params)
return "%s", (combined_value,)
def __invert__(self):
# Apply De Morgan's theorem.
cloned = self.copy()
cloned.connector = self.BITAND if self.connector == self.BITOR else self.BITOR
cloned.lhs = ~self.lhs
cloned.rhs = ~self.rhs
return cloned

View File

@ -96,7 +96,7 @@ Examples:
.. code-block:: pycon .. code-block:: pycon
>>> from django.contrib.postgres.search import SearchQuery >>> from django.contrib.postgres.search import SearchQuery, Lexeme
>>> SearchQuery("red tomato") # two keywords >>> SearchQuery("red tomato") # two keywords
>>> SearchQuery("tomato red") # same results as above >>> SearchQuery("tomato red") # same results as above
>>> SearchQuery("red tomato", search_type="phrase") # a phrase >>> SearchQuery("red tomato", search_type="phrase") # a phrase
@ -105,6 +105,7 @@ Examples:
>>> SearchQuery( >>> SearchQuery(
... "'tomato' ('red' OR 'green')", search_type="websearch" ... "'tomato' ('red' OR 'green')", search_type="websearch"
... ) # websearch operators ... ) # websearch operators
>>> SearchQuery(Lexeme("tomato") & (Lexeme("red") | Lexeme("green"))) # Lexeme objects
``SearchQuery`` terms can be combined logically to provide more flexibility: ``SearchQuery`` terms can be combined logically to provide more flexibility:
@ -118,6 +119,10 @@ Examples:
See :ref:`postgresql-fts-search-configuration` for an explanation of the See :ref:`postgresql-fts-search-configuration` for an explanation of the
``config`` parameter. ``config`` parameter.
.. versionchanged:: 6.0
:class:`Lexeme` objects were added.
``SearchRank`` ``SearchRank``
============== ==============
@ -276,6 +281,53 @@ floats to :class:`SearchRank` as ``weights`` in the same order above:
>>> rank = SearchRank(vector, query, weights=[0.2, 0.4, 0.6, 0.8]) >>> rank = SearchRank(vector, query, weights=[0.2, 0.4, 0.6, 0.8])
>>> Entry.objects.annotate(rank=rank).filter(rank__gte=0.3).order_by("-rank") >>> Entry.objects.annotate(rank=rank).filter(rank__gte=0.3).order_by("-rank")
``Lexeme``
==========
.. versionadded:: 6.0
.. class:: Lexeme(value, output_field=None, *, invert=False, prefix=False, weight=None)
``Lexeme`` objects allow search operators to be safely used with strings from
an untrusted source. The content of each lexeme is escaped so that any
operators that may exist in the string itself will not be interpreted.
You can combine lexemes with other lexemes using the ``&`` and ``|`` operators
and also negate them with the ``~`` operator. For example:
.. code-block:: pycon
>>> from django.contrib.postgres.search import SearchQuery, SearchVector, Lexeme
>>> vector = SearchVector("body_text", "blog__tagline")
>>> Entry.objects.annotate(search=vector).filter(
... search=SearchQuery(Lexeme("fruit") & Lexeme("dessert"))
... )
<QuerySet [<Entry: Apple Crumble Recipes>, <Entry: Banana Split Recipes>]>
.. code-block:: pycon
>>> Entry.objects.annotate(search=vector).filter(
... search=SearchQuery(Lexeme("fruit") & Lexeme("dessert") & ~Lexeme("banana"))
... )
<QuerySet [<Entry: Apple Crumble Recipes>]>
Lexeme objects also support term weighting and prefixes:
.. code-block:: pycon
>>> Entry.objects.annotate(search=vector).filter(
... search=SearchQuery(Lexeme("Pizza") | Lexeme("Cheese"))
... )
<QuerySet [<Entry: Cheese on Toast recipes>, <Entry: Pizza recipes>]>
>>> Entry.objects.annotate(search=vector).filter(
... search=SearchQuery(Lexeme("Pizza") | Lexeme("Cheese", weight="A"))
... )
<QuerySet [<Entry: Pizza recipes>]>
>>> Entry.objects.annotate(search=vector).filter(
... search=SearchQuery(Lexeme("za", prefix=True))
... )
<QuerySet []>
Performance Performance
=========== ===========

View File

@ -171,6 +171,12 @@ Minor features
:mod:`django.contrib.postgres` :mod:`django.contrib.postgres`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* The new :class:`Lexeme <django.contrib.postgres.search.Lexeme>` expression
for full text search provides fine-grained control over search terms.
``Lexeme`` objects automatically escape their input and support logical
combination operators (``&``, ``|``, ``~``), prefix matching, and term
weighting.
* Model fields, indexes, and constraints from :mod:`django.contrib.postgres` * Model fields, indexes, and constraints from :mod:`django.contrib.postgres`
now include system checks to verify that ``django.contrib.postgres`` is an now include system checks to verify that ``django.contrib.postgres`` is an
installed app. installed app.

View File

@ -6,6 +6,7 @@ All text copyright Python (Monty) Pictures. Thanks to sacred-texts.com for the
transcript. transcript.
""" """
from django.db import connection
from django.db.models import F, Value from django.db.models import F, Value
from . import PostgreSQLSimpleTestCase, PostgreSQLTestCase from . import PostgreSQLSimpleTestCase, PostgreSQLTestCase
@ -13,11 +14,13 @@ from .models import Character, Line, LineSavedSearch, Scene
try: try:
from django.contrib.postgres.search import ( from django.contrib.postgres.search import (
Lexeme,
SearchConfig, SearchConfig,
SearchHeadline, SearchHeadline,
SearchQuery, SearchQuery,
SearchRank, SearchRank,
SearchVector, SearchVector,
quote_lexeme,
) )
except ImportError: except ImportError:
pass pass
@ -769,3 +772,223 @@ class SearchHeadlineTests(GrailTestData, PostgreSQLTestCase):
"<b>Brave</b>, <b>brave</b>, <b>brave</b>...<br>" "<b>Brave</b>, <b>brave</b>, <b>brave</b>...<br>"
"<b>brave</b> <b>Sir</b> <b>Robin</b>", "<b>brave</b> <b>Sir</b> <b>Robin</b>",
) )
class TestLexemes(GrailTestData, PostgreSQLTestCase):
def test_and(self):
searched = Line.objects.annotate(
search=SearchVector("scene__setting", "dialogue"),
).filter(search=SearchQuery(Lexeme("bedemir") & Lexeme("scales")))
self.assertSequenceEqual(searched, [self.bedemir0])
def test_multiple_and(self):
searched = Line.objects.annotate(
search=SearchVector("scene__setting", "dialogue"),
).filter(
search=SearchQuery(
Lexeme("bedemir") & Lexeme("scales") & Lexeme("nostrils")
)
)
self.assertSequenceEqual(searched, [])
searched = Line.objects.annotate(
search=SearchVector("scene__setting", "dialogue"),
).filter(search=SearchQuery(Lexeme("shall") & Lexeme("use") & Lexeme("larger")))
self.assertSequenceEqual(searched, [self.bedemir0])
def test_or(self):
searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
search=SearchQuery(Lexeme("kneecaps") | Lexeme("nostrils"))
)
self.assertCountEqual(searched, [self.verse1, self.verse2])
def test_multiple_or(self):
searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
search=SearchQuery(
Lexeme("kneecaps") | Lexeme("nostrils") | Lexeme("Sir Robin")
)
)
self.assertCountEqual(searched, [self.verse1, self.verse2, self.verse0])
def test_advanced(self):
"""
Combination of & and |
This is mainly helpful for checking the test_advanced_invert below
"""
searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
search=SearchQuery(
Lexeme("shall") & Lexeme("use") & Lexeme("larger") | Lexeme("nostrils")
)
)
self.assertCountEqual(searched, [self.bedemir0, self.verse2])
def test_invert(self):
searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
character=self.minstrel, search=SearchQuery(~Lexeme("kneecaps"))
)
self.assertCountEqual(searched, [self.verse0, self.verse2])
def test_advanced_invert(self):
"""
Inverting a query that uses a combination of & and |
should return the opposite of test_advanced.
"""
searched = Line.objects.annotate(search=SearchVector("dialogue")).filter(
search=SearchQuery(
~(
Lexeme("shall") & Lexeme("use") & Lexeme("larger")
| Lexeme("nostrils")
)
)
)
expected_result = Line.objects.exclude(
id__in=[self.bedemir0.id, self.verse2.id]
)
self.assertCountEqual(searched, expected_result)
def test_as_sql(self):
query = Line.objects.all().query
compiler = query.get_compiler(connection.alias)
tests = (
(Lexeme("a"), ("'a'",)),
(Lexeme("a", invert=True), ("!'a'",)),
(~Lexeme("a"), ("!'a'",)),
(Lexeme("a", prefix=True), ("'a':*",)),
(Lexeme("a", weight="D"), ("'a':D",)),
(Lexeme("a", invert=True, prefix=True, weight="D"), ("!'a':*D",)),
(Lexeme("a") | Lexeme("b") & ~Lexeme("c"), ("('a' | ('b' & !'c'))",)),
(
~(Lexeme("a") | Lexeme("b") & ~Lexeme("c")),
("(!'a' & (!'b' | 'c'))",),
),
)
for expression, expected_params in tests:
with self.subTest(expression=expression, expected_params=expected_params):
_, params = expression.as_sql(compiler, connection)
self.assertEqual(params, expected_params)
def test_quote_lexeme(self):
tests = (
("L'amour piqué par une abeille", "'L amour piqué par une abeille'"),
("'starting quote", "'starting quote'"),
("ending quote'", "'ending quote'"),
("double quo''te", "'double quo te'"),
("triple quo'''te", "'triple quo te'"),
("backslash\\", "'backslash'"),
("exclamation!", "'exclamation'"),
("ampers&nd", "'ampers nd'"),
)
for lexeme, quoted in tests:
with self.subTest(lexeme=lexeme):
self.assertEqual(quote_lexeme(lexeme), quoted)
def test_prefix_searching(self):
searched = Line.objects.annotate(
search=SearchVector("scene__setting", "dialogue"),
).filter(search=SearchQuery(Lexeme("hear", prefix=True)))
self.assertSequenceEqual(searched, [self.verse2])
def test_inverse_prefix_searching(self):
searched = Line.objects.annotate(
search=SearchVector("scene__setting", "dialogue"),
).filter(search=SearchQuery(Lexeme("Robi", prefix=True, invert=True)))
self.assertEqual(
set(searched),
{
self.verse2,
self.bedemir0,
self.bedemir1,
self.french,
self.crowd,
self.witch,
self.duck,
},
)
def test_lexemes_multiple_and(self):
searched = Line.objects.annotate(
search=SearchVector("scene__setting", "dialogue"),
).filter(
search=SearchQuery(
Lexeme("Robi", prefix=True) & Lexeme("Camel", prefix=True)
)
)
self.assertSequenceEqual(searched, [self.verse0])
def test_lexemes_multiple_or(self):
searched = Line.objects.annotate(
search=SearchVector("scene__setting", "dialogue"),
).filter(
search=SearchQuery(
Lexeme("kneecap", prefix=True) | Lexeme("afrai", prefix=True)
)
)
self.assertSequenceEqual(searched, [self.verse0, self.verse1])
def test_config_query_explicit(self):
searched = Line.objects.annotate(
search=SearchVector("scene__setting", "dialogue", config="french"),
).filter(search=SearchQuery(Lexeme("cadeaux"), config="french"))
self.assertSequenceEqual(searched, [self.french])
def test_config_query_implicit(self):
searched = Line.objects.annotate(
search=SearchVector("scene__setting", "dialogue", config="french"),
).filter(search=Lexeme("cadeaux"))
self.assertSequenceEqual(searched, [self.french])
def test_config_from_field_explicit(self):
searched = Line.objects.annotate(
search=SearchVector(
"scene__setting", "dialogue", config=F("dialogue_config")
),
).filter(search=SearchQuery(Lexeme("cadeaux"), config=F("dialogue_config")))
self.assertSequenceEqual(searched, [self.french])
def test_config_from_field_implicit(self):
searched = Line.objects.annotate(
search=SearchVector(
"scene__setting", "dialogue", config=F("dialogue_config")
),
).filter(search=Lexeme("cadeaux"))
self.assertSequenceEqual(searched, [self.french])
def test_invalid_combinations(self):
msg = "A Lexeme can only be combined with another Lexeme, got NoneType."
with self.assertRaisesMessage(TypeError, msg):
Line.objects.filter(dialogue__search=None | Lexeme("kneecaps"))
with self.assertRaisesMessage(TypeError, msg):
Line.objects.filter(dialogue__search=None & Lexeme("kneecaps"))
def test_invalid_weights(self):
invalid_weights = ["E", "Drandom", "AB", "C ", 0, "", " ", [1, 2, 3]]
for weight in invalid_weights:
with self.subTest(weight=weight):
with self.assertRaisesMessage(
ValueError,
f"Weight must be one of 'A', 'B', 'C', and 'D', got {weight!r}.",
):
Line.objects.filter(
dialogue__search=Lexeme("kneecaps", weight=weight)
)
def test_empty(self):
with self.assertRaisesMessage(ValueError, "Lexeme value cannot be empty."):
Line.objects.annotate(
search=SearchVector("scene__setting", "dialogue")
).filter(search=SearchQuery(Lexeme("")))
def test_non_string_values(self):
msg = "Lexeme value must be a string, got NoneType."
with self.assertRaisesMessage(TypeError, msg):
Line.objects.annotate(
search=SearchVector("scene__setting", "dialogue")
).filter(search=SearchQuery(Lexeme(None)))