mirror of
https://github.com/django/django.git
synced 2025-07-04 09:49:12 +00:00
[search-api] Initial commit, Lucene working, Xapian and Hype almost working, needs polish.
git-svn-id: http://code.djangoproject.com/svn/django/branches/search-api@3636 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
parent
682aed446b
commit
91790e27cd
0
django/contrib/search/__init__.py
Normal file
0
django/contrib/search/__init__.py
Normal file
19
django/contrib/search/backends.py
Normal file
19
django/contrib/search/backends.py
Normal file
@ -0,0 +1,19 @@
|
||||
from default import DefaultIndexer
|
||||
|
||||
try:
|
||||
from xapian import XapianIndexer
|
||||
except ImportError:
|
||||
print "Xapian backend will not be available due to an ImportError. " \
|
||||
"Do you have Xapian and Xapwrap installed?"
|
||||
|
||||
try:
|
||||
from lucene import LuceneIndexer
|
||||
except ImportError:
|
||||
print "Lucene backend will not be available due to an ImportError. " \
|
||||
"Do you have Lucene and PyLucene installed?"
|
||||
|
||||
try:
|
||||
from hype import HypeIndexer
|
||||
except ImportError:
|
||||
print "Hyper Estraier backend will not be available due to an importError. " \
|
||||
"Do you have Hyper Estraier and Hype installed?"
|
214
django/contrib/search/base.py
Normal file
214
django/contrib/search/base.py
Normal file
@ -0,0 +1,214 @@
|
||||
from django.db import models
|
||||
from django.core.exceptions import ObjectDoesNotExist
|
||||
import sys
|
||||
|
||||
# For Python 2.3
|
||||
if not hasattr(__builtins__, 'set'):
|
||||
from sets import Set as set
|
||||
|
||||
# FIXME: Methods that accept a field parameter claim to accept Field instances
|
||||
# or strings giving the object path. However, since there is no Field
|
||||
# attribute giving the Model it is bound to, these methods only work for
|
||||
# strings at the moment. This doesn't really affect the ease of use of the
|
||||
# library, as strings are actually easier to use.
|
||||
|
||||
def str_to_field(string, namespace=None):
|
||||
"""Gets the column attribute from the model as indicated
|
||||
by `string`, following ForeignKey attributes, etc.
|
||||
|
||||
Example: 'Person.first_name' -> Person._meta.get_field('first_name')
|
||||
|
||||
`namespace` is the dict-like object in which the object path will be
|
||||
searched. If None, the caller's global namespace will be used, thanks
|
||||
to the sys._getframe hack. This is important so that, for example,
|
||||
if `string` is 'models.Person.first_name', the caller's models module
|
||||
is used instead of the django.db.models module imported here.
|
||||
"""
|
||||
# FIXME: This whole function is either silly or clever...
|
||||
objPath = string.split('.')
|
||||
model = None
|
||||
|
||||
if namespace is None:
|
||||
# FIXME: This uses the sys._getframe hack to get the caller's namespace.
|
||||
obj = sys._getframe(1).f_globals
|
||||
else:
|
||||
obj = namespace
|
||||
getter = obj.__getitem__
|
||||
|
||||
while objPath:
|
||||
objName = objPath.pop(0)
|
||||
|
||||
# This might be better in a try/except block, but the respective
|
||||
# exceptions for the getters (KeyError, AttributeError,
|
||||
# FieldDoesNotExist) are already pretty descriptive...
|
||||
obj = getter(objName)
|
||||
|
||||
if isinstance(obj, models.base.ModelBase):
|
||||
model = obj
|
||||
getter = model._meta.get_field
|
||||
elif isinstance(obj, models.fields.related.ForeignKey):
|
||||
model = obj.rel.to
|
||||
getter = model._meta.get_field
|
||||
|
||||
# TODO: The rest of these could be more type-smart...
|
||||
elif hasattr(obj, '__getitem__'):
|
||||
getter = obj.__getitem__
|
||||
elif hasattr(obj, '__getattribute__'):
|
||||
getter = obj.__getattribute__
|
||||
else:
|
||||
getter = obj.__getattr__
|
||||
|
||||
if isinstance(obj, models.base.ModelBase):
|
||||
model = obj
|
||||
obj = obj._meta.pk
|
||||
|
||||
if not isinstance(obj, models.Field):
|
||||
raise ValueError("%r is not a Field object! (%r -> %r)" % \
|
||||
(objName, string, obj))
|
||||
# FIXME: I don't think there is a way to get back to a field's Model
|
||||
# from the Field object. This makes sense from a hierarchical viewpoint,
|
||||
# but sure makes things like this harder. Hopefully setting this attribute
|
||||
# won't mess anything up...
|
||||
obj._model = model
|
||||
return obj
|
||||
|
||||
|
||||
class Indexer(object):
|
||||
def __init__(self, path, model, fields=None, attributes=None, namespace=None, **kwargs):
|
||||
"""Initialize an Indexer whose index data is stored at `path`.
|
||||
`model` is the Model (or string name of the model) whose instances will
|
||||
be used as documents. Note that fields from other models can still be
|
||||
used in the index, but this model will be the one returned from search
|
||||
results.
|
||||
`fields` may be optionally initialized as an iterable of unnamed Fields.
|
||||
`attributes` may be optionally initialized as a mapping of field names
|
||||
to Fields.
|
||||
`namespace` is the dict-like object in which fields passed as object
|
||||
paths will be searched. If None, the caller's global namespace will be
|
||||
used, thanks to the sys._getframe hack.
|
||||
|
||||
Example: If `fields` is ['models.Person.first_name'], it is important
|
||||
that namespace['models'] refers to the intended module and NOT the
|
||||
django.db.models module imported here.
|
||||
"""
|
||||
if fields is None:
|
||||
fields = []
|
||||
if attributes is None:
|
||||
attributes = kwargs
|
||||
else:
|
||||
# `attributes` should take precedence to `kwargs`.
|
||||
kwargs.update(attributes)
|
||||
attributes = kwargs
|
||||
|
||||
if namespace is None:
|
||||
# FIXME: This uses the sys._getframe hack to get the caller's namespace.
|
||||
namespace = sys._getframe(1).f_globals
|
||||
|
||||
self._prepare_path(path)
|
||||
|
||||
self.path = path
|
||||
self.model = model
|
||||
self.text_fields = set([])
|
||||
self.attr_fields = {}
|
||||
|
||||
for field in fields:
|
||||
self.add_field(field, namespace=namespace)
|
||||
|
||||
for name, field in attributes.iteritems():
|
||||
self.add_field(field, name, namespace=namespace)
|
||||
|
||||
pk = self.model._meta.pk
|
||||
pk._model = self.model
|
||||
if pk not in self.text_fields and pk not in set(self.attr_fields.values()):
|
||||
self.add_field(pk, 'pk', namespace=namespace)
|
||||
|
||||
def add_field(self, field, name=None, namespace=None):
|
||||
"""Add the given field to the Indexer, where `field` is either
|
||||
an object path string or a Field instance. If `name` is None,
|
||||
the field will be added to self.text_fields, otherwise it will be
|
||||
added to self.attr_fields with the given name.
|
||||
`namespace` has the same meaning as in __init__.
|
||||
"""
|
||||
# FIXME: This uses the sys._getframe hack to get the caller's namespace.
|
||||
if namespace is None:
|
||||
namespace = sys._getframe(1).f_globals
|
||||
|
||||
# FIXME: Detect duplicates, or user-knows-best?
|
||||
if isinstance(field, basestring):
|
||||
field = str_to_field(field, namespace)
|
||||
|
||||
if name:
|
||||
self.attr_fields[name] = field
|
||||
else:
|
||||
self.text_fields.add(field)
|
||||
|
||||
def remove_field(self, field=None, name=None, find_name=True, namespace=None):
|
||||
"""Remove the given field from the Indexer, where `field` is either
|
||||
an object path string or a Field instance. If `name` is given,
|
||||
the field with that name is removed. If both `field` and `name`
|
||||
are given, both are removed if they refer to different fields.
|
||||
If `find_name` is True, the named fields in self.attr_fields are
|
||||
searched for `field`, otherwise only self.text_fields is searched.
|
||||
`namespace` has the same meaning as in __init__.
|
||||
"""
|
||||
# FIXME: This uses the sys._getframe hack to get the caller's namespace.
|
||||
if namespace is None:
|
||||
namespace = sys._getframe(1).f_globals
|
||||
|
||||
if name:
|
||||
if name in self.attr_fields:
|
||||
del self.attr_fields[name]
|
||||
return
|
||||
|
||||
if field:
|
||||
if isinstance(field, basestring):
|
||||
field = str_to_field(field, namespace)
|
||||
|
||||
self.text_fields.discard(field)
|
||||
|
||||
if find_name:
|
||||
for name, f in self.attr_fields.items():
|
||||
# TODO: Make sure identity is correct here
|
||||
if f is field:
|
||||
del self.attr_fields[name]
|
||||
|
||||
def search(self, query_string, sortBy=None):
|
||||
"""Query the index for `query_string` and return a HitResults instance.
|
||||
`order_by` can have the same values as Model.objects.order_by, with
|
||||
'SCORE' being the default.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def index(self, document):
|
||||
raise NotImplementedError
|
||||
|
||||
def update(self, force=False):
|
||||
raise NotImplementedError
|
||||
|
||||
def _prepare_path(self, path):
|
||||
pass
|
||||
|
||||
def test_indexer():
|
||||
# Note: I'm not very good at writing tests.
|
||||
|
||||
class Person(models.Model):
|
||||
first_name = models.CharField(maxlength=30)
|
||||
last_name = models.CharField(maxlength=30)
|
||||
description = models.TextField()
|
||||
|
||||
i = Indexer('', Person, ['Person.description'], {'first': 'Person.first_name'},
|
||||
last='Person.last_name', namespace=locals())
|
||||
|
||||
assert Person._meta.get_field('description') in i.text_fields
|
||||
assert set([Person._meta.get_field('first_name'),
|
||||
Person._meta.get_field('last_name')]) == \
|
||||
set(i.attr_fields.values())
|
||||
assert 'first' in i.attr_fields and 'last' in i.attr_fields
|
||||
|
||||
i.remove_field('Person.description', namespace=locals())
|
||||
assert not i.text_fields
|
||||
|
||||
i.remove_field(name='last')
|
||||
assert 'last' not in i.attr_fields
|
||||
print "Test succeeded."
|
||||
return i
|
9
django/contrib/search/default.py
Normal file
9
django/contrib/search/default.py
Normal file
@ -0,0 +1,9 @@
|
||||
from base import Indexer
|
||||
|
||||
# This is the future home of a pure-Python text indexer.
|
||||
|
||||
# Alec Thomas has created a built-in indexer for his library here:
|
||||
# http://swapoff.org/wiki/pyndexter
|
||||
|
||||
class DefaultIndexer(Indexer):
|
||||
pass
|
35
django/contrib/search/hype.py
Normal file
35
django/contrib/search/hype.py
Normal file
@ -0,0 +1,35 @@
|
||||
from base import Indexer
|
||||
from query import ResultSet, Hit
|
||||
|
||||
import hype
|
||||
|
||||
# TODO: This is very incomplete.
|
||||
|
||||
class HypeIndexer(Indexer):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(Indexer, self).__init__(*args, **kwargs)
|
||||
self.db = hype.Database(self.path, hype.ESTDBWRITER | hype.ESTDBCREAT)
|
||||
|
||||
def index(self, row):
|
||||
document = hype.Document()
|
||||
document['@pk'] = row._get_pk_val()
|
||||
document.add_text()
|
||||
|
||||
def search(self, query_string, sortBy=None):
|
||||
searcher = self.db.search(query_string)
|
||||
return HypeResultSet(searcher)
|
||||
|
||||
def close(self):
|
||||
self.db.close()
|
||||
|
||||
|
||||
class HypeResultSet(ResultSet):
|
||||
def __len__(self):
|
||||
return len(self._hits)
|
||||
|
||||
def __iter__(self):
|
||||
for hit in self._hits:
|
||||
yield HypeHit(hit, self._indexer)
|
||||
|
||||
class HypeHit(Hit):
|
||||
pass
|
162
django/contrib/search/lucene.py
Normal file
162
django/contrib/search/lucene.py
Normal file
@ -0,0 +1,162 @@
|
||||
from base import Indexer
|
||||
from query import ResultSet, Hit
|
||||
from itertools import imap
|
||||
import os, sys
|
||||
|
||||
import PyLucene
|
||||
|
||||
# WARNING!*
|
||||
# PyLucene wants you to use PyLucene.PythonThread for threading.
|
||||
# Look at samples/ThreadIndexFiles.py bundled with PyLucene.
|
||||
# * I'm not sure how important this is.
|
||||
|
||||
# TODO: Make Lucene aware of field types.
|
||||
|
||||
# Here's how to use me:
|
||||
#
|
||||
# class Person(models.Model):
|
||||
# first_name = models.CharField(maxlength=30)
|
||||
# last_name = models.CharField(maxlength=30)
|
||||
# biography = models.TextField()
|
||||
#
|
||||
# indexer = LuceneIndexer('/tmp/lucene-index', Person, [biography],
|
||||
# {'first': 'Person.first_name',
|
||||
# 'last': 'Person.last_name'})
|
||||
# indexer.update() # Note, calling this multiple times without clearing old
|
||||
# # entries will cause duplicates in the index.
|
||||
# indexer.search("brian -last:beck")
|
||||
|
||||
class LuceneIndexer(Indexer):
|
||||
def __init__(self, *args, **kwargs):
|
||||
# FIXME: This uses the sys._getframe hack to get the caller's namespace.
|
||||
namespace = sys._getframe(1).f_globals
|
||||
kwargs['namespace'] = namespace
|
||||
super(LuceneIndexer, self).__init__(*args, **kwargs)
|
||||
self.writer_closed = True
|
||||
|
||||
def _prepare_path(self, path):
|
||||
# Lucene wants an abstraction of the directory.
|
||||
# Should look into storage in a Model-compatible database in the future...
|
||||
self._store = PyLucene.FSDirectory.getDirectory(path, True)
|
||||
|
||||
def update(self, documents=None):
|
||||
close = False
|
||||
if self.writer_closed:
|
||||
close = True
|
||||
self.open_writer()
|
||||
|
||||
if documents is None:
|
||||
update_queue = self.model.objects.all()
|
||||
else:
|
||||
update_queue = documents
|
||||
|
||||
for document in update_queue:
|
||||
self.delete(document)
|
||||
self.index(document)
|
||||
|
||||
if close:
|
||||
self.close_writer()
|
||||
|
||||
def clear(self):
|
||||
close = False
|
||||
if self.writer_closed:
|
||||
close = True
|
||||
self.open_writer()
|
||||
for i in xrange(self._writer.docCount()):
|
||||
self._writer.deleteDocument(i)
|
||||
if close:
|
||||
self.close_writer()
|
||||
|
||||
def delete(self, row):
|
||||
reader = PyLucene.IndexReader.open(self.path)
|
||||
reader.deleteDocuments(PyLucene.Term('pk', str(row._get_pk_val())))
|
||||
reader.close()
|
||||
|
||||
def open_writer(self):
|
||||
self.writer_closed = False
|
||||
self._writer = PyLucene.IndexWriter(self._store, PyLucene.StandardAnalyzer(), True)
|
||||
self._writer.setMaxFieldLength(1048576) # Max number of tokens stored per field?
|
||||
|
||||
def close_writer(self):
|
||||
self._writer.optimize()
|
||||
self._writer.close()
|
||||
self.writer_closed = True
|
||||
|
||||
def index(self, row):
|
||||
close = False
|
||||
if self.writer_closed:
|
||||
close = True
|
||||
self.open_writer()
|
||||
|
||||
document = PyLucene.Document()
|
||||
|
||||
for name, field in self.attr_fields.iteritems():
|
||||
# FIXME: Assumes no Foreign Keys! Lame!
|
||||
value = getattr(row, field.name)
|
||||
document.add(PyLucene.Field(name, str(value),
|
||||
PyLucene.Field.Store.YES,
|
||||
PyLucene.Field.Index.TOKENIZED))
|
||||
# Lucene only seems to support one 'default' field.
|
||||
# However, we might want multiple fields to be searched
|
||||
# by default. Hopefully just joining their contents with
|
||||
# newlines solves this.
|
||||
contents = '\n'.join([str(getattr(row, field.name)) for field in \
|
||||
self.text_fields])
|
||||
# FIXME: Hardcoded 'contents' field.
|
||||
document.add(PyLucene.Field('contents', contents,
|
||||
PyLucene.Field.Store.YES,
|
||||
PyLucene.Field.Index.TOKENIZED))
|
||||
self._writer.addDocument(document)
|
||||
if close:
|
||||
self.close_writer()
|
||||
|
||||
def search(self, query_string, default_field='contents', order_by='RELEVANCE'):
|
||||
searcher = PyLucene.IndexSearcher(self._store)
|
||||
analyzer = PyLucene.StandardAnalyzer()
|
||||
query = PyLucene.QueryParser(default_field, analyzer).parse(query_string)
|
||||
|
||||
if order_by == 'SCORE':
|
||||
sort_field = PyLucene.SortField.FIELD_SCORE
|
||||
sort = PyLucene.Sort(sort_field)
|
||||
elif order_by == 'INDEX':
|
||||
sort = PyLucene.Sort.INDEXORDER
|
||||
elif order_by == 'RELEVANCE':
|
||||
sort = PyLucene.Sort.RELEVANCE
|
||||
else:
|
||||
reverse = order_by.startswith('-')
|
||||
while order_by[0] in '+-':
|
||||
order_by = order_by[1:]
|
||||
sort_field = PyLucene.SortField(order_by, reverse)
|
||||
sort = PyLucene.Sort(sort_field)
|
||||
hits = searcher.search(query, sort)
|
||||
return LuceneResultSet(hits, self)
|
||||
|
||||
|
||||
class LuceneResultSet(ResultSet):
|
||||
def __init__(self, hits, indexer):
|
||||
self._hits = hits
|
||||
self._indexer = indexer
|
||||
|
||||
def __len__(self):
|
||||
return self._hits.length()
|
||||
|
||||
def __iter__(self):
|
||||
for hit in self._hits:
|
||||
yield LuceneHit(hit, self._indexer)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return LuceneHit(self._hits.__getitem__(item))
|
||||
|
||||
|
||||
class LuceneHit(Hit):
|
||||
def get_pk(self):
|
||||
# FIXME: Hardcoded 'pk' field.
|
||||
return self.data.get('pk')
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self.data.__getitem__(item)
|
||||
|
||||
def get_score(self):
|
||||
return self.data.getScore()
|
||||
|
||||
score = property(get_score)
|
27
django/contrib/search/models.py
Normal file
27
django/contrib/search/models.py
Normal file
@ -0,0 +1,27 @@
|
||||
from django.db import models
|
||||
|
||||
# Note: These aren't used yet, but they probably will be in the future.
|
||||
# This is because the only thing that really needs to be remembered
|
||||
# (the path to the index) is going to go in SETTINGS anyway.
|
||||
# But persistent info such as outdated rows, search statistics, etc.
|
||||
# could still be useful.
|
||||
|
||||
class Index(models.Model):
|
||||
model_name = models.CharField(maxlength=255)
|
||||
|
||||
class IndexedField(models.Model):
|
||||
object_path = models.CharField(maxlength=255)
|
||||
model = models.ForeignKey('Index')
|
||||
|
||||
class QueryLog(models.Model):
|
||||
"""This is not a full log, but merely counts queries."""
|
||||
query = models.CharField(maxlength=255, unique=True)
|
||||
query_count = models.IntegerField(default=1)
|
||||
last_date = DateTimeField()
|
||||
last_source = models.CharField("Some identifier for who sent the query", maxlength=255)
|
||||
|
||||
class Person(models.Model):
|
||||
"""This is for testing."""
|
||||
first_name = models.CharField(maxlength=30)
|
||||
last_name = models.CharField(maxlength=30)
|
||||
description = models.TextField()
|
36
django/contrib/search/query.py
Normal file
36
django/contrib/search/query.py
Normal file
@ -0,0 +1,36 @@
|
||||
class QueryParser(object):
|
||||
# TODO: Make a common query language for all the backends.
|
||||
pass
|
||||
|
||||
|
||||
class ResultSet(object):
|
||||
def __iter__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def __len__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def __getitem__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Hit(object):
|
||||
def __init__(self, data, indexer):
|
||||
self.indexer = indexer
|
||||
self.model = indexer.model
|
||||
self.data = data
|
||||
|
||||
def get_instance(self):
|
||||
name = self.model._meta.pk.name
|
||||
pk = self.model._meta.pk.to_python(self.get_pk())
|
||||
return self.model.objects.get(**{name: pk})
|
||||
|
||||
instance = property(get_instance)
|
||||
|
||||
def get_pk(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s: %s %s, Score: %s>" % (self.__class__.__name__,
|
||||
self.model._meta,
|
||||
self.get_pk(), self.score)
|
1
django/contrib/search/views.py
Normal file
1
django/contrib/search/views.py
Normal file
@ -0,0 +1 @@
|
||||
# Create your views here.
|
64
django/contrib/search/xapian.py
Normal file
64
django/contrib/search/xapian.py
Normal file
@ -0,0 +1,64 @@
|
||||
from django.db import models
|
||||
from datetime import datetime
|
||||
import xapwrap.index
|
||||
import xapwrap.document
|
||||
from itertools import imap
|
||||
|
||||
from base import Indexer, ResultSet
|
||||
|
||||
# TODO: This is incomplete.
|
||||
|
||||
class XapianIndexer(Indexer):
|
||||
def update(self, documents=None):
|
||||
idx = xapwrap.index.Index(self.path, True)
|
||||
|
||||
if documents is None:
|
||||
update_queue = self.model.objects.all()
|
||||
else:
|
||||
update_queue = documents
|
||||
|
||||
for row in documents:
|
||||
keys = []
|
||||
for name, field in self.attr_fields.iteritems():
|
||||
keys.append(xapwrap.document.SortKey(name, getattr(self.model, field.name)))
|
||||
|
||||
d = xapwrap.document.Document(textFields=fields, sortFields=keys, uid=row._get_pk_val())
|
||||
idx.index(d)
|
||||
idx.close()
|
||||
|
||||
def search(self, query, order_by='RELEVANCE'):
|
||||
idx = Index(self.path)
|
||||
if order_by == 'RELEVANCE':
|
||||
results = idx.search(query, sortByRelevence=True)
|
||||
else:
|
||||
ascending = True
|
||||
if isinstance(order_by, basestring) and order_by.startswith('-'):
|
||||
ascending = False
|
||||
while order_by[0] in '+-':
|
||||
order_by = order_by[1:]
|
||||
results = idx.search(query, order_by, sortAscending=ascending)
|
||||
return XapianResultSet(results)
|
||||
|
||||
|
||||
class XapianResultSet(ResultSet):
|
||||
def __init__(self, hits, indexer):
|
||||
self._hits = hits
|
||||
self._indexer = indexer
|
||||
|
||||
def __len__(self):
|
||||
return len(self._hits)
|
||||
|
||||
def __iter__(self):
|
||||
for hit in self._hits):
|
||||
yield XapianHit(hit, self._indexer)
|
||||
|
||||
|
||||
class XapianHit(object):
|
||||
def get_pk(self):
|
||||
return self.data['pk']
|
||||
|
||||
def get_score(self):
|
||||
return self.data['score']
|
||||
|
||||
score = property(get_score)
|
||||
|
Loading…
x
Reference in New Issue
Block a user