1
0
mirror of https://github.com/django/django.git synced 2025-07-04 09:49:12 +00:00

[search-api] Initial commit, Lucene working, Xapian and Hype almost working, needs polish.

git-svn-id: http://code.djangoproject.com/svn/django/branches/search-api@3636 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
Brian Beck 2006-08-21 19:02:19 +00:00
parent 682aed446b
commit 91790e27cd
10 changed files with 567 additions and 0 deletions

View File

View File

@ -0,0 +1,19 @@
from default import DefaultIndexer
try:
from xapian import XapianIndexer
except ImportError:
print "Xapian backend will not be available due to an ImportError. " \
"Do you have Xapian and Xapwrap installed?"
try:
from lucene import LuceneIndexer
except ImportError:
print "Lucene backend will not be available due to an ImportError. " \
"Do you have Lucene and PyLucene installed?"
try:
from hype import HypeIndexer
except ImportError:
print "Hyper Estraier backend will not be available due to an importError. " \
"Do you have Hyper Estraier and Hype installed?"

View File

@ -0,0 +1,214 @@
from django.db import models
from django.core.exceptions import ObjectDoesNotExist
import sys
# For Python 2.3
if not hasattr(__builtins__, 'set'):
from sets import Set as set
# FIXME: Methods that accept a field parameter claim to accept Field instances
# or strings giving the object path. However, since there is no Field
# attribute giving the Model it is bound to, these methods only work for
# strings at the moment. This doesn't really affect the ease of use of the
# library, as strings are actually easier to use.
def str_to_field(string, namespace=None):
"""Gets the column attribute from the model as indicated
by `string`, following ForeignKey attributes, etc.
Example: 'Person.first_name' -> Person._meta.get_field('first_name')
`namespace` is the dict-like object in which the object path will be
searched. If None, the caller's global namespace will be used, thanks
to the sys._getframe hack. This is important so that, for example,
if `string` is 'models.Person.first_name', the caller's models module
is used instead of the django.db.models module imported here.
"""
# FIXME: This whole function is either silly or clever...
objPath = string.split('.')
model = None
if namespace is None:
# FIXME: This uses the sys._getframe hack to get the caller's namespace.
obj = sys._getframe(1).f_globals
else:
obj = namespace
getter = obj.__getitem__
while objPath:
objName = objPath.pop(0)
# This might be better in a try/except block, but the respective
# exceptions for the getters (KeyError, AttributeError,
# FieldDoesNotExist) are already pretty descriptive...
obj = getter(objName)
if isinstance(obj, models.base.ModelBase):
model = obj
getter = model._meta.get_field
elif isinstance(obj, models.fields.related.ForeignKey):
model = obj.rel.to
getter = model._meta.get_field
# TODO: The rest of these could be more type-smart...
elif hasattr(obj, '__getitem__'):
getter = obj.__getitem__
elif hasattr(obj, '__getattribute__'):
getter = obj.__getattribute__
else:
getter = obj.__getattr__
if isinstance(obj, models.base.ModelBase):
model = obj
obj = obj._meta.pk
if not isinstance(obj, models.Field):
raise ValueError("%r is not a Field object! (%r -> %r)" % \
(objName, string, obj))
# FIXME: I don't think there is a way to get back to a field's Model
# from the Field object. This makes sense from a hierarchical viewpoint,
# but sure makes things like this harder. Hopefully setting this attribute
# won't mess anything up...
obj._model = model
return obj
class Indexer(object):
def __init__(self, path, model, fields=None, attributes=None, namespace=None, **kwargs):
"""Initialize an Indexer whose index data is stored at `path`.
`model` is the Model (or string name of the model) whose instances will
be used as documents. Note that fields from other models can still be
used in the index, but this model will be the one returned from search
results.
`fields` may be optionally initialized as an iterable of unnamed Fields.
`attributes` may be optionally initialized as a mapping of field names
to Fields.
`namespace` is the dict-like object in which fields passed as object
paths will be searched. If None, the caller's global namespace will be
used, thanks to the sys._getframe hack.
Example: If `fields` is ['models.Person.first_name'], it is important
that namespace['models'] refers to the intended module and NOT the
django.db.models module imported here.
"""
if fields is None:
fields = []
if attributes is None:
attributes = kwargs
else:
# `attributes` should take precedence to `kwargs`.
kwargs.update(attributes)
attributes = kwargs
if namespace is None:
# FIXME: This uses the sys._getframe hack to get the caller's namespace.
namespace = sys._getframe(1).f_globals
self._prepare_path(path)
self.path = path
self.model = model
self.text_fields = set([])
self.attr_fields = {}
for field in fields:
self.add_field(field, namespace=namespace)
for name, field in attributes.iteritems():
self.add_field(field, name, namespace=namespace)
pk = self.model._meta.pk
pk._model = self.model
if pk not in self.text_fields and pk not in set(self.attr_fields.values()):
self.add_field(pk, 'pk', namespace=namespace)
def add_field(self, field, name=None, namespace=None):
"""Add the given field to the Indexer, where `field` is either
an object path string or a Field instance. If `name` is None,
the field will be added to self.text_fields, otherwise it will be
added to self.attr_fields with the given name.
`namespace` has the same meaning as in __init__.
"""
# FIXME: This uses the sys._getframe hack to get the caller's namespace.
if namespace is None:
namespace = sys._getframe(1).f_globals
# FIXME: Detect duplicates, or user-knows-best?
if isinstance(field, basestring):
field = str_to_field(field, namespace)
if name:
self.attr_fields[name] = field
else:
self.text_fields.add(field)
def remove_field(self, field=None, name=None, find_name=True, namespace=None):
"""Remove the given field from the Indexer, where `field` is either
an object path string or a Field instance. If `name` is given,
the field with that name is removed. If both `field` and `name`
are given, both are removed if they refer to different fields.
If `find_name` is True, the named fields in self.attr_fields are
searched for `field`, otherwise only self.text_fields is searched.
`namespace` has the same meaning as in __init__.
"""
# FIXME: This uses the sys._getframe hack to get the caller's namespace.
if namespace is None:
namespace = sys._getframe(1).f_globals
if name:
if name in self.attr_fields:
del self.attr_fields[name]
return
if field:
if isinstance(field, basestring):
field = str_to_field(field, namespace)
self.text_fields.discard(field)
if find_name:
for name, f in self.attr_fields.items():
# TODO: Make sure identity is correct here
if f is field:
del self.attr_fields[name]
def search(self, query_string, sortBy=None):
"""Query the index for `query_string` and return a HitResults instance.
`order_by` can have the same values as Model.objects.order_by, with
'SCORE' being the default.
"""
raise NotImplementedError
def index(self, document):
raise NotImplementedError
def update(self, force=False):
raise NotImplementedError
def _prepare_path(self, path):
pass
def test_indexer():
# Note: I'm not very good at writing tests.
class Person(models.Model):
first_name = models.CharField(maxlength=30)
last_name = models.CharField(maxlength=30)
description = models.TextField()
i = Indexer('', Person, ['Person.description'], {'first': 'Person.first_name'},
last='Person.last_name', namespace=locals())
assert Person._meta.get_field('description') in i.text_fields
assert set([Person._meta.get_field('first_name'),
Person._meta.get_field('last_name')]) == \
set(i.attr_fields.values())
assert 'first' in i.attr_fields and 'last' in i.attr_fields
i.remove_field('Person.description', namespace=locals())
assert not i.text_fields
i.remove_field(name='last')
assert 'last' not in i.attr_fields
print "Test succeeded."
return i

View File

@ -0,0 +1,9 @@
from base import Indexer
# This is the future home of a pure-Python text indexer.
# Alec Thomas has created a built-in indexer for his library here:
# http://swapoff.org/wiki/pyndexter
class DefaultIndexer(Indexer):
pass

View File

@ -0,0 +1,35 @@
from base import Indexer
from query import ResultSet, Hit
import hype
# TODO: This is very incomplete.
class HypeIndexer(Indexer):
def __init__(self, *args, **kwargs):
super(Indexer, self).__init__(*args, **kwargs)
self.db = hype.Database(self.path, hype.ESTDBWRITER | hype.ESTDBCREAT)
def index(self, row):
document = hype.Document()
document['@pk'] = row._get_pk_val()
document.add_text()
def search(self, query_string, sortBy=None):
searcher = self.db.search(query_string)
return HypeResultSet(searcher)
def close(self):
self.db.close()
class HypeResultSet(ResultSet):
def __len__(self):
return len(self._hits)
def __iter__(self):
for hit in self._hits:
yield HypeHit(hit, self._indexer)
class HypeHit(Hit):
pass

View File

@ -0,0 +1,162 @@
from base import Indexer
from query import ResultSet, Hit
from itertools import imap
import os, sys
import PyLucene
# WARNING!*
# PyLucene wants you to use PyLucene.PythonThread for threading.
# Look at samples/ThreadIndexFiles.py bundled with PyLucene.
# * I'm not sure how important this is.
# TODO: Make Lucene aware of field types.
# Here's how to use me:
#
# class Person(models.Model):
# first_name = models.CharField(maxlength=30)
# last_name = models.CharField(maxlength=30)
# biography = models.TextField()
#
# indexer = LuceneIndexer('/tmp/lucene-index', Person, [biography],
# {'first': 'Person.first_name',
# 'last': 'Person.last_name'})
# indexer.update() # Note, calling this multiple times without clearing old
# # entries will cause duplicates in the index.
# indexer.search("brian -last:beck")
class LuceneIndexer(Indexer):
def __init__(self, *args, **kwargs):
# FIXME: This uses the sys._getframe hack to get the caller's namespace.
namespace = sys._getframe(1).f_globals
kwargs['namespace'] = namespace
super(LuceneIndexer, self).__init__(*args, **kwargs)
self.writer_closed = True
def _prepare_path(self, path):
# Lucene wants an abstraction of the directory.
# Should look into storage in a Model-compatible database in the future...
self._store = PyLucene.FSDirectory.getDirectory(path, True)
def update(self, documents=None):
close = False
if self.writer_closed:
close = True
self.open_writer()
if documents is None:
update_queue = self.model.objects.all()
else:
update_queue = documents
for document in update_queue:
self.delete(document)
self.index(document)
if close:
self.close_writer()
def clear(self):
close = False
if self.writer_closed:
close = True
self.open_writer()
for i in xrange(self._writer.docCount()):
self._writer.deleteDocument(i)
if close:
self.close_writer()
def delete(self, row):
reader = PyLucene.IndexReader.open(self.path)
reader.deleteDocuments(PyLucene.Term('pk', str(row._get_pk_val())))
reader.close()
def open_writer(self):
self.writer_closed = False
self._writer = PyLucene.IndexWriter(self._store, PyLucene.StandardAnalyzer(), True)
self._writer.setMaxFieldLength(1048576) # Max number of tokens stored per field?
def close_writer(self):
self._writer.optimize()
self._writer.close()
self.writer_closed = True
def index(self, row):
close = False
if self.writer_closed:
close = True
self.open_writer()
document = PyLucene.Document()
for name, field in self.attr_fields.iteritems():
# FIXME: Assumes no Foreign Keys! Lame!
value = getattr(row, field.name)
document.add(PyLucene.Field(name, str(value),
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
# Lucene only seems to support one 'default' field.
# However, we might want multiple fields to be searched
# by default. Hopefully just joining their contents with
# newlines solves this.
contents = '\n'.join([str(getattr(row, field.name)) for field in \
self.text_fields])
# FIXME: Hardcoded 'contents' field.
document.add(PyLucene.Field('contents', contents,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
self._writer.addDocument(document)
if close:
self.close_writer()
def search(self, query_string, default_field='contents', order_by='RELEVANCE'):
searcher = PyLucene.IndexSearcher(self._store)
analyzer = PyLucene.StandardAnalyzer()
query = PyLucene.QueryParser(default_field, analyzer).parse(query_string)
if order_by == 'SCORE':
sort_field = PyLucene.SortField.FIELD_SCORE
sort = PyLucene.Sort(sort_field)
elif order_by == 'INDEX':
sort = PyLucene.Sort.INDEXORDER
elif order_by == 'RELEVANCE':
sort = PyLucene.Sort.RELEVANCE
else:
reverse = order_by.startswith('-')
while order_by[0] in '+-':
order_by = order_by[1:]
sort_field = PyLucene.SortField(order_by, reverse)
sort = PyLucene.Sort(sort_field)
hits = searcher.search(query, sort)
return LuceneResultSet(hits, self)
class LuceneResultSet(ResultSet):
def __init__(self, hits, indexer):
self._hits = hits
self._indexer = indexer
def __len__(self):
return self._hits.length()
def __iter__(self):
for hit in self._hits:
yield LuceneHit(hit, self._indexer)
def __getitem__(self, item):
return LuceneHit(self._hits.__getitem__(item))
class LuceneHit(Hit):
def get_pk(self):
# FIXME: Hardcoded 'pk' field.
return self.data.get('pk')
def __getitem__(self, item):
return self.data.__getitem__(item)
def get_score(self):
return self.data.getScore()
score = property(get_score)

View File

@ -0,0 +1,27 @@
from django.db import models
# Note: These aren't used yet, but they probably will be in the future.
# This is because the only thing that really needs to be remembered
# (the path to the index) is going to go in SETTINGS anyway.
# But persistent info such as outdated rows, search statistics, etc.
# could still be useful.
class Index(models.Model):
model_name = models.CharField(maxlength=255)
class IndexedField(models.Model):
object_path = models.CharField(maxlength=255)
model = models.ForeignKey('Index')
class QueryLog(models.Model):
"""This is not a full log, but merely counts queries."""
query = models.CharField(maxlength=255, unique=True)
query_count = models.IntegerField(default=1)
last_date = DateTimeField()
last_source = models.CharField("Some identifier for who sent the query", maxlength=255)
class Person(models.Model):
"""This is for testing."""
first_name = models.CharField(maxlength=30)
last_name = models.CharField(maxlength=30)
description = models.TextField()

View File

@ -0,0 +1,36 @@
class QueryParser(object):
# TODO: Make a common query language for all the backends.
pass
class ResultSet(object):
def __iter__(self):
raise NotImplementedError
def __len__(self):
raise NotImplementedError
def __getitem__(self):
raise NotImplementedError
class Hit(object):
def __init__(self, data, indexer):
self.indexer = indexer
self.model = indexer.model
self.data = data
def get_instance(self):
name = self.model._meta.pk.name
pk = self.model._meta.pk.to_python(self.get_pk())
return self.model.objects.get(**{name: pk})
instance = property(get_instance)
def get_pk(self):
raise NotImplementedError
def __repr__(self):
return "<%s: %s %s, Score: %s>" % (self.__class__.__name__,
self.model._meta,
self.get_pk(), self.score)

View File

@ -0,0 +1 @@
# Create your views here.

View File

@ -0,0 +1,64 @@
from django.db import models
from datetime import datetime
import xapwrap.index
import xapwrap.document
from itertools import imap
from base import Indexer, ResultSet
# TODO: This is incomplete.
class XapianIndexer(Indexer):
def update(self, documents=None):
idx = xapwrap.index.Index(self.path, True)
if documents is None:
update_queue = self.model.objects.all()
else:
update_queue = documents
for row in documents:
keys = []
for name, field in self.attr_fields.iteritems():
keys.append(xapwrap.document.SortKey(name, getattr(self.model, field.name)))
d = xapwrap.document.Document(textFields=fields, sortFields=keys, uid=row._get_pk_val())
idx.index(d)
idx.close()
def search(self, query, order_by='RELEVANCE'):
idx = Index(self.path)
if order_by == 'RELEVANCE':
results = idx.search(query, sortByRelevence=True)
else:
ascending = True
if isinstance(order_by, basestring) and order_by.startswith('-'):
ascending = False
while order_by[0] in '+-':
order_by = order_by[1:]
results = idx.search(query, order_by, sortAscending=ascending)
return XapianResultSet(results)
class XapianResultSet(ResultSet):
def __init__(self, hits, indexer):
self._hits = hits
self._indexer = indexer
def __len__(self):
return len(self._hits)
def __iter__(self):
for hit in self._hits):
yield XapianHit(hit, self._indexer)
class XapianHit(object):
def get_pk(self):
return self.data['pk']
def get_score(self):
return self.data['score']
score = property(get_score)