[search-api] Initial commit, Lucene working, Xapian and Hype almost working, needs polish.

git-svn-id: http://code.djangoproject.com/svn/django/branches/search-api@3636 bcc190cf-cafb-0310-a4f2-bffc1f526a37
2025-07-05 02:09:13 +00:00 · 2006-08-21 19:02:19 +00:00 · 2006-08-21 19:02:19 +00:00 · 91790e27cd
commit 91790e27cd
parent 682aed446b
10 changed files with 567 additions and 0 deletions
--- a/django/contrib/search/init.py
+++ b/django/contrib/search/init.py
--- a/django/contrib/search/backends.py
+++ b/django/contrib/search/backends.py
@ -0,0 +1,19 @@
 from default import DefaultIndexer
 try:
    from xapian import XapianIndexer
 except ImportError:
    print "Xapian backend will not be available due to an ImportError. " \
          "Do you have Xapian and Xapwrap installed?"
 try:
    from lucene import LuceneIndexer
 except ImportError:
    print "Lucene backend will not be available due to an ImportError. " \
          "Do you have Lucene and PyLucene installed?"
 try:
    from hype import HypeIndexer
 except ImportError:
    print "Hyper Estraier backend will not be available due to an importError. " \
          "Do you have Hyper Estraier and Hype installed?"
--- a/django/contrib/search/base.py
+++ b/django/contrib/search/base.py
@ -0,0 +1,214 @@
 from django.db import models
 from django.core.exceptions import ObjectDoesNotExist
 import sys
 # For Python 2.3
 if not hasattr(__builtins__, 'set'):
    from sets import Set as set
 # FIXME: Methods that accept a field parameter claim to accept Field instances
 # or strings giving the object path. However, since there is no Field
 # attribute giving the Model it is bound to, these methods only work for
 # strings at the moment. This doesn't really affect the ease of use of the
 # library, as strings are actually easier to use.
 def str_to_field(string, namespace=None):
    """Gets the column attribute from the model as indicated
    by `string`, following ForeignKey attributes, etc.
    Example: 'Person.first_name' -> Person._meta.get_field('first_name')
    `namespace` is the dict-like object in which the object path will be
    searched. If None, the caller's global namespace will be used, thanks
    to the sys._getframe hack. This is important so that, for example,
    if `string` is 'models.Person.first_name', the caller's models module
    is used instead of the django.db.models module imported here.
    """
    # FIXME: This whole function is either silly or clever...
    objPath = string.split('.')
    model = None
    if namespace is None:
        # FIXME: This uses the sys._getframe hack to get the caller's namespace.
        obj = sys._getframe(1).f_globals
    else:
        obj = namespace
    getter = obj.__getitem__
    while objPath:
        objName = objPath.pop(0)
        # This might be better in a try/except block, but the respective
        # exceptions for the getters (KeyError, AttributeError,
        # FieldDoesNotExist) are already pretty descriptive...
        obj = getter(objName)
        if isinstance(obj, models.base.ModelBase):
            model = obj
            getter = model._meta.get_field
        elif isinstance(obj, models.fields.related.ForeignKey):
            model = obj.rel.to
            getter = model._meta.get_field
        # TODO: The rest of these could be more type-smart...
        elif hasattr(obj, '__getitem__'):
            getter = obj.__getitem__
        elif hasattr(obj, '__getattribute__'):
            getter = obj.__getattribute__
        else:
            getter = obj.__getattr__
    if isinstance(obj, models.base.ModelBase):
        model = obj
        obj = obj._meta.pk
    if not isinstance(obj, models.Field):
        raise ValueError("%r is not a Field object! (%r -> %r)" % \
                         (objName, string, obj))
    # FIXME: I don't think there is a way to get back to a field's Model
    # from the Field object. This makes sense from a hierarchical viewpoint,
    # but sure makes things like this harder. Hopefully setting this attribute
    # won't mess anything up...
    obj._model = model
    return obj
 class Indexer(object):
    def __init__(self, path, model, fields=None, attributes=None, namespace=None, **kwargs):
        """Initialize an Indexer whose index data is stored at `path`.
        `model` is the Model (or string name of the model) whose instances will
        be used as documents. Note that fields from other models can still be
        used in the index, but this model will be the one returned from search
        results.
        `fields` may be optionally initialized as an iterable of unnamed Fields.
        `attributes` may be optionally initialized as a mapping of field names
        to Fields.
        `namespace` is the dict-like object in which fields passed as object
        paths will be searched. If None, the caller's global namespace will be
        used, thanks to the sys._getframe hack.
        Example: If `fields` is ['models.Person.first_name'], it is important
        that namespace['models'] refers to the intended module and NOT the
        django.db.models module imported here.
        """
        if fields is None:
            fields = []
        if attributes is None:
            attributes = kwargs
        else:
            # `attributes` should take precedence to `kwargs`.
            kwargs.update(attributes)
            attributes = kwargs
        if namespace is None:
            # FIXME: This uses the sys._getframe hack to get the caller's namespace.
            namespace = sys._getframe(1).f_globals
        self._prepare_path(path)
        self.path = path
        self.model = model
        self.text_fields = set([])
        self.attr_fields = {}
        for field in fields:
            self.add_field(field, namespace=namespace)
        for name, field in attributes.iteritems():
            self.add_field(field, name, namespace=namespace)
        pk = self.model._meta.pk
        pk._model = self.model
        if pk not in self.text_fields and pk not in set(self.attr_fields.values()):
            self.add_field(pk, 'pk', namespace=namespace)
    def add_field(self, field, name=None, namespace=None):
        """Add the given field to the Indexer, where `field` is either
        an object path string or a Field instance. If `name` is None,
        the field will be added to self.text_fields, otherwise it will be
        added to self.attr_fields with the given name.
        `namespace` has the same meaning as in __init__.
        """
        # FIXME: This uses the sys._getframe hack to get the caller's namespace.
        if namespace is None:
            namespace = sys._getframe(1).f_globals
        # FIXME: Detect duplicates, or user-knows-best?
        if isinstance(field, basestring):
            field = str_to_field(field,  namespace)
        if name:
            self.attr_fields[name] = field
        else:
            self.text_fields.add(field)
    def remove_field(self, field=None, name=None, find_name=True, namespace=None):
        """Remove the given field from the Indexer, where `field` is either
        an object path string or a Field instance. If `name` is given,
        the field with that name is removed. If both `field` and `name`
        are given, both are removed if they refer to different fields.
        If `find_name` is True, the named fields in self.attr_fields are
        searched for `field`, otherwise only self.text_fields is searched.
        `namespace` has the same meaning as in __init__.
        """
        # FIXME: This uses the sys._getframe hack to get the caller's namespace.
        if namespace is None:
            namespace = sys._getframe(1).f_globals
        if name:
            if name in self.attr_fields:
                del self.attr_fields[name]
                return
        if field:
            if isinstance(field, basestring):
                field = str_to_field(field, namespace)
            self.text_fields.discard(field)
            if find_name:
                for name, f in self.attr_fields.items():
                    # TODO: Make sure identity is correct here
                    if f is field:
                        del self.attr_fields[name]
    def search(self, query_string, sortBy=None):
        """Query the index for `query_string` and return a HitResults instance.
        `order_by` can have the same values as Model.objects.order_by, with
        'SCORE' being the default.
        """
        raise NotImplementedError
    def index(self, document):
        raise NotImplementedError
    def update(self, force=False):
        raise NotImplementedError
    def _prepare_path(self, path):
        pass
 def test_indexer():
    # Note: I'm not very good at writing tests.
    class Person(models.Model):
        first_name = models.CharField(maxlength=30)
        last_name = models.CharField(maxlength=30)
        description = models.TextField()
    i = Indexer('', Person, ['Person.description'], {'first': 'Person.first_name'},
                last='Person.last_name', namespace=locals())
    assert Person._meta.get_field('description') in i.text_fields
    assert set([Person._meta.get_field('first_name'),
                Person._meta.get_field('last_name')]) == \
           set(i.attr_fields.values())
    assert 'first' in i.attr_fields and 'last' in i.attr_fields
    i.remove_field('Person.description', namespace=locals())
    assert not i.text_fields
    i.remove_field(name='last')
    assert 'last' not in i.attr_fields
    print "Test succeeded."
    return i
--- a/django/contrib/search/default.py
+++ b/django/contrib/search/default.py
@ -0,0 +1,9 @@
 from base import Indexer
 # This is the future home of a pure-Python text indexer.
 # Alec Thomas has created a built-in indexer for his library here:
 #   http://swapoff.org/wiki/pyndexter
 class DefaultIndexer(Indexer):
    pass
--- a/django/contrib/search/hype.py
+++ b/django/contrib/search/hype.py
@ -0,0 +1,35 @@
 from base import Indexer
 from query import ResultSet, Hit
 import hype
 # TODO: This is very incomplete.
 class HypeIndexer(Indexer):
    def __init__(self, *args, **kwargs):
        super(Indexer, self).__init__(*args, **kwargs)
        self.db = hype.Database(self.path, hype.ESTDBWRITER | hype.ESTDBCREAT)
    def index(self, row):
        document = hype.Document()
        document['@pk'] = row._get_pk_val()
        document.add_text()
    def search(self, query_string, sortBy=None):
        searcher = self.db.search(query_string)
        return HypeResultSet(searcher)
    def close(self):
        self.db.close()
 class HypeResultSet(ResultSet):
    def __len__(self):
        return len(self._hits)
    def __iter__(self):
        for hit in self._hits:
            yield HypeHit(hit, self._indexer)
 class HypeHit(Hit):
    pass
--- a/django/contrib/search/lucene.py
+++ b/django/contrib/search/lucene.py
@ -0,0 +1,162 @@
 from base import Indexer
 from query import ResultSet, Hit
 from itertools import imap
 import os, sys
 import PyLucene
 # WARNING!*
 # PyLucene wants you to use PyLucene.PythonThread for threading.
 # Look at samples/ThreadIndexFiles.py bundled with PyLucene.
 # * I'm not sure how important this is.
 # TODO: Make Lucene aware of field types.
 # Here's how to use me:
 #
 # class Person(models.Model):
 #     first_name = models.CharField(maxlength=30)
 #     last_name = models.CharField(maxlength=30)
 #     biography = models.TextField()
 #
 # indexer = LuceneIndexer('/tmp/lucene-index', Person, [biography],
 #                         {'first': 'Person.first_name',
 #                          'last': 'Person.last_name'})
 # indexer.update() # Note, calling this multiple times without clearing old
 #                  # entries will cause duplicates in the index.
 # indexer.search("brian -last:beck")
 class LuceneIndexer(Indexer):
    def __init__(self, *args, **kwargs):
        # FIXME: This uses the sys._getframe hack to get the caller's namespace.
        namespace = sys._getframe(1).f_globals
        kwargs['namespace'] = namespace
        super(LuceneIndexer, self).__init__(*args, **kwargs)
        self.writer_closed = True
    def _prepare_path(self, path):
        # Lucene wants an abstraction of the directory.
        # Should look into storage in a Model-compatible database in the future...
        self._store = PyLucene.FSDirectory.getDirectory(path, True)
    def update(self, documents=None):
        close = False
        if self.writer_closed:
            close = True
            self.open_writer()
        if documents is None:
            update_queue = self.model.objects.all()
        else:
            update_queue = documents
        for document in update_queue:
            self.delete(document)
            self.index(document)
        if close:
            self.close_writer()
    def clear(self):
        close = False
        if self.writer_closed:
            close = True
            self.open_writer()
        for i in xrange(self._writer.docCount()):
            self._writer.deleteDocument(i)
        if close:
            self.close_writer()
    def delete(self, row):
        reader = PyLucene.IndexReader.open(self.path)
        reader.deleteDocuments(PyLucene.Term('pk', str(row._get_pk_val())))
        reader.close()
    def open_writer(self):
        self.writer_closed = False
        self._writer = PyLucene.IndexWriter(self._store, PyLucene.StandardAnalyzer(), True)
        self._writer.setMaxFieldLength(1048576) # Max number of tokens stored per field?
    def close_writer(self):
        self._writer.optimize()
        self._writer.close()
        self.writer_closed = True
    def index(self, row):
        close = False
        if self.writer_closed:
            close = True
            self.open_writer()
        document = PyLucene.Document()
        for name, field in self.attr_fields.iteritems():
            # FIXME: Assumes no Foreign Keys! Lame!
            value = getattr(row, field.name)
            document.add(PyLucene.Field(name, str(value),
                                        PyLucene.Field.Store.YES,
                                        PyLucene.Field.Index.TOKENIZED))
        # Lucene only seems to support one 'default' field.
        # However, we might want multiple fields to be searched
        # by default. Hopefully just joining their contents with
        # newlines solves this.
        contents = '\n'.join([str(getattr(row, field.name)) for field in \
                              self.text_fields])
        # FIXME: Hardcoded 'contents' field.
        document.add(PyLucene.Field('contents', contents,
                                    PyLucene.Field.Store.YES,
                                    PyLucene.Field.Index.TOKENIZED))
        self._writer.addDocument(document)
        if close:
            self.close_writer()
    def search(self, query_string, default_field='contents', order_by='RELEVANCE'):
        searcher = PyLucene.IndexSearcher(self._store)
        analyzer = PyLucene.StandardAnalyzer()
        query = PyLucene.QueryParser(default_field, analyzer).parse(query_string)
        if order_by == 'SCORE':
            sort_field = PyLucene.SortField.FIELD_SCORE
            sort = PyLucene.Sort(sort_field)
        elif order_by == 'INDEX':
            sort = PyLucene.Sort.INDEXORDER
        elif order_by == 'RELEVANCE':
            sort = PyLucene.Sort.RELEVANCE
        else:
            reverse = order_by.startswith('-')
            while order_by[0] in '+-':
                order_by = order_by[1:]
            sort_field = PyLucene.SortField(order_by, reverse)
            sort = PyLucene.Sort(sort_field)
        hits = searcher.search(query, sort)
        return LuceneResultSet(hits, self)
 class LuceneResultSet(ResultSet):
    def __init__(self, hits, indexer):
        self._hits = hits
        self._indexer = indexer
    def __len__(self):
        return self._hits.length()
    def __iter__(self):
        for hit in self._hits:
            yield LuceneHit(hit, self._indexer)
    def __getitem__(self, item):
        return LuceneHit(self._hits.__getitem__(item))
 class LuceneHit(Hit):
    def get_pk(self):
        # FIXME: Hardcoded 'pk' field.
        return self.data.get('pk')
    def __getitem__(self, item):
        return self.data.__getitem__(item)
    def get_score(self):
        return self.data.getScore()
    score = property(get_score)
--- a/django/contrib/search/models.py
+++ b/django/contrib/search/models.py
@ -0,0 +1,27 @@
 from django.db import models
 # Note: These aren't used yet, but they probably will be in the future.
 # This is because the only thing that really needs to be remembered
 # (the path to the index) is going to go in SETTINGS anyway.
 # But persistent info such as outdated rows, search statistics, etc.
 # could still be useful.
 class Index(models.Model):
    model_name = models.CharField(maxlength=255)
 class IndexedField(models.Model):
    object_path = models.CharField(maxlength=255)
    model = models.ForeignKey('Index')
 class QueryLog(models.Model):
    """This is not a full log, but merely counts queries."""
    query = models.CharField(maxlength=255, unique=True)
    query_count = models.IntegerField(default=1)
    last_date = DateTimeField()
    last_source = models.CharField("Some identifier for who sent the query", maxlength=255)
 class Person(models.Model):
    """This is for testing."""
    first_name = models.CharField(maxlength=30)
    last_name = models.CharField(maxlength=30)
    description = models.TextField()
--- a/django/contrib/search/query.py
+++ b/django/contrib/search/query.py
@ -0,0 +1,36 @@
 class QueryParser(object):
    # TODO: Make a common query language for all the backends.
    pass
 class ResultSet(object):
    def __iter__(self):
        raise NotImplementedError
    def __len__(self):
        raise NotImplementedError
    def __getitem__(self):
        raise NotImplementedError
 class Hit(object):
    def __init__(self, data, indexer):
        self.indexer = indexer
        self.model = indexer.model
        self.data = data
    def get_instance(self):
        name = self.model._meta.pk.name
        pk = self.model._meta.pk.to_python(self.get_pk())
        return self.model.objects.get(**{name: pk})
    instance = property(get_instance)
    def get_pk(self):
        raise NotImplementedError
    def __repr__(self):
        return "<%s: %s %s, Score: %s>" % (self.__class__.__name__,
                                           self.model._meta,
                                           self.get_pk(), self.score)
--- a/django/contrib/search/views.py
+++ b/django/contrib/search/views.py
@ -0,0 +1 @@
 # Create your views here.
--- a/django/contrib/search/xapian.py
+++ b/django/contrib/search/xapian.py
@ -0,0 +1,64 @@
 from django.db import models
 from datetime import datetime
 import xapwrap.index
 import xapwrap.document
 from itertools import imap
 from base import Indexer, ResultSet
 # TODO: This is incomplete.
 class XapianIndexer(Indexer):
    def update(self, documents=None):
        idx = xapwrap.index.Index(self.path, True)
        if documents is None:
            update_queue = self.model.objects.all()
        else:
            update_queue = documents
        for row in documents:
            keys = []
            for name, field in self.attr_fields.iteritems():
                keys.append(xapwrap.document.SortKey(name, getattr(self.model, field.name)))
            d = xapwrap.document.Document(textFields=fields, sortFields=keys, uid=row._get_pk_val())
            idx.index(d)
        idx.close()
    def search(self, query, order_by='RELEVANCE'):
        idx = Index(self.path)
        if order_by == 'RELEVANCE':
            results = idx.search(query, sortByRelevence=True)
        else:
            ascending = True
            if isinstance(order_by, basestring) and order_by.startswith('-'):
                ascending = False
            while order_by[0] in '+-':
                order_by = order_by[1:]
            results = idx.search(query, order_by, sortAscending=ascending)
        return XapianResultSet(results)
 class XapianResultSet(ResultSet):
    def __init__(self, hits, indexer):
        self._hits = hits
        self._indexer = indexer
    def __len__(self):
        return len(self._hits)
    def __iter__(self):
        for hit in self._hits):
            yield XapianHit(hit, self._indexer)
 class XapianHit(object):
    def get_pk(self):
        return self.data['pk']
    def get_score(self):
        return self.data['score']
    score = property(get_score)