Contrasting examples of the lucene api versus pythonic lupyne idioms.


Basic indexing and searching example adapted from http://lucene.apache.org/core/4_3_0/core/index.html

import lucene
from org.apache.lucene import analysis, document, index, queryparser, search, store, util
from lupyne import engine

### lucene ###

analyzer = analysis.standard.StandardAnalyzer(util.Version.LUCENE_CURRENT)

# Store the index in memory:
directory = store.RAMDirectory()
# To store an index on disk, use this instead:
#Directory directory = FSDirectory.open(File("/tmp/testindex"))
config = index.IndexWriterConfig(util.Version.LUCENE_CURRENT, analyzer)
iwriter = index.IndexWriter(directory, config)
doc = document.Document()
text = "This is the text to be indexed."
doc.add(document.Field("fieldname", text, document.TextField.TYPE_STORED))

# Now search the index:
ireader = index.IndexReader.open(directory)
isearcher = search.IndexSearcher(ireader)
# Parse a simple query that searches for "text":
parser = queryparser.classic.QueryParser(util.Version.LUCENE_CURRENT, "fieldname", analyzer)
query = parser.parse("text")
hits = isearcher.search(query, None, 1000).scoreDocs
assert len(hits) == 1
# Iterate through the results:
for hit in hits:
    hitDoc = isearcher.doc(hit.doc)
    assert hitDoc['fieldname'] == text

### lupyne ###

# Store the index in memory:
indexer = engine.Indexer()              # Indexer combines Writer and Searcher; RAMDirectory and StandardAnalyzer are defaults
indexer.set('fieldname', stored=True)   # settings for all documents of indexer; indexed and tokenized is the default
indexer.add(fieldname=text)             # add document
indexer.commit()                        # commit changes and refresh searcher

# Now search the index:
hits = indexer.search('text', field='fieldname')    # parsing handled if necessary
assert len(hits) == 1
for hit in hits:                                    # hits support mapping interface
    assert hit['fieldname'] == text
# closing is handled automatically


Convenient Query creation.

Operator overloading is used for combining boolean clauses.

import lucene
from org.apache.lucene import index, search
from org.apache.lucene.search import spans
from lupyne.engine import Query

### lucene ###

q1 = search.TermQuery(index.Term('text', 'lucene'))
q2 = search.PhraseQuery()
q2.add(index.Term('text', 'search'))
q2.add(index.Term('text', 'engine'))
q3 = search.BooleanQuery()
q3.add(q1, search.BooleanClause.Occur.MUST)
q3.add(q2, search.BooleanClause.Occur.MUST)
assert str(q3) == '+text:lucene +text:"search engine"'

q1 = spans.SpanTermQuery(index.Term('text', 'hello'))
q2 = spans.SpanTermQuery(index.Term('text', 'world'))
q3 = spans.SpanPositionRangeQuery(q1, 0, 10)
q4 = spans.SpanNearQuery([q1, q2], 0, True)
q5 = spans.SpanNotQuery(q3, q4)
assert str(q5).startswith('spanNot(spanPosRange(text:hello, 0, 10), spanNear([text:hello, text:world], 0, true)')

### lupyne ###

q = Query.term('text', 'lucene') & Query.phrase('text', 'search', 'engine')
assert isinstance(q, search.BooleanQuery)
assert str(q) == '+text:lucene +text:"search engine"'

q = Query.span('text', 'hello')[:10] - Query.near('text', 'hello', 'world')
assert isinstance(q, spans.SpanQuery)
assert str(q).startswith('spanNot(spanPosRange(text:hello, 0, 10), spanNear([text:hello, text:world], 0, true)')


Advanced searching with custom fields.

Prefix and Range queries are a potential pitfall in Lucene.
As the queries expand to more terms, the performance drops off precipitously.
A common example is where datetimes are indexed, but a large span of date ranges are being searched.
The usual workaround is to only index the amount of granularity needed, e.g., just the dates.
But this may not be sufficient, or the datetimes may be necessary for other searches.

The general solution is to index the term values into a prefix tree.
Then each query can expand to only values of the appropriate granularity.
Lucene's NumericFields encode numbers to be sortable, so it is also able to cluster prefixes into the same field.
Whereas Lupyne's NestedField assumes the value is already a sortable string, so different fields must be used to cluster the prefixes.
There are trade-offs to each approach:
 * NumericFields support range queries natively, but must translate prefix queries.
 * NestedFields support prefix queries optimally, but must translate range queries.
 * NumericFields only support numbers, and result in unreadable values in the index.
 * NestedFields support any searchable values, but pollute the field namespace.

Lupyne PointFields and DateTimeFields are now implemented as NumericFields since both are easily encoded as numbers.
NestedFields could still be used however, as demonstrated on dates below.

from datetime import date
import lucene
from org.apache.lucene import search
from lupyne import engine

docs = [
    {'city': 'San Francisco', 'state': 'CA', 'incorporated': '1850-04-15', 'population': 808976, 'longitude': -122.4192, 'latitude': 37.7752},
    {'city': 'Los Angeles', 'state': 'CA', 'incorporated': '1850-04-04', 'population': 3849378, 'longitude': -118.2434, 'latitude': 34.0521},
    {'city': 'Portland', 'state': 'OR', 'incorporated': '1851-02-08', 'population': 575930, 'longitude': -122.6703, 'latitude': 45.5238},

indexer = engine.Indexer()
indexer.set('city', stored=True, indexed=False)
indexer.set('state', stored=True, indexed=False)
# set method supports custom field types inheriting their default settings
indexer.set('incorporated', engine.DateTimeField)
indexer.set('year-month-day', engine.NestedField, sep='-')
indexer.set('population', engine.NumericField, type=int)
indexer.set('point', engine.PointField, precision=10)
# assigned fields can have a different key from their underlying field name
indexer.fields['location'] = engine.NestedField('state.city')

for doc in docs:
    doc['year-month-day'] = doc['incorporated']
    point = doc.pop('longitude'), doc.pop('latitude')
    location = doc['state'] + '.' + doc['city']
    incorporated = map(int, doc.pop('incorporated').split('-'))
    indexer.add(doc, location=location, incorporated=date(*incorporated), point=[point])

query = indexer.fields['incorporated'].prefix([1850])
assert query.max.doubleValue() - query.min.doubleValue() == 60 * 60 * 24 * 365
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']
query = indexer.fields['incorporated'].range(date(1850, 4, 10), None)
assert query.max is None
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

query = indexer.fields['year-month-day'].prefix('1850')
assert str(query) == 'year:1850*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']
query = indexer.fields['year-month-day'].range('1850-04-10', None)
assert str(query) == 'year-month-day:[1850-04-10 TO *}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

query = indexer.fields['population'].range(0, 1000000)
assert str(query) == 'population:[0 TO 1000000}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

cities = ['San Francisco', 'Los Angeles', 'Portland']
for index, distance in enumerate([1e3, 1e5, 2e5, 1e6]):
    query = indexer.fields['point'].within(-122.4, 37.7, distance=distance)
    assert isinstance(query, search.BooleanQuery) and len(query) <= 4
    assert set(hit['city'] for hit in indexer.search(query)) == set(cities[:index])

query = indexer.fields['location'].prefix('CA.San')
# works like any prefix query
assert str(query) == 'state.city:CA.San*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco']
query = indexer.fields['location'].prefix('CA')
# optimized to search the best field
assert str(query) == 'state:CA*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']


PyLucene has several pitfalls when collecting or sorting a large query result.
Generally they involve the overhead of traversing the VM in an internal loop.

Lucene also requires supplying a maximum doc count for searches,
and supplying an excessively large count is a poor workaround because the collection heap is pre-allocated.

Finally the custom sorting interface, although well-supported in PyLucene, has horrible performance.
The sort key of every potential doc must realistically be cached,
but the overhead of O(n log n) comparison calls dispatched through the VM is far worse than iterating ScoreDocs.

To mitigate all these problems, Lupyne first provides a unified search interface.
The same Hits type is returned regardless of optional doc count or sorting parameters.
As with lucene, the result is fully evaluated but each individual Hit object will only be loaded on demand.
Internally a CachingCollector is used when all docs are requested.

The search method allows lucene Sort parameters to be passed through, since that's still optimal.
Additionally the hits themselves can be sorted afterwards with any python callable key.
The IndexSearcher.comparator method is convenient for creating a sort key table from indexed fields.
The upshot is custom sorting and sorting large results are both easier and faster.

Custom sorting isn't necessary in the below example of course, just there for demonstration.

import lucene
from org.apache.lucene import search, util
from org.apache.pylucene.search import PythonFieldComparator, PythonFieldComparatorSource
from lupyne import engine

colors = 'red', 'green', 'blue', 'cyan', 'magenta', 'yellow'
indexer = engine.Indexer()
indexer.set('color', stored=True, tokenized=False)
for color in colors:

### lucene ###

searcher = search.IndexSearcher(indexer.indexReader)
sorttype = getattr(search.SortField, 'Type', search.SortField).STRING
topdocs = searcher.search(search.MatchAllDocsQuery(), None, 10, search.Sort(search.SortField('color', sorttype)))
assert [searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs] == sorted(colors)

class ComparatorSource(PythonFieldComparatorSource):
    class newComparator(PythonFieldComparator):
        def __init__(self, name, numHits, sortPos, reversed):
            self.name = name
            self.values = [None] * numHits
        def setNextReader(self, reader, *args):
            if not args:
                reader = reader.reader()
            br = util.BytesRef()
            comparator = search.FieldCache.DEFAULT.getTermsIndex(reader, self.name)
            self.comparator = [comparator.get(id, br) or br.utf8ToString() for id in range(reader.maxDoc())]
            return self
        def compare(self, slot1, slot2):
            return cmp(self.values[slot1], self.values[slot2])
        def setBottom(self, slot):
            self._bottom = self.values[slot]
        def compareBottom(self, doc):
            return cmp(self._bottom, self.comparator[doc])
        def copy(self, slot, doc):
            self.values[slot] = self.comparator[doc]
        def value(self, slot):

sorter = search.Sort(search.SortField('color', ComparatorSource()))
# still must supply excessive doc count to use the sorter
topdocs = searcher.search(search.MatchAllDocsQuery(), None, 10, sorter)
assert [searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs] == sorted(colors)

### lupyne ###

hits = indexer.search(sort='color')
assert [hit['color'] for hit in hits] == sorted(colors)
comparator = indexer.comparator('color')
assert list(comparator) == list(colors)
hits = indexer.search().sorted(comparator.__getitem__)
assert [hit['color'] for hit in hits] == sorted(colors)


Custom server.

Fields settings are assigned directly to the root.
Indexing is done here just to populate the example.

A custom filter and sorter are demonstrated by transforming a date field into a year field.
Filters are also used for faceting;  sorters are also used for grouping.

Example queries:
 * http://localhost:8080/search?q=date:17*&group=year
 * http://localhost:8080/search?q=date:17*&group=year&sort=-year
 * http://localhost:8080/search?count=0&facets=year
 * http://localhost:8080/search?q=text:right&count=3&facets=year

import lucene
from lupyne import engine, server
from test import fixture

def parse(date):
    return int(date.utf8ToString().split('-')[0])

if __name__ == '__main__':
    root = server.WebIndexer()
    # assign field settings
    root.indexer.set('amendment', stored=True, tokenized=False)
    root.indexer.set('date', stored=True, tokenized=False)
    # populate index
    for doc in fixture.constitution.docs():
        if 'amendment' in doc:
    # assign custom filter and sorter based on year
    root.searcher.sorters['year'] = engine.SortField('date', int, parse)
    years = set(date.split('-')[0] for date in root.searcher.terms('date'))
    root.searcher.filters['year'] = dict((year, engine.Query.prefix('date', year).filter()) for year in years)
    # start with pretty-printing
    server.start(root, config={'global': {'tools.json_out.indent': 2}})

Table Of Contents

Previous topic


This Page