borislav.nikolov@booking.com
Java based distributed RESTful service on top of Lucene indexes.
information retrieval library based on inverted indexes and boolean model
few lines of code are worth a thausand words
documents in the inverted index are sorted by document id
my $book = [
'... perl / ruby benchmark ...', # 0
'... perl / python benchmark ...', # 1
'... ruby / python benchmark ...', # 2
'... perl is awesome ...', # 3
];
my $inverted = {
'benchmark' => [ 0, 1, 2 ],
'perl' => [ 0, 1, 3 ],
'ruby' => [ 0, 2 ],
'python' => [ 1, 2 ],
};
# free stats:
my $n_documents = scalar(@{ $book });
my $n_perl_documents = scalar(@{ $inverted->{perl} });
my @ids = search($inverted, "perl AND ruby");
# matching: $book->[0] "... perl / ruby benchmark ..."
my @ids = search($inverted, "perl OR ruby");
# matching: $book->[0], "... perl / ruby benchmark ..."
# $book->[2], "... ruby / python benchmark ..."
# $book->[1], "... perl / python benchmark ..."
# $book->[3], "... perl is awesome ..." // below ruby/python?
# // "ruby" is less common than "perl"
#include <stdio.h>
#include <sys/time.h>
int main(void) {
int i,j;
struct timeval t0,t1;
gettimeofday(&t0, 0);
for (i = 0; i < 1000000; i++)
if (i % 2 == 0)
j++;
gettimeofday(&t1, 0);
printf("%ld\n", ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec) / 1000);
}
took: 5ms
split the work to 120 pieces, took: 47us
my @docs = ();
for (1.. (100_000_000 / 120)) { # 833333
push @docs, { name => ($_ % 10 == 0 ? 'perl ruby' : 'perl') }
}
$engine->index(\@docs);
my $t0 = time();
my $results = search({
bool => {
must => [
{ term => { name => "perl" } },
{ term => { name => "ruby" } }
]
}
});
print "took: @{[ time() - $t0 ]}\n";
5.8ms when every 10th document matches
0.7ms when all matching documents are in a block
//14 times slower than
for(i = 0; i < 9000; i++) { if (i % 2 == 0) j++; }
{
'perl' => [ 1, 3, 277, 46000, 64973, 78688, ... ], # 876_962 documents
'ruby' => [ 300, 456, 736, 837, 7278 ... 50000, ... ] # 51_345 documents
}
int doNext(int target) throws Exception {
for(;;) {
try_again: for(;;) {
for (int i = 1; i < queries.size(); i++) {
Primitive q = queries.get(i);
if (q.docID() < target) {
q.advance(target);
if (q.docID() > target) {
target = q.docID();
break try_again; // goto try_again # thank you java
}
}
}
return target;
}
// try_again:
target = lead.advance(target);
}
}
int next() throws Exception {
return doNext(lead.nextDoc());
}
while ((int doc = scorer.next()) != NO_MORE_DOCS) {
collect(doc);
}
always stop shard reallocation first
coordinates cluster wide changes, like:
mostly due to hardware issues, triggering weird behavior in unpredictable ways
we just want to store and search
// read [ { "field": "value" }, .. ] from STDIN and store each hash as a Lucene document
// thank you stackoverflow
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48,whitespace);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(new NIOFSDirectory(ROOT),config);
JsonReader json = new JsonReader(new InputStreamReader(System.in, "UTF-8"));
Gson gson = new Gson();
Map<String,String> map = new HashMap<String,String>();
json.beginArray();
while (json.hasNext()) {
map = (Map<String,String>) gson.fromJson(json, map.getClass());
Document doc = new Document();
for (Map.Entry<String,String> entry : map.entrySet())
doc.add(new Field(entry.getKey(), entry.getValue(),
Field.Store.YES,
Field.Index.ANALYZED));
writer.addDocument(doc);
}
json.endArray();
json.close();
// return top documents matching 'perl AND ruby' as [ { "key":"value" }.. ]
IndexReader reader = DirectoryReader.open(writer,false);
IndexSearcher searcher = new IndexSearcher(reader);
Query q = new BooleanQuery();
q.add(new TermQuery(new Term("name","perl")),BooleanClause.Occur.MUST);
q.add(new TermQuery(new Term("name","ruby")),BooleanClause.Occur.MUST);
TopDocs results = searcher.search(q, null,100);
ScoreDoc[] hits = results.scoreDocs;
List<Map<String,String>> output = new ArrayList<Map<String,String>>();
for(int i = 0; i < hits.length; i++) {
Document doc = searcher.doc(hits[i].doc);
Map<String,String> item = new Map<String,String>();
item.put("_score",hits[i].score);
for (IndexableField field : doc.getFields())
item.put(field.name(),field.stringValue());
output.put(item);
}
reader.close();
return output;
using sun.http or netty or jetty or whatever tty you like, you can easilly hack your own search service
simple topN recepie
sort { $b->{_score} <=> $a->{_score} } @r