Use multi-key binary search to convert document IDs to corresponding indices.

Improved performance by 7x.
This commit is contained in:
Kishore Nallan 2017-02-07 22:42:58 +05:30
parent 536e605a35
commit c115c2c4a4
5 changed files with 123 additions and 10 deletions

View File

@ -27,6 +27,9 @@
- ~~drop collection should remove all records from the store~~
- Pagination parameter
- UTF-8 support for fuzzy search
- ~~Multi-key binary search during scoring~~
- Assumption that all tokens match for scoring is no longer true
- Intersection without unpacking
- Facets
- Filters
- Support search operators like +, - etc.
@ -39,7 +42,6 @@
- Space sensitivity
- Use bitmap index instead of compressed array for doc list
- Throw errors when schema is broken
- Assumption that all tokens match for scoring is no longer true
- Primary_rank_scores and secondary_rank_scores hashmaps should be combined
- Proper logging
- clean special chars before indexing

View File

@ -4,6 +4,7 @@
#include <cstdlib>
#include <for.h>
#include <cstring>
#include <vector>
#include <limits>
#include <iostream>
#include "array_base.h"
@ -18,6 +19,13 @@ private:
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
}
uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base,
uint32_t bits, uint32_t value, uint32_t *actual);
void binary_search_indices(const uint32_t *values, int low_vindex, int high_vindex,
int low_index, int high_index, uint32_t base, uint32_t bits,
uint32_t *indices);
public:
// FIXME: this should be a constructor instead of a setter
@ -29,6 +37,8 @@ public:
uint32_t indexOf(uint32_t value);
void indexOf(const uint32_t *values, const size_t values_len, uint32_t* indices);
// returns false if malloc fails
bool append(uint32_t value);

View File

@ -5,6 +5,7 @@
#include <intersection.h>
#include <match_score.h>
#include <string_utils.h>
#include <art.h>
Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
const std::vector<field> &search_fields, const std::vector<std::string> rank_fields):
@ -448,6 +449,13 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank,
const size_t result_size) const {
const int max_token_rank = 250;
spp::sparse_hash_map<art_leaf*, uint32_t*> leaf_to_indices;
for (art_leaf *token_leaf : query_suggestion) {
uint32_t *indices = new uint32_t[result_size];
token_leaf->values->ids.indexOf(result_ids, result_size, indices);
leaf_to_indices.emplace(token_leaf, indices);
}
for(auto i=0; i<result_size; i++) {
uint32_t seq_id = result_ids[i];
@ -461,7 +469,7 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank,
// for each token in the query, find the positions that it appears in this document
for (art_leaf *token_leaf : query_suggestion) {
std::vector<uint16_t> positions;
uint32_t doc_index = token_leaf->values->ids.indexOf(seq_id);
uint32_t doc_index = leaf_to_indices.at(token_leaf)[i];
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
token_leaf->values->offsets.getLength() :
@ -570,14 +578,6 @@ void Collection::remove(std::string id) {
uint32_t seq_id_values[1] = {seq_id};
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
/*
auto len = leaf->values->offset_index.getLength();
for(auto i=0; i<len; i++) {
std::cout << "i: " << i << ", val: " << leaf->values->offset_index.at(i) << std::endl;
}
std::cout << "----" << std::endl;
*/
uint32_t start_offset = leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
leaf->values->offsets.getLength() :

View File

@ -13,7 +13,35 @@
using namespace std;
void find_indices(const uint32_t *result_ids, int low, int high, std::vector<uint32_t> & results) {
if(high >= low) {
size_t pivot = (low + high) / 2;
//std::cout << pivot << std::endl;
results.at(pivot) = result_ids[pivot];
find_indices(result_ids, low, pivot-1, results);
find_indices(result_ids, pivot+1, high, results);
}
}
int main(int argc, char* argv[]) {
std::vector<uint32_t> results(3);
uint32_t *result_ids = new uint32_t[3];
/*for(auto i = 0; i < 100; i++) {
result_ids[i] = i;
}*/
result_ids[0] = 6;
result_ids[1] = 19;
result_ids[2] = 21;
find_indices(result_ids, 0, 2, results);
//std::sort(results.begin(), results.end());
for(auto i : results) {
std::cout << i << std::endl;
}
return 0;
const std::string state_dir_path = "/tmp/typesense-data";
std::vector<field> fields_to_index = {field("title", field_types::STRING)};

View File

@ -61,6 +61,79 @@ uint32_t sorted_array::indexOf(uint32_t value) {
return length;
}
uint32_t sorted_array::lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base,
uint32_t bits, uint32_t value, uint32_t *actual) {
uint32_t imid;
uint32_t v;
while (imin + 1 < imax) {
imid = imin + ((imax - imin) / 2);
v = for_select_bits(in, base, bits, imid);
if (v >= value) {
imax = imid;
}
else if (v < value) {
imin = imid;
}
}
v = for_select_bits(in, base, bits, imin);
if (v >= value) {
*actual = v;
return imin;
}
v = for_select_bits(in, base, bits, imax);
*actual = v;
return imax;
}
void sorted_array::binary_search_indices(const uint32_t *values, int low_vindex, int high_vindex,
int low_index, int high_index, uint32_t base, uint32_t bits,
uint32_t *indices) {
uint32_t actual_value = 0;
if(high_vindex >= low_vindex && high_index >= low_index) {
size_t pivot_vindex = (low_vindex + high_vindex) / 2;
uint32_t in_index = lower_bound_search_bits(in+METADATA_OVERHEAD, low_index, high_index, base, bits,
values[pivot_vindex], &actual_value);
//if(actual_value == values[pivot_vindex]) {
indices[pivot_vindex] = in_index;
//}
size_t pivot_index = (low_index + high_index) / 2;
binary_search_indices(values, low_vindex, pivot_vindex-1, low_index, pivot_index-1,
base, bits, indices);
binary_search_indices(values, pivot_vindex+1, high_vindex, pivot_index+1, high_index,
base, bits, indices);
}
}
void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint32_t *indices) {
if(values_len == 0) {
return ;
}
uint32_t base = *(uint32_t *)(in + 0);
uint32_t bits = *(in + 4);
uint32_t low_index, high_index;
uint32_t actual_value = 0;
do {
low_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[0], &actual_value);
} while(actual_value != values[0]);
do {
high_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[values_len-1], &actual_value);
} while(actual_value != values[values_len-1]);
binary_search_indices(values, 0, values_len-1, low_index, high_index, base, bits, indices);
}
void sorted_array::remove_values(uint32_t *sorted_values, uint32_t values_length) {
uint32_t *curr_array = uncompress();