mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 21:22:25 +08:00
Use multi-key binary search to convert document IDs to corresponding indices.
Improved performance by 7x.
This commit is contained in:
parent
536e605a35
commit
c115c2c4a4
4
TODO.md
4
TODO.md
@ -27,6 +27,9 @@
|
||||
- ~~drop collection should remove all records from the store~~
|
||||
- Pagination parameter
|
||||
- UTF-8 support for fuzzy search
|
||||
- ~~Multi-key binary search during scoring~~
|
||||
- Assumption that all tokens match for scoring is no longer true
|
||||
- Intersection without unpacking
|
||||
- Facets
|
||||
- Filters
|
||||
- Support search operators like +, - etc.
|
||||
@ -39,7 +42,6 @@
|
||||
- Space sensitivity
|
||||
- Use bitmap index instead of compressed array for doc list
|
||||
- Throw errors when schema is broken
|
||||
- Assumption that all tokens match for scoring is no longer true
|
||||
- Primary_rank_scores and secondary_rank_scores hashmaps should be combined
|
||||
- Proper logging
|
||||
- clean special chars before indexing
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <cstdlib>
|
||||
#include <for.h>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include "array_base.h"
|
||||
@ -18,6 +19,13 @@ private:
|
||||
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
|
||||
}
|
||||
|
||||
uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base,
|
||||
uint32_t bits, uint32_t value, uint32_t *actual);
|
||||
|
||||
void binary_search_indices(const uint32_t *values, int low_vindex, int high_vindex,
|
||||
int low_index, int high_index, uint32_t base, uint32_t bits,
|
||||
uint32_t *indices);
|
||||
|
||||
public:
|
||||
|
||||
// FIXME: this should be a constructor instead of a setter
|
||||
@ -29,6 +37,8 @@ public:
|
||||
|
||||
uint32_t indexOf(uint32_t value);
|
||||
|
||||
void indexOf(const uint32_t *values, const size_t values_len, uint32_t* indices);
|
||||
|
||||
// returns false if malloc fails
|
||||
bool append(uint32_t value);
|
||||
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <intersection.h>
|
||||
#include <match_score.h>
|
||||
#include <string_utils.h>
|
||||
#include <art.h>
|
||||
|
||||
Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
|
||||
const std::vector<field> &search_fields, const std::vector<std::string> rank_fields):
|
||||
@ -448,6 +449,13 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank,
|
||||
const size_t result_size) const {
|
||||
|
||||
const int max_token_rank = 250;
|
||||
spp::sparse_hash_map<art_leaf*, uint32_t*> leaf_to_indices;
|
||||
|
||||
for (art_leaf *token_leaf : query_suggestion) {
|
||||
uint32_t *indices = new uint32_t[result_size];
|
||||
token_leaf->values->ids.indexOf(result_ids, result_size, indices);
|
||||
leaf_to_indices.emplace(token_leaf, indices);
|
||||
}
|
||||
|
||||
for(auto i=0; i<result_size; i++) {
|
||||
uint32_t seq_id = result_ids[i];
|
||||
@ -461,7 +469,7 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank,
|
||||
// for each token in the query, find the positions that it appears in this document
|
||||
for (art_leaf *token_leaf : query_suggestion) {
|
||||
std::vector<uint16_t> positions;
|
||||
uint32_t doc_index = token_leaf->values->ids.indexOf(seq_id);
|
||||
uint32_t doc_index = leaf_to_indices.at(token_leaf)[i];
|
||||
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
|
||||
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
|
||||
token_leaf->values->offsets.getLength() :
|
||||
@ -570,14 +578,6 @@ void Collection::remove(std::string id) {
|
||||
uint32_t seq_id_values[1] = {seq_id};
|
||||
|
||||
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
|
||||
|
||||
/*
|
||||
auto len = leaf->values->offset_index.getLength();
|
||||
for(auto i=0; i<len; i++) {
|
||||
std::cout << "i: " << i << ", val: " << leaf->values->offset_index.at(i) << std::endl;
|
||||
}
|
||||
std::cout << "----" << std::endl;
|
||||
*/
|
||||
uint32_t start_offset = leaf->values->offset_index.at(doc_index);
|
||||
uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
|
||||
leaf->values->offsets.getLength() :
|
||||
|
@ -13,7 +13,35 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
void find_indices(const uint32_t *result_ids, int low, int high, std::vector<uint32_t> & results) {
|
||||
if(high >= low) {
|
||||
size_t pivot = (low + high) / 2;
|
||||
//std::cout << pivot << std::endl;
|
||||
results.at(pivot) = result_ids[pivot];
|
||||
find_indices(result_ids, low, pivot-1, results);
|
||||
find_indices(result_ids, pivot+1, high, results);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
std::vector<uint32_t> results(3);
|
||||
uint32_t *result_ids = new uint32_t[3];
|
||||
/*for(auto i = 0; i < 100; i++) {
|
||||
result_ids[i] = i;
|
||||
}*/
|
||||
result_ids[0] = 6;
|
||||
result_ids[1] = 19;
|
||||
result_ids[2] = 21;
|
||||
|
||||
find_indices(result_ids, 0, 2, results);
|
||||
//std::sort(results.begin(), results.end());
|
||||
for(auto i : results) {
|
||||
std::cout << i << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
|
||||
const std::string state_dir_path = "/tmp/typesense-data";
|
||||
|
||||
std::vector<field> fields_to_index = {field("title", field_types::STRING)};
|
||||
|
@ -61,6 +61,79 @@ uint32_t sorted_array::indexOf(uint32_t value) {
|
||||
return length;
|
||||
}
|
||||
|
||||
uint32_t sorted_array::lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base,
|
||||
uint32_t bits, uint32_t value, uint32_t *actual) {
|
||||
uint32_t imid;
|
||||
uint32_t v;
|
||||
|
||||
while (imin + 1 < imax) {
|
||||
imid = imin + ((imax - imin) / 2);
|
||||
|
||||
v = for_select_bits(in, base, bits, imid);
|
||||
if (v >= value) {
|
||||
imax = imid;
|
||||
}
|
||||
else if (v < value) {
|
||||
imin = imid;
|
||||
}
|
||||
}
|
||||
|
||||
v = for_select_bits(in, base, bits, imin);
|
||||
if (v >= value) {
|
||||
*actual = v;
|
||||
return imin;
|
||||
}
|
||||
|
||||
v = for_select_bits(in, base, bits, imax);
|
||||
*actual = v;
|
||||
return imax;
|
||||
}
|
||||
|
||||
void sorted_array::binary_search_indices(const uint32_t *values, int low_vindex, int high_vindex,
|
||||
int low_index, int high_index, uint32_t base, uint32_t bits,
|
||||
uint32_t *indices) {
|
||||
uint32_t actual_value = 0;
|
||||
|
||||
if(high_vindex >= low_vindex && high_index >= low_index) {
|
||||
size_t pivot_vindex = (low_vindex + high_vindex) / 2;
|
||||
|
||||
uint32_t in_index = lower_bound_search_bits(in+METADATA_OVERHEAD, low_index, high_index, base, bits,
|
||||
values[pivot_vindex], &actual_value);
|
||||
//if(actual_value == values[pivot_vindex]) {
|
||||
indices[pivot_vindex] = in_index;
|
||||
//}
|
||||
|
||||
size_t pivot_index = (low_index + high_index) / 2;
|
||||
|
||||
binary_search_indices(values, low_vindex, pivot_vindex-1, low_index, pivot_index-1,
|
||||
base, bits, indices);
|
||||
binary_search_indices(values, pivot_vindex+1, high_vindex, pivot_index+1, high_index,
|
||||
base, bits, indices);
|
||||
}
|
||||
}
|
||||
|
||||
void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint32_t *indices) {
|
||||
if(values_len == 0) {
|
||||
return ;
|
||||
}
|
||||
|
||||
uint32_t base = *(uint32_t *)(in + 0);
|
||||
uint32_t bits = *(in + 4);
|
||||
|
||||
uint32_t low_index, high_index;
|
||||
uint32_t actual_value = 0;
|
||||
|
||||
do {
|
||||
low_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[0], &actual_value);
|
||||
} while(actual_value != values[0]);
|
||||
|
||||
do {
|
||||
high_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[values_len-1], &actual_value);
|
||||
} while(actual_value != values[values_len-1]);
|
||||
|
||||
binary_search_indices(values, 0, values_len-1, low_index, high_index, base, bits, indices);
|
||||
}
|
||||
|
||||
void sorted_array::remove_values(uint32_t *sorted_values, uint32_t values_length) {
|
||||
uint32_t *curr_array = uncompress();
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user