diff --git a/TODO.md b/TODO.md index 921f4b96..3ad58a09 100644 --- a/TODO.md +++ b/TODO.md @@ -27,6 +27,9 @@ - ~~drop collection should remove all records from the store~~ - Pagination parameter - UTF-8 support for fuzzy search +- ~~Multi-key binary search during scoring~~ +- Assumption that all tokens match for scoring is no longer true +- Intersection without unpacking - Facets - Filters - Support search operators like +, - etc. @@ -39,7 +42,6 @@ - Space sensitivity - Use bitmap index instead of compressed array for doc list - Throw errors when schema is broken -- Assumption that all tokens match for scoring is no longer true - Primary_rank_scores and secondary_rank_scores hashmaps should be combined - Proper logging - clean special chars before indexing diff --git a/include/sorted_array.h b/include/sorted_array.h index 70da9856..099b32b0 100644 --- a/include/sorted_array.h +++ b/include/sorted_array.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include "array_base.h" @@ -18,6 +19,13 @@ private: return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew); } + uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base, + uint32_t bits, uint32_t value, uint32_t *actual); + + void binary_search_indices(const uint32_t *values, int low_vindex, int high_vindex, + int low_index, int high_index, uint32_t base, uint32_t bits, + uint32_t *indices); + public: // FIXME: this should be a constructor instead of a setter @@ -29,6 +37,8 @@ public: uint32_t indexOf(uint32_t value); + void indexOf(const uint32_t *values, const size_t values_len, uint32_t* indices); + // returns false if malloc fails bool append(uint32_t value); diff --git a/src/collection.cpp b/src/collection.cpp index 8dbcc373..d23f72d8 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -5,6 +5,7 @@ #include #include #include +#include Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store, const std::vector &search_fields, const std::vector rank_fields): @@ -448,6 +449,13 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank, const size_t result_size) const { const int max_token_rank = 250; + spp::sparse_hash_map leaf_to_indices; + + for (art_leaf *token_leaf : query_suggestion) { + uint32_t *indices = new uint32_t[result_size]; + token_leaf->values->ids.indexOf(result_ids, result_size, indices); + leaf_to_indices.emplace(token_leaf, indices); + } for(auto i=0; i &topster, const int & token_rank, // for each token in the query, find the positions that it appears in this document for (art_leaf *token_leaf : query_suggestion) { std::vector positions; - uint32_t doc_index = token_leaf->values->ids.indexOf(seq_id); + uint32_t doc_index = leaf_to_indices.at(token_leaf)[i]; uint32_t start_offset = token_leaf->values->offset_index.at(doc_index); uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ? token_leaf->values->offsets.getLength() : @@ -570,14 +578,6 @@ void Collection::remove(std::string id) { uint32_t seq_id_values[1] = {seq_id}; uint32_t doc_index = leaf->values->ids.indexOf(seq_id); - - /* - auto len = leaf->values->offset_index.getLength(); - for(auto i=0; ivalues->offset_index.at(i) << std::endl; - } - std::cout << "----" << std::endl; - */ uint32_t start_offset = leaf->values->offset_index.at(doc_index); uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ? leaf->values->offsets.getLength() : diff --git a/src/main/main.cpp b/src/main/main.cpp index c7c47004..e6a002d7 100644 --- a/src/main/main.cpp +++ b/src/main/main.cpp @@ -13,7 +13,35 @@ using namespace std; +void find_indices(const uint32_t *result_ids, int low, int high, std::vector & results) { + if(high >= low) { + size_t pivot = (low + high) / 2; + //std::cout << pivot << std::endl; + results.at(pivot) = result_ids[pivot]; + find_indices(result_ids, low, pivot-1, results); + find_indices(result_ids, pivot+1, high, results); + } +} + int main(int argc, char* argv[]) { + std::vector results(3); + uint32_t *result_ids = new uint32_t[3]; + /*for(auto i = 0; i < 100; i++) { + result_ids[i] = i; + }*/ + result_ids[0] = 6; + result_ids[1] = 19; + result_ids[2] = 21; + + find_indices(result_ids, 0, 2, results); + //std::sort(results.begin(), results.end()); + for(auto i : results) { + std::cout << i << std::endl; + } + + return 0; + + const std::string state_dir_path = "/tmp/typesense-data"; std::vector fields_to_index = {field("title", field_types::STRING)}; diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp index 4f5bdebd..21f5792d 100644 --- a/src/sorted_array.cpp +++ b/src/sorted_array.cpp @@ -61,6 +61,79 @@ uint32_t sorted_array::indexOf(uint32_t value) { return length; } +uint32_t sorted_array::lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base, + uint32_t bits, uint32_t value, uint32_t *actual) { + uint32_t imid; + uint32_t v; + + while (imin + 1 < imax) { + imid = imin + ((imax - imin) / 2); + + v = for_select_bits(in, base, bits, imid); + if (v >= value) { + imax = imid; + } + else if (v < value) { + imin = imid; + } + } + + v = for_select_bits(in, base, bits, imin); + if (v >= value) { + *actual = v; + return imin; + } + + v = for_select_bits(in, base, bits, imax); + *actual = v; + return imax; +} + +void sorted_array::binary_search_indices(const uint32_t *values, int low_vindex, int high_vindex, + int low_index, int high_index, uint32_t base, uint32_t bits, + uint32_t *indices) { + uint32_t actual_value = 0; + + if(high_vindex >= low_vindex && high_index >= low_index) { + size_t pivot_vindex = (low_vindex + high_vindex) / 2; + + uint32_t in_index = lower_bound_search_bits(in+METADATA_OVERHEAD, low_index, high_index, base, bits, + values[pivot_vindex], &actual_value); + //if(actual_value == values[pivot_vindex]) { + indices[pivot_vindex] = in_index; + //} + + size_t pivot_index = (low_index + high_index) / 2; + + binary_search_indices(values, low_vindex, pivot_vindex-1, low_index, pivot_index-1, + base, bits, indices); + binary_search_indices(values, pivot_vindex+1, high_vindex, pivot_index+1, high_index, + base, bits, indices); + } +} + +void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint32_t *indices) { + if(values_len == 0) { + return ; + } + + uint32_t base = *(uint32_t *)(in + 0); + uint32_t bits = *(in + 4); + + uint32_t low_index, high_index; + uint32_t actual_value = 0; + + do { + low_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[0], &actual_value); + } while(actual_value != values[0]); + + do { + high_index = lower_bound_search_bits(in+METADATA_OVERHEAD, 0, length-1, base, bits, values[values_len-1], &actual_value); + } while(actual_value != values[values_len-1]); + + binary_search_indices(values, 0, values_len-1, low_index, high_index, base, bits, indices); +} + void sorted_array::remove_values(uint32_t *sorted_values, uint32_t values_length) { uint32_t *curr_array = uncompress();