From 1d54c07a50a05c46a074ea0ac77b41887397ce59 Mon Sep 17 00:00:00 2001 From: krunal1313 Date: Mon, 22 May 2023 14:48:29 +0530 Subject: [PATCH] threshold based migration changes --- include/facet_index.h | 7 + include/index.h | 1 + include/num_tree.h | 11 ++ src/collection.cpp | 20 ++- src/facet_index.cpp | 48 ++++++ src/index.cpp | 161 ++++++++++++++++---- src/num_tree.cpp | 25 +++ test/collection_optimized_faceting_test.cpp | 6 +- 8 files changed, 239 insertions(+), 40 deletions(-) diff --git a/include/facet_index.h b/include/facet_index.h index 4c6d1c0c..d5a878c8 100644 --- a/include/facet_index.h +++ b/include/facet_index.h @@ -39,6 +39,7 @@ private: struct facet_index_counter { tsl::htrie_map facet_index_map; std::vector counter_list; + bool is_migrated = false; facet_index_counter() { facet_index_map.clear(); @@ -74,4 +75,10 @@ public: size_t intersect(const std::string& val, const uint32_t* result_ids, int result_id_len, int max_facet_count, std::map& found, bool is_wildcard_no_filter_query); + + size_t get_facet_indexes(const std::string& field, std::map>& seqid_index_map); + + bool get_migrated (const std::string& field) const; + + void set_migrated(const std::string& field, bool val); }; \ No newline at end of file diff --git a/include/index.h b/include/index.h index ec000a07..3897c999 100644 --- a/include/index.h +++ b/include/index.h @@ -522,6 +522,7 @@ private: void initialize_facet_indexes(const field& facet_field); + void migrate_facet_to_new_index(const std::string& field); static Option embed_fields(nlohmann::json& document, diff --git a/include/num_tree.h b/include/num_tree.h index 77455e45..b684419c 100644 --- a/include/num_tree.h +++ b/include/num_tree.h @@ -42,6 +42,7 @@ private: }; std::vector counter_list; + bool is_migrated = false; public: @@ -80,4 +81,14 @@ public: bool is_wildcard_no_filter_query); size_t counter_list_size() const; + + size_t get_facet_indexes(std::map>& facets); + + bool get_migrated () const { + return is_migrated; + } + + void set_migrated(bool val) { + is_migrated = true; + } }; \ No newline at end of file diff --git a/src/collection.cpp b/src/collection.cpp index 3ca03923..b46041bf 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1937,7 +1937,7 @@ Option Collection::search(std::string raw_query, } } } else if(a_facet.is_intersected) { - LOG(INFO) << "used intersection"; + //LOG(INFO) << "used intersection"; std::vector> facet_counts; for (const auto & kv : a_facet.result_map) { @@ -1945,10 +1945,18 @@ Option Collection::search(std::string raw_query, } auto max_facets = std::min(max_facet_values, facet_counts.size()); - std::sort(facet_counts.begin(), facet_counts.end(), - [&](const auto& p1, const auto& p2) { - return std::tie(p1.second, p1.first) > std::tie(p2.second, p2.first); - }); + auto nthElement = max_facets == facet_counts.size() ? max_facets - 1 : max_facets; + + std::nth_element(facet_counts.begin(), facet_counts.begin() + nthElement, + facet_counts.end(), [&](const auto& kv1, const auto& kv2) { + size_t a_count = kv1.second; + size_t b_count = kv2.second; + + size_t a_value_size = UINT64_MAX - kv1.first.size(); + size_t b_value_size = UINT64_MAX - kv2.first.size(); + + return std::tie(a_count, a_value_size) > std::tie(b_count, b_value_size); + }); for(int i = 0; i < max_facets; ++i) { const auto& kv = facet_counts[i]; @@ -1956,7 +1964,7 @@ Option Collection::search(std::string raw_query, facet_values.emplace_back(facet_value); } } else { - LOG(INFO) << "used hashes"; + //LOG(INFO) << "used hashes"; std::vector> facet_hash_counts; for (const auto & kv : a_facet.result_map) { diff --git a/src/facet_index.cpp b/src/facet_index.cpp index ef3556a4..564394db 100644 --- a/src/facet_index.cpp +++ b/src/facet_index.cpp @@ -150,3 +150,51 @@ facet_index_t::~facet_index_t() { facet_field_map.clear(); } + //used for migrating string and int64 facets +size_t facet_index_t::get_facet_indexes(const std::string& field, + std::map>& seqid_index_map) { + + const auto& facet_field_it = facet_field_map.find(field); + if(facet_field_it == facet_field_map.end()) { + return 0; + } + + auto& facet_index_map = facet_field_it->second.facet_index_map; + + std::vector id_list; + + for(auto facet_index_map_it = facet_index_map.begin(); + facet_index_map_it != facet_index_map.end(); ++facet_index_map_it) { + + auto ids = facet_index_map_it->id_list_ptr; + ids_t::uncompress(ids, id_list); + + //emplacing seq_id=>count_index + for(const auto& id : id_list) { + seqid_index_map[id].emplace_back(facet_index_map_it->index); + } + + id_list.clear(); + } + + return seqid_index_map.size(); +} + + +bool facet_index_t::get_migrated (const std::string& field) const { + const auto it = facet_field_map.find(field); + if(it != facet_field_map.end()) { + return it->second.is_migrated; + } + + return false; +} + +void facet_index_t::set_migrated(const std::string& field, bool val) { + const auto it = facet_field_map.find(field); + if(it != facet_field_map.end()) { + it->second.is_migrated = val; + } + + return; +} diff --git a/src/index.cpp b/src/index.cpp index 926c4baf..41524890 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -37,6 +37,7 @@ search_cutoff = true; \ break;\ } +#define FACET_INDEX_THRESHOLD 10 spp::sparse_hash_map Index::text_match_sentinel_value; spp::sparse_hash_map Index::seq_id_sentinel_value; @@ -602,6 +603,75 @@ size_t Index::batch_memory_index(Index *index, std::vector& iter_b return num_indexed; } +void Index::migrate_facet_to_new_index(const std::string& field) { + + //LOG(INFO) << "migrating facet " << field << " to old index"; + + std::map> facets_indexes; + auto& facet_index = facet_index_v3[field]; + auto& single_facet_index = single_val_facet_index_v3[field]; + + //migrate string and int64 facets first + if(facet_index_v4->get_facet_indexes(field, facets_indexes)) { + + for(const auto& kv : facets_indexes) { + + const auto hash_count = kv.second.size(); + const auto seq_id = kv.first; + + if(hash_count > 1) { + auto& facet_dim_index = facet_index[seq_id % ARRAY_FACET_DIM]; + if(facet_dim_index == nullptr) { + LOG(ERROR) << "Error, facet index not initialized for field " << field; + } else { + facet_hash_values_t fhashvalues; + fhashvalues.hashes = std::move(kv.second); + facet_dim_index->emplace(seq_id, std::move(fhashvalues)); + } + } else { + auto& facet_dim_index = single_facet_index[seq_id % ARRAY_FACET_DIM]; + if(facet_dim_index == nullptr) { + LOG(ERROR) << "Error, facet index not initialized for field " << field; + } else { + facet_dim_index->emplace(seq_id, kv.second[0]); + } + } + } + } + + //now extract remaining facets from numerical index + facets_indexes.clear(); + auto numerical_index_it = numerical_index.find(field); + if(numerical_index_it != numerical_index_it) { + auto num_tree = numerical_index_it->second; + if(num_tree->get_facet_indexes(facets_indexes)) { + + for(const auto& kv : facets_indexes) { + const auto hash_count = kv.second.size(); + const auto seq_id = kv.first; + + if(hash_count > 1) { + auto& facet_dim_index = facet_index[seq_id % ARRAY_FACET_DIM]; + if(facet_dim_index == nullptr) { + LOG(ERROR) << "Error, facet index not initialized for field " << field; + } else { + facet_hash_values_t fhashvalues; + fhashvalues.hashes = std::move(kv.second); + facet_dim_index->emplace(seq_id, std::move(fhashvalues)); + } + } else { + auto& facet_dim_index = single_facet_index[seq_id % ARRAY_FACET_DIM]; + if(facet_dim_index == nullptr) { + LOG(ERROR) << "Error, facet index not initialized for field " << field; + } else { + facet_dim_index->emplace(seq_id, kv.second[0]); + } + } + } + } + } +} + void Index::index_field_in_memory(const field& afield, std::vector& iter_batch) { // indexes a given field of all documents in the batch @@ -635,6 +705,37 @@ void Index::index_field_in_memory(const field& afield, std::vector std::unordered_map> token_to_doc_offsets; int64_t max_score = INT64_MIN; + + auto facet_threshold_count = 0; + bool is_migrated = false; + if(afield.is_string()) { + facet_threshold_count = facet_index_v4->get_facet_count(afield.name); + is_migrated = facet_index_v4->get_migrated(afield.name); + } else { + auto numerical_index_it = numerical_index.find(afield.name); + if(numerical_index_it != numerical_index.end()) { + facet_threshold_count = numerical_index_it->second->counter_list_size(); + is_migrated = numerical_index_it->second->get_migrated(); + } + } + +#ifdef TEST_BUILD + facet_threshold_count = FACET_INDEX_THRESHOLD + 1; + is_migrated = true; +#endif + if(!is_migrated && (facet_threshold_count > FACET_INDEX_THRESHOLD)) { + migrate_facet_to_new_index(afield.name); + + if(afield.is_string()) { + facet_index_v4->set_migrated(afield.name, true); + } else { + auto numerical_index_it = numerical_index.find(afield.name); + if(numerical_index_it != numerical_index.end()) { + numerical_index_it->second->set_migrated(true); + } + } + } + for(const auto& record: iter_batch) { if(!record.indexed.ok()) { // some records could have been invalidated upstream @@ -654,6 +755,7 @@ void Index::index_field_in_memory(const field& afield, std::vector } std::string value; + uint32_t fhash = 0; if(afield.facet) { if(afield.is_array()) { @@ -661,41 +763,36 @@ void Index::index_field_in_memory(const field& afield, std::vector for(size_t i = 0; i < document[afield.name].size(); ++i) { if(afield.type == field_types::INT32_ARRAY) { int32_t raw_val = document[afield.name][i].get(); - uint32_t hash = reinterpret_cast(raw_val); - fhashvalues.hashes.emplace_back(hash); + fhash = reinterpret_cast(raw_val); } else if(afield.type == field_types::INT64_ARRAY) { int64_t raw_val = document[afield.name][i].get(); value = std::to_string(raw_val); - auto index = facet_index_v4->insert(afield.name, value, seq_id); - fhashvalues.hashes.emplace_back(index); + fhash = facet_index_v4->insert(afield.name, value, seq_id); } else if(afield.type == field_types::STRING_ARRAY) { value = document[afield.name][i]; - auto index = facet_index_v4->insert(afield.name, value, seq_id, true); - fhashvalues.hashes.emplace_back(index); + fhash = facet_index_v4->insert(afield.name, value, seq_id, true); } else if(afield.type == field_types::FLOAT_ARRAY) { float raw_val = document[afield.name][i].get(); - uint32_t hash = reinterpret_cast(raw_val); - fhashvalues.hashes.emplace_back(hash); + fhash = reinterpret_cast(raw_val); } else if(afield.type == field_types::BOOL_ARRAY) { bool raw_val = document[afield.name][i].get(); - uint32_t hash = (uint32_t)raw_val; - fhashvalues.hashes.emplace_back(hash); + fhash = (uint32_t)raw_val; + } + + if(facet_threshold_count > FACET_INDEX_THRESHOLD) { + fhashvalues.hashes.emplace_back(fhash); } } - fhashvalues.length = fhashvalues.hashes.size(); - //LOG(INFO) << "fhashvalues.length " << fhashvalues.length; - //fhashvalues.length = field_index_it->second.facet_hashes.size(); - //fhashvalues.hashes = new uint32_t[field_index_it->second.facet_hashes.size()]; - // for(size_t i = 0; i < field_index_it->second.facet_hashes.size(); i++) { - // fhashvalues.hashes[i] = field_index_it->second.facet_hashes[i]; - // } + if(facet_threshold_count > FACET_INDEX_THRESHOLD) { + fhashvalues.length = fhashvalues.hashes.size(); - auto& facet_dim_index = facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM]; - if(facet_dim_index == nullptr) { - LOG(ERROR) << "Error, facet index not initialized for field " << afield.name; - } else { - facet_dim_index->emplace(seq_id, std::move(fhashvalues)); + auto& facet_dim_index = facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM]; + if(facet_dim_index == nullptr) { + LOG(ERROR) << "Error, facet index not initialized for field " << afield.name; + } else { + facet_dim_index->emplace(seq_id, std::move(fhashvalues)); + } } } else { uint32_t fhash; @@ -721,14 +818,16 @@ void Index::index_field_in_memory(const field& afield, std::vector bool raw_val = document[afield.name].get(); fhash = (uint32_t)raw_val; } - //fhash = field_index_it->second.facet_hashes[0]; + + if(facet_threshold_count > FACET_INDEX_THRESHOLD) { - auto& facet_dim_index = single_val_facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM]; - if(facet_dim_index == nullptr) { - LOG(ERROR) << "Error, facet index not initialized for field " << afield.name; - } else { - facet_dim_index->emplace(seq_id, std::move(fhash)); - } + auto& facet_dim_index = single_val_facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM]; + if(facet_dim_index == nullptr) { + LOG(ERROR) << "Error, facet index not initialized for field " << afield.name; + } else { + facet_dim_index->emplace(seq_id, std::move(fhash)); + } + } } } @@ -1238,8 +1337,8 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, use_hashes = false; #endif - if(results_size && facet_records && ((facet_records <= 10 || is_wildcard_query) && - !use_facet_query && group_limit == 0) + if(results_size && facet_records && ((facet_records <= FACET_INDEX_THRESHOLD + || is_wildcard_query) && !use_facet_query && group_limit == 0) && !use_hashes || use_facet_intersection) { //LOG(INFO) << "Using intersection to find facets"; a_facet.is_intersected = true; diff --git a/src/num_tree.cpp b/src/num_tree.cpp index b7028065..feadf74f 100644 --- a/src/num_tree.cpp +++ b/src/num_tree.cpp @@ -401,3 +401,28 @@ size_t num_tree_t::intersect(const uint32_t* result_ids, int result_ids_len, int return found.size(); } + +size_t num_tree_t::get_facet_indexes(std::map>& facets) { + + //check if facet field + if(counter_list.empty()) { + return 0; + } + + std::vector id_list; + + for(auto int64map_it = int64map.begin(); int64map_it != int64map.end(); ++int64map_it) { + + auto ids = int64map_it->second; + ids_t::uncompress(ids, id_list); + + //emplacing seq_id=>count_index + for(const auto& id : id_list) { + facets[id].emplace_back(int64map_it->first); + } + + id_list.clear(); + } + + return facets.size(); +} diff --git a/test/collection_optimized_faceting_test.cpp b/test/collection_optimized_faceting_test.cpp index 972e1169..86501db4 100644 --- a/test/collection_optimized_faceting_test.cpp +++ b/test/collection_optimized_faceting_test.cpp @@ -562,8 +562,8 @@ TEST_F(CollectionOptimizedFacetingTest, FacetCountOnSimilarStrings) { ASSERT_EQ(2, results["hits"].size()); ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); - ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); collectionManager.drop_collection("coll1"); } @@ -1225,4 +1225,4 @@ TEST_F(CollectionOptimizedFacetingTest, StringLengthTest) { //string facet length is restricted to 100 ASSERT_TRUE(100 == longStr.size()); -} \ No newline at end of file +}