From df8f6849fe6f4eec8da028266117385bca7cf0e9 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 15 Dec 2022 14:50:09 +0530 Subject: [PATCH] Facet estimation for large result sets. --- include/collection.h | 4 +- include/field.h | 2 + include/index.h | 11 +- src/collection.cpp | 26 +-- src/collection_manager.cpp | 13 +- src/index.cpp | 56 ++++- test/collection_faceting_test.cpp | 309 ++++++++++++++++++++++++- test/collection_schema_change_test.cpp | 2 +- test/collection_specific_test.cpp | 236 ------------------- 9 files changed, 385 insertions(+), 274 deletions(-) diff --git a/include/collection.h b/include/collection.h index 72cdd76c..19d1fe9b 100644 --- a/include/collection.h +++ b/include/collection.h @@ -408,7 +408,9 @@ public: const size_t facet_query_num_typos = 2, const size_t filter_curated_hits_option = 2, const bool prioritize_token_position = false, - const std::string& vector_query_str = "") const; + const std::string& vector_query_str = "", + const size_t facet_sample_percent = 100, + const size_t facet_sample_threshold = 0) const; Option get_filter_ids(const std::string & simple_filter_query, std::vector>& index_ids); diff --git a/include/field.h b/include/field.h index c3d284ab..d0ccbe80 100644 --- a/include/field.h +++ b/include/field.h @@ -682,6 +682,8 @@ struct facet { bool is_range_query; + bool sampled = false; + bool get_range(int64_t key, std::pair& range_pair) { if(facet_range_map.empty()) diff --git a/include/index.h b/include/index.h index 95a8a4ff..035998e8 100644 --- a/include/index.h +++ b/include/index.h @@ -132,6 +132,8 @@ struct search_args { std::vector> override_result_kvs; vector_query_t& vector_query; + size_t facet_sample_percent; + size_t facet_sample_threshold; search_args(std::vector field_query_tokens, std::vector search_fields, filter_node_t* filter_tree_root, std::vector& facets, @@ -145,7 +147,8 @@ struct search_args { size_t concurrency, size_t search_cutoff_ms, size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector& infixes, const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos, - const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query) : + const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query, + size_t facet_sample_percent, size_t facet_sample_threshold) : field_query_tokens(field_query_tokens), search_fields(search_fields), filter_tree_root(filter_tree_root), facets(facets), included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), @@ -159,7 +162,8 @@ struct search_args { min_len_1typo(min_len_1typo), min_len_2typo(min_len_2typo), max_candidates(max_candidates), infixes(infixes), max_extra_prefix(max_extra_prefix), max_extra_suffix(max_extra_suffix), facet_query_num_typos(facet_query_num_typos), filter_curated_hits(filter_curated_hits), - split_join_tokens(split_join_tokens), vector_query(vector_query) { + split_join_tokens(split_join_tokens), vector_query(vector_query), + facet_sample_percent(facet_sample_percent), facet_sample_threshold(facet_sample_threshold) { const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory topster = new Topster(topster_size, group_limit); @@ -357,6 +361,7 @@ private: void log_leaves(int cost, const std::string &token, const std::vector &leaves) const; void do_facets(std::vector & facets, facet_query_t & facet_query, + bool estimate_facets, size_t facet_sample_percent, const std::vector& facet_infos, size_t group_limit, const std::vector& group_by_fields, const uint32_t* result_ids, size_t results_size) const; @@ -645,7 +650,7 @@ public: size_t max_candidates, const std::vector& infixes, const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos, const bool filter_curated_hits, enable_t split_join_tokens, - const vector_query_t& vector_query) const; + const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold) const; void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name); diff --git a/src/collection.cpp b/src/collection.cpp index ffacd8ca..d2c697db 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -866,7 +866,9 @@ Option Collection::search(const std::string & raw_query, const size_t facet_query_num_typos, const size_t filter_curated_hits_option, const bool prioritize_token_position, - const std::string& vector_query_str) const { + const std::string& vector_query_str, + const size_t facet_sample_percent, + const size_t facet_sample_threshold) const { std::shared_lock lock(mutex); @@ -911,6 +913,10 @@ Option Collection::search(const std::string & raw_query, } } + if(facet_sample_percent > 100) { + return Option(400, "Value of `facet_sample_percent` must be less than 100."); + } + if(raw_group_by_fields.empty()) { group_limit = 0; } @@ -1302,7 +1308,8 @@ Option Collection::search(const std::string & raw_query, search_stop_millis, min_len_1typo, min_len_2typo, max_candidates, infixes, max_extra_prefix, max_extra_suffix, facet_query_num_typos, - filter_curated_hits, split_join_tokens, vector_query); + filter_curated_hits, split_join_tokens, vector_query, + facet_sample_percent, facet_sample_threshold); index->run_search(search_params); @@ -1319,12 +1326,6 @@ Option Collection::search(const std::string & raw_query, // for grouping we have to aggregate group set sizes to a count value if(group_limit) { - for(auto& acc_facet: facets) { - for(auto& facet_kv: acc_facet.result_map) { - facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size(); - } - } - total_found = search_params->groups_processed.size() + override_result_kvs.size(); } else { total_found = search_params->all_result_ids_len; @@ -1430,8 +1431,6 @@ Option Collection::search(const std::string & raw_query, // handle which fields have to be highlighted std::vector highlight_items; - bool has_atleast_one_fully_highlighted_field = false; - std::vector highlight_field_names; StringUtils::split(highlight_fields, highlight_field_names, ","); @@ -1442,12 +1441,6 @@ Option Collection::search(const std::string & raw_query, process_highlight_fields(weighted_search_fields, raw_search_fields, include_fields_full, exclude_fields_full, highlight_field_names, highlight_full_field_names, infixes, q_tokens, search_params->qtoken_set, highlight_items); - - for(auto& highlight_item: highlight_items) { - if(highlight_item.fully_highlighted) { - has_atleast_one_fully_highlighted_field = true; - } - } } nlohmann::json result = nlohmann::json::object(); @@ -1657,6 +1650,7 @@ Option Collection::search(const std::string & raw_query, for(facet & a_facet: facets) { nlohmann::json facet_result = nlohmann::json::object(); facet_result["field_name"] = a_facet.field_name; + facet_result["sampled"] = a_facet.sampled; facet_result["counts"] = nlohmann::json::array(); std::vector facet_values; diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 2633d39e..e04ce8f9 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -695,6 +695,9 @@ Option CollectionManager::do_search(std::map& re const char *EXHAUSTIVE_SEARCH = "exhaustive_search"; const char *SPLIT_JOIN_TOKENS = "split_join_tokens"; + const char *FACET_SAMPLE_PERCENT = "facet_sample_percent"; + const char *FACET_SAMPLE_THRESHOLD = "facet_sample_threshold"; + // enrich params with values from embedded params for(auto& item: embedded_params.items()) { if(item.key() == "expires_at") { @@ -720,7 +723,6 @@ Option CollectionManager::do_search(std::map& re // end check for mandatory params - const std::string& raw_query = req_params[QUERY]; std::vector num_typos = {2}; size_t min_len_1typo = 4; @@ -772,6 +774,9 @@ Option CollectionManager::do_search(std::map& re size_t max_extra_prefix = INT16_MAX; size_t max_extra_suffix = INT16_MAX; + size_t facet_sample_percent = 100; + size_t facet_sample_threshold = 0; + std::unordered_map unsigned_int_values = { {MIN_LEN_1TYPO, &min_len_1typo}, {MIN_LEN_2TYPO, &min_len_2typo}, @@ -790,6 +795,8 @@ Option CollectionManager::do_search(std::map& re {MAX_CANDIDATES, &max_candidates}, {FACET_QUERY_NUM_TYPOS, &facet_query_num_typos}, {FILTER_CURATED_HITS, &filter_curated_hits_option}, + {FACET_SAMPLE_PERCENT, &facet_sample_percent}, + {FACET_SAMPLE_THRESHOLD, &facet_sample_threshold}, }; std::unordered_map str_values = { @@ -982,7 +989,9 @@ Option CollectionManager::do_search(std::map& re facet_query_num_typos, filter_curated_hits_option, prioritize_token_position, - vector_query + vector_query, + facet_sample_percent, + facet_sample_threshold ); uint64_t timeMillis = std::chrono::duration_cast( diff --git a/src/index.cpp b/src/index.cpp index f5f7a2d1..f85b55d3 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -1228,6 +1229,7 @@ void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::s } void Index::do_facets(std::vector & facets, facet_query_t & facet_query, + bool estimate_facets, size_t facet_sample_percent, const std::vector& facet_infos, const size_t group_limit, const std::vector& group_by_fields, const uint32_t* result_ids, size_t results_size) const { @@ -1247,8 +1249,21 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, const auto& field_facet_mapping = field_facet_mapping_it->second; + // used for sampling facets (if enabled) + std::mt19937 gen(137723); // use constant seed to make sure that counts don't jump around + std::uniform_int_distribution<> distr(1, 100); // 1 to 100 inclusive + for(size_t i = 0; i < results_size; i++) { uint32_t doc_seq_id = result_ids[i]; + + // if sampling is enabled, we will skip a portion of the results to speed up things + if(estimate_facets) { + size_t num = distr(gen); + if(num > facet_sample_percent) { + continue; + } + } + const auto& facet_hashes_it = field_facet_mapping[doc_seq_id % ARRAY_FACET_DIM]->find(doc_seq_id); if(facet_hashes_it == field_facet_mapping[doc_seq_id % ARRAY_FACET_DIM]->end()) { @@ -1265,7 +1280,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, compute_facet_stats(a_facet, fhash, facet_field.type); } - if(a_facet.is_range_query){ + if(a_facet.is_range_query) { auto sort_index_it = sort_index.find(a_facet.field_name); if(sort_index_it != sort_index.end()){ @@ -1285,8 +1300,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, } } } - } - else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) { + } else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) { facet_count_t& facet_count = a_facet.result_map[fhash]; //LOG(INFO) << "field: " << a_facet.field_name << ", doc id: " << doc_seq_id << ", hash: " << fhash; @@ -1980,7 +1994,9 @@ void Index::run_search(search_args* search_params) { search_params->facet_query_num_typos, search_params->filter_curated_hits, search_params->split_join_tokens, - search_params->vector_query); + search_params->vector_query, + search_params->facet_sample_percent, + search_params->facet_sample_threshold); } void Index::collate_included_ids(const std::vector& q_included_tokens, @@ -2430,7 +2446,8 @@ void Index::search(std::vector& field_query_tokens, const std::v size_t max_candidates, const std::vector& infixes, const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos, const bool filter_curated_hits, const enable_t split_join_tokens, - const vector_query_t& vector_query) const { + const vector_query_t& vector_query, + size_t facet_sample_percent, size_t facet_sample_threshold) const { // process the filters @@ -2784,6 +2801,8 @@ void Index::search(std::vector& field_query_tokens, const std::v delete [] exclude_token_ids; delete [] excluded_result_ids; + bool estimate_facets = (facet_sample_percent < 100 && all_result_ids_len > facet_sample_threshold); + if(!facets.empty()) { const size_t num_threads = std::min(concurrency, all_result_ids_len); const size_t window_size = (num_threads == 0) ? 0 : @@ -2820,9 +2839,11 @@ void Index::search(std::vector& field_query_tokens, const std::v thread_pool->enqueue([this, thread_id, &facet_batches, &facet_query, group_limit, group_by_fields, batch_result_ids, batch_res_len, &facet_infos, + estimate_facets, facet_sample_percent, &num_processed, &m_process, &cv_process]() { auto fq = facet_query; - do_facets(facet_batches[thread_id], fq, facet_infos, group_limit, group_by_fields, + do_facets(facet_batches[thread_id], fq, estimate_facets, facet_sample_percent, + facet_infos, group_limit, group_by_fields, batch_result_ids, batch_res_len); std::unique_lock lock(m_process); num_processed++; @@ -2844,8 +2865,8 @@ void Index::search(std::vector& field_query_tokens, const std::v if(group_limit) { // we have to add all group sets acc_facet.hash_groups[facet_kv.first].insert( - this_facet.hash_groups[facet_kv.first].begin(), - this_facet.hash_groups[facet_kv.first].end() + this_facet.hash_groups[facet_kv.first].begin(), + this_facet.hash_groups[facet_kv.first].end() ); } else { size_t count = 0; @@ -2872,6 +2893,22 @@ void Index::search(std::vector& field_query_tokens, const std::v } } + for(auto & acc_facet: facets) { + for(auto& facet_kv: acc_facet.result_map) { + if(group_limit) { + facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size(); + } + + if(estimate_facets) { + facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent)); + } + } + + if(estimate_facets) { + acc_facet.sampled = true; + } + } + /*long long int timeMillisF = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - beginF).count(); LOG(INFO) << "Time for faceting: " << timeMillisF;*/ @@ -2880,7 +2917,8 @@ void Index::search(std::vector& field_query_tokens, const std::v std::vector facet_infos(facets.size()); compute_facet_infos(facets, facet_query, facet_query_num_typos, &included_ids_vec[0], included_ids_vec.size(), group_by_fields, max_candidates, facet_infos); - do_facets(facets, facet_query, facet_infos, group_limit, group_by_fields, &included_ids_vec[0], included_ids_vec.size()); + do_facets(facets, facet_query, estimate_facets, facet_sample_percent, + facet_infos, group_limit, group_by_fields, &included_ids_vec[0], included_ids_vec.size()); all_result_ids_len += curated_topster->size; diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index 688a0512..8b0f9717 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -73,8 +73,9 @@ TEST_F(CollectionFacetingTest, FacetCounts) { ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(1, results["facet_counts"].size()); - ASSERT_EQ(3, results["facet_counts"][0].size()); + ASSERT_EQ(4, results["facet_counts"][0].size()); ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]); + ASSERT_EQ(false, results["facet_counts"][0]["sampled"].get()); ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); ASSERT_EQ(1, results["facet_counts"][0]["stats"].size()); ASSERT_EQ(4, results["facet_counts"][0]["stats"]["total_values"].get()); @@ -981,7 +982,6 @@ TEST_F(CollectionFacetingTest, FacetByNestedIntField) { } TEST_F(CollectionFacetingTest, FacetParseTest){ - std::vector fields = { field("score", field_types::INT32, true), field("grade", field_types::INT32, true), @@ -1008,8 +1008,6 @@ TEST_F(CollectionFacetingTest, FacetParseTest){ ASSERT_TRUE(range_facets[1].is_range_query); ASSERT_GT(range_facets[1].facet_range_map.size(), 0); - - std::vector normal_facet_fields { "score", "grade" @@ -1022,9 +1020,7 @@ TEST_F(CollectionFacetingTest, FacetParseTest){ ASSERT_STREQ("score", normal_facets[0].field_name.c_str()); ASSERT_STREQ("grade", normal_facets[1].field_name.c_str()); - - std::vector mixed_facet_fields { "score", "grade(A:[80, 100], B:[60, 80], C:[40, 60])", @@ -1044,3 +1040,304 @@ TEST_F(CollectionFacetingTest, FacetParseTest){ ASSERT_STREQ("rank", mixed_facets[2].field_name.c_str()); } + + +TEST_F(CollectionFacetingTest, RangeFacetTest) { + std::vector fields = {field("place", field_types::STRING, false), + field("state", field_types::STRING, false), + field("visitors", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {} + ).get(); + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["place"] = "Mysore Palace"; + doc1["state"] = "Karnataka"; + doc1["visitors"] = 235486; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["place"] = "Hampi"; + doc2["state"] = "Karnataka"; + doc2["visitors"] = 187654; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["place"] = "Mahabalipuram"; + doc3["state"] = "TamilNadu"; + doc3["visitors"] = 174684; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["place"] = "Meenakshi Amman Temple"; + doc4["state"] = "TamilNadu"; + doc4["visitors"] = 246676; + + nlohmann::json doc5; + doc5["id"] = "4"; + doc5["place"] = "Staue of Unity"; + doc5["state"] = "Gujarat"; + doc5["visitors"] = 345878; + + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + ASSERT_TRUE(coll1->add(doc5.dump()).ok()); + + auto results = coll1->search("Karnataka", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true).get(); + ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Busy", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + auto results2 = coll1->search("Gujarat", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true).get(); + ASSERT_EQ(1, results2["facet_counts"][0]["counts"].size()); + ASSERT_EQ(1, results2["facet_counts"][0]["counts"][0]["count"].get()); + ASSERT_STREQ("VeryBusy", results2["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_TRUE(results2["facet_counts"][0]["counts"][1]["value"] == nullptr); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionFacetingTest, RangeFacetContinuity) { + std::vector fields = {field("place", field_types::STRING, false), + field("state", field_types::STRING, false), + field("visitors", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {} + ).get(); + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["place"] = "Mysore Palace"; + doc1["state"] = "Karnataka"; + doc1["visitors"] = 235486; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["place"] = "Hampi"; + doc2["state"] = "Karnataka"; + doc2["visitors"] = 187654; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["place"] = "Mahabalipuram"; + doc3["state"] = "TamilNadu"; + doc3["visitors"] = 174684; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["place"] = "Meenakshi Amman Temple"; + doc4["state"] = "TamilNadu"; + doc4["visitors"] = 246676; + + nlohmann::json doc5; + doc5["id"] = "4"; + doc5["place"] = "Staue of Unity"; + doc5["state"] = "Gujarat"; + doc5["visitors"] = 345878; + + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + ASSERT_TRUE(coll1->add(doc5.dump()).ok()); + + auto results = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200001, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true); + ASSERT_STREQ("Ranges in range facet syntax should be continous.", results.error().c_str()); + + auto results2 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[199999, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true); + ASSERT_STREQ("Ranges in range facet syntax should be continous.", results2.error().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionFacetingTest, RangeFacetTypo) { + std::vector fields = {field("place", field_types::STRING, false), + field("state", field_types::STRING, false), + field("visitors", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {} + ).get(); + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["place"] = "Mysore Palace"; + doc1["state"] = "Karnataka"; + doc1["visitors"] = 235486; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["place"] = "Hampi"; + doc2["state"] = "Karnataka"; + doc2["visitors"] = 187654; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["place"] = "Mahabalipuram"; + doc3["state"] = "TamilNadu"; + doc3["visitors"] = 174684; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["place"] = "Meenakshi Amman Temple"; + doc4["state"] = "TamilNadu"; + doc4["visitors"] = 246676; + + nlohmann::json doc5; + doc5["id"] = "4"; + doc5["place"] = "Staue of Unity"; + doc5["state"] = "Gujarat"; + doc5["visitors"] = 345878; + + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + ASSERT_TRUE(coll1->add(doc5.dump()).ok()); + + auto results = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000)"}, //missing ']' at end + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true); + ASSERT_STREQ("Error splitting the range string.", results.error().c_str()); + + auto results2 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:200000, 500000])"}, //missing '[' in second range + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true); + ASSERT_STREQ("Error splitting the range string.", results2.error().c_str()); + + auto results3 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000] VeryBusy:[200000, 500000])"}, //missing ',' between ranges + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true); + ASSERT_STREQ("Error splitting the range string.", results3.error().c_str()); + + auto results4 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0 200000], VeryBusy:[200000, 500000])"}, //missing ',' between first ranges values + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true); + ASSERT_STREQ("Range String range pattern not matched.", results4.error().c_str()); + + auto results5 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000 VeryBusy:200000, 500000])"}, //missing '],' and '[' + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true); + ASSERT_STREQ("Range String range pattern not matched.", results5.error().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionFacetingTest, SampleFacetCounts) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "color", "type": "string", "facet": true} + ] + })"_json; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + for(size_t i = 0; i < 1000; i++) { + nlohmann::json doc; + if(i % 2 == 0) { + doc["color"] = "blue"; + } else { + doc["color"] = "red"; + } + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + } + + auto res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", 10, 0).get(); + + ASSERT_EQ(1000, res["found"].get()); + ASSERT_EQ(1, res["facet_counts"].size()); + ASSERT_EQ(2, res["facet_counts"][0]["counts"].size()); + + // verify approximate counts + ASSERT_GE(res["facet_counts"][0]["counts"][0]["count"].get(), 250); + ASSERT_GE(res["facet_counts"][0]["counts"][1]["count"].get(), 250); + ASSERT_TRUE(res["facet_counts"][0]["sampled"].get()); + + // when sample threshold is high, don't estimate + res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", 10, 10000).get(); + + ASSERT_EQ(1000, res["found"].get()); + ASSERT_EQ(1, res["facet_counts"].size()); + ASSERT_EQ(2, res["facet_counts"][0]["counts"].size()); + + // verify approximate counts + ASSERT_EQ(500, res["facet_counts"][0]["counts"][0]["count"].get()); + ASSERT_EQ(500, res["facet_counts"][0]["counts"][1]["count"].get()); + ASSERT_FALSE(res["facet_counts"][0]["sampled"].get()); + + // test for sample percent > 100 + + auto res_op = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", 200, 0); + + ASSERT_FALSE(res_op.ok()); + ASSERT_EQ("Value of `facet_sample_percent` must be less than 100.", res_op.error()); +} diff --git a/test/collection_schema_change_test.cpp b/test/collection_schema_change_test.cpp index 4555766a..c32ba142 100644 --- a/test/collection_schema_change_test.cpp +++ b/test/collection_schema_change_test.cpp @@ -583,7 +583,7 @@ TEST_F(CollectionSchemaChangeTest, AbilityToDropAndReAddIndexAtTheSameTime) { ASSERT_EQ(1, res["found"].get()); ASSERT_EQ("0", res["hits"][0]["document"]["id"].get()); ASSERT_EQ(1, res["facet_counts"].size()); - ASSERT_EQ(3, res["facet_counts"][0].size()); + ASSERT_EQ(4, res["facet_counts"][0].size()); ASSERT_EQ("title", res["facet_counts"][0]["field_name"]); ASSERT_EQ(1, res["facet_counts"][0]["counts"].size()); ASSERT_EQ("123", res["facet_counts"][0]["counts"][0]["value"].get()); diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index 95ae5039..e051783e 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -2918,239 +2918,3 @@ TEST_F(CollectionSpecificTest, DontHighlightPunctuation) { collectionManager.drop_collection("coll1"); } - -TEST_F(CollectionSpecificTest, RangeFacetTest) { - std::vector fields = {field("place", field_types::STRING, false), - field("state", field_types::STRING, false), - field("visitors", field_types::INT32, true),}; - Collection* coll1 = collectionManager.create_collection( - "coll1", 1, fields, "", 0, "", {}, {} - ).get(); - nlohmann::json doc1; - doc1["id"] = "0"; - doc1["place"] = "Mysore Palace"; - doc1["state"] = "Karnataka"; - doc1["visitors"] = 235486; - - nlohmann::json doc2; - doc2["id"] = "1"; - doc2["place"] = "Hampi"; - doc2["state"] = "Karnataka"; - doc2["visitors"] = 187654; - - nlohmann::json doc3; - doc3["id"] = "2"; - doc3["place"] = "Mahabalipuram"; - doc3["state"] = "TamilNadu"; - doc3["visitors"] = 174684; - - nlohmann::json doc4; - doc4["id"] = "3"; - doc4["place"] = "Meenakshi Amman Temple"; - doc4["state"] = "TamilNadu"; - doc4["visitors"] = 246676; - - nlohmann::json doc5; - doc5["id"] = "4"; - doc5["place"] = "Staue of Unity"; - doc5["state"] = "Gujarat"; - doc5["visitors"] = 345878; - - - ASSERT_TRUE(coll1->add(doc1.dump()).ok()); - ASSERT_TRUE(coll1->add(doc2.dump()).ok()); - ASSERT_TRUE(coll1->add(doc3.dump()).ok()); - ASSERT_TRUE(coll1->add(doc4.dump()).ok()); - ASSERT_TRUE(coll1->add(doc5.dump()).ok()); - - auto results = coll1->search("Karnataka", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true).get(); - ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); - ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); - ASSERT_STREQ("Busy", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - - auto results2 = coll1->search("Gujarat", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true).get(); - ASSERT_EQ(1, results2["facet_counts"][0]["counts"].size()); - ASSERT_EQ(1, results2["facet_counts"][0]["counts"][0]["count"].get()); - ASSERT_STREQ("VeryBusy", results2["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_TRUE(results2["facet_counts"][0]["counts"][1]["value"] == nullptr); - - collectionManager.drop_collection("coll1"); -} - -TEST_F(CollectionSpecificTest, RangeFacetContinuity) { - std::vector fields = {field("place", field_types::STRING, false), - field("state", field_types::STRING, false), - field("visitors", field_types::INT32, true),}; - Collection* coll1 = collectionManager.create_collection( - "coll1", 1, fields, "", 0, "", {}, {} - ).get(); - nlohmann::json doc1; - doc1["id"] = "0"; - doc1["place"] = "Mysore Palace"; - doc1["state"] = "Karnataka"; - doc1["visitors"] = 235486; - - nlohmann::json doc2; - doc2["id"] = "1"; - doc2["place"] = "Hampi"; - doc2["state"] = "Karnataka"; - doc2["visitors"] = 187654; - - nlohmann::json doc3; - doc3["id"] = "2"; - doc3["place"] = "Mahabalipuram"; - doc3["state"] = "TamilNadu"; - doc3["visitors"] = 174684; - - nlohmann::json doc4; - doc4["id"] = "3"; - doc4["place"] = "Meenakshi Amman Temple"; - doc4["state"] = "TamilNadu"; - doc4["visitors"] = 246676; - - nlohmann::json doc5; - doc5["id"] = "4"; - doc5["place"] = "Staue of Unity"; - doc5["state"] = "Gujarat"; - doc5["visitors"] = 345878; - - - ASSERT_TRUE(coll1->add(doc1.dump()).ok()); - ASSERT_TRUE(coll1->add(doc2.dump()).ok()); - ASSERT_TRUE(coll1->add(doc3.dump()).ok()); - ASSERT_TRUE(coll1->add(doc4.dump()).ok()); - ASSERT_TRUE(coll1->add(doc5.dump()).ok()); - - auto results = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:[200001, 500000])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true); - ASSERT_STREQ("Ranges in range facet syntax should be continous.", results.error().c_str()); - - auto results2 = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:[199999, 500000])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true); - ASSERT_STREQ("Ranges in range facet syntax should be continous.", results2.error().c_str()); - - collectionManager.drop_collection("coll1"); -} - -TEST_F(CollectionSpecificTest, RangeFacetTypo) { - std::vector fields = {field("place", field_types::STRING, false), - field("state", field_types::STRING, false), - field("visitors", field_types::INT32, true),}; - Collection* coll1 = collectionManager.create_collection( - "coll1", 1, fields, "", 0, "", {}, {} - ).get(); - nlohmann::json doc1; - doc1["id"] = "0"; - doc1["place"] = "Mysore Palace"; - doc1["state"] = "Karnataka"; - doc1["visitors"] = 235486; - - nlohmann::json doc2; - doc2["id"] = "1"; - doc2["place"] = "Hampi"; - doc2["state"] = "Karnataka"; - doc2["visitors"] = 187654; - - nlohmann::json doc3; - doc3["id"] = "2"; - doc3["place"] = "Mahabalipuram"; - doc3["state"] = "TamilNadu"; - doc3["visitors"] = 174684; - - nlohmann::json doc4; - doc4["id"] = "3"; - doc4["place"] = "Meenakshi Amman Temple"; - doc4["state"] = "TamilNadu"; - doc4["visitors"] = 246676; - - nlohmann::json doc5; - doc5["id"] = "4"; - doc5["place"] = "Staue of Unity"; - doc5["state"] = "Gujarat"; - doc5["visitors"] = 345878; - - - ASSERT_TRUE(coll1->add(doc1.dump()).ok()); - ASSERT_TRUE(coll1->add(doc2.dump()).ok()); - ASSERT_TRUE(coll1->add(doc3.dump()).ok()); - ASSERT_TRUE(coll1->add(doc4.dump()).ok()); - ASSERT_TRUE(coll1->add(doc5.dump()).ok()); - - auto results = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000)"}, //missing ']' at end - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true); - ASSERT_STREQ("Error splitting the range string.", results.error().c_str()); - - auto results2 = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:200000, 500000])"}, //missing '[' in second range - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true); - ASSERT_STREQ("Error splitting the range string.", results2.error().c_str()); - - auto results3 = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000] VeryBusy:[200000, 500000])"}, //missing ',' between ranges - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true); - ASSERT_STREQ("Error splitting the range string.", results3.error().c_str()); - - auto results4 = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0 200000], VeryBusy:[200000, 500000])"}, //missing ',' between first ranges values - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true); - ASSERT_STREQ("Range String range pattern not matched.", results4.error().c_str()); - - auto results5 = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000 VeryBusy:200000, 500000])"}, //missing '],' and '[' - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true); - ASSERT_STREQ("Range String range pattern not matched.", results5.error().c_str()); - - collectionManager.drop_collection("coll1"); -} \ No newline at end of file