diff --git a/BUILD b/BUILD index a78c2f28..78cd3a3a 100644 --- a/BUILD +++ b/BUILD @@ -134,7 +134,7 @@ TEST_COPTS = [ "-Wno-unused-parameter", "-Werror=return-type", "-g", - "-DFORCE_INTERSECTION", + "-DTEST_BUILD" ] config_setting( diff --git a/include/collection.h b/include/collection.h index 0eb820bb..68e9956e 100644 --- a/include/collection.h +++ b/include/collection.h @@ -460,10 +460,8 @@ public: const text_match_type_t match_type = max_score, const size_t facet_sample_percent = 100, const size_t facet_sample_threshold = 0, - const size_t page_offset = UINT32_MAX -#ifdef FORCE_INTERSECTION - , bool force_intersection = false -#endif + const size_t page_offset = UINT32_MAX, + bool force_intersection = false ) const; Option get_filter_ids(const std::string & filter_query, filter_result_t& filter_result) const; diff --git a/include/index.h b/include/index.h index d347437d..d244967a 100644 --- a/include/index.h +++ b/include/index.h @@ -363,10 +363,8 @@ private: const std::vector& facet_infos, size_t group_limit, const std::vector& group_by_fields, const uint32_t* result_ids, size_t results_size, - int max_facet_count, bool is_wildcard_query, bool no_filters_provided -#ifdef FORCE_INTERSECTION - , bool force_intersection = false -#endif + int max_facet_count, bool is_wildcard_query, bool no_filters_provided, + bool force_intersection = false ) const; bool static_filter_query_eval(const override_t* override, std::vector& tokens, @@ -637,12 +635,8 @@ public: // Public operations - Option run_search(search_args* search_params, - const std::string& collection_name -#ifdef FORCE_INTERSECTION - , bool force_intersection -#endif - ); + Option run_search(search_args* search_params, const std::string& collection_name, + bool force_intersection); Option search(std::vector& field_query_tokens, const std::vector& the_fields, const text_match_type_t match_type, @@ -667,11 +661,7 @@ public: const size_t max_extra_suffix, const size_t facet_query_num_typos, const bool filter_curated_hits, enable_t split_join_tokens, const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold, - const std::string& collection_name -#ifdef FORCE_INTERSECTION - , bool force_intersection = false -#endif - ) const; + const std::string& collection_name, bool force_intersection = false) const; void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name); diff --git a/src/collection.cpp b/src/collection.cpp index ecbed5ec..2966dc25 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1070,11 +1070,8 @@ Option Collection::search(std::string raw_query, const text_match_type_t match_type, const size_t facet_sample_percent, const size_t facet_sample_threshold, - const size_t page_offset -#ifdef FORCE_INTERSECTION - , bool force_intersection -#endif - ) const { + const size_t page_offset, + bool force_intersection) const { std::shared_lock lock(mutex); @@ -1524,11 +1521,7 @@ Option Collection::search(std::string raw_query, std::unique_ptr search_params_guard(search_params); - auto search_op = index->run_search(search_params, name -#ifdef FORCE_INTERSECTION - , force_intersection -#endif - ); + auto search_op = index->run_search(search_params, name, force_intersection); if (!search_op.ok()) { return Option(search_op.code(), search_op.error()); diff --git a/src/index.cpp b/src/index.cpp index 4080e0de..23b3ecbb 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1194,11 +1194,8 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, const std::vector& facet_infos, const size_t group_limit, const std::vector& group_by_fields, const uint32_t* result_ids, size_t results_size, - int max_facet_count, bool is_wildcard_query, bool no_filters_provided -#ifdef FORCE_INTERSECTION - , bool force_intersection -#endif - ) const { + int max_facet_count, bool is_wildcard_query, bool no_filters_provided, + bool force_intersection) const { // assumed that facet fields have already been validated upstream for(size_t findex=0; findex < facets.size(); findex++) { auto& a_facet = facets[findex]; @@ -1224,20 +1221,21 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, } } -#ifdef FORCE_INTERSECTION bool use_hashes = false; if(!force_intersection) { use_hashes = true; } + +#ifndef TEST_BUILD + // non-test build should not accidentally set this flag + force_intersection = false; + use_hashes = false; #endif if(results_size && facet_records && ((facet_records <= 10 || is_wildcard_query) && !use_facet_query && group_limit == 0 && no_filters_provided) -#ifdef FORCE_INTERSECTION - && !use_hashes || force_intersection -#endif - ) { + && !use_hashes || force_intersection) { //LOG(INFO) << "Using intersection to find facets"; a_facet.is_intersected = true; @@ -2347,11 +2345,8 @@ Option Index::get_approximate_reference_filter_ids_with_lock(filter_node_t return rearrange_filter_tree(filter_tree_root, filter_ids_length); } -Option Index::run_search(search_args* search_params, const std::string& collection_name -#ifdef FORCE_INTERSECTION - , bool force_intersection -#endif - ) { +Option Index::run_search(search_args* search_params, const std::string& collection_name, + bool force_intersection) { return search(search_params->field_query_tokens, search_params->search_fields, search_params->match_type, @@ -2386,11 +2381,8 @@ Option Index::run_search(search_args* search_params, const std::string& co search_params->vector_query, search_params->facet_sample_percent, search_params->facet_sample_threshold, - collection_name -#ifdef FORCE_INTERSECTION - , force_intersection -#endif - ); + collection_name, + force_intersection); } void Index::collate_included_ids(const std::vector& q_included_tokens, @@ -2839,11 +2831,7 @@ Option Index::search(std::vector& field_query_tokens, cons const bool filter_curated_hits, const enable_t split_join_tokens, const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold, - const std::string& collection_name -#ifdef FORCE_INTERSECTION - , bool force_intersection -#endif - ) const { + const std::string& collection_name, bool force_intersection) const { std::shared_lock lock(mutex); uint32_t filter_ids_length = 0; @@ -3368,11 +3356,7 @@ Option Index::search(std::vector& field_query_tokens, cons batch_result_ids, batch_res_len, &facet_infos, max_facet_values, is_wildcard_query, no_filters_provided, estimate_facets, facet_sample_percent, &parent_search_begin, &parent_search_stop_ms, &parent_search_cutoff, - &num_processed, &m_process, &cv_process -#ifdef FORCE_INTERSECTION - , force_intersection -#endif - ]() { + &num_processed, &m_process, &cv_process, force_intersection]() { search_begin_us = parent_search_begin; search_stop_us = parent_search_stop_ms; search_cutoff = parent_search_cutoff; @@ -3382,11 +3366,8 @@ Option Index::search(std::vector& field_query_tokens, cons do_facets(facet_batches[thread_id], fq, estimate_facets, facet_sample_percent, facet_infos, group_limit, group_by_fields, batch_result_ids, batch_res_len, max_facet_values, - is_wildcard_query, no_filters_provided -#ifdef FORCE_INTERSECTION - , force_intersection -#endif - ); + is_wildcard_query, no_filters_provided, + force_intersection); std::unique_lock lock(m_process); num_processed++; parent_search_cutoff = parent_search_cutoff || search_cutoff; @@ -3471,11 +3452,8 @@ Option Index::search(std::vector& field_query_tokens, cons max_candidates, facet_infos); do_facets(facets, facet_query, estimate_facets, facet_sample_percent, facet_infos, group_limit, group_by_fields, &included_ids_vec[0], - included_ids_vec.size(), max_facet_values, is_wildcard_query, no_filters_provided -#ifdef FORCE_INTERSECTION - , force_intersection -#endif - ); + included_ids_vec.size(), max_facet_values, is_wildcard_query, no_filters_provided, + force_intersection); all_result_ids_len += curated_topster->size; diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index 8dd99404..2f107d6a 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -1596,1188 +1596,3 @@ TEST_F(CollectionFacetingTest, FloatFieldValueTruncation) { ASSERT_EQ("300", results["facet_counts"][0]["counts"][0]["value"].get()); } - - -class CollectionOptimizedFacetingTest : public ::testing::Test { -protected: - Store *store; - CollectionManager & collectionManager = CollectionManager::get_instance(); - std::atomic quit = false; - - std::vector query_fields; - std::vector sort_fields; - - void setupCollection() { - std::string state_dir_path = "/tmp/typesense_test/collection_optimized_faceting"; - LOG(INFO) << "Truncating and creating: " << state_dir_path; - system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); - - store = new Store(state_dir_path); - collectionManager.init(store, 1.0, "auth_key", quit); - collectionManager.load(8, 1000); - } - - virtual void SetUp() { - setupCollection(); - } - - virtual void TearDown() { - collectionManager.dispose(); - delete store; - } -}; - -TEST_F(CollectionOptimizedFacetingTest, FacetCounts) { - Collection *coll_array_fields; - - std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl"); - std::vector fields = {field("name", field_types::STRING, false), - field("name_facet", field_types::STRING, true), - field("age", field_types::INT32, true), - field("years", field_types::INT32_ARRAY, true), - field("rating", field_types::FLOAT, true), - field("timestamps", field_types::INT64_ARRAY, true), - field("tags", field_types::STRING_ARRAY, true), - field("optional_facet", field_types::INT64_ARRAY, true, true),}; - - std::vector sort_fields = { sort_by("age", "DESC") }; - - coll_array_fields = collectionManager.get_collection("coll_array_fields").get(); - if(coll_array_fields == nullptr) { - coll_array_fields = collectionManager.create_collection("coll_array_fields", 4, fields, "age").get(); - } - - std::string json_line; - - while (std::getline(infile, json_line)) { - nlohmann::json document = nlohmann::json::parse(json_line); - document["name_facet"] = document["name"]; - const std::string & patched_json_line = document.dump(); - coll_array_fields->add(patched_json_line); - } - - infile.close(); - - query_fields = {"name"}; - std::vector facets = {"tags"}; - - // single facet with no filters - nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, - {0}, 10, 1, FREQUENCY, {false}, 1UL, - spp::sparse_hash_set(), - spp::sparse_hash_set(), - 10UL, "", 30UL, 4UL, "", 1UL, "", "", {}, - 3UL, "", "", {}, 4294967295UL, true, - false, true, "", false, 6000000UL, 4UL, 7UL, fallback, - 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, - 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(5, results["hits"].size()); - - ASSERT_EQ(1, results["facet_counts"].size()); - ASSERT_EQ(4, results["facet_counts"][0].size()); - ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]); - ASSERT_EQ(false, results["facet_counts"][0]["sampled"].get()); - ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); - ASSERT_EQ(1, results["facet_counts"][0]["stats"].size()); - ASSERT_EQ(4, results["facet_counts"][0]["stats"]["total_values"].get()); - - ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]); - - ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); - ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]); - - ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][2]["value"].get().c_str()); - ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][2]["count"]); - - ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][3]["value"].get().c_str()); - ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][3]["count"]); - - // facet with facet count limit - results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, {0}, 10, 1, - FREQUENCY, {false}, 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 2, "", 30UL, 4UL, "", 1UL, - "", "", {}, 3UL, "", "", {}, 4294967295UL, true, - false, true, "", false, 6000000UL, 4UL, 7UL, fallback, 4UL, {off}, - 32767UL, 32767UL, 2UL, 2UL, false, "", true, 0UL, max_score, 100UL, - 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(5, results["hits"].size()); - - ASSERT_EQ(1, results["facet_counts"].size()); - ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get().c_str()); - ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); - - ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]); - - ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); - ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]); - - // 2 facets, 1 text query with no filters - facets.clear(); - facets.push_back("tags"); - facets.push_back("name_facet"); - results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, - {0}, 10, 1, FREQUENCY, {false}, 1UL, - spp::sparse_hash_set(), - spp::sparse_hash_set(), - 10UL, "", 30UL, 4UL, "", 1UL, "", "", {}, - 3UL, "", "", {}, 4294967295UL, true, - false, true, "", false, 6000000UL, 4UL, 7UL, fallback, - 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, - 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(5, results["hits"].size()); - ASSERT_EQ(2, results["facet_counts"].size()); - - ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get().c_str()); - ASSERT_STREQ("name_facet", results["facet_counts"][1]["field_name"].get().c_str()); - - // facet value must one that's stored, not indexed (i.e. no tokenization/standardization) - ASSERT_STREQ("Jeremy Howard", results["facet_counts"][1]["counts"][0]["value"].get().c_str()); - ASSERT_EQ(5, (int) results["facet_counts"][1]["counts"][0]["count"]); - - // facet with wildcard - results = coll_array_fields->search("Jeremy", query_fields, "", {"ag*"}, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, 1UL, spp::sparse_hash_set(), - spp::sparse_hash_set(), - 10UL, "", 30UL, 4UL, "", 1UL, "", "", {}, - 3UL, "", "", {}, 4294967295UL, true, - false, true, "", false, 6000000UL, 4UL, 7UL, fallback, - 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, - 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(5, results["hits"].size()); - ASSERT_EQ(1, results["facet_counts"].size()); - ASSERT_STREQ("age", results["facet_counts"][0]["field_name"].get().c_str()); - - // facet on a float field without query to check on stats - results = coll_array_fields->search("*", query_fields, "", {"rating"}, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(5, results["facet_counts"][0]["stats"].size()); - ASSERT_FLOAT_EQ(4.880199885368347, results["facet_counts"][0]["stats"]["avg"].get()); - ASSERT_FLOAT_EQ(0.0, results["facet_counts"][0]["stats"]["min"].get()); - ASSERT_FLOAT_EQ(9.99899959564209, results["facet_counts"][0]["stats"]["max"].get()); - ASSERT_FLOAT_EQ(24.400999426841736, results["facet_counts"][0]["stats"]["sum"].get()); - ASSERT_FLOAT_EQ(5, results["facet_counts"][0]["stats"]["total_values"].get()); - - // check for "0" case - ASSERT_STREQ("0", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get()); - - facets.clear(); - facets.push_back("tags"); - - // empty facet query value should return all facets without any filtering of facets - results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "tags: ", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(5, results["hits"].size()); - - results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "tags:", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(5, results["hits"].size()); - - // Wildcard facet_by can have partial matches - results = coll_array_fields->search("*", query_fields, "", {"nam*"}, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(5, results["hits"].size()); - ASSERT_EQ(1, results["facet_counts"].size()); - ASSERT_EQ("name_facet", results["facet_counts"][0]["field_name"].get()); - - // Wildcard facet_by having no counts should not be returned - results = coll_array_fields->search("*", query_fields, "", {"optio*"}, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(5, results["hits"].size()); - ASSERT_EQ(0, results["facet_counts"].size()); - - results = coll_array_fields->search("*", query_fields, "", {"optional_facet"}, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(5, results["hits"].size()); - ASSERT_EQ(1, results["facet_counts"].size()); - ASSERT_EQ("optional_facet", results["facet_counts"][0]["field_name"].get()); - - // bad facet query syntax - auto res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "foobar", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_FALSE(res_op.ok()); - ASSERT_STREQ("Facet query must be in the `facet_field: value` format.", res_op.error().c_str()); - - // unknown facet field - res_op = coll_array_fields->search("*", query_fields, "", {"foobar"}, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "foobar: baz", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_FALSE(res_op.ok()); - ASSERT_STREQ("Could not find a facet field named `foobar` in the schema.", res_op.error().c_str()); - - // only prefix matching is valid - res_op = coll_array_fields->search("*", query_fields, "", {"*_facet"}, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_FALSE(res_op.ok()); - ASSERT_STREQ("Only prefix matching with a wildcard is allowed.", res_op.error().c_str()); - - // unknown wildcard facet field - res_op = coll_array_fields->search("*", query_fields, "", {"foo*"}, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_FALSE(res_op.ok()); - ASSERT_STREQ("Could not find a facet field for `foo*` in the schema.", res_op.error().c_str()); - - // when facet query is given but no facet fields are specified, must return an error message - res_op = coll_array_fields->search("*", query_fields, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "tags: foo", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_FALSE(res_op.ok()); - ASSERT_STREQ("The `facet_query` parameter is supplied without a `facet_by` parameter.", res_op.error().c_str()); - - res_op = coll_array_fields->search("*", query_fields, "", {""}, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "tags: foo", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_FALSE(res_op.ok()); - ASSERT_STREQ("Could not find a facet field named `` in the schema.", res_op.error().c_str()); - - // given facet query field must be part of facet fields requested - res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, - {false}, Index::DROP_TOKENS_THRESHOLD, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "name_facet: jeremy", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_FALSE(res_op.ok()); - ASSERT_STREQ("Facet query refers to a facet field `name_facet` that is not part of `facet_by` parameter.", res_op.error().c_str()); - - collectionManager.drop_collection("coll_array_fields"); -} - -TEST_F(CollectionOptimizedFacetingTest, FacetCountsBool) { - Collection *coll1; - - std::vector fields = {field("title", field_types::STRING, false), - field("points", field_types::INT32, false), - field("in_stock", field_types::BOOL, true)}; - - std::vector sort_fields = {sort_by("points", "DESC")}; - - coll1 = collectionManager.get_collection("coll1").get(); - if (coll1 == nullptr) { - coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); - } - - nlohmann::json doc; - doc["id"] = "100"; - doc["title"] = "Ford Mustang"; - doc["points"] = 25; - doc["in_stock"] = true; - - coll1->add(doc.dump()); - - doc["id"] = "101"; - doc["title"] = "Tesla Model S"; - doc["points"] = 40; - doc["in_stock"] = false; - - coll1->add(doc.dump()); - - doc["id"] = "102"; - doc["title"] = "Ford Mustang GT"; - doc["points"] = 10; - doc["in_stock"] = true; - - coll1->add(doc.dump()); - - std::vector facets = {"in_stock"}; - - nlohmann::json results = coll1->search("Ford", {"title"}, "", facets, sort_fields, {0}, 10, 1, - token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10,"", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(1, results["facet_counts"].size()); - ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); - ASSERT_EQ(1, results["facet_counts"][0]["stats"].size()); - ASSERT_FLOAT_EQ(1, results["facet_counts"][0]["stats"]["total_values"].get()); - - ASSERT_STREQ("in_stock", results["facet_counts"][0]["field_name"].get().c_str()); - ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); - ASSERT_STREQ("true", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - - collectionManager.drop_collection("coll1"); -} - -TEST_F(CollectionOptimizedFacetingTest, FacetCountsFloatPrecision) { - Collection *coll1; - - std::vector fields = {field("title", field_types::STRING, false), - field("points", field_types::FLOAT, true)}; - - std::vector sort_fields = {sort_by("points", "DESC")}; - - coll1 = collectionManager.get_collection("coll1").get(); - if (coll1 == nullptr) { - coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); - } - - nlohmann::json doc; - doc["id"] = "100"; - doc["title"] = "Ford Mustang"; - doc["points"] = 113.4; - - coll1->add(doc.dump()); - - std::vector facets = {"points"}; - - nlohmann::json results = coll1->search("*", {"title"}, "", facets, sort_fields, {0}, 10, 1, - token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10,"", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(1, results["facet_counts"].size()); - ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); - - ASSERT_STREQ("points", results["facet_counts"][0]["field_name"].get().c_str()); - ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); - ASSERT_STREQ("113.4", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_STREQ("113.4",results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); - - collectionManager.drop_collection("coll1"); -} - -TEST_F(CollectionOptimizedFacetingTest, FacetStatOnFloatFields) { - Collection *coll_float_fields; - - std::ifstream infile(std::string(ROOT_DIR)+"test/float_documents.jsonl"); - std::vector fields = { - field("title", field_types::STRING, false), - field("score", field_types::FLOAT, false), - field("average", field_types::FLOAT, true) - }; - - std::vector sort_fields_desc = { sort_by("average", "DESC") }; - - coll_float_fields = collectionManager.get_collection("coll_float_fields").get(); - if(coll_float_fields == nullptr) { - coll_float_fields = collectionManager.create_collection("coll_float_fields", 4, fields, "average").get(); - } - - std::string json_line; - - while (std::getline(infile, json_line)) { - coll_float_fields->add(json_line); - } - - infile.close(); - - query_fields = {"title"}; - auto res_op = coll_float_fields->search("Jeremy", query_fields, "", {"average"}, sort_fields_desc, {0}, 10, - 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - auto results = res_op.get(); - - ASSERT_EQ(7, results["hits"].size()); - - ASSERT_EQ(5, results["facet_counts"][0]["stats"].size()); - ASSERT_FLOAT_EQ(-21.3799991607666, results["facet_counts"][0]["stats"]["min"].get()); - ASSERT_FLOAT_EQ(300, results["facet_counts"][0]["stats"]["max"].get()); - ASSERT_FLOAT_EQ(277.8160007725237, results["facet_counts"][0]["stats"]["sum"].get()); - ASSERT_FLOAT_EQ(39.68800011036053, results["facet_counts"][0]["stats"]["avg"].get()); - ASSERT_FLOAT_EQ(7, results["facet_counts"][0]["stats"]["total_values"].get()); -} - -TEST_F(CollectionOptimizedFacetingTest, FacetCountOnSimilarStrings) { - Collection *coll1; - - std::vector fields = {field("categories", field_types::STRING_ARRAY, true), - field("points", field_types::INT32, true)}; - - std::vector sort_fields = {sort_by("points", "DESC")}; - - coll1 = collectionManager.get_collection("coll1").get(); - if (coll1 == nullptr) { - coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); - } - - nlohmann::json doc; - doc["id"] = "100"; - doc["categories"] = {"England in India"}; - doc["points"] = 25; - - coll1->add(doc.dump()); - - doc["id"] = "101"; - doc["categories"] = {"India in England"}; - doc["points"] = 50; - - coll1->add(doc.dump()); - - std::vector facets = {"categories"}; - - nlohmann::json results = coll1->search("*", {"categories"}, "", facets, sort_fields, {0}, 10, 1, - token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30UL, 4UL, - "", 1UL, "", "", {}, 3UL, "", "", {}, - 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(2, results["hits"].size()); - ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); - - ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); - - collectionManager.drop_collection("coll1"); -} - -TEST_F(CollectionOptimizedFacetingTest, FacetByNestedIntField) { - nlohmann::json schema = R"({ - "name": "coll1", - "enable_nested_fields": true, - "fields": [ - {"name": "details", "type": "object", "optional": false }, - {"name": "company.num_employees", "type": "int32", "optional": false, "facet": true }, - {"name": "companyRank", "type": "int32", "optional": false, "facet": true } - ] - })"_json; - - auto op = collectionManager.create_collection(schema); - ASSERT_TRUE(op.ok()); - Collection* coll1 = op.get(); - - auto doc1 = R"({ - "details": {"count": 1000}, - "company": {"num_employees": 2000}, - "companyRank": 100 - })"_json; - - auto doc2 = R"({ - "details": {"count": 2000}, - "company": {"num_employees": 2000}, - "companyRank": 101 - })"_json; - - ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok()); - ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok()); - - std::vector sort_fields = { sort_by("details.count", "ASC") }; - - auto results = coll1->search("*", {}, "", {"company.num_employees"}, sort_fields, {0}, 10, 1, - token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 1UL, "", "", {}, 3UL, - "", "", {}, 4294967295UL, true, false, true, "", false, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(2, results["found"].get()); - ASSERT_EQ(1, results["facet_counts"].size()); - ASSERT_EQ("company.num_employees", results["facet_counts"][0]["field_name"]); - ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); - ASSERT_EQ(2, results["facet_counts"][0]["counts"][0]["count"].get()); - ASSERT_EQ("2000", results["facet_counts"][0]["counts"][0]["value"].get()); - - // Nested wildcard faceting - std::vector wildcard_facets; - coll1->parse_facet("company.*", wildcard_facets); - - ASSERT_EQ(1, wildcard_facets.size()); - ASSERT_EQ("company.num_employees", wildcard_facets[0].field_name); - - wildcard_facets.clear(); - coll1->parse_facet("company*", wildcard_facets); - - ASSERT_EQ(2, wildcard_facets.size()); - ASSERT_EQ("company.num_employees", wildcard_facets[0].field_name); - ASSERT_EQ("companyRank", wildcard_facets[1].field_name); -} - -TEST_F(CollectionOptimizedFacetingTest, FacetParseTest){ - std::vector fields = { - field("score", field_types::INT32, true), - field("grade", field_types::INT32, true), - field("rank", field_types::INT32, true), - field("range", field_types::INT32, true), - field("scale", field_types::INT32, false), - }; - - Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); - - std::vector range_facet_fields { - "score(fail:[0, 40], pass:[40, 100])", - "grade(A:[80, 100], B:[60, 80], C:[40, 60])" - }; - std::vector range_facets; - for(const std::string & facet_field: range_facet_fields) { - coll1->parse_facet(facet_field, range_facets); - } - ASSERT_EQ(2, range_facets.size()); - - ASSERT_STREQ("score", range_facets[0].field_name.c_str()); - ASSERT_TRUE(range_facets[0].is_range_query); - ASSERT_GT(range_facets[0].facet_range_map.size(), 0); - - ASSERT_STREQ("grade", range_facets[1].field_name.c_str()); - ASSERT_TRUE(range_facets[1].is_range_query); - ASSERT_GT(range_facets[1].facet_range_map.size(), 0); - - std::vector normal_facet_fields { - "score", - "grade" - }; - std::vector normal_facets; - for(const std::string & facet_field: normal_facet_fields) { - coll1->parse_facet(facet_field, normal_facets); - } - ASSERT_EQ(2, normal_facets.size()); - - ASSERT_STREQ("score", normal_facets[0].field_name.c_str()); - ASSERT_STREQ("grade", normal_facets[1].field_name.c_str()); - - std::vector wildcard_facet_fields { - "ran*", - "sc*", - }; - std::vector wildcard_facets; - for(const std::string & facet_field: wildcard_facet_fields) { - coll1->parse_facet(facet_field, wildcard_facets); - } - - ASSERT_EQ(3, wildcard_facets.size()); - - std::set expected{"range", "rank", "score"}; - for (size_t i = 0; i < wildcard_facets.size(); i++) { - ASSERT_TRUE(expected.count(wildcard_facets[i].field_name) == 1); - } - - wildcard_facets.clear(); - coll1->parse_facet("*", wildcard_facets); - - // Last field is not a facet. - ASSERT_EQ(fields.size() - 1, wildcard_facets.size()); - - expected.clear(); - for (size_t i = 0; i < fields.size() - 1; i++) { - expected.insert(fields[i].name); - } - - for (size_t i = 0; i < wildcard_facets.size(); i++) { - ASSERT_TRUE(expected.count(wildcard_facets[i].field_name) == 1); - } - - std::vector mixed_facet_fields { - "score", - "grade(A:[80, 100], B:[60, 80], C:[40, 60])", - "ra*", - }; - - std::vector mixed_facets; - for(const std::string & facet_field: mixed_facet_fields) { - coll1->parse_facet(facet_field, mixed_facets); - } - ASSERT_EQ(4, mixed_facets.size()); - - std::vector mixed_facets_ptr; - for(auto& f: mixed_facets) { - mixed_facets_ptr.push_back(&f); - } - - std::sort(mixed_facets_ptr.begin(), mixed_facets_ptr.end(), [](const facet* f1, const facet* f2) { - return f1->field_name < f2->field_name; - }); - - ASSERT_EQ("score", mixed_facets_ptr[3]->field_name); - - ASSERT_EQ("grade", mixed_facets_ptr[0]->field_name); - ASSERT_TRUE(mixed_facets_ptr[0]->is_range_query); - ASSERT_GT(mixed_facets_ptr[0]->facet_range_map.size(), 0); - - ASSERT_EQ("rank", mixed_facets_ptr[2]->field_name); - ASSERT_EQ("range", mixed_facets_ptr[1]->field_name); -} - -TEST_F(CollectionOptimizedFacetingTest, RangeFacetTest) { - std::vector fields = {field("place", field_types::STRING, false), - field("state", field_types::STRING, false), - field("visitors", field_types::INT32, true), - field("trackingFrom", field_types::INT32, true),}; - Collection* coll1 = collectionManager.create_collection( - "coll1", 1, fields, "", 0, "", {}, {} - ).get(); - nlohmann::json doc1; - doc1["id"] = "0"; - doc1["place"] = "Mysore Palace"; - doc1["state"] = "Karnataka"; - doc1["visitors"] = 235486; - doc1["trackingFrom"] = 1900; - - nlohmann::json doc2; - doc2["id"] = "1"; - doc2["place"] = "Hampi"; - doc2["state"] = "Karnataka"; - doc2["visitors"] = 187654; - doc2["trackingFrom"] = 1900; - - nlohmann::json doc3; - doc3["id"] = "2"; - doc3["place"] = "Mahabalipuram"; - doc3["state"] = "TamilNadu"; - doc3["visitors"] = 174684; - doc3["trackingFrom"] = 1900; - - nlohmann::json doc4; - doc4["id"] = "3"; - doc4["place"] = "Meenakshi Amman Temple"; - doc4["state"] = "TamilNadu"; - doc4["visitors"] = 246676; - doc4["trackingFrom"] = 2000; - - nlohmann::json doc5; - doc5["id"] = "4"; - doc5["place"] = "Staue of Unity"; - doc5["state"] = "Gujarat"; - doc5["visitors"] = 345878; - doc5["trackingFrom"] = 2000; - - ASSERT_TRUE(coll1->add(doc1.dump()).ok()); - ASSERT_TRUE(coll1->add(doc2.dump()).ok()); - ASSERT_TRUE(coll1->add(doc3.dump()).ok()); - ASSERT_TRUE(coll1->add(doc4.dump()).ok()); - ASSERT_TRUE(coll1->add(doc5.dump()).ok()); - - auto results = coll1->search("Karnataka", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); - ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); - ASSERT_EQ("Busy", results["facet_counts"][0]["counts"][0]["value"].get()); - ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]); - ASSERT_EQ("VeryBusy", results["facet_counts"][0]["counts"][1]["value"].get()); - - auto results2 = coll1->search("Gujarat", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(1, results2["facet_counts"][0]["counts"].size()); - ASSERT_EQ(1, results2["facet_counts"][0]["counts"][0]["count"].get()); - ASSERT_STREQ("VeryBusy", results2["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_TRUE(results2["facet_counts"][0]["counts"][1]["value"] == nullptr); - - // ensure that unknown facet field are handled - - auto results3 = coll1->search("Gujarat", {"state"}, - "", {"visitorsz(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, true, false, true, "", true, 6000000UL, 4UL, - 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, 0UL, - max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_FALSE(results3.ok()); - ASSERT_EQ("Could not find a facet field named `visitorsz` in the schema.", results3.error()); - - auto results4 = coll1->search("*", {"state"}, - "", {"trackingFrom(Old:[0, 1910], New:[1910, 2100])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); - - ASSERT_EQ(2, results4["facet_counts"][0]["counts"].size()); - ASSERT_EQ(3, results4["facet_counts"][0]["counts"][0]["count"].get()); - ASSERT_EQ("Old", results4["facet_counts"][0]["counts"][0]["value"].get()); - - ASSERT_EQ(2, results4["facet_counts"][0]["counts"][1]["count"].get()); - ASSERT_EQ("New", results4["facet_counts"][0]["counts"][1]["value"].get()); - - // ensure that only integer fields are allowed - auto rop = coll1->search("Karnataka", {"state"}, - "", {"state(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_FALSE(rop.ok()); - ASSERT_EQ("Range facet is restricted to only int32 and int64 fields.", rop.error()); - - // ensure that bad facet range values are handled - rop = coll1->search("Karnataka", {"state"}, - "", {"visitors(Busy:[alpha, 200000], VeryBusy:[200000, beta])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_FALSE(rop.ok()); - ASSERT_EQ("Facet range value is not valid.", rop.error()); - - collectionManager.drop_collection("coll1"); -} - -TEST_F(CollectionOptimizedFacetingTest, RangeFacetContinuity) { - std::vector fields = {field("place", field_types::STRING, false), - field("state", field_types::STRING, false), - field("visitors", field_types::INT32, true),}; - Collection* coll1 = collectionManager.create_collection( - "coll1", 1, fields, "", 0, "", {}, {} - ).get(); - nlohmann::json doc1; - doc1["id"] = "0"; - doc1["place"] = "Mysore Palace"; - doc1["state"] = "Karnataka"; - doc1["visitors"] = 235486; - - nlohmann::json doc2; - doc2["id"] = "1"; - doc2["place"] = "Hampi"; - doc2["state"] = "Karnataka"; - doc2["visitors"] = 187654; - - nlohmann::json doc3; - doc3["id"] = "2"; - doc3["place"] = "Mahabalipuram"; - doc3["state"] = "TamilNadu"; - doc3["visitors"] = 174684; - - nlohmann::json doc4; - doc4["id"] = "3"; - doc4["place"] = "Meenakshi Amman Temple"; - doc4["state"] = "TamilNadu"; - doc4["visitors"] = 246676; - - nlohmann::json doc5; - doc5["id"] = "4"; - doc5["place"] = "Staue of Unity"; - doc5["state"] = "Gujarat"; - doc5["visitors"] = 345878; - - - ASSERT_TRUE(coll1->add(doc1.dump()).ok()); - ASSERT_TRUE(coll1->add(doc2.dump()).ok()); - ASSERT_TRUE(coll1->add(doc3.dump()).ok()); - ASSERT_TRUE(coll1->add(doc4.dump()).ok()); - ASSERT_TRUE(coll1->add(doc5.dump()).ok()); - - auto results = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:[200001, 500000])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_STREQ("Ranges in range facet syntax should be continous.", results.error().c_str()); - - auto results2 = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:[199999, 500000])"}, - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_STREQ("Ranges in range facet syntax should be continous.", results2.error().c_str()); - - collectionManager.drop_collection("coll1"); -} - -TEST_F(CollectionOptimizedFacetingTest, RangeFacetTypo) { - std::vector fields = {field("place", field_types::STRING, false), - field("state", field_types::STRING, false), - field("visitors", field_types::INT32, true),}; - Collection* coll1 = collectionManager.create_collection( - "coll1", 1, fields, "", 0, "", {}, {} - ).get(); - nlohmann::json doc1; - doc1["id"] = "0"; - doc1["place"] = "Mysore Palace"; - doc1["state"] = "Karnataka"; - doc1["visitors"] = 235486; - - nlohmann::json doc2; - doc2["id"] = "1"; - doc2["place"] = "Hampi"; - doc2["state"] = "Karnataka"; - doc2["visitors"] = 187654; - - nlohmann::json doc3; - doc3["id"] = "2"; - doc3["place"] = "Mahabalipuram"; - doc3["state"] = "TamilNadu"; - doc3["visitors"] = 174684; - - nlohmann::json doc4; - doc4["id"] = "3"; - doc4["place"] = "Meenakshi Amman Temple"; - doc4["state"] = "TamilNadu"; - doc4["visitors"] = 246676; - - nlohmann::json doc5; - doc5["id"] = "4"; - doc5["place"] = "Staue of Unity"; - doc5["state"] = "Gujarat"; - doc5["visitors"] = 345878; - - - ASSERT_TRUE(coll1->add(doc1.dump()).ok()); - ASSERT_TRUE(coll1->add(doc2.dump()).ok()); - ASSERT_TRUE(coll1->add(doc3.dump()).ok()); - ASSERT_TRUE(coll1->add(doc4.dump()).ok()); - ASSERT_TRUE(coll1->add(doc5.dump()).ok()); - - auto results = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000)"}, //missing ']' at end - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_STREQ("Error splitting the facet range values.", results.error().c_str()); - - auto results2 = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000], VeryBusy:200000, 500000])"}, //missing '[' in second range - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_STREQ("Error splitting the facet range values.", results2.error().c_str()); - - auto results3 = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000] VeryBusy:[200000, 500000])"}, //missing ',' between ranges - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_STREQ("Error splitting the facet range values.", results3.error().c_str()); - - auto results4 = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0 200000], VeryBusy:[200000, 500000])"}, //missing ',' between first ranges values - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_STREQ("Facet range value is not valid.", results4.error().c_str()); - - auto results5 = coll1->search("TamilNadu", {"state"}, - "", {"visitors(Busy:[0, 200000 VeryBusy:200000, 500000])"}, //missing '],' and '[' - {}, {2}, 10, - 1, FREQUENCY, {true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, - "", "", {}, 1000, - true, false, true, "", true, 6000000UL, - 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, - "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); - - ASSERT_STREQ("Facet range value is not valid.", results5.error().c_str()); - - collectionManager.drop_collection("coll1"); -} - -TEST_F(CollectionOptimizedFacetingTest, SampleFacetCounts) { - nlohmann::json schema = R"({ - "name": "coll1", - "fields": [ - {"name": "color", "type": "string", "facet": true} - ] - })"_json; - - Collection* coll1 = collectionManager.create_collection(schema).get(); - - std::mt19937 gen(137723); // use constant seed to make sure that counts don't jump around - std::uniform_int_distribution<> distr(1, 100); // 1 to 100 inclusive - - size_t count_blue = 0, count_red = 0; - - for(size_t i = 0; i < 1000; i++) { - nlohmann::json doc; - if(distr(gen) % 2 == 0) { - doc["color"] = "blue"; - count_blue++; - } else { - doc["color"] = "red"; - count_red++; - } - - ASSERT_TRUE(coll1->add(doc.dump()).ok()); - } - - auto res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, - "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, - 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 10, 0, 4294967295UL, true).get(); - - ASSERT_EQ(1000, res["found"].get()); - ASSERT_EQ(1, res["facet_counts"].size()); - ASSERT_EQ(2, res["facet_counts"][0]["counts"].size()); - - // verify approximate counts - ASSERT_GE(res["facet_counts"][0]["counts"][0]["count"].get(), 250); - ASSERT_GE(res["facet_counts"][0]["counts"][1]["count"].get(), 250); - ASSERT_TRUE(res["facet_counts"][0]["sampled"].get()); - - // when sample threshold is high, don't estimate - res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, - "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, - 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 10, 10000, 4294967295UL, true).get(); - - ASSERT_EQ(1000, res["found"].get()); - ASSERT_EQ(1, res["facet_counts"].size()); - ASSERT_EQ(2, res["facet_counts"][0]["counts"].size()); - - for(size_t i = 0; i < res["facet_counts"][0]["counts"].size(); i++) { - if(res["facet_counts"][0]["counts"][i]["value"].get() == "red") { - ASSERT_EQ(count_red, res["facet_counts"][0]["counts"][i]["count"].get()); - } else { - ASSERT_EQ(count_blue, res["facet_counts"][0]["counts"][i]["count"].get()); - } - } - - ASSERT_FALSE(res["facet_counts"][0]["sampled"].get()); - - // test for sample percent > 100 - - auto res_op = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, - spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, - "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, - 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 200, 0, 4294967295UL, true); - - ASSERT_FALSE(res_op.ok()); - ASSERT_EQ("Value of `facet_sample_percent` must be less than 100.", res_op.error()); -} - -TEST_F(CollectionOptimizedFacetingTest, FacetOnArrayFieldWithSpecialChars) { - std::vector fields = { - field("tags", field_types::STRING_ARRAY, true), - field("points", field_types::INT32, true), - }; - - Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); - - nlohmann::json doc; - doc["tags"] = {"gamma"}; - doc["points"] = 10; - ASSERT_TRUE(coll1->add(doc.dump()).ok()); - - doc["tags"] = {"alpha", "| . |", "beta", "gamma"}; - doc["points"] = 10; - ASSERT_TRUE(coll1->add(doc.dump()).ok()); - - auto results = coll1->search("*", {}, - "", {"tags"}, {}, {2}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, - "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, - 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 100, 0, 4294967295UL, true).get(); - - ASSERT_EQ(1, results["facet_counts"].size()); - ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); - - for(size_t i = 0; i < results["facet_counts"][0]["counts"].size(); i++) { - auto fvalue = results["facet_counts"][0]["counts"][i]["value"].get(); - if(fvalue == "gamma") { - ASSERT_EQ(2, results["facet_counts"][0]["counts"][i]["count"].get()); - } else { - ASSERT_EQ(1, results["facet_counts"][0]["counts"][i]["count"].get()); - } - } -} - -TEST_F(CollectionOptimizedFacetingTest, StringLengthTest) { - std::vector fields = { - field("tags", field_types::STRING_ARRAY, true), - field("points", field_types::INT32, true), - }; - - Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); - - nlohmann::json doc; - doc["tags"] = {"gamma"}; - doc["points"] = 10; - ASSERT_TRUE(coll1->add(doc.dump()).ok()); - - doc["tags"] = {"beta"}; - doc["points"] = 10; - ASSERT_TRUE(coll1->add(doc.dump()).ok()); - - doc["tags"] = {"alpha"}; - doc["points"] = 10; - ASSERT_TRUE(coll1->add(doc.dump()).ok()); - - std::string longStr = ""; - for(auto i = 0; i < 8; ++i) { - longStr+="alphabetagamma"; - } - - ASSERT_TRUE(112 == longStr.size()); - - std::vector vec; - vec.emplace_back(longStr); - doc["tags"] = vec; - doc["points"] = 10; - ASSERT_TRUE(coll1->add(doc.dump()).ok()); - - auto results = coll1->search("*", {}, - "", {"tags"}, {}, {2}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, - "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, - 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 100, 0, 4294967295UL, true).get(); - - - ASSERT_EQ(1, results["facet_counts"].size()); - ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); - - longStr = results["facet_counts"][0]["counts"][3]["value"]; - - //string facet length is restricted to 100 - ASSERT_TRUE(100 == longStr.size()); -} \ No newline at end of file diff --git a/test/collection_optimized_faceting_test.cpp b/test/collection_optimized_faceting_test.cpp new file mode 100644 index 00000000..6e14774b --- /dev/null +++ b/test/collection_optimized_faceting_test.cpp @@ -0,0 +1,1192 @@ +#include +#include +#include +#include +#include +#include +#include "collection.h" + + +class CollectionOptimizedFacetingTest : public ::testing::Test { +protected: + Store *store; + CollectionManager & collectionManager = CollectionManager::get_instance(); + std::atomic quit = false; + + std::vector query_fields; + std::vector sort_fields; + + void setupCollection() { + std::string state_dir_path = "/tmp/typesense_test/collection_optimized_faceting"; + LOG(INFO) << "Truncating and creating: " << state_dir_path; + system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); + + store = new Store(state_dir_path); + collectionManager.init(store, 1.0, "auth_key", quit); + collectionManager.load(8, 1000); + } + + virtual void SetUp() { + setupCollection(); + } + + virtual void TearDown() { + collectionManager.dispose(); + delete store; + } +}; + +TEST_F(CollectionOptimizedFacetingTest, FacetCounts) { + Collection *coll_array_fields; + + std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl"); + std::vector fields = {field("name", field_types::STRING, false), + field("name_facet", field_types::STRING, true), + field("age", field_types::INT32, true), + field("years", field_types::INT32_ARRAY, true), + field("rating", field_types::FLOAT, true), + field("timestamps", field_types::INT64_ARRAY, true), + field("tags", field_types::STRING_ARRAY, true), + field("optional_facet", field_types::INT64_ARRAY, true, true),}; + + std::vector sort_fields = { sort_by("age", "DESC") }; + + coll_array_fields = collectionManager.get_collection("coll_array_fields").get(); + if(coll_array_fields == nullptr) { + coll_array_fields = collectionManager.create_collection("coll_array_fields", 4, fields, "age").get(); + } + + std::string json_line; + + while (std::getline(infile, json_line)) { + nlohmann::json document = nlohmann::json::parse(json_line); + document["name_facet"] = document["name"]; + const std::string & patched_json_line = document.dump(); + coll_array_fields->add(patched_json_line); + } + + infile.close(); + + query_fields = {"name"}; + std::vector facets = {"tags"}; + + // single facet with no filters + nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, + {0}, 10, 1, FREQUENCY, {false}, 1UL, + spp::sparse_hash_set(), + spp::sparse_hash_set(), + 10UL, "", 30UL, 4UL, "", 1UL, "", "", {}, + 3UL, "", "", {}, 4294967295UL, true, + false, true, "", false, 6000000UL, 4UL, 7UL, fallback, + 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, + 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(4, results["facet_counts"][0].size()); + ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]); + ASSERT_EQ(false, results["facet_counts"][0]["sampled"].get()); + ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["stats"].size()); + ASSERT_EQ(4, results["facet_counts"][0]["stats"]["total_values"].get()); + + ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]); + + ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]); + + ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][2]["value"].get().c_str()); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][2]["count"]); + + ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][3]["value"].get().c_str()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][3]["count"]); + + // facet with facet count limit + results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, {0}, 10, 1, + FREQUENCY, {false}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 2, "", 30UL, 4UL, "", 1UL, + "", "", {}, 3UL, "", "", {}, 4294967295UL, true, + false, true, "", false, 6000000UL, 4UL, 7UL, fallback, 4UL, {off}, + 32767UL, 32767UL, 2UL, 2UL, false, "", true, 0UL, max_score, 100UL, + 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); + + ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]); + + ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]); + + // 2 facets, 1 text query with no filters + facets.clear(); + facets.push_back("tags"); + facets.push_back("name_facet"); + results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, + {0}, 10, 1, FREQUENCY, {false}, 1UL, + spp::sparse_hash_set(), + spp::sparse_hash_set(), + 10UL, "", 30UL, 4UL, "", 1UL, "", "", {}, + 3UL, "", "", {}, 4294967295UL, true, + false, true, "", false, 6000000UL, 4UL, 7UL, fallback, + 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, + 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(2, results["facet_counts"].size()); + + ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_STREQ("name_facet", results["facet_counts"][1]["field_name"].get().c_str()); + + // facet value must one that's stored, not indexed (i.e. no tokenization/standardization) + ASSERT_STREQ("Jeremy Howard", results["facet_counts"][1]["counts"][0]["value"].get().c_str()); + ASSERT_EQ(5, (int) results["facet_counts"][1]["counts"][0]["count"]); + + // facet with wildcard + results = coll_array_fields->search("Jeremy", query_fields, "", {"ag*"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, 1UL, spp::sparse_hash_set(), + spp::sparse_hash_set(), + 10UL, "", 30UL, 4UL, "", 1UL, "", "", {}, + 3UL, "", "", {}, 4294967295UL, true, + false, true, "", false, 6000000UL, 4UL, 7UL, fallback, + 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, + 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_STREQ("age", results["facet_counts"][0]["field_name"].get().c_str()); + + // facet on a float field without query to check on stats + results = coll_array_fields->search("*", query_fields, "", {"rating"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["facet_counts"][0]["stats"].size()); + ASSERT_FLOAT_EQ(4.880199885368347, results["facet_counts"][0]["stats"]["avg"].get()); + ASSERT_FLOAT_EQ(0.0, results["facet_counts"][0]["stats"]["min"].get()); + ASSERT_FLOAT_EQ(9.99899959564209, results["facet_counts"][0]["stats"]["max"].get()); + ASSERT_FLOAT_EQ(24.400999426841736, results["facet_counts"][0]["stats"]["sum"].get()); + ASSERT_FLOAT_EQ(5, results["facet_counts"][0]["stats"]["total_values"].get()); + + // check for "0" case + ASSERT_STREQ("0", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get()); + + facets.clear(); + facets.push_back("tags"); + + // empty facet query value should return all facets without any filtering of facets + results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "tags: ", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + + results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "tags:", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + + // Wildcard facet_by can have partial matches + results = coll_array_fields->search("*", query_fields, "", {"nam*"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ("name_facet", results["facet_counts"][0]["field_name"].get()); + + // Wildcard facet_by having no counts should not be returned + results = coll_array_fields->search("*", query_fields, "", {"optio*"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(0, results["facet_counts"].size()); + + results = coll_array_fields->search("*", query_fields, "", {"optional_facet"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ("optional_facet", results["facet_counts"][0]["field_name"].get()); + + // bad facet query syntax + auto res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "foobar", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Facet query must be in the `facet_field: value` format.", res_op.error().c_str()); + + // unknown facet field + res_op = coll_array_fields->search("*", query_fields, "", {"foobar"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "foobar: baz", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Could not find a facet field named `foobar` in the schema.", res_op.error().c_str()); + + // only prefix matching is valid + res_op = coll_array_fields->search("*", query_fields, "", {"*_facet"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Only prefix matching with a wildcard is allowed.", res_op.error().c_str()); + + // unknown wildcard facet field + res_op = coll_array_fields->search("*", query_fields, "", {"foo*"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Could not find a facet field for `foo*` in the schema.", res_op.error().c_str()); + + // when facet query is given but no facet fields are specified, must return an error message + res_op = coll_array_fields->search("*", query_fields, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "tags: foo", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("The `facet_query` parameter is supplied without a `facet_by` parameter.", res_op.error().c_str()); + + res_op = coll_array_fields->search("*", query_fields, "", {""}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "tags: foo", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Could not find a facet field named `` in the schema.", res_op.error().c_str()); + + // given facet query field must be part of facet fields requested + res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "name_facet: jeremy", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Facet query refers to a facet field `name_facet` that is not part of `facet_by` parameter.", res_op.error().c_str()); + + collectionManager.drop_collection("coll_array_fields"); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetCountsBool) { + Collection *coll1; + + std::vector fields = {field("title", field_types::STRING, false), + field("points", field_types::INT32, false), + field("in_stock", field_types::BOOL, true)}; + + std::vector sort_fields = {sort_by("points", "DESC")}; + + coll1 = collectionManager.get_collection("coll1").get(); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); + } + + nlohmann::json doc; + doc["id"] = "100"; + doc["title"] = "Ford Mustang"; + doc["points"] = 25; + doc["in_stock"] = true; + + coll1->add(doc.dump()); + + doc["id"] = "101"; + doc["title"] = "Tesla Model S"; + doc["points"] = 40; + doc["in_stock"] = false; + + coll1->add(doc.dump()); + + doc["id"] = "102"; + doc["title"] = "Ford Mustang GT"; + doc["points"] = 10; + doc["in_stock"] = true; + + coll1->add(doc.dump()); + + std::vector facets = {"in_stock"}; + + nlohmann::json results = coll1->search("Ford", {"title"}, "", facets, sort_fields, {0}, 10, 1, + token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10,"", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["stats"].size()); + ASSERT_FLOAT_EQ(1, results["facet_counts"][0]["stats"]["total_values"].get()); + + ASSERT_STREQ("in_stock", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("true", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetCountsFloatPrecision) { + Collection *coll1; + + std::vector fields = {field("title", field_types::STRING, false), + field("points", field_types::FLOAT, true)}; + + std::vector sort_fields = {sort_by("points", "DESC")}; + + coll1 = collectionManager.get_collection("coll1").get(); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); + } + + nlohmann::json doc; + doc["id"] = "100"; + doc["title"] = "Ford Mustang"; + doc["points"] = 113.4; + + coll1->add(doc.dump()); + + std::vector facets = {"points"}; + + nlohmann::json results = coll1->search("*", {"title"}, "", facets, sort_fields, {0}, 10, 1, + token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10,"", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + + ASSERT_STREQ("points", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("113.4", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("113.4",results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetStatOnFloatFields) { + Collection *coll_float_fields; + + std::ifstream infile(std::string(ROOT_DIR)+"test/float_documents.jsonl"); + std::vector fields = { + field("title", field_types::STRING, false), + field("score", field_types::FLOAT, false), + field("average", field_types::FLOAT, true) + }; + + std::vector sort_fields_desc = { sort_by("average", "DESC") }; + + coll_float_fields = collectionManager.get_collection("coll_float_fields").get(); + if(coll_float_fields == nullptr) { + coll_float_fields = collectionManager.create_collection("coll_float_fields", 4, fields, "average").get(); + } + + std::string json_line; + + while (std::getline(infile, json_line)) { + coll_float_fields->add(json_line); + } + + infile.close(); + + query_fields = {"title"}; + auto res_op = coll_float_fields->search("Jeremy", query_fields, "", {"average"}, sort_fields_desc, {0}, 10, + 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + auto results = res_op.get(); + + ASSERT_EQ(7, results["hits"].size()); + + ASSERT_EQ(5, results["facet_counts"][0]["stats"].size()); + ASSERT_FLOAT_EQ(-21.3799991607666, results["facet_counts"][0]["stats"]["min"].get()); + ASSERT_FLOAT_EQ(300, results["facet_counts"][0]["stats"]["max"].get()); + ASSERT_FLOAT_EQ(277.8160007725237, results["facet_counts"][0]["stats"]["sum"].get()); + ASSERT_FLOAT_EQ(39.68800011036053, results["facet_counts"][0]["stats"]["avg"].get()); + ASSERT_FLOAT_EQ(7, results["facet_counts"][0]["stats"]["total_values"].get()); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetCountOnSimilarStrings) { + Collection *coll1; + + std::vector fields = {field("categories", field_types::STRING_ARRAY, true), + field("points", field_types::INT32, true)}; + + std::vector sort_fields = {sort_by("points", "DESC")}; + + coll1 = collectionManager.get_collection("coll1").get(); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); + } + + nlohmann::json doc; + doc["id"] = "100"; + doc["categories"] = {"England in India"}; + doc["points"] = 25; + + coll1->add(doc.dump()); + + doc["id"] = "101"; + doc["categories"] = {"India in England"}; + doc["points"] = 50; + + coll1->add(doc.dump()); + + std::vector facets = {"categories"}; + + nlohmann::json results = coll1->search("*", {"categories"}, "", facets, sort_fields, {0}, 10, 1, + token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(2, results["hits"].size()); + ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); + + ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetByNestedIntField) { + nlohmann::json schema = R"({ + "name": "coll1", + "enable_nested_fields": true, + "fields": [ + {"name": "details", "type": "object", "optional": false }, + {"name": "company.num_employees", "type": "int32", "optional": false, "facet": true }, + {"name": "companyRank", "type": "int32", "optional": false, "facet": true } + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll1 = op.get(); + + auto doc1 = R"({ + "details": {"count": 1000}, + "company": {"num_employees": 2000}, + "companyRank": 100 + })"_json; + + auto doc2 = R"({ + "details": {"count": 2000}, + "company": {"num_employees": 2000}, + "companyRank": 101 + })"_json; + + ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok()); + ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok()); + + std::vector sort_fields = { sort_by("details.count", "ASC") }; + + auto results = coll1->search("*", {}, "", {"company.num_employees"}, sort_fields, {0}, 10, 1, + token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 1UL, "", "", {}, 3UL, + "", "", {}, 4294967295UL, true, false, true, "", false, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(2, results["found"].get()); + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ("company.num_employees", results["facet_counts"][0]["field_name"]); + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(2, results["facet_counts"][0]["counts"][0]["count"].get()); + ASSERT_EQ("2000", results["facet_counts"][0]["counts"][0]["value"].get()); + + // Nested wildcard faceting + std::vector wildcard_facets; + coll1->parse_facet("company.*", wildcard_facets); + + ASSERT_EQ(1, wildcard_facets.size()); + ASSERT_EQ("company.num_employees", wildcard_facets[0].field_name); + + wildcard_facets.clear(); + coll1->parse_facet("company*", wildcard_facets); + + ASSERT_EQ(2, wildcard_facets.size()); + ASSERT_EQ("company.num_employees", wildcard_facets[0].field_name); + ASSERT_EQ("companyRank", wildcard_facets[1].field_name); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetParseTest){ + std::vector fields = { + field("score", field_types::INT32, true), + field("grade", field_types::INT32, true), + field("rank", field_types::INT32, true), + field("range", field_types::INT32, true), + field("scale", field_types::INT32, false), + }; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + std::vector range_facet_fields { + "score(fail:[0, 40], pass:[40, 100])", + "grade(A:[80, 100], B:[60, 80], C:[40, 60])" + }; + std::vector range_facets; + for(const std::string & facet_field: range_facet_fields) { + coll1->parse_facet(facet_field, range_facets); + } + ASSERT_EQ(2, range_facets.size()); + + ASSERT_STREQ("score", range_facets[0].field_name.c_str()); + ASSERT_TRUE(range_facets[0].is_range_query); + ASSERT_GT(range_facets[0].facet_range_map.size(), 0); + + ASSERT_STREQ("grade", range_facets[1].field_name.c_str()); + ASSERT_TRUE(range_facets[1].is_range_query); + ASSERT_GT(range_facets[1].facet_range_map.size(), 0); + + std::vector normal_facet_fields { + "score", + "grade" + }; + std::vector normal_facets; + for(const std::string & facet_field: normal_facet_fields) { + coll1->parse_facet(facet_field, normal_facets); + } + ASSERT_EQ(2, normal_facets.size()); + + ASSERT_STREQ("score", normal_facets[0].field_name.c_str()); + ASSERT_STREQ("grade", normal_facets[1].field_name.c_str()); + + std::vector wildcard_facet_fields { + "ran*", + "sc*", + }; + std::vector wildcard_facets; + for(const std::string & facet_field: wildcard_facet_fields) { + coll1->parse_facet(facet_field, wildcard_facets); + } + + ASSERT_EQ(3, wildcard_facets.size()); + + std::set expected{"range", "rank", "score"}; + for (size_t i = 0; i < wildcard_facets.size(); i++) { + ASSERT_TRUE(expected.count(wildcard_facets[i].field_name) == 1); + } + + wildcard_facets.clear(); + coll1->parse_facet("*", wildcard_facets); + + // Last field is not a facet. + ASSERT_EQ(fields.size() - 1, wildcard_facets.size()); + + expected.clear(); + for (size_t i = 0; i < fields.size() - 1; i++) { + expected.insert(fields[i].name); + } + + for (size_t i = 0; i < wildcard_facets.size(); i++) { + ASSERT_TRUE(expected.count(wildcard_facets[i].field_name) == 1); + } + + std::vector mixed_facet_fields { + "score", + "grade(A:[80, 100], B:[60, 80], C:[40, 60])", + "ra*", + }; + + std::vector mixed_facets; + for(const std::string & facet_field: mixed_facet_fields) { + coll1->parse_facet(facet_field, mixed_facets); + } + ASSERT_EQ(4, mixed_facets.size()); + + std::vector mixed_facets_ptr; + for(auto& f: mixed_facets) { + mixed_facets_ptr.push_back(&f); + } + + std::sort(mixed_facets_ptr.begin(), mixed_facets_ptr.end(), [](const facet* f1, const facet* f2) { + return f1->field_name < f2->field_name; + }); + + ASSERT_EQ("score", mixed_facets_ptr[3]->field_name); + + ASSERT_EQ("grade", mixed_facets_ptr[0]->field_name); + ASSERT_TRUE(mixed_facets_ptr[0]->is_range_query); + ASSERT_GT(mixed_facets_ptr[0]->facet_range_map.size(), 0); + + ASSERT_EQ("rank", mixed_facets_ptr[2]->field_name); + ASSERT_EQ("range", mixed_facets_ptr[1]->field_name); +} + +TEST_F(CollectionOptimizedFacetingTest, RangeFacetTest) { + std::vector fields = {field("place", field_types::STRING, false), + field("state", field_types::STRING, false), + field("visitors", field_types::INT32, true), + field("trackingFrom", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {} + ).get(); + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["place"] = "Mysore Palace"; + doc1["state"] = "Karnataka"; + doc1["visitors"] = 235486; + doc1["trackingFrom"] = 1900; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["place"] = "Hampi"; + doc2["state"] = "Karnataka"; + doc2["visitors"] = 187654; + doc2["trackingFrom"] = 1900; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["place"] = "Mahabalipuram"; + doc3["state"] = "TamilNadu"; + doc3["visitors"] = 174684; + doc3["trackingFrom"] = 1900; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["place"] = "Meenakshi Amman Temple"; + doc4["state"] = "TamilNadu"; + doc4["visitors"] = 246676; + doc4["trackingFrom"] = 2000; + + nlohmann::json doc5; + doc5["id"] = "4"; + doc5["place"] = "Staue of Unity"; + doc5["state"] = "Gujarat"; + doc5["visitors"] = 345878; + doc5["trackingFrom"] = 2000; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + ASSERT_TRUE(coll1->add(doc5.dump()).ok()); + + auto results = coll1->search("Karnataka", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("Busy", results["facet_counts"][0]["counts"][0]["value"].get()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_EQ("VeryBusy", results["facet_counts"][0]["counts"][1]["value"].get()); + + auto results2 = coll1->search("Gujarat", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(1, results2["facet_counts"][0]["counts"].size()); + ASSERT_EQ(1, results2["facet_counts"][0]["counts"][0]["count"].get()); + ASSERT_STREQ("VeryBusy", results2["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_TRUE(results2["facet_counts"][0]["counts"][1]["value"] == nullptr); + + // ensure that unknown facet field are handled + + auto results3 = coll1->search("Gujarat", {"state"}, + "", {"visitorsz(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", true, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, 0UL, + max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(results3.ok()); + ASSERT_EQ("Could not find a facet field named `visitorsz` in the schema.", results3.error()); + + auto results4 = coll1->search("*", {"state"}, + "", {"trackingFrom(Old:[0, 1910], New:[1910, 2100])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(2, results4["facet_counts"][0]["counts"].size()); + ASSERT_EQ(3, results4["facet_counts"][0]["counts"][0]["count"].get()); + ASSERT_EQ("Old", results4["facet_counts"][0]["counts"][0]["value"].get()); + + ASSERT_EQ(2, results4["facet_counts"][0]["counts"][1]["count"].get()); + ASSERT_EQ("New", results4["facet_counts"][0]["counts"][1]["value"].get()); + + // ensure that only integer fields are allowed + auto rop = coll1->search("Karnataka", {"state"}, + "", {"state(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(rop.ok()); + ASSERT_EQ("Range facet is restricted to only int32 and int64 fields.", rop.error()); + + // ensure that bad facet range values are handled + rop = coll1->search("Karnataka", {"state"}, + "", {"visitors(Busy:[alpha, 200000], VeryBusy:[200000, beta])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(rop.ok()); + ASSERT_EQ("Facet range value is not valid.", rop.error()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, RangeFacetContinuity) { + std::vector fields = {field("place", field_types::STRING, false), + field("state", field_types::STRING, false), + field("visitors", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {} + ).get(); + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["place"] = "Mysore Palace"; + doc1["state"] = "Karnataka"; + doc1["visitors"] = 235486; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["place"] = "Hampi"; + doc2["state"] = "Karnataka"; + doc2["visitors"] = 187654; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["place"] = "Mahabalipuram"; + doc3["state"] = "TamilNadu"; + doc3["visitors"] = 174684; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["place"] = "Meenakshi Amman Temple"; + doc4["state"] = "TamilNadu"; + doc4["visitors"] = 246676; + + nlohmann::json doc5; + doc5["id"] = "4"; + doc5["place"] = "Staue of Unity"; + doc5["state"] = "Gujarat"; + doc5["visitors"] = 345878; + + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + ASSERT_TRUE(coll1->add(doc5.dump()).ok()); + + auto results = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200001, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Ranges in range facet syntax should be continous.", results.error().c_str()); + + auto results2 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[199999, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Ranges in range facet syntax should be continous.", results2.error().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, RangeFacetTypo) { + std::vector fields = {field("place", field_types::STRING, false), + field("state", field_types::STRING, false), + field("visitors", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {} + ).get(); + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["place"] = "Mysore Palace"; + doc1["state"] = "Karnataka"; + doc1["visitors"] = 235486; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["place"] = "Hampi"; + doc2["state"] = "Karnataka"; + doc2["visitors"] = 187654; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["place"] = "Mahabalipuram"; + doc3["state"] = "TamilNadu"; + doc3["visitors"] = 174684; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["place"] = "Meenakshi Amman Temple"; + doc4["state"] = "TamilNadu"; + doc4["visitors"] = 246676; + + nlohmann::json doc5; + doc5["id"] = "4"; + doc5["place"] = "Staue of Unity"; + doc5["state"] = "Gujarat"; + doc5["visitors"] = 345878; + + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + ASSERT_TRUE(coll1->add(doc5.dump()).ok()); + + auto results = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000)"}, //missing ']' at end + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Error splitting the facet range values.", results.error().c_str()); + + auto results2 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:200000, 500000])"}, //missing '[' in second range + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Error splitting the facet range values.", results2.error().c_str()); + + auto results3 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000] VeryBusy:[200000, 500000])"}, //missing ',' between ranges + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Error splitting the facet range values.", results3.error().c_str()); + + auto results4 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0 200000], VeryBusy:[200000, 500000])"}, //missing ',' between first ranges values + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Facet range value is not valid.", results4.error().c_str()); + + auto results5 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000 VeryBusy:200000, 500000])"}, //missing '],' and '[' + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Facet range value is not valid.", results5.error().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, SampleFacetCounts) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "color", "type": "string", "facet": true} + ] + })"_json; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + std::mt19937 gen(137723); // use constant seed to make sure that counts don't jump around + std::uniform_int_distribution<> distr(1, 100); // 1 to 100 inclusive + + size_t count_blue = 0, count_red = 0; + + for(size_t i = 0; i < 1000; i++) { + nlohmann::json doc; + if(distr(gen) % 2 == 0) { + doc["color"] = "blue"; + count_blue++; + } else { + doc["color"] = "red"; + count_red++; + } + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + } + + auto res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 10, 0, 4294967295UL, true).get(); + + ASSERT_EQ(1000, res["found"].get()); + ASSERT_EQ(1, res["facet_counts"].size()); + ASSERT_EQ(2, res["facet_counts"][0]["counts"].size()); + + // verify approximate counts + ASSERT_GE(res["facet_counts"][0]["counts"][0]["count"].get(), 250); + ASSERT_GE(res["facet_counts"][0]["counts"][1]["count"].get(), 250); + ASSERT_TRUE(res["facet_counts"][0]["sampled"].get()); + + // when sample threshold is high, don't estimate + res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 10, 10000, 4294967295UL, true).get(); + + ASSERT_EQ(1000, res["found"].get()); + ASSERT_EQ(1, res["facet_counts"].size()); + ASSERT_EQ(2, res["facet_counts"][0]["counts"].size()); + + for(size_t i = 0; i < res["facet_counts"][0]["counts"].size(); i++) { + if(res["facet_counts"][0]["counts"][i]["value"].get() == "red") { + ASSERT_EQ(count_red, res["facet_counts"][0]["counts"][i]["count"].get()); + } else { + ASSERT_EQ(count_blue, res["facet_counts"][0]["counts"][i]["count"].get()); + } + } + + ASSERT_FALSE(res["facet_counts"][0]["sampled"].get()); + + // test for sample percent > 100 + + auto res_op = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 200, 0, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_EQ("Value of `facet_sample_percent` must be less than 100.", res_op.error()); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetOnArrayFieldWithSpecialChars) { + std::vector fields = { + field("tags", field_types::STRING_ARRAY, true), + field("points", field_types::INT32, true), + }; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + nlohmann::json doc; + doc["tags"] = {"gamma"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["tags"] = {"alpha", "| . |", "beta", "gamma"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"tags"}, {}, {2}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 100, 0, 4294967295UL, true).get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); + + for(size_t i = 0; i < results["facet_counts"][0]["counts"].size(); i++) { + auto fvalue = results["facet_counts"][0]["counts"][i]["value"].get(); + if(fvalue == "gamma") { + ASSERT_EQ(2, results["facet_counts"][0]["counts"][i]["count"].get()); + } else { + ASSERT_EQ(1, results["facet_counts"][0]["counts"][i]["count"].get()); + } + } +} + +TEST_F(CollectionOptimizedFacetingTest, StringLengthTest) { + std::vector fields = { + field("tags", field_types::STRING_ARRAY, true), + field("points", field_types::INT32, true), + }; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + nlohmann::json doc; + doc["tags"] = {"gamma"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["tags"] = {"beta"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["tags"] = {"alpha"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + std::string longStr = ""; + for(auto i = 0; i < 8; ++i) { + longStr+="alphabetagamma"; + } + + ASSERT_TRUE(112 == longStr.size()); + + std::vector vec; + vec.emplace_back(longStr); + doc["tags"] = vec; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"tags"}, {}, {2}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 100, 0, 4294967295UL, true).get(); + + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); + + longStr = results["facet_counts"][0]["counts"][3]["value"]; + + //string facet length is restricted to 100 + ASSERT_TRUE(100 == longStr.size()); +} \ No newline at end of file