From 92c38c3fa5ea0d1593fb21fe06ba792e9a119000 Mon Sep 17 00:00:00 2001 From: krunal1313 Date: Mon, 1 May 2023 12:01:35 +0530 Subject: [PATCH] adding tests and optimizing indexing --- BUILD | 1 + include/collection.h | 10 +- include/facet_index.h | 4 +- include/index.h | 23 +- src/collection.cpp | 15 +- src/facet_index.cpp | 43 +- src/field.cpp | 2 +- src/index.cpp | 171 +++-- src/num_tree.cpp | 25 +- test/collection_faceting_test.cpp | 1191 ++++++++++++++++++++++++++++- 10 files changed, 1373 insertions(+), 112 deletions(-) diff --git a/BUILD b/BUILD index f47ced69..a78c2f28 100644 --- a/BUILD +++ b/BUILD @@ -134,6 +134,7 @@ TEST_COPTS = [ "-Wno-unused-parameter", "-Werror=return-type", "-g", + "-DFORCE_INTERSECTION", ] config_setting( diff --git a/include/collection.h b/include/collection.h index caaf897f..0eb820bb 100644 --- a/include/collection.h +++ b/include/collection.h @@ -182,8 +182,8 @@ private: std::vector& new_fields, bool enable_nested_fields); - static bool facet_count_compare(const std::pair& a, - const std::pair& b) { + static bool facet_count_compare(const std::pair& a, + const std::pair& b) { return std::tie(a.second.count, a.first) > std::tie(b.second.count, b.first); } @@ -460,7 +460,11 @@ public: const text_match_type_t match_type = max_score, const size_t facet_sample_percent = 100, const size_t facet_sample_threshold = 0, - const size_t page_offset = UINT32_MAX) const; + const size_t page_offset = UINT32_MAX +#ifdef FORCE_INTERSECTION + , bool force_intersection = false +#endif + ) const; Option get_filter_ids(const std::string & filter_query, filter_result_t& filter_result) const; diff --git a/include/facet_index.h b/include/facet_index.h index cafb6c74..491148b5 100644 --- a/include/facet_index.h +++ b/include/facet_index.h @@ -52,7 +52,5 @@ public: size_t get_facet_count(const std::string& field); size_t intersect(const std::string& val, const uint32_t* result_ids, int result_id_len, - int max_facet_count, std::map& found, bool is_wildcard_no_filter_query); - - std::string get_facet_by_count_index(const std::string& field, uint32_t count_index); + int max_facet_count, std::map& found, bool is_wildcard_no_filter_query); }; \ No newline at end of file diff --git a/include/index.h b/include/index.h index d045d923..d347437d 100644 --- a/include/index.h +++ b/include/index.h @@ -363,7 +363,11 @@ private: const std::vector& facet_infos, size_t group_limit, const std::vector& group_by_fields, const uint32_t* result_ids, size_t results_size, - int max_facet_count, bool is_wildcard_query, bool no_filters_provided) const; + int max_facet_count, bool is_wildcard_query, bool no_filters_provided +#ifdef FORCE_INTERSECTION + , bool force_intersection = false +#endif + ) const; bool static_filter_query_eval(const override_t* override, std::vector& tokens, filter_node_t*& filter_tree_root) const; @@ -520,6 +524,8 @@ private: static void compute_facet_stats(facet &a_facet, const std::string& raw_value, const std::string & field_type); + static void compute_facet_stats(facet &a_facet, const int64_t raw_value, const std::string & field_type); + static void get_doc_changes(const index_operation_t op, nlohmann::json &update_doc, const nlohmann::json &old_doc, nlohmann::json &new_doc, nlohmann::json &del_doc); @@ -631,7 +637,12 @@ public: // Public operations - Option run_search(search_args* search_params, const std::string& collection_name); + Option run_search(search_args* search_params, + const std::string& collection_name +#ifdef FORCE_INTERSECTION + , bool force_intersection +#endif + ); Option search(std::vector& field_query_tokens, const std::vector& the_fields, const text_match_type_t match_type, @@ -656,7 +667,11 @@ public: const size_t max_extra_suffix, const size_t facet_query_num_typos, const bool filter_curated_hits, enable_t split_join_tokens, const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold, - const std::string& collection_name) const; + const std::string& collection_name +#ifdef FORCE_INTERSECTION + , bool force_intersection = false +#endif + ) const; void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name); @@ -942,6 +957,8 @@ public: uint32_t filter_ids_length, std::set& curated_ids, std::map>& included_ids_map, std::vector& included_ids_vec) const; + + int64_t get_doc_val_from_sort_index(const std::string& field_name, uint32_t doc_seq_id) const; }; template diff --git a/src/collection.cpp b/src/collection.cpp index 19aa663a..ecbed5ec 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1070,7 +1070,11 @@ Option Collection::search(std::string raw_query, const text_match_type_t match_type, const size_t facet_sample_percent, const size_t facet_sample_threshold, - const size_t page_offset) const { + const size_t page_offset +#ifdef FORCE_INTERSECTION + , bool force_intersection +#endif + ) const { std::shared_lock lock(mutex); @@ -1520,7 +1524,12 @@ Option Collection::search(std::string raw_query, std::unique_ptr search_params_guard(search_params); - auto search_op = index->run_search(search_params, name); + auto search_op = index->run_search(search_params, name +#ifdef FORCE_INTERSECTION + , force_intersection +#endif + ); + if (!search_op.ok()) { return Option(search_op.code(), search_op.error()); } @@ -1937,7 +1946,7 @@ Option Collection::search(std::string raw_query, auto max_facets = std::min(max_facet_values, facet_counts.size()); std::sort(facet_counts.begin(), facet_counts.end(), [&](const auto& p1, const auto& p2) { - return p1.second > p2.second; + return std::tie(p1.second, p1.first) > std::tie(p2.second, p2.first); }); for(int i = 0; i < max_facets; ++i) { diff --git a/src/facet_index.cpp b/src/facet_index.cpp index 2d49a7df..902134f2 100644 --- a/src/facet_index.cpp +++ b/src/facet_index.cpp @@ -39,23 +39,17 @@ uint32_t facet_index_t::insert(const std::string& field, const std::string& valu counter_list.emplace_back(sv, facet_count); } else { auto counter_it = counter_list.begin(); - //remove node from list - for(counter_it = counter_list.begin(); counter_it != counter_list.end(); ++counter_it) { - if(counter_it->facet_value == sv) { - //found facet in first node - counter_list.erase(counter_it); - break; - } - } - - //find position in list and add node with updated count + count_list node(sv, facet_count); for(counter_it = counter_list.begin(); counter_it != counter_list.end(); ++counter_it) { - // LOG (INFO) << "inserting in middle or front facet " << node.facet_value - // << " with count " << node.count; - if(counter_it->count <= facet_count) { - counter_list.emplace(counter_it, node); + if(counter_it->facet_value == sv) { + counter_it->count = facet_count; + + auto prev_node = std::prev(counter_it); + if(prev_node->count < counter_it->count) { + std::swap(prev_node, counter_it); + } break; } } @@ -127,9 +121,6 @@ size_t facet_index_t::intersect(const std::string& field, const uint32_t* result ids_t::uncompress(ids, id_list); const auto ids_len = id_list.size(); for(int i = 0; i < result_ids_len; ++i) { - // if(std::binary_search(id_list.begin(), id_list.end(), result_ids[i])) { - // ++count; - // } uint32_t* out = nullptr; count = ArrayUtils::and_scalar(id_list.data(), id_list.size(), result_ids, result_ids_len, &out); @@ -148,24 +139,6 @@ size_t facet_index_t::intersect(const std::string& field, const uint32_t* result return found.size(); } -std::string facet_index_t::get_facet_by_count_index(const std::string& field, uint32_t count_index) { - - const auto& facet_field_it = facet_field_map.find(field); - - if(facet_field_it == facet_field_map.end()) { - return ""; - } - std::string result = ""; - auto facet_index_map = facet_field_it->second.facet_index_map; - - for(auto it = facet_index_map.begin(); it != facet_index_map.end(); ++it) { - if(it.value().index == count_index) { - result = it.key(); - } - } - return result; -} - facet_index_t::~facet_index_t() { facet_field_map.clear(); } diff --git a/src/field.cpp b/src/field.cpp index bcc8470a..6e1a4de6 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -652,7 +652,7 @@ Option field::json_field_to_field(bool enable_nested_fields, nlohmann::jso if(field_json["type"] == field_types::INT32 || field_json["type"] == field_types::INT64 || field_json["type"] == field_types::FLOAT || field_json["type"] == field_types::BOOL || field_json["type"] == field_types::GEOPOINT || field_json["type"] == field_types::GEOPOINT_ARRAY) { - if(field_json.count(fields::num_dim) == 0) { + if((field_json.count(fields::num_dim) == 0) || (field_json[fields::facet])) { field_json[fields::sort] = true; } else { field_json[fields::sort] = false; diff --git a/src/index.cpp b/src/index.cpp index 26d2fb49..4080e0de 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1139,12 +1139,66 @@ void Index::compute_facet_stats(facet &a_facet, const std::string& raw_value, co } } +void Index::compute_facet_stats(facet &a_facet, const int64_t raw_value, const std::string & field_type) { + if(field_type == field_types::INT32 || field_type == field_types::INT32_ARRAY) { + int32_t val = raw_value; + if (val < a_facet.stats.fvmin) { + a_facet.stats.fvmin = val; + } + if (val > a_facet.stats.fvmax) { + a_facet.stats.fvmax = val; + } + a_facet.stats.fvsum += val; + a_facet.stats.fvcount++; + } else if(field_type == field_types::INT64 || field_type == field_types::INT64_ARRAY) { + int64_t val = raw_value; + if(val < a_facet.stats.fvmin) { + a_facet.stats.fvmin = val; + } + if(val > a_facet.stats.fvmax) { + a_facet.stats.fvmax = val; + } + a_facet.stats.fvsum += val; + a_facet.stats.fvcount++; + } else if(field_type == field_types::FLOAT || field_type == field_types::FLOAT_ARRAY) { + float val = int64_t_to_float(raw_value); + if(val < a_facet.stats.fvmin) { + a_facet.stats.fvmin = val; + } + if(val > a_facet.stats.fvmax) { + a_facet.stats.fvmax = val; + } + a_facet.stats.fvsum += val; + a_facet.stats.fvcount++; + } +} + +int64_t Index::get_doc_val_from_sort_index(const std::string& field_name, uint32_t doc_seq_id) const { + + auto sort_index_it = sort_index.find(field_name); + + if(sort_index_it != sort_index.end()){ + auto doc_id_val_map = sort_index_it->second; + auto doc_seq_id_it = doc_id_val_map->find(doc_seq_id); + + if(doc_seq_id_it != doc_id_val_map->end()){ + return doc_seq_id_it->second; + } + } + + return 0; +} + void Index::do_facets(std::vector & facets, facet_query_t & facet_query, bool estimate_facets, size_t facet_sample_percent, const std::vector& facet_infos, const size_t group_limit, const std::vector& group_by_fields, const uint32_t* result_ids, size_t results_size, - int max_facet_count, bool is_wildcard_query, bool no_filters_provided) const { + int max_facet_count, bool is_wildcard_query, bool no_filters_provided +#ifdef FORCE_INTERSECTION + , bool force_intersection +#endif + ) const { // assumed that facet fields have already been validated upstream for(size_t findex=0; findex < facets.size(); findex++) { auto& a_facet = facets[findex]; @@ -1170,32 +1224,45 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, } } - if(results_size && facet_records && (facet_records <= 10 || is_wildcard_query) && - !use_facet_query && group_limit == 0 && no_filters_provided) { +#ifdef FORCE_INTERSECTION + bool use_hashes = false; + + if(!force_intersection) { + use_hashes = true; + } +#endif + + if(results_size && facet_records && ((facet_records <= 10 || is_wildcard_query) && + !use_facet_query && group_limit == 0 && no_filters_provided) +#ifdef FORCE_INTERSECTION + && !use_hashes || force_intersection +#endif + ) { //LOG(INFO) << "Using intersection to find facets"; a_facet.is_intersected = true; std::map facet_results; - - if(facet_field.is_string()) { - facet_index_v4->intersect(a_facet.field_name, result_ids, - results_size, max_facet_count, facet_results, is_wildcard_query & no_filters_provided); - } else { - std::map facet_counts; - numerical_index.at(a_facet.field_name)->intersect(result_ids, + if(!facet_field.name.empty()) { + if(facet_field.is_string()) { + facet_index_v4->intersect(a_facet.field_name, result_ids, + results_size, max_facet_count, facet_results, is_wildcard_query & no_filters_provided); + } else { + std::map facet_counts; + numerical_index.at(a_facet.field_name)->intersect(result_ids, results_size, max_facet_count, facet_counts, is_wildcard_query & no_filters_provided); - for(const auto& kv : facet_counts) { - std::string val; - if(facet_field.is_float()) { - val = std::to_string(int64_t_to_float(kv.first)); - } else if(facet_field.is_bool()) { - val = kv.first == 1 ? "true" : "false"; - } else { - val = std::to_string(kv.first); + for(const auto& kv : facet_counts) { + std::string val; + if(facet_field.is_float()) { + val = StringUtils::float_to_str(int64_t_to_float(kv.first)); + } else if(facet_field.is_bool()) { + val = kv.first == 1 ? "true" : "false"; + } else { + val = std::to_string(kv.first); + } + + facet_results[val] = kv.second; } - - facet_results[val] = kv.second; } } @@ -1269,32 +1336,24 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, RETURN_CIRCUIT_BREAKER } + int64_t doc_val = 0; for(size_t j = 0; j < facet_hash_count; j++) { if(facet_field.is_array()) { fhash = facet_map_it->second.hashes[j]; } if(should_compute_stats) { - std::string fvalue = - facet_index_v4->get_facet_by_count_index(a_facet.field_name, fhash); - if(!fvalue.empty()) { - compute_facet_stats(a_facet, fvalue, facet_field.type); - } + doc_val = get_doc_val_from_sort_index(a_facet.field_name, doc_seq_id); + compute_facet_stats(a_facet, doc_val, facet_field.type); } + if(a_facet.is_range_query) { - auto sort_index_it = sort_index.find(a_facet.field_name); - if(sort_index_it != sort_index.end()){ - auto doc_id_val_map = sort_index_it->second; - auto doc_seq_id_it = doc_id_val_map->find(doc_seq_id); - if(doc_seq_id_it != doc_id_val_map->end()){ + doc_val = get_doc_val_from_sort_index(a_facet.field_name, doc_seq_id); - std::string doc_val = std::to_string(doc_seq_id_it->second); - std::pair range_pair {}; - if(a_facet.get_range(doc_val, range_pair)) { - const auto& range_id = range_pair.first; - facet_count_t& facet_count = a_facet.result_map[range_id]; - facet_count.count += 1; - } - } + std::pair range_pair {}; + if(a_facet.get_range(std::to_string(doc_val), range_pair)) { + const auto& range_id = range_pair.first; + facet_count_t& facet_count = a_facet.result_map[range_id]; + facet_count.count += 1; } } else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) { std::string fhash_str = std::to_string(fhash); @@ -2288,7 +2347,11 @@ Option Index::get_approximate_reference_filter_ids_with_lock(filter_node_t return rearrange_filter_tree(filter_tree_root, filter_ids_length); } -Option Index::run_search(search_args* search_params, const std::string& collection_name) { +Option Index::run_search(search_args* search_params, const std::string& collection_name +#ifdef FORCE_INTERSECTION + , bool force_intersection +#endif + ) { return search(search_params->field_query_tokens, search_params->search_fields, search_params->match_type, @@ -2323,7 +2386,11 @@ Option Index::run_search(search_args* search_params, const std::string& co search_params->vector_query, search_params->facet_sample_percent, search_params->facet_sample_threshold, - collection_name); + collection_name +#ifdef FORCE_INTERSECTION + , force_intersection +#endif + ); } void Index::collate_included_ids(const std::vector& q_included_tokens, @@ -2772,7 +2839,11 @@ Option Index::search(std::vector& field_query_tokens, cons const bool filter_curated_hits, const enable_t split_join_tokens, const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold, - const std::string& collection_name) const { + const std::string& collection_name +#ifdef FORCE_INTERSECTION + , bool force_intersection +#endif + ) const { std::shared_lock lock(mutex); uint32_t filter_ids_length = 0; @@ -3297,7 +3368,11 @@ Option Index::search(std::vector& field_query_tokens, cons batch_result_ids, batch_res_len, &facet_infos, max_facet_values, is_wildcard_query, no_filters_provided, estimate_facets, facet_sample_percent, &parent_search_begin, &parent_search_stop_ms, &parent_search_cutoff, - &num_processed, &m_process, &cv_process]() { + &num_processed, &m_process, &cv_process +#ifdef FORCE_INTERSECTION + , force_intersection +#endif + ]() { search_begin_us = parent_search_begin; search_stop_us = parent_search_stop_ms; search_cutoff = parent_search_cutoff; @@ -3307,7 +3382,11 @@ Option Index::search(std::vector& field_query_tokens, cons do_facets(facet_batches[thread_id], fq, estimate_facets, facet_sample_percent, facet_infos, group_limit, group_by_fields, batch_result_ids, batch_res_len, max_facet_values, - is_wildcard_query, no_filters_provided); + is_wildcard_query, no_filters_provided +#ifdef FORCE_INTERSECTION + , force_intersection +#endif + ); std::unique_lock lock(m_process); num_processed++; parent_search_cutoff = parent_search_cutoff || search_cutoff; @@ -3392,7 +3471,11 @@ Option Index::search(std::vector& field_query_tokens, cons max_candidates, facet_infos); do_facets(facets, facet_query, estimate_facets, facet_sample_percent, facet_infos, group_limit, group_by_fields, &included_ids_vec[0], - included_ids_vec.size(), max_facet_values, is_wildcard_query, no_filters_provided); + included_ids_vec.size(), max_facet_values, is_wildcard_query, no_filters_provided +#ifdef FORCE_INTERSECTION + , force_intersection +#endif + ); all_result_ids_len += curated_topster->size; diff --git a/src/num_tree.cpp b/src/num_tree.cpp index 6e433eb1..7f5aabd4 100644 --- a/src/num_tree.cpp +++ b/src/num_tree.cpp @@ -22,23 +22,17 @@ void num_tree_t::insert(int64_t value, uint32_t id, bool is_facet) { counter_list.emplace_back(value, facet_count); } else { auto counter_it = counter_list.begin(); - //remove node from list - for(counter_it = counter_list.begin(); counter_it != counter_list.end(); ++counter_it) { - if(counter_it->facet_value == value) { - //found facet in first node - counter_list.erase(counter_it); - break; - } - } - - //find position in list and add node with updated count + count_list node(value, facet_count); for(counter_it = counter_list.begin(); counter_it != counter_list.end(); ++counter_it) { - // LOG (INFO) << "inserting in middle or front facet " << node.facet_value - // << " with count " << node.count; - if(counter_it->count <= facet_count) { - counter_list.emplace(counter_it, node); + if(counter_it->facet_value == value) { + counter_it->count = facet_count; + + auto prev_node = std::prev(counter_it); + if(prev_node->count < counter_it->count) { + std::swap(prev_node, counter_it); + } break; } } @@ -389,9 +383,6 @@ size_t num_tree_t::intersect(const uint32_t* result_ids, int result_ids_len, int ids_t::uncompress(ids, id_list); const auto ids_len = id_list.size(); for(int i = 0; i < result_ids_len; ++i) { - // if(std::binary_search(id_list.begin(), id_list.end(), result_ids[i])) { - // ++count; - // } uint32_t* out = nullptr; count = ArrayUtils::and_scalar(id_list.data(), id_list.size(), result_ids, result_ids_len, &out); diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index 649a91d0..63081292 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -252,7 +252,7 @@ TEST_F(CollectionFacetingTest, FacetCounts) { ASSERT_FLOAT_EQ(5, results["facet_counts"][0]["stats"]["total_values"].get()); // check for "0" case - ASSERT_STREQ("0.000000", results["facet_counts"][0]["counts"][2]["value"].get().c_str()); + ASSERT_STREQ("0", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get()); // facet query on a float field @@ -506,8 +506,8 @@ TEST_F(CollectionFacetingTest, FacetCountsFloatPrecision) { ASSERT_STREQ("points", results["facet_counts"][0]["field_name"].get().c_str()); ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); - ASSERT_STREQ("113.400002", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); - ASSERT_STREQ("113.400002",results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + ASSERT_STREQ("113.4", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("113.4",results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); collectionManager.drop_collection("coll1"); } @@ -1574,3 +1574,1188 @@ TEST_F(CollectionFacetingTest, FacetOnArrayFieldWithSpecialChars) { } } } + + +class CollectionOptimizedFacetingTest : public ::testing::Test { +protected: + Store *store; + CollectionManager & collectionManager = CollectionManager::get_instance(); + std::atomic quit = false; + + std::vector query_fields; + std::vector sort_fields; + + void setupCollection() { + std::string state_dir_path = "/tmp/typesense_test/collection_optimized_faceting"; + LOG(INFO) << "Truncating and creating: " << state_dir_path; + system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); + + store = new Store(state_dir_path); + collectionManager.init(store, 1.0, "auth_key", quit); + collectionManager.load(8, 1000); + } + + virtual void SetUp() { + setupCollection(); + } + + virtual void TearDown() { + collectionManager.dispose(); + delete store; + } +}; + +TEST_F(CollectionOptimizedFacetingTest, FacetCounts) { + Collection *coll_array_fields; + + std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl"); + std::vector fields = {field("name", field_types::STRING, false), + field("name_facet", field_types::STRING, true), + field("age", field_types::INT32, true), + field("years", field_types::INT32_ARRAY, true), + field("rating", field_types::FLOAT, true), + field("timestamps", field_types::INT64_ARRAY, true), + field("tags", field_types::STRING_ARRAY, true), + field("optional_facet", field_types::INT64_ARRAY, true, true),}; + + std::vector sort_fields = { sort_by("age", "DESC") }; + + coll_array_fields = collectionManager.get_collection("coll_array_fields").get(); + if(coll_array_fields == nullptr) { + coll_array_fields = collectionManager.create_collection("coll_array_fields", 4, fields, "age").get(); + } + + std::string json_line; + + while (std::getline(infile, json_line)) { + nlohmann::json document = nlohmann::json::parse(json_line); + document["name_facet"] = document["name"]; + const std::string & patched_json_line = document.dump(); + coll_array_fields->add(patched_json_line); + } + + infile.close(); + + query_fields = {"name"}; + std::vector facets = {"tags"}; + + // single facet with no filters + nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, + {0}, 10, 1, FREQUENCY, {false}, 1UL, + spp::sparse_hash_set(), + spp::sparse_hash_set(), + 10UL, "", 30UL, 4UL, "", 1UL, "", "", {}, + 3UL, "", "", {}, 4294967295UL, true, + false, true, "", false, 6000000UL, 4UL, 7UL, fallback, + 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, + 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(4, results["facet_counts"][0].size()); + ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]); + ASSERT_EQ(false, results["facet_counts"][0]["sampled"].get()); + ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["stats"].size()); + ASSERT_EQ(4, results["facet_counts"][0]["stats"]["total_values"].get()); + + ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]); + + ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]); + + ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][2]["value"].get().c_str()); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][2]["count"]); + + ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][3]["value"].get().c_str()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][3]["count"]); + + // facet with facet count limit + results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, {0}, 10, 1, + FREQUENCY, {false}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 2, "", 30UL, 4UL, "", 1UL, + "", "", {}, 3UL, "", "", {}, 4294967295UL, true, + false, true, "", false, 6000000UL, 4UL, 7UL, fallback, 4UL, {off}, + 32767UL, 32767UL, 2UL, 2UL, false, "", true, 0UL, max_score, 100UL, + 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); + + ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][0]["count"]); + + ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]); + + // 2 facets, 1 text query with no filters + facets.clear(); + facets.push_back("tags"); + facets.push_back("name_facet"); + results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, + {0}, 10, 1, FREQUENCY, {false}, 1UL, + spp::sparse_hash_set(), + spp::sparse_hash_set(), + 10UL, "", 30UL, 4UL, "", 1UL, "", "", {}, + 3UL, "", "", {}, 4294967295UL, true, + false, true, "", false, 6000000UL, 4UL, 7UL, fallback, + 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, + 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(2, results["facet_counts"].size()); + + ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_STREQ("name_facet", results["facet_counts"][1]["field_name"].get().c_str()); + + // facet value must one that's stored, not indexed (i.e. no tokenization/standardization) + ASSERT_STREQ("Jeremy Howard", results["facet_counts"][1]["counts"][0]["value"].get().c_str()); + ASSERT_EQ(5, (int) results["facet_counts"][1]["counts"][0]["count"]); + + // facet with wildcard + results = coll_array_fields->search("Jeremy", query_fields, "", {"ag*"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, 1UL, spp::sparse_hash_set(), + spp::sparse_hash_set(), + 10UL, "", 30UL, 4UL, "", 1UL, "", "", {}, + 3UL, "", "", {}, 4294967295UL, true, + false, true, "", false, 6000000UL, 4UL, 7UL, fallback, + 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, + 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_STREQ("age", results["facet_counts"][0]["field_name"].get().c_str()); + + // facet on a float field without query to check on stats + results = coll_array_fields->search("*", query_fields, "", {"rating"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["facet_counts"][0]["stats"].size()); + ASSERT_FLOAT_EQ(4.880199885368347, results["facet_counts"][0]["stats"]["avg"].get()); + ASSERT_FLOAT_EQ(0.0, results["facet_counts"][0]["stats"]["min"].get()); + ASSERT_FLOAT_EQ(9.99899959564209, results["facet_counts"][0]["stats"]["max"].get()); + ASSERT_FLOAT_EQ(24.400999426841736, results["facet_counts"][0]["stats"]["sum"].get()); + ASSERT_FLOAT_EQ(5, results["facet_counts"][0]["stats"]["total_values"].get()); + + // check for "0" case + ASSERT_STREQ("0", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get()); + + facets.clear(); + facets.push_back("tags"); + + // empty facet query value should return all facets without any filtering of facets + results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "tags: ", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + + results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "tags:", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + + // Wildcard facet_by can have partial matches + results = coll_array_fields->search("*", query_fields, "", {"nam*"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ("name_facet", results["facet_counts"][0]["field_name"].get()); + + // Wildcard facet_by having no counts should not be returned + results = coll_array_fields->search("*", query_fields, "", {"optio*"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(0, results["facet_counts"].size()); + + results = coll_array_fields->search("*", query_fields, "", {"optional_facet"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ("optional_facet", results["facet_counts"][0]["field_name"].get()); + + // bad facet query syntax + auto res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "foobar", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Facet query must be in the `facet_field: value` format.", res_op.error().c_str()); + + // unknown facet field + res_op = coll_array_fields->search("*", query_fields, "", {"foobar"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "foobar: baz", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Could not find a facet field named `foobar` in the schema.", res_op.error().c_str()); + + // only prefix matching is valid + res_op = coll_array_fields->search("*", query_fields, "", {"*_facet"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Only prefix matching with a wildcard is allowed.", res_op.error().c_str()); + + // unknown wildcard facet field + res_op = coll_array_fields->search("*", query_fields, "", {"foo*"}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Could not find a facet field for `foo*` in the schema.", res_op.error().c_str()); + + // when facet query is given but no facet fields are specified, must return an error message + res_op = coll_array_fields->search("*", query_fields, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "tags: foo", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("The `facet_query` parameter is supplied without a `facet_by` parameter.", res_op.error().c_str()); + + res_op = coll_array_fields->search("*", query_fields, "", {""}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "tags: foo", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Could not find a facet field named `` in the schema.", res_op.error().c_str()); + + // given facet query field must be part of facet fields requested + res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "name_facet: jeremy", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Facet query refers to a facet field `name_facet` that is not part of `facet_by` parameter.", res_op.error().c_str()); + + collectionManager.drop_collection("coll_array_fields"); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetCountsBool) { + Collection *coll1; + + std::vector fields = {field("title", field_types::STRING, false), + field("points", field_types::INT32, false), + field("in_stock", field_types::BOOL, true)}; + + std::vector sort_fields = {sort_by("points", "DESC")}; + + coll1 = collectionManager.get_collection("coll1").get(); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); + } + + nlohmann::json doc; + doc["id"] = "100"; + doc["title"] = "Ford Mustang"; + doc["points"] = 25; + doc["in_stock"] = true; + + coll1->add(doc.dump()); + + doc["id"] = "101"; + doc["title"] = "Tesla Model S"; + doc["points"] = 40; + doc["in_stock"] = false; + + coll1->add(doc.dump()); + + doc["id"] = "102"; + doc["title"] = "Ford Mustang GT"; + doc["points"] = 10; + doc["in_stock"] = true; + + coll1->add(doc.dump()); + + std::vector facets = {"in_stock"}; + + nlohmann::json results = coll1->search("Ford", {"title"}, "", facets, sort_fields, {0}, 10, 1, + token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10,"", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["stats"].size()); + ASSERT_FLOAT_EQ(1, results["facet_counts"][0]["stats"]["total_values"].get()); + + ASSERT_STREQ("in_stock", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("true", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetCountsFloatPrecision) { + Collection *coll1; + + std::vector fields = {field("title", field_types::STRING, false), + field("points", field_types::FLOAT, true)}; + + std::vector sort_fields = {sort_by("points", "DESC")}; + + coll1 = collectionManager.get_collection("coll1").get(); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); + } + + nlohmann::json doc; + doc["id"] = "100"; + doc["title"] = "Ford Mustang"; + doc["points"] = 113.4; + + coll1->add(doc.dump()); + + std::vector facets = {"points"}; + + nlohmann::json results = coll1->search("*", {"title"}, "", facets, sort_fields, {0}, 10, 1, + token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10,"", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + + ASSERT_STREQ("points", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("113.4", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("113.4",results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetStatOnFloatFields) { + Collection *coll_float_fields; + + std::ifstream infile(std::string(ROOT_DIR)+"test/float_documents.jsonl"); + std::vector fields = { + field("title", field_types::STRING, false), + field("score", field_types::FLOAT, false), + field("average", field_types::FLOAT, true) + }; + + std::vector sort_fields_desc = { sort_by("average", "DESC") }; + + coll_float_fields = collectionManager.get_collection("coll_float_fields").get(); + if(coll_float_fields == nullptr) { + coll_float_fields = collectionManager.create_collection("coll_float_fields", 4, fields, "average").get(); + } + + std::string json_line; + + while (std::getline(infile, json_line)) { + coll_float_fields->add(json_line); + } + + infile.close(); + + query_fields = {"title"}; + auto res_op = coll_float_fields->search("Jeremy", query_fields, "", {"average"}, sort_fields_desc, {0}, 10, + 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + auto results = res_op.get(); + + ASSERT_EQ(7, results["hits"].size()); + + ASSERT_EQ(5, results["facet_counts"][0]["stats"].size()); + ASSERT_FLOAT_EQ(-21.3799991607666, results["facet_counts"][0]["stats"]["min"].get()); + ASSERT_FLOAT_EQ(300, results["facet_counts"][0]["stats"]["max"].get()); + ASSERT_FLOAT_EQ(277.8160007725237, results["facet_counts"][0]["stats"]["sum"].get()); + ASSERT_FLOAT_EQ(39.68800011036053, results["facet_counts"][0]["stats"]["avg"].get()); + ASSERT_FLOAT_EQ(7, results["facet_counts"][0]["stats"]["total_values"].get()); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetCountOnSimilarStrings) { + Collection *coll1; + + std::vector fields = {field("categories", field_types::STRING_ARRAY, true), + field("points", field_types::INT32, true)}; + + std::vector sort_fields = {sort_by("points", "DESC")}; + + coll1 = collectionManager.get_collection("coll1").get(); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); + } + + nlohmann::json doc; + doc["id"] = "100"; + doc["categories"] = {"England in India"}; + doc["points"] = 25; + + coll1->add(doc.dump()); + + doc["id"] = "101"; + doc["categories"] = {"India in England"}; + doc["points"] = 50; + + coll1->add(doc.dump()); + + std::vector facets = {"categories"}; + + nlohmann::json results = coll1->search("*", {"categories"}, "", facets, sort_fields, {0}, 10, 1, + token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30UL, 4UL, + "", 1UL, "", "", {}, 3UL, "", "", {}, + 4294967295UL, true, false, true, "", false, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(2, results["hits"].size()); + ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); + + ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetByNestedIntField) { + nlohmann::json schema = R"({ + "name": "coll1", + "enable_nested_fields": true, + "fields": [ + {"name": "details", "type": "object", "optional": false }, + {"name": "company.num_employees", "type": "int32", "optional": false, "facet": true }, + {"name": "companyRank", "type": "int32", "optional": false, "facet": true } + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll1 = op.get(); + + auto doc1 = R"({ + "details": {"count": 1000}, + "company": {"num_employees": 2000}, + "companyRank": 100 + })"_json; + + auto doc2 = R"({ + "details": {"count": 2000}, + "company": {"num_employees": 2000}, + "companyRank": 101 + })"_json; + + ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok()); + ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok()); + + std::vector sort_fields = { sort_by("details.count", "ASC") }; + + auto results = coll1->search("*", {}, "", {"company.num_employees"}, sort_fields, {0}, 10, 1, + token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 1UL, "", "", {}, 3UL, + "", "", {}, 4294967295UL, true, false, true, "", false, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(2, results["found"].get()); + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ("company.num_employees", results["facet_counts"][0]["field_name"]); + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(2, results["facet_counts"][0]["counts"][0]["count"].get()); + ASSERT_EQ("2000", results["facet_counts"][0]["counts"][0]["value"].get()); + + // Nested wildcard faceting + std::vector wildcard_facets; + coll1->parse_facet("company.*", wildcard_facets); + + ASSERT_EQ(1, wildcard_facets.size()); + ASSERT_EQ("company.num_employees", wildcard_facets[0].field_name); + + wildcard_facets.clear(); + coll1->parse_facet("company*", wildcard_facets); + + ASSERT_EQ(2, wildcard_facets.size()); + ASSERT_EQ("company.num_employees", wildcard_facets[0].field_name); + ASSERT_EQ("companyRank", wildcard_facets[1].field_name); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetParseTest){ + std::vector fields = { + field("score", field_types::INT32, true), + field("grade", field_types::INT32, true), + field("rank", field_types::INT32, true), + field("range", field_types::INT32, true), + field("scale", field_types::INT32, false), + }; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + std::vector range_facet_fields { + "score(fail:[0, 40], pass:[40, 100])", + "grade(A:[80, 100], B:[60, 80], C:[40, 60])" + }; + std::vector range_facets; + for(const std::string & facet_field: range_facet_fields) { + coll1->parse_facet(facet_field, range_facets); + } + ASSERT_EQ(2, range_facets.size()); + + ASSERT_STREQ("score", range_facets[0].field_name.c_str()); + ASSERT_TRUE(range_facets[0].is_range_query); + ASSERT_GT(range_facets[0].facet_range_map.size(), 0); + + ASSERT_STREQ("grade", range_facets[1].field_name.c_str()); + ASSERT_TRUE(range_facets[1].is_range_query); + ASSERT_GT(range_facets[1].facet_range_map.size(), 0); + + std::vector normal_facet_fields { + "score", + "grade" + }; + std::vector normal_facets; + for(const std::string & facet_field: normal_facet_fields) { + coll1->parse_facet(facet_field, normal_facets); + } + ASSERT_EQ(2, normal_facets.size()); + + ASSERT_STREQ("score", normal_facets[0].field_name.c_str()); + ASSERT_STREQ("grade", normal_facets[1].field_name.c_str()); + + std::vector wildcard_facet_fields { + "ran*", + "sc*", + }; + std::vector wildcard_facets; + for(const std::string & facet_field: wildcard_facet_fields) { + coll1->parse_facet(facet_field, wildcard_facets); + } + + ASSERT_EQ(3, wildcard_facets.size()); + + std::set expected{"range", "rank", "score"}; + for (size_t i = 0; i < wildcard_facets.size(); i++) { + ASSERT_TRUE(expected.count(wildcard_facets[i].field_name) == 1); + } + + wildcard_facets.clear(); + coll1->parse_facet("*", wildcard_facets); + + // Last field is not a facet. + ASSERT_EQ(fields.size() - 1, wildcard_facets.size()); + + expected.clear(); + for (size_t i = 0; i < fields.size() - 1; i++) { + expected.insert(fields[i].name); + } + + for (size_t i = 0; i < wildcard_facets.size(); i++) { + ASSERT_TRUE(expected.count(wildcard_facets[i].field_name) == 1); + } + + std::vector mixed_facet_fields { + "score", + "grade(A:[80, 100], B:[60, 80], C:[40, 60])", + "ra*", + }; + + std::vector mixed_facets; + for(const std::string & facet_field: mixed_facet_fields) { + coll1->parse_facet(facet_field, mixed_facets); + } + ASSERT_EQ(4, mixed_facets.size()); + + std::vector mixed_facets_ptr; + for(auto& f: mixed_facets) { + mixed_facets_ptr.push_back(&f); + } + + std::sort(mixed_facets_ptr.begin(), mixed_facets_ptr.end(), [](const facet* f1, const facet* f2) { + return f1->field_name < f2->field_name; + }); + + ASSERT_EQ("score", mixed_facets_ptr[3]->field_name); + + ASSERT_EQ("grade", mixed_facets_ptr[0]->field_name); + ASSERT_TRUE(mixed_facets_ptr[0]->is_range_query); + ASSERT_GT(mixed_facets_ptr[0]->facet_range_map.size(), 0); + + ASSERT_EQ("rank", mixed_facets_ptr[2]->field_name); + ASSERT_EQ("range", mixed_facets_ptr[1]->field_name); +} + +TEST_F(CollectionOptimizedFacetingTest, RangeFacetTest) { + std::vector fields = {field("place", field_types::STRING, false), + field("state", field_types::STRING, false), + field("visitors", field_types::INT32, true), + field("trackingFrom", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {} + ).get(); + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["place"] = "Mysore Palace"; + doc1["state"] = "Karnataka"; + doc1["visitors"] = 235486; + doc1["trackingFrom"] = 1900; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["place"] = "Hampi"; + doc2["state"] = "Karnataka"; + doc2["visitors"] = 187654; + doc2["trackingFrom"] = 1900; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["place"] = "Mahabalipuram"; + doc3["state"] = "TamilNadu"; + doc3["visitors"] = 174684; + doc3["trackingFrom"] = 1900; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["place"] = "Meenakshi Amman Temple"; + doc4["state"] = "TamilNadu"; + doc4["visitors"] = 246676; + doc4["trackingFrom"] = 2000; + + nlohmann::json doc5; + doc5["id"] = "4"; + doc5["place"] = "Staue of Unity"; + doc5["state"] = "Gujarat"; + doc5["visitors"] = 345878; + doc5["trackingFrom"] = 2000; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + ASSERT_TRUE(coll1->add(doc5.dump()).ok()); + + auto results = coll1->search("Karnataka", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(2, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("Busy", results["facet_counts"][0]["counts"][0]["value"].get()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_EQ("VeryBusy", results["facet_counts"][0]["counts"][1]["value"].get()); + + auto results2 = coll1->search("Gujarat", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(1, results2["facet_counts"][0]["counts"].size()); + ASSERT_EQ(1, results2["facet_counts"][0]["counts"][0]["count"].get()); + ASSERT_STREQ("VeryBusy", results2["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_TRUE(results2["facet_counts"][0]["counts"][1]["value"] == nullptr); + + // ensure that unknown facet field are handled + + auto results3 = coll1->search("Gujarat", {"state"}, + "", {"visitorsz(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", true, 6000000UL, 4UL, + 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, "", true, 0UL, + max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(results3.ok()); + ASSERT_EQ("Could not find a facet field named `visitorsz` in the schema.", results3.error()); + + auto results4 = coll1->search("*", {"state"}, + "", {"trackingFrom(Old:[0, 1910], New:[1910, 2100])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true).get(); + + ASSERT_EQ(2, results4["facet_counts"][0]["counts"].size()); + ASSERT_EQ(3, results4["facet_counts"][0]["counts"][0]["count"].get()); + ASSERT_EQ("Old", results4["facet_counts"][0]["counts"][0]["value"].get()); + + ASSERT_EQ(2, results4["facet_counts"][0]["counts"][1]["count"].get()); + ASSERT_EQ("New", results4["facet_counts"][0]["counts"][1]["value"].get()); + + // ensure that only integer fields are allowed + auto rop = coll1->search("Karnataka", {"state"}, + "", {"state(Busy:[0, 200000], VeryBusy:[200000, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(rop.ok()); + ASSERT_EQ("Range facet is restricted to only int32 and int64 fields.", rop.error()); + + // ensure that bad facet range values are handled + rop = coll1->search("Karnataka", {"state"}, + "", {"visitors(Busy:[alpha, 200000], VeryBusy:[200000, beta])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_FALSE(rop.ok()); + ASSERT_EQ("Facet range value is not valid.", rop.error()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, RangeFacetContinuity) { + std::vector fields = {field("place", field_types::STRING, false), + field("state", field_types::STRING, false), + field("visitors", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {} + ).get(); + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["place"] = "Mysore Palace"; + doc1["state"] = "Karnataka"; + doc1["visitors"] = 235486; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["place"] = "Hampi"; + doc2["state"] = "Karnataka"; + doc2["visitors"] = 187654; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["place"] = "Mahabalipuram"; + doc3["state"] = "TamilNadu"; + doc3["visitors"] = 174684; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["place"] = "Meenakshi Amman Temple"; + doc4["state"] = "TamilNadu"; + doc4["visitors"] = 246676; + + nlohmann::json doc5; + doc5["id"] = "4"; + doc5["place"] = "Staue of Unity"; + doc5["state"] = "Gujarat"; + doc5["visitors"] = 345878; + + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + ASSERT_TRUE(coll1->add(doc5.dump()).ok()); + + auto results = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200001, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Ranges in range facet syntax should be continous.", results.error().c_str()); + + auto results2 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[199999, 500000])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Ranges in range facet syntax should be continous.", results2.error().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, RangeFacetTypo) { + std::vector fields = {field("place", field_types::STRING, false), + field("state", field_types::STRING, false), + field("visitors", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", {}, {} + ).get(); + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["place"] = "Mysore Palace"; + doc1["state"] = "Karnataka"; + doc1["visitors"] = 235486; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["place"] = "Hampi"; + doc2["state"] = "Karnataka"; + doc2["visitors"] = 187654; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["place"] = "Mahabalipuram"; + doc3["state"] = "TamilNadu"; + doc3["visitors"] = 174684; + + nlohmann::json doc4; + doc4["id"] = "3"; + doc4["place"] = "Meenakshi Amman Temple"; + doc4["state"] = "TamilNadu"; + doc4["visitors"] = 246676; + + nlohmann::json doc5; + doc5["id"] = "4"; + doc5["place"] = "Staue of Unity"; + doc5["state"] = "Gujarat"; + doc5["visitors"] = 345878; + + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + ASSERT_TRUE(coll1->add(doc4.dump()).ok()); + ASSERT_TRUE(coll1->add(doc5.dump()).ok()); + + auto results = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000)"}, //missing ']' at end + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Error splitting the facet range values.", results.error().c_str()); + + auto results2 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000], VeryBusy:200000, 500000])"}, //missing '[' in second range + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Error splitting the facet range values.", results2.error().c_str()); + + auto results3 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000] VeryBusy:[200000, 500000])"}, //missing ',' between ranges + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Error splitting the facet range values.", results3.error().c_str()); + + auto results4 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0 200000], VeryBusy:[200000, 500000])"}, //missing ',' between first ranges values + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Facet range value is not valid.", results4.error().c_str()); + + auto results5 = coll1->search("TamilNadu", {"state"}, + "", {"visitors(Busy:[0, 200000 VeryBusy:200000, 500000])"}, //missing '],' and '[' + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000000UL, + 4UL, 7UL, fallback, 4UL, {off}, 32767UL, 32767UL, 2UL, 2UL, false, + "", true, 0UL, max_score, 100UL, 0UL, 4294967295UL, true); + + ASSERT_STREQ("Facet range value is not valid.", results5.error().c_str()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionOptimizedFacetingTest, SampleFacetCounts) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "color", "type": "string", "facet": true} + ] + })"_json; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + std::mt19937 gen(137723); // use constant seed to make sure that counts don't jump around + std::uniform_int_distribution<> distr(1, 100); // 1 to 100 inclusive + + size_t count_blue = 0, count_red = 0; + + for(size_t i = 0; i < 1000; i++) { + nlohmann::json doc; + if(distr(gen) % 2 == 0) { + doc["color"] = "blue"; + count_blue++; + } else { + doc["color"] = "red"; + count_red++; + } + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + } + + auto res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 10, 0, 4294967295UL, true).get(); + + ASSERT_EQ(1000, res["found"].get()); + ASSERT_EQ(1, res["facet_counts"].size()); + ASSERT_EQ(2, res["facet_counts"][0]["counts"].size()); + + // verify approximate counts + ASSERT_GE(res["facet_counts"][0]["counts"][0]["count"].get(), 250); + ASSERT_GE(res["facet_counts"][0]["counts"][1]["count"].get(), 250); + ASSERT_TRUE(res["facet_counts"][0]["sampled"].get()); + + // when sample threshold is high, don't estimate + res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 10, 10000, 4294967295UL, true).get(); + + ASSERT_EQ(1000, res["found"].get()); + ASSERT_EQ(1, res["facet_counts"].size()); + ASSERT_EQ(2, res["facet_counts"][0]["counts"].size()); + + for(size_t i = 0; i < res["facet_counts"][0]["counts"].size(); i++) { + if(res["facet_counts"][0]["counts"][i]["value"].get() == "red") { + ASSERT_EQ(count_red, res["facet_counts"][0]["counts"][i]["count"].get()); + } else { + ASSERT_EQ(count_blue, res["facet_counts"][0]["counts"][i]["count"].get()); + } + } + + ASSERT_FALSE(res["facet_counts"][0]["sampled"].get()); + + // test for sample percent > 100 + + auto res_op = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 200, 0, 4294967295UL, true); + + ASSERT_FALSE(res_op.ok()); + ASSERT_EQ("Value of `facet_sample_percent` must be less than 100.", res_op.error()); +} + +TEST_F(CollectionOptimizedFacetingTest, FacetOnArrayFieldWithSpecialChars) { + std::vector fields = { + field("tags", field_types::STRING_ARRAY, true), + field("points", field_types::INT32, true), + }; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + nlohmann::json doc; + doc["tags"] = {"gamma"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["tags"] = {"alpha", "| . |", "beta", "gamma"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"tags"}, {}, {2}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 100, 0, 4294967295UL, true).get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); + + for(size_t i = 0; i < results["facet_counts"][0]["counts"].size(); i++) { + auto fvalue = results["facet_counts"][0]["counts"][i]["value"].get(); + if(fvalue == "gamma") { + ASSERT_EQ(2, results["facet_counts"][0]["counts"][i]["count"].get()); + } else { + ASSERT_EQ(1, results["facet_counts"][0]["counts"][i]["count"].get()); + } + } +} + +TEST_F(CollectionOptimizedFacetingTest, StringLengthTest) { + std::vector fields = { + field("tags", field_types::STRING_ARRAY, true), + field("points", field_types::INT32, true), + }; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + nlohmann::json doc; + doc["tags"] = {"gamma"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["tags"] = {"beta"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["tags"] = {"alpha"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + std::string longStr = ""; + for(auto i = 0; i < 8; ++i) { + longStr+="alphabetagamma"; + } + + ASSERT_TRUE(112 == longStr.size()); + + std::vector vec; + vec.emplace_back(longStr); + doc["tags"] = vec; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"tags"}, {}, {2}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 100, 0, 4294967295UL, true).get(); + + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); + + longStr = results["facet_counts"][0]["counts"][3]["value"]; + + //string facet length is restricted to 100 + ASSERT_TRUE(100 == longStr.size()); +} \ No newline at end of file