Facet estimation for large result sets.

This commit is contained in:
Kishore Nallan 2022-12-15 14:50:09 +05:30
parent e836af159f
commit df8f6849fe
9 changed files with 385 additions and 274 deletions

View File

@ -408,7 +408,9 @@ public:
const size_t facet_query_num_typos = 2,
const size_t filter_curated_hits_option = 2,
const bool prioritize_token_position = false,
const std::string& vector_query_str = "") const;
const std::string& vector_query_str = "",
const size_t facet_sample_percent = 100,
const size_t facet_sample_threshold = 0) const;
Option<bool> get_filter_ids(const std::string & simple_filter_query,
std::vector<std::pair<size_t, uint32_t*>>& index_ids);

View File

@ -682,6 +682,8 @@ struct facet {
bool is_range_query;
bool sampled = false;
bool get_range(int64_t key, std::pair<int64_t, std::string>& range_pair)
{
if(facet_range_map.empty())

View File

@ -132,6 +132,8 @@ struct search_args {
std::vector<std::vector<KV*>> override_result_kvs;
vector_query_t& vector_query;
size_t facet_sample_percent;
size_t facet_sample_threshold;
search_args(std::vector<query_tokens_t> field_query_tokens, std::vector<search_field_t> search_fields,
filter_node_t* filter_tree_root, std::vector<facet>& facets,
@ -145,7 +147,8 @@ struct search_args {
size_t concurrency, size_t search_cutoff_ms,
size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector<enable_t>& infixes,
const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos,
const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query) :
const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query,
size_t facet_sample_percent, size_t facet_sample_threshold) :
field_query_tokens(field_query_tokens),
search_fields(search_fields), filter_tree_root(filter_tree_root), facets(facets),
included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std),
@ -159,7 +162,8 @@ struct search_args {
min_len_1typo(min_len_1typo), min_len_2typo(min_len_2typo), max_candidates(max_candidates),
infixes(infixes), max_extra_prefix(max_extra_prefix), max_extra_suffix(max_extra_suffix),
facet_query_num_typos(facet_query_num_typos), filter_curated_hits(filter_curated_hits),
split_join_tokens(split_join_tokens), vector_query(vector_query) {
split_join_tokens(split_join_tokens), vector_query(vector_query),
facet_sample_percent(facet_sample_percent), facet_sample_threshold(facet_sample_threshold) {
const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory
topster = new Topster(topster_size, group_limit);
@ -357,6 +361,7 @@ private:
void log_leaves(int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const;
void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
bool estimate_facets, size_t facet_sample_percent,
const std::vector<facet_info_t>& facet_infos,
size_t group_limit, const std::vector<std::string>& group_by_fields,
const uint32_t* result_ids, size_t results_size) const;
@ -645,7 +650,7 @@ public:
size_t max_candidates, const std::vector<enable_t>& infixes, const size_t max_extra_prefix,
const size_t max_extra_suffix, const size_t facet_query_num_typos,
const bool filter_curated_hits, enable_t split_join_tokens,
const vector_query_t& vector_query) const;
const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold) const;
void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name);

View File

@ -866,7 +866,9 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
const size_t facet_query_num_typos,
const size_t filter_curated_hits_option,
const bool prioritize_token_position,
const std::string& vector_query_str) const {
const std::string& vector_query_str,
const size_t facet_sample_percent,
const size_t facet_sample_threshold) const {
std::shared_lock lock(mutex);
@ -911,6 +913,10 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
}
}
if(facet_sample_percent > 100) {
return Option<nlohmann::json>(400, "Value of `facet_sample_percent` must be less than 100.");
}
if(raw_group_by_fields.empty()) {
group_limit = 0;
}
@ -1302,7 +1308,8 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
search_stop_millis,
min_len_1typo, min_len_2typo, max_candidates, infixes,
max_extra_prefix, max_extra_suffix, facet_query_num_typos,
filter_curated_hits, split_join_tokens, vector_query);
filter_curated_hits, split_join_tokens, vector_query,
facet_sample_percent, facet_sample_threshold);
index->run_search(search_params);
@ -1319,12 +1326,6 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
// for grouping we have to aggregate group set sizes to a count value
if(group_limit) {
for(auto& acc_facet: facets) {
for(auto& facet_kv: acc_facet.result_map) {
facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
}
}
total_found = search_params->groups_processed.size() + override_result_kvs.size();
} else {
total_found = search_params->all_result_ids_len;
@ -1430,8 +1431,6 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
// handle which fields have to be highlighted
std::vector<highlight_field_t> highlight_items;
bool has_atleast_one_fully_highlighted_field = false;
std::vector<std::string> highlight_field_names;
StringUtils::split(highlight_fields, highlight_field_names, ",");
@ -1442,12 +1441,6 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
process_highlight_fields(weighted_search_fields, raw_search_fields, include_fields_full, exclude_fields_full,
highlight_field_names, highlight_full_field_names, infixes, q_tokens,
search_params->qtoken_set, highlight_items);
for(auto& highlight_item: highlight_items) {
if(highlight_item.fully_highlighted) {
has_atleast_one_fully_highlighted_field = true;
}
}
}
nlohmann::json result = nlohmann::json::object();
@ -1657,6 +1650,7 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
for(facet & a_facet: facets) {
nlohmann::json facet_result = nlohmann::json::object();
facet_result["field_name"] = a_facet.field_name;
facet_result["sampled"] = a_facet.sampled;
facet_result["counts"] = nlohmann::json::array();
std::vector<facet_value_t> facet_values;

View File

@ -695,6 +695,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
const char *EXHAUSTIVE_SEARCH = "exhaustive_search";
const char *SPLIT_JOIN_TOKENS = "split_join_tokens";
const char *FACET_SAMPLE_PERCENT = "facet_sample_percent";
const char *FACET_SAMPLE_THRESHOLD = "facet_sample_threshold";
// enrich params with values from embedded params
for(auto& item: embedded_params.items()) {
if(item.key() == "expires_at") {
@ -720,7 +723,6 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
// end check for mandatory params
const std::string& raw_query = req_params[QUERY];
std::vector<uint32_t> num_typos = {2};
size_t min_len_1typo = 4;
@ -772,6 +774,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
size_t max_extra_prefix = INT16_MAX;
size_t max_extra_suffix = INT16_MAX;
size_t facet_sample_percent = 100;
size_t facet_sample_threshold = 0;
std::unordered_map<std::string, size_t*> unsigned_int_values = {
{MIN_LEN_1TYPO, &min_len_1typo},
{MIN_LEN_2TYPO, &min_len_2typo},
@ -790,6 +795,8 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
{MAX_CANDIDATES, &max_candidates},
{FACET_QUERY_NUM_TYPOS, &facet_query_num_typos},
{FILTER_CURATED_HITS, &filter_curated_hits_option},
{FACET_SAMPLE_PERCENT, &facet_sample_percent},
{FACET_SAMPLE_THRESHOLD, &facet_sample_threshold},
};
std::unordered_map<std::string, std::string*> str_values = {
@ -982,7 +989,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
facet_query_num_typos,
filter_curated_hits_option,
prioritize_token_position,
vector_query
vector_query,
facet_sample_percent,
facet_sample_threshold
);
uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(

View File

@ -4,6 +4,7 @@
#include <chrono>
#include <set>
#include <unordered_map>
#include <random>
#include <array_utils.h>
#include <match_score.h>
#include <string_utils.h>
@ -1228,6 +1229,7 @@ void Index::compute_facet_stats(facet &a_facet, uint64_t raw_value, const std::s
}
void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
bool estimate_facets, size_t facet_sample_percent,
const std::vector<facet_info_t>& facet_infos,
const size_t group_limit, const std::vector<std::string>& group_by_fields,
const uint32_t* result_ids, size_t results_size) const {
@ -1247,8 +1249,21 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
const auto& field_facet_mapping = field_facet_mapping_it->second;
// used for sampling facets (if enabled)
std::mt19937 gen(137723); // use constant seed to make sure that counts don't jump around
std::uniform_int_distribution<> distr(1, 100); // 1 to 100 inclusive
for(size_t i = 0; i < results_size; i++) {
uint32_t doc_seq_id = result_ids[i];
// if sampling is enabled, we will skip a portion of the results to speed up things
if(estimate_facets) {
size_t num = distr(gen);
if(num > facet_sample_percent) {
continue;
}
}
const auto& facet_hashes_it = field_facet_mapping[doc_seq_id % ARRAY_FACET_DIM]->find(doc_seq_id);
if(facet_hashes_it == field_facet_mapping[doc_seq_id % ARRAY_FACET_DIM]->end()) {
@ -1265,7 +1280,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
compute_facet_stats(a_facet, fhash, facet_field.type);
}
if(a_facet.is_range_query){
if(a_facet.is_range_query) {
auto sort_index_it = sort_index.find(a_facet.field_name);
if(sort_index_it != sort_index.end()){
@ -1285,8 +1300,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
}
}
}
}
else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) {
} else if(!use_facet_query || fquery_hashes.find(fhash) != fquery_hashes.end()) {
facet_count_t& facet_count = a_facet.result_map[fhash];
//LOG(INFO) << "field: " << a_facet.field_name << ", doc id: " << doc_seq_id << ", hash: " << fhash;
@ -1980,7 +1994,9 @@ void Index::run_search(search_args* search_params) {
search_params->facet_query_num_typos,
search_params->filter_curated_hits,
search_params->split_join_tokens,
search_params->vector_query);
search_params->vector_query,
search_params->facet_sample_percent,
search_params->facet_sample_threshold);
}
void Index::collate_included_ids(const std::vector<token_t>& q_included_tokens,
@ -2430,7 +2446,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
size_t max_candidates, const std::vector<enable_t>& infixes, const size_t max_extra_prefix,
const size_t max_extra_suffix, const size_t facet_query_num_typos,
const bool filter_curated_hits, const enable_t split_join_tokens,
const vector_query_t& vector_query) const {
const vector_query_t& vector_query,
size_t facet_sample_percent, size_t facet_sample_threshold) const {
// process the filters
@ -2784,6 +2801,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
delete [] exclude_token_ids;
delete [] excluded_result_ids;
bool estimate_facets = (facet_sample_percent < 100 && all_result_ids_len > facet_sample_threshold);
if(!facets.empty()) {
const size_t num_threads = std::min(concurrency, all_result_ids_len);
const size_t window_size = (num_threads == 0) ? 0 :
@ -2820,9 +2839,11 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
thread_pool->enqueue([this, thread_id, &facet_batches, &facet_query, group_limit, group_by_fields,
batch_result_ids, batch_res_len, &facet_infos,
estimate_facets, facet_sample_percent,
&num_processed, &m_process, &cv_process]() {
auto fq = facet_query;
do_facets(facet_batches[thread_id], fq, facet_infos, group_limit, group_by_fields,
do_facets(facet_batches[thread_id], fq, estimate_facets, facet_sample_percent,
facet_infos, group_limit, group_by_fields,
batch_result_ids, batch_res_len);
std::unique_lock<std::mutex> lock(m_process);
num_processed++;
@ -2844,8 +2865,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
if(group_limit) {
// we have to add all group sets
acc_facet.hash_groups[facet_kv.first].insert(
this_facet.hash_groups[facet_kv.first].begin(),
this_facet.hash_groups[facet_kv.first].end()
this_facet.hash_groups[facet_kv.first].begin(),
this_facet.hash_groups[facet_kv.first].end()
);
} else {
size_t count = 0;
@ -2872,6 +2893,22 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
}
}
for(auto & acc_facet: facets) {
for(auto& facet_kv: acc_facet.result_map) {
if(group_limit) {
facet_kv.second.count = acc_facet.hash_groups[facet_kv.first].size();
}
if(estimate_facets) {
facet_kv.second.count = size_t(double(facet_kv.second.count) * (100.0f / facet_sample_percent));
}
}
if(estimate_facets) {
acc_facet.sampled = true;
}
}
/*long long int timeMillisF = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - beginF).count();
LOG(INFO) << "Time for faceting: " << timeMillisF;*/
@ -2880,7 +2917,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
std::vector<facet_info_t> facet_infos(facets.size());
compute_facet_infos(facets, facet_query, facet_query_num_typos,
&included_ids_vec[0], included_ids_vec.size(), group_by_fields, max_candidates, facet_infos);
do_facets(facets, facet_query, facet_infos, group_limit, group_by_fields, &included_ids_vec[0], included_ids_vec.size());
do_facets(facets, facet_query, estimate_facets, facet_sample_percent,
facet_infos, group_limit, group_by_fields, &included_ids_vec[0], included_ids_vec.size());
all_result_ids_len += curated_topster->size;

View File

@ -73,8 +73,9 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(3, results["facet_counts"][0].size());
ASSERT_EQ(4, results["facet_counts"][0].size());
ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]);
ASSERT_EQ(false, results["facet_counts"][0]["sampled"].get<bool>());
ASSERT_EQ(4, results["facet_counts"][0]["counts"].size());
ASSERT_EQ(1, results["facet_counts"][0]["stats"].size());
ASSERT_EQ(4, results["facet_counts"][0]["stats"]["total_values"].get<size_t>());
@ -981,7 +982,6 @@ TEST_F(CollectionFacetingTest, FacetByNestedIntField) {
}
TEST_F(CollectionFacetingTest, FacetParseTest){
std::vector<field> fields = {
field("score", field_types::INT32, true),
field("grade", field_types::INT32, true),
@ -1008,8 +1008,6 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
ASSERT_TRUE(range_facets[1].is_range_query);
ASSERT_GT(range_facets[1].facet_range_map.size(), 0);
std::vector<std::string> normal_facet_fields {
"score",
"grade"
@ -1022,9 +1020,7 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
ASSERT_STREQ("score", normal_facets[0].field_name.c_str());
ASSERT_STREQ("grade", normal_facets[1].field_name.c_str());
std::vector<std::string> mixed_facet_fields {
"score",
"grade(A:[80, 100], B:[60, 80], C:[40, 60])",
@ -1044,3 +1040,304 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
ASSERT_STREQ("rank", mixed_facets[2].field_name.c_str());
}
TEST_F(CollectionFacetingTest, RangeFacetTest) {
std::vector<field> fields = {field("place", field_types::STRING, false),
field("state", field_types::STRING, false),
field("visitors", field_types::INT32, true),};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "", {}, {}
).get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["place"] = "Mysore Palace";
doc1["state"] = "Karnataka";
doc1["visitors"] = 235486;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["place"] = "Hampi";
doc2["state"] = "Karnataka";
doc2["visitors"] = 187654;
nlohmann::json doc3;
doc3["id"] = "2";
doc3["place"] = "Mahabalipuram";
doc3["state"] = "TamilNadu";
doc3["visitors"] = 174684;
nlohmann::json doc4;
doc4["id"] = "3";
doc4["place"] = "Meenakshi Amman Temple";
doc4["state"] = "TamilNadu";
doc4["visitors"] = 246676;
nlohmann::json doc5;
doc5["id"] = "4";
doc5["place"] = "Staue of Unity";
doc5["state"] = "Gujarat";
doc5["visitors"] = 345878;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
ASSERT_TRUE(coll1->add(doc4.dump()).ok());
ASSERT_TRUE(coll1->add(doc5.dump()).ok());
auto results = coll1->search("Karnataka", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true).get();
ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Busy", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
auto results2 = coll1->search("Gujarat", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true).get();
ASSERT_EQ(1, results2["facet_counts"][0]["counts"].size());
ASSERT_EQ(1, results2["facet_counts"][0]["counts"][0]["count"].get<std::size_t>());
ASSERT_STREQ("VeryBusy", results2["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_TRUE(results2["facet_counts"][0]["counts"][1]["value"] == nullptr);
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionFacetingTest, RangeFacetContinuity) {
std::vector<field> fields = {field("place", field_types::STRING, false),
field("state", field_types::STRING, false),
field("visitors", field_types::INT32, true),};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "", {}, {}
).get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["place"] = "Mysore Palace";
doc1["state"] = "Karnataka";
doc1["visitors"] = 235486;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["place"] = "Hampi";
doc2["state"] = "Karnataka";
doc2["visitors"] = 187654;
nlohmann::json doc3;
doc3["id"] = "2";
doc3["place"] = "Mahabalipuram";
doc3["state"] = "TamilNadu";
doc3["visitors"] = 174684;
nlohmann::json doc4;
doc4["id"] = "3";
doc4["place"] = "Meenakshi Amman Temple";
doc4["state"] = "TamilNadu";
doc4["visitors"] = 246676;
nlohmann::json doc5;
doc5["id"] = "4";
doc5["place"] = "Staue of Unity";
doc5["state"] = "Gujarat";
doc5["visitors"] = 345878;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
ASSERT_TRUE(coll1->add(doc4.dump()).ok());
ASSERT_TRUE(coll1->add(doc5.dump()).ok());
auto results = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:[200001, 500000])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Ranges in range facet syntax should be continous.", results.error().c_str());
auto results2 = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:[199999, 500000])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Ranges in range facet syntax should be continous.", results2.error().c_str());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionFacetingTest, RangeFacetTypo) {
std::vector<field> fields = {field("place", field_types::STRING, false),
field("state", field_types::STRING, false),
field("visitors", field_types::INT32, true),};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "", {}, {}
).get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["place"] = "Mysore Palace";
doc1["state"] = "Karnataka";
doc1["visitors"] = 235486;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["place"] = "Hampi";
doc2["state"] = "Karnataka";
doc2["visitors"] = 187654;
nlohmann::json doc3;
doc3["id"] = "2";
doc3["place"] = "Mahabalipuram";
doc3["state"] = "TamilNadu";
doc3["visitors"] = 174684;
nlohmann::json doc4;
doc4["id"] = "3";
doc4["place"] = "Meenakshi Amman Temple";
doc4["state"] = "TamilNadu";
doc4["visitors"] = 246676;
nlohmann::json doc5;
doc5["id"] = "4";
doc5["place"] = "Staue of Unity";
doc5["state"] = "Gujarat";
doc5["visitors"] = 345878;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
ASSERT_TRUE(coll1->add(doc4.dump()).ok());
ASSERT_TRUE(coll1->add(doc5.dump()).ok());
auto results = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000)"}, //missing ']' at end
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Error splitting the range string.", results.error().c_str());
auto results2 = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:200000, 500000])"}, //missing '[' in second range
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Error splitting the range string.", results2.error().c_str());
auto results3 = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000] VeryBusy:[200000, 500000])"}, //missing ',' between ranges
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Error splitting the range string.", results3.error().c_str());
auto results4 = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0 200000], VeryBusy:[200000, 500000])"}, //missing ',' between first ranges values
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Range String range pattern not matched.", results4.error().c_str());
auto results5 = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000 VeryBusy:200000, 500000])"}, //missing '],' and '['
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Range String range pattern not matched.", results5.error().c_str());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionFacetingTest, SampleFacetCounts) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "color", "type": "string", "facet": true}
]
})"_json;
Collection* coll1 = collectionManager.create_collection(schema).get();
for(size_t i = 0; i < 1000; i++) {
nlohmann::json doc;
if(i % 2 == 0) {
doc["color"] = "blue";
} else {
doc["color"] = "red";
}
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
auto res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 3, 3, 2, 2, false, "", 10, 0).get();
ASSERT_EQ(1000, res["found"].get<size_t>());
ASSERT_EQ(1, res["facet_counts"].size());
ASSERT_EQ(2, res["facet_counts"][0]["counts"].size());
// verify approximate counts
ASSERT_GE(res["facet_counts"][0]["counts"][0]["count"].get<size_t>(), 250);
ASSERT_GE(res["facet_counts"][0]["counts"][1]["count"].get<size_t>(), 250);
ASSERT_TRUE(res["facet_counts"][0]["sampled"].get<bool>());
// when sample threshold is high, don't estimate
res = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 3, 3, 2, 2, false, "", 10, 10000).get();
ASSERT_EQ(1000, res["found"].get<size_t>());
ASSERT_EQ(1, res["facet_counts"].size());
ASSERT_EQ(2, res["facet_counts"][0]["counts"].size());
// verify approximate counts
ASSERT_EQ(500, res["facet_counts"][0]["counts"][0]["count"].get<size_t>());
ASSERT_EQ(500, res["facet_counts"][0]["counts"][1]["count"].get<size_t>());
ASSERT_FALSE(res["facet_counts"][0]["sampled"].get<bool>());
// test for sample percent > 100
auto res_op = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 3, 3, 2, 2, false, "", 200, 0);
ASSERT_FALSE(res_op.ok());
ASSERT_EQ("Value of `facet_sample_percent` must be less than 100.", res_op.error());
}

View File

@ -583,7 +583,7 @@ TEST_F(CollectionSchemaChangeTest, AbilityToDropAndReAddIndexAtTheSameTime) {
ASSERT_EQ(1, res["found"].get<size_t>());
ASSERT_EQ("0", res["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ(1, res["facet_counts"].size());
ASSERT_EQ(3, res["facet_counts"][0].size());
ASSERT_EQ(4, res["facet_counts"][0].size());
ASSERT_EQ("title", res["facet_counts"][0]["field_name"]);
ASSERT_EQ(1, res["facet_counts"][0]["counts"].size());
ASSERT_EQ("123", res["facet_counts"][0]["counts"][0]["value"].get<std::string>());

View File

@ -2918,239 +2918,3 @@ TEST_F(CollectionSpecificTest, DontHighlightPunctuation) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, RangeFacetTest) {
std::vector<field> fields = {field("place", field_types::STRING, false),
field("state", field_types::STRING, false),
field("visitors", field_types::INT32, true),};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "", {}, {}
).get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["place"] = "Mysore Palace";
doc1["state"] = "Karnataka";
doc1["visitors"] = 235486;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["place"] = "Hampi";
doc2["state"] = "Karnataka";
doc2["visitors"] = 187654;
nlohmann::json doc3;
doc3["id"] = "2";
doc3["place"] = "Mahabalipuram";
doc3["state"] = "TamilNadu";
doc3["visitors"] = 174684;
nlohmann::json doc4;
doc4["id"] = "3";
doc4["place"] = "Meenakshi Amman Temple";
doc4["state"] = "TamilNadu";
doc4["visitors"] = 246676;
nlohmann::json doc5;
doc5["id"] = "4";
doc5["place"] = "Staue of Unity";
doc5["state"] = "Gujarat";
doc5["visitors"] = 345878;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
ASSERT_TRUE(coll1->add(doc4.dump()).ok());
ASSERT_TRUE(coll1->add(doc5.dump()).ok());
auto results = coll1->search("Karnataka", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true).get();
ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Busy", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
auto results2 = coll1->search("Gujarat", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true).get();
ASSERT_EQ(1, results2["facet_counts"][0]["counts"].size());
ASSERT_EQ(1, results2["facet_counts"][0]["counts"][0]["count"].get<std::size_t>());
ASSERT_STREQ("VeryBusy", results2["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_TRUE(results2["facet_counts"][0]["counts"][1]["value"] == nullptr);
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, RangeFacetContinuity) {
std::vector<field> fields = {field("place", field_types::STRING, false),
field("state", field_types::STRING, false),
field("visitors", field_types::INT32, true),};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "", {}, {}
).get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["place"] = "Mysore Palace";
doc1["state"] = "Karnataka";
doc1["visitors"] = 235486;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["place"] = "Hampi";
doc2["state"] = "Karnataka";
doc2["visitors"] = 187654;
nlohmann::json doc3;
doc3["id"] = "2";
doc3["place"] = "Mahabalipuram";
doc3["state"] = "TamilNadu";
doc3["visitors"] = 174684;
nlohmann::json doc4;
doc4["id"] = "3";
doc4["place"] = "Meenakshi Amman Temple";
doc4["state"] = "TamilNadu";
doc4["visitors"] = 246676;
nlohmann::json doc5;
doc5["id"] = "4";
doc5["place"] = "Staue of Unity";
doc5["state"] = "Gujarat";
doc5["visitors"] = 345878;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
ASSERT_TRUE(coll1->add(doc4.dump()).ok());
ASSERT_TRUE(coll1->add(doc5.dump()).ok());
auto results = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:[200001, 500000])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Ranges in range facet syntax should be continous.", results.error().c_str());
auto results2 = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:[199999, 500000])"},
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Ranges in range facet syntax should be continous.", results2.error().c_str());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, RangeFacetTypo) {
std::vector<field> fields = {field("place", field_types::STRING, false),
field("state", field_types::STRING, false),
field("visitors", field_types::INT32, true),};
Collection* coll1 = collectionManager.create_collection(
"coll1", 1, fields, "", 0, "", {}, {}
).get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["place"] = "Mysore Palace";
doc1["state"] = "Karnataka";
doc1["visitors"] = 235486;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["place"] = "Hampi";
doc2["state"] = "Karnataka";
doc2["visitors"] = 187654;
nlohmann::json doc3;
doc3["id"] = "2";
doc3["place"] = "Mahabalipuram";
doc3["state"] = "TamilNadu";
doc3["visitors"] = 174684;
nlohmann::json doc4;
doc4["id"] = "3";
doc4["place"] = "Meenakshi Amman Temple";
doc4["state"] = "TamilNadu";
doc4["visitors"] = 246676;
nlohmann::json doc5;
doc5["id"] = "4";
doc5["place"] = "Staue of Unity";
doc5["state"] = "Gujarat";
doc5["visitors"] = 345878;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
ASSERT_TRUE(coll1->add(doc4.dump()).ok());
ASSERT_TRUE(coll1->add(doc5.dump()).ok());
auto results = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:[200000, 500000)"}, //missing ']' at end
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Error splitting the range string.", results.error().c_str());
auto results2 = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000], VeryBusy:200000, 500000])"}, //missing '[' in second range
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Error splitting the range string.", results2.error().c_str());
auto results3 = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000] VeryBusy:[200000, 500000])"}, //missing ',' between ranges
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Error splitting the range string.", results3.error().c_str());
auto results4 = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0 200000], VeryBusy:[200000, 500000])"}, //missing ',' between first ranges values
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Range String range pattern not matched.", results4.error().c_str());
auto results5 = coll1->search("TamilNadu", {"state"},
"", {"visitors(Busy:[0, 200000 VeryBusy:200000, 500000])"}, //missing '],' and '['
{}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000,
true, false, true, "", true);
ASSERT_STREQ("Range String range pattern not matched.", results5.error().c_str());
collectionManager.drop_collection("coll1");
}