restricting int64 for stats, using integers, float as hash and fixing tests

This commit is contained in:
krunal1313 2023-05-04 14:19:53 +05:30
parent 81862c29d3
commit 007096baf4
4 changed files with 37 additions and 52 deletions

View File

@ -1958,10 +1958,12 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
auto the_field = search_schema.at(a_facet.field_name);
// keep only top K facets
auto max_facets = std::min(max_facet_values, facet_hash_counts.size());
std::nth_element(facet_hash_counts.begin(), facet_hash_counts.begin() + max_facets,
facet_hash_counts.end(), Collection::facet_count_compare);
auto nthElement = max_facets == facet_hash_counts.size() ? max_facets - 1 : max_facets;
std::nth_element(facet_hash_counts.begin(), facet_hash_counts.begin() + nthElement,
facet_hash_counts.end(), Collection::facet_count_compare);
for(size_t fi = 0; fi < max_facets; fi++) {
// remap facet value hash with actual string
auto & kv = facet_hash_counts[fi];
@ -2672,13 +2674,9 @@ bool Collection::facet_value_to_string(const facet &a_facet, const facet_count_t
} else if(search_schema.at(a_facet.field_name).type == field_types::FLOAT) {
float raw_val = document[a_facet.field_name].get<float>();
value = StringUtils::float_to_str(raw_val);
if(value != "0") {
value.erase ( value.find_last_not_of('0') + 1, std::string::npos ); // remove trailing zeros
}
} else if(search_schema.at(a_facet.field_name).type == field_types::FLOAT_ARRAY) {
float raw_val = document[a_facet.field_name][facet_count.array_pos].get<float>();
value = StringUtils::float_to_str(raw_val);
value.erase ( value.find_last_not_of('0') + 1, std::string::npos ); // remove trailing zeros
} else if(search_schema.at(a_facet.field_name).type == field_types::BOOL) {
value = std::to_string(document[a_facet.field_name].get<bool>());
value = (value == "1") ? "true" : "false";

View File

@ -660,9 +660,8 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
for(size_t i = 0; i < document[afield.name].size(); ++i) {
if(afield.type == field_types::INT32_ARRAY) {
int32_t raw_val = document[afield.name][i].get<int32_t>();
value = std::to_string(raw_val);
auto index = facet_index_v4->insert(afield.name, value, seq_id);
fhashvalues.hashes.emplace_back(index);
uint32_t hash = reinterpret_cast<uint32_t&>(raw_val);
fhashvalues.hashes.emplace_back(hash);
} else if(afield.type == field_types::INT64_ARRAY) {
int64_t raw_val = document[afield.name][i].get<int64_t>();
value = std::to_string(raw_val);
@ -674,13 +673,12 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
fhashvalues.hashes.emplace_back(index);
} else if(afield.type == field_types::FLOAT_ARRAY) {
float raw_val = document[afield.name][i].get<float>();
value = StringUtils::float_to_str(raw_val);
auto index = facet_index_v4->insert(afield.name, value, seq_id);
fhashvalues.hashes.emplace_back(index);
uint32_t hash = reinterpret_cast<uint32_t&>(raw_val);
fhashvalues.hashes.emplace_back(hash);
} else if(afield.type == field_types::BOOL_ARRAY) {
value = std::to_string(document[afield.name][i].get<bool>());
auto index = facet_index_v4->insert(afield.name, value, seq_id);
fhashvalues.hashes.emplace_back(index);
bool raw_val = document[afield.name][i].get<bool>();
uint32_t hash = reinterpret_cast<uint32_t&>(raw_val);
fhashvalues.hashes.emplace_back(hash);
}
}
fhashvalues.length = fhashvalues.hashes.size();
@ -696,8 +694,7 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
if(afield.type == field_types::INT32) {
int32_t raw_val = document[afield.name].get<int32_t>();
value = std::to_string(raw_val);
fhash = facet_index_v4->insert(afield.name, value, seq_id);
fhash = reinterpret_cast<uint32_t&>(raw_val);
}
else if(afield.type == field_types::INT64) {
int64_t raw_val = document[afield.name].get<int64_t>();
@ -710,12 +707,11 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
}
else if(afield.type == field_types::FLOAT) {
float raw_val = document[afield.name].get<float>();
value = StringUtils::float_to_str(raw_val);
fhash = facet_index_v4->insert(afield.name, value, seq_id);
fhash = reinterpret_cast<uint32_t&>(raw_val);
}
else if(afield.type == field_types::BOOL) {
value = std::to_string(document[afield.name].get<bool>());
fhash = facet_index_v4->insert(afield.name, value, seq_id);
bool raw_val = document[afield.name].get<bool>();
fhash = reinterpret_cast<uint32_t&>(raw_val);
}
auto& facet_dim_index = single_val_facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM];
@ -1139,7 +1135,7 @@ void Index::compute_facet_stats(facet &a_facet, const std::string& raw_value, co
}
}
void Index::compute_facet_stats(facet &a_facet, const int64_t raw_value, const std::string & field_type) {
void Index::compute_facet_stats(facet &a_facet, int64_t raw_value, const std::string & field_type) {
if(field_type == field_types::INT32 || field_type == field_types::INT32_ARRAY) {
int32_t val = raw_value;
if (val < a_facet.stats.fvmin) {
@ -1161,7 +1157,7 @@ void Index::compute_facet_stats(facet &a_facet, const int64_t raw_value, const s
a_facet.stats.fvsum += val;
a_facet.stats.fvcount++;
} else if(field_type == field_types::FLOAT || field_type == field_types::FLOAT_ARRAY) {
float val = int64_t_to_float(raw_value);
float val = reinterpret_cast<float&>(raw_value);
if(val < a_facet.stats.fvmin) {
a_facet.stats.fvmin = val;
}
@ -1298,7 +1294,6 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
continue;
}
int64_t doc_val = INT64_MAX;
auto sort_index_it = sort_index.find(a_facet.field_name);
for(size_t i = 0; i < results_size; i++) {
@ -1340,17 +1335,11 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
fhash = facet_map_it->second.hashes[j];
}
if(should_compute_stats) {
doc_val = get_doc_val_from_sort_index(sort_index_it, doc_seq_id);
if(doc_val != INT64_MAX) {
compute_facet_stats(a_facet, doc_val, facet_field.type);
}
compute_facet_stats(a_facet, fhash, facet_field.type);
}
if(a_facet.is_range_query) {
if(doc_val == INT64_MAX) {
doc_val = get_doc_val_from_sort_index(sort_index_it, doc_seq_id);
}
int64_t doc_val = get_doc_val_from_sort_index(sort_index_it, doc_seq_id);
std::pair<std::string, std::string> range_pair {};
if(a_facet.get_range(std::to_string(doc_val), range_pair)) {
@ -4903,7 +4892,9 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
facet_infos[findex].should_compute_stats = (facet_field.type != field_types::STRING &&
facet_field.type != field_types::BOOL &&
facet_field.type != field_types::STRING_ARRAY &&
facet_field.type != field_types::BOOL_ARRAY);
facet_field.type != field_types::BOOL_ARRAY &&
facet_field.type != field_types::INT64 &&
facet_field.type != field_types::INT64_ARRAY);
if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) {
facet_infos[findex].use_facet_query = true;

View File

@ -231,13 +231,12 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
ASSERT_STREQ("age", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("21", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>2</mark>1", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
ASSERT_STREQ("24", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>2</mark>4", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
ASSERT_STREQ("24", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>2</mark>4", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
ASSERT_STREQ("21", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>2</mark>1", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
// facet on a float field without query to check on stats
results = coll_array_fields->search("*", query_fields, "", {"rating"}, sort_fields, {0}, 10, 1, FREQUENCY,
@ -289,11 +288,6 @@ TEST_F(CollectionFacetingTest, FacetCounts) {
ASSERT_STREQ("1421890022", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>142189002</mark>2", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
ASSERT_EQ(5, results["facet_counts"][0]["stats"].size());
ASSERT_FLOAT_EQ(348974822.0, results["facet_counts"][0]["stats"]["min"].get<double>());
ASSERT_FLOAT_EQ(1453426022.0, results["facet_counts"][0]["stats"]["max"].get<double>());
ASSERT_FLOAT_EQ(13275854664.0, results["facet_counts"][0]["stats"]["sum"].get<double>());
ASSERT_FLOAT_EQ(1106321222.0, results["facet_counts"][0]["stats"]["avg"].get<double>());
ASSERT_FLOAT_EQ(1, results["facet_counts"][0]["stats"]["total_values"].get<size_t>());
// facet query that does not match any indexed value
@ -958,10 +952,11 @@ TEST_F(CollectionFacetingTest, FacetValuesShouldBeNormalized) {
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
ASSERT_EQ(3, results["facet_counts"][0]["counts"].size());
// any document is chosen as representative
ASSERT_EQ("bu-qu", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
ASSERT_EQ("Buqu", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
ASSERT_EQ("BUQU", results["facet_counts"][0]["counts"][1]["value"].get<std::string>());
ASSERT_EQ("bu-qu", results["facet_counts"][0]["counts"][2]["value"].get<std::string>());
collectionManager.drop_collection("coll1");
}
@ -992,10 +987,11 @@ TEST_F(CollectionFacetingTest, FacetArrayValuesShouldBeNormalized) {
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
ASSERT_EQ(3, results["facet_counts"][0]["counts"].size());
// any document is chosen as representative
ASSERT_EQ("bu-qu", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
ASSERT_EQ("Buqu", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
ASSERT_EQ("BUQU", results["facet_counts"][0]["counts"][1]["value"].get<std::string>());
ASSERT_EQ("bu-qu", results["facet_counts"][0]["counts"][2]["value"].get<std::string>());
collectionManager.drop_collection("coll1");
}

View File

@ -452,10 +452,10 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][1]["count"]);
ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());