threshold based migration changes

This commit is contained in:
krunal1313 2023-05-22 14:48:29 +05:30
parent 5804a9c9a2
commit 1d54c07a50
8 changed files with 239 additions and 40 deletions

View File

@ -39,6 +39,7 @@ private:
struct facet_index_counter {
tsl::htrie_map<char, facet_index_struct> facet_index_map;
std::vector<count_list> counter_list;
bool is_migrated = false;
facet_index_counter() {
facet_index_map.clear();
@ -74,4 +75,10 @@ public:
size_t intersect(const std::string& val, const uint32_t* result_ids, int result_id_len,
int max_facet_count, std::map<std::string, uint32_t>& found, bool is_wildcard_no_filter_query);
size_t get_facet_indexes(const std::string& field, std::map<uint32_t, std::vector<uint32_t>>& seqid_index_map);
bool get_migrated (const std::string& field) const;
void set_migrated(const std::string& field, bool val);
};

View File

@ -522,6 +522,7 @@ private:
void initialize_facet_indexes(const field& facet_field);
void migrate_facet_to_new_index(const std::string& field);
static Option<bool> embed_fields(nlohmann::json& document,

View File

@ -42,6 +42,7 @@ private:
};
std::vector<count_list> counter_list;
bool is_migrated = false;
public:
@ -80,4 +81,14 @@ public:
bool is_wildcard_no_filter_query);
size_t counter_list_size() const;
size_t get_facet_indexes(std::map<uint32_t, std::vector<uint32_t>>& facets);
bool get_migrated () const {
return is_migrated;
}
void set_migrated(bool val) {
is_migrated = true;
}
};

View File

@ -1937,7 +1937,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
}
}
} else if(a_facet.is_intersected) {
LOG(INFO) << "used intersection";
//LOG(INFO) << "used intersection";
std::vector<std::pair<std::string, uint32_t>> facet_counts;
for (const auto & kv : a_facet.result_map) {
@ -1945,10 +1945,18 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
}
auto max_facets = std::min(max_facet_values, facet_counts.size());
std::sort(facet_counts.begin(), facet_counts.end(),
[&](const auto& p1, const auto& p2) {
return std::tie(p1.second, p1.first) > std::tie(p2.second, p2.first);
});
auto nthElement = max_facets == facet_counts.size() ? max_facets - 1 : max_facets;
std::nth_element(facet_counts.begin(), facet_counts.begin() + nthElement,
facet_counts.end(), [&](const auto& kv1, const auto& kv2) {
size_t a_count = kv1.second;
size_t b_count = kv2.second;
size_t a_value_size = UINT64_MAX - kv1.first.size();
size_t b_value_size = UINT64_MAX - kv2.first.size();
return std::tie(a_count, a_value_size) > std::tie(b_count, b_value_size);
});
for(int i = 0; i < max_facets; ++i) {
const auto& kv = facet_counts[i];
@ -1956,7 +1964,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
facet_values.emplace_back(facet_value);
}
} else {
LOG(INFO) << "used hashes";
//LOG(INFO) << "used hashes";
std::vector<std::pair<uint32_t, facet_count_t>> facet_hash_counts;
for (const auto & kv : a_facet.result_map) {

View File

@ -150,3 +150,51 @@ facet_index_t::~facet_index_t() {
facet_field_map.clear();
}
//used for migrating string and int64 facets
size_t facet_index_t::get_facet_indexes(const std::string& field,
std::map<uint32_t, std::vector<uint32_t>>& seqid_index_map) {
const auto& facet_field_it = facet_field_map.find(field);
if(facet_field_it == facet_field_map.end()) {
return 0;
}
auto& facet_index_map = facet_field_it->second.facet_index_map;
std::vector<uint32_t> id_list;
for(auto facet_index_map_it = facet_index_map.begin();
facet_index_map_it != facet_index_map.end(); ++facet_index_map_it) {
auto ids = facet_index_map_it->id_list_ptr;
ids_t::uncompress(ids, id_list);
//emplacing seq_id=>count_index
for(const auto& id : id_list) {
seqid_index_map[id].emplace_back(facet_index_map_it->index);
}
id_list.clear();
}
return seqid_index_map.size();
}
bool facet_index_t::get_migrated (const std::string& field) const {
const auto it = facet_field_map.find(field);
if(it != facet_field_map.end()) {
return it->second.is_migrated;
}
return false;
}
void facet_index_t::set_migrated(const std::string& field, bool val) {
const auto it = facet_field_map.find(field);
if(it != facet_field_map.end()) {
it->second.is_migrated = val;
}
return;
}

View File

@ -37,6 +37,7 @@
search_cutoff = true; \
break;\
}
#define FACET_INDEX_THRESHOLD 10
spp::sparse_hash_map<uint32_t, int64_t> Index::text_match_sentinel_value;
spp::sparse_hash_map<uint32_t, int64_t> Index::seq_id_sentinel_value;
@ -602,6 +603,75 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record>& iter_b
return num_indexed;
}
void Index::migrate_facet_to_new_index(const std::string& field) {
//LOG(INFO) << "migrating facet " << field << " to old index";
std::map<uint32_t, std::vector<uint32_t>> facets_indexes;
auto& facet_index = facet_index_v3[field];
auto& single_facet_index = single_val_facet_index_v3[field];
//migrate string and int64 facets first
if(facet_index_v4->get_facet_indexes(field, facets_indexes)) {
for(const auto& kv : facets_indexes) {
const auto hash_count = kv.second.size();
const auto seq_id = kv.first;
if(hash_count > 1) {
auto& facet_dim_index = facet_index[seq_id % ARRAY_FACET_DIM];
if(facet_dim_index == nullptr) {
LOG(ERROR) << "Error, facet index not initialized for field " << field;
} else {
facet_hash_values_t fhashvalues;
fhashvalues.hashes = std::move(kv.second);
facet_dim_index->emplace(seq_id, std::move(fhashvalues));
}
} else {
auto& facet_dim_index = single_facet_index[seq_id % ARRAY_FACET_DIM];
if(facet_dim_index == nullptr) {
LOG(ERROR) << "Error, facet index not initialized for field " << field;
} else {
facet_dim_index->emplace(seq_id, kv.second[0]);
}
}
}
}
//now extract remaining facets from numerical index
facets_indexes.clear();
auto numerical_index_it = numerical_index.find(field);
if(numerical_index_it != numerical_index_it) {
auto num_tree = numerical_index_it->second;
if(num_tree->get_facet_indexes(facets_indexes)) {
for(const auto& kv : facets_indexes) {
const auto hash_count = kv.second.size();
const auto seq_id = kv.first;
if(hash_count > 1) {
auto& facet_dim_index = facet_index[seq_id % ARRAY_FACET_DIM];
if(facet_dim_index == nullptr) {
LOG(ERROR) << "Error, facet index not initialized for field " << field;
} else {
facet_hash_values_t fhashvalues;
fhashvalues.hashes = std::move(kv.second);
facet_dim_index->emplace(seq_id, std::move(fhashvalues));
}
} else {
auto& facet_dim_index = single_facet_index[seq_id % ARRAY_FACET_DIM];
if(facet_dim_index == nullptr) {
LOG(ERROR) << "Error, facet index not initialized for field " << field;
} else {
facet_dim_index->emplace(seq_id, kv.second[0]);
}
}
}
}
}
}
void Index::index_field_in_memory(const field& afield, std::vector<index_record>& iter_batch) {
// indexes a given field of all documents in the batch
@ -635,6 +705,37 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
std::unordered_map<std::string, std::vector<art_document>> token_to_doc_offsets;
int64_t max_score = INT64_MIN;
auto facet_threshold_count = 0;
bool is_migrated = false;
if(afield.is_string()) {
facet_threshold_count = facet_index_v4->get_facet_count(afield.name);
is_migrated = facet_index_v4->get_migrated(afield.name);
} else {
auto numerical_index_it = numerical_index.find(afield.name);
if(numerical_index_it != numerical_index.end()) {
facet_threshold_count = numerical_index_it->second->counter_list_size();
is_migrated = numerical_index_it->second->get_migrated();
}
}
#ifdef TEST_BUILD
facet_threshold_count = FACET_INDEX_THRESHOLD + 1;
is_migrated = true;
#endif
if(!is_migrated && (facet_threshold_count > FACET_INDEX_THRESHOLD)) {
migrate_facet_to_new_index(afield.name);
if(afield.is_string()) {
facet_index_v4->set_migrated(afield.name, true);
} else {
auto numerical_index_it = numerical_index.find(afield.name);
if(numerical_index_it != numerical_index.end()) {
numerical_index_it->second->set_migrated(true);
}
}
}
for(const auto& record: iter_batch) {
if(!record.indexed.ok()) {
// some records could have been invalidated upstream
@ -654,6 +755,7 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
}
std::string value;
uint32_t fhash = 0;
if(afield.facet) {
if(afield.is_array()) {
@ -661,41 +763,36 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
for(size_t i = 0; i < document[afield.name].size(); ++i) {
if(afield.type == field_types::INT32_ARRAY) {
int32_t raw_val = document[afield.name][i].get<int32_t>();
uint32_t hash = reinterpret_cast<uint32_t&>(raw_val);
fhashvalues.hashes.emplace_back(hash);
fhash = reinterpret_cast<uint32_t&>(raw_val);
} else if(afield.type == field_types::INT64_ARRAY) {
int64_t raw_val = document[afield.name][i].get<int64_t>();
value = std::to_string(raw_val);
auto index = facet_index_v4->insert(afield.name, value, seq_id);
fhashvalues.hashes.emplace_back(index);
fhash = facet_index_v4->insert(afield.name, value, seq_id);
} else if(afield.type == field_types::STRING_ARRAY) {
value = document[afield.name][i];
auto index = facet_index_v4->insert(afield.name, value, seq_id, true);
fhashvalues.hashes.emplace_back(index);
fhash = facet_index_v4->insert(afield.name, value, seq_id, true);
} else if(afield.type == field_types::FLOAT_ARRAY) {
float raw_val = document[afield.name][i].get<float>();
uint32_t hash = reinterpret_cast<uint32_t&>(raw_val);
fhashvalues.hashes.emplace_back(hash);
fhash = reinterpret_cast<uint32_t&>(raw_val);
} else if(afield.type == field_types::BOOL_ARRAY) {
bool raw_val = document[afield.name][i].get<bool>();
uint32_t hash = (uint32_t)raw_val;
fhashvalues.hashes.emplace_back(hash);
fhash = (uint32_t)raw_val;
}
if(facet_threshold_count > FACET_INDEX_THRESHOLD) {
fhashvalues.hashes.emplace_back(fhash);
}
}
fhashvalues.length = fhashvalues.hashes.size();
//LOG(INFO) << "fhashvalues.length " << fhashvalues.length;
//fhashvalues.length = field_index_it->second.facet_hashes.size();
//fhashvalues.hashes = new uint32_t[field_index_it->second.facet_hashes.size()];
// for(size_t i = 0; i < field_index_it->second.facet_hashes.size(); i++) {
// fhashvalues.hashes[i] = field_index_it->second.facet_hashes[i];
// }
if(facet_threshold_count > FACET_INDEX_THRESHOLD) {
fhashvalues.length = fhashvalues.hashes.size();
auto& facet_dim_index = facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM];
if(facet_dim_index == nullptr) {
LOG(ERROR) << "Error, facet index not initialized for field " << afield.name;
} else {
facet_dim_index->emplace(seq_id, std::move(fhashvalues));
auto& facet_dim_index = facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM];
if(facet_dim_index == nullptr) {
LOG(ERROR) << "Error, facet index not initialized for field " << afield.name;
} else {
facet_dim_index->emplace(seq_id, std::move(fhashvalues));
}
}
} else {
uint32_t fhash;
@ -721,14 +818,16 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
bool raw_val = document[afield.name].get<bool>();
fhash = (uint32_t)raw_val;
}
//fhash = field_index_it->second.facet_hashes[0];
if(facet_threshold_count > FACET_INDEX_THRESHOLD) {
auto& facet_dim_index = single_val_facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM];
if(facet_dim_index == nullptr) {
LOG(ERROR) << "Error, facet index not initialized for field " << afield.name;
} else {
facet_dim_index->emplace(seq_id, std::move(fhash));
}
auto& facet_dim_index = single_val_facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM];
if(facet_dim_index == nullptr) {
LOG(ERROR) << "Error, facet index not initialized for field " << afield.name;
} else {
facet_dim_index->emplace(seq_id, std::move(fhash));
}
}
}
}
@ -1238,8 +1337,8 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
use_hashes = false;
#endif
if(results_size && facet_records && ((facet_records <= 10 || is_wildcard_query) &&
!use_facet_query && group_limit == 0)
if(results_size && facet_records && ((facet_records <= FACET_INDEX_THRESHOLD
|| is_wildcard_query) && !use_facet_query && group_limit == 0)
&& !use_hashes || use_facet_intersection) {
//LOG(INFO) << "Using intersection to find facets";
a_facet.is_intersected = true;

View File

@ -401,3 +401,28 @@ size_t num_tree_t::intersect(const uint32_t* result_ids, int result_ids_len, int
return found.size();
}
size_t num_tree_t::get_facet_indexes(std::map<uint32_t, std::vector<uint32_t>>& facets) {
//check if facet field
if(counter_list.empty()) {
return 0;
}
std::vector<uint32_t> id_list;
for(auto int64map_it = int64map.begin(); int64map_it != int64map.end(); ++int64map_it) {
auto ids = int64map_it->second;
ids_t::uncompress(ids, id_list);
//emplacing seq_id=>count_index
for(const auto& id : id_list) {
facets[id].emplace_back(int64map_it->first);
}
id_list.clear();
}
return facets.size();
}

View File

@ -562,8 +562,8 @@ TEST_F(CollectionOptimizedFacetingTest, FacetCountOnSimilarStrings) {
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}
@ -1225,4 +1225,4 @@ TEST_F(CollectionOptimizedFacetingTest, StringLengthTest) {
//string facet length is restricted to 100
ASSERT_TRUE(100 == longStr.size());
}
}