mirror of
https://github.com/typesense/typesense.git
synced 2025-05-17 04:02:36 +08:00
threshold based migration changes
This commit is contained in:
parent
5804a9c9a2
commit
1d54c07a50
@ -39,6 +39,7 @@ private:
|
||||
struct facet_index_counter {
|
||||
tsl::htrie_map<char, facet_index_struct> facet_index_map;
|
||||
std::vector<count_list> counter_list;
|
||||
bool is_migrated = false;
|
||||
|
||||
facet_index_counter() {
|
||||
facet_index_map.clear();
|
||||
@ -74,4 +75,10 @@ public:
|
||||
|
||||
size_t intersect(const std::string& val, const uint32_t* result_ids, int result_id_len,
|
||||
int max_facet_count, std::map<std::string, uint32_t>& found, bool is_wildcard_no_filter_query);
|
||||
|
||||
size_t get_facet_indexes(const std::string& field, std::map<uint32_t, std::vector<uint32_t>>& seqid_index_map);
|
||||
|
||||
bool get_migrated (const std::string& field) const;
|
||||
|
||||
void set_migrated(const std::string& field, bool val);
|
||||
};
|
@ -522,6 +522,7 @@ private:
|
||||
|
||||
void initialize_facet_indexes(const field& facet_field);
|
||||
|
||||
void migrate_facet_to_new_index(const std::string& field);
|
||||
|
||||
|
||||
static Option<bool> embed_fields(nlohmann::json& document,
|
||||
|
@ -42,6 +42,7 @@ private:
|
||||
};
|
||||
|
||||
std::vector<count_list> counter_list;
|
||||
bool is_migrated = false;
|
||||
|
||||
public:
|
||||
|
||||
@ -80,4 +81,14 @@ public:
|
||||
bool is_wildcard_no_filter_query);
|
||||
|
||||
size_t counter_list_size() const;
|
||||
|
||||
size_t get_facet_indexes(std::map<uint32_t, std::vector<uint32_t>>& facets);
|
||||
|
||||
bool get_migrated () const {
|
||||
return is_migrated;
|
||||
}
|
||||
|
||||
void set_migrated(bool val) {
|
||||
is_migrated = true;
|
||||
}
|
||||
};
|
@ -1937,7 +1937,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
}
|
||||
}
|
||||
} else if(a_facet.is_intersected) {
|
||||
LOG(INFO) << "used intersection";
|
||||
//LOG(INFO) << "used intersection";
|
||||
std::vector<std::pair<std::string, uint32_t>> facet_counts;
|
||||
|
||||
for (const auto & kv : a_facet.result_map) {
|
||||
@ -1945,10 +1945,18 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
}
|
||||
|
||||
auto max_facets = std::min(max_facet_values, facet_counts.size());
|
||||
std::sort(facet_counts.begin(), facet_counts.end(),
|
||||
[&](const auto& p1, const auto& p2) {
|
||||
return std::tie(p1.second, p1.first) > std::tie(p2.second, p2.first);
|
||||
});
|
||||
auto nthElement = max_facets == facet_counts.size() ? max_facets - 1 : max_facets;
|
||||
|
||||
std::nth_element(facet_counts.begin(), facet_counts.begin() + nthElement,
|
||||
facet_counts.end(), [&](const auto& kv1, const auto& kv2) {
|
||||
size_t a_count = kv1.second;
|
||||
size_t b_count = kv2.second;
|
||||
|
||||
size_t a_value_size = UINT64_MAX - kv1.first.size();
|
||||
size_t b_value_size = UINT64_MAX - kv2.first.size();
|
||||
|
||||
return std::tie(a_count, a_value_size) > std::tie(b_count, b_value_size);
|
||||
});
|
||||
|
||||
for(int i = 0; i < max_facets; ++i) {
|
||||
const auto& kv = facet_counts[i];
|
||||
@ -1956,7 +1964,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
|
||||
facet_values.emplace_back(facet_value);
|
||||
}
|
||||
} else {
|
||||
LOG(INFO) << "used hashes";
|
||||
//LOG(INFO) << "used hashes";
|
||||
std::vector<std::pair<uint32_t, facet_count_t>> facet_hash_counts;
|
||||
|
||||
for (const auto & kv : a_facet.result_map) {
|
||||
|
@ -150,3 +150,51 @@ facet_index_t::~facet_index_t() {
|
||||
facet_field_map.clear();
|
||||
}
|
||||
|
||||
//used for migrating string and int64 facets
|
||||
size_t facet_index_t::get_facet_indexes(const std::string& field,
|
||||
std::map<uint32_t, std::vector<uint32_t>>& seqid_index_map) {
|
||||
|
||||
const auto& facet_field_it = facet_field_map.find(field);
|
||||
if(facet_field_it == facet_field_map.end()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto& facet_index_map = facet_field_it->second.facet_index_map;
|
||||
|
||||
std::vector<uint32_t> id_list;
|
||||
|
||||
for(auto facet_index_map_it = facet_index_map.begin();
|
||||
facet_index_map_it != facet_index_map.end(); ++facet_index_map_it) {
|
||||
|
||||
auto ids = facet_index_map_it->id_list_ptr;
|
||||
ids_t::uncompress(ids, id_list);
|
||||
|
||||
//emplacing seq_id=>count_index
|
||||
for(const auto& id : id_list) {
|
||||
seqid_index_map[id].emplace_back(facet_index_map_it->index);
|
||||
}
|
||||
|
||||
id_list.clear();
|
||||
}
|
||||
|
||||
return seqid_index_map.size();
|
||||
}
|
||||
|
||||
|
||||
bool facet_index_t::get_migrated (const std::string& field) const {
|
||||
const auto it = facet_field_map.find(field);
|
||||
if(it != facet_field_map.end()) {
|
||||
return it->second.is_migrated;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void facet_index_t::set_migrated(const std::string& field, bool val) {
|
||||
const auto it = facet_field_map.find(field);
|
||||
if(it != facet_field_map.end()) {
|
||||
it->second.is_migrated = val;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
161
src/index.cpp
161
src/index.cpp
@ -37,6 +37,7 @@
|
||||
search_cutoff = true; \
|
||||
break;\
|
||||
}
|
||||
#define FACET_INDEX_THRESHOLD 10
|
||||
|
||||
spp::sparse_hash_map<uint32_t, int64_t> Index::text_match_sentinel_value;
|
||||
spp::sparse_hash_map<uint32_t, int64_t> Index::seq_id_sentinel_value;
|
||||
@ -602,6 +603,75 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record>& iter_b
|
||||
return num_indexed;
|
||||
}
|
||||
|
||||
void Index::migrate_facet_to_new_index(const std::string& field) {
|
||||
|
||||
//LOG(INFO) << "migrating facet " << field << " to old index";
|
||||
|
||||
std::map<uint32_t, std::vector<uint32_t>> facets_indexes;
|
||||
auto& facet_index = facet_index_v3[field];
|
||||
auto& single_facet_index = single_val_facet_index_v3[field];
|
||||
|
||||
//migrate string and int64 facets first
|
||||
if(facet_index_v4->get_facet_indexes(field, facets_indexes)) {
|
||||
|
||||
for(const auto& kv : facets_indexes) {
|
||||
|
||||
const auto hash_count = kv.second.size();
|
||||
const auto seq_id = kv.first;
|
||||
|
||||
if(hash_count > 1) {
|
||||
auto& facet_dim_index = facet_index[seq_id % ARRAY_FACET_DIM];
|
||||
if(facet_dim_index == nullptr) {
|
||||
LOG(ERROR) << "Error, facet index not initialized for field " << field;
|
||||
} else {
|
||||
facet_hash_values_t fhashvalues;
|
||||
fhashvalues.hashes = std::move(kv.second);
|
||||
facet_dim_index->emplace(seq_id, std::move(fhashvalues));
|
||||
}
|
||||
} else {
|
||||
auto& facet_dim_index = single_facet_index[seq_id % ARRAY_FACET_DIM];
|
||||
if(facet_dim_index == nullptr) {
|
||||
LOG(ERROR) << "Error, facet index not initialized for field " << field;
|
||||
} else {
|
||||
facet_dim_index->emplace(seq_id, kv.second[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//now extract remaining facets from numerical index
|
||||
facets_indexes.clear();
|
||||
auto numerical_index_it = numerical_index.find(field);
|
||||
if(numerical_index_it != numerical_index_it) {
|
||||
auto num_tree = numerical_index_it->second;
|
||||
if(num_tree->get_facet_indexes(facets_indexes)) {
|
||||
|
||||
for(const auto& kv : facets_indexes) {
|
||||
const auto hash_count = kv.second.size();
|
||||
const auto seq_id = kv.first;
|
||||
|
||||
if(hash_count > 1) {
|
||||
auto& facet_dim_index = facet_index[seq_id % ARRAY_FACET_DIM];
|
||||
if(facet_dim_index == nullptr) {
|
||||
LOG(ERROR) << "Error, facet index not initialized for field " << field;
|
||||
} else {
|
||||
facet_hash_values_t fhashvalues;
|
||||
fhashvalues.hashes = std::move(kv.second);
|
||||
facet_dim_index->emplace(seq_id, std::move(fhashvalues));
|
||||
}
|
||||
} else {
|
||||
auto& facet_dim_index = single_facet_index[seq_id % ARRAY_FACET_DIM];
|
||||
if(facet_dim_index == nullptr) {
|
||||
LOG(ERROR) << "Error, facet index not initialized for field " << field;
|
||||
} else {
|
||||
facet_dim_index->emplace(seq_id, kv.second[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Index::index_field_in_memory(const field& afield, std::vector<index_record>& iter_batch) {
|
||||
// indexes a given field of all documents in the batch
|
||||
|
||||
@ -635,6 +705,37 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
|
||||
std::unordered_map<std::string, std::vector<art_document>> token_to_doc_offsets;
|
||||
int64_t max_score = INT64_MIN;
|
||||
|
||||
|
||||
auto facet_threshold_count = 0;
|
||||
bool is_migrated = false;
|
||||
if(afield.is_string()) {
|
||||
facet_threshold_count = facet_index_v4->get_facet_count(afield.name);
|
||||
is_migrated = facet_index_v4->get_migrated(afield.name);
|
||||
} else {
|
||||
auto numerical_index_it = numerical_index.find(afield.name);
|
||||
if(numerical_index_it != numerical_index.end()) {
|
||||
facet_threshold_count = numerical_index_it->second->counter_list_size();
|
||||
is_migrated = numerical_index_it->second->get_migrated();
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef TEST_BUILD
|
||||
facet_threshold_count = FACET_INDEX_THRESHOLD + 1;
|
||||
is_migrated = true;
|
||||
#endif
|
||||
if(!is_migrated && (facet_threshold_count > FACET_INDEX_THRESHOLD)) {
|
||||
migrate_facet_to_new_index(afield.name);
|
||||
|
||||
if(afield.is_string()) {
|
||||
facet_index_v4->set_migrated(afield.name, true);
|
||||
} else {
|
||||
auto numerical_index_it = numerical_index.find(afield.name);
|
||||
if(numerical_index_it != numerical_index.end()) {
|
||||
numerical_index_it->second->set_migrated(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto& record: iter_batch) {
|
||||
if(!record.indexed.ok()) {
|
||||
// some records could have been invalidated upstream
|
||||
@ -654,6 +755,7 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
|
||||
}
|
||||
|
||||
std::string value;
|
||||
uint32_t fhash = 0;
|
||||
|
||||
if(afield.facet) {
|
||||
if(afield.is_array()) {
|
||||
@ -661,41 +763,36 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
|
||||
for(size_t i = 0; i < document[afield.name].size(); ++i) {
|
||||
if(afield.type == field_types::INT32_ARRAY) {
|
||||
int32_t raw_val = document[afield.name][i].get<int32_t>();
|
||||
uint32_t hash = reinterpret_cast<uint32_t&>(raw_val);
|
||||
fhashvalues.hashes.emplace_back(hash);
|
||||
fhash = reinterpret_cast<uint32_t&>(raw_val);
|
||||
} else if(afield.type == field_types::INT64_ARRAY) {
|
||||
int64_t raw_val = document[afield.name][i].get<int64_t>();
|
||||
value = std::to_string(raw_val);
|
||||
auto index = facet_index_v4->insert(afield.name, value, seq_id);
|
||||
fhashvalues.hashes.emplace_back(index);
|
||||
fhash = facet_index_v4->insert(afield.name, value, seq_id);
|
||||
} else if(afield.type == field_types::STRING_ARRAY) {
|
||||
value = document[afield.name][i];
|
||||
auto index = facet_index_v4->insert(afield.name, value, seq_id, true);
|
||||
fhashvalues.hashes.emplace_back(index);
|
||||
fhash = facet_index_v4->insert(afield.name, value, seq_id, true);
|
||||
} else if(afield.type == field_types::FLOAT_ARRAY) {
|
||||
float raw_val = document[afield.name][i].get<float>();
|
||||
uint32_t hash = reinterpret_cast<uint32_t&>(raw_val);
|
||||
fhashvalues.hashes.emplace_back(hash);
|
||||
fhash = reinterpret_cast<uint32_t&>(raw_val);
|
||||
} else if(afield.type == field_types::BOOL_ARRAY) {
|
||||
bool raw_val = document[afield.name][i].get<bool>();
|
||||
uint32_t hash = (uint32_t)raw_val;
|
||||
fhashvalues.hashes.emplace_back(hash);
|
||||
fhash = (uint32_t)raw_val;
|
||||
}
|
||||
|
||||
if(facet_threshold_count > FACET_INDEX_THRESHOLD) {
|
||||
fhashvalues.hashes.emplace_back(fhash);
|
||||
}
|
||||
}
|
||||
fhashvalues.length = fhashvalues.hashes.size();
|
||||
//LOG(INFO) << "fhashvalues.length " << fhashvalues.length;
|
||||
//fhashvalues.length = field_index_it->second.facet_hashes.size();
|
||||
//fhashvalues.hashes = new uint32_t[field_index_it->second.facet_hashes.size()];
|
||||
|
||||
// for(size_t i = 0; i < field_index_it->second.facet_hashes.size(); i++) {
|
||||
// fhashvalues.hashes[i] = field_index_it->second.facet_hashes[i];
|
||||
// }
|
||||
if(facet_threshold_count > FACET_INDEX_THRESHOLD) {
|
||||
fhashvalues.length = fhashvalues.hashes.size();
|
||||
|
||||
auto& facet_dim_index = facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM];
|
||||
if(facet_dim_index == nullptr) {
|
||||
LOG(ERROR) << "Error, facet index not initialized for field " << afield.name;
|
||||
} else {
|
||||
facet_dim_index->emplace(seq_id, std::move(fhashvalues));
|
||||
auto& facet_dim_index = facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM];
|
||||
if(facet_dim_index == nullptr) {
|
||||
LOG(ERROR) << "Error, facet index not initialized for field " << afield.name;
|
||||
} else {
|
||||
facet_dim_index->emplace(seq_id, std::move(fhashvalues));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint32_t fhash;
|
||||
@ -721,14 +818,16 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
|
||||
bool raw_val = document[afield.name].get<bool>();
|
||||
fhash = (uint32_t)raw_val;
|
||||
}
|
||||
//fhash = field_index_it->second.facet_hashes[0];
|
||||
|
||||
if(facet_threshold_count > FACET_INDEX_THRESHOLD) {
|
||||
|
||||
auto& facet_dim_index = single_val_facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM];
|
||||
if(facet_dim_index == nullptr) {
|
||||
LOG(ERROR) << "Error, facet index not initialized for field " << afield.name;
|
||||
} else {
|
||||
facet_dim_index->emplace(seq_id, std::move(fhash));
|
||||
}
|
||||
auto& facet_dim_index = single_val_facet_index_v3[afield.name][seq_id % ARRAY_FACET_DIM];
|
||||
if(facet_dim_index == nullptr) {
|
||||
LOG(ERROR) << "Error, facet index not initialized for field " << afield.name;
|
||||
} else {
|
||||
facet_dim_index->emplace(seq_id, std::move(fhash));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1238,8 +1337,8 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
use_hashes = false;
|
||||
#endif
|
||||
|
||||
if(results_size && facet_records && ((facet_records <= 10 || is_wildcard_query) &&
|
||||
!use_facet_query && group_limit == 0)
|
||||
if(results_size && facet_records && ((facet_records <= FACET_INDEX_THRESHOLD
|
||||
|| is_wildcard_query) && !use_facet_query && group_limit == 0)
|
||||
&& !use_hashes || use_facet_intersection) {
|
||||
//LOG(INFO) << "Using intersection to find facets";
|
||||
a_facet.is_intersected = true;
|
||||
|
@ -401,3 +401,28 @@ size_t num_tree_t::intersect(const uint32_t* result_ids, int result_ids_len, int
|
||||
|
||||
return found.size();
|
||||
}
|
||||
|
||||
size_t num_tree_t::get_facet_indexes(std::map<uint32_t, std::vector<uint32_t>>& facets) {
|
||||
|
||||
//check if facet field
|
||||
if(counter_list.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<uint32_t> id_list;
|
||||
|
||||
for(auto int64map_it = int64map.begin(); int64map_it != int64map.end(); ++int64map_it) {
|
||||
|
||||
auto ids = int64map_it->second;
|
||||
ids_t::uncompress(ids, id_list);
|
||||
|
||||
//emplacing seq_id=>count_index
|
||||
for(const auto& id : id_list) {
|
||||
facets[id].emplace_back(int64map_it->first);
|
||||
}
|
||||
|
||||
id_list.clear();
|
||||
}
|
||||
|
||||
return facets.size();
|
||||
}
|
||||
|
@ -562,8 +562,8 @@ TEST_F(CollectionOptimizedFacetingTest, FacetCountOnSimilarStrings) {
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
|
||||
|
||||
ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("England in India", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("India in England", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
@ -1225,4 +1225,4 @@ TEST_F(CollectionOptimizedFacetingTest, StringLengthTest) {
|
||||
|
||||
//string facet length is restricted to 100
|
||||
ASSERT_TRUE(100 == longStr.size());
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user