Add geo_range_index.

This commit is contained in:
Harpreet Sangar 2023-06-13 16:00:25 +05:30
parent 0321396f98
commit 334ea25b21
5 changed files with 161 additions and 37 deletions

View File

@ -308,7 +308,9 @@ private:
spp::sparse_hash_map<std::string, NumericTrie*> range_index;
spp::sparse_hash_map<std::string, spp::sparse_hash_map<std::string, std::vector<uint32_t>>*> geopoint_index;
spp::sparse_hash_map<std::string, NumericTrie*> geo_range_index;
// spp::sparse_hash_map<std::string, spp::sparse_hash_map<std::string, std::vector<uint32_t>>*> geopoint_index;
// geo_array_field => (seq_id => values) used for exact filtering of geo array records
spp::sparse_hash_map<std::string, spp::sparse_hash_map<uint32_t, int64_t*>*> geo_array_index;

View File

@ -14,6 +14,8 @@ class NumericTrie {
void insert_helper(const int64_t& value, const uint32_t& seq_id, char& level, const char& max_level);
void insert_geopoint_helper(const uint64_t& cell_id, const uint32_t& seq_id, char& level, const char& max_level);
void search_range_helper(const int64_t& low,const int64_t& high, const char& max_level,
std::vector<Node*>& matches);
@ -35,7 +37,13 @@ class NumericTrie {
delete [] children;
}
void insert(const int64_t& value, const uint32_t& seq_id, const char& max_level);
void insert(const int64_t& cell_id, const uint32_t& seq_id, const char& max_level);
void insert_geopoint(const uint64_t& cell_id, const uint32_t& seq_id, const char& max_level);
void search_geopoint(const uint64_t& cell_id, const char& max_index_level, uint32_t*& ids, uint32_t& ids_length);
void delete_geopoint(const uint64_t& cell_id, uint32_t id, const char& max_level);
void get_all_ids(uint32_t*& ids, uint32_t& ids_length);
@ -110,6 +118,12 @@ public:
void insert(const int64_t& value, const uint32_t& seq_id);
void insert_geopoint(const uint64_t& cell_id, const uint32_t& seq_id);
void search_geopoint(const uint64_t& cell_id, uint32_t*& ids, uint32_t& ids_length);
void delete_geopoint(const uint64_t& cell_id, uint32_t id);
void search_range(const int64_t& low, const bool& low_inclusive,
const int64_t& high, const bool& high_inclusive,
uint32_t*& ids, uint32_t& ids_length);

View File

@ -892,13 +892,21 @@ void filter_result_iterator_t::init() {
S2RegionTermIndexer::Options options;
options.set_index_contains_points_only(true);
S2RegionTermIndexer indexer(options);
auto const& geo_range_index = index->geo_range_index.at(a_filter.field_name);
for (const auto& term : indexer.GetQueryTerms(*query_region, "")) {
auto geo_index = index->geopoint_index.at(a_filter.field_name);
const auto& ids_it = geo_index->find(term);
if(ids_it != geo_index->end()) {
geo_result_ids.insert(geo_result_ids.end(), ids_it->second.begin(), ids_it->second.end());
auto cell = S2CellId::FromToken(term);
uint32_t* geo_ids = nullptr;
uint32_t geo_ids_length = 0;
geo_range_index->search_geopoint(cell.id(), geo_ids, geo_ids_length);
geo_result_ids.reserve(geo_result_ids.size() + geo_ids_length);
for (uint32_t i = 0; i < geo_ids_length; i++) {
geo_result_ids.push_back(geo_ids[i]);
}
delete [] geo_ids;
}
gfx::timsort(geo_result_ids.begin(), geo_result_ids.end());

View File

@ -78,8 +78,7 @@ Index::Index(const std::string& name, const uint32_t collection_id, const Store*
art_tree_init(t);
search_index.emplace(a_field.name, t);
} else if(a_field.is_geopoint()) {
auto field_geo_index = new spp::sparse_hash_map<std::string, std::vector<uint32_t>>();
geopoint_index.emplace(a_field.name, field_geo_index);
geo_range_index.emplace(a_field.name, new NumericTrie(64));
if(!a_field.is_single_geopoint()) {
spp::sparse_hash_map<uint32_t, int64_t*> * doc_to_geos = new spp::sparse_hash_map<uint32_t, int64_t*>();
@ -141,12 +140,12 @@ Index::~Index() {
search_index.clear();
for(auto & name_index: geopoint_index) {
for(auto & name_index: geo_range_index) {
delete name_index.second;
name_index.second = nullptr;
}
geopoint_index.clear();
geo_range_index.clear();
for(auto& name_index: geo_array_index) {
for(auto& kv: *name_index.second) {
@ -810,10 +809,10 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
num_tree->insert(value, seq_id);
});
} else if(afield.type == field_types::GEOPOINT || afield.type == field_types::GEOPOINT_ARRAY) {
auto geo_index = geopoint_index.at(afield.name);
auto geopoint_range_index = geo_range_index.at(afield.name);
iterate_and_index_numerical_field(iter_batch, afield,
[&afield, &geo_array_index=geo_array_index, geo_index](const index_record& record, uint32_t seq_id) {
[&afield, &geo_array_index=geo_array_index, geopoint_range_index](const index_record& record, uint32_t seq_id) {
// nested geopoint value inside an array of object will be a simple array so must be treated as geopoint
bool nested_obj_arr_geopoint = (afield.nested && afield.type == field_types::GEOPOINT_ARRAY &&
!record.doc[afield.name].empty() && record.doc[afield.name][0].is_number());
@ -827,9 +826,8 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
S2RegionTermIndexer indexer(options);
S2Point point = S2LatLng::FromDegrees(latlongs[li], latlongs[li+1]).ToPoint();
for(const auto& term: indexer.GetIndexTerms(point, "")) {
(*geo_index)[term].push_back(seq_id);
}
auto cell = S2CellId(point);
geopoint_range_index->insert_geopoint(cell.id(), seq_id);
}
if(nested_obj_arr_geopoint) {
@ -857,9 +855,9 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
for(size_t li = 0; li < latlongs.size(); li++) {
auto& latlong = latlongs[li];
S2Point point = S2LatLng::FromDegrees(latlong[0], latlong[1]).ToPoint();
for(const auto& term: indexer.GetIndexTerms(point, "")) {
(*geo_index)[term].push_back(seq_id);
}
auto cell = S2CellId(point);
geopoint_range_index->insert_geopoint(cell.id(), seq_id);
int64_t packed_latlong = GeoPoint::pack_lat_lng(latlong[0], latlong[1]);
packed_latlongs[li + 1] = packed_latlong;
@ -1589,7 +1587,7 @@ void Index::numeric_not_equals_filter(num_tree_t* const num_tree,
bool Index::field_is_indexed(const std::string& field_name) const {
return search_index.count(field_name) != 0 ||
numerical_index.count(field_name) != 0 ||
geopoint_index.count(field_name) != 0;
geo_range_index.count(field_name) != 0;
}
void Index::aproximate_numerical_match(num_tree_t* const num_tree,
@ -5471,7 +5469,7 @@ void Index::remove_field(uint32_t seq_id, const nlohmann::json& document, const
}
}
} else if(search_field.is_geopoint()) {
auto geo_index = geopoint_index[field_name];
auto geopoint_range_index = geo_range_index[field_name];
S2RegionTermIndexer::Options options;
options.set_index_contains_points_only(true);
S2RegionTermIndexer indexer(options);
@ -5482,17 +5480,8 @@ void Index::remove_field(uint32_t seq_id, const nlohmann::json& document, const
for(const std::vector<double>& latlong: latlongs) {
S2Point point = S2LatLng::FromDegrees(latlong[0], latlong[1]).ToPoint();
for(const auto& term: indexer.GetIndexTerms(point, "")) {
auto term_it = geo_index->find(term);
if(term_it == geo_index->end()) {
continue;
}
std::vector<uint32_t>& ids = term_it->second;
ids.erase(std::remove(ids.begin(), ids.end(), seq_id), ids.end());
if(ids.empty()) {
geo_index->erase(term);
}
}
auto cell = S2CellId(point);
geopoint_range_index->delete_geopoint(cell.id(), seq_id);
}
if(!search_field.is_single_geopoint()) {
@ -5644,8 +5633,7 @@ void Index::refresh_schemas(const std::vector<field>& new_fields, const std::vec
art_tree_init(t);
search_index.emplace(new_field.name, t);
} else if(new_field.is_geopoint()) {
auto field_geo_index = new spp::sparse_hash_map<std::string, std::vector<uint32_t>>();
geopoint_index.emplace(new_field.name, field_geo_index);
geo_range_index.emplace(new_field.name, new NumericTrie(64));
if(!new_field.is_single_geopoint()) {
auto geo_array_map = new spp::sparse_hash_map<uint32_t, int64_t*>();
geo_array_index.emplace(new_field.name, geo_array_map);
@ -5695,8 +5683,8 @@ void Index::refresh_schemas(const std::vector<field>& new_fields, const std::vec
delete search_index[del_field.name];
search_index.erase(del_field.name);
} else if(del_field.is_geopoint()) {
delete geopoint_index[del_field.name];
geopoint_index.erase(del_field.name);
delete geo_range_index[del_field.name];
geo_range_index.erase(del_field.name);
if(!del_field.is_single_geopoint()) {
spp::sparse_hash_map<uint32_t, int64_t*>* geo_array_map = geo_array_index[del_field.name];

View File

@ -18,6 +18,30 @@ void NumericTrie::insert(const int64_t& value, const uint32_t& seq_id) {
}
}
void NumericTrie::insert_geopoint(const uint64_t& cell_id, const uint32_t& seq_id) {
if (positive_trie == nullptr) {
positive_trie = new NumericTrie::Node();
}
positive_trie->insert_geopoint(cell_id, seq_id, max_level);
}
void NumericTrie::search_geopoint(const uint64_t& cell_id, uint32_t*& ids, uint32_t& ids_length) {
if (positive_trie == nullptr) {
return;
}
positive_trie->search_geopoint(cell_id, max_level, ids, ids_length);
}
void NumericTrie::delete_geopoint(const uint64_t& cell_id, uint32_t id) {
if (positive_trie == nullptr) {
return;
}
positive_trie->delete_geopoint(cell_id, id, max_level);
}
void NumericTrie::search_range(const int64_t& low, const bool& low_inclusive,
const int64_t& high, const bool& high_inclusive,
uint32_t*& ids, uint32_t& ids_length) {
@ -369,9 +393,14 @@ NumericTrie::iterator_t NumericTrie::search_equal_to(const int64_t& value) {
return NumericTrie::iterator_t(matches);
}
void NumericTrie::Node::insert(const int64_t& value, const uint32_t& seq_id, const char& max_level) {
void NumericTrie::Node::insert(const int64_t& cell_id, const uint32_t& seq_id, const char& max_level) {
char level = 0;
return insert_helper(value, seq_id, level, max_level);
return insert_helper(cell_id, seq_id, level, max_level);
}
void NumericTrie::Node::insert_geopoint(const uint64_t& cell_id, const uint32_t& seq_id, const char& max_level) {
char level = 0;
return insert_geopoint_helper(cell_id, seq_id, level, max_level);
}
inline int get_index(const int64_t& value, const char& level, const char& max_level) {
@ -385,6 +414,10 @@ inline int get_index(const int64_t& value, const char& level, const char& max_le
return (value >> (8 * (max_level - level))) & 0xFF;
}
inline int get_geopoint_index(const uint64_t& cell_id, const char& level, const char& max_level) {
return (cell_id >> (8 * (max_level - level))) & 0xFF;
}
void NumericTrie::Node::insert_helper(const int64_t& value, const uint32_t& seq_id, char& level, const char& max_level) {
if (level > max_level) {
return;
@ -409,6 +442,85 @@ void NumericTrie::Node::insert_helper(const int64_t& value, const uint32_t& seq_
}
}
void NumericTrie::Node::insert_geopoint_helper(const uint64_t& cell_id, const uint32_t& seq_id, char& level,
const char& max_level) {
if (level > max_level) {
return;
}
// Root node contains all the sequence ids present in the tree.
if (!seq_ids.contains(seq_id)) {
seq_ids.append(seq_id);
}
if (++level <= max_level) {
if (children == nullptr) {
children = new NumericTrie::Node* [EXPANSE]{nullptr};
}
auto index = get_geopoint_index(cell_id, level, max_level);
if (children[index] == nullptr) {
children[index] = new NumericTrie::Node();
}
return children[index]->insert_geopoint_helper(cell_id, seq_id, level, max_level);
}
}
char get_max_search_level(const uint64_t& cell_id, const char& max_level) {
// For cell id 0x47E66C3000000000, we only have to prefix match the top four bytes since rest of the bytes are 0.
// So the max search level would be 4 in this case.
uint64_t mask = 0xff;
char i = max_level;
while (((cell_id & mask) == 0) && --i > 0) {
mask <<= 8;
}
return i;
}
void NumericTrie::Node::search_geopoint(const uint64_t& cell_id, const char& max_index_level,
uint32_t*& ids, uint32_t& ids_length) {
char level = 1;
Node* root = this;
auto index = get_geopoint_index(cell_id, level, max_index_level);
auto max_search_level = get_max_search_level(cell_id, max_index_level);
while (level < max_search_level) {
if (root->children == nullptr || root->children[index] == nullptr) {
return;
}
root = root->children[index];
index = get_geopoint_index(cell_id, ++level, max_index_level);
}
root->get_all_ids(ids, ids_length);
}
void NumericTrie::Node::delete_geopoint(const uint64_t& cell_id, uint32_t id, const char& max_level) {
char level = 1;
Node* root = this;
auto index = get_geopoint_index(cell_id, level, max_level);
while (level < max_level) {
root->seq_ids.remove_value(id);
if (root->children == nullptr || root->children[index] == nullptr) {
return;
}
root = root->children[index];
index = get_geopoint_index(cell_id, ++level, max_level);
}
if (root->children != nullptr || root->children[index] != nullptr) {
delete root->children[index];
root->children[index] = nullptr;
}
}
void NumericTrie::Node::get_all_ids(uint32_t*& ids, uint32_t& ids_length) {
ids = seq_ids.uncompress();
ids_length = seq_ids.getLength();