From 334ea25b2194538a7ba5731ed5060a6d971dd1af Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Tue, 13 Jun 2023 16:00:25 +0530 Subject: [PATCH] Add `geo_range_index`. --- include/index.h | 4 +- include/numeric_range_trie_test.h | 16 ++++- src/filter_result_iterator.cpp | 16 +++-- src/index.cpp | 46 +++++------- src/numeric_range_trie.cpp | 116 +++++++++++++++++++++++++++++- 5 files changed, 161 insertions(+), 37 deletions(-) diff --git a/include/index.h b/include/index.h index ef23dc54..11d6155d 100644 --- a/include/index.h +++ b/include/index.h @@ -308,7 +308,9 @@ private: spp::sparse_hash_map range_index; - spp::sparse_hash_map>*> geopoint_index; + spp::sparse_hash_map geo_range_index; + +// spp::sparse_hash_map>*> geopoint_index; // geo_array_field => (seq_id => values) used for exact filtering of geo array records spp::sparse_hash_map*> geo_array_index; diff --git a/include/numeric_range_trie_test.h b/include/numeric_range_trie_test.h index a8422524..ed695a70 100644 --- a/include/numeric_range_trie_test.h +++ b/include/numeric_range_trie_test.h @@ -14,6 +14,8 @@ class NumericTrie { void insert_helper(const int64_t& value, const uint32_t& seq_id, char& level, const char& max_level); + void insert_geopoint_helper(const uint64_t& cell_id, const uint32_t& seq_id, char& level, const char& max_level); + void search_range_helper(const int64_t& low,const int64_t& high, const char& max_level, std::vector& matches); @@ -35,7 +37,13 @@ class NumericTrie { delete [] children; } - void insert(const int64_t& value, const uint32_t& seq_id, const char& max_level); + void insert(const int64_t& cell_id, const uint32_t& seq_id, const char& max_level); + + void insert_geopoint(const uint64_t& cell_id, const uint32_t& seq_id, const char& max_level); + + void search_geopoint(const uint64_t& cell_id, const char& max_index_level, uint32_t*& ids, uint32_t& ids_length); + + void delete_geopoint(const uint64_t& cell_id, uint32_t id, const char& max_level); void get_all_ids(uint32_t*& ids, uint32_t& ids_length); @@ -110,6 +118,12 @@ public: void insert(const int64_t& value, const uint32_t& seq_id); + void insert_geopoint(const uint64_t& cell_id, const uint32_t& seq_id); + + void search_geopoint(const uint64_t& cell_id, uint32_t*& ids, uint32_t& ids_length); + + void delete_geopoint(const uint64_t& cell_id, uint32_t id); + void search_range(const int64_t& low, const bool& low_inclusive, const int64_t& high, const bool& high_inclusive, uint32_t*& ids, uint32_t& ids_length); diff --git a/src/filter_result_iterator.cpp b/src/filter_result_iterator.cpp index ee326b32..5fb432f2 100644 --- a/src/filter_result_iterator.cpp +++ b/src/filter_result_iterator.cpp @@ -892,13 +892,21 @@ void filter_result_iterator_t::init() { S2RegionTermIndexer::Options options; options.set_index_contains_points_only(true); S2RegionTermIndexer indexer(options); + auto const& geo_range_index = index->geo_range_index.at(a_filter.field_name); for (const auto& term : indexer.GetQueryTerms(*query_region, "")) { - auto geo_index = index->geopoint_index.at(a_filter.field_name); - const auto& ids_it = geo_index->find(term); - if(ids_it != geo_index->end()) { - geo_result_ids.insert(geo_result_ids.end(), ids_it->second.begin(), ids_it->second.end()); + auto cell = S2CellId::FromToken(term); + uint32_t* geo_ids = nullptr; + uint32_t geo_ids_length = 0; + + geo_range_index->search_geopoint(cell.id(), geo_ids, geo_ids_length); + + geo_result_ids.reserve(geo_result_ids.size() + geo_ids_length); + for (uint32_t i = 0; i < geo_ids_length; i++) { + geo_result_ids.push_back(geo_ids[i]); } + + delete [] geo_ids; } gfx::timsort(geo_result_ids.begin(), geo_result_ids.end()); diff --git a/src/index.cpp b/src/index.cpp index 949d3740..48b9b6bc 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -78,8 +78,7 @@ Index::Index(const std::string& name, const uint32_t collection_id, const Store* art_tree_init(t); search_index.emplace(a_field.name, t); } else if(a_field.is_geopoint()) { - auto field_geo_index = new spp::sparse_hash_map>(); - geopoint_index.emplace(a_field.name, field_geo_index); + geo_range_index.emplace(a_field.name, new NumericTrie(64)); if(!a_field.is_single_geopoint()) { spp::sparse_hash_map * doc_to_geos = new spp::sparse_hash_map(); @@ -141,12 +140,12 @@ Index::~Index() { search_index.clear(); - for(auto & name_index: geopoint_index) { + for(auto & name_index: geo_range_index) { delete name_index.second; name_index.second = nullptr; } - geopoint_index.clear(); + geo_range_index.clear(); for(auto& name_index: geo_array_index) { for(auto& kv: *name_index.second) { @@ -810,10 +809,10 @@ void Index::index_field_in_memory(const field& afield, std::vector num_tree->insert(value, seq_id); }); } else if(afield.type == field_types::GEOPOINT || afield.type == field_types::GEOPOINT_ARRAY) { - auto geo_index = geopoint_index.at(afield.name); + auto geopoint_range_index = geo_range_index.at(afield.name); iterate_and_index_numerical_field(iter_batch, afield, - [&afield, &geo_array_index=geo_array_index, geo_index](const index_record& record, uint32_t seq_id) { + [&afield, &geo_array_index=geo_array_index, geopoint_range_index](const index_record& record, uint32_t seq_id) { // nested geopoint value inside an array of object will be a simple array so must be treated as geopoint bool nested_obj_arr_geopoint = (afield.nested && afield.type == field_types::GEOPOINT_ARRAY && !record.doc[afield.name].empty() && record.doc[afield.name][0].is_number()); @@ -827,9 +826,8 @@ void Index::index_field_in_memory(const field& afield, std::vector S2RegionTermIndexer indexer(options); S2Point point = S2LatLng::FromDegrees(latlongs[li], latlongs[li+1]).ToPoint(); - for(const auto& term: indexer.GetIndexTerms(point, "")) { - (*geo_index)[term].push_back(seq_id); - } + auto cell = S2CellId(point); + geopoint_range_index->insert_geopoint(cell.id(), seq_id); } if(nested_obj_arr_geopoint) { @@ -857,9 +855,9 @@ void Index::index_field_in_memory(const field& afield, std::vector for(size_t li = 0; li < latlongs.size(); li++) { auto& latlong = latlongs[li]; S2Point point = S2LatLng::FromDegrees(latlong[0], latlong[1]).ToPoint(); - for(const auto& term: indexer.GetIndexTerms(point, "")) { - (*geo_index)[term].push_back(seq_id); - } + + auto cell = S2CellId(point); + geopoint_range_index->insert_geopoint(cell.id(), seq_id); int64_t packed_latlong = GeoPoint::pack_lat_lng(latlong[0], latlong[1]); packed_latlongs[li + 1] = packed_latlong; @@ -1589,7 +1587,7 @@ void Index::numeric_not_equals_filter(num_tree_t* const num_tree, bool Index::field_is_indexed(const std::string& field_name) const { return search_index.count(field_name) != 0 || numerical_index.count(field_name) != 0 || - geopoint_index.count(field_name) != 0; + geo_range_index.count(field_name) != 0; } void Index::aproximate_numerical_match(num_tree_t* const num_tree, @@ -5471,7 +5469,7 @@ void Index::remove_field(uint32_t seq_id, const nlohmann::json& document, const } } } else if(search_field.is_geopoint()) { - auto geo_index = geopoint_index[field_name]; + auto geopoint_range_index = geo_range_index[field_name]; S2RegionTermIndexer::Options options; options.set_index_contains_points_only(true); S2RegionTermIndexer indexer(options); @@ -5482,17 +5480,8 @@ void Index::remove_field(uint32_t seq_id, const nlohmann::json& document, const for(const std::vector& latlong: latlongs) { S2Point point = S2LatLng::FromDegrees(latlong[0], latlong[1]).ToPoint(); - for(const auto& term: indexer.GetIndexTerms(point, "")) { - auto term_it = geo_index->find(term); - if(term_it == geo_index->end()) { - continue; - } - std::vector& ids = term_it->second; - ids.erase(std::remove(ids.begin(), ids.end(), seq_id), ids.end()); - if(ids.empty()) { - geo_index->erase(term); - } - } + auto cell = S2CellId(point); + geopoint_range_index->delete_geopoint(cell.id(), seq_id); } if(!search_field.is_single_geopoint()) { @@ -5644,8 +5633,7 @@ void Index::refresh_schemas(const std::vector& new_fields, const std::vec art_tree_init(t); search_index.emplace(new_field.name, t); } else if(new_field.is_geopoint()) { - auto field_geo_index = new spp::sparse_hash_map>(); - geopoint_index.emplace(new_field.name, field_geo_index); + geo_range_index.emplace(new_field.name, new NumericTrie(64)); if(!new_field.is_single_geopoint()) { auto geo_array_map = new spp::sparse_hash_map(); geo_array_index.emplace(new_field.name, geo_array_map); @@ -5695,8 +5683,8 @@ void Index::refresh_schemas(const std::vector& new_fields, const std::vec delete search_index[del_field.name]; search_index.erase(del_field.name); } else if(del_field.is_geopoint()) { - delete geopoint_index[del_field.name]; - geopoint_index.erase(del_field.name); + delete geo_range_index[del_field.name]; + geo_range_index.erase(del_field.name); if(!del_field.is_single_geopoint()) { spp::sparse_hash_map* geo_array_map = geo_array_index[del_field.name]; diff --git a/src/numeric_range_trie.cpp b/src/numeric_range_trie.cpp index 9d9f4aa0..86090304 100644 --- a/src/numeric_range_trie.cpp +++ b/src/numeric_range_trie.cpp @@ -18,6 +18,30 @@ void NumericTrie::insert(const int64_t& value, const uint32_t& seq_id) { } } +void NumericTrie::insert_geopoint(const uint64_t& cell_id, const uint32_t& seq_id) { + if (positive_trie == nullptr) { + positive_trie = new NumericTrie::Node(); + } + + positive_trie->insert_geopoint(cell_id, seq_id, max_level); +} + +void NumericTrie::search_geopoint(const uint64_t& cell_id, uint32_t*& ids, uint32_t& ids_length) { + if (positive_trie == nullptr) { + return; + } + + positive_trie->search_geopoint(cell_id, max_level, ids, ids_length); +} + +void NumericTrie::delete_geopoint(const uint64_t& cell_id, uint32_t id) { + if (positive_trie == nullptr) { + return; + } + + positive_trie->delete_geopoint(cell_id, id, max_level); +} + void NumericTrie::search_range(const int64_t& low, const bool& low_inclusive, const int64_t& high, const bool& high_inclusive, uint32_t*& ids, uint32_t& ids_length) { @@ -369,9 +393,14 @@ NumericTrie::iterator_t NumericTrie::search_equal_to(const int64_t& value) { return NumericTrie::iterator_t(matches); } -void NumericTrie::Node::insert(const int64_t& value, const uint32_t& seq_id, const char& max_level) { +void NumericTrie::Node::insert(const int64_t& cell_id, const uint32_t& seq_id, const char& max_level) { char level = 0; - return insert_helper(value, seq_id, level, max_level); + return insert_helper(cell_id, seq_id, level, max_level); +} + +void NumericTrie::Node::insert_geopoint(const uint64_t& cell_id, const uint32_t& seq_id, const char& max_level) { + char level = 0; + return insert_geopoint_helper(cell_id, seq_id, level, max_level); } inline int get_index(const int64_t& value, const char& level, const char& max_level) { @@ -385,6 +414,10 @@ inline int get_index(const int64_t& value, const char& level, const char& max_le return (value >> (8 * (max_level - level))) & 0xFF; } +inline int get_geopoint_index(const uint64_t& cell_id, const char& level, const char& max_level) { + return (cell_id >> (8 * (max_level - level))) & 0xFF; +} + void NumericTrie::Node::insert_helper(const int64_t& value, const uint32_t& seq_id, char& level, const char& max_level) { if (level > max_level) { return; @@ -409,6 +442,85 @@ void NumericTrie::Node::insert_helper(const int64_t& value, const uint32_t& seq_ } } +void NumericTrie::Node::insert_geopoint_helper(const uint64_t& cell_id, const uint32_t& seq_id, char& level, + const char& max_level) { + if (level > max_level) { + return; + } + + // Root node contains all the sequence ids present in the tree. + if (!seq_ids.contains(seq_id)) { + seq_ids.append(seq_id); + } + + if (++level <= max_level) { + if (children == nullptr) { + children = new NumericTrie::Node* [EXPANSE]{nullptr}; + } + + auto index = get_geopoint_index(cell_id, level, max_level); + if (children[index] == nullptr) { + children[index] = new NumericTrie::Node(); + } + + return children[index]->insert_geopoint_helper(cell_id, seq_id, level, max_level); + } +} + +char get_max_search_level(const uint64_t& cell_id, const char& max_level) { + // For cell id 0x47E66C3000000000, we only have to prefix match the top four bytes since rest of the bytes are 0. + // So the max search level would be 4 in this case. + + uint64_t mask = 0xff; + char i = max_level; + while (((cell_id & mask) == 0) && --i > 0) { + mask <<= 8; + } + + return i; +} + +void NumericTrie::Node::search_geopoint(const uint64_t& cell_id, const char& max_index_level, + uint32_t*& ids, uint32_t& ids_length) { + char level = 1; + Node* root = this; + auto index = get_geopoint_index(cell_id, level, max_index_level); + auto max_search_level = get_max_search_level(cell_id, max_index_level); + + while (level < max_search_level) { + if (root->children == nullptr || root->children[index] == nullptr) { + return; + } + + root = root->children[index]; + index = get_geopoint_index(cell_id, ++level, max_index_level); + } + + root->get_all_ids(ids, ids_length); +} + +void NumericTrie::Node::delete_geopoint(const uint64_t& cell_id, uint32_t id, const char& max_level) { + char level = 1; + Node* root = this; + auto index = get_geopoint_index(cell_id, level, max_level); + + while (level < max_level) { + root->seq_ids.remove_value(id); + + if (root->children == nullptr || root->children[index] == nullptr) { + return; + } + + root = root->children[index]; + index = get_geopoint_index(cell_id, ++level, max_level); + } + + if (root->children != nullptr || root->children[index] != nullptr) { + delete root->children[index]; + root->children[index] = nullptr; + } +} + void NumericTrie::Node::get_all_ids(uint32_t*& ids, uint32_t& ids_length) { ids = seq_ids.uncompress(); ids_length = seq_ids.getLength();