Support array of geo points.

This commit is contained in:
Kishore Nallan 2021-09-24 12:43:26 +05:30
parent 4596ca04c2
commit 043535ee6b
9 changed files with 526 additions and 152 deletions

View File

@ -58,7 +58,6 @@ include(cmake/GoogleTest.cmake)
include(cmake/TestResources.cmake)
include(cmake/Iconv.cmake)
include(cmake/Jemalloc.cmake)
include(cmake/h3.cmake)
include(cmake/s2.cmake)
include(cmake/lrucache.cmake)
include(cmake/kakasi.cmake)
@ -92,7 +91,6 @@ include_directories(${DEP_ROOT_DIR}/${ICONV_NAME}/include)
include_directories(${DEP_ROOT_DIR}/${BRPC_NAME}/include)
include_directories(${DEP_ROOT_DIR}/${BRAFT_NAME}/include)
include_directories(${DEP_ROOT_DIR}/${JEMALLOC_NAME}/include/jemalloc)
include_directories(${DEP_ROOT_DIR}/${H3_NAME}/build/src/h3lib/include)
include_directories(${DEP_ROOT_DIR}/${S2_NAME}/src)
include_directories(${DEP_ROOT_DIR}/${LRUCACHE_NAME}/include)
include_directories(${DEP_ROOT_DIR}/${KAKASI_NAME}/build/include)
@ -105,7 +103,6 @@ link_directories(${DEP_ROOT_DIR}/${H2O_NAME}/build)
link_directories(${DEP_ROOT_DIR}/${ROCKSDB_NAME})
link_directories(${DEP_ROOT_DIR}/${ICONV_NAME}/lib/.libs)
link_directories(${DEP_ROOT_DIR}/${JEMALLOC_NAME}/lib)
link_directories(${DEP_ROOT_DIR}/${H3_NAME}/build/lib)
link_directories(${DEP_ROOT_DIR}/${S2_NAME}/build)
link_directories(${DEP_ROOT_DIR}/${KAKASI_NAME}/build/lib)
@ -156,7 +153,7 @@ else()
endif()
set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIES})
set(CORE_LIBS kakasi h2o-evloop braft brpc iconv ${ICU_ALL_LIBRARIES} ${CURL_LIBRARIES} for s2 h3
set(CORE_LIBS kakasi h2o-evloop braft brpc iconv ${ICU_ALL_LIBRARIES} ${CURL_LIBRARIES} for s2
${LevelDB_LIBRARIES} ${ROCKSDB_LIBS}
glog ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${STACKTRACE_LIBS}
${OPENSSL_LIBRARIES} ${ZLIB_LIBRARIES} ${JEMALLOC_LIBRARIES}

View File

@ -1,44 +0,0 @@
# Download and build H3
set(H3_VERSION 3.7.1)
set(H3_NAME h3-${H3_VERSION})
set(H3_TAR_PATH ${DEP_ROOT_DIR}/${H3_NAME}.tar.gz)
if(NOT EXISTS ${H3_TAR_PATH})
message(STATUS "Downloading https://github.com/uber/h3/archive/v${H3_VERSION}.tar.gz")
file(DOWNLOAD https://github.com/uber/h3/archive/v${H3_VERSION}.tar.gz ${H3_TAR_PATH})
endif()
if(NOT EXISTS ${DEP_ROOT_DIR}/${H3_NAME})
message(STATUS "Extracting ${H3_NAME}...")
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${H3_TAR_PATH} WORKING_DIRECTORY ${DEP_ROOT_DIR}/)
endif()
if(NOT EXISTS ${DEP_ROOT_DIR}/${H3_NAME}/build/lib/libh3.a)
message("Configuring ${H3_NAME}...")
file(REMOVE_RECURSE ${DEP_ROOT_DIR}/${H3_NAME}/build)
file(MAKE_DIRECTORY ${DEP_ROOT_DIR}/${H3_NAME}/build)
execute_process(COMMAND ${CMAKE_COMMAND}
"-DCMAKE_FIND_LIBRARY_SUFFIXES=.a"
"-H${DEP_ROOT_DIR}/${H3_NAME}"
"-B${DEP_ROOT_DIR}/${H3_NAME}/build"
RESULT_VARIABLE
H3_CONFIGURE)
if(NOT H3_CONFIGURE EQUAL 0)
message(FATAL_ERROR "${H3_NAME} configure failed!")
endif()
if(BUILD_DEPS STREQUAL "yes")
message("Building ${H3_NAME} locally...")
execute_process(COMMAND ${CMAKE_COMMAND} --build
"${DEP_ROOT_DIR}/${H3_NAME}/build"
--target h3
RESULT_VARIABLE
H3_BUILD)
if(NOT H3_BUILD EQUAL 0)
message(FATAL_ERROR "${H3_NAME} build failed!")
endif()
endif()
endif()

View File

@ -24,6 +24,7 @@ namespace field_types {
static const std::string INT64_ARRAY = "int64[]";
static const std::string FLOAT_ARRAY = "float[]";
static const std::string BOOL_ARRAY = "bool[]";
static const std::string GEOPOINT_ARRAY = "geopoint[]";
static bool is_string_or_array(const std::string& type_def) {
return type_def == "string*";
@ -70,6 +71,10 @@ struct field {
return (type == field_types::BOOL);
}
bool is_single_geopoint() const {
return (type == field_types::GEOPOINT);
}
bool is_integer() const {
return (type == field_types::INT32 || type == field_types::INT32_ARRAY ||
type == field_types::INT64 || type == field_types::INT64_ARRAY);
@ -92,7 +97,7 @@ struct field {
}
bool is_geopoint() const {
return (type == field_types::GEOPOINT);
return (type == field_types::GEOPOINT || type == field_types::GEOPOINT_ARRAY);
}
bool is_string() const {
@ -106,7 +111,8 @@ struct field {
bool is_array() const {
return (type == field_types::STRING_ARRAY || type == field_types::INT32_ARRAY ||
type == field_types::FLOAT_ARRAY ||
type == field_types::INT64_ARRAY || type == field_types::BOOL_ARRAY);
type == field_types::INT64_ARRAY || type == field_types::BOOL_ARRAY ||
type == field_types::GEOPOINT_ARRAY);
}
bool is_singular() const {

View File

@ -15,7 +15,6 @@
#include <field.h>
#include <option.h>
#include <set>
#include <h3api.h>
#include "string_utils.h"
#include "num_tree.h"
#include "magic_enum.hpp"
@ -385,6 +384,9 @@ private:
// sort_field => (seq_id => value)
spp::sparse_hash_map<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> sort_index;
// geo_array_field => (seq_id => values) used for exact filtering of geo array records
spp::sparse_hash_map<std::string, spp::sparse_hash_map<uint32_t, int64_t*>*> geo_array_index;
// this is used for wildcard queries
sorted_array seq_ids;
@ -632,11 +634,11 @@ public:
const std::map<std::string, field> & facet_schema,
const std::string& fallback_field_type);
static bool is_point_in_polygon(const Geofence& poly, const GeoCoord& point);
//static bool is_point_in_polygon(const Geofence& poly, const GeoCoord& point);
static double transform_for_180th_meridian(Geofence& poly);
//static double transform_for_180th_meridian(Geofence& poly);
static void transform_for_180th_meridian(GeoCoord& point, double offset);
//static void transform_for_180th_meridian(GeoCoord& point, double offset);
art_leaf* get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len);
@ -644,8 +646,6 @@ public:
void refresh_schemas(const std::vector<field>& new_fields);
bool field_contains_string(const std::string& field_name, const std::string& value);
// the following methods are not synchronized because their parent calls are synchronized or they are const/static
static Option<uint32_t> validate_index_in_memory(nlohmann::json &document, uint32_t seq_id,
@ -670,5 +670,9 @@ public:
void curate_filtered_ids(const std::vector<filter>& filters, const std::set<uint32_t>& curated_ids,
const uint32_t* exclude_token_ids, size_t exclude_token_ids_size, uint32_t*& filter_ids,
uint32_t& filter_ids_length, const std::vector<uint32_t>& curated_ids_sorted) const;
void populate_sort_mapping(int* sort_order, std::vector<size_t>& geopoint_indices,
const std::vector<sort_by>& sort_fields_std,
std::array<spp::sparse_hash_map<uint32_t, int64_t>*, 3>& field_values) const;
};

View File

@ -12,7 +12,6 @@
#include <system_metrics.h>
#include <tokenizer.h>
#include <collection_manager.h>
#include <h3api.h>
#include <regex>
#include <list>
#include <posting.h>

View File

@ -9,7 +9,6 @@
#include <string_utils.h>
#include <art.h>
#include <tokenizer.h>
#include <h3api.h>
#include <s2/s2point.h>
#include <s2/s2latlng.h>
#include <s2/s2region_term_indexer.h>
@ -42,6 +41,11 @@ Index::Index(const std::string& name, const uint32_t collection_id, const Store*
} else if(fname_field.second.is_geopoint()) {
auto field_geo_index = new spp::sparse_hash_map<std::string, std::vector<uint32_t>>();
geopoint_index.emplace(fname_field.first, field_geo_index);
if(!fname_field.second.is_single_geopoint()) {
spp::sparse_hash_map<uint32_t, int64_t*> * doc_to_geos = new spp::sparse_hash_map<uint32_t, int64_t*>();
geo_array_index.emplace(fname_field.first, doc_to_geos);
}
} else {
num_tree_t* num_tree = new num_tree_t;
numerical_index.emplace(fname_field.first, num_tree);
@ -56,8 +60,10 @@ Index::Index(const std::string& name, const uint32_t collection_id, const Store*
}
for(const auto & pair: sort_schema) {
spp::sparse_hash_map<uint32_t, int64_t> * doc_to_score = new spp::sparse_hash_map<uint32_t, int64_t>();
sort_index.emplace(pair.first, doc_to_score);
if(pair.second.type != field_types::GEOPOINT_ARRAY) {
spp::sparse_hash_map<uint32_t, int64_t> * doc_to_score = new spp::sparse_hash_map<uint32_t, int64_t>();
sort_index.emplace(pair.first, doc_to_score);
}
}
for(const auto& pair: facet_schema) {
@ -86,6 +92,17 @@ Index::~Index() {
geopoint_index.clear();
for(auto& name_index: geo_array_index) {
for(auto& kv: *name_index.second) {
delete [] kv.second;
}
delete name_index.second;
name_index.second = nullptr;
}
geo_array_index.clear();
for(auto & name_tree: numerical_index) {
delete name_tree.second;
name_tree.second = nullptr;
@ -168,7 +185,7 @@ Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t
bool is_facet = (facet_schema.count(field_name) != 0);
// non-string, non-geo faceted field should be indexed as faceted string field as well
if(field_pair.second.facet && !field_pair.second.is_string() && field_pair.second.type != field_types::GEOPOINT) {
if(field_pair.second.facet && !field_pair.second.is_string() && !field_pair.second.is_geopoint()) {
art_tree *t = search_index.at(field_pair.second.faceted_name());
if(field_pair.second.is_array()) {
@ -233,18 +250,47 @@ Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t
bool value = document[field_name];
num_tree->insert(value, seq_id);
} else if(field_pair.second.type == field_types::GEOPOINT) {
const std::vector<double>& latlong = document[field_name];
auto geo_index = geopoint_index.at(field_name);
S2RegionTermIndexer::Options options;
options.set_index_contains_points_only(true);
S2RegionTermIndexer indexer(options);
const std::vector<double>& latlong = document[field_name];
S2Point point = S2LatLng::FromDegrees(latlong[0], latlong[1]).ToPoint();
for(const auto& term: indexer.GetIndexTerms(point, "")) {
auto geo_index = geopoint_index.at(field_name);
(*geo_index)[term].push_back(seq_id);
}
} else if(field_pair.second.type == field_types::STRING_ARRAY) {
art_tree *t = search_index.at(field_name);
index_string_array_field(document[field_name], points, t, seq_id, is_facet, field_pair.second);
} else if(field_pair.second.type == field_types::GEOPOINT_ARRAY) {
const std::vector<std::vector<double>>& latlongs = document[field_name];
auto geo_index = geopoint_index.at(field_name);
S2RegionTermIndexer::Options options;
options.set_index_contains_points_only(true);
S2RegionTermIndexer indexer(options);
int64_t* packed_latlongs = new int64_t[latlongs.size() + 1];
packed_latlongs[0] = latlongs.size();
for(size_t li = 0; li < latlongs.size(); li++) {
auto& latlong = latlongs[li];
S2Point point = S2LatLng::FromDegrees(latlong[0], latlong[1]).ToPoint();
std::set<std::string> terms;
for(const auto& term: indexer.GetIndexTerms(point, "")) {
terms.insert(term);
}
for(const auto& term: terms) {
(*geo_index)[term].push_back(seq_id);
}
int64_t packed_latlong = GeoPoint::pack_lat_lng(latlong[0], latlong[1]);
packed_latlongs[li + 1] = packed_latlong;
}
geo_array_index.at(field_name)->emplace(seq_id, packed_latlongs);
}
else if(field_pair.second.is_array()) {
@ -433,6 +479,18 @@ Option<uint32_t> Index::validate_index_in_memory(nlohmann::json& document, uint3
if (!coerce_op.ok()) {
return coerce_op;
}
} else if (a_field.type == field_types::GEOPOINT_ARRAY) {
if(!item.is_array() || item.size() != 2) {
return Option<>(400, "Field `" + field_name + "` must contain 2 element arrays: [ [lat, lng],... ].");
}
if(!(item[0].is_number() && item[1].is_number())) {
// one or more elements is not an number, try to coerce
Option<uint32_t> coerce_op = coerce_geopoint(dirty_values, a_field, document, field_name, it, true, array_ele_erased);
if(!coerce_op.ok()) {
return coerce_op;
}
}
}
if(!array_ele_erased) {
@ -938,24 +996,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array,
std::array<spp::sparse_hash_map<uint32_t, int64_t>*, 3> field_values;
std::vector<size_t> geopoint_indices;
for (size_t i = 0; i < sort_fields.size(); i++) {
sort_order[i] = 1;
if (sort_fields[i].order == sort_field_const::asc) {
sort_order[i] = -1;
}
if (sort_fields[i].name == sort_field_const::text_match) {
field_values[i] = &text_match_sentinel_value;
} else if (sort_fields[i].name == sort_field_const::seq_id) {
field_values[i] = &seq_id_sentinel_value;
} else if (sort_index.count(sort_fields[i].name) != 0) {
field_values[i] = sort_index.at(sort_fields[i].name);
if (sort_schema.at(sort_fields[i].name).is_geopoint()) {
geopoint_indices.push_back(i);
}
}
}
populate_sort_mapping(sort_order, geopoint_indices, sort_fields, field_values);
size_t combination_limit = exhaustive_search ? Index::COMBINATION_MAX_LIMIT : Index::COMBINATION_MIN_LIMIT;
@ -1241,14 +1282,37 @@ void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length,
std::vector<uint32_t> exact_geo_result_ids;
for(auto result_id: geo_result_ids) {
int64_t lat_lng = sort_index.at(f.name)->at(result_id);
S2LatLng s2_lat_lng;
GeoPoint::unpack_lat_lng(lat_lng, s2_lat_lng);
if (!query_region->Contains(s2_lat_lng.ToPoint())) {
continue;
if(f.is_single_geopoint()) {
for(auto result_id: geo_result_ids) {
// no need to check for existence of `result_id` because of indexer based pre-filtering above
int64_t lat_lng = sort_index.at(f.name)->at(result_id);
S2LatLng s2_lat_lng;
GeoPoint::unpack_lat_lng(lat_lng, s2_lat_lng);
if (query_region->Contains(s2_lat_lng.ToPoint())) {
exact_geo_result_ids.push_back(result_id);
}
}
} else {
for(auto result_id: geo_result_ids) {
int64_t* lat_lngs = geo_array_index.at(f.name)->at(result_id);
bool point_found = false;
// any one point should exist
for(size_t li = 0; li < lat_lngs[0]; li++) {
int64_t lat_lng = lat_lngs[li + 1];
S2LatLng s2_lat_lng;
GeoPoint::unpack_lat_lng(lat_lng, s2_lat_lng);
if (query_region->Contains(s2_lat_lng.ToPoint())) {
point_found = true;
break;
}
}
if(point_found) {
exact_geo_result_ids.push_back(result_id);
}
}
exact_geo_result_ids.push_back(result_id);
}
std::sort(exact_geo_result_ids.begin(), exact_geo_result_ids.end());
@ -2335,29 +2399,11 @@ void Index::search_wildcard(const std::vector<std::string>& qtokens, const std::
const string& field, uint32_t*& all_result_ids, size_t& all_result_ids_len,
const uint32_t* filter_ids, uint32_t filter_ids_length) const {
// FIXME: duplicated
int sort_order[3]; // 1 or -1 based on DESC or ASC respectively
std::array<spp::sparse_hash_map<uint32_t, int64_t>*, 3> field_values;
std::vector<size_t> geopoint_indices;
for (size_t i = 0; i < sort_fields_std.size(); i++) {
sort_order[i] = 1;
if (sort_fields_std[i].order == sort_field_const::asc) {
sort_order[i] = -1;
}
if (sort_fields_std[i].name == sort_field_const::text_match) {
field_values[i] = &text_match_sentinel_value;
} else if (sort_fields_std[i].name == sort_field_const::seq_id) {
field_values[i] = &seq_id_sentinel_value;
} else if (sort_index.count(sort_fields_std[i].name) != 0) {
field_values[i] = sort_index.at(sort_fields_std[i].name);
if (sort_schema.at(sort_fields_std[i].name).is_geopoint()) {
geopoint_indices.push_back(i);
}
}
}
populate_sort_mapping(sort_order, geopoint_indices, sort_fields_std, field_values);
uint32_t token_bits = 255;
std::vector<posting_list_t::iterator_t> plists;
@ -2379,6 +2425,34 @@ void Index::search_wildcard(const std::vector<std::string>& qtokens, const std::
all_result_ids = new_all_result_ids;
}
void Index::populate_sort_mapping(int* sort_order, std::vector<size_t>& geopoint_indices,
const std::vector<sort_by>& sort_fields_std,
std::array<spp::sparse_hash_map<uint32_t, int64_t>*, 3>& field_values) const {
for (size_t i = 0; i < sort_fields_std.size(); i++) {
sort_order[i] = 1;
if (sort_fields_std[i].order == sort_field_const::asc) {
sort_order[i] = -1;
}
if (sort_fields_std[i].name == sort_field_const::text_match) {
field_values[i] = &text_match_sentinel_value;
} else if (sort_fields_std[i].name == sort_field_const::seq_id) {
field_values[i] = &seq_id_sentinel_value;
} else if (sort_schema.count(sort_fields_std[i].name) != 0) {
if (sort_schema.at(sort_fields_std[i].name).type == field_types::GEOPOINT_ARRAY) {
geopoint_indices.push_back(i);
field_values[i] = nullptr; // GEOPOINT_ARRAY uses a multi-valued index
} else {
field_values[i] = sort_index.at(sort_fields_std[i].name);
if (sort_schema.at(sort_fields_std[i].name).is_geopoint()) {
geopoint_indices.push_back(i);
}
}
}
}
}
/*
1. Split the query into tokens
2. Outer loop will generate bounded cartesian product with costs for each token
@ -2633,18 +2707,37 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
for(auto& i: geopoint_indices) {
spp::sparse_hash_map<uint32_t, int64_t>* geopoints = field_values[i];
int64_t dist = INT32_MAX;
S2LatLng reference_lat_lng;
GeoPoint::unpack_lat_lng(sort_fields[i].geopoint, reference_lat_lng);
auto it = geopoints->find(seq_id);
int64_t dist = INT32_MAX;
if(geopoints != nullptr) {
auto it = geopoints->find(seq_id);
if(it != geopoints->end()) {
int64_t packed_latlng = it->second;
S2LatLng s2_lat_lng;
GeoPoint::unpack_lat_lng(packed_latlng, s2_lat_lng);
dist = GeoPoint::distance(s2_lat_lng, reference_lat_lng);
if(it != geopoints->end()) {
int64_t packed_latlng = it->second;
S2LatLng s2_lat_lng;
GeoPoint::unpack_lat_lng(packed_latlng, s2_lat_lng);
dist = GeoPoint::distance(s2_lat_lng, reference_lat_lng);
}
} else {
// indicates geo point array
auto field_it = geo_array_index.at(sort_fields[i].name);
auto it = field_it->find(seq_id);
if(it != field_it->end()) {
int64_t* latlngs = it->second;
for(size_t li = 0; li < latlngs[0]; li++) {
S2LatLng s2_lat_lng;
int64_t packed_latlng = latlngs[li + 1];
GeoPoint::unpack_lat_lng(packed_latlng, s2_lat_lng);
int64_t this_dist = GeoPoint::distance(s2_lat_lng, reference_lat_lng);
if(this_dist < dist) {
dist = this_dist;
}
}
}
}
if(dist < sort_fields[i].exclude_radius) {
@ -2939,13 +3032,31 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
options.set_index_contains_points_only(true);
S2RegionTermIndexer indexer(options);
const std::vector<double>& latlong = document[field_name];
S2Point point = S2LatLng::FromDegrees(latlong[0], latlong[1]).ToPoint();
for(const auto& term: indexer.GetIndexTerms(point, "")) {
std::vector<uint32_t>& ids = (*geo_index)[term];
ids.erase(std::remove(ids.begin(), ids.end(), seq_id), ids.end());
if(ids.empty()) {
geo_index->erase(term);
const std::vector<std::vector<double>>& latlongs = search_field.is_single_geopoint() ?
std::vector<std::vector<double>>{document[field_name].get<std::vector<double>>()} :
document[field_name].get<std::vector<std::vector<double>>>();
for(const std::vector<double>& latlong: latlongs) {
S2Point point = S2LatLng::FromDegrees(latlong[0], latlong[1]).ToPoint();
for(const auto& term: indexer.GetIndexTerms(point, "")) {
auto term_it = geo_index->find(term);
if(term_it == geo_index->end()) {
continue;
}
std::vector<uint32_t>& ids = term_it->second;
ids.erase(std::remove(ids.begin(), ids.end(), seq_id), ids.end());
if(ids.empty()) {
geo_index->erase(term);
}
}
}
if(!search_field.is_single_geopoint()) {
spp::sparse_hash_map<uint32_t, int64_t*>*& field_geo_array_map = geo_array_index.at(field_name);
auto geo_array_it = field_geo_array_map->find(seq_id);
if(geo_array_it != field_geo_array_map->end()) {
delete [] geo_array_it->second;
field_geo_array_map->erase(seq_id);
}
}
}
@ -3017,6 +3128,10 @@ void Index::refresh_schemas(const std::vector<field>& new_fields) {
} else if(new_field.is_geopoint()) {
auto field_geo_index = new spp::sparse_hash_map<std::string, std::vector<uint32_t>>();
geopoint_index.emplace(new_field.name, field_geo_index);
if(!new_field.is_single_geopoint()) {
auto geo_array_map = new spp::sparse_hash_map<uint32_t, int64_t*>();
geo_array_index.emplace(new_field.name, geo_array_map);
}
} else {
num_tree_t* num_tree = new num_tree_t;
numerical_index.emplace(new_field.name, num_tree);
@ -3290,7 +3405,7 @@ Option<uint32_t> Index::coerce_bool(const DIRTY_VALUES& dirty_values, const fiel
Option<uint32_t> Index::coerce_geopoint(const DIRTY_VALUES& dirty_values, const field& a_field, nlohmann::json &document,
const std::string &field_name,
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased) {
std::string suffix = is_array ? "a array of" : "a";
std::string suffix = is_array ? "an array of" : "a";
auto& item = is_array ? array_iter.value() : document[field_name];
if(dirty_values == DIRTY_VALUES::REJECT) {
@ -3313,19 +3428,19 @@ Option<uint32_t> Index::coerce_geopoint(const DIRTY_VALUES& dirty_values, const
// try to value coerce into a geopoint
if(!document[field_name][0].is_number() && document[field_name][0].is_string()) {
if(StringUtils::is_float(document[field_name][0])) {
document[field_name][0] = std::stof(document[field_name][0].get<std::string>());
if(!item[0].is_number() && item[0].is_string()) {
if(StringUtils::is_float(item[0])) {
item[0] = std::stof(item[0].get<std::string>());
}
}
if(!document[field_name][1].is_number() && document[field_name][1].is_string()) {
if(StringUtils::is_float(document[field_name][1])) {
document[field_name][1] = std::stof(document[field_name][1].get<std::string>());
if(!item[1].is_number() && item[1].is_string()) {
if(StringUtils::is_float(item[1])) {
item[1] = std::stof(item[1].get<std::string>());
}
}
if(!document[field_name][0].is_number() || !document[field_name][1].is_number()) {
if(!item[0].is_number() || !item[1].is_number()) {
if(dirty_values == DIRTY_VALUES::COERCE_OR_DROP) {
if(!a_field.optional) {
return Option<>(400, "Field `" + field_name + "` must be " + suffix + " geopoint.");
@ -3457,6 +3572,7 @@ void Index::scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_do
}
}
/*
// https://stackoverflow.com/questions/924171/geo-fencing-point-inside-outside-polygon
// NOTE: polygon and point should have been transformed with `transform_for_180th_meridian`
bool Index::is_point_in_polygon(const Geofence& poly, const GeoCoord &point) {
@ -3511,16 +3627,4 @@ double Index::transform_for_180th_meridian(Geofence &poly) {
void Index::transform_for_180th_meridian(GeoCoord &point, double offset) {
point.lon = point.lon < 0.0 ? point.lon + offset : point.lon;
}
bool Index::field_contains_string(const std::string& field_name, const std::string& value) {
std::shared_lock lock(mutex);
auto field_it = search_index.find(field_name);
if(field_it != search_index.end()) {
art_tree* t = field_it->second;
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char *)value.c_str(), value.size()+1);
return (leaf != nullptr);
}
return false;
}
*/

View File

@ -4,7 +4,6 @@
#include <fstream>
#include <algorithm>
#include <collection_manager.h>
#include <h3api.h>
#include "collection.h"
class CollectionFilteringTest : public ::testing::Test {
@ -1069,10 +1068,151 @@ TEST_F(CollectionFilteringTest, GeoPointFiltering) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionFilteringTest, GeoPointArrayFiltering) {
Collection *coll1;
std::vector<field> fields = {field("title", field_types::STRING, false),
field("loc", field_types::GEOPOINT_ARRAY, false),
field("points", field_types::INT32, false),};
coll1 = collectionManager.get_collection("coll1").get();
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
}
std::vector<std::vector<std::vector<std::string>>> records = {
{ {"Alpha Inc", "Ennore", "13.22112, 80.30511"},
{"Alpha Inc", "Velachery", "12.98973, 80.23095"}
},
{
{"Veera Inc", "Thiruvallur", "13.12752, 79.90136"},
},
{
{"B1 Inc", "Bengaluru", "12.98246, 77.5847"},
{"B1 Inc", "Hosur", "12.74147, 77.82915"},
{"B1 Inc", "Vellore", "12.91866, 79.13075"},
},
{
{"M Inc", "Nashik", "20.11282, 73.79458"},
{"M Inc", "Pune", "18.56309, 73.855"},
}
};
for(size_t i=0; i<records.size(); i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = records[i][0][0];
doc["points"] = i;
std::vector<std::vector<double>> lat_lngs;
for(size_t k = 0; k < records[i].size(); k++) {
std::vector<std::string> lat_lng_str;
StringUtils::split(records[i][k][2], lat_lng_str, ", ");
std::vector<double> lat_lng = {
std::stod(lat_lng_str[0]),
std::stod(lat_lng_str[1])
};
lat_lngs.push_back(lat_lng);
}
doc["loc"] = lat_lngs;
auto add_op = coll1->add(doc.dump());
ASSERT_TRUE(add_op.ok());
}
// pick a location close to Chennai
auto results = coll1->search("*",
{}, "loc: (13.12631, 80.20252, 100 km)",
{}, {}, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(2, results["found"].get<size_t>());
ASSERT_EQ(2, results["hits"].size());
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
// pick location close to none of the spots
results = coll1->search("*",
{}, "loc: (13.62601, 79.39559, 10 km)",
{}, {}, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(0, results["found"].get<size_t>());
// pick a large radius covering all points
results = coll1->search("*",
{}, "loc: (21.20714729927276, 78.99153966917213, 1000 km)",
{}, {}, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(4, results["found"].get<size_t>());
// 1 mile radius
results = coll1->search("*",
{}, "loc: (12.98941, 80.23073, 1 mi)",
{}, {}, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
// when geo field is formatted badly, show meaningful error
nlohmann::json bad_doc;
bad_doc["id"] = "1000";
bad_doc["title"] = "Test record";
bad_doc["loc"] = {"48.91", "2.33"};
bad_doc["points"] = 1000;
auto add_op = coll1->add(bad_doc.dump(), CREATE, "", DIRTY_VALUES::REJECT);
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Field `loc` must contain 2 element arrays: [ [lat, lng],... ].", add_op.error());
bad_doc["loc"] = "foobar";
add_op = coll1->add(bad_doc.dump(), CREATE, "", DIRTY_VALUES::REJECT);
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Field `loc` must be an array.", add_op.error());
bad_doc["loc"] = nlohmann::json::array();
nlohmann::json points = nlohmann::json::array();
points.push_back("foo");
points.push_back("bar");
bad_doc["loc"].push_back(points);
add_op = coll1->add(bad_doc.dump(), CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT);
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Field `loc` must be an array of geopoint.", add_op.error());
bad_doc["loc"][0][0] = "2.33";
bad_doc["loc"][0][1] = "bar";
add_op = coll1->add(bad_doc.dump(), CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT);
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Field `loc` must be an array of geopoint.", add_op.error());
bad_doc["loc"][0][0] = "foo";
bad_doc["loc"][0][1] = "2.33";
add_op = coll1->add(bad_doc.dump(), CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT);
ASSERT_FALSE(add_op.ok());
ASSERT_EQ("Field `loc` must be an array of geopoint.", add_op.error());
// under coercion mode, it should work
bad_doc["loc"][0][0] = "48.91";
bad_doc["loc"][0][1] = "2.33";
add_op = coll1->add(bad_doc.dump(), CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT);
ASSERT_TRUE(add_op.ok());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionFilteringTest, GeoPointRemoval) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("loc1", field_types::GEOPOINT, false),
field("loc2", field_types::GEOPOINT, false),
field("loc2", field_types::GEOPOINT_ARRAY, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
@ -1081,7 +1221,8 @@ TEST_F(CollectionFilteringTest, GeoPointRemoval) {
doc["id"] = "0";
doc["title"] = "Palais Garnier";
doc["loc1"] = {48.872576479306765, 2.332291112241466};
doc["loc2"] = {48.84620987789056, 2.345152755563131};
doc["loc2"] = nlohmann::json::array();
doc["loc2"][0] = {48.84620987789056, 2.345152755563131};
doc["points"] = 100;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
@ -1093,6 +1234,13 @@ TEST_F(CollectionFilteringTest, GeoPointRemoval) {
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
results = coll1->search("*",
{}, "loc2: (48.87491151802846, 2.343945883701618, 10 km)",
{}, {}, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
// remove the document, index another document and try querying again
coll1->remove("0");
doc["id"] = "1";
@ -1105,6 +1253,13 @@ TEST_F(CollectionFilteringTest, GeoPointRemoval) {
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
results = coll1->search("*",
{}, "loc2: (48.87491151802846, 2.343945883701618, 10 km)",
{}, {}, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
}
TEST_F(CollectionFilteringTest, GeoPolygonFiltering) {

View File

@ -4,7 +4,6 @@
#include <fstream>
#include <algorithm>
#include <collection_manager.h>
#include <h3api.h>
#include "collection.h"
class CollectionSortingTest : public ::testing::Test {
@ -860,3 +859,157 @@ TEST_F(CollectionSortingTest, GeoPointSortingWithPrecision) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSortingTest, GeoPointAsOptionalField) {
Collection* coll1;
std::vector<field> fields = {field("title", field_types::STRING, false),
field("loc", field_types::GEOPOINT, false, true),
field("points", field_types::INT32, false),};
coll1 = collectionManager.get_collection("coll1").get();
if (coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
}
std::vector<std::vector<std::string>> records = {
{"Tibetan Colony", "32.24678, 77.19239"},
{"Civil Hospital", "32.23959, 77.18763"},
{"Johnson Lodge", "32.24751, 77.18814"},
{"Lion King Rock", "32.24493, 77.17038"},
{"Jai Durga Handloom", "32.25749, 77.17583"},
{"Panduropa", "32.26059, 77.21798"},
{"Police Station", "32.23743, 77.18639"},
{"Panduropa Post", "32.26263, 77.2196"},
};
for (size_t i = 0; i < records.size(); i++) {
nlohmann::json doc;
std::vector<std::string> lat_lng;
StringUtils::split(records[i][1], lat_lng, ", ");
double lat = std::stod(lat_lng[0]);
double lng = std::stod(lat_lng[1]);
doc["id"] = std::to_string(i);
doc["title"] = records[i][0];
if(i != 2) {
doc["loc"] = {lat, lng};
}
doc["points"] = i;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
std::vector<sort_by> geo_sort_fields = {
sort_by("loc(32.24348, 77.1893, precision: 0.9 km)", "ASC"),
sort_by("points", "DESC"),
};
auto results = coll1->search("*",
{}, "loc: (32.24348, 77.1893, 20 km)",
{}, geo_sort_fields, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(7, results["found"].get<size_t>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSortingTest, GeoPointArraySorting) {
Collection *coll1;
std::vector<field> fields = {field("title", field_types::STRING, false),
field("loc", field_types::GEOPOINT_ARRAY, false),
field("points", field_types::INT32, false),};
coll1 = collectionManager.get_collection("coll1").get();
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
}
std::vector<std::vector<std::vector<std::string>>> records = {
{ {"Alpha Inc", "Ennore", "13.22112, 80.30511"},
{"Alpha Inc", "Velachery", "12.98973, 80.23095"}
},
{
{"Veera Inc", "Thiruvallur", "13.12752, 79.90136"},
},
{
{"B1 Inc", "Bengaluru", "12.98246, 77.5847"},
{"B1 Inc", "Hosur", "12.74147, 77.82915"},
{"B1 Inc", "Vellore", "12.91866, 79.13075"},
},
{
{"M Inc", "Nashik", "20.11282, 73.79458"},
{"M Inc", "Pune", "18.56309, 73.855"},
}
};
for(size_t i=0; i<records.size(); i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = records[i][0][0];
doc["points"] = i;
std::vector<std::vector<double>> lat_lngs;
for(size_t k = 0; k < records[i].size(); k++) {
std::vector<std::string> lat_lng_str;
StringUtils::split(records[i][k][2], lat_lng_str, ", ");
std::vector<double> lat_lng = {
std::stod(lat_lng_str[0]),
std::stod(lat_lng_str[1])
};
lat_lngs.push_back(lat_lng);
}
doc["loc"] = lat_lngs;
auto add_op = coll1->add(doc.dump());
ASSERT_TRUE(add_op.ok());
}
std::vector<sort_by> geo_sort_fields = {
sort_by("loc(13.12631, 80.20252)", "ASC"),
sort_by("points", "DESC"),
};
// pick a location close to Chennai
auto results = coll1->search("*",
{}, "loc: (13.12631, 80.20252, 100 km)",
{}, geo_sort_fields, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(2, results["found"].get<size_t>());
ASSERT_EQ(2, results["hits"].size());
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
// pick a large radius covering all points
geo_sort_fields = {
sort_by("loc(13.03388, 79.25868)", "ASC"),
sort_by("points", "DESC"),
};
results = coll1->search("*",
{}, "loc: (13.03388, 79.25868, 1000 km)",
{}, geo_sort_fields, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(4, results["found"].get<size_t>());
ASSERT_STREQ("2", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["hits"][2]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("3", results["hits"][3]["document"]["id"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}

View File

@ -63,7 +63,7 @@ TEST(IndexTest, ScrubReindexDoc) {
pool.shutdown();
}
TEST(IndexTest, PointInPolygon180thMeridian) {
/*TEST(IndexTest, PointInPolygon180thMeridian) {
// somewhere in far eastern russia
GeoCoord verts[3] = {
{67.63378886620751, 179.87924212491276},
@ -72,13 +72,13 @@ TEST(IndexTest, PointInPolygon180thMeridian) {
};
/*std::vector<S2Point> vertices;
*//*std::vector<S2Point> vertices;
for(size_t point_index = 0; point_index < 4; point_index++) {
S2Point vertex = S2LatLng::FromDegrees(verts[point_index].lat, verts[point_index].lon).ToPoint();
vertices.emplace_back(vertex);
}
S2Loop region(vertices);*/
S2Loop region(vertices);*//*
Geofence poly1{3, verts};
double offset = Index::transform_for_180th_meridian(poly1);
@ -96,19 +96,19 @@ TEST(IndexTest, PointInPolygon180thMeridian) {
Index::transform_for_180th_meridian(point4, offset);
Index::transform_for_180th_meridian(point5, offset);
/*ASSERT_TRUE(region.Contains(S2LatLng::FromDegrees(point1.lat, point1.lon).ToPoint()));
*//*ASSERT_TRUE(region.Contains(S2LatLng::FromDegrees(point1.lat, point1.lon).ToPoint()));
ASSERT_TRUE(region.Contains(S2LatLng::FromDegrees(point2.lat, point2.lon).ToPoint()));
ASSERT_TRUE(region.Contains(S2LatLng::FromDegrees(point3.lat, point3.lon).ToPoint()));
ASSERT_FALSE(region.Contains(S2LatLng::FromDegrees(point4.lat, point4.lon).ToPoint()));
ASSERT_FALSE(region.Contains(S2LatLng::FromDegrees(point5.lat, point5.lon).ToPoint()));
*/
*//*
ASSERT_TRUE(Index::is_point_in_polygon(poly1, point1));
ASSERT_TRUE(Index::is_point_in_polygon(poly1, point2));
ASSERT_TRUE(Index::is_point_in_polygon(poly1, point3));
ASSERT_FALSE(Index::is_point_in_polygon(poly1, point4));
ASSERT_FALSE(Index::is_point_in_polygon(poly1, point5));
}
}*/
TEST(IndexTest, GeoPointPackUnpack) {
std::vector<std::pair<double, double>> latlngs = {