Skip exact filtering beyond threshold in case of geo polygon.

This commit is contained in:
Harpreet Sangar 2023-05-23 10:09:28 +05:30
parent 4c2811f46d
commit b825ce752a
3 changed files with 69 additions and 54 deletions

View File

@ -172,7 +172,8 @@ Option<bool> validate_geofilter_distance(std::string& raw_value, const string& f
Option<bool> filter::parse_geopoint_filter_value(string& raw_value, const string& format_err_msg, filter& filter_exp) {
// FORMAT:
// [ ([48.853, 2.344], radius: 1km, exact_filter_radius: 100km), ([48.8662, 2.3255, 48.8581, 2.3209, 48.8561, 2.3448, 48.8641, 2.3469]) ]
// [ ([48.853, 2.344], radius: 1km, exact_filter_radius: 100km),
// ([48.8662, 2.3255, 48.8581, 2.3209, 48.8561, 2.3448, 48.8641, 2.3469], exact_filter_radius: 100km) ]
// Every open parenthesis represent a geo filter value.
auto open_parenthesis_count = std::count(raw_value.begin(), raw_value.end(), '(');
@ -181,11 +182,9 @@ Option<bool> filter::parse_geopoint_filter_value(string& raw_value, const string
}
filter_exp.comparators.push_back(LESS_THAN_EQUALS);
bool is_multivalued = raw_value[0] == '[';
bool is_multivalued = open_parenthesis_count > 1;
size_t i = is_multivalued;
// Adding polygonal values at last since they don't have any parameters associated with them.
std::vector<std::string> polygons;
for (auto j = 0; j < open_parenthesis_count; j++) {
if (is_multivalued) {
auto pos = raw_value.find('(', i);
@ -206,57 +205,55 @@ Option<bool> filter::parse_geopoint_filter_value(string& raw_value, const string
}
// [48.853, 2.344], radius: 1km, exact_filter_radius: 100km
// [48.8662, 2.3255, 48.8581, 2.3209, 48.8561, 2.3448, 48.8641, 2.3469]
// [48.8662, 2.3255, 48.8581, 2.3209, 48.8561, 2.3448, 48.8641, 2.3469], exact_filter_radius: 100km
std::string value_str = raw_value.substr(i, value_end_index - i);
StringUtils::trim(value_str);
if (value_str.empty() || value_str[0] != '[' || value_str.find(']', 1) == std::string::npos) {
return Option<bool>(400, format_err_msg);
} else {
std::vector<std::string> filter_values;
StringUtils::split(value_str, filter_values, ",");
if(filter_values.size() < 3) {
return Option<bool>(400, format_err_msg);
}
}
auto points_str = value_str.substr(1, value_str.find(']', 1) - 1);
std::vector<std::string> geo_points;
StringUtils::split(points_str, geo_points, ",");
bool is_polygon = value_str.back() == ']';
if (geo_points.size() < 2 || geo_points.size() % 2) {
return Option<bool>(400, format_err_msg);
}
bool is_polygon = geo_points.size() > 2;
for (const auto& geo_point: geo_points) {
if (!StringUtils::is_float(geo_point) ||
(!is_polygon && (geo_point == "nan" || geo_point == "NaN"))) {
if (geo_point == "nan" || geo_point == "NaN" || !StringUtils::is_float(geo_point)) {
return Option<bool>(400, format_err_msg);
}
}
if (is_polygon) {
polygons.push_back(points_str);
continue;
filter_exp.values.push_back(points_str);
}
// Handle options.
// , radius: 1km, exact_filter_radius: 100km
i = raw_value.find(']', i);
i++;
i = raw_value.find(']', i) + 1;
std::vector<std::string> options;
StringUtils::split(raw_value.substr(i, value_end_index - i), options, ",");
if (options.empty()) {
// Missing radius option
return Option<bool>(400, format_err_msg);
if (!is_polygon) {
// Missing radius option
return Option<bool>(400, format_err_msg);
}
nlohmann::json param;
param[EXACT_GEO_FILTER_RADIUS_KEY] = DEFAULT_EXACT_GEO_FILTER_RADIUS_VALUE;
filter_exp.params.push_back(param);
continue;
}
bool is_radius_present = false;
for (auto const& option: options) {
if (option.empty()) {
continue;
}
std::vector<std::string> key_value;
StringUtils::split(option, key_value, ":");
@ -264,7 +261,7 @@ Option<bool> filter::parse_geopoint_filter_value(string& raw_value, const string
continue;
}
if (key_value[0] == GEO_FILTER_RADIUS_KEY) {
if (key_value[0] == GEO_FILTER_RADIUS_KEY && !is_polygon) {
is_radius_present = true;
std::string distance, unit;
@ -293,10 +290,16 @@ Option<bool> filter::parse_geopoint_filter_value(string& raw_value, const string
nlohmann::json param;
param[EXACT_GEO_FILTER_RADIUS_KEY] = exact_under_radius;
filter_exp.params.push_back(param);
// Only EXACT_GEO_FILTER_RADIUS_KEY option would be present for a polygon. We can also stop if we've
// parsed the radius in case of a single geopoint since there are only two options.
if (is_polygon || is_radius_present) {
break;
}
}
}
if (!is_radius_present) {
if (!is_radius_present && !is_polygon) {
return Option<bool>(400, format_err_msg);
}
@ -308,10 +311,6 @@ Option<bool> filter::parse_geopoint_filter_value(string& raw_value, const string
}
}
for (auto const& polygon: polygons) {
filter_exp.values.push_back(polygon);
}
return Option<bool>(true);
}

View File

@ -765,7 +765,7 @@ void filter_result_iterator_t::init() {
bool is_polygon = StringUtils::is_float(filter_value_parts.back());
S2Region* query_region;
double radius;
double query_radius_meters;
if (is_polygon) {
const int num_verts = int(filter_value_parts.size()) / 2;
std::vector<S2Point> vertices;
@ -790,22 +790,24 @@ void filter_result_iterator_t::init() {
} else {
query_region = loop;
}
query_radius_meters = S2Earth::RadiansToMeters(query_region->GetCapBound().GetRadius().radians());
} else {
radius = std::stof(filter_value_parts[2]);
query_radius_meters = std::stof(filter_value_parts[2]);
const auto& unit = filter_value_parts[3];
if (unit == "km") {
radius *= 1000;
query_radius_meters *= 1000;
} else {
// assume "mi" (validated upstream)
radius *= 1609.34;
query_radius_meters *= 1609.34;
}
S1Angle query_radius = S1Angle::Radians(S2Earth::MetersToRadians(radius));
S1Angle query_radius_radians = S1Angle::Radians(S2Earth::MetersToRadians(query_radius_meters));
double query_lat = std::stod(filter_value_parts[0]);
double query_lng = std::stod(filter_value_parts[1]);
S2Point center = S2LatLng::FromDegrees(query_lat, query_lng).ToPoint();
query_region = new S2Cap(center, query_radius);
query_region = new S2Cap(center, query_radius_radians);
}
std::unique_ptr<S2Region> query_region_guard(query_region);
@ -825,8 +827,8 @@ void filter_result_iterator_t::init() {
geo_result_ids.erase(std::unique( geo_result_ids.begin(), geo_result_ids.end() ), geo_result_ids.end());
// Skip exact filtering step if query radius is greater than the threshold.
if (!is_polygon && fi < a_filter.params.size() &&
radius > a_filter.params[fi][filter::EXACT_GEO_FILTER_RADIUS_KEY].get<double>()) {
if (fi < a_filter.params.size() &&
query_radius_meters > a_filter.params[fi][filter::EXACT_GEO_FILTER_RADIUS_KEY].get<double>()) {
uint32_t* out = nullptr;
filter_result.count = ArrayUtils::or_scalar(geo_result_ids.data(), geo_result_ids.size(),
filter_result.docs, filter_result.count, &out);

View File

@ -85,7 +85,7 @@ TEST_F(GeoFilteringTest, GeoPointFiltering) {
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
results = coll1->search("*", {}, "loc: [([48.90615, 2.34358], radius: 1 km), ([48.8462, 2.34515], radius: 1 km)]",
{}, {}, {0}, 10, 1, FREQUENCY).get();
@ -115,9 +115,9 @@ TEST_F(GeoFilteringTest, GeoPointFiltering) {
ASSERT_EQ(3, results["found"].get<size_t>());
ASSERT_STREQ("6", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("5", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("3", results["hits"][2]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ("6", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("5", results["hits"][1]["document"]["id"].get<std::string>());
ASSERT_EQ("3", results["hits"][2]["document"]["id"].get<std::string>());
// when geo query had NaN
auto gop = coll1->search("*", {}, "loc: ([NaN, nan], radius: 1 mi)",
@ -206,8 +206,8 @@ TEST_F(GeoFilteringTest, GeoPointArrayFiltering) {
ASSERT_EQ(2, results["found"].get<size_t>());
ASSERT_EQ(2, results["hits"].size());
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
// Default value of exact_filter_radius is 10km, exact filtering is not performed.
results = coll1->search("*",
@ -217,9 +217,9 @@ TEST_F(GeoFilteringTest, GeoPointArrayFiltering) {
ASSERT_EQ(3, results["found"].get<size_t>());
ASSERT_EQ(3, results["hits"].size());
ASSERT_STREQ("2", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["hits"][2]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ("2", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][2]["document"]["id"].get<std::string>());
// pick location close to none of the spots
results = coll1->search("*",
@ -244,7 +244,7 @@ TEST_F(GeoFilteringTest, GeoPointArrayFiltering) {
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
@ -355,9 +355,9 @@ TEST_F(GeoFilteringTest, GeoPolygonFiltering) {
ASSERT_EQ(3, results["found"].get<size_t>());
ASSERT_EQ(3, results["hits"].size());
ASSERT_STREQ("8", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("4", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["hits"][2]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ("8", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("4", results["hits"][1]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][2]["document"]["id"].get<std::string>());
// should work even if points of polygon are clockwise
@ -389,6 +389,8 @@ TEST_F(GeoFilteringTest, GeoPolygonFilteringSouthAmerica) {
std::vector<std::vector<std::string>> records = {
{"North of Equator", "4.48615, -71.38049"},
{"South of Equator", "-8.48587, -71.02892"},
{"North of Equator, outside polygon", "4.13377, -56.00459"},
{"South of Equator, outside polygon", "-4.5041, -57.34523"},
};
for(size_t i=0; i<records.size(); i++) {
@ -408,8 +410,7 @@ TEST_F(GeoFilteringTest, GeoPolygonFilteringSouthAmerica) {
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
// pick a polygon that covers both points
// polygon only covers 2 points but all points are returned since exact filtering is not performed.
auto results = coll1->search("*",
{}, "loc: ([13.3163, -82.3585, "
"-29.134, -82.3585, "
@ -417,9 +418,22 @@ TEST_F(GeoFilteringTest, GeoPolygonFilteringSouthAmerica) {
"13.3163, -59.8528])",
{}, {}, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(4, results["found"].get<size_t>());
ASSERT_EQ(4, results["hits"].size());
results = coll1->search("*",
{}, "loc: ([13.3163, -82.3585, "
"-29.134, -82.3585, "
"-29.134, -59.8528, "
"13.3163, -59.8528], exact_filter_radius: 2703km)",
{}, {}, {0}, 10, 1, FREQUENCY).get();
ASSERT_EQ(2, results["found"].get<size_t>());
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}