Support remove_matched_tokens=false when replace query is used.

This commit is contained in:
Kishore Nallan 2022-08-10 13:34:01 +05:30
parent dfa7191f32
commit 83ed38e3e6
4 changed files with 287 additions and 271 deletions

View File

@ -26,6 +26,7 @@
#include <tsl/htrie_map.h>
#include "id_list.h"
#include "synonym_index.h"
#include "override.h"
static constexpr size_t ARRAY_FACET_DIM = 4;
using facet_map_t = spp::sparse_hash_map<uint32_t, facet_hash_values_t>;
@ -78,277 +79,6 @@ struct query_tokens_t {
std::vector<std::vector<std::string>> q_synonyms;
};
struct override_t {
static const std::string MATCH_EXACT;
static const std::string MATCH_CONTAINS;
struct rule_t {
std::string query;
std::string match;
bool dynamic_query = false;
};
struct add_hit_t {
std::string doc_id;
uint32_t position = 0;
};
struct drop_hit_t {
std::string doc_id;
};
std::string id;
rule_t rule;
std::vector<add_hit_t> add_hits;
std::vector<drop_hit_t> drop_hits;
std::string filter_by;
bool remove_matched_tokens = false;
bool filter_curated_hits = false;
bool stop_processing = true;
std::string sort_by;
std::string replace_query;
// epoch seconds
int64_t effective_from_ts = -1;
int64_t effective_to_ts = -1;
override_t() = default;
static Option<bool> parse(const nlohmann::json& override_json, const std::string& id, override_t& override) {
if(!override_json.is_object()) {
return Option<bool>(400, "Bad JSON.");
}
if(override_json.count("rule") == 0 || !override_json["rule"].is_object()) {
return Option<bool>(400, "Missing `rule` definition.");
}
if(override_json["rule"].count("query") == 0 || override_json["rule"].count("match") == 0) {
return Option<bool>(400, "The `rule` definition must contain a `query` and `match`.");
}
if(override_json.count("includes") == 0 && override_json.count("excludes") == 0 &&
override_json.count("filter_by") == 0 && override_json.count("sort_by") == 0 &&
override_json.count("remove_matched_tokens") == 0 &&
override_json.count("replace_query") == 0) {
return Option<bool>(400, "Must contain one of: `includes`, `excludes`, "
"`filter_by`, `sort_by`, `remove_matched_tokens`, `replace_query`.");
}
if(override_json.count("includes") != 0) {
if(!override_json["includes"].is_array()) {
return Option<bool>(400, "The `includes` value must be an array.");
}
for(const auto & include_obj: override_json["includes"]) {
if(!include_obj.is_object()) {
return Option<bool>(400, "The `includes` value must be an array of objects.");
}
if(include_obj.count("id") == 0 || include_obj.count("position") == 0) {
return Option<bool>(400, "Inclusion definition must define both `id` and `position` keys.");
}
if(!include_obj["id"].is_string()) {
return Option<bool>(400, "Inclusion `id` must be a string.");
}
if(!include_obj["position"].is_number_integer()) {
return Option<bool>(400, "Inclusion `position` must be an integer.");
}
}
}
if(override_json.count("excludes") != 0) {
if(!override_json["excludes"].is_array()) {
return Option<bool>(400, "The `excludes` value must be an array.");
}
for(const auto & exclude_obj: override_json["excludes"]) {
if(!exclude_obj.is_object()) {
return Option<bool>(400, "The `excludes` value must be an array of objects.");
}
if(exclude_obj.count("id") == 0) {
return Option<bool>(400, "Exclusion definition must define an `id`.");
}
if(!exclude_obj["id"].is_string()) {
return Option<bool>(400, "Exclusion `id` must be a string.");
}
}
}
if(override_json.count("filter_by") != 0) {
if(!override_json["filter_by"].is_string()) {
return Option<bool>(400, "The `filter_by` must be a string.");
}
if(override_json["filter_by"].get<std::string>().empty()) {
return Option<bool>(400, "The `filter_by` must be a non-empty string.");
}
}
if(override_json.count("remove_matched_tokens") != 0) {
if (!override_json["remove_matched_tokens"].is_boolean()) {
return Option<bool>(400, "The `remove_matched_tokens` must be a boolean.");
}
}
if(override_json.count("filter_curated_hits") != 0) {
if (!override_json["filter_curated_hits"].is_boolean()) {
return Option<bool>(400, "The `filter_curated_hits` must be a boolean.");
}
}
if(override_json.count("stop_processing") != 0) {
if (!override_json["stop_processing"].is_boolean()) {
return Option<bool>(400, "The `stop_processing` must be a boolean.");
}
}
if(!id.empty()) {
override.id = id;
} else if(override_json.count("id") != 0) {
override.id = override_json["id"].get<std::string>();
} else {
return Option<bool>(400, "Override `id` not provided.");
}
override.rule.query = override_json["rule"]["query"].get<std::string>();
override.rule.match = override_json["rule"]["match"].get<std::string>();
if (override_json.count("includes") != 0) {
for(const auto & include: override_json["includes"]) {
add_hit_t add_hit;
add_hit.doc_id = include["id"].get<std::string>();
add_hit.position = include["position"].get<uint32_t>();
override.add_hits.push_back(add_hit);
}
}
if (override_json.count("excludes") != 0) {
for(const auto & exclude: override_json["excludes"]) {
drop_hit_t drop_hit;
drop_hit.doc_id = exclude["id"].get<std::string>();
override.drop_hits.push_back(drop_hit);
}
}
if (override_json.count("filter_by") != 0) {
override.filter_by = override_json["filter_by"].get<std::string>();
}
if (override_json.count("sort_by") != 0) {
override.sort_by = override_json["sort_by"].get<std::string>();
}
if (override_json.count("replace_query") != 0) {
if(override_json.count("remove_matched_tokens") != 0) {
return Option<bool>(400, "Only one of `replace_query` or `remove_matched_tokens` can be specified.");
}
override.replace_query = override_json["replace_query"].get<std::string>();
}
if(override_json.count("remove_matched_tokens") != 0) {
override.remove_matched_tokens = override_json["remove_matched_tokens"].get<bool>();
} else {
override.remove_matched_tokens = (override_json.count("filter_by") != 0);
}
if(override_json.count("filter_curated_hits") != 0) {
override.filter_curated_hits = override_json["filter_curated_hits"].get<bool>();
}
if(override_json.count("stop_processing") != 0) {
override.stop_processing = override_json["stop_processing"].get<bool>();
}
if(override_json.count("effective_from_ts") != 0) {
override.effective_from_ts = override_json["effective_from_ts"].get<int64_t>();
}
if(override_json.count("effective_to_ts") != 0) {
override.effective_to_ts = override_json["effective_to_ts"].get<int64_t>();
}
// we have to also detect if it is a dynamic query rule
size_t i = 0;
while(i < override.rule.query.size()) {
if(override.rule.query[i] == '{') {
// look for closing curly
i++;
while(i < override.rule.query.size()) {
if(override.rule.query[i] == '}') {
override.rule.dynamic_query = true;
// remove spaces around curlies
override.rule.query = StringUtils::trim_curly_spaces(override.rule.query);
break;
}
i++;
}
}
i++;
}
return Option<bool>(true);
}
nlohmann::json to_json() const {
nlohmann::json override;
override["id"] = id;
override["rule"]["query"] = rule.query;
override["rule"]["match"] = rule.match;
override["includes"] = nlohmann::json::array();
for(const auto & add_hit: add_hits) {
nlohmann::json include;
include["id"] = add_hit.doc_id;
include["position"] = add_hit.position;
override["includes"].push_back(include);
}
override["excludes"] = nlohmann::json::array();
for(const auto & drop_hit: drop_hits) {
nlohmann::json exclude;
exclude["id"] = drop_hit.doc_id;
override["excludes"].push_back(exclude);
}
if(!filter_by.empty()) {
override["filter_by"] = filter_by;
}
if(!sort_by.empty()) {
override["sort_by"] = sort_by;
}
if(!replace_query.empty()) {
override["replace_query"] = replace_query;
}
if(effective_from_ts != -1) {
override["effective_from_ts"] = effective_from_ts;
}
if(effective_to_ts != -1) {
override["effective_to_ts"] = effective_to_ts;
}
override["remove_matched_tokens"] = remove_matched_tokens;
override["filter_curated_hits"] = filter_curated_hits;
override["stop_processing"] = stop_processing;
return override;
}
};
enum enable_t {
always,
fallback,

49
include/override.h Normal file
View File

@ -0,0 +1,49 @@
#pragma once
#include <string>
#include <json.hpp>
#include "option.h"
struct override_t {
static const std::string MATCH_EXACT;
static const std::string MATCH_CONTAINS;
struct rule_t {
std::string query;
std::string match;
bool dynamic_query = false;
};
struct add_hit_t {
std::string doc_id;
uint32_t position = 0;
};
struct drop_hit_t {
std::string doc_id;
};
std::string id;
rule_t rule;
std::vector<add_hit_t> add_hits;
std::vector<drop_hit_t> drop_hits;
std::string filter_by;
bool remove_matched_tokens = false;
bool filter_curated_hits = false;
bool stop_processing = true;
std::string sort_by;
std::string replace_query;
// epoch seconds
int64_t effective_from_ts = -1;
int64_t effective_to_ts = -1;
override_t() = default;
static Option<bool> parse(const nlohmann::json& override_json, const std::string& id, override_t& override);
nlohmann::json to_json() const;
};

232
src/override.cpp Normal file
View File

@ -0,0 +1,232 @@
#include <string_utils.h>
#include "override.h"
Option<bool> override_t::parse(const nlohmann::json& override_json, const std::string& id, override_t& override) {
if(!override_json.is_object()) {
return Option<bool>(400, "Bad JSON.");
}
if(override_json.count("rule") == 0 || !override_json["rule"].is_object()) {
return Option<bool>(400, "Missing `rule` definition.");
}
if(override_json["rule"].count("query") == 0 || override_json["rule"].count("match") == 0) {
return Option<bool>(400, "The `rule` definition must contain a `query` and `match`.");
}
if(override_json.count("includes") == 0 && override_json.count("excludes") == 0 &&
override_json.count("filter_by") == 0 && override_json.count("sort_by") == 0 &&
override_json.count("remove_matched_tokens") == 0 &&
override_json.count("replace_query") == 0) {
return Option<bool>(400, "Must contain one of: `includes`, `excludes`, "
"`filter_by`, `sort_by`, `remove_matched_tokens`, `replace_query`.");
}
if(override_json.count("includes") != 0) {
if(!override_json["includes"].is_array()) {
return Option<bool>(400, "The `includes` value must be an array.");
}
for(const auto & include_obj: override_json["includes"]) {
if(!include_obj.is_object()) {
return Option<bool>(400, "The `includes` value must be an array of objects.");
}
if(include_obj.count("id") == 0 || include_obj.count("position") == 0) {
return Option<bool>(400, "Inclusion definition must define both `id` and `position` keys.");
}
if(!include_obj["id"].is_string()) {
return Option<bool>(400, "Inclusion `id` must be a string.");
}
if(!include_obj["position"].is_number_integer()) {
return Option<bool>(400, "Inclusion `position` must be an integer.");
}
}
}
if(override_json.count("excludes") != 0) {
if(!override_json["excludes"].is_array()) {
return Option<bool>(400, "The `excludes` value must be an array.");
}
for(const auto & exclude_obj: override_json["excludes"]) {
if(!exclude_obj.is_object()) {
return Option<bool>(400, "The `excludes` value must be an array of objects.");
}
if(exclude_obj.count("id") == 0) {
return Option<bool>(400, "Exclusion definition must define an `id`.");
}
if(!exclude_obj["id"].is_string()) {
return Option<bool>(400, "Exclusion `id` must be a string.");
}
}
}
if(override_json.count("filter_by") != 0) {
if(!override_json["filter_by"].is_string()) {
return Option<bool>(400, "The `filter_by` must be a string.");
}
if(override_json["filter_by"].get<std::string>().empty()) {
return Option<bool>(400, "The `filter_by` must be a non-empty string.");
}
}
if(override_json.count("remove_matched_tokens") != 0) {
if (!override_json["remove_matched_tokens"].is_boolean()) {
return Option<bool>(400, "The `remove_matched_tokens` must be a boolean.");
}
}
if(override_json.count("filter_curated_hits") != 0) {
if (!override_json["filter_curated_hits"].is_boolean()) {
return Option<bool>(400, "The `filter_curated_hits` must be a boolean.");
}
}
if(override_json.count("stop_processing") != 0) {
if (!override_json["stop_processing"].is_boolean()) {
return Option<bool>(400, "The `stop_processing` must be a boolean.");
}
}
if(!id.empty()) {
override.id = id;
} else if(override_json.count("id") != 0) {
override.id = override_json["id"].get<std::string>();
} else {
return Option<bool>(400, "Override `id` not provided.");
}
override.rule.query = override_json["rule"]["query"].get<std::string>();
override.rule.match = override_json["rule"]["match"].get<std::string>();
if (override_json.count("includes") != 0) {
for(const auto & include: override_json["includes"]) {
add_hit_t add_hit;
add_hit.doc_id = include["id"].get<std::string>();
add_hit.position = include["position"].get<uint32_t>();
override.add_hits.push_back(add_hit);
}
}
if (override_json.count("excludes") != 0) {
for(const auto & exclude: override_json["excludes"]) {
drop_hit_t drop_hit;
drop_hit.doc_id = exclude["id"].get<std::string>();
override.drop_hits.push_back(drop_hit);
}
}
if (override_json.count("filter_by") != 0) {
override.filter_by = override_json["filter_by"].get<std::string>();
}
if (override_json.count("sort_by") != 0) {
override.sort_by = override_json["sort_by"].get<std::string>();
}
if (override_json.count("replace_query") != 0) {
if(override_json.count("remove_matched_tokens") != 0 && override_json["remove_matched_tokens"].get<bool>()) {
return Option<bool>(400, "Only one of `replace_query` or `remove_matched_tokens` can be specified.");
}
override.replace_query = override_json["replace_query"].get<std::string>();
}
if(override_json.count("remove_matched_tokens") != 0) {
override.remove_matched_tokens = override_json["remove_matched_tokens"].get<bool>();
} else {
override.remove_matched_tokens = (override_json.count("filter_by") != 0);
}
if(override_json.count("filter_curated_hits") != 0) {
override.filter_curated_hits = override_json["filter_curated_hits"].get<bool>();
}
if(override_json.count("stop_processing") != 0) {
override.stop_processing = override_json["stop_processing"].get<bool>();
}
if(override_json.count("effective_from_ts") != 0) {
override.effective_from_ts = override_json["effective_from_ts"].get<int64_t>();
}
if(override_json.count("effective_to_ts") != 0) {
override.effective_to_ts = override_json["effective_to_ts"].get<int64_t>();
}
// we have to also detect if it is a dynamic query rule
size_t i = 0;
while(i < override.rule.query.size()) {
if(override.rule.query[i] == '{') {
// look for closing curly
i++;
while(i < override.rule.query.size()) {
if(override.rule.query[i] == '}') {
override.rule.dynamic_query = true;
// remove spaces around curlies
override.rule.query = StringUtils::trim_curly_spaces(override.rule.query);
break;
}
i++;
}
}
i++;
}
return Option<bool>(true);
}
nlohmann::json override_t::to_json() const {
nlohmann::json override;
override["id"] = id;
override["rule"]["query"] = rule.query;
override["rule"]["match"] = rule.match;
override["includes"] = nlohmann::json::array();
for(const auto & add_hit: add_hits) {
nlohmann::json include;
include["id"] = add_hit.doc_id;
include["position"] = add_hit.position;
override["includes"].push_back(include);
}
override["excludes"] = nlohmann::json::array();
for(const auto & drop_hit: drop_hits) {
nlohmann::json exclude;
exclude["id"] = drop_hit.doc_id;
override["excludes"].push_back(exclude);
}
if(!filter_by.empty()) {
override["filter_by"] = filter_by;
}
if(!sort_by.empty()) {
override["sort_by"] = sort_by;
}
if(!replace_query.empty()) {
override["replace_query"] = replace_query;
}
if(effective_from_ts != -1) {
override["effective_from_ts"] = effective_from_ts;
}
if(effective_to_ts != -1) {
override["effective_to_ts"] = effective_to_ts;
}
override["remove_matched_tokens"] = remove_matched_tokens;
override["filter_curated_hits"] = filter_curated_hits;
override["stop_processing"] = stop_processing;
return override;
}

View File

@ -839,6 +839,11 @@ TEST_F(CollectionOverrideTest, ReplaceQuery) {
op = override_t::parse(override_json, "rule-1", override_rule);
ASSERT_FALSE(op.ok());
ASSERT_EQ("Only one of `replace_query` or `remove_matched_tokens` can be specified.", op.error());
// it's okay when it's explicitly set to false
override_json["remove_matched_tokens"] = false;
op = override_t::parse(override_json, "rule-1", override_rule);
ASSERT_TRUE(op.ok());
}
TEST_F(CollectionOverrideTest, WindowForRule) {