Fix synonym loading regression.

This commit is contained in:
Kishore Nallan 2023-01-06 20:52:05 +05:30
parent 7a63d930bf
commit 44d489ed54
7 changed files with 106 additions and 68 deletions

View File

@ -10,19 +10,20 @@
struct synonym_t {
std::string id;
std::string raw_root;
// used in code and differs from API + storage format
std::vector<std::string> root;
std::vector<std::string> raw_synonyms;
// used in code and differs from API + storage format
std::vector<std::vector<std::string>> synonyms;
std::string locale;
std::vector<char> symbols;
synonym_t() = default;
synonym_t(const std::string& id, const std::vector<std::string>& root,
const std::vector<std::vector<std::string>>& synonyms):
id(id), root(root), synonyms(synonyms) {
}
nlohmann::json to_view_json() const;
static Option<bool> parse(const nlohmann::json& synonym_json, synonym_t& syn);

View File

@ -1579,6 +1579,27 @@ bool put_synonym(const std::shared_ptr<http_req>& req, const std::shared_ptr<htt
return false;
}
// These checks should be inside `add_synonym` but older versions of Typesense wrongly persisted
// `root` as an array, so we have to do it here so that on-disk synonyms are loaded properly
if(syn_json.count("root") != 0 && !syn_json["root"].is_string()) {
res->set_400("Key `root` should be a string.");
return false;
}
if(syn_json.count("synonyms") && syn_json["synonyms"].is_array()) {
if(syn_json["synonyms"].empty()) {
res->set_400("Could not find a valid string array of `synonyms`");
return false;
}
for(const auto& synonym: syn_json["synonyms"]) {
if (!synonym.is_string() || synonym.empty()) {
res->set_400("Could not find a valid string array of `synonyms`");
return false;
}
}
}
syn_json["id"] = synonym_id;
Option<bool> upsert_op = collection->add_synonym(syn_json);

View File

@ -194,10 +194,6 @@ Option<bool> synonym_t::parse(const nlohmann::json& synonym_json, synonym_t& syn
return Option<bool>(400, "Could not find an array of `synonyms`");
}
if(synonym_json.count("root") != 0 && !synonym_json["root"].is_string()) {
return Option<bool>(400, "Key `root` should be a string.");
}
if (!synonym_json["synonyms"].is_array() || synonym_json["synonyms"].empty()) {
return Option<bool>(400, "Could not find an array of `synonyms`");
}
@ -228,17 +224,51 @@ Option<bool> synonym_t::parse(const nlohmann::json& synonym_json, synonym_t& syn
if(synonym_json.count("root") != 0) {
std::vector<std::string> tokens;
Tokenizer(synonym_json["root"], true, false, syn.locale, syn.symbols).tokenize(tokens);
if(synonym_json["root"].is_string()) {
Tokenizer(synonym_json["root"].get<std::string>(), true, false, syn.locale, syn.symbols).tokenize(tokens);
syn.raw_root = synonym_json["root"].get<std::string>();
} else if(synonym_json["root"].is_array()) {
// Typesense 0.23.1 and below incorrectly stored root as array
for(const auto& root_ele: synonym_json["root"]) {
if(!root_ele.is_string()) {
return Option<bool>(400, "Synonym root is not valid.");
}
tokens.push_back(root_ele.get<std::string>());
}
syn.raw_root = StringUtils::join(tokens, " ");
} else {
return Option<bool>(400, "Key `root` should be a string.");
}
syn.root = tokens;
}
for(const auto& synonym: synonym_json["synonyms"]) {
if(!synonym.is_string() || synonym == "") {
std::vector<std::string> tokens;
if(synonym.is_string()) {
Tokenizer(synonym.get<std::string>(), true, false, syn.locale, syn.symbols).tokenize(tokens);
syn.raw_synonyms.push_back(synonym.get<std::string>());
} else if(synonym.is_array()) {
// Typesense 0.23.1 and below incorrectly stored synonym as array
if(synonym.empty()) {
return Option<bool>(400, "Could not find a valid string array of `synonyms`");
}
for(const auto& ele: synonym) {
if(!ele.is_string() || ele.get<std::string>().empty()) {
return Option<bool>(400, "Could not find a valid string array of `synonyms`");
}
tokens.push_back(ele.get<std::string>());
}
syn.raw_synonyms.push_back(StringUtils::join(tokens, " "));
} else {
return Option<bool>(400, "Could not find a valid string array of `synonyms`");
}
std::vector<std::string> tokens;
Tokenizer(synonym, true, false, syn.locale, syn.symbols).tokenize(tokens);
syn.synonyms.push_back(tokens);
}
@ -249,12 +279,12 @@ Option<bool> synonym_t::parse(const nlohmann::json& synonym_json, synonym_t& syn
nlohmann::json synonym_t::to_view_json() const {
nlohmann::json obj;
obj["id"] = id;
obj["root"] = StringUtils::join(root, " ");
obj["root"] = raw_root;
obj["synonyms"] = nlohmann::json::array();
for(const auto& synonym: synonyms) {
obj["synonyms"].push_back(StringUtils::join(synonym, " "));
for(const auto& synonym: raw_synonyms) {
obj["synonyms"].push_back(synonym);
}
if(!locale.empty()) {

View File

@ -406,13 +406,9 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
collection1->remove_override("deleted-rule");
// make some synonym operation
synonym_t synonym1("id1", {"smart", "phone"}, {{"iphone"}});
synonym_t synonym2("id2", {"mobile", "phone"}, {{"samsung", "phone"}});
synonym_t synonym3("id3", {}, {{"football"}, {"foot", "ball"}});
ASSERT_TRUE(collection1->add_synonym(synonym1.to_view_json()).ok());
ASSERT_TRUE(collection1->add_synonym(synonym2.to_view_json()).ok());
ASSERT_TRUE(collection1->add_synonym(synonym3.to_view_json()).ok());
ASSERT_TRUE(collection1->add_synonym(R"({"id": "id1", "root": "smart phone", "synonyms": ["iphone"]})"_json).ok());
ASSERT_TRUE(collection1->add_synonym(R"({"id": "id2", "root": "mobile phone", "synonyms": ["samsung phone"]})"_json).ok());
ASSERT_TRUE(collection1->add_synonym(R"({"id": "id3", "synonyms": ["football", "foot ball"]})"_json).ok());
collection1->remove_synonym("id2");

View File

@ -2064,12 +2064,9 @@ TEST_F(CollectionOverrideTest, DynamicFilteringWithSynonyms) {
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
synonym_t synonym1{"sneakers-shoes", {"sneakers"}, {{"shoes"}} };
synonym_t synonym2{"boots-shoes", {"boots"}, {{"shoes"}} };
synonym_t synonym3{"exciting-amazing", {"exciting"}, {{"amazing"}} };
coll1->add_synonym(synonym1.to_view_json());
coll1->add_synonym(synonym2.to_view_json());
coll1->add_synonym(synonym3.to_view_json());
coll1->add_synonym(R"({"id": "sneakers-shoes", "root": "sneakers", "synonyms": ["shoes"]})"_json);
coll1->add_synonym(R"({"id": "boots-shoes", "root": "boots", "synonyms": ["shoes"]})"_json);
coll1->add_synonym(R"({"id": "exciting-amazing", "root": "exciting", "synonyms": ["amazing"]})"_json);
std::vector<sort_by> sort_fields = { sort_by("_text_match", "DESC"), sort_by("points", "DESC") };
@ -2514,8 +2511,7 @@ TEST_F(CollectionOverrideTest, SynonymsAppliedToOverridenQuery) {
coll1->add_override(override_contains);
synonym_t synonym1{"shoes-sneakers", {"shoes"}, {{"sneakers"}} };
coll1->add_synonym(synonym1.to_view_json());
coll1->add_synonym(R"({"id": "", "root": "shoes", "synonyms": ["sneakers"]})"_json);
auto results = coll1->search("expensive shoes", {"name"}, "",
{}, sort_fields, {2}, 10, 1, FREQUENCY, {true}, 0).get();

View File

@ -123,26 +123,16 @@ TEST_F(CollectionSynonymsTest, SynonymParsingFromJson) {
// synonyms bad type
nlohmann::json syn_json_bad_type1 = {
{"id", "syn-1"},
{"root", "Ocean"},
{"synonyms", {"Sea", 1} }
};
nlohmann::json syn_json_bad_type1 = R"({
"id": "syn-1",
"root": "Ocean",
"synonyms": [["Sea", 1]]
})"_json;
syn_op = synonym_t::parse(syn_json_bad_type1, synonym);
ASSERT_FALSE(syn_op.ok());
ASSERT_STREQ("Could not find a valid string array of `synonyms`", syn_op.error().c_str());
nlohmann::json syn_json_bad_type2 = {
{"id", "syn-1"},
{"root", "Ocean"},
{"synonyms", "foo" }
};
syn_op = synonym_t::parse(syn_json_bad_type2, synonym);
ASSERT_FALSE(syn_op.ok());
ASSERT_STREQ("Could not find an array of `synonyms`", syn_op.error().c_str());
nlohmann::json syn_json_bad_type3 = {
{"id", "syn-1"},
{"root", "Ocean"},
@ -154,11 +144,11 @@ TEST_F(CollectionSynonymsTest, SynonymParsingFromJson) {
ASSERT_STREQ("Could not find an array of `synonyms`", syn_op.error().c_str());
// empty string in synonym list
nlohmann::json syn_json_bad_type4 = {
{"id", "syn-1"},
{"root", "Ocean"},
{"synonyms", {""} }
};
nlohmann::json syn_json_bad_type4 = R"({
"id": "syn-1",
"root": "Ocean",
"synonyms": [["Foo", ""]]
})"_json;
syn_op = synonym_t::parse(syn_json_bad_type4, synonym);
ASSERT_FALSE(syn_op.ok());
@ -707,10 +697,8 @@ TEST_F(CollectionSynonymsTest, SynonymFieldOrdering) {
}
TEST_F(CollectionSynonymsTest, DeleteAndUpsertDuplicationOfSynonms) {
synonym_t synonym1{"ipod-synonyms", {}, {{"ipod"}, {"i", "pod"}, {"pod"}}};
synonym_t synonym2{"samsung-synonyms", {}, {{"s3"}, {"s3", "phone"}, {"samsung"}}};
coll_mul_fields->add_synonym(synonym1.to_view_json());
coll_mul_fields->add_synonym(synonym2.to_view_json());
coll_mul_fields->add_synonym(R"({"id": "ipod-synonyms", "root": "ipod", "synonyms": ["i pod", "ipod"]})"_json);
coll_mul_fields->add_synonym(R"({"id": "samsung-synonyms", "root": "s3", "synonyms": ["s3 phone", "samsung"]})"_json);
ASSERT_EQ(2, coll_mul_fields->get_synonyms().size());
coll_mul_fields->remove_synonym("ipod-synonyms");
@ -720,14 +708,14 @@ TEST_F(CollectionSynonymsTest, DeleteAndUpsertDuplicationOfSynonms) {
// try to upsert synonym with same ID
synonym2.root = {"s3", "smartphone"};
auto upsert_op = coll_mul_fields->add_synonym(synonym2.to_view_json());
auto upsert_op = coll_mul_fields->add_synonym(R"({"id": "samsung-synonyms", "root": "s3 smartphone",
"synonyms": ["s3 phone", "samsung"]})"_json);
ASSERT_TRUE(upsert_op.ok());
ASSERT_EQ(1, coll_mul_fields->get_synonyms().size());
synonym_t synonym2_updated;
coll_mul_fields->get_synonym(synonym2.id, synonym2_updated);
coll_mul_fields->get_synonym("samsung-synonyms", synonym2_updated);
ASSERT_EQ("s3", synonym2_updated.root[0]);
ASSERT_EQ("smartphone", synonym2_updated.root[1]);
@ -737,7 +725,16 @@ TEST_F(CollectionSynonymsTest, DeleteAndUpsertDuplicationOfSynonms) {
}
TEST_F(CollectionSynonymsTest, SynonymJsonSerialization) {
synonym_t synonym1{"ipod-synonyms", {"apple", "ipod"}, {{"ipod"}, {"i", "pod"}, {"pod"}}};
synonym_t synonym1;
synonym1.id = "ipod-synonyms";
synonym1.root = {"apple", "ipod"};
synonym1.raw_root = "apple ipod";
synonym1.raw_synonyms = {"ipod", "i pod", "pod"};
synonym1.synonyms.push_back({"ipod"});
synonym1.synonyms.push_back({"i", "pod"});
synonym1.synonyms.push_back({"pod"});
nlohmann::json obj = synonym1.to_view_json();
ASSERT_STREQ("ipod-synonyms", obj["id"].get<std::string>().c_str());
ASSERT_STREQ("apple ipod", obj["root"].get<std::string>().c_str());
@ -817,8 +814,9 @@ TEST_F(CollectionSynonymsTest, SynonymExpansionAndCompressionRanking) {
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
synonym_t synonym1{"syn-1", {"lululemon"}, {{"lulu", "lemon"}}};
coll1->add_synonym(synonym1.to_view_json());
nlohmann::json foo = R"({"id": "syn-1", "root": "lululemon", "synonyms": ["lulu lemon"]})"_json;
coll1->add_synonym(R"({"id": "syn-1", "root": "lululemon", "synonyms": ["lulu lemon"]})"_json);
auto res = coll1->search("lululemon", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(2, res["hits"].size());
@ -832,9 +830,7 @@ TEST_F(CollectionSynonymsTest, SynonymExpansionAndCompressionRanking) {
ASSERT_EQ(res["hits"][0]["text_match"].get<size_t>(), res["hits"][1]["text_match"].get<size_t>());
// now with compression synonym
synonym1.root = {"lulu", "lemon"};
synonym1.synonyms = {{"lululemon"}};
coll1->add_synonym(synonym1.to_view_json());
coll1->add_synonym(R"({"id": "syn-1", "root": "lulu lemon", "synonyms": ["lululemon"]})"_json);
res = coll1->search("lulu lemon", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(2, res["hits"].size());
@ -876,7 +872,7 @@ TEST_F(CollectionSynonymsTest, SynonymQueriesMustHavePrefixEnabled) {
}
synonym_t synonym1{"syn-1", {"ns"}, {{"nonstick"}}};
coll1->add_synonym(synonym1.to_view_json());
coll1->add_synonym(R"({"id": "syn-1", "root": "ns", "synonyms": ["nonstick"]})"_json);
auto res = coll1->search("ns cook", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(1, res["hits"].size());

View File

@ -354,6 +354,4 @@ TEST(TokenizerTest, ShouldTokenizeWithDifferentSymbolConfigs) {
ASSERT_EQ("ความ", tokens[0]);
ASSERT_EQ("เหลื่อม", tokens[1]);
ASSERT_EQ("ล้ํา", tokens[2]);
LOG(INFO) << "here";
}