From 44d489ed545a08e27f075269f30ea8f39f4563ee Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 6 Jan 2023 20:52:05 +0530 Subject: [PATCH] Fix synonym loading regression. --- include/synonym_index.h | 13 ++++--- src/core_api.cpp | 21 ++++++++++ src/synonym_index.cpp | 52 +++++++++++++++++++------ test/collection_manager_test.cpp | 10 ++--- test/collection_override_test.cpp | 12 ++---- test/collection_synonyms_test.cpp | 64 +++++++++++++++---------------- test/tokenizer_test.cpp | 2 - 7 files changed, 106 insertions(+), 68 deletions(-) diff --git a/include/synonym_index.h b/include/synonym_index.h index 995be21e..a8b855ee 100644 --- a/include/synonym_index.h +++ b/include/synonym_index.h @@ -10,19 +10,20 @@ struct synonym_t { std::string id; + + std::string raw_root; + // used in code and differs from API + storage format std::vector root; + + std::vector raw_synonyms; + // used in code and differs from API + storage format std::vector> synonyms; + std::string locale; std::vector symbols; synonym_t() = default; - synonym_t(const std::string& id, const std::vector& root, - const std::vector>& synonyms): - id(id), root(root), synonyms(synonyms) { - - } - nlohmann::json to_view_json() const; static Option parse(const nlohmann::json& synonym_json, synonym_t& syn); diff --git a/src/core_api.cpp b/src/core_api.cpp index 1599ddd2..c2facf97 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -1579,6 +1579,27 @@ bool put_synonym(const std::shared_ptr& req, const std::shared_ptrset_400("Key `root` should be a string."); + return false; + } + + if(syn_json.count("synonyms") && syn_json["synonyms"].is_array()) { + if(syn_json["synonyms"].empty()) { + res->set_400("Could not find a valid string array of `synonyms`"); + return false; + } + + for(const auto& synonym: syn_json["synonyms"]) { + if (!synonym.is_string() || synonym.empty()) { + res->set_400("Could not find a valid string array of `synonyms`"); + return false; + } + } + } + syn_json["id"] = synonym_id; Option upsert_op = collection->add_synonym(syn_json); diff --git a/src/synonym_index.cpp b/src/synonym_index.cpp index 73bb86be..79505c4b 100644 --- a/src/synonym_index.cpp +++ b/src/synonym_index.cpp @@ -194,10 +194,6 @@ Option synonym_t::parse(const nlohmann::json& synonym_json, synonym_t& syn return Option(400, "Could not find an array of `synonyms`"); } - if(synonym_json.count("root") != 0 && !synonym_json["root"].is_string()) { - return Option(400, "Key `root` should be a string."); - } - if (!synonym_json["synonyms"].is_array() || synonym_json["synonyms"].empty()) { return Option(400, "Could not find an array of `synonyms`"); } @@ -228,17 +224,51 @@ Option synonym_t::parse(const nlohmann::json& synonym_json, synonym_t& syn if(synonym_json.count("root") != 0) { std::vector tokens; - Tokenizer(synonym_json["root"], true, false, syn.locale, syn.symbols).tokenize(tokens); + + if(synonym_json["root"].is_string()) { + Tokenizer(synonym_json["root"].get(), true, false, syn.locale, syn.symbols).tokenize(tokens); + syn.raw_root = synonym_json["root"].get(); + } else if(synonym_json["root"].is_array()) { + // Typesense 0.23.1 and below incorrectly stored root as array + for(const auto& root_ele: synonym_json["root"]) { + if(!root_ele.is_string()) { + return Option(400, "Synonym root is not valid."); + } + + tokens.push_back(root_ele.get()); + } + + syn.raw_root = StringUtils::join(tokens, " "); + } else { + return Option(400, "Key `root` should be a string."); + } + syn.root = tokens; } for(const auto& synonym: synonym_json["synonyms"]) { - if(!synonym.is_string() || synonym == "") { + std::vector tokens; + if(synonym.is_string()) { + Tokenizer(synonym.get(), true, false, syn.locale, syn.symbols).tokenize(tokens); + syn.raw_synonyms.push_back(synonym.get()); + } else if(synonym.is_array()) { + // Typesense 0.23.1 and below incorrectly stored synonym as array + if(synonym.empty()) { + return Option(400, "Could not find a valid string array of `synonyms`"); + } + + for(const auto& ele: synonym) { + if(!ele.is_string() || ele.get().empty()) { + return Option(400, "Could not find a valid string array of `synonyms`"); + } + tokens.push_back(ele.get()); + } + + syn.raw_synonyms.push_back(StringUtils::join(tokens, " ")); + } else { return Option(400, "Could not find a valid string array of `synonyms`"); } - std::vector tokens; - Tokenizer(synonym, true, false, syn.locale, syn.symbols).tokenize(tokens); syn.synonyms.push_back(tokens); } @@ -249,12 +279,12 @@ Option synonym_t::parse(const nlohmann::json& synonym_json, synonym_t& syn nlohmann::json synonym_t::to_view_json() const { nlohmann::json obj; obj["id"] = id; - obj["root"] = StringUtils::join(root, " "); + obj["root"] = raw_root; obj["synonyms"] = nlohmann::json::array(); - for(const auto& synonym: synonyms) { - obj["synonyms"].push_back(StringUtils::join(synonym, " ")); + for(const auto& synonym: raw_synonyms) { + obj["synonyms"].push_back(synonym); } if(!locale.empty()) { diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index e61eaac6..75a1c77c 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -406,13 +406,9 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) { collection1->remove_override("deleted-rule"); // make some synonym operation - synonym_t synonym1("id1", {"smart", "phone"}, {{"iphone"}}); - synonym_t synonym2("id2", {"mobile", "phone"}, {{"samsung", "phone"}}); - synonym_t synonym3("id3", {}, {{"football"}, {"foot", "ball"}}); - - ASSERT_TRUE(collection1->add_synonym(synonym1.to_view_json()).ok()); - ASSERT_TRUE(collection1->add_synonym(synonym2.to_view_json()).ok()); - ASSERT_TRUE(collection1->add_synonym(synonym3.to_view_json()).ok()); + ASSERT_TRUE(collection1->add_synonym(R"({"id": "id1", "root": "smart phone", "synonyms": ["iphone"]})"_json).ok()); + ASSERT_TRUE(collection1->add_synonym(R"({"id": "id2", "root": "mobile phone", "synonyms": ["samsung phone"]})"_json).ok()); + ASSERT_TRUE(collection1->add_synonym(R"({"id": "id3", "synonyms": ["football", "foot ball"]})"_json).ok()); collection1->remove_synonym("id2"); diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index 8c5de954..376b2fb9 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -2064,12 +2064,9 @@ TEST_F(CollectionOverrideTest, DynamicFilteringWithSynonyms) { ASSERT_TRUE(coll1->add(doc2.dump()).ok()); ASSERT_TRUE(coll1->add(doc3.dump()).ok()); - synonym_t synonym1{"sneakers-shoes", {"sneakers"}, {{"shoes"}} }; - synonym_t synonym2{"boots-shoes", {"boots"}, {{"shoes"}} }; - synonym_t synonym3{"exciting-amazing", {"exciting"}, {{"amazing"}} }; - coll1->add_synonym(synonym1.to_view_json()); - coll1->add_synonym(synonym2.to_view_json()); - coll1->add_synonym(synonym3.to_view_json()); + coll1->add_synonym(R"({"id": "sneakers-shoes", "root": "sneakers", "synonyms": ["shoes"]})"_json); + coll1->add_synonym(R"({"id": "boots-shoes", "root": "boots", "synonyms": ["shoes"]})"_json); + coll1->add_synonym(R"({"id": "exciting-amazing", "root": "exciting", "synonyms": ["amazing"]})"_json); std::vector sort_fields = { sort_by("_text_match", "DESC"), sort_by("points", "DESC") }; @@ -2514,8 +2511,7 @@ TEST_F(CollectionOverrideTest, SynonymsAppliedToOverridenQuery) { coll1->add_override(override_contains); - synonym_t synonym1{"shoes-sneakers", {"shoes"}, {{"sneakers"}} }; - coll1->add_synonym(synonym1.to_view_json()); + coll1->add_synonym(R"({"id": "", "root": "shoes", "synonyms": ["sneakers"]})"_json); auto results = coll1->search("expensive shoes", {"name"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY, {true}, 0).get(); diff --git a/test/collection_synonyms_test.cpp b/test/collection_synonyms_test.cpp index 8568e834..fe59ae54 100644 --- a/test/collection_synonyms_test.cpp +++ b/test/collection_synonyms_test.cpp @@ -123,26 +123,16 @@ TEST_F(CollectionSynonymsTest, SynonymParsingFromJson) { // synonyms bad type - nlohmann::json syn_json_bad_type1 = { - {"id", "syn-1"}, - {"root", "Ocean"}, - {"synonyms", {"Sea", 1} } - }; + nlohmann::json syn_json_bad_type1 = R"({ + "id": "syn-1", + "root": "Ocean", + "synonyms": [["Sea", 1]] + })"_json; syn_op = synonym_t::parse(syn_json_bad_type1, synonym); ASSERT_FALSE(syn_op.ok()); ASSERT_STREQ("Could not find a valid string array of `synonyms`", syn_op.error().c_str()); - nlohmann::json syn_json_bad_type2 = { - {"id", "syn-1"}, - {"root", "Ocean"}, - {"synonyms", "foo" } - }; - - syn_op = synonym_t::parse(syn_json_bad_type2, synonym); - ASSERT_FALSE(syn_op.ok()); - ASSERT_STREQ("Could not find an array of `synonyms`", syn_op.error().c_str()); - nlohmann::json syn_json_bad_type3 = { {"id", "syn-1"}, {"root", "Ocean"}, @@ -154,11 +144,11 @@ TEST_F(CollectionSynonymsTest, SynonymParsingFromJson) { ASSERT_STREQ("Could not find an array of `synonyms`", syn_op.error().c_str()); // empty string in synonym list - nlohmann::json syn_json_bad_type4 = { - {"id", "syn-1"}, - {"root", "Ocean"}, - {"synonyms", {""} } - }; + nlohmann::json syn_json_bad_type4 = R"({ + "id": "syn-1", + "root": "Ocean", + "synonyms": [["Foo", ""]] + })"_json; syn_op = synonym_t::parse(syn_json_bad_type4, synonym); ASSERT_FALSE(syn_op.ok()); @@ -707,10 +697,8 @@ TEST_F(CollectionSynonymsTest, SynonymFieldOrdering) { } TEST_F(CollectionSynonymsTest, DeleteAndUpsertDuplicationOfSynonms) { - synonym_t synonym1{"ipod-synonyms", {}, {{"ipod"}, {"i", "pod"}, {"pod"}}}; - synonym_t synonym2{"samsung-synonyms", {}, {{"s3"}, {"s3", "phone"}, {"samsung"}}}; - coll_mul_fields->add_synonym(synonym1.to_view_json()); - coll_mul_fields->add_synonym(synonym2.to_view_json()); + coll_mul_fields->add_synonym(R"({"id": "ipod-synonyms", "root": "ipod", "synonyms": ["i pod", "ipod"]})"_json); + coll_mul_fields->add_synonym(R"({"id": "samsung-synonyms", "root": "s3", "synonyms": ["s3 phone", "samsung"]})"_json); ASSERT_EQ(2, coll_mul_fields->get_synonyms().size()); coll_mul_fields->remove_synonym("ipod-synonyms"); @@ -720,14 +708,14 @@ TEST_F(CollectionSynonymsTest, DeleteAndUpsertDuplicationOfSynonms) { // try to upsert synonym with same ID - synonym2.root = {"s3", "smartphone"}; - auto upsert_op = coll_mul_fields->add_synonym(synonym2.to_view_json()); + auto upsert_op = coll_mul_fields->add_synonym(R"({"id": "samsung-synonyms", "root": "s3 smartphone", + "synonyms": ["s3 phone", "samsung"]})"_json); ASSERT_TRUE(upsert_op.ok()); ASSERT_EQ(1, coll_mul_fields->get_synonyms().size()); synonym_t synonym2_updated; - coll_mul_fields->get_synonym(synonym2.id, synonym2_updated); + coll_mul_fields->get_synonym("samsung-synonyms", synonym2_updated); ASSERT_EQ("s3", synonym2_updated.root[0]); ASSERT_EQ("smartphone", synonym2_updated.root[1]); @@ -737,7 +725,16 @@ TEST_F(CollectionSynonymsTest, DeleteAndUpsertDuplicationOfSynonms) { } TEST_F(CollectionSynonymsTest, SynonymJsonSerialization) { - synonym_t synonym1{"ipod-synonyms", {"apple", "ipod"}, {{"ipod"}, {"i", "pod"}, {"pod"}}}; + synonym_t synonym1; + synonym1.id = "ipod-synonyms"; + synonym1.root = {"apple", "ipod"}; + synonym1.raw_root = "apple ipod"; + + synonym1.raw_synonyms = {"ipod", "i pod", "pod"}; + synonym1.synonyms.push_back({"ipod"}); + synonym1.synonyms.push_back({"i", "pod"}); + synonym1.synonyms.push_back({"pod"}); + nlohmann::json obj = synonym1.to_view_json(); ASSERT_STREQ("ipod-synonyms", obj["id"].get().c_str()); ASSERT_STREQ("apple ipod", obj["root"].get().c_str()); @@ -817,8 +814,9 @@ TEST_F(CollectionSynonymsTest, SynonymExpansionAndCompressionRanking) { ASSERT_TRUE(coll1->add(doc.dump()).ok()); } - synonym_t synonym1{"syn-1", {"lululemon"}, {{"lulu", "lemon"}}}; - coll1->add_synonym(synonym1.to_view_json()); + nlohmann::json foo = R"({"id": "syn-1", "root": "lululemon", "synonyms": ["lulu lemon"]})"_json; + + coll1->add_synonym(R"({"id": "syn-1", "root": "lululemon", "synonyms": ["lulu lemon"]})"_json); auto res = coll1->search("lululemon", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 0).get(); ASSERT_EQ(2, res["hits"].size()); @@ -832,9 +830,7 @@ TEST_F(CollectionSynonymsTest, SynonymExpansionAndCompressionRanking) { ASSERT_EQ(res["hits"][0]["text_match"].get(), res["hits"][1]["text_match"].get()); // now with compression synonym - synonym1.root = {"lulu", "lemon"}; - synonym1.synonyms = {{"lululemon"}}; - coll1->add_synonym(synonym1.to_view_json()); + coll1->add_synonym(R"({"id": "syn-1", "root": "lulu lemon", "synonyms": ["lululemon"]})"_json); res = coll1->search("lulu lemon", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 0).get(); ASSERT_EQ(2, res["hits"].size()); @@ -876,7 +872,7 @@ TEST_F(CollectionSynonymsTest, SynonymQueriesMustHavePrefixEnabled) { } synonym_t synonym1{"syn-1", {"ns"}, {{"nonstick"}}}; - coll1->add_synonym(synonym1.to_view_json()); + coll1->add_synonym(R"({"id": "syn-1", "root": "ns", "synonyms": ["nonstick"]})"_json); auto res = coll1->search("ns cook", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 0).get(); ASSERT_EQ(1, res["hits"].size()); diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index caaafbf1..eaabe031 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -354,6 +354,4 @@ TEST(TokenizerTest, ShouldTokenizeWithDifferentSymbolConfigs) { ASSERT_EQ("ความ", tokens[0]); ASSERT_EQ("เหลื่อม", tokens[1]); ASSERT_EQ("ล้ํา", tokens[2]); - - LOG(INFO) << "here"; }