diff --git a/include/stopwords_manager.h b/include/stopwords_manager.h index 90a461a9..050f4067 100644 --- a/include/stopwords_manager.h +++ b/include/stopwords_manager.h @@ -36,7 +36,9 @@ public: Option get_stopword(const std::string&, spp::sparse_hash_set&) const; - Option upsert_stopword(const std::string&, const nlohmann::json&, const std::string&); + Option upsert_stopword(const std::string&, const nlohmann::json&); Option delete_stopword(const std::string&); + + void load_stopword_config(const std::string&, const nlohmann::json&); }; \ No newline at end of file diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index ea495600..6a55d44c 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -8,6 +8,7 @@ #include "batched_indexer.h" #include "logger.h" #include "magic_enum.hpp" +#include "stopwords_manager.h" constexpr const size_t CollectionManager::DEFAULT_NUM_MEMORY_SHARDS; @@ -285,6 +286,27 @@ Option CollectionManager::load(const size_t collection_batch_size, const s iter->Next(); } + //load stopwords + std::string stopword_prefix_key = std::string(StopwordsManager::STOPWORD_PREFIX) + "_"; + std::string stopword_upper_bound_key = std::string(StopwordsManager::STOPWORD_PREFIX) + "`"; // cannot inline this + rocksdb::Slice stopword_upper_bound(stopword_upper_bound_key); + + iter = store->scan(stopword_prefix_key, &stopword_upper_bound); + while(iter->Valid() && iter->key().starts_with(stopword_prefix_key)) { + std::vector parts; + std::string stopword_name = iter->key().ToString().substr(stopword_prefix_key.size()); + nlohmann::json stopword_obj = nlohmann::json::parse(iter->value().ToString(), nullptr, false); + + if(!stopword_obj.is_discarded() && stopword_obj.is_object()) { + StopwordsManager::get_instance().load_stopword_config(stopword_name, stopword_obj); + } else { + LOG(INFO) << "Invalid object for stopword " << stopword_name; + } + + iter->Next(); + } + + // restore query suggestions configs std::vector analytics_config_jsons; store->scan_fill(AnalyticsManager::ANALYTICS_RULE_PREFIX, diff --git a/src/core_api.cpp b/src/core_api.cpp index 0a3f8edc..8aef1a3f 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -1969,7 +1969,7 @@ bool put_upsert_stopword(const std::shared_ptr& req, const std::shared return false; } - Option success_op = stopwordManager.upsert_stopword(stopword_name, req_json[STOPWORD_VALUES], req_json[STOPWORD_LOCALE]); + Option success_op = stopwordManager.upsert_stopword(stopword_name, req_json); if(!success_op.ok()) { res->set_500(success_op.error()); return false; diff --git a/src/stopwords_manager.cpp b/src/stopwords_manager.cpp index 69ec9bf7..87f917c8 100644 --- a/src/stopwords_manager.cpp +++ b/src/stopwords_manager.cpp @@ -22,16 +22,18 @@ Option StopwordsManager::get_stopword(const std::string& stopword_name, sp return Option(404, "Stopword `" + stopword_name +"` not found."); } -Option StopwordsManager::upsert_stopword(const std::string& stopword_name, const nlohmann::json& stopwords, const std::string& locale) { +Option StopwordsManager::upsert_stopword(const std::string& stopword_name, const nlohmann::json& stopwords_json) { std::unique_lock lock(mutex); - bool inserted = store->insert(get_stopword_key(stopword_name), stopwords.dump()); + bool inserted = store->insert(get_stopword_key(stopword_name), stopwords_json.dump()); if(!inserted) { return Option(500, "Unable to insert into store."); } std::vector tokens; spp::sparse_hash_set stopwords_set; + const auto& stopwords = stopwords_json["stopwords"]; + const auto& locale = stopwords_json["locale"]; for (const auto &stopword: stopwords.items()) { const auto& val = stopword.value().get(); @@ -65,3 +67,23 @@ Option StopwordsManager::delete_stopword(const std::string& stopword_name) stopword_configs.erase(stopword_name); return Option(true); } + +void StopwordsManager::load_stopword_config(const std::string& stopword_name, const nlohmann::json& stopwords_json) { + std::unique_lock lock(mutex); + + std::vector tokens; + spp::sparse_hash_set stopwords_set; + const auto& stopwords = stopwords_json["stopwords"]; + const auto& locale = stopwords_json["locale"]; + + for (const auto &stopword: stopwords.items()) { + const auto& val = stopword.value().get(); + Tokenizer(val, true, false, locale, {}, {}).tokenize(tokens); + + for(const auto& tok : tokens) { + stopwords_set.emplace(tok); + } + tokens.clear(); + } + stopword_configs[stopword_name] = stopwords_set; +} \ No newline at end of file diff --git a/test/stopwords_manager_test.cpp b/test/stopwords_manager_test.cpp index 76b406f5..784e88c3 100644 --- a/test/stopwords_manager_test.cpp +++ b/test/stopwords_manager_test.cpp @@ -30,24 +30,24 @@ protected: TEST_F(StopwordsManagerTest, UpsertGetStopwords) { auto stopwords1 = R"( - {"stopwords": ["america", "europe"]} + {"stopwords": ["america", "europe"], "locale": "en"} )"_json; - auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1["stopwords"], "en"); + auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1); ASSERT_TRUE(upsert_op.ok()); auto stopwords2 = R"( - {"stopwords": ["a", "an", "the"]} + {"stopwords": ["a", "an", "the"], "locale": "en"} )"_json; - upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2["stopwords"], "en"); + upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2); ASSERT_TRUE(upsert_op.ok()); auto stopwords3 = R"( - {"stopwords": ["India", "United States", "Japan", "China"]} + {"stopwords": ["India", "United States", "Japan", "China"], "locale": "en"} )"_json; - upsert_op = stopwordsManager.upsert_stopword("countries", stopwords3["stopwords"], "en"); + upsert_op = stopwordsManager.upsert_stopword("countries", stopwords3); ASSERT_TRUE(upsert_op.ok()); auto stopword_config = stopwordsManager.get_stopwords(); @@ -74,9 +74,9 @@ TEST_F(StopwordsManagerTest, UpsertGetStopwords) { } TEST_F(StopwordsManagerTest, GetStopword) { - auto stopwords = R"({"stopwords": ["a", "an", "the"]})"_json; + auto stopwords = R"({"stopwords": ["a", "an", "the"], "locale": "en"})"_json; - auto upsert_op = stopwordsManager.upsert_stopword("articles", stopwords["stopwords"], "en"); + auto upsert_op = stopwordsManager.upsert_stopword("articles", stopwords); ASSERT_TRUE(upsert_op.ok()); spp::sparse_hash_set stopwords_set; @@ -94,9 +94,9 @@ TEST_F(StopwordsManagerTest, GetStopword) { ASSERT_EQ("Stopword `country` not found.", get_op.error()); //try fetching stopwords with token - stopwords = R"({"stopwords": ["India", "United States", "Japan"]})"_json; + stopwords = R"({"stopwords": ["India", "United States", "Japan"], "locale": "en"})"_json; - upsert_op = stopwordsManager.upsert_stopword("country", stopwords["stopwords"], "en"); + upsert_op = stopwordsManager.upsert_stopword("country", stopwords); ASSERT_TRUE(upsert_op.ok()); get_op = stopwordsManager.get_stopword("country", stopwords_set); @@ -106,17 +106,17 @@ TEST_F(StopwordsManagerTest, GetStopword) { TEST_F(StopwordsManagerTest, DeleteStopword) { auto stopwords1 = R"( - {"stopwords": ["america", "europe"]} + {"stopwords": ["america", "europe"], "locale": "en"} )"_json; - auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1["stopwords"], "en"); + auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1); ASSERT_TRUE(upsert_op.ok()); auto stopwords2 = R"( - {"stopwords": ["a", "an", "the"]} + {"stopwords": ["a", "an", "the"], "locale": "en"} )"_json; - upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2["stopwords"], "en"); + upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2); ASSERT_TRUE(upsert_op.ok()); spp::sparse_hash_set stopwords_set; @@ -138,11 +138,11 @@ TEST_F(StopwordsManagerTest, DeleteStopword) { } TEST_F(StopwordsManagerTest, UpdateStopword) { - auto stopwords = R"( - {"stopwords": ["america", "europe"]} + auto stopwords_json = R"( + {"stopwords": ["america", "europe"], "locale": "en"} )"_json; - auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords["stopwords"], "en"); + auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords_json); ASSERT_TRUE(upsert_op.ok()); auto stopword_config = stopwordsManager.get_stopwords(); @@ -152,10 +152,10 @@ TEST_F(StopwordsManagerTest, UpdateStopword) { ASSERT_TRUE(stopword_config["continents"].find("europe") != stopword_config["continents"].end()); //adding new words with same name should replace the stopwords set - stopwords = R"( - {"stopwords": ["india", "china", "japan"]} + stopwords_json = R"( + {"stopwords": ["india", "china", "japan"], "locale": "en"} )"_json; - upsert_op = stopwordsManager.upsert_stopword("continents", stopwords["stopwords"], "en"); + upsert_op = stopwordsManager.upsert_stopword("continents", stopwords_json); ASSERT_TRUE(upsert_op.ok()); stopword_config = stopwordsManager.get_stopwords();