load stopwords on startup

This commit is contained in:
krunal 2023-07-18 20:18:25 +05:30
parent 368e5c1789
commit 7b90c8afc3
5 changed files with 70 additions and 24 deletions

View File

@ -36,7 +36,9 @@ public:
Option<bool> get_stopword(const std::string&, spp::sparse_hash_set<std::string>&) const;
Option<bool> upsert_stopword(const std::string&, const nlohmann::json&, const std::string&);
Option<bool> upsert_stopword(const std::string&, const nlohmann::json&);
Option<bool> delete_stopword(const std::string&);
void load_stopword_config(const std::string&, const nlohmann::json&);
};

View File

@ -8,6 +8,7 @@
#include "batched_indexer.h"
#include "logger.h"
#include "magic_enum.hpp"
#include "stopwords_manager.h"
constexpr const size_t CollectionManager::DEFAULT_NUM_MEMORY_SHARDS;
@ -285,6 +286,27 @@ Option<bool> CollectionManager::load(const size_t collection_batch_size, const s
iter->Next();
}
//load stopwords
std::string stopword_prefix_key = std::string(StopwordsManager::STOPWORD_PREFIX) + "_";
std::string stopword_upper_bound_key = std::string(StopwordsManager::STOPWORD_PREFIX) + "`"; // cannot inline this
rocksdb::Slice stopword_upper_bound(stopword_upper_bound_key);
iter = store->scan(stopword_prefix_key, &stopword_upper_bound);
while(iter->Valid() && iter->key().starts_with(stopword_prefix_key)) {
std::vector<std::string> parts;
std::string stopword_name = iter->key().ToString().substr(stopword_prefix_key.size());
nlohmann::json stopword_obj = nlohmann::json::parse(iter->value().ToString(), nullptr, false);
if(!stopword_obj.is_discarded() && stopword_obj.is_object()) {
StopwordsManager::get_instance().load_stopword_config(stopword_name, stopword_obj);
} else {
LOG(INFO) << "Invalid object for stopword " << stopword_name;
}
iter->Next();
}
// restore query suggestions configs
std::vector<std::string> analytics_config_jsons;
store->scan_fill(AnalyticsManager::ANALYTICS_RULE_PREFIX,

View File

@ -1969,7 +1969,7 @@ bool put_upsert_stopword(const std::shared_ptr<http_req>& req, const std::shared
return false;
}
Option<bool> success_op = stopwordManager.upsert_stopword(stopword_name, req_json[STOPWORD_VALUES], req_json[STOPWORD_LOCALE]);
Option<bool> success_op = stopwordManager.upsert_stopword(stopword_name, req_json);
if(!success_op.ok()) {
res->set_500(success_op.error());
return false;

View File

@ -22,16 +22,18 @@ Option<bool> StopwordsManager::get_stopword(const std::string& stopword_name, sp
return Option<bool>(404, "Stopword `" + stopword_name +"` not found.");
}
Option<bool> StopwordsManager::upsert_stopword(const std::string& stopword_name, const nlohmann::json& stopwords, const std::string& locale) {
Option<bool> StopwordsManager::upsert_stopword(const std::string& stopword_name, const nlohmann::json& stopwords_json) {
std::unique_lock lock(mutex);
bool inserted = store->insert(get_stopword_key(stopword_name), stopwords.dump());
bool inserted = store->insert(get_stopword_key(stopword_name), stopwords_json.dump());
if(!inserted) {
return Option<bool>(500, "Unable to insert into store.");
}
std::vector<std::string> tokens;
spp::sparse_hash_set<std::string> stopwords_set;
const auto& stopwords = stopwords_json["stopwords"];
const auto& locale = stopwords_json["locale"];
for (const auto &stopword: stopwords.items()) {
const auto& val = stopword.value().get<std::string>();
@ -65,3 +67,23 @@ Option<bool> StopwordsManager::delete_stopword(const std::string& stopword_name)
stopword_configs.erase(stopword_name);
return Option<bool>(true);
}
void StopwordsManager::load_stopword_config(const std::string& stopword_name, const nlohmann::json& stopwords_json) {
std::unique_lock lock(mutex);
std::vector<std::string> tokens;
spp::sparse_hash_set<std::string> stopwords_set;
const auto& stopwords = stopwords_json["stopwords"];
const auto& locale = stopwords_json["locale"];
for (const auto &stopword: stopwords.items()) {
const auto& val = stopword.value().get<std::string>();
Tokenizer(val, true, false, locale, {}, {}).tokenize(tokens);
for(const auto& tok : tokens) {
stopwords_set.emplace(tok);
}
tokens.clear();
}
stopword_configs[stopword_name] = stopwords_set;
}

View File

@ -30,24 +30,24 @@ protected:
TEST_F(StopwordsManagerTest, UpsertGetStopwords) {
auto stopwords1 = R"(
{"stopwords": ["america", "europe"]}
{"stopwords": ["america", "europe"], "locale": "en"}
)"_json;
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1["stopwords"], "en");
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1);
ASSERT_TRUE(upsert_op.ok());
auto stopwords2 = R"(
{"stopwords": ["a", "an", "the"]}
{"stopwords": ["a", "an", "the"], "locale": "en"}
)"_json;
upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2["stopwords"], "en");
upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2);
ASSERT_TRUE(upsert_op.ok());
auto stopwords3 = R"(
{"stopwords": ["India", "United States", "Japan", "China"]}
{"stopwords": ["India", "United States", "Japan", "China"], "locale": "en"}
)"_json;
upsert_op = stopwordsManager.upsert_stopword("countries", stopwords3["stopwords"], "en");
upsert_op = stopwordsManager.upsert_stopword("countries", stopwords3);
ASSERT_TRUE(upsert_op.ok());
auto stopword_config = stopwordsManager.get_stopwords();
@ -74,9 +74,9 @@ TEST_F(StopwordsManagerTest, UpsertGetStopwords) {
}
TEST_F(StopwordsManagerTest, GetStopword) {
auto stopwords = R"({"stopwords": ["a", "an", "the"]})"_json;
auto stopwords = R"({"stopwords": ["a", "an", "the"], "locale": "en"})"_json;
auto upsert_op = stopwordsManager.upsert_stopword("articles", stopwords["stopwords"], "en");
auto upsert_op = stopwordsManager.upsert_stopword("articles", stopwords);
ASSERT_TRUE(upsert_op.ok());
spp::sparse_hash_set<std::string> stopwords_set;
@ -94,9 +94,9 @@ TEST_F(StopwordsManagerTest, GetStopword) {
ASSERT_EQ("Stopword `country` not found.", get_op.error());
//try fetching stopwords with token
stopwords = R"({"stopwords": ["India", "United States", "Japan"]})"_json;
stopwords = R"({"stopwords": ["India", "United States", "Japan"], "locale": "en"})"_json;
upsert_op = stopwordsManager.upsert_stopword("country", stopwords["stopwords"], "en");
upsert_op = stopwordsManager.upsert_stopword("country", stopwords);
ASSERT_TRUE(upsert_op.ok());
get_op = stopwordsManager.get_stopword("country", stopwords_set);
@ -106,17 +106,17 @@ TEST_F(StopwordsManagerTest, GetStopword) {
TEST_F(StopwordsManagerTest, DeleteStopword) {
auto stopwords1 = R"(
{"stopwords": ["america", "europe"]}
{"stopwords": ["america", "europe"], "locale": "en"}
)"_json;
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1["stopwords"], "en");
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1);
ASSERT_TRUE(upsert_op.ok());
auto stopwords2 = R"(
{"stopwords": ["a", "an", "the"]}
{"stopwords": ["a", "an", "the"], "locale": "en"}
)"_json;
upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2["stopwords"], "en");
upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2);
ASSERT_TRUE(upsert_op.ok());
spp::sparse_hash_set<std::string> stopwords_set;
@ -138,11 +138,11 @@ TEST_F(StopwordsManagerTest, DeleteStopword) {
}
TEST_F(StopwordsManagerTest, UpdateStopword) {
auto stopwords = R"(
{"stopwords": ["america", "europe"]}
auto stopwords_json = R"(
{"stopwords": ["america", "europe"], "locale": "en"}
)"_json;
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords["stopwords"], "en");
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords_json);
ASSERT_TRUE(upsert_op.ok());
auto stopword_config = stopwordsManager.get_stopwords();
@ -152,10 +152,10 @@ TEST_F(StopwordsManagerTest, UpdateStopword) {
ASSERT_TRUE(stopword_config["continents"].find("europe") != stopword_config["continents"].end());
//adding new words with same name should replace the stopwords set
stopwords = R"(
{"stopwords": ["india", "china", "japan"]}
stopwords_json = R"(
{"stopwords": ["india", "china", "japan"], "locale": "en"}
)"_json;
upsert_op = stopwordsManager.upsert_stopword("continents", stopwords["stopwords"], "en");
upsert_op = stopwordsManager.upsert_stopword("continents", stopwords_json);
ASSERT_TRUE(upsert_op.ok());
stopword_config = stopwordsManager.get_stopwords();