mirror of
https://github.com/typesense/typesense.git
synced 2025-05-22 06:40:30 +08:00
load stopwords on startup
This commit is contained in:
parent
368e5c1789
commit
7b90c8afc3
@ -36,7 +36,9 @@ public:
|
||||
|
||||
Option<bool> get_stopword(const std::string&, spp::sparse_hash_set<std::string>&) const;
|
||||
|
||||
Option<bool> upsert_stopword(const std::string&, const nlohmann::json&, const std::string&);
|
||||
Option<bool> upsert_stopword(const std::string&, const nlohmann::json&);
|
||||
|
||||
Option<bool> delete_stopword(const std::string&);
|
||||
|
||||
void load_stopword_config(const std::string&, const nlohmann::json&);
|
||||
};
|
@ -8,6 +8,7 @@
|
||||
#include "batched_indexer.h"
|
||||
#include "logger.h"
|
||||
#include "magic_enum.hpp"
|
||||
#include "stopwords_manager.h"
|
||||
|
||||
constexpr const size_t CollectionManager::DEFAULT_NUM_MEMORY_SHARDS;
|
||||
|
||||
@ -285,6 +286,27 @@ Option<bool> CollectionManager::load(const size_t collection_batch_size, const s
|
||||
iter->Next();
|
||||
}
|
||||
|
||||
//load stopwords
|
||||
std::string stopword_prefix_key = std::string(StopwordsManager::STOPWORD_PREFIX) + "_";
|
||||
std::string stopword_upper_bound_key = std::string(StopwordsManager::STOPWORD_PREFIX) + "`"; // cannot inline this
|
||||
rocksdb::Slice stopword_upper_bound(stopword_upper_bound_key);
|
||||
|
||||
iter = store->scan(stopword_prefix_key, &stopword_upper_bound);
|
||||
while(iter->Valid() && iter->key().starts_with(stopword_prefix_key)) {
|
||||
std::vector<std::string> parts;
|
||||
std::string stopword_name = iter->key().ToString().substr(stopword_prefix_key.size());
|
||||
nlohmann::json stopword_obj = nlohmann::json::parse(iter->value().ToString(), nullptr, false);
|
||||
|
||||
if(!stopword_obj.is_discarded() && stopword_obj.is_object()) {
|
||||
StopwordsManager::get_instance().load_stopword_config(stopword_name, stopword_obj);
|
||||
} else {
|
||||
LOG(INFO) << "Invalid object for stopword " << stopword_name;
|
||||
}
|
||||
|
||||
iter->Next();
|
||||
}
|
||||
|
||||
|
||||
// restore query suggestions configs
|
||||
std::vector<std::string> analytics_config_jsons;
|
||||
store->scan_fill(AnalyticsManager::ANALYTICS_RULE_PREFIX,
|
||||
|
@ -1969,7 +1969,7 @@ bool put_upsert_stopword(const std::shared_ptr<http_req>& req, const std::shared
|
||||
return false;
|
||||
}
|
||||
|
||||
Option<bool> success_op = stopwordManager.upsert_stopword(stopword_name, req_json[STOPWORD_VALUES], req_json[STOPWORD_LOCALE]);
|
||||
Option<bool> success_op = stopwordManager.upsert_stopword(stopword_name, req_json);
|
||||
if(!success_op.ok()) {
|
||||
res->set_500(success_op.error());
|
||||
return false;
|
||||
|
@ -22,16 +22,18 @@ Option<bool> StopwordsManager::get_stopword(const std::string& stopword_name, sp
|
||||
return Option<bool>(404, "Stopword `" + stopword_name +"` not found.");
|
||||
}
|
||||
|
||||
Option<bool> StopwordsManager::upsert_stopword(const std::string& stopword_name, const nlohmann::json& stopwords, const std::string& locale) {
|
||||
Option<bool> StopwordsManager::upsert_stopword(const std::string& stopword_name, const nlohmann::json& stopwords_json) {
|
||||
std::unique_lock lock(mutex);
|
||||
|
||||
bool inserted = store->insert(get_stopword_key(stopword_name), stopwords.dump());
|
||||
bool inserted = store->insert(get_stopword_key(stopword_name), stopwords_json.dump());
|
||||
if(!inserted) {
|
||||
return Option<bool>(500, "Unable to insert into store.");
|
||||
}
|
||||
|
||||
std::vector<std::string> tokens;
|
||||
spp::sparse_hash_set<std::string> stopwords_set;
|
||||
const auto& stopwords = stopwords_json["stopwords"];
|
||||
const auto& locale = stopwords_json["locale"];
|
||||
|
||||
for (const auto &stopword: stopwords.items()) {
|
||||
const auto& val = stopword.value().get<std::string>();
|
||||
@ -65,3 +67,23 @@ Option<bool> StopwordsManager::delete_stopword(const std::string& stopword_name)
|
||||
stopword_configs.erase(stopword_name);
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
void StopwordsManager::load_stopword_config(const std::string& stopword_name, const nlohmann::json& stopwords_json) {
|
||||
std::unique_lock lock(mutex);
|
||||
|
||||
std::vector<std::string> tokens;
|
||||
spp::sparse_hash_set<std::string> stopwords_set;
|
||||
const auto& stopwords = stopwords_json["stopwords"];
|
||||
const auto& locale = stopwords_json["locale"];
|
||||
|
||||
for (const auto &stopword: stopwords.items()) {
|
||||
const auto& val = stopword.value().get<std::string>();
|
||||
Tokenizer(val, true, false, locale, {}, {}).tokenize(tokens);
|
||||
|
||||
for(const auto& tok : tokens) {
|
||||
stopwords_set.emplace(tok);
|
||||
}
|
||||
tokens.clear();
|
||||
}
|
||||
stopword_configs[stopword_name] = stopwords_set;
|
||||
}
|
@ -30,24 +30,24 @@ protected:
|
||||
|
||||
TEST_F(StopwordsManagerTest, UpsertGetStopwords) {
|
||||
auto stopwords1 = R"(
|
||||
{"stopwords": ["america", "europe"]}
|
||||
{"stopwords": ["america", "europe"], "locale": "en"}
|
||||
)"_json;
|
||||
|
||||
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1["stopwords"], "en");
|
||||
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1);
|
||||
ASSERT_TRUE(upsert_op.ok());
|
||||
|
||||
auto stopwords2 = R"(
|
||||
{"stopwords": ["a", "an", "the"]}
|
||||
{"stopwords": ["a", "an", "the"], "locale": "en"}
|
||||
)"_json;
|
||||
|
||||
upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2["stopwords"], "en");
|
||||
upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2);
|
||||
ASSERT_TRUE(upsert_op.ok());
|
||||
|
||||
auto stopwords3 = R"(
|
||||
{"stopwords": ["India", "United States", "Japan", "China"]}
|
||||
{"stopwords": ["India", "United States", "Japan", "China"], "locale": "en"}
|
||||
)"_json;
|
||||
|
||||
upsert_op = stopwordsManager.upsert_stopword("countries", stopwords3["stopwords"], "en");
|
||||
upsert_op = stopwordsManager.upsert_stopword("countries", stopwords3);
|
||||
ASSERT_TRUE(upsert_op.ok());
|
||||
|
||||
auto stopword_config = stopwordsManager.get_stopwords();
|
||||
@ -74,9 +74,9 @@ TEST_F(StopwordsManagerTest, UpsertGetStopwords) {
|
||||
}
|
||||
|
||||
TEST_F(StopwordsManagerTest, GetStopword) {
|
||||
auto stopwords = R"({"stopwords": ["a", "an", "the"]})"_json;
|
||||
auto stopwords = R"({"stopwords": ["a", "an", "the"], "locale": "en"})"_json;
|
||||
|
||||
auto upsert_op = stopwordsManager.upsert_stopword("articles", stopwords["stopwords"], "en");
|
||||
auto upsert_op = stopwordsManager.upsert_stopword("articles", stopwords);
|
||||
ASSERT_TRUE(upsert_op.ok());
|
||||
|
||||
spp::sparse_hash_set<std::string> stopwords_set;
|
||||
@ -94,9 +94,9 @@ TEST_F(StopwordsManagerTest, GetStopword) {
|
||||
ASSERT_EQ("Stopword `country` not found.", get_op.error());
|
||||
|
||||
//try fetching stopwords with token
|
||||
stopwords = R"({"stopwords": ["India", "United States", "Japan"]})"_json;
|
||||
stopwords = R"({"stopwords": ["India", "United States", "Japan"], "locale": "en"})"_json;
|
||||
|
||||
upsert_op = stopwordsManager.upsert_stopword("country", stopwords["stopwords"], "en");
|
||||
upsert_op = stopwordsManager.upsert_stopword("country", stopwords);
|
||||
ASSERT_TRUE(upsert_op.ok());
|
||||
|
||||
get_op = stopwordsManager.get_stopword("country", stopwords_set);
|
||||
@ -106,17 +106,17 @@ TEST_F(StopwordsManagerTest, GetStopword) {
|
||||
|
||||
TEST_F(StopwordsManagerTest, DeleteStopword) {
|
||||
auto stopwords1 = R"(
|
||||
{"stopwords": ["america", "europe"]}
|
||||
{"stopwords": ["america", "europe"], "locale": "en"}
|
||||
)"_json;
|
||||
|
||||
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1["stopwords"], "en");
|
||||
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords1);
|
||||
ASSERT_TRUE(upsert_op.ok());
|
||||
|
||||
auto stopwords2 = R"(
|
||||
{"stopwords": ["a", "an", "the"]}
|
||||
{"stopwords": ["a", "an", "the"], "locale": "en"}
|
||||
)"_json;
|
||||
|
||||
upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2["stopwords"], "en");
|
||||
upsert_op = stopwordsManager.upsert_stopword("articles", stopwords2);
|
||||
ASSERT_TRUE(upsert_op.ok());
|
||||
|
||||
spp::sparse_hash_set<std::string> stopwords_set;
|
||||
@ -138,11 +138,11 @@ TEST_F(StopwordsManagerTest, DeleteStopword) {
|
||||
}
|
||||
|
||||
TEST_F(StopwordsManagerTest, UpdateStopword) {
|
||||
auto stopwords = R"(
|
||||
{"stopwords": ["america", "europe"]}
|
||||
auto stopwords_json = R"(
|
||||
{"stopwords": ["america", "europe"], "locale": "en"}
|
||||
)"_json;
|
||||
|
||||
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords["stopwords"], "en");
|
||||
auto upsert_op = stopwordsManager.upsert_stopword("continents", stopwords_json);
|
||||
ASSERT_TRUE(upsert_op.ok());
|
||||
|
||||
auto stopword_config = stopwordsManager.get_stopwords();
|
||||
@ -152,10 +152,10 @@ TEST_F(StopwordsManagerTest, UpdateStopword) {
|
||||
ASSERT_TRUE(stopword_config["continents"].find("europe") != stopword_config["continents"].end());
|
||||
|
||||
//adding new words with same name should replace the stopwords set
|
||||
stopwords = R"(
|
||||
{"stopwords": ["india", "china", "japan"]}
|
||||
stopwords_json = R"(
|
||||
{"stopwords": ["india", "china", "japan"], "locale": "en"}
|
||||
)"_json;
|
||||
upsert_op = stopwordsManager.upsert_stopword("continents", stopwords["stopwords"], "en");
|
||||
upsert_op = stopwordsManager.upsert_stopword("continents", stopwords_json);
|
||||
ASSERT_TRUE(upsert_op.ok());
|
||||
|
||||
stopword_config = stopwordsManager.get_stopwords();
|
||||
|
Loading…
x
Reference in New Issue
Block a user