#pragma once #include #include "sparsepp.h" #include "json.hpp" #include "string_utils.h" #include "option.h" #include "tokenizer.h" #include "store.h" struct synonym_t { std::string id; std::vector root; std::vector> synonyms; synonym_t() = default; synonym_t(const std::string& id, const std::vector& root, const std::vector>& synonyms): id(id), root(root), synonyms(synonyms) { } explicit synonym_t(const nlohmann::json& synonym) { id = synonym["id"].get(); if(synonym.count("root") != 0) { root = synonym["root"].get>(); } synonyms = synonym["synonyms"].get>>(); } nlohmann::json to_json() const { nlohmann::json obj; obj["id"] = id; obj["root"] = root; obj["synonyms"] = synonyms; return obj; } nlohmann::json to_view_json() const { nlohmann::json obj; obj["id"] = id; obj["root"] = StringUtils::join(root, " "); obj["synonyms"] = nlohmann::json::array(); for(const auto& synonym: synonyms) { obj["synonyms"].push_back(StringUtils::join(synonym, " ")); } return obj; } static Option parse(const nlohmann::json& synonym_json, synonym_t& syn) { if(synonym_json.count("id") == 0) { return Option(400, "Missing `id` field."); } if(synonym_json.count("synonyms") == 0) { return Option(400, "Could not find an array of `synonyms`"); } if(synonym_json.count("root") != 0 && !synonym_json["root"].is_string()) { return Option(400, "Key `root` should be a string."); } if (!synonym_json["synonyms"].is_array() || synonym_json["synonyms"].empty()) { return Option(400, "Could not find an array of `synonyms`"); } for(const auto& synonym: synonym_json["synonyms"]) { if(!synonym.is_string() || synonym == "") { return Option(400, "Could not find a valid string array of `synonyms`"); } std::vector tokens; Tokenizer(synonym, true).tokenize(tokens); syn.synonyms.push_back(tokens); } if(synonym_json.count("root") != 0) { std::vector tokens; Tokenizer(synonym_json["root"], true).tokenize(tokens); syn.root = tokens; } syn.id = synonym_json["id"]; return Option(true); } static uint64_t get_hash(const std::vector& tokens) { uint64_t hash = 1; for(size_t i=0; i < tokens.size(); i++) { auto& token = tokens[i]; uint64_t token_hash = StringUtils::hash_wy(token.c_str(), token.size()); if(i == 0) { hash = token_hash; } else { hash = StringUtils::hash_combine(hash, token_hash); } } return hash; } }; class SynonymIndex { private: mutable std::shared_mutex mutex; Store* store; spp::sparse_hash_map synonym_definitions; spp::sparse_hash_map> synonym_index; void synonym_reduction_internal(const std::vector& tokens, size_t start_window_size, size_t start_index_pos, std::set& processed_syn_hashes, std::vector>& results) const; public: static constexpr const char* COLLECTION_SYNONYM_PREFIX = "$CY"; SynonymIndex(Store* store): store(store) { } static std::string get_synonym_key(const std::string & collection_name, const std::string & synonym_id); void synonym_reduction(const std::vector& tokens, std::vector>& results) const; spp::sparse_hash_map get_synonyms(); bool get_synonym(const std::string& id, synonym_t& synonym); Option add_synonym(const std::string & collection_name, const synonym_t& synonym); Option remove_synonym(const std::string & collection_name, const std::string & id); };