Synonyms should be able to belong to multiple sets.

2025-05-20 05:32:30 +08:00 · 2020-12-18 18:29:25 +05:30 · 2020-12-18 18:29:25 +05:30 · 90ac320c93
commit 90ac320c93
parent 8a310bbb82
3 changed files with 74 additions and 47 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -230,7 +230,7 @@ private:
    std::unordered_map<std::string, field> sort_schema;

    spp::sparse_hash_map<std::string, synonym_t> synonym_definitions;
-    spp::sparse_hash_map<uint64_t, std::string> synonym_index;
+    spp::sparse_hash_map<uint64_t, std::vector<std::string>> synonym_index;

    std::string default_sorting_field;

--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -1917,11 +1917,11 @@ Option<bool> Collection::add_synonym(const synonym_t& synonym) {

    if(!synonym.root.empty()) {
        uint64_t root_hash = synonym_t::get_hash(synonym.root);
-        synonym_index[root_hash] = synonym.id;
+        synonym_index[root_hash].emplace_back(synonym.id);
    } else {
        for(const auto & syn_tokens : synonym.synonyms) {
            uint64_t syn_hash = synonym_t::get_hash(syn_tokens);
-            synonym_index[syn_hash] = synonym.id;
+            synonym_index[syn_hash].emplace_back(synonym.id);
        }
    }

@ -1970,56 +1970,59 @@ void Collection::synonym_reduction_internal(const std::vector<std::string>& toke

            if(syn_itr != synonym_index.end() && processed_syn_hashes.count(syn_hash) == 0) {
                // tokens in this window match a synonym: reconstruct tokens and rerun synonym mapping against matches
-                const auto& syn_id = syn_itr->second;
-                const auto& syn_def = synonym_definitions[syn_id];
+                const auto& syn_ids = syn_itr->second;

-                for(const auto& syn_def_tokens: syn_def.synonyms) {
-                    std::vector<std::string> new_tokens;
+                for(const auto& syn_id: syn_ids) {
+                    const auto &syn_def = synonym_definitions[syn_id];

-                    for(size_t i = 0; i < start_index; i++) {
-                        new_tokens.push_back(tokens[i]);
-                    }
+                    for (const auto &syn_def_tokens: syn_def.synonyms) {
+                        std::vector<std::string> new_tokens;

-                    std::vector<uint64_t> syn_def_hashes;
-                    uint64_t syn_def_hash = 1;
-
-                    for(size_t i=0; i < syn_def_tokens.size(); i++) {
-                        const auto& syn_def_token = syn_def_tokens[i];
-                        new_tokens.push_back(syn_def_token);
-                        uint64_t token_hash = StringUtils::hash_wy(syn_def_token.c_str(),
-                                                                   syn_def_token.size());
-
-                        if(i == 0) {
-                            syn_def_hash = token_hash;
-                        } else {
-                            syn_def_hash = Index::hash_combine(syn_def_hash, token_hash);
+                        for (size_t i = 0; i < start_index; i++) {
+                            new_tokens.push_back(tokens[i]);
                        }

-                        syn_def_hashes.push_back(token_hash);
+                        std::vector<uint64_t> syn_def_hashes;
+                        uint64_t syn_def_hash = 1;
+
+                        for (size_t i = 0; i < syn_def_tokens.size(); i++) {
+                            const auto &syn_def_token = syn_def_tokens[i];
+                            new_tokens.push_back(syn_def_token);
+                            uint64_t token_hash = StringUtils::hash_wy(syn_def_token.c_str(),
+                                                                       syn_def_token.size());
+
+                            if (i == 0) {
+                                syn_def_hash = token_hash;
+                            } else {
+                                syn_def_hash = Index::hash_combine(syn_def_hash, token_hash);
+                            }
+
+                            syn_def_hashes.push_back(token_hash);
+                        }
+
+                        if (syn_def_hash == syn_hash) {
+                            // skip over token matching itself in the group
+                            continue;
+                        }
+
+                        for (size_t i = start_index + window_len; i < tokens.size(); i++) {
+                            new_tokens.push_back(tokens[i]);
+                        }
+
+                        processed_syn_hashes.emplace(syn_def_hash);
+                        processed_syn_hashes.emplace(syn_hash);
+
+                        for (uint64_t h: syn_def_hashes) {
+                            processed_syn_hashes.emplace(h);
+                        }
+
+                        for (uint64_t h: syn_hashes) {
+                            processed_syn_hashes.emplace(h);
+                        }
+
+                        recursed = true;
+                        synonym_reduction_internal(new_tokens, window_len, start_index, processed_syn_hashes, results);
                    }
-
-                    if(syn_def_hash == syn_hash) {
-                        // skip over token matching itself in the group
-                        continue;
-                    }
-
-                    for(size_t i = start_index+window_len; i < tokens.size(); i++) {
-                        new_tokens.push_back(tokens[i]);
-                    }
-
-                    processed_syn_hashes.emplace(syn_def_hash);
-                    processed_syn_hashes.emplace(syn_hash);
-
-                    for(uint64_t h: syn_def_hashes) {
-                        processed_syn_hashes.emplace(h);
-                    }
-
-                    for(uint64_t h: syn_hashes) {
-                        processed_syn_hashes.emplace(h);
-                    }
-
-                    recursed = true;
-                    synonym_reduction_internal(new_tokens, window_len, start_index, processed_syn_hashes, results);
                }
            }
        }
--- a/test/collection_synonyms_test.cpp
+++ b/test/collection_synonyms_test.cpp
@ -267,6 +267,30 @@ TEST_F(CollectionSynonymsTest, SynonymReductionMultiWay) {
    ASSERT_STREQ("states", results[3][0].c_str());
 }

+TEST_F(CollectionSynonymsTest, SynonymBelongingToMultipleSets) {
+    synonym_t synonym1{"iphone-synonyms", {}, {{"i", "phone"}, {"smart", "phone"}}};
+    synonym_t synonym2{"samsung-synonyms", {}, {{"smart", "phone"}, {"galaxy", "phone"}, {"samsung", "phone"}}};
+    coll_mul_fields->add_synonym(synonym1);
+    coll_mul_fields->add_synonym(synonym2);
+
+    std::vector<std::vector<std::string>> results;
+    coll_mul_fields->synonym_reduction({"smart", "phone"}, results);
+
+    ASSERT_EQ(3, results.size());
+    ASSERT_EQ(2, results[0].size());
+    ASSERT_EQ(2, results[1].size());
+    ASSERT_EQ(2, results[2].size());
+
+    ASSERT_STREQ("i", results[0][0].c_str());
+    ASSERT_STREQ("phone", results[0][1].c_str());
+
+    ASSERT_STREQ("galaxy", results[1][0].c_str());
+    ASSERT_STREQ("phone", results[1][1].c_str());
+
+    ASSERT_STREQ("samsung", results[2][0].c_str());
+    ASSERT_STREQ("phone", results[2][1].c_str());
+}
+
 TEST_F(CollectionSynonymsTest, OneWaySynonym) {
    nlohmann::json syn_json = {
        {"id", "syn-1"},