diff --git a/include/collection.h b/include/collection.h index f8111013..5f46acc5 100644 --- a/include/collection.h +++ b/include/collection.h @@ -230,7 +230,7 @@ private: std::unordered_map sort_schema; spp::sparse_hash_map synonym_definitions; - spp::sparse_hash_map synonym_index; + spp::sparse_hash_map> synonym_index; std::string default_sorting_field; diff --git a/src/collection.cpp b/src/collection.cpp index 57c053a0..de795e33 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1917,11 +1917,11 @@ Option Collection::add_synonym(const synonym_t& synonym) { if(!synonym.root.empty()) { uint64_t root_hash = synonym_t::get_hash(synonym.root); - synonym_index[root_hash] = synonym.id; + synonym_index[root_hash].emplace_back(synonym.id); } else { for(const auto & syn_tokens : synonym.synonyms) { uint64_t syn_hash = synonym_t::get_hash(syn_tokens); - synonym_index[syn_hash] = synonym.id; + synonym_index[syn_hash].emplace_back(synonym.id); } } @@ -1970,56 +1970,59 @@ void Collection::synonym_reduction_internal(const std::vector& toke if(syn_itr != synonym_index.end() && processed_syn_hashes.count(syn_hash) == 0) { // tokens in this window match a synonym: reconstruct tokens and rerun synonym mapping against matches - const auto& syn_id = syn_itr->second; - const auto& syn_def = synonym_definitions[syn_id]; + const auto& syn_ids = syn_itr->second; - for(const auto& syn_def_tokens: syn_def.synonyms) { - std::vector new_tokens; + for(const auto& syn_id: syn_ids) { + const auto &syn_def = synonym_definitions[syn_id]; - for(size_t i = 0; i < start_index; i++) { - new_tokens.push_back(tokens[i]); - } + for (const auto &syn_def_tokens: syn_def.synonyms) { + std::vector new_tokens; - std::vector syn_def_hashes; - uint64_t syn_def_hash = 1; - - for(size_t i=0; i < syn_def_tokens.size(); i++) { - const auto& syn_def_token = syn_def_tokens[i]; - new_tokens.push_back(syn_def_token); - uint64_t token_hash = StringUtils::hash_wy(syn_def_token.c_str(), - syn_def_token.size()); - - if(i == 0) { - syn_def_hash = token_hash; - } else { - syn_def_hash = Index::hash_combine(syn_def_hash, token_hash); + for (size_t i = 0; i < start_index; i++) { + new_tokens.push_back(tokens[i]); } - syn_def_hashes.push_back(token_hash); + std::vector syn_def_hashes; + uint64_t syn_def_hash = 1; + + for (size_t i = 0; i < syn_def_tokens.size(); i++) { + const auto &syn_def_token = syn_def_tokens[i]; + new_tokens.push_back(syn_def_token); + uint64_t token_hash = StringUtils::hash_wy(syn_def_token.c_str(), + syn_def_token.size()); + + if (i == 0) { + syn_def_hash = token_hash; + } else { + syn_def_hash = Index::hash_combine(syn_def_hash, token_hash); + } + + syn_def_hashes.push_back(token_hash); + } + + if (syn_def_hash == syn_hash) { + // skip over token matching itself in the group + continue; + } + + for (size_t i = start_index + window_len; i < tokens.size(); i++) { + new_tokens.push_back(tokens[i]); + } + + processed_syn_hashes.emplace(syn_def_hash); + processed_syn_hashes.emplace(syn_hash); + + for (uint64_t h: syn_def_hashes) { + processed_syn_hashes.emplace(h); + } + + for (uint64_t h: syn_hashes) { + processed_syn_hashes.emplace(h); + } + + recursed = true; + synonym_reduction_internal(new_tokens, window_len, start_index, processed_syn_hashes, results); } - - if(syn_def_hash == syn_hash) { - // skip over token matching itself in the group - continue; - } - - for(size_t i = start_index+window_len; i < tokens.size(); i++) { - new_tokens.push_back(tokens[i]); - } - - processed_syn_hashes.emplace(syn_def_hash); - processed_syn_hashes.emplace(syn_hash); - - for(uint64_t h: syn_def_hashes) { - processed_syn_hashes.emplace(h); - } - - for(uint64_t h: syn_hashes) { - processed_syn_hashes.emplace(h); - } - - recursed = true; - synonym_reduction_internal(new_tokens, window_len, start_index, processed_syn_hashes, results); } } } diff --git a/test/collection_synonyms_test.cpp b/test/collection_synonyms_test.cpp index 539b1c6c..4b1a973d 100644 --- a/test/collection_synonyms_test.cpp +++ b/test/collection_synonyms_test.cpp @@ -267,6 +267,30 @@ TEST_F(CollectionSynonymsTest, SynonymReductionMultiWay) { ASSERT_STREQ("states", results[3][0].c_str()); } +TEST_F(CollectionSynonymsTest, SynonymBelongingToMultipleSets) { + synonym_t synonym1{"iphone-synonyms", {}, {{"i", "phone"}, {"smart", "phone"}}}; + synonym_t synonym2{"samsung-synonyms", {}, {{"smart", "phone"}, {"galaxy", "phone"}, {"samsung", "phone"}}}; + coll_mul_fields->add_synonym(synonym1); + coll_mul_fields->add_synonym(synonym2); + + std::vector> results; + coll_mul_fields->synonym_reduction({"smart", "phone"}, results); + + ASSERT_EQ(3, results.size()); + ASSERT_EQ(2, results[0].size()); + ASSERT_EQ(2, results[1].size()); + ASSERT_EQ(2, results[2].size()); + + ASSERT_STREQ("i", results[0][0].c_str()); + ASSERT_STREQ("phone", results[0][1].c_str()); + + ASSERT_STREQ("galaxy", results[1][0].c_str()); + ASSERT_STREQ("phone", results[1][1].c_str()); + + ASSERT_STREQ("samsung", results[2][0].c_str()); + ASSERT_STREQ("phone", results[2][1].c_str()); +} + TEST_F(CollectionSynonymsTest, OneWaySynonym) { nlohmann::json syn_json = { {"id", "syn-1"},