Merge pull request #1232 from krunal1313/MultipleSynonymSubstitution

adding test for multiple synonym substition in query
This commit is contained in:
Kishore Nallan 2023-09-22 12:03:18 +05:30 committed by GitHub
commit 30f21d6d7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 67 additions and 10 deletions

View File

@ -56,7 +56,8 @@ private:
size_t start_window_size,
size_t start_index_pos,
std::set<uint64_t>& processed_syn_hashes,
std::vector<std::vector<std::string>>& results) const;
std::vector<std::vector<std::string>>& results,
const std::vector<std::string>& orig_tokens) const;
public:

View File

@ -4,7 +4,8 @@
void SynonymIndex::synonym_reduction_internal(const std::vector<std::string>& tokens,
size_t start_window_size, size_t start_index_pos,
std::set<uint64_t>& processed_syn_hashes,
std::vector<std::vector<std::string>>& results) const {
std::vector<std::vector<std::string>>& results,
const std::vector<std::string>& orig_tokens) const {
bool recursed = false;
@ -59,11 +60,6 @@ void SynonymIndex::synonym_reduction_internal(const std::vector<std::string>& to
syn_def_hashes.push_back(token_hash);
}
if (syn_def_hash == syn_hash) {
// skip over token matching itself in the group
continue;
}
for (size_t i = start_index + window_len; i < tokens.size(); i++) {
new_tokens.push_back(tokens[i]);
}
@ -80,7 +76,8 @@ void SynonymIndex::synonym_reduction_internal(const std::vector<std::string>& to
}
recursed = true;
synonym_reduction_internal(new_tokens, window_len, start_index, processed_syn_hashes, results);
synonym_reduction_internal(new_tokens, window_len,
start_index, processed_syn_hashes, results, orig_tokens);
}
}
}
@ -90,7 +87,7 @@ void SynonymIndex::synonym_reduction_internal(const std::vector<std::string>& to
start_index_pos = 0;
}
if(!recursed && !processed_syn_hashes.empty()) {
if(!recursed && !processed_syn_hashes.empty() && tokens != orig_tokens) {
results.emplace_back(tokens);
}
}
@ -102,7 +99,7 @@ void SynonymIndex::synonym_reduction(const std::vector<std::string>& tokens,
}
std::set<uint64_t> processed_syn_hashes;
synonym_reduction_internal(tokens, tokens.size(), 0, processed_syn_hashes, results);
synonym_reduction_internal(tokens, tokens.size(), 0, processed_syn_hashes, results, tokens);
}
Option<bool> SynonymIndex::add_synonym(const std::string & collection_name, const synonym_t& synonym,

View File

@ -1008,3 +1008,62 @@ TEST_F(CollectionSynonymsTest, SynonymForKorean) {
res = coll1->search("구울", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(3, res["hits"].size());
}
TEST_F(CollectionSynonymsTest, MultipleSynonymSubstitution) {
nlohmann::json schema = R"({
"name": "coll2",
"fields": [
{"name": "title", "type": "string"},
{"name": "gender", "type": "string"}
]
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll2 = op.get();
std::vector<std::vector<std::string>> records = {
{"Beautiful Blazer", "Male"},
};
for(size_t i=0; i<records.size(); i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = records[i][0];
doc["gender"] = records[i][1];
auto add_op = coll2->add(doc.dump());
ASSERT_TRUE(add_op.ok());
}
nlohmann::json synonym1 = R"({
"id": "foobar",
"synonyms": ["blazer", "suit"]
})"_json;
nlohmann::json synonym2 = R"({
"id": "foobar2",
"synonyms": ["male", "man"]
})"_json;
ASSERT_TRUE(coll2->add_synonym(synonym1).ok());
ASSERT_TRUE(coll2->add_synonym(synonym2).ok());
auto res = coll2->search("blazer male", {"title", "gender"}, "", {},
{}, {0}, 10, 1, FREQUENCY, {true},0).get();
ASSERT_EQ(1, res["hits"].size());
res = coll2->search("blazer man", {"title", "gender"}, "", {},
{}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(1, res["hits"].size());
res = coll2->search("suit male", {"title", "gender"}, "", {},
{}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(1, res["hits"].size());
res = coll2->search("suit man", {"title", "gender"}, "", {},
{}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(1, res["hits"].size());
}