Split / joined token should not be prefix searched.

This commit is contained in:
Kishore Nallan 2025-01-21 12:57:53 +05:30
parent ec8331a9fb
commit 3114efec08
2 changed files with 52 additions and 2 deletions

@ -3513,8 +3513,10 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
if(split_join_tokens == always || (all_result_ids_len == 0 && split_join_tokens == fallback)) {
std::vector<std::vector<std::string>> space_resolved_queries;
std::vector<std::string> orig_q_include_tokens;
for (size_t i = 0; i < num_search_fields; i++) {
std::vector<std::string> orig_q_include_tokens;
orig_q_include_tokens.clear();
for(auto& q_include_token: field_query_tokens[i].q_include_tokens) {
orig_q_include_tokens.push_back(q_include_token.value);
}
@ -3532,7 +3534,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
std::vector<token_t> resolved_tokens;
for(size_t j=0; j < resolved_query.size(); j++) {
bool is_prefix = (j == resolved_query.size()-1);
bool is_prefix = (j == resolved_query.size()-1 &&
orig_q_include_tokens.back() == resolved_query.back());
resolved_tokens.emplace_back(j, space_resolved_queries[0][j], is_prefix,
space_resolved_queries[0][j].size(), 0);
}

@ -2193,6 +2193,53 @@ TEST_F(CollectionSpecificTest, SplitJoinTokenAlways) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, SplitJoinTokenShouldNotBePrefixSearched) {
// token that's split/joined should not be used for prefix searching
std::vector<field> fields = {field("title", field_types::STRING, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
nlohmann::json doc;
doc["title"] = "Non stick cookware";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["title"] = "Nonstick cookware";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["title"] = "Non cookwareable";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["title"] = "Non Scratchable Pottery";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
// "cookwareable" should not match
auto results = coll1->search("cook ware", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
results = coll1->search("nonscratchable", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
results = coll1->search("pottery nonscratchable", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
results = coll1->search("pottery nonscratch", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(0, results["hits"].size());
results = coll1->search("nonscratch", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(0, results["hits"].size());
// prefix search on non-joined token should work
results = coll1->search("nonscratchable po", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, TokenCountOfWordsFarApart) {
// word proximity is calculated using a moving window of X tokens. If only 1 token is present in the best matched
// window, proximity ends up being perfect. So we've to ensure that scoring uses total tokens found and not