Split / joined token should not be prefix searched.

2025-05-16 03:12:32 +08:00 · 2025-01-21 12:57:53 +05:30 · 2025-01-21 12:57:53 +05:30 · 3114efec08
commit 3114efec08
parent ec8331a9fb
2 changed files with 52 additions and 2 deletions
--- a/src/index.cpp
+++ b/src/index.cpp
@ -3513,8 +3513,10 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
        if(split_join_tokens == always || (all_result_ids_len == 0 && split_join_tokens == fallback)) {
            std::vector<std::vector<std::string>> space_resolved_queries;

+            std::vector<std::string> orig_q_include_tokens;
+
            for (size_t i = 0; i < num_search_fields; i++) {
-                std::vector<std::string> orig_q_include_tokens;
+                orig_q_include_tokens.clear();
                for(auto& q_include_token: field_query_tokens[i].q_include_tokens) {
                    orig_q_include_tokens.push_back(q_include_token.value);
                }
@ -3532,7 +3534,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                std::vector<token_t> resolved_tokens;

                for(size_t j=0; j < resolved_query.size(); j++) {
-                    bool is_prefix = (j == resolved_query.size()-1);
+                    bool is_prefix = (j == resolved_query.size()-1 &&
+                                        orig_q_include_tokens.back() == resolved_query.back());
                    resolved_tokens.emplace_back(j, space_resolved_queries[0][j], is_prefix,
                                                 space_resolved_queries[0][j].size(), 0);
                }
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@ -2193,6 +2193,53 @@ TEST_F(CollectionSpecificTest, SplitJoinTokenAlways) {
    collectionManager.drop_collection("coll1");
 }

+TEST_F(CollectionSpecificTest, SplitJoinTokenShouldNotBePrefixSearched) {
+    // token that's split/joined should not be used for prefix searching
+    std::vector<field> fields = {field("title", field_types::STRING, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
+
+    nlohmann::json doc;
+    doc["title"] = "Non stick cookware";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    doc["title"] = "Nonstick cookware";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    doc["title"] = "Non cookwareable";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    doc["title"] = "Non Scratchable Pottery";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    // "cookwareable" should not match
+    auto results = coll1->search("cook ware", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+
+    results = coll1->search("nonscratchable", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
+
+    results = coll1->search("pottery nonscratchable", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
+
+    results = coll1->search("pottery nonscratch", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
+    ASSERT_EQ(0, results["hits"].size());
+
+    results = coll1->search("nonscratch", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
+    ASSERT_EQ(0, results["hits"].size());
+
+    // prefix search on non-joined token should work
+    results = coll1->search("nonscratchable po", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 0).get();
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+
 TEST_F(CollectionSpecificTest, TokenCountOfWordsFarApart) {
    // word proximity is calculated using a moving window of X tokens. If only 1 token is present in the best matched
    // window, proximity ends up being perfect. So we've to ensure that scoring uses total tokens found and not