diff --git a/include/index.h b/include/index.h index 32c8b5ad..ffec847c 100644 --- a/include/index.h +++ b/include/index.h @@ -612,7 +612,7 @@ private: const std::string &field_name, nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased); - bool common_results_exist(std::vector& leaves) const; + bool common_results_exist(std::vector& leaves, bool must_match_phrase) const; public: // for limiting number of results on multiple candidates / query rewrites diff --git a/src/index.cpp b/src/index.cpp index 0c666115..29289d41 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -4411,7 +4411,7 @@ void Index::resolve_space_as_typos(std::vector& qtokens, const stri leaves.push_back(leaf); } - if(leaves.size() == qtokens.size() && common_results_exist(leaves)) { + if(leaves.size() == qtokens.size() && common_results_exist(leaves, false)) { return ; } @@ -4457,7 +4457,7 @@ void Index::resolve_space_as_typos(std::vector& qtokens, const stri leaves.push_back(leaf); } - if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves)) { + if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves, false)) { resolved_queries.push_back(candidate_tokens); return; } @@ -4487,7 +4487,7 @@ void Index::resolve_space_as_typos(std::vector& qtokens, const stri second_part.length() + 1)); std::vector part_leaves = {first_leaf, second_leaf}; - if(second_leaf != nullptr && common_results_exist(part_leaves)) { + if(second_leaf != nullptr && common_results_exist(part_leaves, true)) { candidate_tokens.push_back(first_part); candidate_tokens.push_back(second_part); found_split = true; @@ -4506,9 +4506,9 @@ void Index::resolve_space_as_typos(std::vector& qtokens, const stri leaves.clear(); - for(auto& token: candidate_tokens) { - art_leaf* leaf = static_cast(art_search(t, (const unsigned char*) token.c_str(), - token.length() + 1)); + for(auto& candidate_token: candidate_tokens) { + art_leaf* leaf = static_cast(art_search(t, (const unsigned char*) candidate_token.c_str(), + candidate_token.length() + 1)); if(leaf == nullptr) { break; } @@ -4516,14 +4516,14 @@ void Index::resolve_space_as_typos(std::vector& qtokens, const stri leaves.push_back(leaf); } - if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves)) { + if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves, false)) { resolved_queries.push_back(candidate_tokens); return; } } } -bool Index::common_results_exist(std::vector& leaves) const { +bool Index::common_results_exist(std::vector& leaves, bool must_match_phrase) const { std::vector result_ids; std::vector leaf_vals; @@ -4532,7 +4532,23 @@ bool Index::common_results_exist(std::vector& leaves) const { } posting_t::intersect(leaf_vals, result_ids); - return !result_ids.empty(); + + if(result_ids.empty()) { + return false; + } + + if(!must_match_phrase) { + return !result_ids.empty(); + } + + uint32_t* phrase_ids = new uint32_t[result_ids.size()]; + size_t num_phrase_ids; + + posting_t::get_phrase_matches(leaf_vals, false, &result_ids[0], result_ids.size(), + phrase_ids, num_phrase_ids); + bool phrase_exists = (num_phrase_ids != 0); + delete [] phrase_ids; + return phrase_exists; } /* diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index 0fd2dde7..e9148d84 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -1916,7 +1916,7 @@ TEST_F(CollectionSpecificTest, DroppedTokensShouldNotBeUsedForPrefixSearch) { collectionManager.drop_collection("coll1"); } -TEST_F(CollectionSpecificTest, SearchShouldJoinToken) { +TEST_F(CollectionSpecificTest, SearchShouldSplitAndJoinTokens) { // when the first document containing a token already cannot fit compact posting list std::vector fields = {field("title", field_types::STRING, false),}; @@ -1932,6 +1932,20 @@ TEST_F(CollectionSpecificTest, SearchShouldJoinToken) { results = coll1->search("pressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(1, results["hits"].size()); + results = coll1->search("nonstick pressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); + ASSERT_EQ(1, results["hits"].size()); + + results = coll1->search("the pressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); + ASSERT_EQ(1, results["hits"].size()); + + // splitting requires tokens to co-occur as a phrase in the dataset + + results = coll1->search("the pressureis", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); + ASSERT_EQ(0, results["hits"].size()); + + results = coll1->search("greatcooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); + ASSERT_EQ(0, results["hits"].size()); + results = coll1->search("t h e", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(1, results["hits"].size());