mirror of
https://github.com/typesense/typesense.git
synced 2025-05-23 15:23:40 +08:00
Split tokens should require co-occurrence.
This commit is contained in:
parent
f4d3ecc84e
commit
36382ed3f1
@ -612,7 +612,7 @@ private:
|
||||
const std::string &field_name,
|
||||
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
|
||||
|
||||
bool common_results_exist(std::vector<art_leaf*>& leaves) const;
|
||||
bool common_results_exist(std::vector<art_leaf*>& leaves, bool must_match_phrase) const;
|
||||
|
||||
public:
|
||||
// for limiting number of results on multiple candidates / query rewrites
|
||||
|
@ -4411,7 +4411,7 @@ void Index::resolve_space_as_typos(std::vector<std::string>& qtokens, const stri
|
||||
leaves.push_back(leaf);
|
||||
}
|
||||
|
||||
if(leaves.size() == qtokens.size() && common_results_exist(leaves)) {
|
||||
if(leaves.size() == qtokens.size() && common_results_exist(leaves, false)) {
|
||||
return ;
|
||||
}
|
||||
|
||||
@ -4457,7 +4457,7 @@ void Index::resolve_space_as_typos(std::vector<std::string>& qtokens, const stri
|
||||
leaves.push_back(leaf);
|
||||
}
|
||||
|
||||
if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves)) {
|
||||
if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves, false)) {
|
||||
resolved_queries.push_back(candidate_tokens);
|
||||
return;
|
||||
}
|
||||
@ -4487,7 +4487,7 @@ void Index::resolve_space_as_typos(std::vector<std::string>& qtokens, const stri
|
||||
second_part.length() + 1));
|
||||
|
||||
std::vector<art_leaf*> part_leaves = {first_leaf, second_leaf};
|
||||
if(second_leaf != nullptr && common_results_exist(part_leaves)) {
|
||||
if(second_leaf != nullptr && common_results_exist(part_leaves, true)) {
|
||||
candidate_tokens.push_back(first_part);
|
||||
candidate_tokens.push_back(second_part);
|
||||
found_split = true;
|
||||
@ -4506,9 +4506,9 @@ void Index::resolve_space_as_typos(std::vector<std::string>& qtokens, const stri
|
||||
|
||||
leaves.clear();
|
||||
|
||||
for(auto& token: candidate_tokens) {
|
||||
art_leaf* leaf = static_cast<art_leaf*>(art_search(t, (const unsigned char*) token.c_str(),
|
||||
token.length() + 1));
|
||||
for(auto& candidate_token: candidate_tokens) {
|
||||
art_leaf* leaf = static_cast<art_leaf*>(art_search(t, (const unsigned char*) candidate_token.c_str(),
|
||||
candidate_token.length() + 1));
|
||||
if(leaf == nullptr) {
|
||||
break;
|
||||
}
|
||||
@ -4516,14 +4516,14 @@ void Index::resolve_space_as_typos(std::vector<std::string>& qtokens, const stri
|
||||
leaves.push_back(leaf);
|
||||
}
|
||||
|
||||
if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves)) {
|
||||
if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves, false)) {
|
||||
resolved_queries.push_back(candidate_tokens);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Index::common_results_exist(std::vector<art_leaf*>& leaves) const {
|
||||
bool Index::common_results_exist(std::vector<art_leaf*>& leaves, bool must_match_phrase) const {
|
||||
std::vector<uint32_t> result_ids;
|
||||
std::vector<void*> leaf_vals;
|
||||
|
||||
@ -4532,7 +4532,23 @@ bool Index::common_results_exist(std::vector<art_leaf*>& leaves) const {
|
||||
}
|
||||
|
||||
posting_t::intersect(leaf_vals, result_ids);
|
||||
return !result_ids.empty();
|
||||
|
||||
if(result_ids.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!must_match_phrase) {
|
||||
return !result_ids.empty();
|
||||
}
|
||||
|
||||
uint32_t* phrase_ids = new uint32_t[result_ids.size()];
|
||||
size_t num_phrase_ids;
|
||||
|
||||
posting_t::get_phrase_matches(leaf_vals, false, &result_ids[0], result_ids.size(),
|
||||
phrase_ids, num_phrase_ids);
|
||||
bool phrase_exists = (num_phrase_ids != 0);
|
||||
delete [] phrase_ids;
|
||||
return phrase_exists;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1916,7 +1916,7 @@ TEST_F(CollectionSpecificTest, DroppedTokensShouldNotBeUsedForPrefixSearch) {
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, SearchShouldJoinToken) {
|
||||
TEST_F(CollectionSpecificTest, SearchShouldSplitAndJoinTokens) {
|
||||
// when the first document containing a token already cannot fit compact posting list
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),};
|
||||
|
||||
@ -1932,6 +1932,20 @@ TEST_F(CollectionSpecificTest, SearchShouldJoinToken) {
|
||||
results = coll1->search("pressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
results = coll1->search("nonstick pressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
results = coll1->search("the pressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
// splitting requires tokens to co-occur as a phrase in the dataset
|
||||
|
||||
results = coll1->search("the pressureis", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
results = coll1->search("greatcooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
results = coll1->search("t h e", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user