mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 04:32:38 +08:00
Fix verbatim match on array.
This commit is contained in:
parent
fa607f0013
commit
a409df8dad
@ -700,10 +700,9 @@ public:
|
||||
static void concat_topster_ids(Topster* topster, spp::sparse_hash_map<uint64_t, std::vector<KV*>>& topster_ids);
|
||||
|
||||
int64_t score_results2(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
|
||||
const bool field_is_array, const uint32_t total_cost,
|
||||
const size_t field_id, const bool field_is_array, const uint32_t total_cost,
|
||||
int64_t& match_score,
|
||||
const uint32_t seq_id, const int sort_order[3],
|
||||
const size_t group_limit, const std::vector<std::string>& group_by_fields,
|
||||
const bool prioritize_exact_match,
|
||||
const bool single_exact_query_token,
|
||||
size_t num_query_tokens,
|
||||
|
@ -3118,9 +3118,9 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
|
||||
single_exact_query_token = true;
|
||||
}
|
||||
|
||||
score_results2(sort_fields, searched_queries.size(), field_is_array,
|
||||
score_results2(sort_fields, searched_queries.size(), fi, field_is_array,
|
||||
total_cost, field_match_score,
|
||||
seq_id, sort_order, group_limit, group_by_fields,
|
||||
seq_id, sort_order,
|
||||
prioritize_exact_match, single_exact_query_token,
|
||||
query_tokens.size(), syn_orig_num_tokens, token_postings);
|
||||
|
||||
@ -3543,9 +3543,8 @@ void Index::do_infix_search(const size_t num_search_fields, const std::vector<se
|
||||
auto seq_id = raw_infix_ids[i];
|
||||
|
||||
int64_t match_score = 0;
|
||||
score_results2(sort_fields, searched_queries.size(), field_is_array,
|
||||
0, match_score, seq_id, sort_order, group_limit, group_by_fields,
|
||||
false, false, 1, -1, {});
|
||||
score_results2(sort_fields, searched_queries.size(), field_id, field_is_array,
|
||||
0, match_score, seq_id, sort_order, false, false, 1, -1, {});
|
||||
|
||||
int64_t scores[3] = {0};
|
||||
int64_t match_score_index = 0;
|
||||
@ -3870,9 +3869,8 @@ void Index::search_wildcard(const std::vector<filter>& filters,
|
||||
const uint32_t seq_id = batch_result_ids[i];
|
||||
int64_t match_score = 0;
|
||||
|
||||
score_results2(sort_fields, (uint16_t) searched_queries.size(), false, 0,
|
||||
match_score, seq_id, sort_order, group_limit, group_by_fields, false,
|
||||
false, 1, -1, plists);
|
||||
score_results2(sort_fields, (uint16_t) searched_queries.size(), 0, false, 0,
|
||||
match_score, seq_id, sort_order, false, false, 1, -1, plists);
|
||||
|
||||
int64_t scores[3] = {0};
|
||||
int64_t match_score_index = 0;
|
||||
@ -4168,10 +4166,11 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
|
||||
}
|
||||
|
||||
int64_t Index::score_results2(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
|
||||
const bool field_is_array, const uint32_t total_cost,
|
||||
const size_t field_id,
|
||||
const bool field_is_array,
|
||||
const uint32_t total_cost,
|
||||
int64_t& match_score,
|
||||
const uint32_t seq_id, const int sort_order[3],
|
||||
const size_t group_limit, const std::vector<std::string>& group_by_fields,
|
||||
const bool prioritize_exact_match,
|
||||
const bool single_exact_query_token,
|
||||
size_t num_query_tokens,
|
||||
@ -4183,8 +4182,8 @@ int64_t Index::score_results2(const std::vector<sort_by> & sort_fields, const ui
|
||||
|
||||
if (posting_lists.size() <= 1) {
|
||||
const uint8_t is_verbatim_match = uint8_t(
|
||||
prioritize_exact_match && single_exact_query_token &&
|
||||
posting_list_t::is_single_token_verbatim_match(posting_lists[0], field_is_array)
|
||||
prioritize_exact_match && single_exact_query_token &&
|
||||
posting_list_t::is_single_token_verbatim_match(posting_lists[0], field_is_array)
|
||||
);
|
||||
size_t words_present = (num_query_tokens == 1 && syn_orig_num_tokens != -1) ? syn_orig_num_tokens : 1;
|
||||
size_t distance = (num_query_tokens == 1 && syn_orig_num_tokens != -1) ? syn_orig_num_tokens-1 : 0;
|
||||
@ -4203,11 +4202,18 @@ int64_t Index::score_results2(const std::vector<sort_by> & sort_fields, const ui
|
||||
const Match &match = Match(seq_id, token_positions, false, prioritize_exact_match);
|
||||
uint64_t this_match_score = match.get_match_score(total_cost, posting_lists.size());
|
||||
|
||||
// Within a field, only a subset of query tokens can match (unique_words), but even a smaller set
|
||||
// might be available within the window used for proximity calculation (this_words_present)
|
||||
|
||||
auto this_words_present = ((this_match_score >> 24) & 0xFF);
|
||||
auto unique_words = field_is_array ? this_words_present : ((this_match_score >> 32) & 0xFF);
|
||||
auto typo_score = ((this_match_score >> 16) & 0xFF);
|
||||
auto proximity = ((this_match_score >> 8) & 0xFF);
|
||||
auto verbatim = (this_match_score & 0xFF);
|
||||
|
||||
// for array we have to compare with total query tokens to account for global context
|
||||
auto verbatim = field_is_array ?
|
||||
(this_match_score & 0xFF) && (int64_t)(num_query_tokens == this_words_present) :
|
||||
(this_match_score & 0xFF);
|
||||
|
||||
if(syn_orig_num_tokens != -1 && num_query_tokens == posting_lists.size()) {
|
||||
unique_words = syn_orig_num_tokens;
|
||||
@ -4215,9 +4221,6 @@ int64_t Index::score_results2(const std::vector<sort_by> & sort_fields, const ui
|
||||
proximity = 100 - (syn_orig_num_tokens - 1);
|
||||
}
|
||||
|
||||
// Within a field, only a subset of query tokens can match (unique_words), but even a smaller set
|
||||
// might be available within the window used for proximity calculation (this_words_present)
|
||||
|
||||
uint64_t mod_match_score = (
|
||||
(int64_t(this_words_present) << 32) |
|
||||
(int64_t(unique_words) << 24) |
|
||||
@ -4231,12 +4234,17 @@ int64_t Index::score_results2(const std::vector<sort_by> & sort_fields, const ui
|
||||
}
|
||||
|
||||
/*std::ostringstream os;
|
||||
os << "seq_id: " << seq_id
|
||||
os << "seq_id: " << seq_id << ", field_id: " << field_id
|
||||
<< ", this_words_present: " << this_words_present
|
||||
<< ", unique_words: " << unique_words
|
||||
<< ", typo_score: " << typo_score
|
||||
<< ", proximity: " << proximity
|
||||
<< ", verbatim: " << verbatim
|
||||
<< ", mod_match_score: " << mod_match_score
|
||||
<< ", token_positions: " << token_positions.size()
|
||||
<< ", num_query_tokens: " << num_query_tokens
|
||||
<< ", posting_lists.size: " << posting_lists.size()
|
||||
<< ", array_index: " << kv.first
|
||||
<< std::endl;
|
||||
LOG(INFO) << os.str();*/
|
||||
}
|
||||
|
@ -229,3 +229,33 @@ TEST_F(CollectionSpecificMoreTest, MatchedSegmentMoreImportantThanTotalMatches)
|
||||
ASSERT_EQ("2", results["hits"][1]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("1", results["hits"][2]["document"]["id"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificMoreTest, VerbatimMatchNotOnPartialTokenMatch) {
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("tags", field_types::STRING_ARRAY, false)};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
|
||||
|
||||
nlohmann::json doc1;
|
||||
doc1["id"] = "0";
|
||||
doc1["title"] = "Thirteen Fourteen";
|
||||
doc1["tags"] = {"foo", "bar", "Hundred", "Thirteen Fourteen"};
|
||||
|
||||
nlohmann::json doc2;
|
||||
doc2["id"] = "1";
|
||||
doc2["title"] = "One Eleven Thirteen Fourteen Three";
|
||||
doc2["tags"] = {"foo", "bar", "Hundred", "One Eleven Thirteen Fourteen Three"};
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
|
||||
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
|
||||
|
||||
auto results = coll1->search("hundred thirteen fourteen", {"tags"},
|
||||
"", {}, {}, {2}, 10,
|
||||
1, FREQUENCY, {true},
|
||||
1, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 5, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true).get();
|
||||
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
ASSERT_EQ(results["hits"][0]["text_match"].get<size_t>(), results["hits"][1]["text_match"].get<size_t>());
|
||||
}
|
||||
|
@ -3687,8 +3687,7 @@ TEST_F(CollectionTest, MultiFieldMatchRankingOnArray) {
|
||||
ASSERT_EQ(2, results["found"].get<size_t>());
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_EQ(results["hits"][0]["text_match"].get<size_t>(), results["hits"][0]["text_match"].get<size_t>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user