mirror of
https://github.com/typesense/typesense.git
synced 2025-05-21 06:02:26 +08:00
Fixed an edge case of exact match across multiple fields.
This commit is contained in:
parent
27ddda5792
commit
09e2e62312
@ -1663,7 +1663,7 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
|
||||
|
||||
uint32_t token_bits = (uint32_t(1) << 31); // top most bit set to guarantee atleast 1 bit set
|
||||
uint64_t total_typos = 0, total_distances = 0, exact_matches = 0;
|
||||
uint64_t num_exact_matches = 0;
|
||||
uint64_t num_query_matches = 0;
|
||||
|
||||
//LOG(INFO) << "Init pop count: " << __builtin_popcount(token_bits);
|
||||
|
||||
@ -1687,7 +1687,7 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
|
||||
exact_matches += (((match_score & 0xFF)) + 1) * weight;
|
||||
|
||||
if(field_typos == 0 && tokens_found == field_query_tokens[i].q_include_tokens.size()) {
|
||||
num_exact_matches++;
|
||||
num_query_matches++;
|
||||
}
|
||||
|
||||
/*LOG(INFO) << "seq_id: " << seq_id << ", total_typos: " << (255 - ((match_score >> 8) & 0xFF))
|
||||
@ -1733,14 +1733,15 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
|
||||
|
||||
if(words_present != 0) {
|
||||
uint64_t match_score = Match::get_match_score(words_present, 0, 0);
|
||||
total_distances += ((100 - (match_score & 0xFF)) + 1) * weight;
|
||||
|
||||
uint64_t tokens_found = ((match_score >> 16) & 0xFF);
|
||||
uint64_t field_typos = 255 - ((match_score >> 8) & 0xFF);
|
||||
uint64_t tokens_found = ((match_score >> 24) & 0xFF);
|
||||
uint64_t field_typos = 255 - ((match_score >> 16) & 0xFF);
|
||||
total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * weight;
|
||||
total_typos += (field_typos + 1) * weight;
|
||||
|
||||
if(field_typos == 0 && tokens_found == field_query_tokens[i].q_include_tokens.size()) {
|
||||
num_exact_matches++;
|
||||
num_query_matches++;
|
||||
exact_matches++;
|
||||
}
|
||||
//LOG(INFO) << "seq_id: " << seq_id << ", total_typos: " << ((match_score >> 8) & 0xFF);
|
||||
}
|
||||
@ -1751,10 +1752,11 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
|
||||
total_distances = std::min<uint64_t>(100, total_distances);
|
||||
|
||||
uint64_t aggregated_score = (
|
||||
(num_exact_matches << 24) |
|
||||
(tokens_present << 16) |
|
||||
((255 - total_typos) << 8) |
|
||||
(100 - total_distances)
|
||||
(num_query_matches << 32) |
|
||||
(tokens_present << 24) |
|
||||
((255 - total_typos) << 16) |
|
||||
((100 - total_distances) << 8) |
|
||||
(exact_matches)
|
||||
);
|
||||
|
||||
/*LOG(INFO) << "seq id: " << seq_id << ", tokens_present: " << tokens_present
|
||||
|
@ -737,7 +737,7 @@ TEST_F(CollectionTest, ArrayStringFieldHighlight) {
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
|
||||
|
||||
ids = {"0", "1"};
|
||||
ids = {"1", "0"};
|
||||
|
||||
for (size_t i = 0; i < results["hits"].size(); i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
@ -746,43 +746,43 @@ TEST_F(CollectionTest, ArrayStringFieldHighlight) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
ASSERT_EQ(3, results["hits"][0]["highlights"][0].size());
|
||||
ASSERT_STREQ("title", results["hits"][0]["highlights"][0]["field"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("The <mark>Truth</mark> About Forever", results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
||||
ASSERT_EQ(1, results["hits"][0]["highlights"][0]["matched_tokens"].size());
|
||||
ASSERT_STREQ("Truth", results["hits"][0]["highlights"][0]["matched_tokens"][0].get<std::string>().c_str());
|
||||
ASSERT_EQ(4, results["hits"][0]["highlights"][0].size());
|
||||
ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get<std::string>().c_str(), "tags");
|
||||
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["snippets"].size());
|
||||
ASSERT_STREQ("<mark>truth</mark>", results["hits"][0]["highlights"][0]["snippets"][0].get<std::string>().c_str());
|
||||
ASSERT_STREQ("plain <mark>truth</mark>", results["hits"][0]["highlights"][0]["snippets"][1].get<std::string>().c_str());
|
||||
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["matched_tokens"].size());
|
||||
ASSERT_STREQ("truth", results["hits"][0]["highlights"][0]["matched_tokens"][0][0].get<std::string>().c_str());
|
||||
ASSERT_STREQ("truth", results["hits"][0]["highlights"][0]["matched_tokens"][1][0].get<std::string>().c_str());
|
||||
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"].size());
|
||||
ASSERT_EQ(1, results["hits"][0]["highlights"][0]["indices"][0]);
|
||||
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"][1]);
|
||||
|
||||
ASSERT_EQ(4, results["hits"][0]["highlights"][1].size());
|
||||
ASSERT_STREQ(results["hits"][0]["highlights"][1]["field"].get<std::string>().c_str(), "tags");
|
||||
ASSERT_EQ(2, results["hits"][0]["highlights"][1]["snippets"].size());
|
||||
ASSERT_STREQ("the <mark>truth</mark>", results["hits"][0]["highlights"][1]["snippets"][0].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>truth</mark> about", results["hits"][0]["highlights"][1]["snippets"][1].get<std::string>().c_str());
|
||||
ASSERT_EQ(3, results["hits"][0]["highlights"][1].size());
|
||||
ASSERT_STREQ("title", results["hits"][0]["highlights"][1]["field"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("Plain <mark>Truth</mark>", results["hits"][0]["highlights"][1]["snippet"].get<std::string>().c_str());
|
||||
ASSERT_EQ(1, results["hits"][0]["highlights"][1]["matched_tokens"].size());
|
||||
ASSERT_STREQ("Truth", results["hits"][0]["highlights"][1]["matched_tokens"][0].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(2, results["hits"][0]["highlights"][1]["matched_tokens"].size());
|
||||
ASSERT_STREQ("truth", results["hits"][0]["highlights"][1]["matched_tokens"][0][0].get<std::string>().c_str());
|
||||
ASSERT_STREQ("truth", results["hits"][0]["highlights"][1]["matched_tokens"][1][0].get<std::string>().c_str());
|
||||
ASSERT_EQ(3, results["hits"][1]["highlights"][0].size());
|
||||
ASSERT_STREQ("title", results["hits"][1]["highlights"][0]["field"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("The <mark>Truth</mark> About Forever", results["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
|
||||
ASSERT_EQ(1, results["hits"][1]["highlights"][0]["matched_tokens"].size());
|
||||
ASSERT_STREQ("Truth", results["hits"][1]["highlights"][0]["matched_tokens"][0].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(2, results["hits"][0]["highlights"][1]["indices"].size());
|
||||
ASSERT_EQ(0, results["hits"][0]["highlights"][1]["indices"][0]);
|
||||
ASSERT_EQ(2, results["hits"][0]["highlights"][1]["indices"][1]);
|
||||
ASSERT_EQ(4, results["hits"][1]["highlights"][1].size());
|
||||
ASSERT_STREQ(results["hits"][1]["highlights"][1]["field"].get<std::string>().c_str(), "tags");
|
||||
ASSERT_EQ(2, results["hits"][1]["highlights"][1]["snippets"].size());
|
||||
ASSERT_STREQ("the <mark>truth</mark>", results["hits"][1]["highlights"][1]["snippets"][0].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>truth</mark> about", results["hits"][1]["highlights"][1]["snippets"][1].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(4, results["hits"][1]["highlights"][0].size());
|
||||
ASSERT_STREQ(results["hits"][1]["highlights"][0]["field"].get<std::string>().c_str(), "tags");
|
||||
ASSERT_EQ(2, results["hits"][1]["highlights"][0]["snippets"].size());
|
||||
ASSERT_STREQ("<mark>truth</mark>", results["hits"][1]["highlights"][0]["snippets"][0].get<std::string>().c_str());
|
||||
ASSERT_STREQ("plain <mark>truth</mark>", results["hits"][1]["highlights"][0]["snippets"][1].get<std::string>().c_str());
|
||||
ASSERT_EQ(2, results["hits"][1]["highlights"][0]["matched_tokens"].size());
|
||||
ASSERT_STREQ("truth", results["hits"][1]["highlights"][0]["matched_tokens"][0][0].get<std::string>().c_str());
|
||||
ASSERT_STREQ("truth", results["hits"][1]["highlights"][0]["matched_tokens"][1][0].get<std::string>().c_str());
|
||||
ASSERT_EQ(2, results["hits"][1]["highlights"][0]["indices"].size());
|
||||
ASSERT_EQ(1, results["hits"][1]["highlights"][0]["indices"][0]);
|
||||
ASSERT_EQ(2, results["hits"][1]["highlights"][0]["indices"][1]);
|
||||
ASSERT_EQ(2, results["hits"][1]["highlights"][1]["matched_tokens"].size());
|
||||
ASSERT_STREQ("truth", results["hits"][1]["highlights"][1]["matched_tokens"][0][0].get<std::string>().c_str());
|
||||
ASSERT_STREQ("truth", results["hits"][1]["highlights"][1]["matched_tokens"][1][0].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(3, results["hits"][1]["highlights"][1].size());
|
||||
ASSERT_STREQ("title", results["hits"][1]["highlights"][1]["field"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("Plain <mark>Truth</mark>", results["hits"][1]["highlights"][1]["snippet"].get<std::string>().c_str());
|
||||
ASSERT_EQ(1, results["hits"][1]["highlights"][1]["matched_tokens"].size());
|
||||
ASSERT_STREQ("Truth", results["hits"][1]["highlights"][1]["matched_tokens"][0].get<std::string>().c_str());
|
||||
ASSERT_EQ(2, results["hits"][1]["highlights"][1]["indices"].size());
|
||||
ASSERT_EQ(0, results["hits"][1]["highlights"][1]["indices"][0]);
|
||||
ASSERT_EQ(2, results["hits"][1]["highlights"][1]["indices"][1]);
|
||||
|
||||
// highlight fields must be ordered based on match score
|
||||
results = coll_array_text->search("amazing movie", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
|
||||
@ -3085,6 +3085,50 @@ TEST_F(CollectionTest, MultiFieldRelevance5) {
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, MultiFieldRelevance6) {
|
||||
// with exact match preference
|
||||
Collection *coll1;
|
||||
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("artist", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
coll1 = collectionManager.get_collection("coll1").get();
|
||||
if(coll1 == nullptr) {
|
||||
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::string>> records = {
|
||||
{"Taylor Swift", "Taylor Swift"},
|
||||
{"Taylor Swift Song", "Taylor Swift"},
|
||||
};
|
||||
|
||||
for(size_t i=0; i<records.size(); i++) {
|
||||
nlohmann::json doc;
|
||||
|
||||
doc["id"] = std::to_string(i);
|
||||
doc["title"] = records[i][0];
|
||||
doc["artist"] = records[i][1];
|
||||
doc["points"] = i;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
}
|
||||
|
||||
auto results = coll1->search("taylor swift",
|
||||
{"title", "artist"}, "", {}, {}, 2, 10, 1, FREQUENCY,
|
||||
true, 10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {1, 1}).get();
|
||||
|
||||
ASSERT_EQ(2, results["found"].get<size_t>());
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, ExactMatch) {
|
||||
Collection *coll1;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user