mirror of
https://github.com/typesense/typesense.git
synced 2025-05-21 22:33:27 +08:00
Verbatim match must not overpower weight.
This commit is contained in:
parent
87e2d6914f
commit
133c64d2d2
@ -2287,6 +2287,10 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
}
|
||||
|
||||
//auto begin0 = std::chrono::high_resolution_clock::now();
|
||||
/*size_t total_q_tokens = field_query_tokens[0].q_include_tokens.size();
|
||||
for(const auto& phrase: field_query_tokens[0].q_phrases) {
|
||||
total_q_tokens += phrase.size();
|
||||
}*/
|
||||
|
||||
for(auto& seq_id_kvs: topster_ids) {
|
||||
const uint64_t seq_id = seq_id_kvs.first;
|
||||
@ -2312,7 +2316,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
uint32_t token_bits = (uint32_t(1) << 31); // top most bit set to guarantee atleast 1 bit set
|
||||
uint64_t total_typos = 0, total_distances = 0, min_typos = 1000;
|
||||
|
||||
uint64_t verbatim_match_fields = 0; // query matching field verbatim
|
||||
uint64_t verbatim_match_fields = 0; // field value *exactly* same as query tokens
|
||||
uint64_t exact_match_fields = 0; // number of fields that contains all of query tokens
|
||||
uint64_t max_weighted_tokens_match = 0; // weighted max number of tokens matched in a field
|
||||
uint64_t total_token_matches = 0; // total matches across fields (including fuzzy ones)
|
||||
@ -2325,10 +2329,6 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
const size_t weight = search_fields[i].weight;
|
||||
|
||||
//LOG(INFO) << "--- field index: " << i << ", priority: " << priority;
|
||||
// using `5` here because typo + prefix combo score range is: 0 - 5
|
||||
// 0 1 2
|
||||
// 0,1 2,3 4,5
|
||||
int64_t MAX_SUM_TYPOS = 5 * field_query_tokens[i].q_include_tokens.size();
|
||||
|
||||
if(existing_field_kvs.count(field_id) != 0) {
|
||||
// for existing field, we will simply sum field-wise weighted scores
|
||||
@ -2370,13 +2370,11 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::string& field = search_fields[i].name;
|
||||
const bool field_prefix = (i < prefixes.size()) ? prefixes[i] : prefixes[0];
|
||||
|
||||
// compute approximate match score for this field from actual query
|
||||
|
||||
const std::string& field = search_fields[i].name;
|
||||
size_t words_present = 0;
|
||||
|
||||
// FIXME: must consider phrase tokens also
|
||||
for(size_t token_index=0; token_index < field_query_tokens[i].q_include_tokens.size(); token_index++) {
|
||||
const auto& token = field_query_tokens[i].q_include_tokens[token_index];
|
||||
const art_leaf* leaf = (art_leaf *) art_search(search_index.at(field), (const unsigned char*) token.c_str(),
|
||||
@ -2450,13 +2448,13 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens,
|
||||
// protect most significant byte from overflow, since topster uses int64_t
|
||||
verbatim_match_fields = std::min<uint64_t>(INT8_MAX, verbatim_match_fields);
|
||||
|
||||
exact_match_fields += verbatim_match_fields;
|
||||
exact_match_fields = std::min<uint64_t>(255, exact_match_fields);
|
||||
max_weighted_tokens_match = std::min<uint64_t>(255, max_weighted_tokens_match);
|
||||
total_typos = std::min<uint64_t>(255, total_typos);
|
||||
total_distances = std::min<uint64_t>(100, total_distances);
|
||||
|
||||
uint64_t aggregated_score = (
|
||||
(verbatim_match_fields << 56) | // field value *exactly* same as query tokens
|
||||
(exact_match_fields << 48) | // number of fields that contain *all tokens* in the query
|
||||
(max_weighted_tokens_match << 40) | // weighted max number of tokens matched in a field
|
||||
(uniq_tokens_found << 32) | // number of unique tokens found across fields including typos
|
||||
|
@ -2244,3 +2244,76 @@ TEST_F(CollectionSpecificTest, HandleLargeWeights) {
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, DISABLED_ExactMatchOnAFieldIgnoresOtherFieldScores) {
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("description", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
|
||||
nlohmann::json doc1;
|
||||
doc1["id"] = "0";
|
||||
doc1["title"] = "Mark Antony";
|
||||
doc1["description"] = "Marriage Counsellor";
|
||||
doc1["points"] = 100;
|
||||
|
||||
nlohmann::json doc2;
|
||||
doc2["id"] = "1";
|
||||
doc2["title"] = "Mark Spencer";
|
||||
doc2["description"] = "Sales Expert";
|
||||
doc2["points"] = 200;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
|
||||
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
|
||||
|
||||
auto results = coll1->search("mark", {"title", "description"},
|
||||
"", {}, {}, {2, 2}, 10,
|
||||
1, FREQUENCY, {true, true},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {3, 1}, 1000, true).get();
|
||||
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, VerbatimMatchShouldNotOverpowerHigherWeightedField) {
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),
|
||||
field("description", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
|
||||
nlohmann::json doc1;
|
||||
doc1["id"] = "0";
|
||||
doc1["title"] = "Basketball Shoes";
|
||||
doc1["description"] = "Basketball";
|
||||
doc1["points"] = 100;
|
||||
|
||||
nlohmann::json doc2;
|
||||
doc2["id"] = "1";
|
||||
doc2["title"] = "Nike Jordan";
|
||||
doc2["description"] = "Shoes";
|
||||
doc2["points"] = 200;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
|
||||
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
|
||||
|
||||
auto results = coll1->search("shoes", {"title", "description"},
|
||||
"", {}, {}, {2, 2}, 10,
|
||||
1, FREQUENCY, {true, true},
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {4, 1}, 1000, true).get();
|
||||
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user