Merge pull request #1145 from ozanarmagan/v0.25-join

Fix wrong hybrid search text match score
This commit is contained in:
Kishore Nallan 2023-08-09 19:01:13 +05:30 committed by GitHub
commit b0376e5acf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 61 additions and 2 deletions

View File

@ -31,6 +31,10 @@ struct KV {
this->scores[0] = scores[0];
this->scores[1] = scores[1];
this->scores[2] = scores[2];
if(match_score_index >= 0) {
this->text_match_score = scores[match_score_index];
}
}
KV() = default;

View File

@ -1950,10 +1950,10 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
if(field_order_kv->match_score_index == CURATED_RECORD_IDENTIFIER) {
wrapper_doc["curated"] = true;
} else if(field_order_kv->match_score_index >= 0) {
wrapper_doc["text_match"] = field_order_kv->scores[field_order_kv->match_score_index];
wrapper_doc["text_match"] = field_order_kv->text_match_score;
wrapper_doc["text_match_info"] = nlohmann::json::object();
populate_text_match_info(wrapper_doc["text_match_info"],
field_order_kv->scores[field_order_kv->match_score_index], match_type);
field_order_kv->text_match_score, match_type);
if(!vector_query.field_name.empty()) {
wrapper_doc["hybrid_search_info"] = nlohmann::json::object();
wrapper_doc["hybrid_search_info"]["rank_fusion_score"] = Index::int64_t_to_float(field_order_kv->scores[field_order_kv->match_score_index]);

View File

@ -3213,6 +3213,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
auto result = result_it->second;
// old_score + (1 / rank_of_document) * WEIGHT)
result->vector_distance = vec_result.second;
result->text_match_score = result->scores[result->match_score_index];
int64_t match_score = float_to_int64_t(
(int64_t_to_float(result->scores[result->match_score_index])) +
((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT));
@ -3234,6 +3235,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
int64_t match_score_index = -1;
compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second);
KV kv(searched_queries.size(), doc_id, doc_id, match_score_index, scores);
kv.text_match_score = 0;
kv.vector_distance = vec_result.second;
topster->add(&kv);
vec_search_ids.push_back(doc_id);
@ -4163,6 +4165,7 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
if(match_score_index != -1) {
kv.scores[match_score_index] = aggregated_score;
kv.text_match_score = aggregated_score;
}
int ret = topster->add(&kv);

View File

@ -2530,4 +2530,56 @@ TEST_F(CollectionSpecificMoreTest, ApproxFilterMatchCount) {
delete filter_tree_root;
collectionManager.drop_collection("Collection");
}
TEST_F(CollectionSpecificMoreTest, HybridSearchTextMatchInfo) {
auto schema_json =
R"({
"name": "Products",
"fields": [
{"name": "product_id", "type": "string"},
{"name": "product_name", "type": "string", "infix": true},
{"name": "product_description", "type": "string"},
{"name": "embedding", "type":"float[]", "embed":{"from": ["product_description"], "model_config": {"model_name": "ts/e5-small"}}}
]
})"_json;
std::vector<nlohmann::json> documents = {
R"({
"product_id": "product_a",
"product_name": "shampoo",
"product_description": "Our new moisturizing shampoo is perfect for those with dry or damaged hair."
})"_json,
R"({
"product_id": "product_b",
"product_name": "soap",
"product_description": "Introducing our all-natural, organic soap bar made with essential oils and botanical ingredients."
})"_json
};
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto collection_create_op = collectionManager.create_collection(schema_json);
ASSERT_TRUE(collection_create_op.ok());
for (auto const &json: documents) {
auto add_op = collection_create_op.get()->add(json.dump());
ASSERT_TRUE(add_op.ok());
}
auto coll1 = collection_create_op.get();
auto results = coll1->search("natural products", {"product_name", "embedding"},
"", {}, {}, {2}, 10,
1, FREQUENCY, {true},
0, spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(2, results["hits"].size());
// It's a hybrid search with only vector match
ASSERT_EQ("0", results["hits"][0]["text_match_info"]["score"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["text_match_info"]["score"].get<std::string>());
ASSERT_EQ(0, results["hits"][0]["text_match_info"]["fields_matched"].get<size_t>());
ASSERT_EQ(0, results["hits"][1]["text_match_info"]["fields_matched"].get<size_t>());
ASSERT_EQ(0, results["hits"][0]["text_match_info"]["tokens_matched"].get<size_t>());
ASSERT_EQ(0, results["hits"][1]["text_match_info"]["tokens_matched"].get<size_t>());
}