Merge pull request #1145 from ozanarmagan/v0.25-join

Fix wrong hybrid search text match score
2025-05-21 14:12:27 +08:00 · 2023-08-09 19:01:13 +05:30 · 2023-08-09 19:01:13 +05:30 · b0376e5acf
commit b0376e5acf
parent b3f248bd93 379604cad1
4 changed files with 61 additions and 2 deletions
--- a/include/topster.h
+++ b/include/topster.h
@ -31,6 +31,10 @@ struct KV {
        this->scores[0] = scores[0];
        this->scores[1] = scores[1];
        this->scores[2] = scores[2];
+
+        if(match_score_index >= 0) {
+            this->text_match_score = scores[match_score_index];
+        }
    }

    KV() = default;
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -1950,10 +1950,10 @@ Option<nlohmann::json> Collection::search(std::string  raw_query,
            if(field_order_kv->match_score_index == CURATED_RECORD_IDENTIFIER) {
                wrapper_doc["curated"] = true;
            } else if(field_order_kv->match_score_index >= 0) {
-                wrapper_doc["text_match"] = field_order_kv->scores[field_order_kv->match_score_index];
+                wrapper_doc["text_match"] = field_order_kv->text_match_score;
                wrapper_doc["text_match_info"] = nlohmann::json::object();
                populate_text_match_info(wrapper_doc["text_match_info"],
-                                        field_order_kv->scores[field_order_kv->match_score_index], match_type);
+                                        field_order_kv->text_match_score, match_type);
                if(!vector_query.field_name.empty()) {
                    wrapper_doc["hybrid_search_info"] = nlohmann::json::object();
                    wrapper_doc["hybrid_search_info"]["rank_fusion_score"] = Index::int64_t_to_float(field_order_kv->scores[field_order_kv->match_score_index]);
--- a/src/index.cpp
+++ b/src/index.cpp
@ -3213,6 +3213,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                        auto result = result_it->second;
                        // old_score + (1 / rank_of_document) * WEIGHT)
                        result->vector_distance = vec_result.second;
+                        result->text_match_score  = result->scores[result->match_score_index];
                        int64_t match_score = float_to_int64_t(
                                (int64_t_to_float(result->scores[result->match_score_index])) +
                                ((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT));
@ -3234,6 +3235,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
                        int64_t match_score_index = -1;
                        compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second);
                        KV kv(searched_queries.size(), doc_id, doc_id, match_score_index, scores);
+                        kv.text_match_score = 0;
                        kv.vector_distance = vec_result.second;
                        topster->add(&kv);
                        vec_search_ids.push_back(doc_id);
@ -4163,6 +4165,7 @@ void Index::search_across_fields(const std::vector<token_t>& query_tokens,
        KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores);
        if(match_score_index != -1) {
            kv.scores[match_score_index] = aggregated_score;
+            kv.text_match_score = aggregated_score;
        }

        int ret = topster->add(&kv);
--- a/test/collection_specific_more_test.cpp
+++ b/test/collection_specific_more_test.cpp
@ -2530,4 +2530,56 @@ TEST_F(CollectionSpecificMoreTest, ApproxFilterMatchCount) {

    delete filter_tree_root;
    collectionManager.drop_collection("Collection");
+}
+
+TEST_F(CollectionSpecificMoreTest, HybridSearchTextMatchInfo) {
+    auto schema_json =
+            R"({
+                "name": "Products",
+                "fields": [
+                    {"name": "product_id", "type": "string"},
+                    {"name": "product_name", "type": "string", "infix": true},
+                    {"name": "product_description", "type": "string"},
+                    {"name": "embedding", "type":"float[]", "embed":{"from": ["product_description"], "model_config": {"model_name": "ts/e5-small"}}}
+                ]
+            })"_json;
+    std::vector<nlohmann::json> documents = {
+            R"({
+                "product_id": "product_a",
+                "product_name": "shampoo",
+                "product_description": "Our new moisturizing shampoo is perfect for those with dry or damaged hair."
+            })"_json,
+            R"({
+                "product_id": "product_b",
+                "product_name": "soap",
+                "product_description": "Introducing our all-natural, organic soap bar made with essential oils and botanical ingredients."
+            })"_json
+    };
+
+    TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
+
+    auto collection_create_op = collectionManager.create_collection(schema_json);
+    ASSERT_TRUE(collection_create_op.ok());
+    for (auto const &json: documents) {
+        auto add_op = collection_create_op.get()->add(json.dump());
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    auto coll1 = collection_create_op.get();
+    auto results = coll1->search("natural products", {"product_name", "embedding"},
+                                 "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {true},
+                                 0, spp::sparse_hash_set<std::string>()).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+
+    // It's a hybrid search with only vector match
+    ASSERT_EQ("0", results["hits"][0]["text_match_info"]["score"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["text_match_info"]["score"].get<std::string>());
+
+    ASSERT_EQ(0, results["hits"][0]["text_match_info"]["fields_matched"].get<size_t>());
+    ASSERT_EQ(0, results["hits"][1]["text_match_info"]["fields_matched"].get<size_t>());
+
+    ASSERT_EQ(0, results["hits"][0]["text_match_info"]["tokens_matched"].get<size_t>());
+    ASSERT_EQ(0, results["hits"][1]["text_match_info"]["tokens_matched"].get<size_t>());
 }