diff --git a/src/index.cpp b/src/index.cpp index c05b8c5d..ac7be1d6 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -3981,8 +3981,6 @@ void Index::search_across_fields(const std::vector& query_tokens, dropped_token_its.push_back(std::move(token_fields)); } - - // one iterator for each token, each underlying iterator contains results of token across multiple fields std::vector token_its; @@ -4074,6 +4072,28 @@ void Index::search_across_fields(const std::vector& query_tokens, } } + size_t query_len = query_tokens.size(); + + // check if seq_id exists in any of the dropped_token iters + for(size_t ti = 0; ti < dropped_token_its.size(); ti++) { + or_iterator_t& token_fields_iters = dropped_token_its[ti]; + if(token_fields_iters.skip_to(seq_id) && token_fields_iters.id() == seq_id) { + query_len++; + const std::vector& field_iters = token_fields_iters.get_its(); + for(size_t fi = 0; fi < field_iters.size(); fi++) { + const posting_list_t::iterator_t& field_iter = field_iters[fi]; + if(field_iter.id() == seq_id) { + // not all fields might contain a given token + field_to_tokens[field_iter.get_field_id()].push_back(field_iter.clone()); + } + } + } + } + + if(syn_orig_num_tokens != -1) { + query_len = syn_orig_num_tokens; + } + int64_t best_field_match_score = 0, best_field_weight = 0; uint32_t num_matching_fields = 0; @@ -4127,18 +4147,6 @@ void Index::search_across_fields(const std::vector& query_tokens, compute_sort_scores(sort_fields, sort_order, field_values, geopoint_indices, seq_id, filter_index, best_field_match_score, scores, match_score_index); - size_t query_len = query_tokens.size(); - - // check if seq_id exists in any of the dropped_token iters and increment matching fields accordingly - for(auto& dropped_token_it: dropped_token_its) { - if(dropped_token_it.skip_to(seq_id) && dropped_token_it.id() == seq_id) { - query_len++; - } - } - - if(syn_orig_num_tokens != -1) { - query_len = syn_orig_num_tokens; - } query_len = std::min(15, query_len); // NOTE: `query_len` is total tokens matched across fields. diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 83181283..4e7347e6 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -1816,6 +1816,36 @@ TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring) ASSERT_EQ("1", res["hits"][1]["document"]["id"].get()); } +TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring2) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "name", "type": "string"} + ] + })"_json; + + Collection *coll1 = collectionManager.create_collection(schema).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["name"] = "Elizabeth Arden 5th Avenue Eau de Parfum 125ml"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["name"] = "Avène Sun Very High Protection Mineral Cream SPF50+ 50ml"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto res = coll1->search("avène eau mineral", {"name"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 20, {}, {}, {}, 0, + "", "", {3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 0, 0, 0, 2, false, "", true, 0, max_weight).get(); + + ASSERT_EQ(2, res["hits"].size()); + ASSERT_EQ("1", res["hits"][0]["document"]["id"].get()); + ASSERT_EQ("0", res["hits"][1]["document"]["id"].get()); +} + TEST_F(CollectionSpecificMoreTest, NonNestedFieldNameWithDot) { nlohmann::json schema = R"({ "name": "coll1",