Fix vector query by id returning k+1 hit.

This commit is contained in:
Kishore Nallan 2024-04-05 21:33:55 +05:30
parent 15114a6c87
commit ec02d9fe9a
2 changed files with 20 additions and 1 deletions

View File

@ -3012,6 +3012,13 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
continue;
}
if(vector_query.query_doc_given && nearest_ids.size() >= k-1) {
// When id based vector query is made, we ask for K+1 results to account for the query
// record itself being returned. However, when the filter clause does not match the
// query record, we will end up returning 1 extra hit.
break;
}
uint64_t distinct_id = seq_id;
if (group_limit != 0) {
distinct_id = 1;
@ -3049,6 +3056,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
if(group_limit != 0 && ret < 2) {
groups_processed[distinct_id]++;
}
nearest_ids.push_back(seq_id);
}

View File

@ -174,6 +174,18 @@ TEST_F(CollectionVectorTest, BasicVectorQuerying) {
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
// when id does not match filter, don't return k+1 hits
results = coll1->search("*", {}, "id:!=1", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([], id: 1, k:1)").get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
// `k` value should overrides per_page
results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
@ -1356,7 +1368,6 @@ TEST_F(CollectionVectorTest, DistanceThresholdTest) {
}
TEST_F(CollectionVectorTest, HybridSearchSortByGeopoint) {
nlohmann::json schema = R"({
"name": "objects",