diff --git a/include/field.h b/include/field.h index ec9156c9..981d4834 100644 --- a/include/field.h +++ b/include/field.h @@ -604,7 +604,7 @@ struct sort_by { struct vector_query_t { std::string field_name; size_t k = 0; - size_t flat_search_cutoff = 1000; + size_t flat_search_cutoff = 0; std::vector values; void _reset() { diff --git a/src/index.cpp b/src/index.cpp index eba9f9f1..7413d94d 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2515,9 +2515,16 @@ void Index::search(std::vector& field_query_tokens, const std::v if(!no_filters_provided && filter_ids_length < vector_query.flat_search_cutoff) { for(size_t i = 0; i < filter_ids_length; i++) { auto seq_id = filter_ids[i]; - const std::vector& values = field_vector_index->vecdex->getDataByLabel(seq_id); - float dist; + std::vector values; + try { + values = field_vector_index->vecdex->getDataByLabel(seq_id); + } catch(...) { + // likely not found + continue; + } + + float dist; if(field_vector_index->distance_type == cosine) { std::vector normalized_q(vector_query.values.size()); hnsw_index_t::normalize_vector(vector_query.values, normalized_q); @@ -2565,6 +2572,7 @@ void Index::search(std::vector& field_query_tokens, const std::v } if(!nearest_ids.empty()) { + std::sort(nearest_ids.begin(), nearest_ids.end()); // seq_ids should be in ascending order all_result_ids = new uint32[nearest_ids.size()]; std::copy(nearest_ids.begin(), nearest_ids.end(), all_result_ids); all_result_ids_len = nearest_ids.size(); diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index d4ad9ac3..6467068e 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -319,6 +319,107 @@ TEST_F(CollectionVectorTest, VecSearchWithFiltering) { ASSERT_EQ(1, results["hits"].size()); } +TEST_F(CollectionVectorTest, VecSearchWithFilteringWithMissingVectorValues) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "points", "type": "int32"}, + {"name": "vec", "type": "float[]", "num_dim": 4, "optional": true} + ] + })"_json; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + std::mt19937 rng; + rng.seed(47); + std::uniform_real_distribution<> distrib; + + size_t num_docs = 20; + + for (size_t i = 0; i < num_docs; i++) { + nlohmann::json doc; + doc["id"] = std::to_string(i); + doc["title"] = std::to_string(i) + " title"; + doc["points"] = i; + + std::vector values; + for(size_t j = 0; j < 4; j++) { + values.push_back(distrib(rng)); + } + + if(i != 5 && i != 15) { + doc["vec"] = values; + } + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + } + + auto results = coll1->search("*", {}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, + fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get(); + + ASSERT_EQ(18, results["found"].get()); + ASSERT_EQ(18, results["hits"].size()); + + // with points:<10, non-flat-search + + results = coll1->search("*", {}, "points:<10", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, + fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 0)").get(); + + ASSERT_EQ(9, results["found"].get()); + ASSERT_EQ(9, results["hits"].size()); + + // with points:<10, flat-search + results = coll1->search("*", {}, "points:<10", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, + fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 1000)").get(); + + ASSERT_EQ(9, results["found"].get()); + ASSERT_EQ(9, results["hits"].size()); + + // single point + + results = coll1->search("*", {}, "points:1", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, + fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 0)").get(); + + ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ(1, results["hits"].size()); + + results = coll1->search("*", {}, "points:1", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, + fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 1000)").get(); + + ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ(1, results["hits"].size()); +} + TEST_F(CollectionVectorTest, VectorSearchTestDeletion) { nlohmann::json schema = R"({ "name": "coll1",