Vector flat search: handle missing values.

2025-05-20 21:52:23 +08:00 · 2022-10-21 09:28:26 +05:30 · 2022-10-21 09:28:26 +05:30 · ae49dab8fd
commit ae49dab8fd
parent 790fac008b
3 changed files with 112 additions and 3 deletions
--- a/include/field.h
+++ b/include/field.h
@ -604,7 +604,7 @@ struct sort_by {
 struct vector_query_t {
    std::string field_name;
    size_t k = 0;
-    size_t flat_search_cutoff = 1000;
+    size_t flat_search_cutoff = 0;
    std::vector<float> values;

    void _reset() {
--- a/src/index.cpp
+++ b/src/index.cpp
@ -2515,9 +2515,16 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
            if(!no_filters_provided && filter_ids_length < vector_query.flat_search_cutoff) {
                for(size_t i = 0; i < filter_ids_length; i++) {
                    auto seq_id = filter_ids[i];
-                    const std::vector<float>& values = field_vector_index->vecdex->getDataByLabel<float>(seq_id);
-                    float dist;
+                    std::vector<float> values;

+                    try {
+                        values = field_vector_index->vecdex->getDataByLabel<float>(seq_id);
+                    } catch(...) {
+                        // likely not found
+                        continue;
+                    }
+
+                    float dist;
                    if(field_vector_index->distance_type == cosine) {
                        std::vector<float> normalized_q(vector_query.values.size());
                        hnsw_index_t::normalize_vector(vector_query.values, normalized_q);
@ -2565,6 +2572,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
            }

            if(!nearest_ids.empty()) {
+                std::sort(nearest_ids.begin(), nearest_ids.end());  // seq_ids should be in ascending order
                all_result_ids = new uint32[nearest_ids.size()];
                std::copy(nearest_ids.begin(), nearest_ids.end(), all_result_ids);
                all_result_ids_len = nearest_ids.size();
--- a/test/collection_vector_search_test.cpp
+++ b/test/collection_vector_search_test.cpp
@ -319,6 +319,107 @@ TEST_F(CollectionVectorTest, VecSearchWithFiltering) {
    ASSERT_EQ(1, results["hits"].size());
 }

+TEST_F(CollectionVectorTest, VecSearchWithFilteringWithMissingVectorValues) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "fields": [
+            {"name": "title", "type": "string"},
+            {"name": "points", "type": "int32"},
+            {"name": "vec", "type": "float[]", "num_dim": 4, "optional": true}
+        ]
+    })"_json;
+
+    Collection* coll1 = collectionManager.create_collection(schema).get();
+
+    std::mt19937 rng;
+    rng.seed(47);
+    std::uniform_real_distribution<> distrib;
+
+    size_t num_docs = 20;
+
+    for (size_t i = 0; i < num_docs; i++) {
+        nlohmann::json doc;
+        doc["id"] = std::to_string(i);
+        doc["title"] = std::to_string(i) + " title";
+        doc["points"] = i;
+
+        std::vector<float> values;
+        for(size_t j = 0; j < 4; j++) {
+            values.push_back(distrib(rng));
+        }
+
+        if(i != 5 && i != 15) {
+            doc["vec"] = values;
+        }
+        ASSERT_TRUE(coll1->add(doc.dump()).ok());
+    }
+
+    auto results = coll1->search("*", {}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 10, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
+                                 fallback,
+                                 4, {off}, 32767, 32767, 2,
+                                 false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get();
+
+    ASSERT_EQ(18, results["found"].get<size_t>());
+    ASSERT_EQ(18, results["hits"].size());
+
+    // with points:<10, non-flat-search
+
+    results = coll1->search("*", {}, "points:<10", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
+                            spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                            "", 10, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
+                            fallback,
+                            4, {off}, 32767, 32767, 2,
+                            false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 0)").get();
+
+    ASSERT_EQ(9, results["found"].get<size_t>());
+    ASSERT_EQ(9, results["hits"].size());
+
+    // with points:<10, flat-search
+    results = coll1->search("*", {}, "points:<10", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
+                            spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                            "", 10, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
+                            fallback,
+                            4, {off}, 32767, 32767, 2,
+                            false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 1000)").get();
+
+    ASSERT_EQ(9, results["found"].get<size_t>());
+    ASSERT_EQ(9, results["hits"].size());
+
+    // single point
+
+    results = coll1->search("*", {}, "points:1", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
+                            spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                            "", 10, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
+                            fallback,
+                            4, {off}, 32767, 32767, 2,
+                            false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 0)").get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["hits"].size());
+
+    results = coll1->search("*", {}, "points:1", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
+                            spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                            "", 10, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
+                            fallback,
+                            4, {off}, 32767, 32767, 2,
+                            false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 1000)").get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["hits"].size());
+}
+
 TEST_F(CollectionVectorTest, VectorSearchTestDeletion) {
    nlohmann::json schema = R"({
        "name": "coll1",