Merge branch 'v0.25-join' into v0.26-facets

2025-05-21 06:02:26 +08:00 · 2023-09-15 07:15:32 +05:30 · 2023-09-15 07:15:32 +05:30 · 714f27667e
commit 714f27667e
parent 08739caec6 0db3dd0d00
4 changed files with 153 additions and 19 deletions
--- a/include/index.h
+++ b/include/index.h
@ -643,7 +643,8 @@ public:
                const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold,
                const std::string& collection_name, facet_index_type_t facet_index_type = DETECT) const;

-    void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name);
+    void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name,
+                      const bool is_update);

    Option<uint32_t> remove(const uint32_t seq_id, const nlohmann::json & document,
                            const std::vector<field>& del_fields, const bool is_update);
--- a/src/index.cpp
+++ b/src/index.cpp
@ -3616,8 +3616,6 @@ Option<bool> Index::search_across_fields(const std::vector<token_t>& query_token
        dropped_token_its.push_back(std::move(token_fields));
    }

-
-
    // one iterator for each token, each underlying iterator contains results of token across multiple fields
    std::vector<or_iterator_t> token_its;

@ -3712,6 +3710,28 @@ Option<bool> Index::search_across_fields(const std::vector<token_t>& query_token
            }
        }

+        size_t query_len = query_tokens.size();
+
+        // check if seq_id exists in any of the dropped_token iters
+        for(size_t ti = 0; ti < dropped_token_its.size(); ti++) {
+            or_iterator_t& token_fields_iters = dropped_token_its[ti];
+            if(token_fields_iters.skip_to(seq_id) && token_fields_iters.id() == seq_id) {
+                query_len++;
+                const std::vector<posting_list_t::iterator_t>& field_iters = token_fields_iters.get_its();
+                for(size_t fi = 0; fi < field_iters.size(); fi++) {
+                    const posting_list_t::iterator_t& field_iter = field_iters[fi];
+                    if(field_iter.id() == seq_id) {
+                        // not all fields might contain a given token
+                        field_to_tokens[field_iter.get_field_id()].push_back(field_iter.clone());
+                    }
+                }
+            }
+        }
+
+        if(syn_orig_num_tokens != -1) {
+            query_len = syn_orig_num_tokens;
+        }
+
        int64_t best_field_match_score = 0, best_field_weight = 0;
        uint32_t num_matching_fields = 0;

@ -3770,18 +3790,6 @@ Option<bool> Index::search_across_fields(const std::vector<token_t>& query_token
            return;
        }

-        size_t query_len = query_tokens.size();
-
-        // check if seq_id exists in any of the dropped_token iters and increment matching fields accordingly
-        for(auto& dropped_token_it: dropped_token_its) {
-            if(dropped_token_it.skip_to(seq_id) && dropped_token_it.id() == seq_id) {
-                query_len++;
-            }
-        }
-
-        if(syn_orig_num_tokens != -1) {
-            query_len = syn_orig_num_tokens;
-        }
        query_len = std::min<size_t>(15, query_len);

        // NOTE: `query_len` is total tokens matched across fields.
@ -5877,7 +5885,8 @@ void Index::remove_facet_token(const field& search_field, spp::sparse_hash_map<s
    }
 }

-void Index::remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name) {
+void Index::remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name,
+                         const bool is_update) {
    const auto& search_field_it = search_schema.find(field_name);
    if(search_field_it == search_schema.end()) {
        return;
@ -5949,7 +5958,10 @@ void Index::remove_field(uint32_t seq_id, const nlohmann::json& document, const
            }
        }
    } else if(search_field.num_dim) {
-        vector_index[search_field.name]->vecdex->markDelete(seq_id);
+        if(!is_update) {
+            // since vector index supports upsert natively, we should not attempt to delete for update
+            vector_index[search_field.name]->vecdex->markDelete(seq_id);
+        }
    } else if(search_field.is_float()) {
        const std::vector<float>& values = search_field.is_single_float() ?
                                           std::vector<float>{document[field_name].get<float>()} :
@ -6045,7 +6057,7 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
            }

            try {
-                remove_field(seq_id, document, the_field.name);
+                remove_field(seq_id, document, the_field.name, is_update);
            } catch(const std::exception& e) {
                LOG(WARNING) << "Error while removing field `" << the_field.name << "` from document, message: "
                             << e.what();
@ -6055,7 +6067,7 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
        for(auto it = document.begin(); it != document.end(); ++it) {
            const std::string& field_name = it.key();
            try {
-                remove_field(seq_id, document, field_name);
+                remove_field(seq_id, document, field_name, is_update);
            } catch(const std::exception& e) {
                LOG(WARNING) << "Error while removing field `" << field_name << "` from document, message: "
                             << e.what();
--- a/test/collection_specific_more_test.cpp
+++ b/test/collection_specific_more_test.cpp
@ -1826,6 +1826,36 @@ TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring)
    ASSERT_EQ("1", res["hits"][1]["document"]["id"].get<std::string>());
 }

+TEST_F(CollectionSpecificMoreTest, ConsiderDroppedTokensDuringTextMatchScoring2) {
+    nlohmann::json schema = R"({
+            "name": "coll1",
+            "fields": [
+                {"name": "name", "type": "string"}
+            ]
+        })"_json;
+
+    Collection *coll1 = collectionManager.create_collection(schema).get();
+
+    nlohmann::json doc;
+    doc["id"] = "0";
+    doc["name"] = "Elizabeth Arden 5th Avenue Eau de Parfum 125ml";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    doc["id"] = "1";
+    doc["name"] = "Avène Sun Very High Protection Mineral Cream SPF50+ 50ml";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    auto res = coll1->search("avène eau mineral", {"name"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 5,
+                             spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
+                             "<mark>", "</mark>", {3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                             4, {off}, 0, 0, 0, 2, false, "", true, 0, max_weight).get();
+
+    ASSERT_EQ(2, res["hits"].size());
+    ASSERT_EQ("1", res["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", res["hits"][1]["document"]["id"].get<std::string>());
+}
+
 TEST_F(CollectionSpecificMoreTest, NonNestedFieldNameWithDot) {
    nlohmann::json schema = R"({
        "name": "coll1",
--- a/test/collection_vector_search_test.cpp
+++ b/test/collection_vector_search_test.cpp
@ -306,6 +306,97 @@ TEST_F(CollectionVectorTest, VectorUnchangedUpsert) {
    ASSERT_EQ(1, results["found"].get<size_t>());
 }

+TEST_F(CollectionVectorTest, VectorManyUpserts) {
+    nlohmann::json schema = R"({
+            "name": "coll1",
+            "fields": [
+                {"name": "title", "type": "string"},
+                {"name": "points", "type": "int32"},
+                {"name": "vec", "type": "float[]", "num_dim": 3}
+            ]
+        })"_json;
+
+    Collection* coll1 = collectionManager.create_collection(schema).get();
+
+    size_t d = 3;
+    size_t n = 50;
+
+    std::mt19937 rng;
+    rng.seed(47);
+    std::uniform_real_distribution<> distrib;
+
+    std::vector<std::string> import_records;
+
+    // first insert n docs
+    for (size_t i = 0; i < n; i++) {
+        nlohmann::json doc;
+        doc["id"] = std::to_string(i);
+        doc["title"] = std::to_string(i) + " title";
+        doc["points"] = i;
+
+        std::vector<float> values;
+        for (size_t j = 0; j < d; j++) {
+            values.push_back(distrib(rng));
+        }
+        doc["vec"] = values;
+        import_records.push_back(doc.dump());
+    }
+
+    nlohmann::json document;
+    nlohmann::json import_response = coll1->add_many(import_records, document);
+
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(n, import_response["num_imported"].get<int>());
+    import_records.clear();
+
+    size_t num_new_records = 0;
+
+    // upsert mix of old + new docs50
+    for (size_t i = 0; i < n; i++) {
+        nlohmann::json doc;
+        auto id = i;
+        if(i % 2 != 0) {
+            id = (i + 1000);
+            num_new_records++;
+        }
+
+        doc["id"] = std::to_string(id);
+        doc["title"] = std::to_string(id) + " title";
+        doc["points"] = id;
+
+        std::vector<float> values;
+        for (size_t j = 0; j < d; j++) {
+            values.push_back(distrib(rng) + 0.01);
+        }
+        doc["vec"] = values;
+        import_records.push_back(doc.dump());
+    }
+
+    import_response = coll1->add_many(import_records, document, UPSERT);
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(n, import_response["num_imported"].get<int>());
+    import_records.clear();
+
+    /*for(size_t i = 0; i < 100; i++) {
+        auto results = coll1->search("*", {}, "", {}, {}, {0}, 200, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
+                                     spp::sparse_hash_set<std::string>(),
+                                     spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                     "", 10, {}, {}, {}, 0,
+                                     "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                                     4, {off}, 32767, 32767, 2,
+                                     false, true, "vec:([0.12, 0.44, 0.55])").get();
+
+        if(results["found"].get<size_t>() != n+num_new_records) {
+            LOG(INFO) << results["found"].get<size_t>();
+        }
+    }*/
+
+    //LOG(INFO) << "Expected: " << n + num_new_records;
+    //ASSERT_EQ(n + num_new_records, results["found"].get<size_t>());
+    //ASSERT_EQ(n + num_new_records, results["hits"].size());
+}
+
+
 TEST_F(CollectionVectorTest, VectorPartialUpdate) {
    nlohmann::json schema = R"({
            "name": "coll1",