Reduce no-op operations during updates to fix perf.

2025-05-21 06:02:26 +08:00 · 2021-05-08 17:13:40 +05:30 · 2021-05-08 17:13:40 +05:30 · f9a037a4d5
commit f9a037a4d5
parent 2f56c1aa5a
5 changed files with 34 additions and 14 deletions
--- a/include/index.h
+++ b/include/index.h
@ -353,10 +353,11 @@ public:
                const std::vector<std::string>& group_by_fields,
                const std::string& default_sorting_field) const;

-    Option<uint32_t> remove(const uint32_t seq_id, const nlohmann::json & document);
+    Option<uint32_t> remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update);

    Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id,
-                                     const std::string & default_sorting_field);
+                                     const std::string & default_sorting_field,
+                                     const bool is_update);

    static size_t batch_memory_index(Index *index,
                                     std::vector<index_record> & iter_batch,
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -357,7 +357,7 @@ Option<uint32_t> Collection::index_in_memory(nlohmann::json &document, uint32_t
    }

    Index* index = indices[seq_id % num_memory_shards];
-    index->index_in_memory(document, seq_id, default_sorting_field);
+    index->index_in_memory(document, seq_id, default_sorting_field, is_update);

    num_documents += 1;
    return Option<>(200);
@ -1672,7 +1672,7 @@ void Collection::remove_document(const nlohmann::json & document, const uint32_t
        std::unique_lock lock(mutex);

        Index* index = indices[seq_id % num_memory_shards];
-        index->remove(seq_id, document);
+        index->remove(seq_id, document, false);
        num_documents -= 1;
    }

--- a/src/index.cpp
+++ b/src/index.cpp
@ -18,9 +18,11 @@ Index::Index(const std::string name, const std::unordered_map<std::string, field

    for(const auto & fname_field: search_schema) {
        if(fname_field.second.is_string()) {
-            art_tree *t = new art_tree;
-            art_tree_init(t);
-            search_index.emplace(fname_field.first, t);
+            if(fname_field.second.index) {
+                art_tree *t = new art_tree;
+                art_tree_init(t);
+                search_index.emplace(fname_field.first, t);
+            }
        } else {
            num_tree_t* num_tree = new num_tree_t;
            numerical_index.emplace(fname_field.first, num_tree);
@ -105,7 +107,8 @@ int64_t Index::float_to_in64_t(float f) {
 }

 Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t seq_id,
-                                        const std::string & default_sorting_field) {
+                                        const std::string & default_sorting_field,
+                                        const bool is_update) {

    std::unique_lock lock(mutex);

@ -121,7 +124,10 @@ Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t
        points = get_points_from_doc(document, default_sorting_field);
    }

-    seq_ids.append(seq_id);
+    if(!is_update) {
+        // for updates, the seq_id will already exist
+        seq_ids.append(seq_id);
+    }

    // assumes that validation has already been done
    for(const auto& field_pair: search_schema) {
@ -450,13 +456,13 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
                // scrub string fields to reduce delete ops
                get_doc_changes(index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc);
                index->scrub_reindex_doc(index_rec.doc, index_rec.del_doc, index_rec.old_doc);
-                index->remove(index_rec.seq_id, index_rec.del_doc);
+                index->remove(index_rec.seq_id, index_rec.del_doc, index_rec.is_update);
            }

            Option<uint32_t> index_mem_op(0);

            try {
-                index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field);
+                index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field, index_rec.is_update);
            } catch(const std::exception& e) {
                const std::string& error_msg = std::string("Fatal error during indexing: ") + e.what();
                LOG(ERROR) << error_msg << ", document: " << index_rec.doc;
@ -464,7 +470,7 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
            }

            if(!index_mem_op.ok()) {
-                index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field);
+                index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field, true);
                index_rec.index_failure(index_mem_op.code(), index_mem_op.error());
                continue;
            }
@ -2388,7 +2394,7 @@ void Index::remove_and_shift_offset_index(sorted_array& offset_index, const uint
    delete[] new_array;
 }

-Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & document) {
+Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update) {
    std::unique_lock lock(mutex);

    for(auto it = document.begin(); it != document.end(); ++it) {
@ -2498,7 +2504,7 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
        }
    }

-    if(seq_ids.contains(seq_id)) {
+    if(!is_update) {
        seq_ids.remove_value(seq_id);
    }

--- a/src/sorted_array.cpp
+++ b/src/sorted_array.cpp
@ -253,6 +253,10 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint
 }

 void sorted_array::remove_value(uint32_t value) {
+    if(length == 0) {
+        return ;
+    }
+
    // A lower bound search returns the first element in the sequence that is >= `value`
    // So, `found_val` will be either equal or greater than `value`
    uint32_t found_val;
--- a/test/collection_all_fields_test.cpp
+++ b/test/collection_all_fields_test.cpp
@ -923,6 +923,8 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) {
    auto add_op = coll1->add(doc.dump(), CREATE);
    ASSERT_TRUE(add_op.ok());

+    ASSERT_EQ(0, coll1->_get_indexes()[0]->_get_search_index().count("post"));
+
    auto res_op = coll1->search("Amazon", {"description_txt"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false);
    ASSERT_FALSE(res_op.ok());
    ASSERT_EQ("Could not find a field named `description_txt` in the schema.", res_op.error());
@ -936,6 +938,8 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) {
    auto update_op = coll1->add(doc.dump(), UPDATE, "0");
    ASSERT_TRUE(add_op.ok());

+    ASSERT_EQ(0, coll1->_get_indexes()[0]->_get_search_index().count("post"));
+
    auto res = coll1->search("Amazon", {"company_name"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
    ASSERT_EQ("Some post updated.", res["hits"][0]["document"]["post"].get<std::string>());

@ -943,6 +947,11 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) {
    auto del_op = coll1->remove("0");
    ASSERT_TRUE(del_op.ok());

+    // facet search should also be disabled
+    auto fs_op = coll1->search("Amazon", {"company_name"}, "", {"description_txt"}, sort_fields, 0, 10, 1, FREQUENCY, false);
+    ASSERT_FALSE(fs_op.ok());
+    ASSERT_EQ("Could not find a facet field named `description_txt` in the schema.", fs_op.error());
+
    fields = {field("company_name", field_types::STRING, false),
              field("num_employees", field_types::INT32, false),
              field("post", field_types::STRING, false, false, false),