From 40b5eca7d0351b76a1132fda8478394988158f66 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Sat, 8 May 2021 17:13:40 +0530
Subject: [PATCH] Reduce no-op operations during updates to fix perf.

---
 include/index.h                     |  5 +++--
 src/collection.cpp                  |  4 ++--
 src/index.cpp                       | 26 ++++++++++++++++----------
 src/sorted_array.cpp                |  4 ++++
 test/collection_all_fields_test.cpp |  9 +++++++++
 5 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/include/index.h b/include/index.h
index 180b84a7..d29abf5f 100644
--- a/include/index.h
+++ b/include/index.h
@@ -353,10 +353,11 @@ public:
                 const std::vector<std::string>& group_by_fields,
                 const std::string& default_sorting_field) const;
 
-    Option<uint32_t> remove(const uint32_t seq_id, const nlohmann::json & document);
+    Option<uint32_t> remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update);
 
     Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id,
-                                     const std::string & default_sorting_field);
+                                     const std::string & default_sorting_field,
+                                     const bool is_update);
 
     static size_t batch_memory_index(Index *index,
                                      std::vector<index_record> & iter_batch,
diff --git a/src/collection.cpp b/src/collection.cpp
index ae047c33..723a3ad4 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -357,7 +357,7 @@ Option<uint32_t> Collection::index_in_memory(nlohmann::json &document, uint32_t
     }
 
     Index* index = indices[seq_id % num_memory_shards];
-    index->index_in_memory(document, seq_id, default_sorting_field);
+    index->index_in_memory(document, seq_id, default_sorting_field, is_update);
 
     num_documents += 1;
     return Option<>(200);
@@ -1672,7 +1672,7 @@ void Collection::remove_document(const nlohmann::json & document, const uint32_t
         std::unique_lock lock(mutex);
 
         Index* index = indices[seq_id % num_memory_shards];
-        index->remove(seq_id, document);
+        index->remove(seq_id, document, false);
         num_documents -= 1;
     }
 
diff --git a/src/index.cpp b/src/index.cpp
index 858a79e1..b7cc46a1 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -18,9 +18,11 @@ Index::Index(const std::string name, const std::unordered_map<std::string, field
 
     for(const auto & fname_field: search_schema) {
         if(fname_field.second.is_string()) {
-            art_tree *t = new art_tree;
-            art_tree_init(t);
-            search_index.emplace(fname_field.first, t);
+            if(fname_field.second.index) {
+                art_tree *t = new art_tree;
+                art_tree_init(t);
+                search_index.emplace(fname_field.first, t);
+            }
         } else {
             num_tree_t* num_tree = new num_tree_t;
             numerical_index.emplace(fname_field.first, num_tree);
@@ -105,7 +107,8 @@ int64_t Index::float_to_in64_t(float f) {
 }
 
 Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t seq_id,
-                                        const std::string & default_sorting_field) {
+                                        const std::string & default_sorting_field,
+                                        const bool is_update) {
 
     std::unique_lock lock(mutex);
 
@@ -121,7 +124,10 @@ Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t
         points = get_points_from_doc(document, default_sorting_field);
     }
 
-    seq_ids.append(seq_id);
+    if(!is_update) {
+        // for updates, the seq_id will already exist
+        seq_ids.append(seq_id);
+    }
 
     // assumes that validation has already been done
     for(const auto& field_pair: search_schema) {
@@ -450,13 +456,13 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
                 // scrub string fields to reduce delete ops
                 get_doc_changes(index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc);
                 index->scrub_reindex_doc(index_rec.doc, index_rec.del_doc, index_rec.old_doc);
-                index->remove(index_rec.seq_id, index_rec.del_doc);
+                index->remove(index_rec.seq_id, index_rec.del_doc, index_rec.is_update);
             }
 
             Option<uint32_t> index_mem_op(0);
 
             try {
-                index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field);
+                index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field, index_rec.is_update);
             } catch(const std::exception& e) {
                 const std::string& error_msg = std::string("Fatal error during indexing: ") + e.what();
                 LOG(ERROR) << error_msg << ", document: " << index_rec.doc;
@@ -464,7 +470,7 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
             }
 
             if(!index_mem_op.ok()) {
-                index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field);
+                index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field, true);
                 index_rec.index_failure(index_mem_op.code(), index_mem_op.error());
                 continue;
             }
@@ -2388,7 +2394,7 @@ void Index::remove_and_shift_offset_index(sorted_array& offset_index, const uint
     delete[] new_array;
 }
 
-Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & document) {
+Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update) {
     std::unique_lock lock(mutex);
 
     for(auto it = document.begin(); it != document.end(); ++it) {
@@ -2498,7 +2504,7 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
         }
     }
 
-    if(seq_ids.contains(seq_id)) {
+    if(!is_update) {
         seq_ids.remove_value(seq_id);
     }
 
diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp
index e72c953b..3666bfb0 100644
--- a/src/sorted_array.cpp
+++ b/src/sorted_array.cpp
@@ -253,6 +253,10 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint
 }
 
 void sorted_array::remove_value(uint32_t value) {
+    if(length == 0) {
+        return ;
+    }
+
     // A lower bound search returns the first element in the sequence that is >= `value`
     // So, `found_val` will be either equal or greater than `value`
     uint32_t found_val;
diff --git a/test/collection_all_fields_test.cpp b/test/collection_all_fields_test.cpp
index e84b5ed2..3147e838 100644
--- a/test/collection_all_fields_test.cpp
+++ b/test/collection_all_fields_test.cpp
@@ -923,6 +923,8 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) {
     auto add_op = coll1->add(doc.dump(), CREATE);
     ASSERT_TRUE(add_op.ok());
 
+    ASSERT_EQ(0, coll1->_get_indexes()[0]->_get_search_index().count("post"));
+
     auto res_op = coll1->search("Amazon", {"description_txt"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false);
     ASSERT_FALSE(res_op.ok());
     ASSERT_EQ("Could not find a field named `description_txt` in the schema.", res_op.error());
@@ -936,6 +938,8 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) {
     auto update_op = coll1->add(doc.dump(), UPDATE, "0");
     ASSERT_TRUE(add_op.ok());
 
+    ASSERT_EQ(0, coll1->_get_indexes()[0]->_get_search_index().count("post"));
+
     auto res = coll1->search("Amazon", {"company_name"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
     ASSERT_EQ("Some post updated.", res["hits"][0]["document"]["post"].get<std::string>());
 
@@ -943,6 +947,11 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) {
     auto del_op = coll1->remove("0");
     ASSERT_TRUE(del_op.ok());
 
+    // facet search should also be disabled
+    auto fs_op = coll1->search("Amazon", {"company_name"}, "", {"description_txt"}, sort_fields, 0, 10, 1, FREQUENCY, false);
+    ASSERT_FALSE(fs_op.ok());
+    ASSERT_EQ("Could not find a facet field named `description_txt` in the schema.", fs_op.error());
+
     fields = {field("company_name", field_types::STRING, false),
               field("num_employees", field_types::INT32, false),
               field("post", field_types::STRING, false, false, false),