From 40b5eca7d0351b76a1132fda8478394988158f66 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 8 May 2021 17:13:40 +0530 Subject: [PATCH] Reduce no-op operations during updates to fix perf. --- include/index.h | 5 +++-- src/collection.cpp | 4 ++-- src/index.cpp | 26 ++++++++++++++++---------- src/sorted_array.cpp | 4 ++++ test/collection_all_fields_test.cpp | 9 +++++++++ 5 files changed, 34 insertions(+), 14 deletions(-) diff --git a/include/index.h b/include/index.h index 180b84a7..d29abf5f 100644 --- a/include/index.h +++ b/include/index.h @@ -353,10 +353,11 @@ public: const std::vector& group_by_fields, const std::string& default_sorting_field) const; - Option remove(const uint32_t seq_id, const nlohmann::json & document); + Option remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update); Option index_in_memory(const nlohmann::json & document, uint32_t seq_id, - const std::string & default_sorting_field); + const std::string & default_sorting_field, + const bool is_update); static size_t batch_memory_index(Index *index, std::vector & iter_batch, diff --git a/src/collection.cpp b/src/collection.cpp index ae047c33..723a3ad4 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -357,7 +357,7 @@ Option Collection::index_in_memory(nlohmann::json &document, uint32_t } Index* index = indices[seq_id % num_memory_shards]; - index->index_in_memory(document, seq_id, default_sorting_field); + index->index_in_memory(document, seq_id, default_sorting_field, is_update); num_documents += 1; return Option<>(200); @@ -1672,7 +1672,7 @@ void Collection::remove_document(const nlohmann::json & document, const uint32_t std::unique_lock lock(mutex); Index* index = indices[seq_id % num_memory_shards]; - index->remove(seq_id, document); + index->remove(seq_id, document, false); num_documents -= 1; } diff --git a/src/index.cpp b/src/index.cpp index 858a79e1..b7cc46a1 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -18,9 +18,11 @@ Index::Index(const std::string name, const std::unordered_map Index::index_in_memory(const nlohmann::json &document, uint32_t seq_id, - const std::string & default_sorting_field) { + const std::string & default_sorting_field, + const bool is_update) { std::unique_lock lock(mutex); @@ -121,7 +124,10 @@ Option Index::index_in_memory(const nlohmann::json &document, uint32_t points = get_points_from_doc(document, default_sorting_field); } - seq_ids.append(seq_id); + if(!is_update) { + // for updates, the seq_id will already exist + seq_ids.append(seq_id); + } // assumes that validation has already been done for(const auto& field_pair: search_schema) { @@ -450,13 +456,13 @@ size_t Index::batch_memory_index(Index *index, std::vector & iter_ // scrub string fields to reduce delete ops get_doc_changes(index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc); index->scrub_reindex_doc(index_rec.doc, index_rec.del_doc, index_rec.old_doc); - index->remove(index_rec.seq_id, index_rec.del_doc); + index->remove(index_rec.seq_id, index_rec.del_doc, index_rec.is_update); } Option index_mem_op(0); try { - index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field); + index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field, index_rec.is_update); } catch(const std::exception& e) { const std::string& error_msg = std::string("Fatal error during indexing: ") + e.what(); LOG(ERROR) << error_msg << ", document: " << index_rec.doc; @@ -464,7 +470,7 @@ size_t Index::batch_memory_index(Index *index, std::vector & iter_ } if(!index_mem_op.ok()) { - index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field); + index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field, true); index_rec.index_failure(index_mem_op.code(), index_mem_op.error()); continue; } @@ -2388,7 +2394,7 @@ void Index::remove_and_shift_offset_index(sorted_array& offset_index, const uint delete[] new_array; } -Option Index::remove(const uint32_t seq_id, const nlohmann::json & document) { +Option Index::remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update) { std::unique_lock lock(mutex); for(auto it = document.begin(); it != document.end(); ++it) { @@ -2498,7 +2504,7 @@ Option Index::remove(const uint32_t seq_id, const nlohmann::json & doc } } - if(seq_ids.contains(seq_id)) { + if(!is_update) { seq_ids.remove_value(seq_id); } diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp index e72c953b..3666bfb0 100644 --- a/src/sorted_array.cpp +++ b/src/sorted_array.cpp @@ -253,6 +253,10 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint } void sorted_array::remove_value(uint32_t value) { + if(length == 0) { + return ; + } + // A lower bound search returns the first element in the sequence that is >= `value` // So, `found_val` will be either equal or greater than `value` uint32_t found_val; diff --git a/test/collection_all_fields_test.cpp b/test/collection_all_fields_test.cpp index e84b5ed2..3147e838 100644 --- a/test/collection_all_fields_test.cpp +++ b/test/collection_all_fields_test.cpp @@ -923,6 +923,8 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) { auto add_op = coll1->add(doc.dump(), CREATE); ASSERT_TRUE(add_op.ok()); + ASSERT_EQ(0, coll1->_get_indexes()[0]->_get_search_index().count("post")); + auto res_op = coll1->search("Amazon", {"description_txt"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false); ASSERT_FALSE(res_op.ok()); ASSERT_EQ("Could not find a field named `description_txt` in the schema.", res_op.error()); @@ -936,6 +938,8 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) { auto update_op = coll1->add(doc.dump(), UPDATE, "0"); ASSERT_TRUE(add_op.ok()); + ASSERT_EQ(0, coll1->_get_indexes()[0]->_get_search_index().count("post")); + auto res = coll1->search("Amazon", {"company_name"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get(); ASSERT_EQ("Some post updated.", res["hits"][0]["document"]["post"].get()); @@ -943,6 +947,11 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) { auto del_op = coll1->remove("0"); ASSERT_TRUE(del_op.ok()); + // facet search should also be disabled + auto fs_op = coll1->search("Amazon", {"company_name"}, "", {"description_txt"}, sort_fields, 0, 10, 1, FREQUENCY, false); + ASSERT_FALSE(fs_op.ok()); + ASSERT_EQ("Could not find a facet field named `description_txt` in the schema.", fs_op.error()); + fields = {field("company_name", field_types::STRING, false), field("num_employees", field_types::INT32, false), field("post", field_types::STRING, false, false, false),