Reduce no-op operations during updates to fix perf.

This commit is contained in:
Kishore Nallan 2021-05-08 17:13:40 +05:30
parent 2f56c1aa5a
commit f9a037a4d5
5 changed files with 34 additions and 14 deletions

View File

@ -353,10 +353,11 @@ public:
const std::vector<std::string>& group_by_fields,
const std::string& default_sorting_field) const;
Option<uint32_t> remove(const uint32_t seq_id, const nlohmann::json & document);
Option<uint32_t> remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update);
Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id,
const std::string & default_sorting_field);
const std::string & default_sorting_field,
const bool is_update);
static size_t batch_memory_index(Index *index,
std::vector<index_record> & iter_batch,

View File

@ -357,7 +357,7 @@ Option<uint32_t> Collection::index_in_memory(nlohmann::json &document, uint32_t
}
Index* index = indices[seq_id % num_memory_shards];
index->index_in_memory(document, seq_id, default_sorting_field);
index->index_in_memory(document, seq_id, default_sorting_field, is_update);
num_documents += 1;
return Option<>(200);
@ -1672,7 +1672,7 @@ void Collection::remove_document(const nlohmann::json & document, const uint32_t
std::unique_lock lock(mutex);
Index* index = indices[seq_id % num_memory_shards];
index->remove(seq_id, document);
index->remove(seq_id, document, false);
num_documents -= 1;
}

View File

@ -18,9 +18,11 @@ Index::Index(const std::string name, const std::unordered_map<std::string, field
for(const auto & fname_field: search_schema) {
if(fname_field.second.is_string()) {
art_tree *t = new art_tree;
art_tree_init(t);
search_index.emplace(fname_field.first, t);
if(fname_field.second.index) {
art_tree *t = new art_tree;
art_tree_init(t);
search_index.emplace(fname_field.first, t);
}
} else {
num_tree_t* num_tree = new num_tree_t;
numerical_index.emplace(fname_field.first, num_tree);
@ -105,7 +107,8 @@ int64_t Index::float_to_in64_t(float f) {
}
Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t seq_id,
const std::string & default_sorting_field) {
const std::string & default_sorting_field,
const bool is_update) {
std::unique_lock lock(mutex);
@ -121,7 +124,10 @@ Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t
points = get_points_from_doc(document, default_sorting_field);
}
seq_ids.append(seq_id);
if(!is_update) {
// for updates, the seq_id will already exist
seq_ids.append(seq_id);
}
// assumes that validation has already been done
for(const auto& field_pair: search_schema) {
@ -450,13 +456,13 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
// scrub string fields to reduce delete ops
get_doc_changes(index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc);
index->scrub_reindex_doc(index_rec.doc, index_rec.del_doc, index_rec.old_doc);
index->remove(index_rec.seq_id, index_rec.del_doc);
index->remove(index_rec.seq_id, index_rec.del_doc, index_rec.is_update);
}
Option<uint32_t> index_mem_op(0);
try {
index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field);
index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field, index_rec.is_update);
} catch(const std::exception& e) {
const std::string& error_msg = std::string("Fatal error during indexing: ") + e.what();
LOG(ERROR) << error_msg << ", document: " << index_rec.doc;
@ -464,7 +470,7 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
}
if(!index_mem_op.ok()) {
index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field);
index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field, true);
index_rec.index_failure(index_mem_op.code(), index_mem_op.error());
continue;
}
@ -2388,7 +2394,7 @@ void Index::remove_and_shift_offset_index(sorted_array& offset_index, const uint
delete[] new_array;
}
Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & document) {
Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & document, const bool is_update) {
std::unique_lock lock(mutex);
for(auto it = document.begin(); it != document.end(); ++it) {
@ -2498,7 +2504,7 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
}
}
if(seq_ids.contains(seq_id)) {
if(!is_update) {
seq_ids.remove_value(seq_id);
}

View File

@ -253,6 +253,10 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint
}
void sorted_array::remove_value(uint32_t value) {
if(length == 0) {
return ;
}
// A lower bound search returns the first element in the sequence that is >= `value`
// So, `found_val` will be either equal or greater than `value`
uint32_t found_val;

View File

@ -923,6 +923,8 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) {
auto add_op = coll1->add(doc.dump(), CREATE);
ASSERT_TRUE(add_op.ok());
ASSERT_EQ(0, coll1->_get_indexes()[0]->_get_search_index().count("post"));
auto res_op = coll1->search("Amazon", {"description_txt"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false);
ASSERT_FALSE(res_op.ok());
ASSERT_EQ("Could not find a field named `description_txt` in the schema.", res_op.error());
@ -936,6 +938,8 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) {
auto update_op = coll1->add(doc.dump(), UPDATE, "0");
ASSERT_TRUE(add_op.ok());
ASSERT_EQ(0, coll1->_get_indexes()[0]->_get_search_index().count("post"));
auto res = coll1->search("Amazon", {"company_name"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
ASSERT_EQ("Some post updated.", res["hits"][0]["document"]["post"].get<std::string>());
@ -943,6 +947,11 @@ TEST_F(CollectionAllFieldsTest, DoNotIndexFieldMarkedAsNonIndex) {
auto del_op = coll1->remove("0");
ASSERT_TRUE(del_op.ok());
// facet search should also be disabled
auto fs_op = coll1->search("Amazon", {"company_name"}, "", {"description_txt"}, sort_fields, 0, 10, 1, FREQUENCY, false);
ASSERT_FALSE(fs_op.ok());
ASSERT_EQ("Could not find a facet field named `description_txt` in the schema.", fs_op.error());
fields = {field("company_name", field_types::STRING, false),
field("num_employees", field_types::INT32, false),
field("post", field_types::STRING, false, false, false),