From c066120fb1b61936c1860088188e735a3483c0d1 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 1 Jul 2023 13:46:03 +0530 Subject: [PATCH] Fix edge case in updating empty array strings. --- include/index.h | 3 - src/index.cpp | 95 ++++------------ test/collection_specific_more_test.cpp | 147 +++++++++++++++++++++++++ test/index_test.cpp | 60 ---------- 4 files changed, 171 insertions(+), 134 deletions(-) diff --git a/include/index.h b/include/index.h index 3f93e182..5c1b77e0 100644 --- a/include/index.h +++ b/include/index.h @@ -631,9 +631,6 @@ public: const std::vector& local_token_separators, const std::vector& local_symbols_to_index); - static void scrub_reindex_doc(const tsl::htrie_map& search_schema, - nlohmann::json& update_doc, nlohmann::json& del_doc, const nlohmann::json& old_doc); - static void tokenize_string_field(const nlohmann::json& document, const field& search_field, std::vector& tokens, const std::string& locale, diff --git a/src/index.cpp b/src/index.cpp index 5cfb2287..4e6ac914 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -454,7 +454,6 @@ void Index::validate_and_preprocess(Index *index, std::vector& ite // scrub string fields to reduce delete ops get_doc_changes(index_rec.operation, search_schema, index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc); - scrub_reindex_doc(search_schema, index_rec.doc, index_rec.del_doc, index_rec.old_doc); if(generate_embeddings) { for(auto& field: index_rec.doc.items()) { @@ -6236,11 +6235,19 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map& search_schema, - nlohmann::json& update_doc, - nlohmann::json& del_doc, - const nlohmann::json& old_doc) { - - /*LOG(INFO) << "update_doc: " << update_doc; - LOG(INFO) << "old_doc: " << old_doc; - LOG(INFO) << "del_doc: " << del_doc;*/ - - // del_doc contains fields that exist in both update doc and old doc - // But we will only remove fields that are different - - std::vector unchanged_keys; - - for(auto it = del_doc.cbegin(); it != del_doc.cend(); it++) { - const std::string& field_name = it.key(); - - const auto& search_field_it = search_schema.find(field_name); - if(search_field_it == search_schema.end()) { continue; } - if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) { - continue; - } - - const auto search_field = search_field_it.value(); // copy, don't use reference! - - // compare values between old and update docs: - // if they match, we will remove them from both del and update docs - - if(update_doc.contains(search_field.name)) { - if(update_doc[search_field.name].is_null()) { - // we don't allow null values to be stored or indexed but need to be removed from stored doc - update_doc.erase(search_field.name); - } - else if(update_doc[search_field.name] == old_doc[search_field.name]) { - unchanged_keys.push_back(field_name); + if(old_doc.contains(it.key())) { + if(old_doc[it.key()] == it.value()) { + // unchanged so should not be part of update doc + it = update_doc.erase(it); + continue; + } else { + // delete this old value from index + del_doc[it.key()] = old_doc[it.key()]; } } - } - for(const auto& unchanged_key: unchanged_keys) { - del_doc.erase(unchanged_key); - update_doc.erase(unchanged_key); + it++; } } diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 370fd45f..f53ce1a7 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -1225,6 +1225,153 @@ TEST_F(CollectionSpecificMoreTest, UpsertUpdateEmplaceShouldAllRemoveIndex) { ASSERT_EQ(1, results["found"].get()); } +TEST_F(CollectionSpecificMoreTest, UpdateWithEmptyArray) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "tags", "type": "string[]"} + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll1 = op.get(); + + auto doc1 = R"({ + "id": "0", + "tags": ["alpha", "beta", "gamma"] + })"_json; + + ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok()); + + auto doc2 = R"({ + "id": "1", + "tags": ["one", "two"] + })"_json; + + ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok()); + + // via update + + auto doc_update = R"({ + "id": "0", + "tags": [] + })"_json; + ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok()); + + auto results = coll1->search("alpha", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); + ASSERT_EQ(0, results["found"].get()); + + // via upsert + + doc_update = R"({ + "id": "1", + "tags": [] + })"_json; + ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok()); + + results = coll1->search("one", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); + ASSERT_EQ(0, results["found"].get()); +} + +TEST_F(CollectionSpecificMoreTest, UpdateArrayWithNullValue) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "tags", "type": "string[]", "optional": true} + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll1 = op.get(); + + auto doc1 = R"({ + "id": "0", + "tags": ["alpha", "beta", "gamma"] + })"_json; + + ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok()); + + auto doc2 = R"({ + "id": "1", + "tags": ["one", "two"] + })"_json; + + ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok()); + + // via update + + auto doc_update = R"({ + "id": "0", + "tags": null + })"_json; + ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok()); + + auto results = coll1->search("alpha", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); + ASSERT_EQ(0, results["found"].get()); + + // via upsert + + doc_update = R"({ + "id": "1", + "tags": null + })"_json; + ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok()); + + results = coll1->search("one", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); + ASSERT_EQ(0, results["found"].get()); +} + +TEST_F(CollectionSpecificMoreTest, ReplaceArrayElement) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "tags", "type": "string[]"} + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll1 = op.get(); + + auto doc1 = R"({ + "id": "0", + "tags": ["alpha", "beta", "gamma"] + })"_json; + + ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok()); + + auto doc2 = R"({ + "id": "1", + "tags": ["one", "two", "three"] + })"_json; + + ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok()); + + // via update + + auto doc_update = R"({ + "id": "0", + "tags": ["alpha", "gamma"] + })"_json; + ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok()); + + auto results = coll1->search("beta", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); + ASSERT_EQ(0, results["found"].get()); + + // via upsert + + doc_update = R"({ + "id": "1", + "tags": ["one", "three"] + })"_json; + ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok()); + + results = coll1->search("two", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); + ASSERT_EQ(0, results["found"].get()); +} + TEST_F(CollectionSpecificMoreTest, UnorderedWeightingOfFields) { nlohmann::json schema = R"({ "name": "coll1", diff --git a/test/index_test.cpp b/test/index_test.cpp index d788d5f5..8b2b1fcf 100644 --- a/test/index_test.cpp +++ b/test/index_test.cpp @@ -3,66 +3,6 @@ #include #include -TEST(IndexTest, ScrubReindexDoc) { - tsl::htrie_map search_schema; - search_schema.emplace("title", field("title", field_types::STRING, false)); - search_schema.emplace("points", field("points", field_types::INT32, false)); - search_schema.emplace("cast", field("cast", field_types::STRING_ARRAY, false)); - search_schema.emplace("movie", field("movie", field_types::BOOL, false)); - - ThreadPool pool(4); - - Index index("index", 1, nullptr, nullptr, &pool, search_schema, {}, {}); - nlohmann::json old_doc; - old_doc["id"] = "1"; - old_doc["title"] = "One more thing."; - old_doc["points"] = 100; - old_doc["cast"] = {"John Wick", "Jeremy Renner"}; - old_doc["movie"] = true; - - // all fields remain same - - nlohmann::json update_doc1, del_doc1; - update_doc1 = old_doc; - del_doc1 = old_doc; - - index.scrub_reindex_doc(search_schema, update_doc1, del_doc1, old_doc); - ASSERT_EQ(1, del_doc1.size()); - ASSERT_STREQ("1", del_doc1["id"].get().c_str()); - - // when only some fields are updated - - nlohmann::json update_doc2, del_doc2; - update_doc2["id"] = "1"; - update_doc2["points"] = 100; - update_doc2["cast"] = {"Jack"}; - - del_doc2 = update_doc2; - - index.scrub_reindex_doc(search_schema, update_doc2, del_doc2, old_doc); - ASSERT_EQ(2, del_doc2.size()); - ASSERT_STREQ("1", del_doc2["id"].get().c_str()); - std::vector cast = del_doc2["cast"].get>(); - ASSERT_EQ(1, cast.size()); - ASSERT_STREQ("Jack", cast[0].c_str()); - - // containing fields not part of search schema - - nlohmann::json update_doc3, del_doc3; - update_doc3["id"] = "1"; - update_doc3["title"] = "The Lawyer"; - update_doc3["foo"] = "Bar"; - - del_doc3 = update_doc3; - index.scrub_reindex_doc(search_schema, update_doc3, del_doc3, old_doc); - ASSERT_EQ(3, del_doc3.size()); - ASSERT_STREQ("1", del_doc3["id"].get().c_str()); - ASSERT_STREQ("The Lawyer", del_doc3["title"].get().c_str()); - ASSERT_STREQ("Bar", del_doc3["foo"].get().c_str()); - - pool.shutdown(); -} - /*TEST(IndexTest, PointInPolygon180thMeridian) { // somewhere in far eastern russia GeoCoord verts[3] = {