From c066120fb1b61936c1860088188e735a3483c0d1 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishorenc@gmail.com>
Date: Sat, 1 Jul 2023 13:46:03 +0530
Subject: [PATCH] Fix edge case in updating empty array strings.

---
 include/index.h                        |   3 -
 src/index.cpp                          |  95 ++++------------
 test/collection_specific_more_test.cpp | 147 +++++++++++++++++++++++++
 test/index_test.cpp                    |  60 ----------
 4 files changed, 171 insertions(+), 134 deletions(-)
diff --git a/include/index.h b/include/index.h
index 3f93e182..5c1b77e0 100644
--- a/include/index.h
+++ b/include/index.h
@@ -631,9 +631,6 @@ public:
                                              const std::vector<char>& local_token_separators,
                                              const std::vector<char>& local_symbols_to_index);
 
-    static void scrub_reindex_doc(const tsl::htrie_map<char, field>& search_schema,
-                                  nlohmann::json& update_doc, nlohmann::json& del_doc, const nlohmann::json& old_doc);
-
     static void tokenize_string_field(const nlohmann::json& document,
                                       const field& search_field, std::vector<std::string>& tokens,
                                       const std::string& locale,
diff --git a/src/index.cpp b/src/index.cpp
index 5cfb2287..4e6ac914 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -454,7 +454,6 @@ void Index::validate_and_preprocess(Index *index, std::vector<index_record>& ite
                 // scrub string fields to reduce delete ops
                 get_doc_changes(index_rec.operation, search_schema, index_rec.doc, index_rec.old_doc,
                                 index_rec.new_doc, index_rec.del_doc);
-                scrub_reindex_doc(search_schema, index_rec.doc, index_rec.del_doc, index_rec.old_doc);
 
                 if(generate_embeddings) {
                     for(auto& field: index_rec.doc.items()) {
@@ -6236,11 +6235,19 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<cha
 
     if(op == UPSERT) {
         new_doc = update_doc;
+        // since UPSERT could replace a doc with lesser fields, we have to add those missing fields to del_doc
+        for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
+            if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
+                continue;
+            }
+
+            if(!update_doc.contains(it.key())) {
+                del_doc[it.key()] = it.value();
+            }
+        }
     } else {
-        new_doc = old_doc;
-
         handle_doc_ops(search_schema, update_doc, old_doc);
-
+        new_doc = old_doc;
         new_doc.merge_patch(update_doc);
 
         if(old_doc.contains(".flat")) {
@@ -6251,87 +6258,33 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<cha
         }
     }
 
-    for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
-        if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
-            continue;
-        }
-
-        if(op == UPSERT && !update_doc.contains(it.key())) {
-            del_doc[it.key()] = it.value();
-        }
-    }
-
     auto it = update_doc.begin();
-    for(; it != update_doc.end(); ) {
-        if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
+    while(it != update_doc.end()) {
+        if(it.value().is_object() || (it.value().is_array() && !it.value().empty() && it.value()[0].is_object())) {
             ++it;
             continue;
         }
 
-        // if the update doc contains a field that exists in old, we record that (for delete + reindex)
-        bool field_exists_in_old_doc = (old_doc.count(it.key()) != 0);
-        if(field_exists_in_old_doc) {
-            // key exists in the stored doc, so it must be reindexed
-            // we need to check for this because a field can be optional
-            del_doc[it.key()] = old_doc[it.key()];
-        }
-
-        // adds new key or overrides existing key from `old_doc`
         if(it.value().is_null()) {
-            // null values should not indexed
+            // null values should not be indexed
             new_doc.erase(it.key());
+            del_doc[it.key()] = old_doc[it.key()];
             it = update_doc.erase(it);
-        } else {
-            ++it;
-        }
-    }
-}
-
-void Index::scrub_reindex_doc(const tsl::htrie_map<char, field>& search_schema,
-                              nlohmann::json& update_doc,
-                              nlohmann::json& del_doc,
-                              const nlohmann::json& old_doc) {
-
-    /*LOG(INFO) << "update_doc: " << update_doc;
-    LOG(INFO) << "old_doc: " << old_doc;
-    LOG(INFO) << "del_doc: " << del_doc;*/
-
-    // del_doc contains fields that exist in both update doc and old doc
-    // But we will only remove fields that are different
-
-    std::vector<std::string> unchanged_keys;
-
-    for(auto it = del_doc.cbegin(); it != del_doc.cend(); it++) {
-        const std::string& field_name = it.key();
-
-        const auto& search_field_it = search_schema.find(field_name);
-        if(search_field_it == search_schema.end()) {
             continue;
         }
 
-        if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
-            continue;
-        }
-
-        const auto search_field = search_field_it.value();  // copy, don't use reference!
-
-        // compare values between old and update docs:
-        // if they match, we will remove them from both del and update docs
-
-        if(update_doc.contains(search_field.name)) {
-            if(update_doc[search_field.name].is_null()) {
-                // we don't allow null values to be stored or indexed but need to be removed from stored doc
-                update_doc.erase(search_field.name);
-            }
-            else if(update_doc[search_field.name] == old_doc[search_field.name]) {
-                unchanged_keys.push_back(field_name);
+        if(old_doc.contains(it.key())) {
+            if(old_doc[it.key()] == it.value()) {
+                // unchanged so should not be part of update doc
+                it = update_doc.erase(it);
+                continue;
+            } else {
+                // delete this old value from index
+                del_doc[it.key()] = old_doc[it.key()];
             }
         }
-    }
 
-    for(const auto& unchanged_key: unchanged_keys) {
-        del_doc.erase(unchanged_key);
-        update_doc.erase(unchanged_key);
+        it++;
     }
 }
 
diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp
index 370fd45f..f53ce1a7 100644
--- a/test/collection_specific_more_test.cpp
+++ b/test/collection_specific_more_test.cpp
@@ -1225,6 +1225,153 @@ TEST_F(CollectionSpecificMoreTest, UpsertUpdateEmplaceShouldAllRemoveIndex) {
     ASSERT_EQ(1, results["found"].get<size_t>());
 }
 
+TEST_F(CollectionSpecificMoreTest, UpdateWithEmptyArray) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "fields": [
+          {"name": "tags", "type": "string[]"}
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc1 = R"({
+        "id": "0",
+        "tags": ["alpha", "beta", "gamma"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
+
+    auto doc2 = R"({
+        "id": "1",
+        "tags": ["one", "two"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
+
+    // via update
+
+    auto doc_update = R"({
+        "id": "0",
+        "tags": []
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
+
+    auto results = coll1->search("alpha", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+
+    // via upsert
+
+    doc_update = R"({
+        "id": "1",
+        "tags": []
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
+
+    results = coll1->search("one", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+}
+
+TEST_F(CollectionSpecificMoreTest, UpdateArrayWithNullValue) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "fields": [
+          {"name": "tags", "type": "string[]", "optional": true}
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc1 = R"({
+        "id": "0",
+        "tags": ["alpha", "beta", "gamma"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
+
+    auto doc2 = R"({
+        "id": "1",
+        "tags": ["one", "two"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
+
+    // via update
+
+    auto doc_update = R"({
+        "id": "0",
+        "tags": null
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
+
+    auto results = coll1->search("alpha", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+
+    // via upsert
+
+    doc_update = R"({
+        "id": "1",
+        "tags": null
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
+
+    results = coll1->search("one", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+}
+
+TEST_F(CollectionSpecificMoreTest, ReplaceArrayElement) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "fields": [
+          {"name": "tags", "type": "string[]"}
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc1 = R"({
+        "id": "0",
+        "tags": ["alpha", "beta", "gamma"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
+
+    auto doc2 = R"({
+        "id": "1",
+        "tags": ["one", "two", "three"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
+
+    // via update
+
+    auto doc_update = R"({
+        "id": "0",
+        "tags": ["alpha", "gamma"]
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
+
+    auto results = coll1->search("beta", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+
+    // via upsert
+
+    doc_update = R"({
+        "id": "1",
+        "tags": ["one", "three"]
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
+
+    results = coll1->search("two", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+}
+
 TEST_F(CollectionSpecificMoreTest, UnorderedWeightingOfFields) {
     nlohmann::json schema = R"({
         "name": "coll1",
diff --git a/test/index_test.cpp b/test/index_test.cpp
index d788d5f5..8b2b1fcf 100644
--- a/test/index_test.cpp
+++ b/test/index_test.cpp
@@ -3,66 +3,6 @@
 #include <vector>
 #include <s2/s2loop.h>
 
-TEST(IndexTest, ScrubReindexDoc) {
-    tsl::htrie_map<char, field> search_schema;
-    search_schema.emplace("title", field("title", field_types::STRING, false));
-    search_schema.emplace("points", field("points", field_types::INT32, false));
-    search_schema.emplace("cast", field("cast", field_types::STRING_ARRAY, false));
-    search_schema.emplace("movie", field("movie", field_types::BOOL, false));
-
-    ThreadPool pool(4);
-
-    Index index("index", 1, nullptr, nullptr, &pool, search_schema, {}, {});
-    nlohmann::json old_doc;
-    old_doc["id"] = "1";
-    old_doc["title"] = "One more thing.";
-    old_doc["points"] = 100;
-    old_doc["cast"] = {"John Wick", "Jeremy Renner"};
-    old_doc["movie"] = true;
-
-    // all fields remain same
-
-    nlohmann::json update_doc1, del_doc1;
-    update_doc1 = old_doc;
-    del_doc1 = old_doc;
-
-    index.scrub_reindex_doc(search_schema, update_doc1, del_doc1, old_doc);
-    ASSERT_EQ(1, del_doc1.size());
-    ASSERT_STREQ("1", del_doc1["id"].get<std::string>().c_str());
-
-    // when only some fields are updated
-
-    nlohmann::json update_doc2, del_doc2;
-    update_doc2["id"] = "1";
-    update_doc2["points"] = 100;
-    update_doc2["cast"] = {"Jack"};
-
-    del_doc2 = update_doc2;
-
-    index.scrub_reindex_doc(search_schema, update_doc2, del_doc2, old_doc);
-    ASSERT_EQ(2, del_doc2.size());
-    ASSERT_STREQ("1", del_doc2["id"].get<std::string>().c_str());
-    std::vector<std::string> cast = del_doc2["cast"].get<std::vector<std::string>>();
-    ASSERT_EQ(1, cast.size());
-    ASSERT_STREQ("Jack", cast[0].c_str());
-
-    // containing fields not part of search schema
-
-    nlohmann::json update_doc3, del_doc3;
-    update_doc3["id"] = "1";
-    update_doc3["title"] = "The Lawyer";
-    update_doc3["foo"] = "Bar";
-
-    del_doc3 = update_doc3;
-    index.scrub_reindex_doc(search_schema, update_doc3, del_doc3, old_doc);
-    ASSERT_EQ(3, del_doc3.size());
-    ASSERT_STREQ("1", del_doc3["id"].get<std::string>().c_str());
-    ASSERT_STREQ("The Lawyer", del_doc3["title"].get<std::string>().c_str());
-    ASSERT_STREQ("Bar", del_doc3["foo"].get<std::string>().c_str());
-
-    pool.shutdown();
-}
-
 /*TEST(IndexTest, PointInPolygon180thMeridian) {
     // somewhere in far eastern russia
     GeoCoord verts[3] = {