Fix edge case in updating empty array strings.

2025-05-16 11:28:44 +08:00 · 2023-07-01 13:46:03 +05:30 · 2023-07-01 13:46:03 +05:30 · c066120fb1
commit c066120fb1
parent e60094c88b
4 changed files with 171 additions and 134 deletions
--- a/include/index.h
+++ b/include/index.h
@ -631,9 +631,6 @@ public:
                                             const std::vector<char>& local_token_separators,
                                             const std::vector<char>& local_symbols_to_index);

-    static void scrub_reindex_doc(const tsl::htrie_map<char, field>& search_schema,
-                                  nlohmann::json& update_doc, nlohmann::json& del_doc, const nlohmann::json& old_doc);
-
    static void tokenize_string_field(const nlohmann::json& document,
                                      const field& search_field, std::vector<std::string>& tokens,
                                      const std::string& locale,
--- a/src/index.cpp
+++ b/src/index.cpp
@ -454,7 +454,6 @@ void Index::validate_and_preprocess(Index *index, std::vector<index_record>& ite
                // scrub string fields to reduce delete ops
                get_doc_changes(index_rec.operation, search_schema, index_rec.doc, index_rec.old_doc,
                                index_rec.new_doc, index_rec.del_doc);
-                scrub_reindex_doc(search_schema, index_rec.doc, index_rec.del_doc, index_rec.old_doc);

                if(generate_embeddings) {
                    for(auto& field: index_rec.doc.items()) {
@ -6236,11 +6235,19 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<cha

    if(op == UPSERT) {
        new_doc = update_doc;
+        // since UPSERT could replace a doc with lesser fields, we have to add those missing fields to del_doc
+        for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
+            if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
+                continue;
+            }
+
+            if(!update_doc.contains(it.key())) {
+                del_doc[it.key()] = it.value();
+            }
+        }
    } else {
-        new_doc = old_doc;
-
        handle_doc_ops(search_schema, update_doc, old_doc);
-
+        new_doc = old_doc;
        new_doc.merge_patch(update_doc);

        if(old_doc.contains(".flat")) {
@ -6251,87 +6258,33 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<cha
        }
    }

-    for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
-        if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
-            continue;
-        }
-
-        if(op == UPSERT && !update_doc.contains(it.key())) {
-            del_doc[it.key()] = it.value();
-        }
-    }
-
    auto it = update_doc.begin();
-    for(; it != update_doc.end(); ) {
-        if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
+    while(it != update_doc.end()) {
+        if(it.value().is_object() || (it.value().is_array() && !it.value().empty() && it.value()[0].is_object())) {
            ++it;
            continue;
        }

-        // if the update doc contains a field that exists in old, we record that (for delete + reindex)
-        bool field_exists_in_old_doc = (old_doc.count(it.key()) != 0);
-        if(field_exists_in_old_doc) {
-            // key exists in the stored doc, so it must be reindexed
-            // we need to check for this because a field can be optional
-            del_doc[it.key()] = old_doc[it.key()];
-        }
-
-        // adds new key or overrides existing key from `old_doc`
        if(it.value().is_null()) {
-            // null values should not indexed
+            // null values should not be indexed
            new_doc.erase(it.key());
+            del_doc[it.key()] = old_doc[it.key()];
            it = update_doc.erase(it);
-        } else {
-            ++it;
-        }
-    }
-}
-
-void Index::scrub_reindex_doc(const tsl::htrie_map<char, field>& search_schema,
-                              nlohmann::json& update_doc,
-                              nlohmann::json& del_doc,
-                              const nlohmann::json& old_doc) {
-
-    /*LOG(INFO) << "update_doc: " << update_doc;
-    LOG(INFO) << "old_doc: " << old_doc;
-    LOG(INFO) << "del_doc: " << del_doc;*/
-
-    // del_doc contains fields that exist in both update doc and old doc
-    // But we will only remove fields that are different
-
-    std::vector<std::string> unchanged_keys;
-
-    for(auto it = del_doc.cbegin(); it != del_doc.cend(); it++) {
-        const std::string& field_name = it.key();
-
-        const auto& search_field_it = search_schema.find(field_name);
-        if(search_field_it == search_schema.end()) {
            continue;
        }

-        if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
-            continue;
-        }
-
-        const auto search_field = search_field_it.value();  // copy, don't use reference!
-
-        // compare values between old and update docs:
-        // if they match, we will remove them from both del and update docs
-
-        if(update_doc.contains(search_field.name)) {
-            if(update_doc[search_field.name].is_null()) {
-                // we don't allow null values to be stored or indexed but need to be removed from stored doc
-                update_doc.erase(search_field.name);
-            }
-            else if(update_doc[search_field.name] == old_doc[search_field.name]) {
-                unchanged_keys.push_back(field_name);
+        if(old_doc.contains(it.key())) {
+            if(old_doc[it.key()] == it.value()) {
+                // unchanged so should not be part of update doc
+                it = update_doc.erase(it);
+                continue;
+            } else {
+                // delete this old value from index
+                del_doc[it.key()] = old_doc[it.key()];
            }
        }
-    }

-    for(const auto& unchanged_key: unchanged_keys) {
-        del_doc.erase(unchanged_key);
-        update_doc.erase(unchanged_key);
+        it++;
    }
 }

--- a/test/collection_specific_more_test.cpp
+++ b/test/collection_specific_more_test.cpp
@ -1225,6 +1225,153 @@ TEST_F(CollectionSpecificMoreTest, UpsertUpdateEmplaceShouldAllRemoveIndex) {
    ASSERT_EQ(1, results["found"].get<size_t>());
 }

+TEST_F(CollectionSpecificMoreTest, UpdateWithEmptyArray) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "fields": [
+          {"name": "tags", "type": "string[]"}
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc1 = R"({
+        "id": "0",
+        "tags": ["alpha", "beta", "gamma"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
+
+    auto doc2 = R"({
+        "id": "1",
+        "tags": ["one", "two"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
+
+    // via update
+
+    auto doc_update = R"({
+        "id": "0",
+        "tags": []
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
+
+    auto results = coll1->search("alpha", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+
+    // via upsert
+
+    doc_update = R"({
+        "id": "1",
+        "tags": []
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
+
+    results = coll1->search("one", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+}
+
+TEST_F(CollectionSpecificMoreTest, UpdateArrayWithNullValue) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "fields": [
+          {"name": "tags", "type": "string[]", "optional": true}
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc1 = R"({
+        "id": "0",
+        "tags": ["alpha", "beta", "gamma"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
+
+    auto doc2 = R"({
+        "id": "1",
+        "tags": ["one", "two"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
+
+    // via update
+
+    auto doc_update = R"({
+        "id": "0",
+        "tags": null
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
+
+    auto results = coll1->search("alpha", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+
+    // via upsert
+
+    doc_update = R"({
+        "id": "1",
+        "tags": null
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
+
+    results = coll1->search("one", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+}
+
+TEST_F(CollectionSpecificMoreTest, ReplaceArrayElement) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "fields": [
+          {"name": "tags", "type": "string[]"}
+        ]
+    })"_json;
+
+    auto op = collectionManager.create_collection(schema);
+    ASSERT_TRUE(op.ok());
+    Collection* coll1 = op.get();
+
+    auto doc1 = R"({
+        "id": "0",
+        "tags": ["alpha", "beta", "gamma"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
+
+    auto doc2 = R"({
+        "id": "1",
+        "tags": ["one", "two", "three"]
+    })"_json;
+
+    ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
+
+    // via update
+
+    auto doc_update = R"({
+        "id": "0",
+        "tags": ["alpha", "gamma"]
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
+
+    auto results = coll1->search("beta", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+
+    // via upsert
+
+    doc_update = R"({
+        "id": "1",
+        "tags": ["one", "three"]
+    })"_json;
+    ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
+
+    results = coll1->search("two", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+    ASSERT_EQ(0, results["found"].get<size_t>());
+}
+
 TEST_F(CollectionSpecificMoreTest, UnorderedWeightingOfFields) {
    nlohmann::json schema = R"({
        "name": "coll1",
--- a/test/index_test.cpp
+++ b/test/index_test.cpp
@ -3,66 +3,6 @@
 #include <vector>
 #include <s2/s2loop.h>

-TEST(IndexTest, ScrubReindexDoc) {
-    tsl::htrie_map<char, field> search_schema;
-    search_schema.emplace("title", field("title", field_types::STRING, false));
-    search_schema.emplace("points", field("points", field_types::INT32, false));
-    search_schema.emplace("cast", field("cast", field_types::STRING_ARRAY, false));
-    search_schema.emplace("movie", field("movie", field_types::BOOL, false));
-
-    ThreadPool pool(4);
-
-    Index index("index", 1, nullptr, nullptr, &pool, search_schema, {}, {});
-    nlohmann::json old_doc;
-    old_doc["id"] = "1";
-    old_doc["title"] = "One more thing.";
-    old_doc["points"] = 100;
-    old_doc["cast"] = {"John Wick", "Jeremy Renner"};
-    old_doc["movie"] = true;
-
-    // all fields remain same
-
-    nlohmann::json update_doc1, del_doc1;
-    update_doc1 = old_doc;
-    del_doc1 = old_doc;
-
-    index.scrub_reindex_doc(search_schema, update_doc1, del_doc1, old_doc);
-    ASSERT_EQ(1, del_doc1.size());
-    ASSERT_STREQ("1", del_doc1["id"].get<std::string>().c_str());
-
-    // when only some fields are updated
-
-    nlohmann::json update_doc2, del_doc2;
-    update_doc2["id"] = "1";
-    update_doc2["points"] = 100;
-    update_doc2["cast"] = {"Jack"};
-
-    del_doc2 = update_doc2;
-
-    index.scrub_reindex_doc(search_schema, update_doc2, del_doc2, old_doc);
-    ASSERT_EQ(2, del_doc2.size());
-    ASSERT_STREQ("1", del_doc2["id"].get<std::string>().c_str());
-    std::vector<std::string> cast = del_doc2["cast"].get<std::vector<std::string>>();
-    ASSERT_EQ(1, cast.size());
-    ASSERT_STREQ("Jack", cast[0].c_str());
-
-    // containing fields not part of search schema
-
-    nlohmann::json update_doc3, del_doc3;
-    update_doc3["id"] = "1";
-    update_doc3["title"] = "The Lawyer";
-    update_doc3["foo"] = "Bar";
-
-    del_doc3 = update_doc3;
-    index.scrub_reindex_doc(search_schema, update_doc3, del_doc3, old_doc);
-    ASSERT_EQ(3, del_doc3.size());
-    ASSERT_STREQ("1", del_doc3["id"].get<std::string>().c_str());
-    ASSERT_STREQ("The Lawyer", del_doc3["title"].get<std::string>().c_str());
-    ASSERT_STREQ("Bar", del_doc3["foo"].get<std::string>().c_str());
-
-    pool.shutdown();
-}
-
 /*TEST(IndexTest, PointInPolygon180thMeridian) {
    // somewhere in far eastern russia
    GeoCoord verts[3] = {