Array append/remove perf improvements.

2025-05-17 20:22:32 +08:00 · 2020-10-20 06:55:58 +05:30 · 2020-10-20 06:55:58 +05:30 · bccc3ac362
commit bccc3ac362
parent 0508700ca3
6 changed files with 296 additions and 23 deletions
--- a/include/sorted_array.h
+++ b/include/sorted_array.h
@ -52,5 +52,7 @@ public:

    bool insert(size_t index, uint32_t value);

+    void remove_value(uint32_t value);
+
    void remove_values(uint32_t *sorted_values, uint32_t sorted_values_length);
 };
--- a/src/index.cpp
+++ b/src/index.cpp
@ -1911,7 +1911,6 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc

            art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), key, key_len);
            if(leaf != nullptr) {
-                uint32_t seq_id_values[1] = {seq_id};
                uint32_t doc_index = leaf->values->ids.indexOf(seq_id);

                if(doc_index == leaf->values->ids.getLength()) {
@ -1928,7 +1927,7 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
                remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);

                leaf->values->offsets.remove_index(start_offset, end_offset);
-                leaf->values->ids.remove_values(seq_id_values, 1);
+                leaf->values->ids.remove_value(seq_id);

                /*len = leaf->values->offset_index.getLength();
                for(auto i=0; i<len; i++) {
--- a/src/sorted_array.cpp
+++ b/src/sorted_array.cpp
@ -23,24 +23,21 @@ size_t sorted_array::append(uint32_t value) {
    if(value < max) {
        // we will have to re-encode the whole sequence again
        uint32_t* arr = uncompress(length+1);
-        size_t i = 0;
-        while(i < length+1) {
-            if(value < arr[i]) {
-                break;
-            }
-            i++;
-        }

-        for(size_t j=length; j>i; j--) {
+        // find the index of the element which is >= to `value`
+        uint32_t found_val;
+        uint32_t gte_index = for_lower_bound_search(in, length, value, &found_val);
+
+        for(size_t j=length; j>gte_index; j--) {
            arr[j] = arr[j-1];
        }

-        arr[i] = value;
+        arr[gte_index] = value;

        load(arr, length+1);
        delete [] arr;

-        return i;
+        return gte_index;
    } else {
        uint32_t size_required = sorted_append_size_required(value, length+1);
        size_t min_expected_size = size_required + FOR_ELE_SIZE;
@ -104,7 +101,11 @@ uint32_t sorted_array::indexOf(uint32_t value) {

    uint32_t actual;
    uint32_t index = for_lower_bound_search(in, length, value, &actual);
-    if(actual == value) return index;
+
+    if(actual == value) {
+        return index;
+    }
+
    return length;
 }

@ -193,6 +194,28 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint
    binary_search_indices(values, head, tail, low_index, high_index, base, bits, indices);
 }

+void sorted_array::remove_value(uint32_t value) {
+    // A lower bound search returns the first element in the sequence that is >= `value`
+    // So, `found_val` will be either equal or greater than `value`
+    uint32_t found_val;
+    uint32_t found_index = for_lower_bound_search(in, length, value, &found_val);
+
+    if(found_val != value) {
+        return ;
+    }
+
+    uint32_t *curr_array = uncompress();
+
+    if(found_index + 1 < length) {
+        memmove(&curr_array[found_index], &curr_array[found_index+1], sizeof(uint32_t) * (length - found_index - 1));
+    }
+
+    size_t new_length = (length == 0) ? 0 : (length - 1);
+    load(curr_array, new_length);
+
+    delete [] curr_array;
+}
+
 void sorted_array::remove_values(uint32_t *sorted_values, uint32_t sorted_values_length) {
    uint32_t *curr_array = uncompress();

--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -14,6 +14,9 @@ protected:
    CollectionManager & collectionManager = CollectionManager::get_instance();
    std::vector<sort_by> sort_fields;

+    // used for generating random text
+    std::vector<std::string> words;
+
    void setupCollection() {
        std::string state_dir_path = "/tmp/typesense_test/collection";
        LOG(INFO) << "Truncating and creating: " << state_dir_path;
@ -48,6 +51,12 @@ protected:
        }

        infile.close();
+
+        std::ifstream words_file(std::string(ROOT_DIR)+"test/resources/common100_english.txt");
+        std::stringstream strstream;
+        strstream << words_file.rdbuf();
+        words_file.close();
+        StringUtils::split(strstream.str(), words, "\n");
    }

    virtual void SetUp() {
@ -59,6 +68,18 @@ protected:
        collectionManager.dispose();
        delete store;
    }
+
+    std::string get_text(size_t num_words) {
+        time_t t;
+        srand((unsigned) time(&t));
+        std::vector<std::string> strs;
+
+        for(size_t i = 0 ; i < num_words ; i++ ) {
+            int word_index = rand() % 100;
+            strs.push_back(words[word_index]);
+        }
+        return StringUtils::join(strs, " ");
+    }
 };

 TEST_F(CollectionTest, VerifyCountOfDocuments) {
@ -1410,6 +1431,90 @@ TEST_F(CollectionTest, ImportDocumentsUpsert) {
    ASSERT_EQ(70, results["hits"][0]["document"]["points"].get<uint32_t>());
 }

+
+TEST_F(CollectionTest, ImportDocumentsUpsertOptional) {
+    Collection *coll1;
+    std::vector<field> fields = {
+            field("title", field_types::STRING_ARRAY, false, true),
+            field("points", field_types::INT32, false)
+    };
+
+    coll1 = collectionManager.get_collection("coll1");
+    if(coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
+    }
+
+    std::vector<std::string> records;
+
+    size_t NUM_RECORDS = 1000;
+
+    for(size_t i=0; i<NUM_RECORDS; i++) {
+        nlohmann::json doc;
+        doc["id"] = std::to_string(i);
+        doc["points"] = i;
+        records.push_back(doc.dump());
+    }
+
+    // import records without title
+
+    nlohmann::json document;
+    nlohmann::json import_response = coll1->add_many(records, document, false);
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(1000, import_response["num_imported"].get<int>());
+
+    // upsert documents with title
+
+    records.clear();
+
+    for(size_t i=0; i<NUM_RECORDS; i++) {
+        nlohmann::json updoc;
+        updoc["id"] = std::to_string(i);
+        updoc["title"] = {
+            get_text(10),
+            get_text(10),
+            get_text(10),
+            get_text(10),
+        };
+        records.push_back(updoc.dump());
+    }
+
+    auto begin = std::chrono::high_resolution_clock::now();
+    import_response = coll1->add_many(records, document, true);
+    auto time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
+            std::chrono::high_resolution_clock::now() - begin).count();
+    
+    //LOG(INFO) << "Time taken for first upsert: " << time_micros;
+    
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(1000, import_response["num_imported"].get<int>());
+
+    // run upsert again with title override
+
+    records.clear();
+
+    for(size_t i=0; i<NUM_RECORDS; i++) {
+        nlohmann::json updoc;
+        updoc["id"] = std::to_string(i);
+        updoc["title"] = {
+            get_text(10),
+            get_text(10),
+            get_text(10),
+            get_text(10),
+        };
+        records.push_back(updoc.dump());
+    }
+
+    begin = std::chrono::high_resolution_clock::now();
+    import_response = coll1->add_many(records, document, true);
+    time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
+            std::chrono::high_resolution_clock::now() - begin).count();
+
+    //LOG(INFO) << "Time taken for second upsert: " << time_micros;
+
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(1000, import_response["num_imported"].get<int>());
+}
+
 TEST_F(CollectionTest, ImportDocuments) {
    Collection *coll_mul_fields;

--- a/test/resources/common100_english.txt
+++ b/test/resources/common100_english.txt
@ -0,0 +1,100 @@
+the
+of
+to
+and
+a
+in
+is
+it
+you
+that
+he
+was
+for
+on
+are
+with
+as
+I
+his
+they
+be
+at
+one
+have
+this
+from
+or
+had
+by
+not
+word
+but
+what
+some
+we
+can
+out
+other
+were
+all
+there
+when
+up
+use
+your
+how
+said
+an
+each
+she
+which
+do
+their
+time
+if
+will
+way
+about
+many
+then
+them
+write
+would
+like
+so
+these
+her
+long
+make
+thing
+see
+him
+two
+has
+look
+more
+day
+could
+go
+come
+did
+number
+sound
+no
+most
+people
+my
+over
+know
+water
+than
+call
+first
+who
+may
+down
+side
+been
+now
+find
--- a/test/sorted_array_test.cpp
+++ b/test/sorted_array_test.cpp
@ -12,7 +12,8 @@ TEST(SortedArrayTest, Append) {
    EXPECT_EQ(arr.indexOf(100), 0);  // when not found must be equal to length (0 in this case)

    for(uint32_t i=0; i < SIZE; i++) {
-        arr.append(i);
+        size_t appended_index = arr.append(i);
+        ASSERT_EQ(i, appended_index);
    }

    EXPECT_EQ(arr.getLength(), SIZE);
@ -28,7 +29,8 @@ TEST(SortedArrayTest, Append) {
    EXPECT_EQ(arr.indexOf(SIZE+1), SIZE);

    sorted_array arr_small;
-    arr_small.append(100);
+    size_t appended_index = arr_small.append(100);
+    EXPECT_EQ(0, appended_index);
    EXPECT_EQ(arr_small.getLength(), 1);
    EXPECT_EQ(arr_small.at(0), 100);
 }
@ -36,18 +38,34 @@ TEST(SortedArrayTest, Append) {
 TEST(SortedArrayTest, AppendOutOfOrder) {
    sorted_array arr;
    for(size_t i=5; i<=10; i++) {
-        arr.append(i);
+        size_t appended_index = arr.append(i);
+        ASSERT_EQ(i-5, appended_index);
    }

    EXPECT_EQ(6, arr.getLength());

-    arr.append(1);
-    arr.append(3);
-    arr.append(2);
-    arr.append(4);
-    arr.append(11);
-    arr.append(14);
-    arr.append(12);
+    int appended_index = -1;
+
+    appended_index = arr.append(1);
+    ASSERT_EQ(0, appended_index);
+
+    appended_index = arr.append(3);
+    ASSERT_EQ(1, appended_index);
+
+    appended_index = arr.append(2);
+    ASSERT_EQ(1, appended_index);
+
+    appended_index = arr.append(4);
+    ASSERT_EQ(3, appended_index);
+
+    appended_index = arr.append(11);
+    ASSERT_EQ(10, appended_index);
+
+    appended_index = arr.append(14);
+    ASSERT_EQ(11, appended_index);
+
+    appended_index = arr.append(12);
+    ASSERT_EQ(11, appended_index);

    EXPECT_EQ(13, arr.getLength());
 }
@ -136,6 +154,32 @@ TEST(SortedArrayTest, Uncompress) {
    delete[] raw_sorted_arr;
 }

+TEST(SortedArrayTest, RemoveValue) {
+    sorted_array arr;
+
+    const size_t SIZE = 10*1000;
+    for(size_t i=0; i<SIZE; i++) {
+        arr.append(i);
+    }
+
+    uint32_t values[5] = {0, 100, 1000, 2000, SIZE-1};
+
+    for(size_t i=0; i<5; i++) {
+        arr.remove_value(values[i]);
+    }
+
+    ASSERT_EQ(arr.getLength(), SIZE-5);
+
+    for(size_t i=0; i<SIZE-5; i++) {
+        uint32_t value = arr.at(i);
+        ASSERT_FALSE(value == 0);
+        ASSERT_FALSE(value == 100);
+        ASSERT_FALSE(value == 1000);
+        ASSERT_FALSE(value == 2000);
+        ASSERT_FALSE(value == SIZE-1);
+    }
+}
+
 TEST(SortedArrayTest, RemoveValues) {
    sorted_array arr;