Merge branch 'doc-update'

2025-05-19 13:12:22 +08:00 · 2020-10-25 20:33:28 +05:30 · 2020-10-25 20:33:28 +05:30 · 3d1ea448b6
commit 3d1ea448b6
parent eb70554e1c 72f857ab1a
30 changed files with 1499 additions and 302 deletions
--- a/README.md
+++ b/README.md
@ -52,7 +52,7 @@ Here's a quick example showcasing how you can create a collection, index a docum
 Let's begin by starting the Typesense server via Docker:

 ```
-docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.15.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
+docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.16.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
 ```

 We have [API Clients](#api-clients) in a couple of languages, but let's use the Python client for this example.
--- a/include/array.h
+++ b/include/array.h
@ -18,6 +18,8 @@ private:
    }

 public:
+    void load(const uint32_t *sorted_array, uint32_t array_length, uint32_t m, uint32_t M);
+
    uint32_t at(uint32_t index);

    bool contains(uint32_t value);
@ -26,5 +28,7 @@ public:

    bool append(uint32_t value);

+    bool insert(size_t index, const uint32_t* values, size_t num_values);
+
    void remove_index(uint32_t start_index, uint32_t end_index);
 };
--- a/include/array_base.h
+++ b/include/array_base.h
@ -36,7 +36,8 @@ public:
        in = nullptr;
    }

-    uint32_t* uncompress();
+    // len determines length of output buffer (default: length of input)
+    uint32_t* uncompress(uint32_t len=0);

    uint32_t getSizeInBytes();

--- a/include/art.h
+++ b/include/art.h
@ -96,9 +96,9 @@ typedef struct {
 * of arbitrary size, as they include the key.
 */
 typedef struct {
-    art_values* values;
-    int32_t max_score;
    uint32_t key_len;
+    int64_t max_score;
+    art_values* values;
    unsigned char key[];
 } art_leaf;

--- a/include/collection.h
+++ b/include/collection.h
@ -92,6 +92,11 @@ struct override_t {
    }
 };

+struct doc_seq_id_t {
+    uint32_t seq_id;
+    bool is_new;
+};
+
 class Collection {
 private:

@ -150,7 +155,9 @@ private:

    void highlight_result(const field &search_field, const std::vector<std::vector<art_leaf *>> &searched_queries,
                          const KV* field_order_kv, const nlohmann::json &document,
-                          StringUtils & string_utils, size_t snippet_threshold,
+                          StringUtils & string_utils,
+                          const size_t snippet_threshold,
+                          const size_t highlight_affix_num_tokens,
                          bool highlighted_fully,
                          highlight_t &highlight);

@ -217,13 +224,16 @@ public:

    std::string get_default_sorting_field();

-    Option<uint32_t> to_doc(const std::string & json_str, nlohmann::json & document);
+    Option<doc_seq_id_t> to_doc(const std::string& json_str, nlohmann::json& document,
+                                const index_operation_t& operation, const std::string& id="");

    nlohmann::json get_summary_json();

-    Option<nlohmann::json> add(const std::string & json_str);
+    Option<nlohmann::json> add(const std::string & json_str,
+                               const index_operation_t& operation=CREATE, const std::string& id="");

-    nlohmann::json add_many(std::vector<std::string>& json_lines);
+    nlohmann::json add_many(std::vector<std::string>& json_lines, nlohmann::json& document,
+                            const index_operation_t& operation=CREATE, const std::string& id="");

    Option<nlohmann::json> search(const std::string & query, const std::vector<std::string> & search_fields,
                          const std::string & simple_filter_query, const std::vector<std::string> & facet_fields,
@ -236,6 +246,7 @@ public:
                          size_t max_facet_values=10,
                          const std::string & simple_facet_query = "",
                          const size_t snippet_threshold = 30,
+                          const size_t highlight_affix_num_tokens = 4,
                          const std::string & highlight_full_fields = "",
                          size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD,
                          const std::map<size_t, std::vector<std::string>>& pinned_hits={},
@ -263,7 +274,7 @@ public:

    Option<bool> get_document_from_store(const std::string & seq_id_key, nlohmann::json & document);

-    Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id);
+    Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id, bool is_update);

    size_t par_index_in_memory(std::vector<std::vector<index_record>> & iter_batch, std::vector<size_t>& indexed_counts);

@ -296,5 +307,9 @@ public:
                     size_t &num_indexed);

    bool is_exceeding_memory_threshold() const;
+
+    void get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc,
+                         nlohmann::json &new_doc,
+                         nlohmann::json &del_doc);
 };

--- a/include/core_api.h
+++ b/include/core_api.h
@ -23,6 +23,8 @@ bool get_export_documents(http_req& req, http_res& res);

 bool post_add_document(http_req& req, http_res& res);

+bool patch_update_document(http_req& req, http_res& res);
+
 bool post_import_documents(http_req& req, http_res& res);

 bool get_fetch_document(http_req& req, http_res& res);
--- a/include/http_server.h
+++ b/include/http_server.h
@ -127,6 +127,8 @@ public:

    void put(const std::string & path, bool (*handler)(http_req & req, http_res & res), bool async_req=false, bool async_res=false);

+    void patch(const std::string & path, bool (*handler)(http_req & req, http_res & res), bool async_req=false, bool async_res=false);
+
    void del(const std::string & path, bool (*handler)(http_req & req, http_res & res), bool async_req=false, bool async_res=false);

    void on(const std::string & message, bool (*handler)(void*));
--- a/include/index.h
+++ b/include/index.h
@ -79,15 +79,29 @@ struct search_args {
    };
 };

+enum index_operation_t {
+    CREATE,
+    UPSERT,
+    UPDATE,
+    DELETE
+};
+
 struct index_record {
-    size_t position;         // position of record in the original request
+    size_t position;                    // position of record in the original request
    uint32_t seq_id;
-    nlohmann::json document;

-    Option<bool> indexed;     // indicates if the indexing operation was a success
+    nlohmann::json doc;
+    nlohmann::json old_doc;
+    nlohmann::json new_doc;
+    nlohmann::json del_doc;

-    index_record(size_t record_pos, uint32_t seq_id, const nlohmann::json& doc):
-            position(record_pos), seq_id(seq_id), document(doc), indexed(true) {
+    index_operation_t operation;
+    bool is_update;
+
+    Option<bool> indexed;               // indicates if the indexing operation was a success
+
+    index_record(size_t record_pos, uint32_t seq_id, const nlohmann::json& doc, index_operation_t operation):
+            position(record_pos), seq_id(seq_id), doc(doc), operation(operation), is_update(false), indexed(false) {

    }

@ -95,7 +109,7 @@ struct index_record {
        indexed = Option<bool>(err_code, err_msg);
    }

-    void index_success(const index_record & record) {
+    void index_success() {
        indexed = Option<bool>(true);
    }
 };
@ -154,32 +168,32 @@ private:
                           size_t & all_result_ids_len,
                           const size_t typo_tokens_threshold);

-    void insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
+    void insert_doc(const int64_t score, art_tree *t, uint32_t seq_id,
                    const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const;

-    void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id,
+    void index_string_field(const std::string & text, const int64_t score, art_tree *t, uint32_t seq_id,
                            int facet_id, const field & a_field);

-    void index_string_array_field(const std::vector<std::string> & strings, const uint32_t score, art_tree *t,
+    void index_string_array_field(const std::vector<std::string> & strings, const int64_t score, art_tree *t,
                                  uint32_t seq_id, int facet_id, const field & a_field);

-    void index_int32_field(const int32_t value, const uint32_t score, art_tree *t, uint32_t seq_id) const;
+    void index_int32_field(const int32_t value, const int64_t score, art_tree *t, uint32_t seq_id) const;

-    void index_int64_field(const int64_t value, const uint32_t score, art_tree *t, uint32_t seq_id) const;
+    void index_int64_field(const int64_t value, const int64_t score, art_tree *t, uint32_t seq_id) const;

-    void index_float_field(const float value, const uint32_t score, art_tree *t, uint32_t seq_id) const;
+    void index_float_field(const float value, const int64_t score, art_tree *t, uint32_t seq_id) const;
    
-    void index_bool_field(const bool value, const uint32_t score, art_tree *t, uint32_t seq_id) const;
+    void index_bool_field(const bool value, const int64_t score, art_tree *t, uint32_t seq_id) const;

-    void index_int32_array_field(const std::vector<int32_t> & values, const uint32_t score, art_tree *t, uint32_t seq_id) const;
+    void index_int32_array_field(const std::vector<int32_t> & values, const int64_t score, art_tree *t, uint32_t seq_id) const;

-    void index_int64_array_field(const std::vector<int64_t> & values, const uint32_t score, art_tree *t, uint32_t seq_id) const;
+    void index_int64_array_field(const std::vector<int64_t> & values, const int64_t score, art_tree *t, uint32_t seq_id) const;

-    void index_float_array_field(const std::vector<float> & values, const uint32_t score, art_tree *t, uint32_t seq_id) const;
+    void index_float_array_field(const std::vector<float> & values, const int64_t score, art_tree *t, uint32_t seq_id) const;
    
-    void index_bool_array_field(const std::vector<bool> & values, const uint32_t score, art_tree *t, uint32_t seq_id) const;
+    void index_bool_array_field(const std::vector<bool> & values, const int64_t score, art_tree *t, uint32_t seq_id) const;

-    void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
+    void remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
                                       const uint32_t indices_length);

    uint32_t* collate_leaf_ids(const std::vector<const art_leaf *> &leaves, size_t& result_ids_len) const;
@ -238,21 +252,22 @@ public:
                       spp::sparse_hash_set<uint64_t>& groups_processed,
                       const uint32_t *result_ids, const size_t result_size);

-    static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);
+    static int64_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);

    Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id,
-                                     const std::string & default_sorting_field);
+                                     const std::string & default_sorting_field, bool is_update);

    static Option<uint32_t> validate_index_in_memory(const nlohmann::json &document, uint32_t seq_id,
                                                     const std::string & default_sorting_field,
                                                     const std::unordered_map<std::string, field> & search_schema,
-                                                     const std::map<std::string, field> & facet_schema);
+                                                     const std::map<std::string, field> & facet_schema,
+                                                     bool is_update);

    static size_t batch_memory_index(Index *index,
-                                        std::vector<index_record> & iter_batch,
-                                        const std::string & default_sorting_field,
-                                        const std::unordered_map<std::string, field> & search_schema,
-                                        const std::map<std::string, field> & facet_schema);
+                                     std::vector<index_record> & iter_batch,
+                                     const std::string & default_sorting_field,
+                                     const std::unordered_map<std::string, field> & search_schema,
+                                     const std::map<std::string, field> & facet_schema);

    const spp::sparse_hash_map<std::string, art_tree *> &_get_search_index() const;

@ -291,5 +306,10 @@ public:
    void eq_str_filter_plain(const uint32_t *strt_ids, size_t strt_ids_size,
                             const std::vector<art_leaf *> &query_suggestion,
                             uint32_t *exact_strt_ids, size_t& exact_strt_size) const;
+
+    void scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_doc, nlohmann::json& old_doc);
+
+    void tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field,
+                            std::vector<std::string>& tokens);
 };

--- a/include/sorted_array.h
+++ b/include/sorted_array.h
@ -8,6 +8,7 @@
 #include <limits>
 #include <iostream>
 #include "array_base.h"
+#include "logger.h"

 class sorted_array: public array_base {
 private:
@ -16,7 +17,15 @@ private:
        uint32_t m = std::min(min, value);
        uint32_t M = std::max(max, value);
        uint32_t bnew = required_bits(M - m);
-        return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
+        uint32_t size_bits = for_compressed_size_bits(new_length, bnew);
+
+
+        /*if(new_length == 15) {
+            LOG(INFO) << "value: " << value << ", m: " << m << ", M: " << M << ", bnew: "
+                      << bnew << ", size_bits: " << size_bits;
+        }*/
+
+        return METADATA_OVERHEAD + 4 + size_bits;
    }

    uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base,
@ -39,7 +48,11 @@ public:
    void indexOf(const uint32_t *values, const size_t values_len, uint32_t* indices);

    // returns false if malloc fails
-    bool append(uint32_t value);
+    size_t append(uint32_t value);

-    void remove_values(uint32_t *sorted_values, uint32_t values_length);
+    bool insert(size_t index, uint32_t value);
+
+    void remove_value(uint32_t value);
+
+    void remove_values(uint32_t *sorted_values, uint32_t sorted_values_length);
 };
--- a/include/string_utils.h
+++ b/include/string_utils.h
@ -199,6 +199,15 @@ struct StringUtils {
        return (*p == 0) && val >= std::numeric_limits<int32_t>::min() && val <= std::numeric_limits<int32_t>::max();
    }

+    static bool is_bool(std::string &s) {
+        if(s.empty()) {
+            return false;
+        }
+
+        StringUtils::tolowercase(s);
+        return s == "true" || s == "false";
+    }
+
    static void toupper(std::string& str) {
        std::transform(str.begin(), str.end(), str.begin(), ::toupper);
    }
--- a/src/array.cpp
+++ b/src/array.cpp
@ -41,6 +41,47 @@ bool array::append(uint32_t value) {
    return true;
 }

+void array::load(const uint32_t *sorted_array, const uint32_t array_length, const uint32_t m, const uint32_t M) {
+    min = m;
+    max = M;
+
+    uint32_t size_required = (uint32_t) (unsorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR);
+    uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out);
+    uint32_t actual_size = for_compress_unsorted(sorted_array, out, array_length);
+
+    free(in);
+    in = nullptr;
+
+    in = out;
+    length = array_length;
+    size_bytes = size_required;
+    length_bytes = actual_size;
+}
+
+bool array::insert(size_t index, const uint32_t* values, size_t num_values) {
+    if(index >= length) {
+        return false;
+    }
+
+    uint32_t *curr_array = uncompress(length+num_values);
+    memmove(&curr_array[index+num_values], &curr_array[index], sizeof(uint32_t)*(length-index));
+
+    uint32_t m = min, M = max;
+
+    for(size_t i=0; i<num_values; i++) {
+        uint32_t value = values[i];
+        if(value < m) m = value;
+        if(value > M) M = value;
+        curr_array[index+i] = value;
+    }
+
+    load(curr_array, length+num_values, m, M);
+
+    delete [] curr_array;
+
+    return true;
+}
+
 void array::remove_index(uint32_t start_index, uint32_t end_index) {
    uint32_t *curr_array = uncompress();

--- a/src/array_base.cpp
+++ b/src/array_base.cpp
@ -1,7 +1,8 @@
 #include "array_base.h"

-uint32_t* array_base::uncompress() {
-    uint32_t *out = new uint32_t[length];
+uint32_t* array_base::uncompress(uint32_t len) {
+    uint32_t actual_len = std::max(len, length);
+    uint32_t *out = new uint32_t[actual_len];
    for_uncompress(in, out, length);
    return out;
 }
--- a/src/art.cpp
+++ b/src/art.cpp
@ -39,6 +39,8 @@ static void art_fuzzy_recurse(unsigned char p, unsigned char c, const art_node *
 void art_int_fuzzy_recurse(art_node *n, int depth, const unsigned char* int_str, int int_str_len,
                           NUM_COMPARATOR comparator, std::vector<const art_leaf *> &results);

+static void insert_and_shift_offset_index(sorted_array& offset_index, const uint32_t index, const uint32_t num_offsets);
+
 bool compare_art_leaf_frequency(const art_leaf *a, const art_leaf *b) {
    return a->values->ids.getLength() > b->values->ids.getLength();
 }
@ -408,15 +410,42 @@ art_leaf* art_maximum(art_tree *t) {

 static void add_document_to_leaf(const art_document *document, art_leaf *leaf) {
    leaf->max_score = MAX(leaf->max_score, document->score);
-    leaf->values->ids.append(document->id);
-    uint32_t curr_index = leaf->values->offsets.getLength();
-    leaf->values->offset_index.append(curr_index);
+    size_t inserted_index = leaf->values->ids.append(document->id);

-    for(uint32_t i=0; i<document->offsets_len; i++) {
-        leaf->values->offsets.append(document->offsets[i]);
+    if(inserted_index == leaf->values->ids.getLength()-1) {
+        // treat as appends
+        uint32_t curr_index = leaf->values->offsets.getLength();
+        leaf->values->offset_index.append(curr_index);
+        for(uint32_t i=0; i<document->offsets_len; i++) {
+            leaf->values->offsets.append(document->offsets[i]);
+        }
+    } else {
+        uint32_t existing_offset_index = leaf->values->offset_index.at(inserted_index);
+        insert_and_shift_offset_index(leaf->values->offset_index, inserted_index, document->offsets_len);
+        leaf->values->offsets.insert(existing_offset_index, document->offsets, document->offsets_len);
    }
 }

+void insert_and_shift_offset_index(sorted_array& offset_index, const uint32_t index, const uint32_t num_offsets) {
+    uint32_t existing_offset_index = offset_index.at(index);
+    uint32_t length = offset_index.getLength();
+    uint32_t new_length = length + 1;
+    uint32_t *curr_array = offset_index.uncompress(new_length);
+
+    memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length - index));
+    curr_array[index] = existing_offset_index;
+
+    uint32_t curr_index = index + 1;
+    while(curr_index < new_length) {
+        curr_array[curr_index] += num_offsets;
+        curr_index++;
+    }
+
+    offset_index.load(curr_array, new_length);
+
+    delete [] curr_array;
+}
+
 static art_leaf* make_leaf(const unsigned char *key, uint32_t key_len, art_document *document) {
    art_leaf *l = (art_leaf *) malloc(sizeof(art_leaf) + key_len);
    l->values = new art_values;
--- a/src/auth_manager.cpp
+++ b/src/auth_manager.cpp
@ -5,7 +5,7 @@ constexpr const char* AuthManager::DOCUMENTS_SEARCH_ACTION;

 Option<bool> AuthManager::init(Store *store) {
    // This function must be idempotent, i.e. when called multiple times, must produce the same state without leaks
-    LOG(INFO) << "AuthManager::init()";
+    //LOG(INFO) << "AuthManager::init()";

    this->store = store;

@ -157,7 +157,7 @@ bool AuthManager::authenticate(const std::string& req_api_key, const std::string
        }

        // enrich params with values from embedded_params
-        for (const auto& it: embedded_params.items()){
+        for(auto it = embedded_params.begin(); it != embedded_params.end(); ++it) {
            if(params.count(it.key()) == 0) {
                params[it.key()] = it.value();
            } else if(it.key() == "filter_by") {
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -8,7 +8,6 @@
 #include <art.h>
 #include <thread>
 #include <future>
-#include <chrono>
 #include <rocksdb/write_batch.h>
 #include <system_metrics.h>
 #include "topster.h"
@ -99,33 +98,75 @@ void Collection::increment_next_seq_id_field() {
    next_seq_id++;
 }

-Option<uint32_t> Collection::to_doc(const std::string & json_str, nlohmann::json & document) {
+Option<doc_seq_id_t> Collection::to_doc(const std::string & json_str, nlohmann::json& document,
+                                        const index_operation_t& operation, const std::string& id) {
    try {
        document = nlohmann::json::parse(json_str);
    } catch(const std::exception& e) {
        LOG(ERROR) << "JSON error: " << e.what();
-        return Option<uint32_t>(400, std::string("Bad JSON: ") + e.what());
+        return Option<doc_seq_id_t>(400, std::string("Bad JSON: ") + e.what());
    }

    if(!document.is_object()) {
-        return Option<uint32_t>(400, "Bad JSON: not a properly formed document.");
+        return Option<doc_seq_id_t>(400, "Bad JSON: not a properly formed document.");
    }

-    uint32_t seq_id = get_next_seq_id();
-    std::string seq_id_str = std::to_string(seq_id);
+    if(document.count("id") != 0 && id != "" && document["id"] != id) {
+        return Option<doc_seq_id_t>(400, "The `id` of the resource does not match the `id` in the JSON body.");
+    }
+
+    if(document.count("id") == 0 && !id.empty()) {
+        // use the explicit ID (usually from a PUT request) if document body does not have it
+        document["id"] = id;
+    }
+
+    if(document.count("id") != 0 && document["id"] == "") {
+        return Option<doc_seq_id_t>(400, "The `id` should not be empty.");
+    }

    if(document.count("id") == 0) {
-        document["id"] = seq_id_str;
-    } else if(!document["id"].is_string()) {
-        return Option<uint32_t>(400, "Document's `id` field should be a string.");
-    }
+        if(operation == UPDATE) {
+            return Option<doc_seq_id_t>(400, "For update, the `id` key must be provided.");
+        }
+        // for UPSERT or CREATE, if a document does not have an ID, we will treat it as a new doc
+        uint32_t seq_id = get_next_seq_id();
+        document["id"] = std::to_string(seq_id);
+        return Option<doc_seq_id_t>(doc_seq_id_t{seq_id, true});
+    } else {
+        if(!document["id"].is_string()) {
+            return Option<doc_seq_id_t>(400, "Document's `id` field should be a string.");
+        }

-    const std::string& doc_id = document["id"];
-    if(doc_exists(doc_id)) {
-        return Option<uint32_t>(409, std::string("A document with id ") + doc_id + " already exists.");
-    }
+        const std::string& doc_id = document["id"];

-    return Option<uint32_t>(seq_id);
+        // try to get the corresponding sequence id from disk if present
+        std::string seq_id_str;
+        StoreStatus seq_id_status = store->get(get_doc_id_key(doc_id), seq_id_str);
+
+        if(seq_id_status == StoreStatus::ERROR) {
+            return Option<doc_seq_id_t>(500, "Error fetching the sequence key for document with id: " + doc_id);
+        }
+
+        if(seq_id_status == StoreStatus::FOUND) {
+            if(operation == CREATE) {
+                return Option<doc_seq_id_t>(409, std::string("A document with id ") + doc_id + " already exists.");
+            }
+
+            // UPSERT or UPDATE
+            uint32_t seq_id = (uint32_t) std::stoul(seq_id_str);
+            return Option<doc_seq_id_t>(doc_seq_id_t{seq_id, false});
+
+        } else {
+            if(operation == UPDATE) {
+                // for UPDATE, a document with given ID must be found
+                return Option<doc_seq_id_t>(404, "Could not find a document with id: " + doc_id);
+            } else {
+                // for UPSERT or CREATE, if a document with given ID is not found, we will treat it as a new doc
+                uint32_t seq_id = get_next_seq_id();
+                return Option<doc_seq_id_t>(doc_seq_id_t{seq_id, true});
+            }
+        }
+    }
 }

 nlohmann::json Collection::get_summary_json() {
@ -152,45 +193,48 @@ nlohmann::json Collection::get_summary_json() {
    return json_response;
 }

-Option<nlohmann::json> Collection::add(const std::string & json_str) {
+Option<nlohmann::json> Collection::add(const std::string & json_str,
+                                       const index_operation_t& operation, const std::string& id) {
    nlohmann::json document;
-    Option<uint32_t> doc_seq_id_op = to_doc(json_str, document);
+    std::vector<std::string> json_lines = {json_str};
+    const nlohmann::json& res = add_many(json_lines, document, operation, id);

-    if(!doc_seq_id_op.ok()) {
-        return Option<nlohmann::json>(doc_seq_id_op.code(), doc_seq_id_op.error());
-    }
+    if(!res["success"].get<bool>()) {
+        nlohmann::json res_doc;

-    /*if(is_exceeding_memory_threshold()) {
-        return Option<nlohmann::json>(403, "Max memory ratio exceeded.");
-    }*/
+        try {
+            res_doc = nlohmann::json::parse(json_lines[0]);
+        } catch(const std::exception& e) {
+            LOG(ERROR) << "JSON error: " << e.what();
+            return Option<nlohmann::json>(400, std::string("Bad JSON: ") + e.what());
+        }

-    const uint32_t seq_id = doc_seq_id_op.get();
-    const std::string seq_id_str = std::to_string(seq_id);
-
-    const Option<uint32_t> & index_memory_op = index_in_memory(document, seq_id);
-
-    if(!index_memory_op.ok()) {
-        return Option<nlohmann::json>(index_memory_op.code(), index_memory_op.error());
-    }
-
-    const std::string& serialized_json = document.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore);
-
-    rocksdb::WriteBatch batch;
-    batch.Put(get_doc_id_key(document["id"]), seq_id_str);
-    batch.Put(get_seq_id_key(seq_id), serialized_json);
-    bool write_ok = store->batch_write(batch);
-
-    if(!write_ok) {
-        remove_document(document, seq_id, false);  // remove from in-memory store too
-        return Option<nlohmann::json>(500, "Could not write to on-disk storage.");
+        return Option<nlohmann::json>(res_doc["code"].get<size_t>(), res_doc["error"].get<std::string>());
    }

    return Option<nlohmann::json>(document);
 }

-nlohmann::json Collection::add_many(std::vector<std::string>& json_lines) {
-    //LOG(INFO) << "Memory ratio. Max = " << max_memory_ratio << ", Used = " << SystemMetrics::used_memory_ratio();
+void Collection::get_doc_changes(const nlohmann::json &document, nlohmann::json &old_doc,
+                                 nlohmann::json &new_doc, nlohmann::json &del_doc) {

+    for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
+        new_doc[it.key()] = it.value();
+    }
+
+    for(auto it = document.begin(); it != document.end(); ++it) {
+        new_doc[it.key()] = it.value();
+        if(old_doc.count(it.key()) != 0) {
+            // key exists in the stored doc, so it must be reindexed
+            // we need to check for this because a field can be optional
+            del_doc[it.key()] = old_doc[it.key()];
+        }
+    }
+}
+
+nlohmann::json Collection::add_many(std::vector<std::string>& json_lines, nlohmann::json& document,
+                                    const index_operation_t& operation, const std::string& id) {
+    //LOG(INFO) << "Memory ratio. Max = " << max_memory_ratio << ", Used = " << SystemMetrics::used_memory_ratio();
    std::vector<std::vector<index_record>> iter_batch;

    for(size_t i = 0; i < num_memory_shards; i++) {
@ -203,16 +247,23 @@ nlohmann::json Collection::add_many(std::vector<std::string>& json_lines) {

    for(size_t i=0; i < json_lines.size(); i++) {
        const std::string & json_line = json_lines[i];
-        nlohmann::json document;
-        Option<uint32_t> doc_seq_id_op = to_doc(json_line, document);
+        Option<doc_seq_id_t> doc_seq_id_op = to_doc(json_line, document, operation, id);

-        const uint32_t seq_id = doc_seq_id_op.ok() ? doc_seq_id_op.get() : 0;
-        index_record record(i, seq_id, document);
+        const uint32_t seq_id = doc_seq_id_op.ok() ? doc_seq_id_op.get().seq_id : 0;
+        index_record record(i, seq_id, document, operation);

        // NOTE: we overwrite the input json_lines with result to avoid memory pressure

+        record.is_update = false;
+
        if(!doc_seq_id_op.ok()) {
            record.index_failure(doc_seq_id_op.code(), doc_seq_id_op.error());
+        } else {
+            record.is_update = !doc_seq_id_op.get().is_new;
+            if(record.is_update) {
+                get_document_from_store(get_seq_id_key(seq_id), record.old_doc);
+                get_doc_changes(document, record.old_doc, record.new_doc, record.del_doc);
+            }
        }

        /*
@ -261,45 +312,74 @@ void Collection::batch_index(std::vector<std::vector<index_record>> &index_batch
    // store only documents that were indexed in-memory successfully
    for(auto& index_batch: index_batches) {
        for(auto& index_record: index_batch) {
+            nlohmann::json res;
+
            if(index_record.indexed.ok()) {
-                const std::string& seq_id_str = std::to_string(index_record.seq_id);
-                const std::string& serialized_json = index_record.document.dump(-1, ' ', false,
-                                                                                nlohmann::detail::error_handler_t::ignore);
+                if(index_record.is_update) {
+                    const std::string& serialized_json = index_record.new_doc.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore);
+                    bool write_ok = store->insert(get_seq_id_key(index_record.seq_id), serialized_json);

-                rocksdb::WriteBatch batch;
-                batch.Put(get_doc_id_key(index_record.document["id"]), seq_id_str);
-                batch.Put(get_seq_id_key(index_record.seq_id), serialized_json);
-                bool write_ok = store->batch_write(batch);
+                    if(!write_ok) {
+                        // we will attempt to reindex the old doc on a best-effort basis
+                        remove_document(index_record.new_doc, index_record.seq_id, false);
+                        index_in_memory(index_record.old_doc, index_record.seq_id, false);
+                        index_record.index_failure(500, "Could not write to on-disk storage.");
+                    } else {
+                        num_indexed++;
+                        index_record.index_success();
+                    }

-                if(!write_ok) {
-                    index_record.indexed = Option<bool>(500, "Could not write to on-disk storage.");;
-                    // remove from in-memory store to keep the state synced
-                    remove_document(index_record.document, index_record.seq_id, false);
+                } else {
+                    const std::string& seq_id_str = std::to_string(index_record.seq_id);
+                    const std::string& serialized_json = index_record.doc.dump(-1, ' ', false,
+                                                                               nlohmann::detail::error_handler_t::ignore);
+
+                    rocksdb::WriteBatch batch;
+                    batch.Put(get_doc_id_key(index_record.doc["id"]), seq_id_str);
+                    batch.Put(get_seq_id_key(index_record.seq_id), serialized_json);
+                    bool write_ok = store->batch_write(batch);
+
+                    if(!write_ok) {
+                        // remove from in-memory store to keep the state synced
+                        remove_document(index_record.doc, index_record.seq_id, false);
+                        index_record.index_failure(500, "Could not write to on-disk storage.");
+                    } else {
+                        num_indexed++;
+                        index_record.index_success();
+                    }
                }

-                json_out[index_record.position] = R"({"success": true})";
-                num_indexed++;
+                res["success"] = index_record.indexed.ok();
+                if(!index_record.indexed.ok()) {
+                    res["document"] = json_out[index_record.position];
+                    res["error"] = index_record.indexed.error();
+                    res["code"] = index_record.indexed.code();
+                }
            } else {
-                nlohmann::json res;
                res["success"] = false;
-                res["error"] = index_record.indexed.error();
                res["document"] = json_out[index_record.position];
-                json_out[index_record.position] = res.dump();
+                res["error"] = index_record.indexed.error();
+                res["code"] = index_record.indexed.code();
            }
+
+            json_out[index_record.position] = res.dump();
        }
    }
 }

-Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
-    Option<uint32_t> validation_op = Index::validate_index_in_memory(document, seq_id, default_sorting_field,
-                                                                     search_schema, facet_schema);
+Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id, bool is_update) {
+    if(!is_update) {
+        // for update, validation should be done prior
+        Option<uint32_t> validation_op = Index::validate_index_in_memory(document, seq_id, default_sorting_field,
+                                                                         search_schema, facet_schema, is_update);

-    if(!validation_op.ok()) {
-        return validation_op;
+        if(!validation_op.ok()) {
+            return validation_op;
+        }
    }

    Index* index = indices[seq_id % num_memory_shards];
-    index->index_in_memory(document, seq_id, default_sorting_field);
+    index->index_in_memory(document, seq_id, default_sorting_field, is_update);

    num_documents += 1;
    return Option<>(200);
@ -418,6 +498,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                                  const size_t max_facet_values,
                                  const std::string & simple_facet_query,
                                  const size_t snippet_threshold,
+                                  const size_t highlight_affix_num_tokens,
                                  const std::string & highlight_full_fields,
                                  size_t typo_tokens_threshold,
                                  const std::map<size_t, std::vector<std::string>>& pinned_hits,
@ -992,7 +1073,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                    bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
                    highlight_t highlight;
                    highlight_result(search_field, searched_queries, field_order_kv, document,
-                                     string_utils, snippet_threshold, highlighted_fully, highlight);
+                                     string_utils, snippet_threshold, highlight_affix_num_tokens,
+                                     highlighted_fully, highlight);

                    if(!highlight.snippets.empty()) {
                        highlights.push_back(highlight);
@ -1238,7 +1320,9 @@ void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t
 void Collection::highlight_result(const field &search_field,
                                  const std::vector<std::vector<art_leaf *>> &searched_queries,
                                  const KV* field_order_kv, const nlohmann::json & document,
-                                  StringUtils & string_utils, size_t snippet_threshold,
+                                  StringUtils & string_utils,
+                                  const size_t snippet_threshold,
+                                  const size_t highlight_affix_num_tokens,
                                  bool highlighted_fully,
                                  highlight_t & highlight) {

@ -1316,6 +1400,10 @@ void Collection::highlight_result(const field &search_field,
            if(match.offsets[i].offset != MAX_DISPLACEMENT) {
                size_t token_index = (size_t)(match.offsets[i].offset);
                token_indices.push_back(token_index);
+                if(token_index >= tokens.size()) {
+                    LOG(ERROR) << "Highlight token index " << token_index << " is greater than length of store field.";
+                    continue;
+                }
                std::string token = tokens[token_index];
                string_utils.unicode_normalize(token);
                token_hits.insert(token);
@ -1324,12 +1412,15 @@ void Collection::highlight_result(const field &search_field,

        auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());

+        size_t prefix_length = highlight_affix_num_tokens;
+        size_t suffix_length = highlight_affix_num_tokens + 1;
+
        // For longer strings, pick surrounding tokens within 4 tokens of min_index and max_index for the snippet
        const size_t start_index = (tokens.size() <= snippet_threshold) ? 0 :
-                                   std::max(0, (int)(*(minmax.first) - 4));
+                                   std::max(0, (int)(*(minmax.first) - prefix_length));

        const size_t end_index = (tokens.size() <= snippet_threshold) ? tokens.size() :
-                                 std::min((int)tokens.size(), (int)(*(minmax.second) + 5));
+                                 std::min((int)tokens.size(), (int)(*(minmax.second) + suffix_length));

        std::stringstream snippet_stream;
        for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) {
@ -1401,7 +1492,7 @@ Option<nlohmann::json> Collection::get(const std::string & id) {
        return Option<nlohmann::json>(500, "Error while fetching the document.");
    }

-    uint32_t seq_id = (uint32_t) std::stol(seq_id_str);
+    uint32_t seq_id = (uint32_t) std::stoul(seq_id_str);

    std::string parsed_document;
    StoreStatus doc_status = store->get(get_seq_id_key(seq_id), parsed_document);
@ -1450,7 +1541,7 @@ Option<std::string> Collection::remove(const std::string & id, const bool remove
        return Option<std::string>(500, "Error while fetching the document.");
    }

-    uint32_t seq_id = (uint32_t) std::stol(seq_id_str);
+    uint32_t seq_id = (uint32_t) std::stoul(seq_id_str);

    std::string parsed_document;
    StoreStatus doc_status = store->get(get_seq_id_key(seq_id), parsed_document);
--- a/src/collection_manager.cpp
+++ b/src/collection_manager.cpp
@ -173,7 +173,7 @@ Option<bool> CollectionManager::load(const size_t init_batch_size) {
            }

            num_valid_docs++;
-            iter_batch[seq_id % collection->get_num_memory_shards()].emplace_back(index_record(0, seq_id, document));
+            iter_batch[seq_id % collection->get_num_memory_shards()].emplace_back(index_record(0, seq_id, document, CREATE));

            // Peek and check for last record right here so that we handle batched indexing correctly
            // Without doing this, the "last batch" would have to be indexed outside the loop.
@ -195,7 +195,7 @@ Option<bool> CollectionManager::load(const size_t init_batch_size) {

                    if(num_indexed != num_records) {
                        const Option<std::string> & index_error_op = get_first_index_error(iter_batch[i]);
-                        if(index_error_op.ok()) {
+                        if(!index_error_op.ok()) {
                            return Option<bool>(false, index_error_op.get());
                        }
                    }
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@ -27,6 +27,18 @@ bool handle_authentication(std::map<std::string, std::string>& req_params, const
    return collectionManager.auth_key_matches(auth_key, rpath.action, collection, req_params);
 }

+index_operation_t get_index_operation(const std::string& action) {
+    if(action == "create") {
+        return CREATE;
+    } else if(action == "update") {
+        return UPDATE;
+    } else if(action == "upsert") {
+        return UPSERT;
+    }
+
+    return CREATE;
+}
+
 bool get_collections(http_req & req, http_res & res) {
    CollectionManager & collectionManager = CollectionManager::get_instance();
    std::vector<Collection*> collections = collectionManager.get_collections();
@ -254,6 +266,9 @@ bool get_search(http_req & req, http_res & res) {
    // strings under this length will be fully highlighted, instead of showing a snippet of relevant portion
    const char *SNIPPET_THRESHOLD = "snippet_threshold";

+    // the number of tokens that should surround the highlighted text
+    const char *HIGHLIGHT_AFFIX_NUM_TOKENS = "highlight_affix_num_tokens";
+
    // list of fields which will be highlighted fully without snippeting
    const char *HIGHLIGHT_FULL_FIELDS = "highlight_full_fields";

@ -290,6 +305,10 @@ bool get_search(http_req & req, http_res & res) {
        req.params[SNIPPET_THRESHOLD] = "30";
    }

+    if(req.params.count(HIGHLIGHT_AFFIX_NUM_TOKENS) == 0) {
+        req.params[HIGHLIGHT_AFFIX_NUM_TOKENS] = "4";
+    }
+
    if(req.params.count(HIGHLIGHT_FULL_FIELDS) == 0) {
        req.params[HIGHLIGHT_FULL_FIELDS] = "";
    }
@ -362,6 +381,11 @@ bool get_search(http_req & req, http_res & res) {
        return false;
    }

+    if(!StringUtils::is_uint32_t(req.params[HIGHLIGHT_AFFIX_NUM_TOKENS])) {
+        res.set_400("Parameter `" + std::string(HIGHLIGHT_AFFIX_NUM_TOKENS) + "` must be an unsigned integer.");
+        return false;
+    }
+
    if(!StringUtils::is_uint32_t(req.params[GROUP_LIMIT])) {
        res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer.");
        return false;
@ -474,6 +498,7 @@ bool get_search(http_req & req, http_res & res) {
                                                          static_cast<size_t>(std::stol(req.params[MAX_FACET_VALUES])),
                                                          req.params[FACET_QUERY],
                                                          static_cast<size_t>(std::stol(req.params[SNIPPET_THRESHOLD])),
+                                                          static_cast<size_t>(std::stol(req.params[HIGHLIGHT_AFFIX_NUM_TOKENS])),
                                                          req.params[HIGHLIGHT_FULL_FIELDS],
                                                          typo_tokens_threshold,
                                                          pinned_hits,
@ -579,11 +604,16 @@ bool post_import_documents(http_req& req, http_res& res) {
    //LOG(INFO) << "post_import_documents";
    //LOG(INFO) << "req.first_chunk=" << req.first_chunk_aggregate << ", last_chunk=" << req.last_chunk_aggregate;
    const char *BATCH_SIZE = "batch_size";
+    const char *ACTION = "action";

    if(req.params.count(BATCH_SIZE) == 0) {
        req.params[BATCH_SIZE] = "40";
    }

+    if(req.params.count(ACTION) == 0) {
+        req.params[ACTION] = "create";
+    }
+
    if(!StringUtils::is_uint32_t(req.params[BATCH_SIZE])) {
        req.last_chunk_aggregate = true;
        res.final = true;
@ -592,6 +622,14 @@ bool post_import_documents(http_req& req, http_res& res) {
        return false;
    }

+    if(req.params[ACTION] != "create" && req.params[ACTION] != "update" && req.params[ACTION] != "upsert") {
+        req.last_chunk_aggregate = true;
+        res.final = true;
+        res.set_400("Parameter `" + std::string(ACTION) + "` must be a create|update|upsert.");
+        HttpServer::stream_response(req, res);
+        return false;
+    }
+
    const size_t IMPORT_BATCH_SIZE = std::stoi(req.params[BATCH_SIZE]);

    if(IMPORT_BATCH_SIZE == 0) {
@ -667,8 +705,11 @@ bool post_import_documents(http_req& req, http_res& res) {

    //LOG(INFO) << "single_partial_record_body: " << single_partial_record_body;

+    const index_operation_t operation = get_index_operation(req.params[ACTION]);
+
    if(!single_partial_record_body) {
-        nlohmann::json json_res = collection->add_many(json_lines);
+        nlohmann::json document;
+        nlohmann::json json_res = collection->add_many(json_lines, document, operation);
        //const std::string& import_summary_json = json_res.dump();
        //response_stream << import_summary_json << "\n";

@ -698,6 +739,16 @@ bool post_import_documents(http_req& req, http_res& res) {
 }

 bool post_add_document(http_req & req, http_res & res) {
+    const char *ACTION = "action";
+    if(req.params.count(ACTION) == 0) {
+        req.params[ACTION] = "create";
+    }
+
+    if(req.params[ACTION] != "create" && req.params[ACTION] != "update" && req.params[ACTION] != "upsert") {
+        res.set_400("Parameter `" + std::string(ACTION) + "` must be a create|update|upsert.");
+        return false;
+    }
+
    CollectionManager & collectionManager = CollectionManager::get_instance();
    Collection* collection = collectionManager.get_collection(req.params["collection"]);

@ -706,7 +757,8 @@ bool post_add_document(http_req & req, http_res & res) {
        return false;
    }

-    Option<nlohmann::json> inserted_doc_op = collection->add(req.body);
+    const index_operation_t operation = get_index_operation(req.params[ACTION]);
+    Option<nlohmann::json> inserted_doc_op = collection->add(req.body, operation);

    if(!inserted_doc_op.ok()) {
        res.set(inserted_doc_op.code(), inserted_doc_op.error());
@ -717,6 +769,28 @@ bool post_add_document(http_req & req, http_res & res) {
    return true;
 }

+bool patch_update_document(http_req & req, http_res & res) {
+    std::string doc_id = req.params["id"];
+
+    CollectionManager & collectionManager = CollectionManager::get_instance();
+    Collection* collection = collectionManager.get_collection(req.params["collection"]);
+
+    if(collection == nullptr) {
+        res.set_404();
+        return false;
+    }
+
+    Option<nlohmann::json> upserted_doc_op = collection->add(req.body, index_operation_t::UPDATE, doc_id);
+
+    if(!upserted_doc_op.ok()) {
+        res.set(upserted_doc_op.code(), upserted_doc_op.error());
+        return false;
+    }
+
+    res.set_201(upserted_doc_op.get().dump());
+    return true;
+}
+
 bool get_fetch_document(http_req & req, http_res & res) {
    std::string doc_id = req.params["id"];

@ -1044,7 +1118,7 @@ bool get_key(http_req &req, http_res &res) {
    AuthManager &auth_manager = collectionManager.getAuthManager();

    const std::string& key_id_str = req.params["id"];
-    uint32_t key_id = (uint32_t) std::stol(key_id_str);
+    uint32_t key_id = (uint32_t) std::stoul(key_id_str);

    const Option<api_key_t>& key_op = auth_manager.get_key(key_id);

@ -1066,7 +1140,7 @@ bool del_key(http_req &req, http_res &res) {
    AuthManager &auth_manager = collectionManager.getAuthManager();

    const std::string& key_id_str = req.params["id"];
-    uint32_t key_id = (uint32_t) std::stol(key_id_str);
+    uint32_t key_id = (uint32_t) std::stoul(key_id_str);

    const Option<api_key_t> &del_op = auth_manager.remove_key(key_id);

--- a/src/http_server.cpp
+++ b/src/http_server.cpp
@ -129,6 +129,7 @@ int HttpServer::create_listener() {
    ctx.globalconf->server_name = h2o_strdup(nullptr, "", SIZE_MAX);
    ctx.globalconf->http2.active_stream_window_size = ACTIVE_STREAM_WINDOW_SIZE;
    ctx.globalconf->http2.idle_timeout = REQ_TIMEOUT_MS;
+    ctx.globalconf->max_request_entity_size = (1024 * 1024 * 1024); // 1 GB

    ctx.globalconf->http1.req_timeout = REQ_TIMEOUT_MS;
    ctx.globalconf->http1.req_io_timeout = REQ_TIMEOUT_MS;
@ -705,6 +706,13 @@ void HttpServer::put(const std::string & path, bool (*handler)(http_req &, http_
    routes.emplace_back(rpath.route_hash(), rpath);
 }

+void HttpServer::patch(const std::string & path, bool (*handler)(http_req &, http_res &), bool async_req, bool async_res) {
+    std::vector<std::string> path_parts;
+    StringUtils::split(path, path_parts, "/");
+    route_path rpath("PATCH", path_parts, handler, async_req, async_res);
+    routes.emplace_back(rpath.route_hash(), rpath);
+}
+
 void HttpServer::del(const std::string & path, bool (*handler)(http_req &, http_res &), bool async_req, bool async_res) {
    std::vector<std::string> path_parts;
    StringUtils::split(path, path_parts, "/");
--- a/src/index.cpp
+++ b/src/index.cpp
@ -56,8 +56,8 @@ Index::~Index() {
    sort_index.clear();
 }

-int32_t Index::get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field) {
-    int32_t points = 0;
+int64_t Index::get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field) {
+    int64_t points = 0;

    if(!default_sorting_field.empty()) {
        if(document[default_sorting_field].is_number_float()) {
@ -85,8 +85,15 @@ int64_t Index::float_to_in64_t(float f) {
 }

 Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t seq_id,
-                                        const std::string & default_sorting_field) {
-    int32_t points = get_points_from_doc(document, default_sorting_field);
+                                        const std::string & default_sorting_field, bool is_update) {
+
+    int64_t points = 0;
+
+    if(is_update && document.count(default_sorting_field) == 0) {
+        points = sort_index[default_sorting_field]->at(seq_id);
+    } else {
+        points = get_points_from_doc(document, default_sorting_field);
+    }

    std::unordered_map<std::string, size_t> facet_to_id;
    size_t i_facet = 0;
@ -104,7 +111,7 @@ Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t
    for(const std::pair<std::string, field> & field_pair: search_schema) {
        const std::string & field_name = field_pair.first;

-        if(field_pair.second.optional && document.count(field_name) == 0) {
+        if((field_pair.second.optional || is_update) && document.count(field_name) == 0) {
            continue;
        }

@ -212,17 +219,22 @@ Option<uint32_t> Index::index_in_memory(const nlohmann::json &document, uint32_t
 Option<uint32_t> Index::validate_index_in_memory(const nlohmann::json &document, uint32_t seq_id,
                                                 const std::string & default_sorting_field,
                                                 const std::unordered_map<std::string, field> & search_schema,
-                                                 const std::map<std::string, field> & facet_schema) {
-    if(document.count(default_sorting_field) == 0) {
+                                                 const std::map<std::string, field> & facet_schema,
+                                                 bool is_update) {
+
+    bool has_default_sort_field = (document.count(default_sorting_field) != 0);
+
+    if(!has_default_sort_field && !is_update) {
        return Option<>(400, "Field `" + default_sorting_field  + "` has been declared as a default sorting field, "
                "but is not found in the document.");
    }

-    if(!document[default_sorting_field].is_number_integer() && !document[default_sorting_field].is_number_float()) {
+    if(has_default_sort_field &&
+        !document[default_sorting_field].is_number_integer() && !document[default_sorting_field].is_number_float()) {
        return Option<>(400, "Default sorting field `" + default_sorting_field  + "` must be a single valued numerical field.");
    }

-    if(search_schema.at(default_sorting_field).is_single_float() &&
+    if(has_default_sort_field && search_schema.at(default_sorting_field).is_single_float() &&
       document[default_sorting_field].get<float>() > std::numeric_limits<float>::max()) {
        return Option<>(400, "Default sorting field `" + default_sorting_field  + "` exceeds maximum value of a float.");
    }
@ -230,7 +242,7 @@ Option<uint32_t> Index::validate_index_in_memory(const nlohmann::json &document,
    for(const std::pair<std::string, field> & field_pair: search_schema) {
        const std::string & field_name = field_pair.first;

-        if(field_pair.second.optional && document.count(field_name) == 0) {
+        if((field_pair.second.optional || is_update) && document.count(field_name) == 0) {
            continue;
        }

@ -309,6 +321,48 @@ Option<uint32_t> Index::validate_index_in_memory(const nlohmann::json &document,
    return Option<>(200);
 }

+void Index::scrub_reindex_doc(nlohmann::json& update_doc, nlohmann::json& del_doc, nlohmann::json& old_doc) {
+    auto it = del_doc.cbegin();
+    while(it != del_doc.cend()) {
+        const std::string& field_name = it.key();
+        const auto& search_field_it = search_schema.find(field_name);
+        if(search_field_it == search_schema.end()) {
+            ++it;
+            continue;
+        }
+
+        const auto& search_field = search_field_it->second;
+
+        // Go through all the field names and find the keys+values so that they can be removed from in-memory index
+        std::vector<std::string> reindex_tokens;
+        std::vector<std::string> old_tokens;
+        tokenize_doc_field(update_doc, field_name, search_field, reindex_tokens);
+        tokenize_doc_field(old_doc, field_name, search_field, old_tokens);
+
+        if(old_tokens.size() != reindex_tokens.size()) {
+            ++it;
+            continue;
+        }
+
+        bool exact_match = true;
+
+        for(size_t i=0; i<reindex_tokens.size(); i++) {
+            const std::string& reindex_val = reindex_tokens[i];
+            const std::string& old_val = old_tokens[i];
+            if(reindex_val != old_val) {
+                exact_match = false;
+                break;
+            }
+        }
+
+        if(exact_match) {
+            it = del_doc.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
 size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_batch,
                                 const std::string & default_sorting_field,
                                 const std::unordered_map<std::string, field> & search_schema,
@ -322,29 +376,42 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
            continue;
        }

-        Option<uint32_t> validation_op = validate_index_in_memory(index_rec.document, index_rec.seq_id,
-                                                                  default_sorting_field,
-                                                                  search_schema, facet_schema);
+        if(index_rec.operation != DELETE) {
+            Option<uint32_t> validation_op = validate_index_in_memory(index_rec.doc, index_rec.seq_id,
+                                                                      default_sorting_field,
+                                                                      search_schema, facet_schema, index_rec.is_update);

-        if(!validation_op.ok()) {
-            index_rec.index_failure(validation_op.code(), validation_op.error());
-            continue;
+            if(!validation_op.ok()) {
+                index_rec.index_failure(validation_op.code(), validation_op.error());
+                continue;
+            }
+
+            if(index_rec.is_update) {
+                // scrub string fields to reduce delete ops
+                index->scrub_reindex_doc(index_rec.doc, index_rec.del_doc, index_rec.old_doc);
+                index->remove(index_rec.seq_id, index_rec.del_doc);
+            }
+
+            Option<uint32_t> index_mem_op = index->index_in_memory(index_rec.doc, index_rec.seq_id,
+                                                                   default_sorting_field, index_rec.is_update);
+            if(!index_mem_op.ok()) {
+                index->index_in_memory(index_rec.del_doc, index_rec.seq_id, default_sorting_field, true);
+                index_rec.index_failure(index_mem_op.code(), index_mem_op.error());
+                continue;
+            }
+
+            index_rec.index_success();
+
+            if(!index_rec.is_update) {
+                num_indexed++;
+            }
        }
-
-        Option<uint32_t> index_mem_op = index->index_in_memory(index_rec.document, index_rec.seq_id, default_sorting_field);
-        if(!index_mem_op.ok()) {
-            index_rec.index_failure(index_mem_op.code(), index_mem_op.error());
-            continue;
-        }
-
-        index_rec.index_success(index_rec);
-        num_indexed++;
    }

    return num_indexed;
 }

-void Index::insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
+void Index::insert_doc(const int64_t score, art_tree *t, uint32_t seq_id,
                       const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const {
    for(auto & kv: token_to_offsets) {
        art_document art_doc;
@ -369,13 +436,14 @@ void Index::insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
            art_doc.offsets[i] = kv.second[i];
        }

+        //LOG(INFO) << "key: " << key << ", art_doc.id: " << art_doc.id;
        art_insert(t, key, key_len, &art_doc, num_hits);
        delete [] art_doc.offsets;
        art_doc.offsets = nullptr;
    }
 }

-void Index::index_int32_field(const int32_t value, uint32_t score, art_tree *t, uint32_t seq_id) const {
+void Index::index_int32_field(const int32_t value, int64_t score, art_tree *t, uint32_t seq_id) const {
    const int KEY_LEN = 8;
    unsigned char key[KEY_LEN];

@ -398,7 +466,7 @@ void Index::index_int32_field(const int32_t value, uint32_t score, art_tree *t,
    art_insert(t, key, KEY_LEN, &art_doc, num_hits);
 }

-void Index::index_int64_field(const int64_t value, uint32_t score, art_tree *t, uint32_t seq_id) const {
+void Index::index_int64_field(const int64_t value, int64_t score, art_tree *t, uint32_t seq_id) const {
    const int KEY_LEN = 8;
    unsigned char key[KEY_LEN];

@ -421,7 +489,7 @@ void Index::index_int64_field(const int64_t value, uint32_t score, art_tree *t,
    art_insert(t, key, KEY_LEN, &art_doc, num_hits);
 }

-void Index::index_bool_field(const bool value, const uint32_t score, art_tree *t, uint32_t seq_id) const {
+void Index::index_bool_field(const bool value, const int64_t score, art_tree *t, uint32_t seq_id) const {
    const int KEY_LEN = 1;
    unsigned char key[KEY_LEN];
    key[0] = value ? '1' : '0';
@ -443,7 +511,7 @@ void Index::index_bool_field(const bool value, const uint32_t score, art_tree *t
    art_insert(t, key, KEY_LEN, &art_doc, num_hits);
 }

-void Index::index_float_field(const float value, uint32_t score, art_tree *t, uint32_t seq_id) const {
+void Index::index_float_field(const float value, int64_t score, art_tree *t, uint32_t seq_id) const {
    const int KEY_LEN = 8;
    unsigned char key[KEY_LEN];

@ -484,7 +552,7 @@ uint64_t Index::facet_token_hash(const field & a_field, const std::string &token
    return hash;
 }

-void Index::index_string_field(const std::string & text, const uint32_t score, art_tree *t,
+void Index::index_string_field(const std::string & text, const int64_t score, art_tree *t,
                                    uint32_t seq_id, int facet_id, const field & a_field) {
    std::vector<std::string> tokens;
    StringUtils::split(text, tokens, " ");
@ -506,6 +574,10 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a
        token_to_offsets[token].push_back(i);
    }

+    /*if(seq_id == 0) {
+        LOG(INFO) << "field name: " << a_field.name;
+    }*/
+
    insert_doc(score, t, seq_id, token_to_offsets);

    if(facet_id >= 0) {
@ -513,7 +585,7 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a
    }
 }

-void Index::index_string_array_field(const std::vector<std::string> & strings, const uint32_t score, art_tree *t,
+void Index::index_string_array_field(const std::vector<std::string> & strings, const int64_t score, art_tree *t,
                                          uint32_t seq_id, int facet_id, const field & a_field) {
    std::unordered_map<std::string, std::vector<uint32_t>> token_positions;

@ -565,28 +637,28 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
    insert_doc(score, t, seq_id, token_positions);
 }

-void Index::index_int32_array_field(const std::vector<int32_t> & values, const uint32_t score, art_tree *t,
+void Index::index_int32_array_field(const std::vector<int32_t> & values, const int64_t score, art_tree *t,
                                         uint32_t seq_id) const {
    for(const int32_t value: values) {
        index_int32_field(value, score, t, seq_id);
    }
 }

-void Index::index_int64_array_field(const std::vector<int64_t> & values, const uint32_t score, art_tree *t,
+void Index::index_int64_array_field(const std::vector<int64_t> & values, const int64_t score, art_tree *t,
                                         uint32_t seq_id) const {
    for(const int64_t value: values) {
        index_int64_field(value, score, t, seq_id);
    }
 }

-void Index::index_bool_array_field(const std::vector<bool> & values, const uint32_t score, art_tree *t,
+void Index::index_bool_array_field(const std::vector<bool> & values, const int64_t score, art_tree *t,
                                   uint32_t seq_id) const {
    for(const bool value: values) {
        index_bool_field(value, score, t, seq_id);
    }
 }

-void Index::index_float_array_field(const std::vector<float> & values, const uint32_t score, art_tree *t,
+void Index::index_float_array_field(const std::vector<float> & values, const int64_t score, art_tree *t,
                             uint32_t seq_id) const {
    for(const float value: values) {
        index_float_field(value, score, t, seq_id);
@ -996,7 +1068,7 @@ Option<uint32_t> Index::do_filtering(uint32_t** filter_ids_out, const std::vecto
                        bool found_filter = false;

                        if(!f.is_array()) {
-                            found_filter = (str_tokens.size() == fvalues.size());
+                            found_filter = (query_suggestion.size() == fvalues.size());
                        } else {
                            uint64_t filter_hash = 1;

@ -1712,6 +1784,11 @@ void Index::populate_token_positions(const std::vector<art_leaf *>& query_sugges
        // a) last element is array_index b) second and third last elements will be largest offset
        // (last element is repeated to indicate end of offsets for a given array index)

+        /*uint32_t* offsets = token_leaf->values->offsets.uncompress();
+        for(size_t ii=0; ii < token_leaf->values->offsets.getLength(); ii++) {
+            LOG(INFO) << "offset: " << offsets[ii];
+        }*/
+
        uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
        uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
                              token_leaf->values->offsets.getLength() :
@ -1767,8 +1844,8 @@ inline std::vector<art_leaf *> Index::next_suggestion(const std::vector<token_ca
    return query_suggestion;
 }

-void Index::remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
-                                               const uint32_t indices_length) {
+void Index::remove_and_shift_offset_index(sorted_array& offset_index, const uint32_t* indices_sorted,
+                                         const uint32_t indices_length) {
    uint32_t *curr_array = offset_index.uncompress();
    uint32_t *new_array = new uint32_t[offset_index.getLength()];

@ -1801,83 +1878,27 @@ void Index::remove_and_shift_offset_index(sorted_array &offset_index, const uint
 }

 Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & document) {
-    for(auto & name_field: search_schema) {
-        if(name_field.second.optional && document.count(name_field.first) == 0) {
+    std::unordered_map<std::string, size_t> facet_to_index;
+    get_facet_to_index(facet_to_index);
+
+    for(auto it = document.begin(); it != document.end(); ++it) {
+        const std::string& field_name = it.key();
+        const auto& search_field_it = search_schema.find(field_name);
+        if(search_field_it == search_schema.end()) {
            continue;
        }

+        const auto& search_field = search_field_it->second;
+
        // Go through all the field names and find the keys+values so that they can be removed from in-memory index
        std::vector<std::string> tokens;
-        if(name_field.second.type == field_types::STRING) {
-            StringUtils::split(document[name_field.first], tokens, " ");
-        } else if(name_field.second.type == field_types::STRING_ARRAY) {
-            std::vector<std::string> values = document[name_field.first].get<std::vector<std::string>>();
-            for(const std::string & value: values) {
-                StringUtils::split(value, tokens, " ");
-            }
-        } else if(name_field.second.type == field_types::INT32) {
-            const int KEY_LEN = 8;
-            unsigned char key[KEY_LEN];
-            int32_t value = document[name_field.first].get<int32_t>();
-            encode_int32(value, key);
-            tokens.push_back(std::string((char*)key, KEY_LEN));
-        } else if(name_field.second.type == field_types::INT32_ARRAY) {
-            std::vector<int32_t> values = document[name_field.first].get<std::vector<int32_t>>();
-            for(const int32_t value: values) {
-                const int KEY_LEN = 8;
-                unsigned char key[KEY_LEN];
-                encode_int32(value, key);
-                tokens.push_back(std::string((char*)key, KEY_LEN));
-            }
-        } else if(name_field.second.type == field_types::INT64) {
-            const int KEY_LEN = 8;
-            unsigned char key[KEY_LEN];
-            int64_t value = document[name_field.first].get<int64_t>();
-            encode_int64(value, key);
-            tokens.push_back(std::string((char*)key, KEY_LEN));
-        } else if(name_field.second.type == field_types::INT64_ARRAY) {
-            std::vector<int64_t> values = document[name_field.first].get<std::vector<int64_t>>();
-            for(const int64_t value: values) {
-                const int KEY_LEN = 8;
-                unsigned char key[KEY_LEN];
-                encode_int64(value, key);
-                tokens.push_back(std::string((char*)key, KEY_LEN));
-            }
-        } else if(name_field.second.type == field_types::FLOAT) {
-            const int KEY_LEN = 8;
-            unsigned char key[KEY_LEN];
-            int64_t value = document[name_field.first].get<int64_t>();
-            encode_float(value, key);
-            tokens.push_back(std::string((char*)key, KEY_LEN));
-        } else if(name_field.second.type == field_types::FLOAT_ARRAY) {
-            std::vector<float> values = document[name_field.first].get<std::vector<float>>();
-            for(const float value: values) {
-                const int KEY_LEN = 8;
-                unsigned char key[KEY_LEN];
-                encode_float(value, key);
-                tokens.push_back(std::string((char*)key, KEY_LEN));
-            }
-        } else if(name_field.second.type == field_types::BOOL) {
-            const int KEY_LEN = 1;
-            unsigned char key[KEY_LEN];
-            bool value = document[name_field.first].get<bool>();
-            key[0] = value ? '1' : '0';
-            tokens.push_back(std::string((char*)key, KEY_LEN));
-        } else if(name_field.second.type == field_types::BOOL_ARRAY) {
-            std::vector<bool> values = document[name_field.first].get<std::vector<bool>>();
-            for(const bool value: values) {
-                const int KEY_LEN = 1;
-                unsigned char key[KEY_LEN];
-                key[0] = value ? '1' : '0';
-                tokens.push_back(std::string((char*)key, KEY_LEN));
-            }
-        }
+        tokenize_doc_field(document, field_name, search_field, tokens);

        for(auto & token: tokens) {
            const unsigned char *key;
            int key_len;

-            if(name_field.second.type == field_types::STRING_ARRAY || name_field.second.type == field_types::STRING) {
+            if(search_field.type == field_types::STRING_ARRAY || search_field.type == field_types::STRING) {
                string_utils.unicode_normalize(token);
                key = (const unsigned char *) token.c_str();
                key_len = (int) (token.length() + 1);
@ -1886,9 +1907,8 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
                key_len = (int) (token.length());
            }

-            art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len);
-            if(leaf != NULL) {
-                uint32_t seq_id_values[1] = {seq_id};
+            art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), key, key_len);
+            if(leaf != nullptr) {
                uint32_t doc_index = leaf->values->ids.indexOf(seq_id);

                if(doc_index == leaf->values->ids.getLength()) {
@ -1905,7 +1925,7 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
                remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);

                leaf->values->offsets.remove_index(start_offset, end_offset);
-                leaf->values->ids.remove_values(seq_id_values, 1);
+                leaf->values->ids.remove_value(seq_id);

                /*len = leaf->values->offset_index.getLength();
                for(auto i=0; i<len; i++) {
@ -1914,25 +1934,96 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
                LOG(INFO) << "----";*/

                if(leaf->values->ids.getLength() == 0) {
-                    art_values* values = (art_values*) art_delete(search_index.at(name_field.first), key, key_len);
+                    art_values* values = (art_values*) art_delete(search_index.at(field_name), key, key_len);
                    delete values;
-                    values = nullptr;
                }
            }
        }
-    }

-    // remove facets if any
-    facet_index_v2.erase(seq_id);
+        // remove facets
+        if(facet_to_index.count(field_name) != 0 && facet_index_v2.count(seq_id) != 0) {
+            size_t facet_index = facet_to_index[field_name];
+            std::vector<std::vector<uint64_t>>& facet_values = facet_index_v2[seq_id];
+            facet_values[facet_index].clear();
+        }

-    // remove sort index if any
-    for(auto & field_doc_value_map: sort_index) {
-        field_doc_value_map.second->erase(seq_id);
+        // remove sort field
+        if(sort_index.count(field_name) != 0) {
+            sort_index[field_name]->erase(seq_id);
+        }
    }

    return Option<uint32_t>(seq_id);
 }

+void Index::tokenize_doc_field(const nlohmann::json& document, const std::string& field_name, const field& search_field,
+                               std::vector<std::string>& tokens) {
+    if(search_field.type == field_types::STRING) {
+        StringUtils::split(document[field_name], tokens, " ");
+    } else if(search_field.type == field_types::STRING_ARRAY) {
+        const std::vector<std::string>& values = document[field_name].get<std::vector<std::string>>();
+        for(const std::string & value: values) {
+            StringUtils::split(value, tokens, " ");
+        }
+    } else if(search_field.type == field_types::INT32) {
+        const int KEY_LEN = 8;
+        unsigned char key[KEY_LEN];
+        const int32_t& value = document[field_name].get<int32_t>();
+        encode_int32(value, key);
+        tokens.emplace_back((char*)key, KEY_LEN);
+    } else if(search_field.type == field_types::INT32_ARRAY) {
+        const std::vector<int32_t>& values = document[field_name].get<std::vector<int32_t>>();
+        for(const int32_t value: values) {
+            const int KEY_LEN = 8;
+            unsigned char key[KEY_LEN];
+            encode_int32(value, key);
+            tokens.emplace_back((char*)key, KEY_LEN);
+        }
+    } else if(search_field.type == field_types::INT64) {
+        const int KEY_LEN = 8;
+        unsigned char key[KEY_LEN];
+        const int64_t& value = document[field_name].get<int64_t>();
+        encode_int64(value, key);
+        tokens.emplace_back((char*)key, KEY_LEN);
+    } else if(search_field.type == field_types::INT64_ARRAY) {
+        const std::vector<int64_t>& values = document[field_name].get<std::vector<int64_t>>();
+        for(const int64_t value: values) {
+            const int KEY_LEN = 8;
+            unsigned char key[KEY_LEN];
+            encode_int64(value, key);
+            tokens.emplace_back((char*)key, KEY_LEN);
+        }
+    } else if(search_field.type == field_types::FLOAT) {
+        const int KEY_LEN = 8;
+        unsigned char key[KEY_LEN];
+        const int64_t& value = document[field_name].get<int64_t>();
+        encode_float(value, key);
+        tokens.emplace_back((char*)key, KEY_LEN);
+    } else if(search_field.type == field_types::FLOAT_ARRAY) {
+        const std::vector<float>& values = document[field_name].get<std::vector<float>>();
+        for(const float value: values) {
+            const int KEY_LEN = 8;
+            unsigned char key[KEY_LEN];
+            encode_float(value, key);
+            tokens.emplace_back((char*)key, KEY_LEN);
+        }
+    } else if(search_field.type == field_types::BOOL) {
+        const int KEY_LEN = 1;
+        unsigned char key[KEY_LEN];
+        const bool& value = document[field_name].get<bool>();
+        key[0] = value ? '1' : '0';
+        tokens.emplace_back((char*)key, KEY_LEN);
+    } else if(search_field.type == field_types::BOOL_ARRAY) {
+        const std::vector<bool>& values = document[field_name].get<std::vector<bool>>();
+        for(const bool value: values) {
+            const int KEY_LEN = 1;
+            unsigned char key[KEY_LEN];
+            key[0] = value ? '1' : '0';
+            tokens.emplace_back((char*)key, KEY_LEN);
+        }
+    }
+}
+
 art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) {
    const art_tree *t = search_index.at(field_name);
    return (art_leaf*) art_search(t, token, (int) token_len);
--- a/src/main/typesense_server.cpp
+++ b/src/main/typesense_server.cpp
@ -21,6 +21,7 @@ void master_server_routes() {

    // document management - `/documents/:id` end-points must be placed last in the list
    server->post("/collections/:collection/documents", post_add_document);
+    server->patch("/collections/:collection/documents/:id", patch_update_document);
    server->get("/collections/:collection/documents/search", get_search);

    server->post("/collections/:collection/documents/import", post_import_documents, true, true);
--- a/src/sorted_array.cpp
+++ b/src/sorted_array.cpp
@ -1,5 +1,6 @@
 #include "sorted_array.h"
 #include "array_utils.h"
+#include "logger.h"

 void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_length) {
    min = array_length != 0 ? sorted_array[0] : 0;
@ -18,28 +19,67 @@ void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_lengt
    length_bytes = actual_size;
 }

-bool sorted_array::append(uint32_t value) {
-    uint32_t size_required = sorted_append_size_required(value, length+1);
+size_t sorted_array::append(uint32_t value) {
+    if(value < max) {
+        // we will have to re-encode the whole sequence again
+        uint32_t* arr = uncompress(length+1);

-    if(size_required+FOR_ELE_SIZE > size_bytes) {
-        // grow the array first
-        size_t new_size = (size_t) (size_required * FOR_GROWTH_FACTOR);
-        uint8_t *new_location = (uint8_t *) realloc(in, new_size);
-        if(new_location == NULL) {
-            abort();
+        // find the index of the element which is >= to `value`
+        uint32_t found_val;
+        uint32_t gte_index = for_lower_bound_search(in, length, value, &found_val);
+
+        for(size_t j=length; j>gte_index; j--) {
+            arr[j] = arr[j-1];
        }
-        in = new_location;
-        size_bytes = (uint32_t) new_size;
+
+        arr[gte_index] = value;
+
+        load(arr, length+1);
+        delete [] arr;
+
+        return gte_index;
+    } else {
+        uint32_t size_required = sorted_append_size_required(value, length+1);
+        size_t min_expected_size = size_required + FOR_ELE_SIZE;
+
+        if(size_bytes < min_expected_size) {
+            // grow the array first
+            size_t new_size = min_expected_size * FOR_GROWTH_FACTOR;
+            uint8_t *new_location = (uint8_t *) realloc(in, new_size);
+            if(new_location == NULL) {
+                abort();
+            }
+            in = new_location;
+            size_bytes = (uint32_t) new_size;
+
+            //LOG(INFO) << "new_size: " << new_size;
+        }
+
+        uint32_t new_length_bytes = for_append_sorted(in, length, value);
+        if(new_length_bytes == 0) return false;
+
+        length_bytes = new_length_bytes;
+        length++;
+
+        if(value < min) min = value;
+        if(value > max) max = value;
+
+        return length-1;
+    }
+}
+
+bool sorted_array::insert(size_t index, uint32_t value) {
+    if(index >= length) {
+        return false;
    }

-    uint32_t new_length_bytes = for_append_sorted(in, length, value);
-    if(new_length_bytes == 0) return false;
+    uint32_t *curr_array = uncompress(length+1);
+    memmove(&curr_array[index+1], &curr_array[index], sizeof(uint32_t)*(length-index));
+    curr_array[index] = value;

-    length_bytes = new_length_bytes;
-    length++;
+    load(curr_array, length+1);

-    if(value < min) min = value;
-    if(value > max) max = value;
+    delete [] curr_array;

    return true;
 }
@ -61,7 +101,11 @@ uint32_t sorted_array::indexOf(uint32_t value) {

    uint32_t actual;
    uint32_t index = for_lower_bound_search(in, length, value, &actual);
-    if(actual == value) return index;
+
+    if(actual == value) {
+        return index;
+    }
+
    return length;
 }

@ -150,20 +194,40 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint
    binary_search_indices(values, head, tail, low_index, high_index, base, bits, indices);
 }

-void sorted_array::remove_values(uint32_t *sorted_values, uint32_t values_length) {
+void sorted_array::remove_value(uint32_t value) {
+    // A lower bound search returns the first element in the sequence that is >= `value`
+    // So, `found_val` will be either equal or greater than `value`
+    uint32_t found_val;
+    uint32_t found_index = for_lower_bound_search(in, length, value, &found_val);
+
+    if(found_val != value) {
+        return ;
+    }
+
+    uint32_t *curr_array = uncompress();
+
+    if(found_index + 1 < length) {
+        memmove(&curr_array[found_index], &curr_array[found_index+1], sizeof(uint32_t) * (length - found_index - 1));
+    }
+
+    size_t new_length = (length == 0) ? 0 : (length - 1);
+    load(curr_array, new_length);
+
+    delete [] curr_array;
+}
+
+void sorted_array::remove_values(uint32_t *sorted_values, uint32_t sorted_values_length) {
    uint32_t *curr_array = uncompress();

    uint32_t *new_array = new uint32_t[length];
    uint32_t new_index = 0;
-    uint32_t curr_index = 0;
+
    uint32_t sorted_values_index = 0;
+    uint32_t curr_index = 0;

    while(curr_index < length) {
-        if(sorted_values_index < values_length && curr_array[curr_index] >= sorted_values[sorted_values_index]) {
-            // skip copying
-            if(curr_array[curr_index] == sorted_values[sorted_values_index]) {
-                curr_index++;
-            }
+        if(sorted_values_index < sorted_values_length && sorted_values[sorted_values_index] == curr_array[curr_index]) {
+            curr_index++;
            sorted_values_index++;
        } else {
            new_array[new_index++] = curr_array[curr_index++];
--- a/test/array_test.cpp
+++ b/test/array_test.cpp
@ -45,6 +45,31 @@ TEST(ArrayTest, Append) {
    }
 }

+TEST(ArrayTest, InsertValues) {
+    std::vector<uint32_t> eles = {10, 1, 4, 5, 7};
+    array arr;
+
+    for(size_t i=0; i < eles.size(); i++) {
+        arr.append(eles[i]);
+    }
+
+    uint32_t insert_arr[2] = {2, 3};
+    arr.insert(2, insert_arr, 2);
+    eles = {10, 1, 2, 3, 4, 5, 7};
+
+    for(size_t i=0; i < eles.size(); i++) {
+        ASSERT_EQ(eles[i], arr.at(i));
+    }
+
+    uint32_t insert_arr2[2] = {20, 25};
+    arr.insert(6, insert_arr2, 2);
+
+    eles = {10, 1, 2, 3, 4, 5, 20, 25, 7};
+    for(size_t i=0; i < eles.size(); i++) {
+        ASSERT_EQ(eles[i], arr.at(i));
+    }
+}
+
 TEST(ArrayTest, Uncompress) {
    const size_t SIZE = 10*1000;

--- a/test/collection_faceting_test.cpp
+++ b/test/collection_faceting_test.cpp
@ -71,6 +71,11 @@ TEST_F(CollectionFacetingTest, FacetFieldStringFiltering) {
    ASSERT_EQ(0, results["hits"].size());
    ASSERT_EQ(0, results["found"].get<size_t>());

+    // multiple tokens but with a typo on one of them
+    results = coll_str->search("*", query_fields, "starring:= ssamuel l. Jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
+    ASSERT_EQ(0, results["hits"].size());
+    ASSERT_EQ(0, results["found"].get<size_t>());
+
    // same should succeed when verbatim filter is made
    results = coll_str->search("*", query_fields, "starring:= samuel l. Jackson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
    ASSERT_EQ(2, results["hits"].size());
@ -85,6 +90,11 @@ TEST_F(CollectionFacetingTest, FacetFieldStringFiltering) {
    ASSERT_EQ(2, results["hits"].size());
    ASSERT_EQ(2, results["found"].get<size_t>());

+    // contains when only 1 token matches
+    results = coll_str->search("*", query_fields, "starring: samuel johnson", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ(2, results["found"].get<size_t>());
+
    collectionManager.drop_collection("coll_str");
 }

@ -131,6 +141,9 @@ TEST_F(CollectionFacetingTest, FacetFieldStringArrayFiltering) {
    results = coll_array_fields->search("Jeremy", query_fields, "tags:= FINE", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
    ASSERT_EQ(0, results["hits"].size());

+    results = coll_array_fields->search("Jeremy", query_fields, "tags:= FFINE PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
+    ASSERT_EQ(0, results["hits"].size());
+
    // partial token filter should be made without "=" operator
    results = coll_array_fields->search("Jeremy", query_fields, "tags: PLATINUM", facets, sort_fields, 0, 10, 1, FREQUENCY, false).get();
    ASSERT_EQ(1, results["hits"].size());
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@ -64,7 +64,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                   false, Index::DROP_TOKENS_THRESHOLD,
                                   spp::sparse_hash_set<std::string>(),
-                                   spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                   spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                                   "", 10,
                                   {}, {}, {"size"}, 2).get();

@ -107,7 +107,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
    res = coll_group->search("*", {}, "", {"brand"}, sort_size, 0, 50, 1, FREQUENCY,
                             false, Index::DROP_TOKENS_THRESHOLD,
                             spp::sparse_hash_set<std::string>(),
-                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
                             "", 10,
                             {}, {}, {"rating"}, 2).get();

@ -147,7 +147,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                  false, Index::DROP_TOKENS_THRESHOLD,
                                  spp::sparse_hash_set<std::string>(),
-                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                                  "", 10,
                                  {}, {}, {"size", "brand"}, 2).get();

@ -194,7 +194,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
    res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 2, 2, FREQUENCY,
                             false, Index::DROP_TOKENS_THRESHOLD,
                             spp::sparse_hash_set<std::string>(),
-                             spp::sparse_hash_set<std::string>(), 10, "", 30,
+                             spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                             "", 10,
                             {}, {}, {"size", "brand"}, 2).get();

@ -230,7 +230,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
    auto res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                             false, Index::DROP_TOKENS_THRESHOLD,
                             spp::sparse_hash_set<std::string>(),
-                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
                             "", 10,
                             {}, {}, {"rating"}, 100);

@ -240,7 +240,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
    res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                false, Index::DROP_TOKENS_THRESHOLD,
                                spp::sparse_hash_set<std::string>(),
-                                spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                                spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
                                "", 10,
                                {}, {}, {"rating"}, 0);

@ -252,7 +252,7 @@ TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) {
    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                  false, Index::DROP_TOKENS_THRESHOLD,
                                  spp::sparse_hash_set<std::string>(),
-                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                                  "", 10,
                                  {}, {}, {"brand"}, 1).get();

@ -322,7 +322,7 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
    auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                  false, Index::DROP_TOKENS_THRESHOLD,
                                  spp::sparse_hash_set<std::string>(),
-                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                                  "", 10,
                                  {}, {}, {"colors"}, 2).get();

--- a/test/collection_manager_test.cpp
+++ b/test/collection_manager_test.cpp
@ -213,7 +213,13 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
    // create a new collection manager to ensure that it restores the records from the disk backed store
    CollectionManager & collectionManager2 = CollectionManager::get_instance();
    collectionManager2.init(store, 1.0, "auth_key");
-    collectionManager2.load();
+    auto load_op = collectionManager2.load();
+
+    if(!load_op.ok()) {
+        LOG(ERROR) << load_op.error();
+    }
+
+    ASSERT_TRUE(load_op.ok());

    collection1 = collectionManager2.get_collection("collection1");
    ASSERT_NE(nullptr, collection1);
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@ -271,7 +271,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
    auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                                           false, Index::DROP_TOKENS_THRESHOLD,
                                           spp::sparse_hash_set<std::string>(),
-                                           spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                           spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                                           "", 10,
                                           pinned_hits, {}).get();

@ -289,7 +289,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                                      false, Index::DROP_TOKENS_THRESHOLD,
                                      spp::sparse_hash_set<std::string>(),
-                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                                      "", 10,
                                      pinned_hits, hidden_hits).get();

@ -305,7 +305,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 2, 2, FREQUENCY,
                                      false, Index::DROP_TOKENS_THRESHOLD,
                                      spp::sparse_hash_set<std::string>(),
-                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                                      "", 10,
                                      pinned_hits, hidden_hits).get();

@ -341,7 +341,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                                      false, Index::DROP_TOKENS_THRESHOLD,
                                      spp::sparse_hash_set<std::string>(),
-                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                                      "", 10,
                                      {}, {hidden_hits}).get();

@ -362,7 +362,7 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
    auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                                           false, Index::DROP_TOKENS_THRESHOLD,
                                           spp::sparse_hash_set<std::string>(),
-                                           spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                           spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                                           "", 10,
                                           pinned_hits, {}).get();

@ -383,7 +383,7 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                            false, Index::DROP_TOKENS_THRESHOLD,
                            spp::sparse_hash_set<std::string>(),
-                            spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                            spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                            "", 10,
                            pinned_hits, {}, {"cast"}, 2).get();

--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -14,6 +14,9 @@ protected:
    CollectionManager & collectionManager = CollectionManager::get_instance();
    std::vector<sort_by> sort_fields;

+    // used for generating random text
+    std::vector<std::string> words;
+
    void setupCollection() {
        std::string state_dir_path = "/tmp/typesense_test/collection";
        LOG(INFO) << "Truncating and creating: " << state_dir_path;
@ -48,6 +51,12 @@ protected:
        }

        infile.close();
+
+        std::ifstream words_file(std::string(ROOT_DIR)+"test/resources/common100_english.txt");
+        std::stringstream strstream;
+        strstream << words_file.rdbuf();
+        words_file.close();
+        StringUtils::split(strstream.str(), words, "\n");
    }

    virtual void SetUp() {
@ -59,6 +68,18 @@ protected:
        collectionManager.dispose();
        delete store;
    }
+
+    std::string get_text(size_t num_words) {
+        time_t t;
+        srand((unsigned) time(&t));
+        std::vector<std::string> strs;
+
+        for(size_t i = 0 ; i < num_words ; i++ ) {
+            int word_index = rand() % 100;
+            strs.push_back(words[word_index]);
+        }
+        return StringUtils::join(strs, " ");
+    }
 };

 TEST_F(CollectionTest, VerifyCountOfDocuments) {
@ -558,14 +579,14 @@ TEST_F(CollectionTest, TypoTokensThreshold) {
    // Query expansion should happen only based on the `typo_tokens_threshold` value
    auto results = collection->search("launch", {"title"}, "", {}, sort_fields, 2, 10, 1,
                       token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                       spp::sparse_hash_set<std::string>(), 10, "", 5, "", 0).get();
+                       spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "", 0).get();

    ASSERT_EQ(5, results["hits"].size());
    ASSERT_EQ(5, results["found"].get<size_t>());

    results = collection->search("launch", {"title"}, "", {}, sort_fields, 2, 10, 1,
                                token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                                spp::sparse_hash_set<std::string>(), 10, "", 5, "", 10).get();
+                                spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "", 10).get();

    ASSERT_EQ(7, results["hits"].size());
    ASSERT_EQ(7, results["found"].get<size_t>());
@ -1296,6 +1317,243 @@ std::vector<nlohmann::json> import_res_to_json(const std::vector<std::string>& i
    return out;
 }

+TEST_F(CollectionTest, ImportDocumentsUpsert) {
+    Collection *coll_mul_fields;
+
+    std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
+    std::stringstream strstream;
+    strstream << infile.rdbuf();
+    infile.close();
+
+    std::vector<std::string> import_records;
+    StringUtils::split(strstream.str(), import_records, "\n");
+
+    std::vector<field> fields = {
+        field("title", field_types::STRING, false),
+        field("starring", field_types::STRING, false),
+        field("cast", field_types::STRING_ARRAY, false),
+        field("points", field_types::INT32, false)
+    };
+
+    coll_mul_fields = collectionManager.get_collection("coll_mul_fields");
+    if(coll_mul_fields == nullptr) {
+        coll_mul_fields = collectionManager.create_collection("coll_mul_fields", 1, fields, "points").get();
+    }
+
+    // try importing records
+    nlohmann::json document;
+    nlohmann::json import_response = coll_mul_fields->add_many(import_records, document);
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(18, import_response["num_imported"].get<int>());
+
+    // update + upsert records
+    std::vector<std::string> more_records = {R"({"id": "0", "title": "The Fifth Harry"})",
+                                            R"({"id": "2", "cast": ["Chris Fisher", "Rand Alan"]})",
+                                            R"({"id": "18", "title": "Back Again Forest", "points": 45, "starring": "Ronald Wells", "cast": ["Dant Saren"]})",
+                                            R"({"id": "6", "points": 77})"};
+
+    import_response = coll_mul_fields->add_many(more_records, document, UPSERT);
+
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(4, import_response["num_imported"].get<int>());
+
+    std::vector<nlohmann::json> import_results = import_res_to_json(more_records);
+    ASSERT_EQ(4, import_results.size());
+
+    for(size_t i=0; i<4; i++) {
+        ASSERT_TRUE(import_results[i]["success"].get<bool>());
+        ASSERT_EQ(1, import_results[i].size());
+    }
+
+    auto results = coll_mul_fields->search("*", query_fields, "", {}, sort_fields, 0, 30, 1, FREQUENCY, false).get();
+    ASSERT_EQ(19, results["hits"].size());
+
+    ASSERT_EQ(19, coll_mul_fields->get_num_documents());
+
+    results = coll_mul_fields->search("back again forest", query_fields, "", {}, sort_fields, 0, 30, 1, FREQUENCY, false).get();
+    ASSERT_EQ(1, results["hits"].size());
+
+    ASSERT_STREQ("Back Again Forest", coll_mul_fields->get("18").get()["title"].get<std::string>().c_str());
+
+    results = coll_mul_fields->search("fifth", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
+    ASSERT_EQ(2, results["hits"].size());
+
+    ASSERT_STREQ("The <mark>Fifth</mark> Harry", results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
+    ASSERT_STREQ("The Woman in the <mark>Fifth</mark> from Kristin", results["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
+
+    results = coll_mul_fields->search("burgundy", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
+    ASSERT_EQ(0, results["hits"].size());
+
+    results = coll_mul_fields->search("harry", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
+    ASSERT_EQ(1, results["hits"].size());
+
+    results = coll_mul_fields->search("captain america", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ(77, results["hits"][0]["document"]["points"].get<size_t>());
+
+    // upserting with some bad docs
+    more_records = {R"({"id": "1", "title": "Wake up, Harry"})",
+                    R"({"id": "90", "cast": ["Kim Werrel", "Random Wake"]})",                     // missing fields
+                    R"({"id": "5", "points": 60})",
+                    R"({"id": "24", "starring": "John", "cast": ["John Kim"], "points": 11})"};   // missing fields
+
+    import_response = coll_mul_fields->add_many(more_records, document, UPSERT);
+
+    ASSERT_FALSE(import_response["success"].get<bool>());
+    ASSERT_EQ(2, import_response["num_imported"].get<int>());
+
+    import_results = import_res_to_json(more_records);
+    ASSERT_FALSE(import_results[1]["success"].get<bool>());
+    ASSERT_FALSE(import_results[3]["success"].get<bool>());
+    ASSERT_STREQ("Field `points` has been declared as a default sorting field, but is not found in the document.", import_results[1]["error"].get<std::string>().c_str());
+    ASSERT_STREQ("Field `title` has been declared in the schema, but is not found in the document.", import_results[3]["error"].get<std::string>().c_str());
+
+    // try to duplicate records without upsert option
+
+    more_records = {R"({"id": "1", "title": "Wake up, Harry"})",
+                    R"({"id": "5", "points": 60})"};
+
+    import_response = coll_mul_fields->add_many(more_records, document, CREATE);
+    ASSERT_FALSE(import_response["success"].get<bool>());
+    ASSERT_EQ(0, import_response["num_imported"].get<int>());
+
+    import_results = import_res_to_json(more_records);
+    ASSERT_FALSE(import_results[0]["success"].get<bool>());
+    ASSERT_FALSE(import_results[1]["success"].get<bool>());
+    ASSERT_STREQ("A document with id 1 already exists.", import_results[0]["error"].get<std::string>().c_str());
+    ASSERT_STREQ("A document with id 5 already exists.", import_results[1]["error"].get<std::string>().c_str());
+
+    // update document with verbatim fields, except for points
+    more_records = {R"({"id": "3", "cast":["Matt Damon","Ben Affleck","Minnie Driver"],
+                        "points":70,"starring":"Robin Williams","starring_facet":"Robin Williams",
+                        "title":"Good Will Hunting"})"};
+
+    import_response = coll_mul_fields->add_many(more_records, document, UPDATE);
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(1, import_response["num_imported"].get<int>());
+
+    results = coll_mul_fields->search("Good Will Hunting", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
+    ASSERT_EQ(70, results["hits"][0]["document"]["points"].get<uint32_t>());
+
+    // updating a document that does not exist should fail, others should succeed
+    more_records = {R"({"id": "20", "points": 51})",
+                    R"({"id": "1", "points": 64})"};
+
+    import_response = coll_mul_fields->add_many(more_records, document, UPDATE);
+    ASSERT_FALSE(import_response["success"].get<bool>());
+    ASSERT_EQ(1, import_response["num_imported"].get<int>());
+
+    import_results = import_res_to_json(more_records);
+    ASSERT_FALSE(import_results[0]["success"].get<bool>());
+    ASSERT_TRUE(import_results[1]["success"].get<bool>());
+    ASSERT_STREQ("Could not find a document with id: 20", import_results[0]["error"].get<std::string>().c_str());
+    ASSERT_EQ(404, import_results[0]["code"].get<size_t>());
+
+    results = coll_mul_fields->search("wake up harry", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false).get();
+    ASSERT_EQ(64, results["hits"][0]["document"]["points"].get<uint32_t>());
+
+    // trying to create documents with existing IDs should fail
+    more_records = {R"({"id": "2", "points": 51})",
+                    R"({"id": "1", "points": 64})"};
+
+    import_response = coll_mul_fields->add_many(more_records, document, CREATE);
+    ASSERT_FALSE(import_response["success"].get<bool>());
+    ASSERT_EQ(0, import_response["num_imported"].get<int>());
+
+    import_results = import_res_to_json(more_records);
+    ASSERT_FALSE(import_results[0]["success"].get<bool>());
+    ASSERT_FALSE(import_results[1]["success"].get<bool>());
+    ASSERT_STREQ("A document with id 2 already exists.", import_results[0]["error"].get<std::string>().c_str());
+    ASSERT_STREQ("A document with id 1 already exists.", import_results[1]["error"].get<std::string>().c_str());
+
+    ASSERT_EQ(409, import_results[0]["code"].get<size_t>());
+    ASSERT_EQ(409, import_results[1]["code"].get<size_t>());
+}
+
+
+TEST_F(CollectionTest, ImportDocumentsUpsertOptional) {
+    Collection *coll1;
+    std::vector<field> fields = {
+            field("title", field_types::STRING_ARRAY, false, true),
+            field("points", field_types::INT32, false)
+    };
+
+    coll1 = collectionManager.get_collection("coll1");
+    if(coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
+    }
+
+    std::vector<std::string> records;
+
+    size_t NUM_RECORDS = 1000;
+
+    for(size_t i=0; i<NUM_RECORDS; i++) {
+        nlohmann::json doc;
+        doc["id"] = std::to_string(i);
+        doc["points"] = i;
+        records.push_back(doc.dump());
+    }
+
+    // import records without title
+
+    nlohmann::json document;
+    nlohmann::json import_response = coll1->add_many(records, document, CREATE);
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(1000, import_response["num_imported"].get<int>());
+
+    // upsert documents with title
+
+    records.clear();
+
+    for(size_t i=0; i<NUM_RECORDS; i++) {
+        nlohmann::json updoc;
+        updoc["id"] = std::to_string(i);
+        updoc["title"] = {
+            get_text(10),
+            get_text(10),
+            get_text(10),
+            get_text(10),
+        };
+        records.push_back(updoc.dump());
+    }
+
+    auto begin = std::chrono::high_resolution_clock::now();
+    import_response = coll1->add_many(records, document, UPSERT);
+    auto time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
+            std::chrono::high_resolution_clock::now() - begin).count();
+    
+    //LOG(INFO) << "Time taken for first upsert: " << time_micros;
+    
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(1000, import_response["num_imported"].get<int>());
+
+    // run upsert again with title override
+
+    records.clear();
+
+    for(size_t i=0; i<NUM_RECORDS; i++) {
+        nlohmann::json updoc;
+        updoc["id"] = std::to_string(i);
+        updoc["title"] = {
+            get_text(10),
+            get_text(10),
+            get_text(10),
+            get_text(10),
+        };
+        records.push_back(updoc.dump());
+    }
+
+    begin = std::chrono::high_resolution_clock::now();
+    import_response = coll1->add_many(records, document, UPSERT);
+    time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
+            std::chrono::high_resolution_clock::now() - begin).count();
+
+    //LOG(INFO) << "Time taken for second upsert: " << time_micros;
+
+    ASSERT_TRUE(import_response["success"].get<bool>());
+    ASSERT_EQ(1000, import_response["num_imported"].get<int>());
+}
+
 TEST_F(CollectionTest, ImportDocuments) {
    Collection *coll_mul_fields;

@ -1320,8 +1578,8 @@ TEST_F(CollectionTest, ImportDocuments) {
    }

    // try importing records
-
-    nlohmann::json import_response = coll_mul_fields->add_many(import_records);
+    nlohmann::json document;
+    nlohmann::json import_response = coll_mul_fields->add_many(import_records, document);
    ASSERT_TRUE(import_response["success"].get<bool>());
    ASSERT_EQ(18, import_response["num_imported"].get<int>());

@ -1346,7 +1604,7 @@ TEST_F(CollectionTest, ImportDocuments) {

    // verify that empty import is handled gracefully
    std::vector<std::string> empty_records;
-    import_response = coll_mul_fields->add_many(empty_records);
+    import_response = coll_mul_fields->add_many(empty_records, document);
    ASSERT_TRUE(import_response["success"].get<bool>());
    ASSERT_EQ(0, import_response["num_imported"].get<int>());

@ -1360,7 +1618,7 @@ TEST_F(CollectionTest, ImportDocuments) {
                               "{\"title\": \"Test4\", \"points\": 55, "
                                   "\"cast\": [\"Tom Skerritt\"] }"};

-    import_response = coll_mul_fields->add_many(more_records);
+    import_response = coll_mul_fields->add_many(more_records, document);
    ASSERT_FALSE(import_response["success"].get<bool>());
    ASSERT_EQ(2, import_response["num_imported"].get<int>());

@ -1385,7 +1643,7 @@ TEST_F(CollectionTest, ImportDocuments) {
                    "{\"id\": \"id1\", \"title\": \"Test1\", \"starring\": \"Rand Fish\", \"points\": 12, "
                    "\"cast\": [\"Tom Skerritt\"] }"};

-    import_response = coll_mul_fields->add_many(more_records);
+    import_response = coll_mul_fields->add_many(more_records, document);

    ASSERT_FALSE(import_response["success"].get<bool>());
    ASSERT_EQ(1, import_response["num_imported"].get<int>());
@ -1403,7 +1661,7 @@ TEST_F(CollectionTest, ImportDocuments) {

    // valid JSON but not a document
    more_records = {"[]"};
-    import_response = coll_mul_fields->add_many(more_records);
+    import_response = coll_mul_fields->add_many(more_records, document);

    ASSERT_FALSE(import_response["success"].get<bool>());
    ASSERT_EQ(0, import_response["num_imported"].get<int>());
@ -1417,7 +1675,7 @@ TEST_F(CollectionTest, ImportDocuments) {

    // invalid JSON
    more_records = {"{"};
-    import_response = coll_mul_fields->add_many(more_records);
+    import_response = coll_mul_fields->add_many(more_records, document);

    ASSERT_FALSE(import_response["success"].get<bool>());
    ASSERT_EQ(0, import_response["num_imported"].get<int>());
@ -1756,7 +2014,7 @@ TEST_F(CollectionTest, IndexingWithBadData) {
        sample_collection = collectionManager.create_collection("sample_collection", 4, fields, "age").get();
    }

-    const Option<nlohmann::json> & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\", \"age\": 29, \"average\": 78}");
+    const Option<nlohmann::json> & search_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 29, \"average\": 78}");
    ASSERT_FALSE(search_fields_missing_op1.ok());
    ASSERT_STREQ("Field `tags` has been declared in the schema, but is not found in the document.",
                 search_fields_missing_op1.error().c_str());
@ -2210,9 +2468,169 @@ TEST_F(CollectionTest, SearchHighlightShouldFollowThreshold) {
    ASSERT_STREQ("fox jumped over the <mark>lazy</mark> dog and ran straight",
                 res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());

+    // specify the number of surrounding tokens to return
+    size_t highlight_affix_num_tokens = 2;
+
+    res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
+                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, highlight_affix_num_tokens).get();
+    ASSERT_STREQ("over the <mark>lazy</mark> dog and",
+                 res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
+
+    highlight_affix_num_tokens = 0;
+    res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
+                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, highlight_affix_num_tokens).get();
+    ASSERT_STREQ("<mark>lazy</mark>",
+                 res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
+
    collectionManager.drop_collection("coll1");
 }

+TEST_F(CollectionTest, UpdateDocument) {
+    Collection *coll1;
+
+    std::vector<field> fields = {field("title", field_types::STRING, true),
+                                 field("tags", field_types::STRING_ARRAY, true),
+                                 field("points", field_types::INT32, false)};
+
+    std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
+
+    coll1 = collectionManager.get_collection("coll1");
+    if (coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+    }
+
+    nlohmann::json doc;
+    doc["id"] = "100";
+    doc["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.";
+    doc["tags"] = {"NEWS", "LAZY"};
+    doc["points"] = 25;
+
+    auto add_op = coll1->add(doc.dump());
+    ASSERT_TRUE(add_op.ok());
+
+    auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
+                             token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                             spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
+
+    ASSERT_EQ(1, res["hits"].size());
+    ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.",
+            res["hits"][0]["document"]["title"].get<std::string>().c_str());
+
+    // try changing the title and searching for an older token
+    doc["title"] = "The quick brown fox.";
+    add_op = coll1->add(doc.dump(), UPSERT);
+    ASSERT_TRUE(add_op.ok());
+
+    ASSERT_EQ(1, coll1->get_num_documents());
+
+    res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
+                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
+
+    ASSERT_EQ(0, res["hits"].size());
+
+    res = coll1->search("quick", {"title"}, "", {}, sort_fields, 0, 10, 1,
+                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
+
+    ASSERT_EQ(1, res["hits"].size());
+    ASSERT_STREQ("The quick brown fox.", res["hits"][0]["document"]["title"].get<std::string>().c_str());
+
+    // try to update document tags without `id`
+    nlohmann::json doc2;
+    doc2["tags"] = {"SENTENCE"};
+    add_op = coll1->add(doc2.dump(), UPDATE);
+    ASSERT_FALSE(add_op.ok());
+    ASSERT_STREQ("For update, the `id` key must be provided.", add_op.error().c_str());
+
+    // now change tags with id
+    doc2["id"] = "100";
+    add_op = coll1->add(doc2.dump(), UPDATE);
+    ASSERT_TRUE(add_op.ok());
+
+    // check for old tag
+    res = coll1->search("NEWS", {"tags"}, "", {}, sort_fields, 0, 10, 1,
+                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
+
+    ASSERT_EQ(0, res["hits"].size());
+
+    // now check for new tag and also try faceting on that field
+    res = coll1->search("SENTENCE", {"tags"}, "", {"tags"}, sort_fields, 0, 10, 1,
+                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
+
+    ASSERT_EQ(1, res["hits"].size());
+    ASSERT_STREQ("SENTENCE", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+
+    // try changing points
+    nlohmann::json doc3;
+    doc3["points"] = 99;
+    doc3["id"] = "100";
+
+    add_op = coll1->add(doc3.dump(), UPDATE);
+    ASSERT_TRUE(add_op.ok());
+
+    res = coll1->search("*", {"tags"}, "points: > 90", {}, sort_fields, 0, 10, 1,
+                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
+
+    ASSERT_EQ(1, res["hits"].size());
+    ASSERT_EQ(99, res["hits"][0]["document"]["points"].get<size_t>());
+
+    // id can be passed by param
+    nlohmann::json doc4;
+    doc4["points"] = 105;
+
+    add_op = coll1->add(doc4.dump(), UPSERT, "100");
+    ASSERT_TRUE(add_op.ok());
+
+    res = coll1->search("*", {"tags"}, "points: > 101", {}, sort_fields, 0, 10, 1,
+                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
+
+    ASSERT_EQ(1, res["hits"].size());
+    ASSERT_EQ(105, res["hits"][0]["document"]["points"].get<size_t>());
+
+    // try to change a field with bad value and verify that old document is put back
+    doc4["points"] = "abc";
+    add_op = coll1->add(doc4.dump(), UPSERT, "100");
+    ASSERT_FALSE(add_op.ok());
+
+    res = coll1->search("*", {"tags"}, "points: > 101", {}, sort_fields, 0, 10, 1,
+                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
+
+    ASSERT_EQ(1, res["hits"].size());
+    ASSERT_EQ(105, res["hits"][0]["document"]["points"].get<size_t>());
+
+    // when explicit path id does not match doc id, error should be returned
+    nlohmann::json doc5;
+    doc5["id"] = "800";
+    doc5["title"] = "The Secret Seven";
+    doc5["points"] = 250;
+    doc5["tags"] = {"BOOK", "ENID BLYTON"};
+
+    add_op = coll1->add(doc5.dump(), UPSERT, "799");
+    ASSERT_FALSE(add_op.ok());
+    ASSERT_EQ(400, add_op.code());
+    ASSERT_STREQ("The `id` of the resource does not match the `id` in the JSON body.", add_op.error().c_str());
+
+    // passing an empty id should not succeed
+    nlohmann::json doc6;
+    doc6["id"] = "";
+    doc6["title"] = "The Secret Seven";
+    doc6["points"] = 250;
+    doc6["tags"] = {"BOOK", "ENID BLYTON"};
+
+    add_op = coll1->add(doc6.dump(), UPDATE);
+    ASSERT_FALSE(add_op.ok());
+    ASSERT_EQ(400, add_op.code());
+    ASSERT_STREQ("The `id` should not be empty.", add_op.error().c_str());
+}
+
 TEST_F(CollectionTest, SearchHighlightFieldFully) {
    Collection *coll1;

@ -2240,7 +2658,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {

    auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();

    ASSERT_EQ(1, res["hits"][0]["highlights"].size());
    ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2249,14 +2667,14 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
    // should not return value key when highlight_full_fields is not specified
    res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "").get();

    ASSERT_EQ(2, res["hits"][0]["highlights"][0].size());

    // query multiple fields
    res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "title, tags").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title, tags").get();

    ASSERT_EQ(2, res["hits"][0]["highlights"].size());
    ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2269,7 +2687,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
    spp::sparse_hash_set<std::string> excluded_fields = {"tags"};
    res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        excluded_fields, 10, "", 5, "title, tags").get();
+                        excluded_fields, 10, "", 5, 5, "title, tags").get();

    ASSERT_EQ(1, res["hits"][0]["highlights"].size());
    ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2279,7 +2697,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
    excluded_fields = {"tags", "title"};
    res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        excluded_fields, 10, "", 5, "title, tags").get();
+                        excluded_fields, 10, "", 5, 5, "title, tags").get();
    ASSERT_EQ(0, res["hits"][0]["highlights"].size());

    collectionManager.drop_collection("coll1");
--- a/test/index_test.cpp
+++ b/test/index_test.cpp
@ -0,0 +1,59 @@
+#include <gtest/gtest.h>
+#include "index.h"
+#include <vector>
+
+TEST(IndexTest, ScrubReindexDoc) {
+    std::unordered_map<std::string, field> search_schema;
+    search_schema.emplace("title", field("title", field_types::STRING, false));
+    search_schema.emplace("points", field("title", field_types::INT32, false));
+    search_schema.emplace("cast", field("cast", field_types::STRING_ARRAY, false));
+    search_schema.emplace("movie", field("movie", field_types::BOOL, false));
+
+    Index index("index", search_schema, {}, {});
+    nlohmann::json old_doc;
+    old_doc["id"] = "1";
+    old_doc["title"] = "One more thing.";
+    old_doc["points"] = 100;
+    old_doc["cast"] = {"John Wick", "Jeremy Renner"};
+    old_doc["movie"] = true;
+
+    // all fields remain same
+
+    nlohmann::json update_doc1, del_doc1;
+    update_doc1 = old_doc;
+    del_doc1 = old_doc;
+
+    index.scrub_reindex_doc(update_doc1, del_doc1, old_doc);
+    ASSERT_EQ(1, del_doc1.size());
+    ASSERT_STREQ("1", del_doc1["id"].get<std::string>().c_str());
+
+    // when only some fields are updated
+
+    nlohmann::json update_doc2, del_doc2;
+    update_doc2["id"] = "1";
+    update_doc2["points"] = 100;
+    update_doc2["cast"] = {"Jack"};
+
+    del_doc2 = update_doc2;
+
+    index.scrub_reindex_doc(update_doc2, del_doc2, old_doc);
+    ASSERT_EQ(2, del_doc2.size());
+    ASSERT_STREQ("1", del_doc2["id"].get<std::string>().c_str());
+    std::vector<std::string> cast = del_doc2["cast"].get<std::vector<std::string>>();
+    ASSERT_EQ(1, cast.size());
+    ASSERT_STREQ("Jack", cast[0].c_str());
+
+    // containing fields not part of search schema
+
+    nlohmann::json update_doc3, del_doc3;
+    update_doc3["id"] = "1";
+    update_doc3["title"] = "The Lawyer";
+    update_doc3["foo"] = "Bar";
+
+    del_doc3 = update_doc3;
+    index.scrub_reindex_doc(update_doc3, del_doc3, old_doc);
+    ASSERT_EQ(3, del_doc3.size());
+    ASSERT_STREQ("1", del_doc3["id"].get<std::string>().c_str());
+    ASSERT_STREQ("The Lawyer", del_doc3["title"].get<std::string>().c_str());
+    ASSERT_STREQ("Bar", del_doc3["foo"].get<std::string>().c_str());
+}
--- a/test/resources/common100_english.txt
+++ b/test/resources/common100_english.txt
@ -0,0 +1,100 @@
+the
+of
+to
+and
+a
+in
+is
+it
+you
+that
+he
+was
+for
+on
+are
+with
+as
+I
+his
+they
+be
+at
+one
+have
+this
+from
+or
+had
+by
+not
+word
+but
+what
+some
+we
+can
+out
+other
+were
+all
+there
+when
+up
+use
+your
+how
+said
+an
+each
+she
+which
+do
+their
+time
+if
+will
+way
+about
+many
+then
+them
+write
+would
+like
+so
+these
+her
+long
+make
+thing
+see
+him
+two
+has
+look
+more
+day
+could
+go
+come
+did
+number
+sound
+no
+most
+people
+my
+over
+know
+water
+than
+call
+first
+who
+may
+down
+side
+been
+now
+find
--- a/test/sorted_array_test.cpp
+++ b/test/sorted_array_test.cpp
@ -12,7 +12,8 @@ TEST(SortedArrayTest, Append) {
    EXPECT_EQ(arr.indexOf(100), 0);  // when not found must be equal to length (0 in this case)

    for(uint32_t i=0; i < SIZE; i++) {
-        arr.append(i);
+        size_t appended_index = arr.append(i);
+        ASSERT_EQ(i, appended_index);
    }

    EXPECT_EQ(arr.getLength(), SIZE);
@ -28,11 +29,94 @@ TEST(SortedArrayTest, Append) {
    EXPECT_EQ(arr.indexOf(SIZE+1), SIZE);

    sorted_array arr_small;
-    arr_small.append(100);
+    size_t appended_index = arr_small.append(100);
+    EXPECT_EQ(0, appended_index);
    EXPECT_EQ(arr_small.getLength(), 1);
    EXPECT_EQ(arr_small.at(0), 100);
 }

+TEST(SortedArrayTest, AppendOutOfOrder) {
+    sorted_array arr;
+    for(size_t i=5; i<=10; i++) {
+        size_t appended_index = arr.append(i);
+        ASSERT_EQ(i-5, appended_index);
+    }
+
+    EXPECT_EQ(6, arr.getLength());
+
+    int appended_index = -1;
+
+    appended_index = arr.append(1);
+    ASSERT_EQ(0, appended_index);
+
+    appended_index = arr.append(3);
+    ASSERT_EQ(1, appended_index);
+
+    appended_index = arr.append(2);
+    ASSERT_EQ(1, appended_index);
+
+    appended_index = arr.append(4);
+    ASSERT_EQ(3, appended_index);
+
+    appended_index = arr.append(11);
+    ASSERT_EQ(10, appended_index);
+
+    appended_index = arr.append(14);
+    ASSERT_EQ(11, appended_index);
+
+    appended_index = arr.append(12);
+    ASSERT_EQ(11, appended_index);
+
+    EXPECT_EQ(13, arr.getLength());
+}
+
+TEST(SortedArrayTest, InsertAtIndex) {
+    std::vector<uint32_t> eles;
+    sorted_array arr;
+    for(size_t i=5; i<=9; i++) {
+        arr.append(i);
+    }
+
+    arr.append(11);
+    eles = {5, 6, 7, 8, 9, 11};
+
+    for(size_t i=0; i < eles.size(); i++) {
+        ASSERT_EQ(eles[i], arr.at(i));
+    }
+
+    arr.insert(0, 1);
+    eles = { 1, 5, 6, 7, 8, 9, 11 };
+
+    for(size_t i=0; i < eles.size(); i++) {
+        ASSERT_EQ(eles[i], arr.at(i));
+    }
+
+    ASSERT_EQ(1, arr.at(0));
+    ASSERT_EQ(5, arr.at(1));
+
+    arr.insert(1, 2);
+    eles = {1, 2, 5, 6, 7, 8, 9, 11};
+    ASSERT_EQ(1, arr.at(0));
+    ASSERT_EQ(2, arr.at(1));
+    ASSERT_EQ(8, arr.getLength());
+
+    for(size_t i=0; i < eles.size(); i++) {
+        ASSERT_EQ(eles[i], arr.at(i));
+    }
+
+    arr.insert(7, 10);
+    eles = { 1, 2, 5, 6, 7, 8, 9, 10, 11};
+    ASSERT_EQ(10, arr.at(7));
+    ASSERT_EQ(11, arr.at(8));
+    ASSERT_EQ(9, arr.getLength());
+
+    for(size_t i=0; i < eles.size(); i++) {
+        ASSERT_EQ(eles[i], arr.at(i));
+    }
+
+    ASSERT_FALSE(arr.insert(9, 12));  // index out of range
+}
+
 TEST(SortedArrayTest, Load) {
    sorted_array arr;

@ -70,6 +154,32 @@ TEST(SortedArrayTest, Uncompress) {
    delete[] raw_sorted_arr;
 }

+TEST(SortedArrayTest, RemoveValue) {
+    sorted_array arr;
+
+    const size_t SIZE = 10*1000;
+    for(size_t i=0; i<SIZE; i++) {
+        arr.append(i);
+    }
+
+    uint32_t values[5] = {0, 100, 1000, 2000, SIZE-1};
+
+    for(size_t i=0; i<5; i++) {
+        arr.remove_value(values[i]);
+    }
+
+    ASSERT_EQ(arr.getLength(), SIZE-5);
+
+    for(size_t i=0; i<SIZE-5; i++) {
+        uint32_t value = arr.at(i);
+        ASSERT_FALSE(value == 0);
+        ASSERT_FALSE(value == 100);
+        ASSERT_FALSE(value == 1000);
+        ASSERT_FALSE(value == 2000);
+        ASSERT_FALSE(value == SIZE-1);
+    }
+}
+
 TEST(SortedArrayTest, RemoveValues) {
    sorted_array arr;