From 61bfdf027ba2305f1d738767c96bc8d6cf613aba Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishore@kishorelive.com>
Date: Sun, 21 May 2017 15:59:16 +0530
Subject: [PATCH] Fix valgrind errors, plugging other leaks.

---
 TODO.md                    |   1 +
 include/array.h            |   2 +-
 include/array_base.h       |  14 -----
 include/collection.h       |   6 +-
 include/sorted_array.h     |   2 +-
 src/collection.cpp         |  48 +++++++---------
 src/collection_manager.cpp |   2 +-
 src/http_server.cpp        |   2 +-
 src/main/main.cpp          | 110 ++++++++++++++++++++++---------------
 src/sorted_array.cpp       |   6 +-
 10 files changed, 98 insertions(+), 95 deletions(-)

diff --git a/TODO.md b/TODO.md
index f9f3a64e..57dd05c1 100644
--- a/TODO.md
+++ b/TODO.md
@@ -36,6 +36,7 @@
 - ~~Facet limit (hardcode to top 10)~~
 - ~~Deprecate old split function~~
 - ID should not have "/"
+- Test for sorted_array::indexOf when length is 0
 - Handle store-get() not finding a key
 - Fix API response codes
 - Test for search without any sort_by given
diff --git a/include/array.h b/include/array.h
index c390705a..cb402b96 100644
--- a/include/array.h
+++ b/include/array.h
@@ -14,7 +14,7 @@ private:
         uint32_t m = std::min(min, value);
         uint32_t M = std::max(max, value);
         uint32_t bnew = required_bits(M - m);
-        return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
+        return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
     }
 
 public:
diff --git a/include/array_base.h b/include/array_base.h
index a0ad124c..53ef54df 100644
--- a/include/array_base.h
+++ b/include/array_base.h
@@ -24,20 +24,6 @@ protected:
         return (uint32_t) (v == 0 ? 0 : 32 - __builtin_clz(v));
     }
 
-    uint32_t inline sorted_append_size_required(uint32_t value, uint32_t new_length) {
-        uint32_t m = std::min(min, value);
-        uint32_t M = std::max(max, value);
-        uint32_t bnew = required_bits(M - m);
-        return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
-    }
-
-    uint32_t inline unsorted_append_size_required(uint32_t value, uint32_t new_length) {
-        uint32_t m = std::min(min, value);
-        uint32_t M = std::max(max, value);
-        uint32_t bnew = required_bits(M - m);
-        return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
-    }
-
 public:
     array_base(const uint32_t n=2) {
         size_bytes = METADATA_OVERHEAD + (n * FOR_ELE_SIZE);
diff --git a/include/collection.h b/include/collection.h
index b43d8941..4e95bb5f 100644
--- a/include/collection.h
+++ b/include/collection.h
@@ -15,7 +15,7 @@ struct facet_value {
     spp::sparse_hash_map<std::string, uint32_t> value_index;
     spp::sparse_hash_map<uint32_t, std::string> index_value;
 
-    spp::sparse_hash_map<uint32_t, std::vector<uint32_t>*> doc_values;
+    spp::sparse_hash_map<uint32_t, std::vector<uint32_t>> doc_values;
 
     uint32_t get_value_index(const std::string & value) {
         if(value_index.count(value) != 0) {
@@ -29,9 +29,9 @@ struct facet_value {
     }
 
     void index_values(uint32_t doc_seq_id, const std::vector<std::string> & values) {
-        std::vector<uint32_t>* value_vec = new std::vector<uint32_t>(values.size());
+        std::vector<uint32_t> value_vec(values.size());
         for(auto i = 0; i < values.size(); i++) {
-            (*value_vec)[i] = get_value_index(values[i]);
+            value_vec[i] = get_value_index(values[i]);
         }
         doc_values.emplace(doc_seq_id, value_vec);
     }
diff --git a/include/sorted_array.h b/include/sorted_array.h
index 8798e0b6..93553cab 100644
--- a/include/sorted_array.h
+++ b/include/sorted_array.h
@@ -16,7 +16,7 @@ private:
         uint32_t m = std::min(min, value);
         uint32_t M = std::max(max, value);
         uint32_t bnew = required_bits(M - m);
-        return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
+        return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
     }
 
     uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base,
diff --git a/src/collection.cpp b/src/collection.cpp
index d808699a..2055ef3a 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -33,20 +33,13 @@ Collection::Collection(const std::string name, const uint32_t collection_id, con
 }
 
 Collection::~Collection() {
-    for(std::pair<std::string, field> name_field: search_schema) {
+    for(auto & name_field: search_schema) {
         art_tree *t = search_index.at(name_field.first);
         art_tree_destroy(t);
         t = nullptr;
     }
 
-    for(std::pair<std::string, field> name_field: facet_schema) {
-        facet_value & fvalue = facet_index.at(name_field.first);
-        for(auto doc_value: fvalue.doc_values) {
-            delete doc_value.second;
-        }
-    }
-
-    for(std::pair<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> name_map: sort_index) {
+    for(auto & name_map: sort_index) {
         delete name_map.second;
     }
 }
@@ -271,7 +264,7 @@ void Collection::index_string_field(const std::string & text, const uint32_t sco
     } else {
         StringUtils::split(text, tokens, " ");
         for(uint32_t i=0; i<tokens.size(); i++) {
-            auto token = tokens[i];
+            auto & token = tokens[i];
             transform(token.begin(), token.end(), token.begin(), tolower);
             token_to_offsets[token].push_back(i);
         }
@@ -336,9 +329,9 @@ void Collection::do_facets(std::vector<facet> & facets, uint32_t* result_ids, si
             uint32_t doc_seq_id = result_ids[i];
             if(fvalue.doc_values.count(doc_seq_id) != 0) {
                 // for every result document, get the values associated and increment counter
-                std::vector<uint32_t>* value_indices = fvalue.doc_values.at(doc_seq_id);
-                for(auto j = 0; j < value_indices->size(); j++) {
-                    const std::string & facet_value = fvalue.index_value.at(value_indices->at(j));
+                const std::vector<uint32_t> & value_indices = fvalue.doc_values.at(doc_seq_id);
+                for(auto j = 0; j < value_indices.size(); j++) {
+                    const std::string & facet_value = fvalue.index_value.at(value_indices.at(j));
                     a_facet.result_map[facet_value] += 1;
                 }
             }
@@ -646,7 +639,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
 
     result["hits"] = nlohmann::json::array();
 
-    for(auto field_order_kv: field_order_kvs) {
+    for(auto & field_order_kv: field_order_kvs) {
         std::string value;
         const std::string &seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key);
         store->get(seq_id_key, value);
@@ -676,7 +669,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
                   });
 
         for(auto i = 0; i < std::min((size_t)10, value_to_count.size()); i++) {
-            auto kv = value_to_count[i];
+            auto & kv = value_to_count[i];
             nlohmann::json facet_value_count = nlohmann::json::object();
             facet_value_count["value"] = kv.first;
             facet_value_count["count"] = kv.second;
@@ -1004,8 +997,6 @@ void Collection::remove_and_shift_offset_index(sorted_array &offset_index, const
 }
 
 Option<std::string> Collection::remove(std::string id) {
-    nlohmann::json result = nlohmann::json::object();
-
     std::string seq_id_str;
     StoreStatus status = store->get(get_doc_id_key(id), seq_id_str);
 
@@ -1020,7 +1011,7 @@ Option<std::string> Collection::remove(std::string id) {
 
     nlohmann::json document = nlohmann::json::parse(parsed_document);
 
-    for(auto name_field: search_schema) {
+    for(auto & name_field: search_schema) {
         std::vector<std::string> tokens;
         if(name_field.second.type == field_types::STRING) {
             StringUtils::split(document[name_field.first], tokens, " ");
@@ -1056,7 +1047,7 @@ Option<std::string> Collection::remove(std::string id) {
             }
         }
 
-        for(auto token: tokens) {
+        for(auto & token: tokens) {
             const unsigned char *key;
             int key_len;
 
@@ -1069,19 +1060,16 @@ Option<std::string> Collection::remove(std::string id) {
                 key_len = (int) (token.length());
             }
 
-            if(token == "https://twitter.com/yogalayout") {
-                std::cout << "token https://twitter.com/yogalayout" << std::endl;
-            }
-
             art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len);
             if(leaf != NULL) {
                 uint32_t seq_id_values[1] = {seq_id};
+                uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
 
-                if(leaf->values->ids.getLength() == 0) {
-                    std::cout << "HEY!!!" << std::endl;
+                if(doc_index == leaf->values->ids.getLength()) {
+                    // not found - happens when 2 tokens repeat in a field, e.g "is it or is is not?"
+                    continue;
                 }
 
-                uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
                 uint32_t start_offset = leaf->values->offset_index.at(doc_index);
                 uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
                                       leaf->values->offsets.getLength() :
@@ -1100,19 +1088,21 @@ Option<std::string> Collection::remove(std::string id) {
                 std::cout << "----" << std::endl;*/
 
                 if(leaf->values->ids.getLength() == 0) {
-                    art_delete(search_index.at(name_field.first), key, key_len);
+                    art_values* values = (art_values*) art_delete(search_index.at(name_field.first), key, key_len);
+                    delete values;
+                    values = nullptr;
                 }
             }
         }
     }
 
     // remove facets if any
-    for(auto field_facet_value: facet_index) {
+    for(auto & field_facet_value: facet_index) {
         field_facet_value.second.doc_values.erase(seq_id);
     }
 
     // remove sort index if any
-    for(auto field_doc_value_map: sort_index) {
+    for(auto & field_doc_value_map: sort_index) {
         field_doc_value_map.second->erase(seq_id);
     }
 
diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp
index 27caf2f7..b068be90 100644
--- a/src/collection_manager.cpp
+++ b/src/collection_manager.cpp
@@ -23,7 +23,7 @@ void CollectionManager::init(Store *store) {
     std::vector<std::string> collection_meta_jsons;
     store->scan_fill(Collection::COLLECTION_META_PREFIX, collection_meta_jsons);
 
-    for(auto collection_meta_json: collection_meta_jsons) {
+    for(auto & collection_meta_json: collection_meta_jsons) {
         nlohmann::json collection_meta = nlohmann::json::parse(collection_meta_json);
         std::string this_collection_name = collection_meta[COLLECTION_NAME_KEY].get<std::string>();
 
diff --git a/src/http_server.cpp b/src/http_server.cpp
index 13941e2e..94472a0f 100644
--- a/src/http_server.cpp
+++ b/src/http_server.cpp
@@ -101,7 +101,7 @@ std::map<std::string, std::string> HttpServer::parse_query(const std::string& qu
         std::string key = (*i)[1].str();
         std::string raw_value = (*i)[2].str();
         std::string value = StringUtils::url_decode(raw_value);
-        if(query_map.count(key) == 0) {
+        if(query_map.count(value) == 0) {
             query_map[key] = value;
         } else {
             query_map[key] = query_map[key] + "&&" + value;
diff --git a/src/main/main.cpp b/src/main/main.cpp
index ddde0e68..07a16106 100644
--- a/src/main/main.cpp
+++ b/src/main/main.cpp
@@ -8,70 +8,92 @@
 #include <unordered_map>
 #include <queue>
 #include "string_utils.h"
+#include <sys/resource.h>
 #include "collection.h"
 #include "collection_manager.h"
 
 using namespace std;
 
-void find_indices(const uint32_t *result_ids, int low, int high, std::vector<uint32_t> & results) {
-    if(high >= low) {
-        size_t pivot = (low + high) / 2;
-        //std::cout << pivot << std::endl;
-        results.at(pivot) = result_ids[pivot];
-        find_indices(result_ids, low, pivot-1, results);
-        find_indices(result_ids, pivot+1, high, results);
-    }
-}
-
 int main(int argc, char* argv[]) {
-    std::vector<uint32_t> results(3);
-    uint32_t *result_ids = new uint32_t[3];
-    /*for(auto i = 0; i < 100; i++) {
-        result_ids[i] = i;
-    }*/
-    result_ids[0] = 6;
-    result_ids[1] = 19;
-    result_ids[2] = 21;
-
-    find_indices(result_ids, 0, 2, results);
-    //std::sort(results.begin(), results.end());
-    for(auto i : results) {
-        std::cout << i << std::endl;
-    }
-
-
     const std::string state_dir_path = "/tmp/typesense-data";
-
-    std::vector<field> fields_to_index = {field("title", field_types::STRING)};
-    std::vector<field> sort_fields = { field("points", "INT32")};
     Store *store = new Store("/tmp/typesense-data");
 
     CollectionManager & collectionManager = CollectionManager::get_instance();
     collectionManager.init(store);
 
-    Collection *collection = collectionManager.get_collection("collection");
-    if(collection == nullptr) {
-        collection = collectionManager.create_collection("collection", fields_to_index, {}, sort_fields);
-        std::ifstream infile(std::string(ROOT_DIR)+"test/documents.jsonl");
-        //std::ifstream infile(argv[1]);
+    std::vector<field> fields_to_index = {
+            field("lang", field_types::STRING),
+            field("description", field_types::STRING),
+            field("topics", field_types::STRING_ARRAY),
+            field("stars", field_types::INT32),
+            field("repo_name", field_types::STRING),
+            field("org", field_types::STRING)
+    };
 
+    std::vector<field> facet_fields_index = {
+//            field("lang", field_types::STRING),
+//            field("org", field_types::STRING),
+//            field("topics", field_types::STRING_ARRAY)
+    };
+
+    std::vector<field> sort_fields = { field("stars", "INT32")};
+
+    Collection *collection = collectionManager.get_collection("github_top1k");
+
+    if(collection == nullptr) {
+        collection = collectionManager.create_collection("github_top1k", fields_to_index, facet_fields_index, sort_fields);
+    }
+
+    int j = 0;
+    while(j < 1) {
+        j++;
+
+        std::ifstream infile(argv[1]);
         std::string json_line;
 
+        cout << "BEGINNING Iteration: " << j << endl << flush;
+        auto begin = std::chrono::high_resolution_clock::now();
+
         while (std::getline(infile, json_line)) {
-            collection->add(json_line);
+            nlohmann::json document = nlohmann::json::parse(json_line);
+            document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
+            collection->add(document.dump());
         }
 
         infile.close();
-        cout << "FINISHED INDEXING!" << endl << flush;
+
+        long long int timeMillis =
+                std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
+
+        std::cout << "Time taken for insertion: " << timeMillis << "ms" << std::endl;
+        begin = std::chrono::high_resolution_clock::now();
+
+        std::ifstream infile2(argv[1]);
+
+        int counter = 0;
+
+        while (std::getline(infile2, json_line)) {
+            counter++;
+            nlohmann::json document = nlohmann::json::parse(json_line);
+            document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
+            collection->remove(document["id"]);
+            /*if (counter % 100 == 0) {
+                std::cout << "Removed " << counter << " so far..." << std::endl;
+            }*/
+        }
+
+        infile2.close();
+
+        timeMillis =
+                std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
+
+        struct rusage r_usage;
+        getrusage(RUSAGE_SELF,&r_usage);
+        std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl;
+        std::cout << "Time taken for deletion: " << timeMillis << "ms" << std::endl;
     }
 
-    //collection->remove("foo");
-
-    auto begin = std::chrono::high_resolution_clock::now();
-    std::vector<std::string> search_fields = {"title"};
-    collection->search("the", search_fields, "", {}, { sort_field("points", "DESC") }, 1, 100, MAX_SCORE, 0);
-    long long int timeMillis =
-            std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
-    cout << "Time taken: " << timeMillis << "us" << endl;
+    delete collection;
+    delete store;
     return 0;
 }
\ No newline at end of file
diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp
index 9f32aea5..8c8310fa 100644
--- a/src/sorted_array.cpp
+++ b/src/sorted_array.cpp
@@ -2,7 +2,7 @@
 #include "array_utils.h"
 
 void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_length) {
-    min = sorted_array[0];
+    min = array_length != 0 ? sorted_array[0] : 0;
     max = array_length > 1 ? sorted_array[array_length-1] : min;
 
     uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR);
@@ -55,6 +55,10 @@ bool sorted_array::contains(uint32_t value) {
 }
 
 uint32_t sorted_array::indexOf(uint32_t value) {
+    if(length == 0) {
+        return length;
+    }
+
     uint32_t actual;
     uint32_t index = for_lower_bound_search(in, length, value, &actual);
     if(actual == value) return index;