Fix valgrind errors, plugging other leaks.

2025-05-18 04:32:38 +08:00 · 2017-05-21 15:59:16 +05:30 · 2017-05-21 15:59:16 +05:30 · 61bfdf027b
commit 61bfdf027b
parent 0bb8cf13bf
10 changed files with 98 additions and 95 deletions
--- a/TODO.md
+++ b/TODO.md
@ -36,6 +36,7 @@
 - ~~Facet limit (hardcode to top 10)~~
 - ~~Deprecate old split function~~
 - ID should not have "/"
+- Test for sorted_array::indexOf when length is 0
 - Handle store-get() not finding a key
 - Fix API response codes
 - Test for search without any sort_by given
--- a/include/array.h
+++ b/include/array.h
@ -14,7 +14,7 @@ private:
        uint32_t m = std::min(min, value);
        uint32_t M = std::max(max, value);
        uint32_t bnew = required_bits(M - m);
-        return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
+        return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
    }

 public:
--- a/include/array_base.h
+++ b/include/array_base.h
@ -24,20 +24,6 @@ protected:
        return (uint32_t) (v == 0 ? 0 : 32 - __builtin_clz(v));
    }

-    uint32_t inline sorted_append_size_required(uint32_t value, uint32_t new_length) {
-        uint32_t m = std::min(min, value);
-        uint32_t M = std::max(max, value);
-        uint32_t bnew = required_bits(M - m);
-        return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
-    }
-
-    uint32_t inline unsorted_append_size_required(uint32_t value, uint32_t new_length) {
-        uint32_t m = std::min(min, value);
-        uint32_t M = std::max(max, value);
-        uint32_t bnew = required_bits(M - m);
-        return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
-    }
-
 public:
    array_base(const uint32_t n=2) {
        size_bytes = METADATA_OVERHEAD + (n * FOR_ELE_SIZE);
--- a/include/collection.h
+++ b/include/collection.h
@ -15,7 +15,7 @@ struct facet_value {
    spp::sparse_hash_map<std::string, uint32_t> value_index;
    spp::sparse_hash_map<uint32_t, std::string> index_value;

-    spp::sparse_hash_map<uint32_t, std::vector<uint32_t>*> doc_values;
+    spp::sparse_hash_map<uint32_t, std::vector<uint32_t>> doc_values;

    uint32_t get_value_index(const std::string & value) {
        if(value_index.count(value) != 0) {
@ -29,9 +29,9 @@ struct facet_value {
    }

    void index_values(uint32_t doc_seq_id, const std::vector<std::string> & values) {
-        std::vector<uint32_t>* value_vec = new std::vector<uint32_t>(values.size());
+        std::vector<uint32_t> value_vec(values.size());
        for(auto i = 0; i < values.size(); i++) {
-            (*value_vec)[i] = get_value_index(values[i]);
+            value_vec[i] = get_value_index(values[i]);
        }
        doc_values.emplace(doc_seq_id, value_vec);
    }
--- a/include/sorted_array.h
+++ b/include/sorted_array.h
@ -16,7 +16,7 @@ private:
        uint32_t m = std::min(min, value);
        uint32_t M = std::max(max, value);
        uint32_t bnew = required_bits(M - m);
-        return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
+        return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
    }

    uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base,
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -33,20 +33,13 @@ Collection::Collection(const std::string name, const uint32_t collection_id, con
 }

 Collection::~Collection() {
-    for(std::pair<std::string, field> name_field: search_schema) {
+    for(auto & name_field: search_schema) {
        art_tree *t = search_index.at(name_field.first);
        art_tree_destroy(t);
        t = nullptr;
    }

-    for(std::pair<std::string, field> name_field: facet_schema) {
-        facet_value & fvalue = facet_index.at(name_field.first);
-        for(auto doc_value: fvalue.doc_values) {
-            delete doc_value.second;
-        }
-    }
-
-    for(std::pair<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> name_map: sort_index) {
+    for(auto & name_map: sort_index) {
        delete name_map.second;
    }
 }
@ -271,7 +264,7 @@ void Collection::index_string_field(const std::string & text, const uint32_t sco
    } else {
        StringUtils::split(text, tokens, " ");
        for(uint32_t i=0; i<tokens.size(); i++) {
-            auto token = tokens[i];
+            auto & token = tokens[i];
            transform(token.begin(), token.end(), token.begin(), tolower);
            token_to_offsets[token].push_back(i);
        }
@ -336,9 +329,9 @@ void Collection::do_facets(std::vector<facet> & facets, uint32_t* result_ids, si
            uint32_t doc_seq_id = result_ids[i];
            if(fvalue.doc_values.count(doc_seq_id) != 0) {
                // for every result document, get the values associated and increment counter
-                std::vector<uint32_t>* value_indices = fvalue.doc_values.at(doc_seq_id);
-                for(auto j = 0; j < value_indices->size(); j++) {
-                    const std::string & facet_value = fvalue.index_value.at(value_indices->at(j));
+                const std::vector<uint32_t> & value_indices = fvalue.doc_values.at(doc_seq_id);
+                for(auto j = 0; j < value_indices.size(); j++) {
+                    const std::string & facet_value = fvalue.index_value.at(value_indices.at(j));
                    a_facet.result_map[facet_value] += 1;
                }
            }
@ -646,7 +639,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri

    result["hits"] = nlohmann::json::array();

-    for(auto field_order_kv: field_order_kvs) {
+    for(auto & field_order_kv: field_order_kvs) {
        std::string value;
        const std::string &seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key);
        store->get(seq_id_key, value);
@ -676,7 +669,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
                  });

        for(auto i = 0; i < std::min((size_t)10, value_to_count.size()); i++) {
-            auto kv = value_to_count[i];
+            auto & kv = value_to_count[i];
            nlohmann::json facet_value_count = nlohmann::json::object();
            facet_value_count["value"] = kv.first;
            facet_value_count["count"] = kv.second;
@ -1004,8 +997,6 @@ void Collection::remove_and_shift_offset_index(sorted_array &offset_index, const
 }

 Option<std::string> Collection::remove(std::string id) {
-    nlohmann::json result = nlohmann::json::object();
-
    std::string seq_id_str;
    StoreStatus status = store->get(get_doc_id_key(id), seq_id_str);

@ -1020,7 +1011,7 @@ Option<std::string> Collection::remove(std::string id) {

    nlohmann::json document = nlohmann::json::parse(parsed_document);

-    for(auto name_field: search_schema) {
+    for(auto & name_field: search_schema) {
        std::vector<std::string> tokens;
        if(name_field.second.type == field_types::STRING) {
            StringUtils::split(document[name_field.first], tokens, " ");
@ -1056,7 +1047,7 @@ Option<std::string> Collection::remove(std::string id) {
            }
        }

-        for(auto token: tokens) {
+        for(auto & token: tokens) {
            const unsigned char *key;
            int key_len;

@ -1069,19 +1060,16 @@ Option<std::string> Collection::remove(std::string id) {
                key_len = (int) (token.length());
            }

-            if(token == "https://twitter.com/yogalayout") {
-                std::cout << "token https://twitter.com/yogalayout" << std::endl;
-            }
-
            art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len);
            if(leaf != NULL) {
                uint32_t seq_id_values[1] = {seq_id};
+                uint32_t doc_index = leaf->values->ids.indexOf(seq_id);

-                if(leaf->values->ids.getLength() == 0) {
-                    std::cout << "HEY!!!" << std::endl;
+                if(doc_index == leaf->values->ids.getLength()) {
+                    // not found - happens when 2 tokens repeat in a field, e.g "is it or is is not?"
+                    continue;
                }

-                uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
                uint32_t start_offset = leaf->values->offset_index.at(doc_index);
                uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
                                      leaf->values->offsets.getLength() :
@ -1100,19 +1088,21 @@ Option<std::string> Collection::remove(std::string id) {
                std::cout << "----" << std::endl;*/

                if(leaf->values->ids.getLength() == 0) {
-                    art_delete(search_index.at(name_field.first), key, key_len);
+                    art_values* values = (art_values*) art_delete(search_index.at(name_field.first), key, key_len);
+                    delete values;
+                    values = nullptr;
                }
            }
        }
    }

    // remove facets if any
-    for(auto field_facet_value: facet_index) {
+    for(auto & field_facet_value: facet_index) {
        field_facet_value.second.doc_values.erase(seq_id);
    }

    // remove sort index if any
-    for(auto field_doc_value_map: sort_index) {
+    for(auto & field_doc_value_map: sort_index) {
        field_doc_value_map.second->erase(seq_id);
    }

--- a/src/collection_manager.cpp
+++ b/src/collection_manager.cpp
@ -23,7 +23,7 @@ void CollectionManager::init(Store *store) {
    std::vector<std::string> collection_meta_jsons;
    store->scan_fill(Collection::COLLECTION_META_PREFIX, collection_meta_jsons);

-    for(auto collection_meta_json: collection_meta_jsons) {
+    for(auto & collection_meta_json: collection_meta_jsons) {
        nlohmann::json collection_meta = nlohmann::json::parse(collection_meta_json);
        std::string this_collection_name = collection_meta[COLLECTION_NAME_KEY].get<std::string>();

--- a/src/http_server.cpp
+++ b/src/http_server.cpp
@ -101,7 +101,7 @@ std::map<std::string, std::string> HttpServer::parse_query(const std::string& qu
        std::string key = (*i)[1].str();
        std::string raw_value = (*i)[2].str();
        std::string value = StringUtils::url_decode(raw_value);
-        if(query_map.count(key) == 0) {
+        if(query_map.count(value) == 0) {
            query_map[key] = value;
        } else {
            query_map[key] = query_map[key] + "&&" + value;
--- a/src/main/main.cpp
+++ b/src/main/main.cpp
@ -8,70 +8,92 @@
 #include <unordered_map>
 #include <queue>
 #include "string_utils.h"
+#include <sys/resource.h>
 #include "collection.h"
 #include "collection_manager.h"

 using namespace std;

-void find_indices(const uint32_t *result_ids, int low, int high, std::vector<uint32_t> & results) {
-    if(high >= low) {
-        size_t pivot = (low + high) / 2;
-        //std::cout << pivot << std::endl;
-        results.at(pivot) = result_ids[pivot];
-        find_indices(result_ids, low, pivot-1, results);
-        find_indices(result_ids, pivot+1, high, results);
-    }
-}
-
 int main(int argc, char* argv[]) {
-    std::vector<uint32_t> results(3);
-    uint32_t *result_ids = new uint32_t[3];
-    /*for(auto i = 0; i < 100; i++) {
-        result_ids[i] = i;
-    }*/
-    result_ids[0] = 6;
-    result_ids[1] = 19;
-    result_ids[2] = 21;
-
-    find_indices(result_ids, 0, 2, results);
-    //std::sort(results.begin(), results.end());
-    for(auto i : results) {
-        std::cout << i << std::endl;
-    }
-
-
    const std::string state_dir_path = "/tmp/typesense-data";
-
-    std::vector<field> fields_to_index = {field("title", field_types::STRING)};
-    std::vector<field> sort_fields = { field("points", "INT32")};
    Store *store = new Store("/tmp/typesense-data");

    CollectionManager & collectionManager = CollectionManager::get_instance();
    collectionManager.init(store);

-    Collection *collection = collectionManager.get_collection("collection");
-    if(collection == nullptr) {
-        collection = collectionManager.create_collection("collection", fields_to_index, {}, sort_fields);
-        std::ifstream infile(std::string(ROOT_DIR)+"test/documents.jsonl");
-        //std::ifstream infile(argv[1]);
+    std::vector<field> fields_to_index = {
+            field("lang", field_types::STRING),
+            field("description", field_types::STRING),
+            field("topics", field_types::STRING_ARRAY),
+            field("stars", field_types::INT32),
+            field("repo_name", field_types::STRING),
+            field("org", field_types::STRING)
+    };

+    std::vector<field> facet_fields_index = {
+//            field("lang", field_types::STRING),
+//            field("org", field_types::STRING),
+//            field("topics", field_types::STRING_ARRAY)
+    };
+
+    std::vector<field> sort_fields = { field("stars", "INT32")};
+
+    Collection *collection = collectionManager.get_collection("github_top1k");
+
+    if(collection == nullptr) {
+        collection = collectionManager.create_collection("github_top1k", fields_to_index, facet_fields_index, sort_fields);
+    }
+
+    int j = 0;
+    while(j < 1) {
+        j++;
+
+        std::ifstream infile(argv[1]);
        std::string json_line;

+        cout << "BEGINNING Iteration: " << j << endl << flush;
+        auto begin = std::chrono::high_resolution_clock::now();
+
        while (std::getline(infile, json_line)) {
-            collection->add(json_line);
+            nlohmann::json document = nlohmann::json::parse(json_line);
+            document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
+            collection->add(document.dump());
        }

        infile.close();
-        cout << "FINISHED INDEXING!" << endl << flush;
+
+        long long int timeMillis =
+                std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
+
+        std::cout << "Time taken for insertion: " << timeMillis << "ms" << std::endl;
+        begin = std::chrono::high_resolution_clock::now();
+
+        std::ifstream infile2(argv[1]);
+
+        int counter = 0;
+
+        while (std::getline(infile2, json_line)) {
+            counter++;
+            nlohmann::json document = nlohmann::json::parse(json_line);
+            document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
+            collection->remove(document["id"]);
+            /*if (counter % 100 == 0) {
+                std::cout << "Removed " << counter << " so far..." << std::endl;
+            }*/
+        }
+
+        infile2.close();
+
+        timeMillis =
+                std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
+
+        struct rusage r_usage;
+        getrusage(RUSAGE_SELF,&r_usage);
+        std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl;
+        std::cout << "Time taken for deletion: " << timeMillis << "ms" << std::endl;
    }

-    //collection->remove("foo");
-
-    auto begin = std::chrono::high_resolution_clock::now();
-    std::vector<std::string> search_fields = {"title"};
-    collection->search("the", search_fields, "", {}, { sort_field("points", "DESC") }, 1, 100, MAX_SCORE, 0);
-    long long int timeMillis =
-            std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
-    cout << "Time taken: " << timeMillis << "us" << endl;
+    delete collection;
+    delete store;
    return 0;
 }
--- a/src/sorted_array.cpp
+++ b/src/sorted_array.cpp
@ -2,7 +2,7 @@
 #include "array_utils.h"

 void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_length) {
-    min = sorted_array[0];
+    min = array_length != 0 ? sorted_array[0] : 0;
    max = array_length > 1 ? sorted_array[array_length-1] : min;

    uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR);
@ -55,6 +55,10 @@ bool sorted_array::contains(uint32_t value) {
 }

 uint32_t sorted_array::indexOf(uint32_t value) {
+    if(length == 0) {
+        return length;
+    }
+
    uint32_t actual;
    uint32_t index = for_lower_bound_search(in, length, value, &actual);
    if(actual == value) return index;