From 61bfdf027ba2305f1d738767c96bc8d6cf613aba Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sun, 21 May 2017 15:59:16 +0530 Subject: [PATCH] Fix valgrind errors, plugging other leaks. --- TODO.md | 1 + include/array.h | 2 +- include/array_base.h | 14 ----- include/collection.h | 6 +- include/sorted_array.h | 2 +- src/collection.cpp | 48 +++++++--------- src/collection_manager.cpp | 2 +- src/http_server.cpp | 2 +- src/main/main.cpp | 110 ++++++++++++++++++++++--------------- src/sorted_array.cpp | 6 +- 10 files changed, 98 insertions(+), 95 deletions(-) diff --git a/TODO.md b/TODO.md index f9f3a64e..57dd05c1 100644 --- a/TODO.md +++ b/TODO.md @@ -36,6 +36,7 @@ - ~~Facet limit (hardcode to top 10)~~ - ~~Deprecate old split function~~ - ID should not have "/" +- Test for sorted_array::indexOf when length is 0 - Handle store-get() not finding a key - Fix API response codes - Test for search without any sort_by given diff --git a/include/array.h b/include/array.h index c390705a..cb402b96 100644 --- a/include/array.h +++ b/include/array.h @@ -14,7 +14,7 @@ private: uint32_t m = std::min(min, value); uint32_t M = std::max(max, value); uint32_t bnew = required_bits(M - m); - return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew); + return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew); } public: diff --git a/include/array_base.h b/include/array_base.h index a0ad124c..53ef54df 100644 --- a/include/array_base.h +++ b/include/array_base.h @@ -24,20 +24,6 @@ protected: return (uint32_t) (v == 0 ? 0 : 32 - __builtin_clz(v)); } - uint32_t inline sorted_append_size_required(uint32_t value, uint32_t new_length) { - uint32_t m = std::min(min, value); - uint32_t M = std::max(max, value); - uint32_t bnew = required_bits(M - m); - return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew); - } - - uint32_t inline unsorted_append_size_required(uint32_t value, uint32_t new_length) { - uint32_t m = std::min(min, value); - uint32_t M = std::max(max, value); - uint32_t bnew = required_bits(M - m); - return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew); - } - public: array_base(const uint32_t n=2) { size_bytes = METADATA_OVERHEAD + (n * FOR_ELE_SIZE); diff --git a/include/collection.h b/include/collection.h index b43d8941..4e95bb5f 100644 --- a/include/collection.h +++ b/include/collection.h @@ -15,7 +15,7 @@ struct facet_value { spp::sparse_hash_map value_index; spp::sparse_hash_map index_value; - spp::sparse_hash_map*> doc_values; + spp::sparse_hash_map> doc_values; uint32_t get_value_index(const std::string & value) { if(value_index.count(value) != 0) { @@ -29,9 +29,9 @@ struct facet_value { } void index_values(uint32_t doc_seq_id, const std::vector & values) { - std::vector* value_vec = new std::vector(values.size()); + std::vector value_vec(values.size()); for(auto i = 0; i < values.size(); i++) { - (*value_vec)[i] = get_value_index(values[i]); + value_vec[i] = get_value_index(values[i]); } doc_values.emplace(doc_seq_id, value_vec); } diff --git a/include/sorted_array.h b/include/sorted_array.h index 8798e0b6..93553cab 100644 --- a/include/sorted_array.h +++ b/include/sorted_array.h @@ -16,7 +16,7 @@ private: uint32_t m = std::min(min, value); uint32_t M = std::max(max, value); uint32_t bnew = required_bits(M - m); - return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew); + return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew); } uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base, diff --git a/src/collection.cpp b/src/collection.cpp index d808699a..2055ef3a 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -33,20 +33,13 @@ Collection::Collection(const std::string name, const uint32_t collection_id, con } Collection::~Collection() { - for(std::pair name_field: search_schema) { + for(auto & name_field: search_schema) { art_tree *t = search_index.at(name_field.first); art_tree_destroy(t); t = nullptr; } - for(std::pair name_field: facet_schema) { - facet_value & fvalue = facet_index.at(name_field.first); - for(auto doc_value: fvalue.doc_values) { - delete doc_value.second; - } - } - - for(std::pair*> name_map: sort_index) { + for(auto & name_map: sort_index) { delete name_map.second; } } @@ -271,7 +264,7 @@ void Collection::index_string_field(const std::string & text, const uint32_t sco } else { StringUtils::split(text, tokens, " "); for(uint32_t i=0; i & facets, uint32_t* result_ids, si uint32_t doc_seq_id = result_ids[i]; if(fvalue.doc_values.count(doc_seq_id) != 0) { // for every result document, get the values associated and increment counter - std::vector* value_indices = fvalue.doc_values.at(doc_seq_id); - for(auto j = 0; j < value_indices->size(); j++) { - const std::string & facet_value = fvalue.index_value.at(value_indices->at(j)); + const std::vector & value_indices = fvalue.doc_values.at(doc_seq_id); + for(auto j = 0; j < value_indices.size(); j++) { + const std::string & facet_value = fvalue.index_value.at(value_indices.at(j)); a_facet.result_map[facet_value] += 1; } } @@ -646,7 +639,7 @@ nlohmann::json Collection::search(std::string query, const std::vectorget(seq_id_key, value); @@ -676,7 +669,7 @@ nlohmann::json Collection::search(std::string query, const std::vector Collection::remove(std::string id) { - nlohmann::json result = nlohmann::json::object(); - std::string seq_id_str; StoreStatus status = store->get(get_doc_id_key(id), seq_id_str); @@ -1020,7 +1011,7 @@ Option Collection::remove(std::string id) { nlohmann::json document = nlohmann::json::parse(parsed_document); - for(auto name_field: search_schema) { + for(auto & name_field: search_schema) { std::vector tokens; if(name_field.second.type == field_types::STRING) { StringUtils::split(document[name_field.first], tokens, " "); @@ -1056,7 +1047,7 @@ Option Collection::remove(std::string id) { } } - for(auto token: tokens) { + for(auto & token: tokens) { const unsigned char *key; int key_len; @@ -1069,19 +1060,16 @@ Option Collection::remove(std::string id) { key_len = (int) (token.length()); } - if(token == "https://twitter.com/yogalayout") { - std::cout << "token https://twitter.com/yogalayout" << std::endl; - } - art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len); if(leaf != NULL) { uint32_t seq_id_values[1] = {seq_id}; + uint32_t doc_index = leaf->values->ids.indexOf(seq_id); - if(leaf->values->ids.getLength() == 0) { - std::cout << "HEY!!!" << std::endl; + if(doc_index == leaf->values->ids.getLength()) { + // not found - happens when 2 tokens repeat in a field, e.g "is it or is is not?" + continue; } - uint32_t doc_index = leaf->values->ids.indexOf(seq_id); uint32_t start_offset = leaf->values->offset_index.at(doc_index); uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ? leaf->values->offsets.getLength() : @@ -1100,19 +1088,21 @@ Option Collection::remove(std::string id) { std::cout << "----" << std::endl;*/ if(leaf->values->ids.getLength() == 0) { - art_delete(search_index.at(name_field.first), key, key_len); + art_values* values = (art_values*) art_delete(search_index.at(name_field.first), key, key_len); + delete values; + values = nullptr; } } } } // remove facets if any - for(auto field_facet_value: facet_index) { + for(auto & field_facet_value: facet_index) { field_facet_value.second.doc_values.erase(seq_id); } // remove sort index if any - for(auto field_doc_value_map: sort_index) { + for(auto & field_doc_value_map: sort_index) { field_doc_value_map.second->erase(seq_id); } diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 27caf2f7..b068be90 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -23,7 +23,7 @@ void CollectionManager::init(Store *store) { std::vector collection_meta_jsons; store->scan_fill(Collection::COLLECTION_META_PREFIX, collection_meta_jsons); - for(auto collection_meta_json: collection_meta_jsons) { + for(auto & collection_meta_json: collection_meta_jsons) { nlohmann::json collection_meta = nlohmann::json::parse(collection_meta_json); std::string this_collection_name = collection_meta[COLLECTION_NAME_KEY].get(); diff --git a/src/http_server.cpp b/src/http_server.cpp index 13941e2e..94472a0f 100644 --- a/src/http_server.cpp +++ b/src/http_server.cpp @@ -101,7 +101,7 @@ std::map HttpServer::parse_query(const std::string& qu std::string key = (*i)[1].str(); std::string raw_value = (*i)[2].str(); std::string value = StringUtils::url_decode(raw_value); - if(query_map.count(key) == 0) { + if(query_map.count(value) == 0) { query_map[key] = value; } else { query_map[key] = query_map[key] + "&&" + value; diff --git a/src/main/main.cpp b/src/main/main.cpp index ddde0e68..07a16106 100644 --- a/src/main/main.cpp +++ b/src/main/main.cpp @@ -8,70 +8,92 @@ #include #include #include "string_utils.h" +#include #include "collection.h" #include "collection_manager.h" using namespace std; -void find_indices(const uint32_t *result_ids, int low, int high, std::vector & results) { - if(high >= low) { - size_t pivot = (low + high) / 2; - //std::cout << pivot << std::endl; - results.at(pivot) = result_ids[pivot]; - find_indices(result_ids, low, pivot-1, results); - find_indices(result_ids, pivot+1, high, results); - } -} - int main(int argc, char* argv[]) { - std::vector results(3); - uint32_t *result_ids = new uint32_t[3]; - /*for(auto i = 0; i < 100; i++) { - result_ids[i] = i; - }*/ - result_ids[0] = 6; - result_ids[1] = 19; - result_ids[2] = 21; - - find_indices(result_ids, 0, 2, results); - //std::sort(results.begin(), results.end()); - for(auto i : results) { - std::cout << i << std::endl; - } - - const std::string state_dir_path = "/tmp/typesense-data"; - - std::vector fields_to_index = {field("title", field_types::STRING)}; - std::vector sort_fields = { field("points", "INT32")}; Store *store = new Store("/tmp/typesense-data"); CollectionManager & collectionManager = CollectionManager::get_instance(); collectionManager.init(store); - Collection *collection = collectionManager.get_collection("collection"); - if(collection == nullptr) { - collection = collectionManager.create_collection("collection", fields_to_index, {}, sort_fields); - std::ifstream infile(std::string(ROOT_DIR)+"test/documents.jsonl"); - //std::ifstream infile(argv[1]); + std::vector fields_to_index = { + field("lang", field_types::STRING), + field("description", field_types::STRING), + field("topics", field_types::STRING_ARRAY), + field("stars", field_types::INT32), + field("repo_name", field_types::STRING), + field("org", field_types::STRING) + }; + std::vector facet_fields_index = { +// field("lang", field_types::STRING), +// field("org", field_types::STRING), +// field("topics", field_types::STRING_ARRAY) + }; + + std::vector sort_fields = { field("stars", "INT32")}; + + Collection *collection = collectionManager.get_collection("github_top1k"); + + if(collection == nullptr) { + collection = collectionManager.create_collection("github_top1k", fields_to_index, facet_fields_index, sort_fields); + } + + int j = 0; + while(j < 1) { + j++; + + std::ifstream infile(argv[1]); std::string json_line; + cout << "BEGINNING Iteration: " << j << endl << flush; + auto begin = std::chrono::high_resolution_clock::now(); + while (std::getline(infile, json_line)) { - collection->add(json_line); + nlohmann::json document = nlohmann::json::parse(json_line); + document["id"] = document["org"].get() + ":" + document["repo_name"].get(); + collection->add(document.dump()); } infile.close(); - cout << "FINISHED INDEXING!" << endl << flush; + + long long int timeMillis = + std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); + + std::cout << "Time taken for insertion: " << timeMillis << "ms" << std::endl; + begin = std::chrono::high_resolution_clock::now(); + + std::ifstream infile2(argv[1]); + + int counter = 0; + + while (std::getline(infile2, json_line)) { + counter++; + nlohmann::json document = nlohmann::json::parse(json_line); + document["id"] = document["org"].get() + ":" + document["repo_name"].get(); + collection->remove(document["id"]); + /*if (counter % 100 == 0) { + std::cout << "Removed " << counter << " so far..." << std::endl; + }*/ + } + + infile2.close(); + + timeMillis = + std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); + + struct rusage r_usage; + getrusage(RUSAGE_SELF,&r_usage); + std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl; + std::cout << "Time taken for deletion: " << timeMillis << "ms" << std::endl; } - //collection->remove("foo"); - - auto begin = std::chrono::high_resolution_clock::now(); - std::vector search_fields = {"title"}; - collection->search("the", search_fields, "", {}, { sort_field("points", "DESC") }, 1, 100, MAX_SCORE, 0); - long long int timeMillis = - std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); - cout << "Time taken: " << timeMillis << "us" << endl; + delete collection; + delete store; return 0; } \ No newline at end of file diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp index 9f32aea5..8c8310fa 100644 --- a/src/sorted_array.cpp +++ b/src/sorted_array.cpp @@ -2,7 +2,7 @@ #include "array_utils.h" void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_length) { - min = sorted_array[0]; + min = array_length != 0 ? sorted_array[0] : 0; max = array_length > 1 ? sorted_array[array_length-1] : min; uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR); @@ -55,6 +55,10 @@ bool sorted_array::contains(uint32_t value) { } uint32_t sorted_array::indexOf(uint32_t value) { + if(length == 0) { + return length; + } + uint32_t actual; uint32_t index = for_lower_bound_search(in, length, value, &actual); if(actual == value) return index;