Fix valgrind errors, plugging other leaks.

This commit is contained in:
Kishore Nallan 2017-05-21 15:59:16 +05:30
parent 0bb8cf13bf
commit 61bfdf027b
10 changed files with 98 additions and 95 deletions

View File

@ -36,6 +36,7 @@
- ~~Facet limit (hardcode to top 10)~~
- ~~Deprecate old split function~~
- ID should not have "/"
- Test for sorted_array::indexOf when length is 0
- Handle store-get() not finding a key
- Fix API response codes
- Test for search without any sort_by given

View File

@ -14,7 +14,7 @@ private:
uint32_t m = std::min(min, value);
uint32_t M = std::max(max, value);
uint32_t bnew = required_bits(M - m);
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
}
public:

View File

@ -24,20 +24,6 @@ protected:
return (uint32_t) (v == 0 ? 0 : 32 - __builtin_clz(v));
}
uint32_t inline sorted_append_size_required(uint32_t value, uint32_t new_length) {
uint32_t m = std::min(min, value);
uint32_t M = std::max(max, value);
uint32_t bnew = required_bits(M - m);
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
}
uint32_t inline unsorted_append_size_required(uint32_t value, uint32_t new_length) {
uint32_t m = std::min(min, value);
uint32_t M = std::max(max, value);
uint32_t bnew = required_bits(M - m);
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
}
public:
array_base(const uint32_t n=2) {
size_bytes = METADATA_OVERHEAD + (n * FOR_ELE_SIZE);

View File

@ -15,7 +15,7 @@ struct facet_value {
spp::sparse_hash_map<std::string, uint32_t> value_index;
spp::sparse_hash_map<uint32_t, std::string> index_value;
spp::sparse_hash_map<uint32_t, std::vector<uint32_t>*> doc_values;
spp::sparse_hash_map<uint32_t, std::vector<uint32_t>> doc_values;
uint32_t get_value_index(const std::string & value) {
if(value_index.count(value) != 0) {
@ -29,9 +29,9 @@ struct facet_value {
}
void index_values(uint32_t doc_seq_id, const std::vector<std::string> & values) {
std::vector<uint32_t>* value_vec = new std::vector<uint32_t>(values.size());
std::vector<uint32_t> value_vec(values.size());
for(auto i = 0; i < values.size(); i++) {
(*value_vec)[i] = get_value_index(values[i]);
value_vec[i] = get_value_index(values[i]);
}
doc_values.emplace(doc_seq_id, value_vec);
}

View File

@ -16,7 +16,7 @@ private:
uint32_t m = std::min(min, value);
uint32_t M = std::max(max, value);
uint32_t bnew = required_bits(M - m);
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
}
uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base,

View File

@ -33,20 +33,13 @@ Collection::Collection(const std::string name, const uint32_t collection_id, con
}
Collection::~Collection() {
for(std::pair<std::string, field> name_field: search_schema) {
for(auto & name_field: search_schema) {
art_tree *t = search_index.at(name_field.first);
art_tree_destroy(t);
t = nullptr;
}
for(std::pair<std::string, field> name_field: facet_schema) {
facet_value & fvalue = facet_index.at(name_field.first);
for(auto doc_value: fvalue.doc_values) {
delete doc_value.second;
}
}
for(std::pair<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> name_map: sort_index) {
for(auto & name_map: sort_index) {
delete name_map.second;
}
}
@ -271,7 +264,7 @@ void Collection::index_string_field(const std::string & text, const uint32_t sco
} else {
StringUtils::split(text, tokens, " ");
for(uint32_t i=0; i<tokens.size(); i++) {
auto token = tokens[i];
auto & token = tokens[i];
transform(token.begin(), token.end(), token.begin(), tolower);
token_to_offsets[token].push_back(i);
}
@ -336,9 +329,9 @@ void Collection::do_facets(std::vector<facet> & facets, uint32_t* result_ids, si
uint32_t doc_seq_id = result_ids[i];
if(fvalue.doc_values.count(doc_seq_id) != 0) {
// for every result document, get the values associated and increment counter
std::vector<uint32_t>* value_indices = fvalue.doc_values.at(doc_seq_id);
for(auto j = 0; j < value_indices->size(); j++) {
const std::string & facet_value = fvalue.index_value.at(value_indices->at(j));
const std::vector<uint32_t> & value_indices = fvalue.doc_values.at(doc_seq_id);
for(auto j = 0; j < value_indices.size(); j++) {
const std::string & facet_value = fvalue.index_value.at(value_indices.at(j));
a_facet.result_map[facet_value] += 1;
}
}
@ -646,7 +639,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
result["hits"] = nlohmann::json::array();
for(auto field_order_kv: field_order_kvs) {
for(auto & field_order_kv: field_order_kvs) {
std::string value;
const std::string &seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key);
store->get(seq_id_key, value);
@ -676,7 +669,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
});
for(auto i = 0; i < std::min((size_t)10, value_to_count.size()); i++) {
auto kv = value_to_count[i];
auto & kv = value_to_count[i];
nlohmann::json facet_value_count = nlohmann::json::object();
facet_value_count["value"] = kv.first;
facet_value_count["count"] = kv.second;
@ -1004,8 +997,6 @@ void Collection::remove_and_shift_offset_index(sorted_array &offset_index, const
}
Option<std::string> Collection::remove(std::string id) {
nlohmann::json result = nlohmann::json::object();
std::string seq_id_str;
StoreStatus status = store->get(get_doc_id_key(id), seq_id_str);
@ -1020,7 +1011,7 @@ Option<std::string> Collection::remove(std::string id) {
nlohmann::json document = nlohmann::json::parse(parsed_document);
for(auto name_field: search_schema) {
for(auto & name_field: search_schema) {
std::vector<std::string> tokens;
if(name_field.second.type == field_types::STRING) {
StringUtils::split(document[name_field.first], tokens, " ");
@ -1056,7 +1047,7 @@ Option<std::string> Collection::remove(std::string id) {
}
}
for(auto token: tokens) {
for(auto & token: tokens) {
const unsigned char *key;
int key_len;
@ -1069,19 +1060,16 @@ Option<std::string> Collection::remove(std::string id) {
key_len = (int) (token.length());
}
if(token == "https://twitter.com/yogalayout") {
std::cout << "token https://twitter.com/yogalayout" << std::endl;
}
art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len);
if(leaf != NULL) {
uint32_t seq_id_values[1] = {seq_id};
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
if(leaf->values->ids.getLength() == 0) {
std::cout << "HEY!!!" << std::endl;
if(doc_index == leaf->values->ids.getLength()) {
// not found - happens when 2 tokens repeat in a field, e.g "is it or is is not?"
continue;
}
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
uint32_t start_offset = leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
leaf->values->offsets.getLength() :
@ -1100,19 +1088,21 @@ Option<std::string> Collection::remove(std::string id) {
std::cout << "----" << std::endl;*/
if(leaf->values->ids.getLength() == 0) {
art_delete(search_index.at(name_field.first), key, key_len);
art_values* values = (art_values*) art_delete(search_index.at(name_field.first), key, key_len);
delete values;
values = nullptr;
}
}
}
}
// remove facets if any
for(auto field_facet_value: facet_index) {
for(auto & field_facet_value: facet_index) {
field_facet_value.second.doc_values.erase(seq_id);
}
// remove sort index if any
for(auto field_doc_value_map: sort_index) {
for(auto & field_doc_value_map: sort_index) {
field_doc_value_map.second->erase(seq_id);
}

View File

@ -23,7 +23,7 @@ void CollectionManager::init(Store *store) {
std::vector<std::string> collection_meta_jsons;
store->scan_fill(Collection::COLLECTION_META_PREFIX, collection_meta_jsons);
for(auto collection_meta_json: collection_meta_jsons) {
for(auto & collection_meta_json: collection_meta_jsons) {
nlohmann::json collection_meta = nlohmann::json::parse(collection_meta_json);
std::string this_collection_name = collection_meta[COLLECTION_NAME_KEY].get<std::string>();

View File

@ -101,7 +101,7 @@ std::map<std::string, std::string> HttpServer::parse_query(const std::string& qu
std::string key = (*i)[1].str();
std::string raw_value = (*i)[2].str();
std::string value = StringUtils::url_decode(raw_value);
if(query_map.count(key) == 0) {
if(query_map.count(value) == 0) {
query_map[key] = value;
} else {
query_map[key] = query_map[key] + "&&" + value;

View File

@ -8,70 +8,92 @@
#include <unordered_map>
#include <queue>
#include "string_utils.h"
#include <sys/resource.h>
#include "collection.h"
#include "collection_manager.h"
using namespace std;
void find_indices(const uint32_t *result_ids, int low, int high, std::vector<uint32_t> & results) {
if(high >= low) {
size_t pivot = (low + high) / 2;
//std::cout << pivot << std::endl;
results.at(pivot) = result_ids[pivot];
find_indices(result_ids, low, pivot-1, results);
find_indices(result_ids, pivot+1, high, results);
}
}
int main(int argc, char* argv[]) {
std::vector<uint32_t> results(3);
uint32_t *result_ids = new uint32_t[3];
/*for(auto i = 0; i < 100; i++) {
result_ids[i] = i;
}*/
result_ids[0] = 6;
result_ids[1] = 19;
result_ids[2] = 21;
find_indices(result_ids, 0, 2, results);
//std::sort(results.begin(), results.end());
for(auto i : results) {
std::cout << i << std::endl;
}
const std::string state_dir_path = "/tmp/typesense-data";
std::vector<field> fields_to_index = {field("title", field_types::STRING)};
std::vector<field> sort_fields = { field("points", "INT32")};
Store *store = new Store("/tmp/typesense-data");
CollectionManager & collectionManager = CollectionManager::get_instance();
collectionManager.init(store);
Collection *collection = collectionManager.get_collection("collection");
if(collection == nullptr) {
collection = collectionManager.create_collection("collection", fields_to_index, {}, sort_fields);
std::ifstream infile(std::string(ROOT_DIR)+"test/documents.jsonl");
//std::ifstream infile(argv[1]);
std::vector<field> fields_to_index = {
field("lang", field_types::STRING),
field("description", field_types::STRING),
field("topics", field_types::STRING_ARRAY),
field("stars", field_types::INT32),
field("repo_name", field_types::STRING),
field("org", field_types::STRING)
};
std::vector<field> facet_fields_index = {
// field("lang", field_types::STRING),
// field("org", field_types::STRING),
// field("topics", field_types::STRING_ARRAY)
};
std::vector<field> sort_fields = { field("stars", "INT32")};
Collection *collection = collectionManager.get_collection("github_top1k");
if(collection == nullptr) {
collection = collectionManager.create_collection("github_top1k", fields_to_index, facet_fields_index, sort_fields);
}
int j = 0;
while(j < 1) {
j++;
std::ifstream infile(argv[1]);
std::string json_line;
cout << "BEGINNING Iteration: " << j << endl << flush;
auto begin = std::chrono::high_resolution_clock::now();
while (std::getline(infile, json_line)) {
collection->add(json_line);
nlohmann::json document = nlohmann::json::parse(json_line);
document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
collection->add(document.dump());
}
infile.close();
cout << "FINISHED INDEXING!" << endl << flush;
long long int timeMillis =
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
std::cout << "Time taken for insertion: " << timeMillis << "ms" << std::endl;
begin = std::chrono::high_resolution_clock::now();
std::ifstream infile2(argv[1]);
int counter = 0;
while (std::getline(infile2, json_line)) {
counter++;
nlohmann::json document = nlohmann::json::parse(json_line);
document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
collection->remove(document["id"]);
/*if (counter % 100 == 0) {
std::cout << "Removed " << counter << " so far..." << std::endl;
}*/
}
infile2.close();
timeMillis =
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
struct rusage r_usage;
getrusage(RUSAGE_SELF,&r_usage);
std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl;
std::cout << "Time taken for deletion: " << timeMillis << "ms" << std::endl;
}
//collection->remove("foo");
auto begin = std::chrono::high_resolution_clock::now();
std::vector<std::string> search_fields = {"title"};
collection->search("the", search_fields, "", {}, { sort_field("points", "DESC") }, 1, 100, MAX_SCORE, 0);
long long int timeMillis =
std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
cout << "Time taken: " << timeMillis << "us" << endl;
delete collection;
delete store;
return 0;
}

View File

@ -2,7 +2,7 @@
#include "array_utils.h"
void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_length) {
min = sorted_array[0];
min = array_length != 0 ? sorted_array[0] : 0;
max = array_length > 1 ? sorted_array[array_length-1] : min;
uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR);
@ -55,6 +55,10 @@ bool sorted_array::contains(uint32_t value) {
}
uint32_t sorted_array::indexOf(uint32_t value) {
if(length == 0) {
return length;
}
uint32_t actual;
uint32_t index = for_lower_bound_search(in, length, value, &actual);
if(actual == value) return index;