mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 04:32:38 +08:00
Fix valgrind errors, plugging other leaks.
This commit is contained in:
parent
0bb8cf13bf
commit
61bfdf027b
1
TODO.md
1
TODO.md
@ -36,6 +36,7 @@
|
||||
- ~~Facet limit (hardcode to top 10)~~
|
||||
- ~~Deprecate old split function~~
|
||||
- ID should not have "/"
|
||||
- Test for sorted_array::indexOf when length is 0
|
||||
- Handle store-get() not finding a key
|
||||
- Fix API response codes
|
||||
- Test for search without any sort_by given
|
||||
|
@ -14,7 +14,7 @@ private:
|
||||
uint32_t m = std::min(min, value);
|
||||
uint32_t M = std::max(max, value);
|
||||
uint32_t bnew = required_bits(M - m);
|
||||
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
|
||||
return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -24,20 +24,6 @@ protected:
|
||||
return (uint32_t) (v == 0 ? 0 : 32 - __builtin_clz(v));
|
||||
}
|
||||
|
||||
uint32_t inline sorted_append_size_required(uint32_t value, uint32_t new_length) {
|
||||
uint32_t m = std::min(min, value);
|
||||
uint32_t M = std::max(max, value);
|
||||
uint32_t bnew = required_bits(M - m);
|
||||
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
|
||||
}
|
||||
|
||||
uint32_t inline unsorted_append_size_required(uint32_t value, uint32_t new_length) {
|
||||
uint32_t m = std::min(min, value);
|
||||
uint32_t M = std::max(max, value);
|
||||
uint32_t bnew = required_bits(M - m);
|
||||
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
|
||||
}
|
||||
|
||||
public:
|
||||
array_base(const uint32_t n=2) {
|
||||
size_bytes = METADATA_OVERHEAD + (n * FOR_ELE_SIZE);
|
||||
|
@ -15,7 +15,7 @@ struct facet_value {
|
||||
spp::sparse_hash_map<std::string, uint32_t> value_index;
|
||||
spp::sparse_hash_map<uint32_t, std::string> index_value;
|
||||
|
||||
spp::sparse_hash_map<uint32_t, std::vector<uint32_t>*> doc_values;
|
||||
spp::sparse_hash_map<uint32_t, std::vector<uint32_t>> doc_values;
|
||||
|
||||
uint32_t get_value_index(const std::string & value) {
|
||||
if(value_index.count(value) != 0) {
|
||||
@ -29,9 +29,9 @@ struct facet_value {
|
||||
}
|
||||
|
||||
void index_values(uint32_t doc_seq_id, const std::vector<std::string> & values) {
|
||||
std::vector<uint32_t>* value_vec = new std::vector<uint32_t>(values.size());
|
||||
std::vector<uint32_t> value_vec(values.size());
|
||||
for(auto i = 0; i < values.size(); i++) {
|
||||
(*value_vec)[i] = get_value_index(values[i]);
|
||||
value_vec[i] = get_value_index(values[i]);
|
||||
}
|
||||
doc_values.emplace(doc_seq_id, value_vec);
|
||||
}
|
||||
|
@ -16,7 +16,7 @@ private:
|
||||
uint32_t m = std::min(min, value);
|
||||
uint32_t M = std::max(max, value);
|
||||
uint32_t bnew = required_bits(M - m);
|
||||
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
|
||||
return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
|
||||
}
|
||||
|
||||
uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base,
|
||||
|
@ -33,20 +33,13 @@ Collection::Collection(const std::string name, const uint32_t collection_id, con
|
||||
}
|
||||
|
||||
Collection::~Collection() {
|
||||
for(std::pair<std::string, field> name_field: search_schema) {
|
||||
for(auto & name_field: search_schema) {
|
||||
art_tree *t = search_index.at(name_field.first);
|
||||
art_tree_destroy(t);
|
||||
t = nullptr;
|
||||
}
|
||||
|
||||
for(std::pair<std::string, field> name_field: facet_schema) {
|
||||
facet_value & fvalue = facet_index.at(name_field.first);
|
||||
for(auto doc_value: fvalue.doc_values) {
|
||||
delete doc_value.second;
|
||||
}
|
||||
}
|
||||
|
||||
for(std::pair<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> name_map: sort_index) {
|
||||
for(auto & name_map: sort_index) {
|
||||
delete name_map.second;
|
||||
}
|
||||
}
|
||||
@ -271,7 +264,7 @@ void Collection::index_string_field(const std::string & text, const uint32_t sco
|
||||
} else {
|
||||
StringUtils::split(text, tokens, " ");
|
||||
for(uint32_t i=0; i<tokens.size(); i++) {
|
||||
auto token = tokens[i];
|
||||
auto & token = tokens[i];
|
||||
transform(token.begin(), token.end(), token.begin(), tolower);
|
||||
token_to_offsets[token].push_back(i);
|
||||
}
|
||||
@ -336,9 +329,9 @@ void Collection::do_facets(std::vector<facet> & facets, uint32_t* result_ids, si
|
||||
uint32_t doc_seq_id = result_ids[i];
|
||||
if(fvalue.doc_values.count(doc_seq_id) != 0) {
|
||||
// for every result document, get the values associated and increment counter
|
||||
std::vector<uint32_t>* value_indices = fvalue.doc_values.at(doc_seq_id);
|
||||
for(auto j = 0; j < value_indices->size(); j++) {
|
||||
const std::string & facet_value = fvalue.index_value.at(value_indices->at(j));
|
||||
const std::vector<uint32_t> & value_indices = fvalue.doc_values.at(doc_seq_id);
|
||||
for(auto j = 0; j < value_indices.size(); j++) {
|
||||
const std::string & facet_value = fvalue.index_value.at(value_indices.at(j));
|
||||
a_facet.result_map[facet_value] += 1;
|
||||
}
|
||||
}
|
||||
@ -646,7 +639,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
|
||||
|
||||
result["hits"] = nlohmann::json::array();
|
||||
|
||||
for(auto field_order_kv: field_order_kvs) {
|
||||
for(auto & field_order_kv: field_order_kvs) {
|
||||
std::string value;
|
||||
const std::string &seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key);
|
||||
store->get(seq_id_key, value);
|
||||
@ -676,7 +669,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
|
||||
});
|
||||
|
||||
for(auto i = 0; i < std::min((size_t)10, value_to_count.size()); i++) {
|
||||
auto kv = value_to_count[i];
|
||||
auto & kv = value_to_count[i];
|
||||
nlohmann::json facet_value_count = nlohmann::json::object();
|
||||
facet_value_count["value"] = kv.first;
|
||||
facet_value_count["count"] = kv.second;
|
||||
@ -1004,8 +997,6 @@ void Collection::remove_and_shift_offset_index(sorted_array &offset_index, const
|
||||
}
|
||||
|
||||
Option<std::string> Collection::remove(std::string id) {
|
||||
nlohmann::json result = nlohmann::json::object();
|
||||
|
||||
std::string seq_id_str;
|
||||
StoreStatus status = store->get(get_doc_id_key(id), seq_id_str);
|
||||
|
||||
@ -1020,7 +1011,7 @@ Option<std::string> Collection::remove(std::string id) {
|
||||
|
||||
nlohmann::json document = nlohmann::json::parse(parsed_document);
|
||||
|
||||
for(auto name_field: search_schema) {
|
||||
for(auto & name_field: search_schema) {
|
||||
std::vector<std::string> tokens;
|
||||
if(name_field.second.type == field_types::STRING) {
|
||||
StringUtils::split(document[name_field.first], tokens, " ");
|
||||
@ -1056,7 +1047,7 @@ Option<std::string> Collection::remove(std::string id) {
|
||||
}
|
||||
}
|
||||
|
||||
for(auto token: tokens) {
|
||||
for(auto & token: tokens) {
|
||||
const unsigned char *key;
|
||||
int key_len;
|
||||
|
||||
@ -1069,19 +1060,16 @@ Option<std::string> Collection::remove(std::string id) {
|
||||
key_len = (int) (token.length());
|
||||
}
|
||||
|
||||
if(token == "https://twitter.com/yogalayout") {
|
||||
std::cout << "token https://twitter.com/yogalayout" << std::endl;
|
||||
}
|
||||
|
||||
art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len);
|
||||
if(leaf != NULL) {
|
||||
uint32_t seq_id_values[1] = {seq_id};
|
||||
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
|
||||
|
||||
if(leaf->values->ids.getLength() == 0) {
|
||||
std::cout << "HEY!!!" << std::endl;
|
||||
if(doc_index == leaf->values->ids.getLength()) {
|
||||
// not found - happens when 2 tokens repeat in a field, e.g "is it or is is not?"
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
|
||||
uint32_t start_offset = leaf->values->offset_index.at(doc_index);
|
||||
uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
|
||||
leaf->values->offsets.getLength() :
|
||||
@ -1100,19 +1088,21 @@ Option<std::string> Collection::remove(std::string id) {
|
||||
std::cout << "----" << std::endl;*/
|
||||
|
||||
if(leaf->values->ids.getLength() == 0) {
|
||||
art_delete(search_index.at(name_field.first), key, key_len);
|
||||
art_values* values = (art_values*) art_delete(search_index.at(name_field.first), key, key_len);
|
||||
delete values;
|
||||
values = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// remove facets if any
|
||||
for(auto field_facet_value: facet_index) {
|
||||
for(auto & field_facet_value: facet_index) {
|
||||
field_facet_value.second.doc_values.erase(seq_id);
|
||||
}
|
||||
|
||||
// remove sort index if any
|
||||
for(auto field_doc_value_map: sort_index) {
|
||||
for(auto & field_doc_value_map: sort_index) {
|
||||
field_doc_value_map.second->erase(seq_id);
|
||||
}
|
||||
|
||||
|
@ -23,7 +23,7 @@ void CollectionManager::init(Store *store) {
|
||||
std::vector<std::string> collection_meta_jsons;
|
||||
store->scan_fill(Collection::COLLECTION_META_PREFIX, collection_meta_jsons);
|
||||
|
||||
for(auto collection_meta_json: collection_meta_jsons) {
|
||||
for(auto & collection_meta_json: collection_meta_jsons) {
|
||||
nlohmann::json collection_meta = nlohmann::json::parse(collection_meta_json);
|
||||
std::string this_collection_name = collection_meta[COLLECTION_NAME_KEY].get<std::string>();
|
||||
|
||||
|
@ -101,7 +101,7 @@ std::map<std::string, std::string> HttpServer::parse_query(const std::string& qu
|
||||
std::string key = (*i)[1].str();
|
||||
std::string raw_value = (*i)[2].str();
|
||||
std::string value = StringUtils::url_decode(raw_value);
|
||||
if(query_map.count(key) == 0) {
|
||||
if(query_map.count(value) == 0) {
|
||||
query_map[key] = value;
|
||||
} else {
|
||||
query_map[key] = query_map[key] + "&&" + value;
|
||||
|
@ -8,70 +8,92 @@
|
||||
#include <unordered_map>
|
||||
#include <queue>
|
||||
#include "string_utils.h"
|
||||
#include <sys/resource.h>
|
||||
#include "collection.h"
|
||||
#include "collection_manager.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void find_indices(const uint32_t *result_ids, int low, int high, std::vector<uint32_t> & results) {
|
||||
if(high >= low) {
|
||||
size_t pivot = (low + high) / 2;
|
||||
//std::cout << pivot << std::endl;
|
||||
results.at(pivot) = result_ids[pivot];
|
||||
find_indices(result_ids, low, pivot-1, results);
|
||||
find_indices(result_ids, pivot+1, high, results);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
std::vector<uint32_t> results(3);
|
||||
uint32_t *result_ids = new uint32_t[3];
|
||||
/*for(auto i = 0; i < 100; i++) {
|
||||
result_ids[i] = i;
|
||||
}*/
|
||||
result_ids[0] = 6;
|
||||
result_ids[1] = 19;
|
||||
result_ids[2] = 21;
|
||||
|
||||
find_indices(result_ids, 0, 2, results);
|
||||
//std::sort(results.begin(), results.end());
|
||||
for(auto i : results) {
|
||||
std::cout << i << std::endl;
|
||||
}
|
||||
|
||||
|
||||
const std::string state_dir_path = "/tmp/typesense-data";
|
||||
|
||||
std::vector<field> fields_to_index = {field("title", field_types::STRING)};
|
||||
std::vector<field> sort_fields = { field("points", "INT32")};
|
||||
Store *store = new Store("/tmp/typesense-data");
|
||||
|
||||
CollectionManager & collectionManager = CollectionManager::get_instance();
|
||||
collectionManager.init(store);
|
||||
|
||||
Collection *collection = collectionManager.get_collection("collection");
|
||||
if(collection == nullptr) {
|
||||
collection = collectionManager.create_collection("collection", fields_to_index, {}, sort_fields);
|
||||
std::ifstream infile(std::string(ROOT_DIR)+"test/documents.jsonl");
|
||||
//std::ifstream infile(argv[1]);
|
||||
std::vector<field> fields_to_index = {
|
||||
field("lang", field_types::STRING),
|
||||
field("description", field_types::STRING),
|
||||
field("topics", field_types::STRING_ARRAY),
|
||||
field("stars", field_types::INT32),
|
||||
field("repo_name", field_types::STRING),
|
||||
field("org", field_types::STRING)
|
||||
};
|
||||
|
||||
std::vector<field> facet_fields_index = {
|
||||
// field("lang", field_types::STRING),
|
||||
// field("org", field_types::STRING),
|
||||
// field("topics", field_types::STRING_ARRAY)
|
||||
};
|
||||
|
||||
std::vector<field> sort_fields = { field("stars", "INT32")};
|
||||
|
||||
Collection *collection = collectionManager.get_collection("github_top1k");
|
||||
|
||||
if(collection == nullptr) {
|
||||
collection = collectionManager.create_collection("github_top1k", fields_to_index, facet_fields_index, sort_fields);
|
||||
}
|
||||
|
||||
int j = 0;
|
||||
while(j < 1) {
|
||||
j++;
|
||||
|
||||
std::ifstream infile(argv[1]);
|
||||
std::string json_line;
|
||||
|
||||
cout << "BEGINNING Iteration: " << j << endl << flush;
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
while (std::getline(infile, json_line)) {
|
||||
collection->add(json_line);
|
||||
nlohmann::json document = nlohmann::json::parse(json_line);
|
||||
document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
|
||||
collection->add(document.dump());
|
||||
}
|
||||
|
||||
infile.close();
|
||||
cout << "FINISHED INDEXING!" << endl << flush;
|
||||
|
||||
long long int timeMillis =
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
|
||||
std::cout << "Time taken for insertion: " << timeMillis << "ms" << std::endl;
|
||||
begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
std::ifstream infile2(argv[1]);
|
||||
|
||||
int counter = 0;
|
||||
|
||||
while (std::getline(infile2, json_line)) {
|
||||
counter++;
|
||||
nlohmann::json document = nlohmann::json::parse(json_line);
|
||||
document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
|
||||
collection->remove(document["id"]);
|
||||
/*if (counter % 100 == 0) {
|
||||
std::cout << "Removed " << counter << " so far..." << std::endl;
|
||||
}*/
|
||||
}
|
||||
|
||||
infile2.close();
|
||||
|
||||
timeMillis =
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
|
||||
struct rusage r_usage;
|
||||
getrusage(RUSAGE_SELF,&r_usage);
|
||||
std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl;
|
||||
std::cout << "Time taken for deletion: " << timeMillis << "ms" << std::endl;
|
||||
}
|
||||
|
||||
//collection->remove("foo");
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
std::vector<std::string> search_fields = {"title"};
|
||||
collection->search("the", search_fields, "", {}, { sort_field("points", "DESC") }, 1, 100, MAX_SCORE, 0);
|
||||
long long int timeMillis =
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
cout << "Time taken: " << timeMillis << "us" << endl;
|
||||
delete collection;
|
||||
delete store;
|
||||
return 0;
|
||||
}
|
@ -2,7 +2,7 @@
|
||||
#include "array_utils.h"
|
||||
|
||||
void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_length) {
|
||||
min = sorted_array[0];
|
||||
min = array_length != 0 ? sorted_array[0] : 0;
|
||||
max = array_length > 1 ? sorted_array[array_length-1] : min;
|
||||
|
||||
uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR);
|
||||
@ -55,6 +55,10 @@ bool sorted_array::contains(uint32_t value) {
|
||||
}
|
||||
|
||||
uint32_t sorted_array::indexOf(uint32_t value) {
|
||||
if(length == 0) {
|
||||
return length;
|
||||
}
|
||||
|
||||
uint32_t actual;
|
||||
uint32_t index = for_lower_bound_search(in, length, value, &actual);
|
||||
if(actual == value) return index;
|
||||
|
Loading…
x
Reference in New Issue
Block a user