From 76febf74d09ee92b84fc61fb88236312c611f099 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 22 May 2018 06:17:06 +0530 Subject: [PATCH] Improve benchmark utility. --- src/main/benchmark.cpp | 146 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 130 insertions(+), 16 deletions(-) diff --git a/src/main/benchmark.cpp b/src/main/benchmark.cpp index a7bdb913..784a72a0 100644 --- a/src/main/benchmark.cpp +++ b/src/main/benchmark.cpp @@ -7,15 +7,40 @@ #include #include #include +#include #include "collection.h" #include "string_utils.h" #include "collection_manager.h" using namespace std; -int main(int argc, char* argv[]) { - system("rm -rf /tmp/typesense-data && mkdir -p /tmp/typesense-data"); +std::string get_query(StringUtils & string_utils, std::string & text) { + std::vector tokens; + std::vector normalized_tokens; + StringUtils::split(text, tokens, " "); + for(uint32_t i=0; i(2 - 0 + 1)); + size_t rand_index = 0 + (rand() % static_cast(tokens.size()-1 - 0 + 1)); + size_t end_index = std::min(rand_index+rand_len, tokens.size()-1); + + std::stringstream ss; + for(auto i = rand_index; i <= end_index; i++) { + if(i != rand_index) { + ss << " "; + } + ss << normalized_tokens[i]; + } + + return ss.str(); +} + +void benchmark_hn_titles(char* file_path) { std::vector fields_to_index = { field("title", field_types::STRING, false), field("points", field_types::INT32, false) }; @@ -28,35 +53,124 @@ int main(int argc, char* argv[]) { collection = collectionManager.create_collection("hnstories_direct", fields_to_index, "points").get(); } - std::ifstream infile(argv[1]); + std::ifstream infile(file_path); std::string json_line; - auto begin = std::chrono::high_resolution_clock::now(); + StringUtils string_utils; + std::vector queries; + size_t counter = 0; + + auto begin0 = std::chrono::high_resolution_clock::now(); while (std::getline(infile, json_line)) { + counter++; collection->add(json_line); + + if(counter % 100 == 0) { + nlohmann::json obj = nlohmann::json::parse(json_line); + std::string title = obj["title"]; + std::string query = get_query(string_utils, title); + queries.push_back(query); + } + } + + infile.close(); + long long int timeMillis0 = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin0).count(); + + std::cout << "FINISHED INDEXING!" << flush << std::endl; + std::cout << "Time taken: " << timeMillis0 << "ms" << std::endl; + + std::vector search_fields = {"title"}; + uint64_t results_total = 0; // to prevent no-op optimization! + + auto begin = std::chrono::high_resolution_clock::now(); + + for(size_t i = 0; i < queries.size(); i++) { + auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("points", "DESC")}, 2, 10, 1, MAX_SCORE, true); + if(results_op.ok() != true) { + exit(2); + } + auto results = results_op.get(); + results_total += results["hits"].size(); + } + + long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); + std::cout << "Number of queries: " << queries.size() << std::endl; + std::cout << "Time taken: " << timeMillis << "ms" << std::endl; + std::cout << "Results total: " << results_total << std::endl; +} + +void benchmark_reactjs_pages(char* file_path) { + std::vector fields_to_index = { + field("url", field_types::STRING, false), + field("h1", field_types::STRING, false), + field("h2", field_types::STRING_ARRAY, false), + field("h3", field_types::STRING_ARRAY, false), + field("h4", field_types::STRING_ARRAY, false), + field("h5", field_types::STRING_ARRAY, false), + field("h6", field_types::STRING_ARRAY, false), + field("p", field_types::STRING_ARRAY, false), + field("dummy_sorting_field", field_types::INT32, false) + }; + + Store *store = new Store("/tmp/typesense-data"); + CollectionManager & collectionManager = CollectionManager::get_instance(); + collectionManager.init(store, "abcd", "1234"); + + Collection *collection = collectionManager.get_collection("reactjs_pages"); + if(collection == nullptr) { + collection = collectionManager.create_collection("reactjs_pages", fields_to_index, "dummy_sorting_field").get(); + } + + std::ifstream infile(file_path); + + std::string json_line; + StringUtils string_utils; + std::vector queries; + size_t counter = 0; + + while (std::getline(infile, json_line)) { + counter++; + collection->add(json_line); + + if(counter % 1 == 0) { + nlohmann::json obj = nlohmann::json::parse(json_line); + std::string title = obj["p"][0]; + std::string query = get_query(string_utils, title); + queries.push_back(query); + } } infile.close(); std::cout << "FINISHED INDEXING!" << flush << std::endl; - /*std::vector search_fields = {"title"}; - - std::vector queries = {"the", "and", "to", "of", "in"}; - auto counter = 0; - uint64_t results_total = 0; // to prevent optimizations! + std::vector search_fields = {"h1", "h2", "h3", "h4", "h5", "h6", "p"}; + uint64_t results_total = 0; // to prevent no-op optimization! auto begin = std::chrono::high_resolution_clock::now(); - while(counter < 3000) { - auto i = counter % 5; - auto results = collection->search(queries[i], search_fields, "", { }, {sort_field("points", "DESC")}, 1, 10, 1, MAX_SCORE, 0).get(); - results_total += results.size(); - counter++; - }*/ + for(size_t i = 0; i < queries.size(); i++) { + auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("dummy_sorting_field", "DESC")}, 2, 10, 1, + MAX_SCORE, true, 10, spp::sparse_hash_set(), {"p"}); + if(results_op.ok() != true) { + exit(2); + } + auto results = results_op.get(); + results_total += results["hits"].size(); + } long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); + std::cout << "Number of queries: " << queries.size() << std::endl; std::cout << "Time taken: " << timeMillis << "ms" << std::endl; - //std::cout << "Total: " << results_total << std::endl; + std::cout << "Results total: " << results_total << std::endl; +} + +int main(int argc, char* argv[]) { + srand(time(NULL)); + system("rm -rf /tmp/typesense-data && mkdir -p /tmp/typesense-data"); + + //benchmark_hn_titles(argv[1]); + benchmark_reactjs_pages(argv[1]); + return 0; } \ No newline at end of file