Improve benchmark utility.

This commit is contained in:
Kishore Nallan 2018-05-22 06:17:06 +05:30
parent 0014e19ea6
commit 76febf74d0

View File

@ -7,15 +7,40 @@
#include <art.h>
#include <unordered_map>
#include <queue>
#include <ctime>
#include "collection.h"
#include "string_utils.h"
#include "collection_manager.h"
using namespace std;
int main(int argc, char* argv[]) {
system("rm -rf /tmp/typesense-data && mkdir -p /tmp/typesense-data");
std::string get_query(StringUtils & string_utils, std::string & text) {
std::vector<std::string> tokens;
std::vector<std::string> normalized_tokens;
StringUtils::split(text, tokens, " ");
for(uint32_t i=0; i<tokens.size(); i++) {
auto token = tokens[i];
string_utils.unicode_normalize(token);
normalized_tokens.push_back(token);
}
size_t rand_len = 0 + (rand() % static_cast<int>(2 - 0 + 1));
size_t rand_index = 0 + (rand() % static_cast<int>(tokens.size()-1 - 0 + 1));
size_t end_index = std::min(rand_index+rand_len, tokens.size()-1);
std::stringstream ss;
for(auto i = rand_index; i <= end_index; i++) {
if(i != rand_index) {
ss << " ";
}
ss << normalized_tokens[i];
}
return ss.str();
}
void benchmark_hn_titles(char* file_path) {
std::vector<field> fields_to_index = { field("title", field_types::STRING, false),
field("points", field_types::INT32, false) };
@ -28,35 +53,124 @@ int main(int argc, char* argv[]) {
collection = collectionManager.create_collection("hnstories_direct", fields_to_index, "points").get();
}
std::ifstream infile(argv[1]);
std::ifstream infile(file_path);
std::string json_line;
auto begin = std::chrono::high_resolution_clock::now();
StringUtils string_utils;
std::vector<std::string> queries;
size_t counter = 0;
auto begin0 = std::chrono::high_resolution_clock::now();
while (std::getline(infile, json_line)) {
counter++;
collection->add(json_line);
if(counter % 100 == 0) {
nlohmann::json obj = nlohmann::json::parse(json_line);
std::string title = obj["title"];
std::string query = get_query(string_utils, title);
queries.push_back(query);
}
}
infile.close();
long long int timeMillis0 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin0).count();
std::cout << "FINISHED INDEXING!" << flush << std::endl;
std::cout << "Time taken: " << timeMillis0 << "ms" << std::endl;
std::vector<std::string> search_fields = {"title"};
uint64_t results_total = 0; // to prevent no-op optimization!
auto begin = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < queries.size(); i++) {
auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("points", "DESC")}, 2, 10, 1, MAX_SCORE, true);
if(results_op.ok() != true) {
exit(2);
}
auto results = results_op.get();
results_total += results["hits"].size();
}
long long int timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
std::cout << "Number of queries: " << queries.size() << std::endl;
std::cout << "Time taken: " << timeMillis << "ms" << std::endl;
std::cout << "Results total: " << results_total << std::endl;
}
void benchmark_reactjs_pages(char* file_path) {
std::vector<field> fields_to_index = {
field("url", field_types::STRING, false),
field("h1", field_types::STRING, false),
field("h2", field_types::STRING_ARRAY, false),
field("h3", field_types::STRING_ARRAY, false),
field("h4", field_types::STRING_ARRAY, false),
field("h5", field_types::STRING_ARRAY, false),
field("h6", field_types::STRING_ARRAY, false),
field("p", field_types::STRING_ARRAY, false),
field("dummy_sorting_field", field_types::INT32, false)
};
Store *store = new Store("/tmp/typesense-data");
CollectionManager & collectionManager = CollectionManager::get_instance();
collectionManager.init(store, "abcd", "1234");
Collection *collection = collectionManager.get_collection("reactjs_pages");
if(collection == nullptr) {
collection = collectionManager.create_collection("reactjs_pages", fields_to_index, "dummy_sorting_field").get();
}
std::ifstream infile(file_path);
std::string json_line;
StringUtils string_utils;
std::vector<std::string> queries;
size_t counter = 0;
while (std::getline(infile, json_line)) {
counter++;
collection->add(json_line);
if(counter % 1 == 0) {
nlohmann::json obj = nlohmann::json::parse(json_line);
std::string title = obj["p"][0];
std::string query = get_query(string_utils, title);
queries.push_back(query);
}
}
infile.close();
std::cout << "FINISHED INDEXING!" << flush << std::endl;
/*std::vector<std::string> search_fields = {"title"};
std::vector<string> queries = {"the", "and", "to", "of", "in"};
auto counter = 0;
uint64_t results_total = 0; // to prevent optimizations!
std::vector<std::string> search_fields = {"h1", "h2", "h3", "h4", "h5", "h6", "p"};
uint64_t results_total = 0; // to prevent no-op optimization!
auto begin = std::chrono::high_resolution_clock::now();
while(counter < 3000) {
auto i = counter % 5;
auto results = collection->search(queries[i], search_fields, "", { }, {sort_field("points", "DESC")}, 1, 10, 1, MAX_SCORE, 0).get();
results_total += results.size();
counter++;
}*/
for(size_t i = 0; i < queries.size(); i++) {
auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("dummy_sorting_field", "DESC")}, 2, 10, 1,
MAX_SCORE, true, 10, spp::sparse_hash_set<std::string>(), {"p"});
if(results_op.ok() != true) {
exit(2);
}
auto results = results_op.get();
results_total += results["hits"].size();
}
long long int timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
std::cout << "Number of queries: " << queries.size() << std::endl;
std::cout << "Time taken: " << timeMillis << "ms" << std::endl;
//std::cout << "Total: " << results_total << std::endl;
std::cout << "Results total: " << results_total << std::endl;
}
int main(int argc, char* argv[]) {
srand(time(NULL));
system("rm -rf /tmp/typesense-data && mkdir -p /tmp/typesense-data");
//benchmark_hn_titles(argv[1]);
benchmark_reactjs_pages(argv[1]);
return 0;
}