From f4b8b4c627954abfd96876cb82d1208a9556f5f9 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 19 Dec 2022 17:25:55 +0530 Subject: [PATCH 01/10] Fix symbols_to_index key in response. --- src/synonym_index.cpp | 4 ++-- test/collection_synonyms_test.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/synonym_index.cpp b/src/synonym_index.cpp index 7a0293f3..73bb86be 100644 --- a/src/synonym_index.cpp +++ b/src/synonym_index.cpp @@ -262,9 +262,9 @@ nlohmann::json synonym_t::to_view_json() const { } if(!symbols.empty()) { - obj["symbols"] = nlohmann::json::array(); + obj["symbols_to_index"] = nlohmann::json::array(); for(char c: symbols) { - obj["symbols"].push_back(std::string(1, c)); + obj["symbols_to_index"].push_back(std::string(1, c)); } } diff --git a/test/collection_synonyms_test.cpp b/test/collection_synonyms_test.cpp index 0a5ee304..8568e834 100644 --- a/test/collection_synonyms_test.cpp +++ b/test/collection_synonyms_test.cpp @@ -97,9 +97,9 @@ TEST_F(CollectionSynonymsTest, SynonymParsingFromJson) { ASSERT_STREQ("#", synonym_plus.synonyms[1][0].c_str()); nlohmann::json view_json = synonym_plus.to_view_json(); - ASSERT_EQ(2, view_json["symbols"].size()); - ASSERT_EQ("+", view_json["symbols"][0].get()); - ASSERT_EQ("#", view_json["symbols"][1].get()); + ASSERT_EQ(2, view_json["symbols_to_index"].size()); + ASSERT_EQ("+", view_json["symbols_to_index"][0].get()); + ASSERT_EQ("#", view_json["symbols_to_index"][1].get()); // when `id` is not given nlohmann::json syn_json_without_id = { From 61beb7f317d1d6c245216f787a45122e7145c87b Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 20 Dec 2022 16:29:39 +0530 Subject: [PATCH 02/10] Terminate batch indexing iter early on resource exhaustion. --- src/batched_indexer.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/batched_indexer.cpp b/src/batched_indexer.cpp index 9d8bea08..98d314e1 100644 --- a/src/batched_indexer.cpp +++ b/src/batched_indexer.cpp @@ -203,8 +203,9 @@ void BatchedIndexer::run() { if (resource_check != cached_resource_stat_t::OK && orig_req->http_method != "DELETE") { orig_res->set_422("Rejecting write: running out of resource type: " + std::string(magic_enum::enum_name(resource_check))); - orig_res->final = true; - async_res = false; + async_req_res_t* async_req_res = new async_req_res_t(orig_req, orig_res, true); + server->get_message_dispatcher()->send_message(HttpServer::STREAM_RESPONSE_MESSAGE, async_req_res); + break; } else if(route_found) { From a10cf167caa968faf19c050a320ce211ded25ffb Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 21 Dec 2022 14:39:06 +0530 Subject: [PATCH 03/10] Fix edge case for facet counts with empty strings in array. --- src/collection.cpp | 2 +- src/index.cpp | 8 +++----- test/collection_faceting_test.cpp | 33 +++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index d9a35b81..221a8017 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1658,7 +1658,7 @@ Option Collection::search(const std::string & raw_query, facet_result["field_name"] = a_facet.field_name; facet_result["counts"] = nlohmann::json::array(); - std::vector> facet_hash_counts; + std::vector> facet_hash_counts; for (const auto & kv : a_facet.result_map) { facet_hash_counts.emplace_back(kv); } diff --git a/src/index.cpp b/src/index.cpp index 99ae695d..52d56f94 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1170,16 +1170,14 @@ void Index::tokenize_string_array_with_facets(const std::vector& st } } - //LOG(INFO) << "Str: " << str << ", last_token: " << last_token; + if(is_facet) { + facet_hashes.push_back(facet_hash); + } if(token_set.empty()) { continue; } - if(is_facet) { - facet_hashes.push_back(facet_hash); - } - for(auto& the_token: token_set) { // repeat last element to indicate end of offsets for this array index token_to_offsets[the_token].push_back(token_to_offsets[the_token].back()); diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index d5b07c1c..497eb588 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -979,3 +979,36 @@ TEST_F(CollectionFacetingTest, FacetByNestedIntField) { ASSERT_EQ(2, results["facet_counts"][0]["counts"][0]["count"].get()); ASSERT_EQ("2000", results["facet_counts"][0]["counts"][0]["value"].get()); } + +TEST_F(CollectionFacetingTest, FacetOnArrayFieldWithSpecialChars) { + std::vector fields = { + field("tags", field_types::STRING_ARRAY, true), + field("points", field_types::INT32, true), + }; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + nlohmann::json doc; + doc["tags"] = {"gamma"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["tags"] = {"alpha", "| . |", "beta", "gamma"}; + doc["points"] = 10; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"tags"}, {}, {2}, 10, 1, FREQUENCY, {true}, 1).get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(4, results["facet_counts"][0]["counts"].size()); + + for(size_t i = 0; i < results["facet_counts"][0]["counts"].size(); i++) { + auto fvalue = results["facet_counts"][0]["counts"][i]["value"].get(); + if(fvalue == "gamma") { + ASSERT_EQ(2, results["facet_counts"][0]["counts"][i]["count"].get()); + } else { + ASSERT_EQ(1, results["facet_counts"][0]["counts"][i]["count"].get()); + } + } +} From a5628f9940786d443d5a3e6a0af1502863d3a1db Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 22 Dec 2022 14:43:53 +0530 Subject: [PATCH 04/10] Fix slow exports. --- include/core_api_utils.h | 1 + src/core_api.cpp | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/core_api_utils.h b/include/core_api_utils.h index eca4d6af..4e057035 100644 --- a/include/core_api_utils.h +++ b/include/core_api_utils.h @@ -24,6 +24,7 @@ struct export_state_t: public req_state_t { std::vector offsets; std::set include_fields; std::set exclude_fields; + size_t export_batch_size = 100; std::string* res_body; bool filtered_export = false; diff --git a/src/core_api.cpp b/src/core_api.cpp index 6d815582..acb351ce 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -588,6 +588,7 @@ bool get_export_documents(const std::shared_ptr& req, const std::share const char* FILTER_BY = "filter_by"; const char* INCLUDE_FIELDS = "include_fields"; const char* EXCLUDE_FIELDS = "exclude_fields"; + const char* BATCH_SIZE = "batch_size"; export_state_t* export_state = nullptr; @@ -617,6 +618,10 @@ bool get_export_documents(const std::shared_ptr& req, const std::share export_state->exclude_fields = std::set(exclude_fields_vec.begin(), exclude_fields_vec.end()); } + if(req->params.count(BATCH_SIZE) != 0 && StringUtils::is_uint32_t(req->params[BATCH_SIZE])) { + export_state->export_batch_size = std::stoul(req->params[BATCH_SIZE]); + } + if(simple_filter_query.empty()) { export_state->iter_upper_bound_key = collection->get_seq_id_collection_prefix() + "`"; // cannot inline this export_state->iter_upper_bound = new rocksdb::Slice(export_state->iter_upper_bound_key); @@ -644,10 +649,12 @@ bool get_export_documents(const std::shared_ptr& req, const std::share if(export_state->it != nullptr) { rocksdb::Iterator* it = export_state->it; + size_t batch_counter = 0; + res->body.clear(); - if(it->Valid() && it->key().ToString().compare(0, seq_id_prefix.size(), seq_id_prefix) == 0) { + while(it->Valid() && it->key().ToString().compare(0, seq_id_prefix.size(), seq_id_prefix) == 0) { if(export_state->include_fields.empty() && export_state->exclude_fields.empty()) { - res->body = it->value().ToString(); + res->body += it->value().ToString(); } else { nlohmann::json doc = nlohmann::json::parse(it->value().ToString()); nlohmann::json filtered_doc; @@ -663,7 +670,7 @@ bool get_export_documents(const std::shared_ptr& req, const std::share } } - res->body = filtered_doc.dump(); + res->body += filtered_doc.dump(); } it->Next(); @@ -677,10 +684,15 @@ bool get_export_documents(const std::shared_ptr& req, const std::share req->last_chunk_aggregate = true; res->final = true; } + + batch_counter++; + if(batch_counter == export_state->export_batch_size) { + break; + } } } else { bool done; - stateful_export_docs(export_state, 100, done); + stateful_export_docs(export_state, export_state->export_batch_size, done); if(!done) { req->last_chunk_aggregate = false; From 7f348250b626874fdf383f0d9d7a1354b3cfee35 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 23 Dec 2022 19:44:59 +0530 Subject: [PATCH 05/10] Add guard for really large documents eating memory during load. --- src/collection_manager.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 742ea395..922fee38 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -1222,6 +1222,7 @@ Option CollectionManager::load_collection(const nlohmann::json &collection size_t num_found_docs = 0; size_t num_valid_docs = 0; size_t num_indexed_docs = 0; + size_t batch_doc_str_size = 0; auto begin = std::chrono::high_resolution_clock::now(); @@ -1230,14 +1231,17 @@ Option CollectionManager::load_collection(const nlohmann::json &collection const uint32_t seq_id = Collection::get_seq_id_from_key(iter->key().ToString()); nlohmann::json document; + const std::string& doc_string = iter->value().ToString(); try { - document = nlohmann::json::parse(iter->value().ToString()); + document = nlohmann::json::parse(doc_string); } catch(const std::exception& e) { LOG(ERROR) << "JSON error: " << e.what(); return Option(400, "Bad JSON."); } + batch_doc_str_size += doc_string.size(); + if(collection->get_enable_nested_fields()) { std::vector flattened_fields; field::flatten_doc(document, collection->get_nested_fields(), true, flattened_fields); @@ -1254,10 +1258,14 @@ Option CollectionManager::load_collection(const nlohmann::json &collection iter->Next(); bool last_record = !(iter->Valid() && iter->key().starts_with(seq_id_prefix)); + // if expected memory usage exceeds 250M, we index the accumulated set without caring about batch size + bool exceeds_batch_mem_threshold = ((batch_doc_str_size * 7) > (250 * 1014 * 1024)); + // batch must match atleast the number of shards - if((num_valid_docs % batch_size == 0) || last_record) { + if(exceeds_batch_mem_threshold || (num_valid_docs % batch_size == 0) || last_record) { size_t num_records = index_records.size(); size_t num_indexed = collection->batch_index_in_memory(index_records); + batch_doc_str_size = 0; if(num_indexed != num_records) { const Option & index_error_op = get_first_index_error(index_records); From 0e1d70ebf68579f6e5d8bd060f5877a866e32c87 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sun, 25 Dec 2022 21:04:22 +0530 Subject: [PATCH 06/10] Add flag to disable old highlight structure. --- include/collection.h | 3 +- src/collection.cpp | 61 +++++++++++++++++++++----------------- src/collection_manager.cpp | 7 ++++- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/include/collection.h b/include/collection.h index 75ad74da..a7391cb5 100644 --- a/include/collection.h +++ b/include/collection.h @@ -408,7 +408,8 @@ public: const size_t facet_query_num_typos = 2, const size_t filter_curated_hits_option = 2, const bool prioritize_token_position = false, - const std::string& vector_query_str = "") const; + const std::string& vector_query_str = "", + const bool enable_highlight_v1 = true) const; Option get_filter_ids(const std::string & simple_filter_query, std::vector>& index_ids); diff --git a/src/collection.cpp b/src/collection.cpp index 221a8017..c22327ee 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -865,7 +865,8 @@ Option Collection::search(const std::string & raw_query, const size_t facet_query_num_typos, const size_t filter_curated_hits_option, const bool prioritize_token_position, - const std::string& vector_query_str) const { + const std::string& vector_query_str, + const bool enable_highlight_v1) const { std::shared_lock lock(mutex); @@ -1497,7 +1498,11 @@ Option Collection::search(const std::string & raw_query, } nlohmann::json wrapper_doc; - wrapper_doc["highlights"] = nlohmann::json::array(); + + if(enable_highlight_v1) { + wrapper_doc["highlights"] = nlohmann::json::array(); + } + std::vector highlights; StringUtils string_utils; @@ -1568,34 +1573,36 @@ Option Collection::search(const std::string & raw_query, prune_doc(highlight_res, hfield_names, tsl::htrie_set(), ""); } - std::sort(highlights.begin(), highlights.end()); + if(enable_highlight_v1) { + std::sort(highlights.begin(), highlights.end()); - for(const auto & highlight: highlights) { - auto field_it = search_schema.find(highlight.field); - if(field_it == search_schema.end() || field_it->nested) { - // nested field highlighting will be available only in the new highlight structure. - continue; - } - - nlohmann::json h_json = nlohmann::json::object(); - h_json["field"] = highlight.field; - - if(!highlight.indices.empty()) { - h_json["matched_tokens"] = highlight.matched_tokens; - h_json["indices"] = highlight.indices; - h_json["snippets"] = highlight.snippets; - if(!highlight.values.empty()) { - h_json["values"] = highlight.values; + for(const auto & highlight: highlights) { + auto field_it = search_schema.find(highlight.field); + if(field_it == search_schema.end() || field_it->nested) { + // nested field highlighting will be available only in the new highlight structure. + continue; } - } else { - h_json["matched_tokens"] = highlight.matched_tokens[0]; - h_json["snippet"] = highlight.snippets[0]; - if(!highlight.values.empty() && !highlight.values[0].empty()) { - h_json["value"] = highlight.values[0]; - } - } - wrapper_doc["highlights"].push_back(h_json); + nlohmann::json h_json = nlohmann::json::object(); + h_json["field"] = highlight.field; + + if(!highlight.indices.empty()) { + h_json["matched_tokens"] = highlight.matched_tokens; + h_json["indices"] = highlight.indices; + h_json["snippets"] = highlight.snippets; + if(!highlight.values.empty()) { + h_json["values"] = highlight.values; + } + } else { + h_json["matched_tokens"] = highlight.matched_tokens[0]; + h_json["snippet"] = highlight.snippets[0]; + if(!highlight.values.empty() && !highlight.values[0].empty()) { + h_json["value"] = highlight.values[0]; + } + } + + wrapper_doc["highlights"].push_back(h_json); + } } //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key; diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 922fee38..55b20d1d 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -695,6 +695,8 @@ Option CollectionManager::do_search(std::map& re const char *EXHAUSTIVE_SEARCH = "exhaustive_search"; const char *SPLIT_JOIN_TOKENS = "split_join_tokens"; + const char *ENABLE_HIGHLIGHT_V1 = "enable_highlight_v1"; + // enrich params with values from embedded params for(auto& item: embedded_params.items()) { if(item.key() == "expires_at") { @@ -771,6 +773,7 @@ Option CollectionManager::do_search(std::map& re std::vector infixes; size_t max_extra_prefix = INT16_MAX; size_t max_extra_suffix = INT16_MAX; + bool enable_highlight_v1 = true; std::unordered_map unsigned_int_values = { {MIN_LEN_1TYPO, &min_len_1typo}, @@ -810,6 +813,7 @@ Option CollectionManager::do_search(std::map& re {PRE_SEGMENTED_QUERY, &pre_segmented_query}, {EXHAUSTIVE_SEARCH, &exhaustive_search}, {ENABLE_OVERRIDES, &enable_overrides}, + {ENABLE_HIGHLIGHT_V1, &enable_highlight_v1}, }; std::unordered_map*> str_list_values = { @@ -976,7 +980,8 @@ Option CollectionManager::do_search(std::map& re facet_query_num_typos, filter_curated_hits_option, prioritize_token_position, - vector_query + vector_query, + enable_highlight_v1 ); uint64_t timeMillis = std::chrono::duration_cast( From a9b926e24b3d04f34e94e91de5c3445068449c21 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 28 Dec 2022 17:20:36 +0530 Subject: [PATCH 07/10] Don't wrap () for empty filter strings. --- src/auth_manager.cpp | 8 +++++++- test/core_api_utils_test.cpp | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/auth_manager.cpp b/src/auth_manager.cpp index ee3eed7b..f4e5c431 100644 --- a/src/auth_manager.cpp +++ b/src/auth_manager.cpp @@ -394,7 +394,13 @@ bool AuthManager::add_item_to_params(std::map& req_par if(req_params.count(item.key()) == 0) { req_params[item.key()] = str_value; } else if(item.key() == "filter_by") { - req_params[item.key()] = "(" + req_params[item.key()] + ") && (" + str_value + ")"; + if(!req_params[item.key()].empty() && !str_value.empty()) { + req_params[item.key()] = "(" + req_params[item.key()] + ") && (" + str_value + ")"; + } else if(req_params[item.key()].empty() && !str_value.empty()) { + req_params[item.key()] = "(" + str_value + ")"; + } else if(!req_params[item.key()].empty() && str_value.empty()) { + req_params[item.key()] = "(" + req_params[item.key()] + ")"; + } } else if(overwrite) { req_params[item.key()] = str_value; } diff --git a/test/core_api_utils_test.cpp b/test/core_api_utils_test.cpp index fecd2606..d8d8787b 100644 --- a/test/core_api_utils_test.cpp +++ b/test/core_api_utils_test.cpp @@ -199,6 +199,29 @@ TEST_F(CoreAPIUtilsTest, MultiSearchEmbeddedKeys) { // ensure that req params are appended to (embedded params are also rolled into req params) ASSERT_EQ("((user_id: 100) && (age: > 100)) && (foo: bar)", req->params["filter_by"]); + // when empty filter_by is present in req params, don't add () + req->params["filter_by"] = ""; + post_multi_search(req, res); + ASSERT_EQ("((age: > 100)) && (foo: bar)", req->params["filter_by"]); + + // when empty filter_by in collection search params, don't add () + req->params["filter_by"] = "user_id: 100"; + search["filter_by"] = ""; + body["searches"].clear(); + body["searches"].push_back(search); + req->body = body.dump(); + post_multi_search(req, res); + ASSERT_EQ("((user_id: 100)) && (foo: bar)", req->params["filter_by"]); + + // when both are empty, don't add () + req->params["filter_by"] = ""; + search["filter_by"] = ""; + body["searches"].clear(); + body["searches"].push_back(search); + req->body = body.dump(); + post_multi_search(req, res); + ASSERT_EQ("(foo: bar)", req->params["filter_by"]); + // try setting max search limit req->embedded_params_vec[0]["limit_multi_searches"] = 0; ASSERT_FALSE(post_multi_search(req, res)); From bf0f7430a0b9732a797c8331e74492f536962300 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sun, 1 Jan 2023 18:00:50 +0530 Subject: [PATCH 08/10] Allow vector query to pass a document ID. --- include/collection_manager.h | 2 - include/field.h | 14 --- include/index.h | 1 + include/vector_query_ops.h | 32 +++++ src/collection.cpp | 6 +- src/collection_manager.cpp | 109 ----------------- src/index.cpp | 5 + src/vector_query_ops.cpp | 159 +++++++++++++++++++++++++ test/collection_manager_test.cpp | 37 ------ test/collection_vector_search_test.cpp | 27 +++++ test/vector_query_ops_test.cpp | 73 ++++++++++++ 11 files changed, 301 insertions(+), 164 deletions(-) create mode 100644 include/vector_query_ops.h create mode 100644 src/vector_query_ops.cpp create mode 100644 test/vector_query_ops_test.cpp diff --git a/include/collection_manager.h b/include/collection_manager.h index 686f5af9..e65d9ce4 100644 --- a/include/collection_manager.h +++ b/include/collection_manager.h @@ -181,8 +181,6 @@ public: static bool parse_sort_by_str(std::string sort_by_str, std::vector& sort_fields); - static bool parse_vector_query_str(std::string vector_query_str, vector_query_t& vector_query); - // symlinks Option resolve_symlink(const std::string & symlink_name) const; diff --git a/include/field.h b/include/field.h index b6057e1b..35813c95 100644 --- a/include/field.h +++ b/include/field.h @@ -609,20 +609,6 @@ struct sort_by { } }; -struct vector_query_t { - std::string field_name; - size_t k = 0; - size_t flat_search_cutoff = 0; - std::vector values; - - void _reset() { - // used for testing only - field_name.clear(); - k = 0; - values.clear(); - } -}; - class GeoPoint { constexpr static const double EARTH_RADIUS = 3958.75; constexpr static const double METER_CONVERT = 1609.00; diff --git a/include/index.h b/include/index.h index 95a8a4ff..de993e7c 100644 --- a/include/index.h +++ b/include/index.h @@ -27,6 +27,7 @@ #include "id_list.h" #include "synonym_index.h" #include "override.h" +#include "vector_query_ops.h" #include "hnswlib/hnswlib.h" static constexpr size_t ARRAY_FACET_DIM = 4; diff --git a/include/vector_query_ops.h b/include/vector_query_ops.h new file mode 100644 index 00000000..d3424e30 --- /dev/null +++ b/include/vector_query_ops.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include +#include "option.h" + +class Collection; + +struct vector_query_t { + std::string field_name; + size_t k = 0; + size_t flat_search_cutoff = 0; + std::vector values; + + uint32_t seq_id = 0; + bool query_doc_given = false; + + void _reset() { + // used for testing only + field_name.clear(); + k = 0; + values.clear(); + seq_id = 0; + query_doc_given = false; + } +}; + +class VectorQueryOps { +public: + static Option parse_vector_query_str(std::string vector_query_str, vector_query_t& vector_query, + const Collection* coll); +}; \ No newline at end of file diff --git a/src/collection.cpp b/src/collection.cpp index c22327ee..ca608293 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -15,6 +15,7 @@ #include "topster.h" #include "logger.h" #include "thread_local_vars.h" +#include "vector_query_ops.h" const std::string override_t::MATCH_EXACT = "exact"; const std::string override_t::MATCH_CONTAINS = "contains"; @@ -921,8 +922,9 @@ Option Collection::search(const std::string & raw_query, return Option(400, "Vector query is supported only on wildcard (q=*) searches."); } - if(!CollectionManager::parse_vector_query_str(vector_query_str, vector_query)) { - return Option(400, "The `vector_query` parameter is malformed."); + auto parse_vector_op = VectorQueryOps::parse_vector_query_str(vector_query_str, vector_query, this); + if(!parse_vector_op.ok()) { + return Option(400, parse_vector_op.error()); } auto vector_field_it = search_schema.find(vector_query.field_name); diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 55b20d1d..69e1a425 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -1411,112 +1411,3 @@ Option CollectionManager::clone_collection(const string& existing_n return Option(new_coll); } - -bool CollectionManager::parse_vector_query_str(std::string vector_query_str, vector_query_t& vector_query) { - // FORMAT: - // field_name([0.34, 0.66, 0.12, 0.68], exact: false, k: 10) - size_t i = 0; - while(i < vector_query_str.size()) { - if(vector_query_str[i] != ':') { - vector_query.field_name += vector_query_str[i]; - i++; - } else { - if(vector_query_str[i] != ':') { - // missing ":" - return false; - } - - // field name is done - i++; - - StringUtils::trim(vector_query.field_name); - - while(i < vector_query_str.size() && vector_query_str[i] != '(') { - i++; - } - - if(vector_query_str[i] != '(') { - // missing "(" - return false; - } - - i++; - - while(i < vector_query_str.size() && vector_query_str[i] != '[') { - i++; - } - - if(vector_query_str[i] != '[') { - // missing opening "[" - return false; - } - - i++; - - std::string values_str; - while(i < vector_query_str.size() && vector_query_str[i] != ']') { - values_str += vector_query_str[i]; - i++; - } - - if(vector_query_str[i] != ']') { - // missing closing "]" - return false; - } - - i++; - - std::vector svalues; - StringUtils::split(values_str, svalues, ","); - - for(auto& svalue: svalues) { - if(!StringUtils::is_float(svalue)) { - return false; - } - - vector_query.values.push_back(std::stof(svalue)); - } - - if(i == vector_query_str.size()-1) { - // missing params - return true; - } - - std::string param_str = vector_query_str.substr(i, (vector_query_str.size() - i)); - std::vector param_kvs; - StringUtils::split(param_str, param_kvs, ","); - - for(auto& param_kv_str: param_kvs) { - if(param_kv_str.back() == ')') { - param_kv_str.pop_back(); - } - - std::vector param_kv; - StringUtils::split(param_kv_str, param_kv, ":"); - if(param_kv.size() != 2) { - return false; - } - - if(param_kv[0] == "k") { - if(!StringUtils::is_uint32_t(param_kv[1])) { - return false; - } - - vector_query.k = std::stoul(param_kv[1]); - } - - if(param_kv[0] == "flat_search_cutoff") { - if(!StringUtils::is_uint32_t(param_kv[1])) { - return false; - } - - vector_query.flat_search_cutoff = std::stoi(param_kv[1]); - } - } - - return true; - } - } - - return false; -} diff --git a/src/index.cpp b/src/index.cpp index 52d56f94..93b7375d 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2561,6 +2561,11 @@ void Index::search(std::vector& field_query_tokens, const std::v for (const auto& dist_label : dist_labels) { uint32 seq_id = dist_label.second; + + if(vector_query.query_doc_given && vector_query.seq_id == seq_id) { + continue; + } + uint64_t distinct_id = seq_id; if (group_limit != 0) { distinct_id = get_distinct_id(group_by_fields, seq_id); diff --git a/src/vector_query_ops.cpp b/src/vector_query_ops.cpp new file mode 100644 index 00000000..b4cd3ffa --- /dev/null +++ b/src/vector_query_ops.cpp @@ -0,0 +1,159 @@ +#include "vector_query_ops.h" +#include "string_utils.h" +#include "collection.h" + +Option VectorQueryOps::parse_vector_query_str(std::string vector_query_str, vector_query_t& vector_query, + const Collection* coll) { + // FORMAT: + // field_name([0.34, 0.66, 0.12, 0.68], exact: false, k: 10) + size_t i = 0; + while(i < vector_query_str.size()) { + if(vector_query_str[i] != ':') { + vector_query.field_name += vector_query_str[i]; + i++; + } else { + if(vector_query_str[i] != ':') { + // missing ":" + return Option(400, "Malformed vector query string: `:` is missing."); + } + + // field name is done + i++; + + StringUtils::trim(vector_query.field_name); + + while(i < vector_query_str.size() && vector_query_str[i] != '(') { + i++; + } + + if(vector_query_str[i] != '(') { + // missing "(" + return Option(400, "Malformed vector query string."); + } + + i++; + + while(i < vector_query_str.size() && vector_query_str[i] != '[') { + i++; + } + + if(vector_query_str[i] != '[') { + // missing opening "[" + return Option(400, "Malformed vector query string."); + } + + i++; + + std::string values_str; + while(i < vector_query_str.size() && vector_query_str[i] != ']') { + values_str += vector_query_str[i]; + i++; + } + + if(vector_query_str[i] != ']') { + // missing closing "]" + return Option(400, "Malformed vector query string."); + } + + i++; + + std::vector svalues; + StringUtils::split(values_str, svalues, ","); + + for(auto& svalue: svalues) { + if(!StringUtils::is_float(svalue)) { + return Option(400, "Malformed vector query string: one of the vector values is not a float."); + } + + vector_query.values.push_back(std::stof(svalue)); + } + + if(i == vector_query_str.size()-1) { + // missing params + if(vector_query.values.empty()) { + // when query values are missing, atleast the `id` parameter must be present + return Option(400, "When a vector query value is empty, an `id` parameter must be present."); + } + + return Option(true); + } + + std::string param_str = vector_query_str.substr(i, (vector_query_str.size() - i)); + std::vector param_kvs; + StringUtils::split(param_str, param_kvs, ","); + + for(auto& param_kv_str: param_kvs) { + if(param_kv_str.back() == ')') { + param_kv_str.pop_back(); + } + + std::vector param_kv; + StringUtils::split(param_kv_str, param_kv, ":"); + if(param_kv.size() != 2) { + return Option(400, "Malformed vector query string."); + } + + if(param_kv[0] == "id") { + if(!vector_query.values.empty()) { + // cannot pass both vector values and id + return Option(400, "Malformed vector query string: cannot pass both vector query " + "and `id` parameter."); + } + + Option id_op = coll->doc_id_to_seq_id(param_kv[1]); + if(!id_op.ok()) { + return Option(400, "Document id referenced in vector query is not found."); + } + + nlohmann::json document; + auto doc_op = coll->get_document_from_store(id_op.get(), document); + if(!doc_op.ok()) { + return Option(400, "Document id referenced in vector query is not found."); + } + + if(!document.contains(vector_query.field_name) || !document[vector_query.field_name].is_array()) { + return Option(400, "Document referenced in vector query does not contain a valid " + "vector field."); + } + + for(auto& fvalue: document[vector_query.field_name]) { + if(!fvalue.is_number_float()) { + return Option(400, "Document referenced in vector query does not contain a valid " + "vector field."); + } + + vector_query.values.push_back(fvalue.get()); + } + + vector_query.query_doc_given = true; + vector_query.seq_id = id_op.get(); + } + + if(param_kv[0] == "k") { + if(!StringUtils::is_uint32_t(param_kv[1])) { + return Option(400, "Malformed vector query string: `k` parameter must be an integer."); + } + + vector_query.k = std::stoul(param_kv[1]); + } + + if(param_kv[0] == "flat_search_cutoff") { + if(!StringUtils::is_uint32_t(param_kv[1])) { + return Option(400, "Malformed vector query string: " + "`flat_search_cutoff` parameter must be an integer."); + } + + vector_query.flat_search_cutoff = std::stoi(param_kv[1]); + } + } + + if(!vector_query.query_doc_given && vector_query.values.empty()) { + return Option(400, "When a vector query value is empty, an `id` parameter must be present."); + } + + return Option(true); + } + } + + return Option(400, "Malformed vector query string."); +} diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index c20edb12..a3faceda 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -989,43 +989,6 @@ TEST_F(CollectionManagerTest, ParseSortByClause) { ASSERT_FALSE(sort_by_parsed); } -TEST_F(CollectionManagerTest, ParseVectorQueryString) { - vector_query_t vector_query; - bool parsed = CollectionManager::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10)", vector_query); - ASSERT_TRUE(parsed); - ASSERT_EQ("vec", vector_query.field_name); - ASSERT_EQ(10, vector_query.k); - std::vector fvs = {0.34, 0.66, 0.12, 0.68}; - ASSERT_EQ(fvs.size(), vector_query.values.size()); - for(size_t i = 0; i < fvs.size(); i++) { - ASSERT_EQ(fvs[i], vector_query.values[i]); - } - - vector_query._reset(); - parsed = CollectionManager::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10)", vector_query); - ASSERT_TRUE(parsed); - - vector_query._reset(); - parsed = CollectionManager::parse_vector_query_str("vec:[0.34, 0.66, 0.12, 0.68], k: 10)", vector_query); - ASSERT_FALSE(parsed); - - vector_query._reset(); - parsed = CollectionManager::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10", vector_query); - ASSERT_TRUE(parsed); - - vector_query._reset(); - parsed = CollectionManager::parse_vector_query_str("vec:(0.34, 0.66, 0.12, 0.68, k: 10)", vector_query); - ASSERT_FALSE(parsed); - - vector_query._reset(); - parsed = CollectionManager::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], )", vector_query); - ASSERT_FALSE(parsed); - - vector_query._reset(); - parsed = CollectionManager::parse_vector_query_str("vec([0.34, 0.66, 0.12, 0.68])", vector_query); - ASSERT_FALSE(parsed); -} - TEST_F(CollectionManagerTest, Presets) { // try getting on a blank slate auto presets = collectionManager.get_presets(); diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 18467fa2..9ae9a9fb 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -144,6 +144,33 @@ TEST_F(CollectionVectorTest, BasicVectorQuerying) { ASSERT_FALSE(res_op.ok()); ASSERT_EQ("Field `zec` does not have a vector query index.", res_op.error()); + // pass `id` of existing doc instead of vector, query doc should be omitted from results + results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([], id: 1)").get(); + + ASSERT_EQ(2, results["found"].get()); + ASSERT_EQ(2, results["hits"].size()); + + ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get().c_str()); + + // when `id` does not exist, return appropriate error + res_op = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([], id: 100)"); + + ASSERT_FALSE(res_op.ok()); + ASSERT_EQ("Document id referenced in vector query is not found.", res_op.error()); + // only supported with wildcard queries res_op = coll1->search("title", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), diff --git a/test/vector_query_ops_test.cpp b/test/vector_query_ops_test.cpp new file mode 100644 index 00000000..96661fce --- /dev/null +++ b/test/vector_query_ops_test.cpp @@ -0,0 +1,73 @@ +#include +#include "vector_query_ops.h" + +class VectorQueryOpsTest : public ::testing::Test { +protected: + void setupCollection() { + } + + virtual void SetUp() { + setupCollection(); + } + + virtual void TearDown() { + + } +}; + +TEST_F(VectorQueryOpsTest, ParseVectorQueryString) { + vector_query_t vector_query; + auto parsed = VectorQueryOps::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10)", vector_query, nullptr); + ASSERT_TRUE(parsed.ok()); + ASSERT_EQ("vec", vector_query.field_name); + ASSERT_EQ(10, vector_query.k); + std::vector fvs = {0.34, 0.66, 0.12, 0.68}; + ASSERT_EQ(fvs.size(), vector_query.values.size()); + for (size_t i = 0; i < fvs.size(); i++) { + ASSERT_EQ(fvs[i], vector_query.values[i]); + } + + vector_query._reset(); + parsed = VectorQueryOps::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10)", vector_query, nullptr); + ASSERT_TRUE(parsed.ok()); + + vector_query._reset(); + parsed = VectorQueryOps::parse_vector_query_str("vec:([])", vector_query, nullptr); + ASSERT_FALSE(parsed.ok()); + ASSERT_EQ("When a vector query value is empty, an `id` parameter must be present.", parsed.error()); + + // cannot pass both vector and id + vector_query._reset(); + parsed = VectorQueryOps::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], id: 10)", vector_query, nullptr); + ASSERT_FALSE(parsed.ok()); + ASSERT_EQ("Malformed vector query string: cannot pass both vector query and `id` parameter.", parsed.error()); + + vector_query._reset(); + parsed = VectorQueryOps::parse_vector_query_str("vec:([], k: 10)", vector_query, nullptr); + ASSERT_FALSE(parsed.ok()); + ASSERT_EQ("When a vector query value is empty, an `id` parameter must be present.", parsed.error()); + + vector_query._reset(); + parsed = VectorQueryOps::parse_vector_query_str("vec:[0.34, 0.66, 0.12, 0.68], k: 10)", vector_query, nullptr); + ASSERT_FALSE(parsed.ok()); + ASSERT_EQ("Malformed vector query string.", parsed.error()); + + vector_query._reset(); + parsed = VectorQueryOps::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10", vector_query, nullptr); + ASSERT_TRUE(parsed.ok()); + + vector_query._reset(); + parsed = VectorQueryOps::parse_vector_query_str("vec:(0.34, 0.66, 0.12, 0.68, k: 10)", vector_query, nullptr); + ASSERT_FALSE(parsed.ok()); + ASSERT_EQ("Malformed vector query string.", parsed.error()); + + vector_query._reset(); + parsed = VectorQueryOps::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], )", vector_query, nullptr); + ASSERT_FALSE(parsed.ok()); + ASSERT_EQ("Malformed vector query string.", parsed.error()); + + vector_query._reset(); + parsed = VectorQueryOps::parse_vector_query_str("vec([0.34, 0.66, 0.12, 0.68])", vector_query, nullptr); + ASSERT_FALSE(parsed.ok()); + ASSERT_EQ("Malformed vector query string.", parsed.error()); +} From 1a66a25e2f3c071f50d0ecb33a869f6ea53adc6c Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 2 Jan 2023 18:53:36 +0530 Subject: [PATCH 09/10] Enable slow searches logging by default (30s cutoff) --- include/collection.h | 3 ++- include/config.h | 23 ++++++++++++++++++++ include/http_data.h | 37 +++++++++++++++++++++++++-------- include/or_iterator.h | 15 ++++++-------- include/posting_list.h | 15 ++++++-------- include/string_utils.h | 2 ++ include/thread_local_vars.h | 4 ++-- src/collection.cpp | 9 +++++--- src/collection_manager.cpp | 2 +- src/http_server.cpp | 2 -- src/index.cpp | 38 +++++++++++++++++----------------- src/string_utils.cpp | 4 ++++ src/thread_local_vars.cpp | 4 ++-- src/typesense_server_utils.cpp | 2 ++ 14 files changed, 103 insertions(+), 57 deletions(-) diff --git a/include/collection.h b/include/collection.h index a7391cb5..859967b5 100644 --- a/include/collection.h +++ b/include/collection.h @@ -409,7 +409,8 @@ public: const size_t filter_curated_hits_option = 2, const bool prioritize_token_position = false, const std::string& vector_query_str = "", - const bool enable_highlight_v1 = true) const; + const bool enable_highlight_v1 = true, + const uint64_t search_time_start_us = 0) const; Option get_filter_ids(const std::string & simple_filter_query, std::vector>& index_ids); diff --git a/include/config.h b/include/config.h index 305c7f81..59095649 100644 --- a/include/config.h +++ b/include/config.h @@ -59,6 +59,8 @@ private: std::atomic skip_writes; + std::atomic log_slow_searches_time_ms; + protected: Config() { @@ -80,6 +82,7 @@ protected: this->disk_used_max_percentage = 100; this->memory_used_max_percentage = 100; this->skip_writes = false; + this->log_slow_searches_time_ms = 30 * 1000; } Config(Config const&) { @@ -142,6 +145,10 @@ public: this->log_slow_requests_time_ms = log_slow_requests_time_ms; } + void set_log_slow_searches_time_ms(int log_slow_searches_time_ms) { + this->log_slow_searches_time_ms = log_slow_searches_time_ms; + } + void set_healthy_read_lag(size_t healthy_read_lag) { this->healthy_read_lag = healthy_read_lag; } @@ -245,6 +252,10 @@ public: return this->log_slow_requests_time_ms; } + int get_log_slow_searches_time_ms() const { + return this->log_slow_searches_time_ms; + } + size_t get_num_collections_parallel_load() const { return this->num_collections_parallel_load; } @@ -364,6 +375,10 @@ public: this->log_slow_requests_time_ms = std::stoi(get_env("TYPESENSE_LOG_SLOW_REQUESTS_TIME_MS")); } + if(!get_env("TYPESENSE_LOG_SLOW_SEARCHES_TIME_MS").empty()) { + this->log_slow_searches_time_ms = std::stoi(get_env("TYPESENSE_LOG_SLOW_SEARCHES_TIME_MS")); + } + if(!get_env("TYPESENSE_NUM_COLLECTIONS_PARALLEL_LOAD").empty()) { this->num_collections_parallel_load = std::stoi(get_env("TYPESENSE_NUM_COLLECTIONS_PARALLEL_LOAD")); } @@ -513,6 +528,10 @@ public: this->log_slow_requests_time_ms = (int) reader.GetInteger("server", "log-slow-requests-time-ms", -1); } + if(reader.Exists("server", "log-slow-searches-time-ms")) { + this->log_slow_searches_time_ms = (int) reader.GetInteger("server", "log-slow-searches-time-ms", 30*1000); + } + if(reader.Exists("server", "num-collections-parallel-load")) { this->num_collections_parallel_load = (int) reader.GetInteger("server", "num-collections-parallel-load", 0); } @@ -643,6 +662,10 @@ public: this->log_slow_requests_time_ms = options.get("log-slow-requests-time-ms"); } + if(options.exist("log-slow-searches-time-ms")) { + this->log_slow_searches_time_ms = options.get("log-slow-searches-time-ms"); + } + if(options.exist("num-collections-parallel-load")) { this->num_collections_parallel_load = options.get("num-collections-parallel-load"); } diff --git a/include/http_data.h b/include/http_data.h index 9c4228d2..f51ed2d1 100644 --- a/include/http_data.h +++ b/include/http_data.h @@ -261,8 +261,8 @@ struct http_req { chunk_len(0), body(body), body_index(0), data(nullptr), ready(false), log_index(0), is_diposed(false), client_ip(client_ip) { - start_ts = std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()).count(); + const auto& tv = _req->processed_at.at; + start_ts = (tv.tv_sec * 1000 * 1000) + tv.tv_usec; if(_req != nullptr) { is_http_v1 = (_req->version < 0x200); @@ -279,21 +279,40 @@ struct http_req { std::chrono::system_clock::now().time_since_epoch()).count(); uint64_t ms_since_start = (now - start_ts) / 1000; - std::string metric_identifier = http_method + " " + path_without_query; + const std::string metric_identifier = http_method + " " + path_without_query; AppMetrics::get_instance().increment_duration(metric_identifier, ms_since_start); AppMetrics::get_instance().increment_write_metrics(route_hash, ms_since_start); - if(config.get_log_slow_requests_time_ms() >= 0 && int(ms_since_start) >= config.get_log_slow_requests_time_ms()) { + bool log_slow_searches = config.get_log_slow_searches_time_ms() >= 0 && + int(ms_since_start) >= config.get_log_slow_searches_time_ms() && + (path_without_query == "/multi_search" || + StringUtils::ends_with(path_without_query, "/documents/search")); + + bool log_slow_requests = config.get_log_slow_requests_time_ms() >= 0 && + int(ms_since_start) >= config.get_log_slow_requests_time_ms(); + + if(log_slow_searches || log_slow_requests) { // log slow request if logging is enabled std::string query_string = "?"; - for(const auto& kv: params) { - if(kv.first != AUTH_HEADER) { - query_string += kv.first + "=" + kv.second + "&"; + bool is_multi_search_query = (path_without_query == "/multi_search"); + + if(is_multi_search_query) { + StringUtils::erase_char(body, '\n'); + } else { + // ignore params map of multi_search since it is mutated for every search object in the POST body + for(const auto& kv: params) { + if(kv.first != AUTH_HEADER) { + query_string += kv.first + "=" + kv.second + "&"; + } } } + std::string full_url_path = metric_identifier + query_string; - LOG(INFO) << "SLOW REQUEST: " << "(" + std::to_string(ms_since_start) + " ms) " - << client_ip << " " << full_url_path; + + // NOTE: we log the `body` ONLY for multi-search query + LOG(INFO) << "event=slow_request, time=" << ms_since_start << " ms" + << ", client_ip=" << client_ip << ", endpoint=" << full_url_path + << ", body=" << (is_multi_search_query ? body : ""); } } diff --git a/include/or_iterator.h b/include/or_iterator.h index d98b94a9..67fd5ddf 100644 --- a/include/or_iterator.h +++ b/include/or_iterator.h @@ -68,9 +68,8 @@ bool or_iterator_t::intersect(std::vector& its, result_iter_state while(its.size() == it_size && its[0].valid()) { num_processed++; - if (num_processed % 65536 == 0 && - std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) { + if (num_processed % 65536 == 0 && (std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { search_cutoff = true; break; } @@ -100,9 +99,8 @@ bool or_iterator_t::intersect(std::vector& its, result_iter_state while(its.size() == it_size && !at_end2(its)) { num_processed++; - if (num_processed % 65536 == 0 && - std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) { + if (num_processed % 65536 == 0 && (std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { search_cutoff = true; break; } @@ -138,9 +136,8 @@ bool or_iterator_t::intersect(std::vector& its, result_iter_state while(its.size() == it_size && !at_end(its)) { num_processed++; - if (num_processed % 65536 == 0 && - std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) { + if (num_processed % 65536 == 0 && (std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { search_cutoff = true; break; } diff --git a/include/posting_list.h b/include/posting_list.h index 95a57cbc..dea742f7 100644 --- a/include/posting_list.h +++ b/include/posting_list.h @@ -211,9 +211,8 @@ bool posting_list_t::block_intersect(std::vector& it case 1: while(its[0].valid()) { num_processed++; - if (num_processed % 65536 == 0 && - std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) { + if (num_processed % 65536 == 0 && (std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { search_cutoff = true; break; } @@ -228,9 +227,8 @@ bool posting_list_t::block_intersect(std::vector& it case 2: while(!at_end2(its)) { num_processed++; - if (num_processed % 65536 == 0 && - std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) { + if (num_processed % 65536 == 0 && (std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { search_cutoff = true; break; } @@ -249,9 +247,8 @@ bool posting_list_t::block_intersect(std::vector& it default: while(!at_end(its)) { num_processed++; - if (num_processed % 65536 == 0 && - std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) { + if (num_processed % 65536 == 0 && (std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { search_cutoff = true; break; } diff --git a/include/string_utils.h b/include/string_utils.h index 3472b582..01d3dccd 100644 --- a/include/string_utils.h +++ b/include/string_utils.h @@ -315,6 +315,8 @@ struct StringUtils { static void replace_all(std::string& subject, const std::string& search, const std::string& replace); + static void erase_char(std::string& str, const char c); + static std::string trim_curly_spaces(const std::string& str); static bool ends_with(std::string const &str, std::string const &ending); diff --git a/include/thread_local_vars.h b/include/thread_local_vars.h index d4dd725f..0ec8ed1f 100644 --- a/include/thread_local_vars.h +++ b/include/thread_local_vars.h @@ -4,6 +4,6 @@ extern thread_local int64_t write_log_index; // These are used for circuit breaking search requests // NOTE: if you fork off main search thread, care must be taken to initialize these from parent thread values -extern thread_local std::chrono::high_resolution_clock::time_point search_begin; -extern thread_local int64_t search_stop_ms; +extern thread_local uint64_t search_begin_us; +extern thread_local uint64_t search_stop_us; extern thread_local bool search_cutoff; \ No newline at end of file diff --git a/src/collection.cpp b/src/collection.cpp index ca608293..edb6e388 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -867,13 +867,16 @@ Option Collection::search(const std::string & raw_query, const size_t filter_curated_hits_option, const bool prioritize_token_position, const std::string& vector_query_str, - const bool enable_highlight_v1) const { + const bool enable_highlight_v1, + const uint64_t search_time_start_us) const { std::shared_lock lock(mutex); // setup thread local vars - search_stop_ms = search_stop_millis; - search_begin = std::chrono::high_resolution_clock::now(); + search_stop_us = search_stop_millis * 1000; + search_begin_us = (search_time_start_us != 0) ? search_time_start_us : + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); search_cutoff = false; if(raw_query != "*" && raw_search_fields.empty()) { diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 69e1a425..81098b6c 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -767,7 +767,7 @@ Option CollectionManager::do_search(std::map& re size_t filter_curated_hits_option = 2; std::string highlight_fields; bool exhaustive_search = false; - size_t search_cutoff_ms = 3600000; + size_t search_cutoff_ms = 30 * 1000; enable_t split_join_tokens = fallback; size_t max_candidates = 0; std::vector infixes; diff --git a/src/http_server.cpp b/src/http_server.cpp index 6dcf8ed2..ec155639 100644 --- a/src/http_server.cpp +++ b/src/http_server.cpp @@ -474,8 +474,6 @@ int HttpServer::catch_all_handler(h2o_handler_t *_h2o_handler, h2o_req_t *req) { } } - - std::shared_ptr request = std::make_shared(req, rpath->http_method, path_without_query, route_hash, query_map, embedded_params_vec, api_auth_key_sent, body, client_ip); diff --git a/src/index.cpp b/src/index.cpp index 93b7375d..39c80189 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -23,17 +23,17 @@ #include #include "logger.h" -#define RETURN_CIRCUIT_BREAKER if(std::chrono::duration_cast(\ - std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) { \ - search_cutoff = true; \ - return ;\ - } +#define RETURN_CIRCUIT_BREAKER if((std::chrono::duration_cast( \ + std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { \ + search_cutoff = true; \ + return ;\ + } -#define BREAK_CIRCUIT_BREAKER if(std::chrono::duration_cast(\ - std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) { \ - search_cutoff = true; \ - break;\ - } +#define BREAK_CIRCUIT_BREAKER if((std::chrono::duration_cast( \ + std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { \ + search_cutoff = true; \ + break;\ + } spp::sparse_hash_map Index::text_match_sentinel_value; spp::sparse_hash_map Index::seq_id_sentinel_value; @@ -2328,8 +2328,8 @@ void Index::search_infix(const std::string& query, const std::string& field_name auto search_tree = search_index.at(field_name); - const auto parent_search_begin = search_begin; - const auto parent_search_stop_ms = search_stop_ms; + const auto parent_search_begin = search_begin_us; + const auto parent_search_stop_ms = search_stop_us; auto parent_search_cutoff = search_cutoff; for(auto infix_set: infix_sets) { @@ -2337,7 +2337,7 @@ void Index::search_infix(const std::string& query, const std::string& field_name &num_processed, &m_process, &cv_process, &parent_search_begin, &parent_search_stop_ms, &parent_search_cutoff]() { - search_begin = parent_search_begin; + search_begin_us = parent_search_begin; search_cutoff = parent_search_cutoff; auto op_search_stop_ms = parent_search_stop_ms/2; @@ -2362,8 +2362,8 @@ void Index::search_infix(const std::string& query, const std::string& field_name // check for search cutoff but only once every 2^10 docs to reduce overhead if(((num_iterated + 1) % (1 << 12)) == 0) { - if (std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - search_begin).count() > op_search_stop_ms) { + if ((std::chrono::duration_cast(std::chrono::system_clock::now(). + time_since_epoch()).count() - search_begin_us) > op_search_stop_ms) { search_cutoff = true; break; } @@ -4335,8 +4335,8 @@ void Index::search_wildcard(filter_node_t const* const& filter_tree_root, size_t num_queued = 0; size_t filter_index = 0; - const auto parent_search_begin = search_begin; - const auto parent_search_stop_ms = search_stop_ms; + const auto parent_search_begin = search_begin_us; + const auto parent_search_stop_ms = search_stop_us; auto parent_search_cutoff = search_cutoff; for(size_t thread_id = 0; thread_id < num_threads && filter_index < filter_ids_length; thread_id++) { @@ -4361,8 +4361,8 @@ void Index::search_wildcard(filter_node_t const* const& filter_tree_root, batch_result_ids, batch_res_len, &num_processed, &m_process, &cv_process]() { - search_begin = parent_search_begin; - search_stop_ms = parent_search_stop_ms; + search_begin_us = parent_search_begin; + search_stop_us = parent_search_stop_ms; search_cutoff = parent_search_cutoff; size_t filter_index = 0; diff --git a/src/string_utils.cpp b/src/string_utils.cpp index a4374b65..e7893004 100644 --- a/src/string_utils.cpp +++ b/src/string_utils.cpp @@ -217,6 +217,10 @@ void StringUtils::replace_all(std::string& subject, const std::string& search, c } } +void StringUtils::erase_char(std::string& str, const char c) { + str.erase(std::remove(str.begin(), str.end(), c), str.cend()); +} + std::string StringUtils::trim_curly_spaces(const std::string& str) { std::string left_trimmed; int i = 0; diff --git a/src/thread_local_vars.cpp b/src/thread_local_vars.cpp index 4cb3562a..258d8b5f 100644 --- a/src/thread_local_vars.cpp +++ b/src/thread_local_vars.cpp @@ -2,6 +2,6 @@ #include "thread_local_vars.h" thread_local int64_t write_log_index = 0; -thread_local std::chrono::high_resolution_clock::time_point search_begin; -thread_local int64_t search_stop_ms; +thread_local uint64_t search_begin_us; +thread_local uint64_t search_stop_us; thread_local bool search_cutoff = false; diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp index 4ad8d321..64b945fa 100644 --- a/src/typesense_server_utils.cpp +++ b/src/typesense_server_utils.cpp @@ -105,6 +105,8 @@ void init_cmdline_options(cmdline::parser & options, int argc, char **argv) { options.add("memory-used-max-percentage", '\0', "Reject writes when memory usage exceeds this percentage. Default: 100 (never reject).", false, 100); options.add("skip-writes", '\0', "Skip all writes except config changes. Default: false.", false, false); + options.add("log-slow-searches-time-ms", '\0', "When >= 0, searches that take longer than this duration are logged.", false, 30*1000); + // DEPRECATED options.add("listen-address", 'h', "[DEPRECATED: use `api-address`] Address to which Typesense API service binds.", false, "0.0.0.0"); options.add("listen-port", 'p', "[DEPRECATED: use `api-port`] Port on which Typesense API service listens.", false, 8108); From f380bd5fa96a2ba5a2d56fe06187585a392fd2ca Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 2 Jan 2023 19:33:31 +0530 Subject: [PATCH 10/10] Take care of underlying req being null. --- include/collection_manager.h | 3 ++- include/http_data.h | 8 +++++--- src/collection_manager.cpp | 7 +++++-- src/core_api.cpp | 6 ++++-- test/collection_manager_test.cpp | 7 +++++-- 5 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/collection_manager.h b/include/collection_manager.h index e65d9ce4..6c4f8e1f 100644 --- a/include/collection_manager.h +++ b/include/collection_manager.h @@ -177,7 +177,8 @@ public: static Option do_search(std::map& req_params, nlohmann::json& embedded_params, - std::string& results_json_str); + std::string& results_json_str, + uint64_t start_ts); static bool parse_sort_by_str(std::string sort_by_str, std::vector& sort_fields); diff --git a/include/http_data.h b/include/http_data.h index f51ed2d1..29d9bd2e 100644 --- a/include/http_data.h +++ b/include/http_data.h @@ -261,11 +261,13 @@ struct http_req { chunk_len(0), body(body), body_index(0), data(nullptr), ready(false), log_index(0), is_diposed(false), client_ip(client_ip) { - const auto& tv = _req->processed_at.at; - start_ts = (tv.tv_sec * 1000 * 1000) + tv.tv_usec; - if(_req != nullptr) { + const auto& tv = _req->processed_at.at; + start_ts = (tv.tv_sec * 1000 * 1000) + tv.tv_usec; is_http_v1 = (_req->version < 0x200); + } else { + start_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); } } diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 81098b6c..ffbb704f 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -630,7 +630,9 @@ Option add_unsigned_int_list_param(const std::string& param_name, const st Option CollectionManager::do_search(std::map& req_params, nlohmann::json& embedded_params, - std::string& results_json_str) { + std::string& results_json_str, + uint64_t start_ts) { + auto begin = std::chrono::high_resolution_clock::now(); const char *NUM_TYPOS = "num_typos"; @@ -981,7 +983,8 @@ Option CollectionManager::do_search(std::map& re filter_curated_hits_option, prioritize_token_position, vector_query, - enable_highlight_v1 + enable_highlight_v1, + start_ts ); uint64_t timeMillis = std::chrono::duration_cast( diff --git a/src/core_api.cpp b/src/core_api.cpp index acb351ce..1599ddd2 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -376,7 +376,8 @@ bool get_search(const std::shared_ptr& req, const std::shared_ptr search_op = CollectionManager::do_search(req->params, req->embedded_params_vec[0], results_json_str); + Option search_op = CollectionManager::do_search(req->params, req->embedded_params_vec[0], + results_json_str, req->start_ts); if(!search_op.ok()) { res->set(search_op.code(), search_op.error()); @@ -523,7 +524,8 @@ bool post_multi_search(const std::shared_ptr& req, const std::shared_p } std::string results_json_str; - Option search_op = CollectionManager::do_search(req->params, req->embedded_params_vec[i], results_json_str); + Option search_op = CollectionManager::do_search(req->params, req->embedded_params_vec[i], + results_json_str, req->start_ts); if(search_op.ok()) { response["results"].push_back(nlohmann::json::parse(results_json_str)); diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index a3faceda..c0643ed2 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -526,7 +526,10 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) { embedded_params["filter_by"] = "points: 200"; std::string json_res; - auto search_op = collectionManager.do_search(req_params, embedded_params, json_res); + auto now_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + + auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); ASSERT_TRUE(search_op.ok()); nlohmann::json res_obj = nlohmann::json::parse(json_res); @@ -540,7 +543,7 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) { req_params["filter_by"] = "year: 1922"; req_params["q"] = "*"; - search_op = collectionManager.do_search(req_params, embedded_params, json_res); + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); ASSERT_TRUE(search_op.ok()); res_obj = nlohmann::json::parse(json_res);