From f1cd6038ea064c179bc80f0dfa4ad31bd88882cd Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 21 Aug 2023 16:44:37 +0530 Subject: [PATCH] Trim query suggesitons before aggregation. --- include/analytics_manager.h | 2 +- include/tokenizer.h | 2 +- include/tsconfig.h | 4 ++ src/analytics_manager.cpp | 3 +- src/collection.cpp | 1 + src/collection_manager.cpp | 2 +- src/tokenizer.cpp | 12 ++++-- test/collection_manager_test.cpp | 64 ++++++++++++++++++++++++++++++++ 8 files changed, 82 insertions(+), 8 deletions(-) diff --git a/include/analytics_manager.h b/include/analytics_manager.h index 4207f7cf..0f098a7a 100644 --- a/include/analytics_manager.h +++ b/include/analytics_manager.h @@ -79,7 +79,7 @@ public: Option remove_rule(const std::string& name); void add_suggestion(const std::string& query_collection, - std::string& query, bool live_query, const std::string& user_id); + const std::string& query, bool live_query, const std::string& user_id); void stop(); diff --git a/include/tokenizer.h b/include/tokenizer.h index a6a88ab6..36de0940 100644 --- a/include/tokenizer.h +++ b/include/tokenizer.h @@ -88,5 +88,5 @@ public: bool should_skip_char(char c); - static void normalize_ascii(std::string& text); + static std::string normalize_ascii_no_spaces(const std::string& text); }; \ No newline at end of file diff --git a/include/tsconfig.h b/include/tsconfig.h index 1e6d8f62..dc382e80 100644 --- a/include/tsconfig.h +++ b/include/tsconfig.h @@ -787,6 +787,10 @@ public: cors_domains.insert(cors_values_vec.begin(), cors_values_vec.end()); } + void set_enable_search_analytics(bool enable_search_analytics) { + this->enable_search_analytics = enable_search_analytics; + } + // validation Option is_valid() { diff --git a/src/analytics_manager.cpp b/src/analytics_manager.cpp index b23c6cc2..5385bdf9 100644 --- a/src/analytics_manager.cpp +++ b/src/analytics_manager.cpp @@ -203,7 +203,7 @@ Option AnalyticsManager::remove_popular_queries_index(const std::string &n return Option(true); } -void AnalyticsManager::add_suggestion(const std::string &query_collection, std::string &query, +void AnalyticsManager::add_suggestion(const std::string &query_collection, const std::string& query, const bool live_query, const std::string& user_id) { // look up suggestion collections for the query collection std::unique_lock lock(mutex); @@ -212,7 +212,6 @@ void AnalyticsManager::add_suggestion(const std::string &query_collection, std:: for(const auto& suggestion_collection: suggestion_collections_it->second) { const auto& popular_queries_it = popular_queries.find(suggestion_collection); if(popular_queries_it != popular_queries.end()) { - Tokenizer::normalize_ascii(query); popular_queries_it->second->add(query, live_query, user_id); } } diff --git a/src/collection.cpp b/src/collection.cpp index 50aa82fe..7775b367 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -4952,6 +4952,7 @@ void Collection::hide_credential(nlohmann::json& json, const std::string& creden } } } + Option Collection::truncate_after_top_k(const string &field_name, size_t k) { std::shared_lock slock(mutex); diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 9aec9e5c..7ded7e02 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -1112,7 +1112,7 @@ Option CollectionManager::do_search(std::map& re if(Config::get_instance().get_enable_search_analytics()) { if(result.count("found") != 0 && result["found"].get() != 0) { - std::string analytics_query = raw_query; + std::string analytics_query = Tokenizer::normalize_ascii_no_spaces(raw_query); AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query, true, req_params["x-typesense-user-id"]); } diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 464349ed..5688e27d 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "tokenizer.h" Tokenizer::Tokenizer(const std::string& input, bool normalize, bool no_op, const std::string& locale, @@ -349,10 +350,15 @@ bool Tokenizer::should_skip_char(char c) { return is_ascii_char(c) && get_stream_mode(c) != INDEX; } -void Tokenizer::normalize_ascii(std::string& text) { - for(size_t i = 0; i < text.size(); i++) { +std::string Tokenizer::normalize_ascii_no_spaces(const std::string& text) { + std::string analytics_query = text; + StringUtils::trim(analytics_query); + + for(size_t i = 0; i < analytics_query.size(); i++) { if(is_ascii_char(text[i])) { - text[i] = std::tolower(text[i]); + analytics_query[i] = std::tolower(analytics_query[i]); } } + + return analytics_query; } diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index e821e51b..12ed2cd3 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include "string_utils.h" #include "collection.h" @@ -24,6 +25,8 @@ protected: collectionManager.init(store, 1.0, "auth_key", quit); collectionManager.load(8, 1000); + AnalyticsManager::get_instance().init(store); + schema = R"({ "name": "collection1", "enable_nested_fields": true, @@ -587,6 +590,67 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) { collectionManager.drop_collection("coll1"); } +TEST_F(CollectionManagerTest, QuerySuggestionsShouldBeTrimmed) { + std::vector fields = {field("title", field_types::STRING, false, false, true, "", -1, 1), + field("year", field_types::INT32, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["title"] = "Tom Sawyer"; + doc1["year"] = 1876; + doc1["points"] = 100; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + + Config::get_instance().set_enable_search_analytics(true); + + nlohmann::json analytics_rule = R"({ + "name": "top_search_queries", + "type": "popular_queries", + "params": { + "limit": 100, + "source": { + "collections": ["coll1"] + }, + "destination": { + "collection": "top_queries" + } + } + })"_json; + + auto create_op = AnalyticsManager::get_instance().create_rule(analytics_rule, false, true); + ASSERT_TRUE(create_op.ok()); + + nlohmann::json embedded_params; + std::map req_params; + req_params["collection"] = "coll1"; + req_params["q"] = " tom "; + req_params["query_by"] = "title"; + + std::string json_res; + auto now_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + + auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + json_res.clear(); + req_params["q"] = " "; + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + // check that suggestions have been trimmed + auto popular_queries = AnalyticsManager::get_instance().get_popular_queries(); + ASSERT_EQ(2, popular_queries["top_queries"]->get_user_prefix_queries()[""].size()); + ASSERT_EQ("tom", popular_queries["top_queries"]->get_user_prefix_queries()[""][0].query); + ASSERT_EQ("", popular_queries["top_queries"]->get_user_prefix_queries()[""][1].query); + + collectionManager.drop_collection("coll1"); +} + TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) { Collection *coll1;