From 9d9ffd3bf9a9c4fa5942f2b33fd45edb8edfc590 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 27 Dec 2023 15:50:41 +0530 Subject: [PATCH] Add option to expand search query for suggestion aggregation. --- include/analytics_manager.h | 8 ++- include/collection.h | 4 ++ include/query_analytics.h | 8 ++- include/tokenizer.h | 2 + src/analytics_manager.cpp | 16 ++++-- src/collection.cpp | 54 +++++++++++++++++++-- src/collection_manager.cpp | 5 +- src/event_manager.cpp | 2 +- src/index.cpp | 2 +- src/query_analytics.cpp | 13 +++-- src/tokenizer.cpp | 5 ++ test/analytics_manager_test.cpp | 67 +++++++++++++++++++++++--- test/collection_override_test.cpp | 3 ++ test/collection_specific_more_test.cpp | 27 +++++++++++ test/popular_queries_test.cpp | 20 ++++---- 15 files changed, 201 insertions(+), 35 deletions(-) diff --git a/include/analytics_manager.h b/include/analytics_manager.h index 0aaa43dc..e3c21dbe 100644 --- a/include/analytics_manager.h +++ b/include/analytics_manager.h @@ -120,6 +120,7 @@ private: std::vector query_collections; size_t limit; std::string rule_type; + bool expand_query = false; void to_json(nlohmann::json& obj) const { obj["name"] = name; @@ -128,6 +129,10 @@ private: obj["params"]["limit"] = limit; obj["params"]["source"]["collections"] = query_collections; obj["params"]["destination"]["collection"] = suggestion_collection; + + if(rule_type == POPULAR_QUERIES_TYPE) { + obj["params"]["expand_query"] = expand_query; + } } }; @@ -195,7 +200,8 @@ public: Option remove_rule(const std::string& name); void add_suggestion(const std::string& query_collection, - const std::string& query, bool live_query, const std::string& user_id); + const std::string& query, const std::string& expanded_query, + bool live_query, const std::string& user_id); void stop(); diff --git a/include/collection.h b/include/collection.h index bb0f581a..c2d3a2e8 100644 --- a/include/collection.h +++ b/include/collection.h @@ -681,6 +681,10 @@ public: friend class filter_result_iterator_t; std::shared_mutex& get_lifecycle_mutex(); + + void expand_search_query(const string& raw_query, size_t offset, size_t total, const search_args* search_params, + const std::vector>& result_group_kvs, + const std::vector& raw_search_fields, string& first_q) const; }; template diff --git a/include/query_analytics.h b/include/query_analytics.h index c6aca8c3..3becb3ff 100644 --- a/include/query_analytics.h +++ b/include/query_analytics.h @@ -23,6 +23,8 @@ private: size_t k; const size_t max_size; + bool expand_query = false; + // counts aggregated within the current node tsl::htrie_map local_counts; std::shared_mutex lmutex; @@ -34,8 +36,8 @@ public: QueryAnalytics(size_t k); - void add(const std::string& value, const bool live_query, const std::string& user_id, - uint64_t now_ts_us = 0); + void add(const std::string& value, const std::string& expanded_key, + const bool live_query, const std::string& user_id, uint64_t now_ts_us = 0); void compact_user_queries(uint64_t now_ts_us); @@ -48,4 +50,6 @@ public: std::unordered_map> get_user_prefix_queries(); tsl::htrie_map get_local_counts(); + + void set_expand_query(bool expand_query); }; diff --git a/include/tokenizer.h b/include/tokenizer.h index 36de0940..e6726ad4 100644 --- a/include/tokenizer.h +++ b/include/tokenizer.h @@ -89,4 +89,6 @@ public: bool should_skip_char(char c); static std::string normalize_ascii_no_spaces(const std::string& text); + + static bool has_word_tokenizer(const std::string& locale); }; \ No newline at end of file diff --git a/src/analytics_manager.cpp b/src/analytics_manager.cpp index dc820b40..f76dbd60 100644 --- a/src/analytics_manager.cpp +++ b/src/analytics_manager.cpp @@ -72,11 +72,16 @@ Option AnalyticsManager::create_queries_index(nlohmann::json &payload, boo } size_t limit = 1000; + bool expand_query = false; if(params.contains("limit") && params["limit"].is_number_integer()) { limit = params["limit"].get(); } + if(params.contains("expand_query") && params["expand_query"].is_boolean()) { + expand_query = params["expand_query"].get(); + } + if(!params["source"].contains("collections") || !params["source"]["collections"].is_array()) { return Option(400, "Must contain a valid list of source collections."); } @@ -99,6 +104,7 @@ Option AnalyticsManager::create_queries_index(nlohmann::json &payload, boo suggestion_config.name = suggestion_config_name; suggestion_config.suggestion_collection = suggestion_collection; suggestion_config.limit = limit; + suggestion_config.expand_query = expand_query; suggestion_config.rule_type = payload["type"]; if(payload["type"] == POPULAR_QUERIES_TYPE) { @@ -150,7 +156,8 @@ Option AnalyticsManager::create_queries_index(nlohmann::json &payload, boo } if(payload["type"] == POPULAR_QUERIES_TYPE) { - QueryAnalytics *popularQueries = new QueryAnalytics(limit); + QueryAnalytics* popularQueries = new QueryAnalytics(limit); + popularQueries->set_expand_query(suggestion_config.expand_query); popular_queries.emplace(suggestion_collection, popularQueries); } else if(payload["type"] == NOHITS_QUERIES_TYPE) { QueryAnalytics *noresultsQueries = new QueryAnalytics(limit); @@ -256,7 +263,8 @@ Option AnalyticsManager::remove_queries_index(const std::string &name) { return Option(true); } -void AnalyticsManager::add_suggestion(const std::string &query_collection, const std::string& query, +void AnalyticsManager::add_suggestion(const std::string &query_collection, + const std::string& query, const std::string& expanded_query, const bool live_query, const std::string& user_id) { // look up suggestion collections for the query collection std::unique_lock lock(mutex); @@ -265,7 +273,7 @@ void AnalyticsManager::add_suggestion(const std::string &query_collection, const for(const auto& suggestion_collection: suggestion_collections_it->second) { const auto& popular_queries_it = popular_queries.find(suggestion_collection); if(popular_queries_it != popular_queries.end()) { - popular_queries_it->second->add(query, live_query, user_id); + popular_queries_it->second->add(query, expanded_query, live_query, user_id); } } } @@ -327,7 +335,7 @@ void AnalyticsManager::add_nohits_query(const std::string &query_collection, con for(const auto& suggestion_collection: suggestion_collections_it->second) { const auto& noresults_queries_it = nohits_queries.find(suggestion_collection); if(noresults_queries_it != nohits_queries.end()) { - noresults_queries_it->second->add(query, live_query, user_id); + noresults_queries_it->second->add(query, query, live_query, user_id); } } } diff --git a/src/collection.cpp b/src/collection.cpp index e2564a40..788c9383 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2381,6 +2381,10 @@ Option Collection::search(std::string raw_query, nlohmann::json docs_array = nlohmann::json::array(); + // handle analytics query expansion + std::string first_q = raw_query; + expand_search_query(raw_query, offset, total, search_params, result_group_kvs, raw_search_fields, first_q); + // construct results array for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) { const std::vector & kv_group = result_group_kvs[result_kvs_index]; @@ -2774,9 +2778,7 @@ Option Collection::search(std::string raw_query, highlight_t highlight; if(!facet_query.query.empty()) { - bool is_cyrillic = Tokenizer::is_cyrillic(the_field.locale); - bool use_word_tokenizer = the_field.locale == "th" || the_field.locale == "ja" || - Tokenizer::is_cyrillic(the_field.locale); + bool use_word_tokenizer = Tokenizer::has_word_tokenizer(the_field.locale); bool normalise = !use_word_tokenizer; std::vector fquery_tokens; @@ -2911,6 +2913,7 @@ Option Collection::search(std::string raw_query, result["request_params"]["collection_name"] = name; result["request_params"]["per_page"] = per_page; result["request_params"]["q"] = raw_query; + result["request_params"]["first_q"] = first_q; if(!override_metadata.empty()) { result["metadata"] = override_metadata; @@ -2922,6 +2925,48 @@ Option Collection::search(std::string raw_query, return Option(result); } +void Collection::expand_search_query(const string& raw_query, size_t offset, size_t total, const search_args* search_params, + const std::vector>& result_group_kvs, + const std::vector& raw_search_fields, string& first_q) const { + if(!Config::get_instance().get_enable_search_analytics()) { + return ; + } + + if(offset == 0 && !raw_search_fields.empty() && !search_params->searched_queries.empty() && + total != 0 && !result_group_kvs.empty()) { + // we have to map raw_query (which could contain a prefix) back to expanded version + auto search_field_it = search_schema.find(raw_search_fields[0]); + if(search_field_it == search_schema.end() || Tokenizer::has_word_tokenizer(search_field_it->locale)) { + return ; + } + + first_q = ""; + auto q_index = result_group_kvs[0][0]->query_index; + if(q_index >= search_params->searched_queries.size()) { + return ; + } + + const auto& qleaves = search_params->searched_queries[q_index]; + Tokenizer tokenizer(raw_query, true, false, search_field_it->locale, symbols_to_index, token_separators); + std::string raw_token; + size_t raw_token_index = 0, tok_start = 0, tok_end = 0; + + while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) { + if(raw_token_index < qleaves.size()) { + auto leaf = qleaves[raw_token_index]; + std::string tok(reinterpret_cast(leaf->key), leaf->key_len - 1); + if(StringUtils::begins_with(tok, raw_token)) { + first_q += tok + " "; + } + } + } + + if(!first_q.empty()) { + first_q.pop_back(); + } + } +} + void Collection::copy_highlight_doc(std::vector& hightlight_items, const bool nested_fields_enabled, const nlohmann::json& src, nlohmann::json& dst) { @@ -3577,8 +3622,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea tsl::htrie_set matched_tokens; - bool use_word_tokenizer = search_field.locale == "th" || search_field.locale == "ja" || - Tokenizer::is_cyrillic(search_field.locale); + bool use_word_tokenizer = Tokenizer::has_word_tokenizer(search_field.locale); bool normalise = !use_word_tokenizer; std::vector raw_query_tokens; diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 3939162f..6227aff8 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -1544,7 +1544,10 @@ Option CollectionManager::do_search(std::map& re if(Config::get_instance().get_enable_search_analytics()) { if(result.count("found") != 0 && result["found"].get() != 0) { std::string analytics_query = Tokenizer::normalize_ascii_no_spaces(raw_query); - AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query, + const std::string& expanded_query = Tokenizer::normalize_ascii_no_spaces( + result["request_params"]["first_q"].get()); + + AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query, expanded_query, true, req_params["x-typesense-user-id"]); AnalyticsManager::get_instance().add_query_hits_count(orig_coll_name, analytics_query, req_params["x-typesense-user-id"], diff --git a/src/event_manager.cpp b/src/event_manager.cpp index c759d430..03b512e0 100644 --- a/src/event_manager.cpp +++ b/src/event_manager.cpp @@ -45,7 +45,7 @@ Option EventManager::add_event(const nlohmann::json& event, const std::str for(const auto& coll: event_data_val["collections"]) { std::string query = event_data_query_it.get(); - AnalyticsManager::get_instance().add_suggestion(coll.get(), query, false, ""); + AnalyticsManager::get_instance().add_suggestion(coll.get(), query, query, false, ""); } } else if(event_type == "query_click") { if (!event.contains("data")) { diff --git a/src/index.cpp b/src/index.cpp index 0287f443..5f8c5a38 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1985,7 +1985,7 @@ void Index::collate_included_ids(const std::vector& q_included_tokens, scores[1] = int64_t(1); scores[2] = int64_t(1); - KV kv(searched_queries.size(), seq_id, distinct_id, 0, scores); + KV kv(0, seq_id, distinct_id, 0, scores); curated_topster->add(&kv); } } diff --git a/src/query_analytics.cpp b/src/query_analytics.cpp index 9bf56a94..efd78964 100644 --- a/src/query_analytics.cpp +++ b/src/query_analytics.cpp @@ -8,7 +8,8 @@ QueryAnalytics::QueryAnalytics(size_t k) : k(k), max_size(k * 2) { } -void QueryAnalytics::add(const std::string& key, const bool live_query, const std::string& user_id, uint64_t now_ts_us) { +void QueryAnalytics::add(const std::string& key, const std::string& expanded_key, + const bool live_query, const std::string& user_id, uint64_t now_ts_us) { if(live_query) { // live query must be aggregated first to their final form as they could be prefix queries if(now_ts_us == 0) { @@ -23,7 +24,9 @@ void QueryAnalytics::add(const std::string& key, const bool live_query, const st auto& queries = user_prefix_queries[user_id]; if(queries.size() < 100) { - queries.emplace_back(key, now_ts_us); + // only live queries could send expanded queries + const std::string& actual_key = expand_query ? expanded_key : key; + queries.emplace_back(actual_key, now_ts_us); } umutex.unlock(); @@ -90,7 +93,7 @@ void QueryAnalytics::compact_user_queries(uint64_t now_ts_us) { (queries[i + 1].timestamp - queries[i].timestamp); if(diff_micros > QUERY_FINALIZATION_INTERVAL_MICROS) { - add(queries[i].query, false, ""); + add(queries[i].query, queries[i].query, false, ""); last_consolidated_index = i; } } @@ -116,3 +119,7 @@ tsl::htrie_map QueryAnalytics::get_local_counts() { std::unique_lock lk(lmutex); return local_counts; } + +void QueryAnalytics::set_expand_query(bool expand_query) { + this->expand_query = expand_query; +} diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 70a9bc4b..221fbb94 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -370,3 +370,8 @@ std::string Tokenizer::normalize_ascii_no_spaces(const std::string& text) { return analytics_query; } + +bool Tokenizer::has_word_tokenizer(const std::string& locale) { + bool use_word_tokenizer = locale == "th" || locale == "ja" || Tokenizer::is_cyrillic(locale); + return use_word_tokenizer; +} diff --git a/test/analytics_manager_test.cpp b/test/analytics_manager_test.cpp index a057bd18..fcd5ae21 100644 --- a/test/analytics_manager_test.cpp +++ b/test/analytics_manager_test.cpp @@ -85,29 +85,82 @@ TEST_F(AnalyticsManagerTest, AddSuggestion) { auto create_op = analyticsManager.create_rule(analytics_rule, false, true); ASSERT_TRUE(create_op.ok()); - std::string q = "foobar"; - analyticsManager.add_suggestion("titles", q, true, "1"); + std::string q = "coo"; + analyticsManager.add_suggestion("titles", q, "cool", true, "1"); auto popularQueries = analyticsManager.get_popular_queries(); auto userQueries = popularQueries["top_queries"]->get_user_prefix_queries()["1"]; ASSERT_EQ(1, userQueries.size()); - ASSERT_EQ("foobar", userQueries[0].query); + ASSERT_EQ("coo", userQueries[0].query); // expanded query is NOT stored since it's not enabled // add another query which is more popular q = "buzzfoo"; - analyticsManager.add_suggestion("titles", q, true, "1"); - analyticsManager.add_suggestion("titles", q, true, "2"); - analyticsManager.add_suggestion("titles", q, true, "3"); + analyticsManager.add_suggestion("titles", q, q, true, "1"); + analyticsManager.add_suggestion("titles", q, q, true, "2"); + analyticsManager.add_suggestion("titles", q, q, true, "3"); popularQueries = analyticsManager.get_popular_queries(); userQueries = popularQueries["top_queries"]->get_user_prefix_queries()["1"]; ASSERT_EQ(2, userQueries.size()); - ASSERT_EQ("foobar", userQueries[0].query); + ASSERT_EQ("coo", userQueries[0].query); ASSERT_EQ("buzzfoo", userQueries[1].query); ASSERT_TRUE(analyticsManager.remove_rule("top_search_queries").ok()); } +TEST_F(AnalyticsManagerTest, AddSuggestionWithExpandedQuery) { + nlohmann::json titles_schema = R"({ + "name": "titles", + "fields": [ + {"name": "title", "type": "string"} + ] + })"_json; + + Collection* titles_coll = collectionManager.create_collection(titles_schema).get(); + + nlohmann::json doc; + doc["title"] = "Cool trousers"; + ASSERT_TRUE(titles_coll->add(doc.dump()).ok()); + + // create a collection to store suggestions + nlohmann::json suggestions_schema = R"({ + "name": "top_queries", + "fields": [ + {"name": "q", "type": "string" }, + {"name": "count", "type": "int32" } + ] + })"_json; + + Collection* suggestions_coll = collectionManager.create_collection(suggestions_schema).get(); + + nlohmann::json analytics_rule = R"({ + "name": "top_search_queries", + "type": "popular_queries", + "params": { + "limit": 100, + "expand_query": true, + "source": { + "collections": ["titles"] + }, + "destination": { + "collection": "top_queries" + } + } + })"_json; + + auto create_op = analyticsManager.create_rule(analytics_rule, false, true); + ASSERT_TRUE(create_op.ok()); + + analyticsManager.add_suggestion("titles", "c", "cool", true, "1"); + + auto popularQueries = analyticsManager.get_popular_queries(); + auto userQueries = popularQueries["top_queries"]->get_user_prefix_queries()["1"]; + ASSERT_EQ(1, userQueries.size()); + ASSERT_EQ("cool", userQueries[0].query); + + ASSERT_TRUE(analyticsManager.remove_rule("top_search_queries").ok()); +} + TEST_F(AnalyticsManagerTest, GetAndDeleteSuggestions) { nlohmann::json analytics_rule = R"({ "name": "top_search_queries", diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index 6d06df10..4812bf37 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -56,6 +56,8 @@ protected: }; TEST_F(CollectionOverrideTest, ExcludeIncludeExactQueryMatch) { + Config::get_instance().set_enable_search_analytics(true); + nlohmann::json override_json = { {"id", "exclude-rule"}, { @@ -207,6 +209,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeExactQueryMatch) { ASSERT_EQ(4, results["found"].get()); coll_mul_fields->remove_override("include-rule"); + Config::get_instance().set_enable_search_analytics(false); } TEST_F(CollectionOverrideTest, OverrideJSONValidation) { diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 05361729..e44afbf3 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -2555,6 +2555,33 @@ TEST_F(CollectionSpecificMoreTest, CrossFieldTypoAndPrefixWithWeights) { ASSERT_EQ(1, res["hits"].size()); } +TEST_F(CollectionSpecificMoreTest, AnalyticsFullFirstQuery) { + Config::get_instance().set_enable_search_analytics(true); + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "color", "type": "string"} + ] + })"_json; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["title"] = "Cool trousers"; + doc["color"] = "blue"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto res = coll1->search("co", {"title", "color"}, "", {}, {}, {2, 0}, 10, 1, FREQUENCY, {true}, 0, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, + "", "", {2, 3}).get(); + ASSERT_EQ(1, res["hits"].size()); + ASSERT_EQ("cool", res["request_params"]["first_q"].get()); + Config::get_instance().set_enable_search_analytics(false); +} + TEST_F(CollectionSpecificMoreTest, TruncateAterTopK) { nlohmann::json schema = R"({ "name": "coll1", diff --git a/test/popular_queries_test.cpp b/test/popular_queries_test.cpp index 32462b52..3120edc6 100644 --- a/test/popular_queries_test.cpp +++ b/test/popular_queries_test.cpp @@ -25,7 +25,7 @@ TEST_F(PopularQueriesTest, PrefixQueryCompaction) { ASSERT_TRUE(queries.empty()); // compaction after user has typed first prefix but before compaction interval has happened - pq.add("f", true, "0", now_ts_us+1); + pq.add("f", "f", true, "0", now_ts_us+1); pq.compact_user_queries(now_ts_us+2); queries = pq.get_user_prefix_queries(); ASSERT_EQ(1, queries.size()); @@ -46,9 +46,9 @@ TEST_F(PopularQueriesTest, PrefixQueryCompaction) { // 3 letter search pq.reset_local_counts(); - pq.add("f", true, "0", now_ts_us+1); - pq.add("fo", true, "0", now_ts_us+2); - pq.add("foo", true, "0", now_ts_us+3); + pq.add("f", "f", true, "0", now_ts_us+1); + pq.add("fo", "fo", true, "0", now_ts_us+2); + pq.add("foo", "foo", true, "0", now_ts_us+3); pq.compact_user_queries(now_ts_us + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 100); queries = pq.get_user_prefix_queries(); ASSERT_EQ(0, queries.size()); @@ -59,10 +59,10 @@ TEST_F(PopularQueriesTest, PrefixQueryCompaction) { // 3 letter search + start of next search pq.reset_local_counts(); - pq.add("f", true, "0", now_ts_us+1); - pq.add("fo", true, "0", now_ts_us+2); - pq.add("foo", true, "0", now_ts_us+3); - pq.add("b", true, "0", now_ts_us + 3 + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 100); + pq.add("f", "f", true, "0", now_ts_us+1); + pq.add("fo", "fo", true, "0", now_ts_us+2); + pq.add("foo", "foo", true, "0", now_ts_us+3); + pq.add("b", "b", true, "0", now_ts_us + 3 + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 100); pq.compact_user_queries(now_ts_us + 3 + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 100 + 1); queries = pq.get_user_prefix_queries(); ASSERT_EQ(1, queries.size()); @@ -75,8 +75,8 @@ TEST_F(PopularQueriesTest, PrefixQueryCompaction) { // continue with that query auto prev_ts = now_ts_us + 3 + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 100 + 1; - pq.add("ba", true, "0", prev_ts+1); - pq.add("bar", true, "0", prev_ts+2); + pq.add("ba", "ba", true, "0", prev_ts+1); + pq.add("bar", "bar", true, "0", prev_ts+2); pq.compact_user_queries(prev_ts + 2 + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 1); queries = pq.get_user_prefix_queries(); ASSERT_EQ(0, queries.size());