Add option to expand search query for suggestion aggregation.

This commit is contained in:
Kishore Nallan 2023-12-27 15:50:41 +05:30
parent 97cf43b999
commit 9d9ffd3bf9
15 changed files with 201 additions and 35 deletions

View File

@ -120,6 +120,7 @@ private:
std::vector<std::string> query_collections;
size_t limit;
std::string rule_type;
bool expand_query = false;
void to_json(nlohmann::json& obj) const {
obj["name"] = name;
@ -128,6 +129,10 @@ private:
obj["params"]["limit"] = limit;
obj["params"]["source"]["collections"] = query_collections;
obj["params"]["destination"]["collection"] = suggestion_collection;
if(rule_type == POPULAR_QUERIES_TYPE) {
obj["params"]["expand_query"] = expand_query;
}
}
};
@ -195,7 +200,8 @@ public:
Option<bool> remove_rule(const std::string& name);
void add_suggestion(const std::string& query_collection,
const std::string& query, bool live_query, const std::string& user_id);
const std::string& query, const std::string& expanded_query,
bool live_query, const std::string& user_id);
void stop();

View File

@ -681,6 +681,10 @@ public:
friend class filter_result_iterator_t;
std::shared_mutex& get_lifecycle_mutex();
void expand_search_query(const string& raw_query, size_t offset, size_t total, const search_args* search_params,
const std::vector<std::vector<KV*>>& result_group_kvs,
const std::vector<std::string>& raw_search_fields, string& first_q) const;
};
template<class T>

View File

@ -23,6 +23,8 @@ private:
size_t k;
const size_t max_size;
bool expand_query = false;
// counts aggregated within the current node
tsl::htrie_map<char, uint32_t> local_counts;
std::shared_mutex lmutex;
@ -34,8 +36,8 @@ public:
QueryAnalytics(size_t k);
void add(const std::string& value, const bool live_query, const std::string& user_id,
uint64_t now_ts_us = 0);
void add(const std::string& value, const std::string& expanded_key,
const bool live_query, const std::string& user_id, uint64_t now_ts_us = 0);
void compact_user_queries(uint64_t now_ts_us);
@ -48,4 +50,6 @@ public:
std::unordered_map<std::string, std::vector<QWithTimestamp>> get_user_prefix_queries();
tsl::htrie_map<char, uint32_t> get_local_counts();
void set_expand_query(bool expand_query);
};

View File

@ -89,4 +89,6 @@ public:
bool should_skip_char(char c);
static std::string normalize_ascii_no_spaces(const std::string& text);
static bool has_word_tokenizer(const std::string& locale);
};

View File

@ -72,11 +72,16 @@ Option<bool> AnalyticsManager::create_queries_index(nlohmann::json &payload, boo
}
size_t limit = 1000;
bool expand_query = false;
if(params.contains("limit") && params["limit"].is_number_integer()) {
limit = params["limit"].get<size_t>();
}
if(params.contains("expand_query") && params["expand_query"].is_boolean()) {
expand_query = params["expand_query"].get<bool>();
}
if(!params["source"].contains("collections") || !params["source"]["collections"].is_array()) {
return Option<bool>(400, "Must contain a valid list of source collections.");
}
@ -99,6 +104,7 @@ Option<bool> AnalyticsManager::create_queries_index(nlohmann::json &payload, boo
suggestion_config.name = suggestion_config_name;
suggestion_config.suggestion_collection = suggestion_collection;
suggestion_config.limit = limit;
suggestion_config.expand_query = expand_query;
suggestion_config.rule_type = payload["type"];
if(payload["type"] == POPULAR_QUERIES_TYPE) {
@ -150,7 +156,8 @@ Option<bool> AnalyticsManager::create_queries_index(nlohmann::json &payload, boo
}
if(payload["type"] == POPULAR_QUERIES_TYPE) {
QueryAnalytics *popularQueries = new QueryAnalytics(limit);
QueryAnalytics* popularQueries = new QueryAnalytics(limit);
popularQueries->set_expand_query(suggestion_config.expand_query);
popular_queries.emplace(suggestion_collection, popularQueries);
} else if(payload["type"] == NOHITS_QUERIES_TYPE) {
QueryAnalytics *noresultsQueries = new QueryAnalytics(limit);
@ -256,7 +263,8 @@ Option<bool> AnalyticsManager::remove_queries_index(const std::string &name) {
return Option<bool>(true);
}
void AnalyticsManager::add_suggestion(const std::string &query_collection, const std::string& query,
void AnalyticsManager::add_suggestion(const std::string &query_collection,
const std::string& query, const std::string& expanded_query,
const bool live_query, const std::string& user_id) {
// look up suggestion collections for the query collection
std::unique_lock lock(mutex);
@ -265,7 +273,7 @@ void AnalyticsManager::add_suggestion(const std::string &query_collection, const
for(const auto& suggestion_collection: suggestion_collections_it->second) {
const auto& popular_queries_it = popular_queries.find(suggestion_collection);
if(popular_queries_it != popular_queries.end()) {
popular_queries_it->second->add(query, live_query, user_id);
popular_queries_it->second->add(query, expanded_query, live_query, user_id);
}
}
}
@ -327,7 +335,7 @@ void AnalyticsManager::add_nohits_query(const std::string &query_collection, con
for(const auto& suggestion_collection: suggestion_collections_it->second) {
const auto& noresults_queries_it = nohits_queries.find(suggestion_collection);
if(noresults_queries_it != nohits_queries.end()) {
noresults_queries_it->second->add(query, live_query, user_id);
noresults_queries_it->second->add(query, query, live_query, user_id);
}
}
}

View File

@ -2381,6 +2381,10 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
nlohmann::json docs_array = nlohmann::json::array();
// handle analytics query expansion
std::string first_q = raw_query;
expand_search_query(raw_query, offset, total, search_params, result_group_kvs, raw_search_fields, first_q);
// construct results array
for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) {
const std::vector<KV*> & kv_group = result_group_kvs[result_kvs_index];
@ -2774,9 +2778,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
highlight_t highlight;
if(!facet_query.query.empty()) {
bool is_cyrillic = Tokenizer::is_cyrillic(the_field.locale);
bool use_word_tokenizer = the_field.locale == "th" || the_field.locale == "ja" ||
Tokenizer::is_cyrillic(the_field.locale);
bool use_word_tokenizer = Tokenizer::has_word_tokenizer(the_field.locale);
bool normalise = !use_word_tokenizer;
std::vector<std::string> fquery_tokens;
@ -2911,6 +2913,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
result["request_params"]["collection_name"] = name;
result["request_params"]["per_page"] = per_page;
result["request_params"]["q"] = raw_query;
result["request_params"]["first_q"] = first_q;
if(!override_metadata.empty()) {
result["metadata"] = override_metadata;
@ -2922,6 +2925,48 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
return Option<nlohmann::json>(result);
}
void Collection::expand_search_query(const string& raw_query, size_t offset, size_t total, const search_args* search_params,
const std::vector<std::vector<KV*>>& result_group_kvs,
const std::vector<std::string>& raw_search_fields, string& first_q) const {
if(!Config::get_instance().get_enable_search_analytics()) {
return ;
}
if(offset == 0 && !raw_search_fields.empty() && !search_params->searched_queries.empty() &&
total != 0 && !result_group_kvs.empty()) {
// we have to map raw_query (which could contain a prefix) back to expanded version
auto search_field_it = search_schema.find(raw_search_fields[0]);
if(search_field_it == search_schema.end() || Tokenizer::has_word_tokenizer(search_field_it->locale)) {
return ;
}
first_q = "";
auto q_index = result_group_kvs[0][0]->query_index;
if(q_index >= search_params->searched_queries.size()) {
return ;
}
const auto& qleaves = search_params->searched_queries[q_index];
Tokenizer tokenizer(raw_query, true, false, search_field_it->locale, symbols_to_index, token_separators);
std::string raw_token;
size_t raw_token_index = 0, tok_start = 0, tok_end = 0;
while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) {
if(raw_token_index < qleaves.size()) {
auto leaf = qleaves[raw_token_index];
std::string tok(reinterpret_cast<char*>(leaf->key), leaf->key_len - 1);
if(StringUtils::begins_with(tok, raw_token)) {
first_q += tok + " ";
}
}
}
if(!first_q.empty()) {
first_q.pop_back();
}
}
}
void Collection::copy_highlight_doc(std::vector<highlight_field_t>& hightlight_items,
const bool nested_fields_enabled,
const nlohmann::json& src, nlohmann::json& dst) {
@ -3577,8 +3622,7 @@ void Collection::highlight_result(const std::string& raw_query, const field &sea
tsl::htrie_set<char> matched_tokens;
bool use_word_tokenizer = search_field.locale == "th" || search_field.locale == "ja" ||
Tokenizer::is_cyrillic(search_field.locale);
bool use_word_tokenizer = Tokenizer::has_word_tokenizer(search_field.locale);
bool normalise = !use_word_tokenizer;
std::vector<std::string> raw_query_tokens;

View File

@ -1544,7 +1544,10 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
if(Config::get_instance().get_enable_search_analytics()) {
if(result.count("found") != 0 && result["found"].get<size_t>() != 0) {
std::string analytics_query = Tokenizer::normalize_ascii_no_spaces(raw_query);
AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query,
const std::string& expanded_query = Tokenizer::normalize_ascii_no_spaces(
result["request_params"]["first_q"].get<std::string>());
AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query, expanded_query,
true, req_params["x-typesense-user-id"]);
AnalyticsManager::get_instance().add_query_hits_count(orig_coll_name, analytics_query,
req_params["x-typesense-user-id"],

View File

@ -45,7 +45,7 @@ Option<bool> EventManager::add_event(const nlohmann::json& event, const std::str
for(const auto& coll: event_data_val["collections"]) {
std::string query = event_data_query_it.get<std::string>();
AnalyticsManager::get_instance().add_suggestion(coll.get<std::string>(), query, false, "");
AnalyticsManager::get_instance().add_suggestion(coll.get<std::string>(), query, query, false, "");
}
} else if(event_type == "query_click") {
if (!event.contains("data")) {

View File

@ -1985,7 +1985,7 @@ void Index::collate_included_ids(const std::vector<token_t>& q_included_tokens,
scores[1] = int64_t(1);
scores[2] = int64_t(1);
KV kv(searched_queries.size(), seq_id, distinct_id, 0, scores);
KV kv(0, seq_id, distinct_id, 0, scores);
curated_topster->add(&kv);
}
}

View File

@ -8,7 +8,8 @@ QueryAnalytics::QueryAnalytics(size_t k) : k(k), max_size(k * 2) {
}
void QueryAnalytics::add(const std::string& key, const bool live_query, const std::string& user_id, uint64_t now_ts_us) {
void QueryAnalytics::add(const std::string& key, const std::string& expanded_key,
const bool live_query, const std::string& user_id, uint64_t now_ts_us) {
if(live_query) {
// live query must be aggregated first to their final form as they could be prefix queries
if(now_ts_us == 0) {
@ -23,7 +24,9 @@ void QueryAnalytics::add(const std::string& key, const bool live_query, const st
auto& queries = user_prefix_queries[user_id];
if(queries.size() < 100) {
queries.emplace_back(key, now_ts_us);
// only live queries could send expanded queries
const std::string& actual_key = expand_query ? expanded_key : key;
queries.emplace_back(actual_key, now_ts_us);
}
umutex.unlock();
@ -90,7 +93,7 @@ void QueryAnalytics::compact_user_queries(uint64_t now_ts_us) {
(queries[i + 1].timestamp - queries[i].timestamp);
if(diff_micros > QUERY_FINALIZATION_INTERVAL_MICROS) {
add(queries[i].query, false, "");
add(queries[i].query, queries[i].query, false, "");
last_consolidated_index = i;
}
}
@ -116,3 +119,7 @@ tsl::htrie_map<char, uint32_t> QueryAnalytics::get_local_counts() {
std::unique_lock lk(lmutex);
return local_counts;
}
void QueryAnalytics::set_expand_query(bool expand_query) {
this->expand_query = expand_query;
}

View File

@ -370,3 +370,8 @@ std::string Tokenizer::normalize_ascii_no_spaces(const std::string& text) {
return analytics_query;
}
bool Tokenizer::has_word_tokenizer(const std::string& locale) {
bool use_word_tokenizer = locale == "th" || locale == "ja" || Tokenizer::is_cyrillic(locale);
return use_word_tokenizer;
}

View File

@ -85,29 +85,82 @@ TEST_F(AnalyticsManagerTest, AddSuggestion) {
auto create_op = analyticsManager.create_rule(analytics_rule, false, true);
ASSERT_TRUE(create_op.ok());
std::string q = "foobar";
analyticsManager.add_suggestion("titles", q, true, "1");
std::string q = "coo";
analyticsManager.add_suggestion("titles", q, "cool", true, "1");
auto popularQueries = analyticsManager.get_popular_queries();
auto userQueries = popularQueries["top_queries"]->get_user_prefix_queries()["1"];
ASSERT_EQ(1, userQueries.size());
ASSERT_EQ("foobar", userQueries[0].query);
ASSERT_EQ("coo", userQueries[0].query); // expanded query is NOT stored since it's not enabled
// add another query which is more popular
q = "buzzfoo";
analyticsManager.add_suggestion("titles", q, true, "1");
analyticsManager.add_suggestion("titles", q, true, "2");
analyticsManager.add_suggestion("titles", q, true, "3");
analyticsManager.add_suggestion("titles", q, q, true, "1");
analyticsManager.add_suggestion("titles", q, q, true, "2");
analyticsManager.add_suggestion("titles", q, q, true, "3");
popularQueries = analyticsManager.get_popular_queries();
userQueries = popularQueries["top_queries"]->get_user_prefix_queries()["1"];
ASSERT_EQ(2, userQueries.size());
ASSERT_EQ("foobar", userQueries[0].query);
ASSERT_EQ("coo", userQueries[0].query);
ASSERT_EQ("buzzfoo", userQueries[1].query);
ASSERT_TRUE(analyticsManager.remove_rule("top_search_queries").ok());
}
TEST_F(AnalyticsManagerTest, AddSuggestionWithExpandedQuery) {
nlohmann::json titles_schema = R"({
"name": "titles",
"fields": [
{"name": "title", "type": "string"}
]
})"_json;
Collection* titles_coll = collectionManager.create_collection(titles_schema).get();
nlohmann::json doc;
doc["title"] = "Cool trousers";
ASSERT_TRUE(titles_coll->add(doc.dump()).ok());
// create a collection to store suggestions
nlohmann::json suggestions_schema = R"({
"name": "top_queries",
"fields": [
{"name": "q", "type": "string" },
{"name": "count", "type": "int32" }
]
})"_json;
Collection* suggestions_coll = collectionManager.create_collection(suggestions_schema).get();
nlohmann::json analytics_rule = R"({
"name": "top_search_queries",
"type": "popular_queries",
"params": {
"limit": 100,
"expand_query": true,
"source": {
"collections": ["titles"]
},
"destination": {
"collection": "top_queries"
}
}
})"_json;
auto create_op = analyticsManager.create_rule(analytics_rule, false, true);
ASSERT_TRUE(create_op.ok());
analyticsManager.add_suggestion("titles", "c", "cool", true, "1");
auto popularQueries = analyticsManager.get_popular_queries();
auto userQueries = popularQueries["top_queries"]->get_user_prefix_queries()["1"];
ASSERT_EQ(1, userQueries.size());
ASSERT_EQ("cool", userQueries[0].query);
ASSERT_TRUE(analyticsManager.remove_rule("top_search_queries").ok());
}
TEST_F(AnalyticsManagerTest, GetAndDeleteSuggestions) {
nlohmann::json analytics_rule = R"({
"name": "top_search_queries",

View File

@ -56,6 +56,8 @@ protected:
};
TEST_F(CollectionOverrideTest, ExcludeIncludeExactQueryMatch) {
Config::get_instance().set_enable_search_analytics(true);
nlohmann::json override_json = {
{"id", "exclude-rule"},
{
@ -207,6 +209,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeExactQueryMatch) {
ASSERT_EQ(4, results["found"].get<uint32_t>());
coll_mul_fields->remove_override("include-rule");
Config::get_instance().set_enable_search_analytics(false);
}
TEST_F(CollectionOverrideTest, OverrideJSONValidation) {

View File

@ -2555,6 +2555,33 @@ TEST_F(CollectionSpecificMoreTest, CrossFieldTypoAndPrefixWithWeights) {
ASSERT_EQ(1, res["hits"].size());
}
TEST_F(CollectionSpecificMoreTest, AnalyticsFullFirstQuery) {
Config::get_instance().set_enable_search_analytics(true);
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string"},
{"name": "color", "type": "string"}
]
})"_json;
Collection* coll1 = collectionManager.create_collection(schema).get();
nlohmann::json doc;
doc["id"] = "0";
doc["title"] = "Cool trousers";
doc["color"] = "blue";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
auto res = coll1->search("co", {"title", "color"}, "", {}, {}, {2, 0}, 10, 1, FREQUENCY, {true}, 0,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {2, 3}).get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_EQ("cool", res["request_params"]["first_q"].get<std::string>());
Config::get_instance().set_enable_search_analytics(false);
}
TEST_F(CollectionSpecificMoreTest, TruncateAterTopK) {
nlohmann::json schema = R"({
"name": "coll1",

View File

@ -25,7 +25,7 @@ TEST_F(PopularQueriesTest, PrefixQueryCompaction) {
ASSERT_TRUE(queries.empty());
// compaction after user has typed first prefix but before compaction interval has happened
pq.add("f", true, "0", now_ts_us+1);
pq.add("f", "f", true, "0", now_ts_us+1);
pq.compact_user_queries(now_ts_us+2);
queries = pq.get_user_prefix_queries();
ASSERT_EQ(1, queries.size());
@ -46,9 +46,9 @@ TEST_F(PopularQueriesTest, PrefixQueryCompaction) {
// 3 letter search
pq.reset_local_counts();
pq.add("f", true, "0", now_ts_us+1);
pq.add("fo", true, "0", now_ts_us+2);
pq.add("foo", true, "0", now_ts_us+3);
pq.add("f", "f", true, "0", now_ts_us+1);
pq.add("fo", "fo", true, "0", now_ts_us+2);
pq.add("foo", "foo", true, "0", now_ts_us+3);
pq.compact_user_queries(now_ts_us + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 100);
queries = pq.get_user_prefix_queries();
ASSERT_EQ(0, queries.size());
@ -59,10 +59,10 @@ TEST_F(PopularQueriesTest, PrefixQueryCompaction) {
// 3 letter search + start of next search
pq.reset_local_counts();
pq.add("f", true, "0", now_ts_us+1);
pq.add("fo", true, "0", now_ts_us+2);
pq.add("foo", true, "0", now_ts_us+3);
pq.add("b", true, "0", now_ts_us + 3 + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 100);
pq.add("f", "f", true, "0", now_ts_us+1);
pq.add("fo", "fo", true, "0", now_ts_us+2);
pq.add("foo", "foo", true, "0", now_ts_us+3);
pq.add("b", "b", true, "0", now_ts_us + 3 + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 100);
pq.compact_user_queries(now_ts_us + 3 + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 100 + 1);
queries = pq.get_user_prefix_queries();
ASSERT_EQ(1, queries.size());
@ -75,8 +75,8 @@ TEST_F(PopularQueriesTest, PrefixQueryCompaction) {
// continue with that query
auto prev_ts = now_ts_us + 3 + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 100 + 1;
pq.add("ba", true, "0", prev_ts+1);
pq.add("bar", true, "0", prev_ts+2);
pq.add("ba", "ba", true, "0", prev_ts+1);
pq.add("bar", "bar", true, "0", prev_ts+2);
pq.compact_user_queries(prev_ts + 2 + QueryAnalytics::QUERY_FINALIZATION_INTERVAL_MICROS + 1);
queries = pq.get_user_prefix_queries();
ASSERT_EQ(0, queries.size());