Merge branch 'v0.25-join' into v0.26-facets

This commit is contained in:
Kishore Nallan 2023-08-23 20:19:16 +05:30
commit bfa744353b
11 changed files with 110 additions and 11 deletions

View File

@ -79,7 +79,7 @@ public:
Option<bool> remove_rule(const std::string& name);
void add_suggestion(const std::string& query_collection,
std::string& query, bool live_query, const std::string& user_id);
const std::string& query, bool live_query, const std::string& user_id);
void stop();

View File

@ -88,5 +88,5 @@ public:
bool should_skip_char(char c);
static void normalize_ascii(std::string& text);
static std::string normalize_ascii_no_spaces(const std::string& text);
};

View File

@ -787,6 +787,10 @@ public:
cors_domains.insert(cors_values_vec.begin(), cors_values_vec.end());
}
void set_enable_search_analytics(bool enable_search_analytics) {
this->enable_search_analytics = enable_search_analytics;
}
// validation
Option<bool> is_valid() {

View File

@ -203,7 +203,7 @@ Option<bool> AnalyticsManager::remove_popular_queries_index(const std::string &n
return Option<bool>(true);
}
void AnalyticsManager::add_suggestion(const std::string &query_collection, std::string &query,
void AnalyticsManager::add_suggestion(const std::string &query_collection, const std::string& query,
const bool live_query, const std::string& user_id) {
// look up suggestion collections for the query collection
std::unique_lock lock(mutex);
@ -212,7 +212,6 @@ void AnalyticsManager::add_suggestion(const std::string &query_collection, std::
for(const auto& suggestion_collection: suggestion_collections_it->second) {
const auto& popular_queries_it = popular_queries.find(suggestion_collection);
if(popular_queries_it != popular_queries.end()) {
Tokenizer::normalize_ascii(query);
popular_queries_it->second->add(query, live_query, user_id);
}
}
@ -235,6 +234,8 @@ void AnalyticsManager::run(ReplicationState* raft_server) {
}
persist_suggestions(raft_server, prev_persistence_s);
prev_persistence_s = std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::system_clock::now().time_since_epoch()).count();
lk.unlock();
}
@ -270,8 +271,6 @@ void AnalyticsManager::persist_suggestions(ReplicationState *raft_server, uint64
continue;
}
prev_persistence_s = now_ts_seconds;
std::string import_payload;
popularQueries->serialize_as_docs(import_payload);

View File

@ -5110,9 +5110,13 @@ void Collection::hide_credential(nlohmann::json& json, const std::string& creden
}
Option<bool> Collection::truncate_after_top_k(const string &field_name, size_t k) {
std::shared_lock slock(mutex);
std::vector<uint32_t> seq_ids;
auto op = index->seq_ids_outside_top_k(field_name, k, seq_ids);
slock.unlock();
if(!op.ok()) {
return op;
}

View File

@ -1242,7 +1242,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
if(Config::get_instance().get_enable_search_analytics()) {
if(result.count("found") != 0 && result["found"].get<size_t>() != 0) {
std::string analytics_query = raw_query;
std::string analytics_query = Tokenizer::normalize_ascii_no_spaces(raw_query);
AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query,
true, req_params["x-typesense-user-id"]);
}

View File

@ -607,7 +607,7 @@ Option<bool> field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j
}
if(!the_fields.empty() && !the_fields.back().embed.empty()) {
embed_json_field_indices.emplace_back(i, i);
embed_json_field_indices.emplace_back(i, the_fields.size()-1);
}
}

View File

@ -5886,6 +5886,7 @@ size_t Index::num_seq_ids() const {
Option<bool> Index::seq_ids_outside_top_k(const std::string& field_name, size_t k,
std::vector<uint32_t>& outside_seq_ids) {
std::shared_lock lock(mutex);
if (numerical_index.count(field_name) != 0) {
auto field_it = numerical_index.find(field_name);

View File

@ -1,5 +1,6 @@
#include <sstream>
#include <algorithm>
#include <string_utils.h>
#include "tokenizer.h"
Tokenizer::Tokenizer(const std::string& input, bool normalize, bool no_op, const std::string& locale,
@ -349,10 +350,15 @@ bool Tokenizer::should_skip_char(char c) {
return is_ascii_char(c) && get_stream_mode(c) != INDEX;
}
void Tokenizer::normalize_ascii(std::string& text) {
for(size_t i = 0; i < text.size(); i++) {
std::string Tokenizer::normalize_ascii_no_spaces(const std::string& text) {
std::string analytics_query = text;
StringUtils::trim(analytics_query);
for(size_t i = 0; i < analytics_query.size(); i++) {
if(is_ascii_char(text[i])) {
text[i] = std::tolower(text[i]);
analytics_query[i] = std::tolower(analytics_query[i]);
}
}
return analytics_query;
}

View File

@ -3,6 +3,7 @@
#include <vector>
#include <fstream>
#include <collection_manager.h>
#include <analytics_manager.h>
#include "string_utils.h"
#include "collection.h"
@ -24,6 +25,8 @@ protected:
collectionManager.init(store, 1.0, "auth_key", quit);
collectionManager.load(8, 1000);
AnalyticsManager::get_instance().init(store);
schema = R"({
"name": "collection1",
"enable_nested_fields": true,
@ -587,6 +590,67 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionManagerTest, QuerySuggestionsShouldBeTrimmed) {
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, "", -1, 1),
field("year", field_types::INT32, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Tom Sawyer";
doc1["year"] = 1876;
doc1["points"] = 100;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
Config::get_instance().set_enable_search_analytics(true);
nlohmann::json analytics_rule = R"({
"name": "top_search_queries",
"type": "popular_queries",
"params": {
"limit": 100,
"source": {
"collections": ["coll1"]
},
"destination": {
"collection": "top_queries"
}
}
})"_json;
auto create_op = AnalyticsManager::get_instance().create_rule(analytics_rule, false, true);
ASSERT_TRUE(create_op.ok());
nlohmann::json embedded_params;
std::map<std::string, std::string> req_params;
req_params["collection"] = "coll1";
req_params["q"] = " tom ";
req_params["query_by"] = "title";
std::string json_res;
auto now_ts = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch()).count();
auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
json_res.clear();
req_params["q"] = " ";
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
ASSERT_TRUE(search_op.ok());
// check that suggestions have been trimmed
auto popular_queries = AnalyticsManager::get_instance().get_popular_queries();
ASSERT_EQ(2, popular_queries["top_queries"]->get_user_prefix_queries()[""].size());
ASSERT_EQ("tom", popular_queries["top_queries"]->get_user_prefix_queries()[""][0].query);
ASSERT_EQ("", popular_queries["top_queries"]->get_user_prefix_queries()[""][1].query);
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) {
Collection *coll1;

View File

@ -1161,6 +1161,27 @@ TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) {
"or make the embedding field optional.", add_op.error());
}
TEST_F(CollectionVectorTest, EmbeddingFieldWithIdFieldPrecedingInSchema) {
auto schema = R"({
"name": "objects",
"fields": [
{"name": "id", "type": "string"},
{"name": "name", "type": "string"},
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll = op.get();
auto fs = coll->get_fields();
ASSERT_EQ(2, fs.size());
ASSERT_EQ(384, fs[1].num_dim);
}
TEST_F(CollectionVectorTest, SkipEmbeddingOpWhenValueExists) {
nlohmann::json schema = R"({
"name": "objects",