diff --git a/include/analytics_manager.h b/include/analytics_manager.h index 4207f7cf..0f098a7a 100644 --- a/include/analytics_manager.h +++ b/include/analytics_manager.h @@ -79,7 +79,7 @@ public: Option remove_rule(const std::string& name); void add_suggestion(const std::string& query_collection, - std::string& query, bool live_query, const std::string& user_id); + const std::string& query, bool live_query, const std::string& user_id); void stop(); diff --git a/include/tokenizer.h b/include/tokenizer.h index a6a88ab6..36de0940 100644 --- a/include/tokenizer.h +++ b/include/tokenizer.h @@ -88,5 +88,5 @@ public: bool should_skip_char(char c); - static void normalize_ascii(std::string& text); + static std::string normalize_ascii_no_spaces(const std::string& text); }; \ No newline at end of file diff --git a/include/tsconfig.h b/include/tsconfig.h index 1e6d8f62..dc382e80 100644 --- a/include/tsconfig.h +++ b/include/tsconfig.h @@ -787,6 +787,10 @@ public: cors_domains.insert(cors_values_vec.begin(), cors_values_vec.end()); } + void set_enable_search_analytics(bool enable_search_analytics) { + this->enable_search_analytics = enable_search_analytics; + } + // validation Option is_valid() { diff --git a/src/analytics_manager.cpp b/src/analytics_manager.cpp index b23c6cc2..31ac078a 100644 --- a/src/analytics_manager.cpp +++ b/src/analytics_manager.cpp @@ -203,7 +203,7 @@ Option AnalyticsManager::remove_popular_queries_index(const std::string &n return Option(true); } -void AnalyticsManager::add_suggestion(const std::string &query_collection, std::string &query, +void AnalyticsManager::add_suggestion(const std::string &query_collection, const std::string& query, const bool live_query, const std::string& user_id) { // look up suggestion collections for the query collection std::unique_lock lock(mutex); @@ -212,7 +212,6 @@ void AnalyticsManager::add_suggestion(const std::string &query_collection, std:: for(const auto& suggestion_collection: suggestion_collections_it->second) { const auto& popular_queries_it = popular_queries.find(suggestion_collection); if(popular_queries_it != popular_queries.end()) { - Tokenizer::normalize_ascii(query); popular_queries_it->second->add(query, live_query, user_id); } } @@ -235,6 +234,8 @@ void AnalyticsManager::run(ReplicationState* raft_server) { } persist_suggestions(raft_server, prev_persistence_s); + prev_persistence_s = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); lk.unlock(); } @@ -270,8 +271,6 @@ void AnalyticsManager::persist_suggestions(ReplicationState *raft_server, uint64 continue; } - prev_persistence_s = now_ts_seconds; - std::string import_payload; popularQueries->serialize_as_docs(import_payload); diff --git a/src/collection.cpp b/src/collection.cpp index ff5b8d94..a10e1fc4 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -5110,9 +5110,13 @@ void Collection::hide_credential(nlohmann::json& json, const std::string& creden } Option Collection::truncate_after_top_k(const string &field_name, size_t k) { + std::shared_lock slock(mutex); + std::vector seq_ids; auto op = index->seq_ids_outside_top_k(field_name, k, seq_ids); + slock.unlock(); + if(!op.ok()) { return op; } diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 7791b2a9..481e69b5 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -1242,7 +1242,7 @@ Option CollectionManager::do_search(std::map& re if(Config::get_instance().get_enable_search_analytics()) { if(result.count("found") != 0 && result["found"].get() != 0) { - std::string analytics_query = raw_query; + std::string analytics_query = Tokenizer::normalize_ascii_no_spaces(raw_query); AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query, true, req_params["x-typesense-user-id"]); } diff --git a/src/field.cpp b/src/field.cpp index b7f551fa..fec95589 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -607,7 +607,7 @@ Option field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j } if(!the_fields.empty() && !the_fields.back().embed.empty()) { - embed_json_field_indices.emplace_back(i, i); + embed_json_field_indices.emplace_back(i, the_fields.size()-1); } } diff --git a/src/index.cpp b/src/index.cpp index 1b18d871..6c3d2601 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -5886,6 +5886,7 @@ size_t Index::num_seq_ids() const { Option Index::seq_ids_outside_top_k(const std::string& field_name, size_t k, std::vector& outside_seq_ids) { + std::shared_lock lock(mutex); if (numerical_index.count(field_name) != 0) { auto field_it = numerical_index.find(field_name); diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 464349ed..5688e27d 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "tokenizer.h" Tokenizer::Tokenizer(const std::string& input, bool normalize, bool no_op, const std::string& locale, @@ -349,10 +350,15 @@ bool Tokenizer::should_skip_char(char c) { return is_ascii_char(c) && get_stream_mode(c) != INDEX; } -void Tokenizer::normalize_ascii(std::string& text) { - for(size_t i = 0; i < text.size(); i++) { +std::string Tokenizer::normalize_ascii_no_spaces(const std::string& text) { + std::string analytics_query = text; + StringUtils::trim(analytics_query); + + for(size_t i = 0; i < analytics_query.size(); i++) { if(is_ascii_char(text[i])) { - text[i] = std::tolower(text[i]); + analytics_query[i] = std::tolower(analytics_query[i]); } } + + return analytics_query; } diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index 3c18db44..cce53b18 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include "string_utils.h" #include "collection.h" @@ -24,6 +25,8 @@ protected: collectionManager.init(store, 1.0, "auth_key", quit); collectionManager.load(8, 1000); + AnalyticsManager::get_instance().init(store); + schema = R"({ "name": "collection1", "enable_nested_fields": true, @@ -587,6 +590,67 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) { collectionManager.drop_collection("coll1"); } +TEST_F(CollectionManagerTest, QuerySuggestionsShouldBeTrimmed) { + std::vector fields = {field("title", field_types::STRING, false, false, true, "", -1, 1), + field("year", field_types::INT32, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["title"] = "Tom Sawyer"; + doc1["year"] = 1876; + doc1["points"] = 100; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + + Config::get_instance().set_enable_search_analytics(true); + + nlohmann::json analytics_rule = R"({ + "name": "top_search_queries", + "type": "popular_queries", + "params": { + "limit": 100, + "source": { + "collections": ["coll1"] + }, + "destination": { + "collection": "top_queries" + } + } + })"_json; + + auto create_op = AnalyticsManager::get_instance().create_rule(analytics_rule, false, true); + ASSERT_TRUE(create_op.ok()); + + nlohmann::json embedded_params; + std::map req_params; + req_params["collection"] = "coll1"; + req_params["q"] = " tom "; + req_params["query_by"] = "title"; + + std::string json_res; + auto now_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + + auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + json_res.clear(); + req_params["q"] = " "; + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + + // check that suggestions have been trimmed + auto popular_queries = AnalyticsManager::get_instance().get_popular_queries(); + ASSERT_EQ(2, popular_queries["top_queries"]->get_user_prefix_queries()[""].size()); + ASSERT_EQ("tom", popular_queries["top_queries"]->get_user_prefix_queries()[""][0].query); + ASSERT_EQ("", popular_queries["top_queries"]->get_user_prefix_queries()[""][1].query); + + collectionManager.drop_collection("coll1"); +} + TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) { Collection *coll1; diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 02bef6ff..694e501f 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -1161,6 +1161,27 @@ TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) { "or make the embedding field optional.", add_op.error()); } +TEST_F(CollectionVectorTest, EmbeddingFieldWithIdFieldPrecedingInSchema) { + auto schema = R"({ + "name": "objects", + "fields": [ + {"name": "id", "type": "string"}, + {"name": "name", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + auto fs = coll->get_fields(); + ASSERT_EQ(2, fs.size()); + ASSERT_EQ(384, fs[1].num_dim); +} + TEST_F(CollectionVectorTest, SkipEmbeddingOpWhenValueExists) { nlohmann::json schema = R"({ "name": "objects",