mirror of
https://github.com/typesense/typesense.git
synced 2025-05-22 06:40:30 +08:00
Merge remote-tracking branch 'up/v0.26-facets' into v0.26-facets
This commit is contained in:
commit
23f28359a5
@ -79,7 +79,7 @@ public:
|
||||
Option<bool> remove_rule(const std::string& name);
|
||||
|
||||
void add_suggestion(const std::string& query_collection,
|
||||
std::string& query, bool live_query, const std::string& user_id);
|
||||
const std::string& query, bool live_query, const std::string& user_id);
|
||||
|
||||
void stop();
|
||||
|
||||
|
@ -102,6 +102,8 @@ public:
|
||||
|
||||
static void to_expanded_id_lists(const std::vector<void*>& raw_id_lists, std::vector<id_list_t*>& id_lists,
|
||||
std::vector<id_list_t*>& expanded_id_lists);
|
||||
|
||||
static void* create(const std::vector<uint32_t>& ids);
|
||||
};
|
||||
|
||||
template<class T>
|
||||
|
@ -88,5 +88,5 @@ public:
|
||||
|
||||
bool should_skip_char(char c);
|
||||
|
||||
static void normalize_ascii(std::string& text);
|
||||
static std::string normalize_ascii_no_spaces(const std::string& text);
|
||||
};
|
@ -787,6 +787,10 @@ public:
|
||||
cors_domains.insert(cors_values_vec.begin(), cors_values_vec.end());
|
||||
}
|
||||
|
||||
void set_enable_search_analytics(bool enable_search_analytics) {
|
||||
this->enable_search_analytics = enable_search_analytics;
|
||||
}
|
||||
|
||||
// validation
|
||||
|
||||
Option<bool> is_valid() {
|
||||
|
@ -203,7 +203,7 @@ Option<bool> AnalyticsManager::remove_popular_queries_index(const std::string &n
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
void AnalyticsManager::add_suggestion(const std::string &query_collection, std::string &query,
|
||||
void AnalyticsManager::add_suggestion(const std::string &query_collection, const std::string& query,
|
||||
const bool live_query, const std::string& user_id) {
|
||||
// look up suggestion collections for the query collection
|
||||
std::unique_lock lock(mutex);
|
||||
@ -212,7 +212,6 @@ void AnalyticsManager::add_suggestion(const std::string &query_collection, std::
|
||||
for(const auto& suggestion_collection: suggestion_collections_it->second) {
|
||||
const auto& popular_queries_it = popular_queries.find(suggestion_collection);
|
||||
if(popular_queries_it != popular_queries.end()) {
|
||||
Tokenizer::normalize_ascii(query);
|
||||
popular_queries_it->second->add(query, live_query, user_id);
|
||||
}
|
||||
}
|
||||
@ -235,6 +234,8 @@ void AnalyticsManager::run(ReplicationState* raft_server) {
|
||||
}
|
||||
|
||||
persist_suggestions(raft_server, prev_persistence_s);
|
||||
prev_persistence_s = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
|
||||
lk.unlock();
|
||||
}
|
||||
@ -270,8 +271,6 @@ void AnalyticsManager::persist_suggestions(ReplicationState *raft_server, uint64
|
||||
continue;
|
||||
}
|
||||
|
||||
prev_persistence_s = now_ts_seconds;
|
||||
|
||||
std::string import_payload;
|
||||
popularQueries->serialize_as_docs(import_payload);
|
||||
|
||||
|
@ -5396,9 +5396,13 @@ void Collection::hide_credential(nlohmann::json& json, const std::string& creden
|
||||
}
|
||||
|
||||
Option<bool> Collection::truncate_after_top_k(const string &field_name, size_t k) {
|
||||
std::shared_lock slock(mutex);
|
||||
|
||||
std::vector<uint32_t> seq_ids;
|
||||
auto op = index->seq_ids_outside_top_k(field_name, k, seq_ids);
|
||||
|
||||
slock.unlock();
|
||||
|
||||
if(!op.ok()) {
|
||||
return op;
|
||||
}
|
||||
|
@ -1269,7 +1269,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
|
||||
if(Config::get_instance().get_enable_search_analytics()) {
|
||||
if(result.count("found") != 0 && result["found"].get<size_t>() != 0) {
|
||||
std::string analytics_query = raw_query;
|
||||
std::string analytics_query = Tokenizer::normalize_ascii_no_spaces(raw_query);
|
||||
AnalyticsManager::get_instance().add_suggestion(orig_coll_name, analytics_query,
|
||||
true, req_params["x-typesense-user-id"]);
|
||||
}
|
||||
|
@ -55,7 +55,7 @@ void facet_index_t::insert(const std::string& field_name, bool is_string,
|
||||
if(facet_index.has_value_index) {
|
||||
count_list.emplace_back(fvalue.facet_value, seq_ids.size(), facet_id);
|
||||
fis.facet_count_it = std::prev(count_list.end());
|
||||
fis.seq_ids = SET_COMPACT_IDS(compact_id_list_t::create(seq_ids.size(), seq_ids));
|
||||
fis.seq_ids = ids_t::create(seq_ids);
|
||||
}
|
||||
|
||||
fvalue_index.emplace(fvalue.facet_value, fis);
|
||||
|
@ -607,7 +607,7 @@ Option<bool> field::json_fields_to_fields(bool enable_nested_fields, nlohmann::j
|
||||
}
|
||||
|
||||
if(!the_fields.empty() && !the_fields.back().embed.empty()) {
|
||||
embed_json_field_indices.emplace_back(i, i);
|
||||
embed_json_field_indices.emplace_back(i, the_fields.size()-1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -375,6 +375,19 @@ size_t ids_t::intersect_count(void*& obj, const uint32_t* result_ids, size_t res
|
||||
}
|
||||
}
|
||||
|
||||
void* ids_t::create(const std::vector<uint32_t>& ids) {
|
||||
if(ids.size() < COMPACT_LIST_THRESHOLD_LENGTH) {
|
||||
return SET_COMPACT_IDS(compact_id_list_t::create(ids.size(), ids));
|
||||
} else {
|
||||
id_list_t* pl = new id_list_t(ids_t::MAX_BLOCK_ELEMENTS);
|
||||
for(auto id: ids) {
|
||||
pl->upsert(id);
|
||||
}
|
||||
|
||||
return pl;
|
||||
}
|
||||
}
|
||||
|
||||
void ids_t::block_intersector_t::split_lists(size_t concurrency,
|
||||
std::vector<std::vector<id_list_t::iterator_t>>& partial_its_vec) {
|
||||
const size_t num_blocks = this->id_lists[0]->num_blocks();
|
||||
|
@ -6342,21 +6342,16 @@ size_t Index::num_seq_ids() const {
|
||||
|
||||
Option<bool> Index::seq_ids_outside_top_k(const std::string& field_name, size_t k,
|
||||
std::vector<uint32_t>& outside_seq_ids) {
|
||||
if (numerical_index.count(field_name) != 0) {
|
||||
auto field_it = numerical_index.find(field_name);
|
||||
|
||||
if(field_it == sort_index.end()) {
|
||||
return Option<bool>(400, "Field not found in numerical index.");
|
||||
}
|
||||
|
||||
std::shared_lock lock(mutex);
|
||||
auto field_it = numerical_index.find(field_name);
|
||||
if(field_it != numerical_index.end()) {
|
||||
field_it->second->seq_ids_outside_top_k(k, outside_seq_ids);
|
||||
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
if (range_index.count(field_name) != 0) {
|
||||
auto trie = range_index[field_name];
|
||||
trie->seq_ids_outside_top_k(k, outside_seq_ids);
|
||||
auto range_trie_it = range_index.find(field_name);
|
||||
if (range_trie_it != range_index.end()) {
|
||||
range_trie_it->second->seq_ids_outside_top_k(k, outside_seq_ids);
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <string_utils.h>
|
||||
#include "tokenizer.h"
|
||||
|
||||
Tokenizer::Tokenizer(const std::string& input, bool normalize, bool no_op, const std::string& locale,
|
||||
@ -349,10 +350,15 @@ bool Tokenizer::should_skip_char(char c) {
|
||||
return is_ascii_char(c) && get_stream_mode(c) != INDEX;
|
||||
}
|
||||
|
||||
void Tokenizer::normalize_ascii(std::string& text) {
|
||||
for(size_t i = 0; i < text.size(); i++) {
|
||||
std::string Tokenizer::normalize_ascii_no_spaces(const std::string& text) {
|
||||
std::string analytics_query = text;
|
||||
StringUtils::trim(analytics_query);
|
||||
|
||||
for(size_t i = 0; i < analytics_query.size(); i++) {
|
||||
if(is_ascii_char(text[i])) {
|
||||
text[i] = std::tolower(text[i]);
|
||||
analytics_query[i] = std::tolower(analytics_query[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return analytics_query;
|
||||
}
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <collection_manager.h>
|
||||
#include <analytics_manager.h>
|
||||
#include "string_utils.h"
|
||||
#include "collection.h"
|
||||
|
||||
@ -24,6 +25,8 @@ protected:
|
||||
collectionManager.init(store, 1.0, "auth_key", quit);
|
||||
collectionManager.load(8, 1000);
|
||||
|
||||
AnalyticsManager::get_instance().init(store);
|
||||
|
||||
schema = R"({
|
||||
"name": "collection1",
|
||||
"enable_nested_fields": true,
|
||||
@ -587,6 +590,67 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) {
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionManagerTest, QuerySuggestionsShouldBeTrimmed) {
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, "", -1, 1),
|
||||
field("year", field_types::INT32, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
|
||||
nlohmann::json doc1;
|
||||
doc1["id"] = "0";
|
||||
doc1["title"] = "Tom Sawyer";
|
||||
doc1["year"] = 1876;
|
||||
doc1["points"] = 100;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
|
||||
|
||||
Config::get_instance().set_enable_search_analytics(true);
|
||||
|
||||
nlohmann::json analytics_rule = R"({
|
||||
"name": "top_search_queries",
|
||||
"type": "popular_queries",
|
||||
"params": {
|
||||
"limit": 100,
|
||||
"source": {
|
||||
"collections": ["coll1"]
|
||||
},
|
||||
"destination": {
|
||||
"collection": "top_queries"
|
||||
}
|
||||
}
|
||||
})"_json;
|
||||
|
||||
auto create_op = AnalyticsManager::get_instance().create_rule(analytics_rule, false, true);
|
||||
ASSERT_TRUE(create_op.ok());
|
||||
|
||||
nlohmann::json embedded_params;
|
||||
std::map<std::string, std::string> req_params;
|
||||
req_params["collection"] = "coll1";
|
||||
req_params["q"] = " tom ";
|
||||
req_params["query_by"] = "title";
|
||||
|
||||
std::string json_res;
|
||||
auto now_ts = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
|
||||
auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
|
||||
ASSERT_TRUE(search_op.ok());
|
||||
|
||||
json_res.clear();
|
||||
req_params["q"] = " ";
|
||||
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
|
||||
ASSERT_TRUE(search_op.ok());
|
||||
|
||||
// check that suggestions have been trimmed
|
||||
auto popular_queries = AnalyticsManager::get_instance().get_popular_queries();
|
||||
ASSERT_EQ(2, popular_queries["top_queries"]->get_user_prefix_queries()[""].size());
|
||||
ASSERT_EQ("tom", popular_queries["top_queries"]->get_user_prefix_queries()[""][0].query);
|
||||
ASSERT_EQ("", popular_queries["top_queries"]->get_user_prefix_queries()[""][1].query);
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionManagerTest, RestoreAutoSchemaDocsOnRestart) {
|
||||
Collection *coll1;
|
||||
|
||||
|
@ -1161,6 +1161,27 @@ TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) {
|
||||
"or make the embedding field optional.", add_op.error());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, EmbeddingFieldWithIdFieldPrecedingInSchema) {
|
||||
auto schema = R"({
|
||||
"name": "objects",
|
||||
"fields": [
|
||||
{"name": "id", "type": "string"},
|
||||
{"name": "name", "type": "string"},
|
||||
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll = op.get();
|
||||
|
||||
auto fs = coll->get_fields();
|
||||
ASSERT_EQ(2, fs.size());
|
||||
ASSERT_EQ(384, fs[1].num_dim);
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, SkipEmbeddingOpWhenValueExists) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "objects",
|
||||
|
Loading…
x
Reference in New Issue
Block a user