diff --git a/.bazelversion b/.bazelversion new file mode 100644 index 00000000..7cbea073 --- /dev/null +++ b/.bazelversion @@ -0,0 +1 @@ +5.2.0 \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 280e69e9..af855337 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -45,6 +45,7 @@ jobs: workflow_conclusion: "" if_no_artifact_found: warn skip_unpack: true + branch: ${{ github.base_ref || github.head_ref || github.ref_name }} - name: Uncompress bazel cache run: | diff --git a/include/analytics_manager.h b/include/analytics_manager.h index 9d9e9fd3..08c44010 100644 --- a/include/analytics_manager.h +++ b/include/analytics_manager.h @@ -46,6 +46,11 @@ struct click_event_t { } }; +struct popular_clicks_t { + std::string counter_field; + std::map docid_counts; +}; + struct query_hits_count_t { std::string query; uint64_t timestamp; @@ -137,6 +142,9 @@ private: // suggestion collection => nohits queries std::unordered_map nohits_queries; + // collection => popular clicks + std::unordered_map popular_clicks; + //query collection => click events std::unordered_map> query_collection_click_events; @@ -163,6 +171,7 @@ public: static constexpr const char* QUERY_HITS_COUNT = "$QH"; static constexpr const char* POPULAR_QUERIES_TYPE = "popular_queries"; static constexpr const char* NOHITS_QUERIES_TYPE = "nohits_queries"; + static constexpr const char* POPULAR_CLICKS_TYPE = "popular_clicks"; static AnalyticsManager& get_instance() { static AnalyticsManager instance; @@ -191,6 +200,8 @@ public: void dispose(); + Store* get_analytics_store(); + void persist_query_events(ReplicationState *raft_server, uint64_t prev_persistence_s); std::unordered_map get_popular_queries(); @@ -200,8 +211,12 @@ public: void persist_query_hits_click_events(ReplicationState *raft_server, uint64_t prev_persistence_s); + void persist_popular_clicks(ReplicationState *raft_server, uint64_t prev_persistence_s); + nlohmann::json get_click_events(); + std::unordered_map get_popular_clicks(); + Option write_events_to_store(nlohmann::json& event_jsons); void add_nohits_query(const std::string& query_collection, diff --git a/include/collection.h b/include/collection.h index 844a1176..bb0f581a 100644 --- a/include/collection.h +++ b/include/collection.h @@ -399,6 +399,8 @@ public: std::vector get_fields(); + bool contains_field(const std::string&); + std::unordered_map get_dynamic_fields(); tsl::htrie_map get_schema(); diff --git a/include/store.h b/include/store.h index ae50d1ab..9cf7a1d3 100644 --- a/include/store.h +++ b/include/store.h @@ -84,7 +84,8 @@ public: Store(const std::string & state_dir_path, const size_t wal_ttl_secs = 24*60*60, - const size_t wal_size_mb = 1024, bool disable_wal = true): state_dir_path(state_dir_path) { + const size_t wal_size_mb = 1024, bool disable_wal = true, + const size_t db_compaction_interval = 604800): state_dir_path(state_dir_path) { // Optimize RocksDB options.IncreaseParallelism(); options.OptimizeLevelStyleCompaction(); @@ -94,7 +95,7 @@ public: options.max_write_buffer_number = 2; options.merge_operator.reset(new UInt64AddOperator); options.compression = rocksdb::CompressionType::kSnappyCompression; - options.periodic_compaction_seconds = 604800; + options.periodic_compaction_seconds = db_compaction_interval; options.max_log_file_size = 4*1048576; options.keep_log_file_num = 5; diff --git a/include/string_utils.h b/include/string_utils.h index 74cfc537..11f16967 100644 --- a/include/string_utils.h +++ b/include/string_utils.h @@ -335,4 +335,6 @@ struct StringUtils { static Option tokenize_filter_query(const std::string& filter_query, std::queue& tokens); static Option split_include_fields(const std::string& include_fields, std::vector& tokens); + + static size_t get_occurence_count(const std::string& str, char symbol); }; diff --git a/include/tsconfig.h b/include/tsconfig.h index 67a924e8..da12ffe0 100644 --- a/include/tsconfig.h +++ b/include/tsconfig.h @@ -72,6 +72,8 @@ private: uint32_t housekeeping_interval; + uint32_t db_compaction_interval; + protected: Config() { @@ -100,6 +102,7 @@ protected: this->enable_search_analytics = false; this->analytics_flush_interval = 3600; // in seconds this->housekeeping_interval = 1800; // in seconds + this->db_compaction_interval = 604800; // in seconds } Config(Config const&) { @@ -309,6 +312,10 @@ public: return this->housekeeping_interval; } + size_t get_db_compaction_interval() const { + return this->db_compaction_interval; + } + size_t get_thread_pool_size() const { return this->thread_pool_size; } @@ -449,6 +456,10 @@ public: this->housekeeping_interval = std::stoi(get_env("TYPESENSE_HOUSEKEEPING_INTERVAL")); } + if(!get_env("TYPESENSE_DB_COMPACTION_INTERVAL").empty()) { + this->db_compaction_interval = std::stoi(get_env("TYPESENSE_DB_COMPACTION_INTERVAL")); + } + if(!get_env("TYPESENSE_THREAD_POOL_SIZE").empty()) { this->thread_pool_size = std::stoi(get_env("TYPESENSE_THREAD_POOL_SIZE")); } @@ -620,6 +631,10 @@ public: this->housekeeping_interval = (int) reader.GetInteger("server", "housekeeping-interval", 1800); } + if(reader.Exists("server", "db-compaction-interval")) { + this->db_compaction_interval = (int) reader.GetInteger("server", "db-compaction-interval", 1800); + } + if(reader.Exists("server", "thread-pool-size")) { this->thread_pool_size = (int) reader.GetInteger("server", "thread-pool-size", 0); } @@ -782,6 +797,10 @@ public: this->housekeeping_interval = options.get("housekeeping-interval"); } + if(options.exist("db-compaction-interval")) { + this->db_compaction_interval = options.get("db-compaction-interval"); + } + if(options.exist("thread-pool-size")) { this->thread_pool_size = options.get("thread-pool-size"); } diff --git a/src/analytics_manager.cpp b/src/analytics_manager.cpp index 74c3b1df..6db4978b 100644 --- a/src/analytics_manager.cpp +++ b/src/analytics_manager.cpp @@ -43,7 +43,8 @@ Option AnalyticsManager::create_rule(nlohmann::json& payload, bool upsert, return Option(400, "Bad or missing params."); } - if(payload["type"] == POPULAR_QUERIES_TYPE || payload["type"] == NOHITS_QUERIES_TYPE) { + if(payload["type"] == POPULAR_QUERIES_TYPE || payload["type"] == NOHITS_QUERIES_TYPE + || payload["type"] == POPULAR_CLICKS_TYPE) { return create_queries_index(payload, upsert, write_to_disk); } @@ -84,6 +85,15 @@ Option AnalyticsManager::create_queries_index(nlohmann::json &payload, boo return Option(400, "Must contain a valid destination collection."); } + std::string counter_field; + + if(params["destination"].contains("counter_field")) { + if (!params["destination"]["counter_field"].is_string()) { + return Option(400, "Must contain a valid counter_field."); + } + counter_field = params["destination"]["counter_field"].get(); + } + const std::string& suggestion_collection = params["destination"]["collection"].get(); suggestion_config_t suggestion_config; suggestion_config.name = suggestion_config_name; @@ -98,6 +108,19 @@ Option AnalyticsManager::create_queries_index(nlohmann::json &payload, boo if (!upsert && nohits_queries.count(suggestion_collection) != 0) { return Option(400, "There's already another configuration for this destination collection."); } + } else if(payload["type"] == POPULAR_CLICKS_TYPE) { + if (!upsert && popular_clicks.count(suggestion_collection) != 0) { + return Option(400, "There's already another configuration for this destination collection."); + } + + auto coll = CollectionManager::get_instance().get_collection(suggestion_collection).get(); + if(coll != nullptr) { + if (!coll->contains_field(counter_field)) { + return Option(404, "counter_field `" + counter_field + "` not found in destination collection."); + } + } else { + return Option(404, "Collection `" + suggestion_collection + "` not found."); + } } for(const auto& coll: params["source"]["collections"]) { @@ -131,6 +154,8 @@ Option AnalyticsManager::create_queries_index(nlohmann::json &payload, boo } else if(payload["type"] == NOHITS_QUERIES_TYPE) { QueryAnalytics *noresultsQueries = new QueryAnalytics(limit); nohits_queries.emplace(suggestion_collection, noresultsQueries); + } else if(payload["type"] == POPULAR_CLICKS_TYPE) { + popular_clicks.emplace(suggestion_collection, popular_clicks_t{counter_field, {}}); } if(write_to_disk) { @@ -278,6 +303,13 @@ Option AnalyticsManager::add_click_event(const std::string &query_collecti click_event_t click_event(query, now_ts_useconds, user_id, doc_id, position); click_events_vec.emplace_back(click_event); + auto popular_clicks_it = popular_clicks.find(query_collection); + if(popular_clicks_it != popular_clicks.end()) { + popular_clicks_it->second.docid_counts[doc_id]++; + } else { + LOG(ERROR) << "collection " << query_collection << " not found in analytics rule."; + } + return Option(true); } @@ -346,6 +378,7 @@ void AnalyticsManager::run(ReplicationState* raft_server) { checkEventsExpiry(); persist_query_events(raft_server, prev_persistence_s); persist_query_hits_click_events(raft_server, prev_persistence_s); + persist_popular_clicks(raft_server, prev_persistence_s); prev_persistence_s = std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()).count(); @@ -508,6 +541,37 @@ void AnalyticsManager::persist_query_hits_click_events(ReplicationState *raft_se } } +void AnalyticsManager::persist_popular_clicks(ReplicationState *raft_server, uint64_t prev_persistence_s) { + auto send_http_response = [&](const std::string& import_payload, const std::string& collection) { + std::string leader_url = raft_server->get_leader_url(); + if (!leader_url.empty()) { + const std::string &base_url = leader_url + "collections/" + collection; + std::string res; + + const std::string &update_url = base_url + "/documents/import?action=update"; + std::map res_headers; + long status_code = HttpClient::post_response(update_url, import_payload, + res, res_headers, {}, 10 * 1000, true); + + if (status_code != 200) { + LOG(ERROR) << "Error while sending popular_clicks events to leader. " + << "Status code: " << status_code << ", response: " << res; + } + } + }; + + for(const auto& popular_clicks_it : popular_clicks) { + auto coll = popular_clicks_it.first; + nlohmann::json doc; + auto counter_field = popular_clicks_it.second.counter_field; + for(const auto& popular_click : popular_clicks_it.second.docid_counts) { + doc["id"] = popular_click.first; + doc[counter_field] = popular_click.second; + send_http_response(doc.dump(), coll); + } + } +} + void AnalyticsManager::stop() { quit = true; cv.notify_all(); @@ -534,6 +598,10 @@ void AnalyticsManager::init(Store* store, Store* analytics_store) { this->analytics_store = analytics_store; } +Store* AnalyticsManager::get_analytics_store() { + return this->analytics_store; +} + std::unordered_map AnalyticsManager::get_popular_queries() { std::unique_lock lk(mutex); return popular_queries; @@ -544,6 +612,11 @@ std::unordered_map AnalyticsManager::get_nohits_qu return nohits_queries; } +std::unordered_map AnalyticsManager::get_popular_clicks() { + std::unique_lock lk(mutex); + return popular_clicks; +} + nlohmann::json AnalyticsManager::get_click_events() { std::unique_lock lk(mutex); std::vector click_event_jsons; diff --git a/src/collection.cpp b/src/collection.cpp index df98dae0..e2564a40 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2155,6 +2155,10 @@ Option Collection::search(std::string raw_query, parse_search_query(query, q_include_tokens, field_query_tokens[0].q_exclude_tokens, field_query_tokens[0].q_phrases, "", false, stopwords_set); + + process_filter_overrides(filter_overrides, q_include_tokens, token_order, filter_tree_root, + included_ids, excluded_ids, override_metadata); + for(size_t i = 0; i < q_include_tokens.size(); i++) { auto& q_include_token = q_include_tokens[i]; field_query_tokens[0].q_include_tokens.emplace_back(i, q_include_token, (i == q_include_tokens.size() - 1), @@ -3328,7 +3332,7 @@ void Collection::parse_search_query(const std::string &query, std::vector Collection::get_fields() { return fields; } +bool Collection::contains_field(const std::string &field) { + std::shared_lock lock(mutex); + return search_schema.find(field) != search_schema.end(); +} + std::unordered_map Collection::get_dynamic_fields() { std::shared_lock lock(mutex); return dynamic_fields; @@ -5841,7 +5850,7 @@ bool Collection::get_enable_nested_fields() { Option Collection::parse_facet(const std::string& facet_field, std::vector& facets) const { const std::regex base_pattern(".+\\(.*\\)"); - const std::regex range_pattern("[[a-z A-Z]+:\\[([+-]?([0-9]*[.])?[0-9]*)\\,\\s*([+-]?([0-9]*[.])?[0-9]*)\\]"); + const std::regex range_pattern("[[0-9]*[a-z A-Z]+[0-9]*:\\[([+-]?([0-9]*[.])?[0-9]*)\\,\\s*([+-]?([0-9]*[.])?[0-9]*)\\]"); const std::string _alpha = "_alpha"; if ((facet_field.find(":") != std::string::npos) diff --git a/src/core_api.cpp b/src/core_api.cpp index 6383819b..ffa7ba6a 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -2791,10 +2791,71 @@ bool put_conversation_model(const std::shared_ptr& req, const std::sha res->set_200(model.dump()); return true; } -bool get_click_events(const std::shared_ptr& req, const std::shared_ptr& res) { - auto click_events = AnalyticsManager::get_instance().get_click_events(); - res->set_200(click_events.dump()); +bool get_click_events(const std::shared_ptr& req, const std::shared_ptr& res) { + auto analytics_store = AnalyticsManager::get_instance().get_analytics_store(); + if (!analytics_store) { + LOG(ERROR) << "Analytics store not initialized."; + return true; + } + + export_state_t *export_state = nullptr; + auto click_event_prefix = std::string(AnalyticsManager::CLICK_EVENT) + "_"; + if (req->data == nullptr) { + export_state = new export_state_t(); + req->data = export_state; + + export_state->iter_upper_bound_key = std::string(AnalyticsManager::CLICK_EVENT) + "`"; + export_state->iter_upper_bound = new rocksdb::Slice(export_state->iter_upper_bound_key); + export_state->it = analytics_store->scan(click_event_prefix, export_state->iter_upper_bound); + } else { + export_state = dynamic_cast(req->data); + } + + if (export_state->it != nullptr) { + rocksdb::Iterator *it = export_state->it; + size_t batch_counter = 0; + std::string().swap(res->body); + + if (!it->Valid()) { + LOG(ERROR) << "No click events found in db."; + req->last_chunk_aggregate = true; + res->final = true; + res->set_404(); + stream_response(req, res); + return false; + } + + while (it->Valid() && it->key().ToString().compare(0, click_event_prefix.size(), click_event_prefix) == 0) { + res->body += it->value().ToString(); + it->Next(); + + // append a new line character if there is going to be one more record to send + if (it->Valid() && + it->key().ToString().compare(0, click_event_prefix.size(), click_event_prefix) == 0) { + res->body += "\n"; + req->last_chunk_aggregate = false; + res->final = false; + } else { + req->last_chunk_aggregate = true; + res->final = true; + } + + batch_counter++; + if (batch_counter == export_state->export_batch_size) { + break; + } + } + } else { + req->last_chunk_aggregate = true; + res->final = true; + } + + res->content_type_header = "text/plain; charset=utf-8"; + res->status_code = 200; + + stream_response(req, res); + return true; } diff --git a/src/embedder_manager.cpp b/src/embedder_manager.cpp index 3d6137c6..f61661ba 100644 --- a/src/embedder_manager.cpp +++ b/src/embedder_manager.cpp @@ -15,7 +15,13 @@ Option EmbedderManager::validate_and_init_model(const nlohmann::json& mode return validate_and_init_remote_model(model_config, num_dims); } else { LOG(INFO) << "Validating and initializing local model: " << model_name; - return validate_and_init_local_model(model_config, num_dims); + auto op = validate_and_init_local_model(model_config, num_dims); + if(op.ok()) { + LOG(INFO) << "Finished initializing local model: " << model_name; + } else { + LOG(ERROR) << "Failed to initialize local model " << model_name << ", error: " << op.error(); + } + return op; } } diff --git a/src/index.cpp b/src/index.cpp index 74b35cac..0287f443 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -786,8 +786,10 @@ void Index::index_field_in_memory(const field& afield, std::vector } else if(afield.type == field_types::FLOAT) { float raw_val = document[afield.name].get(); - auto fhash = reinterpret_cast(raw_val); - facet_value_id_t facet_value_id(StringUtils::float_to_str(raw_val), fhash); + const std::string& float_str_val = StringUtils::float_to_str(raw_val); + float normalized_raw_val = std::stof(float_str_val); + auto fhash = reinterpret_cast(normalized_raw_val); + facet_value_id_t facet_value_id(float_str_val, fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } @@ -1387,7 +1389,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, if(a_facet.get_range(std::stoll(doc_val), range_pair)) { const auto& range_id = range_pair.first; facet_count_t& facet_count = a_facet.result_map[range_id]; - facet_count.count = kv.second.count; + facet_count.count += kv.second.count; } } else { facet_count_t& facet_count = a_facet.value_result_map[kv.first]; diff --git a/src/main/typesense_server.cpp b/src/main/typesense_server.cpp index f4993434..a52db200 100644 --- a/src/main/typesense_server.cpp +++ b/src/main/typesense_server.cpp @@ -82,7 +82,7 @@ void master_server_routes() { server->post("/analytics/events", post_create_event); //collection based query click events - server->get("/analytics/click_events", get_click_events); + server->get("/analytics/click_events", get_click_events, false, true); server->post("/analytics/click_events", post_create_event); server->post("/analytics/click_events/replicate", post_replicate_events); server->get("/analytics/query_hits_counts", get_query_hits_counts); diff --git a/src/string_utils.cpp b/src/string_utils.cpp index a9296cd5..109c5d09 100644 --- a/src/string_utils.cpp +++ b/src/string_utils.cpp @@ -594,3 +594,7 @@ size_t StringUtils::split_facet(const std::string &s, std::vector & std::wstring_convert, char32_t> utf8conv; return utf8conv.from_bytes(bytes).size(); }*/ + +size_t StringUtils::get_occurence_count(const std::string &str, char symbol) { + return std::count(str.begin(), str.end(), symbol); +} \ No newline at end of file diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp index db927485..dcc8f136 100644 --- a/src/typesense_server_utils.cpp +++ b/src/typesense_server_utils.cpp @@ -109,6 +109,7 @@ void init_cmdline_options(cmdline::parser & options, int argc, char **argv) { options.add("cache-num-entries", '\0', "Number of entries to cache.", false, 1000); options.add("analytics-flush-interval", '\0', "Frequency of persisting analytics data to disk (in seconds).", false, 3600); options.add("housekeeping-interval", '\0', "Frequency of housekeeping background job (in seconds).", false, 1800); + options.add("db-compaction-interval", '\0', "Frequency of RocksDB compaction (in seconds).", false, 604800); // DEPRECATED options.add("listen-address", 'h', "[DEPRECATED: use `api-address`] Address to which Typesense API service binds.", false, "0.0.0.0"); @@ -394,7 +395,7 @@ int run_server(const Config & config, const std::string & version, void (*master ThreadPool replication_thread_pool(num_threads); // primary DB used for storing the documents: we will not use WAL since Raft provides that - Store store(db_dir); + Store store(db_dir, 24*60*60, 1024, true, config.get_db_compaction_interval()); // meta DB for storing house keeping things Store meta_store(meta_dir, 24*60*60, 1024, false); @@ -402,7 +403,7 @@ int run_server(const Config & config, const std::string & version, void (*master //analytics DB for storing query click events std::unique_ptr analytics_store = nullptr; if(!analytics_dir.empty()) { - analytics_store.reset(new Store(analytics_dir, 24 * 60 * 60, 1024, false)); + analytics_store.reset(new Store(analytics_dir, 24 * 60 * 60, 1024, true, config.get_db_compaction_interval())); } curl_global_init(CURL_GLOBAL_SSL); diff --git a/test/analytics_manager_test.cpp b/test/analytics_manager_test.cpp index 641da345..750cbbf8 100644 --- a/test/analytics_manager_test.cpp +++ b/test/analytics_manager_test.cpp @@ -767,4 +767,178 @@ TEST_F(AnalyticsManagerTest, EventsExpiryPartial) { ASSERT_EQ("management", resp[0]["q"]); ASSERT_EQ(13, resp[0]["user_id"]); ASSERT_EQ(834, resp[0]["hits_count"]); +} + +TEST_F(AnalyticsManagerTest, PopularityScore) { + //reset click event rate limit + analyticsManager.resetRateLimit(); + + nlohmann::json products_schema = R"({ + "name": "products", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "popularity", "type": "int32"} + ] + })"_json; + + Collection* products_coll = collectionManager.create_collection(products_schema).get(); + + nlohmann::json doc; + doc["popularity"] = 0; + + doc["id"] = "0"; + doc["title"] = "Cool trousers"; + ASSERT_TRUE(products_coll->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["title"] = "Funky trousers"; + ASSERT_TRUE(products_coll->add(doc.dump()).ok()); + + doc["id"] = "2"; + doc["title"] = "Casual shorts"; + ASSERT_TRUE(products_coll->add(doc.dump()).ok()); + + doc["id"] = "3"; + doc["title"] = "Trendy shorts"; + ASSERT_TRUE(products_coll->add(doc.dump()).ok()); + + doc["id"] = "4"; + doc["title"] = "Formal pants"; + ASSERT_TRUE(products_coll->add(doc.dump()).ok()); + + nlohmann::json analytics_rule = R"({ + "name": "product_popularity", + "type": "popular_clicks", + "params": { + "source": { + "collections": ["products"] + }, + "destination": { + "collection": "products", + "counter_field": "popularity" + } + } + })"_json; + + auto create_op = analyticsManager.create_rule(analytics_rule, false, true); + ASSERT_TRUE(create_op.ok()); + + std::shared_ptr req = std::make_shared(); + std::shared_ptr res = std::make_shared(nullptr); + + nlohmann::json event1 = R"({ + "type": "query_click", + "data": { + "q": "trousers", + "collection": "products", + "doc_id": "1", + "position": 2, + "user_id": "13" + } + })"_json; + + req->body = event1.dump(); + ASSERT_TRUE(post_create_event(req, res)); + + nlohmann::json event2 = R"({ + "type": "query_click", + "data": { + "q": "shorts", + "collection": "products", + "doc_id": "3", + "position": 4, + "user_id": "11" + } + })"_json; + + req->body = event2.dump(); + ASSERT_TRUE(post_create_event(req, res)); + + ASSERT_TRUE(post_create_event(req, res)); + + auto popular_clicks = analyticsManager.get_popular_clicks(); + ASSERT_EQ(1, popular_clicks.size()); + ASSERT_EQ("popularity", popular_clicks["products"].counter_field); + ASSERT_EQ(2, popular_clicks["products"].docid_counts.size()); + ASSERT_EQ(1, popular_clicks["products"].docid_counts["1"]); + ASSERT_EQ(2, popular_clicks["products"].docid_counts["3"]); + + //trigger persistance event + for(const auto& popular_clicks_it : popular_clicks) { + auto coll = popular_clicks_it.first; + nlohmann::json doc; + auto counter_field = popular_clicks_it.second.counter_field; + req->params["collection"] = "products"; + req->params["action"] = "update"; + for(const auto& popular_click : popular_clicks_it.second.docid_counts) { + doc["id"] = popular_click.first; + doc[counter_field] = popular_click.second; + req->body = doc.dump(); + post_import_documents(req, res); + } + } + + sort_fields = {sort_by("popularity", "DESC")}; + auto results = products_coll->search("*", {}, "", {}, + sort_fields, {0}, 10, 1, FREQUENCY,{false}, + Index::DROP_TOKENS_THRESHOLD,spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + + ASSERT_EQ(5, results["hits"].size()); + + ASSERT_EQ("3", results["hits"][0]["document"]["id"]); + ASSERT_EQ(2, results["hits"][0]["document"]["popularity"]); + ASSERT_EQ("Trendy shorts", results["hits"][0]["document"]["title"]); + + ASSERT_EQ("1", results["hits"][1]["document"]["id"]); + ASSERT_EQ(1, results["hits"][1]["document"]["popularity"]); + ASSERT_EQ("Funky trousers", results["hits"][1]["document"]["title"]); +} + +TEST_F(AnalyticsManagerTest, PopularityScoreValidation) { + nlohmann::json products_schema = R"({ + "name": "books", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "popularity", "type": "int32"} + ] + })"_json; + + Collection* products_coll = collectionManager.create_collection(products_schema).get(); + + nlohmann::json analytics_rule = R"({ + "name": "books_popularity", + "type": "popular_clicks", + "params": { + "source": { + "collections": ["books"] + }, + "destination": { + "collection": "popular_books", + "counter_field": "popularity" + } + } + })"_json; + + auto create_op = analyticsManager.create_rule(analytics_rule, false, true); + ASSERT_FALSE(create_op.ok()); + ASSERT_EQ("Collection `popular_books` not found.", create_op.error()); + + analytics_rule = R"({ + "name": "books_popularity", + "type": "popular_clicks", + "params": { + "source": { + "collections": ["books"] + }, + "destination": { + "collection": "books", + "counter_field": "popularity_score" + } + } + })"_json; + + create_op = analyticsManager.create_rule(analytics_rule, false, true); + ASSERT_FALSE(create_op.ok()); + ASSERT_EQ("counter_field `popularity_score` not found in destination collection.", create_op.error()); } \ No newline at end of file diff --git a/test/collection_all_fields_test.cpp b/test/collection_all_fields_test.cpp index ae9f0d98..fab4adb3 100644 --- a/test/collection_all_fields_test.cpp +++ b/test/collection_all_fields_test.cpp @@ -1591,7 +1591,7 @@ TEST_F(CollectionAllFieldsTest, FieldNameMatchingRegexpShouldNotBeIndexedInNonAu } TEST_F(CollectionAllFieldsTest, EmbedFromFieldJSONInvalidField) { - EmbedderManager::set_model_dir("/tmp/typensense_test/models"); + EmbedderManager::set_model_dir("/tmp/typesense_test/models"); nlohmann::json field_json; field_json["name"] = "embedding"; field_json["type"] = "float[]"; diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index 6fc020f7..7edbb3d2 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -2980,3 +2980,49 @@ TEST_F(CollectionFacetingTest, RangeFacetTestWithGroupBy) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionFacetingTest, RangeFacetAlphanumericLabels) { + std::vector fields = {field("monuments", field_types::STRING, false), + field("year", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", + {},{}).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["monuments"] = "Statue Of Unity"; + doc["year"] = 2018; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["monuments"] = "Taj Mahal"; + doc["year"] = 1653; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "2"; + doc["monuments"] = "Mysore Palace"; + doc["year"] = 1897; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "3"; + doc["monuments"] = "Chennakesava Temple"; + doc["year"] = 1117; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"year(10thAD:[1000,1500], 15thAD:[1500,2000], 20thAD:[2000, ])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true).get(); + + ASSERT_EQ(3, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(2, results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("15thAD", results["facet_counts"][0]["counts"][0]["value"]); + ASSERT_EQ(1, results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_EQ("20thAD", results["facet_counts"][0]["counts"][1]["value"]); + ASSERT_EQ(1, results["facet_counts"][0]["counts"][2]["count"]); + ASSERT_EQ("10thAD", results["facet_counts"][0]["counts"][2]["value"]); +} \ No newline at end of file diff --git a/test/collection_optimized_faceting_test.cpp b/test/collection_optimized_faceting_test.cpp index 34199464..57c19a3b 100644 --- a/test/collection_optimized_faceting_test.cpp +++ b/test/collection_optimized_faceting_test.cpp @@ -2648,3 +2648,50 @@ TEST_F(CollectionOptimizedFacetingTest, StringFacetsCountListRemoveTest) { ASSERT_EQ("The Shawshank Redemption", results["facet_counts"][0]["counts"][0]["value"]); ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"]); } + +TEST_F(CollectionOptimizedFacetingTest, RangeFacetAlphanumericLabels) { + std::vector fields = {field("monuments", field_types::STRING, false), + field("year", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", + {},{}).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["monuments"] = "Statue Of Unity"; + doc["year"] = 2018; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["monuments"] = "Taj Mahal"; + doc["year"] = 1653; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "2"; + doc["monuments"] = "Mysore Palace"; + doc["year"] = 1897; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "3"; + doc["monuments"] = "Chennakesava Temple"; + doc["year"] = 1117; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"year(10thAD:[1000,1500], 15thAD:[1500,2000], 20thAD:[2000, ])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000*1000, 4, 7, fallback, 4, {off}, INT16_MAX, INT16_MAX, + 2, 2, false, "", true, 0, max_score, 100, 0, 0, VALUE).get(); + + ASSERT_EQ(3, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(2, results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("15thAD", results["facet_counts"][0]["counts"][0]["value"]); + ASSERT_EQ(1, results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_EQ("20thAD", results["facet_counts"][0]["counts"][1]["value"]); + ASSERT_EQ(1, results["facet_counts"][0]["counts"][2]["count"]); + ASSERT_EQ("10thAD", results["facet_counts"][0]["counts"][2]["value"]); +} \ No newline at end of file diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index ebceed38..6ea244b4 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -3728,7 +3728,7 @@ TEST_F(CollectionOverrideTest, WildcardTagRuleThatMatchesAllQueries) { // includes instead of filter_by coll1->remove_override("ov-1"); - override_json1 = R"({ + auto override_json2 = R"({ "id": "ov-1", "rule": { "tags": ["*"] @@ -3738,9 +3738,10 @@ TEST_F(CollectionOverrideTest, WildcardTagRuleThatMatchesAllQueries) { ] })"_json; - op = override_t::parse(override_json1, "ov-1", override1); + override_t override2; + op = override_t::parse(override_json2, "ov-2", override2); ASSERT_TRUE(op.ok()); - coll1->add_override(override1); + coll1->add_override(override2); results = coll1->search("foobar", {"name"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY, @@ -3907,3 +3908,98 @@ TEST_F(CollectionOverrideTest, MetadataValidation) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionOverrideTest, WildcardSearchOverride) { + Collection* coll1; + + std::vector fields = {field("name", field_types::STRING, false), + field("category", field_types::STRING, true),}; + + coll1 = collectionManager.get_collection("coll1").get(); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 1, fields, "").get(); + } + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["name"] = "queryA"; + doc1["category"] = "kids"; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["name"] = "queryA"; + doc2["category"] = "kitchen"; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["name"] = "Clay Toy"; + doc3["category"] = "home"; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + + std::vector sort_fields = {sort_by("_text_match", "DESC")}; + + nlohmann::json override_json1 = R"({ + "id": "ov-1", + "rule": { + "query": "*", + "match": "exact" + }, + "filter_by": "category: kids" + })"_json; + + override_t override1; + auto op = override_t::parse(override_json1, "ov-1", override1); + ASSERT_TRUE(op.ok()); + coll1->add_override(override1); + + std::string override_tags = ""; + auto results = coll1->search("*", {}, "", + {}, sort_fields, {2}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 10000, + 4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0, + 0, HASH, 30000, 2, "", {}, {}, "right_to_left", + true, true, false, -1, "", override_tags).get(); + + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + + // includes instead of filter_by + coll1->remove_override("ov-1"); + + override_t override2; + auto override_json2 = R"({ + "id": "ov-2", + "rule": { + "query": "*", + "match": "exact" + }, + "includes": [ + {"id": "1", "position": 1} + ] + })"_json; + + op = override_t::parse(override_json2, "ov-2", override2); + ASSERT_TRUE(op.ok()); + coll1->add_override(override2); + + results = coll1->search("*", {}, "", + {}, sort_fields, {2}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 10000, + 4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0, + 0, HASH, 30000, 2, "", {}, {}, "right_to_left", + true, true, false, -1, "", override_tags).get(); + + ASSERT_EQ(3, results["hits"].size()); + ASSERT_EQ("1", results["hits"][0]["document"]["id"].get()); + + collectionManager.drop_collection("coll1"); +} diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp index deb6ad4f..1dee1d2e 100644 --- a/test/collection_sorting_test.cpp +++ b/test/collection_sorting_test.cpp @@ -1988,7 +1988,7 @@ TEST_F(CollectionSortingTest, OptionalFilteringViaSortingWildcard) { "name": "coll1", "fields": [ {"name": "title", "type": "string" }, - {"name": "brand", "type": "string" }, + {"name": "brand", "type": "string", "infix": true }, {"name": "points", "type": "int32" } ] } @@ -2082,6 +2082,20 @@ TEST_F(CollectionSortingTest, OptionalFilteringViaSortingWildcard) { ASSERT_EQ(expected_ids[i], results["hits"][i]["document"]["id"].get()); } + // Score associated with the first match is assigned to the document. + sort_fields = { + sort_by({"brand:nike", "brand:adidas", "points: 1"}, {3, 2, 5}, "DESC"), + sort_by("points", "DESC"), + }; + + results = coll1->search("*", {"title"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY, {true}, 10).get(); + ASSERT_EQ(6, results["hits"].size()); + + expected_ids = {"3", "0", "4", "2", "1", "5"}; + for(size_t i = 0; i < expected_ids.size(); i++) { + ASSERT_EQ(expected_ids[i], results["hits"][i]["document"]["id"].get()); + } + // bad syntax for eval query sort_fields = { sort_by({"brandnike || points:0"}, {1}, "DESC"), @@ -2107,6 +2121,25 @@ TEST_F(CollectionSortingTest, OptionalFilteringViaSortingWildcard) { ASSERT_FALSE(search_op.ok()); ASSERT_EQ("The eval expression in sort_by is empty.", search_op.error()); + req_params = { + {"collection", "coll1"}, + {"q", "a"}, + {"query_by", "brand"}, + {"sort_by", "_eval(brand:puma):desc, _text_match:desc"}, + {"infix", "always"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + results = nlohmann::json::parse(json_res); + + ASSERT_EQ(4, results["hits"].size()); // 3 Adidas 1 Puma documents + // Because of `_eval`, Puma document will be on top even when having a lower text match score than Adidas documents. + expected_ids = {"5", "4", "2", "1"}; + for(size_t i = 0; i < expected_ids.size(); i++) { + ASSERT_EQ(expected_ids[i], results["hits"][i]["document"]["id"].get()); + } + // more bad syntax! sort_fields = { sort_by(")", "DESC"), diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 51b5a58c..05361729 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -2179,6 +2179,10 @@ TEST_F(CollectionSpecificMoreTest, PhraseMatchAcrossArrayElements) { auto res = coll1->search(R"("state of the art)", {"texts"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set()).get(); + ASSERT_EQ(1, res["hits"].size()); + + res = coll1->search(R"("state of the art")", {"texts"}, "", {}, {}, {0}, 10, 1, + FREQUENCY, {true}, 10, spp::sparse_hash_set()).get(); ASSERT_EQ(0, res["hits"].size()); } diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index 38764660..388eb883 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -2996,3 +2996,59 @@ TEST_F(CollectionSpecificTest, DontHighlightPunctuation) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionSpecificTest, ExactMatchWithoutClosingSymbol) { + std::vector fields = {field("title", field_types::STRING, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + std::vector> records = { + {"Hampi"}, + {"Mahabalipuram"}, + {"Taj Mahal"}, + {"Mysore Palace"} + }; + + for(size_t i=0; iadd(doc.dump()).ok()); + } + + std::map req_params = { + {"collection", "coll1"}, + {"q", "\"Hamp"}, + {"query_by", "title"}, + }; + nlohmann::json embedded_params; + std::string json_res; + auto now_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + + auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + + nlohmann::json result = nlohmann::json::parse(json_res); + ASSERT_EQ(1, result["hits"].size()); + ASSERT_EQ("0", result["hits"][0]["document"]["id"]); + ASSERT_EQ("Hampi", result["hits"][0]["document"]["title"]); + + req_params = { + {"collection", "coll1"}, + {"q", "\"Mah"}, + {"query_by", "title"}, + }; + now_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + + result = nlohmann::json::parse(json_res); + ASSERT_EQ(2, result["hits"].size()); + ASSERT_EQ("2", result["hits"][0]["document"]["id"]); + ASSERT_EQ("Taj Mahal", result["hits"][0]["document"]["title"]); + ASSERT_EQ("1", result["hits"][1]["document"]["id"]); + ASSERT_EQ("Mahabalipuram", result["hits"][1]["document"]["title"]); +} \ No newline at end of file diff --git a/test/core_api_utils_test.cpp b/test/core_api_utils_test.cpp index d2734ec3..cdb39a01 100644 --- a/test/core_api_utils_test.cpp +++ b/test/core_api_utils_test.cpp @@ -7,27 +7,33 @@ #include "raft_server.h" #include "conversation_model_manager.h" #include "conversation_manager.h" +#include class CoreAPIUtilsTest : public ::testing::Test { protected: - Store *store; + Store *store, *analytics_store; CollectionManager & collectionManager = CollectionManager::get_instance(); std::atomic quit = false; std::vector query_fields; std::vector sort_fields; + AnalyticsManager& analyticsManager = AnalyticsManager::get_instance(); + void setupCollection() { std::string state_dir_path = "/tmp/typesense_test/core_api_utils"; + std::string analytics_db_path = "/tmp/typesense_test/analytics_db2"; LOG(INFO) << "Truncating and creating: " << state_dir_path; system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); store = new Store(state_dir_path); + analytics_store = new Store(analytics_db_path); collectionManager.init(store, 1.0, "auth_key", quit); collectionManager.load(8, 1000); ConversationModelManager::init(store); ConversationManager::get_instance().init(store); + analyticsManager.init(store, analytics_store); } virtual void SetUp() { @@ -37,6 +43,7 @@ protected: virtual void TearDown() { collectionManager.dispose(); delete store; + delete analytics_store; } }; @@ -1495,4 +1502,119 @@ TEST_F(CoreAPIUtilsTest, TestInvalidConversationModels) { ASSERT_EQ(400, resp->status_code); ASSERT_EQ("Property `model_name` is not provided or not a string.", nlohmann::json::parse(resp->body)["message"]); +} + +TEST_F(CoreAPIUtilsTest, GetClickEvents) { + //reset analytics store + analyticsManager.resetRateLimit(); + analyticsManager.resetAnalyticsStore(); + + nlohmann::json schema = R"({ + "name": "titles", + "fields": [ + {"name": "name", "type": "string" }, + {"name": "points", "type": "int32" } + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* titles = op.get(); + + std::shared_ptr req = std::make_shared(); + std::shared_ptr res = std::make_shared(nullptr); + + // no events in db + get_click_events(req, res); + ASSERT_EQ("{\"message\": \"Not Found\"}", res->body); + + //add some events + nlohmann::json event1 = R"({ + "type": "query_click", + "data": { + "q": "technology", + "collection": "titles", + "doc_id": "21", + "position": 2, + "user_id": "13" + } + })"_json; + + req->body = event1.dump(); + ASSERT_TRUE(post_create_event(req, res)); + + nlohmann::json event2 = R"({ + "type": "query_click", + "data": { + "q": "technology", + "collection": "titles", + "doc_id": "12", + "position": 1, + "user_id": "13" + } + })"_json; + req->body = event2.dump(); + ASSERT_TRUE(post_create_event(req, res)); + + nlohmann::json event3 = R"({ + "type": "query_click", + "data": { + "q": "technology", + "collection": "titles", + "doc_id": "52", + "position": 5, + "user_id": "13" + } + })"_json; + req->body = event3.dump(); + ASSERT_TRUE(post_create_event(req, res)); + + event1["collection_id"] = "0"; + event1["timestamp"] = 1521512521; + event1["event_type"] = "click_events"; + event2["collection_id"] = "0"; + event2["timestamp"] = 1521514354; + event2["event_type"] = "click_events"; + event3["collection_id"] = "0"; + event3["timestamp"] = 1521515382; + event3["event_type"] = "click_events"; + + + nlohmann::json click_events = nlohmann::json::array(); + click_events.push_back(event1); + click_events.push_back(event2); + click_events.push_back(event3); + + req->body = click_events.dump(); + ASSERT_TRUE(post_replicate_events(req, res)); + + //get click events + req->data = nullptr; + get_click_events(req, res); + + std::vector res_strs; + StringUtils::split(res->body, res_strs, "\n"); + + auto result = nlohmann::json::array(); + result.push_back(nlohmann::json::parse(res_strs[0])); + result.push_back(nlohmann::json::parse(res_strs[1])); + result.push_back(nlohmann::json::parse(res_strs[2])); + + ASSERT_EQ("0", result[0]["collection_id"]); + ASSERT_EQ("13", result[0]["data"]["user_id"]); + ASSERT_EQ("21", result[0]["data"]["doc_id"]); + ASSERT_EQ(2, result[0]["data"]["position"]); + ASSERT_EQ("technology", result[0]["data"]["q"]); + + ASSERT_EQ("0", result[1]["collection_id"]); + ASSERT_EQ("13", result[1]["data"]["user_id"]); + ASSERT_EQ("12", result[1]["data"]["doc_id"]); + ASSERT_EQ(1, result[1]["data"]["position"]); + ASSERT_EQ("technology", result[1]["data"]["q"]); + + ASSERT_EQ("0", result[2]["collection_id"]); + ASSERT_EQ("13", result[2]["data"]["user_id"]); + ASSERT_EQ("52", result[2]["data"]["doc_id"]); + ASSERT_EQ(5, result[2]["data"]["position"]); + ASSERT_EQ("technology", result[2]["data"]["q"]); } \ No newline at end of file