From 2ecd42795e71fc0e3379a48f2b793293794b99cb Mon Sep 17 00:00:00 2001 From: krunal Date: Tue, 12 Dec 2023 14:29:16 +0530 Subject: [PATCH 01/20] adding popularity score --- include/analytics_manager.h | 13 ++++ src/analytics_manager.cpp | 71 +++++++++++++++++- test/analytics_manager_test.cpp | 126 ++++++++++++++++++++++++++++++++ 3 files changed, 209 insertions(+), 1 deletion(-) diff --git a/include/analytics_manager.h b/include/analytics_manager.h index 9d9e9fd3..281e03c3 100644 --- a/include/analytics_manager.h +++ b/include/analytics_manager.h @@ -46,6 +46,11 @@ struct click_event_t { } }; +struct popular_clicks_t { + std::string counter_field; + std::map docid_counts; +}; + struct query_hits_count_t { std::string query; uint64_t timestamp; @@ -137,6 +142,9 @@ private: // suggestion collection => nohits queries std::unordered_map nohits_queries; + // collection => popular clicks + std::unordered_map popular_clicks; + //query collection => click events std::unordered_map> query_collection_click_events; @@ -163,6 +171,7 @@ public: static constexpr const char* QUERY_HITS_COUNT = "$QH"; static constexpr const char* POPULAR_QUERIES_TYPE = "popular_queries"; static constexpr const char* NOHITS_QUERIES_TYPE = "nohits_queries"; + static constexpr const char* POPULAR_CLICKS_TYPE = "popular_clicks"; static AnalyticsManager& get_instance() { static AnalyticsManager instance; @@ -200,8 +209,12 @@ public: void persist_query_hits_click_events(ReplicationState *raft_server, uint64_t prev_persistence_s); + void persist_popular_clicks(ReplicationState *raft_server, uint64_t prev_persistence_s); + nlohmann::json get_click_events(); + std::unordered_map get_popular_clicks(); + Option write_events_to_store(nlohmann::json& event_jsons); void add_nohits_query(const std::string& query_collection, diff --git a/src/analytics_manager.cpp b/src/analytics_manager.cpp index 74c3b1df..5f5d8c4c 100644 --- a/src/analytics_manager.cpp +++ b/src/analytics_manager.cpp @@ -43,7 +43,8 @@ Option AnalyticsManager::create_rule(nlohmann::json& payload, bool upsert, return Option(400, "Bad or missing params."); } - if(payload["type"] == POPULAR_QUERIES_TYPE || payload["type"] == NOHITS_QUERIES_TYPE) { + if(payload["type"] == POPULAR_QUERIES_TYPE || payload["type"] == NOHITS_QUERIES_TYPE + || payload["type"] == POPULAR_CLICKS_TYPE) { return create_queries_index(payload, upsert, write_to_disk); } @@ -84,6 +85,15 @@ Option AnalyticsManager::create_queries_index(nlohmann::json &payload, boo return Option(400, "Must contain a valid destination collection."); } + std::string counter_field; + + if(params["destination"].contains("counter_field")) { + if (!params["destination"]["counter_field"].is_string()) { + return Option(400, "Must contain a valid counter_field."); + } + counter_field = params["destination"]["counter_field"].get(); + } + const std::string& suggestion_collection = params["destination"]["collection"].get(); suggestion_config_t suggestion_config; suggestion_config.name = suggestion_config_name; @@ -98,6 +108,15 @@ Option AnalyticsManager::create_queries_index(nlohmann::json &payload, boo if (!upsert && nohits_queries.count(suggestion_collection) != 0) { return Option(400, "There's already another configuration for this destination collection."); } + } else if(payload["type"] == POPULAR_CLICKS_TYPE) { + if (!upsert && popular_clicks.count(suggestion_collection) != 0) { + return Option(400, "There's already another configuration for this destination collection."); + } + + auto coll_schema = CollectionManager::get_instance().get_collection(suggestion_collection)->get_schema(); + if(coll_schema.find(counter_field) == coll_schema.end()) { + return Option(404, "counter_field not found in destination collection."); + } } for(const auto& coll: params["source"]["collections"]) { @@ -131,6 +150,8 @@ Option AnalyticsManager::create_queries_index(nlohmann::json &payload, boo } else if(payload["type"] == NOHITS_QUERIES_TYPE) { QueryAnalytics *noresultsQueries = new QueryAnalytics(limit); nohits_queries.emplace(suggestion_collection, noresultsQueries); + } else if(payload["type"] == POPULAR_CLICKS_TYPE) { + popular_clicks.emplace(suggestion_collection, popular_clicks_t{counter_field, {}}); } if(write_to_disk) { @@ -278,6 +299,13 @@ Option AnalyticsManager::add_click_event(const std::string &query_collecti click_event_t click_event(query, now_ts_useconds, user_id, doc_id, position); click_events_vec.emplace_back(click_event); + auto popular_clicks_it = popular_clicks.find(query_collection); + if(popular_clicks_it != popular_clicks.end()) { + popular_clicks_it->second.docid_counts[doc_id]++; + } else { + LOG(ERROR) << "collection " << query_collection << " not found in analytics rule."; + } + return Option(true); } @@ -346,6 +374,7 @@ void AnalyticsManager::run(ReplicationState* raft_server) { checkEventsExpiry(); persist_query_events(raft_server, prev_persistence_s); persist_query_hits_click_events(raft_server, prev_persistence_s); + persist_popular_clicks(raft_server, prev_persistence_s); prev_persistence_s = std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()).count(); @@ -508,6 +537,41 @@ void AnalyticsManager::persist_query_hits_click_events(ReplicationState *raft_se } } +void AnalyticsManager::persist_popular_clicks(ReplicationState *raft_server, uint64_t prev_persistence_s) { + auto send_http_response = [&](const std::string& import_payload, const std::string& collection) { + if (import_payload.empty()) { + return; + } + + std::string leader_url = raft_server->get_leader_url(); + if (!leader_url.empty()) { + const std::string &base_url = leader_url + "collections/" + collection; + std::string res; + + const std::string &update_url = base_url + "/documents/import?action=update"; + std::map res_headers; + long status_code = HttpClient::post_response(update_url, import_payload, + res, res_headers, {}, 10 * 1000, true); + + if (status_code != 200) { + LOG(ERROR) << "Error while sending popular_clicks events to leader. " + << "Status code: " << status_code << ", response: " << res; + } + } + }; + + for(const auto& popular_clicks_it : popular_clicks) { + auto coll = popular_clicks_it.first; + nlohmann::json doc; + auto counter_field = popular_clicks_it.second.counter_field; + for(const auto& popular_click : popular_clicks_it.second.docid_counts) { + doc["id"] = popular_click.first; + doc[counter_field] = popular_click.second; + send_http_response(doc, coll); + } + } +} + void AnalyticsManager::stop() { quit = true; cv.notify_all(); @@ -544,6 +608,11 @@ std::unordered_map AnalyticsManager::get_nohits_qu return nohits_queries; } +std::unordered_map AnalyticsManager::get_popular_clicks() { + std::unique_lock lk(mutex); + return popular_clicks; +} + nlohmann::json AnalyticsManager::get_click_events() { std::unique_lock lk(mutex); std::vector click_event_jsons; diff --git a/test/analytics_manager_test.cpp b/test/analytics_manager_test.cpp index 641da345..97aa0edb 100644 --- a/test/analytics_manager_test.cpp +++ b/test/analytics_manager_test.cpp @@ -767,4 +767,130 @@ TEST_F(AnalyticsManagerTest, EventsExpiryPartial) { ASSERT_EQ("management", resp[0]["q"]); ASSERT_EQ(13, resp[0]["user_id"]); ASSERT_EQ(834, resp[0]["hits_count"]); +} + +TEST_F(AnalyticsManagerTest, PopularityScore) { + //reset click event rate limit + analyticsManager.resetRateLimit(); + + nlohmann::json products_schema = R"({ + "name": "products", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "popularity", "type": "int32"} + ] + })"_json; + + Collection* products_coll = collectionManager.create_collection(products_schema).get(); + + nlohmann::json doc; + doc["popularity"] = 0; + + doc["id"] = "0"; + doc["title"] = "Cool trousers"; + ASSERT_TRUE(products_coll->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["title"] = "Funky trousers"; + ASSERT_TRUE(products_coll->add(doc.dump()).ok()); + + doc["id"] = "2"; + doc["title"] = "Casual shorts"; + ASSERT_TRUE(products_coll->add(doc.dump()).ok()); + + doc["id"] = "3"; + doc["title"] = "Trendy shorts"; + ASSERT_TRUE(products_coll->add(doc.dump()).ok()); + + doc["id"] = "4"; + doc["title"] = "Formal pants"; + ASSERT_TRUE(products_coll->add(doc.dump()).ok()); + + nlohmann::json analytics_rule = R"({ + "name": "product_popularity", + "type": "popular_clicks", + "params": { + "source": { + "collections": ["products"] + }, + "destination": { + "collection": "products", + "counter_field": "popularity" + } + } + })"_json; + + auto create_op = analyticsManager.create_rule(analytics_rule, false, true); + ASSERT_TRUE(create_op.ok()); + + std::shared_ptr req = std::make_shared(); + std::shared_ptr res = std::make_shared(nullptr); + + nlohmann::json event1 = R"({ + "type": "query_click", + "data": { + "q": "trousers", + "collection": "products", + "doc_id": "1", + "position": 2, + "user_id": "13" + } + })"_json; + + req->body = event1.dump(); + ASSERT_TRUE(post_create_event(req, res)); + + nlohmann::json event2 = R"({ + "type": "query_click", + "data": { + "q": "shorts", + "collection": "products", + "doc_id": "3", + "position": 4, + "user_id": "11" + } + })"_json; + + req->body = event2.dump(); + ASSERT_TRUE(post_create_event(req, res)); + + ASSERT_TRUE(post_create_event(req, res)); + + auto popular_clicks = analyticsManager.get_popular_clicks(); + ASSERT_EQ(1, popular_clicks.size()); + ASSERT_EQ("popularity", popular_clicks["products"].counter_field); + ASSERT_EQ(2, popular_clicks["products"].docid_counts.size()); + ASSERT_EQ(1, popular_clicks["products"].docid_counts["1"]); + ASSERT_EQ(2, popular_clicks["products"].docid_counts["3"]); + + //trigger persistance event + for(const auto& popular_clicks_it : popular_clicks) { + auto coll = popular_clicks_it.first; + nlohmann::json doc; + auto counter_field = popular_clicks_it.second.counter_field; + req->params["collection"] = "products"; + req->params["action"] = "update"; + for(const auto& popular_click : popular_clicks_it.second.docid_counts) { + doc["id"] = popular_click.first; + doc[counter_field] = popular_click.second; + req->body = doc.dump(); + post_import_documents(req, res); + } + } + + sort_fields = {sort_by("popularity", "DESC")}; + auto results = products_coll->search("*", {}, "", {}, + sort_fields, {0}, 10, 1, FREQUENCY,{false}, + Index::DROP_TOKENS_THRESHOLD,spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + + ASSERT_EQ(5, results["hits"].size()); + + ASSERT_EQ("3", results["hits"][0]["document"]["id"]); + ASSERT_EQ(2, results["hits"][0]["document"]["popularity"]); + ASSERT_EQ("Trendy shorts", results["hits"][0]["document"]["title"]); + + ASSERT_EQ("1", results["hits"][1]["document"]["id"]); + ASSERT_EQ(1, results["hits"][1]["document"]["popularity"]); + ASSERT_EQ("Funky trousers", results["hits"][1]["document"]["title"]); } \ No newline at end of file From 54ac166ee1314bc6aac40fba34662817f084109b Mon Sep 17 00:00:00 2001 From: krunal Date: Tue, 12 Dec 2023 15:58:04 +0530 Subject: [PATCH 02/20] add collection check and test --- include/collection.h | 2 ++ src/analytics_manager.cpp | 10 ++++--- src/collection.cpp | 5 ++++ test/analytics_manager_test.cpp | 48 +++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 3 deletions(-) diff --git a/include/collection.h b/include/collection.h index bc949e61..d8b6309e 100644 --- a/include/collection.h +++ b/include/collection.h @@ -396,6 +396,8 @@ public: std::vector get_fields(); + bool contains_field(const std::string&); + std::unordered_map get_dynamic_fields(); tsl::htrie_map get_schema(); diff --git a/src/analytics_manager.cpp b/src/analytics_manager.cpp index 5f5d8c4c..b9e7e0b6 100644 --- a/src/analytics_manager.cpp +++ b/src/analytics_manager.cpp @@ -113,9 +113,13 @@ Option AnalyticsManager::create_queries_index(nlohmann::json &payload, boo return Option(400, "There's already another configuration for this destination collection."); } - auto coll_schema = CollectionManager::get_instance().get_collection(suggestion_collection)->get_schema(); - if(coll_schema.find(counter_field) == coll_schema.end()) { - return Option(404, "counter_field not found in destination collection."); + auto coll = CollectionManager::get_instance().get_collection(suggestion_collection).get(); + if(coll != nullptr) { + if (!coll->contains_field(counter_field)) { + return Option(404, "counter_field `" + counter_field + "` not found in destination collection."); + } + } else { + return Option(404, "Collection `" + suggestion_collection + "` not found."); } } diff --git a/src/collection.cpp b/src/collection.cpp index 45b0b0c4..063d22b9 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -4336,6 +4336,11 @@ std::vector Collection::get_fields() { return fields; } +bool Collection::contains_field(const std::string &field) { + std::shared_lock lock(mutex); + return search_schema.find(field) != search_schema.end(); +} + std::unordered_map Collection::get_dynamic_fields() { std::shared_lock lock(mutex); return dynamic_fields; diff --git a/test/analytics_manager_test.cpp b/test/analytics_manager_test.cpp index 97aa0edb..750cbbf8 100644 --- a/test/analytics_manager_test.cpp +++ b/test/analytics_manager_test.cpp @@ -893,4 +893,52 @@ TEST_F(AnalyticsManagerTest, PopularityScore) { ASSERT_EQ("1", results["hits"][1]["document"]["id"]); ASSERT_EQ(1, results["hits"][1]["document"]["popularity"]); ASSERT_EQ("Funky trousers", results["hits"][1]["document"]["title"]); +} + +TEST_F(AnalyticsManagerTest, PopularityScoreValidation) { + nlohmann::json products_schema = R"({ + "name": "books", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "popularity", "type": "int32"} + ] + })"_json; + + Collection* products_coll = collectionManager.create_collection(products_schema).get(); + + nlohmann::json analytics_rule = R"({ + "name": "books_popularity", + "type": "popular_clicks", + "params": { + "source": { + "collections": ["books"] + }, + "destination": { + "collection": "popular_books", + "counter_field": "popularity" + } + } + })"_json; + + auto create_op = analyticsManager.create_rule(analytics_rule, false, true); + ASSERT_FALSE(create_op.ok()); + ASSERT_EQ("Collection `popular_books` not found.", create_op.error()); + + analytics_rule = R"({ + "name": "books_popularity", + "type": "popular_clicks", + "params": { + "source": { + "collections": ["books"] + }, + "destination": { + "collection": "books", + "counter_field": "popularity_score" + } + } + })"_json; + + create_op = analyticsManager.create_rule(analytics_rule, false, true); + ASSERT_FALSE(create_op.ok()); + ASSERT_EQ("counter_field `popularity_score` not found in destination collection.", create_op.error()); } \ No newline at end of file From 35ac3b2980eb708057d8cfc386bd10d8d2f896cd Mon Sep 17 00:00:00 2001 From: krunal Date: Tue, 12 Dec 2023 17:50:16 +0530 Subject: [PATCH 03/20] fixing bug --- src/analytics_manager.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/analytics_manager.cpp b/src/analytics_manager.cpp index b9e7e0b6..ad6003ac 100644 --- a/src/analytics_manager.cpp +++ b/src/analytics_manager.cpp @@ -543,10 +543,6 @@ void AnalyticsManager::persist_query_hits_click_events(ReplicationState *raft_se void AnalyticsManager::persist_popular_clicks(ReplicationState *raft_server, uint64_t prev_persistence_s) { auto send_http_response = [&](const std::string& import_payload, const std::string& collection) { - if (import_payload.empty()) { - return; - } - std::string leader_url = raft_server->get_leader_url(); if (!leader_url.empty()) { const std::string &base_url = leader_url + "collections/" + collection; @@ -571,7 +567,7 @@ void AnalyticsManager::persist_popular_clicks(ReplicationState *raft_server, uin for(const auto& popular_click : popular_clicks_it.second.docid_counts) { doc["id"] = popular_click.first; doc[counter_field] = popular_click.second; - send_http_response(doc, coll); + send_http_response(doc.dump(), coll); } } } From 6bc60adbaef559f2c18711dfe0b2a465f8bdecd3 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 13 Dec 2023 12:12:33 +0530 Subject: [PATCH 04/20] Support overriding wildcard query. --- src/collection.cpp | 4 ++ test/collection_all_fields_test.cpp | 2 +- test/collection_override_test.cpp | 102 +++++++++++++++++++++++++++- 3 files changed, 104 insertions(+), 4 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index 8d3fd554..b523ecb0 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2155,6 +2155,10 @@ Option Collection::search(std::string raw_query, parse_search_query(query, q_include_tokens, field_query_tokens[0].q_exclude_tokens, field_query_tokens[0].q_phrases, "", false, stopwords_set); + + process_filter_overrides(filter_overrides, q_include_tokens, token_order, filter_tree_root, + included_ids, excluded_ids, override_metadata); + for(size_t i = 0; i < q_include_tokens.size(); i++) { auto& q_include_token = q_include_tokens[i]; field_query_tokens[0].q_include_tokens.emplace_back(i, q_include_token, (i == q_include_tokens.size() - 1), diff --git a/test/collection_all_fields_test.cpp b/test/collection_all_fields_test.cpp index ae9f0d98..fab4adb3 100644 --- a/test/collection_all_fields_test.cpp +++ b/test/collection_all_fields_test.cpp @@ -1591,7 +1591,7 @@ TEST_F(CollectionAllFieldsTest, FieldNameMatchingRegexpShouldNotBeIndexedInNonAu } TEST_F(CollectionAllFieldsTest, EmbedFromFieldJSONInvalidField) { - EmbedderManager::set_model_dir("/tmp/typensense_test/models"); + EmbedderManager::set_model_dir("/tmp/typesense_test/models"); nlohmann::json field_json; field_json["name"] = "embedding"; field_json["type"] = "float[]"; diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index 8a97bb18..6f78041b 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -3728,7 +3728,7 @@ TEST_F(CollectionOverrideTest, WildcardTagRuleThatMatchesAllQueries) { // includes instead of filter_by coll1->remove_override("ov-1"); - override_json1 = R"({ + auto override_json2 = R"({ "id": "ov-1", "rule": { "tags": ["*"] @@ -3738,9 +3738,10 @@ TEST_F(CollectionOverrideTest, WildcardTagRuleThatMatchesAllQueries) { ] })"_json; - op = override_t::parse(override_json1, "ov-1", override1); + override_t override2; + op = override_t::parse(override_json2, "ov-2", override2); ASSERT_TRUE(op.ok()); - coll1->add_override(override1); + coll1->add_override(override2); results = coll1->search("foobar", {"name"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY, @@ -3907,3 +3908,98 @@ TEST_F(CollectionOverrideTest, MetadataValidation) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionOverrideTest, WildcardSearchOverride) { + Collection* coll1; + + std::vector fields = {field("name", field_types::STRING, false), + field("category", field_types::STRING, true),}; + + coll1 = collectionManager.get_collection("coll1").get(); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 1, fields, "").get(); + } + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["name"] = "queryA"; + doc1["category"] = "kids"; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["name"] = "queryA"; + doc2["category"] = "kitchen"; + + nlohmann::json doc3; + doc3["id"] = "2"; + doc3["name"] = "Clay Toy"; + doc3["category"] = "home"; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + ASSERT_TRUE(coll1->add(doc3.dump()).ok()); + + std::vector sort_fields = {sort_by("_text_match", "DESC")}; + + nlohmann::json override_json1 = R"({ + "id": "ov-1", + "rule": { + "query": "*", + "match": "exact" + }, + "filter_by": "category: kids" + })"_json; + + override_t override1; + auto op = override_t::parse(override_json1, "ov-1", override1); + ASSERT_TRUE(op.ok()); + coll1->add_override(override1); + + std::string override_tags = ""; + auto results = coll1->search("*", {}, "", + {}, sort_fields, {2}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 10000, + 4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0, + 0, HASH, 30000, 2, "", {}, {}, "right_to_left", + true, true, false, -1, "", override_tags).get(); + + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + + // includes instead of filter_by + coll1->remove_override("ov-1"); + + override_t override2; + auto override_json2 = R"({ + "id": "ov-2", + "rule": { + "query": "*", + "match": "exact" + }, + "includes": [ + {"id": "1", "position": 1} + ] + })"_json; + + op = override_t::parse(override_json2, "ov-2", override2); + ASSERT_TRUE(op.ok()); + coll1->add_override(override2); + + results = coll1->search("*", {}, "", + {}, sort_fields, {2}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 10000, + 4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0, + 0, HASH, 30000, 2, "", {}, {}, "right_to_left", + true, true, false, -1, "", override_tags).get(); + + ASSERT_EQ(3, results["hits"].size()); + ASSERT_EQ("1", results["hits"][0]["document"]["id"].get()); + + collectionManager.drop_collection("coll1"); +} From 6d40aa29e1fd0413f7ed83b53b4c2db7d92512a3 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 14 Dec 2023 18:28:35 +0530 Subject: [PATCH 05/20] Log model init status. --- src/embedder_manager.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/embedder_manager.cpp b/src/embedder_manager.cpp index 3d6137c6..f61661ba 100644 --- a/src/embedder_manager.cpp +++ b/src/embedder_manager.cpp @@ -15,7 +15,13 @@ Option EmbedderManager::validate_and_init_model(const nlohmann::json& mode return validate_and_init_remote_model(model_config, num_dims); } else { LOG(INFO) << "Validating and initializing local model: " << model_name; - return validate_and_init_local_model(model_config, num_dims); + auto op = validate_and_init_local_model(model_config, num_dims); + if(op.ok()) { + LOG(INFO) << "Finished initializing local model: " << model_name; + } else { + LOG(ERROR) << "Failed to initialize local model " << model_name << ", error: " << op.error(); + } + return op; } } From ec95099b491442557ba5b29af1653de5681f71a4 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 14 Dec 2023 18:28:54 +0530 Subject: [PATCH 06/20] Handle float facet value floating point precision. --- src/index.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index 74b35cac..a4d83f63 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -786,8 +786,10 @@ void Index::index_field_in_memory(const field& afield, std::vector } else if(afield.type == field_types::FLOAT) { float raw_val = document[afield.name].get(); - auto fhash = reinterpret_cast(raw_val); - facet_value_id_t facet_value_id(StringUtils::float_to_str(raw_val), fhash); + const std::string& float_str_val = StringUtils::float_to_str(raw_val); + float normalized_raw_val = std::stof(float_str_val); + auto fhash = reinterpret_cast(normalized_raw_val); + facet_value_id_t facet_value_id(float_str_val, fhash); fvalue_to_seq_ids[facet_value_id].push_back(seq_id); seq_id_to_fvalues[seq_id].push_back(facet_value_id); } From 707957559b8e51be4e2521c1dee265f49ad2d3ed Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 18 Dec 2023 10:21:59 +0530 Subject: [PATCH 07/20] Try purging bazel cache to fix build. --- .github/workflows/tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 280e69e9..b406c1f3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -56,6 +56,10 @@ jobs: rm bazel-cache.tar.gz exit 0 + - name: Bazel clean + run: | + bazel clean --expunge + - name: Build protobuf deps run: | bazel build @com_google_protobuf//:protobuf_headers From 456f6c449ac9ab28471ca7f93dd15b787357bf5d Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Mon, 18 Dec 2023 10:27:07 +0530 Subject: [PATCH 08/20] Add test case for filter scoring. --- test/collection_sorting_test.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp index deb6ad4f..f6143c53 100644 --- a/test/collection_sorting_test.cpp +++ b/test/collection_sorting_test.cpp @@ -2082,6 +2082,20 @@ TEST_F(CollectionSortingTest, OptionalFilteringViaSortingWildcard) { ASSERT_EQ(expected_ids[i], results["hits"][i]["document"]["id"].get()); } + // Score associated with the first match is assigned to the document. + sort_fields = { + sort_by({"brand:nike", "brand:adidas", "points: 1"}, {3, 2, 5}, "DESC"), + sort_by("points", "DESC"), + }; + + results = coll1->search("*", {"title"}, "", {}, sort_fields, {2}, 10, 1, FREQUENCY, {true}, 10).get(); + ASSERT_EQ(6, results["hits"].size()); + + expected_ids = {"3", "0", "4", "2", "1", "5"}; + for(size_t i = 0; i < expected_ids.size(); i++) { + ASSERT_EQ(expected_ids[i], results["hits"][i]["document"]["id"].get()); + } + // bad syntax for eval query sort_fields = { sort_by({"brandnike || points:0"}, {1}, "DESC"), From 738d562a3659bd22e0d458c300a65487a72035fd Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 18 Dec 2023 12:15:15 +0530 Subject: [PATCH 09/20] Revert "Try purging bazel cache to fix build." This reverts commit 707957559b8e51be4e2521c1dee265f49ad2d3ed. --- .github/workflows/tests.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b406c1f3..280e69e9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -56,10 +56,6 @@ jobs: rm bazel-cache.tar.gz exit 0 - - name: Bazel clean - run: | - bazel clean --expunge - - name: Build protobuf deps run: | bazel build @com_google_protobuf//:protobuf_headers From 7d387c09733ef9f20ab9d890e01aaa36c5eb685e Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 18 Dec 2023 12:28:51 +0530 Subject: [PATCH 10/20] Add .bazelversion --- .bazelversion | 1 + 1 file changed, 1 insertion(+) create mode 100644 .bazelversion diff --git a/.bazelversion b/.bazelversion new file mode 100644 index 00000000..7cbea073 --- /dev/null +++ b/.bazelversion @@ -0,0 +1 @@ +5.2.0 \ No newline at end of file From 7e765e30fb3e6fc6c9a7ce0628094adde14302bf Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Mon, 18 Dec 2023 16:15:19 +0530 Subject: [PATCH 11/20] Add `_eval` sorting test case. --- test/collection_sorting_test.cpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp index f6143c53..40882e38 100644 --- a/test/collection_sorting_test.cpp +++ b/test/collection_sorting_test.cpp @@ -1988,7 +1988,7 @@ TEST_F(CollectionSortingTest, OptionalFilteringViaSortingWildcard) { "name": "coll1", "fields": [ {"name": "title", "type": "string" }, - {"name": "brand", "type": "string" }, + {"name": "brand", "type": "string", "infix": true }, {"name": "points", "type": "int32" } ] } @@ -2121,6 +2121,24 @@ TEST_F(CollectionSortingTest, OptionalFilteringViaSortingWildcard) { ASSERT_FALSE(search_op.ok()); ASSERT_EQ("The eval expression in sort_by is empty.", search_op.error()); + req_params = { + {"collection", "coll1"}, + {"q", "a"}, + {"query_by", "brand"}, + {"sort_by", "_eval(brand:puma):desc, _text_match:desc"}, + {"infix", "always"} + }; + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + ASSERT_TRUE(search_op.ok()); + results = nlohmann::json::parse(json_res); + + ASSERT_EQ(4, results["hits"].size()); // 3 Adidas 1 Puma documents + expected_ids = {"5", "4", "2", "1"}; + for(size_t i = 0; i < expected_ids.size(); i++) { + ASSERT_EQ(expected_ids[i], results["hits"][i]["document"]["id"].get()); + } + // more bad syntax! sort_fields = { sort_by(")", "DESC"), From daa8e28d00a0cef581681b84af1292cb679f99ce Mon Sep 17 00:00:00 2001 From: Harpreet Sangar Date: Mon, 18 Dec 2023 16:25:09 +0530 Subject: [PATCH 12/20] Add comment. --- test/collection_sorting_test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp index 40882e38..1dee1d2e 100644 --- a/test/collection_sorting_test.cpp +++ b/test/collection_sorting_test.cpp @@ -2134,6 +2134,7 @@ TEST_F(CollectionSortingTest, OptionalFilteringViaSortingWildcard) { results = nlohmann::json::parse(json_res); ASSERT_EQ(4, results["hits"].size()); // 3 Adidas 1 Puma documents + // Because of `_eval`, Puma document will be on top even when having a lower text match score than Adidas documents. expected_ids = {"5", "4", "2", "1"}; for(size_t i = 0; i < expected_ids.size(); i++) { ASSERT_EQ(expected_ids[i], results["hits"][i]["document"]["id"].get()); From e37b9bf77598fa2d605ebe3fb4058aa723dfb861 Mon Sep 17 00:00:00 2001 From: krunal Date: Mon, 18 Dec 2023 16:36:12 +0530 Subject: [PATCH 13/20] make get_click_events as export --- include/analytics_manager.h | 2 ++ src/analytics_manager.cpp | 4 +++ src/core_api.cpp | 65 +++++++++++++++++++++++++++++++++-- src/main/typesense_server.cpp | 2 +- 4 files changed, 69 insertions(+), 4 deletions(-) diff --git a/include/analytics_manager.h b/include/analytics_manager.h index 281e03c3..08c44010 100644 --- a/include/analytics_manager.h +++ b/include/analytics_manager.h @@ -200,6 +200,8 @@ public: void dispose(); + Store* get_analytics_store(); + void persist_query_events(ReplicationState *raft_server, uint64_t prev_persistence_s); std::unordered_map get_popular_queries(); diff --git a/src/analytics_manager.cpp b/src/analytics_manager.cpp index ad6003ac..6db4978b 100644 --- a/src/analytics_manager.cpp +++ b/src/analytics_manager.cpp @@ -598,6 +598,10 @@ void AnalyticsManager::init(Store* store, Store* analytics_store) { this->analytics_store = analytics_store; } +Store* AnalyticsManager::get_analytics_store() { + return this->analytics_store; +} + std::unordered_map AnalyticsManager::get_popular_queries() { std::unique_lock lk(mutex); return popular_queries; diff --git a/src/core_api.cpp b/src/core_api.cpp index 8c640b82..d75214b5 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -2804,10 +2804,69 @@ bool put_conversation_model(const std::shared_ptr& req, const std::sha res->set_200(model.dump()); return true; } -bool get_click_events(const std::shared_ptr& req, const std::shared_ptr& res) { - auto click_events = AnalyticsManager::get_instance().get_click_events(); - res->set_200(click_events.dump()); +bool get_click_events(const std::shared_ptr& req, const std::shared_ptr& res) { + auto analytics_store = AnalyticsManager::get_instance().get_analytics_store(); + if(analytics_store) { + export_state_t *export_state = nullptr; + auto click_event_prefix = std::string(AnalyticsManager::CLICK_EVENT) + "_"; + if(req->data == nullptr) { + export_state = new export_state_t(); + req->data = export_state; + + export_state->iter_upper_bound_key = std::string(AnalyticsManager::CLICK_EVENT) + "`"; + export_state->iter_upper_bound = new rocksdb::Slice(export_state->iter_upper_bound_key); + export_state->it = analytics_store->scan(click_event_prefix, export_state->iter_upper_bound); + } else { + export_state = dynamic_cast(req->data); + } + + if (export_state->it != nullptr) { + rocksdb::Iterator *it = export_state->it; + size_t batch_counter = 0; + std::string().swap(res->body); + + if(!it->Valid()) { + LOG(ERROR) << "No click events found in db."; + req->last_chunk_aggregate = true; + res->final = true; + res->set_404(); + stream_response(req, res); + return false; + } + + while (it->Valid() && it->key().ToString().compare(0, click_event_prefix.size(), click_event_prefix) == 0) { + res->body += it->value().ToString(); + it->Next(); + + // append a new line character if there is going to be one more record to send + if (it->Valid() && + it->key().ToString().compare(0, click_event_prefix.size(), click_event_prefix) == 0) { + res->body += "\n"; + req->last_chunk_aggregate = false; + res->final = false; + } else { + req->last_chunk_aggregate = true; + res->final = true; + } + + batch_counter++; + if (batch_counter == export_state->export_batch_size) { + break; + } + } + } else { + req->last_chunk_aggregate = true; + res->final = true; + } + + res->content_type_header = "text/plain; charset=utf-8"; + res->status_code = 200; + + stream_response(req, res); + } else { + LOG(ERROR) << "Analytics store not initialized."; + } return true; } diff --git a/src/main/typesense_server.cpp b/src/main/typesense_server.cpp index f4993434..a52db200 100644 --- a/src/main/typesense_server.cpp +++ b/src/main/typesense_server.cpp @@ -82,7 +82,7 @@ void master_server_routes() { server->post("/analytics/events", post_create_event); //collection based query click events - server->get("/analytics/click_events", get_click_events); + server->get("/analytics/click_events", get_click_events, false, true); server->post("/analytics/click_events", post_create_event); server->post("/analytics/click_events/replicate", post_replicate_events); server->get("/analytics/query_hits_counts", get_query_hits_counts); From d87a48363377d51e7c5928ec58a704f2074e0ab7 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 19 Dec 2023 12:17:26 +0530 Subject: [PATCH 14/20] Parameterize db compaction interval. --- include/store.h | 5 +++-- include/tsconfig.h | 19 +++++++++++++++++++ src/typesense_server_utils.cpp | 5 +++-- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/include/store.h b/include/store.h index ae50d1ab..9cf7a1d3 100644 --- a/include/store.h +++ b/include/store.h @@ -84,7 +84,8 @@ public: Store(const std::string & state_dir_path, const size_t wal_ttl_secs = 24*60*60, - const size_t wal_size_mb = 1024, bool disable_wal = true): state_dir_path(state_dir_path) { + const size_t wal_size_mb = 1024, bool disable_wal = true, + const size_t db_compaction_interval = 604800): state_dir_path(state_dir_path) { // Optimize RocksDB options.IncreaseParallelism(); options.OptimizeLevelStyleCompaction(); @@ -94,7 +95,7 @@ public: options.max_write_buffer_number = 2; options.merge_operator.reset(new UInt64AddOperator); options.compression = rocksdb::CompressionType::kSnappyCompression; - options.periodic_compaction_seconds = 604800; + options.periodic_compaction_seconds = db_compaction_interval; options.max_log_file_size = 4*1048576; options.keep_log_file_num = 5; diff --git a/include/tsconfig.h b/include/tsconfig.h index 67a924e8..da12ffe0 100644 --- a/include/tsconfig.h +++ b/include/tsconfig.h @@ -72,6 +72,8 @@ private: uint32_t housekeeping_interval; + uint32_t db_compaction_interval; + protected: Config() { @@ -100,6 +102,7 @@ protected: this->enable_search_analytics = false; this->analytics_flush_interval = 3600; // in seconds this->housekeeping_interval = 1800; // in seconds + this->db_compaction_interval = 604800; // in seconds } Config(Config const&) { @@ -309,6 +312,10 @@ public: return this->housekeeping_interval; } + size_t get_db_compaction_interval() const { + return this->db_compaction_interval; + } + size_t get_thread_pool_size() const { return this->thread_pool_size; } @@ -449,6 +456,10 @@ public: this->housekeeping_interval = std::stoi(get_env("TYPESENSE_HOUSEKEEPING_INTERVAL")); } + if(!get_env("TYPESENSE_DB_COMPACTION_INTERVAL").empty()) { + this->db_compaction_interval = std::stoi(get_env("TYPESENSE_DB_COMPACTION_INTERVAL")); + } + if(!get_env("TYPESENSE_THREAD_POOL_SIZE").empty()) { this->thread_pool_size = std::stoi(get_env("TYPESENSE_THREAD_POOL_SIZE")); } @@ -620,6 +631,10 @@ public: this->housekeeping_interval = (int) reader.GetInteger("server", "housekeeping-interval", 1800); } + if(reader.Exists("server", "db-compaction-interval")) { + this->db_compaction_interval = (int) reader.GetInteger("server", "db-compaction-interval", 1800); + } + if(reader.Exists("server", "thread-pool-size")) { this->thread_pool_size = (int) reader.GetInteger("server", "thread-pool-size", 0); } @@ -782,6 +797,10 @@ public: this->housekeeping_interval = options.get("housekeeping-interval"); } + if(options.exist("db-compaction-interval")) { + this->db_compaction_interval = options.get("db-compaction-interval"); + } + if(options.exist("thread-pool-size")) { this->thread_pool_size = options.get("thread-pool-size"); } diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp index db927485..dcc8f136 100644 --- a/src/typesense_server_utils.cpp +++ b/src/typesense_server_utils.cpp @@ -109,6 +109,7 @@ void init_cmdline_options(cmdline::parser & options, int argc, char **argv) { options.add("cache-num-entries", '\0', "Number of entries to cache.", false, 1000); options.add("analytics-flush-interval", '\0', "Frequency of persisting analytics data to disk (in seconds).", false, 3600); options.add("housekeeping-interval", '\0', "Frequency of housekeeping background job (in seconds).", false, 1800); + options.add("db-compaction-interval", '\0', "Frequency of RocksDB compaction (in seconds).", false, 604800); // DEPRECATED options.add("listen-address", 'h', "[DEPRECATED: use `api-address`] Address to which Typesense API service binds.", false, "0.0.0.0"); @@ -394,7 +395,7 @@ int run_server(const Config & config, const std::string & version, void (*master ThreadPool replication_thread_pool(num_threads); // primary DB used for storing the documents: we will not use WAL since Raft provides that - Store store(db_dir); + Store store(db_dir, 24*60*60, 1024, true, config.get_db_compaction_interval()); // meta DB for storing house keeping things Store meta_store(meta_dir, 24*60*60, 1024, false); @@ -402,7 +403,7 @@ int run_server(const Config & config, const std::string & version, void (*master //analytics DB for storing query click events std::unique_ptr analytics_store = nullptr; if(!analytics_dir.empty()) { - analytics_store.reset(new Store(analytics_dir, 24 * 60 * 60, 1024, false)); + analytics_store.reset(new Store(analytics_dir, 24 * 60 * 60, 1024, true, config.get_db_compaction_interval())); } curl_global_init(CURL_GLOBAL_SSL); From c28ac365f2cd70f543969be4fea73cacc5e48b4f Mon Sep 17 00:00:00 2001 From: Jason Bosco Date: Mon, 18 Dec 2023 16:06:34 -0600 Subject: [PATCH 15/20] Use cache from target branch in PR if available # Conflicts: # .github/workflows/tests.yml --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 280e69e9..af855337 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -45,6 +45,7 @@ jobs: workflow_conclusion: "" if_no_artifact_found: warn skip_unpack: true + branch: ${{ github.base_ref || github.head_ref || github.ref_name }} - name: Uncompress bazel cache run: | From 6fda5c98216ba34e22959c6d2984ebc308859978 Mon Sep 17 00:00:00 2001 From: krunal Date: Tue, 19 Dec 2023 18:00:10 +0530 Subject: [PATCH 16/20] add test and refactor --- src/core_api.cpp | 100 ++++++++++++++-------------- test/core_api_utils_test.cpp | 124 ++++++++++++++++++++++++++++++++++- 2 files changed, 174 insertions(+), 50 deletions(-) diff --git a/src/core_api.cpp b/src/core_api.cpp index d75214b5..058dc02b 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -2807,66 +2807,68 @@ bool put_conversation_model(const std::shared_ptr& req, const std::sha bool get_click_events(const std::shared_ptr& req, const std::shared_ptr& res) { auto analytics_store = AnalyticsManager::get_instance().get_analytics_store(); - if(analytics_store) { - export_state_t *export_state = nullptr; - auto click_event_prefix = std::string(AnalyticsManager::CLICK_EVENT) + "_"; - if(req->data == nullptr) { - export_state = new export_state_t(); - req->data = export_state; + if (!analytics_store) { + LOG(ERROR) << "Analytics store not initialized."; + return true; + } - export_state->iter_upper_bound_key = std::string(AnalyticsManager::CLICK_EVENT) + "`"; - export_state->iter_upper_bound = new rocksdb::Slice(export_state->iter_upper_bound_key); - export_state->it = analytics_store->scan(click_event_prefix, export_state->iter_upper_bound); - } else { - export_state = dynamic_cast(req->data); - } + export_state_t *export_state = nullptr; + auto click_event_prefix = std::string(AnalyticsManager::CLICK_EVENT) + "_"; + if (req->data == nullptr) { + export_state = new export_state_t(); + req->data = export_state; - if (export_state->it != nullptr) { - rocksdb::Iterator *it = export_state->it; - size_t batch_counter = 0; - std::string().swap(res->body); + export_state->iter_upper_bound_key = std::string(AnalyticsManager::CLICK_EVENT) + "`"; + export_state->iter_upper_bound = new rocksdb::Slice(export_state->iter_upper_bound_key); + export_state->it = analytics_store->scan(click_event_prefix, export_state->iter_upper_bound); + } else { + export_state = dynamic_cast(req->data); + } - if(!it->Valid()) { - LOG(ERROR) << "No click events found in db."; - req->last_chunk_aggregate = true; - res->final = true; - res->set_404(); - stream_response(req, res); - return false; - } + if (export_state->it != nullptr) { + rocksdb::Iterator *it = export_state->it; + size_t batch_counter = 0; + std::string().swap(res->body); - while (it->Valid() && it->key().ToString().compare(0, click_event_prefix.size(), click_event_prefix) == 0) { - res->body += it->value().ToString(); - it->Next(); - - // append a new line character if there is going to be one more record to send - if (it->Valid() && - it->key().ToString().compare(0, click_event_prefix.size(), click_event_prefix) == 0) { - res->body += "\n"; - req->last_chunk_aggregate = false; - res->final = false; - } else { - req->last_chunk_aggregate = true; - res->final = true; - } - - batch_counter++; - if (batch_counter == export_state->export_batch_size) { - break; - } - } - } else { + if (!it->Valid()) { + LOG(ERROR) << "No click events found in db."; req->last_chunk_aggregate = true; res->final = true; + res->set_404(); + stream_response(req, res); + return false; } - res->content_type_header = "text/plain; charset=utf-8"; - res->status_code = 200; + while (it->Valid() && it->key().ToString().compare(0, click_event_prefix.size(), click_event_prefix) == 0) { + res->body += it->value().ToString(); + it->Next(); - stream_response(req, res); + // append a new line character if there is going to be one more record to send + if (it->Valid() && + it->key().ToString().compare(0, click_event_prefix.size(), click_event_prefix) == 0) { + res->body += "\n"; + req->last_chunk_aggregate = false; + res->final = false; + } else { + req->last_chunk_aggregate = true; + res->final = true; + } + + batch_counter++; + if (batch_counter == export_state->export_batch_size) { + break; + } + } } else { - LOG(ERROR) << "Analytics store not initialized."; + req->last_chunk_aggregate = true; + res->final = true; } + + res->content_type_header = "text/plain; charset=utf-8"; + res->status_code = 200; + + stream_response(req, res); + return true; } diff --git a/test/core_api_utils_test.cpp b/test/core_api_utils_test.cpp index 9ad3a006..3556160b 100644 --- a/test/core_api_utils_test.cpp +++ b/test/core_api_utils_test.cpp @@ -7,27 +7,33 @@ #include "raft_server.h" #include "conversation_model_manager.h" #include "conversation_manager.h" +#include class CoreAPIUtilsTest : public ::testing::Test { protected: - Store *store; + Store *store, *analytics_store; CollectionManager & collectionManager = CollectionManager::get_instance(); std::atomic quit = false; std::vector query_fields; std::vector sort_fields; + AnalyticsManager& analyticsManager = AnalyticsManager::get_instance(); + void setupCollection() { std::string state_dir_path = "/tmp/typesense_test/core_api_utils"; + std::string analytics_db_path = "/tmp/typesense_test/analytics_db2"; LOG(INFO) << "Truncating and creating: " << state_dir_path; system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); store = new Store(state_dir_path); + analytics_store = new Store(analytics_db_path); collectionManager.init(store, 1.0, "auth_key", quit); collectionManager.load(8, 1000); ConversationModelManager::init(store); ConversationManager::init(store); + analyticsManager.init(store, analytics_store); } virtual void SetUp() { @@ -37,6 +43,7 @@ protected: virtual void TearDown() { collectionManager.dispose(); delete store; + delete analytics_store; } }; @@ -1495,4 +1502,119 @@ TEST_F(CoreAPIUtilsTest, TestInvalidConversationModels) { ASSERT_EQ(400, resp->status_code); ASSERT_EQ("Property `model_name` is not provided or not a string.", nlohmann::json::parse(resp->body)["message"]); +} + +TEST_F(CoreAPIUtilsTest, GetClickEvents) { + //reset analytics store + analyticsManager.resetRateLimit(); + analyticsManager.resetAnalyticsStore(); + + nlohmann::json schema = R"({ + "name": "titles", + "fields": [ + {"name": "name", "type": "string" }, + {"name": "points", "type": "int32" } + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* titles = op.get(); + + std::shared_ptr req = std::make_shared(); + std::shared_ptr res = std::make_shared(nullptr); + + // no events in db + get_click_events(req, res); + ASSERT_EQ("{\"message\": \"Not Found\"}", res->body); + + //add some events + nlohmann::json event1 = R"({ + "type": "query_click", + "data": { + "q": "technology", + "collection": "titles", + "doc_id": "21", + "position": 2, + "user_id": "13" + } + })"_json; + + req->body = event1.dump(); + ASSERT_TRUE(post_create_event(req, res)); + + nlohmann::json event2 = R"({ + "type": "query_click", + "data": { + "q": "technology", + "collection": "titles", + "doc_id": "12", + "position": 1, + "user_id": "13" + } + })"_json; + req->body = event2.dump(); + ASSERT_TRUE(post_create_event(req, res)); + + nlohmann::json event3 = R"({ + "type": "query_click", + "data": { + "q": "technology", + "collection": "titles", + "doc_id": "52", + "position": 5, + "user_id": "13" + } + })"_json; + req->body = event3.dump(); + ASSERT_TRUE(post_create_event(req, res)); + + event1["collection_id"] = "0"; + event1["timestamp"] = 1521512521; + event1["event_type"] = "click_events"; + event2["collection_id"] = "0"; + event2["timestamp"] = 1521514354; + event2["event_type"] = "click_events"; + event3["collection_id"] = "0"; + event3["timestamp"] = 1521515382; + event3["event_type"] = "click_events"; + + + nlohmann::json click_events = nlohmann::json::array(); + click_events.push_back(event1); + click_events.push_back(event2); + click_events.push_back(event3); + + req->body = click_events.dump(); + ASSERT_TRUE(post_replicate_events(req, res)); + + //get click events + req->data = nullptr; + get_click_events(req, res); + + std::vector res_strs; + StringUtils::split(res->body, res_strs, "\n"); + + auto result = nlohmann::json::array(); + result.push_back(nlohmann::json::parse(res_strs[0])); + result.push_back(nlohmann::json::parse(res_strs[1])); + result.push_back(nlohmann::json::parse(res_strs[2])); + + ASSERT_EQ("0", result[0]["collection_id"]); + ASSERT_EQ("13", result[0]["data"]["user_id"]); + ASSERT_EQ("21", result[0]["data"]["doc_id"]); + ASSERT_EQ(2, result[0]["data"]["position"]); + ASSERT_EQ("technology", result[0]["data"]["q"]); + + ASSERT_EQ("0", result[1]["collection_id"]); + ASSERT_EQ("13", result[1]["data"]["user_id"]); + ASSERT_EQ("12", result[1]["data"]["doc_id"]); + ASSERT_EQ(1, result[1]["data"]["position"]); + ASSERT_EQ("technology", result[1]["data"]["q"]); + + ASSERT_EQ("0", result[2]["collection_id"]); + ASSERT_EQ("13", result[2]["data"]["user_id"]); + ASSERT_EQ("52", result[2]["data"]["doc_id"]); + ASSERT_EQ(5, result[2]["data"]["position"]); + ASSERT_EQ("technology", result[2]["data"]["q"]); } \ No newline at end of file From cf5d00dd4c04b540d6cad161c576b96966988a59 Mon Sep 17 00:00:00 2001 From: krunal Date: Thu, 21 Dec 2023 11:56:13 +0530 Subject: [PATCH 17/20] add test for alphanumeric labels for range facet --- test/collection_faceting_test.cpp | 46 ++++++++++++++++++++ test/collection_optimized_faceting_test.cpp | 47 +++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index 6fc020f7..7edbb3d2 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -2980,3 +2980,49 @@ TEST_F(CollectionFacetingTest, RangeFacetTestWithGroupBy) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionFacetingTest, RangeFacetAlphanumericLabels) { + std::vector fields = {field("monuments", field_types::STRING, false), + field("year", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", + {},{}).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["monuments"] = "Statue Of Unity"; + doc["year"] = 2018; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["monuments"] = "Taj Mahal"; + doc["year"] = 1653; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "2"; + doc["monuments"] = "Mysore Palace"; + doc["year"] = 1897; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "3"; + doc["monuments"] = "Chennakesava Temple"; + doc["year"] = 1117; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"year(10thAD:[1000,1500], 15thAD:[1500,2000], 20thAD:[2000, ])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true).get(); + + ASSERT_EQ(3, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(2, results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("15thAD", results["facet_counts"][0]["counts"][0]["value"]); + ASSERT_EQ(1, results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_EQ("20thAD", results["facet_counts"][0]["counts"][1]["value"]); + ASSERT_EQ(1, results["facet_counts"][0]["counts"][2]["count"]); + ASSERT_EQ("10thAD", results["facet_counts"][0]["counts"][2]["value"]); +} \ No newline at end of file diff --git a/test/collection_optimized_faceting_test.cpp b/test/collection_optimized_faceting_test.cpp index 34199464..57c19a3b 100644 --- a/test/collection_optimized_faceting_test.cpp +++ b/test/collection_optimized_faceting_test.cpp @@ -2648,3 +2648,50 @@ TEST_F(CollectionOptimizedFacetingTest, StringFacetsCountListRemoveTest) { ASSERT_EQ("The Shawshank Redemption", results["facet_counts"][0]["counts"][0]["value"]); ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"]); } + +TEST_F(CollectionOptimizedFacetingTest, RangeFacetAlphanumericLabels) { + std::vector fields = {field("monuments", field_types::STRING, false), + field("year", field_types::INT32, true),}; + Collection* coll1 = collectionManager.create_collection( + "coll1", 1, fields, "", 0, "", + {},{}).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["monuments"] = "Statue Of Unity"; + doc["year"] = 2018; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "1"; + doc["monuments"] = "Taj Mahal"; + doc["year"] = 1653; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "2"; + doc["monuments"] = "Mysore Palace"; + doc["year"] = 1897; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["id"] = "3"; + doc["monuments"] = "Chennakesava Temple"; + doc["year"] = 1117; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("*", {}, + "", {"year(10thAD:[1000,1500], 15thAD:[1500,2000], 20thAD:[2000, ])"}, + {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {}, 1000, + true, false, true, "", true, 6000*1000, 4, 7, fallback, 4, {off}, INT16_MAX, INT16_MAX, + 2, 2, false, "", true, 0, max_score, 100, 0, 0, VALUE).get(); + + ASSERT_EQ(3, results["facet_counts"][0]["counts"].size()); + ASSERT_EQ(2, results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("15thAD", results["facet_counts"][0]["counts"][0]["value"]); + ASSERT_EQ(1, results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_EQ("20thAD", results["facet_counts"][0]["counts"][1]["value"]); + ASSERT_EQ(1, results["facet_counts"][0]["counts"][2]["count"]); + ASSERT_EQ("10thAD", results["facet_counts"][0]["counts"][2]["value"]); +} \ No newline at end of file From 058b8be621e9fe4afe15eabcaaaab9bea486022f Mon Sep 17 00:00:00 2001 From: krunal Date: Thu, 21 Dec 2023 11:56:58 +0530 Subject: [PATCH 18/20] add alphanumeric range facet labels & minor bug fix --- src/collection.cpp | 2 +- src/index.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index c53677d7..abef4b13 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -5850,7 +5850,7 @@ bool Collection::get_enable_nested_fields() { Option Collection::parse_facet(const std::string& facet_field, std::vector& facets) const { const std::regex base_pattern(".+\\(.*\\)"); - const std::regex range_pattern("[[a-z A-Z]+:\\[([+-]?([0-9]*[.])?[0-9]*)\\,\\s*([+-]?([0-9]*[.])?[0-9]*)\\]"); + const std::regex range_pattern("[[0-9]*[a-z A-Z]+[0-9]*:\\[([+-]?([0-9]*[.])?[0-9]*)\\,\\s*([+-]?([0-9]*[.])?[0-9]*)\\]"); const std::string _alpha = "_alpha"; if ((facet_field.find(":") != std::string::npos) diff --git a/src/index.cpp b/src/index.cpp index a4d83f63..0287f443 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1389,7 +1389,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, if(a_facet.get_range(std::stoll(doc_val), range_pair)) { const auto& range_id = range_pair.first; facet_count_t& facet_count = a_facet.result_map[range_id]; - facet_count.count = kv.second.count; + facet_count.count += kv.second.count; } } else { facet_count_t& facet_count = a_facet.value_result_map[kv.first]; From 7126aa80244eb84651e8c8724235cc9db8ef19a4 Mon Sep 17 00:00:00 2001 From: krunal Date: Thu, 21 Dec 2023 17:39:59 +0530 Subject: [PATCH 19/20] fix open quotes search query bug --- include/string_utils.h | 2 + src/collection.cpp | 3 +- src/string_utils.cpp | 4 ++ test/collection_specific_more_test.cpp | 4 ++ test/collection_specific_test.cpp | 56 ++++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) diff --git a/include/string_utils.h b/include/string_utils.h index 74cfc537..11f16967 100644 --- a/include/string_utils.h +++ b/include/string_utils.h @@ -335,4 +335,6 @@ struct StringUtils { static Option tokenize_filter_query(const std::string& filter_query, std::queue& tokens); static Option split_include_fields(const std::string& include_fields, std::vector& tokens); + + static size_t get_occurence_count(const std::string& str, char symbol); }; diff --git a/src/collection.cpp b/src/collection.cpp index abef4b13..62e003f3 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -3257,6 +3257,7 @@ void Collection::parse_search_query(const std::string &query, std::vector phrase; + bool is_open_quotes = StringUtils::get_occurence_count(query, '"') & 1; auto symbols_to_index_has_minus = std::find(symbols_to_index.begin(), symbols_to_index.end(), '-') != symbols_to_index.end(); @@ -3271,7 +3272,7 @@ void Collection::parse_search_query(const std::string &query, std::vector 1) { + if(!is_open_quotes && token[0] == '"' && token.size() > 1) { phrase_search_op_prior = true; token = token.substr(1); } diff --git a/src/string_utils.cpp b/src/string_utils.cpp index a9296cd5..109c5d09 100644 --- a/src/string_utils.cpp +++ b/src/string_utils.cpp @@ -594,3 +594,7 @@ size_t StringUtils::split_facet(const std::string &s, std::vector & std::wstring_convert, char32_t> utf8conv; return utf8conv.from_bytes(bytes).size(); }*/ + +size_t StringUtils::get_occurence_count(const std::string &str, char symbol) { + return std::count(str.begin(), str.end(), symbol); +} \ No newline at end of file diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 51b5a58c..05361729 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -2179,6 +2179,10 @@ TEST_F(CollectionSpecificMoreTest, PhraseMatchAcrossArrayElements) { auto res = coll1->search(R"("state of the art)", {"texts"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set()).get(); + ASSERT_EQ(1, res["hits"].size()); + + res = coll1->search(R"("state of the art")", {"texts"}, "", {}, {}, {0}, 10, 1, + FREQUENCY, {true}, 10, spp::sparse_hash_set()).get(); ASSERT_EQ(0, res["hits"].size()); } diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index 38764660..388eb883 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -2996,3 +2996,59 @@ TEST_F(CollectionSpecificTest, DontHighlightPunctuation) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionSpecificTest, ExactMatchWithoutClosingSymbol) { + std::vector fields = {field("title", field_types::STRING, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + std::vector> records = { + {"Hampi"}, + {"Mahabalipuram"}, + {"Taj Mahal"}, + {"Mysore Palace"} + }; + + for(size_t i=0; iadd(doc.dump()).ok()); + } + + std::map req_params = { + {"collection", "coll1"}, + {"q", "\"Hamp"}, + {"query_by", "title"}, + }; + nlohmann::json embedded_params; + std::string json_res; + auto now_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + + auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + + nlohmann::json result = nlohmann::json::parse(json_res); + ASSERT_EQ(1, result["hits"].size()); + ASSERT_EQ("0", result["hits"][0]["document"]["id"]); + ASSERT_EQ("Hampi", result["hits"][0]["document"]["title"]); + + req_params = { + {"collection", "coll1"}, + {"q", "\"Mah"}, + {"query_by", "title"}, + }; + now_ts = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + + search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts); + + result = nlohmann::json::parse(json_res); + ASSERT_EQ(2, result["hits"].size()); + ASSERT_EQ("2", result["hits"][0]["document"]["id"]); + ASSERT_EQ("Taj Mahal", result["hits"][0]["document"]["title"]); + ASSERT_EQ("1", result["hits"][1]["document"]["id"]); + ASSERT_EQ("Mahabalipuram", result["hits"][1]["document"]["title"]); +} \ No newline at end of file From c12e911f6c3e59be920bf5fe94505e2d9a80b151 Mon Sep 17 00:00:00 2001 From: krunal Date: Fri, 22 Dec 2023 21:07:08 +0530 Subject: [PATCH 20/20] updated patch by @kishorenc --- src/collection.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index 62e003f3..e08e8c59 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -3257,7 +3257,6 @@ void Collection::parse_search_query(const std::string &query, std::vector phrase; - bool is_open_quotes = StringUtils::get_occurence_count(query, '"') & 1; auto symbols_to_index_has_minus = std::find(symbols_to_index.begin(), symbols_to_index.end(), '-') != symbols_to_index.end(); @@ -3272,7 +3271,7 @@ void Collection::parse_search_query(const std::string &query, std::vector 1) { + if(token[0] == '"' && token.size() > 1) { phrase_search_op_prior = true; token = token.substr(1); } @@ -3333,7 +3332,7 @@ void Collection::parse_search_query(const std::string &query, std::vector