From 10c1f4c5c1b87fc85b21352553a023e408c2721b Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 8 Aug 2023 11:39:18 +0530 Subject: [PATCH 01/16] Fix schema validation of non-optional, null nested values. --- include/field.h | 7 +- src/field.cpp | 53 ++++++++--- test/collection_nested_fields_test.cpp | 125 +++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 16 deletions(-) diff --git a/include/field.h b/include/field.h index 6b4ee937..d34d32ae 100644 --- a/include/field.h +++ b/include/field.h @@ -23,6 +23,7 @@ namespace field_types { static const std::string INT64 = "int64"; static const std::string FLOAT = "float"; static const std::string BOOL = "bool"; + static const std::string NIL = "nil"; static const std::string GEOPOINT = "geopoint"; static const std::string STRING_ARRAY = "string[]"; static const std::string INT32_ARRAY = "int32[]"; @@ -429,19 +430,19 @@ struct field { std::vector& fields_vec); static bool flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_array, bool has_obj_array, - const field& the_field, const std::string& flat_name, + bool is_update, const field& the_field, const std::string& flat_name, const std::unordered_map& dyn_fields, std::unordered_map& flattened_fields); static Option flatten_field(nlohmann::json& doc, nlohmann::json& obj, const field& the_field, std::vector& path_parts, size_t path_index, bool has_array, - bool has_obj_array, + bool has_obj_array, bool is_update, const std::unordered_map& dyn_fields, std::unordered_map& flattened_fields); static Option flatten_doc(nlohmann::json& document, const tsl::htrie_map& nested_fields, const std::unordered_map& dyn_fields, - bool missing_is_ok, std::vector& flattened_fields); + bool is_update, std::vector& flattened_fields); static void compact_nested_fields(tsl::htrie_map& nested_fields); }; diff --git a/src/field.cpp b/src/field.cpp index bccd8da5..7d5e399c 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -824,18 +824,41 @@ Option field::json_field_to_field(bool enable_nested_fields, nlohmann::jso } bool field::flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_array, bool has_obj_array, - const field& the_field, const std::string& flat_name, + bool is_update, const field& the_field, const std::string& flat_name, const std::unordered_map& dyn_fields, std::unordered_map& flattened_fields) { if(value.is_object()) { has_obj_array = has_array; - for(const auto& kv: value.items()) { - flatten_obj(doc, kv.value(), has_array, has_obj_array, the_field, flat_name + "." + kv.key(), - dyn_fields, flattened_fields); + auto it = value.begin(); + while(it != value.end()) { + const std::string& child_field_name = flat_name + "." + it.key(); + if(it.value().is_null()) { + if(has_array) { + doc[child_field_name].push_back(nullptr); + } else { + doc[child_field_name] = nullptr; + } + + field flattened_field; + flattened_field.name = child_field_name; + flattened_field.type = field_types::NIL; + flattened_fields[child_field_name] = flattened_field; + + if(!is_update) { + // update code path requires and takes care of null values + it = value.erase(it); + } else { + it++; + } + } else { + flatten_obj(doc, it.value(), has_array, has_obj_array, is_update, the_field, child_field_name, + dyn_fields, flattened_fields); + it++; + } } } else if(value.is_array()) { for(const auto& kv: value.items()) { - flatten_obj(doc, kv.value(), true, has_obj_array, the_field, flat_name, dyn_fields, flattened_fields); + flatten_obj(doc, kv.value(), true, has_obj_array, is_update, the_field, flat_name, dyn_fields, flattened_fields); } } else { // must be a primitive if(doc.count(flat_name) != 0 && flattened_fields.find(flat_name) == flattened_fields.end()) { @@ -891,7 +914,7 @@ bool field::flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_arr Option field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, const field& the_field, std::vector& path_parts, size_t path_index, - bool has_array, bool has_obj_array, + bool has_array, bool has_obj_array, bool is_update, const std::unordered_map& dyn_fields, std::unordered_map& flattened_fields) { if(path_index == path_parts.size()) { @@ -946,7 +969,8 @@ Option field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, cons if(detected_type == the_field.type || is_numericaly_valid) { if(the_field.is_object()) { - flatten_obj(doc, obj, has_array, has_obj_array, the_field, the_field.name, dyn_fields, flattened_fields); + flatten_obj(doc, obj, has_array, has_obj_array, is_update, the_field, the_field.name, + dyn_fields, flattened_fields); } else { if(doc.count(the_field.name) != 0 && flattened_fields.find(the_field.name) == flattened_fields.end()) { return Option(true); @@ -989,7 +1013,7 @@ Option field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, cons for(auto& ele: it.value()) { has_obj_array = has_obj_array || ele.is_object(); Option op = flatten_field(doc, ele, the_field, path_parts, path_index + 1, has_array, - has_obj_array, dyn_fields, flattened_fields); + has_obj_array, is_update, dyn_fields, flattened_fields); if(!op.ok()) { return op; } @@ -997,7 +1021,7 @@ Option field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, cons return Option(true); } else { return flatten_field(doc, it.value(), the_field, path_parts, path_index + 1, has_array, has_obj_array, - dyn_fields, flattened_fields); + is_update, dyn_fields, flattened_fields); } } { return Option(404, "Field `" + the_field.name + "` not found."); @@ -1007,7 +1031,7 @@ Option field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, cons Option field::flatten_doc(nlohmann::json& document, const tsl::htrie_map& nested_fields, const std::unordered_map& dyn_fields, - bool missing_is_ok, std::vector& flattened_fields) { + bool is_update, std::vector& flattened_fields) { std::unordered_map flattened_fields_map; @@ -1021,12 +1045,12 @@ Option field::flatten_doc(nlohmann::json& document, } auto op = flatten_field(document, document, nested_field, field_parts, 0, false, false, - dyn_fields, flattened_fields_map); + is_update, dyn_fields, flattened_fields_map); if(op.ok()) { continue; } - if(op.code() == 404 && (missing_is_ok || nested_field.optional)) { + if(op.code() == 404 && (is_update || nested_field.optional)) { continue; } else { return op; @@ -1036,7 +1060,10 @@ Option field::flatten_doc(nlohmann::json& document, document[".flat"] = nlohmann::json::array(); for(auto& kv: flattened_fields_map) { document[".flat"].push_back(kv.second.name); - flattened_fields.push_back(kv.second); + if(kv.second.type != field_types::NIL) { + // not a real field so we won't add it + flattened_fields.push_back(kv.second); + } } return Option(true); diff --git a/test/collection_nested_fields_test.cpp b/test/collection_nested_fields_test.cpp index a2eef13d..6def5d1a 100644 --- a/test/collection_nested_fields_test.cpp +++ b/test/collection_nested_fields_test.cpp @@ -2560,6 +2560,131 @@ TEST_F(CollectionNestedFieldsTest, NullValuesWithExplicitSchema) { auto results = coll1->search("jack", {"name.first"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(2, results["hits"][0]["document"].size()); // id, name + ASSERT_EQ(1, results["hits"][0]["document"]["name"].size()); // name.first + ASSERT_EQ("Jack", results["hits"][0]["document"]["name"]["first"].get()); +} + +TEST_F(CollectionNestedFieldsTest, EmplaceWithNullValueOnRequiredField) { + nlohmann::json schema = R"({ + "name": "coll1", + "enable_nested_fields": true, + "fields": [ + {"name":"currency", "type":"object"}, + {"name":"currency.eu", "type":"int32", "optional": false} + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection *coll1 = op.get(); + + auto doc1 = R"({ + "id": "0", + "currency": { + "eu": 12000 + } + })"_json; + + auto add_op = coll1->add(doc1.dump(), CREATE); + ASSERT_TRUE(add_op.ok()); + + // now update with null value -- should not be allowed + auto update_doc = R"({ + "id": "0", + "currency": { + "eu": null + } + })"_json; + + auto update_op = coll1->add(update_doc.dump(), EMPLACE); + ASSERT_FALSE(update_op.ok()); + ASSERT_EQ("Field `currency.eu` must be an int32.", update_op.error()); +} + +TEST_F(CollectionNestedFieldsTest, EmplaceWithNullValueOnOptionalField) { + nlohmann::json schema = R"({ + "name": "coll1", + "enable_nested_fields": true, + "fields": [ + {"name":"currency", "type":"object"}, + {"name":"currency.eu", "type":"int32", "optional": true} + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection *coll1 = op.get(); + + auto doc1 = R"({ + "id": "0", + "currency": { + "eu": 12000 + } + })"_json; + + auto add_op = coll1->add(doc1.dump(), CREATE); + ASSERT_TRUE(add_op.ok()); + + // now update with null value -- should be allowed since field is optional + auto update_doc = R"({ + "id": "0", + "currency": { + "eu": null + } + })"_json; + + auto update_op = coll1->add(update_doc.dump(), EMPLACE); + ASSERT_TRUE(update_op.ok()); + + // try to fetch the document to see the stored value + auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); + ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ(2, results["hits"][0]["document"].size()); // id, currency + ASSERT_EQ(0, results["hits"][0]["document"]["currency"].size()); +} + +TEST_F(CollectionNestedFieldsTest, EmplaceWithMissingArrayValueOnOptionalField) { + nlohmann::json schema = R"({ + "name": "coll1", + "enable_nested_fields": true, + "fields": [ + {"name":"currency", "type":"object[]"}, + {"name":"currency.eu", "type":"int32[]", "optional": true} + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection *coll1 = op.get(); + + auto doc1 = R"({ + "id": "0", + "currency": [ + {"eu": 12000}, + {"us": 10000} + ] + })"_json; + + auto add_op = coll1->add(doc1.dump(), CREATE); + ASSERT_TRUE(add_op.ok()); + + // now update with null value -- should be allowed since field is optional + auto update_doc = R"({ + "id": "0", + "currency": [ + {"us": 10000} + ] + })"_json; + + auto update_op = coll1->add(update_doc.dump(), EMPLACE); + ASSERT_TRUE(update_op.ok()); + + // try to fetch the document to see the stored value + auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); + ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ(2, results["hits"][0]["document"].size()); // id, currency + ASSERT_EQ(1, results["hits"][0]["document"]["currency"].size()); + ASSERT_EQ(10000, results["hits"][0]["document"]["currency"][0]["us"].get()); } TEST_F(CollectionNestedFieldsTest, UpdateNestedDocument) { From b3f248bd934935d6b34ccd2439b19a639ed1fb86 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 8 Aug 2023 20:24:15 +0530 Subject: [PATCH 02/16] Handle emplace + null values. --- src/validator.cpp | 2 +- test/collection_nested_fields_test.cpp | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/validator.cpp b/src/validator.cpp index 51a7d19c..f814c923 100644 --- a/src/validator.cpp +++ b/src/validator.cpp @@ -626,7 +626,7 @@ Option validator_t::validate_index_in_memory(nlohmann::json& document, continue; } - if((a_field.optional || op == UPDATE || op == EMPLACE) && document.count(field_name) == 0) { + if((a_field.optional || op == UPDATE || (op == EMPLACE && is_update)) && document.count(field_name) == 0) { continue; } diff --git a/test/collection_nested_fields_test.cpp b/test/collection_nested_fields_test.cpp index 6def5d1a..98a94f37 100644 --- a/test/collection_nested_fields_test.cpp +++ b/test/collection_nested_fields_test.cpp @@ -2578,6 +2578,19 @@ TEST_F(CollectionNestedFieldsTest, EmplaceWithNullValueOnRequiredField) { ASSERT_TRUE(op.ok()); Collection *coll1 = op.get(); + auto doc_with_null = R"({ + "id": "0", + "currency": { + "eu": null + } + })"_json; + + auto add_op = coll1->add(doc_with_null.dump(), EMPLACE); + ASSERT_FALSE(add_op.ok()); + + add_op = coll1->add(doc_with_null.dump(), CREATE); + ASSERT_FALSE(add_op.ok()); + auto doc1 = R"({ "id": "0", "currency": { @@ -2585,7 +2598,7 @@ TEST_F(CollectionNestedFieldsTest, EmplaceWithNullValueOnRequiredField) { } })"_json; - auto add_op = coll1->add(doc1.dump(), CREATE); + add_op = coll1->add(doc1.dump(), CREATE); ASSERT_TRUE(add_op.ok()); // now update with null value -- should not be allowed From 379604cad167659d9441e760da98840ea0eef9dc Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Tue, 8 Aug 2023 18:34:07 +0300 Subject: [PATCH 03/16] Fix wrong hybrid search text match score --- include/topster.h | 4 ++ src/collection.cpp | 4 +- src/index.cpp | 3 ++ test/collection_specific_more_test.cpp | 52 ++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 2 deletions(-) diff --git a/include/topster.h b/include/topster.h index b0b8f125..c7378f2b 100644 --- a/include/topster.h +++ b/include/topster.h @@ -31,6 +31,10 @@ struct KV { this->scores[0] = scores[0]; this->scores[1] = scores[1]; this->scores[2] = scores[2]; + + if(match_score_index >= 0) { + this->text_match_score = scores[match_score_index]; + } } KV() = default; diff --git a/src/collection.cpp b/src/collection.cpp index e60ffe41..88880b51 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1957,10 +1957,10 @@ Option Collection::search(std::string raw_query, if(field_order_kv->match_score_index == CURATED_RECORD_IDENTIFIER) { wrapper_doc["curated"] = true; } else if(field_order_kv->match_score_index >= 0) { - wrapper_doc["text_match"] = field_order_kv->scores[field_order_kv->match_score_index]; + wrapper_doc["text_match"] = field_order_kv->text_match_score; wrapper_doc["text_match_info"] = nlohmann::json::object(); populate_text_match_info(wrapper_doc["text_match_info"], - field_order_kv->scores[field_order_kv->match_score_index], match_type); + field_order_kv->text_match_score, match_type); if(!vector_query.field_name.empty()) { wrapper_doc["hybrid_search_info"] = nlohmann::json::object(); wrapper_doc["hybrid_search_info"]["rank_fusion_score"] = Index::int64_t_to_float(field_order_kv->scores[field_order_kv->match_score_index]); diff --git a/src/index.cpp b/src/index.cpp index 7192faba..05444b73 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -3204,6 +3204,7 @@ Option Index::search(std::vector& field_query_tokens, cons auto result = result_it->second; // old_score + (1 / rank_of_document) * WEIGHT) result->vector_distance = vec_result.second; + result->text_match_score = result->scores[result->match_score_index]; int64_t match_score = float_to_int64_t( (int64_t_to_float(result->scores[result->match_score_index])) + ((1.0 / (res_index + 1)) * VECTOR_SEARCH_WEIGHT)); @@ -3225,6 +3226,7 @@ Option Index::search(std::vector& field_query_tokens, cons int64_t match_score_index = -1; compute_sort_scores(sort_fields_std, sort_order, field_values, geopoint_indices, doc_id, 0, match_score, scores, match_score_index, vec_result.second); KV kv(searched_queries.size(), doc_id, doc_id, match_score_index, scores); + kv.text_match_score = 0; kv.vector_distance = vec_result.second; topster->add(&kv); vec_search_ids.push_back(doc_id); @@ -4154,6 +4156,7 @@ void Index::search_across_fields(const std::vector& query_tokens, KV kv(searched_queries.size(), seq_id, distinct_id, match_score_index, scores); if(match_score_index != -1) { kv.scores[match_score_index] = aggregated_score; + kv.text_match_score = aggregated_score; } int ret = topster->add(&kv); diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index e3f82f69..3e5b43a7 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -2530,4 +2530,56 @@ TEST_F(CollectionSpecificMoreTest, ApproxFilterMatchCount) { delete filter_tree_root; collectionManager.drop_collection("Collection"); +} + +TEST_F(CollectionSpecificMoreTest, HybridSearchTextMatchInfo) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_id", "type": "string"}, + {"name": "product_name", "type": "string", "infix": true}, + {"name": "product_description", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_description"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + std::vector documents = { + R"({ + "product_id": "product_a", + "product_name": "shampoo", + "product_description": "Our new moisturizing shampoo is perfect for those with dry or damaged hair." + })"_json, + R"({ + "product_id": "product_b", + "product_name": "soap", + "product_description": "Introducing our all-natural, organic soap bar made with essential oils and botanical ingredients." + })"_json + }; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + for (auto const &json: documents) { + auto add_op = collection_create_op.get()->add(json.dump()); + ASSERT_TRUE(add_op.ok()); + } + + auto coll1 = collection_create_op.get(); + auto results = coll1->search("natural products", {"product_name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(2, results["hits"].size()); + + // It's a hybrid search with only vector match + ASSERT_EQ("0", results["hits"][0]["text_match_info"]["score"].get()); + ASSERT_EQ("0", results["hits"][1]["text_match_info"]["score"].get()); + + ASSERT_EQ(0, results["hits"][0]["text_match_info"]["fields_matched"].get()); + ASSERT_EQ(0, results["hits"][1]["text_match_info"]["fields_matched"].get()); + + ASSERT_EQ(0, results["hits"][0]["text_match_info"]["tokens_matched"].get()); + ASSERT_EQ(0, results["hits"][1]["text_match_info"]["tokens_matched"].get()); } \ No newline at end of file From a99929f05fa09b13e060c3cc1caede29f61917b1 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 9 Aug 2023 19:02:36 +0530 Subject: [PATCH 04/16] Address change in streaming behavior of h2o on http/2. _req->proceed_req is not 1 when http2 is used for chunks that follow the first chunk. --- src/http_server.cpp | 10 ++++------ src/raft_server.cpp | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/http_server.cpp b/src/http_server.cpp index 9fbfb2a5..50391ed8 100644 --- a/src/http_server.cpp +++ b/src/http_server.cpp @@ -569,13 +569,11 @@ int HttpServer::async_req_cb(void *ctx, int is_end_stream) { bool async_req = custom_generator->rpath->async_req; bool is_http_v1 = (0x101 <= request->_req->version && request->_req->version < 0x200); - /* - LOG(INFO) << "async_req_cb, chunk.len=" << chunk.len + /*LOG(INFO) << "async_req_cb, chunk.len=" << chunk.len << ", is_http_v1: " << is_http_v1 - << ", request->req->entity.len=" << request->req->entity.len - << ", content_len: " << request->req->content_length - << ", is_end_stream=" << is_end_stream; - */ + << ", request->req->entity.len=" << request->_req->entity.len + << ", content_len: " << request->_req->content_length + << ", is_end_stream=" << is_end_stream;*/ // disallow specific curl clients from using import call via http2 // detects: https://github.com/curl/curl/issues/1410 diff --git a/src/raft_server.cpp b/src/raft_server.cpp index 48584937..f33a6518 100644 --- a/src/raft_server.cpp +++ b/src/raft_server.cpp @@ -254,7 +254,7 @@ void ReplicationState::write_to_leader(const std::shared_ptr& request, // Handle no leader scenario LOG(ERROR) << "Rejecting write: could not find a leader."; - if(request->_req->proceed_req && response->proxied_stream) { + if(response->proxied_stream) { // streaming in progress: ensure graceful termination (cannot start response again) LOG(ERROR) << "Terminating streaming request gracefully."; response->is_alive = false; @@ -267,7 +267,7 @@ void ReplicationState::write_to_leader(const std::shared_ptr& request, return message_dispatcher->send_message(HttpServer::STREAM_RESPONSE_MESSAGE, req_res); } - if (request->_req->proceed_req && response->proxied_stream) { + if (response->proxied_stream) { // indicates async request body of in-flight request //LOG(INFO) << "Inflight proxied request, returning control to caller, body_size=" << request->body.size(); request->notify(); From 64ec0fea41b2840ec91b33989cf2a1331807c5f4 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 09:52:34 +0300 Subject: [PATCH 05/16] Fix search results of semantic search --- src/collection.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index 88880b51..1fbce5cf 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1073,7 +1073,7 @@ Option Collection::extract_field_name(const std::string& field_name, return Option(true); } -Option Collection::search(std::string raw_query, +Option Collection::search(std::string raw_query, const std::vector& raw_search_fields, const std::string & filter_query, const std::vector& facet_fields, const std::vector & sort_fields, const std::vector& num_typos, @@ -1201,6 +1201,7 @@ Option Collection::search(std::string raw_query, std::vector processed_search_fields; std::vector query_by_weights; size_t num_embed_fields = 0; + std::string query = raw_query; for(size_t i = 0; i < raw_search_fields.size(); i++) { const std::string& field_name = raw_search_fields[i]; @@ -1289,6 +1290,11 @@ Option Collection::search(std::string raw_query, } } + // Set query to * if it is semantic search + if(!vector_query.field_name.empty() && processed_search_fields.empty()) { + query = "*"; + } + if(!vector_query.field_name.empty() && vector_query.values.empty() && num_embed_fields == 0) { std::string error = "Vector query could not find any embedded fields."; return Option(400, error); @@ -1444,7 +1450,7 @@ Option Collection::search(std::string raw_query, size_t max_hits = DEFAULT_TOPSTER_SIZE; // ensure that `max_hits` never exceeds number of documents in collection - if(search_fields.size() <= 1 || raw_query == "*") { + if(search_fields.size() <= 1 || query == "*") { max_hits = std::min(std::max(fetch_size, max_hits), get_num_documents()); } else { max_hits = std::min(std::max(fetch_size, max_hits), get_num_documents()); @@ -1477,7 +1483,6 @@ Option Collection::search(std::string raw_query, StringUtils::split(hidden_hits_str, hidden_hits, ","); std::vector filter_overrides; - std::string query = raw_query; bool filter_curated_hits = false; std::string curated_sort_by; curate_results(query, filter_query, enable_overrides, pre_segmented_query, pinned_hits, hidden_hits, @@ -1520,6 +1525,10 @@ Option Collection::search(std::string raw_query, bool is_group_by_query = group_by_fields.size() > 0; bool is_vector_query = !vector_query.field_name.empty(); + LOG(INFO) << "is_wildcard_query: " << is_wildcard_query; + LOG(INFO) << "is_group_by_query: " << is_group_by_query; + LOG(INFO) << "is_vector_query: " << is_vector_query; + if(curated_sort_by.empty()) { auto sort_validation_op = validate_and_standardize_sort_fields(sort_fields, sort_fields_std, is_wildcard_query, is_vector_query, is_group_by_query); From c4919bb358688fb2e830d27afef67f6d9a4221ce Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 09:53:26 +0300 Subject: [PATCH 06/16] Remove logs --- src/collection.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index 1fbce5cf..fdc20482 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1525,10 +1525,6 @@ Option Collection::search(std::string raw_query, bool is_group_by_query = group_by_fields.size() > 0; bool is_vector_query = !vector_query.field_name.empty(); - LOG(INFO) << "is_wildcard_query: " << is_wildcard_query; - LOG(INFO) << "is_group_by_query: " << is_group_by_query; - LOG(INFO) << "is_vector_query: " << is_vector_query; - if(curated_sort_by.empty()) { auto sort_validation_op = validate_and_standardize_sort_fields(sort_fields, sort_fields_std, is_wildcard_query, is_vector_query, is_group_by_query); From d1692501fa846c431b3cf4abe4775d187ab5911b Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 14:22:14 +0300 Subject: [PATCH 07/16] Fix text embedding field detection --- src/collection.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/collection.cpp b/src/collection.cpp index fdc20482..8699e51c 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1043,7 +1043,7 @@ Option Collection::extract_field_name(const std::string& field_name, for(auto kv = prefix_it.first; kv != prefix_it.second; ++kv) { bool exact_key_match = (kv.key().size() == field_name.size()); bool exact_primitive_match = exact_key_match && !kv.value().is_object(); - bool text_embedding = kv.value().type == field_types::FLOAT_ARRAY && kv.value().embed.count(fields::from) != 0; + bool text_embedding = kv.value().type == field_types::FLOAT_ARRAY && kv.value().num_dim > 0; if(extract_only_string_fields && !kv.value().is_string() && !text_embedding) { if(exact_primitive_match && !is_wildcard) { From 278c29b3ea0c8cdda1edd7b9b9e21f9633d11e57 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 14:22:25 +0300 Subject: [PATCH 08/16] Add tests --- test/collection_specific_more_test.cpp | 113 +++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 3e5b43a7..7d44351c 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -2582,4 +2582,117 @@ TEST_F(CollectionSpecificMoreTest, HybridSearchTextMatchInfo) { ASSERT_EQ(0, results["hits"][0]["text_match_info"]["tokens_matched"].get()); ASSERT_EQ(0, results["hits"][1]["text_match_info"]["tokens_matched"].get()); +} + + +TEST_F(CollectionSpecificMoreTest, SemanticSearchReturnOnlyVectorDistance) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("moisturizer", {"embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(1, results["hits"].size()); + + // Return only vector distance + ASSERT_EQ(0, results["hits"][0].count("text_match_info")); + ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); + ASSERT_EQ(1, results["hits"][0].count("vector_distance")); +} + +TEST_F(CollectionSpecificMoreTest, KeywordSearchReturnOnlyTextMatchInfo) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("moisturizer", {"product_name"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + + ASSERT_EQ(1, results["hits"].size()); + + // Return only text match info + ASSERT_EQ(0, results["hits"][0].count("vector_distance")); + ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); + ASSERT_EQ(1, results["hits"][0].count("text_match_info")); +} + +TEST_F(CollectionSpecificMoreTest, HybridSearchReturnAllInfo) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + + auto results = coll1->search("moisturizer", {"product_name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(1, results["hits"].size()); + + // Return all info + ASSERT_EQ(1, results["hits"][0].count("vector_distance")); + ASSERT_EQ(1, results["hits"][0].count("text_match_info")); + ASSERT_EQ(1, results["hits"][0].count("hybrid_search_info")); } \ No newline at end of file From 7096ad0c253f8d37897204393f57bcc802f49ddf Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 14:29:27 +0300 Subject: [PATCH 09/16] Remove log --- src/field.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/field.cpp b/src/field.cpp index 8c97bd2b..dc82a021 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -1095,8 +1095,6 @@ Option field::validate_and_init_embed_fields(const std::vector>()) { auto embed_field = std::find_if(fields_json.begin(), fields_json.end(), [&field_name](const nlohmann::json& x) { return x["name"].get() == field_name; From 093442857a4c38a0e2129b5309b12493c9035020 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 14:31:58 +0300 Subject: [PATCH 10/16] Move tests --- test/collection_specific_more_test.cpp | 111 ------------------------ test/collection_vector_search_test.cpp | 114 ++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 112 deletions(-) diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 7d44351c..fbc78e22 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -2585,114 +2585,3 @@ TEST_F(CollectionSpecificMoreTest, HybridSearchTextMatchInfo) { } -TEST_F(CollectionSpecificMoreTest, SemanticSearchReturnOnlyVectorDistance) { - auto schema_json = - R"({ - "name": "Products", - "fields": [ - {"name": "product_name", "type": "string", "infix": true}, - {"name": "category", "type": "string"}, - {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} - ] - })"_json; - - - TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); - - auto collection_create_op = collectionManager.create_collection(schema_json); - ASSERT_TRUE(collection_create_op.ok()); - auto coll1 = collection_create_op.get(); - - auto add_op = coll1->add(R"({ - "product_name": "moisturizer", - "category": "beauty" - })"_json.dump()); - - ASSERT_TRUE(add_op.ok()); - - auto results = coll1->search("moisturizer", {"embedding"}, - "", {}, {}, {2}, 10, - 1, FREQUENCY, {true}, - 0, spp::sparse_hash_set()).get(); - - ASSERT_EQ(1, results["hits"].size()); - - // Return only vector distance - ASSERT_EQ(0, results["hits"][0].count("text_match_info")); - ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); - ASSERT_EQ(1, results["hits"][0].count("vector_distance")); -} - -TEST_F(CollectionSpecificMoreTest, KeywordSearchReturnOnlyTextMatchInfo) { - auto schema_json = - R"({ - "name": "Products", - "fields": [ - {"name": "product_name", "type": "string", "infix": true}, - {"name": "category", "type": "string"}, - {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} - ] - })"_json; - - - TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); - - auto collection_create_op = collectionManager.create_collection(schema_json); - ASSERT_TRUE(collection_create_op.ok()); - auto coll1 = collection_create_op.get(); - auto add_op = coll1->add(R"({ - "product_name": "moisturizer", - "category": "beauty" - })"_json.dump()); - ASSERT_TRUE(add_op.ok()); - - auto results = coll1->search("moisturizer", {"product_name"}, - "", {}, {}, {2}, 10, - 1, FREQUENCY, {true}, - 0, spp::sparse_hash_set()).get(); - - - ASSERT_EQ(1, results["hits"].size()); - - // Return only text match info - ASSERT_EQ(0, results["hits"][0].count("vector_distance")); - ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); - ASSERT_EQ(1, results["hits"][0].count("text_match_info")); -} - -TEST_F(CollectionSpecificMoreTest, HybridSearchReturnAllInfo) { - auto schema_json = - R"({ - "name": "Products", - "fields": [ - {"name": "product_name", "type": "string", "infix": true}, - {"name": "category", "type": "string"}, - {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} - ] - })"_json; - - TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); - - auto collection_create_op = collectionManager.create_collection(schema_json); - ASSERT_TRUE(collection_create_op.ok()); - auto coll1 = collection_create_op.get(); - - auto add_op = coll1->add(R"({ - "product_name": "moisturizer", - "category": "beauty" - })"_json.dump()); - ASSERT_TRUE(add_op.ok()); - - - auto results = coll1->search("moisturizer", {"product_name", "embedding"}, - "", {}, {}, {2}, 10, - 1, FREQUENCY, {true}, - 0, spp::sparse_hash_set()).get(); - - ASSERT_EQ(1, results["hits"].size()); - - // Return all info - ASSERT_EQ(1, results["hits"][0].count("vector_distance")); - ASSERT_EQ(1, results["hits"][0].count("text_match_info")); - ASSERT_EQ(1, results["hits"][0].count("hybrid_search_info")); -} \ No newline at end of file diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index bfb20be5..390b9fa7 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -775,7 +775,7 @@ TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) { ASSERT_EQ(2, search_res["found"].get()); ASSERT_EQ(2, search_res["hits"].size()); - ASSERT_FLOAT_EQ(0.04620, search_res["hits"][0]["vector_distance"].get()); + ASSERT_FLOAT_EQ(0.046207964, search_res["hits"][0]["vector_distance"].get()); ASSERT_FLOAT_EQ(0.1213316321, search_res["hits"][1]["vector_distance"].get()); // to pass k param @@ -1031,4 +1031,116 @@ TEST_F(CollectionVectorTest, EmbedFromOptionalNullField) { add_op = coll->add(doc.dump()); ASSERT_TRUE(add_op.ok()); +} + +TEST_F(CollectionVectorTest, SemanticSearchReturnOnlyVectorDistance) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("moisturizer", {"embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(1, results["hits"].size()); + + // Return only vector distance + ASSERT_EQ(0, results["hits"][0].count("text_match_info")); + ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); + ASSERT_EQ(1, results["hits"][0].count("vector_distance")); +} + +TEST_F(CollectionVectorTest, KeywordSearchReturnOnlyTextMatchInfo) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("moisturizer", {"product_name"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + + ASSERT_EQ(1, results["hits"].size()); + + // Return only text match info + ASSERT_EQ(0, results["hits"][0].count("vector_distance")); + ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); + ASSERT_EQ(1, results["hits"][0].count("text_match_info")); +} + +TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + + auto results = coll1->search("moisturizer", {"product_name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(1, results["hits"].size()); + + // Return all info + ASSERT_EQ(1, results["hits"][0].count("vector_distance")); + ASSERT_EQ(1, results["hits"][0].count("text_match_info")); + ASSERT_EQ(1, results["hits"][0].count("hybrid_search_info")); } \ No newline at end of file From 722cd3446d07072d54c2820f5065f92301413689 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 10 Aug 2023 18:38:12 +0530 Subject: [PATCH 11/16] Parsing vector float values in try. --- src/index.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index bc1845de..4a843349 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -867,9 +867,8 @@ void Index::index_field_in_memory(const field& afield, std::vector continue; } - const std::vector& float_vals = record.doc[afield.name].get>(); - try { + const std::vector& float_vals = record.doc[afield.name].get>(); if(afield.vec_dist == cosine) { std::vector normalized_vals(afield.num_dim); hnsw_index_t::normalize_vector(float_vals, normalized_vals); From f33163ff164bdaf7c1218e4b48f394f592e3781a Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 10 Aug 2023 20:30:40 +0530 Subject: [PATCH 12/16] Fix regression in partial update of record with embedding. --- src/validator.cpp | 2 +- test/collection_vector_search_test.cpp | 36 ++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/validator.cpp b/src/validator.cpp index f814c923..f8c23ee9 100644 --- a/src/validator.cpp +++ b/src/validator.cpp @@ -716,7 +716,7 @@ Option validator_t::validate_embed_fields(const nlohmann::json& document, } } } - if(all_optional_and_null && !field.optional) { + if(all_optional_and_null && !field.optional && !is_update) { return Option(400, "No valid fields found to create embedding for `" + field.name + "`, please provide at least one valid field or make the embedding field optional."); } } diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 2f55cd34..b55bb085 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -1033,6 +1033,42 @@ TEST_F(CollectionVectorTest, EmbedFromOptionalNullField) { ASSERT_TRUE(add_op.ok()); } +TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) { + nlohmann::json schema = R"({ + "name": "objects", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "about", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + nlohmann::json object; + object["id"] = "0"; + object["name"] = "butter"; + object["about"] = "about butter"; + + auto add_op = coll->add(object.dump(), CREATE); + ASSERT_TRUE(add_op.ok()); + + nlohmann::json update_object; + update_object["id"] = "0"; + update_object["about"] = "something about butter"; + auto update_op = coll->add(update_object.dump(), EMPLACE); + ASSERT_TRUE(update_op.ok()); + + // action = update + update_object["about"] = "something about butter 2"; + update_op = coll->add(update_object.dump(), UPDATE); + ASSERT_TRUE(update_op.ok()); +} + TEST_F(CollectionVectorTest, SkipEmbeddingOpWhenValueExists) { nlohmann::json schema = R"({ "name": "objects", From c7e5285618074a8b3ba418382ede1b7e2cdeeef7 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 10 Aug 2023 20:50:57 +0530 Subject: [PATCH 13/16] Add additional test. --- test/collection_vector_search_test.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index b55bb085..c15161ae 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -1069,6 +1069,32 @@ TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) { ASSERT_TRUE(update_op.ok()); } +TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) { + auto schema = R"({ + "name": "objects", + "fields": [ + {"name": "name", "type": "string", "optional": true}, + {"name": "about", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + nlohmann::json object; + object["id"] = "0"; + object["about"] = "about butter"; + + auto add_op = coll->add(object.dump(), EMPLACE); + ASSERT_FALSE(add_op.ok()); + ASSERT_EQ("No valid fields found to create embedding for `embedding`, please provide at least one valid field " + "or make the embedding field optional.", add_op.error()); +} + TEST_F(CollectionVectorTest, SkipEmbeddingOpWhenValueExists) { nlohmann::json schema = R"({ "name": "objects", From bbf67e1979f2045a3f147eabb224021c0fea44a4 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 11 Aug 2023 09:25:57 +0530 Subject: [PATCH 14/16] Fix counter increment for query aggregation. --- src/index.cpp | 4 +- test/collection_specific_more_test.cpp | 70 ++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index 4a843349..4d81126a 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -434,6 +434,8 @@ void Index::validate_and_preprocess(Index *index, std::vector& ite continue; } + handle_doc_ops(search_schema, index_rec.doc, index_rec.old_doc); + if(do_validation) { Option validation_op = validator_t::validate_index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field, @@ -471,7 +473,6 @@ void Index::validate_and_preprocess(Index *index, std::vector& ite } } } else { - handle_doc_ops(search_schema, index_rec.doc, index_rec.old_doc); if(generate_embeddings) { records_to_embed.push_back(&index_rec); } @@ -6260,7 +6261,6 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map()); } +TEST_F(CollectionSpecificMoreTest, IncrementingCount) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "count", "type": "int32"} + ] + })"_json; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + // brand new document: create + upsert + emplace should work + + nlohmann::json doc; + doc["id"] = "0"; + doc["title"] = "Foo"; + doc["$operations"]["increment"]["count"] = 1; + ASSERT_TRUE(coll1->add(doc.dump(), CREATE).ok()); + + doc.clear(); + doc["id"] = "1"; + doc["title"] = "Bar"; + doc["$operations"]["increment"]["count"] = 1; + ASSERT_TRUE(coll1->add(doc.dump(), EMPLACE).ok()); + + doc.clear(); + doc["id"] = "2"; + doc["title"] = "Taz"; + doc["$operations"]["increment"]["count"] = 1; + ASSERT_TRUE(coll1->add(doc.dump(), UPSERT).ok()); + + auto res = coll1->search("*", {}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10).get(); + + ASSERT_EQ(3, res["hits"].size()); + ASSERT_EQ(1, res["hits"][0]["document"]["count"].get()); + ASSERT_EQ(1, res["hits"][1]["document"]["count"].get()); + ASSERT_EQ(1, res["hits"][2]["document"]["count"].get()); + + // should support updates + + doc.clear(); + doc["id"] = "0"; + doc["title"] = "Foo"; + doc["$operations"]["increment"]["count"] = 3; + ASSERT_TRUE(coll1->add(doc.dump(), UPSERT).ok()); + + doc.clear(); + doc["id"] = "1"; + doc["title"] = "Bar"; + doc["$operations"]["increment"]["count"] = 3; + ASSERT_TRUE(coll1->add(doc.dump(), EMPLACE).ok()); + + doc.clear(); + doc["id"] = "2"; + doc["title"] = "Bar"; + doc["$operations"]["increment"]["count"] = 3; + ASSERT_TRUE(coll1->add(doc.dump(), UPDATE).ok()); + + res = coll1->search("*", {}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 5, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10).get(); + + ASSERT_EQ(3, res["hits"].size()); + ASSERT_EQ(4, res["hits"][0]["document"]["count"].get()); + ASSERT_EQ(4, res["hits"][1]["document"]["count"].get()); + ASSERT_EQ(4, res["hits"][2]["document"]["count"].get()); +} + TEST_F(CollectionSpecificMoreTest, HighlightOnFieldNameWithDot) { nlohmann::json schema = R"({ "name": "coll1", From dafde32ce0a1a0f8813c78670ab14da4346b5057 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Mon, 14 Aug 2023 23:25:18 +0300 Subject: [PATCH 15/16] Fix KV constructor parameters --- include/topster.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/topster.h b/include/topster.h index c7378f2b..a16b4440 100644 --- a/include/topster.h +++ b/include/topster.h @@ -24,7 +24,7 @@ struct KV { // to be used only in final aggregation uint64_t* query_indices = nullptr; - KV(uint16_t queryIndex, uint64_t key, uint64_t distinct_key, uint8_t match_score_index, const int64_t *scores, + KV(uint16_t queryIndex, uint64_t key, uint64_t distinct_key, int8_t match_score_index, const int64_t *scores, reference_filter_result_t* reference_filter_result = nullptr): match_score_index(match_score_index), query_index(queryIndex), array_index(0), key(key), distinct_key(distinct_key), reference_filter_result(reference_filter_result) { From e1e890279c0b9ffff93b8e0cd49f9796484ed247 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 15 Aug 2023 09:54:36 +0530 Subject: [PATCH 16/16] Refactor credential hiding logic. --- src/collection.cpp | 9 ++-- test/collection_vector_search_test.cpp | 66 ++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index 04625c8c..8883f4ab 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -261,10 +261,6 @@ nlohmann::json Collection::get_summary_json() const { field_json[fields::reference] = coll_field.reference; } - if(!coll_field.embed.empty()) { - field_json[fields::embed] = coll_field.embed; - } - fields_arr.push_back(field_json); } @@ -4936,9 +4932,10 @@ void Collection::hide_credential(nlohmann::json& json, const std::string& creden // hide api key with * except first 5 chars std::string credential_name_str = json[credential_name]; if(credential_name_str.size() > 5) { - json[credential_name] = credential_name_str.replace(5, credential_name_str.size() - 5, credential_name_str.size() - 5, '*'); + size_t num_chars_to_replace = credential_name_str.size() - 5; + json[credential_name] = credential_name_str.replace(5, num_chars_to_replace, num_chars_to_replace, '*'); } else { - json[credential_name] = credential_name_str.replace(0, credential_name_str.size(), credential_name_str.size(), '*'); + json[credential_name] = "***********"; } } } diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index c15161ae..059e4437 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -1033,6 +1033,72 @@ TEST_F(CollectionVectorTest, EmbedFromOptionalNullField) { ASSERT_TRUE(add_op.ok()); } +TEST_F(CollectionVectorTest, HideCredential) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name"], + "model_config": { + "model_name": "ts/e5-small", + "api_key": "ax-abcdef12345", + "access_token": "ax-abcdef12345", + "refresh_token": "ax-abcdef12345", + "client_id": "ax-abcdef12345", + "client_secret": "ax-abcdef12345", + "project_id": "ax-abcdef12345" + }}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + auto coll_summary = coll1->get_summary_json(); + + ASSERT_EQ("ax-ab*********", coll_summary["fields"][1]["embed"]["model_config"]["api_key"].get()); + ASSERT_EQ("ax-ab*********", coll_summary["fields"][1]["embed"]["model_config"]["access_token"].get()); + ASSERT_EQ("ax-ab*********", coll_summary["fields"][1]["embed"]["model_config"]["refresh_token"].get()); + ASSERT_EQ("ax-ab*********", coll_summary["fields"][1]["embed"]["model_config"]["client_id"].get()); + ASSERT_EQ("ax-ab*********", coll_summary["fields"][1]["embed"]["model_config"]["client_secret"].get()); + ASSERT_EQ("ax-ab*********", coll_summary["fields"][1]["embed"]["model_config"]["project_id"].get()); + + // small api key + + schema_json = + R"({ + "name": "Products2", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name"], + "model_config": { + "model_name": "ts/e5-small", + "api_key": "ax1", + "access_token": "ax1", + "refresh_token": "ax1", + "client_id": "ax1", + "client_secret": "ax1", + "project_id": "ax1" + }}} + ] + })"_json; + + collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll2 = collection_create_op.get(); + coll_summary = coll2->get_summary_json(); + + ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["api_key"].get()); + ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["access_token"].get()); + ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["refresh_token"].get()); + ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["client_id"].get()); + ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["client_secret"].get()); + ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["project_id"].get()); +} + TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) { nlohmann::json schema = R"({ "name": "objects",