From 64ec0fea41b2840ec91b33989cf2a1331807c5f4 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 09:52:34 +0300 Subject: [PATCH 1/6] Fix search results of semantic search --- src/collection.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index 88880b51..1fbce5cf 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1073,7 +1073,7 @@ Option Collection::extract_field_name(const std::string& field_name, return Option(true); } -Option Collection::search(std::string raw_query, +Option Collection::search(std::string raw_query, const std::vector& raw_search_fields, const std::string & filter_query, const std::vector& facet_fields, const std::vector & sort_fields, const std::vector& num_typos, @@ -1201,6 +1201,7 @@ Option Collection::search(std::string raw_query, std::vector processed_search_fields; std::vector query_by_weights; size_t num_embed_fields = 0; + std::string query = raw_query; for(size_t i = 0; i < raw_search_fields.size(); i++) { const std::string& field_name = raw_search_fields[i]; @@ -1289,6 +1290,11 @@ Option Collection::search(std::string raw_query, } } + // Set query to * if it is semantic search + if(!vector_query.field_name.empty() && processed_search_fields.empty()) { + query = "*"; + } + if(!vector_query.field_name.empty() && vector_query.values.empty() && num_embed_fields == 0) { std::string error = "Vector query could not find any embedded fields."; return Option(400, error); @@ -1444,7 +1450,7 @@ Option Collection::search(std::string raw_query, size_t max_hits = DEFAULT_TOPSTER_SIZE; // ensure that `max_hits` never exceeds number of documents in collection - if(search_fields.size() <= 1 || raw_query == "*") { + if(search_fields.size() <= 1 || query == "*") { max_hits = std::min(std::max(fetch_size, max_hits), get_num_documents()); } else { max_hits = std::min(std::max(fetch_size, max_hits), get_num_documents()); @@ -1477,7 +1483,6 @@ Option Collection::search(std::string raw_query, StringUtils::split(hidden_hits_str, hidden_hits, ","); std::vector filter_overrides; - std::string query = raw_query; bool filter_curated_hits = false; std::string curated_sort_by; curate_results(query, filter_query, enable_overrides, pre_segmented_query, pinned_hits, hidden_hits, @@ -1520,6 +1525,10 @@ Option Collection::search(std::string raw_query, bool is_group_by_query = group_by_fields.size() > 0; bool is_vector_query = !vector_query.field_name.empty(); + LOG(INFO) << "is_wildcard_query: " << is_wildcard_query; + LOG(INFO) << "is_group_by_query: " << is_group_by_query; + LOG(INFO) << "is_vector_query: " << is_vector_query; + if(curated_sort_by.empty()) { auto sort_validation_op = validate_and_standardize_sort_fields(sort_fields, sort_fields_std, is_wildcard_query, is_vector_query, is_group_by_query); From c4919bb358688fb2e830d27afef67f6d9a4221ce Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 09:53:26 +0300 Subject: [PATCH 2/6] Remove logs --- src/collection.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index 1fbce5cf..fdc20482 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1525,10 +1525,6 @@ Option Collection::search(std::string raw_query, bool is_group_by_query = group_by_fields.size() > 0; bool is_vector_query = !vector_query.field_name.empty(); - LOG(INFO) << "is_wildcard_query: " << is_wildcard_query; - LOG(INFO) << "is_group_by_query: " << is_group_by_query; - LOG(INFO) << "is_vector_query: " << is_vector_query; - if(curated_sort_by.empty()) { auto sort_validation_op = validate_and_standardize_sort_fields(sort_fields, sort_fields_std, is_wildcard_query, is_vector_query, is_group_by_query); From d1692501fa846c431b3cf4abe4775d187ab5911b Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 14:22:14 +0300 Subject: [PATCH 3/6] Fix text embedding field detection --- src/collection.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/collection.cpp b/src/collection.cpp index fdc20482..8699e51c 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1043,7 +1043,7 @@ Option Collection::extract_field_name(const std::string& field_name, for(auto kv = prefix_it.first; kv != prefix_it.second; ++kv) { bool exact_key_match = (kv.key().size() == field_name.size()); bool exact_primitive_match = exact_key_match && !kv.value().is_object(); - bool text_embedding = kv.value().type == field_types::FLOAT_ARRAY && kv.value().embed.count(fields::from) != 0; + bool text_embedding = kv.value().type == field_types::FLOAT_ARRAY && kv.value().num_dim > 0; if(extract_only_string_fields && !kv.value().is_string() && !text_embedding) { if(exact_primitive_match && !is_wildcard) { From 278c29b3ea0c8cdda1edd7b9b9e21f9633d11e57 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 14:22:25 +0300 Subject: [PATCH 4/6] Add tests --- test/collection_specific_more_test.cpp | 113 +++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 3e5b43a7..7d44351c 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -2582,4 +2582,117 @@ TEST_F(CollectionSpecificMoreTest, HybridSearchTextMatchInfo) { ASSERT_EQ(0, results["hits"][0]["text_match_info"]["tokens_matched"].get()); ASSERT_EQ(0, results["hits"][1]["text_match_info"]["tokens_matched"].get()); +} + + +TEST_F(CollectionSpecificMoreTest, SemanticSearchReturnOnlyVectorDistance) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("moisturizer", {"embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(1, results["hits"].size()); + + // Return only vector distance + ASSERT_EQ(0, results["hits"][0].count("text_match_info")); + ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); + ASSERT_EQ(1, results["hits"][0].count("vector_distance")); +} + +TEST_F(CollectionSpecificMoreTest, KeywordSearchReturnOnlyTextMatchInfo) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("moisturizer", {"product_name"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + + ASSERT_EQ(1, results["hits"].size()); + + // Return only text match info + ASSERT_EQ(0, results["hits"][0].count("vector_distance")); + ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); + ASSERT_EQ(1, results["hits"][0].count("text_match_info")); +} + +TEST_F(CollectionSpecificMoreTest, HybridSearchReturnAllInfo) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + + auto results = coll1->search("moisturizer", {"product_name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(1, results["hits"].size()); + + // Return all info + ASSERT_EQ(1, results["hits"][0].count("vector_distance")); + ASSERT_EQ(1, results["hits"][0].count("text_match_info")); + ASSERT_EQ(1, results["hits"][0].count("hybrid_search_info")); } \ No newline at end of file From 7096ad0c253f8d37897204393f57bcc802f49ddf Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 14:29:27 +0300 Subject: [PATCH 5/6] Remove log --- src/field.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/field.cpp b/src/field.cpp index 8c97bd2b..dc82a021 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -1095,8 +1095,6 @@ Option field::validate_and_init_embed_fields(const std::vector>()) { auto embed_field = std::find_if(fields_json.begin(), fields_json.end(), [&field_name](const nlohmann::json& x) { return x["name"].get() == field_name; From 093442857a4c38a0e2129b5309b12493c9035020 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 10 Aug 2023 14:31:58 +0300 Subject: [PATCH 6/6] Move tests --- test/collection_specific_more_test.cpp | 111 ------------------------ test/collection_vector_search_test.cpp | 114 ++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 112 deletions(-) diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index 7d44351c..fbc78e22 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -2585,114 +2585,3 @@ TEST_F(CollectionSpecificMoreTest, HybridSearchTextMatchInfo) { } -TEST_F(CollectionSpecificMoreTest, SemanticSearchReturnOnlyVectorDistance) { - auto schema_json = - R"({ - "name": "Products", - "fields": [ - {"name": "product_name", "type": "string", "infix": true}, - {"name": "category", "type": "string"}, - {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} - ] - })"_json; - - - TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); - - auto collection_create_op = collectionManager.create_collection(schema_json); - ASSERT_TRUE(collection_create_op.ok()); - auto coll1 = collection_create_op.get(); - - auto add_op = coll1->add(R"({ - "product_name": "moisturizer", - "category": "beauty" - })"_json.dump()); - - ASSERT_TRUE(add_op.ok()); - - auto results = coll1->search("moisturizer", {"embedding"}, - "", {}, {}, {2}, 10, - 1, FREQUENCY, {true}, - 0, spp::sparse_hash_set()).get(); - - ASSERT_EQ(1, results["hits"].size()); - - // Return only vector distance - ASSERT_EQ(0, results["hits"][0].count("text_match_info")); - ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); - ASSERT_EQ(1, results["hits"][0].count("vector_distance")); -} - -TEST_F(CollectionSpecificMoreTest, KeywordSearchReturnOnlyTextMatchInfo) { - auto schema_json = - R"({ - "name": "Products", - "fields": [ - {"name": "product_name", "type": "string", "infix": true}, - {"name": "category", "type": "string"}, - {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} - ] - })"_json; - - - TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); - - auto collection_create_op = collectionManager.create_collection(schema_json); - ASSERT_TRUE(collection_create_op.ok()); - auto coll1 = collection_create_op.get(); - auto add_op = coll1->add(R"({ - "product_name": "moisturizer", - "category": "beauty" - })"_json.dump()); - ASSERT_TRUE(add_op.ok()); - - auto results = coll1->search("moisturizer", {"product_name"}, - "", {}, {}, {2}, 10, - 1, FREQUENCY, {true}, - 0, spp::sparse_hash_set()).get(); - - - ASSERT_EQ(1, results["hits"].size()); - - // Return only text match info - ASSERT_EQ(0, results["hits"][0].count("vector_distance")); - ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); - ASSERT_EQ(1, results["hits"][0].count("text_match_info")); -} - -TEST_F(CollectionSpecificMoreTest, HybridSearchReturnAllInfo) { - auto schema_json = - R"({ - "name": "Products", - "fields": [ - {"name": "product_name", "type": "string", "infix": true}, - {"name": "category", "type": "string"}, - {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} - ] - })"_json; - - TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); - - auto collection_create_op = collectionManager.create_collection(schema_json); - ASSERT_TRUE(collection_create_op.ok()); - auto coll1 = collection_create_op.get(); - - auto add_op = coll1->add(R"({ - "product_name": "moisturizer", - "category": "beauty" - })"_json.dump()); - ASSERT_TRUE(add_op.ok()); - - - auto results = coll1->search("moisturizer", {"product_name", "embedding"}, - "", {}, {}, {2}, 10, - 1, FREQUENCY, {true}, - 0, spp::sparse_hash_set()).get(); - - ASSERT_EQ(1, results["hits"].size()); - - // Return all info - ASSERT_EQ(1, results["hits"][0].count("vector_distance")); - ASSERT_EQ(1, results["hits"][0].count("text_match_info")); - ASSERT_EQ(1, results["hits"][0].count("hybrid_search_info")); -} \ No newline at end of file diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index bfb20be5..390b9fa7 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -775,7 +775,7 @@ TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) { ASSERT_EQ(2, search_res["found"].get()); ASSERT_EQ(2, search_res["hits"].size()); - ASSERT_FLOAT_EQ(0.04620, search_res["hits"][0]["vector_distance"].get()); + ASSERT_FLOAT_EQ(0.046207964, search_res["hits"][0]["vector_distance"].get()); ASSERT_FLOAT_EQ(0.1213316321, search_res["hits"][1]["vector_distance"].get()); // to pass k param @@ -1031,4 +1031,116 @@ TEST_F(CollectionVectorTest, EmbedFromOptionalNullField) { add_op = coll->add(doc.dump()); ASSERT_TRUE(add_op.ok()); +} + +TEST_F(CollectionVectorTest, SemanticSearchReturnOnlyVectorDistance) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("moisturizer", {"embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(1, results["hits"].size()); + + // Return only vector distance + ASSERT_EQ(0, results["hits"][0].count("text_match_info")); + ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); + ASSERT_EQ(1, results["hits"][0].count("vector_distance")); +} + +TEST_F(CollectionVectorTest, KeywordSearchReturnOnlyTextMatchInfo) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("moisturizer", {"product_name"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + + ASSERT_EQ(1, results["hits"].size()); + + // Return only text match info + ASSERT_EQ(0, results["hits"][0].count("vector_distance")); + ASSERT_EQ(0, results["hits"][0].count("hybrid_search_info")); + ASSERT_EQ(1, results["hits"][0].count("text_match_info")); +} + +TEST_F(CollectionVectorTest, HybridSearchReturnAllInfo) { + auto schema_json = + R"({ + "name": "Products", + "fields": [ + {"name": "product_name", "type": "string", "infix": true}, + {"name": "category", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["product_name", "category"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + auto coll1 = collection_create_op.get(); + + auto add_op = coll1->add(R"({ + "product_name": "moisturizer", + "category": "beauty" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + + auto results = coll1->search("moisturizer", {"product_name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(1, results["hits"].size()); + + // Return all info + ASSERT_EQ(1, results["hits"][0].count("vector_distance")); + ASSERT_EQ(1, results["hits"][0].count("text_match_info")); + ASSERT_EQ(1, results["hits"][0].count("hybrid_search_info")); } \ No newline at end of file