From a7faafa4a534bc83dd7348bfe2e1e406e7c5eefb Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 28 Nov 2023 19:26:31 +0530 Subject: [PATCH 1/4] Fix long facet value removal test. --- test/facet_index_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/facet_index_test.cpp b/test/facet_index_test.cpp index 4da5060c..c4a3757b 100644 --- a/test/facet_index_test.cpp +++ b/test/facet_index_test.cpp @@ -43,7 +43,7 @@ TEST(FacetIndexTest, FacetValueDeletionOfLongString) { longval += "a"; } - facet_value_id_t longfval(longval, 1); + facet_value_id_t longfval(longval.substr(0, 100), 1); fvalue_to_seq_ids[longfval] = {0, 1, 2}; seq_id_to_fvalues[0] = {longfval}; @@ -55,14 +55,14 @@ TEST(FacetIndexTest, FacetValueDeletionOfLongString) { doc["brand"] = longval; findex.insert("brand", fvalue_to_seq_ids, seq_id_to_fvalues, true); - ASSERT_EQ(3, findex.facet_val_num_ids("brand", longval)); + ASSERT_EQ(3, findex.facet_val_num_ids("brand", longval.substr(0, 100))); findex.remove(doc, brandf, 0); findex.remove(doc, brandf, 1); - ASSERT_EQ(1, findex.facet_val_num_ids("brand", longval)); + ASSERT_EQ(1, findex.facet_val_num_ids("brand", longval.substr(0, 100))); findex.remove(doc, brandf, 2); - ASSERT_FALSE(findex.facet_value_exists("brand", longval)); + ASSERT_FALSE(findex.facet_value_exists("brand", longval.substr(0, 100))); } TEST(FacetIndexTest, FacetValueDeletionFloat) { From 127d0b49e8d3a6d60b7bbcaea29c4ab123d58ff9 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Wed, 29 Nov 2023 02:39:54 +0300 Subject: [PATCH 2/4] Fix handling invalid images --- src/image_embedder.cpp | 20 ++++++++++++++++++-- src/image_processor.cpp | 3 +-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/image_embedder.cpp b/src/image_embedder.cpp index a347811d..0b6b5213 100644 --- a/src/image_embedder.cpp +++ b/src/image_embedder.cpp @@ -10,7 +10,10 @@ embedding_res_t CLIPImageEmbedder::embed(const std::string& encoded_image) { auto processed_image_op = image_processor_.process_image(encoded_image); if (!processed_image_op.ok()) { - return embedding_res_t(processed_image_op.code(), processed_image_op.error()); + nlohmann::json error_json; + error_json["error"] = processed_image_op.error(); + results[i] = embedding_res_t(processed_image_op.code(), error_json); + return embedding_res_t(processed_image_op.code(), error_json); } auto processed_image = processed_image_op.get(); @@ -58,7 +61,9 @@ std::vector CLIPImageEmbedder::batch_embed(const std::vector CLIPImageEmbedder::batch_embed(const std::vector result_vector(inputs.size()); + for (int i = 0; i < inputs.size(); i++) { + result_vector[i] = results[i]; + } + + return result_vector; + } + // create input tensor std::vector input_shape = {static_cast(processed_images.size()), 3, 224, 224}; std::vector input_names = {"input_ids", "pixel_values", "attention_mask"}; diff --git a/src/image_processor.cpp b/src/image_processor.cpp index 707031d9..e9a942b3 100644 --- a/src/image_processor.cpp +++ b/src/image_processor.cpp @@ -36,8 +36,7 @@ Option CLIPImageProcessor::process_image(const std::string& i LOG(INFO) << "Running image processor"; try { output_tensors = session_->Run(Ort::RunOptions{nullptr}, input_names.data(), &input_tensor, 1, output_names.data(), output_names.size()); - } catch (const std::exception& e) { - LOG(INFO) << "Error while running image processor: " << e.what(); + } catch (...) { return Option(400, "Error while processing image"); } From 42511a05be3f0af2448af10678d60ed86c75385d Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Wed, 29 Nov 2023 02:52:08 +0300 Subject: [PATCH 3/4] Add test --- src/image_embedder.cpp | 1 - test/collection_vector_search_test.cpp | 30 +++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/image_embedder.cpp b/src/image_embedder.cpp index 0b6b5213..4ac5c75c 100644 --- a/src/image_embedder.cpp +++ b/src/image_embedder.cpp @@ -12,7 +12,6 @@ embedding_res_t CLIPImageEmbedder::embed(const std::string& encoded_image) { if (!processed_image_op.ok()) { nlohmann::json error_json; error_json["error"] = processed_image_op.error(); - results[i] = embedding_res_t(processed_image_op.code(), error_json); return embedding_res_t(processed_image_op.code(), error_json); } diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index fc1cf4e4..243a0594 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -2987,7 +2987,6 @@ TEST_F(CollectionVectorTest, TestImageEmbedding) { auto coll = collection_create_op.get(); - LOG(INFO) << "Adding image to collection"; auto add_op = coll->add(R"({ "name": "dog", @@ -3045,4 +3044,33 @@ TEST_F(CollectionVectorTest, TryAddingMultipleImageFieldToEmbedFrom) { ASSERT_FALSE(collection_create_op.ok()); ASSERT_EQ(collection_create_op.error(), "Only one field can be used in the `embed.from` property of an embed field when embedding from an image field."); +} + +TEST_F(CollectionVectorTest, TestInvalidImage) { + auto schema_json = + R"({ + "name": "Images", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "image", "type": "image", "store": false}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["image"], "model_config": {"model_name": "ts/clip-vit-b-p32"}}} + ] + })"_json; + + EmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema_json); + ASSERT_TRUE(collection_create_op.ok()); + + auto coll = collection_create_op.get(); + + auto add_op = coll->add(R"({ + "name": "teddy bear", + "image": "invalid" + })"_json.dump()); + + ASSERT_FALSE(add_op.ok()); + + ASSERT_EQ(add_op.error(), "Error while processing image"); + } \ No newline at end of file From 40ac195be7aa5d51afc67aa0bbae2c0d817d865d Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 29 Nov 2023 10:25:54 +0530 Subject: [PATCH 4/4] Merge conflict resolution. --- .github/workflows/tests.yml | 4 +- include/index.h | 20 ++++++- src/index.cpp | 3 +- src/text_embedder_remote.cpp | 10 ++-- test/collection_vector_search_test.cpp | 80 +++++++++++++++++++++++++- 5 files changed, 106 insertions(+), 11 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 649da61a..280e69e9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,15 +38,17 @@ jobs: uses: bazelbuild/setup-bazelisk@v2 - name: Download bazel cache - uses: dawidd6/action-download-artifact@v2 + uses: jasonbosco/action-download-artifact@709b71d3729e8980f52a5a2a9ec04261060945c1 with: name: bazel-cache search_artifacts: true workflow_conclusion: "" if_no_artifact_found: warn + skip_unpack: true - name: Uncompress bazel cache run: | + unzip bazel-cache.zip mkdir -p ~/.cache/bazel tar_file="bazel-cache.tar.gz" && \ [ -f "$tar_file" ] && \ diff --git a/include/index.h b/include/index.h index 1e8d4f6b..04f8d297 100644 --- a/include/index.h +++ b/include/index.h @@ -276,12 +276,26 @@ struct index_record { class VectorFilterFunctor: public hnswlib::BaseFilterFunctor { filter_result_iterator_t* const filter_result_iterator; + const uint32_t* excluded_ids = nullptr; + const uint32_t excluded_ids_length = 0; + public: - explicit VectorFilterFunctor(filter_result_iterator_t* const filter_result_iterator) : - filter_result_iterator(filter_result_iterator) {} + + explicit VectorFilterFunctor(filter_result_iterator_t* const filter_result_iterator, + const uint32_t* excluded_ids = nullptr, const uint32_t excluded_ids_length = 0) : + filter_result_iterator(filter_result_iterator), + excluded_ids(excluded_ids), excluded_ids_length(excluded_ids_length) {} bool operator()(hnswlib::labeltype id) override { - if (filter_result_iterator->approx_filter_ids_length == 0) { + if (filter_result_iterator->approx_filter_ids_length == 0 && excluded_ids_length == 0) { + return true; + } + + if(excluded_ids_length > 0 && excluded_ids && std::binary_search(excluded_ids, excluded_ids + excluded_ids_length, id)) { + return false; + } + + if(filter_result_iterator->approx_filter_ids_length == 0) { return true; } diff --git a/src/index.cpp b/src/index.cpp index d22a0ae2..f0825726 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2542,6 +2542,7 @@ Option Index::search(std::vector& field_query_tokens, cons k++; } + VectorFilterFunctor filterFunctor(filter_result_iterator, excluded_result_ids, excluded_result_ids_size); auto& field_vector_index = vector_index.at(vector_query.field_name); std::vector> dist_results; @@ -2946,7 +2947,7 @@ Option Index::search(std::vector& field_query_tokens, cons const float VECTOR_SEARCH_WEIGHT = vector_query.alpha; const float TEXT_MATCH_WEIGHT = 1.0 - VECTOR_SEARCH_WEIGHT; - VectorFilterFunctor filterFunctor(filter_result_iterator); + VectorFilterFunctor filterFunctor(filter_result_iterator, excluded_result_ids, excluded_result_ids_size); auto& field_vector_index = vector_index.at(vector_query.field_name); std::vector> dist_labels; diff --git a/src/text_embedder_remote.cpp b/src/text_embedder_remote.cpp index b07aa9e1..1fe5e3bb 100644 --- a/src/text_embedder_remote.cpp +++ b/src/text_embedder_remote.cpp @@ -106,7 +106,7 @@ Option OpenAIEmbedder::is_model_valid(const nlohmann::json& model_config, if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) { return Option(400, "OpenAI API error: " + res); } - return Option(400, "OpenAI API error: " + nlohmann::json::parse(res)["error"]["message"].get()); + return Option(400, "OpenAI API error: " + json_res["error"]["message"].get()); } nlohmann::json models_json; @@ -152,7 +152,7 @@ Option OpenAIEmbedder::is_model_valid(const nlohmann::json& model_config, if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) { return Option(400, "OpenAI API error: " + embedding_res); } - return Option(400, "OpenAI API error: " + nlohmann::json::parse(res)["error"]["message"].get()); + return Option(400, "OpenAI API error: " + json_res["error"]["message"].get()); } std::vector embedding; try { @@ -337,7 +337,7 @@ Option GoogleEmbedder::is_model_valid(const nlohmann::json& model_config, return Option(400, "Google API error: " + res); } - return Option(400, "Google API error: " + nlohmann::json::parse(res)["error"]["message"].get()); + return Option(400, "Google API error: " + json_res["error"]["message"].get()); } try { @@ -477,7 +477,7 @@ Option GCPEmbedder::is_model_valid(const nlohmann::json& model_config, siz if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) { return Option(400, "GCP API error: " + res); } - return Option(400, "GCP API error: " + nlohmann::json::parse(res)["error"]["message"].get()); + return Option(400, "GCP API error: " + json_res["error"]["message"].get()); } nlohmann::json res_json; try { @@ -680,7 +680,7 @@ Option GCPEmbedder::generate_access_token(const std::string& refres if(res_code == 408) { return Option(408, "GCP API timeout."); } - return Option(400, "GCP API error: " + nlohmann::json::parse(res)["error"]["message"].get()); + return Option(400, "GCP API error: " + json_res["error"]["message"].get()); } nlohmann::json res_json; try { diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 243a0594..755cb6ee 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -3026,6 +3026,83 @@ TEST_F(CollectionVectorTest, TestImageEmbedding) { ASSERT_EQ(results2["hits"][1]["document"]["id"], "0"); } +TEST_F(CollectionVectorTest, TestHybridSearchHiddenHits) { + nlohmann::json schema = R"({ + "name": "test", + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "embedding", + "type": "float[]", + "embed": { + "from": [ + "name" + ], + "model_config": { + "model_name": "ts/e5-small" + } + } + } + ] + })"_json; + + EmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema); + ASSERT_TRUE(collection_create_op.ok()); + + auto coll = collection_create_op.get(); + + auto add_op = coll->add(R"({ + "name": "soccer", + "id": "0" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + add_op = coll->add(R"({ + "name": "guitar", + "id": "1" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + add_op = coll->add(R"({ + "name": "typesense", + "id": "2" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + add_op = coll->add(R"({ + "name": "potato", + "id": "3" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + auto results = coll->search("sports", {"name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(4, results["hits"].size()); + ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); + + + // do hybrid search with hidden_hits + auto hybrid_results = coll->search("sports", {"name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 1, "", "0").get(); + + ASSERT_EQ(3, hybrid_results["hits"].size()); + ASSERT_FALSE(hybrid_results["hits"][0]["document"]["id"] == 0); +} + TEST_F(CollectionVectorTest, TryAddingMultipleImageFieldToEmbedFrom) { auto schema_json = R"({ @@ -3045,6 +3122,7 @@ TEST_F(CollectionVectorTest, TryAddingMultipleImageFieldToEmbedFrom) { ASSERT_EQ(collection_create_op.error(), "Only one field can be used in the `embed.from` property of an embed field when embedding from an image field."); } +<<<<<<< Updated upstream TEST_F(CollectionVectorTest, TestInvalidImage) { auto schema_json = @@ -3073,4 +3151,4 @@ TEST_F(CollectionVectorTest, TestInvalidImage) { ASSERT_EQ(add_op.error(), "Error while processing image"); -} \ No newline at end of file +}