From 2b154226cab4af7528c1c065d93163940c93ce2c Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Wed, 22 Nov 2023 22:28:15 +0300 Subject: [PATCH 01/11] Fix hybrid search with filters --- src/index.cpp | 12 ++++ test/collection_vector_search_test.cpp | 78 ++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/src/index.cpp b/src/index.cpp index f883c127..8be15837 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -3206,6 +3206,18 @@ Option Index::search(std::vector& field_query_tokens, cons // For hybrid search, we need to give weight to text match and vector search const float VECTOR_SEARCH_WEIGHT = vector_query.alpha; const float TEXT_MATCH_WEIGHT = 1.0 - VECTOR_SEARCH_WEIGHT; + + bool no_filters_provided = (filter_tree_root == nullptr && filter_result.count == 0); + + // list of all document ids + if (no_filters_provided) { + filter_result.count = seq_ids->num_ids(); + filter_result.docs = seq_ids->uncompress(); + } + + curate_filtered_ids(curated_ids, excluded_result_ids, + excluded_result_ids_size, filter_result.docs, filter_result.count, curated_ids_sorted); + collate_included_ids({}, included_ids_map, curated_topster, searched_queries); VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count); auto& field_vector_index = vector_index.at(vector_query.field_name); diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 72244efe..299b001c 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -2822,4 +2822,82 @@ TEST_F(CollectionVectorTest, TestSemanticSearchAfterUpdate) { ASSERT_TRUE(result.ok()); ASSERT_EQ(1, result.get()["hits"].size()); ASSERT_EQ("potato", result.get()["hits"][0]["document"]["name"]); +} + + +TEST_F(CollectionVectorTest, TestHybridSearchHiddenHits) { + nlohmann::json schema = R"({ + "name": "test", + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "embedding", + "type": "float[]", + "embed": { + "from": [ + "name" + ], + "model_config": { + "model_name": "ts/e5-small" + } + } + } + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema); + ASSERT_TRUE(collection_create_op.ok()); + + auto coll = collection_create_op.get(); + + auto add_op = coll->add(R"({ + "name": "soccer", + "id": "0" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + add_op = coll->add(R"({ + "name": "guitar", + "id": "1" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + add_op = coll->add(R"({ + "name": "typesense", + "id": "2" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + add_op = coll->add(R"({ + "name": "potato", + "id": "3" + })"_json.dump()); + + ASSERT_TRUE(add_op.ok()); + + auto results = coll->search("sports", {"name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(4, results["hits"].size()); + ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); + + + // do hybrid search with hidden_hits + auto hybrid_results = coll->search("sports", {"name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 1, "", "0").get(); + + ASSERT_EQ(3, hybrid_results["hits"].size()); + ASSERT_FALSE(hybrid_results["hits"][0]["document"]["id"] == 0); } \ No newline at end of file From a8b936bee85b0ef01dbdc639f0a41b984fee12f1 Mon Sep 17 00:00:00 2001 From: Jason Bosco Date: Wed, 22 Nov 2023 17:28:15 -0600 Subject: [PATCH 02/11] Try manually unzipping, to prevent 2GB file size error --- .github/workflows/tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 649da61a..0800ecc1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -44,9 +44,11 @@ jobs: search_artifacts: true workflow_conclusion: "" if_no_artifact_found: warn + skip_unpack: true - name: Uncompress bazel cache run: | + unzip bazel-cache.zip mkdir -p ~/.cache/bazel tar_file="bazel-cache.tar.gz" && \ [ -f "$tar_file" ] && \ From a2c5d24802d2e49884d4d51736baa66201e95997 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Thu, 23 Nov 2023 22:23:15 +0300 Subject: [PATCH 03/11] Refactor ```VectorFilterFunctor``` to include ```excluded_ids``` --- include/index.h | 15 +++++++++++++-- src/index.cpp | 16 ++-------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/include/index.h b/include/index.h index 1d895c02..8020c23f 100644 --- a/include/index.h +++ b/include/index.h @@ -269,11 +269,22 @@ class VectorFilterFunctor: public hnswlib::BaseFilterFunctor { const uint32_t* filter_ids = nullptr; const uint32_t filter_ids_length = 0; + const uint32_t* excluded_ids = nullptr; + const uint32_t excluded_ids_length = 0; + public: - explicit VectorFilterFunctor(const uint32_t* filter_ids, const uint32_t filter_ids_length) : - filter_ids(filter_ids), filter_ids_length(filter_ids_length) {} + explicit VectorFilterFunctor(const uint32_t* filter_ids, const uint32_t filter_ids_length, const uint32_t* excluded_ids = nullptr, const uint32_t excluded_ids_length = 0) : + filter_ids(filter_ids), filter_ids_length(filter_ids_length), excluded_ids(excluded_ids), excluded_ids_length(excluded_ids_length) {} bool operator()(hnswlib::labeltype id) override { + if(filter_ids_length == 0 && excluded_ids_length == 0) { + return true; + } + + if(excluded_ids_length > 0 && excluded_ids && std::binary_search(excluded_ids, excluded_ids + excluded_ids_length, id)) { + return false; + } + if(filter_ids_length == 0) { return true; } diff --git a/src/index.cpp b/src/index.cpp index 8be15837..90e7131b 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2901,7 +2901,7 @@ Option Index::search(std::vector& field_query_tokens, cons k++; } - VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count); + VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count, excluded_result_ids, excluded_result_ids_size); auto& field_vector_index = vector_index.at(vector_query.field_name); std::vector> dist_labels; @@ -3206,20 +3206,8 @@ Option Index::search(std::vector& field_query_tokens, cons // For hybrid search, we need to give weight to text match and vector search const float VECTOR_SEARCH_WEIGHT = vector_query.alpha; const float TEXT_MATCH_WEIGHT = 1.0 - VECTOR_SEARCH_WEIGHT; - - bool no_filters_provided = (filter_tree_root == nullptr && filter_result.count == 0); - // list of all document ids - if (no_filters_provided) { - filter_result.count = seq_ids->num_ids(); - filter_result.docs = seq_ids->uncompress(); - } - - curate_filtered_ids(curated_ids, excluded_result_ids, - excluded_result_ids_size, filter_result.docs, filter_result.count, curated_ids_sorted); - collate_included_ids({}, included_ids_map, curated_topster, searched_queries); - - VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count); + VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count, excluded_result_ids, excluded_result_ids_size); auto& field_vector_index = vector_index.at(vector_query.field_name); std::vector> dist_labels; // use k as 100 by default for ensuring results stability in pagination From 09de5fff202734d6e0d7a14ae8690a54120397c1 Mon Sep 17 00:00:00 2001 From: Jason Bosco Date: Thu, 23 Nov 2023 17:09:51 -0600 Subject: [PATCH 04/11] Upgrade action-download-artifact to hopefully fix CI --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0800ecc1..4c7cf196 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,7 +38,7 @@ jobs: uses: bazelbuild/setup-bazelisk@v2 - name: Download bazel cache - uses: dawidd6/action-download-artifact@v2 + uses: dawidd6/action-download-artifact@v2.28.0 with: name: bazel-cache search_artifacts: true From dd215ffa49240d1bb3b0312ea0c61530755ec26b Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Mon, 27 Nov 2023 02:20:10 +0300 Subject: [PATCH 05/11] Fix error messages for remote embedders --- src/text_embedder_remote.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/text_embedder_remote.cpp b/src/text_embedder_remote.cpp index 27409c2d..feb1900f 100644 --- a/src/text_embedder_remote.cpp +++ b/src/text_embedder_remote.cpp @@ -106,7 +106,7 @@ Option OpenAIEmbedder::is_model_valid(const nlohmann::json& model_config, if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) { return Option(400, "OpenAI API error: " + res); } - return Option(400, "OpenAI API error: " + nlohmann::json::parse(res)["error"]["message"].get()); + return Option(400, "OpenAI API error: " + json_res["error"]["message"].get()); } nlohmann::json models_json; @@ -152,7 +152,7 @@ Option OpenAIEmbedder::is_model_valid(const nlohmann::json& model_config, if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) { return Option(400, "OpenAI API error: " + embedding_res); } - return Option(400, "OpenAI API error: " + nlohmann::json::parse(res)["error"]["message"].get()); + return Option(400, "OpenAI API error: " + json_res["error"]["message"].get()); } std::vector embedding; try { @@ -337,7 +337,7 @@ Option GoogleEmbedder::is_model_valid(const nlohmann::json& model_config, return Option(400, "Google API error: " + res); } - return Option(400, "Google API error: " + nlohmann::json::parse(res)["error"]["message"].get()); + return Option(400, "Google API error: " + json_res["error"]["message"].get()); } try { @@ -477,7 +477,7 @@ Option GCPEmbedder::is_model_valid(const nlohmann::json& model_config, siz if(json_res.count("error") == 0 || json_res["error"].count("message") == 0) { return Option(400, "GCP API error: " + res); } - return Option(400, "GCP API error: " + nlohmann::json::parse(res)["error"]["message"].get()); + return Option(400, "GCP API error: " + json_res["error"]["message"].get()); } nlohmann::json res_json; try { @@ -680,7 +680,7 @@ Option GCPEmbedder::generate_access_token(const std::string& refres if(res_code == 408) { return Option(408, "GCP API timeout."); } - return Option(400, "GCP API error: " + nlohmann::json::parse(res)["error"]["message"].get()); + return Option(400, "GCP API error: " + json_res["error"]["message"].get()); } nlohmann::json res_json; try { From 34f748d9b06adb189951f91f1d11e0de683b9d9f Mon Sep 17 00:00:00 2001 From: Jason Bosco Date: Sun, 26 Nov 2023 19:28:33 -0600 Subject: [PATCH 06/11] Attempt to fix bazel cache loading issue for files larger than 2GB --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4c7cf196..4c602a25 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,7 +38,7 @@ jobs: uses: bazelbuild/setup-bazelisk@v2 - name: Download bazel cache - uses: dawidd6/action-download-artifact@v2.28.0 + uses: jasonbosco/action-download-artifact@c4d5f373a5d7da32fe23a91e414baa3e81eeef42 with: name: bazel-cache search_artifacts: true From 59628c91b9a9adcf11ea4fd6b42759bc306f8911 Mon Sep 17 00:00:00 2001 From: Jason Bosco Date: Sun, 26 Nov 2023 19:33:43 -0600 Subject: [PATCH 07/11] Attempt 2 --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4c602a25..c0d13967 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,7 +38,7 @@ jobs: uses: bazelbuild/setup-bazelisk@v2 - name: Download bazel cache - uses: jasonbosco/action-download-artifact@c4d5f373a5d7da32fe23a91e414baa3e81eeef42 + uses: jasonbosco/action-download-artifact@d7ab2dcce12fbef7a1565790bac7cf24a319b066 with: name: bazel-cache search_artifacts: true From 98f599719ea4ab3528211eab2ec7c0c000c39b9d Mon Sep 17 00:00:00 2001 From: Jason Bosco Date: Sun, 26 Nov 2023 19:35:18 -0600 Subject: [PATCH 08/11] Round 3 --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c0d13967..280e69e9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,7 +38,7 @@ jobs: uses: bazelbuild/setup-bazelisk@v2 - name: Download bazel cache - uses: jasonbosco/action-download-artifact@d7ab2dcce12fbef7a1565790bac7cf24a319b066 + uses: jasonbosco/action-download-artifact@709b71d3729e8980f52a5a2a9ec04261060945c1 with: name: bazel-cache search_artifacts: true From 9ad9caecc94fbe84a5e07ec679ae021089f1312e Mon Sep 17 00:00:00 2001 From: Jason Bosco Date: Sun, 26 Nov 2023 19:38:20 -0600 Subject: [PATCH 09/11] Trigger CI From 721d4ed7dc86900087ed3de6d83a97b7b460651a Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 25 Nov 2023 21:36:08 +0530 Subject: [PATCH 10/11] Reduce noise of per-collection housekeeping log. --- src/housekeeper.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/housekeeper.cpp b/src/housekeeper.cpp index 07150e11..370a3074 100644 --- a/src/housekeeper.cpp +++ b/src/housekeeper.cpp @@ -31,7 +31,10 @@ void HouseKeeper::run() { } coll->do_housekeeping(); - LOG(INFO) << "Ran housekeeping."; + } + + if(!coll_names.empty()) { + LOG(INFO) << "Ran housekeeping for " << coll_names.size() << " collections."; } prev_persistence_s = std::chrono::duration_cast( From c71245ca1410cb653a917b6cc2cf560a8f4a5ea3 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 30 Nov 2023 14:08:39 +0530 Subject: [PATCH 11/11] Fix bug with deeply nested optional array of obj field. --- src/field.cpp | 4 +- test/collection_nested_fields_test.cpp | 75 ++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/src/field.cpp b/src/field.cpp index ea99b72f..8c613e0c 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -1022,9 +1022,11 @@ Option field::flatten_field(nlohmann::json& doc, nlohmann::json& obj, cons return flatten_field(doc, it.value(), the_field, path_parts, path_index + 1, has_array, has_obj_array, is_update, dyn_fields, flattened_fields); } - } { + } else if(!the_field.optional) { return Option(404, "Field `" + the_field.name + "` not found."); } + + return Option(true); } Option field::flatten_doc(nlohmann::json& document, diff --git a/test/collection_nested_fields_test.cpp b/test/collection_nested_fields_test.cpp index 1c4bfb24..bcab0f73 100644 --- a/test/collection_nested_fields_test.cpp +++ b/test/collection_nested_fields_test.cpp @@ -3056,6 +3056,81 @@ TEST_F(CollectionNestedFieldsTest, HighlightArrayOfObjects) { ASSERT_EQ(1, results["hits"][0]["highlight"]["details"][2].size()); } +TEST_F(CollectionNestedFieldsTest, DeepNestedOptionalArrayValue) { + nlohmann::json schema = R"({ + "name": "coll1", + "enable_nested_fields": true, + "fields": [ + { + "facet": false, + "index": true, + "infix": false, + "locale": "", + "name": "items.name", + "optional": true, + "sort": false, + "type": "string[]" + }, + { + "facet": false, + "index": true, + "infix": false, + "locale": "", + "name": "items.description", + "optional": true, + "sort": false, + "type": "string[]" + }, + { + "facet": false, + "index": true, + "infix": false, + "locale": "", + "name": "items.nested_items.name", + "optional": true, + "sort": false, + "type": "string[]" + } + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll1 = op.get(); + + auto doc1 = R"({ + "items": [ + { + "description": "random description.", + "name": "foobar", + "nested_items": [ + { + "isAvailable": true + }, + { + "description": "nested description here", + "isAvailable": true, + "name": "naruto" + }, + { + "description": "description again", + "isAvailable": true, + "name": "dragon ball" + } + ] + } + ] + })"_json; + + auto add_op = coll1->add(doc1.dump(), CREATE); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("naruto", {"items.nested_items.name"}, "", {}, {}, {0}, 10, 1, FREQUENCY, + {true}, 1, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4).get(); + ASSERT_EQ(1, results["found"].get()); +} + TEST_F(CollectionNestedFieldsTest, FloatInsideNestedObject) { nlohmann::json schema = R"({ "name": "coll1",