From cb25df1389cf1a806eb855f5b751cdb4abc83f9b Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 4 Aug 2023 07:24:32 +0530 Subject: [PATCH 1/9] CI: enable verbose test output. --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cfd155b1..ce392fba 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -50,7 +50,7 @@ jobs: run: bazel build //:typesense-server - name: Run tests - run: bazel test //:typesense-test + run: bazel test --test_output=all -- //:typesense-test # Source: https://github.com/actions/upload-artifact/issues/92#issuecomment-1080347032 - run: echo "BAZEL_BIN_FULL_PATH=$(readlink -f bazel-bin)" >> $GITHUB_ENV From bfb81173b94f7cef980eeb27a643cd0df50788e4 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 4 Aug 2023 08:39:52 +0530 Subject: [PATCH 2/9] Fix float assertion. --- test/collection_vector_search_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index bfb20be5..d28d9c03 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -775,8 +775,8 @@ TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) { ASSERT_EQ(2, search_res["found"].get()); ASSERT_EQ(2, search_res["hits"].size()); - ASSERT_FLOAT_EQ(0.04620, search_res["hits"][0]["vector_distance"].get()); - ASSERT_FLOAT_EQ(0.1213316321, search_res["hits"][1]["vector_distance"].get()); + ASSERT_NEAR(0.04620, search_res["hits"][0]["vector_distance"].get(), 0.0001); + ASSERT_NEAR(0.12133, search_res["hits"][1]["vector_distance"].get(), 0.0001); // to pass k param vec_query = "embedding:([], k: 1)"; From 3ba9b48eff29229510b8383121ccd8b1abbb4d15 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 4 Aug 2023 14:43:26 +0530 Subject: [PATCH 3/9] Use embedding in auto embedding field if already present. --- include/validator.h | 2 +- src/index.cpp | 7 +++ src/validator.cpp | 24 +++++++-- test/collection_vector_search_test.cpp | 72 +++++++++++++++++++++++++- 4 files changed, 100 insertions(+), 5 deletions(-) diff --git a/include/validator.h b/include/validator.h index b4952cf6..d3898dba 100644 --- a/include/validator.h +++ b/include/validator.h @@ -71,6 +71,6 @@ public: static Option validate_embed_fields(const nlohmann::json& document, const tsl::htrie_map& embedding_fields, const tsl::htrie_map & search_schema, - const bool& error_if_field_not_found); + const bool& is_update); }; \ No newline at end of file diff --git a/src/index.cpp b/src/index.cpp index 69df9301..a87f245c 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -504,6 +504,7 @@ void Index::validate_and_preprocess(Index *index, std::vector& ite index_rec.index_failure(400, e.what()); } } + if(generate_embeddings) { batch_embed_fields(records_to_embed, embedding_fields, search_schema, remote_embedding_batch_size); } @@ -6499,6 +6500,12 @@ void Index::batch_embed_fields(std::vector& records, if(document == nullptr) { continue; } + + if(document->contains(field.name) && !record->is_update) { + // embedding already exists (could be a restore from export) + continue; + } + std::string text = indexing_prefix; const auto& embed_from = field.embed[fields::from].get>(); for(const auto& field_name : embed_from) { diff --git a/src/validator.cpp b/src/validator.cpp index 5fe9bab3..51a7d19c 100644 --- a/src/validator.cpp +++ b/src/validator.cpp @@ -654,7 +654,7 @@ Option validator_t::validate_index_in_memory(nlohmann::json& document, if(validate_embedding_fields) { // validate embedding fields - auto validate_embed_op = validate_embed_fields(document, embedding_fields, search_schema, !is_update); + auto validate_embed_op = validate_embed_fields(document, embedding_fields, search_schema, is_update); if(!validate_embed_op.ok()) { return Option<>(validate_embed_op.code(), validate_embed_op.error()); } @@ -667,8 +667,26 @@ Option validator_t::validate_index_in_memory(nlohmann::json& document, Option validator_t::validate_embed_fields(const nlohmann::json& document, const tsl::htrie_map& embedding_fields, const tsl::htrie_map & search_schema, - const bool& error_if_field_not_found) { + const bool& is_update) { for(const auto& field : embedding_fields) { + if(document.contains(field.name) && !is_update) { + const auto& field_vec = document[field.name]; + if(!field_vec.is_array() || field_vec.empty() || !field_vec[0].is_number() || + field_vec.size() != field.num_dim) { + return Option(400, "Field `" + field.name + "` contains an invalid embedding."); + } + + auto it = field_vec.begin(); + while(it != field_vec.end()) { + if(!it.value().is_number()) { + return Option(400, "Field `" + field.name + "` contains invalid float values."); + } + it++; + } + + continue; + } + const auto& embed_from = field.embed[fields::from].get>(); // flag to check if all fields to embed from are optional and null bool all_optional_and_null = true; @@ -679,7 +697,7 @@ Option validator_t::validate_embed_fields(const nlohmann::json& document, return Option(400, "Field `" + field.name + "` has invalid fields to create embeddings from."); } if(doc_field_it == document.end()) { - if(error_if_field_not_found && !schema_field_it->optional) { + if(!is_update && !schema_field_it->optional) { return Option(400, "Field `" + field_name + "` is needed to create embedding."); } else { continue; diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index d28d9c03..3ffe288e 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -1031,4 +1031,74 @@ TEST_F(CollectionVectorTest, EmbedFromOptionalNullField) { add_op = coll->add(doc.dump()); ASSERT_TRUE(add_op.ok()); -} \ No newline at end of file +} + +TEST_F(CollectionVectorTest, SkipEmbeddingOpWhenValueExists) { + nlohmann::json schema = R"({ + "name": "objects", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + nlohmann::json model_config = R"({ + "model_name": "ts/e5-small" + })"_json; + + // will be roughly 0.1110895648598671,-0.11710234731435776,-0.5319093465805054, ... + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + // document with explicit embedding vector + nlohmann::json doc; + doc["name"] = "FOO"; + + std::vector vec; + for(size_t i = 0; i < 384; i++) { + vec.push_back(0.345); + } + + doc["embedding"] = vec; + + auto add_op = coll->add(doc.dump()); + ASSERT_TRUE(add_op.ok()); + + // get the vector back + auto res = coll->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, + Index::DROP_TOKENS_THRESHOLD).get(); + + // let's check the first few vectors + auto stored_vec = res["hits"][0]["document"]["embedding"]; + ASSERT_NEAR(0.345, stored_vec[0], 0.01); + ASSERT_NEAR(0.345, stored_vec[1], 0.01); + ASSERT_NEAR(0.345, stored_vec[2], 0.01); + ASSERT_NEAR(0.345, stored_vec[3], 0.01); + ASSERT_NEAR(0.345, stored_vec[4], 0.01); + + // what happens when vector contains invalid value, like string + doc["embedding"] = "foo"; //{0.11, 0.11}; + add_op = coll->add(doc.dump()); + ASSERT_FALSE(add_op.ok()); + ASSERT_EQ("Field `embedding` contains an invalid embedding.", add_op.error()); + + // when dims don't match + doc["embedding"] = {0.11, 0.11}; + add_op = coll->add(doc.dump()); + ASSERT_FALSE(add_op.ok()); + ASSERT_EQ("Field `embedding` contains an invalid embedding.", add_op.error()); + + // invalid array value + doc["embedding"].clear(); + for(size_t i = 0; i < 384; i++) { + doc["embedding"].push_back(0.01); + } + doc["embedding"][5] = "foo"; + add_op = coll->add(doc.dump()); + ASSERT_FALSE(add_op.ok()); + ASSERT_EQ("Field `embedding` contains invalid float values.", add_op.error()); +} From ef23f45429804c05c32d1b4a84b0f4574906e392 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 4 Aug 2023 15:08:25 +0530 Subject: [PATCH 4/9] Disable tests for now. --- .github/workflows/tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ce392fba..f3c71bd1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,6 +1,7 @@ name: tests -on: [push, pull_request] +#on: [push, pull_request] +on: workflow_dispatch # Cancel previous running if a new push is made # Source: https://stackoverflow.com/a/72408109/123545 From d2cf4b02019e93251856f0e1df6c0cdefb06850d Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 5 Aug 2023 12:10:03 +0530 Subject: [PATCH 5/9] Fix nested field with regex name in schema. --- src/field.cpp | 2 +- test/collection_nested_fields_test.cpp | 36 ++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/field.cpp b/src/field.cpp index ca7d5149..bccd8da5 100644 --- a/src/field.cpp +++ b/src/field.cpp @@ -852,7 +852,7 @@ bool field::flatten_obj(nlohmann::json& doc, nlohmann::json& value, bool has_arr continue; } - if(std::regex_match(flat_name, std::regex(flat_name))) { + if(std::regex_match(flat_name, std::regex(dynamic_field.name))) { detected_type = dynamic_field.type; found_dynamic_field = true; break; diff --git a/test/collection_nested_fields_test.cpp b/test/collection_nested_fields_test.cpp index 20490d4a..a2eef13d 100644 --- a/test/collection_nested_fields_test.cpp +++ b/test/collection_nested_fields_test.cpp @@ -2867,6 +2867,42 @@ TEST_F(CollectionNestedFieldsTest, FloatInsideNestedObject) { ASSERT_TRUE(add_op.ok()); } +TEST_F(CollectionNestedFieldsTest, NestedFieldWithRegexName) { + nlohmann::json schema = R"({ + "name": "coll1", + "enable_nested_fields": true, + "fields": [ + {"name":"titles", "type":"object"}, + {"name": "titles\\..*", "type":"string"}, + {"name":"start_date", "type":"object"}, + {"name":"start_date\\..*", "type":"int32", "facet":true, "optional":true} + ] + })"_json; + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection *coll1 = op.get(); + + auto doc1 = R"({ + "titles": { + "en": "Foobar baz" + }, + "start_date": { + "year": 2020, + "month": 2, + "day": 3 + } + })"_json; + + auto add_op = coll1->add(doc1.dump(), CREATE); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("foobar", {"titles.en"}, "start_date.year: 2020", {}, {}, {2}, 10, + 1, FREQUENCY, {true}).get(); + + ASSERT_EQ(1, results["found"].get()); +} + TEST_F(CollectionNestedFieldsTest, HighlightOnFlatFieldWithSnippeting) { std::vector fields = {field("title", field_types::STRING, false), field("body", field_types::STRING, false)}; From cf592caf3c332255ffeaf904bc26546241a64ec5 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 5 Aug 2023 15:14:30 +0530 Subject: [PATCH 6/9] Sentencepiece: use build tmp dir instead of install dir. --- bazel/sentencepiece.BUILD | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bazel/sentencepiece.BUILD b/bazel/sentencepiece.BUILD index 6958221c..05967725 100644 --- a/bazel/sentencepiece.BUILD +++ b/bazel/sentencepiece.BUILD @@ -30,9 +30,9 @@ cmake( install = False, cache_entries = { 'SPM_USE_BUILTIN_PROTOBUF': 'OFF', - 'Protobuf_LIBRARY': '$INSTALLDIR/../../com_google_protobuf/libprotobuf.a', - 'Protobuf_LITE_LIBRARY': '$INSTALLDIR/../../com_google_protobuf/libprotobuf-lite.a', - 'Protobuf_PROTOC_EXECUTABLE': '$INSTALLDIR/../../com_google_protobuf/protoc', + 'Protobuf_LIBRARY': '$$BUILD_TMPDIR$$/../../com_google_protobuf/libprotobuf.a', + 'Protobuf_LITE_LIBRARY': '$$BUILD_TMPDIR$$/../../com_google_protobuf/libprotobuf-lite.a', + 'Protobuf_PROTOC_EXECUTABLE': '$$BUILD_TMPDIR$$/../../com_google_protobuf/protoc', 'Protobuf_INCLUDE_DIR': '$EXT_BUILD_ROOT/external/com_google_protobuf/src', 'CMAKE_POLICY_DEFAULT_CMP0111':'OLD' }, @@ -44,7 +44,7 @@ cmake( ], tags = ["no-sandbox"], postfix_script= """ -echo "Intstalling sentencepiece" -cp $BUILD_TMPDIR/src/libsentencepiece.a $INSTALLDIR/lib/libsentencepiece.a -""" + echo "Installing sentencepiece" + cp $BUILD_TMPDIR/src/libsentencepiece.a $INSTALLDIR/lib/libsentencepiece.a + """ ) From d327074c5482252eae46617f4770ff155ad7ee24 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 5 Aug 2023 16:25:03 +0530 Subject: [PATCH 7/9] CI: Use artifacts for bazel cache --- .github/workflows/tests.yml | 53 ++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f3c71bd1..1051cfb3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,7 +1,6 @@ name: tests -#on: [push, pull_request] -on: workflow_dispatch +on: [push, pull_request] # Cancel previous running if a new push is made # Source: https://stackoverflow.com/a/72408109/123545 @@ -21,40 +20,62 @@ jobs: run: | sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test sudo apt-get update - + sudo apt-get install -y g++-10 make git zlib1g-dev m4 - + # Define the compiler sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 30 sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 30 - + sudo update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 30 sudo update-alternatives --set cc /usr/bin/gcc - + sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30 sudo update-alternatives --set c++ /usr/bin/g++ - name: Set up Bazel uses: bazelbuild/setup-bazelisk@v2 - - name: Restore bazel cache - uses: actions/cache@v3 + - name: Download bazel cache + uses: dawidd6/action-download-artifact@v2 with: - path: | - ~/.cache/bazel - # Source: https://github.com/actions/cache/blob/67b839edb68371cc5014f6cea11c9aa77238de78/examples.md#linux-2 - key: ${{ runner.os }}-bazel-${{ hashFiles('.bazelversion', '.bazelrc', 'WORKSPACE', 'WORKSPACE.bazel', 'MODULE.bazel') }} - restore-keys: | - ${{ runner.os }}-bazel- + name: bazel-cache + search_artifacts: true + workflow_conclusion: "" + if_no_artifact_found: warn + + - name: Uncompress bazel cache + run: | + mkdir -p ~/.cache/bazel + tar_file="bazel-cache.tar.gz" && \ + [ -f "$tar_file" ] && \ + tar -xzvf "$tar_file" -C ~/.cache/bazel && \ + rm bazel-cache.tar.gz + exit 0 - name: Build Typesense run: bazel build //:typesense-server - name: Run tests - run: bazel test --test_output=all -- //:typesense-test + run: bazel test //:typesense-test + + - name: Compress bazel cache + if: always() + run: | + tar -czvf bazel-cache.tar.gz -C ~/.cache/bazel . + + - name: Save bazel cache + uses: actions/upload-artifact@v3 + if: always() + with: + name: bazel-cache + path: bazel-cache.tar.gz + if-no-files-found: warn + retention-days: 10 # Source: https://github.com/actions/upload-artifact/issues/92#issuecomment-1080347032 - - run: echo "BAZEL_BIN_FULL_PATH=$(readlink -f bazel-bin)" >> $GITHUB_ENV + - name: Set BAZEL_BIN_FULL_PATH + run: echo "BAZEL_BIN_FULL_PATH=$(readlink -f bazel-bin)" >> $GITHUB_ENV - name: Save build artifacts uses: actions/upload-artifact@v3 with: From 406cafe4c795a78ff4578cfbb92ce51946ca4aed Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 5 Aug 2023 16:33:32 +0530 Subject: [PATCH 8/9] Disable CI again. --- .github/workflows/tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1051cfb3..e9802186 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,6 +1,7 @@ name: tests -on: [push, pull_request] +on: workflow_dispatch +#on: [push, pull_request] # Cancel previous running if a new push is made # Source: https://stackoverflow.com/a/72408109/123545 From 6c9a000bd8e3db2de65432a7793e6b39494f1d51 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 5 Aug 2023 16:58:02 +0530 Subject: [PATCH 9/9] Build protobuf first before building Typesense. --- .github/workflows/tests.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e9802186..649da61a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,7 +1,7 @@ name: tests -on: workflow_dispatch -#on: [push, pull_request] +#on: workflow_dispatch +on: [push, pull_request] # Cancel previous running if a new push is made # Source: https://stackoverflow.com/a/72408109/123545 @@ -54,6 +54,13 @@ jobs: rm bazel-cache.tar.gz exit 0 + - name: Build protobuf deps + run: | + bazel build @com_google_protobuf//:protobuf_headers + bazel build @com_google_protobuf//:protobuf_lite + bazel build @com_google_protobuf//:protobuf + bazel build @com_google_protobuf//:protoc + - name: Build Typesense run: bazel build //:typesense-server