diff --git a/include/index.h b/include/index.h index 6dffc38e..5656cc2f 100644 --- a/include/index.h +++ b/include/index.h @@ -532,7 +532,7 @@ private: static void handle_doc_ops(const tsl::htrie_map& search_schema, nlohmann::json& update_doc, const nlohmann::json& old_doc); - static void get_doc_changes(const index_operation_t op, const tsl::htrie_map& search_schema, + static void get_doc_changes(const index_operation_t op, const tsl::htrie_map& embedding_fields, nlohmann::json &update_doc, const nlohmann::json &old_doc, nlohmann::json &new_doc, nlohmann::json &del_doc); diff --git a/src/index.cpp b/src/index.cpp index 35f5c839..c05b8c5d 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -454,7 +454,7 @@ void Index::validate_and_preprocess(Index *index, std::vector& ite if(index_rec.is_update) { // scrub string fields to reduce delete ops - get_doc_changes(index_rec.operation, search_schema, index_rec.doc, index_rec.old_doc, + get_doc_changes(index_rec.operation, embedding_fields, index_rec.doc, index_rec.old_doc, index_rec.new_doc, index_rec.del_doc); if(generate_embeddings) { @@ -6258,7 +6258,7 @@ void Index::handle_doc_ops(const tsl::htrie_map& search_schema, } } -void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map& search_schema, +void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map& embedding_fields, nlohmann::json& update_doc, const nlohmann::json& old_doc, nlohmann::json& new_doc, nlohmann::json& del_doc) { @@ -6271,7 +6271,12 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map vec = {0.12, 0.45, 0.64}; + + nlohmann::json doc; + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + doc["vec"] = vec; + + auto add_op = coll1->add(doc.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); + + + // upsert unchanged doc + add_op = coll1->add(doc.dump(), index_operation_t::UPSERT); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback, + 4, {off}, 32767, 32767, 2, + false, true, "vec:([0.12, 0.44, 0.55])").get(); + + ASSERT_EQ(1, results["found"].get()); +} + TEST_F(CollectionVectorTest, NumVectorGreaterThanNumDim) { nlohmann::json schema = R"({ "name": "coll1", @@ -692,6 +741,88 @@ TEST_F(CollectionVectorTest, VectorWithNullValue) { nlohmann::json::parse(json_lines[1])["error"].get()); } +TEST_F(CollectionVectorTest, EmbeddedVectorUnchangedUpsert) { + nlohmann::json schema = R"({ + "name": "coll1", + "fields": [ + {"name": "title", "type": "string"}, + {"name": "points", "type": "int32"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["title"], + "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + nlohmann::json doc; + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + auto add_op = coll1->add(doc.dump()); + ASSERT_TRUE(add_op.ok()); + + auto results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + + ASSERT_EQ(1, results["found"].get()); + auto embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); + + // upsert unchanged doc + doc.clear(); + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + add_op = coll1->add(doc.dump(), index_operation_t::UPSERT); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + ASSERT_EQ(1, results["found"].get()); + embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); + + // update + + doc.clear(); + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + add_op = coll1->add(doc.dump(), index_operation_t::UPDATE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + ASSERT_EQ(1, results["found"].get()); + embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); + + // emplace + + doc.clear(); + doc["id"] = "0"; + doc["title"] = "Title"; + doc["points"] = 100; + + add_op = coll1->add(doc.dump(), index_operation_t::EMPLACE); + ASSERT_TRUE(add_op.ok()); + + results = coll1->search("title", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set()).get(); + ASSERT_EQ(1, results["found"].get()); + embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_EQ(384, embedding.size()); +} + TEST_F(CollectionVectorTest, HybridSearchWithExplicitVector) { nlohmann::json schema = R"({ "name": "objects", @@ -1099,7 +1230,67 @@ TEST_F(CollectionVectorTest, HideCredential) { ASSERT_EQ("***********", coll_summary["fields"][1]["embed"]["model_config"]["project_id"].get()); } -TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) { +TEST_F(CollectionVectorTest, UpdateOfFieldReferencedByEmbedding) { + nlohmann::json schema = R"({ + "name": "objects", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], + "model_config": {"model_name": "ts/e5-small"}}} + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto op = collectionManager.create_collection(schema); + ASSERT_TRUE(op.ok()); + Collection* coll = op.get(); + + nlohmann::json object; + object["id"] = "0"; + object["name"] = "butter"; + + auto add_op = coll->add(object.dump(), CREATE); + ASSERT_TRUE(add_op.ok()); + + auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + auto original_embedding = results["hits"][0]["document"]["embedding"].get>(); + + nlohmann::json update_object; + update_object["id"] = "0"; + update_object["name"] = "ghee"; + auto update_op = coll->add(update_object.dump(), EMPLACE); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("ghee", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + auto updated_embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_NE(original_embedding, updated_embedding); + + // action = update + update_object["name"] = "milk"; + update_op = coll->add(update_object.dump(), UPDATE); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("milk", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + updated_embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_NE(original_embedding, updated_embedding); + + // action = upsert + update_object["name"] = "cheese"; + update_op = coll->add(update_object.dump(), UPSERT); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("cheese", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + updated_embedding = results["hits"][0]["document"]["embedding"].get>(); + ASSERT_NE(original_embedding, updated_embedding); +} + +TEST_F(CollectionVectorTest, UpdateOfFieldNotReferencedByEmbedding) { + // test updates to a field that's not referred by an embedding field nlohmann::json schema = R"({ "name": "objects", "fields": [ @@ -1123,16 +1314,34 @@ TEST_F(CollectionVectorTest, UpdateOfCollWithNonOptionalEmbeddingField) { auto add_op = coll->add(object.dump(), CREATE); ASSERT_TRUE(add_op.ok()); + auto results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + nlohmann::json update_object; update_object["id"] = "0"; update_object["about"] = "something about butter"; auto update_op = coll->add(update_object.dump(), EMPLACE); ASSERT_TRUE(update_op.ok()); + results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + // action = update update_object["about"] = "something about butter 2"; update_op = coll->add(update_object.dump(), UPDATE); ASSERT_TRUE(update_op.ok()); + + results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); + + // action = upsert + update_object["name"] = "butter"; + update_object["about"] = "something about butter 3"; + update_op = coll->add(update_object.dump(), UPSERT); + ASSERT_TRUE(update_op.ok()); + + results = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}).get(); + ASSERT_EQ(1, results["found"].get()); } TEST_F(CollectionVectorTest, FreshEmplaceWithOptionalEmbeddingReferencedField) {