diff --git a/src/index.cpp b/src/index.cpp index 28135bc5..96d32488 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -454,6 +454,9 @@ size_t Index::batch_memory_index(Index *index, std::vector & iter_ size_t num_indexed = 0; + // ensures that document IDs are not repeated within the same batch + std::set batch_doc_ids; + for(auto & index_rec: iter_batch) { if(!index_rec.indexed.ok()) { // some records could have been invalidated upstream @@ -461,6 +464,14 @@ size_t Index::batch_memory_index(Index *index, std::vector & iter_ } if(index_rec.operation != DELETE) { + const std::string& doc_id = index_rec.doc["id"].get(); + if(batch_doc_ids.find(doc_id) != batch_doc_ids.end()) { + index_rec.index_failure(400, "Document with `id` " + doc_id + " already exists in the import batch."); + continue; + } + + batch_doc_ids.emplace(doc_id); + Option validation_op = validate_index_in_memory(index_rec.doc, index_rec.seq_id, default_sorting_field, search_schema, facet_schema, diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index b504bb0f..01a4fbc0 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -1395,3 +1395,68 @@ TEST_F(CollectionSpecificTest, ZeroWeightedFieldCannotPrioritizeExactMatch) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionSpecificTest, ImportDocumentWithRepeatingIDInTheSameBatch) { + std::vector fields = {field("name", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["name"] = "Levis"; + doc1["points"] = 3; + + nlohmann::json doc2; + doc2["id"] = "0"; + doc2["name"] = "Amazing from Levis"; + doc2["points"] = 5; + + std::vector import_records; + import_records.push_back(doc1.dump()); + import_records.push_back(doc2.dump()); + + nlohmann::json document; + nlohmann::json import_response = coll1->add_many(import_records, document); + + ASSERT_FALSE(import_response["success"].get()); + ASSERT_EQ(1, import_response["num_imported"].get()); + + ASSERT_TRUE(nlohmann::json::parse(import_records[0])["success"].get()); + ASSERT_FALSE(nlohmann::json::parse(import_records[1])["success"].get()); + ASSERT_EQ("Document with `id` 0 already exists in the import batch.", + nlohmann::json::parse(import_records[1])["error"].get()); + + auto results = coll1->search("levis", {"name"}, + "", {}, {}, {0}, 10, + 1, FREQUENCY, {false}, + 2, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, + "", "", {0}, + 1000, true).get(); + + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + ASSERT_EQ("Levis", results["hits"][0]["document"]["name"].get()); + + // repeated ID is rejected even if the first ID is not indexed due to some error + import_records.clear(); + doc1.erase("name"); + doc1["id"] = "100"; + doc2["id"] = "100"; + + import_records.push_back(doc1.dump()); + import_records.push_back(doc2.dump()); + + import_response = coll1->add_many(import_records, document); + + ASSERT_FALSE(import_response["success"].get()); + ASSERT_EQ(0, import_response["num_imported"].get()); + + ASSERT_FALSE(nlohmann::json::parse(import_records[0])["success"].get()); + ASSERT_FALSE(nlohmann::json::parse(import_records[1])["success"].get()); + ASSERT_EQ("Document with `id` 100 already exists in the import batch.", + nlohmann::json::parse(import_records[1])["error"].get()); + + collectionManager.drop_collection("coll1"); +}