Ensure that an import batch cannot contain duplicate doc IDs.

This commit is contained in:
Kishore Nallan 2021-09-07 17:02:58 +05:30
parent ba67efb7da
commit c0fce41c3b
2 changed files with 76 additions and 0 deletions

View File

@ -454,6 +454,9 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
size_t num_indexed = 0;
// ensures that document IDs are not repeated within the same batch
std::set<std::string> batch_doc_ids;
for(auto & index_rec: iter_batch) {
if(!index_rec.indexed.ok()) {
// some records could have been invalidated upstream
@ -461,6 +464,14 @@ size_t Index::batch_memory_index(Index *index, std::vector<index_record> & iter_
}
if(index_rec.operation != DELETE) {
const std::string& doc_id = index_rec.doc["id"].get<std::string>();
if(batch_doc_ids.find(doc_id) != batch_doc_ids.end()) {
index_rec.index_failure(400, "Document with `id` " + doc_id + " already exists in the import batch.");
continue;
}
batch_doc_ids.emplace(doc_id);
Option<uint32_t> validation_op = validate_index_in_memory(index_rec.doc, index_rec.seq_id,
default_sorting_field,
search_schema, facet_schema,

View File

@ -1395,3 +1395,68 @@ TEST_F(CollectionSpecificTest, ZeroWeightedFieldCannotPrioritizeExactMatch) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, ImportDocumentWithRepeatingIDInTheSameBatch) {
std::vector<field> fields = {field("name", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["name"] = "Levis";
doc1["points"] = 3;
nlohmann::json doc2;
doc2["id"] = "0";
doc2["name"] = "Amazing from Levis";
doc2["points"] = 5;
std::vector<std::string> import_records;
import_records.push_back(doc1.dump());
import_records.push_back(doc2.dump());
nlohmann::json document;
nlohmann::json import_response = coll1->add_many(import_records, document);
ASSERT_FALSE(import_response["success"].get<bool>());
ASSERT_EQ(1, import_response["num_imported"].get<int>());
ASSERT_TRUE(nlohmann::json::parse(import_records[0])["success"].get<bool>());
ASSERT_FALSE(nlohmann::json::parse(import_records[1])["success"].get<bool>());
ASSERT_EQ("Document with `id` 0 already exists in the import batch.",
nlohmann::json::parse(import_records[1])["error"].get<std::string>());
auto results = coll1->search("levis", {"name"},
"", {}, {}, {0}, 10,
1, FREQUENCY, {false},
2, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {0},
1000, true).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("Levis", results["hits"][0]["document"]["name"].get<std::string>());
// repeated ID is rejected even if the first ID is not indexed due to some error
import_records.clear();
doc1.erase("name");
doc1["id"] = "100";
doc2["id"] = "100";
import_records.push_back(doc1.dump());
import_records.push_back(doc2.dump());
import_response = coll1->add_many(import_records, document);
ASSERT_FALSE(import_response["success"].get<bool>());
ASSERT_EQ(0, import_response["num_imported"].get<int>());
ASSERT_FALSE(nlohmann::json::parse(import_records[0])["success"].get<bool>());
ASSERT_FALSE(nlohmann::json::parse(import_records[1])["success"].get<bool>());
ASSERT_EQ("Document with `id` 100 already exists in the import batch.",
nlohmann::json::parse(import_records[1])["error"].get<std::string>());
collectionManager.drop_collection("coll1");
}