Fix edge case in updating empty array strings.

This commit is contained in:
Kishore Nallan 2023-07-01 13:46:03 +05:30
parent e60094c88b
commit c066120fb1
4 changed files with 171 additions and 134 deletions

View File

@ -631,9 +631,6 @@ public:
const std::vector<char>& local_token_separators,
const std::vector<char>& local_symbols_to_index);
static void scrub_reindex_doc(const tsl::htrie_map<char, field>& search_schema,
nlohmann::json& update_doc, nlohmann::json& del_doc, const nlohmann::json& old_doc);
static void tokenize_string_field(const nlohmann::json& document,
const field& search_field, std::vector<std::string>& tokens,
const std::string& locale,

View File

@ -454,7 +454,6 @@ void Index::validate_and_preprocess(Index *index, std::vector<index_record>& ite
// scrub string fields to reduce delete ops
get_doc_changes(index_rec.operation, search_schema, index_rec.doc, index_rec.old_doc,
index_rec.new_doc, index_rec.del_doc);
scrub_reindex_doc(search_schema, index_rec.doc, index_rec.del_doc, index_rec.old_doc);
if(generate_embeddings) {
for(auto& field: index_rec.doc.items()) {
@ -6236,11 +6235,19 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<cha
if(op == UPSERT) {
new_doc = update_doc;
// since UPSERT could replace a doc with lesser fields, we have to add those missing fields to del_doc
for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
continue;
}
if(!update_doc.contains(it.key())) {
del_doc[it.key()] = it.value();
}
}
} else {
new_doc = old_doc;
handle_doc_ops(search_schema, update_doc, old_doc);
new_doc = old_doc;
new_doc.merge_patch(update_doc);
if(old_doc.contains(".flat")) {
@ -6251,87 +6258,33 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<cha
}
}
for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
continue;
}
if(op == UPSERT && !update_doc.contains(it.key())) {
del_doc[it.key()] = it.value();
}
}
auto it = update_doc.begin();
for(; it != update_doc.end(); ) {
if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
while(it != update_doc.end()) {
if(it.value().is_object() || (it.value().is_array() && !it.value().empty() && it.value()[0].is_object())) {
++it;
continue;
}
// if the update doc contains a field that exists in old, we record that (for delete + reindex)
bool field_exists_in_old_doc = (old_doc.count(it.key()) != 0);
if(field_exists_in_old_doc) {
// key exists in the stored doc, so it must be reindexed
// we need to check for this because a field can be optional
del_doc[it.key()] = old_doc[it.key()];
}
// adds new key or overrides existing key from `old_doc`
if(it.value().is_null()) {
// null values should not indexed
// null values should not be indexed
new_doc.erase(it.key());
del_doc[it.key()] = old_doc[it.key()];
it = update_doc.erase(it);
} else {
++it;
}
}
}
void Index::scrub_reindex_doc(const tsl::htrie_map<char, field>& search_schema,
nlohmann::json& update_doc,
nlohmann::json& del_doc,
const nlohmann::json& old_doc) {
/*LOG(INFO) << "update_doc: " << update_doc;
LOG(INFO) << "old_doc: " << old_doc;
LOG(INFO) << "del_doc: " << del_doc;*/
// del_doc contains fields that exist in both update doc and old doc
// But we will only remove fields that are different
std::vector<std::string> unchanged_keys;
for(auto it = del_doc.cbegin(); it != del_doc.cend(); it++) {
const std::string& field_name = it.key();
const auto& search_field_it = search_schema.find(field_name);
if(search_field_it == search_schema.end()) {
continue;
}
if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
continue;
}
const auto search_field = search_field_it.value(); // copy, don't use reference!
// compare values between old and update docs:
// if they match, we will remove them from both del and update docs
if(update_doc.contains(search_field.name)) {
if(update_doc[search_field.name].is_null()) {
// we don't allow null values to be stored or indexed but need to be removed from stored doc
update_doc.erase(search_field.name);
}
else if(update_doc[search_field.name] == old_doc[search_field.name]) {
unchanged_keys.push_back(field_name);
if(old_doc.contains(it.key())) {
if(old_doc[it.key()] == it.value()) {
// unchanged so should not be part of update doc
it = update_doc.erase(it);
continue;
} else {
// delete this old value from index
del_doc[it.key()] = old_doc[it.key()];
}
}
}
for(const auto& unchanged_key: unchanged_keys) {
del_doc.erase(unchanged_key);
update_doc.erase(unchanged_key);
it++;
}
}

View File

@ -1225,6 +1225,153 @@ TEST_F(CollectionSpecificMoreTest, UpsertUpdateEmplaceShouldAllRemoveIndex) {
ASSERT_EQ(1, results["found"].get<size_t>());
}
TEST_F(CollectionSpecificMoreTest, UpdateWithEmptyArray) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "tags", "type": "string[]"}
]
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll1 = op.get();
auto doc1 = R"({
"id": "0",
"tags": ["alpha", "beta", "gamma"]
})"_json;
ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
auto doc2 = R"({
"id": "1",
"tags": ["one", "two"]
})"_json;
ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
// via update
auto doc_update = R"({
"id": "0",
"tags": []
})"_json;
ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
auto results = coll1->search("alpha", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
// via upsert
doc_update = R"({
"id": "1",
"tags": []
})"_json;
ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
results = coll1->search("one", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
}
TEST_F(CollectionSpecificMoreTest, UpdateArrayWithNullValue) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "tags", "type": "string[]", "optional": true}
]
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll1 = op.get();
auto doc1 = R"({
"id": "0",
"tags": ["alpha", "beta", "gamma"]
})"_json;
ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
auto doc2 = R"({
"id": "1",
"tags": ["one", "two"]
})"_json;
ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
// via update
auto doc_update = R"({
"id": "0",
"tags": null
})"_json;
ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
auto results = coll1->search("alpha", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
// via upsert
doc_update = R"({
"id": "1",
"tags": null
})"_json;
ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
results = coll1->search("one", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
}
TEST_F(CollectionSpecificMoreTest, ReplaceArrayElement) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "tags", "type": "string[]"}
]
})"_json;
auto op = collectionManager.create_collection(schema);
ASSERT_TRUE(op.ok());
Collection* coll1 = op.get();
auto doc1 = R"({
"id": "0",
"tags": ["alpha", "beta", "gamma"]
})"_json;
ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
auto doc2 = R"({
"id": "1",
"tags": ["one", "two", "three"]
})"_json;
ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
// via update
auto doc_update = R"({
"id": "0",
"tags": ["alpha", "gamma"]
})"_json;
ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
auto results = coll1->search("beta", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
// via upsert
doc_update = R"({
"id": "1",
"tags": ["one", "three"]
})"_json;
ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
results = coll1->search("two", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
ASSERT_EQ(0, results["found"].get<size_t>());
}
TEST_F(CollectionSpecificMoreTest, UnorderedWeightingOfFields) {
nlohmann::json schema = R"({
"name": "coll1",

View File

@ -3,66 +3,6 @@
#include <vector>
#include <s2/s2loop.h>
TEST(IndexTest, ScrubReindexDoc) {
tsl::htrie_map<char, field> search_schema;
search_schema.emplace("title", field("title", field_types::STRING, false));
search_schema.emplace("points", field("points", field_types::INT32, false));
search_schema.emplace("cast", field("cast", field_types::STRING_ARRAY, false));
search_schema.emplace("movie", field("movie", field_types::BOOL, false));
ThreadPool pool(4);
Index index("index", 1, nullptr, nullptr, &pool, search_schema, {}, {});
nlohmann::json old_doc;
old_doc["id"] = "1";
old_doc["title"] = "One more thing.";
old_doc["points"] = 100;
old_doc["cast"] = {"John Wick", "Jeremy Renner"};
old_doc["movie"] = true;
// all fields remain same
nlohmann::json update_doc1, del_doc1;
update_doc1 = old_doc;
del_doc1 = old_doc;
index.scrub_reindex_doc(search_schema, update_doc1, del_doc1, old_doc);
ASSERT_EQ(1, del_doc1.size());
ASSERT_STREQ("1", del_doc1["id"].get<std::string>().c_str());
// when only some fields are updated
nlohmann::json update_doc2, del_doc2;
update_doc2["id"] = "1";
update_doc2["points"] = 100;
update_doc2["cast"] = {"Jack"};
del_doc2 = update_doc2;
index.scrub_reindex_doc(search_schema, update_doc2, del_doc2, old_doc);
ASSERT_EQ(2, del_doc2.size());
ASSERT_STREQ("1", del_doc2["id"].get<std::string>().c_str());
std::vector<std::string> cast = del_doc2["cast"].get<std::vector<std::string>>();
ASSERT_EQ(1, cast.size());
ASSERT_STREQ("Jack", cast[0].c_str());
// containing fields not part of search schema
nlohmann::json update_doc3, del_doc3;
update_doc3["id"] = "1";
update_doc3["title"] = "The Lawyer";
update_doc3["foo"] = "Bar";
del_doc3 = update_doc3;
index.scrub_reindex_doc(search_schema, update_doc3, del_doc3, old_doc);
ASSERT_EQ(3, del_doc3.size());
ASSERT_STREQ("1", del_doc3["id"].get<std::string>().c_str());
ASSERT_STREQ("The Lawyer", del_doc3["title"].get<std::string>().c_str());
ASSERT_STREQ("Bar", del_doc3["foo"].get<std::string>().c_str());
pool.shutdown();
}
/*TEST(IndexTest, PointInPolygon180thMeridian) {
// somewhere in far eastern russia
GeoCoord verts[3] = {