mirror of
https://github.com/typesense/typesense.git
synced 2025-05-16 11:28:44 +08:00
Fix edge case in updating empty array strings.
This commit is contained in:
parent
e60094c88b
commit
c066120fb1
@ -631,9 +631,6 @@ public:
|
||||
const std::vector<char>& local_token_separators,
|
||||
const std::vector<char>& local_symbols_to_index);
|
||||
|
||||
static void scrub_reindex_doc(const tsl::htrie_map<char, field>& search_schema,
|
||||
nlohmann::json& update_doc, nlohmann::json& del_doc, const nlohmann::json& old_doc);
|
||||
|
||||
static void tokenize_string_field(const nlohmann::json& document,
|
||||
const field& search_field, std::vector<std::string>& tokens,
|
||||
const std::string& locale,
|
||||
|
@ -454,7 +454,6 @@ void Index::validate_and_preprocess(Index *index, std::vector<index_record>& ite
|
||||
// scrub string fields to reduce delete ops
|
||||
get_doc_changes(index_rec.operation, search_schema, index_rec.doc, index_rec.old_doc,
|
||||
index_rec.new_doc, index_rec.del_doc);
|
||||
scrub_reindex_doc(search_schema, index_rec.doc, index_rec.del_doc, index_rec.old_doc);
|
||||
|
||||
if(generate_embeddings) {
|
||||
for(auto& field: index_rec.doc.items()) {
|
||||
@ -6236,11 +6235,19 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<cha
|
||||
|
||||
if(op == UPSERT) {
|
||||
new_doc = update_doc;
|
||||
// since UPSERT could replace a doc with lesser fields, we have to add those missing fields to del_doc
|
||||
for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
|
||||
if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!update_doc.contains(it.key())) {
|
||||
del_doc[it.key()] = it.value();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
new_doc = old_doc;
|
||||
|
||||
handle_doc_ops(search_schema, update_doc, old_doc);
|
||||
|
||||
new_doc = old_doc;
|
||||
new_doc.merge_patch(update_doc);
|
||||
|
||||
if(old_doc.contains(".flat")) {
|
||||
@ -6251,87 +6258,33 @@ void Index::get_doc_changes(const index_operation_t op, const tsl::htrie_map<cha
|
||||
}
|
||||
}
|
||||
|
||||
for(auto it = old_doc.begin(); it != old_doc.end(); ++it) {
|
||||
if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(op == UPSERT && !update_doc.contains(it.key())) {
|
||||
del_doc[it.key()] = it.value();
|
||||
}
|
||||
}
|
||||
|
||||
auto it = update_doc.begin();
|
||||
for(; it != update_doc.end(); ) {
|
||||
if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
|
||||
while(it != update_doc.end()) {
|
||||
if(it.value().is_object() || (it.value().is_array() && !it.value().empty() && it.value()[0].is_object())) {
|
||||
++it;
|
||||
continue;
|
||||
}
|
||||
|
||||
// if the update doc contains a field that exists in old, we record that (for delete + reindex)
|
||||
bool field_exists_in_old_doc = (old_doc.count(it.key()) != 0);
|
||||
if(field_exists_in_old_doc) {
|
||||
// key exists in the stored doc, so it must be reindexed
|
||||
// we need to check for this because a field can be optional
|
||||
del_doc[it.key()] = old_doc[it.key()];
|
||||
}
|
||||
|
||||
// adds new key or overrides existing key from `old_doc`
|
||||
if(it.value().is_null()) {
|
||||
// null values should not indexed
|
||||
// null values should not be indexed
|
||||
new_doc.erase(it.key());
|
||||
del_doc[it.key()] = old_doc[it.key()];
|
||||
it = update_doc.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Index::scrub_reindex_doc(const tsl::htrie_map<char, field>& search_schema,
|
||||
nlohmann::json& update_doc,
|
||||
nlohmann::json& del_doc,
|
||||
const nlohmann::json& old_doc) {
|
||||
|
||||
/*LOG(INFO) << "update_doc: " << update_doc;
|
||||
LOG(INFO) << "old_doc: " << old_doc;
|
||||
LOG(INFO) << "del_doc: " << del_doc;*/
|
||||
|
||||
// del_doc contains fields that exist in both update doc and old doc
|
||||
// But we will only remove fields that are different
|
||||
|
||||
std::vector<std::string> unchanged_keys;
|
||||
|
||||
for(auto it = del_doc.cbegin(); it != del_doc.cend(); it++) {
|
||||
const std::string& field_name = it.key();
|
||||
|
||||
const auto& search_field_it = search_schema.find(field_name);
|
||||
if(search_field_it == search_schema.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(it.value().is_object() || (it.value().is_array() && (it.value().empty() || it.value()[0].is_object()))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto search_field = search_field_it.value(); // copy, don't use reference!
|
||||
|
||||
// compare values between old and update docs:
|
||||
// if they match, we will remove them from both del and update docs
|
||||
|
||||
if(update_doc.contains(search_field.name)) {
|
||||
if(update_doc[search_field.name].is_null()) {
|
||||
// we don't allow null values to be stored or indexed but need to be removed from stored doc
|
||||
update_doc.erase(search_field.name);
|
||||
}
|
||||
else if(update_doc[search_field.name] == old_doc[search_field.name]) {
|
||||
unchanged_keys.push_back(field_name);
|
||||
if(old_doc.contains(it.key())) {
|
||||
if(old_doc[it.key()] == it.value()) {
|
||||
// unchanged so should not be part of update doc
|
||||
it = update_doc.erase(it);
|
||||
continue;
|
||||
} else {
|
||||
// delete this old value from index
|
||||
del_doc[it.key()] = old_doc[it.key()];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto& unchanged_key: unchanged_keys) {
|
||||
del_doc.erase(unchanged_key);
|
||||
update_doc.erase(unchanged_key);
|
||||
it++;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1225,6 +1225,153 @@ TEST_F(CollectionSpecificMoreTest, UpsertUpdateEmplaceShouldAllRemoveIndex) {
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificMoreTest, UpdateWithEmptyArray) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "tags", "type": "string[]"}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll1 = op.get();
|
||||
|
||||
auto doc1 = R"({
|
||||
"id": "0",
|
||||
"tags": ["alpha", "beta", "gamma"]
|
||||
})"_json;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
|
||||
|
||||
auto doc2 = R"({
|
||||
"id": "1",
|
||||
"tags": ["one", "two"]
|
||||
})"_json;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
|
||||
|
||||
// via update
|
||||
|
||||
auto doc_update = R"({
|
||||
"id": "0",
|
||||
"tags": []
|
||||
})"_json;
|
||||
ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
|
||||
|
||||
auto results = coll1->search("alpha", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
|
||||
ASSERT_EQ(0, results["found"].get<size_t>());
|
||||
|
||||
// via upsert
|
||||
|
||||
doc_update = R"({
|
||||
"id": "1",
|
||||
"tags": []
|
||||
})"_json;
|
||||
ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
|
||||
|
||||
results = coll1->search("one", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
|
||||
ASSERT_EQ(0, results["found"].get<size_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificMoreTest, UpdateArrayWithNullValue) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "tags", "type": "string[]", "optional": true}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll1 = op.get();
|
||||
|
||||
auto doc1 = R"({
|
||||
"id": "0",
|
||||
"tags": ["alpha", "beta", "gamma"]
|
||||
})"_json;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
|
||||
|
||||
auto doc2 = R"({
|
||||
"id": "1",
|
||||
"tags": ["one", "two"]
|
||||
})"_json;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
|
||||
|
||||
// via update
|
||||
|
||||
auto doc_update = R"({
|
||||
"id": "0",
|
||||
"tags": null
|
||||
})"_json;
|
||||
ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
|
||||
|
||||
auto results = coll1->search("alpha", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
|
||||
ASSERT_EQ(0, results["found"].get<size_t>());
|
||||
|
||||
// via upsert
|
||||
|
||||
doc_update = R"({
|
||||
"id": "1",
|
||||
"tags": null
|
||||
})"_json;
|
||||
ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
|
||||
|
||||
results = coll1->search("one", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
|
||||
ASSERT_EQ(0, results["found"].get<size_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificMoreTest, ReplaceArrayElement) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "tags", "type": "string[]"}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
auto op = collectionManager.create_collection(schema);
|
||||
ASSERT_TRUE(op.ok());
|
||||
Collection* coll1 = op.get();
|
||||
|
||||
auto doc1 = R"({
|
||||
"id": "0",
|
||||
"tags": ["alpha", "beta", "gamma"]
|
||||
})"_json;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
|
||||
|
||||
auto doc2 = R"({
|
||||
"id": "1",
|
||||
"tags": ["one", "two", "three"]
|
||||
})"_json;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc2.dump(), CREATE).ok());
|
||||
|
||||
// via update
|
||||
|
||||
auto doc_update = R"({
|
||||
"id": "0",
|
||||
"tags": ["alpha", "gamma"]
|
||||
})"_json;
|
||||
ASSERT_TRUE(coll1->add(doc_update.dump(), UPDATE).ok());
|
||||
|
||||
auto results = coll1->search("beta", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
|
||||
ASSERT_EQ(0, results["found"].get<size_t>());
|
||||
|
||||
// via upsert
|
||||
|
||||
doc_update = R"({
|
||||
"id": "1",
|
||||
"tags": ["one", "three"]
|
||||
})"_json;
|
||||
ASSERT_TRUE(coll1->add(doc_update.dump(), UPSERT).ok());
|
||||
|
||||
results = coll1->search("two", {"tags"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
|
||||
ASSERT_EQ(0, results["found"].get<size_t>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificMoreTest, UnorderedWeightingOfFields) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
|
@ -3,66 +3,6 @@
|
||||
#include <vector>
|
||||
#include <s2/s2loop.h>
|
||||
|
||||
TEST(IndexTest, ScrubReindexDoc) {
|
||||
tsl::htrie_map<char, field> search_schema;
|
||||
search_schema.emplace("title", field("title", field_types::STRING, false));
|
||||
search_schema.emplace("points", field("points", field_types::INT32, false));
|
||||
search_schema.emplace("cast", field("cast", field_types::STRING_ARRAY, false));
|
||||
search_schema.emplace("movie", field("movie", field_types::BOOL, false));
|
||||
|
||||
ThreadPool pool(4);
|
||||
|
||||
Index index("index", 1, nullptr, nullptr, &pool, search_schema, {}, {});
|
||||
nlohmann::json old_doc;
|
||||
old_doc["id"] = "1";
|
||||
old_doc["title"] = "One more thing.";
|
||||
old_doc["points"] = 100;
|
||||
old_doc["cast"] = {"John Wick", "Jeremy Renner"};
|
||||
old_doc["movie"] = true;
|
||||
|
||||
// all fields remain same
|
||||
|
||||
nlohmann::json update_doc1, del_doc1;
|
||||
update_doc1 = old_doc;
|
||||
del_doc1 = old_doc;
|
||||
|
||||
index.scrub_reindex_doc(search_schema, update_doc1, del_doc1, old_doc);
|
||||
ASSERT_EQ(1, del_doc1.size());
|
||||
ASSERT_STREQ("1", del_doc1["id"].get<std::string>().c_str());
|
||||
|
||||
// when only some fields are updated
|
||||
|
||||
nlohmann::json update_doc2, del_doc2;
|
||||
update_doc2["id"] = "1";
|
||||
update_doc2["points"] = 100;
|
||||
update_doc2["cast"] = {"Jack"};
|
||||
|
||||
del_doc2 = update_doc2;
|
||||
|
||||
index.scrub_reindex_doc(search_schema, update_doc2, del_doc2, old_doc);
|
||||
ASSERT_EQ(2, del_doc2.size());
|
||||
ASSERT_STREQ("1", del_doc2["id"].get<std::string>().c_str());
|
||||
std::vector<std::string> cast = del_doc2["cast"].get<std::vector<std::string>>();
|
||||
ASSERT_EQ(1, cast.size());
|
||||
ASSERT_STREQ("Jack", cast[0].c_str());
|
||||
|
||||
// containing fields not part of search schema
|
||||
|
||||
nlohmann::json update_doc3, del_doc3;
|
||||
update_doc3["id"] = "1";
|
||||
update_doc3["title"] = "The Lawyer";
|
||||
update_doc3["foo"] = "Bar";
|
||||
|
||||
del_doc3 = update_doc3;
|
||||
index.scrub_reindex_doc(search_schema, update_doc3, del_doc3, old_doc);
|
||||
ASSERT_EQ(3, del_doc3.size());
|
||||
ASSERT_STREQ("1", del_doc3["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("The Lawyer", del_doc3["title"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("Bar", del_doc3["foo"].get<std::string>().c_str());
|
||||
|
||||
pool.shutdown();
|
||||
}
|
||||
|
||||
/*TEST(IndexTest, PointInPolygon180thMeridian) {
|
||||
// somewhere in far eastern russia
|
||||
GeoCoord verts[3] = {
|
||||
|
Loading…
x
Reference in New Issue
Block a user