diff --git a/include/posting.h b/include/posting.h index 998b1f85..fe49c544 100644 --- a/include/posting.h +++ b/include/posting.h @@ -41,12 +41,13 @@ struct compact_posting_list_t { class posting_t { private: - static constexpr size_t COMPACT_LIST_THRESHOLD_LENGTH = 64; static void to_expanded_plists(const std::vector& raw_posting_lists, std::vector& plists, std::vector& expanded_plists); public: + static constexpr size_t COMPACT_LIST_THRESHOLD_LENGTH = 64; + static constexpr size_t MAX_BLOCK_ELEMENTS = 256; struct block_intersector_t { std::vector plists; diff --git a/src/art.cpp b/src/art.cpp index c3261b70..cba9cb04 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -432,9 +432,16 @@ static art_leaf* make_leaf(const unsigned char *key, uint32_t key_len, art_docum uint32_t ids[1] = {document->id}; uint32_t offset_index[1] = {0}; - compact_posting_list_t* list = compact_posting_list_t::create(1, ids, offset_index, document->offsets.size(), - &document->offsets[0]); - l->values = SET_COMPACT_POSTING(list); + + if((2 + document->offsets.size()) <= posting_t::COMPACT_LIST_THRESHOLD_LENGTH) { + compact_posting_list_t* list = compact_posting_list_t::create(1, ids, offset_index, document->offsets.size(), + &document->offsets[0]); + l->values = SET_COMPACT_POSTING(list); + } else { + posting_list_t* pl = new posting_list_t(posting_t::MAX_BLOCK_ELEMENTS); + pl->upsert(document->id, document->offsets); + l->values = pl; + } memcpy(l->key, key, key_len); add_document_to_leaf(document, l); diff --git a/src/posting.cpp b/src/posting.cpp index 8b0c8bb3..a0270dc7 100644 --- a/src/posting.cpp +++ b/src/posting.cpp @@ -156,7 +156,7 @@ compact_posting_list_t* compact_posting_list_t::create(uint32_t num_ids, const u } posting_list_t* compact_posting_list_t::to_full_posting_list() const { - posting_list_t* pl = new posting_list_t(256); + posting_list_t* pl = new posting_list_t(posting_t::MAX_BLOCK_ELEMENTS); size_t i = 0; while(i < length) { @@ -244,9 +244,23 @@ void posting_t::upsert(void*& obj, uint32_t id, const std::vector& off compact_posting_list_t* list = (compact_posting_list_t*) RAW_POSTING_PTR(obj); int64_t extra_capacity_required = list->upsert(id, offsets); - if(extra_capacity_required != 0) { + if(extra_capacity_required == 0) { + // upsert succeeded + return; + } + + if((list->capacity + extra_capacity_required) > COMPACT_LIST_THRESHOLD_LENGTH) { + // we have to convert to a full posting list + posting_list_t* full_list = list->to_full_posting_list(); + free(list); + obj = full_list; + } + + else { // grow the container by 30% - size_t new_capacity = (list->capacity + extra_capacity_required) * 1.3; + size_t new_capacity = std::min((list->capacity + extra_capacity_required) * 1.3, + COMPACT_LIST_THRESHOLD_LENGTH); + size_t new_capacity_bytes = sizeof(compact_posting_list_t) + (new_capacity * sizeof(uint32_t)); auto new_list = (compact_posting_list_t *) realloc(list, new_capacity_bytes); if(new_list == nullptr) { @@ -258,20 +272,14 @@ void posting_t::upsert(void*& obj, uint32_t id, const std::vector& off obj = SET_COMPACT_POSTING(list); list->upsert(id, offsets); - } - if(list->length > COMPACT_LIST_THRESHOLD_LENGTH) { - // we will store anything over this threshold as a full posting list - posting_list_t* full_list = list->to_full_posting_list(); - free(list); - obj = full_list; - return; + return ; } - - } else { - posting_list_t* list = (posting_list_t*)(obj); - list->upsert(id, offsets); } + + // either `obj` is already a full list or was converted to a full list above + posting_list_t* list = (posting_list_t*)(obj); + list->upsert(id, offsets); } void posting_t::erase(void*& obj, uint32_t id) { @@ -465,7 +473,6 @@ void posting_t::block_intersector_t::split_lists(size_t concurrency, // [3, 5] [6] if(i == 0) { - auto& plist = this->plists[i]; p_start_block = start_block; p_end_block = curr_block->next; } else { diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index fb9cd523..001fdf41 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -1657,3 +1657,43 @@ TEST_F(CollectionSpecificTest, CustomNumTyposConfiguration) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionSpecificTest, RepeatingStringArrayTokens) { + std::vector tags; + + // when the first document containing a token already cannot fit compact posting list + + for(size_t i = 0; i < 200; i++) { + tags.emplace_back("spools"); + } + + std::vector fields = {field("tags", field_types::STRING_ARRAY, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + nlohmann::json doc; + doc["tags"] = tags; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("spools", {"tags"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {false}).get(); + ASSERT_EQ(1, results["hits"].size()); + + // when the second document containing a token cannot fit compact posting list + tags = {"foobar"}; + doc["tags"] = tags; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + for(size_t i = 0; i < 200; i++) { + tags.emplace_back("foobar"); + } + + doc["tags"] = tags; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + results = coll1->search("foobar", {"tags"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {false}).get(); + ASSERT_EQ(2, results["hits"].size()); + + collectionManager.drop_collection("coll1"); +}