Fix conversion to full posting list.

Ensure length never overflows max capacity threshold.
This commit is contained in:
Kishore Nallan 2021-10-18 18:07:06 +05:30
parent 0f36b15d5f
commit 20c7705e61
4 changed files with 74 additions and 19 deletions

View File

@ -41,12 +41,13 @@ struct compact_posting_list_t {
class posting_t {
private:
static constexpr size_t COMPACT_LIST_THRESHOLD_LENGTH = 64;
static void to_expanded_plists(const std::vector<void*>& raw_posting_lists, std::vector<posting_list_t*>& plists,
std::vector<posting_list_t*>& expanded_plists);
public:
static constexpr size_t COMPACT_LIST_THRESHOLD_LENGTH = 64;
static constexpr size_t MAX_BLOCK_ELEMENTS = 256;
struct block_intersector_t {
std::vector<posting_list_t*> plists;

View File

@ -432,9 +432,16 @@ static art_leaf* make_leaf(const unsigned char *key, uint32_t key_len, art_docum
uint32_t ids[1] = {document->id};
uint32_t offset_index[1] = {0};
compact_posting_list_t* list = compact_posting_list_t::create(1, ids, offset_index, document->offsets.size(),
&document->offsets[0]);
l->values = SET_COMPACT_POSTING(list);
if((2 + document->offsets.size()) <= posting_t::COMPACT_LIST_THRESHOLD_LENGTH) {
compact_posting_list_t* list = compact_posting_list_t::create(1, ids, offset_index, document->offsets.size(),
&document->offsets[0]);
l->values = SET_COMPACT_POSTING(list);
} else {
posting_list_t* pl = new posting_list_t(posting_t::MAX_BLOCK_ELEMENTS);
pl->upsert(document->id, document->offsets);
l->values = pl;
}
memcpy(l->key, key, key_len);
add_document_to_leaf(document, l);

View File

@ -156,7 +156,7 @@ compact_posting_list_t* compact_posting_list_t::create(uint32_t num_ids, const u
}
posting_list_t* compact_posting_list_t::to_full_posting_list() const {
posting_list_t* pl = new posting_list_t(256);
posting_list_t* pl = new posting_list_t(posting_t::MAX_BLOCK_ELEMENTS);
size_t i = 0;
while(i < length) {
@ -244,9 +244,23 @@ void posting_t::upsert(void*& obj, uint32_t id, const std::vector<uint32_t>& off
compact_posting_list_t* list = (compact_posting_list_t*) RAW_POSTING_PTR(obj);
int64_t extra_capacity_required = list->upsert(id, offsets);
if(extra_capacity_required != 0) {
if(extra_capacity_required == 0) {
// upsert succeeded
return;
}
if((list->capacity + extra_capacity_required) > COMPACT_LIST_THRESHOLD_LENGTH) {
// we have to convert to a full posting list
posting_list_t* full_list = list->to_full_posting_list();
free(list);
obj = full_list;
}
else {
// grow the container by 30%
size_t new_capacity = (list->capacity + extra_capacity_required) * 1.3;
size_t new_capacity = std::min<size_t>((list->capacity + extra_capacity_required) * 1.3,
COMPACT_LIST_THRESHOLD_LENGTH);
size_t new_capacity_bytes = sizeof(compact_posting_list_t) + (new_capacity * sizeof(uint32_t));
auto new_list = (compact_posting_list_t *) realloc(list, new_capacity_bytes);
if(new_list == nullptr) {
@ -258,20 +272,14 @@ void posting_t::upsert(void*& obj, uint32_t id, const std::vector<uint32_t>& off
obj = SET_COMPACT_POSTING(list);
list->upsert(id, offsets);
}
if(list->length > COMPACT_LIST_THRESHOLD_LENGTH) {
// we will store anything over this threshold as a full posting list
posting_list_t* full_list = list->to_full_posting_list();
free(list);
obj = full_list;
return;
return ;
}
} else {
posting_list_t* list = (posting_list_t*)(obj);
list->upsert(id, offsets);
}
// either `obj` is already a full list or was converted to a full list above
posting_list_t* list = (posting_list_t*)(obj);
list->upsert(id, offsets);
}
void posting_t::erase(void*& obj, uint32_t id) {
@ -465,7 +473,6 @@ void posting_t::block_intersector_t::split_lists(size_t concurrency,
// [3, 5] [6]
if(i == 0) {
auto& plist = this->plists[i];
p_start_block = start_block;
p_end_block = curr_block->next;
} else {

View File

@ -1657,3 +1657,43 @@ TEST_F(CollectionSpecificTest, CustomNumTyposConfiguration) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, RepeatingStringArrayTokens) {
std::vector<std::string> tags;
// when the first document containing a token already cannot fit compact posting list
for(size_t i = 0; i < 200; i++) {
tags.emplace_back("spools");
}
std::vector<field> fields = {field("tags", field_types::STRING_ARRAY, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
nlohmann::json doc;
doc["tags"] = tags;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
auto results = coll1->search("spools", {"tags"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {false}).get();
ASSERT_EQ(1, results["hits"].size());
// when the second document containing a token cannot fit compact posting list
tags = {"foobar"};
doc["tags"] = tags;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
for(size_t i = 0; i < 200; i++) {
tags.emplace_back("foobar");
}
doc["tags"] = tags;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
results = coll1->search("foobar", {"tags"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {false}).get();
ASSERT_EQ(2, results["hits"].size());
collectionManager.drop_collection("coll1");
}