mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 13:12:22 +08:00
Fix conversion to full posting list.
Ensure length never overflows max capacity threshold.
This commit is contained in:
parent
0f36b15d5f
commit
20c7705e61
@ -41,12 +41,13 @@ struct compact_posting_list_t {
|
||||
|
||||
class posting_t {
|
||||
private:
|
||||
static constexpr size_t COMPACT_LIST_THRESHOLD_LENGTH = 64;
|
||||
|
||||
static void to_expanded_plists(const std::vector<void*>& raw_posting_lists, std::vector<posting_list_t*>& plists,
|
||||
std::vector<posting_list_t*>& expanded_plists);
|
||||
|
||||
public:
|
||||
static constexpr size_t COMPACT_LIST_THRESHOLD_LENGTH = 64;
|
||||
static constexpr size_t MAX_BLOCK_ELEMENTS = 256;
|
||||
|
||||
struct block_intersector_t {
|
||||
std::vector<posting_list_t*> plists;
|
||||
|
13
src/art.cpp
13
src/art.cpp
@ -432,9 +432,16 @@ static art_leaf* make_leaf(const unsigned char *key, uint32_t key_len, art_docum
|
||||
|
||||
uint32_t ids[1] = {document->id};
|
||||
uint32_t offset_index[1] = {0};
|
||||
compact_posting_list_t* list = compact_posting_list_t::create(1, ids, offset_index, document->offsets.size(),
|
||||
&document->offsets[0]);
|
||||
l->values = SET_COMPACT_POSTING(list);
|
||||
|
||||
if((2 + document->offsets.size()) <= posting_t::COMPACT_LIST_THRESHOLD_LENGTH) {
|
||||
compact_posting_list_t* list = compact_posting_list_t::create(1, ids, offset_index, document->offsets.size(),
|
||||
&document->offsets[0]);
|
||||
l->values = SET_COMPACT_POSTING(list);
|
||||
} else {
|
||||
posting_list_t* pl = new posting_list_t(posting_t::MAX_BLOCK_ELEMENTS);
|
||||
pl->upsert(document->id, document->offsets);
|
||||
l->values = pl;
|
||||
}
|
||||
|
||||
memcpy(l->key, key, key_len);
|
||||
add_document_to_leaf(document, l);
|
||||
|
@ -156,7 +156,7 @@ compact_posting_list_t* compact_posting_list_t::create(uint32_t num_ids, const u
|
||||
}
|
||||
|
||||
posting_list_t* compact_posting_list_t::to_full_posting_list() const {
|
||||
posting_list_t* pl = new posting_list_t(256);
|
||||
posting_list_t* pl = new posting_list_t(posting_t::MAX_BLOCK_ELEMENTS);
|
||||
|
||||
size_t i = 0;
|
||||
while(i < length) {
|
||||
@ -244,9 +244,23 @@ void posting_t::upsert(void*& obj, uint32_t id, const std::vector<uint32_t>& off
|
||||
compact_posting_list_t* list = (compact_posting_list_t*) RAW_POSTING_PTR(obj);
|
||||
int64_t extra_capacity_required = list->upsert(id, offsets);
|
||||
|
||||
if(extra_capacity_required != 0) {
|
||||
if(extra_capacity_required == 0) {
|
||||
// upsert succeeded
|
||||
return;
|
||||
}
|
||||
|
||||
if((list->capacity + extra_capacity_required) > COMPACT_LIST_THRESHOLD_LENGTH) {
|
||||
// we have to convert to a full posting list
|
||||
posting_list_t* full_list = list->to_full_posting_list();
|
||||
free(list);
|
||||
obj = full_list;
|
||||
}
|
||||
|
||||
else {
|
||||
// grow the container by 30%
|
||||
size_t new_capacity = (list->capacity + extra_capacity_required) * 1.3;
|
||||
size_t new_capacity = std::min<size_t>((list->capacity + extra_capacity_required) * 1.3,
|
||||
COMPACT_LIST_THRESHOLD_LENGTH);
|
||||
|
||||
size_t new_capacity_bytes = sizeof(compact_posting_list_t) + (new_capacity * sizeof(uint32_t));
|
||||
auto new_list = (compact_posting_list_t *) realloc(list, new_capacity_bytes);
|
||||
if(new_list == nullptr) {
|
||||
@ -258,20 +272,14 @@ void posting_t::upsert(void*& obj, uint32_t id, const std::vector<uint32_t>& off
|
||||
obj = SET_COMPACT_POSTING(list);
|
||||
|
||||
list->upsert(id, offsets);
|
||||
}
|
||||
|
||||
if(list->length > COMPACT_LIST_THRESHOLD_LENGTH) {
|
||||
// we will store anything over this threshold as a full posting list
|
||||
posting_list_t* full_list = list->to_full_posting_list();
|
||||
free(list);
|
||||
obj = full_list;
|
||||
return;
|
||||
return ;
|
||||
}
|
||||
|
||||
} else {
|
||||
posting_list_t* list = (posting_list_t*)(obj);
|
||||
list->upsert(id, offsets);
|
||||
}
|
||||
|
||||
// either `obj` is already a full list or was converted to a full list above
|
||||
posting_list_t* list = (posting_list_t*)(obj);
|
||||
list->upsert(id, offsets);
|
||||
}
|
||||
|
||||
void posting_t::erase(void*& obj, uint32_t id) {
|
||||
@ -465,7 +473,6 @@ void posting_t::block_intersector_t::split_lists(size_t concurrency,
|
||||
// [3, 5] [6]
|
||||
|
||||
if(i == 0) {
|
||||
auto& plist = this->plists[i];
|
||||
p_start_block = start_block;
|
||||
p_end_block = curr_block->next;
|
||||
} else {
|
||||
|
@ -1657,3 +1657,43 @@ TEST_F(CollectionSpecificTest, CustomNumTyposConfiguration) {
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, RepeatingStringArrayTokens) {
|
||||
std::vector<std::string> tags;
|
||||
|
||||
// when the first document containing a token already cannot fit compact posting list
|
||||
|
||||
for(size_t i = 0; i < 200; i++) {
|
||||
tags.emplace_back("spools");
|
||||
}
|
||||
|
||||
std::vector<field> fields = {field("tags", field_types::STRING_ARRAY, false),};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["tags"] = tags;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
auto results = coll1->search("spools", {"tags"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {false}).get();
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
// when the second document containing a token cannot fit compact posting list
|
||||
tags = {"foobar"};
|
||||
doc["tags"] = tags;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
for(size_t i = 0; i < 200; i++) {
|
||||
tags.emplace_back("foobar");
|
||||
}
|
||||
|
||||
doc["tags"] = tags;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
results = coll1->search("foobar", {"tags"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {false}).get();
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user