Array append/remove perf improvements.

This commit is contained in:
Kishore Nallan 2020-10-20 06:55:58 +05:30
parent 0508700ca3
commit bccc3ac362
6 changed files with 296 additions and 23 deletions

View File

@ -52,5 +52,7 @@ public:
bool insert(size_t index, uint32_t value);
void remove_value(uint32_t value);
void remove_values(uint32_t *sorted_values, uint32_t sorted_values_length);
};

View File

@ -1911,7 +1911,6 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), key, key_len);
if(leaf != nullptr) {
uint32_t seq_id_values[1] = {seq_id};
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
if(doc_index == leaf->values->ids.getLength()) {
@ -1928,7 +1927,7 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);
leaf->values->offsets.remove_index(start_offset, end_offset);
leaf->values->ids.remove_values(seq_id_values, 1);
leaf->values->ids.remove_value(seq_id);
/*len = leaf->values->offset_index.getLength();
for(auto i=0; i<len; i++) {

View File

@ -23,24 +23,21 @@ size_t sorted_array::append(uint32_t value) {
if(value < max) {
// we will have to re-encode the whole sequence again
uint32_t* arr = uncompress(length+1);
size_t i = 0;
while(i < length+1) {
if(value < arr[i]) {
break;
}
i++;
}
for(size_t j=length; j>i; j--) {
// find the index of the element which is >= to `value`
uint32_t found_val;
uint32_t gte_index = for_lower_bound_search(in, length, value, &found_val);
for(size_t j=length; j>gte_index; j--) {
arr[j] = arr[j-1];
}
arr[i] = value;
arr[gte_index] = value;
load(arr, length+1);
delete [] arr;
return i;
return gte_index;
} else {
uint32_t size_required = sorted_append_size_required(value, length+1);
size_t min_expected_size = size_required + FOR_ELE_SIZE;
@ -104,7 +101,11 @@ uint32_t sorted_array::indexOf(uint32_t value) {
uint32_t actual;
uint32_t index = for_lower_bound_search(in, length, value, &actual);
if(actual == value) return index;
if(actual == value) {
return index;
}
return length;
}
@ -193,6 +194,28 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint
binary_search_indices(values, head, tail, low_index, high_index, base, bits, indices);
}
void sorted_array::remove_value(uint32_t value) {
// A lower bound search returns the first element in the sequence that is >= `value`
// So, `found_val` will be either equal or greater than `value`
uint32_t found_val;
uint32_t found_index = for_lower_bound_search(in, length, value, &found_val);
if(found_val != value) {
return ;
}
uint32_t *curr_array = uncompress();
if(found_index + 1 < length) {
memmove(&curr_array[found_index], &curr_array[found_index+1], sizeof(uint32_t) * (length - found_index - 1));
}
size_t new_length = (length == 0) ? 0 : (length - 1);
load(curr_array, new_length);
delete [] curr_array;
}
void sorted_array::remove_values(uint32_t *sorted_values, uint32_t sorted_values_length) {
uint32_t *curr_array = uncompress();

View File

@ -14,6 +14,9 @@ protected:
CollectionManager & collectionManager = CollectionManager::get_instance();
std::vector<sort_by> sort_fields;
// used for generating random text
std::vector<std::string> words;
void setupCollection() {
std::string state_dir_path = "/tmp/typesense_test/collection";
LOG(INFO) << "Truncating and creating: " << state_dir_path;
@ -48,6 +51,12 @@ protected:
}
infile.close();
std::ifstream words_file(std::string(ROOT_DIR)+"test/resources/common100_english.txt");
std::stringstream strstream;
strstream << words_file.rdbuf();
words_file.close();
StringUtils::split(strstream.str(), words, "\n");
}
virtual void SetUp() {
@ -59,6 +68,18 @@ protected:
collectionManager.dispose();
delete store;
}
std::string get_text(size_t num_words) {
time_t t;
srand((unsigned) time(&t));
std::vector<std::string> strs;
for(size_t i = 0 ; i < num_words ; i++ ) {
int word_index = rand() % 100;
strs.push_back(words[word_index]);
}
return StringUtils::join(strs, " ");
}
};
TEST_F(CollectionTest, VerifyCountOfDocuments) {
@ -1410,6 +1431,90 @@ TEST_F(CollectionTest, ImportDocumentsUpsert) {
ASSERT_EQ(70, results["hits"][0]["document"]["points"].get<uint32_t>());
}
TEST_F(CollectionTest, ImportDocumentsUpsertOptional) {
Collection *coll1;
std::vector<field> fields = {
field("title", field_types::STRING_ARRAY, false, true),
field("points", field_types::INT32, false)
};
coll1 = collectionManager.get_collection("coll1");
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
}
std::vector<std::string> records;
size_t NUM_RECORDS = 1000;
for(size_t i=0; i<NUM_RECORDS; i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["points"] = i;
records.push_back(doc.dump());
}
// import records without title
nlohmann::json document;
nlohmann::json import_response = coll1->add_many(records, document, false);
ASSERT_TRUE(import_response["success"].get<bool>());
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
// upsert documents with title
records.clear();
for(size_t i=0; i<NUM_RECORDS; i++) {
nlohmann::json updoc;
updoc["id"] = std::to_string(i);
updoc["title"] = {
get_text(10),
get_text(10),
get_text(10),
get_text(10),
};
records.push_back(updoc.dump());
}
auto begin = std::chrono::high_resolution_clock::now();
import_response = coll1->add_many(records, document, true);
auto time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::high_resolution_clock::now() - begin).count();
//LOG(INFO) << "Time taken for first upsert: " << time_micros;
ASSERT_TRUE(import_response["success"].get<bool>());
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
// run upsert again with title override
records.clear();
for(size_t i=0; i<NUM_RECORDS; i++) {
nlohmann::json updoc;
updoc["id"] = std::to_string(i);
updoc["title"] = {
get_text(10),
get_text(10),
get_text(10),
get_text(10),
};
records.push_back(updoc.dump());
}
begin = std::chrono::high_resolution_clock::now();
import_response = coll1->add_many(records, document, true);
time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::high_resolution_clock::now() - begin).count();
//LOG(INFO) << "Time taken for second upsert: " << time_micros;
ASSERT_TRUE(import_response["success"].get<bool>());
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
}
TEST_F(CollectionTest, ImportDocuments) {
Collection *coll_mul_fields;

View File

@ -0,0 +1,100 @@
the
of
to
and
a
in
is
it
you
that
he
was
for
on
are
with
as
I
his
they
be
at
one
have
this
from
or
had
by
not
word
but
what
some
we
can
out
other
were
all
there
when
up
use
your
how
said
an
each
she
which
do
their
time
if
will
way
about
many
then
them
write
would
like
so
these
her
long
make
thing
see
him
two
has
look
more
day
could
go
come
did
number
sound
no
most
people
my
over
know
water
than
call
first
who
may
down
side
been
now
find

View File

@ -12,7 +12,8 @@ TEST(SortedArrayTest, Append) {
EXPECT_EQ(arr.indexOf(100), 0); // when not found must be equal to length (0 in this case)
for(uint32_t i=0; i < SIZE; i++) {
arr.append(i);
size_t appended_index = arr.append(i);
ASSERT_EQ(i, appended_index);
}
EXPECT_EQ(arr.getLength(), SIZE);
@ -28,7 +29,8 @@ TEST(SortedArrayTest, Append) {
EXPECT_EQ(arr.indexOf(SIZE+1), SIZE);
sorted_array arr_small;
arr_small.append(100);
size_t appended_index = arr_small.append(100);
EXPECT_EQ(0, appended_index);
EXPECT_EQ(arr_small.getLength(), 1);
EXPECT_EQ(arr_small.at(0), 100);
}
@ -36,18 +38,34 @@ TEST(SortedArrayTest, Append) {
TEST(SortedArrayTest, AppendOutOfOrder) {
sorted_array arr;
for(size_t i=5; i<=10; i++) {
arr.append(i);
size_t appended_index = arr.append(i);
ASSERT_EQ(i-5, appended_index);
}
EXPECT_EQ(6, arr.getLength());
arr.append(1);
arr.append(3);
arr.append(2);
arr.append(4);
arr.append(11);
arr.append(14);
arr.append(12);
int appended_index = -1;
appended_index = arr.append(1);
ASSERT_EQ(0, appended_index);
appended_index = arr.append(3);
ASSERT_EQ(1, appended_index);
appended_index = arr.append(2);
ASSERT_EQ(1, appended_index);
appended_index = arr.append(4);
ASSERT_EQ(3, appended_index);
appended_index = arr.append(11);
ASSERT_EQ(10, appended_index);
appended_index = arr.append(14);
ASSERT_EQ(11, appended_index);
appended_index = arr.append(12);
ASSERT_EQ(11, appended_index);
EXPECT_EQ(13, arr.getLength());
}
@ -136,6 +154,32 @@ TEST(SortedArrayTest, Uncompress) {
delete[] raw_sorted_arr;
}
TEST(SortedArrayTest, RemoveValue) {
sorted_array arr;
const size_t SIZE = 10*1000;
for(size_t i=0; i<SIZE; i++) {
arr.append(i);
}
uint32_t values[5] = {0, 100, 1000, 2000, SIZE-1};
for(size_t i=0; i<5; i++) {
arr.remove_value(values[i]);
}
ASSERT_EQ(arr.getLength(), SIZE-5);
for(size_t i=0; i<SIZE-5; i++) {
uint32_t value = arr.at(i);
ASSERT_FALSE(value == 0);
ASSERT_FALSE(value == 100);
ASSERT_FALSE(value == 1000);
ASSERT_FALSE(value == 2000);
ASSERT_FALSE(value == SIZE-1);
}
}
TEST(SortedArrayTest, RemoveValues) {
sorted_array arr;