mirror of
https://github.com/typesense/typesense.git
synced 2025-05-17 20:22:32 +08:00
Array append/remove perf improvements.
This commit is contained in:
parent
0508700ca3
commit
bccc3ac362
@ -52,5 +52,7 @@ public:
|
||||
|
||||
bool insert(size_t index, uint32_t value);
|
||||
|
||||
void remove_value(uint32_t value);
|
||||
|
||||
void remove_values(uint32_t *sorted_values, uint32_t sorted_values_length);
|
||||
};
|
@ -1911,7 +1911,6 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
|
||||
|
||||
art_leaf* leaf = (art_leaf *) art_search(search_index.at(field_name), key, key_len);
|
||||
if(leaf != nullptr) {
|
||||
uint32_t seq_id_values[1] = {seq_id};
|
||||
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
|
||||
|
||||
if(doc_index == leaf->values->ids.getLength()) {
|
||||
@ -1928,7 +1927,7 @@ Option<uint32_t> Index::remove(const uint32_t seq_id, const nlohmann::json & doc
|
||||
remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);
|
||||
|
||||
leaf->values->offsets.remove_index(start_offset, end_offset);
|
||||
leaf->values->ids.remove_values(seq_id_values, 1);
|
||||
leaf->values->ids.remove_value(seq_id);
|
||||
|
||||
/*len = leaf->values->offset_index.getLength();
|
||||
for(auto i=0; i<len; i++) {
|
||||
|
@ -23,24 +23,21 @@ size_t sorted_array::append(uint32_t value) {
|
||||
if(value < max) {
|
||||
// we will have to re-encode the whole sequence again
|
||||
uint32_t* arr = uncompress(length+1);
|
||||
size_t i = 0;
|
||||
while(i < length+1) {
|
||||
if(value < arr[i]) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
for(size_t j=length; j>i; j--) {
|
||||
// find the index of the element which is >= to `value`
|
||||
uint32_t found_val;
|
||||
uint32_t gte_index = for_lower_bound_search(in, length, value, &found_val);
|
||||
|
||||
for(size_t j=length; j>gte_index; j--) {
|
||||
arr[j] = arr[j-1];
|
||||
}
|
||||
|
||||
arr[i] = value;
|
||||
arr[gte_index] = value;
|
||||
|
||||
load(arr, length+1);
|
||||
delete [] arr;
|
||||
|
||||
return i;
|
||||
return gte_index;
|
||||
} else {
|
||||
uint32_t size_required = sorted_append_size_required(value, length+1);
|
||||
size_t min_expected_size = size_required + FOR_ELE_SIZE;
|
||||
@ -104,7 +101,11 @@ uint32_t sorted_array::indexOf(uint32_t value) {
|
||||
|
||||
uint32_t actual;
|
||||
uint32_t index = for_lower_bound_search(in, length, value, &actual);
|
||||
if(actual == value) return index;
|
||||
|
||||
if(actual == value) {
|
||||
return index;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
@ -193,6 +194,28 @@ void sorted_array::indexOf(const uint32_t *values, const size_t values_len, uint
|
||||
binary_search_indices(values, head, tail, low_index, high_index, base, bits, indices);
|
||||
}
|
||||
|
||||
void sorted_array::remove_value(uint32_t value) {
|
||||
// A lower bound search returns the first element in the sequence that is >= `value`
|
||||
// So, `found_val` will be either equal or greater than `value`
|
||||
uint32_t found_val;
|
||||
uint32_t found_index = for_lower_bound_search(in, length, value, &found_val);
|
||||
|
||||
if(found_val != value) {
|
||||
return ;
|
||||
}
|
||||
|
||||
uint32_t *curr_array = uncompress();
|
||||
|
||||
if(found_index + 1 < length) {
|
||||
memmove(&curr_array[found_index], &curr_array[found_index+1], sizeof(uint32_t) * (length - found_index - 1));
|
||||
}
|
||||
|
||||
size_t new_length = (length == 0) ? 0 : (length - 1);
|
||||
load(curr_array, new_length);
|
||||
|
||||
delete [] curr_array;
|
||||
}
|
||||
|
||||
void sorted_array::remove_values(uint32_t *sorted_values, uint32_t sorted_values_length) {
|
||||
uint32_t *curr_array = uncompress();
|
||||
|
||||
|
@ -14,6 +14,9 @@ protected:
|
||||
CollectionManager & collectionManager = CollectionManager::get_instance();
|
||||
std::vector<sort_by> sort_fields;
|
||||
|
||||
// used for generating random text
|
||||
std::vector<std::string> words;
|
||||
|
||||
void setupCollection() {
|
||||
std::string state_dir_path = "/tmp/typesense_test/collection";
|
||||
LOG(INFO) << "Truncating and creating: " << state_dir_path;
|
||||
@ -48,6 +51,12 @@ protected:
|
||||
}
|
||||
|
||||
infile.close();
|
||||
|
||||
std::ifstream words_file(std::string(ROOT_DIR)+"test/resources/common100_english.txt");
|
||||
std::stringstream strstream;
|
||||
strstream << words_file.rdbuf();
|
||||
words_file.close();
|
||||
StringUtils::split(strstream.str(), words, "\n");
|
||||
}
|
||||
|
||||
virtual void SetUp() {
|
||||
@ -59,6 +68,18 @@ protected:
|
||||
collectionManager.dispose();
|
||||
delete store;
|
||||
}
|
||||
|
||||
std::string get_text(size_t num_words) {
|
||||
time_t t;
|
||||
srand((unsigned) time(&t));
|
||||
std::vector<std::string> strs;
|
||||
|
||||
for(size_t i = 0 ; i < num_words ; i++ ) {
|
||||
int word_index = rand() % 100;
|
||||
strs.push_back(words[word_index]);
|
||||
}
|
||||
return StringUtils::join(strs, " ");
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(CollectionTest, VerifyCountOfDocuments) {
|
||||
@ -1410,6 +1431,90 @@ TEST_F(CollectionTest, ImportDocumentsUpsert) {
|
||||
ASSERT_EQ(70, results["hits"][0]["document"]["points"].get<uint32_t>());
|
||||
}
|
||||
|
||||
|
||||
TEST_F(CollectionTest, ImportDocumentsUpsertOptional) {
|
||||
Collection *coll1;
|
||||
std::vector<field> fields = {
|
||||
field("title", field_types::STRING_ARRAY, false, true),
|
||||
field("points", field_types::INT32, false)
|
||||
};
|
||||
|
||||
coll1 = collectionManager.get_collection("coll1");
|
||||
if(coll1 == nullptr) {
|
||||
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
||||
}
|
||||
|
||||
std::vector<std::string> records;
|
||||
|
||||
size_t NUM_RECORDS = 1000;
|
||||
|
||||
for(size_t i=0; i<NUM_RECORDS; i++) {
|
||||
nlohmann::json doc;
|
||||
doc["id"] = std::to_string(i);
|
||||
doc["points"] = i;
|
||||
records.push_back(doc.dump());
|
||||
}
|
||||
|
||||
// import records without title
|
||||
|
||||
nlohmann::json document;
|
||||
nlohmann::json import_response = coll1->add_many(records, document, false);
|
||||
ASSERT_TRUE(import_response["success"].get<bool>());
|
||||
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
|
||||
|
||||
// upsert documents with title
|
||||
|
||||
records.clear();
|
||||
|
||||
for(size_t i=0; i<NUM_RECORDS; i++) {
|
||||
nlohmann::json updoc;
|
||||
updoc["id"] = std::to_string(i);
|
||||
updoc["title"] = {
|
||||
get_text(10),
|
||||
get_text(10),
|
||||
get_text(10),
|
||||
get_text(10),
|
||||
};
|
||||
records.push_back(updoc.dump());
|
||||
}
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
import_response = coll1->add_many(records, document, true);
|
||||
auto time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::high_resolution_clock::now() - begin).count();
|
||||
|
||||
//LOG(INFO) << "Time taken for first upsert: " << time_micros;
|
||||
|
||||
ASSERT_TRUE(import_response["success"].get<bool>());
|
||||
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
|
||||
|
||||
// run upsert again with title override
|
||||
|
||||
records.clear();
|
||||
|
||||
for(size_t i=0; i<NUM_RECORDS; i++) {
|
||||
nlohmann::json updoc;
|
||||
updoc["id"] = std::to_string(i);
|
||||
updoc["title"] = {
|
||||
get_text(10),
|
||||
get_text(10),
|
||||
get_text(10),
|
||||
get_text(10),
|
||||
};
|
||||
records.push_back(updoc.dump());
|
||||
}
|
||||
|
||||
begin = std::chrono::high_resolution_clock::now();
|
||||
import_response = coll1->add_many(records, document, true);
|
||||
time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::high_resolution_clock::now() - begin).count();
|
||||
|
||||
//LOG(INFO) << "Time taken for second upsert: " << time_micros;
|
||||
|
||||
ASSERT_TRUE(import_response["success"].get<bool>());
|
||||
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, ImportDocuments) {
|
||||
Collection *coll_mul_fields;
|
||||
|
||||
|
100
test/resources/common100_english.txt
Normal file
100
test/resources/common100_english.txt
Normal file
@ -0,0 +1,100 @@
|
||||
the
|
||||
of
|
||||
to
|
||||
and
|
||||
a
|
||||
in
|
||||
is
|
||||
it
|
||||
you
|
||||
that
|
||||
he
|
||||
was
|
||||
for
|
||||
on
|
||||
are
|
||||
with
|
||||
as
|
||||
I
|
||||
his
|
||||
they
|
||||
be
|
||||
at
|
||||
one
|
||||
have
|
||||
this
|
||||
from
|
||||
or
|
||||
had
|
||||
by
|
||||
not
|
||||
word
|
||||
but
|
||||
what
|
||||
some
|
||||
we
|
||||
can
|
||||
out
|
||||
other
|
||||
were
|
||||
all
|
||||
there
|
||||
when
|
||||
up
|
||||
use
|
||||
your
|
||||
how
|
||||
said
|
||||
an
|
||||
each
|
||||
she
|
||||
which
|
||||
do
|
||||
their
|
||||
time
|
||||
if
|
||||
will
|
||||
way
|
||||
about
|
||||
many
|
||||
then
|
||||
them
|
||||
write
|
||||
would
|
||||
like
|
||||
so
|
||||
these
|
||||
her
|
||||
long
|
||||
make
|
||||
thing
|
||||
see
|
||||
him
|
||||
two
|
||||
has
|
||||
look
|
||||
more
|
||||
day
|
||||
could
|
||||
go
|
||||
come
|
||||
did
|
||||
number
|
||||
sound
|
||||
no
|
||||
most
|
||||
people
|
||||
my
|
||||
over
|
||||
know
|
||||
water
|
||||
than
|
||||
call
|
||||
first
|
||||
who
|
||||
may
|
||||
down
|
||||
side
|
||||
been
|
||||
now
|
||||
find
|
@ -12,7 +12,8 @@ TEST(SortedArrayTest, Append) {
|
||||
EXPECT_EQ(arr.indexOf(100), 0); // when not found must be equal to length (0 in this case)
|
||||
|
||||
for(uint32_t i=0; i < SIZE; i++) {
|
||||
arr.append(i);
|
||||
size_t appended_index = arr.append(i);
|
||||
ASSERT_EQ(i, appended_index);
|
||||
}
|
||||
|
||||
EXPECT_EQ(arr.getLength(), SIZE);
|
||||
@ -28,7 +29,8 @@ TEST(SortedArrayTest, Append) {
|
||||
EXPECT_EQ(arr.indexOf(SIZE+1), SIZE);
|
||||
|
||||
sorted_array arr_small;
|
||||
arr_small.append(100);
|
||||
size_t appended_index = arr_small.append(100);
|
||||
EXPECT_EQ(0, appended_index);
|
||||
EXPECT_EQ(arr_small.getLength(), 1);
|
||||
EXPECT_EQ(arr_small.at(0), 100);
|
||||
}
|
||||
@ -36,18 +38,34 @@ TEST(SortedArrayTest, Append) {
|
||||
TEST(SortedArrayTest, AppendOutOfOrder) {
|
||||
sorted_array arr;
|
||||
for(size_t i=5; i<=10; i++) {
|
||||
arr.append(i);
|
||||
size_t appended_index = arr.append(i);
|
||||
ASSERT_EQ(i-5, appended_index);
|
||||
}
|
||||
|
||||
EXPECT_EQ(6, arr.getLength());
|
||||
|
||||
arr.append(1);
|
||||
arr.append(3);
|
||||
arr.append(2);
|
||||
arr.append(4);
|
||||
arr.append(11);
|
||||
arr.append(14);
|
||||
arr.append(12);
|
||||
int appended_index = -1;
|
||||
|
||||
appended_index = arr.append(1);
|
||||
ASSERT_EQ(0, appended_index);
|
||||
|
||||
appended_index = arr.append(3);
|
||||
ASSERT_EQ(1, appended_index);
|
||||
|
||||
appended_index = arr.append(2);
|
||||
ASSERT_EQ(1, appended_index);
|
||||
|
||||
appended_index = arr.append(4);
|
||||
ASSERT_EQ(3, appended_index);
|
||||
|
||||
appended_index = arr.append(11);
|
||||
ASSERT_EQ(10, appended_index);
|
||||
|
||||
appended_index = arr.append(14);
|
||||
ASSERT_EQ(11, appended_index);
|
||||
|
||||
appended_index = arr.append(12);
|
||||
ASSERT_EQ(11, appended_index);
|
||||
|
||||
EXPECT_EQ(13, arr.getLength());
|
||||
}
|
||||
@ -136,6 +154,32 @@ TEST(SortedArrayTest, Uncompress) {
|
||||
delete[] raw_sorted_arr;
|
||||
}
|
||||
|
||||
TEST(SortedArrayTest, RemoveValue) {
|
||||
sorted_array arr;
|
||||
|
||||
const size_t SIZE = 10*1000;
|
||||
for(size_t i=0; i<SIZE; i++) {
|
||||
arr.append(i);
|
||||
}
|
||||
|
||||
uint32_t values[5] = {0, 100, 1000, 2000, SIZE-1};
|
||||
|
||||
for(size_t i=0; i<5; i++) {
|
||||
arr.remove_value(values[i]);
|
||||
}
|
||||
|
||||
ASSERT_EQ(arr.getLength(), SIZE-5);
|
||||
|
||||
for(size_t i=0; i<SIZE-5; i++) {
|
||||
uint32_t value = arr.at(i);
|
||||
ASSERT_FALSE(value == 0);
|
||||
ASSERT_FALSE(value == 100);
|
||||
ASSERT_FALSE(value == 1000);
|
||||
ASSERT_FALSE(value == 2000);
|
||||
ASSERT_FALSE(value == SIZE-1);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SortedArrayTest, RemoveValues) {
|
||||
sorted_array arr;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user