From 2051ac6c2f7b73fc5eeddb9116c51545447d990b Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 4 Feb 2017 11:29:05 +0530 Subject: [PATCH] Refactor forarray - split into individual classes. --- CMakeLists.txt | 2 +- TODO.md | 2 +- include/array.h | 30 +++++ include/{forarray.h => array_base.h} | 27 +--- include/art.h | 9 +- include/sorted_array.h | 36 +++++ src/array.cpp | 75 +++++++++++ src/array_base.cpp | 15 +++ src/art.cpp | 6 +- src/collection.cpp | 8 +- src/forarray.cpp | 161 ---------------------- src/sorted_array.cpp | 86 ++++++++++++ test/array_test.cpp | 114 ++++++++++++++++ test/forarray_test.cpp | 194 --------------------------- test/sorted_array_test.cpp | 91 +++++++++++++ 15 files changed, 465 insertions(+), 391 deletions(-) create mode 100644 include/array.h rename include/{forarray.h => array_base.h} (69%) create mode 100644 include/sorted_array.h create mode 100644 src/array.cpp create mode 100644 src/array_base.cpp delete mode 100644 src/forarray.cpp create mode 100644 src/sorted_array.cpp create mode 100644 test/array_test.cpp delete mode 100644 test/forarray_test.cpp create mode 100644 test/sorted_array_test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 52dfd61c..c18a7f95 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,7 +38,7 @@ link_directories(${CMAKE_SOURCE_DIR}/external/${ROCKSDB_NAME}) add_executable(typesense-server ${HEADER_FILES} ${SRC_FILES} src/main/server.cpp) add_executable(search ${HEADER_FILES} ${SRC_FILES} src/main/main.cpp) add_executable(benchmark ${HEADER_FILES} ${SRC_FILES} src/main/benchmark.cpp) -add_executable(typesense_test test/forarray_test.cpp test/art_test.cpp +add_executable(typesense_test test/array_test.cpp test/sorted_array_test.cpp test/art_test.cpp test/collection_test.cpp test/collection_manager_test.cpp test/topster_test.cpp ${SRC_FILES}) target_compile_definitions(typesense-server PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/") diff --git a/TODO.md b/TODO.md index 7955c222..921f4b96 100644 --- a/TODO.md +++ b/TODO.md @@ -37,7 +37,7 @@ - Minimum results should be a variable instead of blindly going with max_results - Benchmark with -ffast-math - Space sensitivity -- Use bitmap index instead of forarray for doc list +- Use bitmap index instead of compressed array for doc list - Throw errors when schema is broken - Assumption that all tokens match for scoring is no longer true - Primary_rank_scores and secondary_rank_scores hashmaps should be combined diff --git a/include/array.h b/include/array.h new file mode 100644 index 00000000..c390705a --- /dev/null +++ b/include/array.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "array_base.h" + +class array: public array_base { +private: + uint32_t inline unsorted_append_size_required(uint32_t value, uint32_t new_length) { + uint32_t m = std::min(min, value); + uint32_t M = std::max(max, value); + uint32_t bnew = required_bits(M - m); + return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew); + } + +public: + uint32_t at(uint32_t index); + + bool contains(uint32_t value); + + uint32_t indexOf(uint32_t value); + + bool append(uint32_t value); + + void remove_index(uint32_t start_index, uint32_t end_index); +}; \ No newline at end of file diff --git a/include/forarray.h b/include/array_base.h similarity index 69% rename from include/forarray.h rename to include/array_base.h index 14a51564..a0ad124c 100644 --- a/include/forarray.h +++ b/include/array_base.h @@ -11,8 +11,8 @@ #define FOR_ELE_SIZE sizeof(uint32_t) #define METADATA_OVERHEAD 5 -class forarray { -private: +class array_base { +protected: uint8_t* in; uint32_t size_bytes = 0; // allocated size uint32_t length_bytes = 0; // actual size @@ -39,38 +39,19 @@ private: } public: - forarray(const uint32_t n=2) { + array_base(const uint32_t n=2) { size_bytes = METADATA_OVERHEAD + (n * FOR_ELE_SIZE); in = new uint8_t[size_bytes]; memset(in, 0, size_bytes); } - ~forarray() { + ~array_base() { delete[] in; in = nullptr; } - // FIXME: this should be a constructor instead of a setter - void load_sorted(const uint32_t *sorted_array, const uint32_t array_length); - - // returns false if malloc fails - bool append_sorted(uint32_t value); - - bool append_unsorted(uint32_t value); - - uint32_t at(uint32_t index); - - // FIXME: contains and indexOf are meant only for sorted arrays - bool contains(uint32_t value); - - uint32_t indexOf(uint32_t value); - uint32_t* uncompress(); - void remove_index_unsorted(uint32_t start_index, uint32_t end_index); - - void remove_values_sorted(uint32_t *sorted_values, uint32_t values_length); - uint32_t getSizeInBytes(); uint32_t getLength(); diff --git a/include/art.h b/include/art.h index cab6dec6..05ae078c 100644 --- a/include/art.h +++ b/include/art.h @@ -4,7 +4,8 @@ #include #include #include -#include "forarray.h" +#include "array.h" +#include "sorted_array.h" #define IGNORE_PRINTF 1 @@ -87,9 +88,9 @@ typedef struct { * Container for holding the documents that belong to a leaf. */ typedef struct { - forarray ids; - forarray offset_index; - forarray offsets; + sorted_array ids; + sorted_array offset_index; + array offsets; } art_values; /* diff --git a/include/sorted_array.h b/include/sorted_array.h new file mode 100644 index 00000000..d41c9640 --- /dev/null +++ b/include/sorted_array.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "array_base.h" + +class sorted_array: public array_base { +private: + + uint32_t inline sorted_append_size_required(uint32_t value, uint32_t new_length) { + uint32_t m = std::min(min, value); + uint32_t M = std::max(max, value); + uint32_t bnew = required_bits(M - m); + return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew); + } + +public: + + // FIXME: this should be a constructor instead of a setter + void load(const uint32_t *sorted_array, const uint32_t array_length); + + uint32_t at(uint32_t index); + + bool contains(uint32_t value); + + uint32_t indexOf(uint32_t value); + + // returns false if malloc fails + bool append(uint32_t value); + + void remove_values(uint32_t *sorted_values, uint32_t values_length); +}; \ No newline at end of file diff --git a/src/array.cpp b/src/array.cpp new file mode 100644 index 00000000..bc901ff7 --- /dev/null +++ b/src/array.cpp @@ -0,0 +1,75 @@ +#include "array.h" + +uint32_t array::at(uint32_t index) { + return for_select(in, index); +} + +bool array::contains(uint32_t value) { + uint32_t index = for_linear_search(in, length, value); + return index != length; +} + +uint32_t array::indexOf(uint32_t value) { + return for_linear_search(in, length, value); +} + +bool array::append(uint32_t value) { + uint32_t size_required = unsorted_append_size_required(value, length+1); + + if(size_required+FOR_ELE_SIZE > size_bytes) { + // grow the array first + size_t new_size = (size_t) (size_required * FOR_GROWTH_FACTOR); + uint8_t *new_location = (uint8_t *) realloc(in, new_size); + if(new_location == NULL) { + abort(); + } + in = new_location; + size_bytes = (uint32_t) new_size; + } + + uint32_t new_length_bytes = for_append_unsorted(in, length, value); + if(new_length_bytes == 0) { + abort(); + } + + if(value < min) min = value; + if(value > max) max = value; + + length_bytes = new_length_bytes; + length++; + + return true; +} + +void array::remove_index(uint32_t start_index, uint32_t end_index) { + uint32_t *curr_array = uncompress(); + + uint32_t *new_array = new uint32_t[length]; + uint32_t new_index = 0; + uint32_t curr_index = 0; + + min = std::numeric_limits::max(); + max = std::numeric_limits::min(); + + while(curr_index < length) { + if(curr_index < start_index || curr_index >= end_index) { + new_array[new_index++] = curr_array[curr_index]; + if(curr_array[curr_index] < min) min = curr_array[curr_index]; + if(curr_array[curr_index] > max) max = curr_array[curr_index]; + } + curr_index++; + } + + uint32_t size_required = (uint32_t) (unsorted_append_size_required(max, new_index) * FOR_GROWTH_FACTOR); + uint8_t *out = new uint8_t[size_required]; + uint32_t actual_size = for_compress_unsorted(new_array, out, new_index); + + delete[] curr_array; + delete[] new_array; + delete[] in; + + in = out; + length = new_index; + size_bytes = size_required; + length_bytes = actual_size; +} \ No newline at end of file diff --git a/src/array_base.cpp b/src/array_base.cpp new file mode 100644 index 00000000..4a9a29ff --- /dev/null +++ b/src/array_base.cpp @@ -0,0 +1,15 @@ +#include "array_base.h" + +uint32_t* array_base::uncompress() { + uint32_t *out = new uint32_t[length]; + for_uncompress(in, out, length); + return out; +} + +uint32_t array_base::getSizeInBytes() { + return size_bytes; +} + +uint32_t array_base::getLength() { + return length; +} diff --git a/src/art.cpp b/src/art.cpp index fa3f012c..07dbcf36 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -416,12 +416,12 @@ art_leaf* art_maximum(art_tree *t) { static void add_document_to_leaf(const art_document *document, art_leaf *leaf) { leaf->max_score = MAX(leaf->max_score, document->score); - leaf->values->ids.append_sorted(document->id); + leaf->values->ids.append(document->id); uint32_t curr_index = leaf->values->offsets.getLength(); - leaf->values->offset_index.append_sorted(curr_index); + leaf->values->offset_index.append(curr_index); for(uint32_t i=0; ioffsets_len; i++) { - leaf->values->offsets.append_unsorted(document->offsets[i]); + leaf->values->offsets.append(document->offsets[i]); } } diff --git a/src/collection.cpp b/src/collection.cpp index 8b31de71..d20028ef 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -520,7 +520,7 @@ inline std::vector Collection::next_suggestion(const std::vectorvalues->offset_index, doc_indices, 1); - leaf->values->offsets.remove_index_unsorted(start_offset, end_offset); - leaf->values->ids.remove_values_sorted(seq_id_values, 1); + leaf->values->offsets.remove_index(start_offset, end_offset); + leaf->values->ids.remove_values(seq_id_values, 1); /*len = leaf->values->offset_index.getLength(); for(auto i=0; i 1 ? sorted_array[array_length-1] : min; - - uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR); - uint8_t *out = new uint8_t[size_required]; - uint32_t actual_size = for_compress_sorted(sorted_array, out, array_length); - - delete[] in; - in = nullptr; - - in = out; - length = array_length; - size_bytes = size_required; - length_bytes = actual_size; -} - -bool forarray::append_sorted(uint32_t value) { - uint32_t size_required = sorted_append_size_required(value, length+1); - - if(size_required+FOR_ELE_SIZE > size_bytes) { - // grow the array first - size_t new_size = (size_t) (size_required * FOR_GROWTH_FACTOR); - uint8_t *new_location = (uint8_t *) realloc(in, new_size); - if(new_location == NULL) { - abort(); - } - in = new_location; - size_bytes = (uint32_t) new_size; - } - - uint32_t new_length_bytes = for_append_sorted(in, length, value); - if(new_length_bytes == 0) return false; - - length_bytes = new_length_bytes; - length++; - - if(value < min) min = value; - if(value > max) max = value; - - return true; -} - -bool forarray::append_unsorted(uint32_t value) { - uint32_t size_required = unsorted_append_size_required(value, length+1); - - if(size_required+FOR_ELE_SIZE > size_bytes) { - // grow the array first - size_t new_size = (size_t) (size_required * FOR_GROWTH_FACTOR); - uint8_t *new_location = (uint8_t *) realloc(in, new_size); - if(new_location == NULL) { - abort(); - } - in = new_location; - size_bytes = (uint32_t) new_size; - } - - uint32_t new_length_bytes = for_append_unsorted(in, length, value); - if(new_length_bytes == 0) { - abort(); - } - - if(value < min) min = value; - if(value > max) max = value; - - length_bytes = new_length_bytes; - length++; - - return true; -} - -uint32_t forarray::at(uint32_t index) { - return for_select(in, index); -} - -bool forarray::contains(uint32_t value) { - uint32_t actual; - for_lower_bound_search(in, length, value, &actual); - return actual == value; -} - -uint32_t forarray::indexOf(uint32_t value) { - uint32_t actual; - uint32_t index = for_lower_bound_search(in, length, value, &actual); - if(actual == value) return index; - return length; -} - -uint32_t* forarray::uncompress() { - uint32_t *out = new uint32_t[length]; - for_uncompress(in, out, length); - return out; -} - -void forarray::remove_index_unsorted(uint32_t start_index, uint32_t end_index) { - uint32_t *curr_array = uncompress(); - - uint32_t *new_array = new uint32_t[length]; - uint32_t new_index = 0; - uint32_t curr_index = 0; - - min = std::numeric_limits::max(); - max = std::numeric_limits::min(); - - while(curr_index < length) { - if(curr_index < start_index || curr_index >= end_index) { - new_array[new_index++] = curr_array[curr_index]; - if(curr_array[curr_index] < min) min = curr_array[curr_index]; - if(curr_array[curr_index] > max) max = curr_array[curr_index]; - } - curr_index++; - } - - uint32_t size_required = (uint32_t) (unsorted_append_size_required(max, new_index) * FOR_GROWTH_FACTOR); - uint8_t *out = new uint8_t[size_required]; - uint32_t actual_size = for_compress_unsorted(new_array, out, new_index); - - delete[] curr_array; - delete[] new_array; - delete[] in; - - in = out; - length = new_index; - size_bytes = size_required; - length_bytes = actual_size; -} - -void forarray::remove_values_sorted(uint32_t *sorted_values, uint32_t values_length) { - uint32_t *curr_array = uncompress(); - - uint32_t *new_array = new uint32_t[length]; - uint32_t new_index = 0; - uint32_t curr_index = 0; - uint32_t sorted_values_index = 0; - - while(curr_index < length) { - if(sorted_values_index < values_length && curr_array[curr_index] >= sorted_values[sorted_values_index]) { - // skip copying - if(curr_array[curr_index] == sorted_values[sorted_values_index]) { - curr_index++; - } - sorted_values_index++; - } else { - new_array[new_index++] = curr_array[curr_index++]; - } - } - - load_sorted(new_array, new_index); - delete[] curr_array; - delete[] new_array; -} - -uint32_t forarray::getSizeInBytes() { - return size_bytes; -} - -uint32_t forarray::getLength() { - return length; -} diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp new file mode 100644 index 00000000..36a606d7 --- /dev/null +++ b/src/sorted_array.cpp @@ -0,0 +1,86 @@ +#include "sorted_array.h" + +void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_length) { + min = sorted_array[0]; + max = array_length > 1 ? sorted_array[array_length-1] : min; + + uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR); + uint8_t *out = new uint8_t[size_required]; + uint32_t actual_size = for_compress_sorted(sorted_array, out, array_length); + + delete[] in; + in = nullptr; + + in = out; + length = array_length; + size_bytes = size_required; + length_bytes = actual_size; +} + +bool sorted_array::append(uint32_t value) { + uint32_t size_required = sorted_append_size_required(value, length+1); + + if(size_required+FOR_ELE_SIZE > size_bytes) { + // grow the array first + size_t new_size = (size_t) (size_required * FOR_GROWTH_FACTOR); + uint8_t *new_location = (uint8_t *) realloc(in, new_size); + if(new_location == NULL) { + abort(); + } + in = new_location; + size_bytes = (uint32_t) new_size; + } + + uint32_t new_length_bytes = for_append_sorted(in, length, value); + if(new_length_bytes == 0) return false; + + length_bytes = new_length_bytes; + length++; + + if(value < min) min = value; + if(value > max) max = value; + + return true; +} + +uint32_t sorted_array::at(uint32_t index) { + return for_select(in, index); +} + +bool sorted_array::contains(uint32_t value) { + uint32_t actual; + for_lower_bound_search(in, length, value, &actual); + return actual == value; +} + +uint32_t sorted_array::indexOf(uint32_t value) { + uint32_t actual; + uint32_t index = for_lower_bound_search(in, length, value, &actual); + if(actual == value) return index; + return length; +} + +void sorted_array::remove_values(uint32_t *sorted_values, uint32_t values_length) { + uint32_t *curr_array = uncompress(); + + uint32_t *new_array = new uint32_t[length]; + uint32_t new_index = 0; + uint32_t curr_index = 0; + uint32_t sorted_values_index = 0; + + while(curr_index < length) { + if(sorted_values_index < values_length && curr_array[curr_index] >= sorted_values[sorted_values_index]) { + // skip copying + if(curr_array[curr_index] == sorted_values[sorted_values_index]) { + curr_index++; + } + sorted_values_index++; + } else { + new_array[new_index++] = curr_array[curr_index++]; + } + } + + load(new_array, new_index); + delete[] curr_array; + delete[] new_array; +} \ No newline at end of file diff --git a/test/array_test.cpp b/test/array_test.cpp new file mode 100644 index 00000000..d4eebd15 --- /dev/null +++ b/test/array_test.cpp @@ -0,0 +1,114 @@ +#include +#include "array.h" +#include + +TEST(ArrayTest, Append) { + array arr; + int SIZE = 10 * 1000; + + EXPECT_EQ(arr.getLength(), 0); + + // First try inserting sorted ints + + for(uint32_t i=0; i < SIZE; i++) { + arr.append(i); + } + + EXPECT_EQ(arr.getLength(), SIZE); + + for(uint32_t i=0; i < SIZE; i++) { + EXPECT_EQ(arr.at(i), i); + EXPECT_EQ(arr.indexOf(i), i); + EXPECT_EQ(arr.contains(i), true); + } + + EXPECT_EQ(arr.contains(SIZE), false); + EXPECT_EQ(arr.indexOf(SIZE), SIZE); + EXPECT_EQ(arr.indexOf(SIZE+1), SIZE); + + // Insert in unsorted fashion + array arr2; + + std::vector unsorted; + + for(uint32_t i=0; i < SIZE; i++) { + uint32_t r = (uint32_t) rand(); + unsorted.push_back(r); + arr2.append(r); + } + + EXPECT_EQ(arr2.getLength(), SIZE); + + for(uint32_t i=0; i < SIZE; i++) { + uint32_t value = unsorted.at(i); + EXPECT_EQ(arr2.at(i), value); + } +} + +TEST(ArrayTest, Uncompress) { + const size_t SIZE = 10*1000; + + array unsorted_arr; + std::vector unsorted; + + for(size_t i=0; i unsorted; + + for(size_t i=0; i -#include "forarray.h" -#include - -TEST(ForarrayTest, AppendSorted) { - forarray arr; - const int SIZE = 10 * 1000; - - EXPECT_EQ(arr.getLength(), 0); - - for(uint32_t i=0; i < SIZE; i++) { - arr.append_sorted(i); - } - - EXPECT_EQ(arr.getLength(), SIZE); - - for(uint32_t i=0; i < SIZE; i++) { - EXPECT_EQ(arr.at(i), i); - EXPECT_EQ(arr.indexOf(i), i); - EXPECT_EQ(arr.contains(i), true); - } - - forarray arr_small; - arr_small.append_sorted(100); - EXPECT_EQ(arr_small.getLength(), 1); - EXPECT_EQ(arr_small.at(0), 100); -} - -TEST(ForarrayTest, AppendUnsorted) { - forarray arr; - int SIZE = 10 * 1000; - - EXPECT_EQ(arr.getLength(), 0); - - // First try inserting sorted ints - - for(uint32_t i=0; i < SIZE; i++) { - arr.append_unsorted(i); - } - - EXPECT_EQ(arr.getLength(), SIZE); - - for(uint32_t i=0; i < SIZE; i++) { - EXPECT_EQ(arr.at(i), i); - EXPECT_EQ(arr.indexOf(i), i); - EXPECT_EQ(arr.contains(i), true); - } - - // Insert in unsorted fashion - forarray arr2; - - std::vector unsorted; - - for(uint32_t i=0; i < SIZE; i++) { - uint32_t r = (uint32_t) rand(); - unsorted.push_back(r); - arr2.append_unsorted(r); - } - - EXPECT_EQ(arr2.getLength(), SIZE); - - for(uint32_t i=0; i < SIZE; i++) { - uint32_t value = unsorted.at(i); - EXPECT_EQ(arr2.at(i), value); - } -} - -TEST(ForarrayTest, LoadSorted) { - forarray arr; - - // To ensure that previous contents are erased - arr.append_sorted(100); - arr.append_sorted(200); - - const size_t SIZE = 10*1000; - uint32_t *array = new uint32_t[SIZE]; - - for(size_t i=0; i unsorted; - - for(size_t i=0; i unsorted; - - for(size_t i=0; i +#include "sorted_array.h" +#include + +TEST(SortedArrayTest, Append) { + sorted_array arr; + const int SIZE = 10 * 1000; + + EXPECT_EQ(arr.getLength(), 0); + + for(uint32_t i=0; i < SIZE; i++) { + arr.append(i); + } + + EXPECT_EQ(arr.getLength(), SIZE); + + for(uint32_t i=0; i < SIZE; i++) { + EXPECT_EQ(arr.at(i), i); + EXPECT_EQ(arr.indexOf(i), i); + EXPECT_EQ(arr.contains(i), true); + } + + EXPECT_EQ(arr.contains(SIZE), false); + EXPECT_EQ(arr.indexOf(SIZE), SIZE); + EXPECT_EQ(arr.indexOf(SIZE+1), SIZE); + + sorted_array arr_small; + arr_small.append(100); + EXPECT_EQ(arr_small.getLength(), 1); + EXPECT_EQ(arr_small.at(0), 100); +} + +TEST(SortedArrayTest, Load) { + sorted_array arr; + + // To ensure that previous contents are erased + arr.append(100); + arr.append(200); + + const size_t SIZE = 10*1000; + uint32_t *array = new uint32_t[SIZE]; + + for(size_t i=0; i