From 5b2407433f46d6c6243b8a9849920cf79181733b Mon Sep 17 00:00:00 2001 From: kishorenc Date: Fri, 5 Jun 2020 20:28:33 +0530 Subject: [PATCH 01/38] Refactor topster to support grouping. --- include/topster.h | 244 ++++++++++++++++++++++++----------------- src/index.cpp | 6 +- src/main/benchmark.cpp | 4 +- test/topster_test.cpp | 6 +- 4 files changed, 156 insertions(+), 104 deletions(-) diff --git a/include/topster.h b/include/topster.h index 6ebadece..a065bb01 100644 --- a/include/topster.h +++ b/include/topster.h @@ -5,16 +5,26 @@ #include #include #include -#include -#include struct KV { uint8_t field_id; uint16_t query_index; uint16_t array_index; uint64_t key; + uint64_t distinct_key; uint64_t match_score; - int64_t scores[3]; // match score + 2 custom attributes + int64_t scores[3]{}; // match score + 2 custom attributes + + KV(uint8_t fieldId, uint16_t queryIndex, uint16_t arrayIndex, uint64_t key, uint64_t distinct_key, + uint64_t match_score, const int64_t *scores): + field_id(fieldId), query_index(queryIndex), array_index(arrayIndex), key(key), + distinct_key(distinct_key), match_score(match_score) { + this->scores[0] = scores[0]; + this->scores[1] = scores[1]; + this->scores[2] = scores[2]; + } + + KV() {} }; /* @@ -25,12 +35,18 @@ struct Topster { uint32_t size; KV *data; - KV* *kvs; + KV** kvs; + spp::sparse_hash_map kv_map; - spp::sparse_hash_map keys; + KV* min_kv; + spp::sparse_hash_map group_kv_map; + size_t distinct; - explicit Topster(size_t capacity): MAX_SIZE(capacity), size(0) { - // we allocate data first to get contiguous memory block whose indices are then assigned to `kvs` + explicit Topster(size_t capacity): Topster(capacity, 0) { + } + + explicit Topster(size_t capacity, size_t distinct): MAX_SIZE(capacity), size(0), distinct(distinct) { + // we allocate data first to get a memory block whose indices are then assigned to `kvs` // we use separate **kvs for easier pointer swaps data = new KV[capacity]; kvs = new KV*[capacity]; @@ -38,15 +54,23 @@ struct Topster { for(size_t i=0; iarray_index = a_index; } - static inline void replace_key_values(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index, - const uint64_t &match_score, const int64_t *scores, uint32_t start, - KV* *kvs, spp::sparse_hash_map& keys) { - kvs[start]->key = key; - kvs[start]->field_id = field_id; - kvs[start]->query_index = query_index; - kvs[start]->array_index = start; - kvs[start]->match_score = match_score; - kvs[start]->scores[0] = scores[0]; - kvs[start]->scores[1] = scores[1]; - kvs[start]->scores[2] = scores[2]; - - keys.erase(kvs[start]->key); - keys[key] = kvs[start]; + static inline void copyMe(KV* a, KV* b) { + size_t b_index = b->array_index; + *b = *a; + b->array_index = b_index; } - void add(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index, const uint64_t &match_score, - const int64_t scores[3]) { - if (size >= MAX_SIZE) { - if(!is_greater(kvs[0], scores)) { - // when incoming value is less than the smallest in the heap, ignore - return; + bool add(KV* kv) { + //LOG(INFO) << "kv_map size: " << kv_map.size() << " -- kvs[0]: " << kvs[0]->match_score; + /*for(auto kv: kv_map) { + LOG(INFO) << "kv key: " << kv.first << " => " << kv.second->match_score; + }*/ + + bool less_than_min_heap = (size >= MAX_SIZE) && is_smaller_equal(kv, kvs[0]); + size_t heap_down_index = 0; + + if(!distinct && less_than_min_heap) { + // for non-distinct, if incoming value is smaller than min-heap ignore + return false; + } + + if(distinct) { + const auto& found_it = group_kv_map.find(kv->distinct_key); + bool is_duplicate_key = (found_it != group_kv_map.end()); + + if(!is_duplicate_key && less_than_min_heap) { + // for distinct, if a non duplicate kv is < than min heap we also ignore + return false; } - uint32_t start = 0; - - // When the key already exists and has a greater score, ignore. Otherwise, we have to replace. - // NOTE: we don't consider primary and secondary attrs here because they will be the same for a given key. - if(keys.count(key) != 0) { - const KV* existing = keys.at(key); - if(match_score <= existing->match_score) { - return ; + if(is_duplicate_key) { + // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift down + Topster* group_topster = found_it->second; + uint16_t old_min_heap_array_index = group_topster->min_kv->array_index; + bool added = group_topster->add(kv); + if(!added) { + return false; } - // replace and sift down - start = existing->array_index; - } + // if added, guaranteed to be larger than old_min_heap_ele + copyMe(kv, group_topster->min_kv); + heap_down_index = old_min_heap_array_index; + } else { + // we have to replace min heap element + // create fresh topster for this distinct group key since it does not exist - replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys); + Topster* group_topster = new Topster(distinct, 0); + group_topster->add(kv); + copyMe(kv, group_topster->min_kv); - // sift down to maintain heap property - while ((2*start+1) < MAX_SIZE) { - uint32_t next = (2 * start + 1); - if (next+1 < MAX_SIZE && is_greater_kv(kvs[next], kvs[next+1])) { - next++; - } - - if (is_greater_kv(kvs[start], kvs[next])) { - swapMe(&kvs[start], &kvs[next]); + if(size < MAX_SIZE) { + // we just copy to end of array + heap_down_index = size; + size++; } else { - break; + // kv is guaranteed to be > current min heap (group_topster.kvs[0]) + heap_down_index = 0; + + // remove current min heap group key from map + delete group_kv_map[kvs[heap_down_index]->distinct_key]; + group_kv_map.erase(kvs[heap_down_index]->distinct_key); } - start = next; + // add new group key to map + group_kv_map.emplace(kv->distinct_key, group_topster); } - } else { - uint32_t start = size; - bool key_found = false; - // When the key already exists and has a greater score, ignore. Otherwise, we have to replace - if(keys.count(key) != 0) { - const KV* existing = keys.at(key); - if(match_score <= existing->match_score) { - return ; + } else { // not distinct + //LOG(INFO) << "Searching for key: " << kv->key; + + const auto& found_it = kv_map.find(kv->key); + bool is_duplicate_key = (found_it != kv_map.end()); + + if(is_duplicate_key) { + // Need to check if kv is greater than existing duplicate kv. + KV* existing_kv = found_it->second; + //LOG(INFO) << "existing_kv: " << existing_kv->key << " -> " << existing_kv->match_score; + + if(is_smaller_equal(kv, existing_kv)) { + return false; } - // replace and sift down - start = existing->array_index; - key_found = true; + // replace existing kv and sift down + heap_down_index = existing_kv->array_index; + kv_map.erase(kvs[heap_down_index]->key); + + // kv will be swapped into heap_down_index + kv_map.emplace(kv->key, kvs[heap_down_index]); } - replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys); - - if(key_found) { - // need to sift down if it's a replace - while ((2*start+1) < size) { - uint32_t next = (2 * start + 1); - if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) { - next++; - } - - if (is_greater_kv(kvs[start], kvs[next])) { - swapMe(&kvs[start], &kvs[next]); - } else { - break; - } - - start = next; - } - - return ; - } - - while(start > 0) { - uint32_t parent = (start-1)/2; - if (is_greater_kv(kvs[parent], kvs[start])) { - swapMe(&kvs[start], &kvs[parent]); - start = parent; + else { + if(size < MAX_SIZE) { + // we just copy to end of array + heap_down_index = size; + size++; } else { - break; + // kv is guaranteed to be > min heap. + // we have to replace min heap element since array is full + heap_down_index = 0; + kv_map.erase(kvs[heap_down_index]->key); } - } - if(keys.count(key) != 0) { - size++; + // kv will be swapped into heap_down_index pointer + kv_map.emplace(kv->key, kvs[heap_down_index]); } } - } - static bool is_greater(const struct KV* i, const int64_t scores[3]) { - return std::tie(scores[0], scores[1], scores[2]) > - std::tie(i->scores[0], i->scores[1], i->scores[2]); + // we have to replace the existing element in the heap and sift down + copyMe(kv, kvs[heap_down_index]); + + if(size < MAX_SIZE) { + heap_down_index = 0; + } + + // sift down to maintain heap property + while ((2*heap_down_index+1) < size) { + uint32_t next = (2 * heap_down_index + 1); // left child + if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) { + // for min heap we compare with the minimum of children + next++; // right child (2n + 2) + } + + if (is_greater_kv(kvs[heap_down_index], kvs[next])) { + swapMe(&kvs[heap_down_index], &kvs[next]); + } else { + break; + } + + heap_down_index = next; + } + + return true; } static bool is_greater_kv(const struct KV* i, const struct KV* j) { @@ -178,6 +221,11 @@ struct Topster { std::tie(j->scores[0], j->scores[1], j->scores[2], j->key); } + static bool is_smaller_equal(const struct KV* i, const struct KV* j) { + return std::tie(i->scores[0], i->scores[1], i->scores[2]) <= + std::tie(j->scores[0], j->scores[1], j->scores[2]); + } + static bool is_greater_kv_value(const struct KV & i, const struct KV & j) { return std::tie(i.scores[0], i.scores[1], i.scores[2], i.key) > std::tie(j.scores[0], j.scores[1], j.scores[2], j.key); diff --git a/src/index.cpp b/src/index.cpp index 71dd092b..96df1f78 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1135,7 +1135,8 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f scores[1] = int64_t(1); scores[2] = int64_t(1); - curated_topster.add(seq_id, field_id, searched_queries.size(), match_score, scores); + KV kv(field_id, searched_queries.size(), 0, seq_id, seq_id, match_score, scores); + curated_topster.add(&kv); searched_queries.push_back(override_query); } @@ -1540,7 +1541,8 @@ void Index::score_results(const std::vector & sort_fields, const uint16 } } - topster.add(seq_id, field_id, query_index, match_score, scores); + KV kv(field_id, query_index, 0, seq_id, seq_id, match_score, scores); + topster.add(&kv); } //long long int timeNanos = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); diff --git a/src/main/benchmark.cpp b/src/main/benchmark.cpp index 793eb00a..3000c26e 100644 --- a/src/main/benchmark.cpp +++ b/src/main/benchmark.cpp @@ -46,7 +46,7 @@ void benchmark_hn_titles(char* file_path) { Store *store = new Store("/tmp/typesense-data"); CollectionManager & collectionManager = CollectionManager::get_instance(); - collectionManager.init(store, 4, "abcd", "1234"); + collectionManager.init(store, 4, "abcd"); collectionManager.load(); Collection *collection = collectionManager.get_collection("hnstories_direct"); @@ -116,7 +116,7 @@ void benchmark_reactjs_pages(char* file_path) { Store *store = new Store("/tmp/typesense-data"); CollectionManager & collectionManager = CollectionManager::get_instance(); - collectionManager.init(store, 4, "abcd", "1234"); + collectionManager.init(store, 4, "abcd"); collectionManager.load(); Collection *collection = collectionManager.get_collection("reactjs_pages"); diff --git a/test/topster_test.cpp b/test/topster_test.cpp index 1e9438a8..a685fcb0 100644 --- a/test/topster_test.cpp +++ b/test/topster_test.cpp @@ -36,7 +36,8 @@ TEST(TopsterTest, MaxIntValues) { scores[1] = data[i].primary_attr; scores[2] = data[i].secondary_attr; - topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores); + KV kv(data[i].field_id, data[i].query_index, 0, data[i].key, data[i].key, data[i].match_score, scores); + topster.add(&kv); } topster.sort(); @@ -87,7 +88,8 @@ TEST(TopsterTest, MaxFloatValues) { scores[1] = Index::float_to_in64_t(data[i].primary_attr); scores[2] = data[i].secondary_attr; - topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores); + KV kv(data[i].field_id, data[i].query_index, 0, data[i].key, data[i].key, data[i].match_score, scores); + topster.add(&kv); } topster.sort(); From 8f458640fd4b036e1e84a37a58f3405e9a9389cb Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sat, 6 Jun 2020 12:58:47 +0530 Subject: [PATCH 02/38] Choose sift down/up based on array size. --- include/topster.h | 114 ++++++++++++++++++++++++++---------------- src/index.cpp | 4 +- test/topster_test.cpp | 4 +- 3 files changed, 75 insertions(+), 47 deletions(-) diff --git a/include/topster.h b/include/topster.h index a065bb01..e564a0e2 100644 --- a/include/topster.h +++ b/include/topster.h @@ -15,9 +15,9 @@ struct KV { uint64_t match_score; int64_t scores[3]{}; // match score + 2 custom attributes - KV(uint8_t fieldId, uint16_t queryIndex, uint16_t arrayIndex, uint64_t key, uint64_t distinct_key, + KV(uint8_t fieldId, uint16_t queryIndex, uint64_t key, uint64_t distinct_key, uint64_t match_score, const int64_t *scores): - field_id(fieldId), query_index(queryIndex), array_index(arrayIndex), key(key), + field_id(fieldId), query_index(queryIndex), array_index(0), key(key), distinct_key(distinct_key), match_score(match_score) { this->scores[0] = scores[0]; this->scores[1] = scores[1]; @@ -36,9 +36,10 @@ struct Topster { KV *data; KV** kvs; + spp::sparse_hash_map kv_map; - KV* min_kv; + KV* group_min_kv; spp::sparse_hash_map group_kv_map; size_t distinct; @@ -61,13 +62,13 @@ struct Topster { kvs[i] = &data[i]; } - min_kv = new KV(); + group_min_kv = new KV(); } ~Topster() { delete[] data; delete[] kvs; - delete min_kv; + delete group_min_kv; for(auto& kv: group_kv_map) { delete kv.second; } @@ -95,14 +96,20 @@ struct Topster { LOG(INFO) << "kv key: " << kv.first << " => " << kv.second->match_score; }*/ + /*if(kv->key == 5) { + LOG(INFO) << "Key is 5"; + }*/ + bool less_than_min_heap = (size >= MAX_SIZE) && is_smaller_equal(kv, kvs[0]); - size_t heap_down_index = 0; + size_t heap_op_index = 0; if(!distinct && less_than_min_heap) { // for non-distinct, if incoming value is smaller than min-heap ignore return false; } + bool SIFT_DOWN = true; + if(distinct) { const auto& found_it = group_kv_map.find(kv->distinct_key); bool is_duplicate_key = (found_it != group_kv_map.end()); @@ -115,34 +122,34 @@ struct Topster { if(is_duplicate_key) { // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift down Topster* group_topster = found_it->second; - uint16_t old_min_heap_array_index = group_topster->min_kv->array_index; + uint16_t old_min_heap_array_index = group_min_kv->array_index; bool added = group_topster->add(kv); if(!added) { return false; } // if added, guaranteed to be larger than old_min_heap_ele - copyMe(kv, group_topster->min_kv); - heap_down_index = old_min_heap_array_index; + copyMe(kv, group_min_kv); + heap_op_index = old_min_heap_array_index; } else { - // we have to replace min heap element // create fresh topster for this distinct group key since it does not exist Topster* group_topster = new Topster(distinct, 0); group_topster->add(kv); - copyMe(kv, group_topster->min_kv); + copyMe(kv, group_min_kv); if(size < MAX_SIZE) { // we just copy to end of array - heap_down_index = size; + heap_op_index = size; size++; } else { // kv is guaranteed to be > current min heap (group_topster.kvs[0]) - heap_down_index = 0; + // so we have to replace min heap element (kvs[0]) + heap_op_index = 0; // remove current min heap group key from map - delete group_kv_map[kvs[heap_down_index]->distinct_key]; - group_kv_map.erase(kvs[heap_down_index]->distinct_key); + delete group_kv_map[kvs[heap_op_index]->distinct_key]; + group_kv_map.erase(kvs[heap_op_index]->distinct_key); } // add new group key to map @@ -155,6 +162,13 @@ struct Topster { const auto& found_it = kv_map.find(kv->key); bool is_duplicate_key = (found_it != kv_map.end()); + /* + is_duplicate_key: SIFT_DOWN regardless of `size`. + Else: + Do SIFT_UP if size < max_size + Else SIFT_DOWN + */ + if(is_duplicate_key) { // Need to check if kv is greater than existing duplicate kv. KV* existing_kv = found_it->second; @@ -164,53 +178,67 @@ struct Topster { return false; } + SIFT_DOWN = true; + // replace existing kv and sift down - heap_down_index = existing_kv->array_index; - kv_map.erase(kvs[heap_down_index]->key); + heap_op_index = existing_kv->array_index; + kv_map.erase(kvs[heap_op_index]->key); - // kv will be swapped into heap_down_index - kv_map.emplace(kv->key, kvs[heap_down_index]); - } + // kv will be swapped into heap_op_index + kv_map.emplace(kv->key, kvs[heap_op_index]); - else { + } else { // not duplicate + if(size < MAX_SIZE) { // we just copy to end of array - heap_down_index = size; + SIFT_DOWN = false; + heap_op_index = size; size++; } else { // kv is guaranteed to be > min heap. // we have to replace min heap element since array is full - heap_down_index = 0; - kv_map.erase(kvs[heap_down_index]->key); + SIFT_DOWN = true; + heap_op_index = 0; + kv_map.erase(kvs[heap_op_index]->key); } - // kv will be swapped into heap_down_index pointer - kv_map.emplace(kv->key, kvs[heap_down_index]); + // kv will be swapped into heap_op_index pointer + kv_map.emplace(kv->key, kvs[heap_op_index]); } } // we have to replace the existing element in the heap and sift down - copyMe(kv, kvs[heap_down_index]); + copyMe(kv, kvs[heap_op_index]); - if(size < MAX_SIZE) { - heap_down_index = 0; - } + // sift up/down to maintain heap property - // sift down to maintain heap property - while ((2*heap_down_index+1) < size) { - uint32_t next = (2 * heap_down_index + 1); // left child - if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) { - // for min heap we compare with the minimum of children - next++; // right child (2n + 2) + if(SIFT_DOWN) { + while ((2 * heap_op_index + 1) < size) { + uint32_t next = (2 * heap_op_index + 1); // left child + if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) { + // for min heap we compare with the minimum of children + next++; // right child (2n + 2) + } + + if (is_greater_kv(kvs[heap_op_index], kvs[next])) { + swapMe(&kvs[heap_op_index], &kvs[next]); + } else { + break; + } + + heap_op_index = next; } - - if (is_greater_kv(kvs[heap_down_index], kvs[next])) { - swapMe(&kvs[heap_down_index], &kvs[next]); - } else { - break; + } else { + // SIFT UP + while(heap_op_index > 0) { + uint32_t parent = (heap_op_index - 1) / 2; + if (is_greater_kv(kvs[parent], kvs[heap_op_index])) { + swapMe(&kvs[heap_op_index], &kvs[parent]); + heap_op_index = parent; + } else { + break; + } } - - heap_down_index = next; } return true; diff --git a/src/index.cpp b/src/index.cpp index 96df1f78..f49097f4 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1135,7 +1135,7 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f scores[1] = int64_t(1); scores[2] = int64_t(1); - KV kv(field_id, searched_queries.size(), 0, seq_id, seq_id, match_score, scores); + KV kv(field_id, searched_queries.size(), seq_id, seq_id, match_score, scores); curated_topster.add(&kv); searched_queries.push_back(override_query); @@ -1541,7 +1541,7 @@ void Index::score_results(const std::vector & sort_fields, const uint16 } } - KV kv(field_id, query_index, 0, seq_id, seq_id, match_score, scores); + KV kv(field_id, query_index, seq_id, seq_id, match_score, scores); topster.add(&kv); } diff --git a/test/topster_test.cpp b/test/topster_test.cpp index a685fcb0..3c4b4521 100644 --- a/test/topster_test.cpp +++ b/test/topster_test.cpp @@ -36,7 +36,7 @@ TEST(TopsterTest, MaxIntValues) { scores[1] = data[i].primary_attr; scores[2] = data[i].secondary_attr; - KV kv(data[i].field_id, data[i].query_index, 0, data[i].key, data[i].key, data[i].match_score, scores); + KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores); topster.add(&kv); } @@ -88,7 +88,7 @@ TEST(TopsterTest, MaxFloatValues) { scores[1] = Index::float_to_in64_t(data[i].primary_attr); scores[2] = data[i].secondary_attr; - KV kv(data[i].field_id, data[i].query_index, 0, data[i].key, data[i].key, data[i].match_score, scores); + KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores); topster.add(&kv); } From b97c37215a8fac8491078357a042a0b08c15458d Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sat, 6 Jun 2020 15:02:50 +0530 Subject: [PATCH 03/38] Basic distinct test is passing. --- include/topster.h | 108 +++++++++++++++++++++++------------------- test/topster_test.cpp | 60 +++++++++++++++++++++++ 2 files changed, 119 insertions(+), 49 deletions(-) diff --git a/include/topster.h b/include/topster.h index e564a0e2..ab6d752b 100644 --- a/include/topster.h +++ b/include/topster.h @@ -7,12 +7,12 @@ #include struct KV { - uint8_t field_id; - uint16_t query_index; - uint16_t array_index; - uint64_t key; - uint64_t distinct_key; - uint64_t match_score; + uint8_t field_id{}; + uint16_t query_index{}; + uint16_t array_index{}; + uint64_t key{}; + uint64_t distinct_key{}; + uint64_t match_score{}; int64_t scores[3]{}; // match score + 2 custom attributes KV(uint8_t fieldId, uint16_t queryIndex, uint64_t key, uint64_t distinct_key, @@ -24,7 +24,7 @@ struct KV { this->scores[2] = scores[2]; } - KV() {} + KV() = default; }; /* @@ -37,9 +37,9 @@ struct Topster { KV *data; KV** kvs; + // For distinct, stores the min heap kv of each group_kv_map topster value spp::sparse_hash_map kv_map; - KV* group_min_kv; spp::sparse_hash_map group_kv_map; size_t distinct; @@ -61,14 +61,11 @@ struct Topster { data[i].match_score = 0; kvs[i] = &data[i]; } - - group_min_kv = new KV(); } ~Topster() { delete[] data; delete[] kvs; - delete group_min_kv; for(auto& kv: group_kv_map) { delete kv.second; } @@ -84,22 +81,12 @@ struct Topster { (*b)->array_index = a_index; } - static inline void copyMe(KV* a, KV* b) { - size_t b_index = b->array_index; - *b = *a; - b->array_index = b_index; - } - bool add(KV* kv) { //LOG(INFO) << "kv_map size: " << kv_map.size() << " -- kvs[0]: " << kvs[0]->match_score; /*for(auto kv: kv_map) { LOG(INFO) << "kv key: " << kv.first << " => " << kv.second->match_score; }*/ - /*if(kv->key == 5) { - LOG(INFO) << "Key is 5"; - }*/ - bool less_than_min_heap = (size >= MAX_SIZE) && is_smaller_equal(kv, kvs[0]); size_t heap_op_index = 0; @@ -115,45 +102,61 @@ struct Topster { bool is_duplicate_key = (found_it != group_kv_map.end()); if(!is_duplicate_key && less_than_min_heap) { - // for distinct, if a non duplicate kv is < than min heap we also ignore + // for distinct, if a non duplicate kv is < than min heap we ignore return false; } if(is_duplicate_key) { - // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift down + // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift Topster* group_topster = found_it->second; - uint16_t old_min_heap_array_index = group_min_kv->array_index; + KV old_min_heap_kv = *kv_map[kv->distinct_key]; bool added = group_topster->add(kv); + if(!added) { return false; } - // if added, guaranteed to be larger than old_min_heap_ele - copyMe(kv, group_min_kv); - heap_op_index = old_min_heap_array_index; - } else { - // create fresh topster for this distinct group key since it does not exist + // if new kv score is greater than previous min heap score we sift dowm, otherwise sift up + SIFT_DOWN = is_greater_kv(kv, &old_min_heap_kv); + // new kv is different from old_min_heap_kv so we have to sift heap + heap_op_index = old_min_heap_kv.array_index; + + // erase current min heap key from kv_map + kv_map.erase(old_min_heap_kv.distinct_key); + + // kv will be copied into the pointer at heap_op_index + kv_map.emplace(kv->distinct_key, kvs[heap_op_index]); + } else { + // kv is guaranteed to be > current min heap: kvs[0] + // create fresh topster for this distinct group key since it does not exist Topster* group_topster = new Topster(distinct, 0); group_topster->add(kv); - copyMe(kv, group_min_kv); - - if(size < MAX_SIZE) { - // we just copy to end of array - heap_op_index = size; - size++; - } else { - // kv is guaranteed to be > current min heap (group_topster.kvs[0]) - // so we have to replace min heap element (kvs[0]) - heap_op_index = 0; - - // remove current min heap group key from map - delete group_kv_map[kvs[heap_op_index]->distinct_key]; - group_kv_map.erase(kvs[heap_op_index]->distinct_key); - } // add new group key to map group_kv_map.emplace(kv->distinct_key, group_topster); + + // find heap operation index for updating kvs + + if(size < MAX_SIZE) { + // there is enough space in heap we just copy to end + SIFT_DOWN = false; + heap_op_index = size; + size++; + } else { + SIFT_DOWN = true; + + // max size is reached so we are forced to replace current min heap element (kvs[0]) + heap_op_index = 0; + + // remove current min heap group key from maps + delete group_kv_map[kvs[heap_op_index]->distinct_key]; + group_kv_map.erase(kvs[heap_op_index]->distinct_key); + kv_map.erase(kvs[heap_op_index]->distinct_key); + } + + // kv will be copied into the pointer at heap_op_index + kv_map.emplace(kv->distinct_key, kvs[heap_op_index]); } } else { // not distinct @@ -184,11 +187,10 @@ struct Topster { heap_op_index = existing_kv->array_index; kv_map.erase(kvs[heap_op_index]->key); - // kv will be swapped into heap_op_index + // kv will be copied into the pointer at heap_op_index kv_map.emplace(kv->key, kvs[heap_op_index]); - } else { // not duplicate - + if(size < MAX_SIZE) { // we just copy to end of array SIFT_DOWN = false; @@ -202,13 +204,14 @@ struct Topster { kv_map.erase(kvs[heap_op_index]->key); } - // kv will be swapped into heap_op_index pointer + // kv will be copied into the pointer at heap_op_index kv_map.emplace(kv->key, kvs[heap_op_index]); } } // we have to replace the existing element in the heap and sift down - copyMe(kv, kvs[heap_op_index]); + kv->array_index = heap_op_index; + *kvs[heap_op_index] = *kv; // sift up/down to maintain heap property @@ -262,6 +265,9 @@ struct Topster { // topster must be sorted before iterated upon to remove dead array entries void sort() { std::stable_sort(kvs, kvs+size, is_greater_kv); + for(auto &group_topster: group_kv_map) { + group_topster.second->sort(); + } } void clear(){ @@ -272,6 +278,10 @@ struct Topster { return kvs[index]->key; } + uint64_t getDistinctKeyAt(uint32_t index) { + return kvs[index]->distinct_key; + } + KV* getKV(uint32_t index) { return kvs[index]; } diff --git a/test/topster_test.cpp b/test/topster_test.cpp index 3c4b4521..104e941e 100644 --- a/test/topster_test.cpp +++ b/test/topster_test.cpp @@ -99,4 +99,64 @@ TEST(TopsterTest, MaxFloatValues) { for(uint32_t i = 0; i < topster.size; i++) { EXPECT_EQ(ids[i], topster.getKeyAt(i)); } +} + +TEST(TopsterTest, DistinctIntValues) { + Topster dist_topster(5, 2); + + struct { + uint8_t field_id; + uint16_t query_index; + uint64_t distinct_key; + uint64_t match_score; + int64_t primary_attr; + int64_t secondary_attr; + } data[14] = { + {1, 0, 1, 11, 20, 30}, + {1, 0, 1, 12, 20, 32}, + {1, 0, 2, 4, 20, 30}, + {1, 2, 3, 7, 20, 30}, + {1, 0, 4, 14, 20, 30}, + {1, 1, 5, 9, 20, 30}, + {1, 1, 5, 10, 20, 32}, + {1, 1, 5, 9, 20, 30}, + {1, 0, 6, 6, 20, 30}, + {1, 2, 7, 6, 22, 30}, + {1, 2, 7, 6, 22, 30}, + {1, 1, 8, 9, 20, 30}, + {1, 0, 9, 8, 20, 30}, + {1, 3, 10, 5, 20, 30}, + }; + + for(int i = 0; i < 14; i++) { + int64_t scores[3]; + scores[0] = int64_t(data[i].match_score); + scores[1] = data[i].primary_attr; + scores[2] = data[i].secondary_attr; + + KV kv(data[i].field_id, data[i].query_index, i+100, data[i].distinct_key, data[i].match_score, scores); + dist_topster.add(&kv); + } + + dist_topster.sort(); + + std::vector distinct_ids = {4, 1, 5, 8, 9}; + + for(uint32_t i = 0; i < dist_topster.size; i++) { + EXPECT_EQ(distinct_ids[i], dist_topster.getDistinctKeyAt(i)); + + if(distinct_ids[i] == 1) { + EXPECT_EQ(12, (int) dist_topster.getKV(i)->match_score); + EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size); + EXPECT_EQ(12, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score); + EXPECT_EQ(11, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score); + } + + if(distinct_ids[i] == 5) { + EXPECT_EQ(10, (int) dist_topster.getKV(i)->match_score); + EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size); + EXPECT_EQ(10, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score); + EXPECT_EQ(9, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score); + } + } } \ No newline at end of file From c452fa0db17cb94fe0d4e898d4a38f5437046f96 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sat, 6 Jun 2020 20:50:32 +0530 Subject: [PATCH 04/38] Make iteration of curated IDs respect max hits. --- src/index.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/index.cpp b/src/index.cpp index f49097f4..9ce1b396 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1110,7 +1110,11 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f leaf_to_indices.emplace(token_leaf, indices); } - for(size_t j=0; j>> array_token_positions; From 8ffb9c5154e478f84055fabc9539079d277d7149 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sun, 7 Jun 2020 17:36:27 +0530 Subject: [PATCH 05/38] Reuse KV pointers instead of copying as objects. --- include/collection.h | 2 +- include/index.h | 35 +++++++++++++++-------- include/topster.h | 5 ---- src/collection.cpp | 67 ++++++++++++++++++++++++-------------------- src/index.cpp | 62 ++++++++++++++++++++-------------------- 5 files changed, 91 insertions(+), 80 deletions(-) diff --git a/include/collection.h b/include/collection.h index c1ebc55e..4807d091 100644 --- a/include/collection.h +++ b/include/collection.h @@ -147,7 +147,7 @@ private: std::string get_seq_id_key(uint32_t seq_id); void highlight_result(const field &search_field, const std::vector> &searched_queries, - const KV &field_order_kv, const nlohmann::json &document, + const KV* field_order_kv, const nlohmann::json &document, StringUtils & string_utils, size_t snippet_threshold, bool highlighted_fully, highlight_t &highlight); diff --git a/include/index.h b/include/index.h index 9f2d2edd..994aa6e5 100644 --- a/include/index.h +++ b/include/index.h @@ -32,17 +32,18 @@ struct search_args { facet_query_t facet_query; int num_typos; size_t max_facet_values; - size_t max_hits; size_t per_page; size_t page; token_ordering token_order; bool prefix; size_t drop_tokens_threshold; size_t typo_tokens_threshold; - std::vector raw_result_kvs; size_t all_result_ids_len; std::vector> searched_queries; - std::vector override_result_kvs; + Topster* topster; + Topster* curated_topster; + std::vector raw_result_kvs; + std::vector override_result_kvs; Option outcome; search_args(): outcome(0) { @@ -56,12 +57,20 @@ struct search_args { size_t drop_tokens_threshold, size_t typo_tokens_threshold): query(query), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), facet_query(facet_query), num_typos(num_typos), - max_facet_values(max_facet_values), max_hits(max_hits), per_page(per_page), + max_facet_values(max_facet_values), per_page(per_page), page(page), token_order(token_order), prefix(prefix), drop_tokens_threshold(drop_tokens_threshold), typo_tokens_threshold(typo_tokens_threshold), all_result_ids_len(0), outcome(0) { + const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory + topster = new Topster(topster_size); + curated_topster = new Topster(topster_size); } + + ~search_args() { + delete topster; + delete curated_topster; + }; }; struct index_record { @@ -155,7 +164,7 @@ private: const std::string & field, uint32_t *filter_ids, size_t filter_ids_length, std::vector & facets, const std::vector & sort_fields, const int num_typos, std::vector> & searched_queries, - Topster & topster, uint32_t** all_result_ids, + Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY, const bool prefix = false, const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD, @@ -164,7 +173,7 @@ private: void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, const std::vector & sort_fields, std::vector & token_to_candidates, const token_ordering token_order, std::vector> & searched_queries, - Topster & topster, uint32_t** all_result_ids, + Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len, const size_t typo_tokens_threshold); @@ -198,7 +207,7 @@ private: void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id, const std::vector & included_ids, - Topster & curated_topster, std::vector> & searched_queries); + Topster* curated_topster, std::vector> & searched_queries); uint64_t facet_token_hash(const field & a_field, const std::string &token); @@ -220,10 +229,12 @@ public: facet_query_t & facet_query, const std::vector & included_ids, const std::vector & excluded_ids, const std::vector & sort_fields_std, const int num_typos, - const size_t max_hits, const size_t per_page, const size_t page, const token_ordering token_order, - const bool prefix, const size_t drop_tokens_threshold, std::vector & raw_result_kvs, + Topster* topster, Topster* curated_topster, + const size_t per_page, const size_t page, const token_ordering token_order, + const bool prefix, const size_t drop_tokens_threshold, size_t & all_result_ids_len, std::vector> & searched_queries, - std::vector & override_result_kvs, const size_t typo_tokens_threshold); + std::vector & raw_result_kvs, std::vector & override_result_kvs, + const size_t typo_tokens_threshold); Option remove(const uint32_t seq_id, nlohmann::json & document); @@ -235,7 +246,7 @@ public: std::vector>> &array_token_positions); void score_results(const std::vector & sort_fields, const uint16_t & query_index, const uint8_t & field_id, - const uint32_t total_cost, Topster &topster, const std::vector & query_suggestion, + const uint32_t total_cost, Topster* topster, const std::vector & query_suggestion, const uint32_t *result_ids, const size_t result_size) const; static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field); @@ -278,7 +289,7 @@ public: bool processed; // prevents spurious wake up of the main thread bool terminate; // used for interrupting the thread during tear down - search_args search_params; + search_args* search_params; static void populate_array_token_positions(std::vector>> & array_token_positions, const art_leaf *token_leaf, uint32_t doc_index); diff --git a/include/topster.h b/include/topster.h index ab6d752b..bbc7f590 100644 --- a/include/topster.h +++ b/include/topster.h @@ -257,11 +257,6 @@ struct Topster { std::tie(j->scores[0], j->scores[1], j->scores[2]); } - static bool is_greater_kv_value(const struct KV & i, const struct KV & j) { - return std::tie(i.scores[0], i.scores[1], i.scores[2], i.key) > - std::tie(j.scores[0], j.scores[1], j.scores[2], j.key); - } - // topster must be sorted before iterated upon to remove dead array entries void sort() { std::stable_sort(kvs, kvs+size, is_greater_kv); diff --git a/src/collection.cpp b/src/collection.cpp index 2e7b30f2..eee23762 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -608,15 +608,15 @@ Option Collection::search(const std::string & query, const std:: const size_t max_hits = std::min((page * per_page), get_num_documents()); std::vector> searched_queries; // search queries used for generating the results - std::vector raw_result_kvs; - std::vector override_result_kvs; + std::vector raw_result_kvs; + std::vector override_result_kvs; size_t total_found = 0; // send data to individual index threads size_t index_id = 0; for(Index* index: indices) { - index->search_params = search_args(query, search_fields, filters, facets, + index->search_params = new search_args(query, search_fields, filters, facets, index_to_included_ids[index_id], index_to_excluded_ids[index_id], sort_fields_std, facet_query, num_typos, max_facet_values, max_hits, per_page, page, token_order, prefix, @@ -639,9 +639,9 @@ Option Collection::search(const std::string & query, const std:: index->cv.wait(lk, [index]{return index->processed;}); } - if(!index->search_params.outcome.ok()) { - index_search_op = Option(index->search_params.outcome.code(), - index->search_params.outcome.error()); + if(!index->search_params->outcome.ok()) { + index_search_op = Option(index->search_params->outcome.code(), + index->search_params->outcome.error()); } if(!index_search_op.ok()) { @@ -649,21 +649,21 @@ Option Collection::search(const std::string & query, const std:: continue; } - for(auto & field_order_kv: index->search_params.raw_result_kvs) { - field_order_kv.query_index += searched_queries.size(); + for(auto & field_order_kv: index->search_params->raw_result_kvs) { + field_order_kv->query_index += searched_queries.size(); raw_result_kvs.push_back(field_order_kv); } - for(auto & field_order_kv: index->search_params.override_result_kvs) { - field_order_kv.query_index += searched_queries.size(); + for(auto & field_order_kv: index->search_params->override_result_kvs) { + field_order_kv->query_index += searched_queries.size(); override_result_kvs.push_back(field_order_kv); } - searched_queries.insert(searched_queries.end(), index->search_params.searched_queries.begin(), - index->search_params.searched_queries.end()); + searched_queries.insert(searched_queries.end(), index->search_params->searched_queries.begin(), + index->search_params->searched_queries.end()); - for(size_t fi = 0; fi < index->search_params.facets.size(); fi++) { - auto & this_facet = index->search_params.facets[fi]; + for(size_t fi = 0; fi < index->search_params->facets.size(); fi++) { + auto & this_facet = index->search_params->facets[fi]; auto & acc_facet = facets[fi]; for(auto & facet_kv: this_facet.result_map) { @@ -690,7 +690,7 @@ Option Collection::search(const std::string & query, const std:: } } - total_found += index->search_params.all_result_ids_len; + total_found += index->search_params->all_result_ids_len; } if(!index_search_op.ok()) { @@ -698,13 +698,13 @@ Option Collection::search(const std::string & query, const std:: } // All fields are sorted descending - std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_value); + std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv); - // Sort based on position in overriden list + // Sort based on position in overridden list std::sort( override_result_kvs.begin(), override_result_kvs.end(), - [&id_pos_map](const KV & a, const KV & b) -> bool { - return id_pos_map[a.key] < id_pos_map[b.key]; + [&id_pos_map](const KV* a, const KV* b) -> bool { + return id_pos_map[a->key] < id_pos_map[b->key]; } ); @@ -713,15 +713,15 @@ Option Collection::search(const std::string & query, const std:: result["hits"] = nlohmann::json::array(); result["found"] = total_found; - std::vector result_kvs; + std::vector result_kvs; size_t override_kv_index = 0; size_t raw_results_index = 0; // merge raw results and override results while(override_kv_index < override_result_kvs.size() && raw_results_index < raw_result_kvs.size()) { if(override_kv_index < override_result_kvs.size() && - id_pos_map.count(override_result_kvs[override_kv_index].key) != 0 && - result_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index].key]) { + id_pos_map.count(override_result_kvs[override_kv_index]->key) != 0 && + result_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index]->key]) { result_kvs.push_back(override_result_kvs[override_kv_index]); override_kv_index++; } else { @@ -746,7 +746,7 @@ Option Collection::search(const std::string & query, const std:: // construct results array for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) { const auto & field_order_kv = result_kvs[result_kvs_index]; - const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv.key); + const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key); nlohmann::json document; const Option & document_op = get_document_from_store(seq_id_key, document); @@ -818,8 +818,8 @@ Option Collection::search(const std::string & query, const std:: prune_document(document, include_fields, exclude_fields); wrapper_doc["document"] = document; - wrapper_doc["text_match"] = field_order_kv.match_score; - //wrapper_doc["seq_id"] = (uint32_t) field_order_kv.key; + wrapper_doc["text_match"] = field_order_kv->match_score; + //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key; result["hits"].push_back(wrapper_doc); } @@ -918,6 +918,11 @@ Option Collection::search(const std::string & query, const std:: result["facet_counts"].push_back(facet_result); } + // free search params + for(Index* index: indices) { + delete index->search_params; + } + result["request_params"] = nlohmann::json::object();; result["request_params"]["per_page"] = per_page; result["request_params"]["q"] = query; @@ -966,7 +971,7 @@ void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t void Collection::highlight_result(const field &search_field, const std::vector> &searched_queries, - const KV & field_order_kv, const nlohmann::json & document, + const KV* field_order_kv, const nlohmann::json & document, StringUtils & string_utils, size_t snippet_threshold, bool highlighted_fully, highlight_t & highlight) { @@ -974,15 +979,15 @@ void Collection::highlight_result(const field &search_field, spp::sparse_hash_map leaf_to_indices; std::vector query_suggestion; - for (const art_leaf *token_leaf : searched_queries[field_order_kv.query_index]) { + for (const art_leaf *token_leaf : searched_queries[field_order_kv->query_index]) { // Must search for the token string fresh on that field for the given document since `token_leaf` // is from the best matched field and need not be present in other fields of a document. - Index* index = indices[field_order_kv.key % num_indices]; + Index* index = indices[field_order_kv->key % num_indices]; art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len); if(actual_leaf != nullptr) { query_suggestion.push_back(actual_leaf); std::vector positions; - uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv.key); + uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key); auto doc_indices = new uint32_t[1]; doc_indices[0] = doc_index; leaf_to_indices.emplace(actual_leaf, doc_indices); @@ -1008,8 +1013,8 @@ void Collection::highlight_result(const field &search_field, continue; } - const Match & this_match = Match::match(field_order_kv.key, token_positions); - uint64_t this_match_score = this_match.get_match_score(1, field_order_kv.field_id); + const Match & this_match = Match::match(field_order_kv->key, token_positions); + uint64_t this_match_score = this_match.get_match_score(1, field_order_kv->field_id); match_indices.emplace_back(this_match, this_match_score, array_index); } diff --git a/src/index.cpp b/src/index.cpp index 9ce1b396..1ff8f5bb 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -829,7 +829,7 @@ void Index::drop_facets(std::vector & facets, const std::vector void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, const std::vector & sort_fields, std::vector & token_candidates_vec, const token_ordering token_order, - std::vector> & searched_queries, Topster & topster, + std::vector> & searched_queries, Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len, const size_t typo_tokens_threshold) { const long long combination_limit = 10; @@ -1055,13 +1055,15 @@ void Index::run_search() { } // after the wait, we own the lock. - search(search_params.outcome, search_params.query, search_params.search_fields, - search_params.filters, search_params.facets, search_params.facet_query, search_params.included_ids, - search_params.excluded_ids, search_params.sort_fields_std, search_params.num_typos, - search_params.max_hits, search_params.per_page, search_params.page, search_params.token_order, - search_params.prefix, search_params.drop_tokens_threshold, search_params.raw_result_kvs, - search_params.all_result_ids_len, search_params.searched_queries, search_params.override_result_kvs, - search_params.typo_tokens_threshold); + search(search_params->outcome, search_params->query, search_params->search_fields, + search_params->filters, search_params->facets, search_params->facet_query, search_params->included_ids, + search_params->excluded_ids, search_params->sort_fields_std, search_params->num_typos, + search_params->topster, search_params->curated_topster, + search_params->per_page, search_params->page, search_params->token_order, + search_params->prefix, search_params->drop_tokens_threshold, + search_params->all_result_ids_len, search_params->searched_queries, + search_params->raw_result_kvs, search_params->override_result_kvs, + search_params->typo_tokens_threshold); // hand control back to main thread processed = true; @@ -1075,7 +1077,7 @@ void Index::run_search() { void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id, const std::vector & included_ids, - Topster & curated_topster, + Topster* curated_topster, std::vector> & searched_queries) { if(included_ids.size() == 0) { @@ -1110,9 +1112,9 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f leaf_to_indices.emplace(token_leaf, indices); } - // curated_topster.MAX_SIZE is initialized based on max_hits. + // curated_topster->MAX_SIZE is initialized based on max_hits. // Even if override has more IDs, we should restrict to max hits. - size_t iter_size = std::min((size_t)curated_topster.MAX_SIZE, included_ids.size()); + size_t iter_size = std::min((size_t)curated_topster->MAX_SIZE, included_ids.size()); for(size_t j=0; jadd(&kv); searched_queries.push_back(override_query); } @@ -1153,13 +1155,15 @@ void Index::search(Option & outcome, std::vector & facets, facet_query_t & facet_query, const std::vector & included_ids, const std::vector & excluded_ids, - const std::vector & sort_fields_std, const int num_typos, const size_t max_hits, + const std::vector & sort_fields_std, const int num_typos, + Topster* topster, + Topster* curated_topster, const size_t per_page, const size_t page, const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold, - std::vector & raw_result_kvs, size_t & all_result_ids_len, std::vector> & searched_queries, - std::vector & override_result_kvs, + std::vector & raw_result_kvs, + std::vector & override_result_kvs, const size_t typo_tokens_threshold) { const size_t num_results = (page * per_page); @@ -1179,10 +1183,6 @@ void Index::search(Option & outcome, //auto begin = std::chrono::high_resolution_clock::now(); uint32_t* all_result_ids = nullptr; - const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory - Topster topster(topster_size); - Topster curated_topster(topster_size); - if(query == "*") { const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0); const std::string & field = search_fields[0]; @@ -1212,8 +1212,8 @@ void Index::search(Option & outcome, do_facets(facets, facet_query, &included_ids[0], included_ids.size()); // must be sorted before iterated upon to remove "empty" array entries - topster.sort(); - curated_topster.sort(); + topster->sort(); + curated_topster->sort(); std::set ids_to_remove(included_ids.begin(), included_ids.end()); ids_to_remove.insert(excluded_ids.begin(), excluded_ids.end()); @@ -1221,26 +1221,26 @@ void Index::search(Option & outcome, std::vector dropped_ids; // loop through topster and remove elements from included and excluded id lists - for(uint32_t t = 0; t < topster.size && t < num_results; t++) { - KV* kv = topster.getKV(t); + for(uint32_t t = 0; t < topster->size && t < num_results; t++) { + KV* kv = topster->getKV(t); if(ids_to_remove.count(kv->key) != 0) { dropped_ids.push_back((uint32_t)kv->key); } else { - raw_result_kvs.push_back(*kv); + raw_result_kvs.push_back(kv); } } - for(uint32_t t = 0; t < curated_topster.size && t < num_results; t++) { - KV* kv = curated_topster.getKV(t); - override_result_kvs.push_back(*kv); + for(uint32_t t = 0; t < curated_topster->size && t < num_results; t++) { + KV* kv = curated_topster->getKV(t); + override_result_kvs.push_back(kv); } // for the ids that are dropped, remove their corresponding facet components from facet results drop_facets(facets, dropped_ids); all_result_ids_len -= dropped_ids.size(); - all_result_ids_len += curated_topster.size; + all_result_ids_len += curated_topster->size; delete [] filter_ids; delete [] all_result_ids; @@ -1264,7 +1264,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co uint32_t *filter_ids, size_t filter_ids_length, std::vector & facets, const std::vector & sort_fields, const int num_typos, std::vector> & searched_queries, - Topster & topster, uint32_t** all_result_ids, size_t & all_result_ids_len, + Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len, const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) { std::vector tokens; @@ -1438,7 +1438,7 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect } void Index::score_results(const std::vector & sort_fields, const uint16_t & query_index, - const uint8_t & field_id, const uint32_t total_cost, Topster & topster, + const uint8_t & field_id, const uint32_t total_cost, Topster* topster, const std::vector &query_suggestion, const uint32_t *result_ids, const size_t result_size) const { @@ -1546,7 +1546,7 @@ void Index::score_results(const std::vector & sort_fields, const uint16 } KV kv(field_id, query_index, seq_id, seq_id, match_score, scores); - topster.add(&kv); + topster->add(&kv); } //long long int timeNanos = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); From a6a2000ddce6dbf5d74a9986f0cb8f0b0c9bd3ed Mon Sep 17 00:00:00 2001 From: kishorenc Date: Wed, 10 Jun 2020 07:00:32 +0530 Subject: [PATCH 06/38] Use proc meminfo for available memory on Linux. --- include/system_metrics.h | 17 +++++++++++++++++ src/system_metrics.cpp | 8 ++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/include/system_metrics.h b/include/system_metrics.h index fda58fdd..6ea58dee 100644 --- a/include/system_metrics.h +++ b/include/system_metrics.h @@ -118,6 +118,23 @@ private: } } + static unsigned long linux_get_mem_available_bytes() { + std::string token; + std::ifstream file("/proc/meminfo"); + while(file >> token) { + if(token == "MemAvailable:") { + unsigned long mem_kb; + if(file >> mem_kb) { + return mem_kb * 1000; + } else { + return 0; + } + } + } + + return 0; // nothing found + } + public: void get(const std::string & data_dir_path, nlohmann::json& result); diff --git a/src/system_metrics.cpp b/src/system_metrics.cpp index a0c6d01e..ac5412b7 100644 --- a/src/system_metrics.cpp +++ b/src/system_metrics.cpp @@ -29,7 +29,7 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result getrusage(RUSAGE_SELF, &r_usage); result["memory_used_process_bytes"] = r_usage.ru_maxrss * 1000; - uint64_t memory_free_bytes = 0; + uint64_t memory_available_bytes = 0; uint64_t memory_total_bytes = 0; #ifdef __APPLE__ @@ -42,7 +42,7 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result if (KERN_SUCCESS == host_page_size(mach_port, &mach_page_size) && KERN_SUCCESS == host_statistics64(mach_port, HOST_VM_INFO, (host_info64_t)&vm_stats, &count)) { - memory_free_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size; + memory_available_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size; } uint64_t pages = sysconf(_SC_PHYS_PAGES); @@ -51,11 +51,11 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result #elif __linux__ struct sysinfo sys_info; sysinfo(&sys_info); - memory_free_bytes = sys_info.freeram; + memory_available_bytes = linux_get_mem_available_bytes(); memory_total_bytes = sys_info.totalram; #endif - result["memory_free_bytes"] = memory_free_bytes; + result["memory_available_bytes"] = memory_available_bytes; result["memory_total_bytes"] = memory_total_bytes; // CPU METRICS From b7dcb7367bc69945bc36ac0b28bf9756fd8a651a Mon Sep 17 00:00:00 2001 From: kishorenc Date: Wed, 10 Jun 2020 18:26:17 +0530 Subject: [PATCH 07/38] Expose group by params. --- TODO.md | 1 + include/collection.h | 4 +- include/index.h | 17 +++- include/topster.h | 17 ++-- src/collection.cpp | 223 +++++++++++++++++++++++++++---------------- src/core_api.cpp | 27 +++++- src/index.cpp | 66 ++++++++++--- 7 files changed, 250 insertions(+), 105 deletions(-) diff --git a/TODO.md b/TODO.md index 69298247..96f2a7e3 100644 --- a/TODO.md +++ b/TODO.md @@ -97,6 +97,7 @@ - ~~Have a LOG(ERROR) level~~ - ~~Handle SIGTERM which is sent when process is killed~~ - ~~Use snappy compression for storage~~ +- Test for overriding result on second page - atleast 1 token match for proceeding with drop tokens - support wildcard query with filters - API for optimizing on disk storage diff --git a/include/collection.h b/include/collection.h index 4807d091..48aa4938 100644 --- a/include/collection.h +++ b/include/collection.h @@ -235,7 +235,9 @@ public: const std::string & highlight_full_fields = "", size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD, const std::map& pinned_hits={}, - const std::vector& hidden_hits={}); + const std::vector& hidden_hits={}, + const std::vector& group_by_fields={}, + const size_t group_limit = 0); Option get(const std::string & id); diff --git a/include/index.h b/include/index.h index 994aa6e5..b142c9a3 100644 --- a/include/index.h +++ b/include/index.h @@ -38,11 +38,13 @@ struct search_args { bool prefix; size_t drop_tokens_threshold; size_t typo_tokens_threshold; + std::vector group_by_fields; + size_t group_limit; size_t all_result_ids_len; std::vector> searched_queries; Topster* topster; Topster* curated_topster; - std::vector raw_result_kvs; + std::vector> raw_result_kvs; std::vector override_result_kvs; Option outcome; @@ -54,16 +56,18 @@ struct search_args { std::vector facets, std::vector included_ids, std::vector excluded_ids, std::vector sort_fields_std, facet_query_t facet_query, int num_typos, size_t max_facet_values, size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix, - size_t drop_tokens_threshold, size_t typo_tokens_threshold): + size_t drop_tokens_threshold, size_t typo_tokens_threshold, + const std::vector& group_by_fields, size_t group_limit): query(query), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), facet_query(facet_query), num_typos(num_typos), max_facet_values(max_facet_values), per_page(per_page), page(page), token_order(token_order), prefix(prefix), drop_tokens_threshold(drop_tokens_threshold), typo_tokens_threshold(typo_tokens_threshold), + group_by_fields(group_by_fields), group_limit(group_limit), all_result_ids_len(0), outcome(0) { const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory - topster = new Topster(topster_size); + topster = new Topster(topster_size, group_limit); curated_topster = new Topster(topster_size); } @@ -213,6 +217,11 @@ private: void compute_facet_stats(facet &a_facet, int64_t raw_value, const std::string & field_type); + // reference: https://stackoverflow.com/a/27952689/131050 + uint64_t hash_combine(uint64_t lhs, uint64_t rhs) const { + lhs ^= rhs + 0x517cc1b727220a95 + (lhs << 6) + (lhs >> 2); + return lhs; + } public: Index() = delete; @@ -233,7 +242,7 @@ public: const size_t per_page, const size_t page, const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold, size_t & all_result_ids_len, std::vector> & searched_queries, - std::vector & raw_result_kvs, std::vector & override_result_kvs, + std::vector> & raw_result_kvs, std::vector & override_result_kvs, const size_t typo_tokens_threshold); Option remove(const uint32_t seq_id, nlohmann::json & document); diff --git a/include/topster.h b/include/topster.h index bbc7f590..685e6d62 100644 --- a/include/topster.h +++ b/include/topster.h @@ -117,7 +117,7 @@ struct Topster { } // if new kv score is greater than previous min heap score we sift dowm, otherwise sift up - SIFT_DOWN = is_greater_kv(kv, &old_min_heap_kv); + SIFT_DOWN = is_greater(kv, &old_min_heap_kv); // new kv is different from old_min_heap_kv so we have to sift heap heap_op_index = old_min_heap_kv.array_index; @@ -218,12 +218,12 @@ struct Topster { if(SIFT_DOWN) { while ((2 * heap_op_index + 1) < size) { uint32_t next = (2 * heap_op_index + 1); // left child - if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) { + if (next+1 < size && is_greater(kvs[next], kvs[next + 1])) { // for min heap we compare with the minimum of children next++; // right child (2n + 2) } - if (is_greater_kv(kvs[heap_op_index], kvs[next])) { + if (is_greater(kvs[heap_op_index], kvs[next])) { swapMe(&kvs[heap_op_index], &kvs[next]); } else { break; @@ -235,7 +235,7 @@ struct Topster { // SIFT UP while(heap_op_index > 0) { uint32_t parent = (heap_op_index - 1) / 2; - if (is_greater_kv(kvs[parent], kvs[heap_op_index])) { + if (is_greater(kvs[parent], kvs[heap_op_index])) { swapMe(&kvs[heap_op_index], &kvs[parent]); heap_op_index = parent; } else { @@ -247,7 +247,7 @@ struct Topster { return true; } - static bool is_greater_kv(const struct KV* i, const struct KV* j) { + static bool is_greater(const struct KV* i, const struct KV* j) { return std::tie(i->scores[0], i->scores[1], i->scores[2], i->key) > std::tie(j->scores[0], j->scores[1], j->scores[2], j->key); } @@ -257,9 +257,14 @@ struct Topster { std::tie(j->scores[0], j->scores[1], j->scores[2]); } + static bool is_greater_kv_group(const std::vector& i, const std::vector& j) { + return std::tie(i[0]->scores[0], i[0]->scores[1], i[0]->scores[2], i[0]->key) > + std::tie(j[0]->scores[0], j[0]->scores[1], j[0]->scores[2], j[0]->key); + } + // topster must be sorted before iterated upon to remove dead array entries void sort() { - std::stable_sort(kvs, kvs+size, is_greater_kv); + std::stable_sort(kvs, kvs + size, is_greater); for(auto &group_topster: group_kv_map) { group_topster.second->sort(); } diff --git a/src/collection.cpp b/src/collection.cpp index eee23762..96a1afbc 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -349,7 +349,9 @@ Option Collection::search(const std::string & query, const std:: const std::string & highlight_full_fields, size_t typo_tokens_threshold, const std::map& pinned_hits, - const std::vector& hidden_hits) { + const std::vector& hidden_hits, + const std::vector& group_by_fields, + const size_t group_limit) { std::vector included_ids; std::vector excluded_ids; @@ -385,6 +387,22 @@ Option Collection::search(const std::string & query, const std:: } } + // validate group by fields + for(const std::string & field_name: group_by_fields) { + if(search_schema.count(field_name) == 0) { + std::string error = "Could not find a field named `" + field_name + "` in the schema."; + return Option(404, error); + } + + field search_field = search_schema.at(field_name); + + // if field is a string field then it must be a facet field as well + if(search_field.is_string() && !search_field.is_facet()) { + std::string error = "Field `" + field_name + "` should be a facet field."; + return Option(400, error); + } + } + // validate filter fields std::vector filter_blocks; StringUtils::split(simple_filter_query, filter_blocks, "&&"); @@ -492,7 +510,7 @@ Option Collection::search(const std::string & query, const std:: } // for a wildcard query, if filter is not specified, use default_sorting_field as a catch-all - if(query == "*" && filters.size() == 0) { + if(query == "*" && filters.empty()) { field f = search_schema.at(default_sorting_field); std::string max_value = f.is_float() ? std::to_string(std::numeric_limits::max()) : std::to_string(std::numeric_limits::max()); @@ -608,7 +626,7 @@ Option Collection::search(const std::string & query, const std:: const size_t max_hits = std::min((page * per_page), get_num_documents()); std::vector> searched_queries; // search queries used for generating the results - std::vector raw_result_kvs; + std::vector> raw_result_kvs; std::vector override_result_kvs; size_t total_found = 0; @@ -617,10 +635,11 @@ Option Collection::search(const std::string & query, const std:: size_t index_id = 0; for(Index* index: indices) { index->search_params = new search_args(query, search_fields, filters, facets, - index_to_included_ids[index_id], index_to_excluded_ids[index_id], - sort_fields_std, facet_query, num_typos, max_facet_values, max_hits, - per_page, page, token_order, prefix, - drop_tokens_threshold, typo_tokens_threshold); + index_to_included_ids[index_id], index_to_excluded_ids[index_id], + sort_fields_std, facet_query, num_typos, max_facet_values, max_hits, + per_page, page, token_order, prefix, + drop_tokens_threshold, typo_tokens_threshold, + group_by_fields, group_limit); { std::lock_guard lk(index->m); index->ready = true; @@ -649,9 +668,9 @@ Option Collection::search(const std::string & query, const std:: continue; } - for(auto & field_order_kv: index->search_params->raw_result_kvs) { - field_order_kv->query_index += searched_queries.size(); - raw_result_kvs.push_back(field_order_kv); + for(const std::vector & kv_group: index->search_params->raw_result_kvs) { + kv_group[0]->query_index += searched_queries.size(); + raw_result_kvs.push_back(kv_group); } for(auto & field_order_kv: index->search_params->override_result_kvs) { @@ -697,8 +716,35 @@ Option Collection::search(const std::string & query, const std:: return index_search_op; } + Topster* aggr_topster = nullptr; + + if(group_limit > 0) { + // group by query requires another round of topster-ing + + // needs to be atleast 1 since scoring is mandatory + const size_t topster_size = std::max((size_t)1, max_hits); + aggr_topster = new Topster(topster_size, group_limit); + + for(const auto& kv_group: raw_result_kvs) { + for(KV* kv: kv_group) { + aggr_topster->add(kv); + } + } + + aggr_topster->sort(); + + raw_result_kvs.clear(); + raw_result_kvs.shrink_to_fit(); + + for(auto &group_topster_entry: aggr_topster->group_kv_map) { + Topster* group_topster = group_topster_entry.second; + const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); + raw_result_kvs.emplace_back(group_kvs); + } + } + // All fields are sorted descending - std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv); + std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_group); // Sort based on position in overridden list std::sort( @@ -708,12 +754,7 @@ Option Collection::search(const std::string & query, const std:: } ); - nlohmann::json result = nlohmann::json::object(); - - result["hits"] = nlohmann::json::array(); - result["found"] = total_found; - - std::vector result_kvs; + std::vector> result_group_kvs; size_t override_kv_index = 0; size_t raw_results_index = 0; @@ -721,107 +762,123 @@ Option Collection::search(const std::string & query, const std:: while(override_kv_index < override_result_kvs.size() && raw_results_index < raw_result_kvs.size()) { if(override_kv_index < override_result_kvs.size() && id_pos_map.count(override_result_kvs[override_kv_index]->key) != 0 && - result_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index]->key]) { - result_kvs.push_back(override_result_kvs[override_kv_index]); + result_group_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index]->key]) { + result_group_kvs.push_back({override_result_kvs[override_kv_index]}); override_kv_index++; } else { - result_kvs.push_back(raw_result_kvs[raw_results_index]); + result_group_kvs.push_back(raw_result_kvs[raw_results_index]); raw_results_index++; } } while(override_kv_index < override_result_kvs.size()) { - result_kvs.push_back(override_result_kvs[override_kv_index]); + result_group_kvs.push_back({override_result_kvs[override_kv_index]}); override_kv_index++; } while(raw_results_index < raw_result_kvs.size()) { - result_kvs.push_back(raw_result_kvs[raw_results_index]); + result_group_kvs.push_back(raw_result_kvs[raw_results_index]); raw_results_index++; } const long start_result_index = (page - 1) * per_page; - const long end_result_index = std::min(max_hits, result_kvs.size()) - 1; // could be -1 when max_hits is 0 + const long end_result_index = std::min(max_hits, result_group_kvs.size()) - 1; // could be -1 when max_hits is 0 + + nlohmann::json result = nlohmann::json::object(); + + result["found"] = total_found; + + std::string hits_key = (group_limit > 1) ? "grouped_hits" : "hits"; + result[hits_key] = nlohmann::json::array(); // construct results array for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) { - const auto & field_order_kv = result_kvs[result_kvs_index]; - const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key); + const std::vector & kv_group = result_group_kvs[result_kvs_index]; - nlohmann::json document; - const Option & document_op = get_document_from_store(seq_id_key, document); + nlohmann::json group_hits_array = nlohmann::json::array(); + nlohmann::json& hits_array = (group_limit > 1) ? group_hits_array : result["hits"]; - if(!document_op.ok()) { - LOG(ERROR) << "Document fetch error. " << document_op.error(); - continue; - } + for(const KV* field_order_kv: kv_group) { + const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key); - nlohmann::json wrapper_doc; - wrapper_doc["highlights"] = nlohmann::json::array(); - std::vector highlights; - StringUtils string_utils; + nlohmann::json document; + const Option & document_op = get_document_from_store(seq_id_key, document); - // find out if fields have to be highlighted fully - std::vector fields_highlighted_fully_vec; - spp::sparse_hash_set fields_highlighted_fully; - StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ","); - - for(std::string & highlight_full_field: fields_highlighted_fully_vec) { - StringUtils::trim(highlight_full_field); - fields_highlighted_fully.emplace(highlight_full_field); - } - - for(const std::string & field_name: search_fields) { - // should not pick excluded field for highlighting - if(exclude_fields.count(field_name) > 0) { + if(!document_op.ok()) { + LOG(ERROR) << "Document fetch error. " << document_op.error(); continue; } - field search_field = search_schema.at(field_name); - if(query != "*" && (search_field.type == field_types::STRING || - search_field.type == field_types::STRING_ARRAY)) { + nlohmann::json wrapper_doc; + wrapper_doc["highlights"] = nlohmann::json::array(); + std::vector highlights; + StringUtils string_utils; - bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end()); - highlight_t highlight; - highlight_result(search_field, searched_queries, field_order_kv, document, - string_utils, snippet_threshold, highlighted_fully, highlight); + // find out if fields have to be highlighted fully + std::vector fields_highlighted_fully_vec; + spp::sparse_hash_set fields_highlighted_fully; + StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ","); - if(!highlight.snippets.empty()) { - highlights.push_back(highlight); - } + for(std::string & highlight_full_field: fields_highlighted_fully_vec) { + StringUtils::trim(highlight_full_field); + fields_highlighted_fully.emplace(highlight_full_field); } - } - std::sort(highlights.begin(), highlights.end()); - - for(const auto & highlight: highlights) { - nlohmann::json h_json = nlohmann::json::object(); - h_json["field"] = highlight.field; - bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end()); - - if(!highlight.indices.empty()) { - h_json["indices"] = highlight.indices; - h_json["snippets"] = highlight.snippets; - if(highlight_fully) { - h_json["values"] = highlight.values; + for(const std::string & field_name: search_fields) { + // should not pick excluded field for highlighting + if(exclude_fields.count(field_name) > 0) { + continue; } - } else { - h_json["snippet"] = highlight.snippets[0]; - if(highlight_fully) { - h_json["value"] = highlight.values[0]; + field search_field = search_schema.at(field_name); + if(query != "*" && (search_field.type == field_types::STRING || + search_field.type == field_types::STRING_ARRAY)) { + + bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end()); + highlight_t highlight; + highlight_result(search_field, searched_queries, field_order_kv, document, + string_utils, snippet_threshold, highlighted_fully, highlight); + + if(!highlight.snippets.empty()) { + highlights.push_back(highlight); + } } } - wrapper_doc["highlights"].push_back(h_json); + std::sort(highlights.begin(), highlights.end()); + + for(const auto & highlight: highlights) { + nlohmann::json h_json = nlohmann::json::object(); + h_json["field"] = highlight.field; + bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end()); + + if(!highlight.indices.empty()) { + h_json["indices"] = highlight.indices; + h_json["snippets"] = highlight.snippets; + if(highlight_fully) { + h_json["values"] = highlight.values; + } + + } else { + h_json["snippet"] = highlight.snippets[0]; + if(highlight_fully) { + h_json["value"] = highlight.values[0]; + } + } + + wrapper_doc["highlights"].push_back(h_json); + } + + prune_document(document, include_fields, exclude_fields); + wrapper_doc["document"] = document; + wrapper_doc["text_match"] = field_order_kv->match_score; + //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key; + hits_array.push_back(wrapper_doc); } - prune_document(document, include_fields, exclude_fields); - wrapper_doc["document"] = document; - wrapper_doc["text_match"] = field_order_kv->match_score; - //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key; - - result["hits"].push_back(wrapper_doc); + if(group_limit > 1) { + result["grouped_hits"].push_back(group_hits_array); + } } result["facet_counts"] = nlohmann::json::array(); @@ -923,6 +980,8 @@ Option Collection::search(const std::string & query, const std:: delete index->search_params; } + delete aggr_topster; + result["request_params"] = nlohmann::json::object();; result["request_params"]["per_page"] = per_page; result["request_params"]["q"] = query; diff --git a/src/core_api.cpp b/src/core_api.cpp index 56efeec7..c987443e 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -235,6 +235,9 @@ bool get_search(http_req & req, http_res & res) { const char *FACET_QUERY = "facet_query"; const char *MAX_FACET_VALUES = "max_facet_values"; + const char *GROUP_BY = "group_by"; + const char *GROUP_LIMIT = "group_limit"; + const char *PER_PAGE = "per_page"; const char *PAGE = "page"; const char *CALLBACK = "callback"; @@ -314,6 +317,18 @@ bool get_search(http_req & req, http_res & res) { req.params[EXCLUDE_FIELDS] = ""; } + if(req.params.count(GROUP_BY) == 0) { + req.params[GROUP_BY] = ""; + } + + if(req.params.count(GROUP_LIMIT) == 0) { + if(req.params[GROUP_BY] != "") { + req.params[GROUP_LIMIT] = "3"; + } else { + req.params[GROUP_LIMIT] = "0"; + } + } + if(!StringUtils::is_uint64_t(req.params[DROP_TOKENS_THRESHOLD])) { res.set_400("Parameter `" + std::string(DROP_TOKENS_THRESHOLD) + "` must be an unsigned integer."); return false; @@ -349,6 +364,11 @@ bool get_search(http_req & req, http_res & res) { return false; } + if(!StringUtils::is_uint64_t(req.params[GROUP_LIMIT])) { + res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer."); + return false; + } + std::string filter_str = req.params.count(FILTER) != 0 ? req.params[FILTER] : ""; std::vector search_fields; @@ -366,6 +386,9 @@ bool get_search(http_req & req, http_res & res) { spp::sparse_hash_set include_fields(include_fields_vec.begin(), include_fields_vec.end()); spp::sparse_hash_set exclude_fields(exclude_fields_vec.begin(), exclude_fields_vec.end()); + std::vector group_by_fields; + StringUtils::split(req.params[GROUP_BY], group_by_fields, ","); + std::vector sort_fields; if(req.params.count(SORT_BY) != 0) { std::vector sort_field_strs; @@ -455,7 +478,9 @@ bool get_search(http_req & req, http_res & res) { req.params[HIGHLIGHT_FULL_FIELDS], typo_tokens_threshold, pinned_hits, - hidden_hits + hidden_hits, + group_by_fields, + static_cast(std::stoi(req.params[GROUP_LIMIT])) ); uint64_t timeMillis = std::chrono::duration_cast( diff --git a/src/index.cpp b/src/index.cpp index 1ff8f5bb..a93509d7 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1162,12 +1162,10 @@ void Index::search(Option & outcome, const bool prefix, const size_t drop_tokens_threshold, size_t & all_result_ids_len, std::vector> & searched_queries, - std::vector & raw_result_kvs, + std::vector> & raw_result_kvs, std::vector & override_result_kvs, const size_t typo_tokens_threshold) { - const size_t num_results = (page * per_page); - // process the filters uint32_t* filter_ids = nullptr; @@ -1221,17 +1219,33 @@ void Index::search(Option & outcome, std::vector dropped_ids; // loop through topster and remove elements from included and excluded id lists - for(uint32_t t = 0; t < topster->size && t < num_results; t++) { - KV* kv = topster->getKV(t); - if(ids_to_remove.count(kv->key) != 0) { - dropped_ids.push_back((uint32_t)kv->key); - } else { - raw_result_kvs.push_back(kv); + if(topster->distinct) { + for(auto &group_topster_entry: topster->group_kv_map) { + Topster* group_topster = group_topster_entry.second; + for(uint32_t t = 0; t < group_topster->size; t++) { + KV* kv = group_topster->getKV(t); + if(ids_to_remove.count(kv->key) != 0) { + dropped_ids.push_back((uint32_t)kv->key); + } + } + + const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); + raw_result_kvs.emplace_back(group_kvs); + } + } else { + for(uint32_t t = 0; t < topster->size; t++) { + KV* kv = topster->getKV(t); + + if(ids_to_remove.count(kv->key) != 0) { + dropped_ids.push_back((uint32_t)kv->key); + } else { + raw_result_kvs.push_back({kv}); + } } } - for(uint32_t t = 0; t < curated_topster->size && t < num_results; t++) { + for(uint32_t t = 0; t < curated_topster->size; t++) { KV* kv = curated_topster->getKV(t); override_result_kvs.push_back(kv); } @@ -1471,6 +1485,16 @@ void Index::score_results(const std::vector & sort_fields, const uint16 Match single_token_match = Match(1, 0, 0, empty_offset_diffs); const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost, field_id); + std::unordered_map facet_to_id; + + if(search_params->group_limit > 0) { + size_t i_facet = 0; + for(const auto & facet: facet_schema) { + facet_to_id[facet.first] = i_facet; + i_facet++; + } + } + for(size_t i=0; i & sort_fields, const uint16 } } - KV kv(field_id, query_index, seq_id, seq_id, match_score, scores); + uint64_t distinct_id = seq_id; + + if(search_params->group_limit != 0) { + distinct_id = 1; // some constant initial value + + // calculate hash from group_by_fields + for(const auto& field: search_params->group_by_fields) { + if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) { + continue; + } + + size_t facet_id = facet_to_id[field]; + const std::vector& fhashes = facet_index_v2.at(seq_id)[facet_id]; + + for(const auto& hash: fhashes) { + distinct_id = hash_combine(distinct_id, hash); + } + } + } + + KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores); topster->add(&kv); } From 07456a5c6a0a7b86a584a2264f3aa9a75069cf91 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Wed, 10 Jun 2020 21:59:56 +0530 Subject: [PATCH 08/38] Handle deletion of records with optional fields. --- src/index.cpp | 4 ++++ test/collection_test.cpp | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/index.cpp b/src/index.cpp index a93509d7..74a96af4 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1756,6 +1756,10 @@ void Index::remove_and_shift_offset_index(sorted_array &offset_index, const uint Option Index::remove(const uint32_t seq_id, nlohmann::json & document) { for(auto & name_field: search_schema) { + if(name_field.second.optional && document.count(name_field.first) == 0) { + continue; + } + // Go through all the field names and find the keys+values so that they can be removed from in-memory index std::vector tokens; if(name_field.second.type == field_types::STRING) { diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 6066da0a..cac9d715 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -2251,6 +2251,10 @@ TEST_F(CollectionTest, OptionalFields) { auto res_op = coll1->search("*", {"title"}, "", {}, sort_fields, 0, 10, 1, FREQUENCY, false); ASSERT_FALSE(res_op.ok()); ASSERT_STREQ("Cannot sort by `average` as it is defined as an optional field.", res_op.error().c_str()); + + // try deleting a record having optional field + Option remove_op = coll1->remove("1"); + ASSERT_TRUE(remove_op.ok()); // default sorting field should not be declared optional fields = { From 10c22c174aa0782f21b1cd68f4cd62ae7b4ef715 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sun, 14 Jun 2020 08:14:03 +0530 Subject: [PATCH 09/38] Consider grouping when generating facet counts. --- TODO.md | 2 + include/field.h | 1 + include/index.h | 9 +- src/array_utils.cpp | 2 +- src/collection.cpp | 18 ++- src/index.cpp | 192 +++++++++++++----------------- test/collection_override_test.cpp | 4 +- 7 files changed, 108 insertions(+), 120 deletions(-) diff --git a/TODO.md b/TODO.md index 96f2a7e3..b031a788 100644 --- a/TODO.md +++ b/TODO.md @@ -98,6 +98,8 @@ - ~~Handle SIGTERM which is sent when process is killed~~ - ~~Use snappy compression for storage~~ - Test for overriding result on second page +- Fix exclude_scalar early returns +- Fix result ids length during grouped overrides - atleast 1 token match for proceeding with drop tokens - support wildcard query with filters - API for optimizing on disk storage diff --git a/include/field.h b/include/field.h index 95c1fcbe..6255aed5 100644 --- a/include/field.h +++ b/include/field.h @@ -146,6 +146,7 @@ struct token_pos_cost_t { struct facet_count_t { uint32_t count; + spp::sparse_hash_map groups; // used for faceting grouped results // used to fetch the actual document and value for representation uint32_t doc_id; diff --git a/include/index.h b/include/index.h index b142c9a3..4baa1eed 100644 --- a/include/index.h +++ b/include/index.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "string_utils.h" struct token_candidates { @@ -162,10 +163,9 @@ private: void do_facets(std::vector & facets, facet_query_t & facet_query, const uint32_t* result_ids, size_t results_size); - void drop_facets(std::vector & facets, const std::vector & ids); - void search_field(const uint8_t & field_id, const std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, std::vector & facets, const std::vector & sort_fields, const int num_typos, std::vector> & searched_queries, Topster* topster, uint32_t** all_result_ids, @@ -175,8 +175,9 @@ private: const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD); void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, const std::vector & sort_fields, std::vector & token_to_candidates, - const token_ordering token_order, std::vector> & searched_queries, + std::vector> & searched_queries, Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len, const size_t typo_tokens_threshold); @@ -306,5 +307,7 @@ public: int get_bounded_typo_cost(const size_t max_cost, const size_t token_len) const; static int64_t float_to_in64_t(float n); + + uint64_t get_distinct_id(const std::unordered_map &facet_to_id, const uint32_t seq_id) const; }; diff --git a/src/array_utils.cpp b/src/array_utils.cpp index 9d93fa46..62fe2d87 100644 --- a/src/array_utils.cpp +++ b/src/array_utils.cpp @@ -114,7 +114,7 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA, return 0; } - if(B == nullptr) { + if(lenB == 0 || B == nullptr) { *out = new uint32_t[lenA]; memcpy(*out, A, lenA * sizeof(uint32_t)); return lenA; diff --git a/src/collection.cpp b/src/collection.cpp index af446907..6e50fbac 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -419,9 +419,9 @@ Option Collection::search(const std::string & query, const std:: field search_field = search_schema.at(field_name); - // if field is a string field then it must be a facet field as well - if(search_field.is_string() && !search_field.is_facet()) { - std::string error = "Field `" + field_name + "` should be a facet field."; + // must be a facet field + if(!search_field.is_facet()) { + std::string error = "Group by field `" + field_name + "` should be a facet field."; return Option(400, error); } } @@ -711,6 +711,18 @@ Option Collection::search(const std::string & query, const std:: for(auto & facet_kv: this_facet.result_map) { size_t count = 0; + + // for grouping we have to aggregate group counts to a count value + /*if(search_params->group_limit) { + // for every facet + for(auto& a_facet: facets) { + // for every facet value + for(auto& fvalue: a_facet.result_map) { + fvalue.second.count = fvalue.second.groups.size(); + } + } + }*/ + if(acc_facet.result_map.count(facet_kv.first) == 0) { // not found, so set it count = facet_kv.second.count; diff --git a/src/index.cpp b/src/index.cpp index 2fe6e5e6..5c0f8546 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -633,7 +633,7 @@ void Index::compute_facet_stats(facet &a_facet, int64_t raw_value, const std::st void Index::do_facets(std::vector & facets, facet_query_t & facet_query, const uint32_t* result_ids, size_t results_size) { - std::map facet_to_index; + std::unordered_map facet_to_index; size_t i_facet = 0; for(const auto & facet: facet_schema) { @@ -661,6 +661,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, std::vector query_tokens; StringUtils::split(facet_query.query, query_tokens, " "); + // for non-string fields, `faceted_name` returns their aliased stringified field name art_tree *t = search_index.at(facet_field.faceted_name()); for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) { @@ -703,7 +704,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, const std::vector & fhashes = facet_index_v2[doc_seq_id][facet_id]; int array_pos = 0; - int fvalue_found = 0; + bool fvalue_found = false; std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens) spp::sparse_hash_map query_token_positions; size_t field_token_index = -1; @@ -717,9 +718,9 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, // ftoken_hash is the raw value for numeric fields compute_facet_stats(a_facet, ftoken_hash, facet_field.type); + // not using facet query or this particular facet value is found in facet filter if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) { - // not using facet query or this particular facet value is found in facet filter - fvalue_found |= 1; // bitwise to ensure only one count for a multi-token facet value + fvalue_found = true; if(use_facet_query) { // map token index to query index (used for highlighting later on) @@ -736,41 +737,38 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, } } - //std::cout << "j: " << j << std::endl; - // 0 indicates separator, while the second condition checks for non-array string if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && j == fhashes.size() - 1)) { - if(!use_facet_query || fvalue_found != 0) { + if(!use_facet_query || fvalue_found) { const std::string & fvalue_str = fvaluestream.str(); - uint64_t fhash = 0; - if(facet_field.is_string()) { - fhash = facet_token_hash(facet_field, fvalue_str); - } else { - fhash = std::atoi(fvalue_str.c_str()); - } + uint64_t fhash = facet_token_hash(facet_field, fvalue_str); if(a_facet.result_map.count(fhash) == 0) { - a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0, + a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_map(), + doc_seq_id, 0, spp::sparse_hash_map()}; } - a_facet.result_map[fhash].count += 1; a_facet.result_map[fhash].doc_id = doc_seq_id; a_facet.result_map[fhash].array_pos = array_pos; + if(search_params->group_limit) { + uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id); + if(a_facet.result_map[fhash].groups.count(distinct_id) == 0) { + a_facet.result_map[fhash].groups.emplace(distinct_id, 0); + } + a_facet.result_map[fhash].groups[distinct_id] += 1; + } else { + a_facet.result_map[fhash].count += 1; + } + if(use_facet_query) { a_facet.result_map[fhash].query_token_pos = query_token_positions; - - /*if(j == 11) { - for(auto xx: query_token_positions) { - std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl; - } - }*/ } } array_pos++; - fvalue_found = 0; + fvalue_found = false; std::stringstream().swap(fvaluestream); spp::sparse_hash_map().swap(query_token_positions); field_token_index = -1; @@ -781,55 +779,10 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, } } -void Index::drop_facets(std::vector & facets, const std::vector & ids) { - std::map facet_to_index; - - size_t i_facet = 0; - for(const auto & facet: facet_schema) { - facet_to_index[facet.first] = i_facet; - i_facet++; - } - - for(auto & a_facet: facets) { - const field & facet_field = facet_schema.at(a_facet.field_name); - - // assumed that facet fields have already been validated upstream - for(const uint32_t doc_seq_id: ids) { - if(facet_index_v2.count(doc_seq_id) != 0) { - // FORMAT OF VALUES - // String: h1 h2 h3 - // String array: h1 h2 h3 0 h1 0 h1 h2 0 - const std::vector & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]]; - std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens) - - for(size_t j = 0; j < fhashes.size(); j++) { - if(fhashes[j] != FACET_ARRAY_DELIMETER) { - int64_t ftoken_hash = fhashes[j]; - fvaluestream << ftoken_hash; - } - - if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && - j == fhashes.size() - 1)) { - const std::string & fvalue_str = fvaluestream.str(); - uint64_t fhash = facet_token_hash(facet_field, fvalue_str); - - if(a_facet.result_map.count(fhash) != 0) { - a_facet.result_map[fhash].count -= 1; - if(a_facet.result_map[fhash].count == 0) { - a_facet.result_map.erase(fhash); - } - } - std::stringstream().swap(fvaluestream); - } - } - } - } - } -} - void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, const std::vector & sort_fields, - std::vector & token_candidates_vec, const token_ordering token_order, + std::vector & token_candidates_vec, std::vector> & searched_queries, Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len, const size_t typo_tokens_threshold) { @@ -874,6 +827,15 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si continue; } + if(!curated_ids.empty()) { + uint32_t *excluded_result_ids = nullptr; + result_size = ArrayUtils::exclude_scalar(result_ids, result_size, &curated_ids[0], + curated_ids.size(), &excluded_result_ids); + + delete [] result_ids; + result_ids = excluded_result_ids; + } + if(filter_ids != nullptr) { // intersect once again with filter ids uint32_t* filtered_result_ids = nullptr; @@ -1176,7 +1138,14 @@ void Index::search(Option & outcome, return ; } - const uint32_t filter_ids_length = op_filter_ids_length.get(); + uint32_t filter_ids_length = op_filter_ids_length.get(); + + // we will be removing all curated IDs from organic results before running topster + std::set curated_ids(included_ids.begin(), included_ids.end()); + curated_ids.insert(excluded_ids.begin(), excluded_ids.end()); + + std::vector curated_ids_sorted(curated_ids.begin(), curated_ids.end()); + std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end()); // Order of `fields` are used to sort results //auto begin = std::chrono::high_resolution_clock::now(); @@ -1186,11 +1155,22 @@ void Index::search(Option & outcome, const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0); const std::string & field = search_fields[0]; + if(!curated_ids.empty()) { + uint32_t *excluded_result_ids = nullptr; + filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0], + curated_ids.size(), &excluded_result_ids); + + delete [] filter_ids; + filter_ids = excluded_result_ids; + } + score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {}, filter_ids, filter_ids_length); collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries); - do_facets(facets, facet_query, filter_ids, filter_ids_length); + all_result_ids_len = filter_ids_length; + all_result_ids = filter_ids; + filter_ids = nullptr; } else { const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM); for(size_t i = 0; i < num_search_fields; i++) { @@ -1199,50 +1179,33 @@ void Index::search(Option & outcome, const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i); // Order of `fields` are used to sort results const std::string & field = search_fields[i]; - search_field(field_id, query, field, filter_ids, filter_ids_length, facets, sort_fields_std, + search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std, num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len, token_order, prefix, drop_tokens_threshold, typo_tokens_threshold); collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries); } } - do_facets(facets, facet_query, all_result_ids, all_result_ids_len); } + do_facets(facets, facet_query, all_result_ids, all_result_ids_len); do_facets(facets, facet_query, &included_ids[0], included_ids.size()); // must be sorted before iterated upon to remove "empty" array entries topster->sort(); curated_topster->sort(); - std::set ids_to_remove(included_ids.begin(), included_ids.end()); - ids_to_remove.insert(excluded_ids.begin(), excluded_ids.end()); - - std::vector dropped_ids; - // loop through topster and remove elements from included and excluded id lists if(topster->distinct) { for(auto &group_topster_entry: topster->group_kv_map) { Topster* group_topster = group_topster_entry.second; - for(uint32_t t = 0; t < group_topster->size; t++) { - KV* kv = group_topster->getKV(t); - if(ids_to_remove.count(kv->key) != 0) { - dropped_ids.push_back((uint32_t)kv->key); - } - } - const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); raw_result_kvs.emplace_back(group_kvs); } } else { for(uint32_t t = 0; t < topster->size; t++) { KV* kv = topster->getKV(t); - - if(ids_to_remove.count(kv->key) != 0) { - dropped_ids.push_back((uint32_t)kv->key); - } else { - raw_result_kvs.push_back({kv}); - } + raw_result_kvs.push_back({kv}); } } @@ -1252,9 +1215,6 @@ void Index::search(Option & outcome, } // for the ids that are dropped, remove their corresponding facet components from facet results - drop_facets(facets, dropped_ids); - - all_result_ids_len -= dropped_ids.size(); all_result_ids_len += curated_topster->size; delete [] filter_ids; @@ -1277,6 +1237,7 @@ void Index::search(Option & outcome, */ void Index::search_field(const uint8_t & field_id, const std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, std::vector & facets, const std::vector & sort_fields, const int num_typos, std::vector> & searched_queries, Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len, @@ -1392,8 +1353,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) { // If all tokens were found, go ahead and search for candidates with what we have so far - search_candidates(field_id, filter_ids, filter_ids_length, sort_fields, token_candidates_vec, - token_order, searched_queries, topster, all_result_ids, all_result_ids_len, + search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec, + searched_queries, topster, all_result_ids, all_result_ids_len, typo_tokens_threshold); } @@ -1426,7 +1387,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co truncated_query += " " + token_count_pairs.at(i).first; } - return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos, + return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids, + facets, sort_fields, num_typos, searched_queries, topster, all_result_ids, all_result_ids_len, token_order, prefix); } @@ -1573,21 +1535,7 @@ void Index::score_results(const std::vector & sort_fields, const uint16 uint64_t distinct_id = seq_id; if(search_params->group_limit != 0) { - distinct_id = 1; // some constant initial value - - // calculate hash from group_by_fields - for(const auto& field: search_params->group_by_fields) { - if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) { - continue; - } - - size_t facet_id = facet_to_id[field]; - const std::vector& fhashes = facet_index_v2.at(seq_id)[facet_id]; - - for(const auto& hash: fhashes) { - distinct_id = hash_combine(distinct_id, hash); - } - } + distinct_id = get_distinct_id(facet_to_id, seq_id); } KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores); @@ -1603,6 +1551,26 @@ void Index::score_results(const std::vector & sort_fields, const uint16 } } +uint64_t Index::get_distinct_id(const std::unordered_map &facet_to_id, + const uint32_t seq_id) const { + uint64_t distinct_id = 1; // some constant initial value + + // calculate hash from group_by_fields + for(const auto& field: search_params->group_by_fields) { + if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) { + continue; + } + + size_t facet_id = facet_to_id.at(field); + const std::vector& fhashes = facet_index_v2.at(seq_id)[facet_id]; + + for(const auto& hash: fhashes) { + distinct_id = hash_combine(distinct_id, hash); + } + } + return distinct_id; +} + void Index::populate_token_positions(const std::vector &query_suggestion, spp::sparse_hash_map &leaf_to_indices, size_t result_index, diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index a94bfbc0..49824142 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -219,6 +219,8 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) { spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "starring: scott").get(); + ASSERT_EQ(9, results["found"].get()); + // "count" would be `2` without exclusion ASSERT_EQ("Scott Glenn", results["facet_counts"][0]["counts"][0]["highlighted"].get()); ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get()); @@ -233,7 +235,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) { spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "starring: scott").get(); - ASSERT_EQ(10, results["found"].get()); + ASSERT_EQ(9, results["found"].get()); ASSERT_EQ(0, results["hits"].size()); coll_mul_fields->remove_override("exclude-rule"); From 3b68a815159176855808e8ff812c904ac55591f9 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sun, 14 Jun 2020 15:20:31 +0530 Subject: [PATCH 10/38] Ensure timeouts are cleared before destroying evloop. --- src/http_server.cpp | 31 ++----------------------------- 1 file changed, 2 insertions(+), 29 deletions(-) diff --git a/src/http_server.cpp b/src/http_server.cpp index 37b6aae0..8058e32d 100644 --- a/src/http_server.cpp +++ b/src/http_server.cpp @@ -186,20 +186,6 @@ std::string HttpServer::get_version() { void HttpServer::clear_timeouts(const std::vector & timers, bool trigger_callback) { for(h2o_timer_t* timer: timers) { h2o_timer_unlink(timer); - /*while (!h2o_linklist_is_empty(&timer->_link)) { - h2o_timer_t *entry = H2O_STRUCT_FROM_MEMBER(h2o_timer_t, _link, timer->_link.next); - if(entry == nullptr) { - continue; - } - - if(trigger_callback) { - entry->cb(entry); - } - - //entry->expire_at = 0; - h2o_linklist_unlink(&entry->_link); - h2o_timer_unlink(timer); - }*/ } } @@ -476,26 +462,13 @@ void HttpServer::on(const std::string & message, bool (*handler)(void*)) { HttpServer::~HttpServer() { delete message_dispatcher; - // remove all timeouts defined in: https://github.com/h2o/h2o/blob/v2.2.2/lib/core/context.c#L142 - /*std::vector timeouts = { - &ctx.zero_timeout, - &ctx.one_sec_timeout, - &ctx.hundred_ms_timeout, - &ctx.handshake_timeout, - &ctx.http1.req_timeout, - &ctx.http2.idle_timeout, - &ctx.http2.graceful_shutdown_timeout, - &ctx.proxy.io_timeout - }; - - clear_timeouts(timeouts); - */ - if(ssl_refresh_timer.timer.expire_at != 0) { // avoid callback since it recreates timeout clear_timeouts({&ssl_refresh_timer.timer}, false); } + h2o_timerwheel_run(ctx.loop->_timeouts, 9999999999999); + h2o_context_dispose(&ctx); free(ctx.globalconf->server_name.base); From 1c398fac7ef48338f8fa2b98c5db763c0536c8a7 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sun, 14 Jun 2020 15:21:54 +0530 Subject: [PATCH 11/38] Don't require query fields for wildcard query. --- src/core_api.cpp | 2 +- test/collection_test.cpp | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/core_api.cpp b/src/core_api.cpp index 8ba15bc2..18c96629 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -252,7 +252,7 @@ bool get_search(http_req & req, http_res & res) { return false; } - if(req.params.count(QUERY_BY) == 0) { + if(req.params.count(QUERY_BY) == 0 && req.params[QUERY] != "*") { res.set_400(std::string("Parameter `") + QUERY_BY + "` is required."); return false; } diff --git a/test/collection_test.cpp b/test/collection_test.cpp index f5c9bf09..dc904ce5 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -455,6 +455,12 @@ TEST_F(CollectionTest, WildcardQuery) { std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } + + // wildcard query should not require a search field + results_op = collection->search("*", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false).get(); + ASSERT_TRUE(results_op.ok()); + ASSERT_EQ(3, results["hits"].size()); + ASSERT_EQ(25, results["found"].get()); } TEST_F(CollectionTest, PrefixSearching) { From c5010a6a5f282c1a8deec3b70a741d9c551e8a9b Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sun, 14 Jun 2020 17:16:01 +0530 Subject: [PATCH 12/38] Results count should match group size for group query. --- TODO.md | 5 +++-- include/field.h | 2 +- include/index.h | 21 ++++++++++------- src/array_utils.cpp | 4 +++- src/art.cpp | 2 -- src/collection.cpp | 55 ++++++++++++++++++++++++++++----------------- src/index.cpp | 53 ++++++++++++++++++++++--------------------- 7 files changed, 82 insertions(+), 60 deletions(-) diff --git a/TODO.md b/TODO.md index b031a788..adeea108 100644 --- a/TODO.md +++ b/TODO.md @@ -97,9 +97,10 @@ - ~~Have a LOG(ERROR) level~~ - ~~Handle SIGTERM which is sent when process is killed~~ - ~~Use snappy compression for storage~~ +- ~~Fix exclude_scalar early returns~~ +- ~~Fix result ids length during grouped overrides~~ +- Fix override grouping (collate_included_ids) - Test for overriding result on second page -- Fix exclude_scalar early returns -- Fix result ids length during grouped overrides - atleast 1 token match for proceeding with drop tokens - support wildcard query with filters - API for optimizing on disk storage diff --git a/include/field.h b/include/field.h index 6255aed5..44761e23 100644 --- a/include/field.h +++ b/include/field.h @@ -146,7 +146,7 @@ struct token_pos_cost_t { struct facet_count_t { uint32_t count; - spp::sparse_hash_map groups; // used for faceting grouped results + spp::sparse_hash_set groups; // used for faceting grouped results // used to fetch the actual document and value for representation uint32_t doc_id; diff --git a/include/index.h b/include/index.h index 4baa1eed..65cf951e 100644 --- a/include/index.h +++ b/include/index.h @@ -42,6 +42,7 @@ struct search_args { std::vector group_by_fields; size_t group_limit; size_t all_result_ids_len; + spp::sparse_hash_set groups_processed; std::vector> searched_queries; Topster* topster; Topster* curated_topster; @@ -168,9 +169,9 @@ private: const std::vector& curated_ids, std::vector & facets, const std::vector & sort_fields, const int num_typos, std::vector> & searched_queries, - Topster* topster, uint32_t** all_result_ids, - size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY, - const bool prefix = false, + Topster* topster, spp::sparse_hash_set& groups_processed, + uint32_t** all_result_ids, size_t & all_result_ids_len, + const token_ordering token_order = FREQUENCY, const bool prefix = false, const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD, const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD); @@ -178,7 +179,8 @@ private: const std::vector& curated_ids, const std::vector & sort_fields, std::vector & token_to_candidates, std::vector> & searched_queries, - Topster* topster, uint32_t** all_result_ids, + Topster* topster, spp::sparse_hash_set& groups_processed, + uint32_t** all_result_ids, size_t & all_result_ids_len, const size_t typo_tokens_threshold); @@ -210,9 +212,9 @@ private: void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted, const uint32_t indices_length); - void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id, - const std::vector & included_ids, - Topster* curated_topster, std::vector> & searched_queries); + void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id, + const std::vector & included_ids, + Topster* curated_topster, std::vector> & searched_queries); uint64_t facet_token_hash(const field & a_field, const std::string &token); @@ -242,7 +244,9 @@ public: Topster* topster, Topster* curated_topster, const size_t per_page, const size_t page, const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold, - size_t & all_result_ids_len, std::vector> & searched_queries, + size_t & all_result_ids_len, + spp::sparse_hash_set& groups_processed, + std::vector> & searched_queries, std::vector> & raw_result_kvs, std::vector & override_result_kvs, const size_t typo_tokens_threshold); @@ -257,6 +261,7 @@ public: void score_results(const std::vector & sort_fields, const uint16_t & query_index, const uint8_t & field_id, const uint32_t total_cost, Topster* topster, const std::vector & query_suggestion, + spp::sparse_hash_set& groups_processed, const uint32_t *result_ids, const size_t result_size) const; static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field); diff --git a/src/array_utils.cpp b/src/array_utils.cpp index 62fe2d87..5a88dffd 100644 --- a/src/array_utils.cpp +++ b/src/array_utils.cpp @@ -107,10 +107,12 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA, size_t indexA = 0, indexB = 0, res_index = 0; if(A == nullptr && B == nullptr) { - return 0; + *out = nullptr; + return 0; } if(A == nullptr) { + *out = nullptr; return 0; } diff --git a/src/art.cpp b/src/art.cpp index 7ac945c0..1da84cf1 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -1384,8 +1384,6 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, art_fuzzy_recurse(0, 0, t->root, -1, term, term_len, irow, jrow, min_cost, max_cost, prefix, nodes); } - PROCESS_NODES: - if(token_order == FREQUENCY) { std::sort(nodes.begin(), nodes.end(), compare_art_node_frequency); } else { diff --git a/src/collection.cpp b/src/collection.cpp index 6e50fbac..24547c62 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -653,6 +653,7 @@ Option Collection::search(const std::string & query, const std:: std::vector override_result_kvs; size_t total_found = 0; + spp::sparse_hash_set groups_processed; // used to calculate total_found for grouped query // send data to individual index threads size_t index_id = 0; @@ -709,28 +710,22 @@ Option Collection::search(const std::string & query, const std:: auto & acc_facet = facets[fi]; for(auto & facet_kv: this_facet.result_map) { - size_t count = 0; - - - // for grouping we have to aggregate group counts to a count value - /*if(search_params->group_limit) { - // for every facet - for(auto& a_facet: facets) { - // for every facet value - for(auto& fvalue: a_facet.result_map) { - fvalue.second.count = fvalue.second.groups.size(); - } - } - }*/ - - if(acc_facet.result_map.count(facet_kv.first) == 0) { - // not found, so set it - count = facet_kv.second.count; + if(index->search_params->group_limit) { + // we have to add all group sets + acc_facet.result_map[facet_kv.first].groups.insert( + facet_kv.second.groups.begin(), facet_kv.second.groups.end() + ); } else { - count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count; + size_t count = 0; + if(acc_facet.result_map.count(facet_kv.first) == 0) { + // not found, so set it + count = facet_kv.second.count; + } else { + count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count; + } + acc_facet.result_map[facet_kv.first].count = count; } - acc_facet.result_map[facet_kv.first].count = count; acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id; acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos; acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos; @@ -744,7 +739,25 @@ Option Collection::search(const std::string & query, const std:: } } - total_found += index->search_params->all_result_ids_len; + if(group_limit) { + groups_processed.insert( + index->search_params->groups_processed.begin(), + index->search_params->groups_processed.end() + ); + } else { + total_found += index->search_params->all_result_ids_len; + } + } + + // for grouping we have to aggregate group set sizes to a count value + if(group_limit) { + for(auto& acc_facet: facets) { + for(auto& facet_kv: acc_facet.result_map) { + facet_kv.second.count = facet_kv.second.groups.size(); + } + } + + total_found = groups_processed.size(); } if(!index_search_op.ok()) { @@ -753,7 +766,7 @@ Option Collection::search(const std::string & query, const std:: Topster* aggr_topster = nullptr; - if(group_limit > 0) { + if(group_limit) { // group by query requires another round of topster-ing // needs to be atleast 1 since scoring is mandatory diff --git a/src/index.cpp b/src/index.cpp index 5c0f8546..fcbbc478 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -744,7 +744,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, uint64_t fhash = facet_token_hash(facet_field, fvalue_str); if(a_facet.result_map.count(fhash) == 0) { - a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_map(), + a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_set(), doc_seq_id, 0, spp::sparse_hash_map()}; } @@ -754,10 +754,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, if(search_params->group_limit) { uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id); - if(a_facet.result_map[fhash].groups.count(distinct_id) == 0) { - a_facet.result_map[fhash].groups.emplace(distinct_id, 0); - } - a_facet.result_map[fhash].groups[distinct_id] += 1; + a_facet.result_map[fhash].groups.emplace(distinct_id); } else { a_facet.result_map[fhash].count += 1; } @@ -784,6 +781,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si const std::vector & sort_fields, std::vector & token_candidates_vec, std::vector> & searched_queries, Topster* topster, + spp::sparse_hash_set& groups_processed, uint32_t** all_result_ids, size_t & all_result_ids_len, const size_t typo_tokens_threshold) { const long long combination_limit = 10; @@ -850,7 +848,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si // go through each matching document id and calculate match score score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion, - filtered_result_ids, filtered_results_size); + groups_processed, filtered_result_ids, filtered_results_size); delete[] filtered_result_ids; delete[] result_ids; @@ -862,7 +860,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si *all_result_ids = new_all_result_ids; score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion, - result_ids, result_size); + groups_processed, result_ids, result_size); delete[] result_ids; } @@ -1024,7 +1022,8 @@ void Index::run_search() { search_params->topster, search_params->curated_topster, search_params->per_page, search_params->page, search_params->token_order, search_params->prefix, search_params->drop_tokens_threshold, - search_params->all_result_ids_len, search_params->searched_queries, + search_params->all_result_ids_len, search_params->groups_processed, + search_params->searched_queries, search_params->raw_result_kvs, search_params->override_result_kvs, search_params->typo_tokens_threshold); @@ -1038,12 +1037,12 @@ void Index::run_search() { } } -void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id, - const std::vector & included_ids, - Topster* curated_topster, - std::vector> & searched_queries) { +void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id, + const std::vector & included_ids, + Topster* curated_topster, + std::vector> & searched_queries) { - if(included_ids.size() == 0) { + if(included_ids.empty()) { return; } @@ -1106,9 +1105,9 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f KV kv(field_id, searched_queries.size(), seq_id, seq_id, match_score, scores); curated_topster->add(&kv); - - searched_queries.push_back(override_query); } + + searched_queries.push_back(override_query); } void Index::search(Option & outcome, @@ -1124,7 +1123,8 @@ void Index::search(Option & outcome, const size_t per_page, const size_t page, const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold, size_t & all_result_ids_len, - std::vector> & searched_queries, + spp::sparse_hash_set& groups_processed, + std::vector>& searched_queries, std::vector> & raw_result_kvs, std::vector & override_result_kvs, const size_t typo_tokens_threshold) { @@ -1140,7 +1140,7 @@ void Index::search(Option & outcome, uint32_t filter_ids_length = op_filter_ids_length.get(); - // we will be removing all curated IDs from organic results before running topster + // we will be removing all curated IDs from organic result ids before running topster std::set curated_ids(included_ids.begin(), included_ids.end()); curated_ids.insert(excluded_ids.begin(), excluded_ids.end()); @@ -1165,8 +1165,8 @@ void Index::search(Option & outcome, } score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {}, - filter_ids, filter_ids_length); - collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries); + groups_processed, filter_ids, filter_ids_length); + collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries); all_result_ids_len = filter_ids_length; all_result_ids = filter_ids; @@ -1180,9 +1180,9 @@ void Index::search(Option & outcome, const std::string & field = search_fields[i]; search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std, - num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len, + num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len, token_order, prefix, drop_tokens_threshold, typo_tokens_threshold); - collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries); + collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries); } } } @@ -1214,7 +1214,7 @@ void Index::search(Option & outcome, override_result_kvs.push_back(kv); } - // for the ids that are dropped, remove their corresponding facet components from facet results + // add curated IDs to result count all_result_ids_len += curated_topster->size; delete [] filter_ids; @@ -1240,7 +1240,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co const std::vector& curated_ids, std::vector & facets, const std::vector & sort_fields, const int num_typos, std::vector> & searched_queries, - Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len, + Topster* topster, spp::sparse_hash_set& groups_processed, + uint32_t** all_result_ids, size_t & all_result_ids_len, const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) { std::vector tokens; @@ -1354,7 +1355,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) { // If all tokens were found, go ahead and search for candidates with what we have so far search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec, - searched_queries, topster, all_result_ids, all_result_ids_len, + searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len, typo_tokens_threshold); } @@ -1389,7 +1390,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids, facets, sort_fields, num_typos, - searched_queries, topster, all_result_ids, all_result_ids_len, + searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len, token_order, prefix); } } @@ -1417,6 +1418,7 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect void Index::score_results(const std::vector & sort_fields, const uint16_t & query_index, const uint8_t & field_id, const uint32_t total_cost, Topster* topster, const std::vector &query_suggestion, + spp::sparse_hash_set& groups_processed, const uint32_t *result_ids, const size_t result_size) const { spp::sparse_hash_map leaf_to_indices; @@ -1536,6 +1538,7 @@ void Index::score_results(const std::vector & sort_fields, const uint16 if(search_params->group_limit != 0) { distinct_id = get_distinct_id(facet_to_id, seq_id); + groups_processed.emplace(distinct_id); } KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores); From 3f6f13baf1c7c6c39e6618167fd2735595c5c8a2 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Mon, 15 Jun 2020 19:20:00 +0530 Subject: [PATCH 13/38] Support for grouping overrides. --- TODO.md | 4 +- include/collection.h | 8 +- include/index.h | 16 ++-- src/collection.cpp | 129 +++++++++++++++++++----------- src/core_api.cpp | 5 +- src/index.cpp | 86 +++++++++----------- test/collection_override_test.cpp | 23 +++++- 7 files changed, 159 insertions(+), 112 deletions(-) diff --git a/TODO.md b/TODO.md index adeea108..8c3bed2d 100644 --- a/TODO.md +++ b/TODO.md @@ -99,8 +99,8 @@ - ~~Use snappy compression for storage~~ - ~~Fix exclude_scalar early returns~~ - ~~Fix result ids length during grouped overrides~~ -- Fix override grouping (collate_included_ids) -- Test for overriding result on second page +- ~~Fix override grouping (collate_included_ids)~~ +- ~~Test for overriding result on second page~~ - atleast 1 token match for proceeding with drop tokens - support wildcard query with filters - API for optimizing on disk storage diff --git a/include/collection.h b/include/collection.h index 4e06131d..648ca384 100644 --- a/include/collection.h +++ b/include/collection.h @@ -155,10 +155,10 @@ private: void remove_document(nlohmann::json & document, const uint32_t seq_id, bool remove_from_store); void populate_overrides(std::string query, - const std::map& pinned_hits, + const std::map>& pinned_hits, const std::vector& hidden_hits, - std::map & id_pos_map, - std::vector & included_ids, std::vector & excluded_ids); + std::map>& include_ids, + std::vector & excluded_ids); static bool facet_count_compare(const std::pair& a, const std::pair& b) { @@ -236,7 +236,7 @@ public: const size_t snippet_threshold = 30, const std::string & highlight_full_fields = "", size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD, - const std::map& pinned_hits={}, + const std::map>& pinned_hits={}, const std::vector& hidden_hits={}, const std::vector& group_by_fields={}, const size_t group_limit = 0); diff --git a/include/index.h b/include/index.h index 65cf951e..f2eb5fcc 100644 --- a/include/index.h +++ b/include/index.h @@ -27,7 +27,7 @@ struct search_args { std::vector search_fields; std::vector filters; std::vector facets; - std::vector included_ids; + std::map> included_ids; std::vector excluded_ids; std::vector sort_fields_std; facet_query_t facet_query; @@ -47,7 +47,7 @@ struct search_args { Topster* topster; Topster* curated_topster; std::vector> raw_result_kvs; - std::vector override_result_kvs; + std::vector> override_result_kvs; Option outcome; search_args(): outcome(0) { @@ -55,7 +55,7 @@ struct search_args { } search_args(std::string query, std::vector search_fields, std::vector filters, - std::vector facets, std::vector included_ids, std::vector excluded_ids, + std::vector facets, std::map> included_ids, std::vector excluded_ids, std::vector sort_fields_std, facet_query_t facet_query, int num_typos, size_t max_facet_values, size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix, size_t drop_tokens_threshold, size_t typo_tokens_threshold, @@ -70,7 +70,7 @@ struct search_args { const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory topster = new Topster(topster_size, group_limit); - curated_topster = new Topster(topster_size); + curated_topster = new Topster(topster_size, group_limit); } ~search_args() { @@ -213,7 +213,7 @@ private: const uint32_t indices_length); void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id, - const std::vector & included_ids, + const std::map> & included_ids_map, Topster* curated_topster, std::vector> & searched_queries); uint64_t facet_token_hash(const field & a_field, const std::string &token); @@ -239,7 +239,8 @@ public: void search(Option & outcome, const std::string & query, const std::vector & search_fields, const std::vector & filters, std::vector & facets, facet_query_t & facet_query, - const std::vector & included_ids, const std::vector & excluded_ids, + const std::map> & included_ids_map, + const std::vector & excluded_ids, const std::vector & sort_fields_std, const int num_typos, Topster* topster, Topster* curated_topster, const size_t per_page, const size_t page, const token_ordering token_order, @@ -247,7 +248,8 @@ public: size_t & all_result_ids_len, spp::sparse_hash_set& groups_processed, std::vector> & searched_queries, - std::vector> & raw_result_kvs, std::vector & override_result_kvs, + std::vector> & raw_result_kvs, + std::vector> & override_result_kvs, const size_t typo_tokens_threshold); Option remove(const uint32_t seq_id, nlohmann::json & document); diff --git a/src/collection.cpp b/src/collection.cpp index 24547c62..a51a7e29 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -305,54 +305,70 @@ void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash } void Collection::populate_overrides(std::string query, - const std::map& pinned_hits, + const std::map>& pinned_hits, const std::vector& hidden_hits, - std::map & id_pos_map, - std::vector & included_ids, + std::map>& include_ids, std::vector & excluded_ids) { StringUtils::tolowercase(query); + std::set excluded_set; + + // If pinned or hidden hits are provided, they take precedence over overrides + + // have to ensure that hidden hits take precedence over included hits + if(!hidden_hits.empty()) { + for(const auto & hit: hidden_hits) { + Option seq_id_op = doc_id_to_seq_id(hit); + if(seq_id_op.ok()) { + excluded_ids.push_back(seq_id_op.get()); + excluded_set.insert(seq_id_op.get()); + } + } + } for(const auto & override_kv: overrides) { const auto & override = override_kv.second; if( (override.rule.match == override_t::MATCH_EXACT && override.rule.query == query) || (override.rule.match == override_t::MATCH_CONTAINS && query.find(override.rule.query) != std::string::npos) ) { - for(const auto & hit: override.add_hits) { - Option seq_id_op = doc_id_to_seq_id(hit.doc_id); - if(seq_id_op.ok()) { - included_ids.push_back(seq_id_op.get()); - id_pos_map[seq_id_op.get()] = hit.position; - } - } + // have to ensure that dropped hits take precedence over added hits for(const auto & hit: override.drop_hits) { Option seq_id_op = doc_id_to_seq_id(hit.doc_id); if(seq_id_op.ok()) { excluded_ids.push_back(seq_id_op.get()); + excluded_set.insert(seq_id_op.get()); } } + + for(const auto & hit: override.add_hits) { + Option seq_id_op = doc_id_to_seq_id(hit.doc_id); + if(!seq_id_op.ok()) { + continue; + } + uint32_t seq_id = seq_id_op.get(); + bool excluded = (excluded_set.count(seq_id) != 0); + if(!excluded) { + include_ids[hit.position].push_back(seq_id); + } + } + + break; } } - // If pinned or hidden hits are provided, they take precedence over overrides - if(!pinned_hits.empty()) { - for(const auto & hit: pinned_hits) { - Option seq_id_op = doc_id_to_seq_id(hit.first); - if(seq_id_op.ok()) { - included_ids.push_back(seq_id_op.get()); - id_pos_map[seq_id_op.get()] = hit.second; - } - } - } - - if(!hidden_hits.empty()) { - for(const auto & hit: hidden_hits) { - Option seq_id_op = doc_id_to_seq_id(hit); - if(seq_id_op.ok()) { - included_ids.erase(std::remove(included_ids.begin(), included_ids.end(), seq_id_op.get()), included_ids.end()); - id_pos_map.erase(seq_id_op.get()); - excluded_ids.push_back(seq_id_op.get()); + for(const auto& pos_ids: pinned_hits) { + size_t pos = pos_ids.first; + for(const std::string& id: pos_ids.second) { + Option seq_id_op = doc_id_to_seq_id(id); + if(!seq_id_op.ok()) { + continue; + } + uint32_t seq_id = seq_id_op.get(); + bool excluded = (excluded_set.count(seq_id) != 0); + if(!excluded) { + include_ids[pos].push_back(seq_id); + } } } } @@ -371,22 +387,40 @@ Option Collection::search(const std::string & query, const std:: const size_t snippet_threshold, const std::string & highlight_full_fields, size_t typo_tokens_threshold, - const std::map& pinned_hits, + const std::map>& pinned_hits, const std::vector& hidden_hits, const std::vector& group_by_fields, const size_t group_limit) { - std::vector included_ids; std::vector excluded_ids; - std::map id_pos_map; - populate_overrides(query, pinned_hits, hidden_hits, id_pos_map, included_ids, excluded_ids); + std::map> include_ids; // position => list of IDs + populate_overrides(query, pinned_hits, hidden_hits, include_ids, excluded_ids); - std::map> index_to_included_ids; + /*for(auto kv: include_ids) { + LOG(INFO) << "key: " << kv.first; + for(auto val: kv.second) { + LOG(INFO) << val; + } + } + + LOG(INFO) << "Excludes:"; + + for(auto id: excluded_ids) { + LOG(INFO) << id; + }*/ + + //LOG(INFO) << "include_ids size: " << include_ids.size(); + //LOG(INFO) << "Pos 1: " << include_ids[1][0]; + + std::map>> index_to_included_ids; std::map> index_to_excluded_ids; - for(auto seq_id: included_ids) { - auto index_id = (seq_id % num_indices); - index_to_included_ids[index_id].push_back(seq_id); + for(const auto& pos_ids: include_ids) { + size_t position = pos_ids.first; + for(auto seq_id: pos_ids.second) { + auto index_id = (seq_id % num_indices); + index_to_included_ids[index_id][position].push_back(seq_id); + } } for(auto seq_id: excluded_ids) { @@ -650,7 +684,7 @@ Option Collection::search(const std::string & query, const std:: std::vector> searched_queries; // search queries used for generating the results std::vector> raw_result_kvs; - std::vector override_result_kvs; + std::vector> override_result_kvs; size_t total_found = 0; spp::sparse_hash_set groups_processed; // used to calculate total_found for grouped query @@ -697,9 +731,9 @@ Option Collection::search(const std::string & query, const std:: raw_result_kvs.push_back(kv_group); } - for(auto & field_order_kv: index->search_params->override_result_kvs) { - field_order_kv->query_index += searched_queries.size(); - override_result_kvs.push_back(field_order_kv); + for(const std::vector & kv_group: index->search_params->override_result_kvs) { + kv_group[0]->query_index += searched_queries.size(); + override_result_kvs.push_back(kv_group); } searched_queries.insert(searched_queries.end(), index->search_params->searched_queries.begin(), @@ -797,8 +831,8 @@ Option Collection::search(const std::string & query, const std:: // Sort based on position in overridden list std::sort( override_result_kvs.begin(), override_result_kvs.end(), - [&id_pos_map](const KV* a, const KV* b) -> bool { - return id_pos_map[a->key] < id_pos_map[b->key]; + [](const std::vector& a, std::vector& b) -> bool { + return a[0]->distinct_key < b[0]->distinct_key; } ); @@ -808,11 +842,12 @@ Option Collection::search(const std::string & query, const std:: // merge raw results and override results while(override_kv_index < override_result_kvs.size() && raw_results_index < raw_result_kvs.size()) { - if(override_kv_index < override_result_kvs.size() && - id_pos_map.count(override_result_kvs[override_kv_index]->key) != 0 && - result_group_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index]->key]) { - result_group_kvs.push_back({override_result_kvs[override_kv_index]}); - override_kv_index++; + size_t result_position = result_group_kvs.size() + 1; + uint64_t override_position = override_result_kvs[override_kv_index][0]->distinct_key; + + if(result_position == override_position) { + result_group_kvs.push_back(override_result_kvs[override_kv_index]); + override_kv_index++; } else { result_group_kvs.push_back(raw_result_kvs[raw_results_index]); raw_results_index++; diff --git a/src/core_api.cpp b/src/core_api.cpp index 18c96629..03f89a2f 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -390,7 +390,8 @@ bool get_search(http_req & req, http_res & res) { } } - std::map pinned_hits; + std::map> pinned_hits; + if(req.params.count(PINNED_HITS) != 0) { std::vector pinned_hits_strs; StringUtils::split(req.params[PINNED_HITS], pinned_hits_strs, ","); @@ -415,7 +416,7 @@ bool get_search(http_req & req, http_res & res) { return false; } - pinned_hits.emplace(expression_parts[0], position); + pinned_hits[position].emplace_back(expression_parts[0]); } } diff --git a/src/index.cpp b/src/index.cpp index fcbbc478..6fdee499 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1038,11 +1038,11 @@ void Index::run_search() { } void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id, - const std::vector & included_ids, + const std::map> & included_ids_map, Topster* curated_topster, std::vector> & searched_queries) { - if(included_ids.empty()) { + if(included_ids_map.empty()) { return; } @@ -1061,50 +1061,28 @@ void Index::collate_included_ids(const std::string & query, const std::string & art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len, 0, 0, 1, token_ordering::MAX_SCORE, false, leaves); - if(leaves.size() > 0) { + if(!leaves.empty()) { override_query.push_back(leaves[0]); } } - spp::sparse_hash_map leaf_to_indices; + for(const auto& pos_ids: included_ids_map) { + const size_t pos = pos_ids.first; - for (art_leaf *token_leaf : override_query) { - uint32_t *indices = new uint32_t[included_ids.size()]; - token_leaf->values->ids.indexOf(&included_ids[0], included_ids.size(), indices); - leaf_to_indices.emplace(token_leaf, indices); - } + for(size_t i = 0; i < pos_ids.second.size(); i++) { + uint32_t seq_id = pos_ids.second[i]; - // curated_topster->MAX_SIZE is initialized based on max_hits. - // Even if override has more IDs, we should restrict to max hits. - size_t iter_size = std::min((size_t)curated_topster->MAX_SIZE, included_ids.size()); + uint64_t distinct_id = pos; // position is the group distinct key + uint64_t match_score = (64000 - i); // index within a group is the match score - for(size_t j=0; j>> array_token_positions; - populate_token_positions(override_query, leaf_to_indices, j, array_token_positions); - - uint64_t match_score = 0; - - for(const std::vector> & token_positions: array_token_positions) { - if(token_positions.empty()) { - continue; - } - const Match & match = Match::match(seq_id, token_positions); - uint64_t this_match_score = match.get_match_score(0, field_id); - - if(this_match_score > match_score) { - match_score = this_match_score; - } + KV kv(field_id, searched_queries.size(), seq_id, distinct_id, match_score, scores); + curated_topster->add(&kv); } - - int64_t scores[3]; - scores[0] = int64_t(match_score); - scores[1] = int64_t(1); - scores[2] = int64_t(1); - - KV kv(field_id, searched_queries.size(), seq_id, seq_id, match_score, scores); - curated_topster->add(&kv); } searched_queries.push_back(override_query); @@ -1115,7 +1093,7 @@ void Index::search(Option & outcome, const std::vector & search_fields, const std::vector & filters, std::vector & facets, facet_query_t & facet_query, - const std::vector & included_ids, + const std::map> & included_ids_map, const std::vector & excluded_ids, const std::vector & sort_fields_std, const int num_typos, Topster* topster, @@ -1126,7 +1104,7 @@ void Index::search(Option & outcome, spp::sparse_hash_set& groups_processed, std::vector>& searched_queries, std::vector> & raw_result_kvs, - std::vector & override_result_kvs, + std::vector> & override_result_kvs, const size_t typo_tokens_threshold) { // process the filters @@ -1141,7 +1119,16 @@ void Index::search(Option & outcome, uint32_t filter_ids_length = op_filter_ids_length.get(); // we will be removing all curated IDs from organic result ids before running topster - std::set curated_ids(included_ids.begin(), included_ids.end()); + std::set curated_ids; + std::vector included_ids; + + for(const auto& pos_ids: included_ids_map) { + for(const uint32_t id: pos_ids.second) { + curated_ids.insert(id); + included_ids.push_back(id); + } + } + curated_ids.insert(excluded_ids.begin(), excluded_ids.end()); std::vector curated_ids_sorted(curated_ids.begin(), curated_ids.end()); @@ -1166,7 +1153,7 @@ void Index::search(Option & outcome, score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {}, groups_processed, filter_ids, filter_ids_length); - collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries); + collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries); all_result_ids_len = filter_ids_length; all_result_ids = filter_ids; @@ -1182,7 +1169,7 @@ void Index::search(Option & outcome, search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std, num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len, token_order, prefix, drop_tokens_threshold, typo_tokens_threshold); - collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries); + collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries); } } } @@ -1202,16 +1189,23 @@ void Index::search(Option & outcome, const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); raw_result_kvs.emplace_back(group_kvs); } + + for(auto &curated_topster_entry: curated_topster->group_kv_map) { + Topster* group_topster = curated_topster_entry.second; + const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); + override_result_kvs.emplace_back(group_kvs); + } + } else { for(uint32_t t = 0; t < topster->size; t++) { KV* kv = topster->getKV(t); raw_result_kvs.push_back({kv}); } - } - for(uint32_t t = 0; t < curated_topster->size; t++) { - KV* kv = curated_topster->getKV(t); - override_result_kvs.push_back(kv); + for(uint32_t t = 0; t < curated_topster->size; t++) { + KV* kv = curated_topster->getKV(t); + override_result_kvs.push_back({kv}); + } } // add curated IDs to result count diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index 49824142..eaf153f7 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -256,10 +256,9 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) { } TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { - std::map pinned_hits; - std::vector hidden_hits; - pinned_hits["13"] = 1; - pinned_hits["4"] = 2; + std::map> pinned_hits; + pinned_hits[1] = {"13"}; + pinned_hits[2] = {"4"}; // basic pinning @@ -279,6 +278,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { // both pinning and hiding + std::vector hidden_hits; hidden_hits = {"11", "16"}; results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, @@ -291,6 +291,21 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { ASSERT_STREQ("4", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("6", results["hits"][2]["document"]["id"].get().c_str()); + // paginating such that pinned hits appear on second page + pinned_hits.clear(); + pinned_hits[4] = {"13"}; + pinned_hits[5] = {"4"}; + + results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 2, 2, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "starring: will", 30, + "", 10, + pinned_hits, hidden_hits).get(); + + ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("13", results["hits"][1]["document"]["id"].get().c_str()); + // take precedence over override rules nlohmann::json override_json_include = { From ef405167902474342b6b719ba35a271081acb6c4 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Thu, 18 Jun 2020 06:17:10 +0530 Subject: [PATCH 14/38] Fixing an issue with grouping across indices. --- include/collection.h | 4 + include/topster.h | 5 ++ src/collection.cpp | 120 ++++++++++++++++++------------ src/index.cpp | 28 ------- test/collection_grouping_test.cpp | 71 ++++++++++++++++++ test/collection_override_test.cpp | 38 +++++++++- test/collection_test.cpp | 1 - 7 files changed, 189 insertions(+), 78 deletions(-) create mode 100644 test/collection_grouping_test.cpp diff --git a/include/collection.h b/include/collection.h index 648ca384..730bad63 100644 --- a/include/collection.h +++ b/include/collection.h @@ -282,5 +282,9 @@ public: void facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document, std::string &value); + + void aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const; + + void populate_result_kvs(Topster *topster, std::vector> &result_kvs) const; }; diff --git a/include/topster.h b/include/topster.h index 685e6d62..2e585a6b 100644 --- a/include/topster.h +++ b/include/topster.h @@ -69,6 +69,11 @@ struct Topster { for(auto& kv: group_kv_map) { delete kv.second; } + + data = nullptr; + kvs = nullptr; + + group_kv_map.clear(); } static inline void swapMe(KV** a, KV** b) { diff --git a/src/collection.cpp b/src/collection.cpp index a51a7e29..ee23aefe 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -407,19 +407,30 @@ Option Collection::search(const std::string & query, const std:: for(auto id: excluded_ids) { LOG(INFO) << id; - }*/ + } - //LOG(INFO) << "include_ids size: " << include_ids.size(); - //LOG(INFO) << "Pos 1: " << include_ids[1][0]; + LOG(INFO) << "include_ids size: " << include_ids.size(); + for(auto& group: include_ids) { + for(uint32_t& seq_id: group.second) { + LOG(INFO) << "seq_id: " << seq_id; + } + + LOG(INFO) << "----"; + } + */ std::map>> index_to_included_ids; std::map> index_to_excluded_ids; for(const auto& pos_ids: include_ids) { size_t position = pos_ids.first; - for(auto seq_id: pos_ids.second) { + size_t ids_per_pos = std::max(size_t(1), group_limit); + + for(size_t i = 0; i < std::min(ids_per_pos, pos_ids.second.size()); i++) { + auto seq_id = pos_ids.second[i]; auto index_id = (seq_id % num_indices); index_to_included_ids[index_id][position].push_back(seq_id); + //LOG(INFO) << "Adding seq_id " << seq_id << " to index_id " << index_id; } } @@ -709,6 +720,12 @@ Option Collection::search(const std::string & query, const std:: Option index_search_op({}); // stores the last error across all index threads + // for grouping we have re-aggregate + + const size_t topster_size = std::max((size_t)1, max_hits); + Topster topster(topster_size, group_limit); + Topster curated_topster(topster_size, group_limit); + for(Index* index: indices) { // wait for the worker { @@ -726,15 +743,8 @@ Option Collection::search(const std::string & query, const std:: continue; } - for(const std::vector & kv_group: index->search_params->raw_result_kvs) { - kv_group[0]->query_index += searched_queries.size(); - raw_result_kvs.push_back(kv_group); - } - - for(const std::vector & kv_group: index->search_params->override_result_kvs) { - kv_group[0]->query_index += searched_queries.size(); - override_result_kvs.push_back(kv_group); - } + aggregate_topster(searched_queries.size(), topster, index->search_params->topster); + aggregate_topster(searched_queries.size(), curated_topster, index->search_params->curated_topster); searched_queries.insert(searched_queries.end(), index->search_params->searched_queries.begin(), index->search_params->searched_queries.end()); @@ -783,46 +793,27 @@ Option Collection::search(const std::string & query, const std:: } } + if(!index_search_op.ok()) { + return index_search_op; + } + + topster.sort(); + curated_topster.sort(); + + populate_result_kvs(&topster, raw_result_kvs); + populate_result_kvs(&curated_topster, override_result_kvs); + // for grouping we have to aggregate group set sizes to a count value if(group_limit) { + LOG(INFO) << "override_result_kvs size: " << override_result_kvs.size(); + for(auto& acc_facet: facets) { for(auto& facet_kv: acc_facet.result_map) { facet_kv.second.count = facet_kv.second.groups.size(); } } - total_found = groups_processed.size(); - } - - if(!index_search_op.ok()) { - return index_search_op; - } - - Topster* aggr_topster = nullptr; - - if(group_limit) { - // group by query requires another round of topster-ing - - // needs to be atleast 1 since scoring is mandatory - const size_t topster_size = std::max((size_t)1, max_hits); - aggr_topster = new Topster(topster_size, group_limit); - - for(const auto& kv_group: raw_result_kvs) { - for(KV* kv: kv_group) { - aggr_topster->add(kv); - } - } - - aggr_topster->sort(); - - raw_result_kvs.clear(); - raw_result_kvs.shrink_to_fit(); - - for(auto &group_topster_entry: aggr_topster->group_kv_map) { - Topster* group_topster = group_topster_entry.second; - const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); - raw_result_kvs.emplace_back(group_kvs); - } + total_found = groups_processed.size() + override_result_kvs.size(); } // All fields are sorted descending @@ -1063,8 +1054,6 @@ Option Collection::search(const std::string & query, const std:: delete index->search_params; } - delete aggr_topster; - result["request_params"] = nlohmann::json::object();; result["request_params"]["per_page"] = per_page; result["request_params"]["q"] = query; @@ -1075,6 +1064,43 @@ Option Collection::search(const std::string & query, const std:: return result; } + +void Collection::populate_result_kvs(Topster *topster, std::vector> &result_kvs) const { + if(topster->distinct) { + for(auto &group_topster_entry: topster->group_kv_map) { + Topster* group_topster = group_topster_entry.second; + const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); + result_kvs.emplace_back(group_kvs); + } + + } else { + for(uint32_t t = 0; t < topster->size; t++) { + KV* kv = topster->getKV(t); + result_kvs.push_back({kv}); + } + } +} + +void Collection::aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const { + if(index_topster->distinct) { + for(auto &group_topster_entry: index_topster->group_kv_map) { + Topster* group_topster = group_topster_entry.second; + const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); + for(KV* kv: group_kvs) { + kv->query_index += query_index; + topster.add(kv); + } + } + + } else { + for(uint32_t t = 0; t < index_topster->size; t++) { + KV* kv = index_topster->getKV(t); + kv->query_index += query_index; + topster.add(kv); + } + } +} + void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document, std::string &value) { diff --git a/src/index.cpp b/src/index.cpp index 6fdee499..ce1fecbc 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1181,34 +1181,6 @@ void Index::search(Option & outcome, topster->sort(); curated_topster->sort(); - // loop through topster and remove elements from included and excluded id lists - - if(topster->distinct) { - for(auto &group_topster_entry: topster->group_kv_map) { - Topster* group_topster = group_topster_entry.second; - const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); - raw_result_kvs.emplace_back(group_kvs); - } - - for(auto &curated_topster_entry: curated_topster->group_kv_map) { - Topster* group_topster = curated_topster_entry.second; - const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); - override_result_kvs.emplace_back(group_kvs); - } - - } else { - for(uint32_t t = 0; t < topster->size; t++) { - KV* kv = topster->getKV(t); - raw_result_kvs.push_back({kv}); - } - - for(uint32_t t = 0; t < curated_topster->size; t++) { - KV* kv = curated_topster->getKV(t); - override_result_kvs.push_back({kv}); - } - } - - // add curated IDs to result count all_result_ids_len += curated_topster->size; delete [] filter_ids; diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp new file mode 100644 index 00000000..b6d99948 --- /dev/null +++ b/test/collection_grouping_test.cpp @@ -0,0 +1,71 @@ +#include +#include +#include +#include +#include +#include +#include "collection.h" + +class CollectionGroupingTest : public ::testing::Test { +protected: + Store *store; + CollectionManager & collectionManager = CollectionManager::get_instance(); + + std::vector query_fields; + std::vector sort_fields; + + void setupCollection() { + std::string state_dir_path = "/tmp/typesense_test/collection_sorting"; + LOG(INFO) << "Truncating and creating: " << state_dir_path; + system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); + + store = new Store(state_dir_path); + collectionManager.init(store, 4, "auth_key"); + collectionManager.load(); + } + + virtual void SetUp() { + setupCollection(); + } + + virtual void TearDown() { + delete store; + } +}; + +TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) { + Collection *coll1; + + std::vector fields = { + field("title", field_types::STRING, false), + field("description", field_types::STRING, true, true), + field("max", field_types::INT32, false), + field("scores", field_types::INT64_ARRAY, true, true), + field("average", field_types::FLOAT, false, true), + field("is_valid", field_types::BOOL, false, true), + }; + + coll1 = collectionManager.get_collection("coll1"); + if(coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", fields, "max").get(); + } + + std::ifstream infile(std::string(ROOT_DIR)+"test/optional_fields.jsonl"); + + std::string json_line; + + while (std::getline(infile, json_line)) { + auto add_op = coll1->add(json_line); + if(!add_op.ok()) { + std::cout << add_op.error() << std::endl; + } + ASSERT_TRUE(add_op.ok()); + } + + infile.close(); + + // first must be able to fetch all records (i.e. all must have been index) + + auto res = coll1->search("*", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get(); + ASSERT_EQ(6, res["found"].get()); +} \ No newline at end of file diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index eaf153f7..8aca5069 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -18,7 +18,7 @@ protected: system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); store = new Store(state_dir_path); - collectionManager.init(store, 1, "auth_key"); + collectionManager.init(store, 4, "auth_key"); collectionManager.load(); std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl"); @@ -248,7 +248,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) { spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "").get(); - ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); @@ -342,4 +342,38 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { ASSERT_EQ(8, results["found"].get()); ASSERT_STREQ("8", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("6", results["hits"][1]["document"]["id"].get().c_str()); +} + +TEST_F(CollectionOverrideTest, PinnedHitsGrouping) { + std::map> pinned_hits; + pinned_hits[1] = {"6", "8"}; + pinned_hits[2] = {"1"}; + pinned_hits[3] = {"13", "4"}; + + // without any grouping parameter, only the first ID in a position should be picked + // and other IDs should appear in their original positions + + auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "starring: will", 30, + "", 10, + pinned_hits, {}).get(); + + ASSERT_EQ(10, results["found"].get()); + ASSERT_STREQ("6", results["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); + ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get().c_str()); + ASSERT_STREQ("11", results["hits"][3]["document"]["id"].get().c_str()); + + // with grouping + + results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "starring: will", 30, + "", 10, + pinned_hits, {}, {"cast"}, 2).get(); + + ASSERT_EQ(8, results["found"].get()); } \ No newline at end of file diff --git a/test/collection_test.cpp b/test/collection_test.cpp index dc904ce5..51865d89 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -2265,7 +2265,6 @@ TEST_F(CollectionTest, OptionalFields) { // try fetching the schema (should contain optional field) nlohmann::json coll_summary = coll1->get_summary_json(); - LOG(INFO) << coll_summary; ASSERT_STREQ("title", coll_summary["fields"][0]["name"].get().c_str()); ASSERT_STREQ("string", coll_summary["fields"][0]["type"].get().c_str()); ASSERT_FALSE(coll_summary["fields"][0]["facet"].get()); From ae6ea0ba6c8076824a0b5422f842c3af2f32b41c Mon Sep 17 00:00:00 2001 From: kishorenc Date: Fri, 19 Jun 2020 06:14:37 +0530 Subject: [PATCH 15/38] Use malloc/free consistently for managing array buffer. --- src/array.cpp | 4 ++-- src/sorted_array.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/array.cpp b/src/array.cpp index bc901ff7..7a009d2c 100644 --- a/src/array.cpp +++ b/src/array.cpp @@ -61,12 +61,12 @@ void array::remove_index(uint32_t start_index, uint32_t end_index) { } uint32_t size_required = (uint32_t) (unsorted_append_size_required(max, new_index) * FOR_GROWTH_FACTOR); - uint8_t *out = new uint8_t[size_required]; + uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out); uint32_t actual_size = for_compress_unsorted(new_array, out, new_index); delete[] curr_array; delete[] new_array; - delete[] in; + free(in); in = out; length = new_index; diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp index 228f6ba8..60f0a962 100644 --- a/src/sorted_array.cpp +++ b/src/sorted_array.cpp @@ -6,7 +6,7 @@ void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_lengt max = array_length > 1 ? sorted_array[array_length-1] : min; uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR); - uint8_t *out = new uint8_t[size_required]; + uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out); uint32_t actual_size = for_compress_sorted(sorted_array, out, array_length); free(in); From e9557bf233b191936151c9ca2e8217c7fe9d3203 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Fri, 19 Jun 2020 07:18:21 +0530 Subject: [PATCH 16/38] Tear down test suite state properly. --- test/collection_faceting_test.cpp | 3 ++- test/collection_grouping_test.cpp | 18 ++++++++---------- test/collection_manager_test.cpp | 1 + test/collection_override_test.cpp | 1 + test/collection_sorting_test.cpp | 1 + test/collection_test.cpp | 1 + 6 files changed, 14 insertions(+), 11 deletions(-) diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index 5db5bac6..2df584c4 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -15,7 +15,7 @@ protected: std::vector sort_fields; void setupCollection() { - std::string state_dir_path = "/tmp/typesense_test/collection_sorting"; + std::string state_dir_path = "/tmp/typesense_test/collection_faceting"; LOG(INFO) << "Truncating and creating: " << state_dir_path; system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); @@ -29,6 +29,7 @@ protected: } virtual void TearDown() { + collectionManager.dispose(); delete store; } }; diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp index b6d99948..8a9a3849 100644 --- a/test/collection_grouping_test.cpp +++ b/test/collection_grouping_test.cpp @@ -11,11 +11,8 @@ protected: Store *store; CollectionManager & collectionManager = CollectionManager::get_instance(); - std::vector query_fields; - std::vector sort_fields; - void setupCollection() { - std::string state_dir_path = "/tmp/typesense_test/collection_sorting"; + std::string state_dir_path = "/tmp/typesense_test/collection_grouping"; LOG(INFO) << "Truncating and creating: " << state_dir_path; system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); @@ -29,12 +26,13 @@ protected: } virtual void TearDown() { + collectionManager.dispose(); delete store; } }; TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) { - Collection *coll1; + Collection *coll_group; std::vector fields = { field("title", field_types::STRING, false), @@ -45,9 +43,9 @@ TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) { field("is_valid", field_types::BOOL, false, true), }; - coll1 = collectionManager.get_collection("coll1"); - if(coll1 == nullptr) { - coll1 = collectionManager.create_collection("coll1", fields, "max").get(); + coll_group = collectionManager.get_collection("coll_group"); + if(coll_group == nullptr) { + coll_group = collectionManager.create_collection("coll_group", fields, "max").get(); } std::ifstream infile(std::string(ROOT_DIR)+"test/optional_fields.jsonl"); @@ -55,7 +53,7 @@ TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) { std::string json_line; while (std::getline(infile, json_line)) { - auto add_op = coll1->add(json_line); + auto add_op = coll_group->add(json_line); if(!add_op.ok()) { std::cout << add_op.error() << std::endl; } @@ -66,6 +64,6 @@ TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) { // first must be able to fetch all records (i.e. all must have been index) - auto res = coll1->search("*", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get(); + auto res = coll_group->search("*", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get(); ASSERT_EQ(6, res["found"].get()); } \ No newline at end of file diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index f2b1f03a..4c8ecfbb 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -40,6 +40,7 @@ protected: virtual void TearDown() { collectionManager.drop_collection("collection1"); + collectionManager.dispose(); delete store; } }; diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index 8aca5069..30d223da 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -49,6 +49,7 @@ protected: virtual void TearDown() { collectionManager.drop_collection("coll_mul_fields"); + collectionManager.dispose(); delete store; } }; diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp index 93caff5b..bd983a05 100644 --- a/test/collection_sorting_test.cpp +++ b/test/collection_sorting_test.cpp @@ -29,6 +29,7 @@ protected: } virtual void TearDown() { + collectionManager.dispose(); delete store; } }; diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 51865d89..ec5c3649 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -56,6 +56,7 @@ protected: virtual void TearDown() { collectionManager.drop_collection("collection"); + collectionManager.dispose(); delete store; } }; From 1216bcb3ccdbd2ce652264aacd666d7ad4040fc1 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Fri, 19 Jun 2020 18:57:49 +0530 Subject: [PATCH 17/38] Wrap grouped hits in a dictionary structure. --- src/collection.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index ee23aefe..29238a88 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -869,8 +869,18 @@ Option Collection::search(const std::string & query, const std:: for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) { const std::vector & kv_group = result_group_kvs[result_kvs_index]; - nlohmann::json group_hits_array = nlohmann::json::array(); - nlohmann::json& hits_array = (group_limit > 1) ? group_hits_array : result["hits"]; + nlohmann::json group_hits; + if(group_limit > 1) { + group_hits["hits"] = nlohmann::json::array(); + std::vector group_keys; + for(const auto& group_key: group_by_fields) { + group_keys.push_back(group_key); + } + + group_hits["group_key"] = StringUtils::join(group_keys, ":"); + } + + nlohmann::json& hits_array = (group_limit > 1) ? group_hits["hits"] : result["hits"]; for(const KV* field_order_kv: kv_group) { const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key); @@ -951,7 +961,7 @@ Option Collection::search(const std::string & query, const std:: } if(group_limit > 1) { - result["grouped_hits"].push_back(group_hits_array); + result["grouped_hits"].push_back(group_hits); } } From 7fa6d5c888e5b8efdb99912f491d0a86591a5ee8 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Fri, 19 Jun 2020 21:18:43 +0530 Subject: [PATCH 18/38] Fix ordering in grouped override IDs. --- include/index.h | 8 ++++---- src/collection.cpp | 10 +++++----- src/index.cpp | 30 +++++++++++++++++------------- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/include/index.h b/include/index.h index f2eb5fcc..36cae022 100644 --- a/include/index.h +++ b/include/index.h @@ -27,7 +27,7 @@ struct search_args { std::vector search_fields; std::vector filters; std::vector facets; - std::map> included_ids; + std::map> included_ids; std::vector excluded_ids; std::vector sort_fields_std; facet_query_t facet_query; @@ -55,7 +55,7 @@ struct search_args { } search_args(std::string query, std::vector search_fields, std::vector filters, - std::vector facets, std::map> included_ids, std::vector excluded_ids, + std::vector facets, std::map> included_ids, std::vector excluded_ids, std::vector sort_fields_std, facet_query_t facet_query, int num_typos, size_t max_facet_values, size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix, size_t drop_tokens_threshold, size_t typo_tokens_threshold, @@ -213,7 +213,7 @@ private: const uint32_t indices_length); void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id, - const std::map> & included_ids_map, + const std::map> & included_ids_map, Topster* curated_topster, std::vector> & searched_queries); uint64_t facet_token_hash(const field & a_field, const std::string &token); @@ -239,7 +239,7 @@ public: void search(Option & outcome, const std::string & query, const std::vector & search_fields, const std::vector & filters, std::vector & facets, facet_query_t & facet_query, - const std::map> & included_ids_map, + const std::map> & included_ids_map, const std::vector & excluded_ids, const std::vector & sort_fields_std, const int num_typos, Topster* topster, Topster* curated_topster, diff --git a/src/collection.cpp b/src/collection.cpp index 29238a88..0f7336f7 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -419,17 +419,17 @@ Option Collection::search(const std::string & query, const std:: } */ - std::map>> index_to_included_ids; + std::map>> index_to_included_ids; std::map> index_to_excluded_ids; for(const auto& pos_ids: include_ids) { - size_t position = pos_ids.first; + size_t outer_pos = pos_ids.first; size_t ids_per_pos = std::max(size_t(1), group_limit); - for(size_t i = 0; i < std::min(ids_per_pos, pos_ids.second.size()); i++) { - auto seq_id = pos_ids.second[i]; + for(size_t inner_pos = 0; inner_pos < std::min(ids_per_pos, pos_ids.second.size()); inner_pos++) { + auto seq_id = pos_ids.second[inner_pos]; auto index_id = (seq_id % num_indices); - index_to_included_ids[index_id][position].push_back(seq_id); + index_to_included_ids[index_id][outer_pos][inner_pos] = seq_id; //LOG(INFO) << "Adding seq_id " << seq_id << " to index_id " << index_id; } } diff --git a/src/index.cpp b/src/index.cpp index ce1fecbc..4e50fb6d 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1017,8 +1017,9 @@ void Index::run_search() { // after the wait, we own the lock. search(search_params->outcome, search_params->query, search_params->search_fields, - search_params->filters, search_params->facets, search_params->facet_query, search_params->included_ids, - search_params->excluded_ids, search_params->sort_fields_std, search_params->num_typos, + search_params->filters, search_params->facets, search_params->facet_query, + search_params->included_ids, search_params->excluded_ids, + search_params->sort_fields_std, search_params->num_typos, search_params->topster, search_params->curated_topster, search_params->per_page, search_params->page, search_params->token_order, search_params->prefix, search_params->drop_tokens_threshold, @@ -1038,7 +1039,7 @@ void Index::run_search() { } void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id, - const std::map> & included_ids_map, + const std::map> & included_ids_map, Topster* curated_topster, std::vector> & searched_queries) { @@ -1067,13 +1068,16 @@ void Index::collate_included_ids(const std::string & query, const std::string & } for(const auto& pos_ids: included_ids_map) { - const size_t pos = pos_ids.first; + const size_t outer_pos = pos_ids.first; - for(size_t i = 0; i < pos_ids.second.size(); i++) { - uint32_t seq_id = pos_ids.second[i]; + for(const auto& index_seq_id: pos_ids.second) { + uint32_t inner_pos = index_seq_id.first; + uint32_t seq_id = index_seq_id.second; - uint64_t distinct_id = pos; // position is the group distinct key - uint64_t match_score = (64000 - i); // index within a group is the match score + uint64_t distinct_id = outer_pos; // outer pos is the group distinct key + uint64_t match_score = (64000 - inner_pos); // inner pos within a group is the match score + + LOG(INFO) << "seq_id: " << seq_id << " - " << match_score; int64_t scores[3]; scores[0] = match_score; @@ -1093,7 +1097,7 @@ void Index::search(Option & outcome, const std::vector & search_fields, const std::vector & filters, std::vector & facets, facet_query_t & facet_query, - const std::map> & included_ids_map, + const std::map> & included_ids_map, const std::vector & excluded_ids, const std::vector & sort_fields_std, const int num_typos, Topster* topster, @@ -1122,10 +1126,10 @@ void Index::search(Option & outcome, std::set curated_ids; std::vector included_ids; - for(const auto& pos_ids: included_ids_map) { - for(const uint32_t id: pos_ids.second) { - curated_ids.insert(id); - included_ids.push_back(id); + for(const auto& outer_pos_ids: included_ids_map) { + for(const auto& inner_pos_seq_id: outer_pos_ids.second) { + curated_ids.insert(inner_pos_seq_id.second); + included_ids.push_back(inner_pos_seq_id.second); } } From de28d33f24366986a6120303e579363ec59c18fe Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sun, 21 Jun 2020 22:00:08 +0530 Subject: [PATCH 19/38] More tests for grouping. --- src/collection.cpp | 8 +- src/index.cpp | 2 +- test/collection_grouping_test.cpp | 289 +++++++++++++++++++++++++++--- test/collection_override_test.cpp | 12 ++ test/group_documents.jsonl | 12 ++ 5 files changed, 290 insertions(+), 33 deletions(-) create mode 100644 test/group_documents.jsonl diff --git a/src/collection.cpp b/src/collection.cpp index 0f7336f7..1c244855 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -351,8 +351,6 @@ void Collection::populate_overrides(std::string query, include_ids[hit.position].push_back(seq_id); } } - - break; } } @@ -805,8 +803,6 @@ Option Collection::search(const std::string & query, const std:: // for grouping we have to aggregate group set sizes to a count value if(group_limit) { - LOG(INFO) << "override_result_kvs size: " << override_result_kvs.size(); - for(auto& acc_facet: facets) { for(auto& facet_kv: acc_facet.result_map) { facet_kv.second.count = facet_kv.second.groups.size(); @@ -877,7 +873,7 @@ Option Collection::search(const std::string & query, const std:: group_keys.push_back(group_key); } - group_hits["group_key"] = StringUtils::join(group_keys, ":"); + group_hits["group_key"] = StringUtils::join(group_keys, ","); } nlohmann::json& hits_array = (group_limit > 1) ? group_hits["hits"] : result["hits"]; @@ -1391,7 +1387,7 @@ Option Collection::add_override(const override_t & override) { return Option(500, "Error while storing the override on disk."); } - overrides[override.id] = override; + overrides.emplace(override.id, override); return Option(200); } diff --git a/src/index.cpp b/src/index.cpp index 4e50fb6d..6384228f 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1077,7 +1077,7 @@ void Index::collate_included_ids(const std::string & query, const std::string & uint64_t distinct_id = outer_pos; // outer pos is the group distinct key uint64_t match_score = (64000 - inner_pos); // inner pos within a group is the match score - LOG(INFO) << "seq_id: " << seq_id << " - " << match_score; + // LOG(INFO) << "seq_id: " << seq_id << " - " << match_score; int64_t scores[3]; scores[0] = match_score; diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp index 8a9a3849..f3e0e6a2 100644 --- a/test/collection_grouping_test.cpp +++ b/test/collection_grouping_test.cpp @@ -10,6 +10,7 @@ class CollectionGroupingTest : public ::testing::Test { protected: Store *store; CollectionManager & collectionManager = CollectionManager::get_instance(); + Collection *coll_group; void setupCollection() { std::string state_dir_path = "/tmp/typesense_test/collection_grouping"; @@ -19,6 +20,33 @@ protected: store = new Store(state_dir_path); collectionManager.init(store, 4, "auth_key"); collectionManager.load(); + + std::vector fields = { + field("title", field_types::STRING, false), + field("brand", field_types::STRING, true, true), + field("size", field_types::INT32, true, false), + field("colors", field_types::STRING_ARRAY, true, false), + field("rating", field_types::FLOAT, true, false) + }; + + coll_group = collectionManager.get_collection("coll_group"); + if(coll_group == nullptr) { + coll_group = collectionManager.create_collection("coll_group", fields, "rating").get(); + } + + std::ifstream infile(std::string(ROOT_DIR)+"test/group_documents.jsonl"); + + std::string json_line; + + while (std::getline(infile, json_line)) { + auto add_op = coll_group->add(json_line); + if(!add_op.ok()) { + std::cout << add_op.error() << std::endl; + } + ASSERT_TRUE(add_op.ok()); + } + + infile.close(); } virtual void SetUp() { @@ -31,39 +59,248 @@ protected: } }; -TEST_F(CollectionGroupingTest, GroupingOnOptionalIntegerArray) { - Collection *coll_group; +TEST_F(CollectionGroupingTest, GroupingBasics) { + // group by size (int32) + auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, + "", 10, + {}, {}, {"size"}, 2).get(); - std::vector fields = { - field("title", field_types::STRING, false), - field("description", field_types::STRING, true, true), - field("max", field_types::INT32, false), - field("scores", field_types::INT64_ARRAY, true, true), - field("average", field_types::FLOAT, false, true), - field("is_valid", field_types::BOOL, false, true), + ASSERT_EQ(3, res["found"].get()); + ASSERT_EQ(3, res["grouped_hits"].size()); + ASSERT_STREQ("size", res["grouped_hits"][0]["group_key"].get().c_str()); + + ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get()); + ASSERT_STREQ("1", res["grouped_hits"][0]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get()); + ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("2", res["grouped_hits"][2]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get()); + ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get().c_str()); + + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]); + ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get().c_str()); + + ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]); + ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get().c_str()); + + // group by rating (float) and sort by size + std::vector sort_size = {sort_by("size", "DESC")}; + res = coll_group->search("*", {}, "", {"brand"}, sort_size, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "brand: omeg", 30, + "", 10, + {}, {}, {"rating"}, 2).get(); + + // 7 unique ratings + ASSERT_EQ(7, res["found"].get()); + ASSERT_EQ(7, res["grouped_hits"].size()); + ASSERT_STREQ("rating", res["grouped_hits"][0]["group_key"].get().c_str()); + + ASSERT_EQ(12, res["grouped_hits"][0]["hits"][0]["document"]["size"].get()); + ASSERT_STREQ("8", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get()); + + ASSERT_EQ(12, res["grouped_hits"][1]["hits"][0]["document"]["size"].get()); + ASSERT_STREQ("6", res["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get()); + + ASSERT_EQ(11, res["grouped_hits"][1]["hits"][1]["document"]["size"].get()); + ASSERT_STREQ("1", res["grouped_hits"][1]["hits"][1]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get()); + + ASSERT_EQ(10, res["grouped_hits"][5]["hits"][0]["document"]["size"].get()); + ASSERT_STREQ("9", res["grouped_hits"][5]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.1, res["grouped_hits"][5]["hits"][0]["document"]["rating"].get()); + + ASSERT_EQ(10, res["grouped_hits"][6]["hits"][0]["document"]["size"].get()); + ASSERT_STREQ("0", res["grouped_hits"][6]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][6]["hits"][0]["document"]["rating"].get()); + + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); +} + +TEST_F(CollectionGroupingTest, GroupingCompoundKey) { + // group by size+brand (int32, string) + auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, + "", 10, + {}, {}, {"size", "brand"}, 2).get(); + + ASSERT_EQ(10, res["found"].get()); + ASSERT_EQ(10, res["grouped_hits"].size()); + ASSERT_STREQ("size,brand", res["grouped_hits"][0]["group_key"].get().c_str()); + + ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size()); + ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + + ASSERT_EQ(1, res["grouped_hits"][1]["hits"].size()); + ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + + ASSERT_EQ(2, res["grouped_hits"][2]["hits"].size()); + ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("3", res["grouped_hits"][2]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get()); + ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get().c_str()); + + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]); + ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get().c_str()); + + ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]); + ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get().c_str()); + + // pagination with page=2, per_page=2 + res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 2, 2, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, + "", 10, + {}, {}, {"size", "brand"}, 2).get(); + + + // 3rd result from previous assertion will be in the first position + ASSERT_EQ(2, res["grouped_hits"][0]["hits"].size()); + ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("3", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get()); + ASSERT_STREQ("0", res["grouped_hits"][0]["hits"][1]["document"]["id"].get().c_str()); + + // total count and facet counts should be the same + ASSERT_EQ(10, res["found"].get()); + ASSERT_EQ(2, res["grouped_hits"].size()); + ASSERT_STREQ("size,brand", res["grouped_hits"][0]["group_key"].get().c_str()); + + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get().c_str()); + + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]); + ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get().c_str()); + + ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]); + ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get().c_str()); +} + +TEST_F(CollectionGroupingTest, GroupingWithSingleDistinct) { + auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, + "", 10, + {}, {}, {"brand"}, 1).get(); + + ASSERT_EQ(5, res["found"].get()); + ASSERT_EQ(5, res["hits"].size()); + + ASSERT_STREQ("4", res["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("2", res["hits"][1]["document"]["id"].get().c_str()); + ASSERT_STREQ("8", res["hits"][2]["document"]["id"].get().c_str()); + ASSERT_STREQ("10", res["hits"][3]["document"]["id"].get().c_str()); // unbranded + ASSERT_STREQ("9", res["hits"][4]["document"]["id"].get().c_str()); + + // facet counts should each be 1, including unbranded + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + + for(size_t i=0; i < 4; i++) { + ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][i]["count"]); + } +} + +TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) { + nlohmann::json override_json_include = { + {"id", "include-rule"}, + { + "rule", { + {"query", "shirt"}, + {"match", override_t::MATCH_EXACT} + } + } }; - coll_group = collectionManager.get_collection("coll_group"); - if(coll_group == nullptr) { - coll_group = collectionManager.create_collection("coll_group", fields, "max").get(); - } + override_json_include["includes"] = nlohmann::json::array(); + override_json_include["includes"][0] = nlohmann::json::object(); + override_json_include["includes"][0]["id"] = "11"; + override_json_include["includes"][0]["position"] = 1; - std::ifstream infile(std::string(ROOT_DIR)+"test/optional_fields.jsonl"); + override_json_include["includes"][1] = nlohmann::json::object(); + override_json_include["includes"][1]["id"] = "10"; + override_json_include["includes"][1]["position"] = 1; - std::string json_line; - - while (std::getline(infile, json_line)) { - auto add_op = coll_group->add(json_line); - if(!add_op.ok()) { - std::cout << add_op.error() << std::endl; + nlohmann::json override_json_exclude = { + {"id", "exclude-rule"}, + { + "rule", { + {"query", "shirt"}, + {"match", override_t::MATCH_EXACT} + } } - ASSERT_TRUE(add_op.ok()); - } + }; + override_json_exclude["excludes"] = nlohmann::json::array(); + override_json_exclude["excludes"][0] = nlohmann::json::object(); + override_json_exclude["excludes"][0]["id"] = "2"; - infile.close(); + override_t override1(override_json_include); + override_t override2(override_json_exclude); + Option ov1_op = coll_group->add_override(override1); + Option ov2_op = coll_group->add_override(override2); - // first must be able to fetch all records (i.e. all must have been index) + ASSERT_TRUE(ov1_op.ok()); + ASSERT_TRUE(ov2_op.ok()); - auto res = coll_group->search("*", {"title"}, "", {}, {}, 0, 10, 1, FREQUENCY, false).get(); - ASSERT_EQ(6, res["found"].get()); + auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, + "", 10, + {}, {}, {"colors"}, 2).get(); + + ASSERT_EQ(4, res["found"].get()); + ASSERT_EQ(4, res["grouped_hits"].size()); + ASSERT_STREQ("colors", res["grouped_hits"][0]["group_key"].get().c_str()); + + ASSERT_STREQ("11", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("10", res["grouped_hits"][0]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("5", res["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("4", res["grouped_hits"][2]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_EQ(1, res["grouped_hits"][3]["hits"].size()); + ASSERT_STREQ("8", res["grouped_hits"][3]["hits"][0]["document"]["id"].get().c_str()); } \ No newline at end of file diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index 30d223da..a36fb85a 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -377,4 +377,16 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) { pinned_hits, {}, {"cast"}, 2).get(); ASSERT_EQ(8, results["found"].get()); + + ASSERT_STREQ("cast", results["grouped_hits"][0]["group_key"].get().c_str()); + ASSERT_STREQ("6", results["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("8", results["grouped_hits"][0]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("1", results["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + + ASSERT_STREQ("13", results["grouped_hits"][2]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("4", results["grouped_hits"][2]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("11", results["grouped_hits"][3]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("16", results["grouped_hits"][4]["hits"][0]["document"]["id"].get().c_str()); } \ No newline at end of file diff --git a/test/group_documents.jsonl b/test/group_documents.jsonl new file mode 100644 index 00000000..5d673ff0 --- /dev/null +++ b/test/group_documents.jsonl @@ -0,0 +1,12 @@ +{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["white", "blue"], "rating": 4.5} +{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 11, "colors": ["white", "blue"], "rating": 4.3} +{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 12, "colors": ["white", "blue"], "rating": 4.6} +{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["blue"], "rating": 4.6} +{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 10, "colors": ["white", "blue"], "rating": 4.8} +{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 11, "colors": ["blue"], "rating": 4.8} +{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 12, "colors": ["white", "blue"], "rating": 4.3} +{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 10, "colors": ["white", "blue"], "rating": 4.3} +{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 12, "colors": ["white", "red"], "rating": 4.4} +{"title": "Zeta Casual Shirt", "brand": "Zeta", "size": 10, "colors": ["white", "blue"], "rating": 4.1} +{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 4.3} +{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 3.3} \ No newline at end of file From 933cbe9bb0502e7c4e7b06f2f9782ca0e465aa1f Mon Sep 17 00:00:00 2001 From: kishorenc Date: Mon, 22 Jun 2020 07:29:20 +0530 Subject: [PATCH 20/38] More tests. --- cmake/Glog.cmake | 0 src/collection.cpp | 4 ++++ src/core_api.cpp | 5 ----- test/collection_grouping_test.cpp | 14 ++++++++++++++ test/collection_sorting_test.cpp | 1 + test/collection_test.cpp | 8 +++++++- 6 files changed, 26 insertions(+), 6 deletions(-) create mode 100644 cmake/Glog.cmake diff --git a/cmake/Glog.cmake b/cmake/Glog.cmake new file mode 100644 index 00000000..e69de29b diff --git a/src/collection.cpp b/src/collection.cpp index 1c244855..fbe0a376 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -390,6 +390,10 @@ Option Collection::search(const std::string & query, const std:: const std::vector& group_by_fields, const size_t group_limit) { + if(query != "*" && search_fields.empty()) { + return Option(400, "No search fields specified for the query."); + } + std::vector excluded_ids; std::map> include_ids; // position => list of IDs populate_overrides(query, pinned_hits, hidden_hits, include_ids, excluded_ids); diff --git a/src/core_api.cpp b/src/core_api.cpp index 03f89a2f..a52b1a7a 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -252,11 +252,6 @@ bool get_search(http_req & req, http_res & res) { return false; } - if(req.params.count(QUERY_BY) == 0 && req.params[QUERY] != "*") { - res.set_400(std::string("Parameter `") + QUERY_BY + "` is required."); - return false; - } - if(req.params.count(MAX_FACET_VALUES) == 0) { req.params[MAX_FACET_VALUES] = "10"; } diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp index f3e0e6a2..b1fcd8b6 100644 --- a/test/collection_grouping_test.cpp +++ b/test/collection_grouping_test.cpp @@ -303,4 +303,18 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) { ASSERT_EQ(1, res["grouped_hits"][3]["hits"].size()); ASSERT_STREQ("8", res["grouped_hits"][3]["hits"][0]["document"]["id"].get().c_str()); + + // assert facet counts + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][1]["value"].get().c_str()); + + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]); + ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][2]["value"].get().c_str()); + + ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]); + ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get().c_str()); } \ No newline at end of file diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp index bd983a05..e552f307 100644 --- a/test/collection_sorting_test.cpp +++ b/test/collection_sorting_test.cpp @@ -249,6 +249,7 @@ TEST_F(CollectionSortingTest, ThreeSortFieldsLimit) { sort_by("min", "DESC"), }; + query_fields = {"title"}; auto res_op = coll1->search("the", query_fields, "", {}, sort_fields_desc, 0, 10, 1, FREQUENCY, false); ASSERT_FALSE(res_op.ok()); diff --git a/test/collection_test.cpp b/test/collection_test.cpp index ec5c3649..12748830 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -458,10 +458,16 @@ TEST_F(CollectionTest, WildcardQuery) { } // wildcard query should not require a search field - results_op = collection->search("*", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false).get(); + results_op = collection->search("*", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false); ASSERT_TRUE(results_op.ok()); + results = results_op.get(); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); + + // non-wildcard query should require a search field + results_op = collection->search("the", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false); + ASSERT_FALSE(results_op.ok()); + ASSERT_STREQ("No search fields specified for the query.", results_op.error().c_str()); } TEST_F(CollectionTest, PrefixSearching) { From e314ec23e6c6bd13cbc6a7337c40cb958e5b52e1 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Mon, 22 Jun 2020 18:17:52 +0530 Subject: [PATCH 21/38] Move to glog. --- .circleci/config.yml | 2 +- CMakeLists.txt | 3 ++- cmake/Modules/FindGlog.cmake | 40 ++++++++++++++++++++++++++++++ cmake/braft.cmake | 2 +- cmake/brpc.cmake | 2 +- docker-build.sh | 8 +++--- docker/development.Dockerfile | 26 ++++++++++++------- docker/patches/brpc_cmakelists.txt | 4 +-- include/logger.h | 2 +- src/main/typesense_server.cpp | 6 ----- src/raft_server.cpp | 4 +-- src/typesense_server_utils.cpp | 21 ++++++---------- 12 files changed, 79 insertions(+), 41 deletions(-) create mode 100644 cmake/Modules/FindGlog.cmake diff --git a/.circleci/config.yml b/.circleci/config.yml index 63d2d996..8609ef77 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2 jobs: build: docker: - - image: typesense/typesense-development:22-MAR-2020-5 + - image: typesense/typesense-development:22-JUNE-2020-1 environment: - PROJECT_DIR: /typesense - TYPESENSE_VERSION: $CIRCLE_BRANCH-$CIRCLE_SHA1 diff --git a/CMakeLists.txt b/CMakeLists.txt index b8c4e9f8..358f2502 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ FIND_PACKAGE(ICU REQUIRED) FIND_PACKAGE(Protobuf REQUIRED) FIND_PACKAGE(LevelDB REQUIRED) FIND_PACKAGE(gflags REQUIRED) +FIND_PACKAGE(glog REQUIRED) message("OpenSSL library: ${OPENSSL_LIBRARIES}") @@ -152,7 +153,7 @@ set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIE set(CORE_LIBS h2o-evloop iconv ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES} ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} pthread dl ${STD_LIB}) -set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS}) +set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${GLOG_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS}) target_link_libraries(typesense-core ${CORE_LIBS}) target_link_libraries(typesense-server ${CORE_LIBS}) diff --git a/cmake/Modules/FindGlog.cmake b/cmake/Modules/FindGlog.cmake new file mode 100644 index 00000000..688e30ce --- /dev/null +++ b/cmake/Modules/FindGlog.cmake @@ -0,0 +1,40 @@ +# - Try to find Glog +# +# The following variables are optionally searched for defaults +# GLOG_ROOT_DIR: Base directory where all GLOG components are found +# +# The following are set after configuration is done: +# GLOG_FOUND +# GLOG_INCLUDE_DIRS +# GLOG_LIBRARIES + +include(FindPackageHandleStandardArgs) + +if (NOT DEFINED GLOG_ROOT) + message("set GLOG_ROOT========================") + set (GLOG_ROOT /usr /usr/local /usr/include/) +endif (NOT DEFINED GLOG_ROOT) + +#set(GLOG_ROOT_DIR "" CACHE PATH "Folder contains Google glog") + +find_path(GLOG_INCLUDE_DIR glog/logging.h + PATHS + ${GLOG_ROOT_DIR} + PATH_SUFFIXES + src) + +find_library(GLOG_LIBRARY glog libglog + PATHS + ${GLOG_ROOT_DIR} + PATH_SUFFIXES + .libs + lib + lib64) + +find_package_handle_standard_args(GLOG DEFAULT_MSG + GLOG_INCLUDE_DIR GLOG_LIBRARY) + +if(GLOG_FOUND) + set(GLOG_INCLUDE_DIRS ${GLOG_INCLUDE_DIR}) + set(GLOG_LIBRARIES ${GLOG_LIBRARY}) +endif() \ No newline at end of file diff --git a/cmake/braft.cmake b/cmake/braft.cmake index 082f95c9..b72a1e25 100644 --- a/cmake/braft.cmake +++ b/cmake/braft.cmake @@ -1,4 +1,4 @@ -set(BRAFT_VERSION 0a9ec3f) +set(BRAFT_VERSION fb27e63) set(BRAFT_NAME braft-${BRAFT_VERSION}) set(BRAFT_TAR_PATH ${DEP_ROOT_DIR}/${BRAFT_NAME}.tar.gz) diff --git a/cmake/brpc.cmake b/cmake/brpc.cmake index 4c97adf1..9ea1927a 100644 --- a/cmake/brpc.cmake +++ b/cmake/brpc.cmake @@ -1,4 +1,4 @@ -set(BRPC_VERSION 23c66e3) +set(BRPC_VERSION 2f8fc37d) set(BRPC_NAME brpc-${BRPC_VERSION}) set(BRPC_TAR_PATH ${DEP_ROOT_DIR}/${BRPC_NAME}.tar.gz) diff --git a/docker-build.sh b/docker-build.sh index 16bd3f26..94164a05 100755 --- a/docker-build.sh +++ b/docker-build.sh @@ -22,13 +22,13 @@ if [[ "$@" == *"--depclean"* ]]; then fi #echo "Creating development image..." -#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:latest $PROJECT_DIR/docker +#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:22-JUNE-2020-1 $PROJECT_DIR/docker echo "Building Typesense $TYPESENSE_VERSION..." -docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ +docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ -DCMAKE_BUILD_TYPE=Release -H/typesense -B/typesense/$BUILD_DIR -docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development make typesense-server -C/typesense/$BUILD_DIR +docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 make typesense-server -C/typesense/$BUILD_DIR if [[ "$@" == *"--build-deploy-image"* ]]; then echo "Creating deployment image for Typesense $TYPESENSE_VERSION server ..." @@ -60,7 +60,7 @@ if [[ "$@" == *"--package-libs"* ]]; then fi # #if [[ "$@" == *"--create-deb-upload"* ]]; then -# docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ +# docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ # -DCMAKE_BUILD_TYPE=Debug -H/typesense -B/typesense/$BUILD_DIR #fi diff --git a/docker/development.Dockerfile b/docker/development.Dockerfile index 790a3246..8b6d0cc4 100644 --- a/docker/development.Dockerfile +++ b/docker/development.Dockerfile @@ -55,25 +55,33 @@ ADD https://github.com/protocolbuffers/protobuf/releases/download/v3.11.4/protob RUN tar -C /opt -xf /opt/protobuf-cpp-3.11.4.tar.gz && chown -R root:root /opt/protobuf-3.11.4 RUN cd /opt/protobuf-3.11.4 && ./configure --disable-shared && make -j8 && make check && make install && rm -rf /usr/local/lib/*.so* -ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz.tar.gz -RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz.tar.gz +ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz +RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz RUN mkdir -p /opt/leveldb-1.22/build && cd /opt/leveldb-1.22/build && cmake -DCMAKE_BUILD_TYPE=Release .. && \ cmake --build . && make install && rm -rf /usr/local/lib/*.so* +ADD https://github.com/google/glog/archive/v0.4.0.tar.gz /opt/glog-0.4.0.tar.gz +RUN tar -C /opt -xf /opt/glog-0.4.0.tar.gz +RUN mkdir -p /opt/glog-0.4.0/build && cd /opt/glog-0.4.0/build && \ + cmake -DBUILD_TESTING=0 -DWITH_GFLAGS=ON -DWITH_UNWIND=OFF .. && \ + cmake --build . && make install && rm -rf /usr/local/lib/*.so* + ADD https://github.com/apache/incubator-brpc/archive/0.9.7-rc03.tar.gz /opt/brpc-0.9.7-rc03.tar.gz RUN tar -C /opt -xf /opt/brpc-0.9.7-rc03.tar.gz COPY patches/brpc_cmakelists.txt /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt RUN chown root:root /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/build && cd /opt/incubator-brpc-0.9.7-rc03/build && \ - cmake -DWITH_DEBUG_SYMBOLS=OFF .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \ + cmake -DWITH_DEBUG_SYMBOLS=OFF -DWITH_GLOG=ON .. && \ + make -j8 && make install && rm -rf /usr/local/lib/*.so* && \ rm -rf /opt/incubator-brpc-0.9.7-rc03/build/output/bin -ADD https://github.com/baidu/braft/archive/v1.1.0.tar.gz /opt/braft-v1.1.0.tar.gz -RUN tar -C /opt -xf /opt/braft-v1.1.0.tar.gz -COPY patches/braft_cmakelists.txt /opt/braft-1.1.0/src/CMakeLists.txt -RUN chown root:root /opt/braft-1.1.0/src/CMakeLists.txt -RUN mkdir -p /opt/braft-1.1.0/build && cd /opt/braft-1.1.0/build && \ - cmake -DWITH_DEBUG_SYMBOLS=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* +ADD https://github.com/baidu/braft/archive/v1.1.1.tar.gz /opt/braft-v1.1.1.tar.gz +RUN tar -C /opt -xf /opt/braft-v1.1.1.tar.gz +COPY patches/braft_cmakelists.txt /opt/braft-1.1.1/src/CMakeLists.txt +RUN chown root:root /opt/braft-1.1.1/src/CMakeLists.txt +RUN mkdir -p /opt/braft-1.1.1/build && cd /opt/braft-1.1.1/build && \ + cmake -DWITH_DEBUG_SYMBOLS=ON -DBRPC_WITH_GLOG=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \ + rm -rf /opt/braft-1.1.1/build/output/bin ENV CC /usr/local/gcc-6.4.0/bin/gcc ENV CXX /usr/local/gcc-6.4.0/bin/g++ diff --git a/docker/patches/brpc_cmakelists.txt b/docker/patches/brpc_cmakelists.txt index 03c05a11..687b3b7d 100644 --- a/docker/patches/brpc_cmakelists.txt +++ b/docker/patches/brpc_cmakelists.txt @@ -35,8 +35,8 @@ add_library(brpc-static STATIC $ $ $) -if(BRPC_WITH_THRIFT) - target_link_libraries(brpc-static thrift) +if(BRPC_WITH_GLOG) + target_link_libraries(brpc-static ${GLOG_LIB}) endif() SET_TARGET_PROPERTIES(brpc-static PROPERTIES OUTPUT_NAME brpc CLEAN_DIRECT_OUTPUT 1) diff --git a/include/logger.h b/include/logger.h index c8a9e62e..01adc553 100644 --- a/include/logger.h +++ b/include/logger.h @@ -2,4 +2,4 @@ #include #include -#include +#include \ No newline at end of file diff --git a/src/main/typesense_server.cpp b/src/main/typesense_server.cpp index b76942e3..5a3ef7d5 100644 --- a/src/main/typesense_server.cpp +++ b/src/main/typesense_server.cpp @@ -55,13 +55,7 @@ void replica_server_routes() { server->get("/health", get_health); } -namespace logging { - DECLARE_bool(log_year); -} - int main(int argc, char **argv) { - logging::FLAGS_log_year = true; - Config config; cmdline::parser options; diff --git a/src/raft_server.cpp b/src/raft_server.cpp index 70a0b472..dc1483e9 100644 --- a/src/raft_server.cpp +++ b/src/raft_server.cpp @@ -297,7 +297,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) { return -1; } - LOG(TRACE) << "rm " << store->get_state_dir_path() << " success"; + LOG(INFO) << "rm " << store->get_state_dir_path() << " success"; std::string snapshot_path = reader->get_path(); snapshot_path.append(std::string("/") + db_snapshot_name); @@ -308,7 +308,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) { return -1; } - LOG(TRACE) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success"; + LOG(INFO) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success"; return init_db(); } diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp index 1a940518..938e2a45 100644 --- a/src/typesense_server_utils.cpp +++ b/src/typesense_server_utils.cpp @@ -133,35 +133,31 @@ int init_logger(Config & config, const std::string & server_version) { signal(SIGILL, catch_crash); signal(SIGSEGV, catch_crash); - logging::LoggingSettings log_settings; - // we can install new signal handlers only after overriding above signal(SIGINT, catch_interrupt); signal(SIGTERM, catch_interrupt); + google::InitGoogleLogging("typesense"); + std::string log_dir = config.get_log_dir(); - std::string log_path; if(log_dir.empty()) { // use console logger if log dir is not specified - log_settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG; + FLAGS_logtostderr = true; } else { if(!directory_exists(log_dir)) { std::cerr << "Typesense failed to start. " << "Log directory " << log_dir << " does not exist."; return 1; } - log_settings.logging_dest = logging::LOG_TO_FILE; - log_path = log_dir + "/" + "typesense.log"; - log_settings.log_file = log_path.c_str(); + std::cout << "Log directory is configured as: " << log_dir << std::endl; + std::string log_path_prefix = log_dir + "/" + "typesense-log-"; + google::SetLogDestination(google::INFO, log_path_prefix.c_str()); - LOG(INFO) << "Starting Typesense " << server_version << ". Log directory is configured as: " - << log_dir << std::endl; + // flush log levels above -1 immediately (INFO=0) + FLAGS_logbuflevel = -1; } - logging::InitLogging(log_settings); - logging::SetMinLogLevel(0); - return 0; } @@ -275,7 +271,6 @@ int start_raft_server(ReplicationState& replication_state, const std::string& st } int run_server(const Config & config, const std::string & version, void (*master_server_routes)()) { - LOG(INFO) << "Starting Typesense " << version << std::flush; quit_raft_service = false; From abc3ae2a4f1ae65511877a6a29c6a8c0bfd22873 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Mon, 22 Jun 2020 18:25:17 +0530 Subject: [PATCH 22/38] Temporary clean build. --- .circleci/config.yml | 7 +++++-- CMakeLists.txt | 2 ++ cmake/Modules/FindGlog.cmake | 1 - 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8609ef77..6bae9b30 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -16,10 +16,13 @@ jobs: keys: - external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }} - external-Linux-cache-{{ .Branch }} - - external-Linux-cache + - external-Linux-cache + - run: + name: debug + command: ls -latr /usr/local/lib/ - run: name: build - command: $PROJECT_DIR/build.sh + command: $PROJECT_DIR/build.sh --clean - save_cache: key: external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }} paths: diff --git a/CMakeLists.txt b/CMakeLists.txt index 358f2502..63abad81 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,6 +153,8 @@ set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIE set(CORE_LIBS h2o-evloop iconv ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES} ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} pthread dl ${STD_LIB}) +message("GLOG library is: ${GLOG_LIBRARIES}") + set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${GLOG_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS}) target_link_libraries(typesense-core ${CORE_LIBS}) diff --git a/cmake/Modules/FindGlog.cmake b/cmake/Modules/FindGlog.cmake index 688e30ce..1962427f 100644 --- a/cmake/Modules/FindGlog.cmake +++ b/cmake/Modules/FindGlog.cmake @@ -11,7 +11,6 @@ include(FindPackageHandleStandardArgs) if (NOT DEFINED GLOG_ROOT) - message("set GLOG_ROOT========================") set (GLOG_ROOT /usr /usr/local /usr/include/) endif (NOT DEFINED GLOG_ROOT) From b9b3c58b2cd2a753ff2c05653f01aee6c3613a62 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Mon, 22 Jun 2020 19:32:10 +0530 Subject: [PATCH 23/38] Fix build. --- .circleci/config.yml | 2 +- CMakeLists.txt | 6 +++--- cmake/Modules/FindGlog.cmake | 39 ------------------------------------ 3 files changed, 4 insertions(+), 43 deletions(-) delete mode 100644 cmake/Modules/FindGlog.cmake diff --git a/.circleci/config.yml b/.circleci/config.yml index 6bae9b30..a5d2dc2f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -22,7 +22,7 @@ jobs: command: ls -latr /usr/local/lib/ - run: name: build - command: $PROJECT_DIR/build.sh --clean + command: $PROJECT_DIR/build.sh - save_cache: key: external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }} paths: diff --git a/CMakeLists.txt b/CMakeLists.txt index 63abad81..c65f4e47 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,8 +88,10 @@ link_directories(${DEP_ROOT_DIR}/${FOR_NAME}) link_directories(${DEP_ROOT_DIR}/${H2O_NAME}/build) link_directories(${DEP_ROOT_DIR}/${ROCKSDB_NAME}) link_directories(${DEP_ROOT_DIR}/${ICONV_NAME}/lib/.libs) +if (APPLE) link_directories(${DEP_ROOT_DIR}/${BRPC_NAME}/lib) link_directories(${DEP_ROOT_DIR}/${BRAFT_NAME}/lib) +endif() # Write dependency libraries to a file file(WRITE ${DEP_ROOT_DIR}/libs.txt "") @@ -153,9 +155,7 @@ set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIE set(CORE_LIBS h2o-evloop iconv ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES} ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} pthread dl ${STD_LIB}) -message("GLOG library is: ${GLOG_LIBRARIES}") - -set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${GLOG_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS}) +set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} glog ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS}) target_link_libraries(typesense-core ${CORE_LIBS}) target_link_libraries(typesense-server ${CORE_LIBS}) diff --git a/cmake/Modules/FindGlog.cmake b/cmake/Modules/FindGlog.cmake deleted file mode 100644 index 1962427f..00000000 --- a/cmake/Modules/FindGlog.cmake +++ /dev/null @@ -1,39 +0,0 @@ -# - Try to find Glog -# -# The following variables are optionally searched for defaults -# GLOG_ROOT_DIR: Base directory where all GLOG components are found -# -# The following are set after configuration is done: -# GLOG_FOUND -# GLOG_INCLUDE_DIRS -# GLOG_LIBRARIES - -include(FindPackageHandleStandardArgs) - -if (NOT DEFINED GLOG_ROOT) - set (GLOG_ROOT /usr /usr/local /usr/include/) -endif (NOT DEFINED GLOG_ROOT) - -#set(GLOG_ROOT_DIR "" CACHE PATH "Folder contains Google glog") - -find_path(GLOG_INCLUDE_DIR glog/logging.h - PATHS - ${GLOG_ROOT_DIR} - PATH_SUFFIXES - src) - -find_library(GLOG_LIBRARY glog libglog - PATHS - ${GLOG_ROOT_DIR} - PATH_SUFFIXES - .libs - lib - lib64) - -find_package_handle_standard_args(GLOG DEFAULT_MSG - GLOG_INCLUDE_DIR GLOG_LIBRARY) - -if(GLOG_FOUND) - set(GLOG_INCLUDE_DIRS ${GLOG_INCLUDE_DIR}) - set(GLOG_LIBRARIES ${GLOG_LIBRARY}) -endif() \ No newline at end of file From 17a6194d6e115a87e4996204170f6da065e83b07 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Mon, 22 Jun 2020 19:53:12 +0530 Subject: [PATCH 24/38] Fix a compile warning. --- .circleci/config.yml | 3 --- src/index.cpp | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a5d2dc2f..765bc4ae 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -17,9 +17,6 @@ jobs: - external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }} - external-Linux-cache-{{ .Branch }} - external-Linux-cache - - run: - name: debug - command: ls -latr /usr/local/lib/ - run: name: build command: $PROJECT_DIR/build.sh diff --git a/src/index.cpp b/src/index.cpp index 6384228f..66b0a299 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1463,7 +1463,7 @@ void Index::score_results(const std::vector & sort_fields, const uint16 } const int64_t default_score = 0; - int64_t scores[3]; + int64_t scores[3] = {0}; // avoiding loop if(sort_fields.size() > 0) { From 57a43a0b210c74f449dc00d36119bd6e44a78b97 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Tue, 23 Jun 2020 06:56:58 +0530 Subject: [PATCH 25/38] Upgrade glog to avail static log file name feature. --- .circleci/config.yml | 2 +- docker-build.sh | 8 ++++---- docker/development.Dockerfile | 15 ++++++++------- src/typesense_server_utils.cpp | 13 +++++++++---- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 765bc4ae..a55fe2d4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2 jobs: build: docker: - - image: typesense/typesense-development:22-JUNE-2020-1 + - image: typesense/typesense-development:23-JUNE-2020-1 environment: - PROJECT_DIR: /typesense - TYPESENSE_VERSION: $CIRCLE_BRANCH-$CIRCLE_SHA1 diff --git a/docker-build.sh b/docker-build.sh index 94164a05..e2ab7d71 100755 --- a/docker-build.sh +++ b/docker-build.sh @@ -22,13 +22,13 @@ if [[ "$@" == *"--depclean"* ]]; then fi #echo "Creating development image..." -#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:22-JUNE-2020-1 $PROJECT_DIR/docker +#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:23-JUNE-2020-1 $PROJECT_DIR/docker echo "Building Typesense $TYPESENSE_VERSION..." -docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ +docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ -DCMAKE_BUILD_TYPE=Release -H/typesense -B/typesense/$BUILD_DIR -docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 make typesense-server -C/typesense/$BUILD_DIR +docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 make typesense-server -C/typesense/$BUILD_DIR if [[ "$@" == *"--build-deploy-image"* ]]; then echo "Creating deployment image for Typesense $TYPESENSE_VERSION server ..." @@ -60,7 +60,7 @@ if [[ "$@" == *"--package-libs"* ]]; then fi # #if [[ "$@" == *"--create-deb-upload"* ]]; then -# docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:22-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ +# docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ # -DCMAKE_BUILD_TYPE=Debug -H/typesense -B/typesense/$BUILD_DIR #fi diff --git a/docker/development.Dockerfile b/docker/development.Dockerfile index 8b6d0cc4..ad2acb33 100644 --- a/docker/development.Dockerfile +++ b/docker/development.Dockerfile @@ -60,9 +60,10 @@ RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz RUN mkdir -p /opt/leveldb-1.22/build && cd /opt/leveldb-1.22/build && cmake -DCMAKE_BUILD_TYPE=Release .. && \ cmake --build . && make install && rm -rf /usr/local/lib/*.so* -ADD https://github.com/google/glog/archive/v0.4.0.tar.gz /opt/glog-0.4.0.tar.gz -RUN tar -C /opt -xf /opt/glog-0.4.0.tar.gz -RUN mkdir -p /opt/glog-0.4.0/build && cd /opt/glog-0.4.0/build && \ +ADD https://github.com/google/glog/archive/0a2e593.tar.gz /opt/glog-0a2e593.tar.gz +RUN tar -C /opt -xf /opt/glog-0a2e593.tar.gz +RUN mkdir -p /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \ + cd /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \ cmake -DBUILD_TESTING=0 -DWITH_GFLAGS=ON -DWITH_UNWIND=OFF .. && \ cmake --build . && make install && rm -rf /usr/local/lib/*.so* @@ -70,18 +71,18 @@ ADD https://github.com/apache/incubator-brpc/archive/0.9.7-rc03.tar.gz /opt/brpc RUN tar -C /opt -xf /opt/brpc-0.9.7-rc03.tar.gz COPY patches/brpc_cmakelists.txt /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt RUN chown root:root /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt -RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/build && cd /opt/incubator-brpc-0.9.7-rc03/build && \ +RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/bld && cd /opt/incubator-brpc-0.9.7-rc03/bld && \ cmake -DWITH_DEBUG_SYMBOLS=OFF -DWITH_GLOG=ON .. && \ make -j8 && make install && rm -rf /usr/local/lib/*.so* && \ - rm -rf /opt/incubator-brpc-0.9.7-rc03/build/output/bin + rm -rf /opt/incubator-brpc-0.9.7-rc03/bld/output/bin ADD https://github.com/baidu/braft/archive/v1.1.1.tar.gz /opt/braft-v1.1.1.tar.gz RUN tar -C /opt -xf /opt/braft-v1.1.1.tar.gz COPY patches/braft_cmakelists.txt /opt/braft-1.1.1/src/CMakeLists.txt RUN chown root:root /opt/braft-1.1.1/src/CMakeLists.txt -RUN mkdir -p /opt/braft-1.1.1/build && cd /opt/braft-1.1.1/build && \ +RUN mkdir -p /opt/braft-1.1.1/bld && cd /opt/braft-1.1.1/bld && \ cmake -DWITH_DEBUG_SYMBOLS=ON -DBRPC_WITH_GLOG=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \ - rm -rf /opt/braft-1.1.1/build/output/bin + rm -rf /opt/braft-1.1.1/bld/output/bin ENV CC /usr/local/gcc-6.4.0/bin/gcc ENV CXX /usr/local/gcc-6.4.0/bin/g++ diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp index 938e2a45..15fb9d67 100644 --- a/src/typesense_server_utils.cpp +++ b/src/typesense_server_utils.cpp @@ -150,12 +150,17 @@ int init_logger(Config & config, const std::string & server_version) { return 1; } - std::cout << "Log directory is configured as: " << log_dir << std::endl; - std::string log_path_prefix = log_dir + "/" + "typesense-log-"; - google::SetLogDestination(google::INFO, log_path_prefix.c_str()); - // flush log levels above -1 immediately (INFO=0) FLAGS_logbuflevel = -1; + + // available only on glog master (ensures that log file name is constant) + FLAGS_timestamp_in_logfile_name = false; + + std::string log_path = log_dir + "/" + "typesense.log"; + google::SetLogDestination(google::INFO, log_path.c_str()); + google::SetLogSymlink(google::INFO, ""); + + std::cout << "Log directory is configured as: " << log_dir << std::endl; } return 0; From a7fad633eb70abc7e9d65b1647a1d46387c0add2 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Wed, 24 Jun 2020 07:57:22 +0530 Subject: [PATCH 26/38] Use mt19937_64 directly for consistency. --- src/main/main.cpp | 2 +- src/string_utils.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/main.cpp b/src/main/main.cpp index 80606fe1..fc5ad311 100644 --- a/src/main/main.cpp +++ b/src/main/main.cpp @@ -21,7 +21,7 @@ int main(int argc, char* argv[]) { Store *store = new Store(state_dir_path); CollectionManager & collectionManager = CollectionManager::get_instance(); - collectionManager.init(store, 4, "abcd", "123"); + collectionManager.init(store, 4, "abcd"); collectionManager.load(); std::vector fields_to_index = { diff --git a/src/string_utils.cpp b/src/string_utils.cpp index 2782c2b6..6de06f4c 100644 --- a/src/string_utils.cpp +++ b/src/string_utils.cpp @@ -1,8 +1,8 @@ #include "string_utils.h" #include #include -#include #include +#include std::string lower_and_no_special_chars(const std::string & str) { std::stringstream ss; @@ -57,14 +57,14 @@ std::string StringUtils::randstring(size_t length, uint64_t seed) { "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - thread_local static std::mt19937 rg(seed); - thread_local static std::uniform_int_distribution pick(0, sizeof(chrs) - 2); + thread_local static std::mt19937_64 mt_rand(seed); std::string s; s.reserve(length); while(length--) { - s += chrs[pick(rg)]; + size_t index = (mt_rand() % (sizeof(chrs) - 1)); + s += chrs[index]; } return s; From 8e1338626e488630781c1b4e7a035b3a6a1afc76 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Wed, 24 Jun 2020 19:51:07 +0530 Subject: [PATCH 27/38] Log all levels to a single file. --- src/typesense_server_utils.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp index 15fb9d67..d0ae17da 100644 --- a/src/typesense_server_utils.cpp +++ b/src/typesense_server_utils.cpp @@ -157,8 +157,14 @@ int init_logger(Config & config, const std::string & server_version) { FLAGS_timestamp_in_logfile_name = false; std::string log_path = log_dir + "/" + "typesense.log"; + + // will log level INFO and up to the given log file google::SetLogDestination(google::INFO, log_path.c_str()); - google::SetLogSymlink(google::INFO, ""); + + // don't create separate log files for each level + google::SetLogDestination(google::WARNING, ""); + google::SetLogDestination(google::ERROR, ""); + google::SetLogDestination(google::FATAL, ""); std::cout << "Log directory is configured as: " << log_dir << std::endl; } From ba80f06001453a4af50c4ba65c860d43e6115ce4 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Wed, 24 Jun 2020 20:34:12 +0530 Subject: [PATCH 28/38] Generate whole key without relying on seed based generation. --- include/http_data.h | 16 +++++----------- include/string_utils.h | 2 +- src/core_api.cpp | 2 +- src/http_server.cpp | 7 +++++++ src/string_utils.cpp | 8 ++++---- src/typesense_server_utils.cpp | 5 ++++- 6 files changed, 22 insertions(+), 18 deletions(-) diff --git a/include/http_data.h b/include/http_data.h index 3a917956..cd306204 100644 --- a/include/http_data.h +++ b/include/http_data.h @@ -117,16 +117,16 @@ struct http_req { uint64_t route_hash; std::map params; std::string body; - uint64_t seed; + std::string metadata; - http_req(): route_hash(1), seed(random_uint64_t()) { + http_req(): route_hash(1) { } http_req(h2o_req_t* _req, const std::string & http_method, uint64_t route_hash, const std::map & params, std::string body): _req(_req), http_method(http_method), route_hash(route_hash), params(params), - body(body), seed(random_uint64_t()) { + body(body) { } @@ -136,7 +136,7 @@ struct http_req { nlohmann::json content = nlohmann::json::parse(serialized_content); route_hash = content["route_hash"]; body = content["body"]; - seed = content["seed"]; + metadata = content.count("metadata") != 0 ? content["metadata"] : ""; for (nlohmann::json::iterator it = content["params"].begin(); it != content["params"].end(); ++it) { params.emplace(it.key(), it.value()); @@ -150,16 +150,10 @@ struct http_req { content["route_hash"] = route_hash; content["params"] = params; content["body"] = body; - content["seed"] = seed; + content["metadata"] = metadata; return content.dump(); } - - uint64_t random_uint64_t() { - thread_local std::mt19937 rg(std::random_device{}()); - thread_local std::uniform_int_distribution pick(0, std::numeric_limits::max()); - return pick(rg); - } }; struct request_response { diff --git a/include/string_utils.h b/include/string_utils.h index 5a19b3ca..62005f7e 100644 --- a/include/string_utils.h +++ b/include/string_utils.h @@ -234,7 +234,7 @@ struct StringUtils { return hash != std::numeric_limits::max() ? hash : (std::numeric_limits::max()-1); } - static std::string randstring(size_t length, uint64_t seed); + static std::string randstring(size_t length); static std::string hmac(const std::string& key, const std::string& msg); }; \ No newline at end of file diff --git a/src/core_api.cpp b/src/core_api.cpp index a52b1a7a..7d5294d3 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -894,7 +894,7 @@ bool post_create_key(http_req &req, http_res &res) { return false; } - const std::string &rand_key = StringUtils::randstring(AuthManager::KEY_LEN, req.seed); + const std::string &rand_key = req.metadata; api_key_t api_key( rand_key, diff --git a/src/http_server.cpp b/src/http_server.cpp index 8058e32d..2b8b6dc1 100644 --- a/src/http_server.cpp +++ b/src/http_server.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "raft_server.h" #include "logger.h" @@ -371,6 +372,12 @@ int HttpServer::catch_all_handler(h2o_handler_t *_self, h2o_req_t *req) { } // routes match and is an authenticated request + // do any additional pre-request middleware operations here + if(rpath->action == "keys:create") { + // we enrich incoming request with a random API key here so that leader and replicas will use the same key + request->metadata = StringUtils::randstring(AuthManager::KEY_LEN); + } + // for writes, we defer to replication_state if(http_method != "GET") { self->http_server->get_replication_state()->write(request, response); diff --git a/src/string_utils.cpp b/src/string_utils.cpp index 6de06f4c..302831dd 100644 --- a/src/string_utils.cpp +++ b/src/string_utils.cpp @@ -52,19 +52,19 @@ void StringUtils::unicode_normalize(std::string & str) const { str.assign(lower_and_no_special_chars(out.str())); } -std::string StringUtils::randstring(size_t length, uint64_t seed) { +std::string StringUtils::randstring(size_t length) { static auto& chrs = "0123456789" "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - thread_local static std::mt19937_64 mt_rand(seed); + thread_local std::mt19937 rg(std::random_device{}()); + thread_local std::uniform_int_distribution pick(0, sizeof(chrs) - 2); std::string s; s.reserve(length); while(length--) { - size_t index = (mt_rand() % (sizeof(chrs) - 1)); - s += chrs[index]; + s += chrs[pick(rg)]; } return s; diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp index d0ae17da..9a56416f 100644 --- a/src/typesense_server_utils.cpp +++ b/src/typesense_server_utils.cpp @@ -158,9 +158,12 @@ int init_logger(Config & config, const std::string & server_version) { std::string log_path = log_dir + "/" + "typesense.log"; - // will log level INFO and up to the given log file + // will log levels INFO **and above** to the given log file google::SetLogDestination(google::INFO, log_path.c_str()); + // don't create symlink for INFO log + google::SetLogSymlink(google::INFO, ""); + // don't create separate log files for each level google::SetLogDestination(google::WARNING, ""); google::SetLogDestination(google::ERROR, ""); From 0762f4ae296c7c2e5d1ccd40c754fb991d00dcee Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sun, 28 Jun 2020 16:05:31 +0530 Subject: [PATCH 29/38] Group key should be actual values. --- cmake/Glog.cmake | 0 include/logger.h | 2 -- include/typesense_server_utils.h | 2 +- src/collection.cpp | 23 ++++++++++-------- src/main/typesense_server.cpp | 2 +- test/collection_grouping_test.cpp | 40 +++++++++++++++++++++---------- test/collection_override_test.cpp | 6 ++++- 7 files changed, 48 insertions(+), 27 deletions(-) delete mode 100644 cmake/Glog.cmake diff --git a/cmake/Glog.cmake b/cmake/Glog.cmake deleted file mode 100644 index e69de29b..00000000 diff --git a/include/logger.h b/include/logger.h index 01adc553..5eee4cdf 100644 --- a/include/logger.h +++ b/include/logger.h @@ -1,5 +1,3 @@ #pragma once -#include -#include #include \ No newline at end of file diff --git a/include/typesense_server_utils.h b/include/typesense_server_utils.h index 4b0af691..34f0cff4 100644 --- a/include/typesense_server_utils.h +++ b/include/typesense_server_utils.h @@ -1,10 +1,10 @@ #pragma once +#include "logger.h" #include #include #include #include "config.h" -#include "logger.h" #include "store.h" #include "collection_manager.h" #include diff --git a/src/collection.cpp b/src/collection.cpp index fbe0a376..402892e5 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -862,7 +862,7 @@ Option Collection::search(const std::string & query, const std:: result["found"] = total_found; - std::string hits_key = (group_limit > 1) ? "grouped_hits" : "hits"; + std::string hits_key = group_limit ? "grouped_hits" : "hits"; result[hits_key] = nlohmann::json::array(); // construct results array @@ -870,17 +870,11 @@ Option Collection::search(const std::string & query, const std:: const std::vector & kv_group = result_group_kvs[result_kvs_index]; nlohmann::json group_hits; - if(group_limit > 1) { + if(group_limit) { group_hits["hits"] = nlohmann::json::array(); - std::vector group_keys; - for(const auto& group_key: group_by_fields) { - group_keys.push_back(group_key); - } - - group_hits["group_key"] = StringUtils::join(group_keys, ","); } - nlohmann::json& hits_array = (group_limit > 1) ? group_hits["hits"] : result["hits"]; + nlohmann::json& hits_array = group_limit ? group_hits["hits"] : result["hits"]; for(const KV* field_order_kv: kv_group) { const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key); @@ -960,7 +954,16 @@ Option Collection::search(const std::string & query, const std:: hits_array.push_back(wrapper_doc); } - if(group_limit > 1) { + if(group_limit) { + const auto& document = group_hits["hits"][0]["document"]; + + group_hits["group_key"] = nlohmann::json::array(); + for(const auto& field_name: group_by_fields) { + if(document.count(field_name) != 0) { + group_hits["group_key"].push_back(document[field_name]); + } + } + result["grouped_hits"].push_back(group_hits); } } diff --git a/src/main/typesense_server.cpp b/src/main/typesense_server.cpp index 5a3ef7d5..fd9ed722 100644 --- a/src/main/typesense_server.cpp +++ b/src/main/typesense_server.cpp @@ -1,6 +1,6 @@ +#include "typesense_server_utils.h" #include "core_api.h" #include "config.h" -#include "typesense_server_utils.h" void master_server_routes() { // collection management diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp index b1fcd8b6..1c565bd5 100644 --- a/test/collection_grouping_test.cpp +++ b/test/collection_grouping_test.cpp @@ -70,9 +70,10 @@ TEST_F(CollectionGroupingTest, GroupingBasics) { ASSERT_EQ(3, res["found"].get()); ASSERT_EQ(3, res["grouped_hits"].size()); - ASSERT_STREQ("size", res["grouped_hits"][0]["group_key"].get().c_str()); + ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get()); ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get()); + ASSERT_EQ(11, res["grouped_hits"][0]["hits"][0]["document"]["size"].get()); ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get()); ASSERT_STREQ("1", res["grouped_hits"][0]["hits"][1]["document"]["id"].get().c_str()); @@ -112,7 +113,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) { // 7 unique ratings ASSERT_EQ(7, res["found"].get()); ASSERT_EQ(7, res["grouped_hits"].size()); - ASSERT_STREQ("rating", res["grouped_hits"][0]["group_key"].get().c_str()); + ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["group_key"][0].get()); ASSERT_EQ(12, res["grouped_hits"][0]["hits"][0]["document"]["size"].get()); ASSERT_STREQ("8", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); @@ -151,7 +152,14 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) { ASSERT_EQ(10, res["found"].get()); ASSERT_EQ(10, res["grouped_hits"].size()); - ASSERT_STREQ("size,brand", res["grouped_hits"][0]["group_key"].get().c_str()); + ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get()); + + ASSERT_STREQ("Beta", res["grouped_hits"][0]["group_key"][1].get().c_str()); + + // optional field should have no value in the group key component + ASSERT_EQ(1, res["grouped_hits"][5]["group_key"].size()); + ASSERT_STREQ("10", res["grouped_hits"][5]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("11", res["grouped_hits"][5]["hits"][1]["document"]["id"].get().c_str()); ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size()); ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get()); @@ -199,7 +207,8 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) { // total count and facet counts should be the same ASSERT_EQ(10, res["found"].get()); ASSERT_EQ(2, res["grouped_hits"].size()); - ASSERT_STREQ("size,brand", res["grouped_hits"][0]["group_key"].get().c_str()); + ASSERT_EQ(10, res["grouped_hits"][0]["group_key"][0].get()); + ASSERT_STREQ("Omega", res["grouped_hits"][0]["group_key"][1].get().c_str()); ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]); @@ -215,7 +224,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) { ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get().c_str()); } -TEST_F(CollectionGroupingTest, GroupingWithSingleDistinct) { +TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) { auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), @@ -224,13 +233,18 @@ TEST_F(CollectionGroupingTest, GroupingWithSingleDistinct) { {}, {}, {"brand"}, 1).get(); ASSERT_EQ(5, res["found"].get()); - ASSERT_EQ(5, res["hits"].size()); + ASSERT_EQ(5, res["grouped_hits"].size()); - ASSERT_STREQ("4", res["hits"][0]["document"]["id"].get().c_str()); - ASSERT_STREQ("2", res["hits"][1]["document"]["id"].get().c_str()); - ASSERT_STREQ("8", res["hits"][2]["document"]["id"].get().c_str()); - ASSERT_STREQ("10", res["hits"][3]["document"]["id"].get().c_str()); // unbranded - ASSERT_STREQ("9", res["hits"][4]["document"]["id"].get().c_str()); + // all hits array must be of size 1 + for(auto i=0; i<5; i++) { + ASSERT_EQ(1, res["grouped_hits"][i]["hits"].size()); + } + + ASSERT_STREQ("4", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("2", res["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("10", res["grouped_hits"][3]["hits"][0]["document"]["id"].get().c_str()); // unbranded + ASSERT_STREQ("9", res["grouped_hits"][4]["hits"][0]["document"]["id"].get().c_str()); // facet counts should each be 1, including unbranded ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); @@ -290,7 +304,9 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) { ASSERT_EQ(4, res["found"].get()); ASSERT_EQ(4, res["grouped_hits"].size()); - ASSERT_STREQ("colors", res["grouped_hits"][0]["group_key"].get().c_str()); + + ASSERT_EQ(1, res["grouped_hits"][0]["group_key"][0].size()); + ASSERT_STREQ("white", res["grouped_hits"][0]["group_key"][0][0].get().c_str()); ASSERT_STREQ("11", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("10", res["grouped_hits"][0]["hits"][1]["document"]["id"].get().c_str()); diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index a36fb85a..e79e2b42 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -378,7 +378,11 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) { ASSERT_EQ(8, results["found"].get()); - ASSERT_STREQ("cast", results["grouped_hits"][0]["group_key"].get().c_str()); + ASSERT_EQ(1, results["grouped_hits"][0]["group_key"].size()); + ASSERT_EQ(2, results["grouped_hits"][0]["group_key"][0].size()); + ASSERT_STREQ("Chris Evans", results["grouped_hits"][0]["group_key"][0][0].get().c_str()); + ASSERT_STREQ("Scarlett Johansson", results["grouped_hits"][0]["group_key"][0][1].get().c_str()); + ASSERT_STREQ("6", results["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("8", results["grouped_hits"][0]["hits"][1]["document"]["id"].get().c_str()); From 2102f2323f810b88e9965bba0870edb0558255ff Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sun, 28 Jun 2020 17:22:12 +0530 Subject: [PATCH 30/38] Identify curated results in response. --- src/collection.cpp | 10 +++++++++- test/collection_override_test.cpp | 11 +++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/collection.cpp b/src/collection.cpp index 402892e5..bdfd1801 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -837,6 +837,7 @@ Option Collection::search(const std::string & query, const std:: uint64_t override_position = override_result_kvs[override_kv_index][0]->distinct_key; if(result_position == override_position) { + override_result_kvs[override_kv_index][0]->match_score = 0; // to identify curated result result_group_kvs.push_back(override_result_kvs[override_kv_index]); override_kv_index++; } else { @@ -846,6 +847,7 @@ Option Collection::search(const std::string & query, const std:: } while(override_kv_index < override_result_kvs.size()) { + override_result_kvs[override_kv_index][0]->match_score = 0; // to identify curated result result_group_kvs.push_back({override_result_kvs[override_kv_index]}); override_kv_index++; } @@ -947,10 +949,16 @@ Option Collection::search(const std::string & query, const std:: wrapper_doc["highlights"].push_back(h_json); } + //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key; + prune_document(document, include_fields, exclude_fields); wrapper_doc["document"] = document; wrapper_doc["text_match"] = field_order_kv->match_score; - //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key; + + if(field_order_kv->match_score == 0) { + wrapper_doc["curated"] = true; + } + hits_array.push_back(wrapper_doc); } diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index e79e2b42..e08f32d2 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -122,6 +122,11 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeExactQueryMatch) { ASSERT_STREQ("3", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get().c_str()); + // curated results should be marked as such + ASSERT_EQ(true, results["hits"][0]["curated"].get()); + ASSERT_EQ(true, results["hits"][1]["curated"].get()); + ASSERT_EQ(0, results["hits"][2].count("curated")); + coll_mul_fields->remove_override("exclude-rule"); coll_mul_fields->remove_override("include-rule"); @@ -367,6 +372,12 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) { ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get().c_str()); ASSERT_STREQ("11", results["hits"][3]["document"]["id"].get().c_str()); + // pinned hits should be marked as curated + ASSERT_EQ(true, results["hits"][0]["curated"].get()); + ASSERT_EQ(true, results["hits"][1]["curated"].get()); + ASSERT_EQ(true, results["hits"][2]["curated"].get()); + ASSERT_EQ(0, results["hits"][3].count("curated")); + // with grouping results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, From 8c42d5da7bf32c305d3b145317c91d7cb675863e Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sun, 28 Jun 2020 21:52:30 +0530 Subject: [PATCH 31/38] Fix url of h2o github archive. --- cmake/H2O.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/H2O.cmake b/cmake/H2O.cmake index 679088e8..572e51d7 100644 --- a/cmake/H2O.cmake +++ b/cmake/H2O.cmake @@ -6,7 +6,7 @@ set(H2O_TAR_PATH ${DEP_ROOT_DIR}/v${H2O_NAME}.tar.gz) if(NOT EXISTS ${H2O_TAR_PATH}) message(STATUS "Downloading ${H2O_NAME}...") - file(DOWNLOAD https://github.com/h2o/h2o/archive/${H2O_VERSION}.tar.gz ${H2O_TAR_PATH}) + file(DOWNLOAD https://github.com/h2o/h2o/archive/v${H2O_VERSION}.tar.gz ${H2O_TAR_PATH}) endif() if(NOT EXISTS ${DEP_ROOT_DIR}/${H2O_NAME}) From 0cdd58e86ce77d92b6ed49683bb57d4a5cd5786b Mon Sep 17 00:00:00 2001 From: kishorenc Date: Tue, 30 Jun 2020 06:58:06 +0530 Subject: [PATCH 32/38] Validate group limit & other numerical parameters of search. --- include/string_utils.h | 14 ++++++++++++-- src/collection.cpp | 4 ++++ src/core_api.cpp | 26 +++++++++++++------------- test/collection_grouping_test.cpp | 21 +++++++++++++++++++++ test/string_utils_test.cpp | 5 +++++ 5 files changed, 55 insertions(+), 15 deletions(-) diff --git a/include/string_utils.h b/include/string_utils.h index 62005f7e..1b12da8b 100644 --- a/include/string_utils.h +++ b/include/string_utils.h @@ -151,8 +151,18 @@ struct StringUtils { } char * p ; - strtoull(s.c_str(), &p, 10); - return (*p == 0); + unsigned long long ull = strtoull(s.c_str(), &p, 10); + return (*p == 0) && ull <= std::numeric_limits::max(); + } + + static bool is_uint32_t(const std::string &s) { + if(s.empty()) { + return false; + } + + char * p ; + unsigned long ul = strtoul(s.c_str(), &p, 10); + return (*p == 0) && ul <= std::numeric_limits::max(); } static void toupper(std::string& str) { diff --git a/src/collection.cpp b/src/collection.cpp index bdfd1801..2c12f690 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -394,6 +394,10 @@ Option Collection::search(const std::string & query, const std:: return Option(400, "No search fields specified for the query."); } + if(group_limit == 0 || group_limit >= 100) { + return Option(400, "Value of `group_limit` is invalid."); + } + std::vector excluded_ids; std::map> include_ids; // position => list of IDs populate_overrides(query, pinned_hits, hidden_hits, include_ids, excluded_ids); diff --git a/src/core_api.cpp b/src/core_api.cpp index 7d5294d3..06952a80 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -301,42 +301,42 @@ bool get_search(http_req & req, http_res & res) { } } - if(!StringUtils::is_uint64_t(req.params[DROP_TOKENS_THRESHOLD])) { + if(!StringUtils::is_uint32_t(req.params[DROP_TOKENS_THRESHOLD])) { res.set_400("Parameter `" + std::string(DROP_TOKENS_THRESHOLD) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[TYPO_TOKENS_THRESHOLD])) { + if(!StringUtils::is_uint32_t(req.params[TYPO_TOKENS_THRESHOLD])) { res.set_400("Parameter `" + std::string(TYPO_TOKENS_THRESHOLD) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[NUM_TYPOS])) { + if(!StringUtils::is_uint32_t(req.params[NUM_TYPOS])) { res.set_400("Parameter `" + std::string(NUM_TYPOS) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[PER_PAGE])) { + if(!StringUtils::is_uint32_t(req.params[PER_PAGE])) { res.set_400("Parameter `" + std::string(PER_PAGE) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[PAGE])) { + if(!StringUtils::is_uint32_t(req.params[PAGE])) { res.set_400("Parameter `" + std::string(PAGE) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[MAX_FACET_VALUES])) { + if(!StringUtils::is_uint32_t(req.params[MAX_FACET_VALUES])) { res.set_400("Parameter `" + std::string(MAX_FACET_VALUES) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[SNIPPET_THRESHOLD])) { + if(!StringUtils::is_uint32_t(req.params[SNIPPET_THRESHOLD])) { res.set_400("Parameter `" + std::string(SNIPPET_THRESHOLD) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[GROUP_LIMIT])) { + if(!StringUtils::is_uint32_t(req.params[GROUP_LIMIT])) { res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer."); return false; } @@ -441,19 +441,19 @@ bool get_search(http_req & req, http_res & res) { Option result_op = collection->search(req.params[QUERY], search_fields, filter_str, facet_fields, sort_fields, std::stoi(req.params[NUM_TYPOS]), - static_cast(std::stoi(req.params[PER_PAGE])), - static_cast(std::stoi(req.params[PAGE])), + static_cast(std::stol(req.params[PER_PAGE])), + static_cast(std::stol(req.params[PAGE])), token_order, prefix, drop_tokens_threshold, include_fields, exclude_fields, - static_cast(std::stoi(req.params[MAX_FACET_VALUES])), + static_cast(std::stol(req.params[MAX_FACET_VALUES])), req.params[FACET_QUERY], - static_cast(std::stoi(req.params[SNIPPET_THRESHOLD])), + static_cast(std::stol(req.params[SNIPPET_THRESHOLD])), req.params[HIGHLIGHT_FULL_FIELDS], typo_tokens_threshold, pinned_hits, hidden_hits, group_by_fields, - static_cast(std::stoi(req.params[GROUP_LIMIT])) + static_cast(std::stol(req.params[GROUP_LIMIT])) ); uint64_t timeMillis = std::chrono::duration_cast( diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp index 1c565bd5..63118a4e 100644 --- a/test/collection_grouping_test.cpp +++ b/test/collection_grouping_test.cpp @@ -222,6 +222,27 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) { ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]); ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get().c_str()); + + // respect min and max grouping limit (greater than 0 and less than 99) + auto res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "brand: omeg", 30, + "", 10, + {}, {}, {"rating"}, 100); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Value of `group_limit` is invalid.", res_op.error().c_str()); + + res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "brand: omeg", 30, + "", 10, + {}, {}, {"rating"}, 0); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Value of `group_limit` is invalid.", res_op.error().c_str()); } TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) { diff --git a/test/string_utils_test.cpp b/test/string_utils_test.cpp index 4f8c2400..a2d64147 100644 --- a/test/string_utils_test.cpp +++ b/test/string_utils_test.cpp @@ -54,3 +54,8 @@ TEST(StringUtilsTest, HMAC) { std::string digest1 = StringUtils::hmac("KeyVal", "{\"filter_by\": \"user_id:1080\"}"); ASSERT_STREQ("IvjqWNZ5M5ElcvbMoXj45BxkQrZG4ZKEaNQoRioCx2s=", digest1.c_str()); } + +TEST(StringUtilsTest, UInt32Validation) { + std::string big_num = "99999999999999999999999999999999"; + ASSERT_FALSE(StringUtils::is_uint32_t(big_num)); +} From 2e0d8179f7da747ae1f23ec61c7f8a00ce8ffe18 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Tue, 30 Jun 2020 06:59:15 +0530 Subject: [PATCH 33/38] Change typesense memory usage metric name. --- src/system_metrics.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/system_metrics.cpp b/src/system_metrics.cpp index ac5412b7..eb79c42b 100644 --- a/src/system_metrics.cpp +++ b/src/system_metrics.cpp @@ -27,7 +27,7 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result rusage r_usage; getrusage(RUSAGE_SELF, &r_usage); - result["memory_used_process_bytes"] = r_usage.ru_maxrss * 1000; + result["typesense_memory_used_bytes"] = r_usage.ru_maxrss * 1000; uint64_t memory_available_bytes = 0; uint64_t memory_total_bytes = 0; From 6377d1b58c8fee58c6b8b42d98adc49b4fd4b495 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Tue, 30 Jun 2020 18:08:44 +0530 Subject: [PATCH 34/38] Fix tests. --- src/collection.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/collection.cpp b/src/collection.cpp index 2c12f690..3e5a031e 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -394,7 +394,7 @@ Option Collection::search(const std::string & query, const std:: return Option(400, "No search fields specified for the query."); } - if(group_limit == 0 || group_limit >= 100) { + if(!group_by_fields.empty() && (group_limit == 0 || group_limit >= 100)) { return Option(400, "Value of `group_limit` is invalid."); } From 0f38242e09e94bd33af2b4795ff8eea762e0fee3 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Wed, 1 Jul 2020 08:09:00 +0530 Subject: [PATCH 35/38] Clearer error message about group limit range. --- include/collection.h | 2 ++ src/collection.cpp | 5 +++-- test/collection_grouping_test.cpp | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/collection.h b/include/collection.h index 730bad63..69f15832 100644 --- a/include/collection.h +++ b/include/collection.h @@ -273,6 +273,8 @@ public: const size_t PER_PAGE_MAX = 250; + const size_t GROUP_LIMIT_MAX = 99; + // Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store static constexpr const char* COLLECTION_META_PREFIX = "$CM"; static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS"; diff --git a/src/collection.cpp b/src/collection.cpp index 3e5a031e..e3724a11 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -394,8 +394,9 @@ Option Collection::search(const std::string & query, const std:: return Option(400, "No search fields specified for the query."); } - if(!group_by_fields.empty() && (group_limit == 0 || group_limit >= 100)) { - return Option(400, "Value of `group_limit` is invalid."); + if(!group_by_fields.empty() && (group_limit == 0 || group_limit > GROUP_LIMIT_MAX)) { + return Option(400, "Value of `group_limit` must be between 1 and " + + std::to_string(GROUP_LIMIT_MAX) + "."); } std::vector excluded_ids; diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp index 63118a4e..9d02e0a8 100644 --- a/test/collection_grouping_test.cpp +++ b/test/collection_grouping_test.cpp @@ -232,7 +232,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) { {}, {}, {"rating"}, 100); ASSERT_FALSE(res_op.ok()); - ASSERT_STREQ("Value of `group_limit` is invalid.", res_op.error().c_str()); + ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str()); res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, @@ -242,7 +242,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) { {}, {}, {"rating"}, 0); ASSERT_FALSE(res_op.ok()); - ASSERT_STREQ("Value of `group_limit` is invalid.", res_op.error().c_str()); + ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str()); } TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) { From e64805320b7d85dc0ba6c4dc3ef785284622bb2a Mon Sep 17 00:00:00 2001 From: kishorenc Date: Fri, 3 Jul 2020 06:10:21 +0530 Subject: [PATCH 36/38] Address some warnings. --- src/raft_server.cpp | 1 + src/string_utils.cpp | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/raft_server.cpp b/src/raft_server.cpp index dc1483e9..73bd1a63 100644 --- a/src/raft_server.cpp +++ b/src/raft_server.cpp @@ -34,6 +34,7 @@ int ReplicationState::start(const butil::EndPoint & peering_endpoint, const int node_options.fsm = this; node_options.node_owns_fsm = false; node_options.snapshot_interval_s = snapshot_interval_s; + node_options.filter_before_copy_remote = false; std::string prefix = "local://" + raft_dir; node_options.log_uri = prefix + "/" + log_dir_name; node_options.raft_meta_uri = prefix + "/" + meta_dir_name; diff --git a/src/string_utils.cpp b/src/string_utils.cpp index 302831dd..041f7f07 100644 --- a/src/string_utils.cpp +++ b/src/string_utils.cpp @@ -19,6 +19,10 @@ std::string lower_and_no_special_chars(const std::string & str) { } void StringUtils::unicode_normalize(std::string & str) const { + if(str.empty()) { + return ; + } + std::stringstream out; for (char *s = &str[0]; *s;) { @@ -49,7 +53,7 @@ void StringUtils::unicode_normalize(std::string & str) const { } } - str.assign(lower_and_no_special_chars(out.str())); + str = lower_and_no_special_chars(out.str()); } std::string StringUtils::randstring(size_t length) { From 60a1454d8d59312066b5e53eee3ad6d7caa7d0de Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sat, 4 Jul 2020 08:17:49 +0530 Subject: [PATCH 37/38] Cmake: remove remote brpc & braft builds for mac. --- CMakeLists.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c65f4e47..b5afc665 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,11 +54,6 @@ include(cmake/GoogleTest.cmake) include(cmake/TestResources.cmake) include(cmake/Iconv.cmake) -if (APPLE) - include(cmake/brpc.cmake) - include(cmake/braft.cmake) -endif() - FILE(GLOB SRC_FILES src/*.cpp) FILE(GLOB TEST_FILES test/*.cpp) From 14d1604d493639ad2162b217755c849a17cc5aab Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sat, 4 Jul 2020 08:21:29 +0530 Subject: [PATCH 38/38] Update README. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d4f7cbfe..c0463bbd 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Here's a quick example showcasing how you can create a collection, index a docum Let's begin by starting the Typesense server via Docker: ``` -docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.13.0 --data-dir /data --api-key=Hu52dwsas2AdxdE +docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.14.0 --data-dir /data --api-key=Hu52dwsas2AdxdE ``` Install the Python client for Typesense (we have [clients](https://typesense.org/api/#api-clients) for other languages too): @@ -146,7 +146,7 @@ works without turning many knobs. **Speed is great, but what about the memory footprint?** -A fresh Typesense server will take less than 5 MB of memory. As you start indexing documents, the memory use will +A fresh Typesense server will consume about 30 MB of memory. As you start indexing documents, the memory use will increase correspondingly. How much it increases depends on the number and type of fields you index. We've strived to keep the in-memory data structures lean. To give you a rough idea: when 1 million