From d2798ac00b00956d3bcf5366a10f6970e6bc7b33 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 19 Nov 2021 19:32:00 +0530 Subject: [PATCH 01/20] Log times for operations. --- src/art.cpp | 2 ++ src/index.cpp | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/art.cpp b/src/art.cpp index 40017d0c..311e1993 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -1024,6 +1024,8 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r } } + LOG(INFO) << "leaf results.size: " << results.size(); + printf("OUTSIDE art_topk_iter: results size: %d\n", results.size()); return 0; } diff --git a/src/index.cpp b/src/index.cpp index d77bf416..40580704 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1212,7 +1212,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length, const std::vector& filters, const bool enable_short_circuit) const { - //auto begin = std::chrono::high_resolution_clock::now(); + auto begin = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < filters.size(); i++) { const filter & a_filter = filters[i]; @@ -1552,10 +1552,10 @@ void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length, } } - /*long long int timeMillis = + long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); - LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms";*/ + LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms"; } @@ -2881,11 +2881,18 @@ void Index::search_field(const uint8_t & field_id, // prefix should apply only for last token const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1; + auto begin = std::chrono::high_resolution_clock::now(); + // need less candidates for filtered searches since we already only pick tokens with results art_fuzzy_search(search_index.at(field_name), (const unsigned char *) token.c_str(), token_len, costs[token_index], costs[token_index], num_fuzzy_candidates, token_order, prefix_search, filter_ids, filter_ids_length, leaves, unique_tokens); + auto timeMillis = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - begin).count(); + + LOG(INFO) << "Time taken for fuzzy search: " << timeMillis << "ms"; + if(!leaves.empty()) { token_cost_cache.emplace(token_cost_hash, leaves); for(auto leaf: leaves) { From 4cb6d7926a411025eaca45cbef1869cd97508179 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 19 Nov 2021 22:19:25 +0530 Subject: [PATCH 02/20] Log multi-field aggregation time. --- src/index.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/index.cpp b/src/index.cpp index 40580704..3fc4bc1e 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2175,6 +2175,8 @@ void Index::search(std::vector& field_query_tokens, } } + auto begin0 = std::chrono::high_resolution_clock::now(); + for(auto& seq_id_kvs: topster_ids) { const uint64_t seq_id = seq_id_kvs.first; auto& kvs = seq_id_kvs.second; // each `kv` can be from a different field @@ -2358,6 +2360,11 @@ void Index::search(std::vector& field_query_tokens, kvs[0]->scores[kvs[0]->match_score_index] = aggregated_score; topster->add(kvs[0]); } + + auto timeMillis0 = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - begin0).count(); + + LOG(INFO) << "Time taken for multi-field aggregation: " << timeMillis0 << "ms"; } //LOG(INFO) << "topster size: " << topster->size; From 0626ca8cf661f0975526543c24d34e2dded43ebd Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 20 Nov 2021 11:59:46 +0530 Subject: [PATCH 03/20] Speed up search inside multi-field aggregation. --- src/index.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/index.cpp b/src/index.cpp index 3fc4bc1e..418dc98b 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2266,18 +2266,14 @@ void Index::search(std::vector& field_query_tokens, for(size_t token_index=0; token_index < field_query_tokens[i].q_include_tokens.size(); token_index++) { const auto& token = field_query_tokens[i].q_include_tokens[token_index]; + const art_leaf* leaf = (art_leaf *) art_search(search_index.at(field), (const unsigned char*) token.c_str(), + token.length()+1); - std::vector leaves; - const bool prefix_search = field_prefix && (token_index == field_query_tokens[i].q_include_tokens.size()-1); - const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1; - art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len, - 0, 0, 1, token_order, prefix_search, nullptr, 0, leaves); - - if(leaves.empty()) { + if(!leaf) { continue; } - if(!posting_t::contains(leaves[0]->values, seq_id)) { + if(!posting_t::contains(leaf->values, seq_id)) { continue; } From ef47f54d111339c9aceb430035aa4844fbee034c Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sun, 21 Nov 2021 12:51:21 +0530 Subject: [PATCH 04/20] Improve fuzzy search filtering perf. --- src/art.cpp | 10 +++++++++- src/posting.cpp | 24 +++++++++++++++--------- test/posting_list_test.cpp | 6 ++++++ 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/art.cpp b/src/art.cpp index 311e1993..bb373813 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -950,6 +950,8 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r q.push(root); + size_t num_large_lists = 0; + while(!q.empty() && results.size() < max_results*4) { art_node *n = (art_node *) q.top(); q.pop(); @@ -974,6 +976,10 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r results.push_back(l); } else { // we will push leaf only if filter matches with leaf IDs + if(!IS_COMPACT_POSTING(l->values)) { + num_large_lists++; + } + bool found_atleast_one = posting_t::contains_atleast_one(l->values, filter_ids, filter_ids_length); if(found_atleast_one) { results.push_back(l); @@ -1024,7 +1030,9 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r } } - LOG(INFO) << "leaf results.size: " << results.size(); + LOG(INFO) << "leaf results.size: " << results.size() + << ", filter_ids_length: " << filter_ids_length + << ", num_large_lists: " << num_large_lists; printf("OUTSIDE art_topk_iter: results size: %d\n", results.size()); return 0; diff --git a/src/posting.cpp b/src/posting.cpp index 4ec4ab00..14484af0 100644 --- a/src/posting.cpp +++ b/src/posting.cpp @@ -221,17 +221,23 @@ bool compact_posting_list_t::contains_atleast_one(const uint32_t* target_ids, si size_t num_existing_offsets = id_offsets[i]; size_t existing_id = id_offsets[i + num_existing_offsets + 1]; - if(existing_id == target_ids[target_ids_index]) { - return true; + // Returns iterator to the first element that is >= to value or last if no such element is found. + size_t found_index = std::lower_bound(target_ids + target_ids_index, + target_ids + target_ids_size, existing_id) - target_ids; + + if(found_index == target_ids_size) { + // all elements are lesser than lowest value (existing_id), so we can stop looking + return false; + } else { + if(target_ids[found_index] == existing_id) { + return true; + } + + // adjust lower bound to found_index+1 whose value is >= `existing_id` + target_ids_index = found_index; } - if(target_ids[target_ids_index] < existing_id) { - while(target_ids_index < target_ids_size && target_ids[target_ids_index] < existing_id) { - target_ids_index++; - } - } else { - i += num_existing_offsets + 2; - } + i += num_existing_offsets + 2; } return false; diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index d28db670..09c567ef 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -1325,6 +1325,12 @@ TEST_F(PostingListTest, CompactPostingListContainsAtleastOne) { ASSERT_TRUE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids3[0], target_ids3.size())); ASSERT_FALSE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids4[0], target_ids4.size())); + std::vector target_ids5 = {2, 3}; + ASSERT_TRUE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids5[0], target_ids5.size())); + + std::vector target_ids6 = {0, 1, 2}; + ASSERT_FALSE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids6[0], target_ids6.size())); + posting_t::destroy_list(obj); } From 55ad455804efc21ae48622278009c3a04b8fa44e Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 22 Nov 2021 12:29:11 +0530 Subject: [PATCH 05/20] Improve intersected filtering perf. --- src/posting_list.cpp | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/src/posting_list.cpp b/src/posting_list.cpp index 0fcb771d..823dd028 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -666,37 +666,18 @@ void posting_list_t::intersect(const std::vector& posting_lists bool posting_list_t::take_id(result_iter_state_t& istate, uint32_t id) { // decide if this result id should be excluded if(istate.excluded_result_ids_size != 0) { - while(istate.excluded_result_ids_index < istate.excluded_result_ids_size && - istate.excluded_result_ids[istate.excluded_result_ids_index] < id) { - istate.excluded_result_ids_index++; - } - - if(istate.excluded_result_ids_index < istate.excluded_result_ids_size && - id == istate.excluded_result_ids[istate.excluded_result_ids_index]) { - istate.excluded_result_ids_index++; + if (std::binary_search(istate.excluded_result_ids, + istate.excluded_result_ids + istate.excluded_result_ids_size, id)) { return false; } } - bool id_found_in_filter = true; - // decide if this result be matched with filter results if(istate.filter_ids_length != 0) { - id_found_in_filter = false; - - // e.g. [1, 3] vs [2, 3] - - while(istate.filter_ids_index < istate.filter_ids_length && istate.filter_ids[istate.filter_ids_index] < id) { - istate.filter_ids_index++; - } - - if(istate.filter_ids_index < istate.filter_ids_length && istate.filter_ids[istate.filter_ids_index] == id) { - istate.filter_ids_index++; - id_found_in_filter = true; - } + return std::binary_search(istate.filter_ids, istate.filter_ids + istate.filter_ids_length, id); } - return id_found_in_filter; + return true; } bool posting_list_t::get_offsets(const std::vector& its, From 97b1981fcdda9540a6f7e017843774537c0fb9f1 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 22 Nov 2021 15:18:37 +0530 Subject: [PATCH 06/20] Buffer IDs during exhaustive search. --- include/index.h | 3 ++- include/posting.h | 8 ++++--- src/index.cpp | 46 ++++++++++++++++++++++++++++++++++------ test/collection_test.cpp | 2 +- 4 files changed, 47 insertions(+), 12 deletions(-) diff --git a/include/index.h b/include/index.h index 9c8d0fc4..1309abea 100644 --- a/include/index.h +++ b/include/index.h @@ -510,7 +510,8 @@ private: bool prioritize_exact_match, bool exhaustive_search, size_t concurrency, - std::set& query_hashes) const; + std::set& query_hashes, + std::vector& id_buff) const; void do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length, const std::vector& filters, const bool enable_short_circuit) const; diff --git a/include/posting.h b/include/posting.h index c5c3ef66..e8ad4be3 100644 --- a/include/posting.h +++ b/include/posting.h @@ -65,9 +65,11 @@ public: to_expanded_plists(raw_posting_lists, plists, expanded_plists); - std::sort(this->plists.begin(), this->plists.end(), [](posting_list_t* a, posting_list_t* b) { - return a->num_blocks() < b->num_blocks(); - }); + if(plists.size() > 1) { + std::sort(this->plists.begin(), this->plists.end(), [](posting_list_t* a, posting_list_t* b) { + return a->num_blocks() < b->num_blocks(); + }); + } } ~block_intersector_t() { diff --git a/src/index.cpp b/src/index.cpp index 418dc98b..1a677731 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1075,7 +1075,8 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, bool prioritize_exact_match, const bool exhaustive_search, const size_t concurrency, - std::set& query_hashes) const { + std::set& query_hashes, + std::vector& id_buff) const { auto product = []( long long a, token_candidates & b ) { return a*b.candidates.size(); }; long long int N = std::accumulate(token_candidates_vec.begin(), token_candidates_vec.end(), 1LL, product); @@ -1180,11 +1181,15 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, for(size_t i = 0; i < concurrency; i++) { // empty vec can happen if not all threads produce results if (!result_id_vecs[i].empty()) { - uint32_t* new_all_result_ids = nullptr; - all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &result_id_vecs[i][0], - result_id_vecs[i].size(), &new_all_result_ids); - delete[] *all_result_ids; - *all_result_ids = new_all_result_ids; + if(exhaustive_search) { + id_buff.insert(id_buff.end(), result_id_vecs[i].begin(), result_id_vecs[i].end()); + } else { + uint32_t* new_all_result_ids = nullptr; + all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &result_id_vecs[i][0], + result_id_vecs[i].size(), &new_all_result_ids); + delete[] *all_result_ids; + *all_result_ids = new_all_result_ids; + } num_result_ids += result_id_vecs[i].size(); @@ -1200,6 +1205,20 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, } } + if(id_buff.size() > 100000) { + // prevents too many ORs during exhaustive searching + std::sort(id_buff.begin(), id_buff.end()); + id_buff.erase(std::unique( id_buff.begin(), id_buff.end() ), id_buff.end()); + + uint32_t* new_all_result_ids = nullptr; + all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &id_buff[0], + id_buff.size(), &new_all_result_ids); + delete[] *all_result_ids; + *all_result_ids = new_all_result_ids; + num_result_ids += id_buff.size(); + id_buff.clear(); + } + if(num_result_ids == 0) { continue; } @@ -2940,13 +2959,26 @@ void Index::search_field(const uint8_t & field_id, } if(!token_candidates_vec.empty()) { + std::vector id_buff; + // If atleast one token is found, go ahead and search for candidates search_candidates(field_id, the_field.is_array(), filter_ids, filter_ids_length, exclude_token_ids, exclude_token_ids_size, curated_ids, sort_fields, token_candidates_vec, searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len, field_num_results, typo_tokens_threshold, group_limit, group_by_fields, query_tokens, - prioritize_exact_match, combination_limit, concurrency, query_hashes); + prioritize_exact_match, combination_limit, concurrency, query_hashes, id_buff); + + if(id_buff.size() > 1) { + std::sort(id_buff.begin(), id_buff.end()); + id_buff.erase(std::unique( id_buff.begin(), id_buff.end() ), id_buff.end()); + } + + uint32_t* new_all_result_ids = nullptr; + all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &id_buff[0], + id_buff.size(), &new_all_result_ids); + delete[] *all_result_ids; + *all_result_ids = new_all_result_ids; } resume_typo_loop: diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 63ceba4f..09229ed0 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -1939,7 +1939,7 @@ TEST_F(CollectionTest, DeletionOfDocumentArrayFields) { token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10).get(); - ASSERT_EQ(1, res["found"]); + ASSERT_EQ(1, res["found"].get()); Option rem_op = coll1->remove("100"); From 72b40252325ed50542bcf2a880c80d93fe33a479 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 23 Nov 2021 08:56:29 +0530 Subject: [PATCH 07/20] Remove stray logs. --- src/art.cpp | 4 ++-- src/index.cpp | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/art.cpp b/src/art.cpp index bb373813..36ff086d 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -1030,9 +1030,9 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r } } - LOG(INFO) << "leaf results.size: " << results.size() + /*LOG(INFO) << "leaf results.size: " << results.size() << ", filter_ids_length: " << filter_ids_length - << ", num_large_lists: " << num_large_lists; + << ", num_large_lists: " << num_large_lists;*/ printf("OUTSIDE art_topk_iter: results size: %d\n", results.size()); return 0; diff --git a/src/index.cpp b/src/index.cpp index 1a677731..03faac11 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1231,7 +1231,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length, const std::vector& filters, const bool enable_short_circuit) const { - auto begin = std::chrono::high_resolution_clock::now(); + //auto begin = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < filters.size(); i++) { const filter & a_filter = filters[i]; @@ -1571,10 +1571,10 @@ void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length, } } - long long int timeMillis = + /*long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); - LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms"; + LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms";*/ } @@ -2194,7 +2194,7 @@ void Index::search(std::vector& field_query_tokens, } } - auto begin0 = std::chrono::high_resolution_clock::now(); + //auto begin0 = std::chrono::high_resolution_clock::now(); for(auto& seq_id_kvs: topster_ids) { const uint64_t seq_id = seq_id_kvs.first; @@ -2376,10 +2376,10 @@ void Index::search(std::vector& field_query_tokens, topster->add(kvs[0]); } - auto timeMillis0 = std::chrono::duration_cast( + /*auto timeMillis0 = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - begin0).count(); - LOG(INFO) << "Time taken for multi-field aggregation: " << timeMillis0 << "ms"; + LOG(INFO) << "Time taken for multi-field aggregation: " << timeMillis0 << "ms";*/ } //LOG(INFO) << "topster size: " << topster->size; @@ -2903,17 +2903,17 @@ void Index::search_field(const uint8_t & field_id, // prefix should apply only for last token const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1; - auto begin = std::chrono::high_resolution_clock::now(); + //auto begin = std::chrono::high_resolution_clock::now(); // need less candidates for filtered searches since we already only pick tokens with results art_fuzzy_search(search_index.at(field_name), (const unsigned char *) token.c_str(), token_len, costs[token_index], costs[token_index], num_fuzzy_candidates, token_order, prefix_search, filter_ids, filter_ids_length, leaves, unique_tokens); - auto timeMillis = std::chrono::duration_cast( + /*auto timeMillis = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - begin).count(); - LOG(INFO) << "Time taken for fuzzy search: " << timeMillis << "ms"; + LOG(INFO) << "Time taken for fuzzy search: " << timeMillis << "ms";*/ if(!leaves.empty()) { token_cost_cache.emplace(token_cost_hash, leaves); From f45416be0c20eb075712cfa4ccb946d9a7ff58f8 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 23 Nov 2021 18:12:38 +0530 Subject: [PATCH 08/20] Only last token should prefix search during drop tokens. --- include/index.h | 5 ++++ src/index.cpp | 24 ++++++++++------- test/collection_specific_test.cpp | 44 ++++++++++++++++++++++++++++--- 3 files changed, 59 insertions(+), 14 deletions(-) diff --git a/include/index.h b/include/index.h index 1309abea..effe2bc8 100644 --- a/include/index.h +++ b/include/index.h @@ -29,6 +29,11 @@ using array_mapped_facet_t = std::array; struct token_t { size_t position; std::string value; + bool prefix; + + token_t(size_t position, const std::string& value, bool prefix): position(position), value(value), prefix(prefix) { + + } }; struct token_candidates { diff --git a/src/index.cpp b/src/index.cpp index 03faac11..19885cba 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1923,7 +1923,8 @@ bool Index::check_for_overrides(const token_ordering& token_order, const string& std::vector window_tokens; std::set window_tokens_set; for (size_t i = start_index; i < start_index + window_len; i++) { - window_tokens.push_back({i, tokens[i]}); + bool is_prefix = (i == (start_index + window_len - 1)); + window_tokens.emplace_back(i, tokens[i], is_prefix); window_tokens_set.emplace(tokens[i]); } @@ -2100,14 +2101,16 @@ void Index::search(std::vector& field_query_tokens, for(size_t i = 0; i < num_search_fields; i++) { std::vector q_include_pos_tokens; for(size_t j=0; j < field_query_tokens[i].q_include_tokens.size(); j++) { - q_include_pos_tokens.push_back({j, field_query_tokens[i].q_include_tokens[j]}); + bool is_prefix = (j == field_query_tokens[i].q_include_tokens.size()-1); + q_include_pos_tokens.emplace_back(j, field_query_tokens[i].q_include_tokens[j], is_prefix); } std::vector> q_pos_synonyms; for(const auto& q_syn_vec: field_query_tokens[i].q_synonyms) { std::vector q_pos_syn; for(size_t j=0; j < q_syn_vec.size(); j++) { - q_pos_syn.push_back({j, q_syn_vec[j]}); + bool is_prefix = (j == q_syn_vec.size()-1); + q_pos_syn.emplace_back(j, q_syn_vec[j], is_prefix); } q_pos_synonyms.emplace_back(q_pos_syn); } @@ -2543,8 +2546,9 @@ void Index::compute_facet_infos(const std::vector& facets, facet_query_t& std::vector search_tokens, qtokens; for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) { - search_tokens.emplace_back(token_t{qtoken_index, query_tokens[qtoken_index]}); - qtokens.emplace_back(token_t{qtoken_index, query_tokens[qtoken_index]}); + bool is_prefix = (qtoken_index == query_tokens.size()-1); + search_tokens.emplace_back(qtoken_index, query_tokens[qtoken_index], is_prefix); + qtokens.emplace_back(qtoken_index, query_tokens[qtoken_index], is_prefix); } std::vector> searched_queries; @@ -2893,14 +2897,14 @@ void Index::search_field(const uint8_t & field_id, const std::string token_cost_hash = token + std::to_string(costs[token_index]); std::vector leaves; - //LOG(INFO) << "Searching for field: " << field << ", token:" << token << " - cost: " << costs[token_index]; + const bool prefix_search = prefix && search_tokens[token_index].prefix; - const bool prefix_search = prefix && (token_index == search_tokens.size()-1); + /*LOG(INFO) << "Searching for field: " << the_field.name << ", token:" + << token << " - cost: " << costs[token_index] << ", prefix_search: " << prefix_search;*/ if(token_cost_cache.count(token_cost_hash) != 0) { leaves = token_cost_cache[token_cost_hash]; } else { - // prefix should apply only for last token const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1; //auto begin = std::chrono::high_resolution_clock::now(); @@ -3008,13 +3012,13 @@ void Index::search_field(const uint8_t & field_id, // drop from right size_t end_index = (query_tokens.size() - 1) - num_tokens_dropped; for(size_t i=0; i <= end_index; i++) { - truncated_tokens.push_back({query_tokens[i].position, query_tokens[i].value}); + truncated_tokens.emplace_back(query_tokens[i].position, query_tokens[i].value, query_tokens[i].prefix); } } else { // drop from left size_t start_index = (num_tokens_dropped - mid_index); for(size_t i=start_index; i tags; - // when the first document containing a token already cannot fit compact posting list std::vector fields = {field("title", field_types::STRING, false),}; @@ -1727,8 +1725,46 @@ TEST_F(CollectionSpecificTest, HighlightOnPrefixRegression) { auto results = coll1->search("and", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); - LOG(INFO) << results; - collectionManager.drop_collection("coll1"); } +TEST_F(CollectionSpecificTest, DroppedTokensShouldNotBeUsedForPrefixSearch) { + std::vector fields = {field("title", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["title"] = "Dog Shoemaker"; + doc1["points"] = 100; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["title"] = "Shoe and Sock"; + doc2["points"] = 200; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + + auto results = coll1->search("shoe cat", {"title"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true).get(); + + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("1", results["hits"][0]["document"]["id"].get()); + + results = coll1->search("cat shoe", {"title"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true).get(); + + ASSERT_EQ(2, results["hits"].size()); + ASSERT_EQ("1", results["hits"][0]["document"]["id"].get()); + ASSERT_EQ("0", results["hits"][1]["document"]["id"].get()); +} From d67d0d6a65d7c96cc7587f5398d1762761bb9952 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 23 Nov 2021 18:16:08 +0530 Subject: [PATCH 09/20] [RPM] Ensure typesense service is stopped before starting. Work around for a systemd bug: https://gitlab.com/gitlab-org/ci-cd/docker-machine/-/issues/41#note_688450126 --- debian-pkg/typesense-server/DEBIAN/postinst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/debian-pkg/typesense-server/DEBIAN/postinst b/debian-pkg/typesense-server/DEBIAN/postinst index 52c70f1d..a71c14c6 100755 --- a/debian-pkg/typesense-server/DEBIAN/postinst +++ b/debian-pkg/typesense-server/DEBIAN/postinst @@ -4,6 +4,7 @@ API_KEY=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 48 | head -n 1) sed -i "s/\$API_KEY/$API_KEY/g" /etc/typesense/typesense-server.ini +systemctl stop typesense-server systemctl enable typesense-server systemctl daemon-reload -systemctl start typesense-server \ No newline at end of file +systemctl start typesense-server From 69345d37ced62d51cc9bbc9623897bad7600fac5 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 24 Nov 2021 12:02:08 +0530 Subject: [PATCH 10/20] Revert "[RPM] Ensure typesense service is stopped before starting." This reverts commit 514824b76b3221998f2d35816a7878ad9c9ffd66. --- debian-pkg/typesense-server/DEBIAN/postinst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/debian-pkg/typesense-server/DEBIAN/postinst b/debian-pkg/typesense-server/DEBIAN/postinst index a71c14c6..52c70f1d 100755 --- a/debian-pkg/typesense-server/DEBIAN/postinst +++ b/debian-pkg/typesense-server/DEBIAN/postinst @@ -4,7 +4,6 @@ API_KEY=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 48 | head -n 1) sed -i "s/\$API_KEY/$API_KEY/g" /etc/typesense/typesense-server.ini -systemctl stop typesense-server systemctl enable typesense-server systemctl daemon-reload -systemctl start typesense-server +systemctl start typesense-server \ No newline at end of file From a21628207ea23be2cee3780bf3b111b371673d35 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 24 Nov 2021 20:20:21 +0530 Subject: [PATCH 11/20] Refactor grouping with sorting. --- include/topster.h | 73 ++++---------------- src/collection.cpp | 23 +++++-- test/collection_grouping_test.cpp | 108 +++++++++++++++++++++++++++++- 3 files changed, 137 insertions(+), 67 deletions(-) diff --git a/include/topster.h b/include/topster.h index f86a7b4b..4accd374 100644 --- a/include/topster.h +++ b/include/topster.h @@ -105,10 +105,9 @@ struct Topster { KV *data; KV** kvs; - // For distinct, stores the min heap kv of each group_kv_map topster value std::unordered_map kv_map; - std::unordered_map group_kv_map; + spp::sparse_hash_map group_kv_map; size_t distinct; explicit Topster(size_t capacity): Topster(capacity, 0) { @@ -172,64 +171,17 @@ struct Topster { bool SIFT_DOWN = true; if(distinct) { - const auto& found_it = group_kv_map.find(kv->distinct_key); - bool is_duplicate_key = (found_it != group_kv_map.end()); - - if(!is_duplicate_key && less_than_min_heap) { - // for distinct, if a non duplicate kv is < than min heap we ignore - return false; - } - - if(is_duplicate_key) { - // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift - Topster* group_topster = found_it->second; - KV old_min_heap_kv = *kv_map[kv->distinct_key]; - bool added = group_topster->add(kv); - - if(!added) { - return false; - } - - // if new kv score is greater than previous min heap score we sift down, otherwise sift up - SIFT_DOWN = is_greater(kv, &old_min_heap_kv); - - // new kv is different from old_min_heap_kv so we have to sift heap - heap_op_index = old_min_heap_kv.array_index; - - // erase current min heap key from kv_map - kv_map.erase(old_min_heap_kv.distinct_key); - + // Grouping cannot be a streaming operation, so aggregate the KVs associated with every group. + auto kvs_it = group_kv_map.find(kv->distinct_key); + if(kvs_it != group_kv_map.end()) { + kvs_it->second->add(kv); } else { - // kv is guaranteed to be > current min heap: kvs[0] - // create fresh topster for this distinct group key since it does not exist - Topster* group_topster = new Topster(distinct, 0); - group_topster->add(kv); - - // add new group key to map - group_kv_map.emplace(kv->distinct_key, group_topster); - - // find heap operation index for updating kvs - - if(size < MAX_SIZE) { - // there is enough space in heap we just copy to end - SIFT_DOWN = false; - heap_op_index = size; - size++; - } else { - SIFT_DOWN = true; - - // max size is reached so we are forced to replace current min heap element (kvs[0]) - heap_op_index = 0; - - // remove current min heap group key from maps - delete group_kv_map[kvs[heap_op_index]->distinct_key]; - group_kv_map.erase(kvs[heap_op_index]->distinct_key); - kv_map.erase(kvs[heap_op_index]->distinct_key); - } + Topster* g_topster = new Topster(distinct, 0); + g_topster->add(kv); + group_kv_map.insert({kv->distinct_key, g_topster}); } - - // kv will be copied into the pointer at heap_op_index - kv_map.emplace(kv->distinct_key, kvs[heap_op_index]); + + return true; } else { // not distinct //LOG(INFO) << "Searching for key: " << kv->key; @@ -334,9 +286,8 @@ struct Topster { // topster must be sorted before iterated upon to remove dead array entries void sort() { - std::stable_sort(kvs, kvs + size, is_greater); - for(auto &group_topster: group_kv_map) { - group_topster.second->sort(); + if(!distinct) { + std::stable_sort(kvs, kvs + size, is_greater); } } diff --git a/src/collection.cpp b/src/collection.cpp index f875bb68..f690e5f3 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1384,12 +1384,27 @@ void Collection::parse_search_query(const std::string &query, std::vector> &result_kvs) { if(topster->distinct) { - for(auto &group_topster_entry: topster->group_kv_map) { - Topster* group_topster = group_topster_entry.second; - const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); - result_kvs.emplace_back(group_kvs); + // we have to pick top-K groups + Topster gtopster(topster->MAX_SIZE); + + for(auto& group_topster: topster->group_kv_map) { + group_topster.second->sort(); + if(group_topster.second->size != 0) { + KV* kv_head = group_topster.second->getKV(0); + gtopster.add(kv_head); + } } + gtopster.sort(); + + for(size_t i = 0; i < gtopster.size; i++) { + KV* kv = gtopster.getKV(i); + const std::vector group_kvs( + topster->group_kv_map[kv->distinct_key]->kvs, + topster->group_kv_map[kv->distinct_key]->kvs+topster->group_kv_map[kv->distinct_key]->size + ); + result_kvs.emplace_back(group_kvs); + } } else { for(uint32_t t = 0; t < topster->size; t++) { KV* kv = topster->getKV(t); diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp index 09111c88..c5321b9e 100644 --- a/test/collection_grouping_test.cpp +++ b/test/collection_grouping_test.cpp @@ -154,8 +154,8 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) { ASSERT_EQ(10, res["found"].get()); ASSERT_EQ(10, res["grouped_hits"].size()); - ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get()); + ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get()); ASSERT_STREQ("Beta", res["grouped_hits"][0]["group_key"][1].get().c_str()); // optional field should have no value in the group key component @@ -428,4 +428,108 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) { ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]); ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get().c_str()); -} \ No newline at end of file +} + +TEST_F(CollectionGroupingTest, GroupOrderIndependence) { + Collection *coll1; + + std::vector fields = {field("group", field_types::STRING, true), + field("points", field_types::INT32, false),}; + + coll1 = collectionManager.get_collection("coll1").get(); + if(coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + } + + nlohmann::json doc; + + for(size_t i = 0; i < 256; i++) { + int64_t points = 100 + i; + doc["id"] = std::to_string(i); + doc["group"] = std::to_string(i); + doc["points"] = points; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + } + + // doc id "255" will have points of 255 + // try to insert doc id "256" with group "256" but having lesser points than all records + + doc["id"] = "256"; + doc["group"] = "256"; + doc["points"] = 50; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + // insert doc id "257" of same group "256" with greatest point + + doc["id"] = "257"; + doc["group"] = "256"; + doc["points"] = 500; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + // when we search by grouping records, sorting descending on points, both records of group "256" should show up + + std::vector sort_fields = {sort_by("points", "DESC")}; + + auto res = coll1->search("*", {}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, + {}, {}, {"group"}, 10).get(); + + ASSERT_EQ(1, res["grouped_hits"][0]["group_key"].size()); + ASSERT_STREQ("256", res["grouped_hits"][0]["group_key"][0].get().c_str()); + ASSERT_EQ(2, res["grouped_hits"][0]["hits"].size()); +} + +TEST_F(CollectionGroupingTest, UseHighestValueInGroupForOrdering) { + Collection *coll1; + + std::vector fields = {field("group", field_types::STRING, true), + field("points", field_types::INT32, false),}; + + coll1 = collectionManager.get_collection("coll1").get(); + if(coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + } + + nlohmann::json doc; + + for(size_t i = 0; i < 250; i++) { + int64_t points = 100 + i; + doc["id"] = std::to_string(i); + doc["group"] = std::to_string(i); + doc["points"] = points; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + } + + // points: 100 -> 349 + + // group with highest point is "249" with 349 points + // insert another document for that group with 50 points + doc["id"] = "250"; + doc["group"] = "249"; + doc["points"] = 50; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + // now insert another new group whose points is greater than 50 + doc["id"] = "251"; + doc["group"] = "1000"; + doc["points"] = 60; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + std::vector sort_fields = {sort_by("points", "DESC")}; + + auto res = coll1->search("*", {}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, + {false}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, + {}, {}, {"group"}, 10).get(); + + ASSERT_EQ(1, res["grouped_hits"][0]["group_key"].size()); + ASSERT_STREQ("249", res["grouped_hits"][0]["group_key"][0].get().c_str()); + ASSERT_EQ(2, res["grouped_hits"][0]["hits"].size()); +} From b088bad83bdf8b77da2ba5ca78f4cc7ecdeff1c0 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Thu, 25 Nov 2021 19:05:41 +0530 Subject: [PATCH 12/20] Use gzip compression for DEB. --- debian-pkg/generate_deb_rpm.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/debian-pkg/generate_deb_rpm.sh b/debian-pkg/generate_deb_rpm.sh index 42a7b191..f5c67231 100755 --- a/debian-pkg/generate_deb_rpm.sh +++ b/debian-pkg/generate_deb_rpm.sh @@ -35,7 +35,8 @@ rm -rf /tmp/typesense-server-$TS_VERSION /tmp/typesense-server-$TS_VERSION.tar.g sed -i "s/\$VERSION/$TS_VERSION/g" `find /tmp/typesense-deb-build -maxdepth 10 -type f` -dpkg -b /tmp/typesense-deb-build/typesense-server "/tmp/typesense-deb-build/typesense-server-${TS_VERSION}-amd64.deb" +dpkg-deb -Zgzip -z6 \ + -b /tmp/typesense-deb-build/typesense-server "/tmp/typesense-deb-build/typesense-server-${TS_VERSION}-amd64.deb" # Generate RPM From 8ec653c3e6ee8f0342bfe8c4bd7ce47eaad5ba8e Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 26 Nov 2021 15:11:23 +0530 Subject: [PATCH 13/20] Check for connection status before responding. --- src/batched_indexer.cpp | 8 +++++--- src/core_api.cpp | 6 ++++++ src/http_server.cpp | 13 ++++++++----- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/batched_indexer.cpp b/src/batched_indexer.cpp index 2fdb760c..e149487e 100644 --- a/src/batched_indexer.cpp +++ b/src/batched_indexer.cpp @@ -255,9 +255,11 @@ void BatchedIndexer::run() { const std::string& req_key_prefix = get_req_prefix_key(it->second.start_ts); store->delete_range(req_key_prefix, req_key_prefix + StringUtils::serialize_uint32_t(UINT32_MAX)); - it->second.res->final = true; - async_req_res_t* async_req_res = new async_req_res_t(it->second.req, it->second.res, true); - server->get_message_dispatcher()->send_message(HttpServer::STREAM_RESPONSE_MESSAGE, async_req_res); + if(it->second.res->is_alive) { + it->second.res->final = true; + async_req_res_t* async_req_res = new async_req_res_t(it->second.req, it->second.res, true); + server->get_message_dispatcher()->send_message(HttpServer::STREAM_RESPONSE_MESSAGE, async_req_res); + } it = req_res_map.erase(it); } else { diff --git a/src/core_api.cpp b/src/core_api.cpp index cdf67adf..ada733fc 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -34,9 +34,15 @@ bool handle_authentication(std::map& req_params, const void stream_response(const std::shared_ptr& req, const std::shared_ptr& res) { if(!res->is_alive) { + // underlying request is dead or this is a raft log playback return ; } + if(req->_req->res.status != 0) { + // not the first response chunk, so wait for previous chunk to finish + res->wait(); + } + auto req_res = new async_req_res_t(req, res, true); server->get_message_dispatcher()->send_message(HttpServer::STREAM_RESPONSE_MESSAGE, req_res); } diff --git a/src/http_server.cpp b/src/http_server.cpp index a62cd477..1a9d4a5d 100644 --- a/src/http_server.cpp +++ b/src/http_server.cpp @@ -767,6 +767,7 @@ void HttpServer::stream_response(stream_response_state_t& state) { if(state.res_body.len == 0 && state.send_state != H2O_SEND_STATE_FINAL) { // without this guard, http streaming will break + state.generator->proceed(state.generator, req); return; } @@ -908,13 +909,15 @@ bool HttpServer::on_request_proceed_message(void *data) { // This callback will run concurrently to batch indexer's run() so care must be taken to protect access // to variables that are written to by the batch indexer, which for now is only: last_chunk_aggregate (atomic) deferred_req_res_t* req_res = static_cast(data); - auto stream_state = (req_res->req->last_chunk_aggregate) ? H2O_SEND_STATE_FINAL : H2O_SEND_STATE_IN_PROGRESS; + if(req_res->res->is_alive) { + auto stream_state = (req_res->req->last_chunk_aggregate) ? H2O_SEND_STATE_FINAL : H2O_SEND_STATE_IN_PROGRESS; - size_t written = req_res->req->chunk_len; - req_res->req->chunk_len = 0; + size_t written = req_res->req->chunk_len; + req_res->req->chunk_len = 0; - if(req_res->req->_req && req_res->req->_req->proceed_req) { - req_res->req->_req->proceed_req(req_res->req->_req, written, stream_state); + if(req_res->req->_req && req_res->req->_req->proceed_req) { + req_res->req->_req->proceed_req(req_res->req->_req, written, stream_state); + } } if(req_res->destroy_after_use) { From b14458efd4874fe9e6599da6ae24876a58ec37e5 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 26 Nov 2021 19:35:50 +0530 Subject: [PATCH 14/20] Handle updates of repeating records within the same batch. --- src/collection.cpp | 40 +++---- test/collection_specific_test.cpp | 182 +++++++++++++++++++++++++++++- 2 files changed, 192 insertions(+), 30 deletions(-) diff --git a/src/collection.cpp b/src/collection.cpp index f690e5f3..627454a8 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -223,22 +223,28 @@ nlohmann::json Collection::add_many(std::vector& json_lines, nlohma // NOTE: we overwrite the input json_lines with result to avoid memory pressure record.is_update = false; + bool repeated_doc = false; if(!doc_seq_id_op.ok()) { record.index_failure(doc_seq_id_op.code(), doc_seq_id_op.error()); } else { + const std::string& doc_id = record.doc["id"].get(); + repeated_doc = (batch_doc_ids.find(doc_id) != batch_doc_ids.end()); + + if(repeated_doc) { + // when a document repeats, we send the batch until this document so that we can deal with conflicts + i--; + goto do_batched_index; + } + record.is_update = !doc_seq_id_op.get().is_new; + if(record.is_update) { get_document_from_store(get_seq_id_key(seq_id), record.old_doc); - } else { - const std::string& doc_id = record.doc["id"].get(); - if(batch_doc_ids.find(doc_id) != batch_doc_ids.end()) { - record.index_failure(400, "Document with `id` " + doc_id + " already exists in the batch."); - } else { - batch_doc_ids.emplace(doc_id); - } } + batch_doc_ids.insert(doc_id); + // if `fallback_field_type` or `dynamic_fields` is enabled, update schema first before indexing if(!fallback_field_type.empty() || !dynamic_fields.empty()) { Option schema_change_op = check_and_update_schema(record.doc, dirty_values); @@ -248,25 +254,11 @@ nlohmann::json Collection::add_many(std::vector& json_lines, nlohma } } - /* - // check for memory threshold before allowing subsequent batches - if(is_exceeding_memory_threshold()) { - exceeds_memory_limit = true; - } - - if(exceeds_memory_limit) { - nlohmann::json index_res; - index_res["error"] = "Max memory ratio exceeded."; - index_res["success"] = false; - index_res["document"] = json_line; - json_lines[i] = index_res.dump(); - record.index_failure(500, "Max memory ratio exceeded."); - } - */ - index_records.emplace_back(std::move(record)); - if((i+1) % index_batch_size == 0 || i == json_lines.size()-1) { + do_batched_index: + + if((i+1) % index_batch_size == 0 || i == json_lines.size()-1 || repeated_doc) { batch_index(index_records, json_lines, num_indexed); // to return the document for the single doc add cases diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index c148b659..3c11518f 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -1452,7 +1452,7 @@ TEST_F(CollectionSpecificTest, ImportDocumentWithRepeatingIDInTheSameBatch) { ASSERT_TRUE(nlohmann::json::parse(import_records[0])["success"].get()); ASSERT_FALSE(nlohmann::json::parse(import_records[1])["success"].get()); - ASSERT_EQ("Document with `id` 0 already exists in the batch.", + ASSERT_EQ("A document with id 0 already exists.", nlohmann::json::parse(import_records[1])["error"].get()); auto results = coll1->search("levis", {"name"}, @@ -1485,7 +1485,7 @@ TEST_F(CollectionSpecificTest, ImportDocumentWithRepeatingIDInTheSameBatch) { ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(2, import_response["num_imported"].get()); - // repeated ID is rejected even if the first ID is not indexed due to some error + // repeated ID is NOT rejected if the first ID is not indexed due to some error import_records.clear(); doc1.erase("name"); doc1["id"] = "100"; @@ -1496,17 +1496,187 @@ TEST_F(CollectionSpecificTest, ImportDocumentWithRepeatingIDInTheSameBatch) { import_response = coll1->add_many(import_records, document); + LOG(INFO) << import_response; + ASSERT_FALSE(import_response["success"].get()); - ASSERT_EQ(0, import_response["num_imported"].get()); + ASSERT_EQ(1, import_response["num_imported"].get()); ASSERT_FALSE(nlohmann::json::parse(import_records[0])["success"].get()); - ASSERT_FALSE(nlohmann::json::parse(import_records[1])["success"].get()); - ASSERT_EQ("Document with `id` 100 already exists in the batch.", - nlohmann::json::parse(import_records[1])["error"].get()); + ASSERT_EQ("Field `name` has been declared in the schema, but is not found in the document.", + nlohmann::json::parse(import_records[0])["error"].get()); + + ASSERT_TRUE(nlohmann::json::parse(import_records[1])["success"].get()); collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionSpecificTest, UpdateOfTwoDocsWithSameIdWithinSameBatch) { + std::vector fields = {field("last_chance", field_types::BOOL, false, true), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["points"] = 100; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + + // second update should reflect the result of first update + std::vector updates = { + R"({"id": "0", "last_chance": false})", + R"({"id": "0", "points": 200})", + }; + + nlohmann::json update_doc; + auto import_response = coll1->add_many(updates, update_doc, UPDATE); + ASSERT_TRUE(import_response["success"].get()); + ASSERT_EQ(2, import_response["num_imported"].get()); + + auto results = coll1->search("*", {}, + "", {}, {}, {0}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true).get(); + + LOG(INFO) << results; + + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + ASSERT_TRUE(results["hits"][0]["document"].contains("last_chance")); + ASSERT_EQ(false, results["hits"][0]["document"]["last_chance"].get()); + + ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size()); + ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size()); +} + +TEST_F(CollectionSpecificTest, UpsertOfTwoDocsWithSameIdWithinSameBatch) { + std::vector fields = {field("last_chance", field_types::BOOL, false, true), + field("points", field_types::INT32, false, true),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["points"] = 100; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + + // first upsert removes both fields, so second upsert should only insert "points" + std::vector upserts = { + R"({"id": "0", "last_chance": true})", + R"({"id": "0", "points": 200})", + }; + + nlohmann::json update_doc; + auto import_response = coll1->add_many(upserts, update_doc, UPSERT); + ASSERT_TRUE(import_response["success"].get()); + ASSERT_EQ(2, import_response["num_imported"].get()); + + auto results = coll1->search("*", {}, + "", {}, {}, {0}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true).get(); + + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + ASSERT_TRUE(results["hits"][0]["document"].contains("points")); + ASSERT_FALSE(results["hits"][0]["document"].contains("last_chance")); + ASSERT_EQ(200, results["hits"][0]["document"]["points"].get()); + + ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size()); + ASSERT_EQ(0, coll1->_get_index()->_get_numerical_index().at("last_chance")->size()); + + // update without doc id + + upserts = { + R"({"last_chance": true})", + }; + + import_response = coll1->add_many(upserts, update_doc, UPDATE); + ASSERT_FALSE(import_response["success"].get()); + ASSERT_EQ(0, import_response["num_imported"].get()); +} + +TEST_F(CollectionSpecificTest, UpdateUpsertOfDocWithMissingFields) { + std::vector fields = {field("last_chance", field_types::BOOL, false, true), + field("points", field_types::INT32, false, true),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["last_chance"] = true; + doc1["points"] = 100; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + + // upsert doc with missing fields: should be removed from index + std::vector upserts = { + R"({"id": "0"})" + }; + + nlohmann::json update_doc; + auto import_response = coll1->add_many(upserts, update_doc, UPSERT); + ASSERT_TRUE(import_response["success"].get()); + ASSERT_EQ(1, import_response["num_imported"].get()); + + auto results = coll1->search("*", {}, + "", {}, {}, {0}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true).get(); + + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + ASSERT_EQ(1, results["hits"][0]["document"].size()); + + ASSERT_EQ(0, coll1->_get_index()->_get_numerical_index().at("points")->size()); + ASSERT_EQ(0, coll1->_get_index()->_get_numerical_index().at("last_chance")->size()); + + // put the original doc back + ASSERT_TRUE(coll1->add(doc1.dump(), UPSERT).ok()); + + results = coll1->search("*", {}, + "", {}, {}, {0}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true).get(); + + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ(3, results["hits"][0]["document"].size()); + + // update doc with missing fields: existing fields should NOT be removed + + upserts = { + R"({"id": "0"})" + }; + + import_response = coll1->add_many(upserts, update_doc, UPDATE); + ASSERT_TRUE(import_response["success"].get()); + ASSERT_EQ(1, import_response["num_imported"].get()); + + results = coll1->search("*", {}, + "", {}, {}, {0}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true).get(); + + ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + ASSERT_EQ(3, results["hits"][0]["document"].size()); + + ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size()); + ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size()); +} + TEST_F(CollectionSpecificTest, FacetParallelizationVerification) { std::vector fields = {field("name", field_types::STRING, false), field("category", field_types::STRING, true), From cec0779bb227dc7d071d033c296a28596a59e84a Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 26 Nov 2021 21:08:24 +0530 Subject: [PATCH 15/20] Add coll drops. --- test/collection_specific_test.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index 3c11518f..09cd6c4d 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -1550,6 +1550,8 @@ TEST_F(CollectionSpecificTest, UpdateOfTwoDocsWithSameIdWithinSameBatch) { ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size()); ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size()); + + collectionManager.drop_collection("coll1"); } TEST_F(CollectionSpecificTest, UpsertOfTwoDocsWithSameIdWithinSameBatch) { @@ -1600,6 +1602,8 @@ TEST_F(CollectionSpecificTest, UpsertOfTwoDocsWithSameIdWithinSameBatch) { import_response = coll1->add_many(upserts, update_doc, UPDATE); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(0, import_response["num_imported"].get()); + + collectionManager.drop_collection("coll1"); } TEST_F(CollectionSpecificTest, UpdateUpsertOfDocWithMissingFields) { @@ -1675,6 +1679,8 @@ TEST_F(CollectionSpecificTest, UpdateUpsertOfDocWithMissingFields) { ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size()); ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size()); + + collectionManager.drop_collection("coll1"); } TEST_F(CollectionSpecificTest, FacetParallelizationVerification) { @@ -1937,4 +1943,6 @@ TEST_F(CollectionSpecificTest, DroppedTokensShouldNotBeUsedForPrefixSearch) { ASSERT_EQ(2, results["hits"].size()); ASSERT_EQ("1", results["hits"][0]["document"]["id"].get()); ASSERT_EQ("0", results["hits"][1]["document"]["id"].get()); + + collectionManager.drop_collection("coll1"); } From d9b969aab5a5cdb523024782b7514befaaac1e49 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 30 Nov 2021 14:21:12 +0530 Subject: [PATCH 16/20] Sort queued requests by timestamp before restore. --- src/batched_indexer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/batched_indexer.cpp b/src/batched_indexer.cpp index e149487e..04e5d1e1 100644 --- a/src/batched_indexer.cpp +++ b/src/batched_indexer.cpp @@ -385,16 +385,16 @@ void BatchedIndexer::load_state(const nlohmann::json& state) { queue_ids.push_back(queue_id); std::unique_lock qlk(qmutuxes[queue_id].mcv); queues[queue_id].emplace_back(req->start_ts); - qmutuxes[queue_id].cv.notify_one(); } num_reqs_restored++; } - // need to sort on `start_ts` to preserve original order + // need to sort on `start_ts` to preserve original order before notifying queues for(auto queue_id: queue_ids) { std::unique_lock lk(qmutuxes[queue_id].mcv); std::sort(queues[queue_id].begin(), queues[queue_id].end()); + qmutuxes[queue_id].cv.notify_one(); } LOG(INFO) << "Restored " << num_reqs_restored << " in-flight requests from snapshot."; From 6ded5c7556f75ad2edfaa80e38d48429a8c916be Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 30 Nov 2021 15:56:25 +0530 Subject: [PATCH 17/20] Fix posting list deletion edge case. --- src/posting_list.cpp | 35 +++++++---- test/posting_list_test.cpp | 117 +++++++++++++++++++++++++++++-------- 2 files changed, 117 insertions(+), 35 deletions(-) diff --git a/src/posting_list.cpp b/src/posting_list.cpp index 823dd028..c888225e 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -186,7 +186,9 @@ bool posting_list_t::block_t::contains(uint32_t id) { /* posting_list_t operations */ posting_list_t::posting_list_t(uint16_t max_block_elements): BLOCK_MAX_ELEMENTS(max_block_elements) { - + if(max_block_elements <= 1) { + throw std::invalid_argument("max_block_elements must be > 1"); + } } posting_list_t::~posting_list_t() { @@ -423,7 +425,7 @@ void posting_list_t::upsert(const uint32_t id, const std::vector& offs } else { const auto it = id_block_map.lower_bound(id); upsert_block = (it == id_block_map.end()) ? id_block_map.rbegin()->second : it->second; - before_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1); + before_upsert_last_id = upsert_block->ids.last(); } // happy path: upsert_block is not full @@ -431,7 +433,7 @@ void posting_list_t::upsert(const uint32_t id, const std::vector& offs uint32_t num_inserted = upsert_block->upsert(id, offsets); ids_length += num_inserted; - last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1); + last_id_t after_upsert_last_id = upsert_block->ids.last(); if(before_upsert_last_id != after_upsert_last_id) { id_block_map.erase(before_upsert_last_id); id_block_map.emplace(after_upsert_last_id, upsert_block); @@ -451,12 +453,12 @@ void posting_list_t::upsert(const uint32_t id, const std::vector& offs // evenly divide elements between both blocks split_block(upsert_block, new_block); - last_id_t after_upsert_last_id = upsert_block->ids.at(upsert_block->size() - 1); + last_id_t after_upsert_last_id = upsert_block->ids.last(); id_block_map.erase(before_upsert_last_id); id_block_map.emplace(after_upsert_last_id, upsert_block); } - last_id_t after_new_block_id = new_block->ids.at(new_block->size() - 1); + last_id_t after_new_block_id = new_block->ids.last(); id_block_map.emplace(after_new_block_id, new_block); new_block->next = upsert_block->next; @@ -485,6 +487,17 @@ void posting_list_t::erase(const uint32_t id) { // since we will be deleting the empty node, set the previous node's next pointer to null std::prev(it)->second->next = nullptr; delete erase_block; + } else { + // The root block cannot be empty if there are other blocks so we will pull some contents from next block + // This is only an issue for blocks with max size of 2 + if(root_block.next != nullptr) { + auto next_block_last_id = erase_block->next->ids.last(); + merge_adjacent_blocks(erase_block, erase_block->next, erase_block->next->size()/2); + id_block_map.erase(next_block_last_id); + + id_block_map.emplace(erase_block->next->ids.last(), erase_block->next); + id_block_map.emplace(erase_block->ids.last(), erase_block); + } } id_block_map.erase(before_last_id); @@ -493,7 +506,7 @@ void posting_list_t::erase(const uint32_t id) { } if(new_ids_length >= BLOCK_MAX_ELEMENTS/2 || erase_block->next == nullptr) { - last_id_t after_last_id = erase_block->ids.at(new_ids_length-1); + last_id_t after_last_id = erase_block->ids.last(); if(before_last_id != after_last_id) { id_block_map.erase(before_last_id); id_block_map.emplace(after_last_id, erase_block); @@ -505,7 +518,7 @@ void posting_list_t::erase(const uint32_t id) { // block is less than 50% of max capacity and contains a next node which we can refill from auto next_block = erase_block->next; - last_id_t next_block_last_id = next_block->ids.at(next_block->ids.getLength()-1); + last_id_t next_block_last_id = next_block->ids.last(); if(erase_block->size() + next_block->size() <= BLOCK_MAX_ELEMENTS) { // we can merge the contents of next block with `erase_block` and delete the next block @@ -515,13 +528,15 @@ void posting_list_t::erase(const uint32_t id) { id_block_map.erase(next_block_last_id); } else { - // only part of the next block can be moved over - size_t num_block2_ids = BLOCK_MAX_ELEMENTS - erase_block->size(); + // Only part of the next block can be moved over. + // We will move only 50% of max elements to ensure that we don't end up "flipping" adjacent blocks: + // 1, 5 -> 5, 1 + size_t num_block2_ids = BLOCK_MAX_ELEMENTS/2; merge_adjacent_blocks(erase_block, next_block, num_block2_ids); // NOTE: we don't have to update `id_block_map` for `next_block` as last element doesn't change } - last_id_t after_last_id = erase_block->ids.at(erase_block->ids.getLength()-1); + last_id_t after_last_id = erase_block->ids.last(); if(before_last_id != after_last_id) { id_block_map.erase(before_last_id); id_block_map.emplace(after_last_id, erase_block); diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index 09c567ef..b9ecec3d 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -447,13 +447,13 @@ TEST_F(PostingListTest, RemovalsOnLaterBlocks) { // only part of the next node contents can be moved over when we delete 8 since (1 + 5) > 5 pl.erase(8); - // [0..4], [9], [10..14] => [0..4], [9,10,11,12,13], [14] + // [0..4], [9], [10..14] => [0..4], [9,10,11], [12,13,14] ASSERT_EQ(3, pl.num_blocks()); ASSERT_EQ(11, pl.num_ids()); - ASSERT_EQ(5, pl.get_root()->next->size()); - ASSERT_EQ(1, pl.get_root()->next->next->size()); - ASSERT_EQ(13, pl.get_root()->next->ids.last()); + ASSERT_EQ(3, pl.get_root()->next->size()); + ASSERT_EQ(3, pl.get_root()->next->next->size()); + ASSERT_EQ(11, pl.get_root()->next->ids.last()); ASSERT_EQ(14, pl.get_root()->next->next->ids.last()); for(size_t i = 0; i < pl.get_root()->next->offset_index.getLength(); i++) { @@ -616,27 +616,6 @@ TEST_F(PostingListTest, SplittingOfListsSimple) { std::vector> partial_its_vec(4); intersector.split_lists(4, partial_its_vec); - /*for(size_t i = 0; i < partial_its_vec.size(); i++) { - auto& partial_its = partial_its_vec[i]; - - if (partial_its.empty()) { - continue; - } - - LOG(INFO) << "Vec " << i; - - for (auto& it: partial_its) { - while (it.valid()) { - LOG(INFO) << it.id(); - it.next(); - } - - LOG(INFO) << "---"; - } - } - - return ;*/ - std::vector>> split_ids = { {{0, 2}, {1, 3}, {2, 3}}, {{3, 20}, {1, 3, 5, 10, 20}, {2, 3, 5, 7, 20}} @@ -1384,6 +1363,94 @@ TEST_F(PostingListTest, BlockIntersectionOnMixedLists) { free(list1); } +TEST_F(PostingListTest, InsertAndEraseSequence) { + std::vector offsets = {0, 1, 3}; + posting_list_t pl(5); + + pl.upsert(0, offsets); + pl.upsert(2, offsets); + pl.upsert(4, offsets); + pl.upsert(6, offsets); + pl.upsert(8, offsets); + + // this will cause a split of the root block + pl.upsert(3, offsets); // 0,2,3 | 4,6,8 + pl.erase(0); // 2,3 | 4,6,8 + pl.upsert(5, offsets); // 2,3 | 4,5,6,8 + pl.upsert(7, offsets); // 2,3 | 4,5,6,7,8 + pl.upsert(10, offsets); // 2,3 | 4,5,6,7,8 | 10 + + // this will cause adjacent block refill + pl.erase(2); // 3,4,5,6,7 | 8 | 10 + + // deletes second block + pl.erase(8); + + // remove all elements + pl.erase(3); + pl.erase(4); + pl.erase(5); + pl.erase(6); + pl.erase(7); + pl.erase(10); + + ASSERT_EQ(0, pl.num_ids()); +} + +TEST_F(PostingListTest, InsertAndEraseSequenceWithBlockSizeTwo) { + std::vector offsets = {0, 1, 3}; + posting_list_t pl(2); + + pl.upsert(2, offsets); + pl.upsert(3, offsets); + pl.upsert(1, offsets); // inserting 2 again here? // inserting 4 here? + + // 1 | 2,3 + + pl.erase(1); + + ASSERT_EQ(1, pl.get_root()->size()); + ASSERT_EQ(2, pl.num_blocks()); + + pl.erase(3); + pl.erase(2); + + ASSERT_EQ(0, pl.get_root()->size()); +} + +TEST_F(PostingListTest, PostingListMustHaveAtleast1Element) { + try { + std::vector offsets = {0, 1, 3}; + posting_list_t pl(1); + FAIL() << "Expected std::invalid_argument"; + } + catch(std::invalid_argument const & err) { + EXPECT_EQ(err.what(),std::string("max_block_elements must be > 1")); + } catch(...) { + FAIL() << "Expected std::invalid_argument"; + } +} + +TEST_F(PostingListTest, DISABLED_RandInsertAndErase) { + std::vector offsets = {0, 1, 3}; + posting_list_t pl(5); + + time_t t; + srand((unsigned) time(&t)); + + for(size_t i = 0; i < 10000; i++) { + LOG(INFO) << "i: " << i; + uint32_t add_id = rand() % 15; + pl.upsert(add_id, offsets); + + uint32_t del_id = rand() % 15; + LOG(INFO) << "add: " << add_id << ", erase: " << del_id; + pl.erase(del_id); + } + + LOG(INFO) << "Num ids: " << pl.num_ids() << ", num bocks: " << pl.num_blocks(); +} + TEST_F(PostingListTest, DISABLED_Benchmark) { std::vector offsets = {0, 1, 3}; posting_list_t pl(4096); From 1bd028e951f1d8dee4a1de780ce8c71abc77b213 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 30 Nov 2021 16:51:41 +0530 Subject: [PATCH 18/20] Fix warning. --- src/posting_list.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/posting_list.cpp b/src/posting_list.cpp index c888225e..a3dc949a 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -269,7 +269,8 @@ void posting_list_t::merge_adjacent_blocks(posting_list_t::block_t* block1, post size_t new_block1_offsets_size = block1->offsets.getLength() + num_block2_offsets_to_move; uint32_t* new_block1_offsets = new uint32_t[new_block1_offsets_size]; - uint32_t min = offsets1[0], max = offsets1[0]; + uint32_t min = block1->offsets.getLength() != 0 ? offsets1[0] : 0; + uint32_t max = min; // we have to manually copy over so we can find the new min and max for(size_t i = 0; i < block1->offsets.getLength(); i++) { From 11f919c4f8ea0cf9ba2fe4c0fc20ae23baf0c1a6 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 30 Nov 2021 19:35:41 +0530 Subject: [PATCH 19/20] Use set for batch indexer request restore. --- src/batched_indexer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/batched_indexer.cpp b/src/batched_indexer.cpp index 04e5d1e1..4d25f976 100644 --- a/src/batched_indexer.cpp +++ b/src/batched_indexer.cpp @@ -354,7 +354,7 @@ void BatchedIndexer::load_state(const nlohmann::json& state) { queued_writes = state["queued_writes"].get(); size_t num_reqs_restored = 0; - std::vector queue_ids; + std::set queue_ids; for(auto& kv: state["req_res_map"].items()) { std::shared_ptr req = std::make_shared(); @@ -382,7 +382,7 @@ void BatchedIndexer::load_state(const nlohmann::json& state) { const std::string& coll_name = get_collection_name(req); uint64_t queue_id = StringUtils::hash_wy(coll_name.c_str(), coll_name.size()) % num_threads; - queue_ids.push_back(queue_id); + queue_ids.insert(queue_id); std::unique_lock qlk(qmutuxes[queue_id].mcv); queues[queue_id].emplace_back(req->start_ts); } From 339300dcd00b896e2f1b1f1a39e0a13d5748e7d5 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 30 Nov 2021 20:20:19 +0530 Subject: [PATCH 20/20] Lowercase unicode while tokenization. --- src/tokenizer.cpp | 2 +- test/collection_specific_test.cpp | 30 ++++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 1d868e01..745e47e8 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -96,7 +96,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde LOG(ERROR) << "Unicode error during parsing: " << errcode; } } else { - token = unicode_text.tempSubString(prev_position, length).toUTF8String(word); + token = unicode_text.toLower().tempSubString(prev_position, length).toUTF8String(word); } if(!token.empty()) { diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index 09cd6c4d..6ea594c4 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -1541,15 +1541,28 @@ TEST_F(CollectionSpecificTest, UpdateOfTwoDocsWithSameIdWithinSameBatch) { spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, "", "", {}, 1000, true).get(); - LOG(INFO) << results; + collectionManager.drop_collection("coll1"); +} - ASSERT_EQ(1, results["hits"].size()); - ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); - ASSERT_TRUE(results["hits"][0]["document"].contains("last_chance")); - ASSERT_EQ(false, results["hits"][0]["document"]["last_chance"].get()); +TEST_F(CollectionSpecificTest, CyrillicText) { + // when the first document containing a token already cannot fit compact posting list - ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size()); - ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size()); + std::vector fields = {field("title", field_types::STRING, false, false, true, "sr"),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + nlohmann::json doc; + doc["title"] = "Test Тест"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + doc["title"] = "TEST ТЕСТ"; + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("тест", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); + + ASSERT_EQ(2, results["hits"].size()); + ASSERT_EQ("1", results["hits"][0]["document"]["id"].get()); + ASSERT_EQ("0", results["hits"][1]["document"]["id"].get()); collectionManager.drop_collection("coll1"); } @@ -1779,9 +1792,10 @@ TEST_F(CollectionSpecificTest, VerbatimMatchShouldConsiderTokensMatchedAcrossAll ASSERT_EQ("3", results["hits"][0]["document"]["id"].get()); ASSERT_EQ("2", results["hits"][1]["document"]["id"].get()); + ASSERT_EQ(2, results["hits"].size()); + collectionManager.drop_collection("coll1"); } - TEST_F(CollectionSpecificTest, CustomNumTyposConfiguration) { // dropped tokens on a single field cannot be deemed as verbatim match