diff --git a/include/index.h b/include/index.h index 9c8d0fc4..1309abea 100644 --- a/include/index.h +++ b/include/index.h @@ -510,7 +510,8 @@ private: bool prioritize_exact_match, bool exhaustive_search, size_t concurrency, - std::set& query_hashes) const; + std::set& query_hashes, + std::vector& id_buff) const; void do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length, const std::vector& filters, const bool enable_short_circuit) const; diff --git a/include/posting.h b/include/posting.h index c5c3ef66..e8ad4be3 100644 --- a/include/posting.h +++ b/include/posting.h @@ -65,9 +65,11 @@ public: to_expanded_plists(raw_posting_lists, plists, expanded_plists); - std::sort(this->plists.begin(), this->plists.end(), [](posting_list_t* a, posting_list_t* b) { - return a->num_blocks() < b->num_blocks(); - }); + if(plists.size() > 1) { + std::sort(this->plists.begin(), this->plists.end(), [](posting_list_t* a, posting_list_t* b) { + return a->num_blocks() < b->num_blocks(); + }); + } } ~block_intersector_t() { diff --git a/src/art.cpp b/src/art.cpp index 40017d0c..bb373813 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -950,6 +950,8 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r q.push(root); + size_t num_large_lists = 0; + while(!q.empty() && results.size() < max_results*4) { art_node *n = (art_node *) q.top(); q.pop(); @@ -974,6 +976,10 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r results.push_back(l); } else { // we will push leaf only if filter matches with leaf IDs + if(!IS_COMPACT_POSTING(l->values)) { + num_large_lists++; + } + bool found_atleast_one = posting_t::contains_atleast_one(l->values, filter_ids, filter_ids_length); if(found_atleast_one) { results.push_back(l); @@ -1024,6 +1030,10 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r } } + LOG(INFO) << "leaf results.size: " << results.size() + << ", filter_ids_length: " << filter_ids_length + << ", num_large_lists: " << num_large_lists; + printf("OUTSIDE art_topk_iter: results size: %d\n", results.size()); return 0; } diff --git a/src/index.cpp b/src/index.cpp index 99b72e8a..1a677731 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1075,7 +1075,8 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, bool prioritize_exact_match, const bool exhaustive_search, const size_t concurrency, - std::set& query_hashes) const { + std::set& query_hashes, + std::vector& id_buff) const { auto product = []( long long a, token_candidates & b ) { return a*b.candidates.size(); }; long long int N = std::accumulate(token_candidates_vec.begin(), token_candidates_vec.end(), 1LL, product); @@ -1180,11 +1181,15 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, for(size_t i = 0; i < concurrency; i++) { // empty vec can happen if not all threads produce results if (!result_id_vecs[i].empty()) { - uint32_t* new_all_result_ids = nullptr; - all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &result_id_vecs[i][0], - result_id_vecs[i].size(), &new_all_result_ids); - delete[] *all_result_ids; - *all_result_ids = new_all_result_ids; + if(exhaustive_search) { + id_buff.insert(id_buff.end(), result_id_vecs[i].begin(), result_id_vecs[i].end()); + } else { + uint32_t* new_all_result_ids = nullptr; + all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &result_id_vecs[i][0], + result_id_vecs[i].size(), &new_all_result_ids); + delete[] *all_result_ids; + *all_result_ids = new_all_result_ids; + } num_result_ids += result_id_vecs[i].size(); @@ -1200,6 +1205,20 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, } } + if(id_buff.size() > 100000) { + // prevents too many ORs during exhaustive searching + std::sort(id_buff.begin(), id_buff.end()); + id_buff.erase(std::unique( id_buff.begin(), id_buff.end() ), id_buff.end()); + + uint32_t* new_all_result_ids = nullptr; + all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &id_buff[0], + id_buff.size(), &new_all_result_ids); + delete[] *all_result_ids; + *all_result_ids = new_all_result_ids; + num_result_ids += id_buff.size(); + id_buff.clear(); + } + if(num_result_ids == 0) { continue; } @@ -1212,7 +1231,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array, void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length, const std::vector& filters, const bool enable_short_circuit) const { - //auto begin = std::chrono::high_resolution_clock::now(); + auto begin = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < filters.size(); i++) { const filter & a_filter = filters[i]; @@ -1552,10 +1571,10 @@ void Index::do_filtering(uint32_t*& filter_ids, uint32_t& filter_ids_length, } } - /*long long int timeMillis = + long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); - LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms";*/ + LOG(INFO) << "Time taken for filtering: " << timeMillis << "ms"; } @@ -2175,6 +2194,8 @@ void Index::search(std::vector& field_query_tokens, } } + auto begin0 = std::chrono::high_resolution_clock::now(); + for(auto& seq_id_kvs: topster_ids) { const uint64_t seq_id = seq_id_kvs.first; auto& kvs = seq_id_kvs.second; // each `kv` can be from a different field @@ -2354,6 +2375,11 @@ void Index::search(std::vector& field_query_tokens, kvs[0]->scores[kvs[0]->match_score_index] = aggregated_score; topster->add(kvs[0]); } + + auto timeMillis0 = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - begin0).count(); + + LOG(INFO) << "Time taken for multi-field aggregation: " << timeMillis0 << "ms"; } //LOG(INFO) << "topster size: " << topster->size; @@ -2877,11 +2903,18 @@ void Index::search_field(const uint8_t & field_id, // prefix should apply only for last token const size_t token_len = prefix_search ? (int) token.length() : (int) token.length() + 1; + auto begin = std::chrono::high_resolution_clock::now(); + // need less candidates for filtered searches since we already only pick tokens with results art_fuzzy_search(search_index.at(field_name), (const unsigned char *) token.c_str(), token_len, costs[token_index], costs[token_index], num_fuzzy_candidates, token_order, prefix_search, filter_ids, filter_ids_length, leaves, unique_tokens); + auto timeMillis = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - begin).count(); + + LOG(INFO) << "Time taken for fuzzy search: " << timeMillis << "ms"; + if(!leaves.empty()) { token_cost_cache.emplace(token_cost_hash, leaves); for(auto leaf: leaves) { @@ -2926,13 +2959,26 @@ void Index::search_field(const uint8_t & field_id, } if(!token_candidates_vec.empty()) { + std::vector id_buff; + // If atleast one token is found, go ahead and search for candidates search_candidates(field_id, the_field.is_array(), filter_ids, filter_ids_length, exclude_token_ids, exclude_token_ids_size, curated_ids, sort_fields, token_candidates_vec, searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len, field_num_results, typo_tokens_threshold, group_limit, group_by_fields, query_tokens, - prioritize_exact_match, combination_limit, concurrency, query_hashes); + prioritize_exact_match, combination_limit, concurrency, query_hashes, id_buff); + + if(id_buff.size() > 1) { + std::sort(id_buff.begin(), id_buff.end()); + id_buff.erase(std::unique( id_buff.begin(), id_buff.end() ), id_buff.end()); + } + + uint32_t* new_all_result_ids = nullptr; + all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, &id_buff[0], + id_buff.size(), &new_all_result_ids); + delete[] *all_result_ids; + *all_result_ids = new_all_result_ids; } resume_typo_loop: diff --git a/src/posting.cpp b/src/posting.cpp index 4ec4ab00..14484af0 100644 --- a/src/posting.cpp +++ b/src/posting.cpp @@ -221,17 +221,23 @@ bool compact_posting_list_t::contains_atleast_one(const uint32_t* target_ids, si size_t num_existing_offsets = id_offsets[i]; size_t existing_id = id_offsets[i + num_existing_offsets + 1]; - if(existing_id == target_ids[target_ids_index]) { - return true; + // Returns iterator to the first element that is >= to value or last if no such element is found. + size_t found_index = std::lower_bound(target_ids + target_ids_index, + target_ids + target_ids_size, existing_id) - target_ids; + + if(found_index == target_ids_size) { + // all elements are lesser than lowest value (existing_id), so we can stop looking + return false; + } else { + if(target_ids[found_index] == existing_id) { + return true; + } + + // adjust lower bound to found_index+1 whose value is >= `existing_id` + target_ids_index = found_index; } - if(target_ids[target_ids_index] < existing_id) { - while(target_ids_index < target_ids_size && target_ids[target_ids_index] < existing_id) { - target_ids_index++; - } - } else { - i += num_existing_offsets + 2; - } + i += num_existing_offsets + 2; } return false; diff --git a/src/posting_list.cpp b/src/posting_list.cpp index 0fcb771d..823dd028 100644 --- a/src/posting_list.cpp +++ b/src/posting_list.cpp @@ -666,37 +666,18 @@ void posting_list_t::intersect(const std::vector& posting_lists bool posting_list_t::take_id(result_iter_state_t& istate, uint32_t id) { // decide if this result id should be excluded if(istate.excluded_result_ids_size != 0) { - while(istate.excluded_result_ids_index < istate.excluded_result_ids_size && - istate.excluded_result_ids[istate.excluded_result_ids_index] < id) { - istate.excluded_result_ids_index++; - } - - if(istate.excluded_result_ids_index < istate.excluded_result_ids_size && - id == istate.excluded_result_ids[istate.excluded_result_ids_index]) { - istate.excluded_result_ids_index++; + if (std::binary_search(istate.excluded_result_ids, + istate.excluded_result_ids + istate.excluded_result_ids_size, id)) { return false; } } - bool id_found_in_filter = true; - // decide if this result be matched with filter results if(istate.filter_ids_length != 0) { - id_found_in_filter = false; - - // e.g. [1, 3] vs [2, 3] - - while(istate.filter_ids_index < istate.filter_ids_length && istate.filter_ids[istate.filter_ids_index] < id) { - istate.filter_ids_index++; - } - - if(istate.filter_ids_index < istate.filter_ids_length && istate.filter_ids[istate.filter_ids_index] == id) { - istate.filter_ids_index++; - id_found_in_filter = true; - } + return std::binary_search(istate.filter_ids, istate.filter_ids + istate.filter_ids_length, id); } - return id_found_in_filter; + return true; } bool posting_list_t::get_offsets(const std::vector& its, diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 63ceba4f..09229ed0 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -1939,7 +1939,7 @@ TEST_F(CollectionTest, DeletionOfDocumentArrayFields) { token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10).get(); - ASSERT_EQ(1, res["found"]); + ASSERT_EQ(1, res["found"].get()); Option rem_op = coll1->remove("100"); diff --git a/test/posting_list_test.cpp b/test/posting_list_test.cpp index d28db670..09c567ef 100644 --- a/test/posting_list_test.cpp +++ b/test/posting_list_test.cpp @@ -1325,6 +1325,12 @@ TEST_F(PostingListTest, CompactPostingListContainsAtleastOne) { ASSERT_TRUE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids3[0], target_ids3.size())); ASSERT_FALSE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids4[0], target_ids4.size())); + std::vector target_ids5 = {2, 3}; + ASSERT_TRUE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids5[0], target_ids5.size())); + + std::vector target_ids6 = {0, 1, 2}; + ASSERT_FALSE(COMPACT_POSTING_PTR(obj)->contains_atleast_one(&target_ids6[0], target_ids6.size())); + posting_t::destroy_list(obj); }