Optimize wildcard query sans filter/sort/facets.

This commit is contained in:
Kishore Nallan 2022-09-10 18:21:13 +05:30
parent 0e39736327
commit 832b519633
6 changed files with 113 additions and 14 deletions

View File

@ -34,19 +34,23 @@ public:
class iterator_t {
private:
block_t* curr_block;
uint32_t curr_index;
int64_t curr_index;
block_t* end_block;
std::map<last_id_t, block_t*>* id_block_map;
bool reverse;
public:
// uncompressed data structure for performance
uint32_t* ids = nullptr;
explicit iterator_t(block_t* start, block_t* end);
explicit iterator_t(block_t* start, block_t* end, std::map<last_id_t, block_t*>* id_block_map, bool reverse);
iterator_t(iterator_t&& rhs) noexcept;
~iterator_t();
[[nodiscard]] bool valid() const;
void next();
void previous();
void skip_to(uint32_t id);
[[nodiscard]] uint32_t id() const;
[[nodiscard]] inline uint32_t index() const;
@ -130,6 +134,8 @@ public:
iterator_t new_iterator(block_t* start_block = nullptr, block_t* end_block = nullptr);
iterator_t new_rev_iterator();
static void merge(const std::vector<id_list_t*>& id_lists, std::vector<uint32_t>& result_ids);
static void intersect(const std::vector<id_list_t*>& id_lists, std::vector<uint32_t>& result_ids);

View File

@ -743,7 +743,7 @@ Option<bool> Collection::validate_and_standardize_sort_fields(const std::vector<
}
}
if(!found_match_score && sort_fields.size() < 3) {
if(!found_match_score && !is_wildcard_query && sort_fields.size() < 3) {
sort_fields_std.emplace_back(sort_field_const::text_match, sort_field_const::desc);
}

View File

@ -30,16 +30,25 @@ uint32_t id_list_t::block_t::erase(const uint32_t id) {
/* iterator_t operations */
id_list_t::iterator_t::iterator_t(id_list_t::block_t* start, id_list_t::block_t* end):
curr_block(start), curr_index(0), end_block(end) {
id_list_t::iterator_t::iterator_t(id_list_t::block_t* start, id_list_t::block_t* end,
std::map<last_id_t, block_t*>* id_block_map, bool reverse):
curr_block(start), curr_index(0), end_block(end), id_block_map(id_block_map), reverse(reverse) {
if(curr_block != end_block) {
ids = curr_block->ids.uncompress();
if(reverse) {
curr_index = curr_block->ids.getLength()-1;
}
}
}
bool id_list_t::iterator_t::valid() const {
return (curr_block != end_block) && (curr_index < curr_block->size());
if(reverse) {
return (curr_block != end_block) && (curr_index >= 0);
} else {
return (curr_block != end_block) && (curr_index < curr_block->size());
}
}
void id_list_t::iterator_t::next() {
@ -57,6 +66,25 @@ void id_list_t::iterator_t::next() {
}
}
void id_list_t::iterator_t::previous() {
curr_index--;
if(curr_index < 0) {
// since block stores only the next pointer, we have to use `id_block_map` for reverse iteration
auto last_ele = ids[curr_block->size()-1];
auto it = id_block_map->find(last_ele);
if(it != id_block_map->end() && it != id_block_map->begin()) {
it--;
curr_block = it->second;
curr_index = curr_block->size()-1;
delete [] ids;
ids = curr_block->ids.uncompress();
} else {
curr_block = end_block;
}
}
}
uint32_t id_list_t::iterator_t::id() const {
return ids[curr_index];
}
@ -103,10 +131,13 @@ id_list_t::iterator_t::iterator_t(iterator_t&& rhs) noexcept {
curr_index = rhs.curr_index;
end_block = rhs.end_block;
ids = rhs.ids;
id_block_map = rhs.id_block_map;
reverse = rhs.reverse;
rhs.curr_block = nullptr;
rhs.end_block = nullptr;
rhs.ids = nullptr;
rhs.id_block_map = nullptr;
}
/* id_list_t operations */
@ -457,7 +488,17 @@ bool id_list_t::equals2(std::vector<id_list_t::iterator_t>& its) {
id_list_t::iterator_t id_list_t::new_iterator(block_t* start_block, block_t* end_block) {
start_block = (start_block == nullptr) ? &root_block : start_block;
return id_list_t::iterator_t(start_block, end_block);
return id_list_t::iterator_t(start_block, end_block, &id_block_map, false);
}
id_list_t::iterator_t id_list_t::new_rev_iterator() {
block_t* start_block = nullptr;
if(!id_block_map.empty()) {
start_block = id_block_map.rbegin()->second;
}
auto rev_it = id_list_t::iterator_t(start_block, nullptr, &id_block_map, true);
return rev_it;
}
void id_list_t::advance_all(std::vector<id_list_t::iterator_t>& its) {

View File

@ -412,7 +412,7 @@ void ids_t::block_intersector_t::split_lists(size_t concurrency,
}
}
partial_its.emplace_back(p_start_block, p_end_block);
partial_its.emplace_back(p_start_block, p_end_block, nullptr, false);
}
start_block = curr_block->next;

View File

@ -2487,9 +2487,43 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
if (is_wildcard_query) {
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
const std::string& field = the_fields[0].name;
bool no_filters_provided = (filters.empty() && filter_ids_length == 0);
if(no_filters_provided && facets.empty() && curated_ids.empty() && vector_query.field_name.empty() &&
sort_fields_std.size() == 1 && sort_fields_std[0].name == sort_field_const::seq_id &&
sort_fields_std[0].order == sort_field_const::desc) {
// optimize for this path specifically
std::vector<uint32_t> result_ids;
auto it = seq_ids->new_rev_iterator();
while(it.valid()) {
uint32_t seq_id = it.id();
uint64_t distinct_id = seq_id;
if(group_limit != 0) {
distinct_id = get_distinct_id(group_by_fields, seq_id);
groups_processed.emplace(distinct_id);
}
int64_t scores[3] = {0};
scores[0] = seq_id;
int64_t match_score_index = -1;
result_ids.push_back(seq_id);
KV kv(field_id, searched_queries.size(), 0, seq_id, distinct_id, match_score_index, scores);
topster->add(&kv);
if(result_ids.size() == page * per_page) {
break;
}
it.previous();
}
all_result_ids_len = seq_ids->num_ids();
goto process_search_results;
}
// if filters were not provided, use the seq_ids index to generate the list of all document ids
if(filters.empty() && filter_ids_length == 0) {
if(no_filters_provided) {
filter_ids_length = seq_ids->num_ids();
filter_ids = seq_ids->uncompress();
}
@ -2710,6 +2744,8 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
//LOG(INFO) << "topster size: " << topster->size;
process_search_results:
delete [] exclude_token_ids;
delete [] excluded_result_ids;

View File

@ -1280,14 +1280,30 @@ TEST_F(CollectionSpecificMoreTest, WildcardSearchWithNoSortingField) {
Collection* coll1 = collectionManager.create_collection(schema).get();
nlohmann::json doc;
doc["title"] = "Sample Title";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
// search on empty collection
auto res_op = coll1->search("*", {}, "", {}, {}, {2}, 10, 1,
FREQUENCY, {true});
ASSERT_TRUE(res_op.ok());
auto res = res_op.get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_EQ(0, res["hits"].size());
ASSERT_EQ(0, res["found"].get<size_t>());
nlohmann::json doc;
doc["title"] = "Sample Title 1";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
doc["title"] = "Sample Title 2";
ASSERT_TRUE(coll1->add(doc.dump()).ok());
res_op = coll1->search("*", {}, "", {}, {}, {2}, 10, 1,
FREQUENCY, {true});
ASSERT_TRUE(res_op.ok());
res = res_op.get();
ASSERT_EQ(2, res["hits"].size());
ASSERT_EQ(2, res["found"].get<size_t>());
ASSERT_EQ("1", res["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", res["hits"][1]["document"]["id"].get<std::string>());
}