Implement intersection count for ids_t.

This commit is contained in:
Kishore Nallan 2023-07-01 21:26:07 +05:30
parent 72a2bf41e0
commit 2cffc015e7
5 changed files with 51 additions and 42 deletions

View File

@ -154,6 +154,8 @@ public:
uint32_t* uncompress();
void uncompress(std::vector<uint32_t>& data);
size_t intersect_count(const uint32_t* res_ids, size_t res_ids_len);
};
template<class T>

View File

@ -35,7 +35,7 @@ struct compact_id_list_t {
[[nodiscard]] uint32_t num_ids() const;
bool contains_atleast_one(const uint32_t* target_ids, size_t target_ids_size);
size_t intersect_count(const uint32_t* res_ids, size_t res_ids_len);
};
class ids_t {
@ -90,8 +90,6 @@ public:
static bool contains(const void* obj, uint32_t id);
static bool contains_atleast_one(const void* obj, const uint32_t* target_ids, size_t target_ids_size);
static void merge(const std::vector<void*>& id_lists, std::vector<uint32_t>& result_ids);
static void intersect(const std::vector<void*>& id_lists, std::vector<uint32_t>& result_ids);
@ -100,6 +98,8 @@ public:
static void uncompress(void*& obj, std::vector<uint32_t>& ids);
static size_t intersect_count(void*& obj, const uint32_t* result_ids, size_t result_ids_len);
static void to_expanded_id_lists(const std::vector<void*>& raw_id_lists, std::vector<id_list_t*>& id_lists,
std::vector<id_list_t*>& expanded_id_lists);
};

View File

@ -165,7 +165,6 @@ size_t facet_index_t::intersect(const std::string& field, const uint32_t* result
size_t max_facets = is_wildcard_no_filter_query ? std::min((size_t)max_facet_count, counter_list.size()) :
std::min((size_t)2 * max_facet_count, counter_list.size());
std::vector<uint32_t> id_list;
for(const auto& facet_count : counter_list) {
//LOG(INFO) << "checking ids in facet_value " << facet_count.facet_value << " having total count "
// << facet_count.count << ", is_wildcard_no_filter_query: " << is_wildcard_no_filter_query;
@ -178,13 +177,7 @@ size_t facet_index_t::intersect(const std::string& field, const uint32_t* result
if(!ids) {
continue;
}
ids_t::uncompress(ids, id_list);
uint32_t* out = nullptr;
count = ArrayUtils::and_scalar(id_list.data(), id_list.size(), result_ids, result_ids_len, &out);
delete[] out;
id_list.clear();
count = ids_t::intersect_count(ids, result_ids, result_ids_len);
}
if(count) {

View File

@ -656,3 +656,24 @@ uint32_t* id_list_t::uncompress() {
return arr;
}
size_t id_list_t::intersect_count(const uint32_t *res_ids, size_t res_ids_len) {
size_t count = 0;
size_t res_index = 0;
auto it = new_iterator();
while(it.valid() && res_index < res_ids_len) {
if(it.id() < res_ids[res_index]) {
it.skip_to(res_ids[res_index]);
} else if(it.id() > res_ids[res_index]) {
// returns index that is >= to value or last if no such element is found.
res_index = std::lower_bound(res_ids + res_index, res_ids + res_ids_len, it.id()) - res_ids;
} else {
it.next();
res_index++;
count++;
}
}
return count;
}

View File

@ -145,34 +145,27 @@ bool compact_id_list_t::contains(uint32_t id) {
return false;
}
bool compact_id_list_t::contains_atleast_one(const uint32_t* target_ids, size_t target_ids_size) {
size_t compact_id_list_t::intersect_count(const uint32_t* res_ids, size_t res_ids_len) {
size_t count = 0;
size_t i = 0;
size_t target_ids_index = 0;
size_t res_index = 0;
while(i < length && target_ids_index < target_ids_size) {
size_t num_existing_offsets = ids[i];
size_t existing_id = ids[i + num_existing_offsets + 1];
while(i < length && res_index < res_ids_len) {
size_t curr_id = ids[i];
// Returns iterator to the first element that is >= to value or last if no such element is found.
size_t found_index = std::lower_bound(target_ids + target_ids_index,
target_ids + target_ids_size, existing_id) - target_ids;
if(found_index == target_ids_size) {
// all elements are lesser than lowest value (existing_id), so we can stop looking
return false;
if(curr_id < res_ids[res_index]) {
i++;
} else if(curr_id > res_ids[res_index]) {
// returns index that is >= to value or last if no such element is found.
res_index = std::lower_bound(res_ids + res_index, res_ids + res_ids_len, curr_id) - res_ids;
} else {
if(target_ids[found_index] == existing_id) {
return true;
}
// adjust lower bound to found_index+1 whose value is >= `existing_id`
target_ids_index = found_index;
i++;
res_index++;
count++;
}
i += num_existing_offsets + 2;
}
return false;
return count;
}
/* posting operations */
@ -289,16 +282,6 @@ bool ids_t::contains(const void* obj, uint32_t id) {
}
}
bool ids_t::contains_atleast_one(const void* obj, const uint32_t* target_ids, size_t target_ids_size) {
if(IS_COMPACT_IDS(obj)) {
compact_id_list_t* list = COMPACT_IDS_PTR(obj);
return list->contains_atleast_one(target_ids, target_ids_size);
} else {
id_list_t* list = (id_list_t*)(obj);
return list->contains_atleast_one(target_ids, target_ids_size);
}
}
void ids_t::merge(const std::vector<void*>& raw_posting_lists, std::vector<uint32_t>& result_ids) {
// we will have to convert the compact posting list (if any) to full form
std::vector<id_list_t*> id_lists;
@ -382,6 +365,16 @@ void ids_t::uncompress(void*& obj, std::vector<uint32_t>& ids) {
}
}
size_t ids_t::intersect_count(void*& obj, const uint32_t* result_ids, size_t result_ids_len) {
if(IS_COMPACT_IDS(obj)) {
compact_id_list_t* list = COMPACT_IDS_PTR(obj);
return list->intersect_count(result_ids, result_ids_len);
} else {
id_list_t* list = (id_list_t*)(obj);
return list->intersect_count(result_ids, result_ids_len);
}
}
void ids_t::block_intersector_t::split_lists(size_t concurrency,
std::vector<std::vector<id_list_t::iterator_t>>& partial_its_vec) {
const size_t num_blocks = this->id_lists[0]->num_blocks();