Fix bug with offset/pagination affecting vector search.

This commit is contained in:
Kishore Nallan 2023-07-02 20:43:31 +05:30
parent 2cffc015e7
commit a7adc1c3ca
7 changed files with 60 additions and 28 deletions

View File

@ -463,7 +463,7 @@ public:
const text_match_type_t match_type = max_score,
const size_t facet_sample_percent = 100,
const size_t facet_sample_threshold = 0,
const size_t page_offset = UINT32_MAX,
const size_t page_offset = 0,
facet_index_type_t facet_index_type = HASH,
const size_t vector_query_hits = 250) const;

View File

@ -152,7 +152,7 @@ struct search_args {
filter_node_t* filter_tree_root, std::vector<facet>& facets,
std::vector<std::pair<uint32_t, uint32_t>>& included_ids, std::vector<uint32_t> excluded_ids,
std::vector<sort_by>& sort_fields_std, facet_query_t facet_query, const std::vector<uint32_t>& num_typos,
size_t max_facet_values, size_t max_hits, size_t per_page, size_t page, token_ordering token_order,
size_t max_facet_values, size_t max_hits, size_t per_page, size_t offset, token_ordering token_order,
const std::vector<bool>& prefixes, size_t drop_tokens_threshold, size_t typo_tokens_threshold,
const std::vector<std::string>& group_by_fields, size_t group_limit,
const string& default_sorting_field, bool prioritize_exact_match,

View File

@ -1393,7 +1393,17 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
return Option<nlohmann::json>(422, message);
}
size_t offset = (page != 0) ? (per_page * (page - 1)) : page_offset;
size_t offset = 0;
if(page == 0 && page_offset != 0) {
// if only offset is set, use that
offset = page_offset;
} else {
// if both are set or none set, use page value (default is 1)
size_t actual_page = (page == 0) ? 1 : page;
offset = (per_page * (actual_page - 1));
}
size_t fetch_size = offset + per_page;
if(fetch_size > limit_hits) {

View File

@ -783,7 +783,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
std::vector<sort_by> sort_fields;
size_t per_page = 10;
size_t page = 0;
size_t offset = UINT32_MAX;
size_t offset = 0;
token_ordering token_order = NOT_SET;
std::string vector_query;
@ -978,14 +978,6 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
per_page = 0;
}
if(!req_params[PAGE].empty() && page == 0 && offset == UINT32_MAX) {
return Option<bool>(422, "Parameter `page` must be an integer of value greater than 0.");
}
if(req_params[PAGE].empty() && req_params[OFFSET].empty()) {
page = 1;
}
include_fields.insert(include_fields_vec.begin(), include_fields_vec.end());
exclude_fields.insert(exclude_fields_vec.begin(), exclude_fields_vec.end());
@ -1097,10 +1089,10 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
result["search_time_ms"] = timeMillis;
}
if(page != 0) {
result["page"] = page;
} else {
if(page == 0 && offset != 0) {
result["offset"] = offset;
} else {
result["page"] = (page == 0) ? 1 : page;
}
results_json_str = result.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore);

View File

@ -2397,6 +2397,10 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
(filter_id_count >= vector_query.flat_search_cutoff && filter_result_iterator->is_valid)) {
dist_labels.clear();
if(no_filters_provided) {
filter_result_iterator->approx_filter_ids_length = 0;
}
VectorFilterFunctor filterFunctor(filter_result_iterator);
if(field_vector_index->distance_type == cosine) {
@ -2407,6 +2411,7 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
dist_labels = field_vector_index->vecdex->searchKnnCloserFirst(vector_query.values.data(), k, &filterFunctor);
}
}
filter_result_iterator->reset();
std::vector<uint32_t> nearest_ids;
@ -2657,6 +2662,10 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
constexpr float TEXT_MATCH_WEIGHT = 0.7;
constexpr float VECTOR_SEARCH_WEIGHT = 1.0 - TEXT_MATCH_WEIGHT;
if(no_filters_provided) {
filter_result_iterator->approx_filter_ids_length = 0;
}
VectorFilterFunctor filterFunctor(filter_result_iterator);
auto& field_vector_index = vector_index.at(vector_query.field_name);
std::vector<std::pair<float, size_t>> dist_labels;

View File

@ -1055,11 +1055,11 @@ TEST_F(CollectionTest, KeywordQueryReturnsResultsBasedOnPerPageParam) {
ASSERT_EQ(422, res_op.code());
ASSERT_STREQ("Only upto 250 hits can be fetched per page.", res_op.error().c_str());
// when page number is not valid
res_op = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 10, 0,
FREQUENCY, {true}, 1000, empty, empty, 10);
ASSERT_FALSE(res_op.ok());
ASSERT_EQ(422, res_op.code());
// when page number is zero, use the first page
results = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 3, 0,
FREQUENCY, {true}, 1000, empty, empty, 10).get();
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ(6, results["found"].get<int>());
// do pagination
@ -3027,11 +3027,11 @@ TEST_F(CollectionTest, WildcardQueryReturnsResultsBasedOnPerPageParam) {
ASSERT_EQ(422, res_op.code());
ASSERT_STREQ("Only upto 250 hits can be fetched per page.", res_op.error().c_str());
// when page number is not valid
res_op = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 0,
FREQUENCY, {false}, 1000, empty, empty, 10);
ASSERT_FALSE(res_op.ok());
ASSERT_EQ(422, res_op.code());
// when page number is 0, just fetch first page
results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 0,
FREQUENCY, {false}, 1000, empty, empty, 10).get();
ASSERT_EQ(10, results["hits"].size());
ASSERT_EQ(25, results["found"].get<int>());
// do pagination

View File

@ -768,7 +768,7 @@ TEST_F(CoreAPIUtilsTest, SearchPagination) {
ASSERT_EQ(400, results["code"].get<size_t>());
ASSERT_EQ("Parameter `offset` must be an unsigned integer.", results["error"].get<std::string>());
// when page is 0 and no offset is sent
// when page is 0 and offset is NOT sent, we will treat as page=1
search.clear();
req->params.clear();
body["searches"] = nlohmann::json::array();
@ -782,8 +782,29 @@ TEST_F(CoreAPIUtilsTest, SearchPagination) {
post_multi_search(req, res);
results = nlohmann::json::parse(res->body)["results"][0];
ASSERT_EQ(422, results["code"].get<size_t>());
ASSERT_EQ("Parameter `page` must be an integer of value greater than 0.", results["error"].get<std::string>());
ASSERT_EQ(10, results["hits"].size());
ASSERT_EQ(1, results["page"].get<size_t>());
ASSERT_EQ(0, results.count("offset"));
// when both page and offset are sent, use page
search.clear();
req->params.clear();
body["searches"] = nlohmann::json::array();
search["collection"] = "coll1";
search["q"] = "title";
search["page"] = "2";
search["offset"] = "30";
search["query_by"] = "name";
search["sort_by"] = "points:desc";
body["searches"].push_back(search);
req->body = body.dump();
post_multi_search(req, res);
results = nlohmann::json::parse(res->body)["results"][0];
ASSERT_EQ(10, results["hits"].size());
ASSERT_EQ(2, results["page"].get<size_t>());
ASSERT_EQ(0, results.count("offset"));
}
TEST_F(CoreAPIUtilsTest, ExportWithFilter) {