From a7adc1c3ca1af9477653f3d2c29c5ac5ae1a80b2 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sun, 2 Jul 2023 20:43:31 +0530 Subject: [PATCH] Fix bug with offset/pagination affecting vector search. --- include/collection.h | 2 +- include/index.h | 2 +- src/collection.cpp | 12 +++++++++++- src/collection_manager.cpp | 16 ++++------------ src/index.cpp | 9 +++++++++ test/collection_test.cpp | 20 ++++++++++---------- test/core_api_utils_test.cpp | 27 ++++++++++++++++++++++++--- 7 files changed, 60 insertions(+), 28 deletions(-) diff --git a/include/collection.h b/include/collection.h index 571d0b1c..9fc304fc 100644 --- a/include/collection.h +++ b/include/collection.h @@ -463,7 +463,7 @@ public: const text_match_type_t match_type = max_score, const size_t facet_sample_percent = 100, const size_t facet_sample_threshold = 0, - const size_t page_offset = UINT32_MAX, + const size_t page_offset = 0, facet_index_type_t facet_index_type = HASH, const size_t vector_query_hits = 250) const; diff --git a/include/index.h b/include/index.h index cfe210de..d8863c99 100644 --- a/include/index.h +++ b/include/index.h @@ -152,7 +152,7 @@ struct search_args { filter_node_t* filter_tree_root, std::vector& facets, std::vector>& included_ids, std::vector excluded_ids, std::vector& sort_fields_std, facet_query_t facet_query, const std::vector& num_typos, - size_t max_facet_values, size_t max_hits, size_t per_page, size_t page, token_ordering token_order, + size_t max_facet_values, size_t max_hits, size_t per_page, size_t offset, token_ordering token_order, const std::vector& prefixes, size_t drop_tokens_threshold, size_t typo_tokens_threshold, const std::vector& group_by_fields, size_t group_limit, const string& default_sorting_field, bool prioritize_exact_match, diff --git a/src/collection.cpp b/src/collection.cpp index ff28887d..6d644b31 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1393,7 +1393,17 @@ Option Collection::search(std::string raw_query, return Option(422, message); } - size_t offset = (page != 0) ? (per_page * (page - 1)) : page_offset; + size_t offset = 0; + + if(page == 0 && page_offset != 0) { + // if only offset is set, use that + offset = page_offset; + } else { + // if both are set or none set, use page value (default is 1) + size_t actual_page = (page == 0) ? 1 : page; + offset = (per_page * (actual_page - 1)); + } + size_t fetch_size = offset + per_page; if(fetch_size > limit_hits) { diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 5323e1b0..f8d32b4e 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -783,7 +783,7 @@ Option CollectionManager::do_search(std::map& re std::vector sort_fields; size_t per_page = 10; size_t page = 0; - size_t offset = UINT32_MAX; + size_t offset = 0; token_ordering token_order = NOT_SET; std::string vector_query; @@ -978,14 +978,6 @@ Option CollectionManager::do_search(std::map& re per_page = 0; } - if(!req_params[PAGE].empty() && page == 0 && offset == UINT32_MAX) { - return Option(422, "Parameter `page` must be an integer of value greater than 0."); - } - - if(req_params[PAGE].empty() && req_params[OFFSET].empty()) { - page = 1; - } - include_fields.insert(include_fields_vec.begin(), include_fields_vec.end()); exclude_fields.insert(exclude_fields_vec.begin(), exclude_fields_vec.end()); @@ -1097,10 +1089,10 @@ Option CollectionManager::do_search(std::map& re result["search_time_ms"] = timeMillis; } - if(page != 0) { - result["page"] = page; - } else { + if(page == 0 && offset != 0) { result["offset"] = offset; + } else { + result["page"] = (page == 0) ? 1 : page; } results_json_str = result.dump(-1, ' ', false, nlohmann::detail::error_handler_t::ignore); diff --git a/src/index.cpp b/src/index.cpp index 8b0f9de3..50f7a19b 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2397,6 +2397,10 @@ Option Index::search(std::vector& field_query_tokens, cons (filter_id_count >= vector_query.flat_search_cutoff && filter_result_iterator->is_valid)) { dist_labels.clear(); + if(no_filters_provided) { + filter_result_iterator->approx_filter_ids_length = 0; + } + VectorFilterFunctor filterFunctor(filter_result_iterator); if(field_vector_index->distance_type == cosine) { @@ -2407,6 +2411,7 @@ Option Index::search(std::vector& field_query_tokens, cons dist_labels = field_vector_index->vecdex->searchKnnCloserFirst(vector_query.values.data(), k, &filterFunctor); } } + filter_result_iterator->reset(); std::vector nearest_ids; @@ -2657,6 +2662,10 @@ Option Index::search(std::vector& field_query_tokens, cons constexpr float TEXT_MATCH_WEIGHT = 0.7; constexpr float VECTOR_SEARCH_WEIGHT = 1.0 - TEXT_MATCH_WEIGHT; + if(no_filters_provided) { + filter_result_iterator->approx_filter_ids_length = 0; + } + VectorFilterFunctor filterFunctor(filter_result_iterator); auto& field_vector_index = vector_index.at(vector_query.field_name); std::vector> dist_labels; diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 787241dc..4da66c4a 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -1055,11 +1055,11 @@ TEST_F(CollectionTest, KeywordQueryReturnsResultsBasedOnPerPageParam) { ASSERT_EQ(422, res_op.code()); ASSERT_STREQ("Only upto 250 hits can be fetched per page.", res_op.error().c_str()); - // when page number is not valid - res_op = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 10, 0, - FREQUENCY, {true}, 1000, empty, empty, 10); - ASSERT_FALSE(res_op.ok()); - ASSERT_EQ(422, res_op.code()); + // when page number is zero, use the first page + results = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 3, 0, + FREQUENCY, {true}, 1000, empty, empty, 10).get(); + ASSERT_EQ(3, results["hits"].size()); + ASSERT_EQ(6, results["found"].get()); // do pagination @@ -3027,11 +3027,11 @@ TEST_F(CollectionTest, WildcardQueryReturnsResultsBasedOnPerPageParam) { ASSERT_EQ(422, res_op.code()); ASSERT_STREQ("Only upto 250 hits can be fetched per page.", res_op.error().c_str()); - // when page number is not valid - res_op = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 0, - FREQUENCY, {false}, 1000, empty, empty, 10); - ASSERT_FALSE(res_op.ok()); - ASSERT_EQ(422, res_op.code()); + // when page number is 0, just fetch first page + results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 0, + FREQUENCY, {false}, 1000, empty, empty, 10).get(); + ASSERT_EQ(10, results["hits"].size()); + ASSERT_EQ(25, results["found"].get()); // do pagination diff --git a/test/core_api_utils_test.cpp b/test/core_api_utils_test.cpp index 95af7546..d24b44bf 100644 --- a/test/core_api_utils_test.cpp +++ b/test/core_api_utils_test.cpp @@ -768,7 +768,7 @@ TEST_F(CoreAPIUtilsTest, SearchPagination) { ASSERT_EQ(400, results["code"].get()); ASSERT_EQ("Parameter `offset` must be an unsigned integer.", results["error"].get()); - // when page is 0 and no offset is sent + // when page is 0 and offset is NOT sent, we will treat as page=1 search.clear(); req->params.clear(); body["searches"] = nlohmann::json::array(); @@ -782,8 +782,29 @@ TEST_F(CoreAPIUtilsTest, SearchPagination) { post_multi_search(req, res); results = nlohmann::json::parse(res->body)["results"][0]; - ASSERT_EQ(422, results["code"].get()); - ASSERT_EQ("Parameter `page` must be an integer of value greater than 0.", results["error"].get()); + ASSERT_EQ(10, results["hits"].size()); + ASSERT_EQ(1, results["page"].get()); + ASSERT_EQ(0, results.count("offset")); + + // when both page and offset are sent, use page + search.clear(); + req->params.clear(); + body["searches"] = nlohmann::json::array(); + search["collection"] = "coll1"; + search["q"] = "title"; + search["page"] = "2"; + search["offset"] = "30"; + search["query_by"] = "name"; + search["sort_by"] = "points:desc"; + body["searches"].push_back(search); + req->body = body.dump(); + + post_multi_search(req, res); + results = nlohmann::json::parse(res->body)["results"][0]; + ASSERT_EQ(10, results["hits"].size()); + ASSERT_EQ(2, results["page"].get()); + ASSERT_EQ(0, results.count("offset")); + } TEST_F(CoreAPIUtilsTest, ExportWithFilter) {