Allow maximum hits returned to be configurable.

This obviously has a performance impact, but it might not be a big deal for most people and is now left to their discretion. The default of 500 results stays to maintain backward compatibility.
2025-05-18 04:32:38 +08:00 · 2020-02-10 20:54:38 +05:30 · 2020-02-10 20:54:38 +05:30 · fd285b6fbe
commit fd285b6fbe
parent b899b86a96
7 changed files with 50 additions and 40 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -211,7 +211,7 @@ public:
                          const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD,
                          const spp::sparse_hash_set<std::string> include_fields = spp::sparse_hash_set<std::string>(),
                          const spp::sparse_hash_set<std::string> exclude_fields = spp::sparse_hash_set<std::string>(),
-                          const size_t max_facet_values=10);
+                          const size_t max_facet_values=10, size_t max_hits=512);

    Option<nlohmann::json> get(const std::string & id);

@ -234,11 +234,10 @@ public:
    void par_index_in_memory(std::vector<std::vector<index_record>> & iter_batch,
                             batch_index_result & result);

-    static void prune_document(nlohmann::json &document, const spp::sparse_hash_set<std::string> include_fields,
-                               const spp::sparse_hash_set<std::string> exclude_fields);
+    static void prune_document(nlohmann::json &document, const spp::sparse_hash_set<std::string> & include_fields,
+                               const spp::sparse_hash_set<std::string> & exclude_fields);

    static const int MAX_SEARCH_TOKENS = 10;
-    static const int MAX_RESULTS = 500;

    // strings under this length will be fully highlighted, instead of showing a snippet of relevant portion
    enum {SNIPPET_STR_ABOVE_LEN = 30};
--- a/include/index.h
+++ b/include/index.h
@ -31,6 +31,7 @@ struct search_args {
    std::vector<sort_by> sort_fields_std;
    int num_typos;
    size_t max_facet_values;
+    size_t max_hits;
    size_t per_page;
    size_t page;
    token_ordering token_order;
@ -49,10 +50,11 @@ struct search_args {
    search_args(std::string query, std::vector<std::string> search_fields, std::vector<filter> filters,
                std::vector<facet> facets, std::vector<uint32_t> included_ids, std::vector<uint32_t> excluded_ids,
                std::vector<sort_by> sort_fields_std, int num_typos, size_t max_facet_values,
-                size_t per_page, size_t page, token_ordering token_order, bool prefix, size_t drop_tokens_threshold):
+                size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix,
+                size_t drop_tokens_threshold):
            query(query), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids),
            excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), num_typos(num_typos),
-            max_facet_values(max_facet_values), per_page(per_page),
+            max_facet_values(max_facet_values), max_hits(max_hits), per_page(per_page),
            page(page), token_order(token_order), prefix(prefix), drop_tokens_threshold(drop_tokens_threshold),
            all_result_ids_len(0), outcome(0) {

@ -146,15 +148,15 @@ private:
                      std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
                      const int num_typos, const size_t num_results,
                      std::vector<std::vector<art_leaf*>> & searched_queries,
-                      Topster<512> & topster, uint32_t** all_result_ids,
+                      Topster & topster, uint32_t** all_result_ids,
                      size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY,
                      const bool prefix = false, const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD);

    void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
                           const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
                           const token_ordering token_order, std::vector<std::vector<art_leaf*>> & searched_queries,
-                           Topster<512> & topster, uint32_t** all_result_ids,
-                           size_t & all_result_ids_len, const size_t & max_results, const bool prefix);
+                           Topster & topster, uint32_t** all_result_ids,
+                           size_t & all_result_ids_len, const size_t & max_results);

    void insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
                    const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const;
@ -186,7 +188,7 @@ private:

    void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
                             const std::vector<uint32_t> & included_ids,
-                             Topster<512> & curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
+                             Topster & curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);

 public:
    Index() = delete;
@ -202,7 +204,7 @@ public:
                          const std::vector<filter> & filters, std::vector<facet> & facets,
                          const std::vector<uint32_t> & included_ids, const std::vector<uint32_t> & excluded_ids,
                          const std::vector<sort_by> & sort_fields_std, const int num_typos,
-                          const size_t per_page, const size_t page, const token_ordering token_order,
+                          const size_t max_hits, const size_t per_page, const size_t page, const token_ordering token_order,
                          const bool prefix, const size_t drop_tokens_threshold, std::vector<KV> & raw_result_kvs,
                          size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries,
                          std::vector<KV> & override_result_kvs);
@ -217,7 +219,7 @@ public:
                                         std::vector<std::vector<std::vector<uint16_t>>> &array_token_positions);

    void score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
-                       const uint32_t total_cost, Topster<512> &topster, const std::vector<art_leaf *> & query_suggestion,
+                       const uint32_t total_cost, Topster &topster, const std::vector<art_leaf *> & query_suggestion,
                       const uint32_t *result_ids, const size_t result_size) const;

    static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);
--- a/include/topster.h
+++ b/include/topster.h
@ -21,19 +21,20 @@ struct KV {
 /*
 * Remembers the max-K elements seen so far using a min-heap
 */
-template <size_t MAX_SIZE=512>
 struct Topster {
+    const uint32_t MAX_SIZE;
    KV *data;
    uint32_t size;

    spp::sparse_hash_map<uint64_t, KV*> keys;

-    KV *kvs[MAX_SIZE];
+    KV* *kvs;

-    Topster(): size(0){
-        data = new KV[MAX_SIZE];
+    explicit Topster(size_t capacity): MAX_SIZE(capacity), size(0) {
+        kvs = new KV*[capacity];
+        data = new KV[capacity];

-        for(size_t i=0; i<MAX_SIZE; i++) {
+        for(size_t i=0; i<capacity; i++) {
            data[i].field_id = 0;
            data[i].query_index = 0;
            data[i].key = 0;
@ -185,7 +186,7 @@ struct Topster {

    // topster must be sorted before iterated upon to remove dead array entries
    void sort() {
-        std::stable_sort(std::begin(kvs), std::begin(kvs) + size, is_greater_kv);
+        std::stable_sort(kvs, kvs+size, is_greater_kv);
    }

    void clear(){
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -264,8 +264,8 @@ void Collection::par_index_in_memory(std::vector<std::vector<index_record>> & it
    }
 }

-void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash_set<std::string> include_fields,
-                                const spp::sparse_hash_set<std::string> exclude_fields) {
+void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash_set<std::string>& include_fields,
+                                const spp::sparse_hash_set<std::string>& exclude_fields) {
    auto it = document.begin();
    for(; it != document.end(); ) {
        if(exclude_fields.count(it.key()) != 0 || (include_fields.size() != 0 && include_fields.count(it.key()) == 0)) {
@ -311,7 +311,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
                                  const size_t drop_tokens_threshold,
                                  const spp::sparse_hash_set<std::string> include_fields,
                                  const spp::sparse_hash_set<std::string> exclude_fields,
-                                  const size_t max_facet_values) {
+                                  const size_t max_facet_values, const size_t max_hits) {

    std::vector<uint32_t> included_ids;
    std::vector<uint32_t> excluded_ids;
@ -492,8 +492,8 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s

    const size_t num_results = (page * per_page);

-    if(num_results > MAX_RESULTS) {
-        std::string message = "Only the first " + std::to_string(MAX_RESULTS) + " results are available.";
+    if(num_results > max_hits) {
+        std::string message = "Only the first " + std::to_string(max_hits) + " results are available.";
        return Option<nlohmann::json>(422, message);
    }

@ -508,8 +508,8 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
    for(Index* index: indices) {
        index->search_params = search_args(query, search_fields, filters, facets,
                                           index_to_included_ids[index_id], index_to_excluded_ids[index_id],
-                                           sort_fields_std, num_typos, max_facet_values, per_page, page,
-                                           token_order, prefix, drop_tokens_threshold);
+                                           sort_fields_std, num_typos, max_facet_values, max_hits,
+                                           per_page, page, token_order, prefix, drop_tokens_threshold);
        {
            std::lock_guard<std::mutex> lk(index->m);
            index->ready = true;
@ -577,7 +577,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
    }

    // All fields are sorted descending
-    std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster<>::is_greater_kv_value);
+    std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_value);

    // Sort based on position in overriden list
    std::sort(
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@ -176,6 +176,7 @@ void get_search(http_req & req, http_res & res) {
    const char *FACET_BY = "facet_by";
    const char *MAX_FACET_VALUES = "max_facet_values";

+    const char *MAX_HITS = "max_hits";
    const char *PER_PAGE = "per_page";
    const char *PAGE = "page";
    const char *CALLBACK = "callback";
@ -207,6 +208,10 @@ void get_search(http_req & req, http_res & res) {
        req.params[MAX_FACET_VALUES] = "10";
    }

+    if(req.params.count(MAX_HITS) == 0) {
+        req.params[MAX_HITS] = "500";
+    }
+
    if(req.params.count(PER_PAGE) == 0) {
        req.params[PER_PAGE] = "10";
    }
@ -301,7 +306,8 @@ void get_search(http_req & req, http_res & res) {
                                                          static_cast<size_t>(std::stoi(req.params[PAGE])),
                                                          token_order, prefix, drop_tokens_threshold,
                                                          include_fields, exclude_fields,
-                                                          static_cast<size_t>(std::stoi(req.params[MAX_FACET_VALUES])));
+                                                          static_cast<size_t>(std::stoi(req.params[MAX_FACET_VALUES])),
+                                                          static_cast<size_t>(std::stoi(req.params[MAX_HITS])));

    uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
                               std::chrono::high_resolution_clock::now() - begin).count();
--- a/src/index.cpp
+++ b/src/index.cpp
@ -559,9 +559,9 @@ void Index::drop_facets(std::vector<facet> & facets, const std::vector<uint32_t>
 void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
                              const std::vector<sort_by> & sort_fields,
                              std::vector<token_candidates> & token_candidates_vec, const token_ordering token_order,
-                              std::vector<std::vector<art_leaf*>> & searched_queries, Topster<512> & topster,
+                              std::vector<std::vector<art_leaf*>> & searched_queries, Topster & topster,
                              uint32_t** all_result_ids, size_t & all_result_ids_len,
-                              const size_t & max_results, const bool prefix) {
+                              const size_t & max_results) {
    const long long combination_limit = 10;

    auto product = []( long long a, token_candidates & b ) { return a*b.candidates.size(); };
@ -795,7 +795,7 @@ void Index::run_search() {
        search(search_params.outcome, search_params.query, search_params.search_fields,
               search_params.filters, search_params.facets, search_params.included_ids,
               search_params.excluded_ids, search_params.sort_fields_std, search_params.num_typos,
-               search_params.per_page, search_params.page, search_params.token_order,
+               search_params.max_hits, search_params.per_page, search_params.page, search_params.token_order,
               search_params.prefix, search_params.drop_tokens_threshold, search_params.raw_result_kvs,
               search_params.all_result_ids_len, search_params.searched_queries, search_params.override_result_kvs);

@ -811,7 +811,7 @@ void Index::run_search() {

 void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
                                const std::vector<uint32_t> & included_ids,
-                                Topster<512> & curated_topster,
+                                Topster & curated_topster,
                                std::vector<std::vector<art_leaf*>> & searched_queries) {

    if(included_ids.size() == 0) {
@ -873,11 +873,13 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
    }
 }

-void Index::search(Option<uint32_t> & outcome, std::string query, const std::vector<std::string> & search_fields,
+void Index::search(Option<uint32_t> & outcome,
+                   std::string query,
+                   const std::vector<std::string> & search_fields,
                   const std::vector<filter> & filters, std::vector<facet> & facets,
                   const std::vector<uint32_t> & included_ids,
                   const std::vector<uint32_t> & excluded_ids,
-                   const std::vector<sort_by> & sort_fields_std, const int num_typos,
+                   const std::vector<sort_by> & sort_fields_std, const int num_typos, const size_t max_hits,
                   const size_t per_page, const size_t page, const token_ordering token_order,
                   const bool prefix, const size_t drop_tokens_threshold,
                   std::vector<KV> & raw_result_kvs,
@ -902,8 +904,8 @@ void Index::search(Option<uint32_t> & outcome, std::string query, const std::vec
    //auto begin = std::chrono::high_resolution_clock::now();
    uint32_t* all_result_ids = nullptr;

-    Topster<512> topster;
-    Topster<512> curated_topster;
+    Topster topster(max_hits);
+    Topster curated_topster(max_hits);

    if(query == "*") {
        const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
@ -986,7 +988,7 @@ void Index::search_field(const uint8_t & field_id, std::string & query, const st
                         uint32_t *filter_ids, size_t filter_ids_length,
                         std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
                         const size_t num_results, std::vector<std::vector<art_leaf*>> & searched_queries,
-                         Topster<512> &topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
+                         Topster & topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
                         const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold) {
    std::vector<std::string> tokens;
    StringUtils::split(query, tokens, " ");
@ -1103,7 +1105,7 @@ void Index::search_field(const uint8_t & field_id, std::string & query, const st
            // If all tokens were found, go ahead and search for candidates with what we have so far
            search_candidates(field_id, filter_ids, filter_ids_length, facets, sort_fields, token_candidates_vec,
                              token_order, searched_queries, topster, all_result_ids, all_result_ids_len,
-                              Index::SEARCH_LIMIT_NUM, prefix);
+                              Index::SEARCH_LIMIT_NUM);

            if (all_result_ids_len >= Index::SEARCH_LIMIT_NUM) {
                // If we don't find enough results, we continue outerloop (looking at tokens with greater cost)
@ -1154,7 +1156,7 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
 }

 void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
-                          const uint32_t total_cost, Topster<512> & topster,
+                          const uint32_t total_cost, Topster & topster,
                          const std::vector<art_leaf *> &query_suggestion,
                          const uint32_t *result_ids, const size_t result_size) const {

--- a/test/topster_test.cpp
+++ b/test/topster_test.cpp
@ -3,7 +3,7 @@
 #include "match_score.h"

 TEST(TopsterTest, MaxIntValues) {
-    Topster<5> topster;
+    Topster topster(5);

    struct {
        uint8_t field_id;
@ -52,7 +52,7 @@ TEST(TopsterTest, MaxIntValues) {
 }

 TEST(TopsterTest, MaxFloatValues) {
-    Topster<5> topster;
+    Topster topster(5);

    struct {
        uint8_t field_id;