Results count should match group size for group query.

2025-05-18 04:32:38 +08:00 · 2020-06-14 17:16:01 +05:30 · 2020-06-14 17:16:01 +05:30 · c5010a6a5f
commit c5010a6a5f
parent 1c398fac7e
7 changed files with 82 additions and 60 deletions
--- a/TODO.md
+++ b/TODO.md
@ -97,9 +97,10 @@
 - ~~Have a LOG(ERROR) level~~
 - ~~Handle SIGTERM which is sent when process is killed~~
 - ~~Use snappy compression for storage~~
+- ~~Fix exclude_scalar early returns~~
+- ~~Fix result ids length during grouped overrides~~
+- Fix override grouping (collate_included_ids)
 - Test for overriding result on second page
- Fix exclude_scalar early returns
- Fix result ids length during grouped overrides
 - atleast 1 token match for proceeding with drop tokens
 - support wildcard query with filters
 - API for optimizing on disk storage
--- a/include/field.h
+++ b/include/field.h
@ -146,7 +146,7 @@ struct token_pos_cost_t {

 struct facet_count_t {
    uint32_t count;
-    spp::sparse_hash_map<uint64_t, uint32_t> groups;  // used for faceting grouped results
+    spp::sparse_hash_set<uint64_t> groups;  // used for faceting grouped results

    // used to fetch the actual document and value for representation
    uint32_t doc_id;
--- a/include/index.h
+++ b/include/index.h
@ -42,6 +42,7 @@ struct search_args {
    std::vector<std::string> group_by_fields;
    size_t group_limit;
    size_t all_result_ids_len;
+    spp::sparse_hash_set<uint64_t> groups_processed;
    std::vector<std::vector<art_leaf*>> searched_queries;
    Topster* topster;
    Topster* curated_topster;
@ -168,9 +169,9 @@ private:
                      const std::vector<uint32_t>& curated_ids,
                      std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
                      const int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
-                      Topster* topster, uint32_t** all_result_ids,
-                      size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY,
-                      const bool prefix = false,
+                      Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
+                      uint32_t** all_result_ids, size_t & all_result_ids_len,
+                      const token_ordering token_order = FREQUENCY, const bool prefix = false,
                      const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD,
                      const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD);

@ -178,7 +179,8 @@ private:
                           const std::vector<uint32_t>& curated_ids,
                           const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
                           std::vector<std::vector<art_leaf*>> & searched_queries,
-                           Topster* topster, uint32_t** all_result_ids,
+                           Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
+                           uint32_t** all_result_ids,
                           size_t & all_result_ids_len,
                           const size_t typo_tokens_threshold);

@ -210,9 +212,9 @@ private:
    void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
                                       const uint32_t indices_length);

-    void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
-                             const std::vector<uint32_t> & included_ids,
-                             Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
+    void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
+                              const std::vector<uint32_t> & included_ids,
+                              Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);

    uint64_t facet_token_hash(const field & a_field, const std::string &token);

@ -242,7 +244,9 @@ public:
                          Topster* topster, Topster* curated_topster,
                          const size_t per_page, const size_t page, const token_ordering token_order,
                          const bool prefix, const size_t drop_tokens_threshold,
-                          size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries,
+                          size_t & all_result_ids_len,
+                          spp::sparse_hash_set<uint64_t>& groups_processed,
+                          std::vector<std::vector<art_leaf*>> & searched_queries,
                          std::vector<std::vector<KV*>> & raw_result_kvs, std::vector<KV*> & override_result_kvs,
                          const size_t typo_tokens_threshold);

@ -257,6 +261,7 @@ public:

    void score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
                       const uint32_t total_cost, Topster* topster, const std::vector<art_leaf *> & query_suggestion,
+                       spp::sparse_hash_set<uint64_t>& groups_processed,
                       const uint32_t *result_ids, const size_t result_size) const;

    static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);
--- a/src/array_utils.cpp
+++ b/src/array_utils.cpp
@ -107,10 +107,12 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA,
  size_t indexA = 0, indexB = 0, res_index = 0;

  if(A == nullptr && B == nullptr) {
-    return 0;
+      *out = nullptr;
+      return 0;
  }

  if(A == nullptr) {
+    *out = nullptr;
    return 0;
  }

--- a/src/art.cpp
+++ b/src/art.cpp
@ -1384,8 +1384,6 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
        art_fuzzy_recurse(0, 0, t->root, -1, term, term_len, irow, jrow, min_cost, max_cost, prefix, nodes);
    }

-    PROCESS_NODES:
-
    if(token_order == FREQUENCY) {
        std::sort(nodes.begin(), nodes.end(), compare_art_node_frequency);
    } else {
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -653,6 +653,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
    std::vector<KV*> override_result_kvs;

    size_t total_found = 0;
+    spp::sparse_hash_set<uint64_t> groups_processed;  // used to calculate total_found for grouped query

    // send data to individual index threads
    size_t index_id = 0;
@ -709,28 +710,22 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
            auto & acc_facet = facets[fi];

            for(auto & facet_kv: this_facet.result_map) {
-                size_t count = 0;
-
-
-                // for grouping we have to aggregate group counts to a count value
-                /*if(search_params->group_limit) {
-                    // for every facet
-                    for(auto& a_facet: facets) {
-                        // for every facet value
-                        for(auto& fvalue: a_facet.result_map) {
-                            fvalue.second.count = fvalue.second.groups.size();
-                        }
-                    }
-                }*/
-
-                if(acc_facet.result_map.count(facet_kv.first) == 0) {
-                    // not found, so set it
-                    count = facet_kv.second.count;
+                if(index->search_params->group_limit) {
+                    // we have to add all group sets
+                    acc_facet.result_map[facet_kv.first].groups.insert(
+                        facet_kv.second.groups.begin(), facet_kv.second.groups.end()
+                    );
                } else {
-                    count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
+                    size_t count = 0;
+                    if(acc_facet.result_map.count(facet_kv.first) == 0) {
+                        // not found, so set it
+                        count = facet_kv.second.count;
+                    } else {
+                        count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
+                    }
+                    acc_facet.result_map[facet_kv.first].count = count;
                }

-                acc_facet.result_map[facet_kv.first].count = count;
                acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
                acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
                acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos;
@ -744,7 +739,25 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
            }
        }

-        total_found += index->search_params->all_result_ids_len;
+        if(group_limit) {
+            groups_processed.insert(
+                index->search_params->groups_processed.begin(),
+                index->search_params->groups_processed.end()
+            );
+        } else {
+            total_found += index->search_params->all_result_ids_len;
+        }
+    }
+
+    // for grouping we have to aggregate group set sizes to a count value
+    if(group_limit) {
+        for(auto& acc_facet: facets) {
+            for(auto& facet_kv: acc_facet.result_map) {
+                facet_kv.second.count = facet_kv.second.groups.size();
+            }
+        }
+
+        total_found = groups_processed.size();
    }

    if(!index_search_op.ok()) {
@ -753,7 +766,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::

    Topster* aggr_topster = nullptr;

-    if(group_limit > 0) {
+    if(group_limit) {
        // group by query requires another round of topster-ing

        // needs to be atleast 1 since scoring is mandatory
--- a/src/index.cpp
+++ b/src/index.cpp
@ -744,7 +744,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                            uint64_t fhash = facet_token_hash(facet_field, fvalue_str);

                            if(a_facet.result_map.count(fhash) == 0) {
-                                a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_map<uint64_t, uint32_t>(),
+                                a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_set<uint64_t>(),
                                                                          doc_seq_id, 0,
                                                                          spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
                            }
@ -754,10 +754,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,

                            if(search_params->group_limit) {
                                uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id);
-                                if(a_facet.result_map[fhash].groups.count(distinct_id) == 0) {
-                                    a_facet.result_map[fhash].groups.emplace(distinct_id, 0);
-                                }
-                                a_facet.result_map[fhash].groups[distinct_id] += 1;
+                                a_facet.result_map[fhash].groups.emplace(distinct_id);
                            } else {
                                a_facet.result_map[fhash].count += 1;
                            }
@ -784,6 +781,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
                              const std::vector<sort_by> & sort_fields,
                              std::vector<token_candidates> & token_candidates_vec,
                              std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
+                              spp::sparse_hash_set<uint64_t>& groups_processed,
                              uint32_t** all_result_ids, size_t & all_result_ids_len,
                              const size_t typo_tokens_threshold) {
    const long long combination_limit = 10;
@ -850,7 +848,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si

            // go through each matching document id and calculate match score
            score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
-                          filtered_result_ids, filtered_results_size);
+                          groups_processed, filtered_result_ids, filtered_results_size);

            delete[] filtered_result_ids;
            delete[] result_ids;
@ -862,7 +860,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
            *all_result_ids = new_all_result_ids;

            score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
-                          result_ids, result_size);
+                          groups_processed, result_ids, result_size);
            delete[] result_ids;
        }

@ -1024,7 +1022,8 @@ void Index::run_search() {
               search_params->topster, search_params->curated_topster,
               search_params->per_page, search_params->page, search_params->token_order,
               search_params->prefix, search_params->drop_tokens_threshold,
-               search_params->all_result_ids_len, search_params->searched_queries,
+               search_params->all_result_ids_len, search_params->groups_processed,
+               search_params->searched_queries,
               search_params->raw_result_kvs, search_params->override_result_kvs,
               search_params->typo_tokens_threshold);

@ -1038,12 +1037,12 @@ void Index::run_search() {
    }
 }

-void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
-                                const std::vector<uint32_t> & included_ids,
-                                Topster* curated_topster,
-                                std::vector<std::vector<art_leaf*>> & searched_queries) {
+void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
+                                 const std::vector<uint32_t> & included_ids,
+                                 Topster* curated_topster,
+                                 std::vector<std::vector<art_leaf*>> & searched_queries) {

-    if(included_ids.size() == 0) {
+    if(included_ids.empty()) {
        return;
    }

@ -1106,9 +1105,9 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f

        KV kv(field_id, searched_queries.size(), seq_id, seq_id, match_score, scores);
        curated_topster->add(&kv);
-
-        searched_queries.push_back(override_query);
    }
+
+    searched_queries.push_back(override_query);
 }

 void Index::search(Option<uint32_t> & outcome,
@ -1124,7 +1123,8 @@ void Index::search(Option<uint32_t> & outcome,
                   const size_t per_page, const size_t page, const token_ordering token_order,
                   const bool prefix, const size_t drop_tokens_threshold,
                   size_t & all_result_ids_len,
-                   std::vector<std::vector<art_leaf*>> & searched_queries,
+                   spp::sparse_hash_set<uint64_t>& groups_processed,
+                   std::vector<std::vector<art_leaf*>>& searched_queries,
                   std::vector<std::vector<KV*>> & raw_result_kvs,
                   std::vector<KV*> & override_result_kvs,
                   const size_t typo_tokens_threshold) {
@ -1140,7 +1140,7 @@ void Index::search(Option<uint32_t> & outcome,

    uint32_t filter_ids_length = op_filter_ids_length.get();

-    // we will be removing all curated IDs from organic results before running topster
+    // we will be removing all curated IDs from organic result ids before running topster
    std::set<uint32_t> curated_ids(included_ids.begin(), included_ids.end());
    curated_ids.insert(excluded_ids.begin(), excluded_ids.end());

@ -1165,8 +1165,8 @@ void Index::search(Option<uint32_t> & outcome,
        }

        score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
-                      filter_ids, filter_ids_length);
-        collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
+                      groups_processed, filter_ids, filter_ids_length);
+        collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries);

        all_result_ids_len = filter_ids_length;
        all_result_ids = filter_ids;
@ -1180,9 +1180,9 @@ void Index::search(Option<uint32_t> & outcome,
                const std::string & field = search_fields[i];

                search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
-                             num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len,
+                             num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len,
                             token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
-                collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
+                collate_included_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
            }
        }
    }
@ -1214,7 +1214,7 @@ void Index::search(Option<uint32_t> & outcome,
        override_result_kvs.push_back(kv);
    }

-    // for the ids that are dropped, remove their corresponding facet components from facet results
+    // add curated IDs to result count
    all_result_ids_len += curated_topster->size;

    delete [] filter_ids;
@ -1240,7 +1240,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
                         const std::vector<uint32_t>& curated_ids,
                         std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
                         std::vector<std::vector<art_leaf*>> & searched_queries,
-                         Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
+                         Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
+                         uint32_t** all_result_ids, size_t & all_result_ids_len,
                         const token_ordering token_order, const bool prefix, 
                         const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) {
    std::vector<std::string> tokens;
@ -1354,7 +1355,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
        if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) {
            // If all tokens were found, go ahead and search for candidates with what we have so far
            search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec,
-                              searched_queries, topster, all_result_ids, all_result_ids_len,
+                              searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
                              typo_tokens_threshold);
        }

@ -1389,7 +1390,7 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co

        return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids,
                            facets, sort_fields, num_typos,
-                            searched_queries, topster, all_result_ids, all_result_ids_len,
+                            searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
                            token_order, prefix);
    }
 }
@ -1417,6 +1418,7 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
 void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
                          const uint8_t & field_id, const uint32_t total_cost, Topster* topster,
                          const std::vector<art_leaf *> &query_suggestion,
+                          spp::sparse_hash_set<uint64_t>& groups_processed,
                          const uint32_t *result_ids, const size_t result_size) const {

    spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
@ -1536,6 +1538,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16

        if(search_params->group_limit != 0) {
            distinct_id = get_distinct_id(facet_to_id, seq_id);
+            groups_processed.emplace(distinct_id);
        }

        KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);