Consider grouping when generating facet counts.

2025-05-21 22:33:27 +08:00 · 2020-06-14 08:14:03 +05:30 · 2020-06-14 08:14:03 +05:30 · 10c22c174a
commit 10c22c174a
parent e234f23cec
7 changed files with 108 additions and 120 deletions
--- a/TODO.md
+++ b/TODO.md
@ -98,6 +98,8 @@
 - ~~Handle SIGTERM which is sent when process is killed~~
 - ~~Use snappy compression for storage~~
 - Test for overriding result on second page
+- Fix exclude_scalar early returns
+- Fix result ids length during grouped overrides
 - atleast 1 token match for proceeding with drop tokens
 - support wildcard query with filters
 - API for optimizing on disk storage
--- a/include/field.h
+++ b/include/field.h
@ -146,6 +146,7 @@ struct token_pos_cost_t {

 struct facet_count_t {
    uint32_t count;
+    spp::sparse_hash_map<uint64_t, uint32_t> groups;  // used for faceting grouped results

    // used to fetch the actual document and value for representation
    uint32_t doc_id;
--- a/include/index.h
+++ b/include/index.h
@ -13,6 +13,7 @@
 #include <json.hpp>
 #include <field.h>
 #include <option.h>
+#include <set>
 #include "string_utils.h"

 struct token_candidates {
@ -162,10 +163,9 @@ private:
    void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                   const uint32_t* result_ids, size_t results_size);

-    void drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids);
-
    void search_field(const uint8_t & field_id, const std::string & query,
                      const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
+                      const std::vector<uint32_t>& curated_ids,
                      std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
                      const int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
                      Topster* topster, uint32_t** all_result_ids,
@ -175,8 +175,9 @@ private:
                      const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD);

    void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
+                           const std::vector<uint32_t>& curated_ids,
                           const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
-                           const token_ordering token_order, std::vector<std::vector<art_leaf*>> & searched_queries,
+                           std::vector<std::vector<art_leaf*>> & searched_queries,
                           Topster* topster, uint32_t** all_result_ids,
                           size_t & all_result_ids_len,
                           const size_t typo_tokens_threshold);
@ -306,5 +307,7 @@ public:
    int get_bounded_typo_cost(const size_t max_cost, const size_t token_len) const;

    static int64_t float_to_in64_t(float n);
+
+    uint64_t get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id, const uint32_t seq_id) const;
 };

--- a/src/array_utils.cpp
+++ b/src/array_utils.cpp
@ -114,7 +114,7 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA,
    return 0;
  }

-  if(B == nullptr) {
+  if(lenB == 0 || B == nullptr) {
    *out = new uint32_t[lenA];
    memcpy(*out, A, lenA * sizeof(uint32_t));
    return lenA;
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -419,9 +419,9 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::

        field search_field = search_schema.at(field_name);

-        // if field is a string field then it must be a facet field as well
-        if(search_field.is_string() && !search_field.is_facet()) {
-            std::string error = "Field `" + field_name + "` should be a facet field.";
+        // must be a facet field
+        if(!search_field.is_facet()) {
+            std::string error = "Group by field `" + field_name + "` should be a facet field.";
            return Option<nlohmann::json>(400, error);
        }
    }
@ -711,6 +711,18 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
            for(auto & facet_kv: this_facet.result_map) {
                size_t count = 0;

+
+                // for grouping we have to aggregate group counts to a count value
+                /*if(search_params->group_limit) {
+                    // for every facet
+                    for(auto& a_facet: facets) {
+                        // for every facet value
+                        for(auto& fvalue: a_facet.result_map) {
+                            fvalue.second.count = fvalue.second.groups.size();
+                        }
+                    }
+                }*/
+
                if(acc_facet.result_map.count(facet_kv.first) == 0) {
                    // not found, so set it
                    count = facet_kv.second.count;
--- a/src/index.cpp
+++ b/src/index.cpp
@ -633,7 +633,7 @@ void Index::compute_facet_stats(facet &a_facet, int64_t raw_value, const std::st
 void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                      const uint32_t* result_ids, size_t results_size) {

-    std::map<std::string, size_t> facet_to_index;
+    std::unordered_map<std::string, size_t> facet_to_index;

    size_t i_facet = 0;
    for(const auto & facet: facet_schema) {
@ -661,6 +661,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
            std::vector<std::string> query_tokens;
            StringUtils::split(facet_query.query, query_tokens, " ");

+            // for non-string fields, `faceted_name` returns their aliased stringified field name
            art_tree *t = search_index.at(facet_field.faceted_name());

            for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
@ -703,7 +704,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_id];

                int array_pos = 0;
-                int fvalue_found = 0;
+                bool fvalue_found = false;
                std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
                spp::sparse_hash_map<uint32_t, token_pos_cost_t> query_token_positions;
                size_t field_token_index = -1;
@ -717,9 +718,9 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                        // ftoken_hash is the raw value for numeric fields
                        compute_facet_stats(a_facet, ftoken_hash, facet_field.type);

+                        // not using facet query or this particular facet value is found in facet filter
                        if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) {
-                            // not using facet query or this particular facet value is found in facet filter
-                            fvalue_found |= 1;  // bitwise to ensure only one count for a multi-token facet value
+                            fvalue_found = true;

                            if(use_facet_query) {
                                // map token index to query index (used for highlighting later on)
@ -736,41 +737,38 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                        }
                    }

-                    //std::cout << "j: " << j << std::endl;
-
                    // 0 indicates separator, while the second condition checks for non-array string
                    if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && j == fhashes.size() - 1)) {
-                        if(!use_facet_query || fvalue_found != 0) {
+                        if(!use_facet_query || fvalue_found) {
                            const std::string & fvalue_str = fvaluestream.str();
-                            uint64_t fhash = 0;
-                            if(facet_field.is_string()) {
-                                fhash = facet_token_hash(facet_field, fvalue_str);
-                            } else {
-                                fhash = std::atoi(fvalue_str.c_str());
-                            }
+                            uint64_t fhash = facet_token_hash(facet_field, fvalue_str);

                            if(a_facet.result_map.count(fhash) == 0) {
-                                a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0,
+                                a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_map<uint64_t, uint32_t>(),
+                                                                          doc_seq_id, 0,
                                                                          spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
                            }

-                            a_facet.result_map[fhash].count += 1;
                            a_facet.result_map[fhash].doc_id = doc_seq_id;
                            a_facet.result_map[fhash].array_pos = array_pos;

+                            if(search_params->group_limit) {
+                                uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id);
+                                if(a_facet.result_map[fhash].groups.count(distinct_id) == 0) {
+                                    a_facet.result_map[fhash].groups.emplace(distinct_id, 0);
+                                }
+                                a_facet.result_map[fhash].groups[distinct_id] += 1;
+                            } else {
+                                a_facet.result_map[fhash].count += 1;
+                            }
+
                            if(use_facet_query) {
                                a_facet.result_map[fhash].query_token_pos = query_token_positions;
-
-                                /*if(j == 11) {
-                                    for(auto xx: query_token_positions) {
-                                        std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl;
-                                    }
-                                }*/
                            }
                        }

                        array_pos++;
-                        fvalue_found = 0;
+                        fvalue_found = false;
                        std::stringstream().swap(fvaluestream);
                        spp::sparse_hash_map<uint32_t, token_pos_cost_t>().swap(query_token_positions);
                        field_token_index = -1;
@ -781,55 +779,10 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
    }
 }

-void Index::drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids) {
-    std::map<std::string, size_t> facet_to_index;
-
-    size_t i_facet = 0;
-    for(const auto & facet: facet_schema) {
-        facet_to_index[facet.first] = i_facet;
-        i_facet++;
-    }
-
-    for(auto & a_facet: facets) {
-        const field & facet_field = facet_schema.at(a_facet.field_name);
-
-        // assumed that facet fields have already been validated upstream
-        for(const uint32_t doc_seq_id: ids) {
-            if(facet_index_v2.count(doc_seq_id) != 0) {
-                // FORMAT OF VALUES
-                // String: h1 h2 h3
-                // String array: h1 h2 h3 0 h1 0 h1 h2 0
-                const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
-                std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
-
-                for(size_t j = 0; j < fhashes.size(); j++) {
-                    if(fhashes[j] != FACET_ARRAY_DELIMETER) {
-                        int64_t ftoken_hash = fhashes[j];
-                        fvaluestream << ftoken_hash;
-                    }
-
-                    if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER &&
-                       j == fhashes.size() - 1)) {
-                        const std::string & fvalue_str = fvaluestream.str();
-                        uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
-
-                        if(a_facet.result_map.count(fhash) != 0) {
-                            a_facet.result_map[fhash].count -= 1;
-                            if(a_facet.result_map[fhash].count == 0) {
-                                a_facet.result_map.erase(fhash);
-                            }
-                        }
-                        std::stringstream().swap(fvaluestream);
-                    }
-                }
-            }
-        }
-    }
-}
-
 void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
+                              const std::vector<uint32_t>& curated_ids,
                              const std::vector<sort_by> & sort_fields,
-                              std::vector<token_candidates> & token_candidates_vec, const token_ordering token_order,
+                              std::vector<token_candidates> & token_candidates_vec,
                              std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
                              uint32_t** all_result_ids, size_t & all_result_ids_len,
                              const size_t typo_tokens_threshold) {
@ -874,6 +827,15 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
            continue;
        }

+        if(!curated_ids.empty()) {
+            uint32_t *excluded_result_ids = nullptr;
+            result_size = ArrayUtils::exclude_scalar(result_ids, result_size, &curated_ids[0],
+                                                     curated_ids.size(), &excluded_result_ids);
+
+            delete [] result_ids;
+            result_ids = excluded_result_ids;
+        }
+
        if(filter_ids != nullptr) {
            // intersect once again with filter ids
            uint32_t* filtered_result_ids = nullptr;
@ -1176,7 +1138,14 @@ void Index::search(Option<uint32_t> & outcome,
        return ;
    }

-    const uint32_t filter_ids_length = op_filter_ids_length.get();
+    uint32_t filter_ids_length = op_filter_ids_length.get();
+
+    // we will be removing all curated IDs from organic results before running topster
+    std::set<uint32_t> curated_ids(included_ids.begin(), included_ids.end());
+    curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
+
+    std::vector<uint32_t> curated_ids_sorted(curated_ids.begin(), curated_ids.end());
+    std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end());

    // Order of `fields` are used to sort results
    //auto begin = std::chrono::high_resolution_clock::now();
@ -1186,11 +1155,22 @@ void Index::search(Option<uint32_t> & outcome,
        const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
        const std::string & field = search_fields[0];

+        if(!curated_ids.empty()) {
+            uint32_t *excluded_result_ids = nullptr;
+            filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0],
+                                                     curated_ids.size(), &excluded_result_ids);
+
+            delete [] filter_ids;
+            filter_ids = excluded_result_ids;
+        }
+
        score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
                      filter_ids, filter_ids_length);
        collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
-        do_facets(facets, facet_query, filter_ids, filter_ids_length);
+
        all_result_ids_len = filter_ids_length;
+        all_result_ids = filter_ids;
+        filter_ids = nullptr;
    } else {
        const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM);
        for(size_t i = 0; i < num_search_fields; i++) {
@ -1199,50 +1179,33 @@ void Index::search(Option<uint32_t> & outcome,
                const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i); // Order of `fields` are used to sort results
                const std::string & field = search_fields[i];

-                search_field(field_id, query, field, filter_ids, filter_ids_length, facets, sort_fields_std,
+                search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
                             num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len,
                             token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
                collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
            }
        }
-        do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
    }

+    do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
    do_facets(facets, facet_query, &included_ids[0], included_ids.size());

    // must be sorted before iterated upon to remove "empty" array entries
    topster->sort();
    curated_topster->sort();

-    std::set<uint32_t> ids_to_remove(included_ids.begin(), included_ids.end());
-    ids_to_remove.insert(excluded_ids.begin(), excluded_ids.end());
-
-    std::vector<uint32_t> dropped_ids;
-
    // loop through topster and remove elements from included and excluded id lists

    if(topster->distinct) {
        for(auto &group_topster_entry: topster->group_kv_map) {
            Topster* group_topster = group_topster_entry.second;
-            for(uint32_t t = 0; t < group_topster->size; t++) {
-                KV* kv = group_topster->getKV(t);
-                if(ids_to_remove.count(kv->key) != 0) {
-                    dropped_ids.push_back((uint32_t)kv->key);
-                }
-            }
-
            const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
            raw_result_kvs.emplace_back(group_kvs);
        }
    } else {
        for(uint32_t t = 0; t < topster->size; t++) {
            KV* kv = topster->getKV(t);
-
-            if(ids_to_remove.count(kv->key) != 0) {
-                dropped_ids.push_back((uint32_t)kv->key);
-            } else {
-                raw_result_kvs.push_back({kv});
-            }
+            raw_result_kvs.push_back({kv});
        }
    }

@ -1252,9 +1215,6 @@ void Index::search(Option<uint32_t> & outcome,
    }

    // for the ids that are dropped, remove their corresponding facet components from facet results
-    drop_facets(facets, dropped_ids);
-
-    all_result_ids_len -= dropped_ids.size();
    all_result_ids_len += curated_topster->size;

    delete [] filter_ids;
@ -1277,6 +1237,7 @@ void Index::search(Option<uint32_t> & outcome,
 */
 void Index::search_field(const uint8_t & field_id, const std::string & query, const std::string & field,
                         uint32_t *filter_ids, size_t filter_ids_length,
+                         const std::vector<uint32_t>& curated_ids,
                         std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
                         std::vector<std::vector<art_leaf*>> & searched_queries,
                         Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
@ -1392,8 +1353,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co

        if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) {
            // If all tokens were found, go ahead and search for candidates with what we have so far
-            search_candidates(field_id, filter_ids, filter_ids_length, sort_fields, token_candidates_vec,
-                              token_order, searched_queries, topster, all_result_ids, all_result_ids_len,
+            search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec,
+                              searched_queries, topster, all_result_ids, all_result_ids_len,
                              typo_tokens_threshold);
        }

@ -1426,7 +1387,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
            truncated_query += " " + token_count_pairs.at(i).first;
        }

-        return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
+        return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids,
+                            facets, sort_fields, num_typos,
                            searched_queries, topster, all_result_ids, all_result_ids_len,
                            token_order, prefix);
    }
@ -1573,21 +1535,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
        uint64_t distinct_id = seq_id;

        if(search_params->group_limit != 0) {
-            distinct_id = 1; // some constant initial value
-
-            // calculate hash from group_by_fields
-            for(const auto& field: search_params->group_by_fields) {
-                if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) {
-                    continue;
-                }
-
-                size_t facet_id = facet_to_id[field];
-                const std::vector<uint64_t>& fhashes = facet_index_v2.at(seq_id)[facet_id];
-
-                for(const auto& hash: fhashes) {
-                    distinct_id = hash_combine(distinct_id, hash);
-                }
-            }
+            distinct_id = get_distinct_id(facet_to_id, seq_id);
        }

        KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);
@ -1603,6 +1551,26 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
    }
 }

+uint64_t Index::get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id,
+                                const uint32_t seq_id) const {
+    uint64_t distinct_id = 1; // some constant initial value
+
+    // calculate hash from group_by_fields
+    for(const auto& field: search_params->group_by_fields) {
+        if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) {
+            continue;
+        }
+
+        size_t facet_id = facet_to_id.at(field);
+        const std::vector<uint64_t>& fhashes = facet_index_v2.at(seq_id)[facet_id];
+
+        for(const auto& hash: fhashes) {
+            distinct_id = hash_combine(distinct_id, hash);
+        }
+    }
+    return distinct_id;
+}
+
 void Index::populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
                                     spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
                                     size_t result_index,
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@ -219,6 +219,8 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
                                      spp::sparse_hash_set<std::string>(),
                                      spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();

+    ASSERT_EQ(9, results["found"].get<size_t>());
+
    // "count" would be `2` without exclusion
    ASSERT_EQ("<mark>Scott</mark> Glenn", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>());
    ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
@ -233,7 +235,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
                                      spp::sparse_hash_set<std::string>(),
                                      spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();

-    ASSERT_EQ(10, results["found"].get<size_t>());
+    ASSERT_EQ(9, results["found"].get<size_t>());
    ASSERT_EQ(0, results["hits"].size());

    coll_mul_fields->remove_override("exclude-rule");