From 10c22c174aa0782f21b1cd68f4cd62ae7b4ef715 Mon Sep 17 00:00:00 2001
From: kishorenc <kishorenc@gmail.com>
Date: Sun, 14 Jun 2020 08:14:03 +0530
Subject: [PATCH] Consider grouping when generating facet counts.

---
 TODO.md                           |   2 +
 include/field.h                   |   1 +
 include/index.h                   |   9 +-
 src/array_utils.cpp               |   2 +-
 src/collection.cpp                |  18 ++-
 src/index.cpp                     | 192 +++++++++++++-----------------
 test/collection_override_test.cpp |   4 +-
 7 files changed, 108 insertions(+), 120 deletions(-)
diff --git a/TODO.md b/TODO.md
index 96f2a7e3..b031a788 100644
--- a/TODO.md
+++ b/TODO.md
@@ -98,6 +98,8 @@
 - ~~Handle SIGTERM which is sent when process is killed~~
 - ~~Use snappy compression for storage~~
 - Test for overriding result on second page
+- Fix exclude_scalar early returns
+- Fix result ids length during grouped overrides
 - atleast 1 token match for proceeding with drop tokens
 - support wildcard query with filters
 - API for optimizing on disk storage
diff --git a/include/field.h b/include/field.h
index 95c1fcbe..6255aed5 100644
--- a/include/field.h
+++ b/include/field.h
@@ -146,6 +146,7 @@ struct token_pos_cost_t {
 
 struct facet_count_t {
     uint32_t count;
+    spp::sparse_hash_map<uint64_t, uint32_t> groups;  // used for faceting grouped results
 
     // used to fetch the actual document and value for representation
     uint32_t doc_id;
diff --git a/include/index.h b/include/index.h
index b142c9a3..4baa1eed 100644
--- a/include/index.h
+++ b/include/index.h
@@ -13,6 +13,7 @@
 #include <json.hpp>
 #include <field.h>
 #include <option.h>
+#include <set>
 #include "string_utils.h"
 
 struct token_candidates {
@@ -162,10 +163,9 @@ private:
     void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                    const uint32_t* result_ids, size_t results_size);
 
-    void drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids);
-
     void search_field(const uint8_t & field_id, const std::string & query,
                       const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
+                      const std::vector<uint32_t>& curated_ids,
                       std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
                       const int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
                       Topster* topster, uint32_t** all_result_ids,
@@ -175,8 +175,9 @@ private:
                       const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD);
 
     void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
+                           const std::vector<uint32_t>& curated_ids,
                            const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
-                           const token_ordering token_order, std::vector<std::vector<art_leaf*>> & searched_queries,
+                           std::vector<std::vector<art_leaf*>> & searched_queries,
                            Topster* topster, uint32_t** all_result_ids,
                            size_t & all_result_ids_len,
                            const size_t typo_tokens_threshold);
@@ -306,5 +307,7 @@ public:
     int get_bounded_typo_cost(const size_t max_cost, const size_t token_len) const;
 
     static int64_t float_to_in64_t(float n);
+
+    uint64_t get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id, const uint32_t seq_id) const;
 };
 
diff --git a/src/array_utils.cpp b/src/array_utils.cpp
index 9d93fa46..62fe2d87 100644
--- a/src/array_utils.cpp
+++ b/src/array_utils.cpp
@@ -114,7 +114,7 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA,
     return 0;
   }
 
-  if(B == nullptr) {
+  if(lenB == 0 || B == nullptr) {
     *out = new uint32_t[lenA];
     memcpy(*out, A, lenA * sizeof(uint32_t));
     return lenA;
diff --git a/src/collection.cpp b/src/collection.cpp
index af446907..6e50fbac 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -419,9 +419,9 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
 
         field search_field = search_schema.at(field_name);
 
-        // if field is a string field then it must be a facet field as well
-        if(search_field.is_string() && !search_field.is_facet()) {
-            std::string error = "Field `" + field_name + "` should be a facet field.";
+        // must be a facet field
+        if(!search_field.is_facet()) {
+            std::string error = "Group by field `" + field_name + "` should be a facet field.";
             return Option<nlohmann::json>(400, error);
         }
     }
@@ -711,6 +711,18 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
             for(auto & facet_kv: this_facet.result_map) {
                 size_t count = 0;
 
+
+                // for grouping we have to aggregate group counts to a count value
+                /*if(search_params->group_limit) {
+                    // for every facet
+                    for(auto& a_facet: facets) {
+                        // for every facet value
+                        for(auto& fvalue: a_facet.result_map) {
+                            fvalue.second.count = fvalue.second.groups.size();
+                        }
+                    }
+                }*/
+
                 if(acc_facet.result_map.count(facet_kv.first) == 0) {
                     // not found, so set it
                     count = facet_kv.second.count;
diff --git a/src/index.cpp b/src/index.cpp
index 2fe6e5e6..5c0f8546 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -633,7 +633,7 @@ void Index::compute_facet_stats(facet &a_facet, int64_t raw_value, const std::st
 void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                       const uint32_t* result_ids, size_t results_size) {
 
-    std::map<std::string, size_t> facet_to_index;
+    std::unordered_map<std::string, size_t> facet_to_index;
 
     size_t i_facet = 0;
     for(const auto & facet: facet_schema) {
@@ -661,6 +661,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
             std::vector<std::string> query_tokens;
             StringUtils::split(facet_query.query, query_tokens, " ");
 
+            // for non-string fields, `faceted_name` returns their aliased stringified field name
             art_tree *t = search_index.at(facet_field.faceted_name());
 
             for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
@@ -703,7 +704,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                 const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_id];
 
                 int array_pos = 0;
-                int fvalue_found = 0;
+                bool fvalue_found = false;
                 std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
                 spp::sparse_hash_map<uint32_t, token_pos_cost_t> query_token_positions;
                 size_t field_token_index = -1;
@@ -717,9 +718,9 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                         // ftoken_hash is the raw value for numeric fields
                         compute_facet_stats(a_facet, ftoken_hash, facet_field.type);
 
+                        // not using facet query or this particular facet value is found in facet filter
                         if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) {
-                            // not using facet query or this particular facet value is found in facet filter
-                            fvalue_found |= 1;  // bitwise to ensure only one count for a multi-token facet value
+                            fvalue_found = true;
 
                             if(use_facet_query) {
                                 // map token index to query index (used for highlighting later on)
@@ -736,41 +737,38 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                         }
                     }
 
-                    //std::cout << "j: " << j << std::endl;
-
                     // 0 indicates separator, while the second condition checks for non-array string
                     if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && j == fhashes.size() - 1)) {
-                        if(!use_facet_query || fvalue_found != 0) {
+                        if(!use_facet_query || fvalue_found) {
                             const std::string & fvalue_str = fvaluestream.str();
-                            uint64_t fhash = 0;
-                            if(facet_field.is_string()) {
-                                fhash = facet_token_hash(facet_field, fvalue_str);
-                            } else {
-                                fhash = std::atoi(fvalue_str.c_str());
-                            }
+                            uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
 
                             if(a_facet.result_map.count(fhash) == 0) {
-                                a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0,
+                                a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_map<uint64_t, uint32_t>(),
+                                                                          doc_seq_id, 0,
                                                                           spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
                             }
 
-                            a_facet.result_map[fhash].count += 1;
                             a_facet.result_map[fhash].doc_id = doc_seq_id;
                             a_facet.result_map[fhash].array_pos = array_pos;
 
+                            if(search_params->group_limit) {
+                                uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id);
+                                if(a_facet.result_map[fhash].groups.count(distinct_id) == 0) {
+                                    a_facet.result_map[fhash].groups.emplace(distinct_id, 0);
+                                }
+                                a_facet.result_map[fhash].groups[distinct_id] += 1;
+                            } else {
+                                a_facet.result_map[fhash].count += 1;
+                            }
+
                             if(use_facet_query) {
                                 a_facet.result_map[fhash].query_token_pos = query_token_positions;
-
-                                /*if(j == 11) {
-                                    for(auto xx: query_token_positions) {
-                                        std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl;
-                                    }
-                                }*/
                             }
                         }
 
                         array_pos++;
-                        fvalue_found = 0;
+                        fvalue_found = false;
                         std::stringstream().swap(fvaluestream);
                         spp::sparse_hash_map<uint32_t, token_pos_cost_t>().swap(query_token_positions);
                         field_token_index = -1;
@@ -781,55 +779,10 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
     }
 }
 
-void Index::drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids) {
-    std::map<std::string, size_t> facet_to_index;
-
-    size_t i_facet = 0;
-    for(const auto & facet: facet_schema) {
-        facet_to_index[facet.first] = i_facet;
-        i_facet++;
-    }
-
-    for(auto & a_facet: facets) {
-        const field & facet_field = facet_schema.at(a_facet.field_name);
-
-        // assumed that facet fields have already been validated upstream
-        for(const uint32_t doc_seq_id: ids) {
-            if(facet_index_v2.count(doc_seq_id) != 0) {
-                // FORMAT OF VALUES
-                // String: h1 h2 h3
-                // String array: h1 h2 h3 0 h1 0 h1 h2 0
-                const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
-                std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
-
-                for(size_t j = 0; j < fhashes.size(); j++) {
-                    if(fhashes[j] != FACET_ARRAY_DELIMETER) {
-                        int64_t ftoken_hash = fhashes[j];
-                        fvaluestream << ftoken_hash;
-                    }
-
-                    if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER &&
-                       j == fhashes.size() - 1)) {
-                        const std::string & fvalue_str = fvaluestream.str();
-                        uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
-
-                        if(a_facet.result_map.count(fhash) != 0) {
-                            a_facet.result_map[fhash].count -= 1;
-                            if(a_facet.result_map[fhash].count == 0) {
-                                a_facet.result_map.erase(fhash);
-                            }
-                        }
-                        std::stringstream().swap(fvaluestream);
-                    }
-                }
-            }
-        }
-    }
-}
-
 void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
+                              const std::vector<uint32_t>& curated_ids,
                               const std::vector<sort_by> & sort_fields,
-                              std::vector<token_candidates> & token_candidates_vec, const token_ordering token_order,
+                              std::vector<token_candidates> & token_candidates_vec,
                               std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
                               uint32_t** all_result_ids, size_t & all_result_ids_len,
                               const size_t typo_tokens_threshold) {
@@ -874,6 +827,15 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
             continue;
         }
 
+        if(!curated_ids.empty()) {
+            uint32_t *excluded_result_ids = nullptr;
+            result_size = ArrayUtils::exclude_scalar(result_ids, result_size, &curated_ids[0],
+                                                     curated_ids.size(), &excluded_result_ids);
+
+            delete [] result_ids;
+            result_ids = excluded_result_ids;
+        }
+
         if(filter_ids != nullptr) {
             // intersect once again with filter ids
             uint32_t* filtered_result_ids = nullptr;
@@ -1176,7 +1138,14 @@ void Index::search(Option<uint32_t> & outcome,
         return ;
     }
 
-    const uint32_t filter_ids_length = op_filter_ids_length.get();
+    uint32_t filter_ids_length = op_filter_ids_length.get();
+
+    // we will be removing all curated IDs from organic results before running topster
+    std::set<uint32_t> curated_ids(included_ids.begin(), included_ids.end());
+    curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
+
+    std::vector<uint32_t> curated_ids_sorted(curated_ids.begin(), curated_ids.end());
+    std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end());
 
     // Order of `fields` are used to sort results
     //auto begin = std::chrono::high_resolution_clock::now();
@@ -1186,11 +1155,22 @@ void Index::search(Option<uint32_t> & outcome,
         const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
         const std::string & field = search_fields[0];
 
+        if(!curated_ids.empty()) {
+            uint32_t *excluded_result_ids = nullptr;
+            filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0],
+                                                     curated_ids.size(), &excluded_result_ids);
+
+            delete [] filter_ids;
+            filter_ids = excluded_result_ids;
+        }
+
         score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
                       filter_ids, filter_ids_length);
         collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
-        do_facets(facets, facet_query, filter_ids, filter_ids_length);
+
         all_result_ids_len = filter_ids_length;
+        all_result_ids = filter_ids;
+        filter_ids = nullptr;
     } else {
         const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM);
         for(size_t i = 0; i < num_search_fields; i++) {
@@ -1199,50 +1179,33 @@ void Index::search(Option<uint32_t> & outcome,
                 const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i); // Order of `fields` are used to sort results
                 const std::string & field = search_fields[i];
 
-                search_field(field_id, query, field, filter_ids, filter_ids_length, facets, sort_fields_std,
+                search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
                              num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len,
                              token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
                 collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
             }
         }
-        do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
     }
 
+    do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
     do_facets(facets, facet_query, &included_ids[0], included_ids.size());
 
     // must be sorted before iterated upon to remove "empty" array entries
     topster->sort();
     curated_topster->sort();
 
-    std::set<uint32_t> ids_to_remove(included_ids.begin(), included_ids.end());
-    ids_to_remove.insert(excluded_ids.begin(), excluded_ids.end());
-
-    std::vector<uint32_t> dropped_ids;
-
     // loop through topster and remove elements from included and excluded id lists
 
     if(topster->distinct) {
         for(auto &group_topster_entry: topster->group_kv_map) {
             Topster* group_topster = group_topster_entry.second;
-            for(uint32_t t = 0; t < group_topster->size; t++) {
-                KV* kv = group_topster->getKV(t);
-                if(ids_to_remove.count(kv->key) != 0) {
-                    dropped_ids.push_back((uint32_t)kv->key);
-                }
-            }
-
             const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
             raw_result_kvs.emplace_back(group_kvs);
         }
     } else {
         for(uint32_t t = 0; t < topster->size; t++) {
             KV* kv = topster->getKV(t);
-
-            if(ids_to_remove.count(kv->key) != 0) {
-                dropped_ids.push_back((uint32_t)kv->key);
-            } else {
-                raw_result_kvs.push_back({kv});
-            }
+            raw_result_kvs.push_back({kv});
         }
     }
 
@@ -1252,9 +1215,6 @@ void Index::search(Option<uint32_t> & outcome,
     }
 
     // for the ids that are dropped, remove their corresponding facet components from facet results
-    drop_facets(facets, dropped_ids);
-
-    all_result_ids_len -= dropped_ids.size();
     all_result_ids_len += curated_topster->size;
 
     delete [] filter_ids;
@@ -1277,6 +1237,7 @@ void Index::search(Option<uint32_t> & outcome,
 */
 void Index::search_field(const uint8_t & field_id, const std::string & query, const std::string & field,
                          uint32_t *filter_ids, size_t filter_ids_length,
+                         const std::vector<uint32_t>& curated_ids,
                          std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
                          std::vector<std::vector<art_leaf*>> & searched_queries,
                          Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
@@ -1392,8 +1353,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
 
         if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) {
             // If all tokens were found, go ahead and search for candidates with what we have so far
-            search_candidates(field_id, filter_ids, filter_ids_length, sort_fields, token_candidates_vec,
-                              token_order, searched_queries, topster, all_result_ids, all_result_ids_len,
+            search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec,
+                              searched_queries, topster, all_result_ids, all_result_ids_len,
                               typo_tokens_threshold);
         }
 
@@ -1426,7 +1387,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
             truncated_query += " " + token_count_pairs.at(i).first;
         }
 
-        return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
+        return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids,
+                            facets, sort_fields, num_typos,
                             searched_queries, topster, all_result_ids, all_result_ids_len,
                             token_order, prefix);
     }
@@ -1573,21 +1535,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
         uint64_t distinct_id = seq_id;
 
         if(search_params->group_limit != 0) {
-            distinct_id = 1; // some constant initial value
-
-            // calculate hash from group_by_fields
-            for(const auto& field: search_params->group_by_fields) {
-                if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) {
-                    continue;
-                }
-
-                size_t facet_id = facet_to_id[field];
-                const std::vector<uint64_t>& fhashes = facet_index_v2.at(seq_id)[facet_id];
-
-                for(const auto& hash: fhashes) {
-                    distinct_id = hash_combine(distinct_id, hash);
-                }
-            }
+            distinct_id = get_distinct_id(facet_to_id, seq_id);
         }
 
         KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);
@@ -1603,6 +1551,26 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
     }
 }
 
+uint64_t Index::get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id,
+                                const uint32_t seq_id) const {
+    uint64_t distinct_id = 1; // some constant initial value
+
+    // calculate hash from group_by_fields
+    for(const auto& field: search_params->group_by_fields) {
+        if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) {
+            continue;
+        }
+
+        size_t facet_id = facet_to_id.at(field);
+        const std::vector<uint64_t>& fhashes = facet_index_v2.at(seq_id)[facet_id];
+
+        for(const auto& hash: fhashes) {
+            distinct_id = hash_combine(distinct_id, hash);
+        }
+    }
+    return distinct_id;
+}
+
 void Index::populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
                                      spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
                                      size_t result_index,
diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp
index a94bfbc0..49824142 100644
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@@ -219,6 +219,8 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
                                       spp::sparse_hash_set<std::string>(),
                                       spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();
 
+    ASSERT_EQ(9, results["found"].get<size_t>());
+
     // "count" would be `2` without exclusion
     ASSERT_EQ("<mark>Scott</mark> Glenn", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>());
     ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
@@ -233,7 +235,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
                                       spp::sparse_hash_set<std::string>(),
                                       spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();
 
-    ASSERT_EQ(10, results["found"].get<size_t>());
+    ASSERT_EQ(9, results["found"].get<size_t>());
     ASSERT_EQ(0, results["hits"].size());
 
     coll_mul_fields->remove_override("exclude-rule");