Facet implementation.

2025-05-19 21:22:25 +08:00 · 2017-03-12 21:30:51 +05:30 · 2017-03-12 21:30:51 +05:30 · 4776b41dc1
commit 4776b41dc1
parent 96921be016
11 changed files with 239 additions and 85 deletions
--- a/TODO.md
+++ b/TODO.md
@ -19,8 +19,8 @@
 - ~~Fix documents.jsonl path in tests~~
 - ~~Multi field search tests~~
 - ~~storage key prefix should include collection name~~
- Index and search on multi-valued field
- range search for art_int
+- ~~Index and search on multi-valued field~~
+- ~~range search for art_int~~
 - Proper score field for ranking tokens
 - Support nested fields via "."
 - ~~Restore records as well on restart (like for meta)~~
@ -31,8 +31,8 @@
 - ~~Assumption that all tokens match for scoring is no longer true~~
 - Handle searching for non-existing fields gracefully
 - Intersection without unpacking
- Filters
- Facets
+- ~~Filters~~
+- ~~Facets~~
 - Iterator
 - Highlight
 - Support search operators like +, - etc.
--- a/include/art.h
+++ b/include/art.h
@ -248,7 +248,7 @@ int art_iter_prefix(art_tree *t, const unsigned char *prefix, int prefix_len, ar
 int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost,
                     const int max_words, const token_ordering token_order, const bool prefix, std::vector<art_leaf *> &results);

-static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
+int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
                         std::vector<art_leaf *> &results);

 void encode_int32(int32_t n, unsigned char *chars);
--- a/include/collection.h
+++ b/include/collection.h
@ -44,12 +44,14 @@ private:

    Option<uint32_t> do_filtering(uint32_t** filter_ids_out, const std::string & simple_filter_str);

-    void search(uint32_t* filter_ids, size_t filter_ids_length, std::string & query, const std::string & field,
-                const int num_typos, const size_t num_results, Topster<100> & topster, size_t & num_found,
-                const token_ordering token_order = FREQUENCY, const bool prefix = false);
+    void do_facets(std::vector<facet> & facets, uint32_t* result_ids, size_t results_size);

-    void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, int & token_rank,
-                           std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
+    void search(uint32_t *filter_ids, size_t filter_ids_length, std::vector<facet> &facets, std::string &query,
+                    const std::string &field, const int num_typos, const size_t num_results, Topster<100> &topster,
+                    size_t &num_found, const token_ordering token_order = FREQUENCY, const bool prefix = false);
+
+    void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
+                           int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
                           size_t & total_results, size_t & num_found, const size_t & max_results);

    void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id) const;
@ -93,9 +95,9 @@ public:

    std::string add(std::string json_str);

-    nlohmann::json search(std::string query, const std::vector<std::string> fields, const std::string & simple_filter_str,
-                          const int num_typos, const size_t num_results, const token_ordering token_order = FREQUENCY,
-                          const bool prefix = false);
+    nlohmann::json search(std::string query, const std::vector<std::string> fields, const std::string &simple_filter_query,
+                          std::vector<facet> & facets, const int num_typos, const size_t num_results,
+                          const token_ordering token_order = FREQUENCY, const bool prefix = false);

    void remove(std::string id);

--- a/include/field.h
+++ b/include/field.h
@ -62,4 +62,13 @@ struct filter {

        return Option<NUM_COMPARATOR>(400, "Numerical field has an invalid comparator.");
    }
+};
+
+struct facet {
+    const std::string field_name;
+    std::map<std::string, size_t> result_map;
+
+    facet(const std::string field_name): field_name(field_name) {
+
+    }
 };
--- a/src/art.cpp
+++ b/src/art.cpp
@ -900,7 +900,7 @@ static uint32_t get_score(art_node* child) {
    return child->max_token_count;
 }

-static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
+int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
                         std::vector<art_leaf *> &results) {
    printf("INSIDE art_topk_iter: root->type: %d\n", root->type);

--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -195,9 +195,40 @@ void Collection::index_int64_array_field(const std::vector<int64_t> & values, co
    }
 }

-void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, int & token_rank,
-                                   std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
-                                   size_t & total_results, size_t & num_found, const size_t & max_results) {
+void Collection::do_facets(std::vector<facet> & facets, uint32_t* result_ids, size_t results_size) {
+    for(auto & a_facet: facets) {
+        // assumed that facet fields have already been validated upstream
+        const field & facet_field = schema.at(a_facet.field_name);
+
+        // loop through the field, get all keys and intersect those ids with result ids
+        if(index_map.count(facet_field.name) != 0) {
+            art_tree *t = index_map.at(facet_field.name);
+            std::vector<art_leaf *> leaves;
+
+            art_topk_iter(t->root, MAX_SCORE, 10, leaves);
+
+            for(const art_leaf* leaf: leaves) {
+                const uint32_t* facet_ids = leaf->values->ids.uncompress();
+                size_t facet_ids_size = leaf->values->ids.getLength();
+
+                uint32_t* facet_results = new uint32_t[std::min(facet_ids_size, results_size)];
+                const size_t facet_results_size = Intersection::scalar(result_ids, results_size,
+                                                                 facet_ids, facet_ids_size, facet_results);
+
+                const std::string facet_value((const char *)leaf->key, leaf->key_len-1); // drop trailing null
+                a_facet.result_map.insert(std::pair<std::string, size_t>(facet_value, facet_results_size));
+
+                delete [] facet_ids;
+                delete [] facet_results;
+            }
+        }
+    }
+}
+
+void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
+                                   int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves,
+                                   Topster<100> & topster, size_t & total_results, size_t & num_found,
+                                   const size_t & max_results) {
    const size_t combination_limit = 10;
    auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
    long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product);
@ -227,6 +258,8 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
            size_t filtered_results_size =
                    Intersection::scalar(filter_ids, filter_ids_length, result_ids, result_size, filtered_result_ids);

+            do_facets(facets, filtered_result_ids, filtered_results_size);
+
            // go through each matching document id and calculate match score
            score_results(topster, token_rank, query_suggestion, filtered_result_ids, filtered_results_size);
            num_found += filtered_results_size;
@ -234,6 +267,8 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
            delete[] filtered_result_ids;
            delete[] result_ids;
        } else {
+            do_facets(facets, result_ids, result_size);
+
            score_results(topster, token_rank, query_suggestion, result_ids, result_size);
            num_found += result_size;
            delete[] result_ids;
@ -391,15 +426,28 @@ Option<uint32_t> Collection::do_filtering(uint32_t** filter_ids_out, const std::
 }

 nlohmann::json Collection::search(std::string query, const std::vector<std::string> fields,
-                                  const std::string & simple_filter_str,
-                                  const int num_typos, const size_t num_results,
-                                  const token_ordering token_order, const bool prefix) {
+                                  const std::string & simple_filter_query, std::vector<facet> & facets,
+                                  const int num_typos, const size_t num_results, const token_ordering token_order,
+                                  const bool prefix) {
    size_t num_found = 0;
    nlohmann::json result = nlohmann::json::object();

+    // validate facet fields
+    for(const facet & a_facet: facets) {
+        if(schema.count(a_facet.field_name) == 0) {
+            result["error"] = "Could not find a facet field named `" + a_facet.field_name + "` in the schema.";
+            return result;
+        }
+        field facet_field = schema.at(a_facet.field_name);
+        if(facet_field.type != field_types::STRING && facet_field.type != field_types::STRING_ARRAY) {
+            result["error"] = "Facet field `" + a_facet.field_name + "` should be a string or a string array.";
+            return result;
+        }
+    }
+
    // process the filters first
    uint32_t* filter_ids = nullptr;
-    Option<uint32_t> op_filter_ids_length = do_filtering(&filter_ids, simple_filter_str);
+    Option<uint32_t> op_filter_ids_length = do_filtering(&filter_ids, simple_filter_query);
    if(!op_filter_ids_length.ok()) {
        result["error"] = op_filter_ids_length.error();
        return result;
@ -415,9 +463,9 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
        Topster<100> topster;
        const std::string & field = fields[i];
        // proceed to query search only when no filters are provided or when filtering produces results
-        if(simple_filter_str.size() == 0 || filter_ids_length > 0) {
-            search(filter_ids, filter_ids_length, query, field, num_typos, num_results,
-                   topster, num_found, token_order, prefix);
+        if(simple_filter_query.size() == 0 || filter_ids_length > 0) {
+            search(filter_ids, filter_ids_length, facets, query, field, num_typos, num_results, topster, num_found,
+                   token_order, prefix);
            topster.sort();
        }

@ -449,6 +497,16 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri

    result["found"] = num_found;

+    result["facets"] = nlohmann::json::array();
+
+    // populate facets
+    for(const facet & a_facet: facets) {
+        nlohmann::json facet_result = nlohmann::json::object();
+        facet_result["field_name"] = a_facet.field_name;
+        facet_result["counts"] = a_facet.result_map;
+        result["facets"].push_back(facet_result);
+    }
+
    long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
    //!std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
    //!store->print_memory_usage();
@ -464,9 +522,9 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
   4. Intersect the lists to find docs that match each phrase
   5. Sort the docs based on some ranking criteria
 */
-void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::string & query,
-                        const std::string & field, const int num_typos, const size_t num_results,
-                        Topster<100> & topster, size_t & num_found, const token_ordering token_order, const bool prefix) {
+void Collection::search(uint32_t *filter_ids, size_t filter_ids_length, std::vector<facet> &facets, std::string &query,
+                        const std::string &field, const int num_typos, const size_t num_results, Topster<100> &topster,
+                        size_t &num_found, const token_ordering token_order, const bool prefix) {
    std::vector<std::string> tokens;
    StringUtils::tokenize(query, tokens, " ", true);

@ -527,8 +585,14 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str
                leaves = token_cost_cache[token_cost_hash];
            } else {
                int token_len = prefix ? (int) token.length() : (int) token.length() + 1;
+
+                if(token_rank == 2) {
+                    std::cout << "\n";
+                }
+
                art_fuzzy_search(index_map.at(field), (const unsigned char *) token.c_str(), token_len,
                                 costs[token_index], costs[token_index], 3, token_order, prefix, leaves);
+
                if(!leaves.empty()) {
                    token_cost_cache.emplace(token_cost_hash, leaves);
                }
@ -562,9 +626,8 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str
        }

        if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) {
-            // If a) all tokens were found, or b) Some were skipped because they don't exist within max_cost,
-            // go ahead and search for candidates with what we have so far
-            search_candidates(filter_ids, filter_ids_length, token_rank, token_leaves, topster,
+            // If all tokens were found, go ahead and search for candidates with what we have so far
+            search_candidates(filter_ids, filter_ids_length, facets, token_rank, token_leaves, topster,
                              total_results, num_found, max_results);

            if (total_results >= max_results) {
@ -598,7 +661,7 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str
            }
        }

-        return search(filter_ids, filter_ids_length, truncated_query, field, num_typos, num_results, topster,
+        return search(filter_ids, filter_ids_length, facets, truncated_query, field, num_typos, num_results, topster,
                      num_found, token_order, prefix);
    }
 }
@ -614,7 +677,7 @@ void Collection::log_leaves(const int cost, const std::string &token, const std:
    }
 }

-void Collection::score_results(Topster<100> &topster, const int & token_rank,
+void Collection::score_results(Topster<100> & topster, const int & token_rank,
                               const std::vector<art_leaf *> &query_suggestion, const uint32_t *result_ids,
                               const size_t result_size) const {

--- a/src/main/benchmark.cpp
+++ b/src/main/benchmark.cpp
@ -48,7 +48,7 @@ int main(int argc, char* argv[]) {

    while(counter < 3000) {
        auto i = counter % 5;
-        auto results = collection->search(queries[i], search_fields, {}, 1, 100);
+        auto results = collection->search(queries[i], search_fields, {}, { }, 1, 100, MAX_SCORE, 0);
        results_total += results.size();
        counter++;
    }
--- a/src/main/main.cpp
+++ b/src/main/main.cpp
@ -71,7 +71,7 @@ int main(int argc, char* argv[]) {

    auto begin = std::chrono::high_resolution_clock::now();
    std::vector<std::string> search_fields = {"title"};
-    collection->search("the", search_fields, {}, 1, 100);
+    collection->search("the", search_fields, {}, { }, 1, 100, MAX_SCORE, 0);
    long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
    cout << "Time taken: " << timeMillis << "us" << endl;
    return 0;
--- a/src/main/server.cpp
+++ b/src/main/server.cpp
@ -99,8 +99,8 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) {

    std::vector<std::string> search_fields = {"title"};

-    nlohmann::json result = collection->search(query_map["q"], search_fields, filter_str, std::stoi(query_map[NUM_TYPOS]),
-                                               100, token_order, false);
+    nlohmann::json result = collection->search(query_map["q"], search_fields, filter_str, { },
+                                               std::stoi(query_map[NUM_TYPOS]), 100, token_order, false);
    std::string json_str = result.dump();
    //std::cout << "JSON:" << json_str << std::endl;
    struct rusage r_usage;
--- a/test/collection_manager_test.cpp
+++ b/test/collection_manager_test.cpp
@ -47,7 +47,9 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
    infile.close();

    std::vector<std::string> search_fields = {"starring", "title"};
-    nlohmann::json results = collection1->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false);
+    std::vector<facet> facets;
+
+    nlohmann::json results = collection1->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(4, results["hits"].size());

    spp::sparse_hash_map<std::string, field> schema = collection1->get_schema();
@ -64,7 +66,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
    ASSERT_EQ(rank_fields, collection1->get_rank_fields());
    ASSERT_EQ(schema.size(), collection1->get_schema().size());

-    results = collection1->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false);
+    results = collection1->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(4, results["hits"].size());
 }

--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -54,7 +54,8 @@ protected:
 };

 TEST_F(CollectionTest, ExactSearchShouldBeStable) {
-    nlohmann::json results = collection->search("the", search_fields, {}, 0, 10);
+    std::vector<facet> facets;
+    nlohmann::json results = collection->search("the", search_fields, "", facets, 0, 10);
    ASSERT_EQ(7, results["hits"].size());
    ASSERT_EQ(7, results["found"].get<int>());

@ -70,7 +71,8 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
 }

 TEST_F(CollectionTest, ExactPhraseSearch) {
-    nlohmann::json results = collection->search("rocket launch", search_fields, {}, 0, 10);
+    std::vector<facet> facets;
+    nlohmann::json results = collection->search("rocket launch", search_fields, "", facets, 0, 10);
    ASSERT_EQ(5, results["hits"].size());

    /*
@ -92,7 +94,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
    }

    // Check pagination
-    results = collection->search("rocket launch", search_fields, {}, 0, 3);
+    results = collection->search("rocket launch", search_fields, "", facets, 0, 3);
    ASSERT_EQ(3, results["hits"].size());
    for(size_t i = 0; i < 3; i++) {
        nlohmann::json result = results["hits"].at(i);
@ -104,7 +106,8 @@ TEST_F(CollectionTest, ExactPhraseSearch) {

 TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
    // Tokens that are not found in the index should be skipped
-    nlohmann::json results = collection->search("DoesNotExist from", search_fields, {}, 0, 10);
+    std::vector<facet> facets;
+    nlohmann::json results = collection->search("DoesNotExist from", search_fields, "", facets, 0, 10);
    ASSERT_EQ(2, results["hits"].size());

    std::vector<std::string> ids = {"2", "17"};
@ -117,7 +120,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
    }

    // with non-zero cost
-    results = collection->search("DoesNotExist from", search_fields, {}, 1, 10);
+    results = collection->search("DoesNotExist from", search_fields, "", facets, 1, 10);
    ASSERT_EQ(2, results["hits"].size());

    for(size_t i = 0; i < results["hits"].size(); i++) {
@ -128,7 +131,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
    }

    // with 2 indexed words
-    results = collection->search("from DoesNotExist insTruments", search_fields, {}, 1, 10);
+    results = collection->search("from DoesNotExist insTruments", search_fields, "", facets, 1, 10);
    ASSERT_EQ(2, results["hits"].size());
    ids = {"2", "17"};

@ -140,16 +143,17 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
    }

    results.clear();
-    results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 0, 10);
+    results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, "", facets, 0, 10);
    ASSERT_EQ(0, results["hits"].size());

    results.clear();
-    results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 2, 10);
+    results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, "", facets, 2, 10);
    ASSERT_EQ(0, results["hits"].size());
 }

 TEST_F(CollectionTest, PartialPhraseSearch) {
-    nlohmann::json results = collection->search("rocket research", search_fields, {}, 0, 10);
+    std::vector<facet> facets;
+    nlohmann::json results = collection->search("rocket research", search_fields, "", facets, 0, 10);
    ASSERT_EQ(4, results["hits"].size());

    std::vector<std::string> ids = {"1", "8", "16", "17"};
@ -163,7 +167,8 @@ TEST_F(CollectionTest, PartialPhraseSearch) {
 }

 TEST_F(CollectionTest, QueryWithTypo) {
-    nlohmann::json results = collection->search("kind biologcal", search_fields, {}, 2, 3);
+    std::vector<facet> facets;
+    nlohmann::json results = collection->search("kind biologcal", search_fields, "", facets, 2, 3);
    ASSERT_EQ(3, results["hits"].size());

    std::vector<std::string> ids = {"19", "20", "21"};
@ -176,7 +181,7 @@ TEST_F(CollectionTest, QueryWithTypo) {
    }

    results.clear();
-    results = collection->search("fer thx", search_fields, {}, 1, 3);
+    results = collection->search("fer thx", search_fields, "", facets, 1, 3);
    ids = {"1", "10", "13"};

    ASSERT_EQ(3, results["hits"].size());
@ -190,7 +195,8 @@ TEST_F(CollectionTest, QueryWithTypo) {
 }

 TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
-    nlohmann::json results = collection->search("loox", search_fields, {}, 1, 2, MAX_SCORE, false);
+    std::vector<facet> facets;
+    nlohmann::json results = collection->search("loox", search_fields, "", facets, 1, 2, MAX_SCORE, false);
    ASSERT_EQ(2, results["hits"].size());
    std::vector<std::string> ids = {"22", "23"};

@ -201,7 +207,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    results = collection->search("loox", search_fields, {}, 1, 3, FREQUENCY, false);
+    results = collection->search("loox", search_fields, "", facets, 1, 3, FREQUENCY, false);
    ASSERT_EQ(3, results["hits"].size());
    ids = {"3", "12", "24"};

@ -213,19 +219,19 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
    }

    // Check pagination
-    results = collection->search("loox", search_fields, {}, 1, 1, FREQUENCY, false);
+    results = collection->search("loox", search_fields, "", facets, 1, 1, FREQUENCY, false);
    ASSERT_EQ(3, results["found"].get<int>());
    ASSERT_EQ(1, results["hits"].size());
    std::string solo_id = results["hits"].at(0)["id"];
    ASSERT_STREQ("3", solo_id.c_str());

-    results = collection->search("loox", search_fields, {}, 1, 2, FREQUENCY, false);
+    results = collection->search("loox", search_fields, "", facets, 1, 2, FREQUENCY, false);
    ASSERT_EQ(3, results["found"].get<int>());
    ASSERT_EQ(2, results["hits"].size());

    // Check total ordering

-    results = collection->search("loox", search_fields, {}, 1, 10, FREQUENCY, false);
+    results = collection->search("loox", search_fields, "", facets, 1, 10, FREQUENCY, false);
    ASSERT_EQ(5, results["hits"].size());
    ids = {"3", "12", "24", "22", "23"};

@ -236,7 +242,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    results = collection->search("loox", search_fields, {}, 1, 10, MAX_SCORE, false);
+    results = collection->search("loox", search_fields, "", facets, 1, 10, MAX_SCORE, false);
    ASSERT_EQ(5, results["hits"].size());
    ids = {"22", "23", "3", "12", "24"};

@ -250,7 +256,8 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {

 TEST_F(CollectionTest, TextContainingAnActualTypo) {
    // A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
-    nlohmann::json results = collection->search("ISX what", search_fields, {}, 1, 4, FREQUENCY, false);
+    std::vector<facet> facets;
+    nlohmann::json results = collection->search("ISX what", search_fields, "", facets, 1, 4, FREQUENCY, false);
    ASSERT_EQ(4, results["hits"].size());

    std::vector<std::string> ids = {"19", "6", "21", "8"};
@ -263,7 +270,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
    }

    // Record containing exact token match should appear first
-    results = collection->search("ISX", search_fields, {}, 1, 10, FREQUENCY, false);
+    results = collection->search("ISX", search_fields, "", facets, 1, 10, FREQUENCY, false);
    ASSERT_EQ(8, results["hits"].size());

    ids = {"20", "19", "6", "3", "21", "4", "10", "8"};
@ -277,7 +284,8 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
 }

 TEST_F(CollectionTest, PrefixSearching) {
-    nlohmann::json results = collection->search("ex", search_fields, {}, 0, 10, FREQUENCY, true);
+    std::vector<facet> facets;
+    nlohmann::json results = collection->search("ex", search_fields, "", facets, 0, 10, FREQUENCY, true);
    ASSERT_EQ(2, results["hits"].size());
    std::vector<std::string> ids = {"12", "6"};

@ -288,7 +296,7 @@ TEST_F(CollectionTest, PrefixSearching) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    results = collection->search("ex", search_fields, {}, 0, 10, MAX_SCORE, true);
+    results = collection->search("ex", search_fields, "", facets, 0, 10, MAX_SCORE, true);
    ASSERT_EQ(2, results["hits"].size());
    ids = {"6", "12"};

@ -322,7 +330,8 @@ TEST_F(CollectionTest, MultipleFields) {
    infile.close();

    search_fields = {"title", "starring"};
-    nlohmann::json results = coll_mul_fields->search("Will", search_fields, {}, 0, 10, FREQUENCY, false);
+    std::vector<facet> facets;
+    nlohmann::json results = coll_mul_fields->search("Will", search_fields, "", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(4, results["hits"].size());

    std::vector<std::string> ids = {"3", "2", "1", "0"};
@ -337,7 +346,7 @@ TEST_F(CollectionTest, MultipleFields) {
    // when "starring" takes higher priority than "title"

    search_fields = {"starring", "title"};
-    results = coll_mul_fields->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false);
+    results = coll_mul_fields->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(4, results["hits"].size());

    ids = {"15", "14", "12", "13"};
@ -350,11 +359,11 @@ TEST_F(CollectionTest, MultipleFields) {
    }

    search_fields = {"starring", "title", "cast"};
-    results = coll_mul_fields->search("ben affleck", search_fields, {}, 0, 10, FREQUENCY, false);
+    results = coll_mul_fields->search("ben affleck", search_fields, "", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(1, results["hits"].size());

    search_fields = {"cast"};
-    results = coll_mul_fields->search("chris", search_fields, {}, 0, 10, FREQUENCY, false);
+    results = coll_mul_fields->search("chris", search_fields, "", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(3, results["hits"].size());

    ids = {"6", "1", "7"};
@ -366,7 +375,7 @@ TEST_F(CollectionTest, MultipleFields) {
    }

    search_fields = {"cast"};
-    results = coll_mul_fields->search("chris pine", search_fields, {}, 0, 10, FREQUENCY, false);
+    results = coll_mul_fields->search("chris pine", search_fields, "", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(3, results["hits"].size());

    ids = {"7", "6", "1"};
@ -402,7 +411,8 @@ TEST_F(CollectionTest, FilterOnNumericFields) {

    // Plain search with no filters - results should be sorted by rank fields
    search_fields = {"name"};
-    nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", 0, 10, FREQUENCY, false);
+    std::vector<facet> facets;
+    nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(5, results["hits"].size());

    std::vector<std::string> ids = {"3", "1", "4", "0", "2"};
@ -415,7 +425,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
    }

    // Searching on an int32 field
-    results = coll_array_fields->search("Jeremy", search_fields, "age:>24", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "age:>24", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(3, results["hits"].size());

    ids = {"3", "1", "4"};
@ -427,14 +437,14 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    results = coll_array_fields->search("Jeremy", search_fields, "age:>=24", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "age:>=24", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(4, results["hits"].size());

-    results = coll_array_fields->search("Jeremy", search_fields, "age:24", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "age:24", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(1, results["hits"].size());

    // Searching a number against an int32 array field
-    results = coll_array_fields->search("Jeremy", search_fields, "years:>2002", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "years:>2002", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(3, results["hits"].size());

    ids = {"1", "0", "2"};
@ -445,7 +455,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    results = coll_array_fields->search("Jeremy", search_fields, "years:<1989", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "years:<1989", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(1, results["hits"].size());

    ids = {"3"};
@ -457,7 +467,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
    }

    // multiple filters
-    results = coll_array_fields->search("Jeremy", search_fields, "years:<2005 && years:>1987", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "years:<2005 && years:>1987", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(1, results["hits"].size());

    ids = {"4"};
@ -469,7 +479,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
    }

    // multiple search values (works like SQL's IN operator) against a single int field
-    results = coll_array_fields->search("Jeremy", search_fields, "age:[21, 24, 63]", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "age:[21, 24, 63]", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(3, results["hits"].size());

    ids = {"3", "0", "2"};
@ -481,7 +491,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
    }

    // multiple search values against an int32 array field - also use extra padding between symbols
-    results = coll_array_fields->search("Jeremy", search_fields, "years : [ 2015, 1985 , 1999]", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "years : [ 2015, 1985 , 1999]", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(4, results["hits"].size());

    ids = {"3", "1", "4", "0"};
@ -493,7 +503,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
    }

    // searching on an int64 array field - also ensure that padded space causes no issues
-    results = coll_array_fields->search("Jeremy", search_fields, "timestamps : > 475205222", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "timestamps : > 475205222", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(4, results["hits"].size());

    ids = {"1", "4", "0", "2"};
@ -506,7 +516,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
    }

    // when filters don't match any record, no results should be returned
-    results = coll_array_fields->search("Jeremy", search_fields, "timestamps:<1", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "timestamps:<1", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(0, results["hits"].size());

    collectionManager.drop_collection("coll_array_fields");
@ -535,7 +545,8 @@ TEST_F(CollectionTest, FilterOnTextFields) {
    infile.close();

    search_fields = {"name"};
-    nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tags: gold", 0, 10, FREQUENCY, false);
+    std::vector<facet> facets;
+    nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tags: gold", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(4, results["hits"].size());

    std::vector<std::string> ids = {"1", "4", "0", "2"};
@ -547,7 +558,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    results = coll_array_fields->search("Jeremy", search_fields, "tags : bronze", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "tags : bronze", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(2, results["hits"].size());

    ids = {"4", "2"};
@ -560,7 +571,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
    }

    // search with a list of tags, also testing extra padding of space
-    results = coll_array_fields->search("Jeremy", search_fields, "tags: [bronze,   silver]", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "tags: [bronze,   silver]", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(4, results["hits"].size());

    ids = {"3", "4", "0", "2"};
@ -573,7 +584,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
    }

    // should be exact matches (no normalization or fuzzy searching should happen)
-    results = coll_array_fields->search("Jeremy", search_fields, "tags: BRONZE", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "tags: BRONZE", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(0, results["hits"].size());

    collectionManager.drop_collection("coll_array_fields");
@ -604,28 +615,95 @@ TEST_F(CollectionTest, HandleBadlyFormedFilterQuery) {
    infile.close();

    search_fields = {"name"};
+    std::vector<facet> facets;

    // when filter field does not exist in the schema
-    nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tagzz: gold", 0, 10, FREQUENCY, false);
+    nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tagzz: gold", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(0, results["hits"].size());

    // searching using a string for a numeric field
-    results = coll_array_fields->search("Jeremy", search_fields, "age: abcdef", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "age: abcdef", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(0, results["hits"].size());

    // searching using a string for a numeric array field
-    results = coll_array_fields->search("Jeremy", search_fields, "timestamps: abcdef", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "timestamps: abcdef", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(0, results["hits"].size());

    // malformed k:v syntax
-    results = coll_array_fields->search("Jeremy", search_fields, "timestamps abcdef", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "timestamps abcdef", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(0, results["hits"].size());

    // just empty spaces
-    results = coll_array_fields->search("Jeremy", search_fields, "  ", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "  ", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(0, results["hits"].size());

    // wrapping number with quotes
-    results = coll_array_fields->search("Jeremy", search_fields, "age: '21'", 0, 10, FREQUENCY, false);
+    results = coll_array_fields->search("Jeremy", search_fields, "age: '21'", facets, 0, 10, FREQUENCY, false);
    ASSERT_EQ(0, results["hits"].size());
+}
+
+TEST_F(CollectionTest, FacetCounts) {
+    Collection *coll_array_fields;
+
+    std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl");
+    std::vector<field> fields = {field("name", field_types::STRING), field("age", field_types::INT32),
+                                 field("years", field_types::INT32_ARRAY),
+                                 field("timestamps", field_types::INT64_ARRAY),
+                                 field("tags", field_types::STRING_ARRAY)};
+    std::vector<std::string> rank_fields = {"age"};
+
+    coll_array_fields = collectionManager.get_collection("coll_array_fields");
+    if(coll_array_fields == nullptr) {
+        coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, rank_fields);
+    }
+
+    std::string json_line;
+
+    while (std::getline(infile, json_line)) {
+        coll_array_fields->add(json_line);
+    }
+
+    infile.close();
+
+    search_fields = {"name"};
+    std::vector<facet> facets = {facet("tags")};
+
+    // single facet with no filters
+    nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false);
+    ASSERT_EQ(5, results["hits"].size());
+
+    ASSERT_EQ(1, results["facets"].size());
+    ASSERT_EQ(2, results["facets"][0].size());
+    ASSERT_EQ("tags", results["facets"][0]["field_name"]);
+
+    ASSERT_EQ(4, (int) results["facets"][0]["counts"]["gold"]);
+    ASSERT_EQ(3, (int) results["facets"][0]["counts"]["silver"]);
+    ASSERT_EQ(2, (int) results["facets"][0]["counts"]["bronze"]);
+
+    // 2 facets, 1 text filter with no filters
+    facets.clear();
+    facets.push_back(facet("tags"));
+    facets.push_back(facet("name"));
+    results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false);
+    ASSERT_EQ(5, results["hits"].size());
+    ASSERT_EQ(2, results["facets"].size());
+
+    ASSERT_EQ("tags", results["facets"][0]["field_name"]);
+    ASSERT_EQ("name", results["facets"][1]["field_name"]);
+
+    // text is tokenized and standardized
+    ASSERT_EQ(5, (int) results["facets"][1]["counts"]["howard"]);
+    ASSERT_EQ(5, (int) results["facets"][1]["counts"]["jeremy"]);
+
+    // facet with filters
+    facets.clear();
+    facets.push_back(facet("tags"));
+    results = coll_array_fields->search("Jeremy", search_fields, "age: >24", facets, 0, 10, FREQUENCY, false);
+    ASSERT_EQ(3, results["hits"].size());
+    ASSERT_EQ(1, results["facets"].size());
+
+    ASSERT_EQ("tags", results["facets"][0]["field_name"]);
+    ASSERT_EQ(2, (int) results["facets"][0]["counts"]["gold"]);
+    ASSERT_EQ(2, (int) results["facets"][0]["counts"]["silver"]);
+    ASSERT_EQ(1, (int) results["facets"][0]["counts"]["bronze"]);
 }