From 10c22c174aa0782f21b1cd68f4cd62ae7b4ef715 Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sun, 14 Jun 2020 08:14:03 +0530 Subject: [PATCH] Consider grouping when generating facet counts. --- TODO.md | 2 + include/field.h | 1 + include/index.h | 9 +- src/array_utils.cpp | 2 +- src/collection.cpp | 18 ++- src/index.cpp | 192 +++++++++++++----------------- test/collection_override_test.cpp | 4 +- 7 files changed, 108 insertions(+), 120 deletions(-) diff --git a/TODO.md b/TODO.md index 96f2a7e3..b031a788 100644 --- a/TODO.md +++ b/TODO.md @@ -98,6 +98,8 @@ - ~~Handle SIGTERM which is sent when process is killed~~ - ~~Use snappy compression for storage~~ - Test for overriding result on second page +- Fix exclude_scalar early returns +- Fix result ids length during grouped overrides - atleast 1 token match for proceeding with drop tokens - support wildcard query with filters - API for optimizing on disk storage diff --git a/include/field.h b/include/field.h index 95c1fcbe..6255aed5 100644 --- a/include/field.h +++ b/include/field.h @@ -146,6 +146,7 @@ struct token_pos_cost_t { struct facet_count_t { uint32_t count; + spp::sparse_hash_map groups; // used for faceting grouped results // used to fetch the actual document and value for representation uint32_t doc_id; diff --git a/include/index.h b/include/index.h index b142c9a3..4baa1eed 100644 --- a/include/index.h +++ b/include/index.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "string_utils.h" struct token_candidates { @@ -162,10 +163,9 @@ private: void do_facets(std::vector & facets, facet_query_t & facet_query, const uint32_t* result_ids, size_t results_size); - void drop_facets(std::vector & facets, const std::vector & ids); - void search_field(const uint8_t & field_id, const std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, std::vector & facets, const std::vector & sort_fields, const int num_typos, std::vector> & searched_queries, Topster* topster, uint32_t** all_result_ids, @@ -175,8 +175,9 @@ private: const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD); void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, const std::vector & sort_fields, std::vector & token_to_candidates, - const token_ordering token_order, std::vector> & searched_queries, + std::vector> & searched_queries, Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len, const size_t typo_tokens_threshold); @@ -306,5 +307,7 @@ public: int get_bounded_typo_cost(const size_t max_cost, const size_t token_len) const; static int64_t float_to_in64_t(float n); + + uint64_t get_distinct_id(const std::unordered_map &facet_to_id, const uint32_t seq_id) const; }; diff --git a/src/array_utils.cpp b/src/array_utils.cpp index 9d93fa46..62fe2d87 100644 --- a/src/array_utils.cpp +++ b/src/array_utils.cpp @@ -114,7 +114,7 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA, return 0; } - if(B == nullptr) { + if(lenB == 0 || B == nullptr) { *out = new uint32_t[lenA]; memcpy(*out, A, lenA * sizeof(uint32_t)); return lenA; diff --git a/src/collection.cpp b/src/collection.cpp index af446907..6e50fbac 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -419,9 +419,9 @@ Option Collection::search(const std::string & query, const std:: field search_field = search_schema.at(field_name); - // if field is a string field then it must be a facet field as well - if(search_field.is_string() && !search_field.is_facet()) { - std::string error = "Field `" + field_name + "` should be a facet field."; + // must be a facet field + if(!search_field.is_facet()) { + std::string error = "Group by field `" + field_name + "` should be a facet field."; return Option(400, error); } } @@ -711,6 +711,18 @@ Option Collection::search(const std::string & query, const std:: for(auto & facet_kv: this_facet.result_map) { size_t count = 0; + + // for grouping we have to aggregate group counts to a count value + /*if(search_params->group_limit) { + // for every facet + for(auto& a_facet: facets) { + // for every facet value + for(auto& fvalue: a_facet.result_map) { + fvalue.second.count = fvalue.second.groups.size(); + } + } + }*/ + if(acc_facet.result_map.count(facet_kv.first) == 0) { // not found, so set it count = facet_kv.second.count; diff --git a/src/index.cpp b/src/index.cpp index 2fe6e5e6..5c0f8546 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -633,7 +633,7 @@ void Index::compute_facet_stats(facet &a_facet, int64_t raw_value, const std::st void Index::do_facets(std::vector & facets, facet_query_t & facet_query, const uint32_t* result_ids, size_t results_size) { - std::map facet_to_index; + std::unordered_map facet_to_index; size_t i_facet = 0; for(const auto & facet: facet_schema) { @@ -661,6 +661,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, std::vector query_tokens; StringUtils::split(facet_query.query, query_tokens, " "); + // for non-string fields, `faceted_name` returns their aliased stringified field name art_tree *t = search_index.at(facet_field.faceted_name()); for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) { @@ -703,7 +704,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, const std::vector & fhashes = facet_index_v2[doc_seq_id][facet_id]; int array_pos = 0; - int fvalue_found = 0; + bool fvalue_found = false; std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens) spp::sparse_hash_map query_token_positions; size_t field_token_index = -1; @@ -717,9 +718,9 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, // ftoken_hash is the raw value for numeric fields compute_facet_stats(a_facet, ftoken_hash, facet_field.type); + // not using facet query or this particular facet value is found in facet filter if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) { - // not using facet query or this particular facet value is found in facet filter - fvalue_found |= 1; // bitwise to ensure only one count for a multi-token facet value + fvalue_found = true; if(use_facet_query) { // map token index to query index (used for highlighting later on) @@ -736,41 +737,38 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, } } - //std::cout << "j: " << j << std::endl; - // 0 indicates separator, while the second condition checks for non-array string if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && j == fhashes.size() - 1)) { - if(!use_facet_query || fvalue_found != 0) { + if(!use_facet_query || fvalue_found) { const std::string & fvalue_str = fvaluestream.str(); - uint64_t fhash = 0; - if(facet_field.is_string()) { - fhash = facet_token_hash(facet_field, fvalue_str); - } else { - fhash = std::atoi(fvalue_str.c_str()); - } + uint64_t fhash = facet_token_hash(facet_field, fvalue_str); if(a_facet.result_map.count(fhash) == 0) { - a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0, + a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_map(), + doc_seq_id, 0, spp::sparse_hash_map()}; } - a_facet.result_map[fhash].count += 1; a_facet.result_map[fhash].doc_id = doc_seq_id; a_facet.result_map[fhash].array_pos = array_pos; + if(search_params->group_limit) { + uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id); + if(a_facet.result_map[fhash].groups.count(distinct_id) == 0) { + a_facet.result_map[fhash].groups.emplace(distinct_id, 0); + } + a_facet.result_map[fhash].groups[distinct_id] += 1; + } else { + a_facet.result_map[fhash].count += 1; + } + if(use_facet_query) { a_facet.result_map[fhash].query_token_pos = query_token_positions; - - /*if(j == 11) { - for(auto xx: query_token_positions) { - std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl; - } - }*/ } } array_pos++; - fvalue_found = 0; + fvalue_found = false; std::stringstream().swap(fvaluestream); spp::sparse_hash_map().swap(query_token_positions); field_token_index = -1; @@ -781,55 +779,10 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, } } -void Index::drop_facets(std::vector & facets, const std::vector & ids) { - std::map facet_to_index; - - size_t i_facet = 0; - for(const auto & facet: facet_schema) { - facet_to_index[facet.first] = i_facet; - i_facet++; - } - - for(auto & a_facet: facets) { - const field & facet_field = facet_schema.at(a_facet.field_name); - - // assumed that facet fields have already been validated upstream - for(const uint32_t doc_seq_id: ids) { - if(facet_index_v2.count(doc_seq_id) != 0) { - // FORMAT OF VALUES - // String: h1 h2 h3 - // String array: h1 h2 h3 0 h1 0 h1 h2 0 - const std::vector & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]]; - std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens) - - for(size_t j = 0; j < fhashes.size(); j++) { - if(fhashes[j] != FACET_ARRAY_DELIMETER) { - int64_t ftoken_hash = fhashes[j]; - fvaluestream << ftoken_hash; - } - - if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && - j == fhashes.size() - 1)) { - const std::string & fvalue_str = fvaluestream.str(); - uint64_t fhash = facet_token_hash(facet_field, fvalue_str); - - if(a_facet.result_map.count(fhash) != 0) { - a_facet.result_map[fhash].count -= 1; - if(a_facet.result_map[fhash].count == 0) { - a_facet.result_map.erase(fhash); - } - } - std::stringstream().swap(fvaluestream); - } - } - } - } - } -} - void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, const std::vector & sort_fields, - std::vector & token_candidates_vec, const token_ordering token_order, + std::vector & token_candidates_vec, std::vector> & searched_queries, Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len, const size_t typo_tokens_threshold) { @@ -874,6 +827,15 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si continue; } + if(!curated_ids.empty()) { + uint32_t *excluded_result_ids = nullptr; + result_size = ArrayUtils::exclude_scalar(result_ids, result_size, &curated_ids[0], + curated_ids.size(), &excluded_result_ids); + + delete [] result_ids; + result_ids = excluded_result_ids; + } + if(filter_ids != nullptr) { // intersect once again with filter ids uint32_t* filtered_result_ids = nullptr; @@ -1176,7 +1138,14 @@ void Index::search(Option & outcome, return ; } - const uint32_t filter_ids_length = op_filter_ids_length.get(); + uint32_t filter_ids_length = op_filter_ids_length.get(); + + // we will be removing all curated IDs from organic results before running topster + std::set curated_ids(included_ids.begin(), included_ids.end()); + curated_ids.insert(excluded_ids.begin(), excluded_ids.end()); + + std::vector curated_ids_sorted(curated_ids.begin(), curated_ids.end()); + std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end()); // Order of `fields` are used to sort results //auto begin = std::chrono::high_resolution_clock::now(); @@ -1186,11 +1155,22 @@ void Index::search(Option & outcome, const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0); const std::string & field = search_fields[0]; + if(!curated_ids.empty()) { + uint32_t *excluded_result_ids = nullptr; + filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0], + curated_ids.size(), &excluded_result_ids); + + delete [] filter_ids; + filter_ids = excluded_result_ids; + } + score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {}, filter_ids, filter_ids_length); collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries); - do_facets(facets, facet_query, filter_ids, filter_ids_length); + all_result_ids_len = filter_ids_length; + all_result_ids = filter_ids; + filter_ids = nullptr; } else { const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM); for(size_t i = 0; i < num_search_fields; i++) { @@ -1199,50 +1179,33 @@ void Index::search(Option & outcome, const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i); // Order of `fields` are used to sort results const std::string & field = search_fields[i]; - search_field(field_id, query, field, filter_ids, filter_ids_length, facets, sort_fields_std, + search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std, num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len, token_order, prefix, drop_tokens_threshold, typo_tokens_threshold); collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries); } } - do_facets(facets, facet_query, all_result_ids, all_result_ids_len); } + do_facets(facets, facet_query, all_result_ids, all_result_ids_len); do_facets(facets, facet_query, &included_ids[0], included_ids.size()); // must be sorted before iterated upon to remove "empty" array entries topster->sort(); curated_topster->sort(); - std::set ids_to_remove(included_ids.begin(), included_ids.end()); - ids_to_remove.insert(excluded_ids.begin(), excluded_ids.end()); - - std::vector dropped_ids; - // loop through topster and remove elements from included and excluded id lists if(topster->distinct) { for(auto &group_topster_entry: topster->group_kv_map) { Topster* group_topster = group_topster_entry.second; - for(uint32_t t = 0; t < group_topster->size; t++) { - KV* kv = group_topster->getKV(t); - if(ids_to_remove.count(kv->key) != 0) { - dropped_ids.push_back((uint32_t)kv->key); - } - } - const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); raw_result_kvs.emplace_back(group_kvs); } } else { for(uint32_t t = 0; t < topster->size; t++) { KV* kv = topster->getKV(t); - - if(ids_to_remove.count(kv->key) != 0) { - dropped_ids.push_back((uint32_t)kv->key); - } else { - raw_result_kvs.push_back({kv}); - } + raw_result_kvs.push_back({kv}); } } @@ -1252,9 +1215,6 @@ void Index::search(Option & outcome, } // for the ids that are dropped, remove their corresponding facet components from facet results - drop_facets(facets, dropped_ids); - - all_result_ids_len -= dropped_ids.size(); all_result_ids_len += curated_topster->size; delete [] filter_ids; @@ -1277,6 +1237,7 @@ void Index::search(Option & outcome, */ void Index::search_field(const uint8_t & field_id, const std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, std::vector & facets, const std::vector & sort_fields, const int num_typos, std::vector> & searched_queries, Topster* topster, uint32_t** all_result_ids, size_t & all_result_ids_len, @@ -1392,8 +1353,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) { // If all tokens were found, go ahead and search for candidates with what we have so far - search_candidates(field_id, filter_ids, filter_ids_length, sort_fields, token_candidates_vec, - token_order, searched_queries, topster, all_result_ids, all_result_ids_len, + search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec, + searched_queries, topster, all_result_ids, all_result_ids_len, typo_tokens_threshold); } @@ -1426,7 +1387,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co truncated_query += " " + token_count_pairs.at(i).first; } - return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos, + return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids, + facets, sort_fields, num_typos, searched_queries, topster, all_result_ids, all_result_ids_len, token_order, prefix); } @@ -1573,21 +1535,7 @@ void Index::score_results(const std::vector & sort_fields, const uint16 uint64_t distinct_id = seq_id; if(search_params->group_limit != 0) { - distinct_id = 1; // some constant initial value - - // calculate hash from group_by_fields - for(const auto& field: search_params->group_by_fields) { - if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) { - continue; - } - - size_t facet_id = facet_to_id[field]; - const std::vector& fhashes = facet_index_v2.at(seq_id)[facet_id]; - - for(const auto& hash: fhashes) { - distinct_id = hash_combine(distinct_id, hash); - } - } + distinct_id = get_distinct_id(facet_to_id, seq_id); } KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores); @@ -1603,6 +1551,26 @@ void Index::score_results(const std::vector & sort_fields, const uint16 } } +uint64_t Index::get_distinct_id(const std::unordered_map &facet_to_id, + const uint32_t seq_id) const { + uint64_t distinct_id = 1; // some constant initial value + + // calculate hash from group_by_fields + for(const auto& field: search_params->group_by_fields) { + if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) { + continue; + } + + size_t facet_id = facet_to_id.at(field); + const std::vector& fhashes = facet_index_v2.at(seq_id)[facet_id]; + + for(const auto& hash: fhashes) { + distinct_id = hash_combine(distinct_id, hash); + } + } + return distinct_id; +} + void Index::populate_token_positions(const std::vector &query_suggestion, spp::sparse_hash_map &leaf_to_indices, size_t result_index, diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index a94bfbc0..49824142 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -219,6 +219,8 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) { spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "starring: scott").get(); + ASSERT_EQ(9, results["found"].get()); + // "count" would be `2` without exclusion ASSERT_EQ("Scott Glenn", results["facet_counts"][0]["counts"][0]["highlighted"].get()); ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get()); @@ -233,7 +235,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) { spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "starring: scott").get(); - ASSERT_EQ(10, results["found"].get()); + ASSERT_EQ(9, results["found"].get()); ASSERT_EQ(0, results["hits"].size()); coll_mul_fields->remove_override("exclude-rule");