From 1369a473e45c7231a71504d0dbc6eaf499fe174c Mon Sep 17 00:00:00 2001 From: kishorenc Date: Sat, 22 Feb 2020 18:39:52 +0530 Subject: [PATCH] Facet query partial highlighting + tests. --- include/field.h | 12 +++- include/index.h | 4 +- src/collection.cpp | 35 +++++++++-- src/index.cpp | 68 ++++++++++++++-------- test/collection_test.cpp | 122 ++++++++++++++++++++++++++++++++++++++- 5 files changed, 206 insertions(+), 35 deletions(-) diff --git a/include/field.h b/include/field.h index d11eab46..389e9eb0 100644 --- a/include/field.h +++ b/include/field.h @@ -120,11 +120,19 @@ struct sort_by { } }; +struct token_pos_cost_t { + size_t pos; + uint32_t cost; +}; + struct facet_count_t { uint32_t count; - uint32_t doc_id; // used to fetch the actual document and the value from store + + // used to fetch the actual document and value for representation + uint32_t doc_id; uint32_t array_pos; - spp::sparse_hash_map token_query_pos; + + spp::sparse_hash_map query_token_pos; }; struct facet { diff --git a/include/index.h b/include/index.h index 6e03265a..a95d17b3 100644 --- a/include/index.h +++ b/include/index.h @@ -140,7 +140,7 @@ private: Option do_filtering(uint32_t** filter_ids_out, const std::vector & filters); - void do_facets(std::vector & facets, const facet_query_t & facet_query, + void do_facets(std::vector & facets, facet_query_t & facet_query, const uint32_t* result_ids, size_t results_size); void drop_facets(std::vector & facets, const std::vector & ids); @@ -203,7 +203,7 @@ public: void search(Option & outcome, std::string query, const std::vector & search_fields, const std::vector & filters, std::vector & facets, - const facet_query_t & facet_query, + facet_query_t & facet_query, const std::vector & included_ids, const std::vector & excluded_ids, const std::vector & sort_fields_std, const int num_typos, const size_t max_hits, const size_t per_page, const size_t page, const token_ordering token_order, diff --git a/src/collection.cpp b/src/collection.cpp index 3524214d..6fbe185b 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -460,12 +460,24 @@ Option Collection::search(const std::string & query, const std:: if(!simple_facet_query.empty() && simple_facet_query.find(':') == std::string::npos) { std::string error = "Facet query must be in the `facet_field: value` format."; - return Option(404, error); + return Option(400, error); } StringUtils::split(simple_facet_query, facet_query_vec, ":"); if(!facet_query_vec.empty()) { + if(facet_fields.empty()) { + std::string error = "The `facet_query` parameter is supplied without a `facet_by` parameter."; + return Option(400, error); + } + + // facet query field must be part of facet fields requested facet_query = { StringUtils::trim(facet_query_vec[0]), StringUtils::trim(facet_query_vec[1]) }; + if(std::find(facet_fields.begin(), facet_fields.end(), facet_query.field_name) == facet_fields.end()) { + std::string error = "Facet query refers to a facet field `" + facet_query.field_name + "` " + + "that is not part of `facet_by` parameter."; + return Option(400, error); + } + if(facet_schema.count(facet_query.field_name) == 0) { std::string error = "Could not find a facet field named `" + facet_query.field_name + "` in the schema."; return Option(404, error); @@ -582,7 +594,7 @@ Option Collection::search(const std::string & query, const std:: acc_facet.result_map[facet_kv.first].count = count; acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id; acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos; - acc_facet.result_map[facet_kv.first].token_query_pos = facet_kv.second.token_query_pos; + acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos; } } @@ -743,13 +755,28 @@ Option Collection::search(const std::string & query, const std:: StringUtils::split(value, tokens, " "); std::stringstream highlightedss; + /*std::cout << "array_pos: " << facet_count.array_pos << std::endl; + + for(auto xx: facet_count.query_token_pos) { + std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl; + } + + std::cout << "doc id: " << facet_count.doc_id << " i: " << i << std::endl; + std::cout << "doc: " << document << std::endl;*/ + + // invert query_pos -> token_pos + spp::sparse_hash_map token_query_pos; + for(auto qtoken_pos: facet_count.query_token_pos) { + token_query_pos.emplace(qtoken_pos.second.pos, qtoken_pos.first); + } + for(size_t i = 0; i < tokens.size(); i++) { if(i != 0) { highlightedss << " "; } - if(facet_count.token_query_pos.count(i) != 0) { - size_t highlight_len = facet_query_tokens[facet_count.token_query_pos[i]].size(); + if(token_query_pos.count(i) != 0) { + size_t highlight_len = facet_query_tokens[token_query_pos[i]].size(); const std::string & unmarked = tokens[i].substr(highlight_len, std::string::npos); highlightedss << "" + tokens[i].substr(0, highlight_len) + "" + unmarked; } else { diff --git a/src/index.cpp b/src/index.cpp index a6266cb5..4dc469d4 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -530,8 +530,9 @@ void Index::index_float_array_field(const std::vector & values, const uin } } -void Index::do_facets(std::vector & facets, const facet_query_t & facet_query, +void Index::do_facets(std::vector & facets, facet_query_t & facet_query, const uint32_t* result_ids, size_t results_size) { + std::map facet_to_index; size_t i_facet = 0; @@ -542,7 +543,7 @@ void Index::do_facets(std::vector & facets, const facet_query_t & facet_q // assumed that facet fields have already been validated upstream for(auto & a_facet: facets) { - spp::sparse_hash_map fhash_qtoken_pos; // facet hash => token position in the query + spp::sparse_hash_map fhash_qtoken_pos; // facet hash => token position in the query bool use_facet_query = false; if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) { @@ -552,8 +553,8 @@ void Index::do_facets(std::vector & facets, const facet_query_t & facet_q for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) { auto & q = query_tokens[qtoken_index]; - StringUtils::trim(q); - int bounded_cost = get_bounded_typo_cost(2, q.size()); + string_utils.unicode_normalize(q); + int bounded_cost = (q.size() < 3) ? 0 : 1; std::vector leaves; art_fuzzy_search(search_index.at(a_facet.field_name), (const unsigned char *) q.c_str(), q.size(),0, bounded_cost, 10000, @@ -562,12 +563,15 @@ void Index::do_facets(std::vector & facets, const facet_query_t & facet_q const auto & leaf = leaves[i]; // calculate hash without terminating null char uint64_t hash = StringUtils::hash_wy(leaf->key, leaf->key_len-1); - fhash_qtoken_pos.emplace(hash, qtoken_index); - printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash); + token_pos_cost_t token_pos_cost = {qtoken_index, 0}; + fhash_qtoken_pos.emplace(hash, token_pos_cost); + //printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash); } } } + size_t facet_id = facet_to_index[a_facet.field_name]; + for(size_t i = 0; i < results_size; i++) { uint32_t doc_seq_id = result_ids[i]; @@ -575,11 +579,12 @@ void Index::do_facets(std::vector & facets, const facet_query_t & facet_q // FORMAT OF VALUES // String: h1 h2 h3 // String array: h1 h2 h3 0 h1 0 h1 h2 0 - const std::vector & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]]; - int array_pos = -1; + const std::vector & fhashes = facet_index_v2[doc_seq_id][facet_id]; + + int array_pos = 0; int fvalue_found = 0; std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens) - spp::sparse_hash_map token_query_positions; + spp::sparse_hash_map query_token_positions; size_t field_token_index = -1; for(size_t j = 0; j < fhashes.size(); j++) { @@ -588,30 +593,36 @@ void Index::do_facets(std::vector & facets, const facet_query_t & facet_q fvaluestream << ftoken_hash; field_token_index++; - if(use_facet_query && fhash_qtoken_pos.find(ftoken_hash) == fhash_qtoken_pos.end()) { - // this particular facet value is not found in facet filter, so ignore - continue; - } + if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) { + // not using facet query or this particular facet value is found in facet filter + fvalue_found |= 1; // bitwise to ensure only one count for a multi-token facet value - fvalue_found |= 1; // bitwise to ensure only one count for a multi-token facet value + if(use_facet_query) { + // map token index to query index (used for highlighting later on) + token_pos_cost_t qtoken_pos = fhash_qtoken_pos[ftoken_hash]; - if(use_facet_query) { - // map token index to query index (used for highlighting later on) - token_query_positions.emplace(field_token_index, fhash_qtoken_pos[ftoken_hash]); + // if the query token has already matched another token in the string + // we will replace the position only if the cost is lower + if(query_token_positions.find(qtoken_pos.pos) == query_token_positions.end() || + query_token_positions[qtoken_pos.pos].cost >= qtoken_pos.cost ) { + token_pos_cost_t ftoken_pos_cost = {field_token_index, qtoken_pos.cost}; + query_token_positions.emplace(qtoken_pos.pos, ftoken_pos_cost); + } + } } } + //std::cout << "j: " << j << std::endl; + // 0 indicates separator, while the second condition checks for non-array string if(fhashes[j] == 0 || (fhashes.back() != 0 && j == fhashes.size() - 1)) { if(!use_facet_query || fvalue_found != 0) { - array_pos++; - const std::string & fvalue_str = fvaluestream.str(); uint64_t fhash = StringUtils::hash_wy(fvalue_str.c_str(), fvalue_str.size()); if(a_facet.result_map.count(fhash) == 0) { - a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, {}, - spp::sparse_hash_map()}; + a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0, + spp::sparse_hash_map()}; } a_facet.result_map[fhash].count += 1; @@ -619,14 +630,21 @@ void Index::do_facets(std::vector & facets, const facet_query_t & facet_q a_facet.result_map[fhash].array_pos = array_pos; if(use_facet_query) { - a_facet.result_map[fhash].token_query_pos = token_query_positions; + a_facet.result_map[fhash].query_token_pos = query_token_positions; + + /*if(j == 11) { + for(auto xx: query_token_positions) { + std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl; + } + }*/ } } + array_pos++; fvalue_found = 0; std::stringstream().swap(fvaluestream); - spp::sparse_hash_map().swap(token_query_positions); - field_token_index = 0; + spp::sparse_hash_map().swap(query_token_positions); + field_token_index = -1; } } } @@ -991,7 +1009,7 @@ void Index::search(Option & outcome, std::string query, const std::vector & search_fields, const std::vector & filters, - std::vector & facets, const facet_query_t & facet_query, + std::vector & facets, facet_query_t & facet_query, const std::vector & included_ids, const std::vector & excluded_ids, const std::vector & sort_fields_std, const int num_typos, const size_t max_hits, diff --git a/test/collection_test.cpp b/test/collection_test.cpp index e3395ad6..6e7b0875 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -1960,7 +1960,7 @@ TEST_F(CollectionTest, FacetCounts) { ASSERT_STREQ("Facet query must be in the `facet_field: value` format.", res_op.error().c_str()); // unknown facet field - res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, + res_op = coll_array_fields->search("*", query_fields, "", {"foobar"}, sort_fields, 0, 10, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, 500, "foobar: baz"); @@ -1968,9 +1968,124 @@ TEST_F(CollectionTest, FacetCounts) { ASSERT_FALSE(res_op.ok()); ASSERT_STREQ("Could not find a facet field named `foobar` in the schema.", res_op.error().c_str()); + // when facet query is given but no facet fields are specified, must return an error message + res_op = coll_array_fields->search("*", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, 500, "tags: foo"); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("The `facet_query` parameter is supplied without a `facet_by` parameter.", res_op.error().c_str()); + + // given facet query field must be part of facet fields requested + res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, 500, "name_facet: jeremy"); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Facet query refers to a facet field `name_facet` that is not part of `facet_by` parameter.", res_op.error().c_str()); + collectionManager.drop_collection("coll_array_fields"); } +TEST_F(CollectionTest, FacetCountsHighlighting) { + Collection *coll1; + + std::vector fields = {field("categories", field_types::STRING_ARRAY, true), + field("points", field_types::INT32, false)}; + + std::vector sort_fields = { sort_by("points", "DESC") }; + + coll1 = collectionManager.get_collection("coll1"); + if(coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", fields, "points").get(); + } + + nlohmann::json doc; + doc["id"] = "100"; + doc["categories"] = {"Cell Phones", "Cell Phone Accessories", "Cell Phone Cases & Clips"}; + doc["points"] = 25; + + coll1->add(doc.dump()); + + std::vector facets = {"categories"}; + + nlohmann::json results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, 500, "categories:cell").get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(3, results["facet_counts"][0]["counts"].size()); + + ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["highlighted"].get().c_str()); + + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][2]["count"]); + ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][2]["value"].get().c_str()); + ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][2]["highlighted"].get().c_str()); + + coll1->remove("100"); + + doc["categories"] = {"Cell Phones", "Unlocked Cell Phones", "All Unlocked Cell Phones" }; + coll1->add(doc.dump()); + + results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, 500, "categories:cell").get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("All Unlocked Cell Phones", results["facet_counts"][0]["counts"][1]["value"].get().c_str()); + ASSERT_STREQ("All Unlocked Cell Phones", results["facet_counts"][0]["counts"][1]["highlighted"].get().c_str()); + + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Unlocked Cell Phones", results["facet_counts"][0]["counts"][2]["value"].get().c_str()); + ASSERT_STREQ("Unlocked Cell Phones", results["facet_counts"][0]["counts"][2]["highlighted"].get().c_str()); + + coll1->remove("100"); + doc["categories"] = {"Cell Phones", "Cell Phone Accessories", "Cell Phone Cases & Clips"}; + coll1->add(doc.dump()); + + results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, 500, "categories:acces").get(); + + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + + // ensure that query is NOT case sensitive + + results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1, + token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, 500, "categories:ACCES").get(); + + ASSERT_EQ(1, results["facet_counts"].size()); + ASSERT_EQ(1, results["facet_counts"][0]["counts"].size()); + ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); + + collectionManager.drop_collection("coll1"); +} + TEST_F(CollectionTest, SortingOrder) { Collection *coll_mul_fields; @@ -2352,13 +2467,16 @@ TEST_F(CollectionTest, DeletionOfADocument) { results = collection_for_del->search("cryogenic", query_fields, "", {}, sort_fields, 0, 5, 1, FREQUENCY, false).get(); ASSERT_EQ(0, results["hits"].size()); + ASSERT_EQ(0, results["found"]); results = collection_for_del->search("archives", query_fields, "", {}, sort_fields, 0, 5, 1, FREQUENCY, false).get(); ASSERT_EQ(1, results["hits"].size()); + ASSERT_EQ(1, results["found"]); collection_for_del->remove("foo"); // custom id record results = collection_for_del->search("martian", query_fields, "", {}, sort_fields, 0, 5, 1, FREQUENCY, false).get(); ASSERT_EQ(0, results["hits"].size()); + ASSERT_EQ(0, results["found"]); // delete all records for(int id = 0; id <= 25; id++) { @@ -2533,4 +2651,4 @@ TEST_F(CollectionTest, PruneFieldsFromDocument) { document = get_prune_doc(); Collection::prune_document(document, spp::sparse_hash_set(), {"notfound"}); ASSERT_EQ(4, document.size()); -} \ No newline at end of file +}