Facet query partial highlighting + tests.

This commit is contained in:
kishorenc 2020-02-22 18:39:52 +05:30
parent 38bac77c43
commit 1369a473e4
5 changed files with 206 additions and 35 deletions

View File

@ -120,11 +120,19 @@ struct sort_by {
}
};
struct token_pos_cost_t {
size_t pos;
uint32_t cost;
};
struct facet_count_t {
uint32_t count;
uint32_t doc_id; // used to fetch the actual document and the value from store
// used to fetch the actual document and value for representation
uint32_t doc_id;
uint32_t array_pos;
spp::sparse_hash_map<uint32_t, uint32_t> token_query_pos;
spp::sparse_hash_map<uint32_t, token_pos_cost_t> query_token_pos;
};
struct facet {

View File

@ -140,7 +140,7 @@ private:
Option<uint32_t> do_filtering(uint32_t** filter_ids_out, const std::vector<filter> & filters);
void do_facets(std::vector<facet> & facets, const facet_query_t & facet_query,
void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
const uint32_t* result_ids, size_t results_size);
void drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids);
@ -203,7 +203,7 @@ public:
void search(Option<uint32_t> & outcome, std::string query, const std::vector<std::string> & search_fields,
const std::vector<filter> & filters, std::vector<facet> & facets,
const facet_query_t & facet_query,
facet_query_t & facet_query,
const std::vector<uint32_t> & included_ids, const std::vector<uint32_t> & excluded_ids,
const std::vector<sort_by> & sort_fields_std, const int num_typos,
const size_t max_hits, const size_t per_page, const size_t page, const token_ordering token_order,

View File

@ -460,12 +460,24 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
if(!simple_facet_query.empty() && simple_facet_query.find(':') == std::string::npos) {
std::string error = "Facet query must be in the `facet_field: value` format.";
return Option<nlohmann::json>(404, error);
return Option<nlohmann::json>(400, error);
}
StringUtils::split(simple_facet_query, facet_query_vec, ":");
if(!facet_query_vec.empty()) {
if(facet_fields.empty()) {
std::string error = "The `facet_query` parameter is supplied without a `facet_by` parameter.";
return Option<nlohmann::json>(400, error);
}
// facet query field must be part of facet fields requested
facet_query = { StringUtils::trim(facet_query_vec[0]), StringUtils::trim(facet_query_vec[1]) };
if(std::find(facet_fields.begin(), facet_fields.end(), facet_query.field_name) == facet_fields.end()) {
std::string error = "Facet query refers to a facet field `" + facet_query.field_name + "` " +
"that is not part of `facet_by` parameter.";
return Option<nlohmann::json>(400, error);
}
if(facet_schema.count(facet_query.field_name) == 0) {
std::string error = "Could not find a facet field named `" + facet_query.field_name + "` in the schema.";
return Option<nlohmann::json>(404, error);
@ -582,7 +594,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
acc_facet.result_map[facet_kv.first].count = count;
acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
acc_facet.result_map[facet_kv.first].token_query_pos = facet_kv.second.token_query_pos;
acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos;
}
}
@ -743,13 +755,28 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
StringUtils::split(value, tokens, " ");
std::stringstream highlightedss;
/*std::cout << "array_pos: " << facet_count.array_pos << std::endl;
for(auto xx: facet_count.query_token_pos) {
std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl;
}
std::cout << "doc id: " << facet_count.doc_id << " i: " << i << std::endl;
std::cout << "doc: " << document << std::endl;*/
// invert query_pos -> token_pos
spp::sparse_hash_map<uint32_t, uint32_t> token_query_pos;
for(auto qtoken_pos: facet_count.query_token_pos) {
token_query_pos.emplace(qtoken_pos.second.pos, qtoken_pos.first);
}
for(size_t i = 0; i < tokens.size(); i++) {
if(i != 0) {
highlightedss << " ";
}
if(facet_count.token_query_pos.count(i) != 0) {
size_t highlight_len = facet_query_tokens[facet_count.token_query_pos[i]].size();
if(token_query_pos.count(i) != 0) {
size_t highlight_len = facet_query_tokens[token_query_pos[i]].size();
const std::string & unmarked = tokens[i].substr(highlight_len, std::string::npos);
highlightedss << "<mark>" + tokens[i].substr(0, highlight_len) + "</mark>" + unmarked;
} else {

View File

@ -530,8 +530,9 @@ void Index::index_float_array_field(const std::vector<float> & values, const uin
}
}
void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_query,
void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
const uint32_t* result_ids, size_t results_size) {
std::map<std::string, size_t> facet_to_index;
size_t i_facet = 0;
@ -542,7 +543,7 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
// assumed that facet fields have already been validated upstream
for(auto & a_facet: facets) {
spp::sparse_hash_map<int64_t, uint32_t> fhash_qtoken_pos; // facet hash => token position in the query
spp::sparse_hash_map<int64_t, token_pos_cost_t> fhash_qtoken_pos; // facet hash => token position in the query
bool use_facet_query = false;
if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) {
@ -552,8 +553,8 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
auto & q = query_tokens[qtoken_index];
StringUtils::trim(q);
int bounded_cost = get_bounded_typo_cost(2, q.size());
string_utils.unicode_normalize(q);
int bounded_cost = (q.size() < 3) ? 0 : 1;
std::vector<art_leaf*> leaves;
art_fuzzy_search(search_index.at(a_facet.field_name), (const unsigned char *) q.c_str(),
q.size(),0, bounded_cost, 10000,
@ -562,12 +563,15 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
const auto & leaf = leaves[i];
// calculate hash without terminating null char
uint64_t hash = StringUtils::hash_wy(leaf->key, leaf->key_len-1);
fhash_qtoken_pos.emplace(hash, qtoken_index);
printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash);
token_pos_cost_t token_pos_cost = {qtoken_index, 0};
fhash_qtoken_pos.emplace(hash, token_pos_cost);
//printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash);
}
}
}
size_t facet_id = facet_to_index[a_facet.field_name];
for(size_t i = 0; i < results_size; i++) {
uint32_t doc_seq_id = result_ids[i];
@ -575,11 +579,12 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
// FORMAT OF VALUES
// String: h1 h2 h3
// String array: h1 h2 h3 0 h1 0 h1 h2 0
const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
int array_pos = -1;
const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_id];
int array_pos = 0;
int fvalue_found = 0;
std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
spp::sparse_hash_map<uint32_t, uint32_t> token_query_positions;
spp::sparse_hash_map<uint32_t, token_pos_cost_t> query_token_positions;
size_t field_token_index = -1;
for(size_t j = 0; j < fhashes.size(); j++) {
@ -588,30 +593,36 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
fvaluestream << ftoken_hash;
field_token_index++;
if(use_facet_query && fhash_qtoken_pos.find(ftoken_hash) == fhash_qtoken_pos.end()) {
// this particular facet value is not found in facet filter, so ignore
continue;
}
if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) {
// not using facet query or this particular facet value is found in facet filter
fvalue_found |= 1; // bitwise to ensure only one count for a multi-token facet value
fvalue_found |= 1; // bitwise to ensure only one count for a multi-token facet value
if(use_facet_query) {
// map token index to query index (used for highlighting later on)
token_pos_cost_t qtoken_pos = fhash_qtoken_pos[ftoken_hash];
if(use_facet_query) {
// map token index to query index (used for highlighting later on)
token_query_positions.emplace(field_token_index, fhash_qtoken_pos[ftoken_hash]);
// if the query token has already matched another token in the string
// we will replace the position only if the cost is lower
if(query_token_positions.find(qtoken_pos.pos) == query_token_positions.end() ||
query_token_positions[qtoken_pos.pos].cost >= qtoken_pos.cost ) {
token_pos_cost_t ftoken_pos_cost = {field_token_index, qtoken_pos.cost};
query_token_positions.emplace(qtoken_pos.pos, ftoken_pos_cost);
}
}
}
}
//std::cout << "j: " << j << std::endl;
// 0 indicates separator, while the second condition checks for non-array string
if(fhashes[j] == 0 || (fhashes.back() != 0 && j == fhashes.size() - 1)) {
if(!use_facet_query || fvalue_found != 0) {
array_pos++;
const std::string & fvalue_str = fvaluestream.str();
uint64_t fhash = StringUtils::hash_wy(fvalue_str.c_str(), fvalue_str.size());
if(a_facet.result_map.count(fhash) == 0) {
a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, {},
spp::sparse_hash_map<uint32_t, uint32_t>()};
a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0,
spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
}
a_facet.result_map[fhash].count += 1;
@ -619,14 +630,21 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
a_facet.result_map[fhash].array_pos = array_pos;
if(use_facet_query) {
a_facet.result_map[fhash].token_query_pos = token_query_positions;
a_facet.result_map[fhash].query_token_pos = query_token_positions;
/*if(j == 11) {
for(auto xx: query_token_positions) {
std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl;
}
}*/
}
}
array_pos++;
fvalue_found = 0;
std::stringstream().swap(fvaluestream);
spp::sparse_hash_map<uint32_t, uint32_t>().swap(token_query_positions);
field_token_index = 0;
spp::sparse_hash_map<uint32_t, token_pos_cost_t>().swap(query_token_positions);
field_token_index = -1;
}
}
}
@ -991,7 +1009,7 @@ void Index::search(Option<uint32_t> & outcome,
std::string query,
const std::vector<std::string> & search_fields,
const std::vector<filter> & filters,
std::vector<facet> & facets, const facet_query_t & facet_query,
std::vector<facet> & facets, facet_query_t & facet_query,
const std::vector<uint32_t> & included_ids,
const std::vector<uint32_t> & excluded_ids,
const std::vector<sort_by> & sort_fields_std, const int num_typos, const size_t max_hits,

View File

@ -1960,7 +1960,7 @@ TEST_F(CollectionTest, FacetCounts) {
ASSERT_STREQ("Facet query must be in the `facet_field: value` format.", res_op.error().c_str());
// unknown facet field
res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
res_op = coll_array_fields->search("*", query_fields, "", {"foobar"}, sort_fields, 0, 10, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "foobar: baz");
@ -1968,9 +1968,124 @@ TEST_F(CollectionTest, FacetCounts) {
ASSERT_FALSE(res_op.ok());
ASSERT_STREQ("Could not find a facet field named `foobar` in the schema.", res_op.error().c_str());
// when facet query is given but no facet fields are specified, must return an error message
res_op = coll_array_fields->search("*", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "tags: foo");
ASSERT_FALSE(res_op.ok());
ASSERT_STREQ("The `facet_query` parameter is supplied without a `facet_by` parameter.", res_op.error().c_str());
// given facet query field must be part of facet fields requested
res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "name_facet: jeremy");
ASSERT_FALSE(res_op.ok());
ASSERT_STREQ("Facet query refers to a facet field `name_facet` that is not part of `facet_by` parameter.", res_op.error().c_str());
collectionManager.drop_collection("coll_array_fields");
}
TEST_F(CollectionTest, FacetCountsHighlighting) {
Collection *coll1;
std::vector<field> fields = {field("categories", field_types::STRING_ARRAY, true),
field("points", field_types::INT32, false)};
std::vector<sort_by> sort_fields = { sort_by("points", "DESC") };
coll1 = collectionManager.get_collection("coll1");
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", fields, "points").get();
}
nlohmann::json doc;
doc["id"] = "100";
doc["categories"] = {"Cell Phones", "Cell Phone Accessories", "Cell Phone Cases & Clips"};
doc["points"] = 25;
coll1->add(doc.dump());
std::vector<std::string> facets = {"categories"};
nlohmann::json results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "categories:cell").get();
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(3, results["facet_counts"][0]["counts"].size());
ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Cell</mark> Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][2]["count"]);
ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Cell</mark> Phone Accessories", results["facet_counts"][0]["counts"][2]["highlighted"].get<std::string>().c_str());
coll1->remove("100");
doc["categories"] = {"Cell Phones", "Unlocked Cell Phones", "All Unlocked Cell Phones" };
coll1->add(doc.dump());
results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "categories:cell").get();
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
ASSERT_STREQ("All Unlocked Cell Phones", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("All Unlocked <mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Unlocked Cell Phones", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
ASSERT_STREQ("Unlocked <mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][2]["highlighted"].get<std::string>().c_str());
coll1->remove("100");
doc["categories"] = {"Cell Phones", "Cell Phone Accessories", "Cell Phone Cases & Clips"};
coll1->add(doc.dump());
results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "categories:acces").get();
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("Cell Phone <mark>Acces</mark>sories", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
// ensure that query is NOT case sensitive
results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "categories:ACCES").get();
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("Cell Phone <mark>Acces</mark>sories", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionTest, SortingOrder) {
Collection *coll_mul_fields;
@ -2352,13 +2467,16 @@ TEST_F(CollectionTest, DeletionOfADocument) {
results = collection_for_del->search("cryogenic", query_fields, "", {}, sort_fields, 0, 5, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(0, results["found"]);
results = collection_for_del->search("archives", query_fields, "", {}, sort_fields, 0, 5, 1, FREQUENCY, false).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(1, results["found"]);
collection_for_del->remove("foo"); // custom id record
results = collection_for_del->search("martian", query_fields, "", {}, sort_fields, 0, 5, 1, FREQUENCY, false).get();
ASSERT_EQ(0, results["hits"].size());
ASSERT_EQ(0, results["found"]);
// delete all records
for(int id = 0; id <= 25; id++) {
@ -2533,4 +2651,4 @@ TEST_F(CollectionTest, PruneFieldsFromDocument) {
document = get_prune_doc();
Collection::prune_document(document, spp::sparse_hash_set<std::string>(), {"notfound"});
ASSERT_EQ(4, document.size());
}
}