WIP: Highlight facet query results

This commit is contained in:
kishorenc 2020-02-21 18:34:02 +05:30
parent 654811f4a3
commit 712d962cac
5 changed files with 156 additions and 51 deletions

View File

@ -154,14 +154,14 @@ private:
void populate_overrides(std::string query, std::map<uint32_t, size_t> & id_pos_map,
std::vector<uint32_t> & included_ids, std::vector<uint32_t> & excluded_ids);
static bool facet_count_compare(const std::pair<uint64_t, facet_count>& a,
const std::pair<uint64_t, facet_count>& b) {
static bool facet_count_compare(const std::pair<uint64_t, facet_count_t>& a,
const std::pair<uint64_t, facet_count_t>& b) {
return std::tie(a.second.count, a.first) > std::tie(b.second.count, a.first);
}
static bool facet_count_str_compare(const std::pair<std::string, size_t>& a,
const std::pair<std::string, size_t>& b) {
return a.second > b.second;
static bool facet_count_str_compare(const facet_value_t& a,
const facet_value_t& b) {
return a.count > b.count;
}
public:

View File

@ -120,15 +120,16 @@ struct sort_by {
}
};
struct facet_count {
struct facet_count_t {
uint32_t count;
uint32_t doc_id; // used to fetch the actual document and the value from store
uint32_t array_pos;
spp::sparse_hash_map<uint32_t, uint32_t> token_query_pos;
};
struct facet {
const std::string field_name;
std::map<uint64_t, facet_count> result_map;
std::map<uint64_t, facet_count_t> result_map;
facet(const std::string & field_name): field_name(field_name) {
@ -138,4 +139,10 @@ struct facet {
struct facet_query_t {
std::string field_name;
std::string query;
};
struct facet_value_t {
std::string value;
std::string highlighted;
uint32_t count;
};

View File

@ -582,6 +582,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
acc_facet.result_map[facet_kv.first].count = count;
acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
acc_facet.result_map[facet_kv.first].token_query_pos = facet_kv.second.token_query_pos;
}
}
@ -699,7 +700,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
facet_result["field_name"] = a_facet.field_name;
facet_result["counts"] = nlohmann::json::array();
std::vector<std::pair<uint64_t, facet_count>> facet_hash_counts;
std::vector<std::pair<uint64_t, facet_count_t>> facet_hash_counts;
for (const auto & itr : a_facet.result_map) {
facet_hash_counts.emplace_back(itr);
}
@ -709,14 +710,19 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
std::nth_element(facet_hash_counts.begin(), facet_hash_counts.begin() + max_facets,
facet_hash_counts.end(), Collection::facet_count_compare);
std::vector<std::pair<std::string, size_t>> facet_counts;
std::vector<std::string> facet_query_tokens;
StringUtils::split(facet_query.query, facet_query_tokens, " ");
std::vector<facet_value_t> facet_values;
for(size_t i = 0; i < max_facets; i++) {
// remap facet value hash with actual string
auto & kv = facet_hash_counts[i];
auto & facet_count = kv.second;
// fetch actual facet value from representative doc id
const std::string& seq_id_key = get_seq_id_key((uint32_t) kv.second.doc_id);
const std::string& seq_id_key = get_seq_id_key((uint32_t) facet_count.doc_id);
nlohmann::json document;
const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
@ -725,23 +731,45 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
continue;
}
std::string facet_value;
std::string value;
if(facet_schema.at(a_facet.field_name).type == field_types::STRING) {
facet_value = document[a_facet.field_name];
value = document[a_facet.field_name];
} else if(facet_schema.at(a_facet.field_name).type == field_types::STRING_ARRAY) {
facet_value = document[a_facet.field_name][kv.second.array_pos];
value = document[a_facet.field_name][facet_count.array_pos];
}
facet_counts.emplace_back(std::make_pair(facet_value, kv.second.count));
std::vector<std::string> tokens;
StringUtils::split(value, tokens, " ");
std::stringstream highlightedss;
for(size_t i = 0; i < tokens.size(); i++) {
if(i != 0) {
highlightedss << " ";
}
if(facet_count.token_query_pos.count(i) != 0) {
size_t highlight_len = facet_query_tokens[facet_count.token_query_pos[i]].size();
const std::string & unmarked = tokens[i].substr(highlight_len, std::string::npos);
highlightedss << "<mark>" + tokens[i].substr(0, highlight_len) + "</mark>" + unmarked;
} else {
highlightedss << tokens[i];
}
}
facet_value_t facet_value = {value, highlightedss.str(), facet_count.count};
facet_values.emplace_back(facet_value);
}
std::stable_sort(facet_counts.begin(), facet_counts.end(), Collection::facet_count_str_compare);
std::stable_sort(facet_values.begin(), facet_values.end(), Collection::facet_count_str_compare);
for(const auto & facet_count: facet_counts) {
for(const auto & facet_count: facet_values) {
nlohmann::json facet_value_count = nlohmann::json::object();
facet_value_count["value"] = facet_count.first;
facet_value_count["count"] = facet_count.second;
const std::string & value = facet_count.value;
facet_value_count["value"] = value;
facet_value_count["highlighted"] = facet_count.highlighted;
facet_value_count["count"] = facet_count.count;
facet_result["counts"].push_back(facet_value_count);
}

View File

@ -449,7 +449,7 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a
insert_doc(score, t, seq_id, token_to_offsets);
if(facet_id >= 0) {
facet_index_v2[seq_id][facet_id].resize(facet_index_v2[seq_id][facet_id].size());
facet_index_v2[seq_id][facet_id].shrink_to_fit();
}
}
@ -496,7 +496,7 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
}
if(facet_id >= 0) {
facet_index_v2[seq_id][facet_id].resize(facet_index_v2[seq_id][facet_id].size());
facet_index_v2[seq_id][facet_id].shrink_to_fit();
}
insert_doc(score, t, seq_id, token_positions);
@ -539,29 +539,31 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
facet_to_index[facet.first] = i_facet;
i_facet++;
}
// assumed that facet fields have already been validated upstream
for(auto & a_facet: facets) {
// assumed that facet fields have already been validated upstream
spp::sparse_hash_set<int64_t> facet_filter_set;
spp::sparse_hash_map<int64_t, uint32_t> fhash_qtoken_pos; // facet hash => token position in the query
bool use_facet_query = false;
if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) {
use_facet_query = true;
std::vector<std::string> facet_queries;
StringUtils::split(facet_query.query, facet_queries, " ");
std::vector<std::string> query_tokens;
StringUtils::split(facet_query.query, query_tokens, " ");
for(auto & q: facet_queries) {
for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
auto & q = query_tokens[qtoken_index];
StringUtils::trim(q);
int bounded_cost = get_bounded_typo_cost(2, q.size());
std::vector<art_leaf*> leaves;
art_fuzzy_search(search_index.at(a_facet.field_name), (const unsigned char *) q.c_str(),
q.size(),0, bounded_cost, 10000,
token_ordering::MAX_SCORE, true, leaves);
for(const auto & leaf: leaves) {
for(size_t i = 0; i < leaves.size(); i++) {
const auto & leaf = leaves[i];
// calculate hash without terminating null char
uint64_t hash = StringUtils::hash_wy(leaf->key, leaf->key_len-1);
facet_filter_set.insert(hash);
//printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash);
fhash_qtoken_pos.emplace(hash, qtoken_index);
printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash);
}
}
}
@ -573,21 +575,58 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
// FORMAT OF VALUES
// String: h1 h2 h3
// String array: h1 h2 h3 0 h1 0 h1 h2 0
const std::vector<uint64_t> & values = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
int array_pos = -1;
int fvalue_found = 0;
std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
spp::sparse_hash_map<uint32_t, uint32_t> token_query_positions;
size_t field_token_index = -1;
for(size_t j = 0; j < values.size(); j++) {
if(values[j] == 0 || (values.back() != 0 && j == values.size()-1)) {
array_pos++;
size_t value_index = (j == 0 ? j : j - 1);
for(size_t j = 0; j < fhashes.size(); j++) {
if(fhashes[j] != 0) {
uint64_t ftoken_hash = fhashes[j];
fvaluestream << ftoken_hash;
field_token_index++;
if(use_facet_query && facet_filter_set.find(values[value_index]) == facet_filter_set.end()) {
if(use_facet_query && fhash_qtoken_pos.find(ftoken_hash) == fhash_qtoken_pos.end()) {
// this particular facet value is not found in facet filter, so ignore
continue;
}
a_facet.result_map[values[value_index]].count += 1;
a_facet.result_map[values[value_index]].doc_id = doc_seq_id;
a_facet.result_map[values[value_index]].array_pos = array_pos;
fvalue_found |= 1; // bitwise to ensure only one count for a multi-token facet value
if(use_facet_query) {
// map token index to query index (used for highlighting later on)
token_query_positions.emplace(field_token_index, fhash_qtoken_pos[ftoken_hash]);
}
}
// 0 indicates separator, while the second condition checks for non-array string
if(fhashes[j] == 0 || (fhashes.back() != 0 && j == fhashes.size() - 1)) {
if(!use_facet_query || fvalue_found != 0) {
array_pos++;
const std::string & fvalue_str = fvaluestream.str();
uint64_t fhash = StringUtils::hash_wy(fvalue_str.c_str(), fvalue_str.size());
if(a_facet.result_map.count(fhash) == 0) {
a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, {},
spp::sparse_hash_map<uint32_t, uint32_t>()};
}
a_facet.result_map[fhash].count += 1;
a_facet.result_map[fhash].doc_id = doc_seq_id;
a_facet.result_map[fhash].array_pos = array_pos;
if(use_facet_query) {
a_facet.result_map[fhash].token_query_pos = token_query_positions;
}
}
fvalue_found = 0;
std::stringstream().swap(fvaluestream);
spp::sparse_hash_map<uint32_t, uint32_t>().swap(token_query_positions);
field_token_index = 0;
}
}
}
@ -611,19 +650,26 @@ void Index::drop_facets(std::vector<facet> & facets, const std::vector<uint32_t>
// FORMAT OF VALUES
// String: h1 h2 h3
// String array: h1 h2 h3 0 h1 0 h1 h2 0
const std::vector<uint64_t> & values = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
int array_pos = -1;
for(size_t j = 0; j < fhashes.size(); j++) {
if(fhashes[j] != 0) {
uint64_t ftoken_hash = fhashes[j];
fvaluestream << ftoken_hash;
}
for(size_t j = 0; j < values.size(); j++) {
if(values[j] == 0 || (values.back() != 0 && j == values.size()-1)) {
array_pos++;
size_t value_index = (j == 0 ? j : j - 1);
if(fhashes[j] == 0 || (fhashes.back() != 0 && j == fhashes.size() - 1)) {
const std::string & fvalue_str = fvaluestream.str();
uint64_t fhash = StringUtils::hash_wy(fvalue_str.c_str(), fvalue_str.size());
a_facet.result_map[values[value_index]].count -= 1;
if(a_facet.result_map[values[value_index]].count == 0) {
a_facet.result_map.erase(values[value_index]);
if(a_facet.result_map.count(fhash) != 0) {
a_facet.result_map[fhash].count -= 1;
if(a_facet.result_map[fhash].count == 0) {
a_facet.result_map.erase(fhash);
}
}
std::stringstream().swap(fvaluestream);
}
}
}

View File

@ -1869,8 +1869,8 @@ TEST_F(CollectionTest, FacetCounts) {
ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
// facet with wildcard query
facets.clear();
@ -1888,8 +1888,8 @@ TEST_F(CollectionTest, FacetCounts) {
ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
// facet with facet filter query (allows typo correction!)
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
@ -1915,6 +1915,30 @@ TEST_F(CollectionTest, FacetCounts) {
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
// facet with facet filter query matching first token of an array
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "tags: fine").get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
// facet with facet filter query matching second token of an array
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, 500, "tags: pltinum").get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
// facet query that does not match any indexed value
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,