mirror of
https://github.com/typesense/typesense.git
synced 2025-05-17 12:12:35 +08:00
WIP: Highlight facet query results
This commit is contained in:
parent
654811f4a3
commit
712d962cac
@ -154,14 +154,14 @@ private:
|
||||
void populate_overrides(std::string query, std::map<uint32_t, size_t> & id_pos_map,
|
||||
std::vector<uint32_t> & included_ids, std::vector<uint32_t> & excluded_ids);
|
||||
|
||||
static bool facet_count_compare(const std::pair<uint64_t, facet_count>& a,
|
||||
const std::pair<uint64_t, facet_count>& b) {
|
||||
static bool facet_count_compare(const std::pair<uint64_t, facet_count_t>& a,
|
||||
const std::pair<uint64_t, facet_count_t>& b) {
|
||||
return std::tie(a.second.count, a.first) > std::tie(b.second.count, a.first);
|
||||
}
|
||||
|
||||
static bool facet_count_str_compare(const std::pair<std::string, size_t>& a,
|
||||
const std::pair<std::string, size_t>& b) {
|
||||
return a.second > b.second;
|
||||
static bool facet_count_str_compare(const facet_value_t& a,
|
||||
const facet_value_t& b) {
|
||||
return a.count > b.count;
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -120,15 +120,16 @@ struct sort_by {
|
||||
}
|
||||
};
|
||||
|
||||
struct facet_count {
|
||||
struct facet_count_t {
|
||||
uint32_t count;
|
||||
uint32_t doc_id; // used to fetch the actual document and the value from store
|
||||
uint32_t array_pos;
|
||||
spp::sparse_hash_map<uint32_t, uint32_t> token_query_pos;
|
||||
};
|
||||
|
||||
struct facet {
|
||||
const std::string field_name;
|
||||
std::map<uint64_t, facet_count> result_map;
|
||||
std::map<uint64_t, facet_count_t> result_map;
|
||||
|
||||
facet(const std::string & field_name): field_name(field_name) {
|
||||
|
||||
@ -138,4 +139,10 @@ struct facet {
|
||||
struct facet_query_t {
|
||||
std::string field_name;
|
||||
std::string query;
|
||||
};
|
||||
|
||||
struct facet_value_t {
|
||||
std::string value;
|
||||
std::string highlighted;
|
||||
uint32_t count;
|
||||
};
|
@ -582,6 +582,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
acc_facet.result_map[facet_kv.first].count = count;
|
||||
acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
|
||||
acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
|
||||
acc_facet.result_map[facet_kv.first].token_query_pos = facet_kv.second.token_query_pos;
|
||||
}
|
||||
}
|
||||
|
||||
@ -699,7 +700,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
facet_result["field_name"] = a_facet.field_name;
|
||||
facet_result["counts"] = nlohmann::json::array();
|
||||
|
||||
std::vector<std::pair<uint64_t, facet_count>> facet_hash_counts;
|
||||
std::vector<std::pair<uint64_t, facet_count_t>> facet_hash_counts;
|
||||
for (const auto & itr : a_facet.result_map) {
|
||||
facet_hash_counts.emplace_back(itr);
|
||||
}
|
||||
@ -709,14 +710,19 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
std::nth_element(facet_hash_counts.begin(), facet_hash_counts.begin() + max_facets,
|
||||
facet_hash_counts.end(), Collection::facet_count_compare);
|
||||
|
||||
std::vector<std::pair<std::string, size_t>> facet_counts;
|
||||
|
||||
std::vector<std::string> facet_query_tokens;
|
||||
StringUtils::split(facet_query.query, facet_query_tokens, " ");
|
||||
|
||||
std::vector<facet_value_t> facet_values;
|
||||
|
||||
for(size_t i = 0; i < max_facets; i++) {
|
||||
// remap facet value hash with actual string
|
||||
auto & kv = facet_hash_counts[i];
|
||||
auto & facet_count = kv.second;
|
||||
|
||||
// fetch actual facet value from representative doc id
|
||||
const std::string& seq_id_key = get_seq_id_key((uint32_t) kv.second.doc_id);
|
||||
const std::string& seq_id_key = get_seq_id_key((uint32_t) facet_count.doc_id);
|
||||
nlohmann::json document;
|
||||
const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
|
||||
|
||||
@ -725,23 +731,45 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string facet_value;
|
||||
std::string value;
|
||||
|
||||
if(facet_schema.at(a_facet.field_name).type == field_types::STRING) {
|
||||
facet_value = document[a_facet.field_name];
|
||||
value = document[a_facet.field_name];
|
||||
} else if(facet_schema.at(a_facet.field_name).type == field_types::STRING_ARRAY) {
|
||||
facet_value = document[a_facet.field_name][kv.second.array_pos];
|
||||
value = document[a_facet.field_name][facet_count.array_pos];
|
||||
}
|
||||
|
||||
facet_counts.emplace_back(std::make_pair(facet_value, kv.second.count));
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::split(value, tokens, " ");
|
||||
std::stringstream highlightedss;
|
||||
|
||||
for(size_t i = 0; i < tokens.size(); i++) {
|
||||
if(i != 0) {
|
||||
highlightedss << " ";
|
||||
}
|
||||
|
||||
if(facet_count.token_query_pos.count(i) != 0) {
|
||||
size_t highlight_len = facet_query_tokens[facet_count.token_query_pos[i]].size();
|
||||
const std::string & unmarked = tokens[i].substr(highlight_len, std::string::npos);
|
||||
highlightedss << "<mark>" + tokens[i].substr(0, highlight_len) + "</mark>" + unmarked;
|
||||
} else {
|
||||
highlightedss << tokens[i];
|
||||
}
|
||||
}
|
||||
|
||||
facet_value_t facet_value = {value, highlightedss.str(), facet_count.count};
|
||||
facet_values.emplace_back(facet_value);
|
||||
}
|
||||
|
||||
std::stable_sort(facet_counts.begin(), facet_counts.end(), Collection::facet_count_str_compare);
|
||||
std::stable_sort(facet_values.begin(), facet_values.end(), Collection::facet_count_str_compare);
|
||||
|
||||
for(const auto & facet_count: facet_counts) {
|
||||
for(const auto & facet_count: facet_values) {
|
||||
nlohmann::json facet_value_count = nlohmann::json::object();
|
||||
facet_value_count["value"] = facet_count.first;
|
||||
facet_value_count["count"] = facet_count.second;
|
||||
const std::string & value = facet_count.value;
|
||||
|
||||
facet_value_count["value"] = value;
|
||||
facet_value_count["highlighted"] = facet_count.highlighted;
|
||||
facet_value_count["count"] = facet_count.count;
|
||||
facet_result["counts"].push_back(facet_value_count);
|
||||
}
|
||||
|
||||
|
104
src/index.cpp
104
src/index.cpp
@ -449,7 +449,7 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a
|
||||
insert_doc(score, t, seq_id, token_to_offsets);
|
||||
|
||||
if(facet_id >= 0) {
|
||||
facet_index_v2[seq_id][facet_id].resize(facet_index_v2[seq_id][facet_id].size());
|
||||
facet_index_v2[seq_id][facet_id].shrink_to_fit();
|
||||
}
|
||||
}
|
||||
|
||||
@ -496,7 +496,7 @@ void Index::index_string_array_field(const std::vector<std::string> & strings, c
|
||||
}
|
||||
|
||||
if(facet_id >= 0) {
|
||||
facet_index_v2[seq_id][facet_id].resize(facet_index_v2[seq_id][facet_id].size());
|
||||
facet_index_v2[seq_id][facet_id].shrink_to_fit();
|
||||
}
|
||||
|
||||
insert_doc(score, t, seq_id, token_positions);
|
||||
@ -539,29 +539,31 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
|
||||
facet_to_index[facet.first] = i_facet;
|
||||
i_facet++;
|
||||
}
|
||||
|
||||
|
||||
// assumed that facet fields have already been validated upstream
|
||||
for(auto & a_facet: facets) {
|
||||
// assumed that facet fields have already been validated upstream
|
||||
spp::sparse_hash_set<int64_t> facet_filter_set;
|
||||
spp::sparse_hash_map<int64_t, uint32_t> fhash_qtoken_pos; // facet hash => token position in the query
|
||||
bool use_facet_query = false;
|
||||
|
||||
if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) {
|
||||
use_facet_query = true;
|
||||
std::vector<std::string> facet_queries;
|
||||
StringUtils::split(facet_query.query, facet_queries, " ");
|
||||
std::vector<std::string> query_tokens;
|
||||
StringUtils::split(facet_query.query, query_tokens, " ");
|
||||
|
||||
for(auto & q: facet_queries) {
|
||||
for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
|
||||
auto & q = query_tokens[qtoken_index];
|
||||
StringUtils::trim(q);
|
||||
int bounded_cost = get_bounded_typo_cost(2, q.size());
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_search(search_index.at(a_facet.field_name), (const unsigned char *) q.c_str(),
|
||||
q.size(),0, bounded_cost, 10000,
|
||||
token_ordering::MAX_SCORE, true, leaves);
|
||||
for(const auto & leaf: leaves) {
|
||||
for(size_t i = 0; i < leaves.size(); i++) {
|
||||
const auto & leaf = leaves[i];
|
||||
// calculate hash without terminating null char
|
||||
uint64_t hash = StringUtils::hash_wy(leaf->key, leaf->key_len-1);
|
||||
facet_filter_set.insert(hash);
|
||||
//printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash);
|
||||
fhash_qtoken_pos.emplace(hash, qtoken_index);
|
||||
printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -573,21 +575,58 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
|
||||
// FORMAT OF VALUES
|
||||
// String: h1 h2 h3
|
||||
// String array: h1 h2 h3 0 h1 0 h1 h2 0
|
||||
const std::vector<uint64_t> & values = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
|
||||
const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
|
||||
int array_pos = -1;
|
||||
int fvalue_found = 0;
|
||||
std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
|
||||
spp::sparse_hash_map<uint32_t, uint32_t> token_query_positions;
|
||||
size_t field_token_index = -1;
|
||||
|
||||
for(size_t j = 0; j < values.size(); j++) {
|
||||
if(values[j] == 0 || (values.back() != 0 && j == values.size()-1)) {
|
||||
array_pos++;
|
||||
size_t value_index = (j == 0 ? j : j - 1);
|
||||
for(size_t j = 0; j < fhashes.size(); j++) {
|
||||
if(fhashes[j] != 0) {
|
||||
uint64_t ftoken_hash = fhashes[j];
|
||||
fvaluestream << ftoken_hash;
|
||||
field_token_index++;
|
||||
|
||||
if(use_facet_query && facet_filter_set.find(values[value_index]) == facet_filter_set.end()) {
|
||||
if(use_facet_query && fhash_qtoken_pos.find(ftoken_hash) == fhash_qtoken_pos.end()) {
|
||||
// this particular facet value is not found in facet filter, so ignore
|
||||
continue;
|
||||
}
|
||||
a_facet.result_map[values[value_index]].count += 1;
|
||||
a_facet.result_map[values[value_index]].doc_id = doc_seq_id;
|
||||
a_facet.result_map[values[value_index]].array_pos = array_pos;
|
||||
|
||||
fvalue_found |= 1; // bitwise to ensure only one count for a multi-token facet value
|
||||
|
||||
if(use_facet_query) {
|
||||
// map token index to query index (used for highlighting later on)
|
||||
token_query_positions.emplace(field_token_index, fhash_qtoken_pos[ftoken_hash]);
|
||||
}
|
||||
}
|
||||
|
||||
// 0 indicates separator, while the second condition checks for non-array string
|
||||
if(fhashes[j] == 0 || (fhashes.back() != 0 && j == fhashes.size() - 1)) {
|
||||
if(!use_facet_query || fvalue_found != 0) {
|
||||
array_pos++;
|
||||
|
||||
const std::string & fvalue_str = fvaluestream.str();
|
||||
uint64_t fhash = StringUtils::hash_wy(fvalue_str.c_str(), fvalue_str.size());
|
||||
|
||||
if(a_facet.result_map.count(fhash) == 0) {
|
||||
a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, {},
|
||||
spp::sparse_hash_map<uint32_t, uint32_t>()};
|
||||
}
|
||||
|
||||
a_facet.result_map[fhash].count += 1;
|
||||
a_facet.result_map[fhash].doc_id = doc_seq_id;
|
||||
a_facet.result_map[fhash].array_pos = array_pos;
|
||||
|
||||
if(use_facet_query) {
|
||||
a_facet.result_map[fhash].token_query_pos = token_query_positions;
|
||||
}
|
||||
}
|
||||
|
||||
fvalue_found = 0;
|
||||
std::stringstream().swap(fvaluestream);
|
||||
spp::sparse_hash_map<uint32_t, uint32_t>().swap(token_query_positions);
|
||||
field_token_index = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -611,19 +650,26 @@ void Index::drop_facets(std::vector<facet> & facets, const std::vector<uint32_t>
|
||||
// FORMAT OF VALUES
|
||||
// String: h1 h2 h3
|
||||
// String array: h1 h2 h3 0 h1 0 h1 h2 0
|
||||
const std::vector<uint64_t> & values = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
|
||||
const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
|
||||
std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
|
||||
|
||||
int array_pos = -1;
|
||||
for(size_t j = 0; j < fhashes.size(); j++) {
|
||||
if(fhashes[j] != 0) {
|
||||
uint64_t ftoken_hash = fhashes[j];
|
||||
fvaluestream << ftoken_hash;
|
||||
}
|
||||
|
||||
for(size_t j = 0; j < values.size(); j++) {
|
||||
if(values[j] == 0 || (values.back() != 0 && j == values.size()-1)) {
|
||||
array_pos++;
|
||||
size_t value_index = (j == 0 ? j : j - 1);
|
||||
if(fhashes[j] == 0 || (fhashes.back() != 0 && j == fhashes.size() - 1)) {
|
||||
const std::string & fvalue_str = fvaluestream.str();
|
||||
uint64_t fhash = StringUtils::hash_wy(fvalue_str.c_str(), fvalue_str.size());
|
||||
|
||||
a_facet.result_map[values[value_index]].count -= 1;
|
||||
if(a_facet.result_map[values[value_index]].count == 0) {
|
||||
a_facet.result_map.erase(values[value_index]);
|
||||
if(a_facet.result_map.count(fhash) != 0) {
|
||||
a_facet.result_map[fhash].count -= 1;
|
||||
if(a_facet.result_map[fhash].count == 0) {
|
||||
a_facet.result_map.erase(fhash);
|
||||
}
|
||||
}
|
||||
std::stringstream().swap(fvaluestream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1869,8 +1869,8 @@ TEST_F(CollectionTest, FacetCounts) {
|
||||
|
||||
ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
|
||||
|
||||
// facet with wildcard query
|
||||
facets.clear();
|
||||
@ -1888,8 +1888,8 @@ TEST_F(CollectionTest, FacetCounts) {
|
||||
|
||||
ASSERT_STREQ("silver", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("gold", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("bronze", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
|
||||
|
||||
// facet with facet filter query (allows typo correction!)
|
||||
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
|
||||
@ -1915,6 +1915,30 @@ TEST_F(CollectionTest, FacetCounts) {
|
||||
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
|
||||
// facet with facet filter query matching first token of an array
|
||||
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, 500, "tags: fine").get();
|
||||
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
|
||||
// facet with facet filter query matching second token of an array
|
||||
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, 500, "tags: pltinum").get();
|
||||
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_STREQ("tags", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("FINE PLATINUM", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
|
||||
// facet query that does not match any indexed value
|
||||
results = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
|
Loading…
x
Reference in New Issue
Block a user