Highlight best matched string in an array.

An ARRAY_SEPARATOR delimiter is used to demarcate offsets of tokens from different indices of an array. Plain string fields are treated like a single-element array field, but without needing to use a delimiter.
This commit is contained in:
Kishore Nallan 2018-04-20 16:44:03 +05:30
parent dea9df233f
commit b669a47c29
7 changed files with 269 additions and 64 deletions

View File

@ -100,6 +100,8 @@
- NOT operator support
- Log operations
- Parameterize replica's MAX_UPDATES_TO_SEND
- NOT operator support
- 64K token limit
- > INT32_MAX validation for float field
- highlight of string arrays?
- test for token ranking on float field

View File

@ -84,10 +84,6 @@ private:
void do_facets(std::vector<facet> & facets, uint32_t* result_ids, size_t results_size);
void populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
size_t result_index, std::vector<std::vector<uint16_t>> &token_positions) const;
void search_field(std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
const int num_typos, const size_t num_results,
@ -102,6 +98,9 @@ private:
Topster<512> & topster, size_t & total_results, uint32_t** all_result_ids,
size_t & all_result_ids_len, const size_t & max_results, const bool prefix);
void insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const;
void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id,
const bool verbatim) const;
@ -147,6 +146,11 @@ public:
Option<uint32_t> remove(const uint32_t seq_id, nlohmann::json & document);
static void populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
size_t result_index,
std::vector<std::vector<std::vector<uint16_t>>> &array_token_positions);
void score_results(const std::vector<sort_by> & sort_fields, const int & query_index, const uint32_t total_cost,
Topster<512> &topster, const std::vector<art_leaf *> & query_suggestion,
const uint32_t *result_ids, const size_t result_size) const;
@ -162,6 +166,8 @@ public:
// strings under this length will be fully highlighted, instead of showing a snippet of relevant portion
enum {SNIPPET_STR_ABOVE_LEN = 30};
enum {ARRAY_SEPARATOR = UINT16_MAX};
// Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store
static constexpr const char* COLLECTION_META_PREFIX = "$CM";
static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS";

View File

@ -35,7 +35,7 @@ struct Match {
uint16_t start_offset;
char offset_diffs[16];
Match() {
Match(): words_present(0), distance(0), start_offset(0) {
}
@ -44,6 +44,14 @@ struct Match {
memcpy(offset_diffs, offset_diffs_stacked, 16);
}
// Construct a single match score from individual components (for multi-field sort)
inline uint64_t get_match_score(const uint32_t total_cost) const {
uint64_t match_score = ((int64_t)(words_present) << 24) |
((int64_t)(255 - total_cost) << 16) |
((int64_t)(distance));
return match_score;
}
static void print_token_offsets(std::vector<std::vector<uint16_t>> &token_offsets) {
for(auto offsets: token_offsets) {
for(auto offset: offsets) {
@ -54,7 +62,8 @@ struct Match {
}
static inline void addTopOfHeapToWindow(TokenOffsetHeap &heap, std::queue<TokenOffset> &window,
std::vector<std::vector<uint16_t>> &token_offsets, uint16_t *token_offset) {
const std::vector<std::vector<uint16_t>> &token_offsets,
uint16_t *token_offset) {
TokenOffset top = heap.top();
heap.pop();
window.push(top);
@ -90,7 +99,7 @@ struct Match {
* We use a priority queue to read the offset vectors in a sorted manner, slide a window of a given size, and
* compute the max_match and min_displacement of target tokens across the windows.
*/
static Match match(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
static Match match(uint32_t doc_id, const std::vector<std::vector<uint16_t>> &token_offsets) {
std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {

View File

@ -545,14 +545,9 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
const std::string & field_name = search_fields[search_fields.size() - field_order_kv.first];
field search_field = search_schema.at(field_name);
// only string fields are supported for now
if(search_field.type == field_types::STRING) {
std::vector<std::string> tokens;
StringUtils::split(document[field_name], tokens, " ");
// positions in the document of each token in the query
std::vector<std::vector<uint16_t>> token_positions;
if(search_field.type == field_types::STRING || search_field.type == field_types::STRING_ARRAY) {
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
for (const art_leaf *token_leaf : searched_queries[field_order_kv.second.query_index]) {
std::vector<uint16_t> positions;
uint32_t doc_index = token_leaf->values->ids.indexOf(field_order_kv.second.key);
@ -560,20 +555,42 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
continue;
}
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
token_leaf->values->offsets.getLength() :
token_leaf->values->offset_index.at(doc_index+1);
while(start_offset < end_offset) {
positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
start_offset++;
}
token_positions.push_back(positions);
uint32_t *indices = new uint32_t[1];
indices[0] = doc_index;
leaf_to_indices.emplace(token_leaf, indices);
}
Match match = Match::match(field_order_kv.second.key, token_positions);
// positions in the field of each token in the query
std::vector<std::vector<std::vector<uint16_t>>> array_token_positions;
Index::populate_token_positions(searched_queries[field_order_kv.second.query_index],
leaf_to_indices, 0, array_token_positions);
Match match;
uint64_t match_score = 0;
size_t matched_array_index = 0;
for(size_t array_index = 0; array_index < array_token_positions.size(); array_index++) {
const std::vector<std::vector<uint16_t>> & token_positions = array_token_positions[array_index];
if(token_positions.empty()) {
continue;
}
const Match & this_match = Match::match(field_order_kv.second.key, token_positions);
uint64_t this_match_score = this_match.get_match_score(1);
if(this_match_score > match_score) {
match_score = this_match_score;
match = this_match;
matched_array_index = array_index;
}
}
std::vector<std::string> tokens;
if(search_field.type == field_types::STRING) {
StringUtils::split(document[field_name], tokens, " ");
} else {
StringUtils::split(document[field_name][matched_array_index], tokens, " ");
}
// unpack `match.offset_diffs` into `token_indices`
std::vector<size_t> token_indices;
@ -609,6 +626,11 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
wrapper_doc["highlight"] = nlohmann::json::object();
wrapper_doc["highlight"][field_name] = snippet_stream.str();
for (auto it = leaf_to_indices.begin(); it != leaf_to_indices.end(); it++) {
delete [] it->second;
it->second = nullptr;
}
}
result["hits"].push_back(wrapper_doc);

View File

@ -225,6 +225,7 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a
token_to_offsets[text].push_back(0);
} else {
StringUtils::split(text, tokens, " ");
for(uint32_t i=0; i<tokens.size(); i++) {
auto & token = tokens[i];
string_utils.unicode_normalize(token);
@ -232,6 +233,11 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a
}
}
insert_doc(score, t, seq_id, token_to_offsets);
}
void Index::insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const {
for(auto & kv: token_to_offsets) {
art_document art_doc;
art_doc.id = seq_id;
@ -263,9 +269,33 @@ void Index::index_string_field(const std::string & text, const uint32_t score, a
void Index::index_string_array_field(const std::vector<std::string> & strings, const uint32_t score, art_tree *t,
uint32_t seq_id, const bool verbatim) const {
for(const std::string & str: strings) {
index_string_field(str, score, t, seq_id, verbatim);
std::unordered_map<std::string, std::unordered_map<size_t, std::vector<uint32_t>>> token_array_positions;
for(size_t array_index = 0; array_index < strings.size(); array_index++) {
const std::string & str = strings[array_index];
std::vector<std::string> tokens;
StringUtils::split(str, tokens, " ");
for(uint32_t i=0; i<tokens.size(); i++) {
auto & token = tokens[i];
string_utils.unicode_normalize(token);
token_array_positions[token][array_index].push_back(i);
}
}
std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
for(const auto & kv: token_array_positions) {
for(size_t array_index = 0; array_index < strings.size(); array_index++) {
token_to_offsets[kv.first].insert(token_to_offsets[kv.first].end(),
token_array_positions[kv.first][array_index].begin(),
token_array_positions[kv.first][array_index].end());
token_to_offsets[kv.first].push_back(ARRAY_SEPARATOR);
}
}
insert_doc(score, t, seq_id, token_to_offsets);
}
void Index::index_int32_array_field(const std::vector<int32_t> & values, const uint32_t score, art_tree *t,
@ -850,9 +880,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
char empty_offset_diffs[16];
std::fill_n(empty_offset_diffs, 16, 0);
Match single_token_match = Match(1, 0, 0, empty_offset_diffs);
const uint64_t single_token_match_score = ((int64_t)(single_token_match.words_present) << 24) |
((int64_t)(255 - total_cost) << 16) |
((int64_t)(single_token_match.distance));
const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost);
for(size_t i=0; i<result_size; i++) {
const uint32_t seq_id = result_ids[i];
@ -862,14 +890,28 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
if(query_suggestion.size() == 1) {
match_score = single_token_match_score;
} else {
std::vector<std::vector<uint16_t>> token_positions;
populate_token_positions(query_suggestion, leaf_to_indices, i, token_positions);
const Match & match = Match::match(seq_id, token_positions);
std::vector<std::vector<std::vector<uint16_t>>> array_token_positions;
populate_token_positions(query_suggestion, leaf_to_indices, i, array_token_positions);
// Construct a single match score from individual components (for multi-field sort)
match_score = ((int64_t)(match.words_present) << 24) |
((int64_t)(255 - total_cost) << 16) |
((int64_t)(match.distance));
for(const std::vector<std::vector<uint16_t>> & token_positions: array_token_positions) {
if(token_positions.size() == 0) {
continue;
}
const Match & match = Match::match(seq_id, token_positions);
uint64_t this_match_score = match.get_match_score(total_cost);
if(this_match_score > match_score) {
match_score = this_match_score;
}
/*std::ostringstream os;
os << name << ", total_cost: " << (255 - total_cost)
<< ", words_present: " << match.words_present
<< ", match_score: " << match_score
<< ", match.distance: " << match.distance
<< ", seq_id: " << seq_id << std::endl;
std::cout << os.str();*/
}
}
const int64_t default_score = 0;
@ -889,15 +931,6 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
const number_t & primary_rank_value = primary_rank_score * primary_rank_factor;
const number_t & secondary_rank_value = secondary_rank_score * secondary_rank_factor;
topster.add(seq_id, query_index, match_score, primary_rank_value, secondary_rank_value);
/*
std::ostringstream os;
os << name << ", total_cost: " << (255 - total_cost)
<< ", words_present: " << match.words_present << ", match_score: " << match_score
<< ", match.distance: " << match.distance
<< ", seq_id: " << seq_id << std::endl;
LOG(INFO) << os.str();
*/
}
//long long int timeNanos = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
@ -910,28 +943,82 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
}
void Index::populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
size_t result_index, std::vector<std::vector<uint16_t>> &token_positions) const {
// for each token in the query, find the positions that it appears in this document
spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
size_t result_index,
std::vector<std::vector<std::vector<uint16_t>>> &array_token_positions) {
// array_token_positions:
// for every element in a potential array, for every token in query suggestion, get the positions
// first let's ascertain the size of the array
size_t array_size = 0;
for (const art_leaf *token_leaf : query_suggestion) {
std::vector<uint16_t> positions;
uint32_t doc_index = leaf_to_indices.at(token_leaf)[result_index];
if(doc_index == token_leaf->values->ids.getLength()) {
uint32_t doc_index = leaf_to_indices.at(token_leaf)[result_index];
if(doc_index == token_leaf->values->ids.getLength()) {
continue;
}
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
token_leaf->values->offsets.getLength() :
token_leaf->values->offset_index.at(doc_index+1);
while(start_offset < end_offset) {
uint16_t pos = (uint16_t) token_leaf->values->offsets.at(start_offset);
if(pos == ARRAY_SEPARATOR) {
array_size++;
}
start_offset++;
}
if(array_size == 0) {
// for plain string fields that don't use an ARRAY_SEPARATOR
array_size = 1;
}
break;
}
// initialize array_token_positions
array_token_positions = std::vector<std::vector<std::vector<uint16_t>>>(array_size);
// for each token in the query, find the positions that it appears in the array
for (const art_leaf *token_leaf : query_suggestion) {
uint32_t doc_index = leaf_to_indices.at(token_leaf)[result_index];
if(doc_index == token_leaf->values->ids.getLength()) {
continue;
}
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
token_leaf->values->offsets.getLength() :
token_leaf->values->offset_index.at(doc_index+1);
size_t array_index = 0;
std::vector<uint16_t> positions;
while(start_offset < end_offset) {
uint16_t pos = (uint16_t) token_leaf->values->offsets.at(start_offset);
start_offset++;
if(pos == ARRAY_SEPARATOR) {
if(positions.size() != 0) {
array_token_positions[array_index].push_back(positions);
positions.clear();
}
array_index++;
continue;
}
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
token_leaf->values->offsets.getLength() :
token_leaf->values->offset_index.at(doc_index+1);
while(start_offset < end_offset) {
positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
start_offset++;
}
token_positions.push_back(positions);
positions.push_back(pos);
}
if(positions.size() != 0) {
// for plain string fields that don't use an ARRAY_SEPARATOR
array_token_positions[array_index].push_back(positions);
}
}
}
inline std::vector<art_leaf *> Index::next_suggestion(const std::vector<token_candidates> &token_candidates_vec,

View File

@ -0,0 +1,3 @@
{"title": "The Truth About Forever", "tags": ["the truth", "about forever", "truth about"], "points": 100}
{"title": "Plain Truth", "tags": ["plain", "truth", "plain truth"], "points": 40}
{"title": "Temple of the Winds", "tags": ["temple", "of", "temple of"], "points": 87}

View File

@ -487,6 +487,82 @@ TEST_F(CollectionTest, PrefixSearching) {
ASSERT_EQ("16", results["hits"].at(0)["document"]["id"]);
}
TEST_F(CollectionTest, ArrayStringFieldHighlight) {
Collection *coll_array_text;
std::ifstream infile(std::string(ROOT_DIR) + "test/array_text_documents.jsonl");
std::vector<field> fields = {
field("title", field_types::STRING, false),
field("tags", field_types::STRING_ARRAY, false),
field("points", field_types::INT32, false)
};
coll_array_text = collectionManager.get_collection("coll_array_text");
if (coll_array_text == nullptr) {
coll_array_text = collectionManager.create_collection("coll_array_text", fields, "points").get();
}
std::string json_line;
while (std::getline(infile, json_line)) {
coll_array_text->add(json_line);
}
infile.close();
query_fields = {"tags"};
std::vector<std::string> facets;
nlohmann::json results = coll_array_text->search("truth about", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
false, 0).get();
ASSERT_EQ(1, results["hits"].size());
std::vector<std::string> ids = {"0"};
for (size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
std::string result_id = result["document"]["id"];
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
ASSERT_STREQ(results["hits"][0]["highlight"]["tags"].get<std::string>().c_str(), "<mark>truth</mark> <mark>about</mark>");
results = coll_array_text->search("forever truth", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
false, 0).get();
ASSERT_EQ(1, results["hits"].size());
ids = {"0"};
for (size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
std::string result_id = result["document"]["id"];
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
ASSERT_STREQ(results["hits"][0]["highlight"]["tags"].get<std::string>().c_str(), "the <mark>truth</mark>");
results = coll_array_text->search("truth", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
false, 0).get();
ASSERT_EQ(2, results["hits"].size());
ids = {"0", "1"};
for (size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
std::string result_id = result["document"]["id"];
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = coll_array_text->search("asdadasd", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
false, 0).get();
ASSERT_EQ(0, results["hits"].size());
collectionManager.drop_collection("coll_array_text");
}
TEST_F(CollectionTest, MultipleFields) {
Collection *coll_mul_fields;