Track best-matched token offsets needed for highlighting.

- We store the best matched token offset positions in Topster KV
- Using run-length encoding (via unions) to pack the offset diffs intelligently
This commit is contained in:
Kishore Nallan 2017-06-09 13:22:24 -05:00
parent 24711d3c5c
commit 1d5146f7ff
7 changed files with 106 additions and 60 deletions

View File

@ -69,7 +69,8 @@ add_executable(typesense-server ${SRC_FILES} src/main/typesense_server.cpp)
add_executable(search ${SRC_FILES} src/main/main.cpp)
add_executable(benchmark ${SRC_FILES} src/main/benchmark.cpp)
add_executable(typesense_test ${SRC_FILES} test/array_test.cpp test/sorted_array_test.cpp test/art_test.cpp
test/collection_test.cpp test/collection_manager_test.cpp test/topster_test.cpp)
test/collection_test.cpp test/collection_manager_test.cpp
test/topster_test.cpp test/match_score_test.cpp)
target_compile_definitions(typesense-server PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
target_compile_definitions(search PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")

View File

@ -149,7 +149,7 @@ public:
Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id);
enum {MAX_SEARCH_TOKENS = 20};
enum {MAX_SEARCH_TOKENS = 10};
enum {MAX_RESULTS = 100};
// Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store

View File

@ -14,8 +14,12 @@
#define TokenOffsetHeap std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset>
struct MatchScore {
struct TokenOffset {
union TokenOffsetDiffs {
int16_t packed;
char bytes[16];
};
struct TokenOffset {
uint8_t token_id; // token identifier
uint16_t offset; // token's offset in the text
uint16_t offset_index; // index of the offset in the vector
@ -23,7 +27,13 @@ struct MatchScore {
bool operator() (const TokenOffset& a, const TokenOffset& b) {
return a.offset > b.offset;
}
};
};
struct MatchScore {
uint16_t words_present;
uint16_t distance;
uint16_t start_offset;
int16_t offset_diffs_packed;
static void print_token_offsets(std::vector<std::vector<uint16_t>> &token_offsets) {
for(auto offsets: token_offsets) {
@ -48,8 +58,12 @@ struct MatchScore {
}
}
uint16_t words_present;
uint16_t distance;
static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
TokenOffsetDiffs & offset_diffs) {
for(size_t i = 1; i < num_tokens; i++) {
offset_diffs.bytes[i-1] = (char)(min_token_offset[i] - min_token_offset[0]);
}
}
/*
* Given *sorted offsets* of each target token in a *single* document, generates a score that indicates:
@ -60,8 +74,8 @@ struct MatchScore {
* compute the max_match and min_displacement of target tokens across the windows.
*/
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
const size_t WINDOW_SIZE = Collection::MAX_SEARCH_TOKENS;
const uint16_t MAX_DISPLACEMENT = Collection::MAX_SEARCH_TOKENS;
const size_t WINDOW_SIZE = 10;
const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();
std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
@ -75,8 +89,11 @@ struct MatchScore {
uint16_t min_displacement = MAX_DISPLACEMENT;
std::queue<TokenOffset> window;
uint16_t token_offset[Collection::MAX_SEARCH_TOKENS] = { };
std::fill_n(token_offset, Collection::MAX_SEARCH_TOKENS, MAX_DISPLACEMENT);
uint16_t token_offset[WINDOW_SIZE] = { };
std::fill_n(token_offset, WINDOW_SIZE, MAX_DISPLACEMENT);
// used to store token offsets of the best-matched window
uint16_t min_token_offset[WINDOW_SIZE];
do {
if(window.empty()) {
@ -121,6 +138,8 @@ struct MatchScore {
max_match = num_match;
if(displacement != 0 && displacement < min_displacement) {
min_displacement = displacement;
// record the token positions (for highlighting)
memcpy(min_token_offset, token_offset, token_offsets.size()*sizeof(uint16_t));
}
}
@ -129,6 +148,11 @@ struct MatchScore {
window.pop();
} while(!heap.empty());
return MatchScore{max_match, min_displacement};
// do run-length encoding of the min token positions/offsets
TokenOffsetDiffs offset_diffs;
uint16_t start_offset = min_token_offset[0];
pack_token_offsets(min_token_offset, token_offsets.size(), offset_diffs);
return MatchScore{max_match, min_displacement, start_offset, offset_diffs.packed};
}
};

View File

@ -5,6 +5,7 @@
#include <cstdio>
#include <algorithm>
#include <sparsepp.h>
#include <match_score.h>
/*
* Remembers the max-K elements seen so far using a min-heap
@ -12,6 +13,8 @@
template <size_t MAX_SIZE=100>
struct Topster {
struct KV {
uint16_t start_offset;
TokenOffsetDiffs offset_diffs;
uint64_t key;
uint64_t match_score;
int64_t primary_attr;
@ -32,7 +35,8 @@ struct Topster {
b = c;
}
void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr, const int64_t &secondary_attr){
void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr,
const int64_t &secondary_attr, const uint16_t &start_offset, const int16_t &offset_diffs_packed){
if (size >= MAX_SIZE) {
if(!is_greater(data[0], match_score, primary_attr, secondary_attr)) {
// when incoming value is less than the smallest in the heap, ignore
@ -51,6 +55,8 @@ struct Topster {
data[0].match_score = match_score;
data[0].primary_attr = primary_attr;
data[0].secondary_attr = secondary_attr;
data[0].start_offset = start_offset;
data[0].offset_diffs.packed = offset_diffs_packed;
uint32_t i = 0;
// sift to maintain heap property
@ -80,6 +86,8 @@ struct Topster {
data[size].match_score = match_score;
data[size].primary_attr = primary_attr;
data[size].secondary_attr = secondary_attr;
data[size].start_offset = start_offset;
data[size].offset_diffs.packed = offset_diffs_packed;
size++;
for (uint32_t i = size - 1; i > 0;) {

View File

@ -862,13 +862,10 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
const int max_candidate_rank = 250;
spp::sparse_hash_map<art_leaf*, uint32_t*> leaf_to_indices;
if(query_suggestion.size() != 1) {
// won't be needing positional ranking when there is only 1 token in the query
for (art_leaf *token_leaf : query_suggestion) {
uint32_t *indices = new uint32_t[result_size];
token_leaf->values->ids.indexOf(result_ids, result_size, indices);
leaf_to_indices.emplace(token_leaf, indices);
}
for (art_leaf *token_leaf : query_suggestion) {
uint32_t *indices = new uint32_t[result_size];
token_leaf->values->ids.indexOf(result_ids, result_size, indices);
leaf_to_indices.emplace(token_leaf, indices);
}
spp::sparse_hash_map<uint32_t, int64_t> * primary_rank_scores = nullptr;
@ -897,35 +894,29 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
uint32_t seq_id = result_ids[i];
std::vector<std::vector<uint16_t>> token_positions;
MatchScore mscore;
if(query_suggestion.size() == 1) {
mscore = MatchScore{1, 1};
} else {
// for each token in the query, find the positions that it appears in this document
for (art_leaf *token_leaf : query_suggestion) {
std::vector<uint16_t> positions;
int doc_index = leaf_to_indices.at(token_leaf)[i];
if(doc_index == token_leaf->values->ids.getLength()) {
continue;
}
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
token_leaf->values->offsets.getLength() :
token_leaf->values->offset_index.at(doc_index+1);
while(start_offset < end_offset) {
positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
start_offset++;
}
token_positions.push_back(positions);
// for each token in the query, find the positions that it appears in this document
for (art_leaf *token_leaf : query_suggestion) {
std::vector<uint16_t> positions;
int doc_index = leaf_to_indices.at(token_leaf)[i];
if(doc_index == token_leaf->values->ids.getLength()) {
continue;
}
mscore = MatchScore::match_score(seq_id, token_positions);
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
token_leaf->values->offsets.getLength() :
token_leaf->values->offset_index.at(doc_index+1);
while(start_offset < end_offset) {
positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
start_offset++;
}
token_positions.push_back(positions);
}
MatchScore mscore = MatchScore::match_score(seq_id, token_positions);
int candidate_rank_score = max_candidate_rank - candidate_rank;
// Construct a single match_score from individual components (for multi-field sort)
@ -938,11 +929,12 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
int64_t secondary_rank_score = (secondary_rank_scores && secondary_rank_scores->count(seq_id) > 0) ?
secondary_rank_scores->at(seq_id) : 0;
topster.add(seq_id, match_score,
primary_rank_factor * primary_rank_score,
secondary_rank_factor * secondary_rank_score);
primary_rank_factor * primary_rank_score, secondary_rank_factor * secondary_rank_score,
mscore.start_offset, mscore.offset_diffs_packed);
/*std::cout << "candidate_rank_score: " << candidate_rank_score << ", match_score: "
<< match_score << ", primary_rank_score: " << primary_rank_score << ", seq_id: " << seq_id << std::endl;*/
/*std::cout << "candidate_rank_score: " << candidate_rank_score << ", words_present: " << mscore.words_present
<< ", match_score: " << match_score << ", primary_rank_score: " << primary_rank_score
<< ", seq_id: " << seq_id << std::endl;*/
}
for (auto it = leaf_to_indices.begin(); it != leaf_to_indices.end(); it++) {

17
test/match_score_test.cpp Normal file
View File

@ -0,0 +1,17 @@
#include <gtest/gtest.h>
#include <match_score.h>
TEST(MatchScoreTest, ShouldPackTokenOffsets) {
uint16_t min_token_offset1[3] = {567, 568, 570};
TokenOffsetDiffs offset_diffs;
MatchScore::pack_token_offsets(min_token_offset1, 3, offset_diffs);
ASSERT_EQ(1, offset_diffs.bytes[0]);
ASSERT_EQ(3, offset_diffs.bytes[1]);
uint16_t min_token_offset2[3] = {0, 1, 2};
MatchScore::pack_token_offsets(min_token_offset2, 3, offset_diffs);
ASSERT_EQ(1, offset_diffs.bytes[0]);
ASSERT_EQ(2, offset_diffs.bytes[1]);
}

View File

@ -1,29 +1,33 @@
#include <gtest/gtest.h>
#include "topster.h"
#include "match_score.h"
TEST(TopsterTest, StoreMaxValuesWithoutRepetition) {
Topster<5> topster;
struct {
uint16_t start_offset;
TokenOffsetDiffs offset_diffs;
uint64_t key;
uint64_t match_score;
int64_t primary_attr;
int64_t secondary_attr;
} data[10] = {
{1, 10, 20, 30},
{2, 4, 20, 30},
{3, 7, 20, 30},
{4, 11, 20, 30},
{5, 9, 20, 30},
{6, 6, 20, 30},
{7, 6, 22, 30},
{8, 9, 20, 30},
{9, 8, 20, 30},
{10, 5, 20, 30},
{10, {.packed = 10 }, 1, 10, 20, 30},
{0, {.packed = 10 }, 2, 4, 20, 30},
{2, {.packed = 10 }, 3, 7, 20, 30},
{11, {.packed = 10 }, 4, 11, 20, 30},
{78, {.packed = 10 }, 5, 9, 20, 30},
{246, {.packed = 10 }, 6, 6, 20, 30},
{0, {.packed = 10 }, 7, 6, 22, 30},
{20, {.packed = 10 }, 8, 9, 20, 30},
{22, {.packed = 10 }, 9, 8, 20, 30},
{77, {.packed = 10 }, 10, 5, 20, 30},
};
for(int i = 0; i < 10; i++) {
topster.add(data[i].key, data[i].match_score, data[i].primary_attr, data[i].secondary_attr);
topster.add(data[i].key, data[i].match_score, data[i].primary_attr, data[i].secondary_attr,
data[i].start_offset, data[i].offset_diffs.packed);
}
topster.sort();