mirror of
https://github.com/typesense/typesense.git
synced 2025-05-17 20:22:32 +08:00
Track best-matched token offsets needed for highlighting.
- We store the best matched token offset positions in Topster KV - Using run-length encoding (via unions) to pack the offset diffs intelligently
This commit is contained in:
parent
24711d3c5c
commit
1d5146f7ff
@ -69,7 +69,8 @@ add_executable(typesense-server ${SRC_FILES} src/main/typesense_server.cpp)
|
||||
add_executable(search ${SRC_FILES} src/main/main.cpp)
|
||||
add_executable(benchmark ${SRC_FILES} src/main/benchmark.cpp)
|
||||
add_executable(typesense_test ${SRC_FILES} test/array_test.cpp test/sorted_array_test.cpp test/art_test.cpp
|
||||
test/collection_test.cpp test/collection_manager_test.cpp test/topster_test.cpp)
|
||||
test/collection_test.cpp test/collection_manager_test.cpp
|
||||
test/topster_test.cpp test/match_score_test.cpp)
|
||||
|
||||
target_compile_definitions(typesense-server PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
|
||||
target_compile_definitions(search PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/")
|
||||
|
@ -149,7 +149,7 @@ public:
|
||||
|
||||
Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id);
|
||||
|
||||
enum {MAX_SEARCH_TOKENS = 20};
|
||||
enum {MAX_SEARCH_TOKENS = 10};
|
||||
enum {MAX_RESULTS = 100};
|
||||
|
||||
// Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store
|
||||
|
@ -14,8 +14,12 @@
|
||||
|
||||
#define TokenOffsetHeap std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset>
|
||||
|
||||
struct MatchScore {
|
||||
struct TokenOffset {
|
||||
union TokenOffsetDiffs {
|
||||
int16_t packed;
|
||||
char bytes[16];
|
||||
};
|
||||
|
||||
struct TokenOffset {
|
||||
uint8_t token_id; // token identifier
|
||||
uint16_t offset; // token's offset in the text
|
||||
uint16_t offset_index; // index of the offset in the vector
|
||||
@ -23,7 +27,13 @@ struct MatchScore {
|
||||
bool operator() (const TokenOffset& a, const TokenOffset& b) {
|
||||
return a.offset > b.offset;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
struct MatchScore {
|
||||
uint16_t words_present;
|
||||
uint16_t distance;
|
||||
uint16_t start_offset;
|
||||
int16_t offset_diffs_packed;
|
||||
|
||||
static void print_token_offsets(std::vector<std::vector<uint16_t>> &token_offsets) {
|
||||
for(auto offsets: token_offsets) {
|
||||
@ -48,8 +58,12 @@ struct MatchScore {
|
||||
}
|
||||
}
|
||||
|
||||
uint16_t words_present;
|
||||
uint16_t distance;
|
||||
static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens,
|
||||
TokenOffsetDiffs & offset_diffs) {
|
||||
for(size_t i = 1; i < num_tokens; i++) {
|
||||
offset_diffs.bytes[i-1] = (char)(min_token_offset[i] - min_token_offset[0]);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Given *sorted offsets* of each target token in a *single* document, generates a score that indicates:
|
||||
@ -60,8 +74,8 @@ struct MatchScore {
|
||||
* compute the max_match and min_displacement of target tokens across the windows.
|
||||
*/
|
||||
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
|
||||
const size_t WINDOW_SIZE = Collection::MAX_SEARCH_TOKENS;
|
||||
const uint16_t MAX_DISPLACEMENT = Collection::MAX_SEARCH_TOKENS;
|
||||
const size_t WINDOW_SIZE = 10;
|
||||
const uint16_t MAX_DISPLACEMENT = std::numeric_limits<uint16_t>::max();
|
||||
|
||||
std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
|
||||
|
||||
@ -75,8 +89,11 @@ struct MatchScore {
|
||||
uint16_t min_displacement = MAX_DISPLACEMENT;
|
||||
|
||||
std::queue<TokenOffset> window;
|
||||
uint16_t token_offset[Collection::MAX_SEARCH_TOKENS] = { };
|
||||
std::fill_n(token_offset, Collection::MAX_SEARCH_TOKENS, MAX_DISPLACEMENT);
|
||||
uint16_t token_offset[WINDOW_SIZE] = { };
|
||||
std::fill_n(token_offset, WINDOW_SIZE, MAX_DISPLACEMENT);
|
||||
|
||||
// used to store token offsets of the best-matched window
|
||||
uint16_t min_token_offset[WINDOW_SIZE];
|
||||
|
||||
do {
|
||||
if(window.empty()) {
|
||||
@ -121,6 +138,8 @@ struct MatchScore {
|
||||
max_match = num_match;
|
||||
if(displacement != 0 && displacement < min_displacement) {
|
||||
min_displacement = displacement;
|
||||
// record the token positions (for highlighting)
|
||||
memcpy(min_token_offset, token_offset, token_offsets.size()*sizeof(uint16_t));
|
||||
}
|
||||
}
|
||||
|
||||
@ -129,6 +148,11 @@ struct MatchScore {
|
||||
window.pop();
|
||||
} while(!heap.empty());
|
||||
|
||||
return MatchScore{max_match, min_displacement};
|
||||
// do run-length encoding of the min token positions/offsets
|
||||
TokenOffsetDiffs offset_diffs;
|
||||
uint16_t start_offset = min_token_offset[0];
|
||||
pack_token_offsets(min_token_offset, token_offsets.size(), offset_diffs);
|
||||
|
||||
return MatchScore{max_match, min_displacement, start_offset, offset_diffs.packed};
|
||||
}
|
||||
};
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
#include <sparsepp.h>
|
||||
#include <match_score.h>
|
||||
|
||||
/*
|
||||
* Remembers the max-K elements seen so far using a min-heap
|
||||
@ -12,6 +13,8 @@
|
||||
template <size_t MAX_SIZE=100>
|
||||
struct Topster {
|
||||
struct KV {
|
||||
uint16_t start_offset;
|
||||
TokenOffsetDiffs offset_diffs;
|
||||
uint64_t key;
|
||||
uint64_t match_score;
|
||||
int64_t primary_attr;
|
||||
@ -32,7 +35,8 @@ struct Topster {
|
||||
b = c;
|
||||
}
|
||||
|
||||
void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr, const int64_t &secondary_attr){
|
||||
void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr,
|
||||
const int64_t &secondary_attr, const uint16_t &start_offset, const int16_t &offset_diffs_packed){
|
||||
if (size >= MAX_SIZE) {
|
||||
if(!is_greater(data[0], match_score, primary_attr, secondary_attr)) {
|
||||
// when incoming value is less than the smallest in the heap, ignore
|
||||
@ -51,6 +55,8 @@ struct Topster {
|
||||
data[0].match_score = match_score;
|
||||
data[0].primary_attr = primary_attr;
|
||||
data[0].secondary_attr = secondary_attr;
|
||||
data[0].start_offset = start_offset;
|
||||
data[0].offset_diffs.packed = offset_diffs_packed;
|
||||
uint32_t i = 0;
|
||||
|
||||
// sift to maintain heap property
|
||||
@ -80,6 +86,8 @@ struct Topster {
|
||||
data[size].match_score = match_score;
|
||||
data[size].primary_attr = primary_attr;
|
||||
data[size].secondary_attr = secondary_attr;
|
||||
data[size].start_offset = start_offset;
|
||||
data[size].offset_diffs.packed = offset_diffs_packed;
|
||||
size++;
|
||||
|
||||
for (uint32_t i = size - 1; i > 0;) {
|
||||
|
@ -862,13 +862,10 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
|
||||
const int max_candidate_rank = 250;
|
||||
spp::sparse_hash_map<art_leaf*, uint32_t*> leaf_to_indices;
|
||||
|
||||
if(query_suggestion.size() != 1) {
|
||||
// won't be needing positional ranking when there is only 1 token in the query
|
||||
for (art_leaf *token_leaf : query_suggestion) {
|
||||
uint32_t *indices = new uint32_t[result_size];
|
||||
token_leaf->values->ids.indexOf(result_ids, result_size, indices);
|
||||
leaf_to_indices.emplace(token_leaf, indices);
|
||||
}
|
||||
for (art_leaf *token_leaf : query_suggestion) {
|
||||
uint32_t *indices = new uint32_t[result_size];
|
||||
token_leaf->values->ids.indexOf(result_ids, result_size, indices);
|
||||
leaf_to_indices.emplace(token_leaf, indices);
|
||||
}
|
||||
|
||||
spp::sparse_hash_map<uint32_t, int64_t> * primary_rank_scores = nullptr;
|
||||
@ -897,35 +894,29 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
|
||||
uint32_t seq_id = result_ids[i];
|
||||
std::vector<std::vector<uint16_t>> token_positions;
|
||||
|
||||
MatchScore mscore;
|
||||
|
||||
if(query_suggestion.size() == 1) {
|
||||
mscore = MatchScore{1, 1};
|
||||
} else {
|
||||
// for each token in the query, find the positions that it appears in this document
|
||||
for (art_leaf *token_leaf : query_suggestion) {
|
||||
std::vector<uint16_t> positions;
|
||||
int doc_index = leaf_to_indices.at(token_leaf)[i];
|
||||
if(doc_index == token_leaf->values->ids.getLength()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
|
||||
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
|
||||
token_leaf->values->offsets.getLength() :
|
||||
token_leaf->values->offset_index.at(doc_index+1);
|
||||
|
||||
while(start_offset < end_offset) {
|
||||
positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
|
||||
start_offset++;
|
||||
}
|
||||
|
||||
token_positions.push_back(positions);
|
||||
// for each token in the query, find the positions that it appears in this document
|
||||
for (art_leaf *token_leaf : query_suggestion) {
|
||||
std::vector<uint16_t> positions;
|
||||
int doc_index = leaf_to_indices.at(token_leaf)[i];
|
||||
if(doc_index == token_leaf->values->ids.getLength()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
mscore = MatchScore::match_score(seq_id, token_positions);
|
||||
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
|
||||
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
|
||||
token_leaf->values->offsets.getLength() :
|
||||
token_leaf->values->offset_index.at(doc_index+1);
|
||||
|
||||
while(start_offset < end_offset) {
|
||||
positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
|
||||
start_offset++;
|
||||
}
|
||||
|
||||
token_positions.push_back(positions);
|
||||
}
|
||||
|
||||
MatchScore mscore = MatchScore::match_score(seq_id, token_positions);
|
||||
|
||||
int candidate_rank_score = max_candidate_rank - candidate_rank;
|
||||
|
||||
// Construct a single match_score from individual components (for multi-field sort)
|
||||
@ -938,11 +929,12 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
|
||||
int64_t secondary_rank_score = (secondary_rank_scores && secondary_rank_scores->count(seq_id) > 0) ?
|
||||
secondary_rank_scores->at(seq_id) : 0;
|
||||
topster.add(seq_id, match_score,
|
||||
primary_rank_factor * primary_rank_score,
|
||||
secondary_rank_factor * secondary_rank_score);
|
||||
primary_rank_factor * primary_rank_score, secondary_rank_factor * secondary_rank_score,
|
||||
mscore.start_offset, mscore.offset_diffs_packed);
|
||||
|
||||
/*std::cout << "candidate_rank_score: " << candidate_rank_score << ", match_score: "
|
||||
<< match_score << ", primary_rank_score: " << primary_rank_score << ", seq_id: " << seq_id << std::endl;*/
|
||||
/*std::cout << "candidate_rank_score: " << candidate_rank_score << ", words_present: " << mscore.words_present
|
||||
<< ", match_score: " << match_score << ", primary_rank_score: " << primary_rank_score
|
||||
<< ", seq_id: " << seq_id << std::endl;*/
|
||||
}
|
||||
|
||||
for (auto it = leaf_to_indices.begin(); it != leaf_to_indices.end(); it++) {
|
||||
|
17
test/match_score_test.cpp
Normal file
17
test/match_score_test.cpp
Normal file
@ -0,0 +1,17 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <match_score.h>
|
||||
|
||||
TEST(MatchScoreTest, ShouldPackTokenOffsets) {
|
||||
uint16_t min_token_offset1[3] = {567, 568, 570};
|
||||
TokenOffsetDiffs offset_diffs;
|
||||
MatchScore::pack_token_offsets(min_token_offset1, 3, offset_diffs);
|
||||
|
||||
ASSERT_EQ(1, offset_diffs.bytes[0]);
|
||||
ASSERT_EQ(3, offset_diffs.bytes[1]);
|
||||
|
||||
uint16_t min_token_offset2[3] = {0, 1, 2};
|
||||
MatchScore::pack_token_offsets(min_token_offset2, 3, offset_diffs);
|
||||
|
||||
ASSERT_EQ(1, offset_diffs.bytes[0]);
|
||||
ASSERT_EQ(2, offset_diffs.bytes[1]);
|
||||
}
|
@ -1,29 +1,33 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "topster.h"
|
||||
#include "match_score.h"
|
||||
|
||||
TEST(TopsterTest, StoreMaxValuesWithoutRepetition) {
|
||||
Topster<5> topster;
|
||||
|
||||
struct {
|
||||
uint16_t start_offset;
|
||||
TokenOffsetDiffs offset_diffs;
|
||||
uint64_t key;
|
||||
uint64_t match_score;
|
||||
int64_t primary_attr;
|
||||
int64_t secondary_attr;
|
||||
} data[10] = {
|
||||
{1, 10, 20, 30},
|
||||
{2, 4, 20, 30},
|
||||
{3, 7, 20, 30},
|
||||
{4, 11, 20, 30},
|
||||
{5, 9, 20, 30},
|
||||
{6, 6, 20, 30},
|
||||
{7, 6, 22, 30},
|
||||
{8, 9, 20, 30},
|
||||
{9, 8, 20, 30},
|
||||
{10, 5, 20, 30},
|
||||
{10, {.packed = 10 }, 1, 10, 20, 30},
|
||||
{0, {.packed = 10 }, 2, 4, 20, 30},
|
||||
{2, {.packed = 10 }, 3, 7, 20, 30},
|
||||
{11, {.packed = 10 }, 4, 11, 20, 30},
|
||||
{78, {.packed = 10 }, 5, 9, 20, 30},
|
||||
{246, {.packed = 10 }, 6, 6, 20, 30},
|
||||
{0, {.packed = 10 }, 7, 6, 22, 30},
|
||||
{20, {.packed = 10 }, 8, 9, 20, 30},
|
||||
{22, {.packed = 10 }, 9, 8, 20, 30},
|
||||
{77, {.packed = 10 }, 10, 5, 20, 30},
|
||||
};
|
||||
|
||||
for(int i = 0; i < 10; i++) {
|
||||
topster.add(data[i].key, data[i].match_score, data[i].primary_attr, data[i].secondary_attr);
|
||||
topster.add(data[i].key, data[i].match_score, data[i].primary_attr, data[i].secondary_attr,
|
||||
data[i].start_offset, data[i].offset_diffs.packed);
|
||||
}
|
||||
|
||||
topster.sort();
|
||||
|
Loading…
x
Reference in New Issue
Block a user