mirror of
https://github.com/typesense/typesense.git
synced 2025-05-17 20:22:32 +08:00
Refactor scoring loop.
This commit is contained in:
parent
2b7059de37
commit
f6612cb34e
@ -28,17 +28,17 @@ struct TokenOffset {
|
||||
}
|
||||
};
|
||||
|
||||
struct MatchScore {
|
||||
struct Match {
|
||||
uint16_t words_present;
|
||||
uint16_t distance;
|
||||
uint16_t start_offset;
|
||||
char offset_diffs[16];
|
||||
|
||||
MatchScore() {
|
||||
Match() {
|
||||
|
||||
}
|
||||
|
||||
MatchScore(uint16_t words_present, uint16_t distance, uint16_t start_offset, char *offset_diffs_stacked):
|
||||
Match(uint16_t words_present, uint16_t distance, uint16_t start_offset, char *offset_diffs_stacked):
|
||||
words_present(words_present), distance(distance), start_offset(start_offset) {
|
||||
memcpy(offset_diffs, offset_diffs_stacked, 16);
|
||||
}
|
||||
@ -89,7 +89,7 @@ struct MatchScore {
|
||||
* We use a priority queue to read the offset vectors in a sorted manner, slide a window of a given size, and
|
||||
* compute the max_match and min_displacement of target tokens across the windows.
|
||||
*/
|
||||
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
|
||||
static Match match(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
|
||||
std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
|
||||
|
||||
for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {
|
||||
@ -179,6 +179,6 @@ struct MatchScore {
|
||||
}
|
||||
|
||||
pack_token_offsets(min_token_offset, token_offsets.size(), token_start_offset, packed_offset_diffs);
|
||||
return MatchScore(max_match, min_displacement, token_start_offset, packed_offset_diffs);
|
||||
return Match(max_match, min_displacement, token_start_offset, packed_offset_diffs);
|
||||
}
|
||||
};
|
||||
|
@ -542,14 +542,14 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
|
||||
token_positions.push_back(positions);
|
||||
}
|
||||
|
||||
MatchScore mscore = MatchScore::match_score(field_order_kv.second.key, token_positions);
|
||||
Match match = Match::match(field_order_kv.second.key, token_positions);
|
||||
|
||||
// unpack `mscore.offset_diffs` into `token_indices`
|
||||
// unpack `match.offset_diffs` into `token_indices`
|
||||
std::vector<size_t> token_indices;
|
||||
char num_tokens_found = mscore.offset_diffs[0];
|
||||
char num_tokens_found = match.offset_diffs[0];
|
||||
for(size_t i = 1; i <= num_tokens_found; i++) {
|
||||
if(mscore.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
|
||||
size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
|
||||
if(match.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
|
||||
size_t token_index = (size_t)(match.start_offset + match.offset_diffs[i]);
|
||||
token_indices.push_back(token_index);
|
||||
}
|
||||
}
|
||||
|
@ -840,25 +840,30 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
|
||||
|
||||
//auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
char empty_offset_diffs[16];
|
||||
std::fill_n(empty_offset_diffs, 16, 0);
|
||||
Match single_token_match = Match(1, 0, 0, empty_offset_diffs);
|
||||
const uint64_t single_token_match_score = ((int64_t)(single_token_match.words_present) << 24) |
|
||||
((int64_t)(255 - total_cost) << 16) |
|
||||
((int64_t)(MAX_SEARCH_TOKENS - single_token_match.distance));
|
||||
|
||||
for(auto i=0; i<result_size; i++) {
|
||||
const uint32_t seq_id = result_ids[i];
|
||||
MatchScore mscore;
|
||||
|
||||
uint64_t match_score = 0;
|
||||
|
||||
if(query_suggestion.size() == 1) {
|
||||
// short circuit to speed up single token searches (use dummy offsets for now)
|
||||
char offset_diffs[16];
|
||||
std::fill_n(offset_diffs, 16, 0);
|
||||
mscore = MatchScore(1, 0, 0, offset_diffs);
|
||||
match_score = single_token_match_score;
|
||||
} else {
|
||||
std::vector<std::vector<uint16_t>> token_positions;
|
||||
populate_token_positions(query_suggestion, leaf_to_indices, i, token_positions);
|
||||
mscore = MatchScore::match_score(seq_id, token_positions);
|
||||
}
|
||||
const Match & match = Match::match(seq_id, token_positions);
|
||||
|
||||
// Construct a single match_score from individual components (for multi-field sort)
|
||||
const uint64_t match_score = ((int64_t)(mscore.words_present) << 24) |
|
||||
((int64_t)(255 - total_cost) << 16) |
|
||||
((int64_t)(MAX_SEARCH_TOKENS - mscore.distance));
|
||||
// Construct a single match score from individual components (for multi-field sort)
|
||||
match_score = ((int64_t)(match.words_present) << 24) |
|
||||
((int64_t)(255 - total_cost) << 16) |
|
||||
((int64_t)(MAX_SEARCH_TOKENS - match.distance));
|
||||
}
|
||||
|
||||
const int64_t default_score = 0;
|
||||
number_t primary_rank_score = default_score;
|
||||
@ -880,8 +885,8 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
|
||||
|
||||
/*std::ostringstream os;
|
||||
os << name << ", total_cost: " << (255 - total_cost)
|
||||
<< ", words_present: " << mscore.words_present << ", match_score: " << match_score
|
||||
<< ", primary_rank_score: " << primary_rank_score.intval << ", distance: " << (MAX_SEARCH_TOKENS - mscore.distance)
|
||||
<< ", words_present: " << match.words_present << ", match_score: " << match
|
||||
<< ", primary_rank_score: " << primary_rank_score.intval << ", distance: " << (MAX_SEARCH_TOKENS - match.distance)
|
||||
<< ", seq_id: " << seq_id << std::endl;
|
||||
std::cout << os.str();*/
|
||||
}
|
||||
|
@ -1,11 +1,11 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <match_score.h>
|
||||
|
||||
TEST(MatchScoreTest, ShouldPackTokenOffsets) {
|
||||
TEST(MatchTest, ShouldPackTokenOffsets) {
|
||||
uint16_t min_token_offset1[3] = {567, 568, 570};
|
||||
char offset_diffs[16];
|
||||
|
||||
MatchScore::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs);
|
||||
Match::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs);
|
||||
|
||||
ASSERT_EQ(3, offset_diffs[0]);
|
||||
ASSERT_EQ(0, offset_diffs[1]);
|
||||
@ -13,7 +13,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
|
||||
ASSERT_EQ(3, offset_diffs[3]);
|
||||
|
||||
uint16_t min_token_offset2[3] = {0, 1, 2};
|
||||
MatchScore::pack_token_offsets(min_token_offset2, 3, 0, offset_diffs);
|
||||
Match::pack_token_offsets(min_token_offset2, 3, 0, offset_diffs);
|
||||
|
||||
ASSERT_EQ(3, offset_diffs[0]);
|
||||
ASSERT_EQ(0, offset_diffs[1]);
|
||||
@ -21,14 +21,14 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
|
||||
ASSERT_EQ(2, offset_diffs[3]);
|
||||
|
||||
uint16_t min_token_offset3[1] = {123};
|
||||
MatchScore::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs);
|
||||
Match::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs);
|
||||
|
||||
ASSERT_EQ(1, offset_diffs[0]);
|
||||
ASSERT_EQ(0, offset_diffs[1]);
|
||||
|
||||
// a token might not have an offset because it might not be in the best matching window
|
||||
uint16_t min_token_offset4[3] = {0, MAX_DISPLACEMENT, 2};
|
||||
MatchScore::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs);
|
||||
Match::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs);
|
||||
|
||||
ASSERT_EQ(3, offset_diffs[0]);
|
||||
ASSERT_EQ(0, offset_diffs[1]);
|
||||
@ -36,7 +36,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
|
||||
ASSERT_EQ(2, offset_diffs[3]);
|
||||
|
||||
uint16_t min_token_offset5[3] = {MAX_DISPLACEMENT, 2, 4};
|
||||
MatchScore::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs);
|
||||
Match::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs);
|
||||
|
||||
ASSERT_EQ(3, offset_diffs[0]);
|
||||
ASSERT_EQ(std::numeric_limits<int8_t>::max(), offset_diffs[1]);
|
||||
|
Loading…
x
Reference in New Issue
Block a user