Refactor scoring loop.

This commit is contained in:
Kishore Nallan 2017-12-30 21:14:31 +05:30
parent 2b7059de37
commit f6612cb34e
4 changed files with 34 additions and 29 deletions

View File

@ -28,17 +28,17 @@ struct TokenOffset {
}
};
struct MatchScore {
struct Match {
uint16_t words_present;
uint16_t distance;
uint16_t start_offset;
char offset_diffs[16];
MatchScore() {
Match() {
}
MatchScore(uint16_t words_present, uint16_t distance, uint16_t start_offset, char *offset_diffs_stacked):
Match(uint16_t words_present, uint16_t distance, uint16_t start_offset, char *offset_diffs_stacked):
words_present(words_present), distance(distance), start_offset(start_offset) {
memcpy(offset_diffs, offset_diffs_stacked, 16);
}
@ -89,7 +89,7 @@ struct MatchScore {
* We use a priority queue to read the offset vectors in a sorted manner, slide a window of a given size, and
* compute the max_match and min_displacement of target tokens across the windows.
*/
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
static Match match(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {
@ -179,6 +179,6 @@ struct MatchScore {
}
pack_token_offsets(min_token_offset, token_offsets.size(), token_start_offset, packed_offset_diffs);
return MatchScore(max_match, min_displacement, token_start_offset, packed_offset_diffs);
return Match(max_match, min_displacement, token_start_offset, packed_offset_diffs);
}
};

View File

@ -542,14 +542,14 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
token_positions.push_back(positions);
}
MatchScore mscore = MatchScore::match_score(field_order_kv.second.key, token_positions);
Match match = Match::match(field_order_kv.second.key, token_positions);
// unpack `mscore.offset_diffs` into `token_indices`
// unpack `match.offset_diffs` into `token_indices`
std::vector<size_t> token_indices;
char num_tokens_found = mscore.offset_diffs[0];
char num_tokens_found = match.offset_diffs[0];
for(size_t i = 1; i <= num_tokens_found; i++) {
if(mscore.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
if(match.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
size_t token_index = (size_t)(match.start_offset + match.offset_diffs[i]);
token_indices.push_back(token_index);
}
}

View File

@ -840,25 +840,30 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
//auto begin = std::chrono::high_resolution_clock::now();
char empty_offset_diffs[16];
std::fill_n(empty_offset_diffs, 16, 0);
Match single_token_match = Match(1, 0, 0, empty_offset_diffs);
const uint64_t single_token_match_score = ((int64_t)(single_token_match.words_present) << 24) |
((int64_t)(255 - total_cost) << 16) |
((int64_t)(MAX_SEARCH_TOKENS - single_token_match.distance));
for(auto i=0; i<result_size; i++) {
const uint32_t seq_id = result_ids[i];
MatchScore mscore;
uint64_t match_score = 0;
if(query_suggestion.size() == 1) {
// short circuit to speed up single token searches (use dummy offsets for now)
char offset_diffs[16];
std::fill_n(offset_diffs, 16, 0);
mscore = MatchScore(1, 0, 0, offset_diffs);
match_score = single_token_match_score;
} else {
std::vector<std::vector<uint16_t>> token_positions;
populate_token_positions(query_suggestion, leaf_to_indices, i, token_positions);
mscore = MatchScore::match_score(seq_id, token_positions);
}
const Match & match = Match::match(seq_id, token_positions);
// Construct a single match_score from individual components (for multi-field sort)
const uint64_t match_score = ((int64_t)(mscore.words_present) << 24) |
((int64_t)(255 - total_cost) << 16) |
((int64_t)(MAX_SEARCH_TOKENS - mscore.distance));
// Construct a single match score from individual components (for multi-field sort)
match_score = ((int64_t)(match.words_present) << 24) |
((int64_t)(255 - total_cost) << 16) |
((int64_t)(MAX_SEARCH_TOKENS - match.distance));
}
const int64_t default_score = 0;
number_t primary_rank_score = default_score;
@ -880,8 +885,8 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &
/*std::ostringstream os;
os << name << ", total_cost: " << (255 - total_cost)
<< ", words_present: " << mscore.words_present << ", match_score: " << match_score
<< ", primary_rank_score: " << primary_rank_score.intval << ", distance: " << (MAX_SEARCH_TOKENS - mscore.distance)
<< ", words_present: " << match.words_present << ", match_score: " << match
<< ", primary_rank_score: " << primary_rank_score.intval << ", distance: " << (MAX_SEARCH_TOKENS - match.distance)
<< ", seq_id: " << seq_id << std::endl;
std::cout << os.str();*/
}

View File

@ -1,11 +1,11 @@
#include <gtest/gtest.h>
#include <match_score.h>
TEST(MatchScoreTest, ShouldPackTokenOffsets) {
TEST(MatchTest, ShouldPackTokenOffsets) {
uint16_t min_token_offset1[3] = {567, 568, 570};
char offset_diffs[16];
MatchScore::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs);
Match::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs);
ASSERT_EQ(3, offset_diffs[0]);
ASSERT_EQ(0, offset_diffs[1]);
@ -13,7 +13,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
ASSERT_EQ(3, offset_diffs[3]);
uint16_t min_token_offset2[3] = {0, 1, 2};
MatchScore::pack_token_offsets(min_token_offset2, 3, 0, offset_diffs);
Match::pack_token_offsets(min_token_offset2, 3, 0, offset_diffs);
ASSERT_EQ(3, offset_diffs[0]);
ASSERT_EQ(0, offset_diffs[1]);
@ -21,14 +21,14 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
ASSERT_EQ(2, offset_diffs[3]);
uint16_t min_token_offset3[1] = {123};
MatchScore::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs);
Match::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs);
ASSERT_EQ(1, offset_diffs[0]);
ASSERT_EQ(0, offset_diffs[1]);
// a token might not have an offset because it might not be in the best matching window
uint16_t min_token_offset4[3] = {0, MAX_DISPLACEMENT, 2};
MatchScore::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs);
Match::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs);
ASSERT_EQ(3, offset_diffs[0]);
ASSERT_EQ(0, offset_diffs[1]);
@ -36,7 +36,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
ASSERT_EQ(2, offset_diffs[3]);
uint16_t min_token_offset5[3] = {MAX_DISPLACEMENT, 2, 4};
MatchScore::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs);
Match::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs);
ASSERT_EQ(3, offset_diffs[0]);
ASSERT_EQ(std::numeric_limits<int8_t>::max(), offset_diffs[1]);