From f6612cb34eb5c74722ea69c3ec56df1d1c9b8449 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 30 Dec 2017 21:14:31 +0530 Subject: [PATCH] Refactor scoring loop. --- include/match_score.h | 10 +++++----- src/collection.cpp | 10 +++++----- src/index.cpp | 31 ++++++++++++++++++------------- test/match_score_test.cpp | 12 ++++++------ 4 files changed, 34 insertions(+), 29 deletions(-) diff --git a/include/match_score.h b/include/match_score.h index 4ecc0594..00d55ac2 100644 --- a/include/match_score.h +++ b/include/match_score.h @@ -28,17 +28,17 @@ struct TokenOffset { } }; -struct MatchScore { +struct Match { uint16_t words_present; uint16_t distance; uint16_t start_offset; char offset_diffs[16]; - MatchScore() { + Match() { } - MatchScore(uint16_t words_present, uint16_t distance, uint16_t start_offset, char *offset_diffs_stacked): + Match(uint16_t words_present, uint16_t distance, uint16_t start_offset, char *offset_diffs_stacked): words_present(words_present), distance(distance), start_offset(start_offset) { memcpy(offset_diffs, offset_diffs_stacked, 16); } @@ -89,7 +89,7 @@ struct MatchScore { * We use a priority queue to read the offset vectors in a sorted manner, slide a window of a given size, and * compute the max_match and min_displacement of target tokens across the windows. */ - static MatchScore match_score(uint32_t doc_id, std::vector> &token_offsets) { + static Match match(uint32_t doc_id, std::vector> &token_offsets) { std::priority_queue, TokenOffset> heap; for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) { @@ -179,6 +179,6 @@ struct MatchScore { } pack_token_offsets(min_token_offset, token_offsets.size(), token_start_offset, packed_offset_diffs); - return MatchScore(max_match, min_displacement, token_start_offset, packed_offset_diffs); + return Match(max_match, min_displacement, token_start_offset, packed_offset_diffs); } }; diff --git a/src/collection.cpp b/src/collection.cpp index e2c8fbaf..6bcfe60d 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -542,14 +542,14 @@ Option Collection::search(std::string query, const std::vector token_indices; - char num_tokens_found = mscore.offset_diffs[0]; + char num_tokens_found = match.offset_diffs[0]; for(size_t i = 1; i <= num_tokens_found; i++) { - if(mscore.offset_diffs[i] != std::numeric_limits::max()) { - size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]); + if(match.offset_diffs[i] != std::numeric_limits::max()) { + size_t token_index = (size_t)(match.start_offset + match.offset_diffs[i]); token_indices.push_back(token_index); } } diff --git a/src/index.cpp b/src/index.cpp index a0478c26..9ea98208 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -840,25 +840,30 @@ void Index::score_results(const std::vector & sort_fields, const int & //auto begin = std::chrono::high_resolution_clock::now(); + char empty_offset_diffs[16]; + std::fill_n(empty_offset_diffs, 16, 0); + Match single_token_match = Match(1, 0, 0, empty_offset_diffs); + const uint64_t single_token_match_score = ((int64_t)(single_token_match.words_present) << 24) | + ((int64_t)(255 - total_cost) << 16) | + ((int64_t)(MAX_SEARCH_TOKENS - single_token_match.distance)); + for(auto i=0; i> token_positions; populate_token_positions(query_suggestion, leaf_to_indices, i, token_positions); - mscore = MatchScore::match_score(seq_id, token_positions); - } + const Match & match = Match::match(seq_id, token_positions); - // Construct a single match_score from individual components (for multi-field sort) - const uint64_t match_score = ((int64_t)(mscore.words_present) << 24) | - ((int64_t)(255 - total_cost) << 16) | - ((int64_t)(MAX_SEARCH_TOKENS - mscore.distance)); + // Construct a single match score from individual components (for multi-field sort) + match_score = ((int64_t)(match.words_present) << 24) | + ((int64_t)(255 - total_cost) << 16) | + ((int64_t)(MAX_SEARCH_TOKENS - match.distance)); + } const int64_t default_score = 0; number_t primary_rank_score = default_score; @@ -880,8 +885,8 @@ void Index::score_results(const std::vector & sort_fields, const int & /*std::ostringstream os; os << name << ", total_cost: " << (255 - total_cost) - << ", words_present: " << mscore.words_present << ", match_score: " << match_score - << ", primary_rank_score: " << primary_rank_score.intval << ", distance: " << (MAX_SEARCH_TOKENS - mscore.distance) + << ", words_present: " << match.words_present << ", match_score: " << match + << ", primary_rank_score: " << primary_rank_score.intval << ", distance: " << (MAX_SEARCH_TOKENS - match.distance) << ", seq_id: " << seq_id << std::endl; std::cout << os.str();*/ } diff --git a/test/match_score_test.cpp b/test/match_score_test.cpp index 7a84f305..93cd7e69 100644 --- a/test/match_score_test.cpp +++ b/test/match_score_test.cpp @@ -1,11 +1,11 @@ #include #include -TEST(MatchScoreTest, ShouldPackTokenOffsets) { +TEST(MatchTest, ShouldPackTokenOffsets) { uint16_t min_token_offset1[3] = {567, 568, 570}; char offset_diffs[16]; - MatchScore::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs); + Match::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs); ASSERT_EQ(3, offset_diffs[0]); ASSERT_EQ(0, offset_diffs[1]); @@ -13,7 +13,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) { ASSERT_EQ(3, offset_diffs[3]); uint16_t min_token_offset2[3] = {0, 1, 2}; - MatchScore::pack_token_offsets(min_token_offset2, 3, 0, offset_diffs); + Match::pack_token_offsets(min_token_offset2, 3, 0, offset_diffs); ASSERT_EQ(3, offset_diffs[0]); ASSERT_EQ(0, offset_diffs[1]); @@ -21,14 +21,14 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) { ASSERT_EQ(2, offset_diffs[3]); uint16_t min_token_offset3[1] = {123}; - MatchScore::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs); + Match::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs); ASSERT_EQ(1, offset_diffs[0]); ASSERT_EQ(0, offset_diffs[1]); // a token might not have an offset because it might not be in the best matching window uint16_t min_token_offset4[3] = {0, MAX_DISPLACEMENT, 2}; - MatchScore::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs); + Match::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs); ASSERT_EQ(3, offset_diffs[0]); ASSERT_EQ(0, offset_diffs[1]); @@ -36,7 +36,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) { ASSERT_EQ(2, offset_diffs[3]); uint16_t min_token_offset5[3] = {MAX_DISPLACEMENT, 2, 4}; - MatchScore::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs); + Match::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs); ASSERT_EQ(3, offset_diffs[0]); ASSERT_EQ(std::numeric_limits::max(), offset_diffs[1]);