Refactor scoring loop.

2025-05-17 20:22:32 +08:00 · 2017-12-30 21:14:31 +05:30 · 2017-12-30 21:14:31 +05:30 · f6612cb34e
commit f6612cb34e
parent 2b7059de37
4 changed files with 34 additions and 29 deletions
--- a/include/match_score.h
+++ b/include/match_score.h
@ -28,17 +28,17 @@ struct TokenOffset {
    }
 };

-struct MatchScore {
+struct Match {
  uint16_t words_present;
  uint16_t distance;
  uint16_t start_offset;
  char offset_diffs[16];

-  MatchScore() {
+  Match() {

  }

-  MatchScore(uint16_t words_present, uint16_t distance, uint16_t start_offset, char *offset_diffs_stacked):
+  Match(uint16_t words_present, uint16_t distance, uint16_t start_offset, char *offset_diffs_stacked):
          words_present(words_present), distance(distance), start_offset(start_offset) {
    memcpy(offset_diffs, offset_diffs_stacked, 16);
  }
@ -89,7 +89,7 @@ struct MatchScore {
  *  We use a priority queue to read the offset vectors in a sorted manner, slide a window of a given size, and
  *  compute the max_match and min_displacement of target tokens across the windows.
  */
-  static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
+  static Match match(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
    std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;

    for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {
@ -179,6 +179,6 @@ struct MatchScore {
    }

    pack_token_offsets(min_token_offset, token_offsets.size(), token_start_offset, packed_offset_diffs);
-    return MatchScore(max_match, min_displacement, token_start_offset, packed_offset_diffs);
+    return Match(max_match, min_displacement, token_start_offset, packed_offset_diffs);
  }
 };
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -542,14 +542,14 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
                token_positions.push_back(positions);
            }

-            MatchScore mscore = MatchScore::match_score(field_order_kv.second.key, token_positions);
+            Match match = Match::match(field_order_kv.second.key, token_positions);

-            // unpack `mscore.offset_diffs` into `token_indices`
+            // unpack `match.offset_diffs` into `token_indices`
            std::vector<size_t> token_indices;
-            char num_tokens_found = mscore.offset_diffs[0];
+            char num_tokens_found = match.offset_diffs[0];
            for(size_t i = 1; i <= num_tokens_found; i++) {
-                if(mscore.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
-                    size_t token_index = (size_t)(mscore.start_offset + mscore.offset_diffs[i]);
+                if(match.offset_diffs[i] != std::numeric_limits<int8_t>::max()) {
+                    size_t token_index = (size_t)(match.start_offset + match.offset_diffs[i]);
                    token_indices.push_back(token_index);
                }
            }
--- a/src/index.cpp
+++ b/src/index.cpp
@ -840,25 +840,30 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &

    //auto begin = std::chrono::high_resolution_clock::now();

+    char empty_offset_diffs[16];
+    std::fill_n(empty_offset_diffs, 16, 0);
+    Match single_token_match = Match(1, 0, 0, empty_offset_diffs);
+    const uint64_t single_token_match_score = ((int64_t)(single_token_match.words_present) << 24) |
+                                              ((int64_t)(255 - total_cost) << 16) |
+                                              ((int64_t)(MAX_SEARCH_TOKENS - single_token_match.distance));
+
    for(auto i=0; i<result_size; i++) {
        const uint32_t seq_id = result_ids[i];
-        MatchScore mscore;
+
+        uint64_t match_score = 0;

        if(query_suggestion.size() == 1) {
-            // short circuit to speed up single token searches (use dummy offsets for now)
-            char offset_diffs[16];
-            std::fill_n(offset_diffs, 16, 0);
-            mscore = MatchScore(1, 0, 0, offset_diffs);
+            match_score = single_token_match_score;
        } else {
            std::vector<std::vector<uint16_t>> token_positions;
            populate_token_positions(query_suggestion, leaf_to_indices, i, token_positions);
-            mscore = MatchScore::match_score(seq_id, token_positions);
-        }
+            const Match & match = Match::match(seq_id, token_positions);

-        // Construct a single match_score from individual components (for multi-field sort)
-        const uint64_t match_score = ((int64_t)(mscore.words_present) << 24) |
-                                     ((int64_t)(255 - total_cost) << 16) |
-                                     ((int64_t)(MAX_SEARCH_TOKENS - mscore.distance));
+            // Construct a single match score from individual components (for multi-field sort)
+            match_score = ((int64_t)(match.words_present) << 24) |
+                          ((int64_t)(255 - total_cost) << 16) |
+                          ((int64_t)(MAX_SEARCH_TOKENS - match.distance));
+        }

        const int64_t default_score = 0;
        number_t primary_rank_score = default_score;
@ -880,8 +885,8 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const int &

        /*std::ostringstream os;
        os << name << ", total_cost: " << (255 - total_cost)
-                << ", words_present: " << mscore.words_present << ", match_score: " << match_score
-                << ", primary_rank_score: " << primary_rank_score.intval << ", distance: " << (MAX_SEARCH_TOKENS - mscore.distance)
+                << ", words_present: " << match.words_present << ", match_score: " << match
+                << ", primary_rank_score: " << primary_rank_score.intval << ", distance: " << (MAX_SEARCH_TOKENS - match.distance)
                << ", seq_id: " << seq_id << std::endl;
        std::cout << os.str();*/
    }
--- a/test/match_score_test.cpp
+++ b/test/match_score_test.cpp
@ -1,11 +1,11 @@
 #include <gtest/gtest.h>
 #include <match_score.h>

-TEST(MatchScoreTest, ShouldPackTokenOffsets) {
+TEST(MatchTest, ShouldPackTokenOffsets) {
    uint16_t min_token_offset1[3] = {567, 568, 570};
    char offset_diffs[16];

-    MatchScore::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs);
+    Match::pack_token_offsets(min_token_offset1, 3, 567, offset_diffs);

    ASSERT_EQ(3, offset_diffs[0]);
    ASSERT_EQ(0, offset_diffs[1]);
@ -13,7 +13,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
    ASSERT_EQ(3, offset_diffs[3]);

    uint16_t min_token_offset2[3] = {0, 1, 2};
-    MatchScore::pack_token_offsets(min_token_offset2, 3, 0, offset_diffs);
+    Match::pack_token_offsets(min_token_offset2, 3, 0, offset_diffs);

    ASSERT_EQ(3, offset_diffs[0]);
    ASSERT_EQ(0, offset_diffs[1]);
@ -21,14 +21,14 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
    ASSERT_EQ(2, offset_diffs[3]);

    uint16_t min_token_offset3[1] = {123};
-    MatchScore::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs);
+    Match::pack_token_offsets(min_token_offset3, 1, 123, offset_diffs);

    ASSERT_EQ(1, offset_diffs[0]);
    ASSERT_EQ(0, offset_diffs[1]);

    // a token might not have an offset because it might not be in the best matching window
    uint16_t min_token_offset4[3] = {0, MAX_DISPLACEMENT, 2};
-    MatchScore::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs);
+    Match::pack_token_offsets(min_token_offset4, 3, 0, offset_diffs);

    ASSERT_EQ(3, offset_diffs[0]);
    ASSERT_EQ(0, offset_diffs[1]);
@ -36,7 +36,7 @@ TEST(MatchScoreTest, ShouldPackTokenOffsets) {
    ASSERT_EQ(2, offset_diffs[3]);

    uint16_t min_token_offset5[3] = {MAX_DISPLACEMENT, 2, 4};
-    MatchScore::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs);
+    Match::pack_token_offsets(min_token_offset5, 3, 2, offset_diffs);

    ASSERT_EQ(3, offset_diffs[0]);
    ASSERT_EQ(std::numeric_limits<int8_t>::max(), offset_diffs[1]);