From 1d5146f7ff6cf8a61fcf9f24cd7977738fc7059d Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Fri, 9 Jun 2017 13:22:24 -0500 Subject: [PATCH] Track best-matched token offsets needed for highlighting. - We store the best matched token offset positions in Topster KV - Using run-length encoding (via unions) to pack the offset diffs intelligently --- CMakeLists.txt | 3 +- include/collection.h | 2 +- include/match_score.h | 44 +++++++++++++++++++++------ include/topster.h | 10 +++++- src/collection.cpp | 64 +++++++++++++++++---------------------- test/match_score_test.cpp | 17 +++++++++++ test/topster_test.cpp | 26 +++++++++------- 7 files changed, 106 insertions(+), 60 deletions(-) create mode 100644 test/match_score_test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1eca6791..07376ac9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,8 @@ add_executable(typesense-server ${SRC_FILES} src/main/typesense_server.cpp) add_executable(search ${SRC_FILES} src/main/main.cpp) add_executable(benchmark ${SRC_FILES} src/main/benchmark.cpp) add_executable(typesense_test ${SRC_FILES} test/array_test.cpp test/sorted_array_test.cpp test/art_test.cpp - test/collection_test.cpp test/collection_manager_test.cpp test/topster_test.cpp) + test/collection_test.cpp test/collection_manager_test.cpp + test/topster_test.cpp test/match_score_test.cpp) target_compile_definitions(typesense-server PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/") target_compile_definitions(search PRIVATE ROOT_DIR="${CMAKE_SOURCE_DIR}/") diff --git a/include/collection.h b/include/collection.h index d3ef4a9c..ea13b0a1 100644 --- a/include/collection.h +++ b/include/collection.h @@ -149,7 +149,7 @@ public: Option index_in_memory(const nlohmann::json & document, uint32_t seq_id); - enum {MAX_SEARCH_TOKENS = 20}; + enum {MAX_SEARCH_TOKENS = 10}; enum {MAX_RESULTS = 100}; // Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store diff --git a/include/match_score.h b/include/match_score.h index ccbb7eea..d81dbe86 100644 --- a/include/match_score.h +++ b/include/match_score.h @@ -14,8 +14,12 @@ #define TokenOffsetHeap std::priority_queue, TokenOffset> -struct MatchScore { - struct TokenOffset { +union TokenOffsetDiffs { + int16_t packed; + char bytes[16]; +}; + +struct TokenOffset { uint8_t token_id; // token identifier uint16_t offset; // token's offset in the text uint16_t offset_index; // index of the offset in the vector @@ -23,7 +27,13 @@ struct MatchScore { bool operator() (const TokenOffset& a, const TokenOffset& b) { return a.offset > b.offset; } - }; +}; + +struct MatchScore { + uint16_t words_present; + uint16_t distance; + uint16_t start_offset; + int16_t offset_diffs_packed; static void print_token_offsets(std::vector> &token_offsets) { for(auto offsets: token_offsets) { @@ -48,8 +58,12 @@ struct MatchScore { } } - uint16_t words_present; - uint16_t distance; + static void pack_token_offsets(const uint16_t* min_token_offset, const size_t num_tokens, + TokenOffsetDiffs & offset_diffs) { + for(size_t i = 1; i < num_tokens; i++) { + offset_diffs.bytes[i-1] = (char)(min_token_offset[i] - min_token_offset[0]); + } + } /* * Given *sorted offsets* of each target token in a *single* document, generates a score that indicates: @@ -60,8 +74,8 @@ struct MatchScore { * compute the max_match and min_displacement of target tokens across the windows. */ static MatchScore match_score(uint32_t doc_id, std::vector> &token_offsets) { - const size_t WINDOW_SIZE = Collection::MAX_SEARCH_TOKENS; - const uint16_t MAX_DISPLACEMENT = Collection::MAX_SEARCH_TOKENS; + const size_t WINDOW_SIZE = 10; + const uint16_t MAX_DISPLACEMENT = std::numeric_limits::max(); std::priority_queue, TokenOffset> heap; @@ -75,8 +89,11 @@ struct MatchScore { uint16_t min_displacement = MAX_DISPLACEMENT; std::queue window; - uint16_t token_offset[Collection::MAX_SEARCH_TOKENS] = { }; - std::fill_n(token_offset, Collection::MAX_SEARCH_TOKENS, MAX_DISPLACEMENT); + uint16_t token_offset[WINDOW_SIZE] = { }; + std::fill_n(token_offset, WINDOW_SIZE, MAX_DISPLACEMENT); + + // used to store token offsets of the best-matched window + uint16_t min_token_offset[WINDOW_SIZE]; do { if(window.empty()) { @@ -121,6 +138,8 @@ struct MatchScore { max_match = num_match; if(displacement != 0 && displacement < min_displacement) { min_displacement = displacement; + // record the token positions (for highlighting) + memcpy(min_token_offset, token_offset, token_offsets.size()*sizeof(uint16_t)); } } @@ -129,6 +148,11 @@ struct MatchScore { window.pop(); } while(!heap.empty()); - return MatchScore{max_match, min_displacement}; + // do run-length encoding of the min token positions/offsets + TokenOffsetDiffs offset_diffs; + uint16_t start_offset = min_token_offset[0]; + pack_token_offsets(min_token_offset, token_offsets.size(), offset_diffs); + + return MatchScore{max_match, min_displacement, start_offset, offset_diffs.packed}; } }; diff --git a/include/topster.h b/include/topster.h index c6d61eda..a1ec92eb 100644 --- a/include/topster.h +++ b/include/topster.h @@ -5,6 +5,7 @@ #include #include #include +#include /* * Remembers the max-K elements seen so far using a min-heap @@ -12,6 +13,8 @@ template struct Topster { struct KV { + uint16_t start_offset; + TokenOffsetDiffs offset_diffs; uint64_t key; uint64_t match_score; int64_t primary_attr; @@ -32,7 +35,8 @@ struct Topster { b = c; } - void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr, const int64_t &secondary_attr){ + void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr, + const int64_t &secondary_attr, const uint16_t &start_offset, const int16_t &offset_diffs_packed){ if (size >= MAX_SIZE) { if(!is_greater(data[0], match_score, primary_attr, secondary_attr)) { // when incoming value is less than the smallest in the heap, ignore @@ -51,6 +55,8 @@ struct Topster { data[0].match_score = match_score; data[0].primary_attr = primary_attr; data[0].secondary_attr = secondary_attr; + data[0].start_offset = start_offset; + data[0].offset_diffs.packed = offset_diffs_packed; uint32_t i = 0; // sift to maintain heap property @@ -80,6 +86,8 @@ struct Topster { data[size].match_score = match_score; data[size].primary_attr = primary_attr; data[size].secondary_attr = secondary_attr; + data[size].start_offset = start_offset; + data[size].offset_diffs.packed = offset_diffs_packed; size++; for (uint32_t i = size - 1; i > 0;) { diff --git a/src/collection.cpp b/src/collection.cpp index c06fca87..f8814ca1 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -862,13 +862,10 @@ void Collection::score_results(const std::vector & sort_fields, cons const int max_candidate_rank = 250; spp::sparse_hash_map leaf_to_indices; - if(query_suggestion.size() != 1) { - // won't be needing positional ranking when there is only 1 token in the query - for (art_leaf *token_leaf : query_suggestion) { - uint32_t *indices = new uint32_t[result_size]; - token_leaf->values->ids.indexOf(result_ids, result_size, indices); - leaf_to_indices.emplace(token_leaf, indices); - } + for (art_leaf *token_leaf : query_suggestion) { + uint32_t *indices = new uint32_t[result_size]; + token_leaf->values->ids.indexOf(result_ids, result_size, indices); + leaf_to_indices.emplace(token_leaf, indices); } spp::sparse_hash_map * primary_rank_scores = nullptr; @@ -897,35 +894,29 @@ void Collection::score_results(const std::vector & sort_fields, cons uint32_t seq_id = result_ids[i]; std::vector> token_positions; - MatchScore mscore; - - if(query_suggestion.size() == 1) { - mscore = MatchScore{1, 1}; - } else { - // for each token in the query, find the positions that it appears in this document - for (art_leaf *token_leaf : query_suggestion) { - std::vector positions; - int doc_index = leaf_to_indices.at(token_leaf)[i]; - if(doc_index == token_leaf->values->ids.getLength()) { - continue; - } - - uint32_t start_offset = token_leaf->values->offset_index.at(doc_index); - uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ? - token_leaf->values->offsets.getLength() : - token_leaf->values->offset_index.at(doc_index+1); - - while(start_offset < end_offset) { - positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset)); - start_offset++; - } - - token_positions.push_back(positions); + // for each token in the query, find the positions that it appears in this document + for (art_leaf *token_leaf : query_suggestion) { + std::vector positions; + int doc_index = leaf_to_indices.at(token_leaf)[i]; + if(doc_index == token_leaf->values->ids.getLength()) { + continue; } - mscore = MatchScore::match_score(seq_id, token_positions); + uint32_t start_offset = token_leaf->values->offset_index.at(doc_index); + uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ? + token_leaf->values->offsets.getLength() : + token_leaf->values->offset_index.at(doc_index+1); + + while(start_offset < end_offset) { + positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset)); + start_offset++; + } + + token_positions.push_back(positions); } + MatchScore mscore = MatchScore::match_score(seq_id, token_positions); + int candidate_rank_score = max_candidate_rank - candidate_rank; // Construct a single match_score from individual components (for multi-field sort) @@ -938,11 +929,12 @@ void Collection::score_results(const std::vector & sort_fields, cons int64_t secondary_rank_score = (secondary_rank_scores && secondary_rank_scores->count(seq_id) > 0) ? secondary_rank_scores->at(seq_id) : 0; topster.add(seq_id, match_score, - primary_rank_factor * primary_rank_score, - secondary_rank_factor * secondary_rank_score); + primary_rank_factor * primary_rank_score, secondary_rank_factor * secondary_rank_score, + mscore.start_offset, mscore.offset_diffs_packed); - /*std::cout << "candidate_rank_score: " << candidate_rank_score << ", match_score: " - << match_score << ", primary_rank_score: " << primary_rank_score << ", seq_id: " << seq_id << std::endl;*/ + /*std::cout << "candidate_rank_score: " << candidate_rank_score << ", words_present: " << mscore.words_present + << ", match_score: " << match_score << ", primary_rank_score: " << primary_rank_score + << ", seq_id: " << seq_id << std::endl;*/ } for (auto it = leaf_to_indices.begin(); it != leaf_to_indices.end(); it++) { diff --git a/test/match_score_test.cpp b/test/match_score_test.cpp new file mode 100644 index 00000000..57c1db12 --- /dev/null +++ b/test/match_score_test.cpp @@ -0,0 +1,17 @@ +#include +#include + +TEST(MatchScoreTest, ShouldPackTokenOffsets) { + uint16_t min_token_offset1[3] = {567, 568, 570}; + TokenOffsetDiffs offset_diffs; + MatchScore::pack_token_offsets(min_token_offset1, 3, offset_diffs); + + ASSERT_EQ(1, offset_diffs.bytes[0]); + ASSERT_EQ(3, offset_diffs.bytes[1]); + + uint16_t min_token_offset2[3] = {0, 1, 2}; + MatchScore::pack_token_offsets(min_token_offset2, 3, offset_diffs); + + ASSERT_EQ(1, offset_diffs.bytes[0]); + ASSERT_EQ(2, offset_diffs.bytes[1]); +} \ No newline at end of file diff --git a/test/topster_test.cpp b/test/topster_test.cpp index 582a159e..608c65a5 100644 --- a/test/topster_test.cpp +++ b/test/topster_test.cpp @@ -1,29 +1,33 @@ #include #include "topster.h" +#include "match_score.h" TEST(TopsterTest, StoreMaxValuesWithoutRepetition) { Topster<5> topster; struct { + uint16_t start_offset; + TokenOffsetDiffs offset_diffs; uint64_t key; uint64_t match_score; int64_t primary_attr; int64_t secondary_attr; } data[10] = { - {1, 10, 20, 30}, - {2, 4, 20, 30}, - {3, 7, 20, 30}, - {4, 11, 20, 30}, - {5, 9, 20, 30}, - {6, 6, 20, 30}, - {7, 6, 22, 30}, - {8, 9, 20, 30}, - {9, 8, 20, 30}, - {10, 5, 20, 30}, + {10, {.packed = 10 }, 1, 10, 20, 30}, + {0, {.packed = 10 }, 2, 4, 20, 30}, + {2, {.packed = 10 }, 3, 7, 20, 30}, + {11, {.packed = 10 }, 4, 11, 20, 30}, + {78, {.packed = 10 }, 5, 9, 20, 30}, + {246, {.packed = 10 }, 6, 6, 20, 30}, + {0, {.packed = 10 }, 7, 6, 22, 30}, + {20, {.packed = 10 }, 8, 9, 20, 30}, + {22, {.packed = 10 }, 9, 8, 20, 30}, + {77, {.packed = 10 }, 10, 5, 20, 30}, }; for(int i = 0; i < 10; i++) { - topster.add(data[i].key, data[i].match_score, data[i].primary_attr, data[i].secondary_attr); + topster.add(data[i].key, data[i].match_score, data[i].primary_attr, data[i].secondary_attr, + data[i].start_offset, data[i].offset_diffs.packed); } topster.sort();