Parameterize the num of tokens that surround a highlight.

2025-05-18 12:42:50 +08:00 · 2020-10-06 19:04:35 +05:30 · 2020-10-06 19:04:35 +05:30 · 7ced978520
commit 7ced978520
parent a0f6bd4347
6 changed files with 72 additions and 33 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -155,7 +155,9 @@ private:

    void highlight_result(const field &search_field, const std::vector<std::vector<art_leaf *>> &searched_queries,
                          const KV* field_order_kv, const nlohmann::json &document,
-                          StringUtils & string_utils, size_t snippet_threshold,
+                          StringUtils & string_utils,
+                          const size_t snippet_threshold,
+                          const size_t highlight_affix_num_tokens,
                          bool highlighted_fully,
                          highlight_t &highlight);

@ -242,6 +244,7 @@ public:
                          size_t max_facet_values=10,
                          const std::string & simple_facet_query = "",
                          const size_t snippet_threshold = 30,
+                          const size_t highlight_affix_num_tokens = 4,
                          const std::string & highlight_full_fields = "",
                          size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD,
                          const std::map<size_t, std::vector<std::string>>& pinned_hits={},
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -485,6 +485,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                                  const size_t max_facet_values,
                                  const std::string & simple_facet_query,
                                  const size_t snippet_threshold,
+                                  const size_t highlight_affix_num_tokens,
                                  const std::string & highlight_full_fields,
                                  size_t typo_tokens_threshold,
                                  const std::map<size_t, std::vector<std::string>>& pinned_hits,
@ -1059,7 +1060,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                    bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
                    highlight_t highlight;
                    highlight_result(search_field, searched_queries, field_order_kv, document,
-                                     string_utils, snippet_threshold, highlighted_fully, highlight);
+                                     string_utils, snippet_threshold, highlight_affix_num_tokens,
+                                     highlighted_fully, highlight);

                    if(!highlight.snippets.empty()) {
                        highlights.push_back(highlight);
@ -1305,7 +1307,9 @@ void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t
 void Collection::highlight_result(const field &search_field,
                                  const std::vector<std::vector<art_leaf *>> &searched_queries,
                                  const KV* field_order_kv, const nlohmann::json & document,
-                                  StringUtils & string_utils, size_t snippet_threshold,
+                                  StringUtils & string_utils,
+                                  const size_t snippet_threshold,
+                                  const size_t highlight_affix_num_tokens,
                                  bool highlighted_fully,
                                  highlight_t & highlight) {

@ -1395,12 +1399,15 @@ void Collection::highlight_result(const field &search_field,

        auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());

+        size_t prefix_length = highlight_affix_num_tokens;
+        size_t suffix_length = highlight_affix_num_tokens + 1;
+
        // For longer strings, pick surrounding tokens within 4 tokens of min_index and max_index for the snippet
        const size_t start_index = (tokens.size() <= snippet_threshold) ? 0 :
-                                   std::max(0, (int)(*(minmax.first) - 4));
+                                   std::max(0, (int)(*(minmax.first) - prefix_length));

        const size_t end_index = (tokens.size() <= snippet_threshold) ? tokens.size() :
-                                 std::min((int)tokens.size(), (int)(*(minmax.second) + 5));
+                                 std::min((int)tokens.size(), (int)(*(minmax.second) + suffix_length));

        std::stringstream snippet_stream;
        for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) {
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@ -254,6 +254,9 @@ bool get_search(http_req & req, http_res & res) {
    // strings under this length will be fully highlighted, instead of showing a snippet of relevant portion
    const char *SNIPPET_THRESHOLD = "snippet_threshold";

+    // the number of tokens that should surround the highlighted text
+    const char *HIGHLIGHT_AFFIX_NUM_TOKENS = "highlight_affix_num_tokens";
+
    // list of fields which will be highlighted fully without snippeting
    const char *HIGHLIGHT_FULL_FIELDS = "highlight_full_fields";

@ -290,6 +293,10 @@ bool get_search(http_req & req, http_res & res) {
        req.params[SNIPPET_THRESHOLD] = "30";
    }

+    if(req.params.count(HIGHLIGHT_AFFIX_NUM_TOKENS) == 0) {
+        req.params[HIGHLIGHT_AFFIX_NUM_TOKENS] = "4";
+    }
+
    if(req.params.count(HIGHLIGHT_FULL_FIELDS) == 0) {
        req.params[HIGHLIGHT_FULL_FIELDS] = "";
    }
@ -362,6 +369,11 @@ bool get_search(http_req & req, http_res & res) {
        return false;
    }

+    if(!StringUtils::is_uint32_t(req.params[HIGHLIGHT_AFFIX_NUM_TOKENS])) {
+        res.set_400("Parameter `" + std::string(HIGHLIGHT_AFFIX_NUM_TOKENS) + "` must be an unsigned integer.");
+        return false;
+    }
+
    if(!StringUtils::is_uint32_t(req.params[GROUP_LIMIT])) {
        res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer.");
        return false;
@ -474,6 +486,7 @@ bool get_search(http_req & req, http_res & res) {
                                                          static_cast<size_t>(std::stol(req.params[MAX_FACET_VALUES])),
                                                          req.params[FACET_QUERY],
                                                          static_cast<size_t>(std::stol(req.params[SNIPPET_THRESHOLD])),
+                                                          static_cast<size_t>(std::stol(req.params[HIGHLIGHT_AFFIX_NUM_TOKENS])),
                                                          req.params[HIGHLIGHT_FULL_FIELDS],
                                                          typo_tokens_threshold,
                                                          pinned_hits,
--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@ -64,7 +64,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                   false, Index::DROP_TOKENS_THRESHOLD,
                                   spp::sparse_hash_set<std::string>(),
-                                   spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                   spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                                   "", 10,
                                   {}, {}, {"size"}, 2).get();

@ -107,7 +107,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
    res = coll_group->search("*", {}, "", {"brand"}, sort_size, 0, 50, 1, FREQUENCY,
                             false, Index::DROP_TOKENS_THRESHOLD,
                             spp::sparse_hash_set<std::string>(),
-                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
                             "", 10,
                             {}, {}, {"rating"}, 2).get();

@ -147,7 +147,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                  false, Index::DROP_TOKENS_THRESHOLD,
                                  spp::sparse_hash_set<std::string>(),
-                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                                  "", 10,
                                  {}, {}, {"size", "brand"}, 2).get();

@ -194,7 +194,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
    res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 2, 2, FREQUENCY,
                             false, Index::DROP_TOKENS_THRESHOLD,
                             spp::sparse_hash_set<std::string>(),
-                             spp::sparse_hash_set<std::string>(), 10, "", 30,
+                             spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                             "", 10,
                             {}, {}, {"size", "brand"}, 2).get();

@ -230,7 +230,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
    auto res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                             false, Index::DROP_TOKENS_THRESHOLD,
                             spp::sparse_hash_set<std::string>(),
-                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                             spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
                             "", 10,
                             {}, {}, {"rating"}, 100);

@ -240,7 +240,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
    res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                false, Index::DROP_TOKENS_THRESHOLD,
                                spp::sparse_hash_set<std::string>(),
-                                spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
+                                spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
                                "", 10,
                                {}, {}, {"rating"}, 0);

@ -252,7 +252,7 @@ TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) {
    auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                  false, Index::DROP_TOKENS_THRESHOLD,
                                  spp::sparse_hash_set<std::string>(),
-                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                                  "", 10,
                                  {}, {}, {"brand"}, 1).get();

@ -322,7 +322,7 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
    auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
                                  false, Index::DROP_TOKENS_THRESHOLD,
                                  spp::sparse_hash_set<std::string>(),
-                                  spp::sparse_hash_set<std::string>(), 10, "", 30,
+                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                                  "", 10,
                                  {}, {}, {"colors"}, 2).get();

--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@ -271,7 +271,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
    auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                                           false, Index::DROP_TOKENS_THRESHOLD,
                                           spp::sparse_hash_set<std::string>(),
-                                           spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                           spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                                           "", 10,
                                           pinned_hits, {}).get();

@ -289,7 +289,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                                      false, Index::DROP_TOKENS_THRESHOLD,
                                      spp::sparse_hash_set<std::string>(),
-                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                                      "", 10,
                                      pinned_hits, hidden_hits).get();

@ -305,7 +305,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 2, 2, FREQUENCY,
                                      false, Index::DROP_TOKENS_THRESHOLD,
                                      spp::sparse_hash_set<std::string>(),
-                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                                      "", 10,
                                      pinned_hits, hidden_hits).get();

@ -341,7 +341,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                                      false, Index::DROP_TOKENS_THRESHOLD,
                                      spp::sparse_hash_set<std::string>(),
-                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                      spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                                      "", 10,
                                      {}, {hidden_hits}).get();

@ -362,7 +362,7 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
    auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                                           false, Index::DROP_TOKENS_THRESHOLD,
                                           spp::sparse_hash_set<std::string>(),
-                                           spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                                           spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                                           "", 10,
                                           pinned_hits, {}).get();

@ -383,7 +383,7 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
    results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
                            false, Index::DROP_TOKENS_THRESHOLD,
                            spp::sparse_hash_set<std::string>(),
-                            spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
+                            spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
                            "", 10,
                            pinned_hits, {}, {"cast"}, 2).get();

--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -558,14 +558,14 @@ TEST_F(CollectionTest, TypoTokensThreshold) {
    // Query expansion should happen only based on the `typo_tokens_threshold` value
    auto results = collection->search("launch", {"title"}, "", {}, sort_fields, 2, 10, 1,
                       token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                       spp::sparse_hash_set<std::string>(), 10, "", 5, "", 0).get();
+                       spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "", 0).get();

    ASSERT_EQ(5, results["hits"].size());
    ASSERT_EQ(5, results["found"].get<size_t>());

    results = collection->search("launch", {"title"}, "", {}, sort_fields, 2, 10, 1,
                                token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                                spp::sparse_hash_set<std::string>(), 10, "", 5, "", 10).get();
+                                spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "", 10).get();

    ASSERT_EQ(7, results["hits"].size());
    ASSERT_EQ(7, results["found"].get<size_t>());
@ -2210,6 +2210,22 @@ TEST_F(CollectionTest, SearchHighlightShouldFollowThreshold) {
    ASSERT_STREQ("fox jumped over the <mark>lazy</mark> dog and ran straight",
                 res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());

+    // specify the number of surrounding tokens to return
+    size_t highlight_affix_num_tokens = 2;
+
+    res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
+                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, highlight_affix_num_tokens).get();
+    ASSERT_STREQ("over the <mark>lazy</mark> dog and",
+                 res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
+
+    highlight_affix_num_tokens = 0;
+    res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
+                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, highlight_affix_num_tokens).get();
+    ASSERT_STREQ("<mark>lazy</mark>",
+                 res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
+
    collectionManager.drop_collection("coll1");
 }

@ -2238,7 +2254,7 @@ TEST_F(CollectionTest, UpdateDocument) {

    auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
                             token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                             spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
+                             spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();

    ASSERT_EQ(1, res["hits"].size());
    ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.",
@ -2251,13 +2267,13 @@ TEST_F(CollectionTest, UpdateDocument) {

    res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();

    ASSERT_EQ(0, res["hits"].size());

    res = coll1->search("quick", {"title"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();

    ASSERT_EQ(1, res["hits"].size());
    ASSERT_STREQ("The quick brown fox.", res["hits"][0]["document"]["title"].get<std::string>().c_str());
@ -2277,14 +2293,14 @@ TEST_F(CollectionTest, UpdateDocument) {
    // check for old tag
    res = coll1->search("NEWS", {"tags"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();

    ASSERT_EQ(0, res["hits"].size());

    // now check for new tag and also try faceting on that field
    res = coll1->search("SENTENCE", {"tags"}, "", {"tags"}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();

    ASSERT_EQ(1, res["hits"].size());
    ASSERT_STREQ("SENTENCE", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
@ -2299,7 +2315,7 @@ TEST_F(CollectionTest, UpdateDocument) {

    res = coll1->search("*", {"tags"}, "points: > 90", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();

    ASSERT_EQ(1, res["hits"].size());
    ASSERT_EQ(99, res["hits"][0]["document"]["points"].get<size_t>());
@ -2313,7 +2329,7 @@ TEST_F(CollectionTest, UpdateDocument) {

    res = coll1->search("*", {"tags"}, "points: > 101", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();

    ASSERT_EQ(1, res["hits"].size());
    ASSERT_EQ(105, res["hits"][0]["document"]["points"].get<size_t>());
@ -2346,7 +2362,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {

    auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();

    ASSERT_EQ(1, res["hits"][0]["highlights"].size());
    ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2355,14 +2371,14 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
    // should not return value key when highlight_full_fields is not specified
    res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "").get();

    ASSERT_EQ(2, res["hits"][0]["highlights"][0].size());

    // query multiple fields
    res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        spp::sparse_hash_set<std::string>(), 10, "", 5, "title, tags").get();
+                        spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title, tags").get();

    ASSERT_EQ(2, res["hits"][0]["highlights"].size());
    ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2375,7 +2391,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
    spp::sparse_hash_set<std::string> excluded_fields = {"tags"};
    res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        excluded_fields, 10, "", 5, "title, tags").get();
+                        excluded_fields, 10, "", 5, 5, "title, tags").get();

    ASSERT_EQ(1, res["hits"][0]["highlights"].size());
    ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2385,7 +2401,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
    excluded_fields = {"tags", "title"};
    res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
                        token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
-                        excluded_fields, 10, "", 5, "title, tags").get();
+                        excluded_fields, 10, "", 5, 5, "title, tags").get();
    ASSERT_EQ(0, res["hits"][0]["highlights"].size());

    collectionManager.drop_collection("coll1");