Parameterize the num of tokens that surround a highlight.

This commit is contained in:
Kishore Nallan 2020-10-06 19:04:35 +05:30
parent a0f6bd4347
commit 7ced978520
6 changed files with 72 additions and 33 deletions

View File

@ -155,7 +155,9 @@ private:
void highlight_result(const field &search_field, const std::vector<std::vector<art_leaf *>> &searched_queries,
const KV* field_order_kv, const nlohmann::json &document,
StringUtils & string_utils, size_t snippet_threshold,
StringUtils & string_utils,
const size_t snippet_threshold,
const size_t highlight_affix_num_tokens,
bool highlighted_fully,
highlight_t &highlight);
@ -242,6 +244,7 @@ public:
size_t max_facet_values=10,
const std::string & simple_facet_query = "",
const size_t snippet_threshold = 30,
const size_t highlight_affix_num_tokens = 4,
const std::string & highlight_full_fields = "",
size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD,
const std::map<size_t, std::vector<std::string>>& pinned_hits={},

View File

@ -485,6 +485,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
const size_t max_facet_values,
const std::string & simple_facet_query,
const size_t snippet_threshold,
const size_t highlight_affix_num_tokens,
const std::string & highlight_full_fields,
size_t typo_tokens_threshold,
const std::map<size_t, std::vector<std::string>>& pinned_hits,
@ -1059,7 +1060,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
highlight_t highlight;
highlight_result(search_field, searched_queries, field_order_kv, document,
string_utils, snippet_threshold, highlighted_fully, highlight);
string_utils, snippet_threshold, highlight_affix_num_tokens,
highlighted_fully, highlight);
if(!highlight.snippets.empty()) {
highlights.push_back(highlight);
@ -1305,7 +1307,9 @@ void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t
void Collection::highlight_result(const field &search_field,
const std::vector<std::vector<art_leaf *>> &searched_queries,
const KV* field_order_kv, const nlohmann::json & document,
StringUtils & string_utils, size_t snippet_threshold,
StringUtils & string_utils,
const size_t snippet_threshold,
const size_t highlight_affix_num_tokens,
bool highlighted_fully,
highlight_t & highlight) {
@ -1395,12 +1399,15 @@ void Collection::highlight_result(const field &search_field,
auto minmax = std::minmax_element(token_indices.begin(), token_indices.end());
size_t prefix_length = highlight_affix_num_tokens;
size_t suffix_length = highlight_affix_num_tokens + 1;
// For longer strings, pick surrounding tokens within 4 tokens of min_index and max_index for the snippet
const size_t start_index = (tokens.size() <= snippet_threshold) ? 0 :
std::max(0, (int)(*(minmax.first) - 4));
std::max(0, (int)(*(minmax.first) - prefix_length));
const size_t end_index = (tokens.size() <= snippet_threshold) ? tokens.size() :
std::min((int)tokens.size(), (int)(*(minmax.second) + 5));
std::min((int)tokens.size(), (int)(*(minmax.second) + suffix_length));
std::stringstream snippet_stream;
for(size_t snippet_index = start_index; snippet_index < end_index; snippet_index++) {

View File

@ -254,6 +254,9 @@ bool get_search(http_req & req, http_res & res) {
// strings under this length will be fully highlighted, instead of showing a snippet of relevant portion
const char *SNIPPET_THRESHOLD = "snippet_threshold";
// the number of tokens that should surround the highlighted text
const char *HIGHLIGHT_AFFIX_NUM_TOKENS = "highlight_affix_num_tokens";
// list of fields which will be highlighted fully without snippeting
const char *HIGHLIGHT_FULL_FIELDS = "highlight_full_fields";
@ -290,6 +293,10 @@ bool get_search(http_req & req, http_res & res) {
req.params[SNIPPET_THRESHOLD] = "30";
}
if(req.params.count(HIGHLIGHT_AFFIX_NUM_TOKENS) == 0) {
req.params[HIGHLIGHT_AFFIX_NUM_TOKENS] = "4";
}
if(req.params.count(HIGHLIGHT_FULL_FIELDS) == 0) {
req.params[HIGHLIGHT_FULL_FIELDS] = "";
}
@ -362,6 +369,11 @@ bool get_search(http_req & req, http_res & res) {
return false;
}
if(!StringUtils::is_uint32_t(req.params[HIGHLIGHT_AFFIX_NUM_TOKENS])) {
res.set_400("Parameter `" + std::string(HIGHLIGHT_AFFIX_NUM_TOKENS) + "` must be an unsigned integer.");
return false;
}
if(!StringUtils::is_uint32_t(req.params[GROUP_LIMIT])) {
res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer.");
return false;
@ -474,6 +486,7 @@ bool get_search(http_req & req, http_res & res) {
static_cast<size_t>(std::stol(req.params[MAX_FACET_VALUES])),
req.params[FACET_QUERY],
static_cast<size_t>(std::stol(req.params[SNIPPET_THRESHOLD])),
static_cast<size_t>(std::stol(req.params[HIGHLIGHT_AFFIX_NUM_TOKENS])),
req.params[HIGHLIGHT_FULL_FIELDS],
typo_tokens_threshold,
pinned_hits,

View File

@ -64,7 +64,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,
{}, {}, {"size"}, 2).get();
@ -107,7 +107,7 @@ TEST_F(CollectionGroupingTest, GroupingBasics) {
res = coll_group->search("*", {}, "", {"brand"}, sort_size, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
"", 10,
{}, {}, {"rating"}, 2).get();
@ -147,7 +147,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,
{}, {}, {"size", "brand"}, 2).get();
@ -194,7 +194,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 2, 2, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,
{}, {}, {"size", "brand"}, 2).get();
@ -230,7 +230,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
auto res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
"", 10,
{}, {}, {"rating"}, 100);
@ -240,7 +240,7 @@ TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30, 5,
"", 10,
{}, {}, {"rating"}, 0);
@ -252,7 +252,7 @@ TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) {
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,
{}, {}, {"brand"}, 1).get();
@ -322,7 +322,7 @@ TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10,
{}, {}, {"colors"}, 2).get();

View File

@ -271,7 +271,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
pinned_hits, {}).get();
@ -289,7 +289,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
pinned_hits, hidden_hits).get();
@ -305,7 +305,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 2, 2, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
pinned_hits, hidden_hits).get();
@ -341,7 +341,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
{}, {hidden_hits}).get();
@ -362,7 +362,7 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
pinned_hits, {}).get();
@ -383,7 +383,7 @@ TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30, 5,
"", 10,
pinned_hits, {}, {"cast"}, 2).get();

View File

@ -558,14 +558,14 @@ TEST_F(CollectionTest, TypoTokensThreshold) {
// Query expansion should happen only based on the `typo_tokens_threshold` value
auto results = collection->search("launch", {"title"}, "", {}, sort_fields, 2, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "", 0).get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "", 0).get();
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(5, results["found"].get<size_t>());
results = collection->search("launch", {"title"}, "", {}, sort_fields, 2, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "", 10).get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "", 10).get();
ASSERT_EQ(7, results["hits"].size());
ASSERT_EQ(7, results["found"].get<size_t>());
@ -2210,6 +2210,22 @@ TEST_F(CollectionTest, SearchHighlightShouldFollowThreshold) {
ASSERT_STREQ("fox jumped over the <mark>lazy</mark> dog and ran straight",
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
// specify the number of surrounding tokens to return
size_t highlight_affix_num_tokens = 2;
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, highlight_affix_num_tokens).get();
ASSERT_STREQ("over the <mark>lazy</mark> dog and",
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
highlight_affix_num_tokens = 0;
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, highlight_affix_num_tokens).get();
ASSERT_STREQ("<mark>lazy</mark>",
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}
@ -2238,7 +2254,7 @@ TEST_F(CollectionTest, UpdateDocument) {
auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.",
@ -2251,13 +2267,13 @@ TEST_F(CollectionTest, UpdateDocument) {
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(0, res["hits"].size());
res = coll1->search("quick", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_STREQ("The quick brown fox.", res["hits"][0]["document"]["title"].get<std::string>().c_str());
@ -2277,14 +2293,14 @@ TEST_F(CollectionTest, UpdateDocument) {
// check for old tag
res = coll1->search("NEWS", {"tags"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(0, res["hits"].size());
// now check for new tag and also try faceting on that field
res = coll1->search("SENTENCE", {"tags"}, "", {"tags"}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_STREQ("SENTENCE", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
@ -2299,7 +2315,7 @@ TEST_F(CollectionTest, UpdateDocument) {
res = coll1->search("*", {"tags"}, "points: > 90", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_EQ(99, res["hits"][0]["document"]["points"].get<size_t>());
@ -2313,7 +2329,7 @@ TEST_F(CollectionTest, UpdateDocument) {
res = coll1->search("*", {"tags"}, "points: > 101", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_EQ(105, res["hits"][0]["document"]["points"].get<size_t>());
@ -2346,7 +2362,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "title").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
ASSERT_EQ(1, res["hits"][0]["highlights"].size());
ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2355,14 +2371,14 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
// should not return value key when highlight_full_fields is not specified
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "").get();
ASSERT_EQ(2, res["hits"][0]["highlights"][0].size());
// query multiple fields
res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 5, "title, tags").get();
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title, tags").get();
ASSERT_EQ(2, res["hits"][0]["highlights"].size());
ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2375,7 +2391,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
spp::sparse_hash_set<std::string> excluded_fields = {"tags"};
res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
excluded_fields, 10, "", 5, "title, tags").get();
excluded_fields, 10, "", 5, 5, "title, tags").get();
ASSERT_EQ(1, res["hits"][0]["highlights"].size());
ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
@ -2385,7 +2401,7 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) {
excluded_fields = {"tags", "title"};
res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, 0, 10, 1,
token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
excluded_fields, 10, "", 5, "title, tags").get();
excluded_fields, 10, "", 5, 5, "title, tags").get();
ASSERT_EQ(0, res["hits"][0]["highlights"].size());
collectionManager.drop_collection("coll1");