diff --git a/src/collection.cpp b/src/collection.cpp index 1904a880..2677b153 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1397,6 +1397,7 @@ void Collection::highlight_result(const field &search_field, std::vector leaf_to_indices; std::vector query_suggestion; + std::set query_suggestion_tokens; for (const art_leaf *token_leaf : searched_queries[field_order_kv->query_index]) { // Must search for the token string fresh on that field for the given document since `token_leaf` @@ -1408,6 +1409,8 @@ void Collection::highlight_result(const field &search_field, if(actual_leaf != nullptr) { query_suggestion.push_back(actual_leaf); + std::string token(reinterpret_cast(actual_leaf->key), actual_leaf->key_len-1); + query_suggestion_tokens.insert(token); std::vector positions; uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key); auto doc_indices = new uint32_t[1]; @@ -1416,20 +1419,26 @@ void Collection::highlight_result(const field &search_field, } } - if(query_suggestion.empty()) { - // can happen for compound query matched across 2 fields: try to use original query tokens + if(query_suggestion.size() != q_tokens.size()) { + // can happen for compound query matched across 2 fields when some tokens are dropped for(const std::string& q_token: q_tokens) { + if(query_suggestion_tokens.count(q_token) != 0) { + continue; + } + Index* index = indices[field_order_kv->key % num_memory_shards]; art_leaf *actual_leaf = index->get_token_leaf(search_field.name, reinterpret_cast(q_token.c_str()), q_token.size() + 1); if(actual_leaf != nullptr) { - query_suggestion.push_back(actual_leaf); std::vector positions; uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key); - auto doc_indices = new uint32_t[1]; - doc_indices[0] = doc_index; - leaf_to_indices.push_back(doc_indices); + if(doc_index != actual_leaf->values->ids.getLength()) { + auto doc_indices = new uint32_t[1]; + doc_indices[0] = doc_index; + leaf_to_indices.push_back(doc_indices); + query_suggestion.push_back(actual_leaf); + } } } } @@ -1477,6 +1486,15 @@ void Collection::highlight_result(const field &search_field, const auto& match_index = match_indices[index]; const Match& match = match_index.match; + size_t last_valid_offset = 0; + for (auto token_offset : match.offsets) { + if(token_offset.offset != MAX_DISPLACEMENT) { + last_valid_offset = token_offset.offset; + } else { + break; + } + } + const std::string& text = (search_field.type == field_types::STRING) ? document[search_field.name] : document[search_field.name][match_index.index]; Tokenizer tokenizer(text, true, false, search_field.locale); @@ -1524,14 +1542,14 @@ void Collection::highlight_result(const field &search_field, } } - if(raw_token_index == match.offsets.back().offset + highlight_affix_num_tokens) { + if(raw_token_index == last_valid_offset + highlight_affix_num_tokens) { // register end of highlight snippet snippet_end_offset = tok_end; } - if(raw_token_index > match.offsets.back().offset + highlight_affix_num_tokens && - raw_token_index == snippet_threshold - 1) { - // since we have already crossed snippeting threshold, we can break now + if(raw_token_index == snippet_threshold - 1 && + raw_token_index >= last_valid_offset + highlight_affix_num_tokens) { + // since we have already crossed snippet threshold, we can break now break; } } @@ -1541,7 +1559,7 @@ void Collection::highlight_result(const field &search_field, } if(raw_token_index + 1 < snippet_threshold) { - // fully highlight field whose token size is less than given snippeth threshold + // fully highlight field whose token size is less than given snippet threshold snippet_start_offset = 0; snippet_end_offset = text.size() - 1; } diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 21f877a1..2879f6f2 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -3156,6 +3156,21 @@ TEST_F(CollectionTest, MultiFieldHighlighting) { ASSERT_EQ("Easily replenish your cell phone with this wireless charger.", results["hits"][0]["highlights"][1]["snippet"].get()); + results = coll1->search("John With Denver", + {"description"}, "", {}, {}, 0, 10, 1, FREQUENCY, + true, 1, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, + "", "", {1}).get(); + + ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ(1, results["hits"].size()); + ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); + + ASSERT_EQ(1, results["hits"][0]["highlights"].size()); + ASSERT_EQ("description", results["hits"][0]["highlights"][0]["field"].get()); + ASSERT_EQ("John Denver", + results["hits"][0]["highlights"][0]["snippet"].get()); + results = coll1->search("Annies song John Denver", {"name","description"}, "", {}, {}, 0, 10, 1, FREQUENCY, true, 1, spp::sparse_hash_set(),