Allow field highlighting independent of query_by fields.

2025-05-19 21:22:25 +08:00 · 2021-06-17 18:24:11 +05:30 · 2021-06-17 18:24:11 +05:30 · dbeb00debe
commit dbeb00debe
parent 72a240888e
4 changed files with 124 additions and 12 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -534,7 +534,8 @@ public:
                                  size_t limit_hits=UINT32_MAX,
                                  bool prioritize_exact_match=true,
                                  bool pre_segmented_query=false,
-                                  bool enable_overrides=true) const;
+                                  bool enable_overrides=true,
+                                  const std::string& highlight_fields="") const;

    Option<bool> get_filter_ids(const std::string & simple_filter_query,
                                std::vector<std::pair<size_t, uint32_t*>>& index_ids);
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -497,7 +497,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                                  const std::string & simple_facet_query,
                                  const size_t snippet_threshold,
                                  const size_t highlight_affix_num_tokens,
-                                  const std::string & highlight_full_fields,
+                                  const std::string& highlight_full_fields,
                                  size_t typo_tokens_threshold,
                                  const std::string& pinned_hits_str,
                                  const std::string& hidden_hits_str,
@ -509,7 +509,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                                  size_t limit_hits,
                                  bool prioritize_exact_match,
                                  bool pre_segmented_query,
-                                  bool enable_overrides) const {
+                                  bool enable_overrides,
+                                  const std::string& highlight_fields) const {

    std::shared_lock lock(mutex);

@ -1089,18 +1090,35 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
            spp::sparse_hash_set<std::string> fields_highlighted_fully;
            StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ",");

+            std::vector<std::string> fields_highlighted_vec;
+            std::vector<size_t> fields_highlighted_indices;
+            if(highlight_fields.empty()) {
+                for(size_t i = 0; i < search_fields.size(); i++) {
+                    const auto& field_name = search_fields[i];
+                    // should not pick excluded field for highlighting
+                    if(exclude_fields.count(field_name) > 0) {
+                        continue;
+                    }
+
+                    fields_highlighted_vec.emplace_back(field_name);
+                    fields_highlighted_indices.push_back(i);
+                }
+            } else {
+                if(query != "*") {
+                    StringUtils::split(highlight_fields, fields_highlighted_vec, ",");
+                    for(size_t i = 0; i < fields_highlighted_vec.size(); i++) {
+                        fields_highlighted_indices.push_back(0);
+                    }
+                }
+            }
+
            for(std::string & highlight_full_field: fields_highlighted_fully_vec) {
                fields_highlighted_fully.emplace(highlight_full_field);
            }

-            for(size_t i = 0; i < search_fields.size(); i++) {
-                const std::string& field_name = search_fields[i];
-                const std::vector<std::string>& q_tokens = field_query_tokens[i].q_include_tokens;
-
-                // should not pick excluded field for highlighting
-                if(exclude_fields.count(field_name) > 0) {
-                    continue;
-                }
+            for(size_t i = 0; i < fields_highlighted_vec.size(); i++) {
+                const std::string& field_name = fields_highlighted_vec[i];
+                const std::vector<std::string>& q_tokens = field_query_tokens[fields_highlighted_indices[i]].q_include_tokens;

                field search_field = search_schema.at(field_name);
                if(query != "*" && (search_field.type == field_types::STRING ||
--- a/src/collection_manager.cpp
+++ b/src/collection_manager.cpp
@ -495,6 +495,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re

    // list of fields which will be highlighted fully without snippeting
    const char *HIGHLIGHT_FULL_FIELDS = "highlight_full_fields";
+    const char *HIGHLIGHT_FIELDS = "highlight_fields";

    const char *HIGHLIGHT_START_TAG = "highlight_start_tag";
    const char *HIGHLIGHT_END_TAG = "highlight_end_tag";
@ -546,6 +547,10 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
        req_params[HIGHLIGHT_FULL_FIELDS] = "";
    }

+    if(req_params.count(HIGHLIGHT_FIELDS) == 0) {
+        req_params[HIGHLIGHT_FIELDS] = "";
+    }
+
    if(req_params.count(HIGHLIGHT_START_TAG) == 0) {
        req_params[HIGHLIGHT_START_TAG] = "<mark>";
    }
@ -768,7 +773,8 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
                                                          static_cast<size_t>(std::stol(req_params[LIMIT_HITS])),
                                                          prioritize_exact_match,
                                                          pre_segmented_query,
-                                                          enable_overrides
+                                                          enable_overrides,
+                                                          req_params[HIGHLIGHT_FIELDS]
                                                        );

    uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@ -54,5 +54,92 @@ TEST_F(CollectionSpecificTest, SearchTextWithHyphen) {
    ASSERT_EQ(1, results["hits"].size());

    ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionSpecificTest, ExplicitHighlightFieldsConfig) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("description", field_types::STRING, false),
+                                 field("author", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc;
+    doc["id"] = "0";
+    doc["title"] = "The quick brown fox was too fast.";
+    doc["description"] = "A story about a brown fox who was fast.";
+    doc["author"] = "David Pernell";
+    doc["points"] = 100;
+
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    auto results = coll1->search("brown fox pernell", {"title"}, "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {false}, 1, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 1, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {1}, 10000, true, false, true, "description,author").get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["hits"].size());
+
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ(2, results["hits"][0]["highlights"].size());
+
+    ASSERT_EQ("description", results["hits"][0]["highlights"][0]["field"].get<std::string>());
+    ASSERT_EQ("A story about a <mark>brown</mark> <mark>fox</mark> who was fast.", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
+
+    ASSERT_EQ("author", results["hits"][0]["highlights"][1]["field"].get<std::string>());
+    ASSERT_EQ("David <mark>Pernell</mark>", results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
+
+    // excluded fields are NOT respected if explicit highlight fields are provided
+
+    results = coll1->search("brown fox pernell", {"title"}, "", {}, {}, {2}, 10,
+                            1, FREQUENCY, {false}, 1, spp::sparse_hash_set<std::string>(),
+                            {"description"}, 10, "", 30, 4, "", 1, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {1}, 10000, true, false, true, "description,author").get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["hits"].size());
+
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ(2, results["hits"][0]["highlights"].size());
+    ASSERT_FALSE(results["hits"][0]["document"].contains("description"));
+
+    ASSERT_EQ("description", results["hits"][0]["highlights"][0]["field"].get<std::string>());
+    ASSERT_EQ("author", results["hits"][0]["highlights"][1]["field"].get<std::string>());
+
+    // query not matching field selected for highlighting
+
+    results = coll1->search("pernell", {"title", "author"}, "", {}, {}, {2}, 10,
+                            1, FREQUENCY, {false}, 1, spp::sparse_hash_set<std::string>(),
+                            {"description"}, 10, "", 30, 4, "", 1, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {1,1}, 10000, true, false, true, "description").get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ(0, results["hits"][0]["highlights"].size());
+
+    // wildcard query with search field names
+
+    results = coll1->search("*", {"title", "author"}, "", {}, {}, {2}, 10,
+                            1, FREQUENCY, {false}, 1, spp::sparse_hash_set<std::string>(),
+                            {"description"}, 10, "", 30, 4, "", 1, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {1,1}, 10000, true, false, true, "description,author").get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ(0, results["hits"][0]["highlights"].size());
+
+    // wildcard query without search field names
+
+    results = coll1->search("*", {}, "", {}, {}, {2}, 10,
+                            1, FREQUENCY, {false}, 1, spp::sparse_hash_set<std::string>(),
+                            {"description"}, 10, "", 30, 4, "", 1, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {1,1}, 10000, true, false, true, "description,author").get();
+
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ(0, results["hits"][0]["highlights"].size());
+
    collectionManager.drop_collection("coll1");
 }