From 38d44a7c8ae9f51e96ba7b06d1bbb8c472f77668 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 26 Jul 2021 13:18:42 +0530 Subject: [PATCH 1/6] Highlight field value that is a prefix of the query. --- include/index.h | 3 ++- src/collection.cpp | 16 ++++++----- src/index.cpp | 9 ++++--- test/collection_specific_test.cpp | 45 ++++++++++++++++++++++++++++--- test/collection_test.cpp | 6 ++++- 5 files changed, 64 insertions(+), 15 deletions(-) diff --git a/include/index.h b/include/index.h index 37bb45e0..b10bea9d 100644 --- a/include/index.h +++ b/include/index.h @@ -396,7 +396,8 @@ public: static void transform_for_180th_meridian(GeoCoord& point, double offset); - art_leaf* get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len); + void get_token_leaves(const std::string & field_name, const unsigned char* token, uint32_t token_len, + std::vector& leaves); // the following methods are not synchronized because their parent calls are synchronized diff --git a/src/collection.cpp b/src/collection.cpp index 0624cb44..d754d66c 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1501,11 +1501,12 @@ void Collection::highlight_result(const field &search_field, // Must search for the token string fresh on that field for the given document since `token_leaf` // is from the best matched field and need not be present in other fields of a document. Index* index = indices[field_order_kv->key % num_memory_shards]; - art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len); - //LOG(INFO) << "field: " << search_field.name << ", key: " << token_leaf->key; + std::vector leaves; + index->get_token_leaves(search_field.name, &token_leaf->key[0], token_leaf->key_len, leaves); - if(actual_leaf != nullptr) { + for(const auto actual_leaf: leaves) { + //LOG(INFO) << "field: " << search_field.name << ", key: " << token_leaf->key; query_suggestion.push_back(actual_leaf); std::string token(reinterpret_cast(actual_leaf->key), actual_leaf->key_len-1); query_suggestion_tokens.insert(token); @@ -1525,10 +1526,11 @@ void Collection::highlight_result(const field &search_field, } Index* index = indices[field_order_kv->key % num_memory_shards]; - art_leaf *actual_leaf = index->get_token_leaf(search_field.name, - reinterpret_cast(q_token.c_str()), - q_token.size() + 1); - if(actual_leaf != nullptr) { + std::vector leaves; + index->get_token_leaves(search_field.name, reinterpret_cast(q_token.c_str()), + q_token.size() + 1, leaves); + + for(const auto actual_leaf: leaves) { std::vector positions; uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key); if(doc_index != actual_leaf->values->ids.getLength()) { diff --git a/src/index.cpp b/src/index.cpp index 6f6f4b7d..1b8227d2 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2639,10 +2639,13 @@ void Index::tokenize_string_field(const nlohmann::json& document, const field& s } } -art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) { +void Index::get_token_leaves(const std::string & field_name, const unsigned char* token, uint32_t token_len, + std::vector& leaves) { std::shared_lock lock(mutex); - const art_tree *t = search_index.at(field_name); - return (art_leaf*) art_search(t, token, (int) token_len); + art_tree *t = search_index.at(field_name); + token_len = (token_len == 0) ? 0 : token_len-1; + art_fuzzy_search(t, token, token_len, 0, 0, 2, token_ordering::MAX_SCORE, + true, nullptr, 0, leaves); } const spp::sparse_hash_map &Index::_get_search_index() const { diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index 786211f1..d316b230 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -606,10 +606,49 @@ TEST_F(CollectionSpecificTest, TokensSpreadAcrossFields) { spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {4, 1}).get(); - LOG(INFO) << results; - ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); ASSERT_EQ("1", results["hits"][1]["document"]["id"].get()); collectionManager.drop_collection("coll1"); -} \ No newline at end of file +} + +TEST_F(CollectionSpecificTest, HighlightSecondaryFieldWithPrefixMatch) { + std::vector fields = {field("title", field_types::STRING, false), + field("description", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["title"] = "Functions and Equations"; + doc1["description"] = "Use a function to solve an equation."; + doc1["points"] = 100; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["title"] = "Function of effort"; + doc2["description"] = "Learn all about it."; + doc2["points"] = 100; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + + auto results = coll1->search("function", {"title", "description"}, "", {}, {}, {0}, 10, + 1, FREQUENCY, {true, true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, + "", "", {1, 1}).get(); + + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + + ASSERT_EQ(2, results["hits"][0]["highlights"].size()); + + ASSERT_EQ("Functions and Equations", + results["hits"][0]["highlights"][0]["snippet"].get()); + + ASSERT_EQ("Use a function to solve an equation.", + results["hits"][0]["highlights"][1]["snippet"].get()); + + collectionManager.drop_collection("coll1"); +} diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 0e9221a2..d82982d0 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -3317,7 +3317,7 @@ TEST_F(CollectionTest, MultiFieldHighlighting) { ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); - ASSERT_EQ(2, results["hits"][0]["highlights"].size()); + ASSERT_EQ(3, results["hits"][0]["highlights"].size()); ASSERT_EQ("name", results["hits"][0]["highlights"][0]["field"].get()); ASSERT_EQ("Best Wireless Vehicle Charger", results["hits"][0]["highlights"][0]["snippet"].get()); @@ -3326,6 +3326,10 @@ TEST_F(CollectionTest, MultiFieldHighlighting) { ASSERT_EQ("Easily replenish your cell phone with this wireless charger.", results["hits"][0]["highlights"][1]["snippet"].get()); + ASSERT_EQ("categories", results["hits"][0]["highlights"][2]["field"].get()); + ASSERT_EQ("Car Chargers", + results["hits"][0]["highlights"][2]["snippets"][0].get()); + results = coll1->search("John With Denver", {"description"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(), From e45f18785f143b3933bd529ba2cce77977cf9c07 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 26 Jul 2021 19:44:10 +0530 Subject: [PATCH 2/6] Ignore id field present in schema. --- include/field.h | 12 ++++++++++++ test/collection_specific_test.cpp | 24 ++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/include/field.h b/include/field.h index 8d9cbf83..e7200da8 100644 --- a/include/field.h +++ b/include/field.h @@ -5,6 +5,7 @@ #include "art.h" #include "option.h" #include "string_utils.h" +#include "logger.h" #include "json.hpp" namespace field_types { @@ -192,6 +193,10 @@ struct field { bool found_default_sorting_field = false; for(const field & field: fields) { + if(field.name == "id") { + continue; + } + nlohmann::json field_val; field_val[fields::name] = field.name; field_val[fields::type] = field.type; @@ -263,6 +268,13 @@ struct field { size_t num_auto_detect_fields = 0; for(nlohmann::json & field_json: fields_json) { + if(field_json["name"] == "id") { + // No field should exist with the name "id" as it is reserved for internal use + // We cannot throw an error here anymore since that will break backward compatibility! + LOG(WARNING) << "Collection schema cannot contain a field with name `id`. Ignoring field."; + continue; + } + if(!field_json.is_object() || field_json.count(fields::name) == 0 || field_json.count(fields::type) == 0 || !field_json.at(fields::name).is_string() || !field_json.at(fields::type).is_string()) { diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index d316b230..f36be260 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -652,3 +652,27 @@ TEST_F(CollectionSpecificTest, HighlightSecondaryFieldWithPrefixMatch) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionSpecificTest, GuardAgainstIdFieldInSchema) { + // The "id" field, if defined in the schema should be ignored + + std::vector fields = {field("title", field_types::STRING, false), + field("id", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + nlohmann::json schema; + schema["name"] = "books"; + schema["fields"] = nlohmann::json::array(); + schema["fields"][0]["name"] = "title"; + schema["fields"][0]["type"] = "string"; + schema["fields"][1]["name"] = "id"; + schema["fields"][1]["type"] = "string"; + schema["fields"][2]["name"] = "points"; + schema["fields"][2]["type"] = "int32"; + + Collection* coll1 = collectionManager.create_collection(schema).get(); + + ASSERT_EQ(0, coll1->get_schema().count("id")); + + collectionManager.drop_collection("coll1"); +} From b4c222064c6bf96f5af6d727d732eea748c4e5b2 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Mon, 26 Jul 2021 19:44:38 +0530 Subject: [PATCH 3/6] Handle bad data in ingestion text gracefully. --- src/collection.cpp | 3 ++- test/collection_specific_test.cpp | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/collection.cpp b/src/collection.cpp index d754d66c..aaf0d11f 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -337,7 +337,8 @@ void Collection::batch_index(std::vector> &index_batch res["code"] = index_record.indexed.code(); } - json_out[index_record.position] = res.dump(); + json_out[index_record.position] = res.dump(-1, ' ', false, + nlohmann::detail::error_handler_t::ignore); } } } diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index f36be260..4fc657e7 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -676,3 +676,16 @@ TEST_F(CollectionSpecificTest, GuardAgainstIdFieldInSchema) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionSpecificTest, HandleBadCharactersInStringGracefully) { + std::vector fields = {field("title", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + std::string doc_str = "不推荐。\",\"price\":10.12,\"ratings\":5}"; + + auto add_op = coll1->add(doc_str); + ASSERT_FALSE(add_op.ok()); + + collectionManager.drop_collection("coll1"); +} From 13cb7b936493f694a296e62e0860dd17e72bc7bc Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 27 Jul 2021 17:57:49 +0530 Subject: [PATCH 4/6] Revert "Highlight field value that is a prefix of the query." This reverts commit 545027a59bc55b24c2fece112b4fa6a655a1f79e. # Conflicts: # test/collection_specific_test.cpp --- include/index.h | 3 +-- src/collection.cpp | 16 ++++++------ src/index.cpp | 9 +++---- test/collection_specific_test.cpp | 41 ------------------------------- test/collection_test.cpp | 6 +---- 5 files changed, 12 insertions(+), 63 deletions(-) diff --git a/include/index.h b/include/index.h index b10bea9d..37bb45e0 100644 --- a/include/index.h +++ b/include/index.h @@ -396,8 +396,7 @@ public: static void transform_for_180th_meridian(GeoCoord& point, double offset); - void get_token_leaves(const std::string & field_name, const unsigned char* token, uint32_t token_len, - std::vector& leaves); + art_leaf* get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len); // the following methods are not synchronized because their parent calls are synchronized diff --git a/src/collection.cpp b/src/collection.cpp index aaf0d11f..3373ab30 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1502,12 +1502,11 @@ void Collection::highlight_result(const field &search_field, // Must search for the token string fresh on that field for the given document since `token_leaf` // is from the best matched field and need not be present in other fields of a document. Index* index = indices[field_order_kv->key % num_memory_shards]; + art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len); - std::vector leaves; - index->get_token_leaves(search_field.name, &token_leaf->key[0], token_leaf->key_len, leaves); + //LOG(INFO) << "field: " << search_field.name << ", key: " << token_leaf->key; - for(const auto actual_leaf: leaves) { - //LOG(INFO) << "field: " << search_field.name << ", key: " << token_leaf->key; + if(actual_leaf != nullptr) { query_suggestion.push_back(actual_leaf); std::string token(reinterpret_cast(actual_leaf->key), actual_leaf->key_len-1); query_suggestion_tokens.insert(token); @@ -1527,11 +1526,10 @@ void Collection::highlight_result(const field &search_field, } Index* index = indices[field_order_kv->key % num_memory_shards]; - std::vector leaves; - index->get_token_leaves(search_field.name, reinterpret_cast(q_token.c_str()), - q_token.size() + 1, leaves); - - for(const auto actual_leaf: leaves) { + art_leaf *actual_leaf = index->get_token_leaf(search_field.name, + reinterpret_cast(q_token.c_str()), + q_token.size() + 1); + if(actual_leaf != nullptr) { std::vector positions; uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key); if(doc_index != actual_leaf->values->ids.getLength()) { diff --git a/src/index.cpp b/src/index.cpp index 1b8227d2..6f6f4b7d 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2639,13 +2639,10 @@ void Index::tokenize_string_field(const nlohmann::json& document, const field& s } } -void Index::get_token_leaves(const std::string & field_name, const unsigned char* token, uint32_t token_len, - std::vector& leaves) { +art_leaf* Index::get_token_leaf(const std::string & field_name, const unsigned char* token, uint32_t token_len) { std::shared_lock lock(mutex); - art_tree *t = search_index.at(field_name); - token_len = (token_len == 0) ? 0 : token_len-1; - art_fuzzy_search(t, token, token_len, 0, 0, 2, token_ordering::MAX_SCORE, - true, nullptr, 0, leaves); + const art_tree *t = search_index.at(field_name); + return (art_leaf*) art_search(t, token, (int) token_len); } const spp::sparse_hash_map &Index::_get_search_index() const { diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index 4fc657e7..c1855479 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -612,47 +612,6 @@ TEST_F(CollectionSpecificTest, TokensSpreadAcrossFields) { collectionManager.drop_collection("coll1"); } -TEST_F(CollectionSpecificTest, HighlightSecondaryFieldWithPrefixMatch) { - std::vector fields = {field("title", field_types::STRING, false), - field("description", field_types::STRING, false), - field("points", field_types::INT32, false),}; - - Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); - - nlohmann::json doc1; - doc1["id"] = "0"; - doc1["title"] = "Functions and Equations"; - doc1["description"] = "Use a function to solve an equation."; - doc1["points"] = 100; - - nlohmann::json doc2; - doc2["id"] = "1"; - doc2["title"] = "Function of effort"; - doc2["description"] = "Learn all about it."; - doc2["points"] = 100; - - ASSERT_TRUE(coll1->add(doc1.dump()).ok()); - ASSERT_TRUE(coll1->add(doc2.dump()).ok()); - - auto results = coll1->search("function", {"title", "description"}, "", {}, {}, {0}, 10, - 1, FREQUENCY, {true, true}, - 10, spp::sparse_hash_set(), - spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, - "", "", {1, 1}).get(); - - ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); - - ASSERT_EQ(2, results["hits"][0]["highlights"].size()); - - ASSERT_EQ("Functions and Equations", - results["hits"][0]["highlights"][0]["snippet"].get()); - - ASSERT_EQ("Use a function to solve an equation.", - results["hits"][0]["highlights"][1]["snippet"].get()); - - collectionManager.drop_collection("coll1"); -} - TEST_F(CollectionSpecificTest, GuardAgainstIdFieldInSchema) { // The "id" field, if defined in the schema should be ignored diff --git a/test/collection_test.cpp b/test/collection_test.cpp index d82982d0..0e9221a2 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -3317,7 +3317,7 @@ TEST_F(CollectionTest, MultiFieldHighlighting) { ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); - ASSERT_EQ(3, results["hits"][0]["highlights"].size()); + ASSERT_EQ(2, results["hits"][0]["highlights"].size()); ASSERT_EQ("name", results["hits"][0]["highlights"][0]["field"].get()); ASSERT_EQ("Best Wireless Vehicle Charger", results["hits"][0]["highlights"][0]["snippet"].get()); @@ -3326,10 +3326,6 @@ TEST_F(CollectionTest, MultiFieldHighlighting) { ASSERT_EQ("Easily replenish your cell phone with this wireless charger.", results["hits"][0]["highlights"][1]["snippet"].get()); - ASSERT_EQ("categories", results["hits"][0]["highlights"][2]["field"].get()); - ASSERT_EQ("Car Chargers", - results["hits"][0]["highlights"][2]["snippets"][0].get()); - results = coll1->search("John With Denver", {"description"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(), From 331db4f27e7917dd025e3f89218f3d824b374788 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 27 Jul 2021 19:57:56 +0530 Subject: [PATCH 5/6] Add precision option to geo field sorting. --- include/field.h | 10 +++- src/collection.cpp | 70 ++++++++++++++--------- src/index.cpp | 5 ++ test/collection_sorting_test.cpp | 98 +++++++++++++++++++++++++++++++- 4 files changed, 152 insertions(+), 31 deletions(-) diff --git a/include/field.h b/include/field.h index e7200da8..03d82ec8 100644 --- a/include/field.h +++ b/include/field.h @@ -475,6 +475,7 @@ namespace sort_field_const { static const std::string seq_id = "_seq_id"; static const std::string exclude_radius = "exclude_radius"; + static const std::string precision = "precision"; } struct sort_by { @@ -484,14 +485,16 @@ struct sort_by { // geo related fields int64_t geopoint; uint32_t exclude_radius; + uint32_t geo_precision; sort_by(const std::string & name, const std::string & order): - name(name), order(order), geopoint(0), exclude_radius(0) { + name(name), order(order), geopoint(0), exclude_radius(0), geo_precision(0) { } - sort_by(const std::string &name, const std::string &order, int64_t geopoint, uint32_t exclude_radius) : - name(name), order(order), geopoint(geopoint), exclude_radius(exclude_radius) { + sort_by(const std::string &name, const std::string &order, int64_t geopoint, + uint32_t exclude_radius, uint32_t geo_precision) : + name(name), order(order), geopoint(geopoint), exclude_radius(exclude_radius), geo_precision(geo_precision) { } @@ -500,6 +503,7 @@ struct sort_by { order = other.order; geopoint = other.geopoint; exclude_radius = other.exclude_radius; + geo_precision = other.geo_precision; return *this; } }; diff --git a/src/collection.cpp b/src/collection.cpp index 3373ab30..44590a26 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -753,36 +753,54 @@ Option Collection::search(const std::string & query, const std:: if(geo_parts.size() == 3) { // try to parse the exclude radius option - if(!StringUtils::begins_with(geo_parts[2], sort_field_const::exclude_radius)) { - return Option(400, error); - } + bool is_exclude_option = false; - std::vector exclude_parts; - StringUtils::split(geo_parts[2], exclude_parts, ":"); - - if(exclude_parts.size() != 2) { - return Option(400, error); - } - - std::vector exclude_value_parts; - StringUtils::split(exclude_parts[1], exclude_value_parts, " "); - - if(exclude_value_parts.size() != 2) { - return Option(400, error); - } - - if(!StringUtils::is_float(exclude_value_parts[0])) { - return Option(400, error); - } - - if(exclude_value_parts[1] == "km") { - sort_field_std.exclude_radius = std::stof(exclude_value_parts[0]) * 1000; - } else if(exclude_value_parts[1] == "mi") { - sort_field_std.exclude_radius = std::stof(exclude_value_parts[0]) * 1609.34; + if(StringUtils::begins_with(geo_parts[2], sort_field_const::exclude_radius)) { + is_exclude_option = true; + } else if(StringUtils::begins_with(geo_parts[2], sort_field_const::precision)) { + is_exclude_option = false; } else { - return Option(400, "Sort field's exclude radius " + return Option(400, error); + } + + std::vector param_parts; + StringUtils::split(geo_parts[2], param_parts, ":"); + + if(param_parts.size() != 2) { + return Option(400, error); + } + + std::vector param_value_parts; + StringUtils::split(param_parts[1], param_value_parts, " "); + + if(param_value_parts.size() != 2) { + return Option(400, error); + } + + if(!StringUtils::is_float(param_value_parts[0])) { + return Option(400, error); + } + + int32_t value_meters; + + if(param_value_parts[1] == "km") { + value_meters = std::stof(param_value_parts[0]) * 1000; + } else if(param_value_parts[1] == "mi") { + value_meters = std::stof(param_value_parts[0]) * 1609.34; + } else { + return Option(400, "Sort field's parameter " "unit must be either `km` or `mi`."); } + + if(value_meters <= 0) { + return Option(400, "Sort field's parameter must be a positive number."); + } + + if(is_exclude_option) { + sort_field_std.exclude_radius = value_meters; + } else { + sort_field_std.geo_precision = value_meters; + } } double lat = std::stod(geo_parts[0]); diff --git a/src/index.cpp b/src/index.cpp index 6f6f4b7d..a36a7adc 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2145,6 +2145,11 @@ void Index::score_results(const std::vector & sort_fields, const uint16 dist = 0; } + if(sort_fields[i].geo_precision > 0) { + dist = dist + sort_fields[i].geo_precision - 1 - + (dist + sort_fields[i].geo_precision - 1) % sort_fields[i].geo_precision; + } + geopoint_distances[i].emplace(seq_id, dist); } diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp index d85686a5..5c2283b0 100644 --- a/test/collection_sorting_test.cpp +++ b/test/collection_sorting_test.cpp @@ -756,7 +756,101 @@ TEST_F(CollectionSortingTest, GeoPointSortingWithExcludeRadius) { {}, geo_sort_fields, {0}, 10, 1, FREQUENCY); ASSERT_FALSE(res_op.ok()); - ASSERT_EQ("Sort field's exclude radius unit must be either `km` or `mi`.", res_op.error()); + ASSERT_EQ("Sort field's parameter unit must be either `km` or `mi`.", res_op.error()); + + geo_sort_fields = { sort_by("loc(32.24348, 77.1893, exclude_radius: -10 km)", "ASC") }; + res_op = coll1->search("*", {}, "loc: (32.24348, 77.1893, 20 km)", + {}, geo_sort_fields, {0}, 10, 1, FREQUENCY); + + ASSERT_FALSE(res_op.ok()); + ASSERT_EQ("Sort field's parameter must be a positive number.", res_op.error()); collectionManager.drop_collection("coll1"); -} \ No newline at end of file +} + +TEST_F(CollectionSortingTest, GeoPointSortingWithPrecision) { + Collection* coll1; + + std::vector fields = {field("title", field_types::STRING, false), + field("loc", field_types::GEOPOINT, false), + field("points", field_types::INT32, false),}; + + coll1 = collectionManager.get_collection("coll1").get(); + if (coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + } + + std::vector> records = { + {"Tibetan Colony", "32.24678, 77.19239"}, + {"Civil Hospital", "32.23959, 77.18763"}, + {"Johnson Lodge", "32.24751, 77.18814"}, + + {"Lion King Rock", "32.24493, 77.17038"}, + {"Jai Durga Handloom", "32.25749, 77.17583"}, + {"Panduropa", "32.26059, 77.21798"}, + + {"Police Station", "32.23743, 77.18639"}, + {"Panduropa Post", "32.26263, 77.2196"}, + }; + + for (size_t i = 0; i < records.size(); i++) { + nlohmann::json doc; + + std::vector lat_lng; + StringUtils::split(records[i][1], lat_lng, ", "); + + double lat = std::stod(lat_lng[0]); + double lng = std::stod(lat_lng[1]); + + doc["id"] = std::to_string(i); + doc["title"] = records[i][0]; + doc["loc"] = {lat, lng}; + doc["points"] = i; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + } + + std::vector geo_sort_fields = { + sort_by("loc(32.24348, 77.1893, precision: 0.9 km)", "ASC"), + sort_by("points", "DESC"), + }; + + auto results = coll1->search("*", + {}, "loc: (32.24348, 77.1893, 20 km)", + {}, geo_sort_fields, {0}, 10, 1, FREQUENCY).get(); + + ASSERT_EQ(8, results["found"].get()); + + std::vector expected_ids = { + "6", "2", "1", "0", "3", "4", "7", "5" + }; + + for (size_t i = 0; i < expected_ids.size(); i++) { + ASSERT_STREQ(expected_ids[i].c_str(), results["hits"][i]["document"]["id"].get().c_str()); + } + + // badly formatted precision + + geo_sort_fields = { sort_by("loc(32.24348, 77.1893, precision 1 km)", "ASC") }; + auto res_op = coll1->search("*", {}, "loc: (32.24348, 77.1893, 20 km)", + {}, geo_sort_fields, {0}, 10, 1, FREQUENCY); + + ASSERT_FALSE(res_op.ok()); + ASSERT_EQ("Bad syntax for geopoint sorting field `loc`", res_op.error()); + + geo_sort_fields = { sort_by("loc(32.24348, 77.1893, precision: 1 meter)", "ASC") }; + res_op = coll1->search("*", {}, "loc: (32.24348, 77.1893, 20 km)", + {}, geo_sort_fields, {0}, 10, 1, FREQUENCY); + + ASSERT_FALSE(res_op.ok()); + ASSERT_EQ("Sort field's parameter unit must be either `km` or `mi`.", res_op.error()); + + geo_sort_fields = { sort_by("loc(32.24348, 77.1893, precision: -10 km)", "ASC") }; + res_op = coll1->search("*", {}, "loc: (32.24348, 77.1893, 20 km)", + {}, geo_sort_fields, {0}, 10, 1, FREQUENCY); + + ASSERT_FALSE(res_op.ok()); + ASSERT_EQ("Sort field's parameter must be a positive number.", res_op.error()); + + collectionManager.drop_collection("coll1"); +} From b2c12a9b2cef557e57a51aabe708a154ffff91cf Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sat, 31 Jul 2021 08:59:49 +0530 Subject: [PATCH 6/6] Fix more edge cases in highlighting. --- include/topster.h | 66 +++++++++++++++++++++++++++ src/collection.cpp | 76 +++++++++++++++++++++++-------- src/index.cpp | 16 +++++-- test/collection_locale_test.cpp | 2 +- test/collection_manager_test.cpp | 1 - test/collection_specific_test.cpp | 72 +++++++++++++++++++++++++++++ test/collection_test.cpp | 67 +++++++++++++++------------ 7 files changed, 247 insertions(+), 53 deletions(-) diff --git a/include/topster.h b/include/topster.h index 5550fbd0..19ba746c 100644 --- a/include/topster.h +++ b/include/topster.h @@ -16,6 +16,9 @@ struct KV { uint64_t distinct_key{}; int64_t scores[3]{}; // match score + 2 custom attributes + // to be used only in final aggregation + uint64_t* query_indices = nullptr; + KV(uint8_t field_id, uint16_t queryIndex, uint32_t token_bits, uint64_t key, uint64_t distinct_key, uint8_t match_score_index, const int64_t *scores): field_id(field_id), match_score_index(match_score_index), @@ -27,6 +30,69 @@ struct KV { } KV() = default; + + KV(KV& kv) = default; + + KV(KV&& kv) noexcept : field_id(kv.field_id), match_score_index(kv.match_score_index), + query_index(kv.query_index), array_index(kv.array_index), token_bits(kv.token_bits), + key(kv.key), distinct_key(kv.distinct_key) { + + scores[0] = kv.scores[0]; + scores[1] = kv.scores[1]; + scores[2] = kv.scores[2]; + + query_indices = kv.query_indices; + kv.query_indices = nullptr; + } + + KV& operator=(KV&& kv) noexcept { + if (this != &kv) { + field_id = kv.field_id; + match_score_index = kv.match_score_index; + query_index = kv.query_index; + array_index = kv.array_index; + token_bits = kv.token_bits; + key = kv.key; + distinct_key = kv.distinct_key; + + scores[0] = kv.scores[0]; + scores[1] = kv.scores[1]; + scores[2] = kv.scores[2]; + + delete[] query_indices; + query_indices = kv.query_indices; + kv.query_indices = nullptr; + } + + return *this; + } + + KV& operator=(KV& kv) noexcept { + if (this != &kv) { + field_id = kv.field_id; + match_score_index = kv.match_score_index; + query_index = kv.query_index; + array_index = kv.array_index; + token_bits = kv.token_bits; + key = kv.key; + distinct_key = kv.distinct_key; + + scores[0] = kv.scores[0]; + scores[1] = kv.scores[1]; + scores[2] = kv.scores[2]; + + delete[] query_indices; + query_indices = kv.query_indices; + kv.query_indices = nullptr; + } + + return *this; + } + + ~KV() { + delete [] query_indices; + query_indices = nullptr; + } }; /* diff --git a/src/collection.cpp b/src/collection.cpp index 44590a26..640e744b 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1405,12 +1405,22 @@ void Collection::aggregate_topster(size_t query_index, Topster& agg_topster, Top Topster* group_topster = group_topster_entry.second; for(const auto& map_kv: group_topster->kv_map) { map_kv.second->query_index += query_index; + if(map_kv.second->query_indices != nullptr) { + for(size_t i = 0; i < map_kv.second->query_indices[0]; i++) { + map_kv.second->query_indices[i+1] += query_index; + } + } agg_topster.add(map_kv.second); } } } else { for(const auto& map_kv: index_topster->kv_map) { map_kv.second->query_index += query_index; + if(map_kv.second->query_indices != nullptr) { + for(size_t i = 0; i < map_kv.second->query_indices[0]; i++) { + map_kv.second->query_indices[i+1] += query_index; + } + } agg_topster.add(map_kv.second); } } @@ -1516,25 +1526,38 @@ void Collection::highlight_result(const field &search_field, std::vector query_suggestion; std::set query_suggestion_tokens; - for (const art_leaf *token_leaf : searched_queries[field_order_kv->query_index]) { - // Must search for the token string fresh on that field for the given document since `token_leaf` - // is from the best matched field and need not be present in other fields of a document. - Index* index = indices[field_order_kv->key % num_memory_shards]; - art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len); + size_t qindex = 0; - //LOG(INFO) << "field: " << search_field.name << ", key: " << token_leaf->key; + do { + auto searched_query = + (field_order_kv->query_indices == nullptr) ? searched_queries[field_order_kv->query_index] : + searched_queries[field_order_kv->query_indices[qindex + 1]]; + + for (art_leaf* token_leaf : searched_query) { + // Must search for the token string fresh on that field for the given document since `token_leaf` + // is from the best matched field and need not be present in other fields of a document. + Index* index = indices[field_order_kv->key % num_memory_shards]; + art_leaf* actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len); + + if(actual_leaf == nullptr) { + continue; + } - if(actual_leaf != nullptr) { - query_suggestion.push_back(actual_leaf); - std::string token(reinterpret_cast(actual_leaf->key), actual_leaf->key_len-1); - query_suggestion_tokens.insert(token); - std::vector positions; uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key); - auto doc_indices = new uint32_t[1]; - doc_indices[0] = doc_index; - leaf_to_indices.push_back(doc_indices); + if(doc_index != actual_leaf->values->ids.getLength()) { + query_suggestion.push_back(actual_leaf); + std::string token(reinterpret_cast(actual_leaf->key), actual_leaf->key_len - 1); + //LOG(INFO) << "field: " << search_field.name << ", key: " << token; + query_suggestion_tokens.insert(token); + auto doc_indices = new uint32_t[1]; + doc_indices[0] = doc_index; + leaf_to_indices.push_back(doc_indices); + } } - } + + qindex++; + } while(field_order_kv->query_indices != nullptr && qindex < field_order_kv->query_indices[0]); + if(query_suggestion.size() != q_tokens.size()) { // can happen for compound query matched across 2 fields when some tokens are dropped @@ -1543,6 +1566,8 @@ void Collection::highlight_result(const field &search_field, continue; } + query_suggestion_tokens.insert(q_token); + Index* index = indices[field_order_kv->key % num_memory_shards]; art_leaf *actual_leaf = index->get_token_leaf(search_field.name, reinterpret_cast(q_token.c_str()), @@ -1638,9 +1663,10 @@ void Collection::highlight_result(const field &search_field, highlight.matched_tokens.emplace_back(); std::vector& matched_tokens = highlight.matched_tokens.back(); + bool found_first_match = false; while(tokenizer.next(raw_token, raw_token_index, tok_start, tok_end)) { - if(token_offsets.empty()) { + if(!found_first_match) { if(snippet_start_window.size() == highlight_affix_num_tokens + 1) { snippet_start_window.pop_front(); } @@ -1648,7 +1674,10 @@ void Collection::highlight_result(const field &search_field, snippet_start_window.push_back(tok_start); } - if (token_hits.count(raw_token) != 0 || + bool token_already_found = token_hits.count(raw_token) != 0; + + // ensures that the `snippet_start_offset` is always from a matched token, and not from query suggestion + if ((found_first_match && token_already_found) || (match_offset_index < match.offsets.size() && match.offsets[match_offset_index].offset == raw_token_index)) { @@ -1661,9 +1690,15 @@ void Collection::highlight_result(const field &search_field, } while(match_offset_index < match.offsets.size() && match.offsets[match_offset_index - 1].offset == match.offsets[match_offset_index].offset); - if(token_offsets.size() == 1) { + if(!found_first_match) { snippet_start_offset = snippet_start_window.front(); } + + found_first_match = true; + + } else if(query_suggestion_tokens.find(raw_token) != query_suggestion_tokens.end()) { + token_offsets.emplace(tok_start, tok_end); + token_hits.insert(raw_token); } if(raw_token_index == last_valid_offset + highlight_affix_num_tokens) { @@ -1693,6 +1728,11 @@ void Collection::highlight_result(const field &search_field, auto offset_it = token_offsets.begin(); std::stringstream highlighted_text; + // tokens from query might occur before actual snippet start offset: we skip that + while(offset_it != token_offsets.end() && offset_it->first < snippet_start_offset) { + offset_it++; + } + for(size_t i = snippet_start_offset; i <= snippet_end_offset; i++) { if(offset_it != token_offsets.end()) { if (i == offset_it->first) { diff --git a/src/index.cpp b/src/index.cpp index a36a7adc..ee8debdf 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -995,6 +995,11 @@ void Index::search_candidates(const uint8_t & field_id, field_num_results += filtered_results_size; + /*if(filtered_results_size != 0) { + LOG(INFO) << size_t(field_id) << " - " << log_query.str() << ", filtered_results_size: " << filtered_results_size + << ", popcount: " << (__builtin_popcount(token_bits) - 1); + }*/ + delete[] filtered_result_ids; delete[] result_ids; } else { @@ -1695,13 +1700,18 @@ void Index::search(const std::vector& field_query_tokens, auto& kvs = seq_id_kvs.second; // each `kv` can be from a different field std::sort(kvs.begin(), kvs.end(), Topster::is_greater); + kvs[0]->query_indices = new uint64_t[kvs.size() + 1]; + kvs[0]->query_indices[0] = kvs.size(); - // LOG(INFO) << "DOC ID: " << seq_id << ", score: " << kvs[0]->scores[kvs[0]->match_score_index]; + //LOG(INFO) << "DOC ID: " << seq_id << ", score: " << kvs[0]->scores[kvs[0]->match_score_index]; // to calculate existing aggregate scores across best matching fields spp::sparse_hash_map existing_field_kvs; - for(const auto kv: kvs) { - existing_field_kvs.emplace(kv->field_id, kv); + for(size_t kv_i = 0; kv_i < kvs.size(); kv_i++) { + existing_field_kvs.emplace(kvs[kv_i]->field_id, kvs[kv_i]); + kvs[0]->query_indices[kv_i+1] = kvs[kv_i]->query_index; + /*LOG(INFO) << "kv_i: " << kv_i << ", kvs[kv_i]->query_index: " << kvs[kv_i]->query_index << ", " + << "searched_query: " << searched_queries[kvs[kv_i]->query_index][0];*/ } uint32_t token_bits = (uint32_t(1) << 31); // top most bit set to guarantee atleast 1 bit set diff --git a/test/collection_locale_test.cpp b/test/collection_locale_test.cpp index c6f5b63a..75fe60b1 100644 --- a/test/collection_locale_test.cpp +++ b/test/collection_locale_test.cpp @@ -263,7 +263,7 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) { ASSERT_EQ("ติดกับดักรายได้ปานกลาง", results["hits"][0]["highlights"][0]["snippet"].get()); - ASSERT_EQ("ข้อมูลรายคนหรือรายบริษัทในการเชื่อมโยงส่วนได้ส่วนเสีย", + ASSERT_EQ("ข้อมูลรายคนหรือรายบริษัทในการเชื่อมโยงส่วนได้ส่วนเสีย", results["hits"][1]["highlights"][0]["snippet"].get()); } diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index 8b3f507a..ac87bfc4 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -232,7 +232,6 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) { std::vector facets; nlohmann::json results = collection1->search("thomas", search_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); - LOG(INFO) << results; ASSERT_EQ(4, results["hits"].size()); std::unordered_map schema = collection1->get_schema(); diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index c1855479..bf74f410 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -648,3 +648,75 @@ TEST_F(CollectionSpecificTest, HandleBadCharactersInStringGracefully) { collectionManager.drop_collection("coll1"); } + +TEST_F(CollectionSpecificTest, HighlightSecondaryFieldWithPrefixMatch) { + std::vector fields = {field("title", field_types::STRING, false), + field("description", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["title"] = "Functions and Equations"; + doc1["description"] = "Use a function to solve an equation."; + doc1["points"] = 100; + + nlohmann::json doc2; + doc2["id"] = "1"; + doc2["title"] = "Function of effort"; + doc2["description"] = "Learn all about it."; + doc2["points"] = 100; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + ASSERT_TRUE(coll1->add(doc2.dump()).ok()); + + auto results = coll1->search("function", {"title", "description"}, "", {}, {}, {0}, 10, + 1, FREQUENCY, {true, true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, + "", "", {1, 1}).get(); + + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + + ASSERT_EQ(2, results["hits"][0]["highlights"].size()); + + ASSERT_EQ("Functions and Equations", + results["hits"][0]["highlights"][0]["snippet"].get()); + + ASSERT_EQ("Use a function to solve an equation.", + results["hits"][0]["highlights"][1]["snippet"].get()); + + collectionManager.drop_collection("coll1"); +} + +TEST_F(CollectionSpecificTest, HighlightWithDropTokens) { + std::vector fields = {field("description", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + + nlohmann::json doc1; + doc1["id"] = "0"; + doc1["description"] = "HPE Aruba AP-575 802.11ax Wireless Access Point - TAA Compliant - 2.40 GHz, " + "5 GHz - MIMO Technology - 1 x Network (RJ-45) - Gigabit Ethernet - Bluetooth 5"; + doc1["points"] = 100; + + ASSERT_TRUE(coll1->add(doc1.dump()).ok()); + + auto results = coll1->search("HPE Aruba AP-575 Technology Gigabit Bluetooth 5", {"description"}, "", {}, {}, {0}, 10, + 1, FREQUENCY, {true}, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "description", 40, {}, {}, {}, 0, + "", "").get(); + + ASSERT_EQ(1, results["hits"][0]["highlights"].size()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); + + ASSERT_EQ("HPE Aruba AP-575 802.11ax Wireless Access Point - " + "TAA Compliant - 2.40 GHz, 5 GHz - MIMO Technology - 1 x Network (RJ-45) - " + "Gigabit Ethernet - Bluetooth 5", + results["hits"][0]["highlights"][0]["snippet"].get()); + + collectionManager.drop_collection("coll1"); +} diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 0e9221a2..8f573e6e 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -746,23 +746,23 @@ TEST_F(CollectionTest, ArrayStringFieldHighlight) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - ASSERT_EQ(4, results["hits"][0]["highlights"][0].size()); - ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get().c_str(), "tags"); - ASSERT_EQ(2, results["hits"][0]["highlights"][0]["snippets"].size()); - ASSERT_STREQ("truth", results["hits"][0]["highlights"][0]["snippets"][0].get().c_str()); - ASSERT_STREQ("plain truth", results["hits"][0]["highlights"][0]["snippets"][1].get().c_str()); - ASSERT_EQ(2, results["hits"][0]["highlights"][0]["matched_tokens"].size()); - ASSERT_STREQ("truth", results["hits"][0]["highlights"][0]["matched_tokens"][0][0].get().c_str()); - ASSERT_STREQ("truth", results["hits"][0]["highlights"][0]["matched_tokens"][1][0].get().c_str()); - ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"].size()); - ASSERT_EQ(1, results["hits"][0]["highlights"][0]["indices"][0]); - ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"][1]); + ASSERT_EQ(3, results["hits"][0]["highlights"][0].size()); + ASSERT_STREQ("title", results["hits"][0]["highlights"][0]["field"].get().c_str()); + ASSERT_STREQ("Plain Truth", results["hits"][0]["highlights"][0]["snippet"].get().c_str()); + ASSERT_EQ(1, results["hits"][0]["highlights"][0]["matched_tokens"].size()); + ASSERT_STREQ("Truth", results["hits"][0]["highlights"][0]["matched_tokens"][0].get().c_str()); - ASSERT_EQ(3, results["hits"][0]["highlights"][1].size()); - ASSERT_STREQ("title", results["hits"][0]["highlights"][1]["field"].get().c_str()); - ASSERT_STREQ("Plain Truth", results["hits"][0]["highlights"][1]["snippet"].get().c_str()); - ASSERT_EQ(1, results["hits"][0]["highlights"][1]["matched_tokens"].size()); - ASSERT_STREQ("Truth", results["hits"][0]["highlights"][1]["matched_tokens"][0].get().c_str()); + ASSERT_EQ(4, results["hits"][0]["highlights"][1].size()); + ASSERT_STREQ(results["hits"][0]["highlights"][1]["field"].get().c_str(), "tags"); + ASSERT_EQ(2, results["hits"][0]["highlights"][1]["snippets"].size()); + ASSERT_STREQ("truth", results["hits"][0]["highlights"][1]["snippets"][0].get().c_str()); + ASSERT_STREQ("plain truth", results["hits"][0]["highlights"][1]["snippets"][1].get().c_str()); + ASSERT_EQ(2, results["hits"][0]["highlights"][1]["matched_tokens"].size()); + ASSERT_STREQ("truth", results["hits"][0]["highlights"][1]["matched_tokens"][0][0].get().c_str()); + ASSERT_STREQ("truth", results["hits"][0]["highlights"][1]["matched_tokens"][1][0].get().c_str()); + ASSERT_EQ(2, results["hits"][0]["highlights"][1]["indices"].size()); + ASSERT_EQ(1, results["hits"][0]["highlights"][1]["indices"][0]); + ASSERT_EQ(2, results["hits"][0]["highlights"][1]["indices"][1]); ASSERT_EQ(3, results["hits"][1]["highlights"][0].size()); ASSERT_STREQ("title", results["hits"][1]["highlights"][0]["field"].get().c_str()); @@ -2456,15 +2456,15 @@ TEST_F(CollectionTest, SearchHighlightFieldFully) { spp::sparse_hash_set(), 10, "", 5, 5, "title, tags").get(); ASSERT_EQ(2, res["hits"][0]["highlights"].size()); - ASSERT_EQ("LAZY", res["hits"][0]["highlights"][0]["values"][0].get()); ASSERT_EQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.", - res["hits"][0]["highlights"][1]["value"].get()); + res["hits"][0]["highlights"][0]["value"].get()); + ASSERT_EQ(1, res["hits"][0]["highlights"][0]["matched_tokens"].size()); + ASSERT_STREQ("lazy", res["hits"][0]["highlights"][0]["matched_tokens"][0].get().c_str()); - ASSERT_EQ(1, res["hits"][0]["highlights"][1]["matched_tokens"].size()); - ASSERT_STREQ("lazy", res["hits"][0]["highlights"][1]["matched_tokens"][0].get().c_str()); - - ASSERT_EQ(1, res["hits"][0]["highlights"][0]["values"][0].size()); - ASSERT_STREQ("LAZY", res["hits"][0]["highlights"][0]["values"][0].get().c_str()); + ASSERT_EQ(1, res["hits"][0]["highlights"][1]["values"].size()); + ASSERT_EQ("LAZY", res["hits"][0]["highlights"][1]["values"][0].get()); + ASSERT_EQ(1, res["hits"][0]["highlights"][1]["snippets"].size()); + ASSERT_EQ("LAZY", res["hits"][0]["highlights"][1]["snippets"][0].get()); // excluded fields should not be returned in highlights section spp::sparse_hash_set excluded_fields = {"tags"}; @@ -3146,13 +3146,17 @@ TEST_F(CollectionTest, MultiFieldRelevance5) { ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("2", results["hits"][2]["document"]["id"].get().c_str()); - ASSERT_EQ(1, results["hits"][0]["highlights"].size()); - ASSERT_EQ("country", results["hits"][0]["highlights"][0]["field"].get()); - ASSERT_EQ("Canada", results["hits"][0]["highlights"][0]["snippet"].get()); + ASSERT_EQ(2, results["hits"][0]["highlights"].size()); + ASSERT_EQ("field_a", results["hits"][0]["highlights"][0]["field"].get()); + ASSERT_EQ("Canadia", results["hits"][0]["highlights"][0]["snippet"].get()); + ASSERT_EQ("country", results["hits"][0]["highlights"][1]["field"].get()); + ASSERT_EQ("Canada", results["hits"][0]["highlights"][1]["snippet"].get()); - ASSERT_EQ(1, results["hits"][1]["highlights"].size()); - ASSERT_EQ("company_name", results["hits"][1]["highlights"][0]["field"].get()); - ASSERT_EQ("Canaida Corp", results["hits"][1]["highlights"][0]["snippet"].get()); + ASSERT_EQ(2, results["hits"][1]["highlights"].size()); + ASSERT_EQ("field_a", results["hits"][1]["highlights"][0]["field"].get()); + ASSERT_EQ("Canadoo", results["hits"][1]["highlights"][0]["snippet"].get()); + ASSERT_EQ("company_name", results["hits"][1]["highlights"][1]["field"].get()); + ASSERT_EQ("Canaida Corp", results["hits"][1]["highlights"][1]["snippet"].get()); ASSERT_EQ(1, results["hits"][2]["highlights"].size()); ASSERT_EQ("field_a", results["hits"][2]["highlights"][0]["field"].get()); @@ -3317,7 +3321,7 @@ TEST_F(CollectionTest, MultiFieldHighlighting) { ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); - ASSERT_EQ(2, results["hits"][0]["highlights"].size()); + ASSERT_EQ(3, results["hits"][0]["highlights"].size()); ASSERT_EQ("name", results["hits"][0]["highlights"][0]["field"].get()); ASSERT_EQ("Best Wireless Vehicle Charger", results["hits"][0]["highlights"][0]["snippet"].get()); @@ -3326,6 +3330,9 @@ TEST_F(CollectionTest, MultiFieldHighlighting) { ASSERT_EQ("Easily replenish your cell phone with this wireless charger.", results["hits"][0]["highlights"][1]["snippet"].get()); + ASSERT_EQ("categories", results["hits"][0]["highlights"][2]["field"].get()); + ASSERT_EQ("Car Chargers", results["hits"][0]["highlights"][2]["snippets"][0].get()); + results = coll1->search("John With Denver", {"description"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(),