Facet query partial highlighting + tests.

2025-05-17 12:12:35 +08:00 · 2020-02-22 18:39:52 +05:30 · 2020-02-22 18:39:52 +05:30 · 1369a473e4
commit 1369a473e4
parent 38bac77c43
5 changed files with 206 additions and 35 deletions
--- a/include/field.h
+++ b/include/field.h
@ -120,11 +120,19 @@ struct sort_by {
    }
 };

+struct token_pos_cost_t {
+    size_t pos;
+    uint32_t cost;
+};
+
 struct facet_count_t {
    uint32_t count;
-    uint32_t doc_id;    // used to fetch the actual document and the value from store
+
+    // used to fetch the actual document and value for representation
+    uint32_t doc_id;
    uint32_t array_pos;
-    spp::sparse_hash_map<uint32_t, uint32_t> token_query_pos;
+
+    spp::sparse_hash_map<uint32_t, token_pos_cost_t> query_token_pos;
 };

 struct facet {
--- a/include/index.h
+++ b/include/index.h
@ -140,7 +140,7 @@ private:

    Option<uint32_t> do_filtering(uint32_t** filter_ids_out, const std::vector<filter> & filters);

-    void do_facets(std::vector<facet> & facets, const facet_query_t & facet_query,
+    void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                   const uint32_t* result_ids, size_t results_size);

    void drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids);
@ -203,7 +203,7 @@ public:

    void search(Option<uint32_t> & outcome, std::string query, const std::vector<std::string> & search_fields,
                          const std::vector<filter> & filters, std::vector<facet> & facets,
-                          const facet_query_t & facet_query,
+                          facet_query_t & facet_query,
                          const std::vector<uint32_t> & included_ids, const std::vector<uint32_t> & excluded_ids,
                          const std::vector<sort_by> & sort_fields_std, const int num_typos,
                          const size_t max_hits, const size_t per_page, const size_t page, const token_ordering token_order,
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -460,12 +460,24 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::

    if(!simple_facet_query.empty() && simple_facet_query.find(':') == std::string::npos) {
        std::string error = "Facet query must be in the `facet_field: value` format.";
-        return Option<nlohmann::json>(404, error);
+        return Option<nlohmann::json>(400, error);
    }

    StringUtils::split(simple_facet_query, facet_query_vec, ":");
    if(!facet_query_vec.empty()) {
+        if(facet_fields.empty()) {
+            std::string error = "The `facet_query` parameter is supplied without a `facet_by` parameter.";
+            return Option<nlohmann::json>(400, error);
+        }
+
+        // facet query field must be part of facet fields requested
        facet_query = { StringUtils::trim(facet_query_vec[0]), StringUtils::trim(facet_query_vec[1]) };
+        if(std::find(facet_fields.begin(), facet_fields.end(), facet_query.field_name) == facet_fields.end()) {
+            std::string error = "Facet query refers to a facet field `" + facet_query.field_name + "` " +
+                                "that is not part of `facet_by` parameter.";
+            return Option<nlohmann::json>(400, error);
+        }
+
        if(facet_schema.count(facet_query.field_name) == 0) {
            std::string error = "Could not find a facet field named `" + facet_query.field_name + "` in the schema.";
            return Option<nlohmann::json>(404, error);
@ -582,7 +594,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
                acc_facet.result_map[facet_kv.first].count = count;
                acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
                acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
-                acc_facet.result_map[facet_kv.first].token_query_pos = facet_kv.second.token_query_pos;
+                acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos;
            }
        }

@ -743,13 +755,28 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
            StringUtils::split(value, tokens, " ");
            std::stringstream highlightedss;

+            /*std::cout << "array_pos: " << facet_count.array_pos << std::endl;
+
+            for(auto xx: facet_count.query_token_pos) {
+                std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl;
+            }
+
+            std::cout << "doc id: " << facet_count.doc_id << " i: " << i << std::endl;
+            std::cout << "doc: " << document << std::endl;*/
+
+            // invert query_pos -> token_pos
+            spp::sparse_hash_map<uint32_t, uint32_t> token_query_pos;
+            for(auto qtoken_pos: facet_count.query_token_pos) {
+                token_query_pos.emplace(qtoken_pos.second.pos, qtoken_pos.first);
+            }
+
            for(size_t i = 0; i < tokens.size(); i++) {
                if(i != 0) {
                    highlightedss << " ";
                }

-                if(facet_count.token_query_pos.count(i) != 0) {
-                    size_t highlight_len = facet_query_tokens[facet_count.token_query_pos[i]].size();
+                if(token_query_pos.count(i) != 0) {
+                    size_t highlight_len = facet_query_tokens[token_query_pos[i]].size();
                    const std::string & unmarked = tokens[i].substr(highlight_len, std::string::npos);
                    highlightedss << "<mark>" + tokens[i].substr(0, highlight_len) + "</mark>" + unmarked;
                } else {
--- a/src/index.cpp
+++ b/src/index.cpp
@ -530,8 +530,9 @@ void Index::index_float_array_field(const std::vector<float> & values, const uin
    }
 }

-void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_query,
+void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                      const uint32_t* result_ids, size_t results_size) {
+
    std::map<std::string, size_t> facet_to_index;

    size_t i_facet = 0;
@ -542,7 +543,7 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q

    // assumed that facet fields have already been validated upstream
    for(auto & a_facet: facets) {
-        spp::sparse_hash_map<int64_t, uint32_t> fhash_qtoken_pos;  // facet hash => token position in the query
+        spp::sparse_hash_map<int64_t, token_pos_cost_t> fhash_qtoken_pos;  // facet hash => token position in the query
        bool use_facet_query = false;

        if(a_facet.field_name == facet_query.field_name && !facet_query.query.empty()) {
@ -552,8 +553,8 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q

            for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
                auto & q = query_tokens[qtoken_index];
-                StringUtils::trim(q);
-                int bounded_cost = get_bounded_typo_cost(2, q.size());
+                string_utils.unicode_normalize(q);
+                int bounded_cost = (q.size() < 3) ? 0 : 1;
                std::vector<art_leaf*> leaves;
                art_fuzzy_search(search_index.at(a_facet.field_name), (const unsigned char *) q.c_str(),
                                 q.size(),0, bounded_cost, 10000,
@ -562,12 +563,15 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
                    const auto & leaf = leaves[i];
                    // calculate hash without terminating null char
                    uint64_t hash = StringUtils::hash_wy(leaf->key, leaf->key_len-1);
-                    fhash_qtoken_pos.emplace(hash, qtoken_index);
-                    printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash);
+                    token_pos_cost_t token_pos_cost = {qtoken_index, 0};
+                    fhash_qtoken_pos.emplace(hash, token_pos_cost);
+                    //printf("%.*s - %llu\n", leaf->key_len, leaf->key, hash);
                }
            }
        }

+        size_t facet_id = facet_to_index[a_facet.field_name];
+
        for(size_t i = 0; i < results_size; i++) {
            uint32_t doc_seq_id = result_ids[i];

@ -575,11 +579,12 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
                // FORMAT OF VALUES
                // String: h1 h2 h3
                // String array: h1 h2 h3 0 h1 0 h1 h2 0
-                const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
-                int array_pos = -1;
+                const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_id];
+
+                int array_pos = 0;
                int fvalue_found = 0;
                std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
-                spp::sparse_hash_map<uint32_t, uint32_t> token_query_positions;
+                spp::sparse_hash_map<uint32_t, token_pos_cost_t> query_token_positions;
                size_t field_token_index = -1;

                for(size_t j = 0; j < fhashes.size(); j++) {
@ -588,30 +593,36 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
                        fvaluestream << ftoken_hash;
                        field_token_index++;

-                        if(use_facet_query && fhash_qtoken_pos.find(ftoken_hash) == fhash_qtoken_pos.end()) {
-                            // this particular facet value is not found in facet filter, so ignore
-                            continue;
-                        }
+                        if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) {
+                            // not using facet query or this particular facet value is found in facet filter
+                            fvalue_found |= 1;  // bitwise to ensure only one count for a multi-token facet value

-                        fvalue_found |= 1;  // bitwise to ensure only one count for a multi-token facet value
+                            if(use_facet_query) {
+                                // map token index to query index (used for highlighting later on)
+                                token_pos_cost_t qtoken_pos = fhash_qtoken_pos[ftoken_hash];

-                        if(use_facet_query) {
-                            // map token index to query index (used for highlighting later on)
-                            token_query_positions.emplace(field_token_index, fhash_qtoken_pos[ftoken_hash]);
+                                // if the query token has already matched another token in the string
+                                // we will replace the position only if the cost is lower
+                                if(query_token_positions.find(qtoken_pos.pos) == query_token_positions.end() ||
+                                   query_token_positions[qtoken_pos.pos].cost >= qtoken_pos.cost ) {
+                                    token_pos_cost_t ftoken_pos_cost = {field_token_index, qtoken_pos.cost};
+                                    query_token_positions.emplace(qtoken_pos.pos, ftoken_pos_cost);
+                                }
+                            }
                        }
                    }

+                    //std::cout << "j: " << j << std::endl;
+
                    // 0 indicates separator, while the second condition checks for non-array string
                    if(fhashes[j] == 0 || (fhashes.back() != 0 && j == fhashes.size() - 1)) {
                        if(!use_facet_query || fvalue_found != 0) {
-                            array_pos++;
-
                            const std::string & fvalue_str = fvaluestream.str();
                            uint64_t fhash = StringUtils::hash_wy(fvalue_str.c_str(), fvalue_str.size());

                            if(a_facet.result_map.count(fhash) == 0) {
-                                a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, {},
-                                                                          spp::sparse_hash_map<uint32_t, uint32_t>()};
+                                a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0,
+                                                                          spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
                            }

                            a_facet.result_map[fhash].count += 1;
@ -619,14 +630,21 @@ void Index::do_facets(std::vector<facet> & facets, const facet_query_t & facet_q
                            a_facet.result_map[fhash].array_pos = array_pos;

                            if(use_facet_query) {
-                                a_facet.result_map[fhash].token_query_pos = token_query_positions;
+                                a_facet.result_map[fhash].query_token_pos = query_token_positions;
+
+                                /*if(j == 11) {
+                                    for(auto xx: query_token_positions) {
+                                        std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl;
+                                    }
+                                }*/
                            }
                        }

+                        array_pos++;
                        fvalue_found = 0;
                        std::stringstream().swap(fvaluestream);
-                        spp::sparse_hash_map<uint32_t, uint32_t>().swap(token_query_positions);
-                        field_token_index = 0;
+                        spp::sparse_hash_map<uint32_t, token_pos_cost_t>().swap(query_token_positions);
+                        field_token_index = -1;
                    }
                }
            }
@ -991,7 +1009,7 @@ void Index::search(Option<uint32_t> & outcome,
                   std::string query,
                   const std::vector<std::string> & search_fields,
                   const std::vector<filter> & filters,
-                   std::vector<facet> & facets, const facet_query_t & facet_query,
+                   std::vector<facet> & facets, facet_query_t & facet_query,
                   const std::vector<uint32_t> & included_ids,
                   const std::vector<uint32_t> & excluded_ids,
                   const std::vector<sort_by> & sort_fields_std, const int num_typos, const size_t max_hits,
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -1960,7 +1960,7 @@ TEST_F(CollectionTest, FacetCounts) {
    ASSERT_STREQ("Facet query must be in the `facet_field: value` format.", res_op.error().c_str());

    // unknown facet field
-    res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
+    res_op = coll_array_fields->search("*", query_fields, "", {"foobar"}, sort_fields, 0, 10, 1, FREQUENCY,
                                       false, Index::DROP_TOKENS_THRESHOLD,
                                       spp::sparse_hash_set<std::string>(),
                                       spp::sparse_hash_set<std::string>(), 10, 500, "foobar: baz");
@ -1968,9 +1968,124 @@ TEST_F(CollectionTest, FacetCounts) {
    ASSERT_FALSE(res_op.ok());
    ASSERT_STREQ("Could not find a facet field named `foobar` in the schema.", res_op.error().c_str());

+    // when facet query is given but no facet fields are specified, must return an error message
+    res_op = coll_array_fields->search("*", query_fields, "", {}, sort_fields, 0, 10, 1, FREQUENCY,
+                                       false, Index::DROP_TOKENS_THRESHOLD,
+                                       spp::sparse_hash_set<std::string>(),
+                                       spp::sparse_hash_set<std::string>(), 10, 500, "tags: foo");
+
+    ASSERT_FALSE(res_op.ok());
+    ASSERT_STREQ("The `facet_query` parameter is supplied without a `facet_by` parameter.", res_op.error().c_str());
+
+    // given facet query field must be part of facet fields requested
+    res_op = coll_array_fields->search("*", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY,
+                                       false, Index::DROP_TOKENS_THRESHOLD,
+                                       spp::sparse_hash_set<std::string>(),
+                                       spp::sparse_hash_set<std::string>(), 10, 500, "name_facet: jeremy");
+
+    ASSERT_FALSE(res_op.ok());
+    ASSERT_STREQ("Facet query refers to a facet field `name_facet` that is not part of `facet_by` parameter.", res_op.error().c_str());
+
    collectionManager.drop_collection("coll_array_fields");
 }

+TEST_F(CollectionTest, FacetCountsHighlighting) {
+    Collection *coll1;
+
+    std::vector<field> fields = {field("categories", field_types::STRING_ARRAY, true),
+                                 field("points", field_types::INT32, false)};
+
+    std::vector<sort_by> sort_fields = { sort_by("points", "DESC") };
+
+    coll1 = collectionManager.get_collection("coll1");
+    if(coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", fields, "points").get();
+    }
+
+    nlohmann::json doc;
+    doc["id"] = "100";
+    doc["categories"] = {"Cell Phones", "Cell Phone Accessories", "Cell Phone Cases & Clips"};
+    doc["points"] = 25;
+
+    coll1->add(doc.dump());
+
+    std::vector<std::string> facets = {"categories"};
+
+    nlohmann::json results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1,
+                                           token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                                           spp::sparse_hash_set<std::string>(), 10, 500, "categories:cell").get();
+
+    ASSERT_EQ(1, results["facet_counts"].size());
+    ASSERT_EQ(3, results["facet_counts"][0]["counts"].size());
+
+    ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
+    ASSERT_STREQ("Cell Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Cell</mark> Phone Cases & Clips", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][2]["count"]);
+    ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Cell</mark> Phone Accessories", results["facet_counts"][0]["counts"][2]["highlighted"].get<std::string>().c_str());
+
+    coll1->remove("100");
+
+    doc["categories"] = {"Cell Phones", "Unlocked Cell Phones", "All Unlocked Cell Phones" };
+    coll1->add(doc.dump());
+
+    results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1,
+                            token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, 500, "categories:cell").get();
+
+    ASSERT_EQ(1, results["facet_counts"].size());
+    ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Cell Phones", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][1]["count"]);
+    ASSERT_STREQ("All Unlocked Cell Phones", results["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("All Unlocked <mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][1]["highlighted"].get<std::string>().c_str());
+
+    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Unlocked Cell Phones", results["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("Unlocked <mark>Cell</mark> Phones", results["facet_counts"][0]["counts"][2]["highlighted"].get<std::string>().c_str());
+
+    coll1->remove("100");
+    doc["categories"] = {"Cell Phones", "Cell Phone Accessories", "Cell Phone Cases & Clips"};
+    coll1->add(doc.dump());
+
+    results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1,
+                            token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, 500, "categories:acces").get();
+
+
+    ASSERT_EQ(1, results["facet_counts"].size());
+    ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
+    ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("Cell Phone <mark>Acces</mark>sories", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
+
+    // ensure that query is NOT case sensitive
+
+    results = coll1->search("phone", {"categories"}, "", facets, sort_fields, 0, 10, 1,
+                            token_ordering::FREQUENCY, true, 10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, 500, "categories:ACCES").get();
+
+    ASSERT_EQ(1, results["facet_counts"].size());
+    ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
+    ASSERT_STREQ("categories", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
+    ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
+    ASSERT_STREQ("Cell Phone Accessories", results["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
+    ASSERT_STREQ("Cell Phone <mark>Acces</mark>sories", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
+
+    collectionManager.drop_collection("coll1");
+}
+
 TEST_F(CollectionTest, SortingOrder) {
    Collection *coll_mul_fields;

@ -2352,13 +2467,16 @@ TEST_F(CollectionTest, DeletionOfADocument) {

    results = collection_for_del->search("cryogenic", query_fields, "", {}, sort_fields, 0, 5, 1, FREQUENCY, false).get();
    ASSERT_EQ(0, results["hits"].size());
+    ASSERT_EQ(0, results["found"]);

    results = collection_for_del->search("archives", query_fields, "", {}, sort_fields, 0, 5, 1, FREQUENCY, false).get();
    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ(1, results["found"]);

    collection_for_del->remove("foo");   // custom id record
    results = collection_for_del->search("martian", query_fields, "", {}, sort_fields, 0, 5, 1, FREQUENCY, false).get();
    ASSERT_EQ(0, results["hits"].size());
+    ASSERT_EQ(0, results["found"]);

    // delete all records
    for(int id = 0; id <= 25; id++) {
@ -2533,4 +2651,4 @@ TEST_F(CollectionTest, PruneFieldsFromDocument) {
    document = get_prune_doc();
    Collection::prune_document(document, spp::sparse_hash_set<std::string>(), {"notfound"});
    ASSERT_EQ(4, document.size());
-}
+}