Handle repeated facet values in arrays during searching.

2025-05-20 05:32:30 +08:00 · 2023-08-03 15:23:30 +05:30 · 2023-08-03 15:23:30 +05:30 · 956d596e43
commit 956d596e43
parent 876e998cfe
3 changed files with 25 additions and 13 deletions
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -1727,13 +1727,6 @@ Option<nlohmann::json> Collection::search(std::string  raw_query,
    if(!facet_query.query.empty()) {
        // identify facet hash tokens

-        for(const auto& the_facet: facets) {
-            if(the_facet.field_name == facet_query.field_name) {
-                //the_facet.hash_tokens
-                break;
-            }
-        }
-
        auto fq_field = search_schema.at(facet_query.field_name);
        bool is_cyrillic = Tokenizer::is_cyrillic(fq_field.locale);
        bool normalise = is_cyrillic ? false : true;
--- a/src/index.cpp
+++ b/src/index.cpp
@ -1056,8 +1056,6 @@ void Index::tokenize_string_array_with_facets(const std::vector<std::string>& st
                                              std::unordered_map<std::string, std::vector<uint32_t>>& token_to_offsets,
                                              std::vector<uint64_t>& facet_hashes) {

-    std::set<uint64_t> facet_hash_set;  // required to deal with repeating phrases
-
    for(size_t array_index = 0; array_index < strings.size(); array_index++) {
        const std::string& str = strings[array_index];
        std::set<std::string> token_set;  // required to deal with repeating tokens
@ -1091,9 +1089,8 @@ void Index::tokenize_string_array_with_facets(const std::vector<std::string>& st
            }
        }

-        if(is_facet && facet_hash_set.count(facet_hash) == 0) {
+        if(is_facet) {
            facet_hashes.push_back(facet_hash);
-            facet_hash_set.insert(facet_hash);
        }

        if(token_set.empty()) {
@ -1226,11 +1223,19 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
                RETURN_CIRCUIT_BREAKER
            }

+            std::set<uint32_t> unique_facet_hashes;
+
            for(size_t j = 0; j < facet_hash_count; j++) {
                
                if(facet_field.is_array()) {
                    fhash = facet_map_it->second.hashes[j];
                }
+
+                if(unique_facet_hashes.count(fhash) == 0) {
+                    unique_facet_hashes.insert(fhash);
+                } else {
+                    continue;
+                }
        
                if(should_compute_stats) {
                    compute_facet_stats(a_facet, fhash, facet_field.type);
--- a/test/collection_faceting_test.cpp
+++ b/test/collection_faceting_test.cpp
@ -1111,7 +1111,7 @@ TEST_F(CollectionFacetingTest, FacetByArrayField) {
    })"_json;

    auto doc2 = R"({
-        "data": ["Foo", "Foo"]
+        "data": ["Foo", "Foo", "Bazinga"]
    })"_json;

    ASSERT_TRUE(coll1->add(doc1.dump(), CREATE).ok());
@ -1124,9 +1124,23 @@ TEST_F(CollectionFacetingTest, FacetByArrayField) {
    ASSERT_EQ(2, results["found"].get<size_t>());
    ASSERT_EQ(1, results["facet_counts"].size());
    ASSERT_EQ("data", results["facet_counts"][0]["field_name"]);
-    ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
+    ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
    ASSERT_EQ(2, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
    ASSERT_EQ("Foo", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
+
+    ASSERT_EQ(1, results["facet_counts"][0]["counts"][1]["count"].get<size_t>());
+    ASSERT_EQ("Bazinga", results["facet_counts"][0]["counts"][1]["value"].get<std::string>());
+
+    results = coll1->search("*", {}, "", {"data"}, {}, {0}, 10, 1,
+                            token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "data:baz", 30, 4).get();
+
+    ASSERT_EQ(2, results["found"].get<size_t>());
+    ASSERT_EQ(1, results["facet_counts"].size());
+    ASSERT_EQ("data", results["facet_counts"][0]["field_name"]);
+    ASSERT_EQ(1, results["facet_counts"][0]["counts"].size());
+    ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
+    ASSERT_EQ("Bazinga", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
 }

 TEST_F(CollectionFacetingTest, FacetParseTest){