Use symbols to index + separators for facet query parsing.

2025-05-18 12:42:50 +08:00 · 2023-11-07 21:00:35 +05:30 · 2023-11-07 21:00:35 +05:30 · ec7c54d31c
commit ec7c54d31c
parent 8be74519a7
4 changed files with 40 additions and 4 deletions
--- a/include/facet_index.h
+++ b/include/facet_index.h
@ -135,6 +135,7 @@ public:

    size_t intersect(facet& a_facet, const field& facet_field,
                     bool has_facet_query, const std::vector<std::vector<std::string>>& fvalue_searched_tokens,
+                     const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
                     const uint32_t* result_ids, size_t result_id_len,
                     size_t max_facet_count, std::map<std::string, docid_count_t>& found,
                     bool is_wildcard_no_filter_query, const std::string& sort_order = "");
--- a/src/facet_index.cpp
+++ b/src/facet_index.cpp
@ -260,6 +260,7 @@ size_t facet_index_t::get_facet_count(const std::string& field_name) {
 //returns the count of matching seq_ids from result array
 size_t facet_index_t::intersect(facet& a_facet, const field& facet_field,
                                bool has_facet_query, const std::vector<std::vector<std::string>>& fvalue_searched_tokens,
+                                const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators,
                                const uint32_t* result_ids, size_t result_ids_len,
                                size_t max_facet_count, std::map<std::string, docid_count_t>& found,
                                bool is_wildcard_no_filter_query, const std::string& sort_order) {
@ -288,7 +289,8 @@ size_t facet_index_t::intersect(facet& a_facet, const field& facet_field,
            auto facet_str = facet_count_it->facet_value;
            std::vector<std::string> facet_tokens;
            if(facet_field.is_string()) {
-                Tokenizer(facet_str, true, false, facet_field.locale).tokenize(facet_tokens);
+                Tokenizer(facet_str, true, false, facet_field.locale,
+                          symbols_to_index, token_separators).tokenize(facet_tokens);
            } else {
                facet_tokens.push_back(facet_str);
            }
@ -306,7 +308,6 @@ size_t facet_index_t::intersect(facet& a_facet, const field& facet_field,
                    if(!facet_tokens_found) {
                        found_all_search_tokens = false;
                    }
-
                }

                if (found_all_search_tokens) {
--- a/src/index.cpp
+++ b/src/index.cpp
@ -1344,8 +1344,9 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
            std::string sort_order = a_facet.is_sort_by_alpha ? a_facet.sort_order : "";

            facet_index_v4->intersect(a_facet, facet_field,use_facet_query,
-                                      facet_infos[findex].fvalue_searched_tokens, result_ids,
-                                      results_size, max_facet_count, facet_results,
+                                      facet_infos[findex].fvalue_searched_tokens,
+                                      symbols_to_index, token_separators,
+                                      result_ids, results_size, max_facet_count, facet_results,
                                      is_wildcard_no_filter_query, sort_order);

            for(const auto& kv : facet_results) {
--- a/test/collection_optimized_faceting_test.cpp
+++ b/test/collection_optimized_faceting_test.cpp
@ -1423,6 +1423,39 @@ TEST_F(CollectionOptimizedFacetingTest, FacetQueryTest) {
    ASSERT_EQ("<mark>a</mark>mazon <mark>green</mark>", results["facet_counts"][0]["counts"][0]["highlighted"]);
 }

+TEST_F(CollectionOptimizedFacetingTest, FacetQueryWithSymbols) {
+    nlohmann::json schema = R"({
+        "name": "coll1",
+        "fields": [
+            {"name": "title", "type": "string", "facet": true}
+        ],
+        "symbols_to_index": ["[", "]"],
+        "token_separators": ["[", "]"]
+    })"_json;
+
+    Collection* coll1 = collectionManager.create_collection(schema).get();
+
+    std::vector<std::string> titles = {"Article 4", "Article 4[7]", "Article 4[11]", "Article 4[22][a]"};
+
+    for(size_t i = 0; i < titles.size(); i++) {
+        nlohmann::json doc;
+        doc["title"] = titles[i];
+        ASSERT_TRUE(coll1->add(doc.dump()).ok());
+    }
+
+    auto results = coll1->search("*", {},
+                                 "", {"title"}, {}, {2}, 1, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 5, "title:article 4[", 30, 4, "", 20, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
+                                 4, {off}, 3, 3, 2, 2, false, "", true, 0, max_score, 100, 0, 4294967295UL, VALUE).get();
+
+    ASSERT_EQ(1, results["facet_counts"].size());
+    ASSERT_EQ(3, results["facet_counts"][0]["counts"].size());
+    ASSERT_EQ("<mark>Article</mark> <mark>4[</mark>7]", results["facet_counts"][0]["counts"][0]["highlighted"]);
+    ASSERT_EQ("<mark>Article</mark> <mark>4[</mark>11]", results["facet_counts"][0]["counts"][1]["highlighted"]);
+    ASSERT_EQ("<mark>Article</mark> <mark>4[</mark>22][a]", results["facet_counts"][0]["counts"][2]["highlighted"]);
+}
+
 TEST_F(CollectionOptimizedFacetingTest, StringLengthTest) {
    std::vector<field> fields = {
            field("tags", field_types::STRING_ARRAY, true),