Support exclude filtering for string facets.

2025-05-19 13:12:22 +08:00 · 2021-05-07 13:32:03 +05:30 · 2021-05-07 13:32:03 +05:30 · e6a11f74fc
commit e6a11f74fc
parent 25f6fe0614
4 changed files with 103 additions and 10 deletions
--- a/include/art.h
+++ b/include/art.h
@ -133,6 +133,7 @@ enum NUM_COMPARATOR {
    LESS_THAN,
    LESS_THAN_EQUALS,
    EQUALS,
+    NOT_EQUALS,
    CONTAINS,
    GREATER_THAN,
    GREATER_THAN_EQUALS,
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -1967,6 +1967,9 @@ Option<bool> Collection::parse_geopoint_filter_value(std::string& raw_value,

 Option<bool> Collection::parse_filter_query(const std::string& simple_filter_query,
                                                      std::vector<filter>& filters) const {
+
+    std::vector<filter> exclude_filters;  // to ensure that they go last in the list of filters
+
    std::vector<std::string> filter_blocks;
    StringUtils::split(simple_filter_query, filter_blocks, "&&");

@ -2141,6 +2144,15 @@ Option<bool> Collection::parse_filter_query(const std::string& simple_filter_que
                // string filter should be evaluated in strict "equals" mode
                str_comparator = EQUALS;
                while(raw_value[++filter_value_index] == ' ');
+            } else if(raw_value[0] == '-') {
+                if(!_field.facet) {
+                    // EXCLUDE filtering on string is possible only on facet fields
+                    return Option<bool>(400, "To perform exclude filtering, filter field `" +
+                                             _field.name + "` must be a facet field.");
+                }
+
+                str_comparator = NOT_EQUALS;
+                while(raw_value[++filter_value_index] == ' ');
            }

            if(raw_value[filter_value_index] == '[' && raw_value[raw_value.size() - 1] == ']') {
@ -2155,9 +2167,15 @@ Option<bool> Collection::parse_filter_query(const std::string& simple_filter_que
                                "`: Unidentified field data type, see docs for supported data types.");
        }

-        filters.push_back(f);
+        if(f.comparators.size() > 0 && f.comparators.front() == NOT_EQUALS) {
+            exclude_filters.push_back(f);
+        } else {
+            filters.push_back(f);
+        }
    }

+    filters.insert( filters.end(), exclude_filters.begin(), exclude_filters.end() );
+
    return Option<bool>(true);
 }

--- a/src/index.cpp
+++ b/src/index.cpp
@ -1252,7 +1252,7 @@ uint32_t Index::do_filtering(uint32_t** filter_ids_out, const std::vector<filter
                    }
                }

-                if(a_filter.comparators[0] == EQUALS && f.is_facet()) {
+                if((a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS) && f.is_facet()) {
                    // need to do exact match (unlike CONTAINS) by using the facet index
                    // field being a facet is already enforced upstream
                    uint32_t* exact_strt_ids = new uint32_t[strt_ids_size];
@ -1305,14 +1305,36 @@ uint32_t Index::do_filtering(uint32_t** filter_ids_out, const std::vector<filter
                    strt_ids_size = exact_strt_size;
                }

-                // Otherwise, we just ensure that given record contains tokens in the filter query
-                // (NOT implemented) if the query is wrapped by double quotes, ensure phrase match
-                // bool exact_match = (filter_value.front() == '"' && filter_value.back() == '"');
-                uint32_t* out = nullptr;
-                ids_size = ArrayUtils::or_scalar(ids, ids_size, strt_ids, strt_ids_size, &out);
-                delete[] strt_ids;
-                delete[] ids;
-                ids = out;
+                if(a_filter.comparators[0] == NOT_EQUALS && f.is_facet()) {
+                    // exclude records from existing IDs (from previous filters or ALL records)
+                    // upstream will guarantee that NOT_EQUALS is placed right at the end of filters list
+                    if(ids == nullptr) {
+                        if(filter_ids == nullptr) {
+                            ids = seq_ids.uncompress();
+                            ids_size = seq_ids.getLength();
+                        } else {
+                            ids = filter_ids;
+                            ids_size = filter_ids_length;
+                        }
+                    }
+
+                    uint32_t* excluded_strt_ids = new uint32_t[strt_ids_size];
+                    size_t excluded_strt_size = 0;
+                    excluded_strt_size = ArrayUtils::exclude_scalar(ids, ids_size, strt_ids,
+                                                                    strt_ids_size, &excluded_strt_ids);
+
+                    delete [] ids;
+                    ids = excluded_strt_ids;
+                    ids_size = excluded_strt_size;
+
+                } else {
+                    // Otherwise, we just ensure that given record contains tokens in the filter query
+                    uint32_t* out = nullptr;
+                    ids_size = ArrayUtils::or_scalar(ids, ids_size, strt_ids, strt_ids_size, &out);
+                    delete[] strt_ids;
+                    delete[] ids;
+                    ids = out;
+                }
            }

            result_ids = ids;
--- a/test/collection_filtering_test.cpp
+++ b/test/collection_filtering_test.cpp
@ -1289,5 +1289,57 @@ TEST_F(CollectionFilteringTest, NumericalFilteringWithArray) {
    ASSERT_EQ(4, results["found"].get<size_t>());
    ASSERT_EQ(4, results["hits"].size());

+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionFilteringTest, NegationOperatorBasics) {
+    Collection *coll1;
+
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("artist", field_types::STRING, true),
+                                 field("points", field_types::INT32, false),};
+
+    coll1 = collectionManager.get_collection("coll1").get();
+    if(coll1 == nullptr) {
+        coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+    }
+
+    std::vector<std::vector<std::string>> records = {
+        {"Taylor Swift Karaoke: reputation", "Taylor Swift"},
+        {"Beat it", "Michael Jackson"},
+        {"Style", "Taylor Swift"},
+        {"Thriller", "Michael Joseph Jackson"},
+    };
+
+    for(size_t i=0; i<records.size(); i++) {
+        nlohmann::json doc;
+
+        doc["id"] = std::to_string(i);
+        doc["title"] = records[i][0];
+        doc["artist"] = records[i][1];
+        doc["points"] = i;
+
+        ASSERT_TRUE(coll1->add(doc.dump()).ok());
+    }
+
+    auto results = coll1->search("*", {"artist"}, "artist:- Michael Jackson", {}, {}, 0, 10, 1, FREQUENCY, true, 10).get();
+
+    ASSERT_EQ(3, results["found"].get<size_t>());
+
+    ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("0", results["hits"][2]["document"]["id"].get<std::string>().c_str());
+
+    results = coll1->search("*", {"artist"}, "artist:- Michael Jackson && points: >0", {}, {}, 0, 10, 1, FREQUENCY, true, 10).get();
+    ASSERT_EQ(2, results["found"].get<size_t>());
+    ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    // negation operation on multiple values
+
+    results = coll1->search("*", {"artist"}, "artist:- [Michael Jackson, Taylor Swift]", {}, {}, 0, 10, 1, FREQUENCY, true, 10).get();
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+
    collectionManager.drop_collection("coll1");
 }