Refactor string filter iteration.

2025-05-23 23:30:42 +08:00 · 2023-04-27 14:40:53 +05:30 · 2023-04-27 14:40:53 +05:30 · 9aa226a461
commit 9aa226a461
parent 5c3333058d
2 changed files with 69 additions and 112 deletions
--- a/include/filter_result_iterator.h
+++ b/include/filter_result_iterator.h
@ -85,7 +85,6 @@ struct filter_result_t {
    static void or_filter_results(const filter_result_t& a, const filter_result_t& b, filter_result_t& result);
 };

-
 class filter_result_iterator_t {
 private:
    std::string collection_name;
@ -106,6 +105,7 @@ private:
    /// for each token.
    ///
    /// Multiple filter values: Multiple tokens: posting list iterator
+    std::vector<std::vector<posting_list_t*>> posting_lists;
    std::vector<std::vector<posting_list_t::iterator_t>> posting_list_iterators;
    std::vector<posting_list_t*> expanded_plists;

@ -121,11 +121,11 @@ private:
    /// Advance all the token iterators that are at seq_id.
    void advance_string_filter_token_iterators();

-    /// Finds the next match for a filter on string field.
-    void doc_matching_string_filter(bool field_is_array);
+    /// Finds the first match for a filter on string field.
+    void get_string_filter_first_match(const bool& field_is_array);

-    /// Returns true when doc and reference hold valid values. Used in conjunction with next() and skip_to(id).
-    [[nodiscard]] bool valid();
+    /// Finds the next match for a filter on string field.
+    void get_string_filter_next_match(const bool& field_is_array);

 public:
    uint32_t seq_id = 0;
--- a/src/filter_result_iterator.cpp
+++ b/src/filter_result_iterator.cpp
@ -291,7 +291,7 @@ void filter_result_iterator_t::advance_string_filter_token_iterators() {
    }
 }

-void filter_result_iterator_t::doc_matching_string_filter(bool field_is_array) {
+void filter_result_iterator_t::get_string_filter_next_match(const bool& field_is_array) {
    // If none of the filter value iterators are valid, mark this node as invalid.
    bool one_is_valid = false;

@ -412,7 +412,7 @@ void filter_result_iterator_t::next() {
            do {
                previous_match = seq_id;
                advance_string_filter_token_iterators();
-                doc_matching_string_filter(f.is_array());
+                get_string_filter_next_match(f.is_array());
            } while (is_valid && previous_match + 1 == seq_id);

            if (!is_valid) {
@ -433,7 +433,7 @@ void filter_result_iterator_t::next() {
        }

        advance_string_filter_token_iterators();
-        doc_matching_string_filter(f.is_array());
+        get_string_filter_next_match(f.is_array());

        return;
    }
@ -474,6 +474,50 @@ void apply_not_equals(uint32_t*&& all_ids,
    result_ids_len = to_include_ids_len;
 }

+void filter_result_iterator_t::get_string_filter_first_match(const bool& field_is_array) {
+    get_string_filter_next_match(field_is_array);
+
+    if (filter_node->filter_exp.apply_not_equals) {
+        // filter didn't match any id. So by applying not equals, every id in the index is a match.
+        if (!is_valid) {
+            is_valid = true;
+            seq_id = 0;
+            result_index = index->seq_ids->last_id() + 1;
+            return;
+        }
+
+        // [0, seq_id) are a match for not equals.
+        if (seq_id > 0) {
+            result_index = seq_id;
+            seq_id = 0;
+            return;
+        }
+
+        // Keep ignoring the consecutive matches.
+        uint32_t previous_match;
+        do {
+            previous_match = seq_id;
+            advance_string_filter_token_iterators();
+            get_string_filter_next_match(field_is_array);
+        } while (is_valid && previous_match + 1 == seq_id);
+
+        if (!is_valid) {
+            // filter matched all the ids in the index. So for not equals, there's no match.
+            if (previous_match >= index->seq_ids->last_id()) {
+                return;
+            }
+
+            is_valid = true;
+            result_index = index->seq_ids->last_id() + 1;
+            seq_id = previous_match + 1;
+            return;
+        }
+
+        result_index = seq_id;
+        seq_id = previous_match + 1;
+    }
+}
+
 void filter_result_iterator_t::init() {
    if (filter_node == nullptr) {
        return;
@ -807,7 +851,7 @@ void filter_result_iterator_t::init() {
        art_tree* t = index->search_index.at(a_filter.field_name);

        for (const std::string& filter_value : a_filter.values) {
-            std::vector<void*> posting_lists;
+            std::vector<void*> raw_posting_lists;

            // there could be multiple tokens in a filter value, which we have to treat as ANDs
            // e.g. country: South Africa
@ -826,119 +870,29 @@ void filter_result_iterator_t::init() {
                    continue;
                }

-                posting_lists.push_back(leaf->values);
+                raw_posting_lists.push_back(leaf->values);
            }

-            if (posting_lists.size() != str_tokens.size()) {
+            if (raw_posting_lists.size() != str_tokens.size()) {
                continue;
            }

            std::vector<posting_list_t*> plists;
-            posting_t::to_expanded_plists(posting_lists, plists, expanded_plists);
+            posting_t::to_expanded_plists(raw_posting_lists, plists, expanded_plists);

+            posting_lists.push_back(plists);
            posting_list_iterators.emplace_back(std::vector<posting_list_t::iterator_t>());
-
            for (auto const& plist: plists) {
                posting_list_iterators.back().push_back(plist->new_iterator());
            }
        }

-        doc_matching_string_filter(f.is_array());
-
-        if (filter_node->filter_exp.apply_not_equals) {
-            // filter didn't match any id. So by applying not equals, every id in the index is a match.
-            if (!is_valid) {
-                is_valid = true;
-                seq_id = 0;
-                result_index = index->seq_ids->last_id() + 1;
-                return;
-            }
-
-            // [0, seq_id) are a match for not equals.
-            if (seq_id > 0) {
-                result_index = seq_id;
-                seq_id = 0;
-                return;
-            }
-
-            // Keep ignoring the consecutive matches.
-            uint32_t previous_match;
-            do {
-                previous_match = seq_id;
-                advance_string_filter_token_iterators();
-                doc_matching_string_filter(f.is_array());
-            } while (is_valid && previous_match + 1 == seq_id);
-
-            if (!is_valid) {
-                // filter matched all the ids in the index. So for not equals, there's no match.
-                if (previous_match >= index->seq_ids->last_id()) {
-                    return;
-                }
-
-                is_valid = true;
-                result_index = index->seq_ids->last_id() + 1;
-                seq_id = previous_match + 1;
-                return;
-            }
-
-            result_index = seq_id;
-            seq_id = previous_match + 1;
-        }
+        get_string_filter_first_match(f.is_array());

        return;
    }
 }

-bool filter_result_iterator_t::valid() {
-    if (!is_valid) {
-        return false;
-    }
-
-    if (filter_node->isOperator) {
-        if (filter_node->filter_operator == AND) {
-            is_valid = left_it->valid() && right_it->valid();
-            return is_valid;
-        } else {
-            is_valid = left_it->valid() || right_it->valid();
-            return is_valid;
-        }
-    }
-
-    if (is_filter_result_initialized) {
-        is_valid = result_index < filter_result.count;
-        return is_valid;
-    }
-
-    const filter a_filter = filter_node->filter_exp;
-
-    if (!index->field_is_indexed(a_filter.field_name)) {
-        is_valid = false;
-        return is_valid;
-    }
-
-    field f = index->search_schema.at(a_filter.field_name);
-
-    if (f.is_string()) {
-        if (filter_node->filter_exp.apply_not_equals) {
-            return seq_id < result_index;
-        }
-
-        bool one_is_valid = false;
-        for (auto& filter_value_tokens: posting_list_iterators) {
-            posting_list_t::intersect(filter_value_tokens, one_is_valid);
-
-            if (one_is_valid) {
-                break;
-            }
-        }
-
-        is_valid = one_is_valid;
-        return is_valid;
-    }
-
-    return false;
-}
-
 void filter_result_iterator_t::skip_to(uint32_t id) {
    if (!is_valid) {
        return;
@ -1003,7 +957,7 @@ void filter_result_iterator_t::skip_to(uint32_t id) {
                do {
                    previous_match = seq_id;
                    advance_string_filter_token_iterators();
-                    doc_matching_string_filter(f.is_array());
+                    get_string_filter_next_match(f.is_array());
                } while (is_valid && previous_match + 1 == seq_id);
            } while (is_valid && seq_id <= id);

@ -1047,7 +1001,7 @@ void filter_result_iterator_t::skip_to(uint32_t id) {
            }
        }

-        doc_matching_string_filter(f.is_array());
+        get_string_filter_next_match(f.is_array());
        return;
    }
 }
@ -1190,13 +1144,16 @@ void filter_result_iterator_t::reset() {
    field f = index->search_schema.at(a_filter.field_name);

    if (f.is_string()) {
-        posting_list_iterators.clear();
-        for(auto expanded_plist: expanded_plists) {
-            delete expanded_plist;
-        }
-        expanded_plists.clear();
+        for (uint32_t i = 0; i < posting_lists.size(); i++) {
+            auto const& plists = posting_lists[i];

-        init();
+            posting_list_iterators[i].clear();
+            for (auto const& plist: plists) {
+                posting_list_iterators[i].push_back(plist->new_iterator());
+            }
+        }
+
+        get_string_filter_first_match(f.is_array());
        return;
    }
 }