Exact prefix value filter. (#1763)

* Exact prefix match on `string` field. * Exact prefix match on `string[]` field. --------- Co-authored-by: Kishore Nallan <kishorenc@gmail.com>
2025-05-18 20:52:50 +08:00 · 2024-06-07 14:57:34 +05:30 · 2024-06-07 14:57:34 +05:30 · 4fee4dc286
commit 4fee4dc286
parent 25762a7c69
6 changed files with 983 additions and 26 deletions
--- a/include/filter_result_iterator.h
+++ b/include/filter_result_iterator.h
@ -280,6 +280,10 @@ private:
    /// Sample filter: [>10, !15].
    std::unordered_set<uint32_t> numerical_not_iterator_index;

+    /// String filter can specify prefix value match.
+    /// Sample filter: [Chris P*].
+    std::unordered_set<uint32_t> string_prefix_filter_index;
+
    bool delete_filter_node = false;

    std::unique_ptr<filter_result_iterator_timeout_info> timeout_info;
--- a/include/posting_list.h
+++ b/include/posting_list.h
@ -203,10 +203,19 @@ public:

    static bool is_single_token_verbatim_match(const posting_list_t::iterator_t& it, bool field_is_array);

+    static bool is_single_token_prefix_match(const posting_list_t::iterator_t& it, bool field_is_array);
+
+    static void get_prefix_matches(std::vector<iterator_t>& its, const bool field_is_array,
+                                   const uint32_t* ids, const uint32_t num_ids,
+                                   uint32_t*& prefix_ids, size_t& num_prefix_ids);
+
    static void get_exact_matches(std::vector<iterator_t>& its, bool field_is_array,
                                  const uint32_t* ids, const uint32_t num_ids,
                                  uint32_t*& exact_ids, size_t& num_exact_ids);

+    static bool has_prefix_match(std::vector<posting_list_t::iterator_t>& posting_list_iterators,
+                                 const bool field_is_array);
+
    static bool has_exact_match(std::vector<posting_list_t::iterator_t>& posting_list_iterators,
                                  const bool field_is_array);

--- a/src/filter_result_iterator.cpp
+++ b/src/filter_result_iterator.cpp
@ -355,7 +355,7 @@ void filter_result_iterator_t::get_string_filter_next_match(const bool& field_is
    uint32_t lowest_id = UINT32_MAX;

    if (filter_node->filter_exp.comparators[0] == EQUALS || filter_node->filter_exp.comparators[0] == NOT_EQUALS) {
-        bool exact_match_found = false;
+        bool match_found = false;
        switch (posting_list_iterators.size()) {
            case 1:
                while(true) {
@ -366,28 +366,32 @@ void filter_result_iterator_t::get_string_filter_next_match(const bool& field_is
                        break;
                    }

-                    if (posting_list_t::has_exact_match(posting_list_iterators[0], field_is_array)) {
-                        exact_match_found = true;
-                        break;
-                    } else {
-                        // Keep advancing token iterators till exact match is not found.
-                        for (auto& iter: posting_list_iterators[0]) {
-                            if (!iter.valid()) {
-                                break;
-                            }
+                    match_found = string_prefix_filter_index.count(0) == 0 ?
+                                    posting_list_t::has_exact_match(posting_list_iterators[0], field_is_array) :
+                                    posting_list_t::has_prefix_match(posting_list_iterators[0], field_is_array);

-                            iter.next();
+                    if (match_found) {
+                        break;
+                    }
+
+                    // Keep advancing token iterators till match is not found.
+                    for (auto& iter: posting_list_iterators[0]) {
+                        if (!iter.valid()) {
+                            break;
                        }
+
+                        iter.next();
                    }
                }

-                if (one_is_valid && exact_match_found) {
+                if (one_is_valid && match_found) {
                    lowest_id = posting_list_iterators[0][0].id();
                }
            break;

            default :
-                for (auto& filter_value_tokens : posting_list_iterators) {
+                for (uint32_t i = 0; i < posting_list_iterators.size(); i++) {
+                    auto& filter_value_tokens = posting_list_iterators[i];
                    bool tokens_iter_is_valid;
                    while(true) {
                        // Perform AND between tokens of a filter value.
@ -397,24 +401,27 @@ void filter_result_iterator_t::get_string_filter_next_match(const bool& field_is
                            break;
                        }

-                        if (posting_list_t::has_exact_match(filter_value_tokens, field_is_array)) {
-                            exact_match_found = true;
-                            break;
-                        } else {
-                            // Keep advancing token iterators till exact match is not found.
-                            for (auto &iter: filter_value_tokens) {
-                                if (!iter.valid()) {
-                                    break;
-                                }
+                        match_found = string_prefix_filter_index.count(i) == 0 ?
+                                      posting_list_t::has_exact_match(filter_value_tokens, field_is_array) :
+                                      posting_list_t::has_prefix_match(filter_value_tokens, field_is_array);

-                                iter.next();
+                        if (match_found) {
+                            break;
+                        }
+
+                        // Keep advancing token iterators till exact match is not found.
+                        for (auto &iter: filter_value_tokens) {
+                            if (!iter.valid()) {
+                                break;
                            }
+
+                            iter.next();
                        }
                    }

                    one_is_valid = tokens_iter_is_valid || one_is_valid;

-                    if (tokens_iter_is_valid && exact_match_found && filter_value_tokens[0].id() < lowest_id) {
+                    if (tokens_iter_is_valid && match_found && filter_value_tokens[0].id() < lowest_id) {
                        lowest_id = filter_value_tokens[0].id();
                    }
                }
@ -1360,7 +1367,8 @@ void filter_result_iterator_t::init() {
    } else if (f.is_string()) {
        art_tree* t = index->search_index.at(a_filter.field_name);

-        for (std::string filter_value : a_filter.values) {
+        for (uint32_t i = 0; i < a_filter.values.size(); i++) {
+            auto filter_value = a_filter.values[i];
            auto is_prefix_match = filter_value.size() > 1 && filter_value[filter_value.size() - 1] == '*';
            if (is_prefix_match) {
                filter_value.erase(filter_value.size() - 1);
@ -1469,6 +1477,7 @@ void filter_result_iterator_t::init() {
                        continue;
                    }

+                    string_prefix_filter_index.insert(posting_lists.size());
                    posting_lists.push_back(plists);
                    posting_list_iterators.emplace_back(std::vector<posting_list_t::iterator_t>());
                    for (auto const& plist: plists) {
@ -2485,7 +2494,33 @@ void filter_result_iterator_t::compute_iterators() {

        for (uint32_t i = 0; i < posting_lists.size(); i++) {
            auto& p_list = posting_lists[i];
-            if (a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS) {
+            if (string_prefix_filter_index.count(i) != 0 &&
+                    (a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS)) {
+                // Exact prefix match, needs intersection + prefix matching
+                std::vector<uint32_t> result_id_vec;
+                posting_list_t::intersect(p_list, result_id_vec);
+
+                if (result_id_vec.empty()) {
+                    continue;
+                }
+
+                // need to do prefix match
+                uint32_t* prefix_str_ids = new uint32_t[result_id_vec.size()];
+                size_t prefix_str_ids_size = 0;
+                std::unique_ptr<uint32_t[]> prefix_str_ids_guard(prefix_str_ids);
+
+                posting_list_t::get_prefix_matches(posting_list_iterators[i], f.is_array(),
+                                                  result_id_vec.data(), result_id_vec.size(),
+                                                  prefix_str_ids, prefix_str_ids_size);
+
+                if (prefix_str_ids_size == 0) {
+                    continue;
+                }
+
+                for (size_t pi = 0; pi < prefix_str_ids_size; pi++) {
+                    f_id_buff.push_back(prefix_str_ids[pi]);
+                }
+            } else if (a_filter.comparators[0] == EQUALS || a_filter.comparators[0] == NOT_EQUALS) {
                // needs intersection + exact matching (unlike CONTAINS)
                std::vector<uint32_t> result_id_vec;
                posting_list_t::intersect(p_list, result_id_vec);
--- a/src/posting_list.cpp
+++ b/src/posting_list.cpp
@ -1111,6 +1111,173 @@ bool posting_list_t::contains_atleast_one(const uint32_t* target_ids, size_t tar
    return false;
 }

+bool posting_list_t::is_single_token_prefix_match(const posting_list_t::iterator_t& it, bool field_is_array) {
+    block_t* curr_block = it.block();
+    uint32_t curr_index = it.index();
+
+    if (curr_block == nullptr || curr_index == UINT32_MAX) {
+        return false;
+    }
+
+    uint32_t* offsets = it.offsets;
+    uint32_t start_offset = it.offset_index[curr_index];
+
+    // If the field value starts with the token, it's a match.
+    return offsets[start_offset] == 1;
+}
+
+void posting_list_t::get_prefix_matches(std::vector<iterator_t>& its, const bool field_is_array,
+                                        const uint32_t* ids, const uint32_t num_ids,
+                                        uint32_t*& prefix_ids, size_t& num_prefix_ids) {
+    size_t prefix_id_index = 0;
+
+    if (its.size() == 1) {
+        for (size_t i = 0; i < num_ids; i++) {
+            auto const& id = ids[i];
+            its[0].skip_to(id);
+            if (is_single_token_prefix_match(its[0], field_is_array)) {
+                prefix_ids[prefix_id_index++] = id;
+            }
+        }
+    } else {
+
+        if (!field_is_array) {
+            for (size_t i = 0; i < num_ids; i++) {
+                uint32_t id = ids[i];
+                bool is_match = true;
+
+                for (int j = its.size()-1; j >= 0; j--) {
+                    posting_list_t::iterator_t& it = its[j];
+                    it.skip_to(id);
+
+                    block_t* curr_block = it.block();
+                    uint32_t curr_index = it.index();
+
+                    if (curr_block == nullptr || curr_index == UINT32_MAX) {
+                        is_match = false;
+                        break;
+                    }
+
+                    uint32_t* offsets = it.offsets;
+
+                    uint32_t start_offset_index = it.offset_index[curr_index];
+                    uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ?
+                                                curr_block->offsets.getLength() :
+                                                it.offset_index[curr_index + 1];
+
+                    // looping handles duplicate query tokens, e.g. "hip hip hurray hurray"
+                    while (start_offset_index < end_offset_index) {
+                        uint32_t offset = offsets[start_offset_index];
+                        start_offset_index++;
+
+                        if (offset == (j + 1)) {
+                            // we have found a matching index, no need to look further for this token
+                            is_match = true;
+                            break;
+                        }
+
+                        if (offset > (j + 1)) {
+                            is_match = false;
+                            break;
+                        }
+                    }
+
+                    if (!is_match) {
+                        break;
+                    }
+                }
+
+                if (is_match) {
+                    prefix_ids[prefix_id_index++] = id;
+                }
+            }
+        }
+
+        else {
+            // field is an array
+
+            struct token_index_meta_t {
+                std::bitset<32> token_index;
+            };
+
+            for (size_t i = 0; i < num_ids; i++) {
+                uint32_t id = ids[i];
+
+                std::map<size_t, token_index_meta_t> array_index_to_token_index;
+                bool premature_exit = false;
+
+                for (int j = its.size()-1; j >= 0; j--) {
+                    posting_list_t::iterator_t& it = its[j];
+
+                    it.skip_to(id);
+
+                    block_t* curr_block = it.block();
+                    uint32_t curr_index = it.index();
+
+                    if (curr_block == nullptr || curr_index == UINT32_MAX) {
+                        premature_exit = true;
+                        break;
+                    }
+
+                    uint32_t* offsets = it.offsets;
+                    uint32_t start_offset_index = it.offset_index[curr_index];
+                    uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ?
+                                                curr_block->offsets.getLength() :
+                                                it.offset_index[curr_index + 1];
+
+                    int prev_pos = -1;
+                    bool found_matching_index = false;
+                    size_t num_matching_index = 0;
+
+                    while (start_offset_index < end_offset_index) {
+                        int pos = offsets[start_offset_index];
+                        start_offset_index++;
+
+                        if (pos == prev_pos) {  // indicates end of array index
+                            size_t array_index = (size_t) offsets[start_offset_index];
+
+                            if (found_matching_index) {
+                                array_index_to_token_index[array_index].token_index.set(j+1);
+                            }
+
+                            start_offset_index++;  // skip current value which is the array index or flag for last index
+                            prev_pos = -1;
+                            found_matching_index = false;
+                            continue;
+                        }
+
+                        if (pos == (j + 1)) {
+                            // we have found a matching index
+                            found_matching_index = true;
+                            num_matching_index++;
+                        }
+
+                        prev_pos = pos;
+                    }
+
+                    if (num_matching_index == 0) {
+                        // not even a single matching index found: can never be an exact match
+                        premature_exit = true;
+                        break;
+                    }
+                }
+
+                if (!premature_exit) {
+                    // iterate array index to token index to check if atleast 1 array position contains all tokens
+                    for (auto& kv: array_index_to_token_index) {
+                        if (kv.second.token_index.count() == its.size()) {
+                            prefix_ids[prefix_id_index++] = id;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    num_prefix_ids = prefix_id_index;
+}
+
 void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool field_is_array,
                                       const uint32_t* ids, const uint32_t num_ids,
                                       uint32_t*& exact_ids, size_t& num_exact_ids) {
@ -1292,6 +1459,123 @@ void posting_list_t::get_exact_matches(std::vector<iterator_t>& its, const bool
    num_exact_ids = exact_id_index;
 }

+bool posting_list_t::has_prefix_match(std::vector<posting_list_t::iterator_t>& posting_list_iterators,
+                                      const bool field_is_array) {
+    if (posting_list_iterators.empty()) {
+        return false;
+    }
+
+    if (posting_list_iterators.size() == 1) {
+        return is_single_token_prefix_match(posting_list_iterators[0], field_is_array);
+    }
+
+    if (!field_is_array) {
+        for (uint32_t i = 0; i < posting_list_iterators.size(); i++) {
+            posting_list_t::iterator_t& it = posting_list_iterators[i];
+
+            block_t* curr_block = it.block();
+            uint32_t curr_index = it.index();
+
+            if (curr_block == nullptr || curr_index == UINT32_MAX) {
+                return false;
+            }
+
+            uint32_t* offsets = it.offsets;
+
+            uint32_t start_offset_index = it.offset_index[curr_index];
+            uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ?
+                                        curr_block->offsets.getLength() :
+                                        it.offset_index[curr_index + 1];
+
+            // looping handles duplicate query tokens, e.g. "hip hip hurray hurray"
+            while (start_offset_index < end_offset_index) {
+                uint32_t offset = offsets[start_offset_index];
+                start_offset_index++;
+
+                if (offset == (i + 1)) {
+                    // we have found a matching index, no need to look further for this token.
+                    break;
+                }
+
+                if (offset > (i + 1)) {
+                    return false;
+                }
+            }
+        }
+    }
+
+    else {
+        // field is an array
+
+        struct token_index_meta_t {
+            std::bitset<32> token_index;
+        };
+
+        std::map<size_t, token_index_meta_t> array_index_to_token_index;
+
+        for (int i = posting_list_iterators.size() - 1; i >= 0; i--) {
+            posting_list_t::iterator_t& it = posting_list_iterators[i];
+
+            block_t* curr_block = it.block();
+            uint32_t curr_index = it.index();
+
+            if (curr_block == nullptr || curr_index == UINT32_MAX) {
+                return false;
+            }
+
+            uint32_t* offsets = it.offsets;
+            uint32_t start_offset_index = it.offset_index[curr_index];
+            uint32_t end_offset_index = (curr_index == curr_block->size() - 1) ?
+                                        curr_block->offsets.getLength() :
+                                        it.offset_index[curr_index + 1];
+
+            int prev_pos = -1;
+            bool found_matching_index = false;
+            size_t num_matching_index = 0;
+
+            while (start_offset_index < end_offset_index) {
+                int pos = offsets[start_offset_index];
+                start_offset_index++;
+
+                if (pos == prev_pos) {  // indicates end of array index
+                    size_t array_index = (size_t) offsets[start_offset_index];
+
+                    if (found_matching_index) {
+                        array_index_to_token_index[array_index].token_index.set(i + 1);
+                    }
+
+                    start_offset_index++;  // skip current value which is the array index or flag for last index
+                    prev_pos = -1;
+                    found_matching_index = false;
+                    continue;
+                }
+
+                if (pos == (i + 1)) {
+                    // we have found a matching index
+                    found_matching_index = true;
+                    num_matching_index++;
+                }
+
+                prev_pos = pos;
+            }
+
+            if (num_matching_index == 0) {
+                // not even a single matching index found: can never be an exact match
+                return false;
+            }
+        }
+
+        // iterate array index to token index to check if atleast 1 array position contains all tokens
+        for (auto& kv: array_index_to_token_index) {
+            if (kv.second.token_index.count() == posting_list_iterators.size()) {
+                return true;
+            }
+        }
+    }
+
+    return true;
+}
+
 bool posting_list_t::has_exact_match(std::vector<posting_list_t::iterator_t>& posting_list_iterators,
                                       const bool field_is_array) {
    if(posting_list_iterators.size() == 1) {
--- a/test/collection_filtering_test.cpp
+++ b/test/collection_filtering_test.cpp
@ -2563,6 +2563,368 @@ TEST_F(CollectionFilteringTest, PrefixFilterOnTextFields) {
        std::string id = ids.at(i);
        ASSERT_EQ(id, result_id);
    }
+
+    auto schema_json =
+            R"({
+                "name": "Names",
+                "fields": [
+                    {"name": "name", "type": "string", "optional": true},
+                    {"name": "names", "type": "string[]", "optional": true}
+                ]
+            })"_json;
+    std::vector<nlohmann::json> documents = {
+            R"({
+                "name": "Steve Jobs"
+            })"_json,
+            R"({
+                "name": "Adam Stator"
+            })"_json,
+    };
+
+    auto collection_create_op = collectionManager.create_collection(schema_json);
+    ASSERT_TRUE(collection_create_op.ok());
+    for (auto const &json: documents) {
+        auto add_op = collection_create_op.get()->add(json.dump());
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    std::map<std::string, std::string> req_params = {
+            {"collection", "Names"},
+            {"q", "*"},
+            {"query_by", "name"},
+            {"filter_by", "name:= S*"}
+    };
+    nlohmann::json embedded_params;
+    std::string json_res;
+    auto now_ts = std::chrono::duration_cast<std::chrono::microseconds>(
+            std::chrono::system_clock::now().time_since_epoch()).count();
+
+    auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    auto res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(1, res_obj["found"].get<size_t>());
+    ASSERT_EQ(1, res_obj["hits"].size());
+    ASSERT_EQ("Steve Jobs", res_obj["hits"][0]["document"].at("name"));
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "*"},
+            {"query_by", "name"},
+            {"filter_by", "name: S*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(2, res_obj["found"].get<size_t>());
+    ASSERT_EQ(2, res_obj["hits"].size());
+    ASSERT_EQ("Adam Stator", res_obj["hits"][0]["document"].at("name"));
+    ASSERT_EQ("Steve Jobs", res_obj["hits"][1]["document"].at("name"));
+
+    documents = {
+            R"({
+                "name": "Steve Reiley"
+            })"_json,
+            R"({
+                "name": "Storm"
+            })"_json,
+            R"({
+                "name": "Steve Rogers"
+            })"_json,
+    };
+
+    for (auto const &json: documents) {
+        auto add_op = collection_create_op.get()->add(json.dump());
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "name"},
+            {"filter_by", "name:= St*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(4, res_obj["found"].get<size_t>());
+    ASSERT_EQ(4, res_obj["hits"].size());
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name"));
+    ASSERT_EQ("Storm", res_obj["hits"][1]["document"].at("name"));
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"].at("name"));
+    ASSERT_EQ("Steve Jobs", res_obj["hits"][3]["document"].at("name"));
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "name"},
+            {"filter_by", "name: St*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(5, res_obj["found"].get<size_t>());
+    ASSERT_EQ(5, res_obj["hits"].size());
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name"));
+    ASSERT_EQ("Storm", res_obj["hits"][1]["document"].at("name"));
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"].at("name"));
+    ASSERT_EQ("Adam Stator", res_obj["hits"][3]["document"].at("name"));
+    ASSERT_EQ("Steve Jobs", res_obj["hits"][4]["document"].at("name"));
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "name"},
+            {"filter_by", "name:= Steve R*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(2, res_obj["found"].get<size_t>());
+    ASSERT_EQ(2, res_obj["hits"].size());
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name"));
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"].at("name"));
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "name"},
+            {"filter_by", "name: Steve R*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(2, res_obj["found"].get<size_t>());
+    ASSERT_EQ(2, res_obj["hits"].size());
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"].at("name"));
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"].at("name"));
+
+    documents = {
+            R"({
+                "names": []
+            })"_json,
+            R"({
+                "names": ["Steve Jobs"]
+            })"_json,
+            R"({
+                "names": ["Adam Stator"]
+            })"_json
+    };
+
+    for (auto const &json: documents) {
+        auto add_op = collection_create_op.get()->add(json.dump());
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "names"},
+            {"filter_by", "names:= St*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(1, res_obj["found"].get<size_t>());
+    ASSERT_EQ(1, res_obj["hits"].size());
+    ASSERT_EQ("Steve Jobs", res_obj["hits"][0]["document"]["names"][0]);
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "names"},
+            {"filter_by", "names: St*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(2, res_obj["found"].get<size_t>());
+    ASSERT_EQ(2, res_obj["hits"].size());
+    ASSERT_EQ("Adam Stator", res_obj["hits"][0]["document"]["names"][0]);
+    ASSERT_EQ("Steve Jobs", res_obj["hits"][1]["document"]["names"][0]);
+
+    documents = {
+            R"({
+                "names": ["Steve Reiley"]
+            })"_json,
+            R"({
+                "names": ["Storm"]
+            })"_json,
+            R"({
+                "names": ["Adam", "Steve Rogers"]
+            })"_json,
+    };
+
+    for (auto const &json: documents) {
+        auto add_op = collection_create_op.get()->add(json.dump());
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "names"},
+            {"filter_by", "names:= St*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(4, res_obj["found"].get<size_t>());
+    ASSERT_EQ(4, res_obj["hits"].size());
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
+    ASSERT_EQ("Storm", res_obj["hits"][1]["document"]["names"][0]);
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"]["names"][0]);
+    ASSERT_EQ("Steve Jobs", res_obj["hits"][3]["document"]["names"][0]);
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "names"},
+            {"filter_by", "names: St*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(5, res_obj["found"].get<size_t>());
+    ASSERT_EQ(5, res_obj["hits"].size());
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
+    ASSERT_EQ("Storm", res_obj["hits"][1]["document"]["names"][0]);
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"]["names"][0]);
+    ASSERT_EQ("Adam Stator", res_obj["hits"][3]["document"]["names"][0]);
+    ASSERT_EQ("Steve Jobs", res_obj["hits"][4]["document"]["names"][0]);
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "names"},
+            {"filter_by", "names:= Steve*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(3, res_obj["found"].get<size_t>());
+    ASSERT_EQ(3, res_obj["hits"].size());
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]);
+    ASSERT_EQ("Steve Jobs", res_obj["hits"][2]["document"]["names"][0]);
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "names"},
+            {"filter_by", "names: Steve*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(3, res_obj["found"].get<size_t>());
+    ASSERT_EQ(3, res_obj["hits"].size());
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]);
+    ASSERT_EQ("Steve Jobs", res_obj["hits"][2]["document"]["names"][0]);
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "names"},
+            {"filter_by", "names:= Steve R*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(2, res_obj["found"].get<size_t>());
+    ASSERT_EQ(2, res_obj["hits"].size());
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]);
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "names"},
+            {"filter_by", "names: Steve R*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(2, res_obj["found"].get<size_t>());
+    ASSERT_EQ(2, res_obj["hits"].size());
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][0]["document"]["names"][1]);
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][1]["document"]["names"][0]);
+
+    documents = {
+            R"({
+                "names": ["Steve Runner foo"]
+            })"_json,
+            R"({
+                "names": ["foo Steve Runner"]
+            })"_json,
+    };
+
+    for (auto const &json: documents) {
+        auto add_op = collection_create_op.get()->add(json.dump());
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "names"},
+            {"filter_by", "names:= Steve R*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(3, res_obj["found"].get<size_t>());
+    ASSERT_EQ(3, res_obj["hits"].size());
+    ASSERT_EQ("Steve Runner foo", res_obj["hits"][0]["document"]["names"][0]);
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][1]["document"]["names"][1]);
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][2]["document"]["names"][0]);
+
+    req_params = {
+            {"collection", "Names"},
+            {"q", "s"},
+            {"query_by", "names"},
+            {"filter_by", "names: Steve R*"}
+    };
+
+    search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
+    ASSERT_TRUE(search_op.ok());
+
+    res_obj = nlohmann::json::parse(json_res);
+    ASSERT_EQ(4, res_obj["found"].get<size_t>());
+    ASSERT_EQ(4, res_obj["hits"].size());
+    ASSERT_EQ("foo Steve Runner", res_obj["hits"][0]["document"]["names"][0]);
+    ASSERT_EQ("Steve Runner foo", res_obj["hits"][1]["document"]["names"][0]);
+    ASSERT_EQ("Steve Rogers", res_obj["hits"][2]["document"]["names"][1]);
+    ASSERT_EQ("Steve Reiley", res_obj["hits"][3]["document"]["names"][0]);
 }

 TEST_F(CollectionFilteringTest, FilterOnStemmedField) {
--- a/test/filter_test.cpp
+++ b/test/filter_test.cpp
@ -1503,3 +1503,266 @@ TEST_F(FilterTest, NumericFilterIterator) {

    delete filter_tree_root;
 }
+
+TEST_F(FilterTest, PrefixStringFilter) {
+    auto schema_json =
+            R"({
+                "name": "Names",
+                "fields": [
+                    {"name": "name", "type": "string"}
+                ]
+            })"_json;
+    std::vector<nlohmann::json> documents = {
+            R"({
+                "name": "Steve Jobs"
+            })"_json,
+            R"({
+                "name": "Adam Stator"
+            })"_json,
+    };
+
+    auto collection_create_op = collectionManager.create_collection(schema_json);
+    ASSERT_TRUE(collection_create_op.ok());
+    Collection* coll = collection_create_op.get();
+    for (auto const &json: documents) {
+        auto add_op = coll->add(json.dump());
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    const std::string doc_id_prefix = std::to_string(coll->get_collection_id()) + "_" + Collection::DOC_ID_PREFIX + "_";
+    filter_node_t* filter_tree_root = nullptr;
+
+    search_stop_us = UINT64_MAX; // `Index::fuzzy_search_fields` checks for timeout.
+    Option<bool> filter_op = filter::parse_filter_query("name:= S*", coll->get_schema(), store, doc_id_prefix,
+                                                        filter_tree_root);
+    ASSERT_TRUE(filter_op.ok());
+
+    auto computed_exact_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
+    ASSERT_TRUE(computed_exact_prefix_test.init_status().ok());
+    ASSERT_TRUE(computed_exact_prefix_test._get_is_filter_result_initialized());
+
+    std::vector<int> expected = {0};
+    for (auto const& i : expected) {
+        ASSERT_EQ(filter_result_iterator_t::valid, computed_exact_prefix_test.validity);
+        ASSERT_EQ(i, computed_exact_prefix_test.seq_id);
+        computed_exact_prefix_test.next();
+    }
+    ASSERT_EQ(filter_result_iterator_t::invalid, computed_exact_prefix_test.validity);
+
+    delete filter_tree_root;
+    filter_tree_root = nullptr;
+    filter_op = filter::parse_filter_query("name: S*", coll->get_schema(), store, doc_id_prefix,
+                                                        filter_tree_root);
+    ASSERT_TRUE(filter_op.ok());
+
+    auto computed_contains_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
+    ASSERT_TRUE(computed_contains_prefix_test.init_status().ok());
+    ASSERT_TRUE(computed_contains_prefix_test._get_is_filter_result_initialized());
+
+    expected = {0, 1};
+    for (auto const& i : expected) {
+        ASSERT_EQ(filter_result_iterator_t::valid, computed_contains_prefix_test.validity);
+        ASSERT_EQ(i, computed_contains_prefix_test.seq_id);
+        computed_contains_prefix_test.next();
+    }
+    ASSERT_EQ(filter_result_iterator_t::invalid, computed_contains_prefix_test.validity);
+
+    delete filter_tree_root;
+
+    documents = {
+            R"({
+                "name": "Steve Reiley"
+            })"_json,
+            R"({
+                "name": "Storm"
+            })"_json,
+            R"({
+                "name": "Steve Rogers"
+            })"_json,
+    };
+
+    for (auto const &json: documents) {
+        auto add_op = collection_create_op.get()->add(json.dump());
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    filter_tree_root = nullptr;
+    filter_op = filter::parse_filter_query("name:= S*", coll->get_schema(), store, doc_id_prefix,
+                                           filter_tree_root);
+    ASSERT_TRUE(filter_op.ok());
+
+    auto iter_exact_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
+    ASSERT_TRUE(iter_exact_prefix_test.init_status().ok());
+    ASSERT_FALSE(iter_exact_prefix_test._get_is_filter_result_initialized());
+
+    std::vector<uint32_t> validate_ids = {0, 1, 2, 3, 4, 5};
+    std::vector<uint32_t> seq_ids = {2, 2, 3, 4, 4, 4};
+    std::vector<uint32_t> equals_match_seq_ids = {0, 2, 2, 3, 4, 4};
+    std::vector<bool> equals_iterator_valid = {true, true, true, true, true, false};
+    expected = {1, 0, 1, 1, 1, -1};
+    for (uint32_t i = 0; i < validate_ids.size(); i++) {
+        if (i < 5) {
+            ASSERT_EQ(filter_result_iterator_t::valid, iter_exact_prefix_test.validity);
+        } else {
+            ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test.validity);
+        }
+        ASSERT_EQ(expected[i], iter_exact_prefix_test.is_valid(validate_ids[i]));
+        ASSERT_EQ(equals_match_seq_ids[i], iter_exact_prefix_test._get_equals_iterator_id());
+        ASSERT_EQ(equals_iterator_valid[i], iter_exact_prefix_test._get_is_equals_iterator_valid());
+
+        if (expected[i] == 1) {
+            iter_exact_prefix_test.next();
+        }
+        ASSERT_EQ(seq_ids[i], iter_exact_prefix_test.seq_id);
+    }
+    ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test.validity);
+
+    delete filter_tree_root;
+    filter_tree_root = nullptr;
+    filter_op = filter::parse_filter_query("name: S*", coll->get_schema(), store, doc_id_prefix,
+                                           filter_tree_root);
+    ASSERT_TRUE(filter_op.ok());
+
+    auto iter_contains_prefix_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
+    ASSERT_TRUE(iter_contains_prefix_test.init_status().ok());
+    ASSERT_FALSE(iter_contains_prefix_test._get_is_filter_result_initialized());
+
+    validate_ids = {0, 1, 2, 3, 4, 5};
+    seq_ids = {1, 2, 3, 4, 4, 4};
+    equals_match_seq_ids = {0, 1, 2, 3, 4, 4};
+    equals_iterator_valid = {true, true, true, true, true, false};
+    expected = {1, 1, 1, 1, 1, -1};
+    for (uint32_t i = 0; i < validate_ids.size(); i++) {
+        if (i < 5) {
+            ASSERT_EQ(filter_result_iterator_t::valid, iter_contains_prefix_test.validity);
+        } else {
+            ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test.validity);
+        }
+        ASSERT_EQ(expected[i], iter_contains_prefix_test.is_valid(validate_ids[i]));
+        ASSERT_EQ(equals_match_seq_ids[i], iter_contains_prefix_test._get_equals_iterator_id());
+        ASSERT_EQ(equals_iterator_valid[i], iter_contains_prefix_test._get_is_equals_iterator_valid());
+
+        if (expected[i] == 1) {
+            iter_contains_prefix_test.next();
+        }
+        ASSERT_EQ(seq_ids[i], iter_contains_prefix_test.seq_id);
+    }
+    ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test.validity);
+
+    delete filter_tree_root;
+    filter_tree_root = nullptr;
+    filter_op = filter::parse_filter_query("name:= Steve R*", coll->get_schema(), store, doc_id_prefix,
+                                           filter_tree_root);
+    ASSERT_TRUE(filter_op.ok());
+
+    auto computed_exact_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
+    ASSERT_TRUE(computed_exact_prefix_test_2.init_status().ok());
+    ASSERT_TRUE(computed_exact_prefix_test_2._get_is_filter_result_initialized());
+
+    expected = {2, 4};
+    for (auto const& i : expected) {
+        ASSERT_EQ(filter_result_iterator_t::valid, computed_exact_prefix_test_2.validity);
+        ASSERT_EQ(i, computed_exact_prefix_test_2.seq_id);
+        computed_exact_prefix_test_2.next();
+    }
+    ASSERT_EQ(filter_result_iterator_t::invalid, computed_exact_prefix_test_2.validity);
+
+    delete filter_tree_root;
+    filter_tree_root = nullptr;
+    filter_op = filter::parse_filter_query("name: Steve R*", coll->get_schema(), store, doc_id_prefix,
+                                           filter_tree_root);
+    ASSERT_TRUE(filter_op.ok());
+
+    auto computed_contains_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
+    ASSERT_TRUE(computed_contains_prefix_test_2.init_status().ok());
+    ASSERT_TRUE(computed_contains_prefix_test_2._get_is_filter_result_initialized());
+
+    expected = {2, 4};
+    for (auto const& i : expected) {
+        ASSERT_EQ(filter_result_iterator_t::valid, computed_contains_prefix_test_2.validity);
+        ASSERT_EQ(i, computed_contains_prefix_test_2.seq_id);
+        computed_contains_prefix_test_2.next();
+    }
+    ASSERT_EQ(filter_result_iterator_t::invalid, computed_contains_prefix_test_2.validity);
+
+    delete filter_tree_root;
+
+    documents = {
+            R"({
+                "name": "Steve Runner foo"
+            })"_json,
+            R"({
+                "name": "foo Steve Runner"
+            })"_json,
+    };
+
+    for (auto const &json: documents) {
+        auto add_op = collection_create_op.get()->add(json.dump());
+        ASSERT_TRUE(add_op.ok());
+    }
+
+    filter_tree_root = nullptr;
+    filter_op = filter::parse_filter_query("name:= Steve R*", coll->get_schema(), store, doc_id_prefix,
+                                           filter_tree_root);
+    ASSERT_TRUE(filter_op.ok());
+
+    auto iter_exact_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
+    ASSERT_TRUE(iter_exact_prefix_test_2.init_status().ok());
+    ASSERT_FALSE(iter_exact_prefix_test_2._get_is_filter_result_initialized());
+
+    validate_ids = {0, 1, 2, 3, 4, 5, 6, 7};
+    seq_ids = {2, 2, 4, 4, 5, 5, 5, 5};
+    equals_match_seq_ids = {2, 2, 2, 4, 4, 5, 5, 5};
+    equals_iterator_valid = {true, true, true, true, true, true, false, false};
+    expected = {0, 0, 1, 0, 1, 1, -1, -1};
+    for (uint32_t i = 0; i < validate_ids.size(); i++) {
+        if (i < 6) {
+            ASSERT_EQ(filter_result_iterator_t::valid, iter_exact_prefix_test_2.validity);
+        } else {
+            ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test_2.validity);
+        }
+        ASSERT_EQ(expected[i], iter_exact_prefix_test_2.is_valid(validate_ids[i]));
+        ASSERT_EQ(equals_match_seq_ids[i], iter_exact_prefix_test_2._get_equals_iterator_id());
+        ASSERT_EQ(equals_iterator_valid[i], iter_exact_prefix_test_2._get_is_equals_iterator_valid());
+
+        if (expected[i] == 1) {
+            iter_exact_prefix_test_2.next();
+        }
+        ASSERT_EQ(seq_ids[i], iter_exact_prefix_test_2.seq_id);
+    }
+    ASSERT_EQ(filter_result_iterator_t::invalid, iter_exact_prefix_test_2.validity);
+
+    delete filter_tree_root;
+    filter_tree_root = nullptr;
+    filter_op = filter::parse_filter_query("name: Steve R*", coll->get_schema(), store, doc_id_prefix,
+                                           filter_tree_root);
+    ASSERT_TRUE(filter_op.ok());
+
+    auto iter_contains_prefix_test_2 = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root);
+    ASSERT_TRUE(iter_contains_prefix_test_2.init_status().ok());
+    ASSERT_FALSE(iter_contains_prefix_test_2._get_is_filter_result_initialized());
+
+    validate_ids = {0, 1, 2, 3, 4, 5, 6, 7};
+    seq_ids = {2, 2, 4, 4, 5, 6, 6, 6};
+    equals_match_seq_ids = {2, 2, 2, 4, 4, 5, 6, 6};
+    equals_iterator_valid = {true, true, true, true, true, true, true, false};
+    expected = {0, 0, 1, 0, 1, 1, 1, -1};
+    for (uint32_t i = 0; i < validate_ids.size(); i++) {
+        if (i < 7) {
+            ASSERT_EQ(filter_result_iterator_t::valid, iter_contains_prefix_test_2.validity);
+        } else {
+            ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test_2.validity);
+        }
+        ASSERT_EQ(expected[i], iter_contains_prefix_test_2.is_valid(validate_ids[i]));
+        ASSERT_EQ(equals_match_seq_ids[i], iter_contains_prefix_test_2._get_equals_iterator_id());
+        ASSERT_EQ(equals_iterator_valid[i], iter_contains_prefix_test_2._get_is_equals_iterator_valid());
+
+        if (expected[i] == 1) {
+            iter_contains_prefix_test_2.next();
+        }
+        ASSERT_EQ(seq_ids[i], iter_contains_prefix_test_2.seq_id);
+    }
+    ASSERT_EQ(filter_result_iterator_t::invalid, iter_contains_prefix_test_2.validity);
+
+    delete filter_tree_root;
+}