diff --git a/include/filter_result_iterator.h b/include/filter_result_iterator.h index f5ae22a6..46ec11bc 100644 --- a/include/filter_result_iterator.h +++ b/include/filter_result_iterator.h @@ -120,6 +120,9 @@ private: /// Performs OR on the subtrees of operator. void or_filter_iterators(); + /// Advance all the token iterators that are at seq_id. + void advance_string_filter_token_iterators(); + /// Finds the next match for a filter on string field. void doc_matching_string_filter(bool field_is_array); diff --git a/include/id_list.h b/include/id_list.h index 3b0ef7ae..ad890119 100644 --- a/include/id_list.h +++ b/include/id_list.h @@ -126,6 +126,8 @@ public: uint32_t first_id(); + uint32_t last_id(); + block_t* block_of(uint32_t id); bool contains(uint32_t id); diff --git a/src/filter_result_iterator.cpp b/src/filter_result_iterator.cpp index f5bd8b5f..f7217c0a 100644 --- a/src/filter_result_iterator.cpp +++ b/src/filter_result_iterator.cpp @@ -265,6 +265,18 @@ void filter_result_iterator_t::or_filter_iterators() { is_valid = false; } +void filter_result_iterator_t::advance_string_filter_token_iterators() { + for (uint32_t i = 0; i < posting_list_iterators.size(); i++) { + auto& filter_value_tokens = posting_list_iterators[i]; + + if (filter_value_tokens[0].valid() && filter_value_tokens[0].id() == seq_id) { + for (auto& iter: filter_value_tokens) { + iter.next(); + } + } + } +} + void filter_result_iterator_t::doc_matching_string_filter(bool field_is_array) { // If none of the filter value iterators are valid, mark this node as invalid. bool one_is_valid = false; @@ -384,18 +396,38 @@ void filter_result_iterator_t::next() { field f = index->search_schema.at(a_filter.field_name); if (f.is_string()) { - // Advance all the filter values that are at doc. Then find the next doc. - for (uint32_t i = 0; i < posting_list_iterators.size(); i++) { - auto& filter_value_tokens = posting_list_iterators[i]; - - if (filter_value_tokens[0].valid() && filter_value_tokens[0].id() == seq_id) { - for (auto& iter: filter_value_tokens) { - iter.next(); - } + if (filter_node->filter_exp.apply_not_equals) { + if (++seq_id < result_index) { + return; } + + uint32_t previous_match; + do { + previous_match = seq_id; + advance_string_filter_token_iterators(); + doc_matching_string_filter(f.is_array()); + } while (is_valid && previous_match + 1 == seq_id); + + if (!is_valid) { + // We've reached the end of the index, no possible matches pending. + if (previous_match >= index->seq_ids->last_id()) { + return; + } + + is_valid = true; + result_index = index->seq_ids->last_id() + 1; + seq_id = previous_match + 1; + return; + } + + result_index = seq_id; + seq_id = previous_match + 1; + return; } + advance_string_filter_token_iterators(); doc_matching_string_filter(f.is_array()); + return; } } @@ -509,6 +541,47 @@ void filter_result_iterator_t::init() { } doc_matching_string_filter(f.is_array()); + + if (filter_node->filter_exp.apply_not_equals) { + // filter didn't match any id. So by applying not equals, every id in the index is a match. + if (!is_valid) { + is_valid = true; + seq_id = 0; + result_index = index->seq_ids->last_id() + 1; + return; + } + + // [0, seq_id) are a match for not equals. + if (seq_id > 0) { + result_index = seq_id; + seq_id = 0; + return; + } + + // Keep ignoring the consecutive matches. + uint32_t previous_match; + do { + previous_match = seq_id; + advance_string_filter_token_iterators(); + doc_matching_string_filter(f.is_array()); + } while (is_valid && previous_match + 1 == seq_id); + + if (!is_valid) { + // filter matched all the ids in the index. So for not equals, there's no match. + if (previous_match >= index->seq_ids->last_id()) { + return; + } + + is_valid = true; + result_index = index->seq_ids->last_id() + 1; + seq_id = previous_match + 1; + return; + } + + result_index = seq_id; + seq_id = previous_match + 1; + } + return; } } @@ -543,6 +616,10 @@ bool filter_result_iterator_t::valid() { field f = index->search_schema.at(a_filter.field_name); if (f.is_string()) { + if (filter_node->filter_exp.apply_not_equals) { + return seq_id < result_index; + } + bool one_is_valid = false; for (auto& filter_value_tokens: posting_list_iterators) { posting_list_t::intersect(filter_value_tokens, one_is_valid); @@ -618,6 +695,41 @@ void filter_result_iterator_t::skip_to(uint32_t id) { field f = index->search_schema.at(a_filter.field_name); if (f.is_string()) { + if (filter_node->filter_exp.apply_not_equals) { + if (id < seq_id) { + return; + } + + if (id < result_index) { + seq_id = id; + return; + } + + seq_id = result_index; + uint32_t previous_match; + do { + previous_match = seq_id; + advance_string_filter_token_iterators(); + doc_matching_string_filter(f.is_array()); + } while (is_valid && previous_match + 1 == seq_id && seq_id >= id); + + if (!is_valid) { + // filter matched all the ids in the index. So for not equals, there's no match. + if (previous_match >= index->seq_ids->last_id()) { + return; + } + + is_valid = true; + seq_id = previous_match + 1; + result_index = index->seq_ids->last_id() + 1; + return; + } + + result_index = seq_id; + seq_id = previous_match + 1; + return; + } + // Skip all the token iterators and find a new match. for (auto& filter_value_tokens : posting_list_iterators) { for (auto& token: filter_value_tokens) { @@ -670,23 +782,6 @@ int filter_result_iterator_t::valid(uint32_t id) { } } - if (filter_node->filter_exp.apply_not_equals) { - // Even when iterator becomes invalid, we keep it marked as valid since we are evaluating not equals. - if (!valid()) { - is_valid = true; - return 1; - } - - skip_to(id); - - if (!is_valid) { - is_valid = true; - return 1; - } - - return seq_id != id ? 1 : 0; - } - skip_to(id); return is_valid ? (seq_id == id ? 1 : 0) : -1; } diff --git a/src/id_list.cpp b/src/id_list.cpp index 4b308603..82712368 100644 --- a/src/id_list.cpp +++ b/src/id_list.cpp @@ -338,6 +338,14 @@ uint32_t id_list_t::first_id() { return root_block.ids.at(0); } +uint32_t id_list_t::last_id() { + if(id_block_map.empty()) { + return 0; + } + + return id_block_map.rbegin()->first; +} + id_list_t::block_t* id_list_t::block_of(uint32_t id) { const auto it = id_block_map.lower_bound(id); if(it == id_block_map.end()) { diff --git a/test/filter_test.cpp b/test/filter_test.cpp index d125e814..6cab88b5 100644 --- a/test/filter_test.cpp +++ b/test/filter_test.cpp @@ -162,23 +162,23 @@ TEST_F(FilterTest, FilterTreeIterator) { } ASSERT_FALSE(iter_exact_match_multi_test.valid()); -// delete filter_tree_root; -// filter_tree_root = nullptr; -// filter_op = filter::parse_filter_query("tags:!= gold", coll->get_schema(), store, doc_id_prefix, -// filter_tree_root); -// ASSERT_TRUE(filter_op.ok()); -// -// auto iter_not_equals_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root); -// ASSERT_TRUE(iter_not_equals_test.init_status().ok()); -// -// std::vector expected = {1, 3}; -// for (auto const& i : expected) { -// ASSERT_TRUE(iter_not_equals_test.valid()); -// ASSERT_EQ(i, iter_not_equals_test.seq_id); -// iter_not_equals_test.next(); -// } -// -// ASSERT_FALSE(iter_not_equals_test.valid()); + delete filter_tree_root; + filter_tree_root = nullptr; + filter_op = filter::parse_filter_query("tags:!= gold", coll->get_schema(), store, doc_id_prefix, + filter_tree_root); + ASSERT_TRUE(filter_op.ok()); + + auto iter_not_equals_test = filter_result_iterator_t(coll->get_name(), coll->_get_index(), filter_tree_root); + ASSERT_TRUE(iter_not_equals_test.init_status().ok()); + + expected = {1, 3}; + for (auto const& i : expected) { + ASSERT_TRUE(iter_not_equals_test.valid()); + ASSERT_EQ(i, iter_not_equals_test.seq_id); + iter_not_equals_test.next(); + } + + ASSERT_FALSE(iter_not_equals_test.valid()); delete filter_tree_root; filter_tree_root = nullptr; @@ -287,8 +287,8 @@ TEST_F(FilterTest, FilterTreeIterator) { filter_tree_root); ASSERT_TRUE(iter_validate_ids_not_equals_filter_test.init_status().ok()); - validate_ids = {0, 1, 2, 3, 4, 5, 6, 7, 100}; - expected = {0, 1, 0, 1, 0, 1, 1, 1, 1}; + validate_ids = {0, 1, 2, 3, 4, 5, 6}; + expected = {0, 1, 0, 1, 0, 1, -1}; for (uint32_t i = 0; i < validate_ids.size(); i++) { ASSERT_EQ(expected[i], iter_validate_ids_not_equals_filter_test.valid(validate_ids[i])); } @@ -422,7 +422,7 @@ TEST_F(FilterTest, FilterTreeIterator) { for (uint32_t i = 0; i < and_result_length; i++) { ASSERT_EQ(expected[i], and_result[i]); } - ASSERT_FALSE(iter_and_test.valid()); + ASSERT_FALSE(iter_and_scalar_test.valid()); delete and_result; delete filter_tree_root;