Handle curation + filtering for infix.

This commit is contained in:
Kishore Nallan 2022-02-26 14:43:39 +05:30
parent 31c8a030c7
commit b910786deb
3 changed files with 95 additions and 12 deletions

View File

@ -816,8 +816,10 @@ public:
const std::vector<std::string>& group_by_fields, const size_t max_extra_prefix,
const size_t max_extra_suffix, const infix_t& field_infix, const uint8_t field_id,
const string& field_name, const std::vector<token_t>& query_tokens, Topster* actual_topster,
size_t field_num_results, size_t& all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed, uint32_t*& all_result_ids) const;
const uint32_t *filter_ids, size_t filter_ids_length,
const std::vector<uint32_t>& curated_ids_sorted,
size_t field_num_results, uint32_t*& all_result_ids, size_t& all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed) const;
void do_synonym_search(const std::vector<filter>& filters,
const std::map<size_t, std::map<size_t, uint32_t>>& included_ids_map,

View File

@ -2613,8 +2613,8 @@ void Index::search_fields(const std::vector<filter>& filters,
do_infix_search(sort_fields_std, searched_queries, group_limit, group_by_fields,
max_extra_prefix, max_extra_suffix, field_infix, field_id, field_name, query_tokens,
actual_topster, field_num_results, all_result_ids_len, groups_processed,
all_result_ids);
actual_topster, filter_ids, filter_ids_length,
curated_ids_sorted, field_num_results, all_result_ids, all_result_ids_len, groups_processed);
} else if(actual_filter_ids_length != 0) {
// indicates phrase match query
curate_filtered_ids(filters, curated_ids, exclude_token_ids,
@ -2787,8 +2787,11 @@ void Index::do_infix_search(const std::vector<sort_by>& sort_fields_std,
const std::vector<std::string>& group_by_fields, const size_t max_extra_prefix,
const size_t max_extra_suffix, const infix_t& field_infix, const uint8_t field_id,
const string& field_name, const std::vector<token_t>& query_tokens, Topster* actual_topster,
size_t field_num_results, size_t& all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed, uint32_t*& all_result_ids) const {
const uint32_t *filter_ids, size_t filter_ids_length,
const std::vector<uint32_t>& curated_ids_sorted,
size_t field_num_results, uint32_t*& all_result_ids, size_t& all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed) const {
if(field_infix == always || (field_infix == fallback && field_num_results == 0)) {
std::vector<uint32_t> infix_ids;
search_infix(query_tokens[0].value, field_name, infix_ids, max_extra_prefix, max_extra_suffix);
@ -2800,21 +2803,49 @@ void Index::do_infix_search(const std::vector<sort_by>& sort_fields_std,
populate_sort_mapping(sort_order, geopoint_indices, sort_fields_std, field_values);
uint32_t token_bits = 255;
for(auto seq_id: infix_ids) {
std::sort(infix_ids.begin(), infix_ids.end());
infix_ids.erase(std::unique( infix_ids.begin(), infix_ids.end() ), infix_ids.end());
uint32_t *raw_infix_ids = nullptr;
size_t raw_infix_ids_length = 0;
if(curated_ids_sorted.size() != 0) {
raw_infix_ids_length = ArrayUtils::exclude_scalar(&infix_ids[0], infix_ids.size(), &curated_ids_sorted[0],
curated_ids_sorted.size(), &raw_infix_ids);
infix_ids.clear();
} else {
raw_infix_ids = &infix_ids[0];
raw_infix_ids_length = infix_ids.size();
}
if(filter_ids_length != 0) {
uint32_t *filtered_raw_infix_ids = nullptr;
raw_infix_ids_length = ArrayUtils::and_scalar(filter_ids, filter_ids_length, raw_infix_ids,
raw_infix_ids_length, &filtered_raw_infix_ids);
if(raw_infix_ids != &infix_ids[0]) {
delete [] raw_infix_ids;
}
raw_infix_ids = filtered_raw_infix_ids;
}
for(size_t i = 0; i < raw_infix_ids_length; i++) {
auto seq_id = raw_infix_ids[i];
score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, false, 2,
actual_topster, {}, groups_processed, seq_id, sort_order, field_values,
geopoint_indices, group_limit, group_by_fields, token_bits,
false, false, {});
}
std::sort(infix_ids.begin(), infix_ids.end());
infix_ids.erase(std::unique( infix_ids.begin(), infix_ids.end() ), infix_ids.end());
uint32_t* new_all_result_ids = nullptr;
all_result_ids_len = ArrayUtils::or_scalar(all_result_ids, all_result_ids_len, &infix_ids[0],
infix_ids.size(), &new_all_result_ids);
all_result_ids_len = ArrayUtils::or_scalar(all_result_ids, all_result_ids_len, raw_infix_ids,
raw_infix_ids_length, &new_all_result_ids);
delete[] all_result_ids;
all_result_ids = new_all_result_ids;
if(raw_infix_ids != &infix_ids[0]) {
delete [] raw_infix_ids;
}
}
}
}

View File

@ -112,6 +112,56 @@ TEST_F(CollectionInfixSearchTest, InfixBasics) {
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionInfixSearchTest, InfixWithFiltering) {
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, "", -1, 1),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "GH100037IN8900X";
doc1["points"] = 100;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["title"] = "XH100037IN8900X";
doc2["points"] = 200;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
auto results = coll1->search("37IN8",
{"title"}, "points: 200", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, "", "", {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, true,
4, {always}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
// filtering + exclusion via curation
nlohmann::json doc3;
doc3["id"] = "2";
doc3["title"] = "RH100037IN8900X";
doc3["points"] = 300;
ASSERT_TRUE(coll1->add(doc3.dump()).ok());
results = coll1->search("37IN8", {"title"}, "points:>= 200", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, "", "2", {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, true,
4, {always}).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionInfixSearchTest, RespectPrefixAndSuffixLimits) {
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, "", -1, 1),
field("points", field_types::INT32, false),};