Support wildcard query: using a * would ignore the search query and consider all records for filter+sort.

This commit is contained in:
Kishore Nallan 2018-05-11 21:13:11 +05:30
parent 95112a8086
commit 2d7e75caa5
6 changed files with 115 additions and 24 deletions

View File

@ -80,7 +80,7 @@ void post_create_collection(http_req & req, http_res & res) {
if(!req_json[DEFAULT_SORTING_FIELD].is_string()) {
return res.send_400(std::string("`") + DEFAULT_SORTING_FIELD +
"` should be a string. It should be the name of an unsigned integer field.");
"` should be a string. It should be the name of an int32/float field.");
}
if(collectionManager.get_collection(req_json["name"]) != nullptr) {

View File

@ -118,7 +118,7 @@ Option<uint32_t> Collection::validate_index_in_memory(const nlohmann::json &docu
}
if(!document[default_sorting_field].is_number_integer() && !document[default_sorting_field].is_number_float()) {
return Option<>(400, "Default sorting field `" + default_sorting_field + "` must be a number.");
return Option<>(400, "Default sorting field `" + default_sorting_field + "` must be of type int32 or float.");
}
if(document[default_sorting_field].is_number_integer() &&
@ -391,6 +391,15 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
filters.push_back(f);
}
// for a wildcard query, if filter is not specified, use default_sorting_field as a catch-all
if(query == "*" && filters.size() == 0) {
field f = search_schema.at(default_sorting_field);
std::string max_value = f.is_float() ? std::to_string(std::numeric_limits<float>::max()) :
std::to_string(std::numeric_limits<int32_t>::max());
filter catch_all_filter = {f.name, {max_value}, LESS_THAN_EQUALS};
filters.push_back(catch_all_filter);
}
// validate facet fields
for(const std::string & field_name: facet_fields) {
if(facet_schema.count(field_name) == 0) {
@ -553,7 +562,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
const std::string & field_name = search_fields[Index::FIELD_LIMIT_NUM - field_order_kv.field_id];
field search_field = search_schema.at(field_name);
if(search_field.type == field_types::STRING || search_field.type == field_types::STRING_ARRAY) {
if(query != "*" && (search_field.type == field_types::STRING || search_field.type == field_types::STRING_ARRAY)) {
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
for (const art_leaf *token_leaf : searched_queries[field_order_kv.query_index]) {
@ -640,13 +649,13 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
delete [] it->second;
it->second = nullptr;
}
prune_document(document, include_fields, exclude_fields);
wrapper_doc["document"] = document;
//wrapper_doc["match_score"] = field_order_kv.match_score;
//wrapper_doc["seq_id"] = (uint32_t) field_order_kv.key;
}
prune_document(document, include_fields, exclude_fields);
wrapper_doc["document"] = document;
//wrapper_doc["match_score"] = field_order_kv.match_score;
//wrapper_doc["seq_id"] = (uint32_t) field_order_kv.key;
result["hits"].push_back(wrapper_doc);
}

View File

@ -156,6 +156,12 @@ Option<Collection*> CollectionManager::create_collection(const std::string name,
field_val[fields::type] = field.type;
field_val[fields::facet] = field.facet;
fields_json.push_back(field_val);
if(field.name == default_sorting_field && !(field.type == field_types::INT32 ||
field.type == field_types::FLOAT)) {
return Option<Collection*>(400, "Default sorting field `" + default_sorting_field + "` must be of type int32 "
"or float.");
}
}
collection_meta[COLLECTION_NAME_KEY] = name;

View File

@ -622,15 +622,22 @@ void Index::search(Option<uint32_t> & outcome, std::string query, const std::vec
Topster<512> topster;
const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM);
for(size_t i = 0; i < num_search_fields; i++) {
const std::string & field = search_fields[i];
// proceed to query search only when no filters are provided or when filtering produces results
if(filters.size() == 0 || filter_ids_length > 0) {
uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i);
search_field(field_id, query, field, filter_ids, filter_ids_length, facets, sort_fields_std,
num_typos, num_results, searched_queries, topster, &all_result_ids, all_result_ids_len,
token_order, prefix, drop_tokens_threshold);
if(query == "*") {
uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
filter_ids, filter_ids_length);
all_result_ids_len = filter_ids_length;
} else {
const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM);
for(size_t i = 0; i < num_search_fields; i++) {
const std::string & field = search_fields[i];
// proceed to query search only when no filters are provided or when filtering produces results
if(filters.size() == 0 || filter_ids_length > 0) {
uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i);
search_field(field_id, query, field, filter_ids, filter_ids_length, facets, sort_fields_std,
num_typos, num_results, searched_queries, topster, &all_result_ids, all_result_ids_len,
token_order, prefix, drop_tokens_threshold);
}
}
}
@ -897,7 +904,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
uint64_t match_score = 0;
if(query_suggestion.size() == 1) {
if(query_suggestion.size() <= 1) {
match_score = single_token_match_score;
} else {
std::vector<std::vector<std::vector<uint16_t>>> array_token_positions;

View File

@ -768,7 +768,7 @@ TEST(ArtTest, test_encode_int32) {
}
}
TEST(ArtTest, test_int32_range_hundreds) {
TEST(ArtTest, test_int32_overlap) {
art_tree t;
art_tree_init(&t);
@ -776,7 +776,7 @@ TEST(ArtTest, test_int32_range_hundreds) {
const int CHAR_LEN = 8;
unsigned char chars[CHAR_LEN];
std::vector<const art_leaf*> results;
std::vector<const art_leaf *> results;
std::vector<std::vector<uint32_t>> values = {{2014, 2015, 2016}, {2015, 2016}, {2016},
{1981, 1985}, {1999, 2000, 2001, 2002}};
@ -793,7 +793,19 @@ TEST(ArtTest, test_int32_range_hundreds) {
ASSERT_TRUE(res == 0);
ASSERT_EQ(3, results.size());
return ;
res = art_tree_destroy(&t);
ASSERT_TRUE(res == 0);
}
TEST(ArtTest, test_int32_range_hundreds) {
art_tree t;
art_tree_init(&t);
art_document doc = get_document(1);
const int CHAR_LEN = 8;
unsigned char chars[CHAR_LEN];
std::vector<const art_leaf*> results;
for(uint32_t i = 100; i < 110; i++) {
encode_int32(i, chars);
@ -802,8 +814,7 @@ TEST(ArtTest, test_int32_range_hundreds) {
encode_int32(106, chars);
res = art_int32_search(&t, 106, EQUALS, results);
int res = art_int32_search(&t, 106, EQUALS, results);
ASSERT_TRUE(res == 0);
ASSERT_EQ(1, results.size());
results.clear();
@ -832,6 +843,35 @@ TEST(ArtTest, test_int32_range_hundreds) {
ASSERT_TRUE(res == 0);
}
TEST(ArtTest, test_int32_duplicates) {
art_tree t;
art_tree_init(&t);
art_document doc = get_document(1);
const int CHAR_LEN = 8;
unsigned char chars[CHAR_LEN];
for(size_t i = 0; i < 10000; i++) {
doc.id = i;
int value = 1900 + (rand() % static_cast<int>(2018 - 1900 + 1));
encode_int32(value, chars);
art_insert(&t, (unsigned char*)chars, CHAR_LEN, &doc, 1);
}
std::vector<const art_leaf*> results;
int res = art_int32_search(&t, 0, GREATER_THAN, results);
ASSERT_TRUE(res == 0);
size_t counter = 0;
for(auto res: results) {
counter += res->values->ids.getLength();
}
ASSERT_EQ(10000, counter);
results.clear();
}
TEST(ArtTest, test_int32_negative) {
art_tree t;
art_tree_init(&t);

View File

@ -417,6 +417,21 @@ TEST_F(CollectionTest, Pagination) {
}
}
TEST_F(CollectionTest, WildcardQuery) {
nlohmann::json results = collection->search("*", query_fields, "points:>0", {}, sort_fields, 0, 3, 1, FREQUENCY,
false).get();
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ(25, results["found"].get<uint32_t>());
// when no filter is specified, fall back on default sorting field based catch-all filter
Option<nlohmann::json> results_op = collection->search("*", query_fields, "", {}, sort_fields, 0, 3, 1, FREQUENCY,
false);
ASSERT_TRUE(results_op.ok());
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ(25, results["found"].get<uint32_t>());
}
TEST_F(CollectionTest, PrefixSearching) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("ex", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, true).get();
@ -1537,6 +1552,20 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {
collectionManager.drop_collection("coll_array_fields");
}
TEST_F(CollectionTest, DefaultSortingFieldMustBeInt32OrFloat) {
std::vector<field> fields = {field("name", field_types::STRING, false),
field("tags", field_types::STRING_ARRAY, true),
field("age", field_types::INT32, false),
field("average", field_types::INT32, false) };
std::vector<sort_by> sort_fields = { sort_by("age", "DESC"), sort_by("average", "DESC") };
Option<Collection*> collection_op = collectionManager.create_collection("sample_collection", fields, "name");
EXPECT_FALSE(collection_op.ok());
EXPECT_EQ("Default sorting field `name` must be of type int32 or float.", collection_op.error());
collectionManager.drop_collection("sample_collection");
}
TEST_F(CollectionTest, IndexingWithBadData) {
// should not crash when document to-be-indexed doesn't match schema
Collection *sample_collection;
@ -1588,7 +1617,7 @@ TEST_F(CollectionTest, IndexingWithBadData) {
doc_str = "{\"name\": \"foo\", \"age\": \"34\", \"tags\": [], \"average\": 34 }";
const Option<nlohmann::json> & bad_default_sorting_field_op1 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_default_sorting_field_op1.ok());
ASSERT_STREQ("Default sorting field `age` must be a number.", bad_default_sorting_field_op1.error().c_str());
ASSERT_STREQ("Default sorting field `age` must be of type int32 or float.", bad_default_sorting_field_op1.error().c_str());
doc_str = "{\"name\": \"foo\", \"age\": 343234324234233234, \"tags\": [], \"average\": 34 }";
const Option<nlohmann::json> & bad_default_sorting_field_op2 = sample_collection->add(doc_str);