Facet implementation.

This commit is contained in:
Kishore Nallan 2017-03-12 21:30:51 +05:30
parent 96921be016
commit 4776b41dc1
11 changed files with 239 additions and 85 deletions

View File

@ -19,8 +19,8 @@
- ~~Fix documents.jsonl path in tests~~
- ~~Multi field search tests~~
- ~~storage key prefix should include collection name~~
- Index and search on multi-valued field
- range search for art_int
- ~~Index and search on multi-valued field~~
- ~~range search for art_int~~
- Proper score field for ranking tokens
- Support nested fields via "."
- ~~Restore records as well on restart (like for meta)~~
@ -31,8 +31,8 @@
- ~~Assumption that all tokens match for scoring is no longer true~~
- Handle searching for non-existing fields gracefully
- Intersection without unpacking
- Filters
- Facets
- ~~Filters~~
- ~~Facets~~
- Iterator
- Highlight
- Support search operators like +, - etc.

View File

@ -248,7 +248,7 @@ int art_iter_prefix(art_tree *t, const unsigned char *prefix, int prefix_len, ar
int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost,
const int max_words, const token_ordering token_order, const bool prefix, std::vector<art_leaf *> &results);
static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
std::vector<art_leaf *> &results);
void encode_int32(int32_t n, unsigned char *chars);

View File

@ -44,12 +44,14 @@ private:
Option<uint32_t> do_filtering(uint32_t** filter_ids_out, const std::string & simple_filter_str);
void search(uint32_t* filter_ids, size_t filter_ids_length, std::string & query, const std::string & field,
const int num_typos, const size_t num_results, Topster<100> & topster, size_t & num_found,
const token_ordering token_order = FREQUENCY, const bool prefix = false);
void do_facets(std::vector<facet> & facets, uint32_t* result_ids, size_t results_size);
void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, int & token_rank,
std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
void search(uint32_t *filter_ids, size_t filter_ids_length, std::vector<facet> &facets, std::string &query,
const std::string &field, const int num_typos, const size_t num_results, Topster<100> &topster,
size_t &num_found, const token_ordering token_order = FREQUENCY, const bool prefix = false);
void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
size_t & total_results, size_t & num_found, const size_t & max_results);
void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id) const;
@ -93,9 +95,9 @@ public:
std::string add(std::string json_str);
nlohmann::json search(std::string query, const std::vector<std::string> fields, const std::string & simple_filter_str,
const int num_typos, const size_t num_results, const token_ordering token_order = FREQUENCY,
const bool prefix = false);
nlohmann::json search(std::string query, const std::vector<std::string> fields, const std::string &simple_filter_query,
std::vector<facet> & facets, const int num_typos, const size_t num_results,
const token_ordering token_order = FREQUENCY, const bool prefix = false);
void remove(std::string id);

View File

@ -62,4 +62,13 @@ struct filter {
return Option<NUM_COMPARATOR>(400, "Numerical field has an invalid comparator.");
}
};
struct facet {
const std::string field_name;
std::map<std::string, size_t> result_map;
facet(const std::string field_name): field_name(field_name) {
}
};

View File

@ -900,7 +900,7 @@ static uint32_t get_score(art_node* child) {
return child->max_token_count;
}
static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
std::vector<art_leaf *> &results) {
printf("INSIDE art_topk_iter: root->type: %d\n", root->type);

View File

@ -195,9 +195,40 @@ void Collection::index_int64_array_field(const std::vector<int64_t> & values, co
}
}
void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, int & token_rank,
std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
size_t & total_results, size_t & num_found, const size_t & max_results) {
void Collection::do_facets(std::vector<facet> & facets, uint32_t* result_ids, size_t results_size) {
for(auto & a_facet: facets) {
// assumed that facet fields have already been validated upstream
const field & facet_field = schema.at(a_facet.field_name);
// loop through the field, get all keys and intersect those ids with result ids
if(index_map.count(facet_field.name) != 0) {
art_tree *t = index_map.at(facet_field.name);
std::vector<art_leaf *> leaves;
art_topk_iter(t->root, MAX_SCORE, 10, leaves);
for(const art_leaf* leaf: leaves) {
const uint32_t* facet_ids = leaf->values->ids.uncompress();
size_t facet_ids_size = leaf->values->ids.getLength();
uint32_t* facet_results = new uint32_t[std::min(facet_ids_size, results_size)];
const size_t facet_results_size = Intersection::scalar(result_ids, results_size,
facet_ids, facet_ids_size, facet_results);
const std::string facet_value((const char *)leaf->key, leaf->key_len-1); // drop trailing null
a_facet.result_map.insert(std::pair<std::string, size_t>(facet_value, facet_results_size));
delete [] facet_ids;
delete [] facet_results;
}
}
}
}
void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves,
Topster<100> & topster, size_t & total_results, size_t & num_found,
const size_t & max_results) {
const size_t combination_limit = 10;
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product);
@ -227,6 +258,8 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
size_t filtered_results_size =
Intersection::scalar(filter_ids, filter_ids_length, result_ids, result_size, filtered_result_ids);
do_facets(facets, filtered_result_ids, filtered_results_size);
// go through each matching document id and calculate match score
score_results(topster, token_rank, query_suggestion, filtered_result_ids, filtered_results_size);
num_found += filtered_results_size;
@ -234,6 +267,8 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
delete[] filtered_result_ids;
delete[] result_ids;
} else {
do_facets(facets, result_ids, result_size);
score_results(topster, token_rank, query_suggestion, result_ids, result_size);
num_found += result_size;
delete[] result_ids;
@ -391,15 +426,28 @@ Option<uint32_t> Collection::do_filtering(uint32_t** filter_ids_out, const std::
}
nlohmann::json Collection::search(std::string query, const std::vector<std::string> fields,
const std::string & simple_filter_str,
const int num_typos, const size_t num_results,
const token_ordering token_order, const bool prefix) {
const std::string & simple_filter_query, std::vector<facet> & facets,
const int num_typos, const size_t num_results, const token_ordering token_order,
const bool prefix) {
size_t num_found = 0;
nlohmann::json result = nlohmann::json::object();
// validate facet fields
for(const facet & a_facet: facets) {
if(schema.count(a_facet.field_name) == 0) {
result["error"] = "Could not find a facet field named `" + a_facet.field_name + "` in the schema.";
return result;
}
field facet_field = schema.at(a_facet.field_name);
if(facet_field.type != field_types::STRING && facet_field.type != field_types::STRING_ARRAY) {
result["error"] = "Facet field `" + a_facet.field_name + "` should be a string or a string array.";
return result;
}
}
// process the filters first
uint32_t* filter_ids = nullptr;
Option<uint32_t> op_filter_ids_length = do_filtering(&filter_ids, simple_filter_str);
Option<uint32_t> op_filter_ids_length = do_filtering(&filter_ids, simple_filter_query);
if(!op_filter_ids_length.ok()) {
result["error"] = op_filter_ids_length.error();
return result;
@ -415,9 +463,9 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
Topster<100> topster;
const std::string & field = fields[i];
// proceed to query search only when no filters are provided or when filtering produces results
if(simple_filter_str.size() == 0 || filter_ids_length > 0) {
search(filter_ids, filter_ids_length, query, field, num_typos, num_results,
topster, num_found, token_order, prefix);
if(simple_filter_query.size() == 0 || filter_ids_length > 0) {
search(filter_ids, filter_ids_length, facets, query, field, num_typos, num_results, topster, num_found,
token_order, prefix);
topster.sort();
}
@ -449,6 +497,16 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
result["found"] = num_found;
result["facets"] = nlohmann::json::array();
// populate facets
for(const facet & a_facet: facets) {
nlohmann::json facet_result = nlohmann::json::object();
facet_result["field_name"] = a_facet.field_name;
facet_result["counts"] = a_facet.result_map;
result["facets"].push_back(facet_result);
}
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
//!std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
//!store->print_memory_usage();
@ -464,9 +522,9 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
4. Intersect the lists to find docs that match each phrase
5. Sort the docs based on some ranking criteria
*/
void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::string & query,
const std::string & field, const int num_typos, const size_t num_results,
Topster<100> & topster, size_t & num_found, const token_ordering token_order, const bool prefix) {
void Collection::search(uint32_t *filter_ids, size_t filter_ids_length, std::vector<facet> &facets, std::string &query,
const std::string &field, const int num_typos, const size_t num_results, Topster<100> &topster,
size_t &num_found, const token_ordering token_order, const bool prefix) {
std::vector<std::string> tokens;
StringUtils::tokenize(query, tokens, " ", true);
@ -527,8 +585,14 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str
leaves = token_cost_cache[token_cost_hash];
} else {
int token_len = prefix ? (int) token.length() : (int) token.length() + 1;
if(token_rank == 2) {
std::cout << "\n";
}
art_fuzzy_search(index_map.at(field), (const unsigned char *) token.c_str(), token_len,
costs[token_index], costs[token_index], 3, token_order, prefix, leaves);
if(!leaves.empty()) {
token_cost_cache.emplace(token_cost_hash, leaves);
}
@ -562,9 +626,8 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str
}
if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) {
// If a) all tokens were found, or b) Some were skipped because they don't exist within max_cost,
// go ahead and search for candidates with what we have so far
search_candidates(filter_ids, filter_ids_length, token_rank, token_leaves, topster,
// If all tokens were found, go ahead and search for candidates with what we have so far
search_candidates(filter_ids, filter_ids_length, facets, token_rank, token_leaves, topster,
total_results, num_found, max_results);
if (total_results >= max_results) {
@ -598,7 +661,7 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str
}
}
return search(filter_ids, filter_ids_length, truncated_query, field, num_typos, num_results, topster,
return search(filter_ids, filter_ids_length, facets, truncated_query, field, num_typos, num_results, topster,
num_found, token_order, prefix);
}
}
@ -614,7 +677,7 @@ void Collection::log_leaves(const int cost, const std::string &token, const std:
}
}
void Collection::score_results(Topster<100> &topster, const int & token_rank,
void Collection::score_results(Topster<100> & topster, const int & token_rank,
const std::vector<art_leaf *> &query_suggestion, const uint32_t *result_ids,
const size_t result_size) const {

View File

@ -48,7 +48,7 @@ int main(int argc, char* argv[]) {
while(counter < 3000) {
auto i = counter % 5;
auto results = collection->search(queries[i], search_fields, {}, 1, 100);
auto results = collection->search(queries[i], search_fields, {}, { }, 1, 100, MAX_SCORE, 0);
results_total += results.size();
counter++;
}

View File

@ -71,7 +71,7 @@ int main(int argc, char* argv[]) {
auto begin = std::chrono::high_resolution_clock::now();
std::vector<std::string> search_fields = {"title"};
collection->search("the", search_fields, {}, 1, 100);
collection->search("the", search_fields, {}, { }, 1, 100, MAX_SCORE, 0);
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
cout << "Time taken: " << timeMillis << "us" << endl;
return 0;

View File

@ -99,8 +99,8 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) {
std::vector<std::string> search_fields = {"title"};
nlohmann::json result = collection->search(query_map["q"], search_fields, filter_str, std::stoi(query_map[NUM_TYPOS]),
100, token_order, false);
nlohmann::json result = collection->search(query_map["q"], search_fields, filter_str, { },
std::stoi(query_map[NUM_TYPOS]), 100, token_order, false);
std::string json_str = result.dump();
//std::cout << "JSON:" << json_str << std::endl;
struct rusage r_usage;

View File

@ -47,7 +47,9 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
infile.close();
std::vector<std::string> search_fields = {"starring", "title"};
nlohmann::json results = collection1->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false);
std::vector<facet> facets;
nlohmann::json results = collection1->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
spp::sparse_hash_map<std::string, field> schema = collection1->get_schema();
@ -64,7 +66,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
ASSERT_EQ(rank_fields, collection1->get_rank_fields());
ASSERT_EQ(schema.size(), collection1->get_schema().size());
results = collection1->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false);
results = collection1->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
}

View File

@ -54,7 +54,8 @@ protected:
};
TEST_F(CollectionTest, ExactSearchShouldBeStable) {
nlohmann::json results = collection->search("the", search_fields, {}, 0, 10);
std::vector<facet> facets;
nlohmann::json results = collection->search("the", search_fields, "", facets, 0, 10);
ASSERT_EQ(7, results["hits"].size());
ASSERT_EQ(7, results["found"].get<int>());
@ -70,7 +71,8 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
}
TEST_F(CollectionTest, ExactPhraseSearch) {
nlohmann::json results = collection->search("rocket launch", search_fields, {}, 0, 10);
std::vector<facet> facets;
nlohmann::json results = collection->search("rocket launch", search_fields, "", facets, 0, 10);
ASSERT_EQ(5, results["hits"].size());
/*
@ -92,7 +94,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
}
// Check pagination
results = collection->search("rocket launch", search_fields, {}, 0, 3);
results = collection->search("rocket launch", search_fields, "", facets, 0, 3);
ASSERT_EQ(3, results["hits"].size());
for(size_t i = 0; i < 3; i++) {
nlohmann::json result = results["hits"].at(i);
@ -104,7 +106,8 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
// Tokens that are not found in the index should be skipped
nlohmann::json results = collection->search("DoesNotExist from", search_fields, {}, 0, 10);
std::vector<facet> facets;
nlohmann::json results = collection->search("DoesNotExist from", search_fields, "", facets, 0, 10);
ASSERT_EQ(2, results["hits"].size());
std::vector<std::string> ids = {"2", "17"};
@ -117,7 +120,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
}
// with non-zero cost
results = collection->search("DoesNotExist from", search_fields, {}, 1, 10);
results = collection->search("DoesNotExist from", search_fields, "", facets, 1, 10);
ASSERT_EQ(2, results["hits"].size());
for(size_t i = 0; i < results["hits"].size(); i++) {
@ -128,7 +131,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
}
// with 2 indexed words
results = collection->search("from DoesNotExist insTruments", search_fields, {}, 1, 10);
results = collection->search("from DoesNotExist insTruments", search_fields, "", facets, 1, 10);
ASSERT_EQ(2, results["hits"].size());
ids = {"2", "17"};
@ -140,16 +143,17 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
}
results.clear();
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 0, 10);
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, "", facets, 0, 10);
ASSERT_EQ(0, results["hits"].size());
results.clear();
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 2, 10);
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, "", facets, 2, 10);
ASSERT_EQ(0, results["hits"].size());
}
TEST_F(CollectionTest, PartialPhraseSearch) {
nlohmann::json results = collection->search("rocket research", search_fields, {}, 0, 10);
std::vector<facet> facets;
nlohmann::json results = collection->search("rocket research", search_fields, "", facets, 0, 10);
ASSERT_EQ(4, results["hits"].size());
std::vector<std::string> ids = {"1", "8", "16", "17"};
@ -163,7 +167,8 @@ TEST_F(CollectionTest, PartialPhraseSearch) {
}
TEST_F(CollectionTest, QueryWithTypo) {
nlohmann::json results = collection->search("kind biologcal", search_fields, {}, 2, 3);
std::vector<facet> facets;
nlohmann::json results = collection->search("kind biologcal", search_fields, "", facets, 2, 3);
ASSERT_EQ(3, results["hits"].size());
std::vector<std::string> ids = {"19", "20", "21"};
@ -176,7 +181,7 @@ TEST_F(CollectionTest, QueryWithTypo) {
}
results.clear();
results = collection->search("fer thx", search_fields, {}, 1, 3);
results = collection->search("fer thx", search_fields, "", facets, 1, 3);
ids = {"1", "10", "13"};
ASSERT_EQ(3, results["hits"].size());
@ -190,7 +195,8 @@ TEST_F(CollectionTest, QueryWithTypo) {
}
TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
nlohmann::json results = collection->search("loox", search_fields, {}, 1, 2, MAX_SCORE, false);
std::vector<facet> facets;
nlohmann::json results = collection->search("loox", search_fields, "", facets, 1, 2, MAX_SCORE, false);
ASSERT_EQ(2, results["hits"].size());
std::vector<std::string> ids = {"22", "23"};
@ -201,7 +207,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = collection->search("loox", search_fields, {}, 1, 3, FREQUENCY, false);
results = collection->search("loox", search_fields, "", facets, 1, 3, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"3", "12", "24"};
@ -213,19 +219,19 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
}
// Check pagination
results = collection->search("loox", search_fields, {}, 1, 1, FREQUENCY, false);
results = collection->search("loox", search_fields, "", facets, 1, 1, FREQUENCY, false);
ASSERT_EQ(3, results["found"].get<int>());
ASSERT_EQ(1, results["hits"].size());
std::string solo_id = results["hits"].at(0)["id"];
ASSERT_STREQ("3", solo_id.c_str());
results = collection->search("loox", search_fields, {}, 1, 2, FREQUENCY, false);
results = collection->search("loox", search_fields, "", facets, 1, 2, FREQUENCY, false);
ASSERT_EQ(3, results["found"].get<int>());
ASSERT_EQ(2, results["hits"].size());
// Check total ordering
results = collection->search("loox", search_fields, {}, 1, 10, FREQUENCY, false);
results = collection->search("loox", search_fields, "", facets, 1, 10, FREQUENCY, false);
ASSERT_EQ(5, results["hits"].size());
ids = {"3", "12", "24", "22", "23"};
@ -236,7 +242,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = collection->search("loox", search_fields, {}, 1, 10, MAX_SCORE, false);
results = collection->search("loox", search_fields, "", facets, 1, 10, MAX_SCORE, false);
ASSERT_EQ(5, results["hits"].size());
ids = {"22", "23", "3", "12", "24"};
@ -250,7 +256,8 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
TEST_F(CollectionTest, TextContainingAnActualTypo) {
// A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
nlohmann::json results = collection->search("ISX what", search_fields, {}, 1, 4, FREQUENCY, false);
std::vector<facet> facets;
nlohmann::json results = collection->search("ISX what", search_fields, "", facets, 1, 4, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
std::vector<std::string> ids = {"19", "6", "21", "8"};
@ -263,7 +270,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
}
// Record containing exact token match should appear first
results = collection->search("ISX", search_fields, {}, 1, 10, FREQUENCY, false);
results = collection->search("ISX", search_fields, "", facets, 1, 10, FREQUENCY, false);
ASSERT_EQ(8, results["hits"].size());
ids = {"20", "19", "6", "3", "21", "4", "10", "8"};
@ -277,7 +284,8 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
}
TEST_F(CollectionTest, PrefixSearching) {
nlohmann::json results = collection->search("ex", search_fields, {}, 0, 10, FREQUENCY, true);
std::vector<facet> facets;
nlohmann::json results = collection->search("ex", search_fields, "", facets, 0, 10, FREQUENCY, true);
ASSERT_EQ(2, results["hits"].size());
std::vector<std::string> ids = {"12", "6"};
@ -288,7 +296,7 @@ TEST_F(CollectionTest, PrefixSearching) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = collection->search("ex", search_fields, {}, 0, 10, MAX_SCORE, true);
results = collection->search("ex", search_fields, "", facets, 0, 10, MAX_SCORE, true);
ASSERT_EQ(2, results["hits"].size());
ids = {"6", "12"};
@ -322,7 +330,8 @@ TEST_F(CollectionTest, MultipleFields) {
infile.close();
search_fields = {"title", "starring"};
nlohmann::json results = coll_mul_fields->search("Will", search_fields, {}, 0, 10, FREQUENCY, false);
std::vector<facet> facets;
nlohmann::json results = coll_mul_fields->search("Will", search_fields, "", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
std::vector<std::string> ids = {"3", "2", "1", "0"};
@ -337,7 +346,7 @@ TEST_F(CollectionTest, MultipleFields) {
// when "starring" takes higher priority than "title"
search_fields = {"starring", "title"};
results = coll_mul_fields->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
ids = {"15", "14", "12", "13"};
@ -350,11 +359,11 @@ TEST_F(CollectionTest, MultipleFields) {
}
search_fields = {"starring", "title", "cast"};
results = coll_mul_fields->search("ben affleck", search_fields, {}, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("ben affleck", search_fields, "", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(1, results["hits"].size());
search_fields = {"cast"};
results = coll_mul_fields->search("chris", search_fields, {}, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("chris", search_fields, "", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"6", "1", "7"};
@ -366,7 +375,7 @@ TEST_F(CollectionTest, MultipleFields) {
}
search_fields = {"cast"};
results = coll_mul_fields->search("chris pine", search_fields, {}, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("chris pine", search_fields, "", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"7", "6", "1"};
@ -402,7 +411,8 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
// Plain search with no filters - results should be sorted by rank fields
search_fields = {"name"};
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", 0, 10, FREQUENCY, false);
std::vector<facet> facets;
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(5, results["hits"].size());
std::vector<std::string> ids = {"3", "1", "4", "0", "2"};
@ -415,7 +425,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// Searching on an int32 field
results = coll_array_fields->search("Jeremy", search_fields, "age:>24", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "age:>24", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"3", "1", "4"};
@ -427,14 +437,14 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = coll_array_fields->search("Jeremy", search_fields, "age:>=24", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "age:>=24", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
results = coll_array_fields->search("Jeremy", search_fields, "age:24", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "age:24", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(1, results["hits"].size());
// Searching a number against an int32 array field
results = coll_array_fields->search("Jeremy", search_fields, "years:>2002", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "years:>2002", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"1", "0", "2"};
@ -445,7 +455,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = coll_array_fields->search("Jeremy", search_fields, "years:<1989", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "years:<1989", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(1, results["hits"].size());
ids = {"3"};
@ -457,7 +467,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// multiple filters
results = coll_array_fields->search("Jeremy", search_fields, "years:<2005 && years:>1987", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "years:<2005 && years:>1987", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(1, results["hits"].size());
ids = {"4"};
@ -469,7 +479,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// multiple search values (works like SQL's IN operator) against a single int field
results = coll_array_fields->search("Jeremy", search_fields, "age:[21, 24, 63]", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "age:[21, 24, 63]", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"3", "0", "2"};
@ -481,7 +491,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// multiple search values against an int32 array field - also use extra padding between symbols
results = coll_array_fields->search("Jeremy", search_fields, "years : [ 2015, 1985 , 1999]", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "years : [ 2015, 1985 , 1999]", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
ids = {"3", "1", "4", "0"};
@ -493,7 +503,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// searching on an int64 array field - also ensure that padded space causes no issues
results = coll_array_fields->search("Jeremy", search_fields, "timestamps : > 475205222", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "timestamps : > 475205222", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
ids = {"1", "4", "0", "2"};
@ -506,7 +516,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// when filters don't match any record, no results should be returned
results = coll_array_fields->search("Jeremy", search_fields, "timestamps:<1", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "timestamps:<1", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
collectionManager.drop_collection("coll_array_fields");
@ -535,7 +545,8 @@ TEST_F(CollectionTest, FilterOnTextFields) {
infile.close();
search_fields = {"name"};
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tags: gold", 0, 10, FREQUENCY, false);
std::vector<facet> facets;
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tags: gold", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
std::vector<std::string> ids = {"1", "4", "0", "2"};
@ -547,7 +558,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = coll_array_fields->search("Jeremy", search_fields, "tags : bronze", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "tags : bronze", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(2, results["hits"].size());
ids = {"4", "2"};
@ -560,7 +571,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
}
// search with a list of tags, also testing extra padding of space
results = coll_array_fields->search("Jeremy", search_fields, "tags: [bronze, silver]", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "tags: [bronze, silver]", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
ids = {"3", "4", "0", "2"};
@ -573,7 +584,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
}
// should be exact matches (no normalization or fuzzy searching should happen)
results = coll_array_fields->search("Jeremy", search_fields, "tags: BRONZE", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "tags: BRONZE", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
collectionManager.drop_collection("coll_array_fields");
@ -604,28 +615,95 @@ TEST_F(CollectionTest, HandleBadlyFormedFilterQuery) {
infile.close();
search_fields = {"name"};
std::vector<facet> facets;
// when filter field does not exist in the schema
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tagzz: gold", 0, 10, FREQUENCY, false);
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tagzz: gold", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
// searching using a string for a numeric field
results = coll_array_fields->search("Jeremy", search_fields, "age: abcdef", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "age: abcdef", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
// searching using a string for a numeric array field
results = coll_array_fields->search("Jeremy", search_fields, "timestamps: abcdef", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "timestamps: abcdef", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
// malformed k:v syntax
results = coll_array_fields->search("Jeremy", search_fields, "timestamps abcdef", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "timestamps abcdef", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
// just empty spaces
results = coll_array_fields->search("Jeremy", search_fields, " ", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, " ", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
// wrapping number with quotes
results = coll_array_fields->search("Jeremy", search_fields, "age: '21'", 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", search_fields, "age: '21'", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
}
TEST_F(CollectionTest, FacetCounts) {
Collection *coll_array_fields;
std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl");
std::vector<field> fields = {field("name", field_types::STRING), field("age", field_types::INT32),
field("years", field_types::INT32_ARRAY),
field("timestamps", field_types::INT64_ARRAY),
field("tags", field_types::STRING_ARRAY)};
std::vector<std::string> rank_fields = {"age"};
coll_array_fields = collectionManager.get_collection("coll_array_fields");
if(coll_array_fields == nullptr) {
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, rank_fields);
}
std::string json_line;
while (std::getline(infile, json_line)) {
coll_array_fields->add(json_line);
}
infile.close();
search_fields = {"name"};
std::vector<facet> facets = {facet("tags")};
// single facet with no filters
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facets"].size());
ASSERT_EQ(2, results["facets"][0].size());
ASSERT_EQ("tags", results["facets"][0]["field_name"]);
ASSERT_EQ(4, (int) results["facets"][0]["counts"]["gold"]);
ASSERT_EQ(3, (int) results["facets"][0]["counts"]["silver"]);
ASSERT_EQ(2, (int) results["facets"][0]["counts"]["bronze"]);
// 2 facets, 1 text filter with no filters
facets.clear();
facets.push_back(facet("tags"));
facets.push_back(facet("name"));
results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(2, results["facets"].size());
ASSERT_EQ("tags", results["facets"][0]["field_name"]);
ASSERT_EQ("name", results["facets"][1]["field_name"]);
// text is tokenized and standardized
ASSERT_EQ(5, (int) results["facets"][1]["counts"]["howard"]);
ASSERT_EQ(5, (int) results["facets"][1]["counts"]["jeremy"]);
// facet with filters
facets.clear();
facets.push_back(facet("tags"));
results = coll_array_fields->search("Jeremy", search_fields, "age: >24", facets, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ(1, results["facets"].size());
ASSERT_EQ("tags", results["facets"][0]["field_name"]);
ASSERT_EQ(2, (int) results["facets"][0]["counts"]["gold"]);
ASSERT_EQ(2, (int) results["facets"][0]["counts"]["silver"]);
ASSERT_EQ(1, (int) results["facets"][0]["counts"]["bronze"]);
}