mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 21:22:25 +08:00
Facet implementation.
This commit is contained in:
parent
96921be016
commit
4776b41dc1
8
TODO.md
8
TODO.md
@ -19,8 +19,8 @@
|
||||
- ~~Fix documents.jsonl path in tests~~
|
||||
- ~~Multi field search tests~~
|
||||
- ~~storage key prefix should include collection name~~
|
||||
- Index and search on multi-valued field
|
||||
- range search for art_int
|
||||
- ~~Index and search on multi-valued field~~
|
||||
- ~~range search for art_int~~
|
||||
- Proper score field for ranking tokens
|
||||
- Support nested fields via "."
|
||||
- ~~Restore records as well on restart (like for meta)~~
|
||||
@ -31,8 +31,8 @@
|
||||
- ~~Assumption that all tokens match for scoring is no longer true~~
|
||||
- Handle searching for non-existing fields gracefully
|
||||
- Intersection without unpacking
|
||||
- Filters
|
||||
- Facets
|
||||
- ~~Filters~~
|
||||
- ~~Facets~~
|
||||
- Iterator
|
||||
- Highlight
|
||||
- Support search operators like +, - etc.
|
||||
|
@ -248,7 +248,7 @@ int art_iter_prefix(art_tree *t, const unsigned char *prefix, int prefix_len, ar
|
||||
int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost,
|
||||
const int max_words, const token_ordering token_order, const bool prefix, std::vector<art_leaf *> &results);
|
||||
|
||||
static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
|
||||
int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
|
||||
std::vector<art_leaf *> &results);
|
||||
|
||||
void encode_int32(int32_t n, unsigned char *chars);
|
||||
|
@ -44,12 +44,14 @@ private:
|
||||
|
||||
Option<uint32_t> do_filtering(uint32_t** filter_ids_out, const std::string & simple_filter_str);
|
||||
|
||||
void search(uint32_t* filter_ids, size_t filter_ids_length, std::string & query, const std::string & field,
|
||||
const int num_typos, const size_t num_results, Topster<100> & topster, size_t & num_found,
|
||||
const token_ordering token_order = FREQUENCY, const bool prefix = false);
|
||||
void do_facets(std::vector<facet> & facets, uint32_t* result_ids, size_t results_size);
|
||||
|
||||
void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, int & token_rank,
|
||||
std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
|
||||
void search(uint32_t *filter_ids, size_t filter_ids_length, std::vector<facet> &facets, std::string &query,
|
||||
const std::string &field, const int num_typos, const size_t num_results, Topster<100> &topster,
|
||||
size_t &num_found, const token_ordering token_order = FREQUENCY, const bool prefix = false);
|
||||
|
||||
void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
|
||||
int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
|
||||
size_t & total_results, size_t & num_found, const size_t & max_results);
|
||||
|
||||
void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id) const;
|
||||
@ -93,9 +95,9 @@ public:
|
||||
|
||||
std::string add(std::string json_str);
|
||||
|
||||
nlohmann::json search(std::string query, const std::vector<std::string> fields, const std::string & simple_filter_str,
|
||||
const int num_typos, const size_t num_results, const token_ordering token_order = FREQUENCY,
|
||||
const bool prefix = false);
|
||||
nlohmann::json search(std::string query, const std::vector<std::string> fields, const std::string &simple_filter_query,
|
||||
std::vector<facet> & facets, const int num_typos, const size_t num_results,
|
||||
const token_ordering token_order = FREQUENCY, const bool prefix = false);
|
||||
|
||||
void remove(std::string id);
|
||||
|
||||
|
@ -62,4 +62,13 @@ struct filter {
|
||||
|
||||
return Option<NUM_COMPARATOR>(400, "Numerical field has an invalid comparator.");
|
||||
}
|
||||
};
|
||||
|
||||
struct facet {
|
||||
const std::string field_name;
|
||||
std::map<std::string, size_t> result_map;
|
||||
|
||||
facet(const std::string field_name): field_name(field_name) {
|
||||
|
||||
}
|
||||
};
|
@ -900,7 +900,7 @@ static uint32_t get_score(art_node* child) {
|
||||
return child->max_token_count;
|
||||
}
|
||||
|
||||
static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
|
||||
int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results,
|
||||
std::vector<art_leaf *> &results) {
|
||||
printf("INSIDE art_topk_iter: root->type: %d\n", root->type);
|
||||
|
||||
|
@ -195,9 +195,40 @@ void Collection::index_int64_array_field(const std::vector<int64_t> & values, co
|
||||
}
|
||||
}
|
||||
|
||||
void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, int & token_rank,
|
||||
std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
|
||||
size_t & total_results, size_t & num_found, const size_t & max_results) {
|
||||
void Collection::do_facets(std::vector<facet> & facets, uint32_t* result_ids, size_t results_size) {
|
||||
for(auto & a_facet: facets) {
|
||||
// assumed that facet fields have already been validated upstream
|
||||
const field & facet_field = schema.at(a_facet.field_name);
|
||||
|
||||
// loop through the field, get all keys and intersect those ids with result ids
|
||||
if(index_map.count(facet_field.name) != 0) {
|
||||
art_tree *t = index_map.at(facet_field.name);
|
||||
std::vector<art_leaf *> leaves;
|
||||
|
||||
art_topk_iter(t->root, MAX_SCORE, 10, leaves);
|
||||
|
||||
for(const art_leaf* leaf: leaves) {
|
||||
const uint32_t* facet_ids = leaf->values->ids.uncompress();
|
||||
size_t facet_ids_size = leaf->values->ids.getLength();
|
||||
|
||||
uint32_t* facet_results = new uint32_t[std::min(facet_ids_size, results_size)];
|
||||
const size_t facet_results_size = Intersection::scalar(result_ids, results_size,
|
||||
facet_ids, facet_ids_size, facet_results);
|
||||
|
||||
const std::string facet_value((const char *)leaf->key, leaf->key_len-1); // drop trailing null
|
||||
a_facet.result_map.insert(std::pair<std::string, size_t>(facet_value, facet_results_size));
|
||||
|
||||
delete [] facet_ids;
|
||||
delete [] facet_results;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
|
||||
int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves,
|
||||
Topster<100> & topster, size_t & total_results, size_t & num_found,
|
||||
const size_t & max_results) {
|
||||
const size_t combination_limit = 10;
|
||||
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
|
||||
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product);
|
||||
@ -227,6 +258,8 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
|
||||
size_t filtered_results_size =
|
||||
Intersection::scalar(filter_ids, filter_ids_length, result_ids, result_size, filtered_result_ids);
|
||||
|
||||
do_facets(facets, filtered_result_ids, filtered_results_size);
|
||||
|
||||
// go through each matching document id and calculate match score
|
||||
score_results(topster, token_rank, query_suggestion, filtered_result_ids, filtered_results_size);
|
||||
num_found += filtered_results_size;
|
||||
@ -234,6 +267,8 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
|
||||
delete[] filtered_result_ids;
|
||||
delete[] result_ids;
|
||||
} else {
|
||||
do_facets(facets, result_ids, result_size);
|
||||
|
||||
score_results(topster, token_rank, query_suggestion, result_ids, result_size);
|
||||
num_found += result_size;
|
||||
delete[] result_ids;
|
||||
@ -391,15 +426,28 @@ Option<uint32_t> Collection::do_filtering(uint32_t** filter_ids_out, const std::
|
||||
}
|
||||
|
||||
nlohmann::json Collection::search(std::string query, const std::vector<std::string> fields,
|
||||
const std::string & simple_filter_str,
|
||||
const int num_typos, const size_t num_results,
|
||||
const token_ordering token_order, const bool prefix) {
|
||||
const std::string & simple_filter_query, std::vector<facet> & facets,
|
||||
const int num_typos, const size_t num_results, const token_ordering token_order,
|
||||
const bool prefix) {
|
||||
size_t num_found = 0;
|
||||
nlohmann::json result = nlohmann::json::object();
|
||||
|
||||
// validate facet fields
|
||||
for(const facet & a_facet: facets) {
|
||||
if(schema.count(a_facet.field_name) == 0) {
|
||||
result["error"] = "Could not find a facet field named `" + a_facet.field_name + "` in the schema.";
|
||||
return result;
|
||||
}
|
||||
field facet_field = schema.at(a_facet.field_name);
|
||||
if(facet_field.type != field_types::STRING && facet_field.type != field_types::STRING_ARRAY) {
|
||||
result["error"] = "Facet field `" + a_facet.field_name + "` should be a string or a string array.";
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// process the filters first
|
||||
uint32_t* filter_ids = nullptr;
|
||||
Option<uint32_t> op_filter_ids_length = do_filtering(&filter_ids, simple_filter_str);
|
||||
Option<uint32_t> op_filter_ids_length = do_filtering(&filter_ids, simple_filter_query);
|
||||
if(!op_filter_ids_length.ok()) {
|
||||
result["error"] = op_filter_ids_length.error();
|
||||
return result;
|
||||
@ -415,9 +463,9 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
|
||||
Topster<100> topster;
|
||||
const std::string & field = fields[i];
|
||||
// proceed to query search only when no filters are provided or when filtering produces results
|
||||
if(simple_filter_str.size() == 0 || filter_ids_length > 0) {
|
||||
search(filter_ids, filter_ids_length, query, field, num_typos, num_results,
|
||||
topster, num_found, token_order, prefix);
|
||||
if(simple_filter_query.size() == 0 || filter_ids_length > 0) {
|
||||
search(filter_ids, filter_ids_length, facets, query, field, num_typos, num_results, topster, num_found,
|
||||
token_order, prefix);
|
||||
topster.sort();
|
||||
}
|
||||
|
||||
@ -449,6 +497,16 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
|
||||
|
||||
result["found"] = num_found;
|
||||
|
||||
result["facets"] = nlohmann::json::array();
|
||||
|
||||
// populate facets
|
||||
for(const facet & a_facet: facets) {
|
||||
nlohmann::json facet_result = nlohmann::json::object();
|
||||
facet_result["field_name"] = a_facet.field_name;
|
||||
facet_result["counts"] = a_facet.result_map;
|
||||
result["facets"].push_back(facet_result);
|
||||
}
|
||||
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
//!std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
|
||||
//!store->print_memory_usage();
|
||||
@ -464,9 +522,9 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
|
||||
4. Intersect the lists to find docs that match each phrase
|
||||
5. Sort the docs based on some ranking criteria
|
||||
*/
|
||||
void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::string & query,
|
||||
const std::string & field, const int num_typos, const size_t num_results,
|
||||
Topster<100> & topster, size_t & num_found, const token_ordering token_order, const bool prefix) {
|
||||
void Collection::search(uint32_t *filter_ids, size_t filter_ids_length, std::vector<facet> &facets, std::string &query,
|
||||
const std::string &field, const int num_typos, const size_t num_results, Topster<100> &topster,
|
||||
size_t &num_found, const token_ordering token_order, const bool prefix) {
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::tokenize(query, tokens, " ", true);
|
||||
|
||||
@ -527,8 +585,14 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str
|
||||
leaves = token_cost_cache[token_cost_hash];
|
||||
} else {
|
||||
int token_len = prefix ? (int) token.length() : (int) token.length() + 1;
|
||||
|
||||
if(token_rank == 2) {
|
||||
std::cout << "\n";
|
||||
}
|
||||
|
||||
art_fuzzy_search(index_map.at(field), (const unsigned char *) token.c_str(), token_len,
|
||||
costs[token_index], costs[token_index], 3, token_order, prefix, leaves);
|
||||
|
||||
if(!leaves.empty()) {
|
||||
token_cost_cache.emplace(token_cost_hash, leaves);
|
||||
}
|
||||
@ -562,9 +626,8 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str
|
||||
}
|
||||
|
||||
if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) {
|
||||
// If a) all tokens were found, or b) Some were skipped because they don't exist within max_cost,
|
||||
// go ahead and search for candidates with what we have so far
|
||||
search_candidates(filter_ids, filter_ids_length, token_rank, token_leaves, topster,
|
||||
// If all tokens were found, go ahead and search for candidates with what we have so far
|
||||
search_candidates(filter_ids, filter_ids_length, facets, token_rank, token_leaves, topster,
|
||||
total_results, num_found, max_results);
|
||||
|
||||
if (total_results >= max_results) {
|
||||
@ -598,7 +661,7 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str
|
||||
}
|
||||
}
|
||||
|
||||
return search(filter_ids, filter_ids_length, truncated_query, field, num_typos, num_results, topster,
|
||||
return search(filter_ids, filter_ids_length, facets, truncated_query, field, num_typos, num_results, topster,
|
||||
num_found, token_order, prefix);
|
||||
}
|
||||
}
|
||||
@ -614,7 +677,7 @@ void Collection::log_leaves(const int cost, const std::string &token, const std:
|
||||
}
|
||||
}
|
||||
|
||||
void Collection::score_results(Topster<100> &topster, const int & token_rank,
|
||||
void Collection::score_results(Topster<100> & topster, const int & token_rank,
|
||||
const std::vector<art_leaf *> &query_suggestion, const uint32_t *result_ids,
|
||||
const size_t result_size) const {
|
||||
|
||||
|
@ -48,7 +48,7 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
while(counter < 3000) {
|
||||
auto i = counter % 5;
|
||||
auto results = collection->search(queries[i], search_fields, {}, 1, 100);
|
||||
auto results = collection->search(queries[i], search_fields, {}, { }, 1, 100, MAX_SCORE, 0);
|
||||
results_total += results.size();
|
||||
counter++;
|
||||
}
|
||||
|
@ -71,7 +71,7 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
std::vector<std::string> search_fields = {"title"};
|
||||
collection->search("the", search_fields, {}, 1, 100);
|
||||
collection->search("the", search_fields, {}, { }, 1, 100, MAX_SCORE, 0);
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
cout << "Time taken: " << timeMillis << "us" << endl;
|
||||
return 0;
|
||||
|
@ -99,8 +99,8 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) {
|
||||
|
||||
std::vector<std::string> search_fields = {"title"};
|
||||
|
||||
nlohmann::json result = collection->search(query_map["q"], search_fields, filter_str, std::stoi(query_map[NUM_TYPOS]),
|
||||
100, token_order, false);
|
||||
nlohmann::json result = collection->search(query_map["q"], search_fields, filter_str, { },
|
||||
std::stoi(query_map[NUM_TYPOS]), 100, token_order, false);
|
||||
std::string json_str = result.dump();
|
||||
//std::cout << "JSON:" << json_str << std::endl;
|
||||
struct rusage r_usage;
|
||||
|
@ -47,7 +47,9 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
|
||||
infile.close();
|
||||
|
||||
std::vector<std::string> search_fields = {"starring", "title"};
|
||||
nlohmann::json results = collection1->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false);
|
||||
std::vector<facet> facets;
|
||||
|
||||
nlohmann::json results = collection1->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
spp::sparse_hash_map<std::string, field> schema = collection1->get_schema();
|
||||
@ -64,7 +66,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
|
||||
ASSERT_EQ(rank_fields, collection1->get_rank_fields());
|
||||
ASSERT_EQ(schema.size(), collection1->get_schema().size());
|
||||
|
||||
results = collection1->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false);
|
||||
results = collection1->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
}
|
||||
|
||||
|
@ -54,7 +54,8 @@ protected:
|
||||
};
|
||||
|
||||
TEST_F(CollectionTest, ExactSearchShouldBeStable) {
|
||||
nlohmann::json results = collection->search("the", search_fields, {}, 0, 10);
|
||||
std::vector<facet> facets;
|
||||
nlohmann::json results = collection->search("the", search_fields, "", facets, 0, 10);
|
||||
ASSERT_EQ(7, results["hits"].size());
|
||||
ASSERT_EQ(7, results["found"].get<int>());
|
||||
|
||||
@ -70,7 +71,8 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, ExactPhraseSearch) {
|
||||
nlohmann::json results = collection->search("rocket launch", search_fields, {}, 0, 10);
|
||||
std::vector<facet> facets;
|
||||
nlohmann::json results = collection->search("rocket launch", search_fields, "", facets, 0, 10);
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
|
||||
/*
|
||||
@ -92,7 +94,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
|
||||
}
|
||||
|
||||
// Check pagination
|
||||
results = collection->search("rocket launch", search_fields, {}, 0, 3);
|
||||
results = collection->search("rocket launch", search_fields, "", facets, 0, 3);
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
for(size_t i = 0; i < 3; i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
@ -104,7 +106,8 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
|
||||
|
||||
TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
// Tokens that are not found in the index should be skipped
|
||||
nlohmann::json results = collection->search("DoesNotExist from", search_fields, {}, 0, 10);
|
||||
std::vector<facet> facets;
|
||||
nlohmann::json results = collection->search("DoesNotExist from", search_fields, "", facets, 0, 10);
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
std::vector<std::string> ids = {"2", "17"};
|
||||
@ -117,7 +120,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
}
|
||||
|
||||
// with non-zero cost
|
||||
results = collection->search("DoesNotExist from", search_fields, {}, 1, 10);
|
||||
results = collection->search("DoesNotExist from", search_fields, "", facets, 1, 10);
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
for(size_t i = 0; i < results["hits"].size(); i++) {
|
||||
@ -128,7 +131,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
}
|
||||
|
||||
// with 2 indexed words
|
||||
results = collection->search("from DoesNotExist insTruments", search_fields, {}, 1, 10);
|
||||
results = collection->search("from DoesNotExist insTruments", search_fields, "", facets, 1, 10);
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
ids = {"2", "17"};
|
||||
|
||||
@ -140,16 +143,17 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
}
|
||||
|
||||
results.clear();
|
||||
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 0, 10);
|
||||
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, "", facets, 0, 10);
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
results.clear();
|
||||
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 2, 10);
|
||||
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, "", facets, 2, 10);
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, PartialPhraseSearch) {
|
||||
nlohmann::json results = collection->search("rocket research", search_fields, {}, 0, 10);
|
||||
std::vector<facet> facets;
|
||||
nlohmann::json results = collection->search("rocket research", search_fields, "", facets, 0, 10);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
std::vector<std::string> ids = {"1", "8", "16", "17"};
|
||||
@ -163,7 +167,8 @@ TEST_F(CollectionTest, PartialPhraseSearch) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, QueryWithTypo) {
|
||||
nlohmann::json results = collection->search("kind biologcal", search_fields, {}, 2, 3);
|
||||
std::vector<facet> facets;
|
||||
nlohmann::json results = collection->search("kind biologcal", search_fields, "", facets, 2, 3);
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
|
||||
std::vector<std::string> ids = {"19", "20", "21"};
|
||||
@ -176,7 +181,7 @@ TEST_F(CollectionTest, QueryWithTypo) {
|
||||
}
|
||||
|
||||
results.clear();
|
||||
results = collection->search("fer thx", search_fields, {}, 1, 3);
|
||||
results = collection->search("fer thx", search_fields, "", facets, 1, 3);
|
||||
ids = {"1", "10", "13"};
|
||||
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
@ -190,7 +195,8 @@ TEST_F(CollectionTest, QueryWithTypo) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
nlohmann::json results = collection->search("loox", search_fields, {}, 1, 2, MAX_SCORE, false);
|
||||
std::vector<facet> facets;
|
||||
nlohmann::json results = collection->search("loox", search_fields, "", facets, 1, 2, MAX_SCORE, false);
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
std::vector<std::string> ids = {"22", "23"};
|
||||
|
||||
@ -201,7 +207,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results = collection->search("loox", search_fields, {}, 1, 3, FREQUENCY, false);
|
||||
results = collection->search("loox", search_fields, "", facets, 1, 3, FREQUENCY, false);
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
ids = {"3", "12", "24"};
|
||||
|
||||
@ -213,19 +219,19 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
}
|
||||
|
||||
// Check pagination
|
||||
results = collection->search("loox", search_fields, {}, 1, 1, FREQUENCY, false);
|
||||
results = collection->search("loox", search_fields, "", facets, 1, 1, FREQUENCY, false);
|
||||
ASSERT_EQ(3, results["found"].get<int>());
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
std::string solo_id = results["hits"].at(0)["id"];
|
||||
ASSERT_STREQ("3", solo_id.c_str());
|
||||
|
||||
results = collection->search("loox", search_fields, {}, 1, 2, FREQUENCY, false);
|
||||
results = collection->search("loox", search_fields, "", facets, 1, 2, FREQUENCY, false);
|
||||
ASSERT_EQ(3, results["found"].get<int>());
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
// Check total ordering
|
||||
|
||||
results = collection->search("loox", search_fields, {}, 1, 10, FREQUENCY, false);
|
||||
results = collection->search("loox", search_fields, "", facets, 1, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ids = {"3", "12", "24", "22", "23"};
|
||||
|
||||
@ -236,7 +242,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results = collection->search("loox", search_fields, {}, 1, 10, MAX_SCORE, false);
|
||||
results = collection->search("loox", search_fields, "", facets, 1, 10, MAX_SCORE, false);
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ids = {"22", "23", "3", "12", "24"};
|
||||
|
||||
@ -250,7 +256,8 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
|
||||
TEST_F(CollectionTest, TextContainingAnActualTypo) {
|
||||
// A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
|
||||
nlohmann::json results = collection->search("ISX what", search_fields, {}, 1, 4, FREQUENCY, false);
|
||||
std::vector<facet> facets;
|
||||
nlohmann::json results = collection->search("ISX what", search_fields, "", facets, 1, 4, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
std::vector<std::string> ids = {"19", "6", "21", "8"};
|
||||
@ -263,7 +270,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
|
||||
}
|
||||
|
||||
// Record containing exact token match should appear first
|
||||
results = collection->search("ISX", search_fields, {}, 1, 10, FREQUENCY, false);
|
||||
results = collection->search("ISX", search_fields, "", facets, 1, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(8, results["hits"].size());
|
||||
|
||||
ids = {"20", "19", "6", "3", "21", "4", "10", "8"};
|
||||
@ -277,7 +284,8 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, PrefixSearching) {
|
||||
nlohmann::json results = collection->search("ex", search_fields, {}, 0, 10, FREQUENCY, true);
|
||||
std::vector<facet> facets;
|
||||
nlohmann::json results = collection->search("ex", search_fields, "", facets, 0, 10, FREQUENCY, true);
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
std::vector<std::string> ids = {"12", "6"};
|
||||
|
||||
@ -288,7 +296,7 @@ TEST_F(CollectionTest, PrefixSearching) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results = collection->search("ex", search_fields, {}, 0, 10, MAX_SCORE, true);
|
||||
results = collection->search("ex", search_fields, "", facets, 0, 10, MAX_SCORE, true);
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
ids = {"6", "12"};
|
||||
|
||||
@ -322,7 +330,8 @@ TEST_F(CollectionTest, MultipleFields) {
|
||||
infile.close();
|
||||
|
||||
search_fields = {"title", "starring"};
|
||||
nlohmann::json results = coll_mul_fields->search("Will", search_fields, {}, 0, 10, FREQUENCY, false);
|
||||
std::vector<facet> facets;
|
||||
nlohmann::json results = coll_mul_fields->search("Will", search_fields, "", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
std::vector<std::string> ids = {"3", "2", "1", "0"};
|
||||
@ -337,7 +346,7 @@ TEST_F(CollectionTest, MultipleFields) {
|
||||
// when "starring" takes higher priority than "title"
|
||||
|
||||
search_fields = {"starring", "title"};
|
||||
results = coll_mul_fields->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false);
|
||||
results = coll_mul_fields->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
ids = {"15", "14", "12", "13"};
|
||||
@ -350,11 +359,11 @@ TEST_F(CollectionTest, MultipleFields) {
|
||||
}
|
||||
|
||||
search_fields = {"starring", "title", "cast"};
|
||||
results = coll_mul_fields->search("ben affleck", search_fields, {}, 0, 10, FREQUENCY, false);
|
||||
results = coll_mul_fields->search("ben affleck", search_fields, "", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
search_fields = {"cast"};
|
||||
results = coll_mul_fields->search("chris", search_fields, {}, 0, 10, FREQUENCY, false);
|
||||
results = coll_mul_fields->search("chris", search_fields, "", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
|
||||
ids = {"6", "1", "7"};
|
||||
@ -366,7 +375,7 @@ TEST_F(CollectionTest, MultipleFields) {
|
||||
}
|
||||
|
||||
search_fields = {"cast"};
|
||||
results = coll_mul_fields->search("chris pine", search_fields, {}, 0, 10, FREQUENCY, false);
|
||||
results = coll_mul_fields->search("chris pine", search_fields, "", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
|
||||
ids = {"7", "6", "1"};
|
||||
@ -402,7 +411,8 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
|
||||
|
||||
// Plain search with no filters - results should be sorted by rank fields
|
||||
search_fields = {"name"};
|
||||
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", 0, 10, FREQUENCY, false);
|
||||
std::vector<facet> facets;
|
||||
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
|
||||
std::vector<std::string> ids = {"3", "1", "4", "0", "2"};
|
||||
@ -415,7 +425,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
|
||||
}
|
||||
|
||||
// Searching on an int32 field
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age:>24", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age:>24", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
|
||||
ids = {"3", "1", "4"};
|
||||
@ -427,14 +437,14 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age:>=24", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age:>=24", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age:24", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age:24", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
// Searching a number against an int32 array field
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "years:>2002", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "years:>2002", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
|
||||
ids = {"1", "0", "2"};
|
||||
@ -445,7 +455,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "years:<1989", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "years:<1989", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
ids = {"3"};
|
||||
@ -457,7 +467,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
|
||||
}
|
||||
|
||||
// multiple filters
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "years:<2005 && years:>1987", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "years:<2005 && years:>1987", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
ids = {"4"};
|
||||
@ -469,7 +479,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
|
||||
}
|
||||
|
||||
// multiple search values (works like SQL's IN operator) against a single int field
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age:[21, 24, 63]", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age:[21, 24, 63]", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
|
||||
ids = {"3", "0", "2"};
|
||||
@ -481,7 +491,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
|
||||
}
|
||||
|
||||
// multiple search values against an int32 array field - also use extra padding between symbols
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "years : [ 2015, 1985 , 1999]", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "years : [ 2015, 1985 , 1999]", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
ids = {"3", "1", "4", "0"};
|
||||
@ -493,7 +503,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
|
||||
}
|
||||
|
||||
// searching on an int64 array field - also ensure that padded space causes no issues
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "timestamps : > 475205222", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "timestamps : > 475205222", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
ids = {"1", "4", "0", "2"};
|
||||
@ -506,7 +516,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
|
||||
}
|
||||
|
||||
// when filters don't match any record, no results should be returned
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "timestamps:<1", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "timestamps:<1", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
collectionManager.drop_collection("coll_array_fields");
|
||||
@ -535,7 +545,8 @@ TEST_F(CollectionTest, FilterOnTextFields) {
|
||||
infile.close();
|
||||
|
||||
search_fields = {"name"};
|
||||
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tags: gold", 0, 10, FREQUENCY, false);
|
||||
std::vector<facet> facets;
|
||||
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tags: gold", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
std::vector<std::string> ids = {"1", "4", "0", "2"};
|
||||
@ -547,7 +558,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "tags : bronze", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "tags : bronze", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
ids = {"4", "2"};
|
||||
@ -560,7 +571,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
|
||||
}
|
||||
|
||||
// search with a list of tags, also testing extra padding of space
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "tags: [bronze, silver]", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "tags: [bronze, silver]", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results["hits"].size());
|
||||
|
||||
ids = {"3", "4", "0", "2"};
|
||||
@ -573,7 +584,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
|
||||
}
|
||||
|
||||
// should be exact matches (no normalization or fuzzy searching should happen)
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "tags: BRONZE", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "tags: BRONZE", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
collectionManager.drop_collection("coll_array_fields");
|
||||
@ -604,28 +615,95 @@ TEST_F(CollectionTest, HandleBadlyFormedFilterQuery) {
|
||||
infile.close();
|
||||
|
||||
search_fields = {"name"};
|
||||
std::vector<facet> facets;
|
||||
|
||||
// when filter field does not exist in the schema
|
||||
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tagzz: gold", 0, 10, FREQUENCY, false);
|
||||
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tagzz: gold", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
// searching using a string for a numeric field
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age: abcdef", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age: abcdef", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
// searching using a string for a numeric array field
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "timestamps: abcdef", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "timestamps: abcdef", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
// malformed k:v syntax
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "timestamps abcdef", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "timestamps abcdef", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
// just empty spaces
|
||||
results = coll_array_fields->search("Jeremy", search_fields, " ", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, " ", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
// wrapping number with quotes
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age: '21'", 0, 10, FREQUENCY, false);
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age: '21'", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, FacetCounts) {
|
||||
Collection *coll_array_fields;
|
||||
|
||||
std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl");
|
||||
std::vector<field> fields = {field("name", field_types::STRING), field("age", field_types::INT32),
|
||||
field("years", field_types::INT32_ARRAY),
|
||||
field("timestamps", field_types::INT64_ARRAY),
|
||||
field("tags", field_types::STRING_ARRAY)};
|
||||
std::vector<std::string> rank_fields = {"age"};
|
||||
|
||||
coll_array_fields = collectionManager.get_collection("coll_array_fields");
|
||||
if(coll_array_fields == nullptr) {
|
||||
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, rank_fields);
|
||||
}
|
||||
|
||||
std::string json_line;
|
||||
|
||||
while (std::getline(infile, json_line)) {
|
||||
coll_array_fields->add(json_line);
|
||||
}
|
||||
|
||||
infile.close();
|
||||
|
||||
search_fields = {"name"};
|
||||
std::vector<facet> facets = {facet("tags")};
|
||||
|
||||
// single facet with no filters
|
||||
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
|
||||
ASSERT_EQ(1, results["facets"].size());
|
||||
ASSERT_EQ(2, results["facets"][0].size());
|
||||
ASSERT_EQ("tags", results["facets"][0]["field_name"]);
|
||||
|
||||
ASSERT_EQ(4, (int) results["facets"][0]["counts"]["gold"]);
|
||||
ASSERT_EQ(3, (int) results["facets"][0]["counts"]["silver"]);
|
||||
ASSERT_EQ(2, (int) results["facets"][0]["counts"]["bronze"]);
|
||||
|
||||
// 2 facets, 1 text filter with no filters
|
||||
facets.clear();
|
||||
facets.push_back(facet("tags"));
|
||||
facets.push_back(facet("name"));
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(5, results["hits"].size());
|
||||
ASSERT_EQ(2, results["facets"].size());
|
||||
|
||||
ASSERT_EQ("tags", results["facets"][0]["field_name"]);
|
||||
ASSERT_EQ("name", results["facets"][1]["field_name"]);
|
||||
|
||||
// text is tokenized and standardized
|
||||
ASSERT_EQ(5, (int) results["facets"][1]["counts"]["howard"]);
|
||||
ASSERT_EQ(5, (int) results["facets"][1]["counts"]["jeremy"]);
|
||||
|
||||
// facet with filters
|
||||
facets.clear();
|
||||
facets.push_back(facet("tags"));
|
||||
results = coll_array_fields->search("Jeremy", search_fields, "age: >24", facets, 0, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
ASSERT_EQ(1, results["facets"].size());
|
||||
|
||||
ASSERT_EQ("tags", results["facets"][0]["field_name"]);
|
||||
ASSERT_EQ(2, (int) results["facets"][0]["counts"]["gold"]);
|
||||
ASSERT_EQ(2, (int) results["facets"][0]["counts"]["silver"]);
|
||||
ASSERT_EQ(1, (int) results["facets"][0]["counts"]["bronze"]);
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user