diff --git a/TODO.md b/TODO.md index c89bf6cf..e53459c7 100644 --- a/TODO.md +++ b/TODO.md @@ -19,8 +19,8 @@ - ~~Fix documents.jsonl path in tests~~ - ~~Multi field search tests~~ - ~~storage key prefix should include collection name~~ -- Index and search on multi-valued field -- range search for art_int +- ~~Index and search on multi-valued field~~ +- ~~range search for art_int~~ - Proper score field for ranking tokens - Support nested fields via "." - ~~Restore records as well on restart (like for meta)~~ @@ -31,8 +31,8 @@ - ~~Assumption that all tokens match for scoring is no longer true~~ - Handle searching for non-existing fields gracefully - Intersection without unpacking -- Filters -- Facets +- ~~Filters~~ +- ~~Facets~~ - Iterator - Highlight - Support search operators like +, - etc. diff --git a/include/art.h b/include/art.h index 05ae078c..288418a2 100644 --- a/include/art.h +++ b/include/art.h @@ -248,7 +248,7 @@ int art_iter_prefix(art_tree *t, const unsigned char *prefix, int prefix_len, ar int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, const int min_cost, const int max_cost, const int max_words, const token_ordering token_order, const bool prefix, std::vector &results); -static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results, +int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results, std::vector &results); void encode_int32(int32_t n, unsigned char *chars); diff --git a/include/collection.h b/include/collection.h index 92cf1e4d..0749782e 100644 --- a/include/collection.h +++ b/include/collection.h @@ -44,12 +44,14 @@ private: Option do_filtering(uint32_t** filter_ids_out, const std::string & simple_filter_str); - void search(uint32_t* filter_ids, size_t filter_ids_length, std::string & query, const std::string & field, - const int num_typos, const size_t num_results, Topster<100> & topster, size_t & num_found, - const token_ordering token_order = FREQUENCY, const bool prefix = false); + void do_facets(std::vector & facets, uint32_t* result_ids, size_t results_size); - void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, int & token_rank, - std::vector> & token_leaves, Topster<100> & topster, + void search(uint32_t *filter_ids, size_t filter_ids_length, std::vector &facets, std::string &query, + const std::string &field, const int num_typos, const size_t num_results, Topster<100> &topster, + size_t &num_found, const token_ordering token_order = FREQUENCY, const bool prefix = false); + + void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector & facets, + int & token_rank, std::vector> & token_leaves, Topster<100> & topster, size_t & total_results, size_t & num_found, const size_t & max_results); void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id) const; @@ -93,9 +95,9 @@ public: std::string add(std::string json_str); - nlohmann::json search(std::string query, const std::vector fields, const std::string & simple_filter_str, - const int num_typos, const size_t num_results, const token_ordering token_order = FREQUENCY, - const bool prefix = false); + nlohmann::json search(std::string query, const std::vector fields, const std::string &simple_filter_query, + std::vector & facets, const int num_typos, const size_t num_results, + const token_ordering token_order = FREQUENCY, const bool prefix = false); void remove(std::string id); diff --git a/include/field.h b/include/field.h index d88ff5b5..a3e8ed3c 100644 --- a/include/field.h +++ b/include/field.h @@ -62,4 +62,13 @@ struct filter { return Option(400, "Numerical field has an invalid comparator."); } +}; + +struct facet { + const std::string field_name; + std::map result_map; + + facet(const std::string field_name): field_name(field_name) { + + } }; \ No newline at end of file diff --git a/src/art.cpp b/src/art.cpp index 07dbcf36..469d5bf8 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -900,7 +900,7 @@ static uint32_t get_score(art_node* child) { return child->max_token_count; } -static int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results, +int art_topk_iter(const art_node *root, token_ordering token_order, const int max_results, std::vector &results) { printf("INSIDE art_topk_iter: root->type: %d\n", root->type); diff --git a/src/collection.cpp b/src/collection.cpp index 76e05d7b..8492206e 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -195,9 +195,40 @@ void Collection::index_int64_array_field(const std::vector & values, co } } -void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, int & token_rank, - std::vector> & token_leaves, Topster<100> & topster, - size_t & total_results, size_t & num_found, const size_t & max_results) { +void Collection::do_facets(std::vector & facets, uint32_t* result_ids, size_t results_size) { + for(auto & a_facet: facets) { + // assumed that facet fields have already been validated upstream + const field & facet_field = schema.at(a_facet.field_name); + + // loop through the field, get all keys and intersect those ids with result ids + if(index_map.count(facet_field.name) != 0) { + art_tree *t = index_map.at(facet_field.name); + std::vector leaves; + + art_topk_iter(t->root, MAX_SCORE, 10, leaves); + + for(const art_leaf* leaf: leaves) { + const uint32_t* facet_ids = leaf->values->ids.uncompress(); + size_t facet_ids_size = leaf->values->ids.getLength(); + + uint32_t* facet_results = new uint32_t[std::min(facet_ids_size, results_size)]; + const size_t facet_results_size = Intersection::scalar(result_ids, results_size, + facet_ids, facet_ids_size, facet_results); + + const std::string facet_value((const char *)leaf->key, leaf->key_len-1); // drop trailing null + a_facet.result_map.insert(std::pair(facet_value, facet_results_size)); + + delete [] facet_ids; + delete [] facet_results; + } + } + } +} + +void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector & facets, + int & token_rank, std::vector> & token_leaves, + Topster<100> & topster, size_t & total_results, size_t & num_found, + const size_t & max_results) { const size_t combination_limit = 10; auto product = []( long long a, std::vector& b ) { return a*b.size(); }; long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product); @@ -227,6 +258,8 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt size_t filtered_results_size = Intersection::scalar(filter_ids, filter_ids_length, result_ids, result_size, filtered_result_ids); + do_facets(facets, filtered_result_ids, filtered_results_size); + // go through each matching document id and calculate match score score_results(topster, token_rank, query_suggestion, filtered_result_ids, filtered_results_size); num_found += filtered_results_size; @@ -234,6 +267,8 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt delete[] filtered_result_ids; delete[] result_ids; } else { + do_facets(facets, result_ids, result_size); + score_results(topster, token_rank, query_suggestion, result_ids, result_size); num_found += result_size; delete[] result_ids; @@ -391,15 +426,28 @@ Option Collection::do_filtering(uint32_t** filter_ids_out, const std:: } nlohmann::json Collection::search(std::string query, const std::vector fields, - const std::string & simple_filter_str, - const int num_typos, const size_t num_results, - const token_ordering token_order, const bool prefix) { + const std::string & simple_filter_query, std::vector & facets, + const int num_typos, const size_t num_results, const token_ordering token_order, + const bool prefix) { size_t num_found = 0; nlohmann::json result = nlohmann::json::object(); + // validate facet fields + for(const facet & a_facet: facets) { + if(schema.count(a_facet.field_name) == 0) { + result["error"] = "Could not find a facet field named `" + a_facet.field_name + "` in the schema."; + return result; + } + field facet_field = schema.at(a_facet.field_name); + if(facet_field.type != field_types::STRING && facet_field.type != field_types::STRING_ARRAY) { + result["error"] = "Facet field `" + a_facet.field_name + "` should be a string or a string array."; + return result; + } + } + // process the filters first uint32_t* filter_ids = nullptr; - Option op_filter_ids_length = do_filtering(&filter_ids, simple_filter_str); + Option op_filter_ids_length = do_filtering(&filter_ids, simple_filter_query); if(!op_filter_ids_length.ok()) { result["error"] = op_filter_ids_length.error(); return result; @@ -415,9 +463,9 @@ nlohmann::json Collection::search(std::string query, const std::vector topster; const std::string & field = fields[i]; // proceed to query search only when no filters are provided or when filtering produces results - if(simple_filter_str.size() == 0 || filter_ids_length > 0) { - search(filter_ids, filter_ids_length, query, field, num_typos, num_results, - topster, num_found, token_order, prefix); + if(simple_filter_query.size() == 0 || filter_ids_length > 0) { + search(filter_ids, filter_ids_length, facets, query, field, num_typos, num_results, topster, num_found, + token_order, prefix); topster.sort(); } @@ -449,6 +497,16 @@ nlohmann::json Collection::search(std::string query, const std::vector(std::chrono::high_resolution_clock::now() - begin).count(); //!std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl; //!store->print_memory_usage(); @@ -464,9 +522,9 @@ nlohmann::json Collection::search(std::string query, const std::vector & topster, size_t & num_found, const token_ordering token_order, const bool prefix) { +void Collection::search(uint32_t *filter_ids, size_t filter_ids_length, std::vector &facets, std::string &query, + const std::string &field, const int num_typos, const size_t num_results, Topster<100> &topster, + size_t &num_found, const token_ordering token_order, const bool prefix) { std::vector tokens; StringUtils::tokenize(query, tokens, " ", true); @@ -527,8 +585,14 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str leaves = token_cost_cache[token_cost_hash]; } else { int token_len = prefix ? (int) token.length() : (int) token.length() + 1; + + if(token_rank == 2) { + std::cout << "\n"; + } + art_fuzzy_search(index_map.at(field), (const unsigned char *) token.c_str(), token_len, costs[token_index], costs[token_index], 3, token_order, prefix, leaves); + if(!leaves.empty()) { token_cost_cache.emplace(token_cost_hash, leaves); } @@ -562,9 +626,8 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str } if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) { - // If a) all tokens were found, or b) Some were skipped because they don't exist within max_cost, - // go ahead and search for candidates with what we have so far - search_candidates(filter_ids, filter_ids_length, token_rank, token_leaves, topster, + // If all tokens were found, go ahead and search for candidates with what we have so far + search_candidates(filter_ids, filter_ids_length, facets, token_rank, token_leaves, topster, total_results, num_found, max_results); if (total_results >= max_results) { @@ -598,7 +661,7 @@ void Collection::search(uint32_t* filter_ids, size_t filter_ids_length, std::str } } - return search(filter_ids, filter_ids_length, truncated_query, field, num_typos, num_results, topster, + return search(filter_ids, filter_ids_length, facets, truncated_query, field, num_typos, num_results, topster, num_found, token_order, prefix); } } @@ -614,7 +677,7 @@ void Collection::log_leaves(const int cost, const std::string &token, const std: } } -void Collection::score_results(Topster<100> &topster, const int & token_rank, +void Collection::score_results(Topster<100> & topster, const int & token_rank, const std::vector &query_suggestion, const uint32_t *result_ids, const size_t result_size) const { diff --git a/src/main/benchmark.cpp b/src/main/benchmark.cpp index 447b551f..167c76bd 100644 --- a/src/main/benchmark.cpp +++ b/src/main/benchmark.cpp @@ -48,7 +48,7 @@ int main(int argc, char* argv[]) { while(counter < 3000) { auto i = counter % 5; - auto results = collection->search(queries[i], search_fields, {}, 1, 100); + auto results = collection->search(queries[i], search_fields, {}, { }, 1, 100, MAX_SCORE, 0); results_total += results.size(); counter++; } diff --git a/src/main/main.cpp b/src/main/main.cpp index e6a002d7..52d4f733 100644 --- a/src/main/main.cpp +++ b/src/main/main.cpp @@ -71,7 +71,7 @@ int main(int argc, char* argv[]) { auto begin = std::chrono::high_resolution_clock::now(); std::vector search_fields = {"title"}; - collection->search("the", search_fields, {}, 1, 100); + collection->search("the", search_fields, {}, { }, 1, 100, MAX_SCORE, 0); long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); cout << "Time taken: " << timeMillis << "us" << endl; return 0; diff --git a/src/main/server.cpp b/src/main/server.cpp index 6482ba52..87c51a0e 100644 --- a/src/main/server.cpp +++ b/src/main/server.cpp @@ -99,8 +99,8 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) { std::vector search_fields = {"title"}; - nlohmann::json result = collection->search(query_map["q"], search_fields, filter_str, std::stoi(query_map[NUM_TYPOS]), - 100, token_order, false); + nlohmann::json result = collection->search(query_map["q"], search_fields, filter_str, { }, + std::stoi(query_map[NUM_TYPOS]), 100, token_order, false); std::string json_str = result.dump(); //std::cout << "JSON:" << json_str << std::endl; struct rusage r_usage; diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index ab8e71cf..70bcadbe 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -47,7 +47,9 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) { infile.close(); std::vector search_fields = {"starring", "title"}; - nlohmann::json results = collection1->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false); + std::vector facets; + + nlohmann::json results = collection1->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); spp::sparse_hash_map schema = collection1->get_schema(); @@ -64,7 +66,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) { ASSERT_EQ(rank_fields, collection1->get_rank_fields()); ASSERT_EQ(schema.size(), collection1->get_schema().size()); - results = collection1->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false); + results = collection1->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); } diff --git a/test/collection_test.cpp b/test/collection_test.cpp index df0da6eb..23409ab6 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -54,7 +54,8 @@ protected: }; TEST_F(CollectionTest, ExactSearchShouldBeStable) { - nlohmann::json results = collection->search("the", search_fields, {}, 0, 10); + std::vector facets; + nlohmann::json results = collection->search("the", search_fields, "", facets, 0, 10); ASSERT_EQ(7, results["hits"].size()); ASSERT_EQ(7, results["found"].get()); @@ -70,7 +71,8 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) { } TEST_F(CollectionTest, ExactPhraseSearch) { - nlohmann::json results = collection->search("rocket launch", search_fields, {}, 0, 10); + std::vector facets; + nlohmann::json results = collection->search("rocket launch", search_fields, "", facets, 0, 10); ASSERT_EQ(5, results["hits"].size()); /* @@ -92,7 +94,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) { } // Check pagination - results = collection->search("rocket launch", search_fields, {}, 0, 3); + results = collection->search("rocket launch", search_fields, "", facets, 0, 3); ASSERT_EQ(3, results["hits"].size()); for(size_t i = 0; i < 3; i++) { nlohmann::json result = results["hits"].at(i); @@ -104,7 +106,8 @@ TEST_F(CollectionTest, ExactPhraseSearch) { TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { // Tokens that are not found in the index should be skipped - nlohmann::json results = collection->search("DoesNotExist from", search_fields, {}, 0, 10); + std::vector facets; + nlohmann::json results = collection->search("DoesNotExist from", search_fields, "", facets, 0, 10); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"2", "17"}; @@ -117,7 +120,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { } // with non-zero cost - results = collection->search("DoesNotExist from", search_fields, {}, 1, 10); + results = collection->search("DoesNotExist from", search_fields, "", facets, 1, 10); ASSERT_EQ(2, results["hits"].size()); for(size_t i = 0; i < results["hits"].size(); i++) { @@ -128,7 +131,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { } // with 2 indexed words - results = collection->search("from DoesNotExist insTruments", search_fields, {}, 1, 10); + results = collection->search("from DoesNotExist insTruments", search_fields, "", facets, 1, 10); ASSERT_EQ(2, results["hits"].size()); ids = {"2", "17"}; @@ -140,16 +143,17 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { } results.clear(); - results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 0, 10); + results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, "", facets, 0, 10); ASSERT_EQ(0, results["hits"].size()); results.clear(); - results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 2, 10); + results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, "", facets, 2, 10); ASSERT_EQ(0, results["hits"].size()); } TEST_F(CollectionTest, PartialPhraseSearch) { - nlohmann::json results = collection->search("rocket research", search_fields, {}, 0, 10); + std::vector facets; + nlohmann::json results = collection->search("rocket research", search_fields, "", facets, 0, 10); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"1", "8", "16", "17"}; @@ -163,7 +167,8 @@ TEST_F(CollectionTest, PartialPhraseSearch) { } TEST_F(CollectionTest, QueryWithTypo) { - nlohmann::json results = collection->search("kind biologcal", search_fields, {}, 2, 3); + std::vector facets; + nlohmann::json results = collection->search("kind biologcal", search_fields, "", facets, 2, 3); ASSERT_EQ(3, results["hits"].size()); std::vector ids = {"19", "20", "21"}; @@ -176,7 +181,7 @@ TEST_F(CollectionTest, QueryWithTypo) { } results.clear(); - results = collection->search("fer thx", search_fields, {}, 1, 3); + results = collection->search("fer thx", search_fields, "", facets, 1, 3); ids = {"1", "10", "13"}; ASSERT_EQ(3, results["hits"].size()); @@ -190,7 +195,8 @@ TEST_F(CollectionTest, QueryWithTypo) { } TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { - nlohmann::json results = collection->search("loox", search_fields, {}, 1, 2, MAX_SCORE, false); + std::vector facets; + nlohmann::json results = collection->search("loox", search_fields, "", facets, 1, 2, MAX_SCORE, false); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"22", "23"}; @@ -201,7 +207,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = collection->search("loox", search_fields, {}, 1, 3, FREQUENCY, false); + results = collection->search("loox", search_fields, "", facets, 1, 3, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"3", "12", "24"}; @@ -213,19 +219,19 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { } // Check pagination - results = collection->search("loox", search_fields, {}, 1, 1, FREQUENCY, false); + results = collection->search("loox", search_fields, "", facets, 1, 1, FREQUENCY, false); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); std::string solo_id = results["hits"].at(0)["id"]; ASSERT_STREQ("3", solo_id.c_str()); - results = collection->search("loox", search_fields, {}, 1, 2, FREQUENCY, false); + results = collection->search("loox", search_fields, "", facets, 1, 2, FREQUENCY, false); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); // Check total ordering - results = collection->search("loox", search_fields, {}, 1, 10, FREQUENCY, false); + results = collection->search("loox", search_fields, "", facets, 1, 10, FREQUENCY, false); ASSERT_EQ(5, results["hits"].size()); ids = {"3", "12", "24", "22", "23"}; @@ -236,7 +242,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = collection->search("loox", search_fields, {}, 1, 10, MAX_SCORE, false); + results = collection->search("loox", search_fields, "", facets, 1, 10, MAX_SCORE, false); ASSERT_EQ(5, results["hits"].size()); ids = {"22", "23", "3", "12", "24"}; @@ -250,7 +256,8 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { TEST_F(CollectionTest, TextContainingAnActualTypo) { // A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens - nlohmann::json results = collection->search("ISX what", search_fields, {}, 1, 4, FREQUENCY, false); + std::vector facets; + nlohmann::json results = collection->search("ISX what", search_fields, "", facets, 1, 4, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"19", "6", "21", "8"}; @@ -263,7 +270,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) { } // Record containing exact token match should appear first - results = collection->search("ISX", search_fields, {}, 1, 10, FREQUENCY, false); + results = collection->search("ISX", search_fields, "", facets, 1, 10, FREQUENCY, false); ASSERT_EQ(8, results["hits"].size()); ids = {"20", "19", "6", "3", "21", "4", "10", "8"}; @@ -277,7 +284,8 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) { } TEST_F(CollectionTest, PrefixSearching) { - nlohmann::json results = collection->search("ex", search_fields, {}, 0, 10, FREQUENCY, true); + std::vector facets; + nlohmann::json results = collection->search("ex", search_fields, "", facets, 0, 10, FREQUENCY, true); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"12", "6"}; @@ -288,7 +296,7 @@ TEST_F(CollectionTest, PrefixSearching) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = collection->search("ex", search_fields, {}, 0, 10, MAX_SCORE, true); + results = collection->search("ex", search_fields, "", facets, 0, 10, MAX_SCORE, true); ASSERT_EQ(2, results["hits"].size()); ids = {"6", "12"}; @@ -322,7 +330,8 @@ TEST_F(CollectionTest, MultipleFields) { infile.close(); search_fields = {"title", "starring"}; - nlohmann::json results = coll_mul_fields->search("Will", search_fields, {}, 0, 10, FREQUENCY, false); + std::vector facets; + nlohmann::json results = coll_mul_fields->search("Will", search_fields, "", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"3", "2", "1", "0"}; @@ -337,7 +346,7 @@ TEST_F(CollectionTest, MultipleFields) { // when "starring" takes higher priority than "title" search_fields = {"starring", "title"}; - results = coll_mul_fields->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("thomas", search_fields, "", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); ids = {"15", "14", "12", "13"}; @@ -350,11 +359,11 @@ TEST_F(CollectionTest, MultipleFields) { } search_fields = {"starring", "title", "cast"}; - results = coll_mul_fields->search("ben affleck", search_fields, {}, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("ben affleck", search_fields, "", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(1, results["hits"].size()); search_fields = {"cast"}; - results = coll_mul_fields->search("chris", search_fields, {}, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("chris", search_fields, "", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"6", "1", "7"}; @@ -366,7 +375,7 @@ TEST_F(CollectionTest, MultipleFields) { } search_fields = {"cast"}; - results = coll_mul_fields->search("chris pine", search_fields, {}, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("chris pine", search_fields, "", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"7", "6", "1"}; @@ -402,7 +411,8 @@ TEST_F(CollectionTest, FilterOnNumericFields) { // Plain search with no filters - results should be sorted by rank fields search_fields = {"name"}; - nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", 0, 10, FREQUENCY, false); + std::vector facets; + nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(5, results["hits"].size()); std::vector ids = {"3", "1", "4", "0", "2"}; @@ -415,7 +425,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // Searching on an int32 field - results = coll_array_fields->search("Jeremy", search_fields, "age:>24", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "age:>24", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"3", "1", "4"}; @@ -427,14 +437,14 @@ TEST_F(CollectionTest, FilterOnNumericFields) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = coll_array_fields->search("Jeremy", search_fields, "age:>=24", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "age:>=24", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); - results = coll_array_fields->search("Jeremy", search_fields, "age:24", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "age:24", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(1, results["hits"].size()); // Searching a number against an int32 array field - results = coll_array_fields->search("Jeremy", search_fields, "years:>2002", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "years:>2002", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"1", "0", "2"}; @@ -445,7 +455,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = coll_array_fields->search("Jeremy", search_fields, "years:<1989", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "years:<1989", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(1, results["hits"].size()); ids = {"3"}; @@ -457,7 +467,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // multiple filters - results = coll_array_fields->search("Jeremy", search_fields, "years:<2005 && years:>1987", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "years:<2005 && years:>1987", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(1, results["hits"].size()); ids = {"4"}; @@ -469,7 +479,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // multiple search values (works like SQL's IN operator) against a single int field - results = coll_array_fields->search("Jeremy", search_fields, "age:[21, 24, 63]", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "age:[21, 24, 63]", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"3", "0", "2"}; @@ -481,7 +491,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // multiple search values against an int32 array field - also use extra padding between symbols - results = coll_array_fields->search("Jeremy", search_fields, "years : [ 2015, 1985 , 1999]", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "years : [ 2015, 1985 , 1999]", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); ids = {"3", "1", "4", "0"}; @@ -493,7 +503,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // searching on an int64 array field - also ensure that padded space causes no issues - results = coll_array_fields->search("Jeremy", search_fields, "timestamps : > 475205222", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "timestamps : > 475205222", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); ids = {"1", "4", "0", "2"}; @@ -506,7 +516,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // when filters don't match any record, no results should be returned - results = coll_array_fields->search("Jeremy", search_fields, "timestamps:<1", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "timestamps:<1", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); collectionManager.drop_collection("coll_array_fields"); @@ -535,7 +545,8 @@ TEST_F(CollectionTest, FilterOnTextFields) { infile.close(); search_fields = {"name"}; - nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tags: gold", 0, 10, FREQUENCY, false); + std::vector facets; + nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tags: gold", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"1", "4", "0", "2"}; @@ -547,7 +558,7 @@ TEST_F(CollectionTest, FilterOnTextFields) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = coll_array_fields->search("Jeremy", search_fields, "tags : bronze", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "tags : bronze", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(2, results["hits"].size()); ids = {"4", "2"}; @@ -560,7 +571,7 @@ TEST_F(CollectionTest, FilterOnTextFields) { } // search with a list of tags, also testing extra padding of space - results = coll_array_fields->search("Jeremy", search_fields, "tags: [bronze, silver]", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "tags: [bronze, silver]", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); ids = {"3", "4", "0", "2"}; @@ -573,7 +584,7 @@ TEST_F(CollectionTest, FilterOnTextFields) { } // should be exact matches (no normalization or fuzzy searching should happen) - results = coll_array_fields->search("Jeremy", search_fields, "tags: BRONZE", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "tags: BRONZE", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); collectionManager.drop_collection("coll_array_fields"); @@ -604,28 +615,95 @@ TEST_F(CollectionTest, HandleBadlyFormedFilterQuery) { infile.close(); search_fields = {"name"}; + std::vector facets; // when filter field does not exist in the schema - nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tagzz: gold", 0, 10, FREQUENCY, false); + nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "tagzz: gold", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); // searching using a string for a numeric field - results = coll_array_fields->search("Jeremy", search_fields, "age: abcdef", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "age: abcdef", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); // searching using a string for a numeric array field - results = coll_array_fields->search("Jeremy", search_fields, "timestamps: abcdef", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "timestamps: abcdef", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); // malformed k:v syntax - results = coll_array_fields->search("Jeremy", search_fields, "timestamps abcdef", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "timestamps abcdef", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); // just empty spaces - results = coll_array_fields->search("Jeremy", search_fields, " ", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, " ", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); // wrapping number with quotes - results = coll_array_fields->search("Jeremy", search_fields, "age: '21'", 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", search_fields, "age: '21'", facets, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); +} + +TEST_F(CollectionTest, FacetCounts) { + Collection *coll_array_fields; + + std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl"); + std::vector fields = {field("name", field_types::STRING), field("age", field_types::INT32), + field("years", field_types::INT32_ARRAY), + field("timestamps", field_types::INT64_ARRAY), + field("tags", field_types::STRING_ARRAY)}; + std::vector rank_fields = {"age"}; + + coll_array_fields = collectionManager.get_collection("coll_array_fields"); + if(coll_array_fields == nullptr) { + coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, rank_fields); + } + + std::string json_line; + + while (std::getline(infile, json_line)) { + coll_array_fields->add(json_line); + } + + infile.close(); + + search_fields = {"name"}; + std::vector facets = {facet("tags")}; + + // single facet with no filters + nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false); + ASSERT_EQ(5, results["hits"].size()); + + ASSERT_EQ(1, results["facets"].size()); + ASSERT_EQ(2, results["facets"][0].size()); + ASSERT_EQ("tags", results["facets"][0]["field_name"]); + + ASSERT_EQ(4, (int) results["facets"][0]["counts"]["gold"]); + ASSERT_EQ(3, (int) results["facets"][0]["counts"]["silver"]); + ASSERT_EQ(2, (int) results["facets"][0]["counts"]["bronze"]); + + // 2 facets, 1 text filter with no filters + facets.clear(); + facets.push_back(facet("tags")); + facets.push_back(facet("name")); + results = coll_array_fields->search("Jeremy", search_fields, "", facets, 0, 10, FREQUENCY, false); + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(2, results["facets"].size()); + + ASSERT_EQ("tags", results["facets"][0]["field_name"]); + ASSERT_EQ("name", results["facets"][1]["field_name"]); + + // text is tokenized and standardized + ASSERT_EQ(5, (int) results["facets"][1]["counts"]["howard"]); + ASSERT_EQ(5, (int) results["facets"][1]["counts"]["jeremy"]); + + // facet with filters + facets.clear(); + facets.push_back(facet("tags")); + results = coll_array_fields->search("Jeremy", search_fields, "age: >24", facets, 0, 10, FREQUENCY, false); + ASSERT_EQ(3, results["hits"].size()); + ASSERT_EQ(1, results["facets"].size()); + + ASSERT_EQ("tags", results["facets"][0]["field_name"]); + ASSERT_EQ(2, (int) results["facets"][0]["counts"]["gold"]); + ASSERT_EQ(2, (int) results["facets"][0]["counts"]["silver"]); + ASSERT_EQ(1, (int) results["facets"][0]["counts"]["bronze"]); } \ No newline at end of file