mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 12:42:50 +08:00
Search across multiple fields.
Need to write more tests.
This commit is contained in:
parent
54a60398ab
commit
2b6293650e
13
TODO.md
13
TODO.md
@ -6,7 +6,7 @@
|
||||
|
||||
- ~~Proper JSON as input~~
|
||||
- ~~Storing raw JSON input to RocksDB~~
|
||||
- ART for every indexed field
|
||||
- ~~ART for every indexed field~~
|
||||
- UTF-8 support for fuzzy search
|
||||
- Facets
|
||||
- Filters
|
||||
@ -19,10 +19,13 @@
|
||||
- only last token should be prefix searched
|
||||
- art int search should support signed ints
|
||||
- storage key prefix should include collection name
|
||||
- storage key prefix should include collection name
|
||||
- use art for indexing score as well
|
||||
- ISX what (score based on typo matches)
|
||||
- Mininum results should be a variable instead of blindly going with max_results
|
||||
- Minimum results should be a variable instead of blindly going with max_results
|
||||
- Benchmark with -ffast-math
|
||||
- Space sensitivity
|
||||
- Use bitmap index instead of forarray for doc list
|
||||
- ~~Search across multiple fields~~
|
||||
- Multi field search tests
|
||||
- Throw errors when schema is broken
|
||||
|
||||
**API**
|
||||
|
||||
|
@ -43,8 +43,13 @@ private:
|
||||
long long int n);
|
||||
void log_leaves(const int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const;
|
||||
|
||||
void search_candidates(std::vector<std::vector<art_leaf*>> & token_leaves, std::vector<Topster<100>::KV> & result_kvs,
|
||||
spp::sparse_hash_set<uint64_t> & dedup_seq_ids, size_t & total_results, const size_t & max_results);
|
||||
std::vector<Topster<100>::KV> search(std::string & query, const std::string & field, const int num_typos, const size_t num_results,
|
||||
std::vector<Topster<100>::KV> & result_kvs, spp::sparse_hash_set<uint64_t> & result_set,
|
||||
const token_ordering token_order = FREQUENCY, const bool prefix = false);
|
||||
|
||||
void search_candidates(int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves,
|
||||
std::vector<Topster<100>::KV> & result_kvs, spp::sparse_hash_set<uint64_t> & dedup_seq_ids,
|
||||
size_t & total_results, const size_t & max_results);
|
||||
|
||||
void index_string_field(const std::string &field_name, art_tree *t, const nlohmann::json &document, uint32_t seq_id) const;
|
||||
|
||||
@ -56,12 +61,12 @@ public:
|
||||
const std::vector<std::string> rank_fields);
|
||||
~Collection();
|
||||
std::string add(std::string json_str);
|
||||
std::vector<nlohmann::json> search(std::string query, const int num_typos, const size_t num_results,
|
||||
const token_ordering token_order = FREQUENCY, const bool prefix = false);
|
||||
std::vector<nlohmann::json> search(std::string query, const std::vector<std::string> fields, const int num_typos,
|
||||
const size_t num_results, const token_ordering token_order = FREQUENCY,
|
||||
const bool prefix = false);
|
||||
void remove(std::string id);
|
||||
void score_results(Topster<100> &topster, const std::vector<art_leaf *> &query_suggestion,
|
||||
const uint32_t *result_ids,
|
||||
const size_t result_size) const;
|
||||
void score_results(Topster<100> &topster, const int & token_rank, const std::vector<art_leaf *> &query_suggestion,
|
||||
const uint32_t *result_ids, const size_t result_size) const;
|
||||
|
||||
enum {MAX_SEARCH_TOKENS = 20};
|
||||
enum {MAX_RESULTS = 100};
|
||||
|
@ -143,7 +143,7 @@ void Collection::index_string_field(const std::string &field_name, art_tree *t,
|
||||
}
|
||||
}
|
||||
|
||||
void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_leaves,
|
||||
void Collection::search_candidates(int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves,
|
||||
std::vector<Topster<100>::KV> & result_kvs, spp::sparse_hash_set<uint64_t> & dedup_seq_ids,
|
||||
size_t & total_results, const size_t & max_results) {
|
||||
const size_t combination_limit = 10;
|
||||
@ -153,6 +153,7 @@ void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_l
|
||||
for(long long n=0; n<N && n<combination_limit; ++n) {
|
||||
// every element in `query_suggestion` contains a token and its associated hits
|
||||
std::vector<art_leaf *> query_suggestion = next_suggestion(token_leaves, n);
|
||||
token_rank++;
|
||||
|
||||
/*std:: cout << "\nSuggestion: ";
|
||||
for(auto suggestion_leaf: query_suggestion) {
|
||||
@ -178,7 +179,7 @@ void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_l
|
||||
|
||||
// go through each matching document id and calculate match score
|
||||
Topster<100> topster;
|
||||
score_results(topster, query_suggestion, result_ids, result_size);
|
||||
score_results(topster, token_rank, query_suggestion, result_ids, result_size);
|
||||
delete[] result_ids;
|
||||
topster.sort();
|
||||
|
||||
@ -191,10 +192,58 @@ void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_l
|
||||
}
|
||||
}
|
||||
|
||||
if(total_results >= max_results) break;
|
||||
if(total_results >= max_results) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<nlohmann::json> Collection::search(std::string query, const std::vector<std::string> fields,
|
||||
const int num_typos, const size_t num_results,
|
||||
const token_ordering token_order, const bool prefix) {
|
||||
// Order of `fields` are used to rank results
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
std::vector<std::pair<int, Topster<100>::KV>> field_order_kvs;
|
||||
|
||||
for(int i = 0; i < fields.size(); i++) {
|
||||
const std::string & field = fields[i];
|
||||
|
||||
// Container for holding the results
|
||||
std::vector<Topster<100>::KV> result_kvs;
|
||||
|
||||
// To prevent duplicate results, while preserving order of result vector
|
||||
spp::sparse_hash_set<uint64_t> result_set;
|
||||
|
||||
search(query, field, num_typos, num_results, result_kvs, result_set, token_order, prefix);
|
||||
for(auto result_kv: result_kvs) {
|
||||
field_order_kvs.push_back(std::make_pair(fields.size() - i, result_kv));
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(field_order_kvs.begin(), field_order_kvs.end(),
|
||||
[](const std::pair<int, Topster<100>::KV> & a, const std::pair<int, Topster<100>::KV> & b) {
|
||||
if(a.second.match_score != b.second.match_score) return a.second.match_score > b.second.match_score;
|
||||
if(a.second.primary_attr != b.second.primary_attr) return a.second.primary_attr > b.second.primary_attr;
|
||||
if(a.second.secondary_attr != b.second.secondary_attr) return a.second.secondary_attr > b.second.secondary_attr;
|
||||
if(a.first != b.first) return a.first > b.first;
|
||||
return a.second.key > b.second.key;
|
||||
});
|
||||
|
||||
std::vector<nlohmann::json> results;
|
||||
|
||||
for(auto field_order_kv: field_order_kvs) {
|
||||
std::string value;
|
||||
store->get(get_seq_id_key((uint32_t) field_order_kv.second.key), value);
|
||||
nlohmann::json document = nlohmann::json::parse(value);
|
||||
results.push_back(document);
|
||||
}
|
||||
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
|
||||
store->print_memory_usage();
|
||||
return results;
|
||||
}
|
||||
|
||||
/*
|
||||
1. Split the query into tokens
|
||||
2. Outer loop will generate bounded cartesian product with costs for each token
|
||||
@ -204,25 +253,22 @@ void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_l
|
||||
4. Intersect the lists to find docs that match each phrase
|
||||
5. Sort the docs based on some ranking criteria
|
||||
*/
|
||||
std::vector<nlohmann::json> Collection::search(std::string query, const int num_typos, const size_t num_results,
|
||||
const token_ordering token_order, const bool prefix) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
std::vector<Topster<100>::KV> Collection::search(std::string & query, const std::string & field,
|
||||
const int num_typos, const size_t num_results,
|
||||
std::vector<Topster<100>::KV> & result_kvs,
|
||||
spp::sparse_hash_set<uint64_t> & result_set,
|
||||
const token_ordering token_order, const bool prefix) {
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::tokenize(query, tokens, " ", true);
|
||||
|
||||
const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
|
||||
const size_t max_results = std::min(num_results, (size_t) Collection::MAX_RESULTS);
|
||||
|
||||
size_t total_results = 0;
|
||||
std::vector<Topster<100>::KV> result_kvs;
|
||||
size_t total_results = result_kvs.size();
|
||||
|
||||
// To prevent us from doing ART search repeatedly as we iterate through possible corrections
|
||||
spp::sparse_hash_map<std::string, std::vector<art_leaf*>> token_cost_cache;
|
||||
|
||||
// To prevent duplicate results, while preserving order of result vector
|
||||
spp::sparse_hash_set<uint64_t> result_set;
|
||||
|
||||
// Used to drop the least occurring token(s) for partial searches
|
||||
spp::sparse_hash_map<std::string, uint32_t> token_to_count;
|
||||
|
||||
@ -239,8 +285,10 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
|
||||
}
|
||||
|
||||
std::vector<std::vector<art_leaf*>> token_leaves;
|
||||
|
||||
const size_t combination_limit = 10;
|
||||
auto product = []( long long a, std::vector<int>& b ) { return a*b.size(); };
|
||||
int token_rank = 0;
|
||||
long long n = 0;
|
||||
long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
|
||||
|
||||
@ -256,7 +304,6 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
|
||||
|
||||
token_leaves.clear();
|
||||
int token_index = 0;
|
||||
bool retry_with_larger_cost = false;
|
||||
|
||||
while(token_index < tokens.size()) {
|
||||
// For each token, look up the generated cost for this iteration and search using that cost
|
||||
@ -264,13 +311,14 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
|
||||
const std::string token_cost_hash = token + std::to_string(costs[token_index]);
|
||||
|
||||
std::vector<art_leaf*> leaves;
|
||||
//std::cout << "\nSearching for: " << token << " - cost: " << costs[token_index] << std::endl;
|
||||
/*std::cout << "\nSearching for: " << token << " - cost: " << costs[token_index] << ", token_rank: "
|
||||
<< token_rank << std::endl;*/
|
||||
|
||||
if(token_cost_cache.count(token_cost_hash) != 0) {
|
||||
leaves = token_cost_cache[token_cost_hash];
|
||||
} else {
|
||||
int token_len = prefix ? (int) token.length() : (int) token.length() + 1;
|
||||
art_fuzzy_search(index_map.at("title"), (const unsigned char *) token.c_str(), token_len,
|
||||
art_fuzzy_search(index_map.at(field), (const unsigned char *) token.c_str(), token_len,
|
||||
costs[token_index], costs[token_index], 3, token_order, prefix, leaves);
|
||||
if(!leaves.empty()) {
|
||||
token_cost_cache.emplace(token_cost_hash, leaves);
|
||||
@ -298,22 +346,16 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
|
||||
n = -1;
|
||||
N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
|
||||
|
||||
// Don't look at remaining tokens if
|
||||
// a) We've run out of tokens, or b) We're not at at max_cost for this token
|
||||
// since we would see them again in a future iteration when we retry with a larger cost
|
||||
if(token_index == -1 || costs[token_index] != max_cost) {
|
||||
retry_with_larger_cost = true;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
token_index++;
|
||||
}
|
||||
|
||||
if(token_leaves.size() != 0 && !retry_with_larger_cost) {
|
||||
if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) {
|
||||
// If a) all tokens were found, or b) Some were skipped because they don't exist within max_cost,
|
||||
// go ahead and search for candidates with what we have so far
|
||||
search_candidates(token_leaves, result_kvs, result_set, total_results, max_results);
|
||||
search_candidates(token_rank, token_leaves, result_kvs, result_set, total_results, max_results);
|
||||
|
||||
if (total_results >= max_results) {
|
||||
// If we don't find enough results, we continue outerloop (looking at tokens with greater cost)
|
||||
@ -324,7 +366,8 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
|
||||
n++;
|
||||
}
|
||||
|
||||
if(result_kvs.size() == 0 && token_to_count.size() != 0) {
|
||||
// When there are not enough overall results and atleast one token has results
|
||||
if(result_kvs.size() < max_results && token_to_count.size() > 1) {
|
||||
// Drop certain token with least hits and try searching again
|
||||
std::string truncated_query;
|
||||
|
||||
@ -340,27 +383,15 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
|
||||
);
|
||||
|
||||
for(uint32_t i = 0; i < token_count_pairs.size()-1; i++) {
|
||||
if(token_to_count.count(tokens[i]) != 0) {
|
||||
if(token_to_count.count(token_count_pairs[i].first) != 0) {
|
||||
truncated_query += " " + token_count_pairs.at(i).first;
|
||||
}
|
||||
}
|
||||
|
||||
return search(truncated_query, num_typos, num_results);
|
||||
return search(truncated_query, field, num_typos, num_results, result_kvs, result_set, token_order, prefix);
|
||||
}
|
||||
|
||||
std::vector<nlohmann::json> results;
|
||||
|
||||
for(auto result_kv: result_kvs) {
|
||||
std::string value;
|
||||
store->get(get_seq_id_key((uint32_t) result_kv.key), value);
|
||||
nlohmann::json document = nlohmann::json::parse(value);
|
||||
results.push_back(document);
|
||||
}
|
||||
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
|
||||
store->print_memory_usage();
|
||||
return results;
|
||||
return result_kvs;
|
||||
}
|
||||
|
||||
void Collection::log_leaves(const int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const {
|
||||
@ -374,8 +405,12 @@ void Collection::log_leaves(const int cost, const std::string &token, const std:
|
||||
}
|
||||
}
|
||||
|
||||
void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf *> &query_suggestion,
|
||||
const uint32_t *result_ids, const size_t result_size) const {
|
||||
void Collection::score_results(Topster<100> &topster, const int & token_rank,
|
||||
const std::vector<art_leaf *> &query_suggestion, const uint32_t *result_ids,
|
||||
const size_t result_size) const {
|
||||
|
||||
const int max_token_rank = 250;
|
||||
|
||||
for(auto i=0; i<result_size; i++) {
|
||||
uint32_t doc_id = result_ids[i];
|
||||
std::vector<std::vector<uint16_t>> token_positions;
|
||||
@ -405,18 +440,23 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
|
||||
mscore = MatchScore::match_score(doc_id, token_positions);
|
||||
}
|
||||
|
||||
const uint64_t match_score = (uint64_t)(mscore.words_present * 32 + (MAX_SEARCH_TOKENS - mscore.distance));
|
||||
int token_rank_score = max_token_rank - token_rank;
|
||||
|
||||
// Construct a single match_score from individual components (for multi-field sort)
|
||||
const uint64_t match_score = (token_rank_score << 16) +
|
||||
((uint64_t)(mscore.words_present) << 8) +
|
||||
(MAX_SEARCH_TOKENS - mscore.distance);
|
||||
|
||||
int64_t primary_rank_score = primary_rank_scores.count(doc_id) > 0 ? primary_rank_scores.at(doc_id) : 0;
|
||||
int64_t secondary_rank_score = secondary_rank_scores.count(doc_id) > 0 ? secondary_rank_scores.at(doc_id) : 0;
|
||||
topster.add(doc_id, match_score, primary_rank_score, secondary_rank_score);
|
||||
/*std::cout << "mscore.distance: " << (int) mscore.distance << ", match_score: "
|
||||
/*std::cout << "token_rank_score: " << token_rank_score << ", match_score: "
|
||||
<< match_score << ", primary_rank_score: " << primary_rank_score << ", doc_id: " << doc_id << std::endl;*/
|
||||
}
|
||||
}
|
||||
|
||||
inline std::vector<art_leaf *> Collection::next_suggestion(
|
||||
const std::vector<std::vector<art_leaf *>> &token_leaves,
|
||||
long long int n) {
|
||||
inline std::vector<art_leaf *> Collection::next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
|
||||
long long int n) {
|
||||
std::vector<art_leaf*> query_suggestion(token_leaves.size());
|
||||
|
||||
// generate the next combination from `token_leaves` and store it in `query_suggestion`
|
||||
|
@ -56,7 +56,8 @@ int main() {
|
||||
collection->remove("foo");
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
collection->search("the", 1, 100);
|
||||
std::vector<std::string> search_fields = {"title"};
|
||||
collection->search("the", search_fields, 1, 100);
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
cout << "Time taken: " << timeMillis << "us" << endl;
|
||||
delete collection;
|
||||
|
@ -84,7 +84,10 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) {
|
||||
printf("Query: %s\n", query_map["q"].c_str());
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
std::vector<nlohmann::json> results = collection->search(query_map["q"], std::stoi(query_map[NUM_TYPOS]), 100, token_order, false);
|
||||
std::vector<std::string> search_fields = {"title"};
|
||||
|
||||
std::vector<nlohmann::json> results = collection->search(query_map["q"], search_fields,
|
||||
std::stoi(query_map[NUM_TYPOS]), 100, token_order, false);
|
||||
nlohmann::json json_array = nlohmann::json::array();
|
||||
for(nlohmann::json& result: results) {
|
||||
json_array.push_back(result);
|
||||
|
@ -7,11 +7,13 @@
|
||||
class CollectionTest : public ::testing::Test {
|
||||
protected:
|
||||
Collection *collection;
|
||||
std::vector<std::string> search_fields;
|
||||
|
||||
virtual void SetUp() {
|
||||
std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl");
|
||||
std::vector<field> fields = {field("title", field_type::STRING)};
|
||||
std::vector<std::string> rank_fields = {"points"};
|
||||
search_fields = {"title"};
|
||||
collection = new Collection("/tmp/typesense_test/collection", "collection", fields, rank_fields);
|
||||
|
||||
std::string json_line;
|
||||
@ -29,7 +31,7 @@ protected:
|
||||
};
|
||||
|
||||
TEST_F(CollectionTest, ExactSearchShouldBeStable) {
|
||||
std::vector<nlohmann::json> results = collection->search("the", 0, 10);
|
||||
std::vector<nlohmann::json> results = collection->search("the", search_fields, 0, 10);
|
||||
ASSERT_EQ(7, results.size());
|
||||
|
||||
// For two documents of the same score, the larger doc_id appears first
|
||||
@ -44,8 +46,8 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, ExactPhraseSearch) {
|
||||
std::vector<nlohmann::json> results = collection->search("rocket launch", 0, 10);
|
||||
ASSERT_EQ(4, results.size());
|
||||
std::vector<nlohmann::json> results = collection->search("rocket launch", search_fields, 0, 10);
|
||||
ASSERT_EQ(5, results.size());
|
||||
|
||||
/*
|
||||
Sort by (match, diff, score)
|
||||
@ -53,9 +55,10 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
|
||||
1: score: 15, diff: 4
|
||||
17: score: 8, diff: 4
|
||||
16: score: 10, diff: 5
|
||||
13: score: 12, (single word match)
|
||||
*/
|
||||
|
||||
std::vector<std::string> ids = {"8", "1", "17", "16"};
|
||||
std::vector<std::string> ids = {"8", "1", "17", "16", "13"};
|
||||
|
||||
for(size_t i = 0; i < results.size(); i++) {
|
||||
nlohmann::json result = results.at(i);
|
||||
@ -65,7 +68,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
|
||||
}
|
||||
|
||||
// Check pagination
|
||||
results = collection->search("rocket launch", 0, 3);
|
||||
results = collection->search("rocket launch", search_fields, 0, 3);
|
||||
ASSERT_EQ(3, results.size());
|
||||
for(size_t i = 0; i < 3; i++) {
|
||||
nlohmann::json result = results.at(i);
|
||||
@ -77,7 +80,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
|
||||
|
||||
TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
// Tokens that are not found in the index should be skipped
|
||||
std::vector<nlohmann::json> results = collection->search("DoesNotExist from", 0, 10);
|
||||
std::vector<nlohmann::json> results = collection->search("DoesNotExist from", search_fields, 0, 10);
|
||||
ASSERT_EQ(2, results.size());
|
||||
|
||||
std::vector<std::string> ids = {"2", "17"};
|
||||
@ -90,7 +93,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
}
|
||||
|
||||
// with non-zero cost
|
||||
results = collection->search("DoesNotExist from", 1, 10);
|
||||
results = collection->search("DoesNotExist from", search_fields, 1, 10);
|
||||
ASSERT_EQ(2, results.size());
|
||||
|
||||
for(size_t i = 0; i < results.size(); i++) {
|
||||
@ -101,7 +104,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
}
|
||||
|
||||
// with 2 indexed words
|
||||
results = collection->search("from DoesNotExist insTruments", 1, 10);
|
||||
results = collection->search("from DoesNotExist insTruments", search_fields, 1, 10);
|
||||
ASSERT_EQ(2, results.size());
|
||||
ids = {"2", "17"};
|
||||
|
||||
@ -113,16 +116,16 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
}
|
||||
|
||||
results.clear();
|
||||
results = collection->search("DoesNotExist1 DoesNotExist2", 0, 10);
|
||||
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, 0, 10);
|
||||
ASSERT_EQ(0, results.size());
|
||||
|
||||
results.clear();
|
||||
results = collection->search("DoesNotExist1 DoesNotExist2", 2, 10);
|
||||
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, 2, 10);
|
||||
ASSERT_EQ(0, results.size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, PartialPhraseSearch) {
|
||||
std::vector<nlohmann::json> results = collection->search("rocket research", 0, 10);
|
||||
std::vector<nlohmann::json> results = collection->search("rocket research", search_fields, 0, 10);
|
||||
ASSERT_EQ(4, results.size());
|
||||
|
||||
std::vector<std::string> ids = {"1", "8", "16", "17"};
|
||||
@ -136,15 +139,23 @@ TEST_F(CollectionTest, PartialPhraseSearch) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, QueryWithTypo) {
|
||||
std::vector<nlohmann::json> results = collection->search("kind biologcal", 2, 10);
|
||||
ASSERT_EQ(1, results.size());
|
||||
std::vector<nlohmann::json> results = collection->search("kind biologcal", search_fields, 2, 3);
|
||||
ASSERT_EQ(3, results.size());
|
||||
|
||||
std::string result_id = results.at(0)["id"];
|
||||
ASSERT_STREQ("19", result_id.c_str());
|
||||
std::vector<std::string> ids = {"19", "20", "21"};
|
||||
|
||||
for(size_t i = 0; i < results.size(); i++) {
|
||||
nlohmann::json result = results.at(i);
|
||||
std::string result_id = result["id"];
|
||||
std::string id = ids.at(i);
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results.clear();
|
||||
results = collection->search("fer thx", 1, 10);
|
||||
std::vector<std::string> ids = {"1", "10", "13"};
|
||||
results = collection->search("fer thx", search_fields, 1, 3);
|
||||
ids = {"1", "10", "13"};
|
||||
|
||||
ASSERT_EQ(3, results.size());
|
||||
|
||||
for(size_t i = 0; i < results.size(); i++) {
|
||||
nlohmann::json result = results.at(i);
|
||||
@ -155,7 +166,7 @@ TEST_F(CollectionTest, QueryWithTypo) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
std::vector<nlohmann::json> results = collection->search("loox", 1, 2, MAX_SCORE, false);
|
||||
std::vector<nlohmann::json> results = collection->search("loox", search_fields, 1, 2, MAX_SCORE, false);
|
||||
ASSERT_EQ(2, results.size());
|
||||
std::vector<std::string> ids = {"22", "23"};
|
||||
|
||||
@ -166,7 +177,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results = collection->search("loox", 1, 3, FREQUENCY, false);
|
||||
results = collection->search("loox", search_fields, 1, 3, FREQUENCY, false);
|
||||
ASSERT_EQ(3, results.size());
|
||||
ids = {"3", "12", "24"};
|
||||
|
||||
@ -178,17 +189,17 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
}
|
||||
|
||||
// Check pagination
|
||||
results = collection->search("loox", 1, 1, FREQUENCY, false);
|
||||
results = collection->search("loox", search_fields, 1, 1, FREQUENCY, false);
|
||||
ASSERT_EQ(1, results.size());
|
||||
std::string solo_id = results.at(0)["id"];
|
||||
ASSERT_STREQ("3", solo_id.c_str());
|
||||
|
||||
results = collection->search("loox", 1, 2, FREQUENCY, false);
|
||||
results = collection->search("loox", search_fields, 1, 2, FREQUENCY, false);
|
||||
ASSERT_EQ(2, results.size());
|
||||
|
||||
// Check total ordering
|
||||
|
||||
results = collection->search("loox", 1, 10, FREQUENCY, false);
|
||||
results = collection->search("loox", search_fields, 1, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(5, results.size());
|
||||
ids = {"3", "12", "24", "22", "23"};
|
||||
|
||||
@ -199,7 +210,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results = collection->search("loox", 1, 10, MAX_SCORE, false);
|
||||
results = collection->search("loox", search_fields, 1, 10, MAX_SCORE, false);
|
||||
ASSERT_EQ(5, results.size());
|
||||
ids = {"22", "23", "3", "12", "24"};
|
||||
|
||||
@ -213,10 +224,23 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
||||
|
||||
TEST_F(CollectionTest, TextContainingAnActualTypo) {
|
||||
// A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
|
||||
std::vector<nlohmann::json> results = collection->search("ISX what", 1, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(5, results.size());
|
||||
std::vector<nlohmann::json> results = collection->search("ISX what", search_fields, 1, 4, FREQUENCY, false);
|
||||
ASSERT_EQ(4, results.size());
|
||||
|
||||
std::vector<std::string> ids = {"20", "19", "6", "21", "8"};
|
||||
std::vector<std::string> ids = {"19", "6", "21", "8"};
|
||||
|
||||
for(size_t i = 0; i < results.size(); i++) {
|
||||
nlohmann::json result = results.at(i);
|
||||
std::string result_id = result["id"];
|
||||
std::string id = ids.at(i);
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
// Record containing exact token match should appear first
|
||||
results = collection->search("ISX", search_fields, 1, 10, FREQUENCY, false);
|
||||
ASSERT_EQ(8, results.size());
|
||||
|
||||
ids = {"20", "19", "6", "3", "21", "4", "10", "8"};
|
||||
|
||||
for(size_t i = 0; i < results.size(); i++) {
|
||||
nlohmann::json result = results.at(i);
|
||||
@ -227,7 +251,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, PrefixSearching) {
|
||||
std::vector<nlohmann::json> results = collection->search("ex", 0, 10, FREQUENCY, true);
|
||||
std::vector<nlohmann::json> results = collection->search("ex", search_fields, 0, 10, FREQUENCY, true);
|
||||
ASSERT_EQ(2, results.size());
|
||||
std::vector<std::string> ids = {"12", "6"};
|
||||
|
||||
@ -238,7 +262,7 @@ TEST_F(CollectionTest, PrefixSearching) {
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
results = collection->search("ex", 0, 10, MAX_SCORE, true);
|
||||
results = collection->search("ex", search_fields, 0, 10, MAX_SCORE, true);
|
||||
ASSERT_EQ(2, results.size());
|
||||
ids = {"6", "12"};
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user