Do drop tokens at a global level.

This commit is contained in:
Kishore Nallan 2022-03-23 16:40:08 +05:30
parent 6b743cfa48
commit 66cb71039f
16 changed files with 664 additions and 453 deletions

View File

@ -18,108 +18,13 @@
#include <field.h>
#include <option.h>
#include "tokenizer.h"
#include "synonym_index.h"
struct doc_seq_id_t {
uint32_t seq_id;
bool is_new;
};
struct synonym_t {
std::string id;
std::vector<std::string> root;
std::vector<std::vector<std::string>> synonyms;
synonym_t() = default;
synonym_t(const std::string& id, const std::vector<std::string>& root,
const std::vector<std::vector<std::string>>& synonyms):
id(id), root(root), synonyms(synonyms) {
}
explicit synonym_t(const nlohmann::json& synonym) {
id = synonym["id"].get<std::string>();
if(synonym.count("root") != 0) {
root = synonym["root"].get<std::vector<std::string>>();
}
synonyms = synonym["synonyms"].get<std::vector<std::vector<std::string>>>();
}
nlohmann::json to_json() const {
nlohmann::json obj;
obj["id"] = id;
obj["root"] = root;
obj["synonyms"] = synonyms;
return obj;
}
nlohmann::json to_view_json() const {
nlohmann::json obj;
obj["id"] = id;
obj["root"] = StringUtils::join(root, " ");
obj["synonyms"] = nlohmann::json::array();
for(const auto& synonym: synonyms) {
obj["synonyms"].push_back(StringUtils::join(synonym, " "));
}
return obj;
}
static Option<bool> parse(const nlohmann::json& synonym_json, synonym_t& syn) {
if(synonym_json.count("id") == 0) {
return Option<bool>(400, "Missing `id` field.");
}
if(synonym_json.count("synonyms") == 0) {
return Option<bool>(400, "Could not find an array of `synonyms`");
}
if(synonym_json.count("root") != 0 && !synonym_json["root"].is_string()) {
return Option<bool>(400, "Key `root` should be a string.");
}
if (!synonym_json["synonyms"].is_array() || synonym_json["synonyms"].empty()) {
return Option<bool>(400, "Could not find an array of `synonyms`");
}
for(const auto& synonym: synonym_json["synonyms"]) {
if(!synonym.is_string() || synonym == "") {
return Option<bool>(400, "Could not find a valid string array of `synonyms`");
}
std::vector<std::string> tokens;
Tokenizer(synonym, true).tokenize(tokens);
syn.synonyms.push_back(tokens);
}
if(synonym_json.count("root") != 0) {
std::vector<std::string> tokens;
Tokenizer(synonym_json["root"], true).tokenize(tokens);
syn.root = tokens;
}
syn.id = synonym_json["id"];
return Option<bool>(true);
}
static uint64_t get_hash(const std::vector<std::string>& tokens) {
uint64_t hash = 1;
for(size_t i=0; i < tokens.size(); i++) {
auto& token = tokens[i];
uint64_t token_hash = StringUtils::hash_wy(token.c_str(), token.size());
if(i == 0) {
hash = token_hash;
} else {
hash = Index::hash_combine(hash, token_hash);
}
}
return hash;
}
};
struct highlight_field_t {
std::string name;
bool fully_highlighted;
@ -176,9 +81,6 @@ private:
std::map<std::string, override_t> overrides;
spp::sparse_hash_map<std::string, synonym_t> synonym_definitions;
spp::sparse_hash_map<uint64_t, std::vector<std::string>> synonym_index;
const std::string default_sorting_field;
const float max_memory_ratio;
@ -193,6 +95,8 @@ private:
Index* index;
SynonymIndex* synonym_index;
// methods
std::string get_doc_id_key(const std::string & doc_id) const;
@ -242,12 +146,6 @@ private:
static Option<bool> parse_pinned_hits(const std::string& pinned_hits_str,
std::map<size_t, std::vector<std::string>>& pinned_hits);
void synonym_reduction_internal(const std::vector<std::string>& tokens,
size_t start_window_size,
size_t start_index_pos,
std::set<uint64_t>& processed_syn_hashes,
std::vector<std::vector<std::string>>& results) const;
Index* init_index();
static std::vector<char> to_char_array(const std::vector<std::string>& strs);
@ -267,7 +165,6 @@ public:
static constexpr const char* COLLECTION_META_PREFIX = "$CM";
static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS";
static constexpr const char* COLLECTION_OVERRIDE_PREFIX = "$CO";
static constexpr const char* COLLECTION_SYNONYM_PREFIX = "$CY";
static constexpr const char* SEQ_ID_PREFIX = "$SI";
static constexpr const char* DOC_ID_PREFIX = "$DI";
@ -300,8 +197,6 @@ public:
static std::string get_override_key(const std::string & collection_name, const std::string & override_id);
static std::string get_synonym_key(const std::string & collection_name, const std::string & synonym_id);
std::string get_seq_id_collection_prefix() const;
std::string get_name() const;
@ -444,9 +339,6 @@ public:
// synonym operations
void synonym_reduction(const std::vector<std::string>& tokens,
std::vector<std::vector<std::string>>& results) const;
spp::sparse_hash_map<std::string, synonym_t> get_synonyms();
bool get_synonym(const std::string& id, synonym_t& synonym);
@ -455,6 +347,11 @@ public:
Option<bool> remove_synonym(const std::string & id);
void synonym_reduction(const std::vector<std::string>& tokens,
std::vector<std::vector<std::string>>& results) const;
// highlight ops
static void highlight_text(const string& highlight_start_tag, const string& highlight_end_tag, const string& last_raw_q_token,
const string& text, const std::map<size_t, size_t>& token_offsets,
const std::map<size_t, std::string>& prefix_start_offsets, size_t snippet_end_offset,

View File

@ -24,6 +24,7 @@
#include "adi_tree.h"
#include "tsl/htrie_set.h"
#include "id_list.h"
#include "synonym_index.h"
static constexpr size_t ARRAY_FACET_DIM = 4;
using facet_map_t = spp::sparse_hash_map<uint32_t, facet_hash_values_t>;
@ -56,7 +57,7 @@ struct search_field_t {
};
struct query_tokens_t {
std::vector<std::string> q_include_tokens;
std::vector<token_t> q_include_tokens;
std::vector<std::vector<std::string>> q_exclude_tokens;
std::vector<std::vector<std::string>> q_phrases;
std::vector<std::vector<std::string>> q_synonyms;
@ -431,6 +432,8 @@ private:
const Store* store;
const SynonymIndex* synonym_index;
ThreadPool* thread_pool;
size_t num_documents;
@ -480,6 +483,7 @@ private:
long long int n,
std::vector<art_leaf *>& actual_query_suggestion,
std::vector<art_leaf *>& query_suggestion,
int syn_orig_num_tokens,
uint32_t& token_bits,
uint64& qhash);
@ -506,8 +510,7 @@ private:
static void aggregate_topster(Topster* agg_topster, Topster* index_topster);
void search_field(const uint8_t & field_id,
std::vector<token_t>& query_tokens,
std::vector<token_t>& search_tokens,
const std::vector<token_t>& query_tokens,
const uint32_t* exclude_token_ids,
size_t exclude_token_ids_size,
size_t& num_tokens_dropped,
@ -575,7 +578,7 @@ private:
std::unordered_map<std::string, std::vector<uint32_t>>& token_to_offsets,
std::vector<uint64_t>& facet_hashes);
void collate_included_ids(const std::vector<std::string>& q_included_tokens,
void collate_included_ids(const std::vector<token_t>& q_included_tokens,
const std::string & field, const uint8_t field_id,
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries) const;
@ -635,18 +638,14 @@ public:
Index(const std::string& name,
const uint32_t collection_id,
const Store* store,
SynonymIndex* synonym_index,
ThreadPool* thread_pool,
const std::unordered_map<std::string, field>& search_schema,
const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators);
const std::vector<char>& symbols_to_index,
const std::vector<char>& token_separators);
~Index();
// reference: https://stackoverflow.com/a/27952689/131050
static uint64_t hash_combine(uint64_t combined, uint64_t hash) {
combined ^= hash + 0x517cc1b727220a95 + (combined << 6) + (combined >> 2);
return combined;
}
static void concat_topster_ids(Topster* topster, spp::sparse_hash_map<uint64_t, std::vector<KV*>>& topster_ids);
void score_results(const std::vector<sort_by> &sort_fields, const uint16_t &query_index, const uint8_t &field_id,
@ -819,7 +818,7 @@ public:
int field_num_typos,
bool field_prefix, const uint8_t field_id, const string& field_name,
const std::unordered_map<string, field>::const_iterator& field_it,
std::vector<token_t>& query_tokens, std::vector<token_t>& search_tokens,
std::vector<token_t>& query_tokens,
size_t num_tokens_dropped, Topster* actual_topster, size_t field_num_results,
std::vector<query_tokens_t>& field_query_tokens, size_t& all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed,

View File

@ -295,6 +295,12 @@ struct StringUtils {
return hash != std::numeric_limits<uint64_t>::max() ? hash : (std::numeric_limits<uint64_t>::max()-1);
}
// reference: https://stackoverflow.com/a/27952689/131050
static uint64_t hash_combine(uint64_t combined, uint64_t hash) {
combined ^= hash + 0x517cc1b727220a95 + (combined << 6) + (combined >> 2);
return combined;
}
std::string unicode_nfkd(const std::string& text);
static std::string randstring(size_t length);

139
include/synonym_index.h Normal file
View File

@ -0,0 +1,139 @@
#pragma once
#include <set>
#include "sparsepp.h"
#include "json.hpp"
#include "string_utils.h"
#include "option.h"
#include "tokenizer.h"
#include "store.h"
struct synonym_t {
std::string id;
std::vector<std::string> root;
std::vector<std::vector<std::string>> synonyms;
synonym_t() = default;
synonym_t(const std::string& id, const std::vector<std::string>& root,
const std::vector<std::vector<std::string>>& synonyms):
id(id), root(root), synonyms(synonyms) {
}
explicit synonym_t(const nlohmann::json& synonym) {
id = synonym["id"].get<std::string>();
if(synonym.count("root") != 0) {
root = synonym["root"].get<std::vector<std::string>>();
}
synonyms = synonym["synonyms"].get<std::vector<std::vector<std::string>>>();
}
nlohmann::json to_json() const {
nlohmann::json obj;
obj["id"] = id;
obj["root"] = root;
obj["synonyms"] = synonyms;
return obj;
}
nlohmann::json to_view_json() const {
nlohmann::json obj;
obj["id"] = id;
obj["root"] = StringUtils::join(root, " ");
obj["synonyms"] = nlohmann::json::array();
for(const auto& synonym: synonyms) {
obj["synonyms"].push_back(StringUtils::join(synonym, " "));
}
return obj;
}
static Option<bool> parse(const nlohmann::json& synonym_json, synonym_t& syn) {
if(synonym_json.count("id") == 0) {
return Option<bool>(400, "Missing `id` field.");
}
if(synonym_json.count("synonyms") == 0) {
return Option<bool>(400, "Could not find an array of `synonyms`");
}
if(synonym_json.count("root") != 0 && !synonym_json["root"].is_string()) {
return Option<bool>(400, "Key `root` should be a string.");
}
if (!synonym_json["synonyms"].is_array() || synonym_json["synonyms"].empty()) {
return Option<bool>(400, "Could not find an array of `synonyms`");
}
for(const auto& synonym: synonym_json["synonyms"]) {
if(!synonym.is_string() || synonym == "") {
return Option<bool>(400, "Could not find a valid string array of `synonyms`");
}
std::vector<std::string> tokens;
Tokenizer(synonym, true).tokenize(tokens);
syn.synonyms.push_back(tokens);
}
if(synonym_json.count("root") != 0) {
std::vector<std::string> tokens;
Tokenizer(synonym_json["root"], true).tokenize(tokens);
syn.root = tokens;
}
syn.id = synonym_json["id"];
return Option<bool>(true);
}
static uint64_t get_hash(const std::vector<std::string>& tokens) {
uint64_t hash = 1;
for(size_t i=0; i < tokens.size(); i++) {
auto& token = tokens[i];
uint64_t token_hash = StringUtils::hash_wy(token.c_str(), token.size());
if(i == 0) {
hash = token_hash;
} else {
hash = StringUtils::hash_combine(hash, token_hash);
}
}
return hash;
}
};
class SynonymIndex {
private:
mutable std::shared_mutex mutex;
Store* store;
spp::sparse_hash_map<std::string, synonym_t> synonym_definitions;
spp::sparse_hash_map<uint64_t, std::vector<std::string>> synonym_index;
void synonym_reduction_internal(const std::vector<std::string>& tokens,
size_t start_window_size,
size_t start_index_pos,
std::set<uint64_t>& processed_syn_hashes,
std::vector<std::vector<std::string>>& results) const;
public:
static constexpr const char* COLLECTION_SYNONYM_PREFIX = "$CY";
SynonymIndex(Store* store): store(store) { }
static std::string get_synonym_key(const std::string & collection_name, const std::string & synonym_id);
void synonym_reduction(const std::vector<std::string>& tokens,
std::vector<std::vector<std::string>>& results) const;
spp::sparse_hash_map<std::string, synonym_t> get_synonyms();
bool get_synonym(const std::string& id, synonym_t& synonym);
Option<bool> add_synonym(const std::string & collection_name, const synonym_t& synonym);
Option<bool> remove_synonym(const std::string & collection_name, const std::string & id);
};

View File

@ -57,6 +57,7 @@ Collection::Collection(const std::string& name, const uint32_t collection_id, co
Collection::~Collection() {
std::unique_lock lock(mutex);
delete index;
delete synonym_index;
}
uint32_t Collection::get_next_seq_id() {
@ -974,28 +975,34 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query, const s
//LOG(INFO) << "Num indices used for querying: " << indices.size();
std::vector<query_tokens_t> field_query_tokens;
std::vector<std::string> q_tokens; // used for auxillary highlighting
std::vector<std::string> q_include_tokens;
if(search_fields.size() == 0) {
// has to be a wildcard query
field_query_tokens.emplace_back(query_tokens_t{});
parse_search_query(query, field_query_tokens[0].q_include_tokens,
parse_search_query(query, q_include_tokens,
field_query_tokens[0].q_exclude_tokens, field_query_tokens[0].q_phrases, "",
false);
for(size_t i = 0; i < q_include_tokens.size(); i++) {
auto& q_include_token = q_include_tokens[i];
field_query_tokens[0].q_include_tokens.emplace_back(i, q_include_token, (i == q_include_tokens.size()-1));
}
} else {
field_query_tokens.emplace_back(query_tokens_t{});
const std::string & field_locale = search_schema.at(search_fields[0]).locale;
parse_search_query(query, field_query_tokens[0].q_include_tokens,
parse_search_query(query, q_include_tokens,
field_query_tokens[0].q_exclude_tokens,
field_query_tokens[0].q_phrases,
field_locale, pre_segmented_query);
// process filter overrides first, before synonyms (order is important)
index->process_filter_overrides(filter_overrides, field_query_tokens[0].q_include_tokens, token_order, filters);
index->process_filter_overrides(filter_overrides, q_include_tokens, token_order, filters);
// get synonyms
synonym_reduction(field_query_tokens[0].q_include_tokens, field_query_tokens[0].q_synonyms);
q_tokens = field_query_tokens[0].q_include_tokens;
for(size_t i = 0; i < q_include_tokens.size(); i++) {
auto& q_include_token = q_include_tokens[i];
q_tokens.push_back(q_include_token);
field_query_tokens[0].q_include_tokens.emplace_back(i, q_include_token, (i == q_include_tokens.size()-1));
}
for(auto& phrase: field_query_tokens[0].q_phrases) {
for(auto& token: phrase) {
@ -2344,10 +2351,6 @@ std::string Collection::get_override_key(const std::string & collection_name, co
return std::string(COLLECTION_OVERRIDE_PREFIX) + "_" + collection_name + "_" + override_id;
}
std::string Collection::get_synonym_key(const std::string & collection_name, const std::string & synonym_id) {
return std::string(COLLECTION_SYNONYM_PREFIX) + "_" + collection_name + "_" + synonym_id;
}
std::string Collection::get_seq_id_collection_prefix() const {
return std::to_string(collection_id) + "_" + std::string(SEQ_ID_PREFIX);
}
@ -2432,184 +2435,30 @@ Option<bool> Collection::parse_pinned_hits(const std::string& pinned_hits_str,
return Option<bool>(true);
}
void Collection::synonym_reduction(const std::vector<std::string>& tokens,
std::vector<std::vector<std::string>>& results) const {
if(synonym_definitions.empty()) {
return;
}
std::set<uint64_t> processed_syn_hashes;
synonym_reduction_internal(tokens, tokens.size(), 0, processed_syn_hashes, results);
}
Option<bool> Collection::add_synonym(const synonym_t& synonym) {
if(synonym_definitions.count(synonym.id) != 0) {
// first we have to delete existing entries so we can upsert
Option<bool> rem_op = remove_synonym(synonym.id);
if(!rem_op.ok()) {
return rem_op;
}
}
std::unique_lock write_lock(mutex);
synonym_definitions[synonym.id] = synonym;
if(!synonym.root.empty()) {
uint64_t root_hash = synonym_t::get_hash(synonym.root);
synonym_index[root_hash].emplace_back(synonym.id);
} else {
for(const auto & syn_tokens : synonym.synonyms) {
uint64_t syn_hash = synonym_t::get_hash(syn_tokens);
synonym_index[syn_hash].emplace_back(synonym.id);
}
}
write_lock.unlock();
bool inserted = store->insert(Collection::get_synonym_key(name, synonym.id), synonym.to_json().dump());
if(!inserted) {
return Option<bool>(500, "Error while storing the synonym on disk.");
}
return Option<bool>(true);
std::shared_lock lock(mutex);
return synonym_index->add_synonym(name, synonym);
}
bool Collection::get_synonym(const std::string& id, synonym_t& synonym) {
std::shared_lock lock(mutex);
if(synonym_definitions.count(id) != 0) {
synonym = synonym_definitions[id];
return true;
}
return false;
}
void Collection::synonym_reduction_internal(const std::vector<std::string>& tokens,
size_t start_window_size, size_t start_index_pos,
std::set<uint64_t>& processed_syn_hashes,
std::vector<std::vector<std::string>>& results) const {
bool recursed = false;
for(size_t window_len = start_window_size; window_len > 0; window_len--) {
for(size_t start_index = start_index_pos; start_index+window_len-1 < tokens.size(); start_index++) {
std::vector<uint64_t> syn_hashes;
uint64_t syn_hash = 1;
for(size_t i = start_index; i < start_index+window_len; i++) {
uint64_t token_hash = StringUtils::hash_wy(tokens[i].c_str(), tokens[i].size());
if(i == start_index) {
syn_hash = token_hash;
} else {
syn_hash = Index::hash_combine(syn_hash, token_hash);
}
syn_hashes.push_back(token_hash);
}
const auto& syn_itr = synonym_index.find(syn_hash);
if(syn_itr != synonym_index.end() && processed_syn_hashes.count(syn_hash) == 0) {
// tokens in this window match a synonym: reconstruct tokens and rerun synonym mapping against matches
const auto& syn_ids = syn_itr->second;
for(const auto& syn_id: syn_ids) {
const auto &syn_def = synonym_definitions.at(syn_id);
for (const auto &syn_def_tokens: syn_def.synonyms) {
std::vector<std::string> new_tokens;
for (size_t i = 0; i < start_index; i++) {
new_tokens.push_back(tokens[i]);
}
std::vector<uint64_t> syn_def_hashes;
uint64_t syn_def_hash = 1;
for (size_t i = 0; i < syn_def_tokens.size(); i++) {
const auto &syn_def_token = syn_def_tokens[i];
new_tokens.push_back(syn_def_token);
uint64_t token_hash = StringUtils::hash_wy(syn_def_token.c_str(),
syn_def_token.size());
if (i == 0) {
syn_def_hash = token_hash;
} else {
syn_def_hash = Index::hash_combine(syn_def_hash, token_hash);
}
syn_def_hashes.push_back(token_hash);
}
if (syn_def_hash == syn_hash) {
// skip over token matching itself in the group
continue;
}
for (size_t i = start_index + window_len; i < tokens.size(); i++) {
new_tokens.push_back(tokens[i]);
}
processed_syn_hashes.emplace(syn_def_hash);
processed_syn_hashes.emplace(syn_hash);
for (uint64_t h: syn_def_hashes) {
processed_syn_hashes.emplace(h);
}
for (uint64_t h: syn_hashes) {
processed_syn_hashes.emplace(h);
}
recursed = true;
synonym_reduction_internal(new_tokens, window_len, start_index, processed_syn_hashes, results);
}
}
}
}
// reset it because for the next window we have to start from scratch
start_index_pos = 0;
}
if(!recursed && !processed_syn_hashes.empty()) {
results.emplace_back(tokens);
}
return synonym_index->get_synonym(id, synonym);
}
Option<bool> Collection::remove_synonym(const std::string &id) {
std::unique_lock lock(mutex);
const auto& syn_iter = synonym_definitions.find(id);
std::shared_lock lock(mutex);
return synonym_index->remove_synonym(name, id);
}
if(syn_iter != synonym_definitions.end()) {
bool removed = store->remove(Collection::get_synonym_key(name, id));
if(!removed) {
return Option<bool>(500, "Error while deleting the synonym from disk.");
}
const auto& synonym = syn_iter->second;
if(!synonym.root.empty()) {
uint64_t root_hash = synonym_t::get_hash(synonym.root);
synonym_index.erase(root_hash);
} else {
for(const auto & syn_tokens : synonym.synonyms) {
uint64_t syn_hash = synonym_t::get_hash(syn_tokens);
synonym_index.erase(syn_hash);
}
}
synonym_definitions.erase(id);
return Option<bool>(true);
}
return Option<bool>(404, "Could not find that `id`.");
void Collection::synonym_reduction(const std::vector<std::string>& tokens,
std::vector<std::vector<std::string>>& results) const {
std::shared_lock lock(mutex);
return synonym_index->synonym_reduction(tokens, results);
}
spp::sparse_hash_map<std::string, synonym_t> Collection::get_synonyms() {
std::shared_lock lock(mutex);
return synonym_definitions;
return synonym_index->get_synonyms();
}
Option<bool> Collection::check_and_update_schema(nlohmann::json& document, const DIRTY_VALUES& dirty_values) {
@ -2773,9 +2622,12 @@ Index* Collection::init_index() {
search_schema.emplace(field.name, field);
}
synonym_index = new SynonymIndex(store);
return new Index(name+std::to_string(0),
collection_id,
store,
synonym_index,
CollectionManager::get_instance().get_thread_pool(),
search_schema,
symbols_to_index, token_separators);

View File

@ -429,7 +429,7 @@ Option<nlohmann::json> CollectionManager::drop_collection(const std::string& col
// delete synonyms
const std::string& del_synonym_prefix =
std::string(Collection::COLLECTION_SYNONYM_PREFIX) + "_" + actual_coll_name + "_";
std::string(SynonymIndex::COLLECTION_SYNONYM_PREFIX) + "_" + actual_coll_name + "_";
iter = store->scan(del_synonym_prefix);
while(iter->Valid() && iter->key().starts_with(del_synonym_prefix)) {
store->remove(iter->key().ToString());
@ -1110,7 +1110,7 @@ Option<bool> CollectionManager::load_collection(const nlohmann::json &collection
// initialize synonyms
std::vector<std::string> collection_synonym_jsons;
cm.store->scan_fill(Collection::get_synonym_key(this_collection_name, ""), collection_synonym_jsons);
cm.store->scan_fill(SynonymIndex::get_synonym_key(this_collection_name, ""), collection_synonym_jsons);
for(const auto & collection_synonym_json: collection_synonym_jsons) {
nlohmann::json collection_synonym = nlohmann::json::parse(collection_synonym_json);

View File

@ -38,10 +38,11 @@ spp::sparse_hash_map<uint32_t, int64_t> Index::seq_id_sentinel_value;
spp::sparse_hash_map<uint32_t, int64_t> Index::geo_sentinel_value;
spp::sparse_hash_map<uint32_t, int64_t> Index::str_sentinel_value;
Index::Index(const std::string& name, const uint32_t collection_id, const Store* store, ThreadPool* thread_pool,
Index::Index(const std::string& name, const uint32_t collection_id, const Store* store,
SynonymIndex* synonym_index, ThreadPool* thread_pool,
const std::unordered_map<std::string, field> & search_schema,
const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators):
name(name), collection_id(collection_id), store(store), thread_pool(thread_pool),
name(name), collection_id(collection_id), store(store), synonym_index(synonym_index), thread_pool(thread_pool),
search_schema(search_schema),
seq_ids(new id_list_t(256)), symbols_to_index(symbols_to_index), token_separators(token_separators) {
@ -1142,7 +1143,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array,
uint32_t token_bits = 0;
uint32_t total_cost = next_suggestion(token_candidates_vec, n, actual_query_suggestion,
query_suggestion, token_bits, qhash);
query_suggestion, syn_orig_num_tokens, token_bits, qhash);
if(query_hashes.find(qhash) != query_hashes.end()) {
// skip this query since it has already been processed before
@ -1659,7 +1660,7 @@ void Index::run_search(search_args* search_params) {
search_params->split_join_tokens);
}
void Index::collate_included_ids(const std::vector<std::string>& q_included_tokens,
void Index::collate_included_ids(const std::vector<token_t>& q_included_tokens,
const std::string & field, const uint8_t field_id,
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
Topster* curated_topster,
@ -1673,15 +1674,15 @@ void Index::collate_included_ids(const std::vector<std::string>& q_included_toke
std::vector<art_leaf *> override_query;
for(const std::string& token: q_included_tokens) {
if(token == "*") {
for(const token_t& token: q_included_tokens) {
if(token.value == "*") {
continue;
}
const size_t token_len = token.size() + 1;
const size_t token_len = token.value.size() + 1;
std::vector<art_leaf*> leaves;
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.value.c_str(), token_len,
0, 0, 1, token_ordering::MAX_SCORE, false, nullptr, 0, leaves);
if(!leaves.empty()) {
@ -1931,8 +1932,6 @@ bool Index::check_for_overrides(const token_ordering& token_order, const string&
window_tokens_set.emplace(tokens[i]);
}
std::vector<token_t> search_tokens = window_tokens;
std::vector<facet> facets;
std::vector<std::vector<art_leaf*>> searched_queries;
Topster* topster = nullptr;
@ -1950,7 +1949,7 @@ bool Index::check_for_overrides(const token_ordering& token_order, const string&
continue;
}
search_field(0, window_tokens, search_tokens, nullptr, 0, num_toks_dropped, field_it->second, field_name,
search_field(0, window_tokens, nullptr, 0, num_toks_dropped, field_it->second, field_name,
nullptr, 0, {}, {}, -1, 2, searched_queries, topster, groups_processed,
&result_ids, result_ids_len, field_num_results, 0, group_by_fields,
false, 4, query_hashes, token_order, false, 0, 1, false, -1, 3, 7, 4);
@ -2142,7 +2141,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
std::vector<Topster*> ftopsters;
auto is_wildcard_query = !field_query_tokens.empty() && !field_query_tokens[0].q_include_tokens.empty() &&
field_query_tokens[0].q_include_tokens[0] == "*";
field_query_tokens[0].q_include_tokens[0].value == "*";
// for phrase query, parser will set field_query_tokens to "*", need to handle that
if (is_wildcard_query && field_query_tokens[0].q_phrases.empty()) {
@ -2162,6 +2161,10 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
// In multi-field searches, a record can be matched across different fields, so we use this for aggregation
spp::sparse_hash_map<uint64_t, std::vector<KV*>> topster_ids;
//begin = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < num_search_fields; i++) {
Topster* ftopster = new Topster(topster->MAX_SIZE, topster->distinct);
ftopsters.push_back(ftopster);
}
// We do progressive typo relaxation here so that results with minimal typos are fetched first,
// and further results are fetched only if `typo_tokens_threshold` is not satisfied.
@ -2180,6 +2183,65 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
}
}
// now try to drop tokens
size_t num_tokens_dropped = 0;
std::vector<token_t> orig_tokens = field_query_tokens[0].q_include_tokens;
std::vector<query_tokens_t> truncated_field_query_tokens = field_query_tokens;
while(exhaustive_search || all_result_ids_len < drop_tokens_threshold) {
// When atleast two tokens from the query are available we can drop one
std::vector<token_t> truncated_tokens;
if(orig_tokens.size() > 1 && num_tokens_dropped < 2*(orig_tokens.size()-1)) {
bool prefix_search = false;
if(num_tokens_dropped < orig_tokens.size()-1) {
// drop from right
size_t truncated_len = orig_tokens.size() - num_tokens_dropped - 1;
for(size_t i=0; i < truncated_len; i++) {
truncated_tokens.emplace_back(orig_tokens[i]);
}
} else {
// drop from left
prefix_search = true;
size_t start_index = (num_tokens_dropped + 1) - orig_tokens.size() + 1;
for(size_t i = start_index; i < orig_tokens.size(); i++) {
truncated_tokens.emplace_back(orig_tokens[i]);
}
}
num_tokens_dropped++;
for(size_t i = 0; i < num_search_fields; i++) {
truncated_field_query_tokens[i].q_include_tokens = truncated_tokens;
}
std::vector<bool> drop_token_prefixes;
for(const auto p: prefixes) {
drop_token_prefixes.push_back(p && prefix_search);
}
for(int min_typo = 0; min_typo <= 2; min_typo++) {
search_fields(filters, included_ids_map, sort_fields_std, min_typo, num_typos, topster,
curated_topster, token_order, drop_token_prefixes, drop_tokens_threshold, groups_processed,
searched_queries, typo_tokens_threshold, group_limit, group_by_fields,
prioritize_exact_match, exhaustive_search, concurrency, min_len_1typo, min_len_2typo,
max_candidates, infixes, max_extra_prefix, max_extra_suffix,
filter_ids, filter_ids_length, curated_ids, curated_ids_sorted, num_search_fields,
exclude_token_ids, exclude_token_ids_size, ftopsters, is_wildcard_query, false,
truncated_field_query_tokens, the_fields, all_result_ids_len, all_result_ids, topster_ids);
if (!exhaustive_search && all_result_ids_len >= typo_tokens_threshold) {
break;
}
}
} else {
break;
}
}
//auto begin0 = std::chrono::high_resolution_clock::now();
/*size_t total_q_tokens = field_query_tokens[0].q_include_tokens.size();
for(const auto& phrase: field_query_tokens[0].q_phrases) {
@ -2392,7 +2454,7 @@ void Index::aggregate_and_score_fields(const std::vector<query_tokens_t>& field_
// FIXME: must consider phrase tokens also
for(size_t token_index=0; token_index < field_query_tokens[i].q_include_tokens.size(); token_index++) {
const auto& token = field_query_tokens[i].q_include_tokens[token_index];
const auto& token = field_query_tokens[i].q_include_tokens[token_index].value;
const art_leaf* leaf = (art_leaf *) art_search(search_index.at(field), (const unsigned char*) token.c_str(),
token.length()+1);
@ -2494,11 +2556,7 @@ void Index::search_fields(const std::vector<filter>& filters,
continue;
}
std::vector<token_t> q_include_pos_tokens;
for(size_t j=0; j < field_query_tokens[i].q_include_tokens.size(); j++) {
bool is_prefix = (j == field_query_tokens[i].q_include_tokens.size()-1);
q_include_pos_tokens.emplace_back(j, field_query_tokens[i].q_include_tokens[j], is_prefix);
}
std::vector<token_t> q_include_pos_tokens = field_query_tokens[i].q_include_tokens;
// these are already validated upstream, but still playing safe
bool field_prefix = (i < prefixes.size()) ? prefixes[i] : prefixes[0];
@ -2514,15 +2572,12 @@ void Index::search_fields(const std::vector<filter>& filters,
}
std::vector<token_t> query_tokens = q_include_pos_tokens;
std::vector<token_t> search_tokens = q_include_pos_tokens;
size_t num_tokens_dropped = 0;
//LOG(INFO) << "searching field_name! " << field_name;
Topster* ftopster = new Topster(topster->MAX_SIZE, topster->distinct);
ftopsters.push_back(ftopster);
// Don't waste additional cycles for single field_name searches
Topster* actual_topster = (num_search_fields == 1) ? topster : ftopster;
Topster* actual_topster = (num_search_fields == 1) ? topster : ftopsters[i];
// tracks the number of results found for the current field_name
size_t field_num_results = 0;
@ -2531,7 +2586,7 @@ void Index::search_fields(const std::vector<filter>& filters,
int last_typo = int(min_typo) - 1;
if(!is_wildcard_query) {
search_field(field_id, query_tokens, search_tokens, exclude_token_ids, exclude_token_ids_size,
search_field(field_id, query_tokens, exclude_token_ids, exclude_token_ids_size,
num_tokens_dropped, field_it->second, field_name,
actual_filter_ids, actual_filter_ids_length, curated_ids_sorted, sort_fields_std,
last_typo, min_typo, searched_queries, actual_topster, groups_processed,
@ -2548,7 +2603,13 @@ void Index::search_fields(const std::vector<filter>& filters,
if(field_num_results == 0 && split_join_tokens) {
std::vector<std::vector<std::string>> space_resolved_queries;
resolve_space_as_typos(field_query_tokens[i].q_include_tokens, field_name,
std::vector<std::string> q_include_tokens;
for(auto& q_include_token: field_query_tokens[i].q_include_tokens) {
q_include_tokens.push_back(q_include_token.value);
}
resolve_space_as_typos(q_include_tokens, field_name,
space_resolved_queries);
// only one query is resolved for now, so just use that
@ -2562,9 +2623,8 @@ void Index::search_fields(const std::vector<filter>& filters,
}
query_tokens = q_include_pos_tokens;
search_tokens = q_include_pos_tokens;
search_field(field_id, query_tokens, search_tokens, exclude_token_ids, exclude_token_ids_size,
search_field(field_id, query_tokens, exclude_token_ids, exclude_token_ids_size,
num_tokens_dropped, field_it->second, field_name,
actual_filter_ids, actual_filter_ids_length, curated_ids_sorted, sort_fields_std,
last_typo, min_typo, searched_queries, actual_topster, groups_processed,
@ -2590,6 +2650,14 @@ void Index::search_fields(const std::vector<filter>& filters,
}
// do synonym based searches
// get synonyms
std::vector<std::string> q_include_tokens;
for(size_t j = 0; j < field_query_tokens[i].q_include_tokens.size(); j++) {
q_include_tokens.push_back(field_query_tokens[i].q_include_tokens[j].value);
}
synonym_index->synonym_reduction(q_include_tokens, field_query_tokens[i].q_synonyms);
// since typos are disabled, we will use drop_tokens_threshold for typo_tokens_threshold as well
// otherwise, we can't support dropping of tokens here.
do_synonym_search(filters, included_ids_map, sort_fields_std, curated_topster, token_order,
@ -2599,13 +2667,13 @@ void Index::search_fields(const std::vector<filter>& filters,
max_candidates, curated_ids, curated_ids_sorted, exclude_token_ids,
exclude_token_ids_size, i, actual_filter_ids_length, 0, field_prefix,
field_id, field_name, field_it,
query_tokens, search_tokens, num_tokens_dropped, actual_topster, field_num_results,
query_tokens, num_tokens_dropped, actual_topster, field_num_results,
field_query_tokens, all_result_ids_len, groups_processed, searched_queries,
all_result_ids,
actual_filter_ids, query_hashes);
// concat is done only for multi-field searches as `ftopster` will be empty for single-field search
concat_topster_ids(ftopster, topster_ids);
concat_topster_ids(ftopsters[i], topster_ids);
collate_included_ids(field_query_tokens[i].q_include_tokens, field_name, field_id, included_ids_map, curated_topster, searched_queries);
//LOG(INFO) << "topster_ids.size: " << topster_ids.size();
}
@ -2691,7 +2759,7 @@ void Index::do_synonym_search(const std::vector<filter>& filters,
size_t exclude_token_ids_size, size_t i, uint32_t actual_filter_ids_length,
int field_num_typos, bool field_prefix, const uint8_t field_id, const string& field_name,
const std::unordered_map<string, field>::const_iterator& field_it,
std::vector<token_t>& query_tokens, std::vector<token_t>& search_tokens,
std::vector<token_t>& query_tokens,
size_t num_tokens_dropped, Topster* actual_topster, size_t field_num_results,
std::vector<query_tokens_t>& field_query_tokens, size_t& all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed,
@ -2713,7 +2781,7 @@ void Index::do_synonym_search(const std::vector<filter>& filters,
for(const auto& syn_tokens: q_pos_synonyms) {
num_tokens_dropped = 0;
field_num_results = 0;
query_tokens = search_tokens = syn_tokens;
query_tokens = syn_tokens;
query_hashes.clear();
if(query_tokens.size() == 1 && query_tokens[0].value == "*") {
@ -2734,7 +2802,7 @@ void Index::do_synonym_search(const std::vector<filter>& filters,
all_result_ids, all_result_ids_len,
actual_filter_ids, actual_filter_ids_length, concurrency);
} else {
search_field(field_id, query_tokens, search_tokens, exclude_token_ids, exclude_token_ids_size, num_tokens_dropped,
search_field(field_id, query_tokens, exclude_token_ids, exclude_token_ids_size, num_tokens_dropped,
field_it->second, field_name, actual_filter_ids, actual_filter_ids_length, curated_ids_sorted, sort_fields_std,
-1, field_num_typos, searched_queries, actual_topster, groups_processed,
&all_result_ids, all_result_ids_len,
@ -2913,11 +2981,10 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
Tokenizer(facet_query.query, true, !facet_field.is_string(),
facet_field.locale, symbols_to_index, token_separators).tokenize(query_tokens);
std::vector<token_t> search_tokens, qtokens;
std::vector<token_t> qtokens;
for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
bool is_prefix = (qtoken_index == query_tokens.size()-1);
search_tokens.emplace_back(qtoken_index, query_tokens[qtoken_index], is_prefix);
qtokens.emplace_back(qtoken_index, query_tokens[qtoken_index], is_prefix);
}
@ -2930,7 +2997,7 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
std::set<uint64> query_hashes;
size_t num_toks_dropped = 0;
search_field(0, qtokens, search_tokens, nullptr, 0, num_toks_dropped,
search_field(0, qtokens, nullptr, 0, num_toks_dropped,
facet_field, facet_field.faceted_name(),
all_result_ids, all_result_ids_len, {}, {}, -1, facet_query_num_typos, searched_queries, topster,
groups_processed, &field_result_ids, field_result_ids_len, field_num_results, 0, group_by_fields,
@ -3187,8 +3254,7 @@ void Index::populate_sort_mapping(int* sort_order, std::vector<size_t>& geopoint
5. Sort the docs based on some ranking criteria
*/
void Index::search_field(const uint8_t & field_id,
std::vector<token_t>& query_tokens,
std::vector<token_t>& search_tokens,
const std::vector<token_t>& query_tokens,
const uint32_t* exclude_token_ids,
size_t exclude_token_ids_size,
size_t& num_tokens_dropped,
@ -3228,8 +3294,8 @@ void Index::search_field(const uint8_t & field_id,
std::vector<std::vector<int>> token_to_costs;
for(size_t stoken_index=0; stoken_index < search_tokens.size(); stoken_index++) {
const std::string& token = search_tokens[stoken_index].value;
for(size_t stoken_index=0; stoken_index < query_tokens.size(); stoken_index++) {
const std::string& token = query_tokens[stoken_index].value;
std::vector<int> all_costs;
// This ensures that we don't end up doing a cost of 1 for a single char etc.
@ -3279,13 +3345,13 @@ void Index::search_field(const uint8_t & field_id,
token_candidates_vec.clear();
size_t token_index = 0;
while(token_index < search_tokens.size()) {
while(token_index < query_tokens.size()) {
// For each token, look up the generated cost for this iteration and search using that cost
const std::string& token = search_tokens[token_index].value;
const std::string& token = query_tokens[token_index].value;
const std::string token_cost_hash = token + std::to_string(costs[token_index]);
std::vector<art_leaf*> leaves;
const bool prefix_search = prefix && search_tokens[token_index].prefix;
const bool prefix_search = prefix && query_tokens[token_index].prefix;
/*LOG(INFO) << "Searching for field: " << the_field.name << ", token:"
<< token << " - cost: " << costs[token_index] << ", prefix_search: " << prefix_search;*/
@ -3318,42 +3384,16 @@ void Index::search_field(const uint8_t & field_id,
if(!leaves.empty()) {
//log_leaves(costs[token_index], token, leaves);
token_candidates_vec.push_back(
token_candidates{search_tokens[token_index], costs[token_index], prefix_search, leaves});
} else {
// No result at `cost = costs[token_index]`. Remove `cost` for token and re-do combinations
auto it = std::find(token_to_costs[token_index].begin(), token_to_costs[token_index].end(), costs[token_index]);
if(it != token_to_costs[token_index].end()) {
token_to_costs[token_index].erase(it);
// when no more costs are left for this token
if(token_to_costs[token_index].empty()) {
// we can try to drop the token and search with remaining tokens
if(!exhaustive_search && field_num_results >= drop_tokens_threshold) {
// but if drop_tokens_threshold is breached, we are done
return ;
}
token_to_costs.erase(token_to_costs.begin()+token_index);
search_tokens.erase(search_tokens.begin()+token_index);
query_tokens.erase(query_tokens.begin()+token_index);
costs.erase(costs.begin()+token_index);
}
}
// Continue outerloop on new cost combination
n = -1;
N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
goto resume_typo_loop;
token_candidates{query_tokens[token_index], costs[token_index], prefix_search, leaves});
}
token_index++;
}
if(!token_candidates_vec.empty()) {
if(token_candidates_vec.size() == query_tokens.size()) {
std::vector<uint32_t> id_buff;
// If atleast one token is found, go ahead and search for candidates
// If all tokens are, go ahead and search for candidates
search_candidates(field_id, the_field.is_array(), filter_ids, filter_ids_length,
exclude_token_ids, exclude_token_ids_size,
curated_ids, sort_fields, token_candidates_vec, searched_queries, topster,
@ -3374,8 +3414,6 @@ void Index::search_field(const uint8_t & field_id,
*all_result_ids = new_all_result_ids;
}
resume_typo_loop:
if(!exhaustive_search && field_num_results >= typo_tokens_threshold) {
// if typo threshold is breached, we are done
return ;
@ -3383,41 +3421,6 @@ void Index::search_field(const uint8_t & field_id,
n++;
}
// When atleast two tokens from the query are available so we can drop one
if(query_tokens.size() > 1 && num_tokens_dropped < query_tokens.size()) {
// Drop tokens from right until (len/2 + 1), and then from left until (len/2 + 1)
if(!exhaustive_search && field_num_results >= drop_tokens_threshold) {
// if drop_tokens_threshold is breached, we are done
return ;
}
std::vector<token_t> truncated_tokens;
num_tokens_dropped++;
if(num_tokens_dropped < query_tokens.size()) {
// drop from right
size_t end_index = query_tokens.size() - num_tokens_dropped - 1;
for(size_t i=0; i <= end_index; i++) {
truncated_tokens.emplace_back(query_tokens[i].position, query_tokens[i].value, query_tokens[i].prefix);
}
} else {
// drop from left
size_t start_index = (num_tokens_dropped - query_tokens.size() + 1);
for(size_t i=start_index; i<query_tokens.size(); i++) {
truncated_tokens.emplace_back(query_tokens[i].position, query_tokens[i].value, query_tokens[i].prefix);
}
}
return search_field(field_id, query_tokens, truncated_tokens, exclude_token_ids, exclude_token_ids_size,
num_tokens_dropped, the_field, field_name, filter_ids, filter_ids_length, curated_ids,
sort_fields, last_typo, max_typos, searched_queries, topster, groups_processed, all_result_ids,
all_result_ids_len, field_num_results, group_limit, group_by_fields,
prioritize_exact_match, concurrency, query_hashes,
token_order, prefix, drop_tokens_threshold, typo_tokens_threshold,
exhaustive_search, syn_orig_num_tokens, min_len_1typo, min_len_2typo, max_candidates);
}
}
int Index::get_bounded_typo_cost(const size_t max_cost, const size_t token_len,
@ -3683,7 +3686,7 @@ uint64_t Index::get_distinct_id(const std::vector<std::string>& group_by_fields,
const auto& facet_hashes = facet_hashes_it->second;
for(size_t i = 0; i < facet_hashes.size(); i++) {
distinct_id = hash_combine(distinct_id, facet_hashes.hashes[i]);
distinct_id = StringUtils::hash_combine(distinct_id, facet_hashes.hashes[i]);
}
}
@ -3694,6 +3697,7 @@ inline uint32_t Index::next_suggestion(const std::vector<token_candidates> &toke
long long int n,
std::vector<art_leaf *>& actual_query_suggestion,
std::vector<art_leaf *>& query_suggestion,
const int syn_orig_num_tokens,
uint32_t& token_bits,
uint64& qhash) {
uint32_t total_cost = 0;
@ -3717,13 +3721,20 @@ inline uint32_t Index::next_suggestion(const std::vector<token_candidates> &toke
token_bits |= 1UL << token_candidates_vec[i].token.position; // sets n-th bit
uintptr_t addr_val = (uintptr_t) query_suggestion[i];
qhash = Index::hash_combine(qhash, addr_val);
qhash = StringUtils::hash_combine(qhash, addr_val);
/*LOG(INFO) << "suggestion key: " << actual_query_suggestion[i]->key << ", token: "
<< token_candidates_vec[i].token.value << ", actual_cost: " << actual_cost;
LOG(INFO) << ".";*/
}
if(syn_orig_num_tokens != -1) {
token_bits = 0;
for(size_t i = 0; i < size_t(syn_orig_num_tokens); i++) {
token_bits |= 1UL << i;
}
}
return total_cost;
}

186
src/synonym_index.cpp Normal file
View File

@ -0,0 +1,186 @@
#include "synonym_index.h"
void SynonymIndex::synonym_reduction_internal(const std::vector<std::string>& tokens,
size_t start_window_size, size_t start_index_pos,
std::set<uint64_t>& processed_syn_hashes,
std::vector<std::vector<std::string>>& results) const {
bool recursed = false;
for(size_t window_len = start_window_size; window_len > 0; window_len--) {
for(size_t start_index = start_index_pos; start_index+window_len-1 < tokens.size(); start_index++) {
std::vector<uint64_t> syn_hashes;
uint64_t syn_hash = 1;
for(size_t i = start_index; i < start_index+window_len; i++) {
uint64_t token_hash = StringUtils::hash_wy(tokens[i].c_str(), tokens[i].size());
if(i == start_index) {
syn_hash = token_hash;
} else {
syn_hash = StringUtils::hash_combine(syn_hash, token_hash);
}
syn_hashes.push_back(token_hash);
}
const auto& syn_itr = synonym_index.find(syn_hash);
if(syn_itr != synonym_index.end() && processed_syn_hashes.count(syn_hash) == 0) {
// tokens in this window match a synonym: reconstruct tokens and rerun synonym mapping against matches
const auto& syn_ids = syn_itr->second;
for(const auto& syn_id: syn_ids) {
const auto &syn_def = synonym_definitions.at(syn_id);
for (const auto &syn_def_tokens: syn_def.synonyms) {
std::vector<std::string> new_tokens;
for (size_t i = 0; i < start_index; i++) {
new_tokens.push_back(tokens[i]);
}
std::vector<uint64_t> syn_def_hashes;
uint64_t syn_def_hash = 1;
for (size_t i = 0; i < syn_def_tokens.size(); i++) {
const auto &syn_def_token = syn_def_tokens[i];
new_tokens.push_back(syn_def_token);
uint64_t token_hash = StringUtils::hash_wy(syn_def_token.c_str(),
syn_def_token.size());
if (i == 0) {
syn_def_hash = token_hash;
} else {
syn_def_hash = StringUtils::hash_combine(syn_def_hash, token_hash);
}
syn_def_hashes.push_back(token_hash);
}
if (syn_def_hash == syn_hash) {
// skip over token matching itself in the group
continue;
}
for (size_t i = start_index + window_len; i < tokens.size(); i++) {
new_tokens.push_back(tokens[i]);
}
processed_syn_hashes.emplace(syn_def_hash);
processed_syn_hashes.emplace(syn_hash);
for (uint64_t h: syn_def_hashes) {
processed_syn_hashes.emplace(h);
}
for (uint64_t h: syn_hashes) {
processed_syn_hashes.emplace(h);
}
recursed = true;
synonym_reduction_internal(new_tokens, window_len, start_index, processed_syn_hashes, results);
}
}
}
}
// reset it because for the next window we have to start from scratch
start_index_pos = 0;
}
if(!recursed && !processed_syn_hashes.empty()) {
results.emplace_back(tokens);
}
}
void SynonymIndex::synonym_reduction(const std::vector<std::string>& tokens,
std::vector<std::vector<std::string>>& results) const {
if(synonym_definitions.empty()) {
return;
}
std::set<uint64_t> processed_syn_hashes;
synonym_reduction_internal(tokens, tokens.size(), 0, processed_syn_hashes, results);
}
Option<bool> SynonymIndex::add_synonym(const std::string & collection_name, const synonym_t& synonym) {
if(synonym_definitions.count(synonym.id) != 0) {
// first we have to delete existing entries so we can upsert
Option<bool> rem_op = remove_synonym(collection_name, synonym.id);
if(!rem_op.ok()) {
return rem_op;
}
}
std::unique_lock write_lock(mutex);
synonym_definitions[synonym.id] = synonym;
if(!synonym.root.empty()) {
uint64_t root_hash = synonym_t::get_hash(synonym.root);
synonym_index[root_hash].emplace_back(synonym.id);
} else {
for(const auto & syn_tokens : synonym.synonyms) {
uint64_t syn_hash = synonym_t::get_hash(syn_tokens);
synonym_index[syn_hash].emplace_back(synonym.id);
}
}
write_lock.unlock();
bool inserted = store->insert(get_synonym_key(collection_name, synonym.id), synonym.to_json().dump());
if(!inserted) {
return Option<bool>(500, "Error while storing the synonym on disk.");
}
return Option<bool>(true);
}
bool SynonymIndex::get_synonym(const std::string& id, synonym_t& synonym) {
std::shared_lock lock(mutex);
if(synonym_definitions.count(id) != 0) {
synonym = synonym_definitions[id];
return true;
}
return false;
}
Option<bool> SynonymIndex::remove_synonym(const std::string & collection_name, const std::string &id) {
std::unique_lock lock(mutex);
const auto& syn_iter = synonym_definitions.find(id);
if(syn_iter != synonym_definitions.end()) {
bool removed = store->remove(get_synonym_key(collection_name, id));
if(!removed) {
return Option<bool>(500, "Error while deleting the synonym from disk.");
}
const auto& synonym = syn_iter->second;
if(!synonym.root.empty()) {
uint64_t root_hash = synonym_t::get_hash(synonym.root);
synonym_index.erase(root_hash);
} else {
for(const auto & syn_tokens : synonym.synonyms) {
uint64_t syn_hash = synonym_t::get_hash(syn_tokens);
synonym_index.erase(syn_hash);
}
}
synonym_definitions.erase(id);
return Option<bool>(true);
}
return Option<bool>(404, "Could not find that `id`.");
}
spp::sparse_hash_map<std::string, synonym_t> SynonymIndex::get_synonyms() {
std::shared_lock lock(mutex);
return synonym_definitions;
}
std::string SynonymIndex::get_synonym_key(const std::string & collection_name, const std::string & synonym_id) {
return std::string(COLLECTION_SYNONYM_PREFIX) + "_" + collection_name + "_" + synonym_id;
}

View File

@ -2254,7 +2254,6 @@ TEST_F(CollectionFilteringTest, ExcludeMultipleTokens) {
{"title"}, "",
{}, {}, {0}, 10, 1, FREQUENCY).get();
LOG(INFO) << results;
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());

View File

@ -300,17 +300,17 @@ TEST_F(CollectionGroupingTest, GroupingWithMultiFieldRelevance) {
ASSERT_STREQ("pop", results["grouped_hits"][0]["group_key"][0].get<std::string>().c_str());
ASSERT_EQ(2, results["grouped_hits"][0]["hits"].size());
ASSERT_STREQ("1", results["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("4", results["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("2", results["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("rock", results["grouped_hits"][1]["group_key"][0].get<std::string>().c_str());
ASSERT_EQ(2, results["grouped_hits"][1]["hits"].size());
ASSERT_STREQ("5", results["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("6", results["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("country", results["grouped_hits"][2]["group_key"][0].get<std::string>().c_str());
ASSERT_EQ(2, results["grouped_hits"][2]["hits"].size());
ASSERT_STREQ("8", results["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("3", results["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("3", results["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("8", results["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
collectionManager.drop_collection("coll1");
}

View File

@ -170,7 +170,7 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiText) {
ASSERT_EQ("<mark>ลง</mark>ที่นั่นโดย<mark>รถไฟ</mark>", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
results = coll1->search("ลงรถไฟ downie",
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10).get();
ASSERT_EQ(2, results["found"].get<size_t>());
ASSERT_EQ(2, results["hits"].size());

View File

@ -1040,12 +1040,12 @@ TEST_F(CollectionOverrideTest, DynamicFilteringExactMatchBasics) {
// should not apply filter for non-exact case
results = coll1->search("running shoes", {"name", "category", "brand"}, "",
{}, sort_fields, {2, 2, 2}, 10).get();
{}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();
ASSERT_EQ(3, results["hits"].size());
results = coll1->search("adidas shoes", {"name", "category", "brand"}, "",
{}, sort_fields, {2, 2, 2}, 10).get();
{}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
@ -1200,7 +1200,7 @@ TEST_F(CollectionOverrideTest, DynamicFilteringMultiplePlaceholders) {
// not an exact match of rule (because of "light") so all results will be fetched, not just Air Jordan brand
auto results = coll1->search("Nike Air Jordan light yellow shoes", {"name", "category", "brand"}, "",
{}, sort_fields, {2, 2, 2}, 10).get();
{}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
@ -1209,7 +1209,7 @@ TEST_F(CollectionOverrideTest, DynamicFilteringMultiplePlaceholders) {
// query with tokens at the start that preceding the placeholders in the rule
results = coll1->search("New Nike Air Jordan yellow shoes", {"name", "category", "brand"}, "",
{}, sort_fields, {2, 2, 2}, 10).get();
{}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
@ -1381,16 +1381,16 @@ TEST_F(CollectionOverrideTest, DynamicFilteringWithNumericalFilter) {
// should not match the defined override
results = coll1->search("running adidas shoes", {"name", "category", "brand"}, "",
{}, sort_fields, {2, 2, 2}, 10).get();
{}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();
ASSERT_EQ(4, results["hits"].size());
ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("2", results["hits"][1]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][2]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
ASSERT_EQ("2", results["hits"][2]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][3]["document"]["id"].get<std::string>());
results = coll1->search("adidas", {"name", "category", "brand"}, "",
{}, sort_fields, {2, 2, 2}, 10).get();
{}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
@ -1475,14 +1475,16 @@ TEST_F(CollectionOverrideTest, DynamicFilteringExactMatch) {
ASSERT_EQ(4, results["hits"].size());
results = coll1->search("popular nike running shoes", {"name", "category", "brand"}, "",
{}, sort_fields, {2, 2, 2}, 10).get();
{}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {true}, 10).get();
ASSERT_EQ(4, results["hits"].size());
results = coll1->search("popular nike shoes running", {"name", "category", "brand"}, "",
{}, sort_fields, {2, 2, 2}, 10).get();
{}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {true}, 10).get();
ASSERT_EQ(4, results["hits"].size());
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("2", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("3", results["hits"][1]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}

View File

@ -293,6 +293,7 @@ TEST_F(CollectionSpecificTest, OrderMultiFieldFuzzyMatch) {
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {1, 1}).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
@ -304,12 +305,50 @@ TEST_F(CollectionSpecificTest, OrderMultiFieldFuzzyMatch) {
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {2, 1}).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, TypoBeforeDropTokens) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Josh Wexler";
doc1["points"] = 500;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["title"] = "Josh Lipson";
doc2["points"] = 100;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
auto results = coll1->search("Josh Lixson", {"title"}, "", {}, {}, {2}, 10,
1, FREQUENCY, {true}).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
results = coll1->search("Josh Lixson", {"title"}, "", {}, {}, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSpecificTest, FieldWeighting) {
std::vector<field> fields = {field("title", field_types::STRING, false),
field("description", field_types::STRING, false),
@ -338,6 +377,7 @@ TEST_F(CollectionSpecificTest, FieldWeighting) {
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {1, 4}).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
@ -372,10 +412,8 @@ TEST_F(CollectionSpecificTest, MultiFieldArrayRepeatingTokens) {
auto results = coll1->search("rv345 cisco 18", {"title", "description", "attrs"}, "", {}, {}, {1}, 10,
1, FREQUENCY, {true, true, true}).get();
LOG(INFO) << results;
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
@ -629,7 +667,7 @@ TEST_F(CollectionSpecificTest, DeleteOverridesAndSynonymsOnDiskDuringCollDrop) {
ASSERT_TRUE(stored_values.empty());
// synonyms should also have been deleted from the store
store->scan_fill(Collection::COLLECTION_SYNONYM_PREFIX, stored_values);
store->scan_fill(SynonymIndex::COLLECTION_SYNONYM_PREFIX, stored_values);
ASSERT_TRUE(stored_values.empty());
}
@ -877,16 +915,39 @@ TEST_F(CollectionSpecificTest, HighlightWithDropTokensAndPrefixSearch) {
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 1, {}, {}, {}, 0,
"<mark>", "</mark>").get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
ASSERT_EQ("<mark>Pandaabear</mark> <mark>Bas</mark>ic",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("<mark>Pandaabear</mark>",
results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
results = coll1->search("pandaabear bas", {"username", "name"},
"", {}, {}, {2, 2}, 10,
1, FREQUENCY, {true, true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 1, {}, {}, {}, 0,
"<mark>", "</mark>").get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(2, results["found"].get<size_t>());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
ASSERT_EQ(2, results["hits"][1]["highlights"].size());
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
ASSERT_EQ("<mark>Pandaabear</mark> <mark>Bas</mark>ic",
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("<mark>Pandaabear</mark>",
results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
ASSERT_EQ(2, results["hits"][1]["highlights"].size());
ASSERT_EQ("<mark>Pandaabear</mark>",
results["hits"][1]["highlights"][0]["snippet"].get<std::string>());
ASSERT_EQ("Panda's <mark>Basement</mark>",
ASSERT_EQ("Panda's <mark>Bas</mark>ement",
results["hits"][1]["highlights"][1]["snippet"].get<std::string>());
results = coll1->search("pandaabear bas", {"username", "tags"},
@ -2534,6 +2595,15 @@ TEST_F(CollectionSpecificTest, DropTokensTillOneToken) {
auto results = coll1->search("harry malcolm roscow", {"title"},
"", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("2", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
// with drop tokens threshold of 1
results = coll1->search("harry malcolm roscow", {"title"},
"", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 1).get();
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());

View File

@ -329,8 +329,7 @@ TEST_F(CollectionSynonymsTest, OneWaySynonym) {
ASSERT_EQ(1, res["found"].get<uint32_t>());
}
TEST_F(CollectionSynonymsTest, VerbatimMatchShouldConsiderTokensMatchedAcrossAllFieldsWithSynonyms) {
TEST_F(CollectionSynonymsTest, SynonymQueryVariantWithDropTokens) {
std::vector<field> fields = {field("category", field_types::STRING_ARRAY, false),
field("location", field_types::STRING, false),
field("points", field_types::INT32, false),};
@ -377,9 +376,56 @@ TEST_F(CollectionSynonymsTest, VerbatimMatchShouldConsiderTokensMatchedAcrossAll
ASSERT_EQ(3, res["hits"].size());
// NOTE: "2" is ranked above "1" because synonym matches uses the root query's number of tokens for counting
// This means that "united states" == "us" so a single token match, same as match on "sneakers" in record 2.
ASSERT_EQ("0", res["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("1", res["hits"][1]["document"]["id"].get<std::string>());
ASSERT_EQ("2", res["hits"][2]["document"]["id"].get<std::string>());
ASSERT_EQ("2", res["hits"][1]["document"]["id"].get<std::string>());
ASSERT_EQ("1", res["hits"][2]["document"]["id"].get<std::string>());
collectionManager.drop_collection("coll1");
}
TEST_F(CollectionSynonymsTest, SynonymsTextMatchSameAsRootQuery) {
std::vector<field> fields = {field("name", field_types::STRING, false),
field("title", field_types::STRING, false),
field("points", field_types::INT32, false),};
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
nlohmann::json syn_json = {
{"id", "syn-1"},
{"root", "ceo"},
{"synonyms", {"chief executive officer"} }
};
synonym_t synonym;
auto syn_op = synonym_t::parse(syn_json, synonym);
ASSERT_TRUE(syn_op.ok());
coll1->add_synonym(synonym);
nlohmann::json doc1;
doc1["id"] = "0";
doc1["name"] = "Dan Fisher";
doc1["title"] = "Chief Executive Officer";
doc1["points"] = 10;
nlohmann::json doc2;
doc2["id"] = "1";
doc2["name"] = "Jack Sparrow";
doc2["title"] = "CEO";
doc2["points"] = 20;
ASSERT_TRUE(coll1->add(doc1.dump()).ok());
ASSERT_TRUE(coll1->add(doc2.dump()).ok());
auto res = coll1->search("ceo", {"name", "title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10).get();
ASSERT_EQ(2, res["hits"].size());
ASSERT_EQ("1", res["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", res["hits"][1]["document"]["id"].get<std::string>());
ASSERT_EQ(res["hits"][1]["text_match"].get<size_t>(), res["hits"][0]["text_match"].get<size_t>());
collectionManager.drop_collection("coll1");
}
@ -426,8 +472,8 @@ TEST_F(CollectionSynonymsTest, MultiWaySynonym) {
ASSERT_EQ(2, res["hits"].size());
ASSERT_EQ(2, res["found"].get<uint32_t>());
ASSERT_STREQ("<mark>Samuel</mark> L. <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Samuel</mark> L. <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
// for now we don't support synonyms on ANY prefix

View File

@ -285,8 +285,8 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringMultiTokenSearch) {
// with 2 indexed words
results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, sort_fields, {1}, 10).get();
ASSERT_EQ(1, results["hits"].size());
ids = {"2"};
ASSERT_EQ(2, results["hits"].size());
ids = {"2", "17"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
@ -295,7 +295,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringMultiTokenSearch) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
// exhaustive search should throw more results
// exhaustive search should give same results
results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, sort_fields, {1}, 10,
1, FREQUENCY, {true},
1, spp::sparse_hash_set<std::string>(),
@ -318,7 +318,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringMultiTokenSearch) {
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10).get();
ASSERT_EQ(7, results["hits"].size());
ASSERT_EQ(9, results["hits"].size());
results.clear();
results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
@ -350,9 +350,9 @@ TEST_F(CollectionTest, PartialMultiTokenSearch) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("rocket research", query_fields, "", facets,
sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10).get();
ASSERT_EQ(4, results["hits"].size());
ASSERT_EQ(6, results["hits"].size());
std::vector<std::string> ids = {"1", "8", "16", "17"};
std::vector<std::string> ids = {"19", "1", "10", "8", "16", "17"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
@ -3026,7 +3026,8 @@ TEST_F(CollectionTest, MultiFieldRelevance) {
}
auto results = coll1->search("Dustin Kensrue Down There by the Train",
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
{true}, 10).get();
ASSERT_EQ(3, results["found"].get<size_t>());
ASSERT_EQ(3, results["hits"].size());
@ -3069,7 +3070,8 @@ TEST_F(CollectionTest, MultiFieldRelevance) {
}
results = coll1->search("Dustin Kensrue Down There by the Train",
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
{true}, 10).get();
ASSERT_EQ(3, results["found"].get<size_t>());
ASSERT_EQ(3, results["hits"].size());
@ -3082,7 +3084,8 @@ TEST_F(CollectionTest, MultiFieldRelevance) {
// with exclude token syntax
results = coll1->search("-downie dustin kensrue down there by the train",
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
{true}, 10).get();
ASSERT_EQ(2, results["found"].get<size_t>());
ASSERT_EQ(2, results["hits"].size());
@ -3125,7 +3128,7 @@ TEST_F(CollectionTest, MultiFieldRelevance2) {
}
auto results = coll1->search("on a jetplane",
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10).get();
ASSERT_EQ(2, results["found"].get<size_t>());
ASSERT_EQ(2, results["hits"].size());
@ -3479,11 +3482,12 @@ TEST_F(CollectionTest, ExactMatch) {
{"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY,
{true}, 10).get();
ASSERT_EQ(2, results["found"].get<size_t>());
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ(3, results["found"].get<size_t>());
ASSERT_EQ(3, results["hits"].size());
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", results["hits"][2]["document"]["id"].get<std::string>().c_str());
results = coll1->search("alpha", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10).get();

View File

@ -12,7 +12,7 @@ TEST(IndexTest, ScrubReindexDoc) {
ThreadPool pool(4);
Index index("index", 1, nullptr, &pool, search_schema, {}, {});
Index index("index", 1, nullptr, nullptr, &pool, search_schema, {}, {});
nlohmann::json old_doc;
old_doc["id"] = "1";
old_doc["title"] = "One more thing.";