Allow maximum hits returned to be configurable.

This obviously has a performance impact, but it might not be a big deal for most people and is now left to their discretion. The default of 500 results stays to maintain backward compatibility.
This commit is contained in:
kishorenc 2020-02-10 20:54:38 +05:30
parent b899b86a96
commit fd285b6fbe
7 changed files with 50 additions and 40 deletions

View File

@ -211,7 +211,7 @@ public:
const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD,
const spp::sparse_hash_set<std::string> include_fields = spp::sparse_hash_set<std::string>(),
const spp::sparse_hash_set<std::string> exclude_fields = spp::sparse_hash_set<std::string>(),
const size_t max_facet_values=10);
const size_t max_facet_values=10, size_t max_hits=512);
Option<nlohmann::json> get(const std::string & id);
@ -234,11 +234,10 @@ public:
void par_index_in_memory(std::vector<std::vector<index_record>> & iter_batch,
batch_index_result & result);
static void prune_document(nlohmann::json &document, const spp::sparse_hash_set<std::string> include_fields,
const spp::sparse_hash_set<std::string> exclude_fields);
static void prune_document(nlohmann::json &document, const spp::sparse_hash_set<std::string> & include_fields,
const spp::sparse_hash_set<std::string> & exclude_fields);
static const int MAX_SEARCH_TOKENS = 10;
static const int MAX_RESULTS = 500;
// strings under this length will be fully highlighted, instead of showing a snippet of relevant portion
enum {SNIPPET_STR_ABOVE_LEN = 30};

View File

@ -31,6 +31,7 @@ struct search_args {
std::vector<sort_by> sort_fields_std;
int num_typos;
size_t max_facet_values;
size_t max_hits;
size_t per_page;
size_t page;
token_ordering token_order;
@ -49,10 +50,11 @@ struct search_args {
search_args(std::string query, std::vector<std::string> search_fields, std::vector<filter> filters,
std::vector<facet> facets, std::vector<uint32_t> included_ids, std::vector<uint32_t> excluded_ids,
std::vector<sort_by> sort_fields_std, int num_typos, size_t max_facet_values,
size_t per_page, size_t page, token_ordering token_order, bool prefix, size_t drop_tokens_threshold):
size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix,
size_t drop_tokens_threshold):
query(query), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids),
excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), num_typos(num_typos),
max_facet_values(max_facet_values), per_page(per_page),
max_facet_values(max_facet_values), max_hits(max_hits), per_page(per_page),
page(page), token_order(token_order), prefix(prefix), drop_tokens_threshold(drop_tokens_threshold),
all_result_ids_len(0), outcome(0) {
@ -146,15 +148,15 @@ private:
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
const int num_typos, const size_t num_results,
std::vector<std::vector<art_leaf*>> & searched_queries,
Topster<512> & topster, uint32_t** all_result_ids,
Topster & topster, uint32_t** all_result_ids,
size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY,
const bool prefix = false, const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD);
void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
const token_ordering token_order, std::vector<std::vector<art_leaf*>> & searched_queries,
Topster<512> & topster, uint32_t** all_result_ids,
size_t & all_result_ids_len, const size_t & max_results, const bool prefix);
Topster & topster, uint32_t** all_result_ids,
size_t & all_result_ids_len, const size_t & max_results);
void insert_doc(const uint32_t score, art_tree *t, uint32_t seq_id,
const std::unordered_map<std::string, std::vector<uint32_t>> &token_to_offsets) const;
@ -186,7 +188,7 @@ private:
void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
const std::vector<uint32_t> & included_ids,
Topster<512> & curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
Topster & curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
public:
Index() = delete;
@ -202,7 +204,7 @@ public:
const std::vector<filter> & filters, std::vector<facet> & facets,
const std::vector<uint32_t> & included_ids, const std::vector<uint32_t> & excluded_ids,
const std::vector<sort_by> & sort_fields_std, const int num_typos,
const size_t per_page, const size_t page, const token_ordering token_order,
const size_t max_hits, const size_t per_page, const size_t page, const token_ordering token_order,
const bool prefix, const size_t drop_tokens_threshold, std::vector<KV> & raw_result_kvs,
size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries,
std::vector<KV> & override_result_kvs);
@ -217,7 +219,7 @@ public:
std::vector<std::vector<std::vector<uint16_t>>> &array_token_positions);
void score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
const uint32_t total_cost, Topster<512> &topster, const std::vector<art_leaf *> & query_suggestion,
const uint32_t total_cost, Topster &topster, const std::vector<art_leaf *> & query_suggestion,
const uint32_t *result_ids, const size_t result_size) const;
static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);

View File

@ -21,19 +21,20 @@ struct KV {
/*
* Remembers the max-K elements seen so far using a min-heap
*/
template <size_t MAX_SIZE=512>
struct Topster {
const uint32_t MAX_SIZE;
KV *data;
uint32_t size;
spp::sparse_hash_map<uint64_t, KV*> keys;
KV *kvs[MAX_SIZE];
KV* *kvs;
Topster(): size(0){
data = new KV[MAX_SIZE];
explicit Topster(size_t capacity): MAX_SIZE(capacity), size(0) {
kvs = new KV*[capacity];
data = new KV[capacity];
for(size_t i=0; i<MAX_SIZE; i++) {
for(size_t i=0; i<capacity; i++) {
data[i].field_id = 0;
data[i].query_index = 0;
data[i].key = 0;
@ -185,7 +186,7 @@ struct Topster {
// topster must be sorted before iterated upon to remove dead array entries
void sort() {
std::stable_sort(std::begin(kvs), std::begin(kvs) + size, is_greater_kv);
std::stable_sort(kvs, kvs+size, is_greater_kv);
}
void clear(){

View File

@ -264,8 +264,8 @@ void Collection::par_index_in_memory(std::vector<std::vector<index_record>> & it
}
}
void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash_set<std::string> include_fields,
const spp::sparse_hash_set<std::string> exclude_fields) {
void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash_set<std::string>& include_fields,
const spp::sparse_hash_set<std::string>& exclude_fields) {
auto it = document.begin();
for(; it != document.end(); ) {
if(exclude_fields.count(it.key()) != 0 || (include_fields.size() != 0 && include_fields.count(it.key()) == 0)) {
@ -311,7 +311,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
const size_t drop_tokens_threshold,
const spp::sparse_hash_set<std::string> include_fields,
const spp::sparse_hash_set<std::string> exclude_fields,
const size_t max_facet_values) {
const size_t max_facet_values, const size_t max_hits) {
std::vector<uint32_t> included_ids;
std::vector<uint32_t> excluded_ids;
@ -492,8 +492,8 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
const size_t num_results = (page * per_page);
if(num_results > MAX_RESULTS) {
std::string message = "Only the first " + std::to_string(MAX_RESULTS) + " results are available.";
if(num_results > max_hits) {
std::string message = "Only the first " + std::to_string(max_hits) + " results are available.";
return Option<nlohmann::json>(422, message);
}
@ -508,8 +508,8 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
for(Index* index: indices) {
index->search_params = search_args(query, search_fields, filters, facets,
index_to_included_ids[index_id], index_to_excluded_ids[index_id],
sort_fields_std, num_typos, max_facet_values, per_page, page,
token_order, prefix, drop_tokens_threshold);
sort_fields_std, num_typos, max_facet_values, max_hits,
per_page, page, token_order, prefix, drop_tokens_threshold);
{
std::lock_guard<std::mutex> lk(index->m);
index->ready = true;
@ -577,7 +577,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
}
// All fields are sorted descending
std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster<>::is_greater_kv_value);
std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_value);
// Sort based on position in overriden list
std::sort(

View File

@ -176,6 +176,7 @@ void get_search(http_req & req, http_res & res) {
const char *FACET_BY = "facet_by";
const char *MAX_FACET_VALUES = "max_facet_values";
const char *MAX_HITS = "max_hits";
const char *PER_PAGE = "per_page";
const char *PAGE = "page";
const char *CALLBACK = "callback";
@ -207,6 +208,10 @@ void get_search(http_req & req, http_res & res) {
req.params[MAX_FACET_VALUES] = "10";
}
if(req.params.count(MAX_HITS) == 0) {
req.params[MAX_HITS] = "500";
}
if(req.params.count(PER_PAGE) == 0) {
req.params[PER_PAGE] = "10";
}
@ -301,7 +306,8 @@ void get_search(http_req & req, http_res & res) {
static_cast<size_t>(std::stoi(req.params[PAGE])),
token_order, prefix, drop_tokens_threshold,
include_fields, exclude_fields,
static_cast<size_t>(std::stoi(req.params[MAX_FACET_VALUES])));
static_cast<size_t>(std::stoi(req.params[MAX_FACET_VALUES])),
static_cast<size_t>(std::stoi(req.params[MAX_HITS])));
uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - begin).count();

View File

@ -559,9 +559,9 @@ void Index::drop_facets(std::vector<facet> & facets, const std::vector<uint32_t>
void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
const std::vector<sort_by> & sort_fields,
std::vector<token_candidates> & token_candidates_vec, const token_ordering token_order,
std::vector<std::vector<art_leaf*>> & searched_queries, Topster<512> & topster,
std::vector<std::vector<art_leaf*>> & searched_queries, Topster & topster,
uint32_t** all_result_ids, size_t & all_result_ids_len,
const size_t & max_results, const bool prefix) {
const size_t & max_results) {
const long long combination_limit = 10;
auto product = []( long long a, token_candidates & b ) { return a*b.candidates.size(); };
@ -795,7 +795,7 @@ void Index::run_search() {
search(search_params.outcome, search_params.query, search_params.search_fields,
search_params.filters, search_params.facets, search_params.included_ids,
search_params.excluded_ids, search_params.sort_fields_std, search_params.num_typos,
search_params.per_page, search_params.page, search_params.token_order,
search_params.max_hits, search_params.per_page, search_params.page, search_params.token_order,
search_params.prefix, search_params.drop_tokens_threshold, search_params.raw_result_kvs,
search_params.all_result_ids_len, search_params.searched_queries, search_params.override_result_kvs);
@ -811,7 +811,7 @@ void Index::run_search() {
void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
const std::vector<uint32_t> & included_ids,
Topster<512> & curated_topster,
Topster & curated_topster,
std::vector<std::vector<art_leaf*>> & searched_queries) {
if(included_ids.size() == 0) {
@ -873,11 +873,13 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
}
}
void Index::search(Option<uint32_t> & outcome, std::string query, const std::vector<std::string> & search_fields,
void Index::search(Option<uint32_t> & outcome,
std::string query,
const std::vector<std::string> & search_fields,
const std::vector<filter> & filters, std::vector<facet> & facets,
const std::vector<uint32_t> & included_ids,
const std::vector<uint32_t> & excluded_ids,
const std::vector<sort_by> & sort_fields_std, const int num_typos,
const std::vector<sort_by> & sort_fields_std, const int num_typos, const size_t max_hits,
const size_t per_page, const size_t page, const token_ordering token_order,
const bool prefix, const size_t drop_tokens_threshold,
std::vector<KV> & raw_result_kvs,
@ -902,8 +904,8 @@ void Index::search(Option<uint32_t> & outcome, std::string query, const std::vec
//auto begin = std::chrono::high_resolution_clock::now();
uint32_t* all_result_ids = nullptr;
Topster<512> topster;
Topster<512> curated_topster;
Topster topster(max_hits);
Topster curated_topster(max_hits);
if(query == "*") {
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
@ -986,7 +988,7 @@ void Index::search_field(const uint8_t & field_id, std::string & query, const st
uint32_t *filter_ids, size_t filter_ids_length,
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
const size_t num_results, std::vector<std::vector<art_leaf*>> & searched_queries,
Topster<512> &topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
Topster & topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold) {
std::vector<std::string> tokens;
StringUtils::split(query, tokens, " ");
@ -1103,7 +1105,7 @@ void Index::search_field(const uint8_t & field_id, std::string & query, const st
// If all tokens were found, go ahead and search for candidates with what we have so far
search_candidates(field_id, filter_ids, filter_ids_length, facets, sort_fields, token_candidates_vec,
token_order, searched_queries, topster, all_result_ids, all_result_ids_len,
Index::SEARCH_LIMIT_NUM, prefix);
Index::SEARCH_LIMIT_NUM);
if (all_result_ids_len >= Index::SEARCH_LIMIT_NUM) {
// If we don't find enough results, we continue outerloop (looking at tokens with greater cost)
@ -1154,7 +1156,7 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
}
void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
const uint32_t total_cost, Topster<512> & topster,
const uint32_t total_cost, Topster & topster,
const std::vector<art_leaf *> &query_suggestion,
const uint32_t *result_ids, const size_t result_size) const {

View File

@ -3,7 +3,7 @@
#include "match_score.h"
TEST(TopsterTest, MaxIntValues) {
Topster<5> topster;
Topster topster(5);
struct {
uint8_t field_id;
@ -52,7 +52,7 @@ TEST(TopsterTest, MaxIntValues) {
}
TEST(TopsterTest, MaxFloatValues) {
Topster<5> topster;
Topster topster(5);
struct {
uint8_t field_id;