mirror of
https://github.com/typesense/typesense.git
synced 2025-05-17 20:22:32 +08:00
Merge branch 'v0.24-nested' into v0.25
# Conflicts: # include/collection.h # src/collection.cpp # src/collection_manager.cpp # test/collection_faceting_test.cpp
This commit is contained in:
commit
b8b9fb20b3
@ -409,6 +409,8 @@ public:
|
||||
const size_t filter_curated_hits_option = 2,
|
||||
const bool prioritize_token_position = false,
|
||||
const std::string& vector_query_str = "",
|
||||
const bool enable_highlight_v1 = true,
|
||||
const uint64_t search_time_start_us = 0,
|
||||
const size_t facet_sample_percent = 100,
|
||||
const size_t facet_sample_threshold = 0) const;
|
||||
|
||||
|
@ -177,12 +177,11 @@ public:
|
||||
|
||||
static Option<bool> do_search(std::map<std::string, std::string>& req_params,
|
||||
nlohmann::json& embedded_params,
|
||||
std::string& results_json_str);
|
||||
std::string& results_json_str,
|
||||
uint64_t start_ts);
|
||||
|
||||
static bool parse_sort_by_str(std::string sort_by_str, std::vector<sort_by>& sort_fields);
|
||||
|
||||
static bool parse_vector_query_str(std::string vector_query_str, vector_query_t& vector_query);
|
||||
|
||||
// symlinks
|
||||
Option<std::string> resolve_symlink(const std::string & symlink_name) const;
|
||||
|
||||
|
@ -59,6 +59,8 @@ private:
|
||||
|
||||
std::atomic<bool> skip_writes;
|
||||
|
||||
std::atomic<int> log_slow_searches_time_ms;
|
||||
|
||||
protected:
|
||||
|
||||
Config() {
|
||||
@ -80,6 +82,7 @@ protected:
|
||||
this->disk_used_max_percentage = 100;
|
||||
this->memory_used_max_percentage = 100;
|
||||
this->skip_writes = false;
|
||||
this->log_slow_searches_time_ms = 30 * 1000;
|
||||
}
|
||||
|
||||
Config(Config const&) {
|
||||
@ -142,6 +145,10 @@ public:
|
||||
this->log_slow_requests_time_ms = log_slow_requests_time_ms;
|
||||
}
|
||||
|
||||
void set_log_slow_searches_time_ms(int log_slow_searches_time_ms) {
|
||||
this->log_slow_searches_time_ms = log_slow_searches_time_ms;
|
||||
}
|
||||
|
||||
void set_healthy_read_lag(size_t healthy_read_lag) {
|
||||
this->healthy_read_lag = healthy_read_lag;
|
||||
}
|
||||
@ -245,6 +252,10 @@ public:
|
||||
return this->log_slow_requests_time_ms;
|
||||
}
|
||||
|
||||
int get_log_slow_searches_time_ms() const {
|
||||
return this->log_slow_searches_time_ms;
|
||||
}
|
||||
|
||||
size_t get_num_collections_parallel_load() const {
|
||||
return this->num_collections_parallel_load;
|
||||
}
|
||||
@ -364,6 +375,10 @@ public:
|
||||
this->log_slow_requests_time_ms = std::stoi(get_env("TYPESENSE_LOG_SLOW_REQUESTS_TIME_MS"));
|
||||
}
|
||||
|
||||
if(!get_env("TYPESENSE_LOG_SLOW_SEARCHES_TIME_MS").empty()) {
|
||||
this->log_slow_searches_time_ms = std::stoi(get_env("TYPESENSE_LOG_SLOW_SEARCHES_TIME_MS"));
|
||||
}
|
||||
|
||||
if(!get_env("TYPESENSE_NUM_COLLECTIONS_PARALLEL_LOAD").empty()) {
|
||||
this->num_collections_parallel_load = std::stoi(get_env("TYPESENSE_NUM_COLLECTIONS_PARALLEL_LOAD"));
|
||||
}
|
||||
@ -513,6 +528,10 @@ public:
|
||||
this->log_slow_requests_time_ms = (int) reader.GetInteger("server", "log-slow-requests-time-ms", -1);
|
||||
}
|
||||
|
||||
if(reader.Exists("server", "log-slow-searches-time-ms")) {
|
||||
this->log_slow_searches_time_ms = (int) reader.GetInteger("server", "log-slow-searches-time-ms", 30*1000);
|
||||
}
|
||||
|
||||
if(reader.Exists("server", "num-collections-parallel-load")) {
|
||||
this->num_collections_parallel_load = (int) reader.GetInteger("server", "num-collections-parallel-load", 0);
|
||||
}
|
||||
@ -643,6 +662,10 @@ public:
|
||||
this->log_slow_requests_time_ms = options.get<int>("log-slow-requests-time-ms");
|
||||
}
|
||||
|
||||
if(options.exist("log-slow-searches-time-ms")) {
|
||||
this->log_slow_searches_time_ms = options.get<int>("log-slow-searches-time-ms");
|
||||
}
|
||||
|
||||
if(options.exist("num-collections-parallel-load")) {
|
||||
this->num_collections_parallel_load = options.get<uint32_t>("num-collections-parallel-load");
|
||||
}
|
||||
|
@ -24,6 +24,7 @@ struct export_state_t: public req_state_t {
|
||||
std::vector<size_t> offsets;
|
||||
std::set<std::string> include_fields;
|
||||
std::set<std::string> exclude_fields;
|
||||
size_t export_batch_size = 100;
|
||||
std::string* res_body;
|
||||
|
||||
bool filtered_export = false;
|
||||
|
@ -609,20 +609,6 @@ struct sort_by {
|
||||
}
|
||||
};
|
||||
|
||||
struct vector_query_t {
|
||||
std::string field_name;
|
||||
size_t k = 0;
|
||||
size_t flat_search_cutoff = 0;
|
||||
std::vector<float> values;
|
||||
|
||||
void _reset() {
|
||||
// used for testing only
|
||||
field_name.clear();
|
||||
k = 0;
|
||||
values.clear();
|
||||
}
|
||||
};
|
||||
|
||||
class GeoPoint {
|
||||
constexpr static const double EARTH_RADIUS = 3958.75;
|
||||
constexpr static const double METER_CONVERT = 1609.00;
|
||||
|
@ -261,11 +261,13 @@ struct http_req {
|
||||
chunk_len(0), body(body), body_index(0), data(nullptr), ready(false),
|
||||
log_index(0), is_diposed(false), client_ip(client_ip) {
|
||||
|
||||
start_ts = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
|
||||
if(_req != nullptr) {
|
||||
const auto& tv = _req->processed_at.at;
|
||||
start_ts = (tv.tv_sec * 1000 * 1000) + tv.tv_usec;
|
||||
is_http_v1 = (_req->version < 0x200);
|
||||
} else {
|
||||
start_ts = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
}
|
||||
}
|
||||
|
||||
@ -279,21 +281,40 @@ struct http_req {
|
||||
std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
uint64_t ms_since_start = (now - start_ts) / 1000;
|
||||
|
||||
std::string metric_identifier = http_method + " " + path_without_query;
|
||||
const std::string metric_identifier = http_method + " " + path_without_query;
|
||||
AppMetrics::get_instance().increment_duration(metric_identifier, ms_since_start);
|
||||
AppMetrics::get_instance().increment_write_metrics(route_hash, ms_since_start);
|
||||
|
||||
if(config.get_log_slow_requests_time_ms() >= 0 && int(ms_since_start) >= config.get_log_slow_requests_time_ms()) {
|
||||
bool log_slow_searches = config.get_log_slow_searches_time_ms() >= 0 &&
|
||||
int(ms_since_start) >= config.get_log_slow_searches_time_ms() &&
|
||||
(path_without_query == "/multi_search" ||
|
||||
StringUtils::ends_with(path_without_query, "/documents/search"));
|
||||
|
||||
bool log_slow_requests = config.get_log_slow_requests_time_ms() >= 0 &&
|
||||
int(ms_since_start) >= config.get_log_slow_requests_time_ms();
|
||||
|
||||
if(log_slow_searches || log_slow_requests) {
|
||||
// log slow request if logging is enabled
|
||||
std::string query_string = "?";
|
||||
for(const auto& kv: params) {
|
||||
if(kv.first != AUTH_HEADER) {
|
||||
query_string += kv.first + "=" + kv.second + "&";
|
||||
bool is_multi_search_query = (path_without_query == "/multi_search");
|
||||
|
||||
if(is_multi_search_query) {
|
||||
StringUtils::erase_char(body, '\n');
|
||||
} else {
|
||||
// ignore params map of multi_search since it is mutated for every search object in the POST body
|
||||
for(const auto& kv: params) {
|
||||
if(kv.first != AUTH_HEADER) {
|
||||
query_string += kv.first + "=" + kv.second + "&";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string full_url_path = metric_identifier + query_string;
|
||||
LOG(INFO) << "SLOW REQUEST: " << "(" + std::to_string(ms_since_start) + " ms) "
|
||||
<< client_ip << " " << full_url_path;
|
||||
|
||||
// NOTE: we log the `body` ONLY for multi-search query
|
||||
LOG(INFO) << "event=slow_request, time=" << ms_since_start << " ms"
|
||||
<< ", client_ip=" << client_ip << ", endpoint=" << full_url_path
|
||||
<< ", body=" << (is_multi_search_query ? body : "");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "id_list.h"
|
||||
#include "synonym_index.h"
|
||||
#include "override.h"
|
||||
#include "vector_query_ops.h"
|
||||
#include "hnswlib/hnswlib.h"
|
||||
|
||||
static constexpr size_t ARRAY_FACET_DIM = 4;
|
||||
|
@ -68,9 +68,8 @@ bool or_iterator_t::intersect(std::vector<or_iterator_t>& its, result_iter_state
|
||||
|
||||
while(its.size() == it_size && its[0].valid()) {
|
||||
num_processed++;
|
||||
if (num_processed % 65536 == 0 &&
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) {
|
||||
if (num_processed % 65536 == 0 && (std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) {
|
||||
search_cutoff = true;
|
||||
break;
|
||||
}
|
||||
@ -100,9 +99,8 @@ bool or_iterator_t::intersect(std::vector<or_iterator_t>& its, result_iter_state
|
||||
|
||||
while(its.size() == it_size && !at_end2(its)) {
|
||||
num_processed++;
|
||||
if (num_processed % 65536 == 0 &&
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) {
|
||||
if (num_processed % 65536 == 0 && (std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) {
|
||||
search_cutoff = true;
|
||||
break;
|
||||
}
|
||||
@ -138,9 +136,8 @@ bool or_iterator_t::intersect(std::vector<or_iterator_t>& its, result_iter_state
|
||||
|
||||
while(its.size() == it_size && !at_end(its)) {
|
||||
num_processed++;
|
||||
if (num_processed % 65536 == 0 &&
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) {
|
||||
if (num_processed % 65536 == 0 && (std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) {
|
||||
search_cutoff = true;
|
||||
break;
|
||||
}
|
||||
|
@ -211,9 +211,8 @@ bool posting_list_t::block_intersect(std::vector<posting_list_t::iterator_t>& it
|
||||
case 1:
|
||||
while(its[0].valid()) {
|
||||
num_processed++;
|
||||
if (num_processed % 65536 == 0 &&
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) {
|
||||
if (num_processed % 65536 == 0 && (std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) {
|
||||
search_cutoff = true;
|
||||
break;
|
||||
}
|
||||
@ -228,9 +227,8 @@ bool posting_list_t::block_intersect(std::vector<posting_list_t::iterator_t>& it
|
||||
case 2:
|
||||
while(!at_end2(its)) {
|
||||
num_processed++;
|
||||
if (num_processed % 65536 == 0 &&
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) {
|
||||
if (num_processed % 65536 == 0 && (std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) {
|
||||
search_cutoff = true;
|
||||
break;
|
||||
}
|
||||
@ -249,9 +247,8 @@ bool posting_list_t::block_intersect(std::vector<posting_list_t::iterator_t>& it
|
||||
default:
|
||||
while(!at_end(its)) {
|
||||
num_processed++;
|
||||
if (num_processed % 65536 == 0 &&
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) {
|
||||
if (num_processed % 65536 == 0 && (std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) {
|
||||
search_cutoff = true;
|
||||
break;
|
||||
}
|
||||
|
@ -366,6 +366,8 @@ struct StringUtils {
|
||||
static void replace_all(std::string& subject, const std::string& search,
|
||||
const std::string& replace);
|
||||
|
||||
static void erase_char(std::string& str, const char c);
|
||||
|
||||
static std::string trim_curly_spaces(const std::string& str);
|
||||
|
||||
static bool ends_with(std::string const &str, std::string const &ending);
|
||||
|
@ -4,6 +4,6 @@ extern thread_local int64_t write_log_index;
|
||||
|
||||
// These are used for circuit breaking search requests
|
||||
// NOTE: if you fork off main search thread, care must be taken to initialize these from parent thread values
|
||||
extern thread_local std::chrono::high_resolution_clock::time_point search_begin;
|
||||
extern thread_local int64_t search_stop_ms;
|
||||
extern thread_local uint64_t search_begin_us;
|
||||
extern thread_local uint64_t search_stop_us;
|
||||
extern thread_local bool search_cutoff;
|
32
include/vector_query_ops.h
Normal file
32
include/vector_query_ops.h
Normal file
@ -0,0 +1,32 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "option.h"
|
||||
|
||||
class Collection;
|
||||
|
||||
struct vector_query_t {
|
||||
std::string field_name;
|
||||
size_t k = 0;
|
||||
size_t flat_search_cutoff = 0;
|
||||
std::vector<float> values;
|
||||
|
||||
uint32_t seq_id = 0;
|
||||
bool query_doc_given = false;
|
||||
|
||||
void _reset() {
|
||||
// used for testing only
|
||||
field_name.clear();
|
||||
k = 0;
|
||||
values.clear();
|
||||
seq_id = 0;
|
||||
query_doc_given = false;
|
||||
}
|
||||
};
|
||||
|
||||
class VectorQueryOps {
|
||||
public:
|
||||
static Option<bool> parse_vector_query_str(std::string vector_query_str, vector_query_t& vector_query,
|
||||
const Collection* coll);
|
||||
};
|
@ -394,7 +394,13 @@ bool AuthManager::add_item_to_params(std::map<std::string, std::string>& req_par
|
||||
if(req_params.count(item.key()) == 0) {
|
||||
req_params[item.key()] = str_value;
|
||||
} else if(item.key() == "filter_by") {
|
||||
req_params[item.key()] = "(" + req_params[item.key()] + ") && (" + str_value + ")";
|
||||
if(!req_params[item.key()].empty() && !str_value.empty()) {
|
||||
req_params[item.key()] = "(" + req_params[item.key()] + ") && (" + str_value + ")";
|
||||
} else if(req_params[item.key()].empty() && !str_value.empty()) {
|
||||
req_params[item.key()] = "(" + str_value + ")";
|
||||
} else if(!req_params[item.key()].empty() && str_value.empty()) {
|
||||
req_params[item.key()] = "(" + req_params[item.key()] + ")";
|
||||
}
|
||||
} else if(overwrite) {
|
||||
req_params[item.key()] = str_value;
|
||||
}
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include "topster.h"
|
||||
#include "logger.h"
|
||||
#include "thread_local_vars.h"
|
||||
#include "vector_query_ops.h"
|
||||
|
||||
const std::string override_t::MATCH_EXACT = "exact";
|
||||
const std::string override_t::MATCH_CONTAINS = "contains";
|
||||
@ -867,14 +868,18 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
|
||||
const size_t filter_curated_hits_option,
|
||||
const bool prioritize_token_position,
|
||||
const std::string& vector_query_str,
|
||||
const bool enable_highlight_v1,
|
||||
const uint64_t search_time_start_us,
|
||||
const size_t facet_sample_percent,
|
||||
const size_t facet_sample_threshold) const {
|
||||
|
||||
std::shared_lock lock(mutex);
|
||||
|
||||
// setup thread local vars
|
||||
search_stop_ms = search_stop_millis;
|
||||
search_begin = std::chrono::high_resolution_clock::now();
|
||||
search_stop_us = search_stop_millis * 1000;
|
||||
search_begin_us = (search_time_start_us != 0) ? search_time_start_us :
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
search_cutoff = false;
|
||||
|
||||
if(raw_query != "*" && raw_search_fields.empty()) {
|
||||
@ -927,8 +932,9 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
|
||||
return Option<nlohmann::json>(400, "Vector query is supported only on wildcard (q=*) searches.");
|
||||
}
|
||||
|
||||
if(!CollectionManager::parse_vector_query_str(vector_query_str, vector_query)) {
|
||||
return Option<nlohmann::json>(400, "The `vector_query` parameter is malformed.");
|
||||
auto parse_vector_op = VectorQueryOps::parse_vector_query_str(vector_query_str, vector_query, this);
|
||||
if(!parse_vector_op.ok()) {
|
||||
return Option<nlohmann::json>(400, parse_vector_op.error());
|
||||
}
|
||||
|
||||
auto vector_field_it = search_schema.find(vector_query.field_name);
|
||||
@ -1491,7 +1497,11 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
|
||||
}
|
||||
|
||||
nlohmann::json wrapper_doc;
|
||||
wrapper_doc["highlights"] = nlohmann::json::array();
|
||||
|
||||
if(enable_highlight_v1) {
|
||||
wrapper_doc["highlights"] = nlohmann::json::array();
|
||||
}
|
||||
|
||||
std::vector<highlight_t> highlights;
|
||||
StringUtils string_utils;
|
||||
|
||||
@ -1562,34 +1572,36 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
|
||||
prune_doc(highlight_res, hfield_names, tsl::htrie_set<char>(), "");
|
||||
}
|
||||
|
||||
std::sort(highlights.begin(), highlights.end());
|
||||
if(enable_highlight_v1) {
|
||||
std::sort(highlights.begin(), highlights.end());
|
||||
|
||||
for(const auto & highlight: highlights) {
|
||||
auto field_it = search_schema.find(highlight.field);
|
||||
if(field_it == search_schema.end() || field_it->nested) {
|
||||
// nested field highlighting will be available only in the new highlight structure.
|
||||
continue;
|
||||
}
|
||||
|
||||
nlohmann::json h_json = nlohmann::json::object();
|
||||
h_json["field"] = highlight.field;
|
||||
|
||||
if(!highlight.indices.empty()) {
|
||||
h_json["matched_tokens"] = highlight.matched_tokens;
|
||||
h_json["indices"] = highlight.indices;
|
||||
h_json["snippets"] = highlight.snippets;
|
||||
if(!highlight.values.empty()) {
|
||||
h_json["values"] = highlight.values;
|
||||
for(const auto & highlight: highlights) {
|
||||
auto field_it = search_schema.find(highlight.field);
|
||||
if(field_it == search_schema.end() || field_it->nested) {
|
||||
// nested field highlighting will be available only in the new highlight structure.
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
h_json["matched_tokens"] = highlight.matched_tokens[0];
|
||||
h_json["snippet"] = highlight.snippets[0];
|
||||
if(!highlight.values.empty() && !highlight.values[0].empty()) {
|
||||
h_json["value"] = highlight.values[0];
|
||||
}
|
||||
}
|
||||
|
||||
wrapper_doc["highlights"].push_back(h_json);
|
||||
nlohmann::json h_json = nlohmann::json::object();
|
||||
h_json["field"] = highlight.field;
|
||||
|
||||
if(!highlight.indices.empty()) {
|
||||
h_json["matched_tokens"] = highlight.matched_tokens;
|
||||
h_json["indices"] = highlight.indices;
|
||||
h_json["snippets"] = highlight.snippets;
|
||||
if(!highlight.values.empty()) {
|
||||
h_json["values"] = highlight.values;
|
||||
}
|
||||
} else {
|
||||
h_json["matched_tokens"] = highlight.matched_tokens[0];
|
||||
h_json["snippet"] = highlight.snippets[0];
|
||||
if(!highlight.values.empty() && !highlight.values[0].empty()) {
|
||||
h_json["value"] = highlight.values[0];
|
||||
}
|
||||
}
|
||||
|
||||
wrapper_doc["highlights"].push_back(h_json);
|
||||
}
|
||||
}
|
||||
|
||||
//wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key;
|
||||
@ -1654,8 +1666,8 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query,
|
||||
facet_result["counts"] = nlohmann::json::array();
|
||||
|
||||
std::vector<facet_value_t> facet_values;
|
||||
std::vector<std::pair<int64_t, facet_count_t>> facet_hash_counts;
|
||||
|
||||
std::vector<std::pair<uint64_t, facet_count_t>> facet_hash_counts;
|
||||
|
||||
for (const auto & kv : a_facet.result_map) {
|
||||
facet_hash_counts.emplace_back(kv);
|
||||
}
|
||||
|
@ -630,7 +630,9 @@ Option<bool> add_unsigned_int_list_param(const std::string& param_name, const st
|
||||
|
||||
Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& req_params,
|
||||
nlohmann::json& embedded_params,
|
||||
std::string& results_json_str) {
|
||||
std::string& results_json_str,
|
||||
uint64_t start_ts) {
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
const char *NUM_TYPOS = "num_typos";
|
||||
@ -695,6 +697,8 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
const char *EXHAUSTIVE_SEARCH = "exhaustive_search";
|
||||
const char *SPLIT_JOIN_TOKENS = "split_join_tokens";
|
||||
|
||||
const char *ENABLE_HIGHLIGHT_V1 = "enable_highlight_v1";
|
||||
|
||||
const char *FACET_SAMPLE_PERCENT = "facet_sample_percent";
|
||||
const char *FACET_SAMPLE_THRESHOLD = "facet_sample_threshold";
|
||||
|
||||
@ -767,12 +771,13 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
size_t filter_curated_hits_option = 2;
|
||||
std::string highlight_fields;
|
||||
bool exhaustive_search = false;
|
||||
size_t search_cutoff_ms = 3600000;
|
||||
size_t search_cutoff_ms = 30 * 1000;
|
||||
enable_t split_join_tokens = fallback;
|
||||
size_t max_candidates = 0;
|
||||
std::vector<enable_t> infixes;
|
||||
size_t max_extra_prefix = INT16_MAX;
|
||||
size_t max_extra_suffix = INT16_MAX;
|
||||
bool enable_highlight_v1 = true;
|
||||
|
||||
size_t facet_sample_percent = 100;
|
||||
size_t facet_sample_threshold = 0;
|
||||
@ -817,6 +822,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
{PRE_SEGMENTED_QUERY, &pre_segmented_query},
|
||||
{EXHAUSTIVE_SEARCH, &exhaustive_search},
|
||||
{ENABLE_OVERRIDES, &enable_overrides},
|
||||
{ENABLE_HIGHLIGHT_V1, &enable_highlight_v1},
|
||||
};
|
||||
|
||||
std::unordered_map<std::string, std::vector<std::string>*> str_list_values = {
|
||||
@ -990,6 +996,8 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
filter_curated_hits_option,
|
||||
prioritize_token_position,
|
||||
vector_query,
|
||||
enable_highlight_v1,
|
||||
start_ts,
|
||||
facet_sample_percent,
|
||||
facet_sample_threshold
|
||||
);
|
||||
@ -1237,6 +1245,7 @@ Option<bool> CollectionManager::load_collection(const nlohmann::json &collection
|
||||
size_t num_found_docs = 0;
|
||||
size_t num_valid_docs = 0;
|
||||
size_t num_indexed_docs = 0;
|
||||
size_t batch_doc_str_size = 0;
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
@ -1245,14 +1254,17 @@ Option<bool> CollectionManager::load_collection(const nlohmann::json &collection
|
||||
const uint32_t seq_id = Collection::get_seq_id_from_key(iter->key().ToString());
|
||||
|
||||
nlohmann::json document;
|
||||
const std::string& doc_string = iter->value().ToString();
|
||||
|
||||
try {
|
||||
document = nlohmann::json::parse(iter->value().ToString());
|
||||
document = nlohmann::json::parse(doc_string);
|
||||
} catch(const std::exception& e) {
|
||||
LOG(ERROR) << "JSON error: " << e.what();
|
||||
return Option<bool>(400, "Bad JSON.");
|
||||
}
|
||||
|
||||
batch_doc_str_size += doc_string.size();
|
||||
|
||||
if(collection->get_enable_nested_fields()) {
|
||||
std::vector<field> flattened_fields;
|
||||
field::flatten_doc(document, collection->get_nested_fields(), true, flattened_fields);
|
||||
@ -1269,10 +1281,14 @@ Option<bool> CollectionManager::load_collection(const nlohmann::json &collection
|
||||
iter->Next();
|
||||
bool last_record = !(iter->Valid() && iter->key().starts_with(seq_id_prefix));
|
||||
|
||||
// if expected memory usage exceeds 250M, we index the accumulated set without caring about batch size
|
||||
bool exceeds_batch_mem_threshold = ((batch_doc_str_size * 7) > (250 * 1014 * 1024));
|
||||
|
||||
// batch must match atleast the number of shards
|
||||
if((num_valid_docs % batch_size == 0) || last_record) {
|
||||
if(exceeds_batch_mem_threshold || (num_valid_docs % batch_size == 0) || last_record) {
|
||||
size_t num_records = index_records.size();
|
||||
size_t num_indexed = collection->batch_index_in_memory(index_records);
|
||||
batch_doc_str_size = 0;
|
||||
|
||||
if(num_indexed != num_records) {
|
||||
const Option<std::string> & index_error_op = get_first_index_error(index_records);
|
||||
@ -1413,112 +1429,3 @@ Option<Collection*> CollectionManager::clone_collection(const string& existing_n
|
||||
|
||||
return Option<Collection*>(new_coll);
|
||||
}
|
||||
|
||||
bool CollectionManager::parse_vector_query_str(std::string vector_query_str, vector_query_t& vector_query) {
|
||||
// FORMAT:
|
||||
// field_name([0.34, 0.66, 0.12, 0.68], exact: false, k: 10)
|
||||
size_t i = 0;
|
||||
while(i < vector_query_str.size()) {
|
||||
if(vector_query_str[i] != ':') {
|
||||
vector_query.field_name += vector_query_str[i];
|
||||
i++;
|
||||
} else {
|
||||
if(vector_query_str[i] != ':') {
|
||||
// missing ":"
|
||||
return false;
|
||||
}
|
||||
|
||||
// field name is done
|
||||
i++;
|
||||
|
||||
StringUtils::trim(vector_query.field_name);
|
||||
|
||||
while(i < vector_query_str.size() && vector_query_str[i] != '(') {
|
||||
i++;
|
||||
}
|
||||
|
||||
if(vector_query_str[i] != '(') {
|
||||
// missing "("
|
||||
return false;
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
while(i < vector_query_str.size() && vector_query_str[i] != '[') {
|
||||
i++;
|
||||
}
|
||||
|
||||
if(vector_query_str[i] != '[') {
|
||||
// missing opening "["
|
||||
return false;
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
std::string values_str;
|
||||
while(i < vector_query_str.size() && vector_query_str[i] != ']') {
|
||||
values_str += vector_query_str[i];
|
||||
i++;
|
||||
}
|
||||
|
||||
if(vector_query_str[i] != ']') {
|
||||
// missing closing "]"
|
||||
return false;
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
std::vector<std::string> svalues;
|
||||
StringUtils::split(values_str, svalues, ",");
|
||||
|
||||
for(auto& svalue: svalues) {
|
||||
if(!StringUtils::is_float(svalue)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vector_query.values.push_back(std::stof(svalue));
|
||||
}
|
||||
|
||||
if(i == vector_query_str.size()-1) {
|
||||
// missing params
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string param_str = vector_query_str.substr(i, (vector_query_str.size() - i));
|
||||
std::vector<std::string> param_kvs;
|
||||
StringUtils::split(param_str, param_kvs, ",");
|
||||
|
||||
for(auto& param_kv_str: param_kvs) {
|
||||
if(param_kv_str.back() == ')') {
|
||||
param_kv_str.pop_back();
|
||||
}
|
||||
|
||||
std::vector<std::string> param_kv;
|
||||
StringUtils::split(param_kv_str, param_kv, ":");
|
||||
if(param_kv.size() != 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(param_kv[0] == "k") {
|
||||
if(!StringUtils::is_uint32_t(param_kv[1])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vector_query.k = std::stoul(param_kv[1]);
|
||||
}
|
||||
|
||||
if(param_kv[0] == "flat_search_cutoff") {
|
||||
if(!StringUtils::is_uint32_t(param_kv[1])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vector_query.flat_search_cutoff = std::stoi(param_kv[1]);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -376,7 +376,8 @@ bool get_search(const std::shared_ptr<http_req>& req, const std::shared_ptr<http
|
||||
}
|
||||
|
||||
std::string results_json_str;
|
||||
Option<bool> search_op = CollectionManager::do_search(req->params, req->embedded_params_vec[0], results_json_str);
|
||||
Option<bool> search_op = CollectionManager::do_search(req->params, req->embedded_params_vec[0],
|
||||
results_json_str, req->start_ts);
|
||||
|
||||
if(!search_op.ok()) {
|
||||
res->set(search_op.code(), search_op.error());
|
||||
@ -523,7 +524,8 @@ bool post_multi_search(const std::shared_ptr<http_req>& req, const std::shared_p
|
||||
}
|
||||
|
||||
std::string results_json_str;
|
||||
Option<bool> search_op = CollectionManager::do_search(req->params, req->embedded_params_vec[i], results_json_str);
|
||||
Option<bool> search_op = CollectionManager::do_search(req->params, req->embedded_params_vec[i],
|
||||
results_json_str, req->start_ts);
|
||||
|
||||
if(search_op.ok()) {
|
||||
response["results"].push_back(nlohmann::json::parse(results_json_str));
|
||||
@ -588,6 +590,7 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
|
||||
const char* FILTER_BY = "filter_by";
|
||||
const char* INCLUDE_FIELDS = "include_fields";
|
||||
const char* EXCLUDE_FIELDS = "exclude_fields";
|
||||
const char* BATCH_SIZE = "batch_size";
|
||||
|
||||
export_state_t* export_state = nullptr;
|
||||
|
||||
@ -617,6 +620,10 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
|
||||
export_state->exclude_fields = std::set<std::string>(exclude_fields_vec.begin(), exclude_fields_vec.end());
|
||||
}
|
||||
|
||||
if(req->params.count(BATCH_SIZE) != 0 && StringUtils::is_uint32_t(req->params[BATCH_SIZE])) {
|
||||
export_state->export_batch_size = std::stoul(req->params[BATCH_SIZE]);
|
||||
}
|
||||
|
||||
if(simple_filter_query.empty()) {
|
||||
export_state->iter_upper_bound_key = collection->get_seq_id_collection_prefix() + "`"; // cannot inline this
|
||||
export_state->iter_upper_bound = new rocksdb::Slice(export_state->iter_upper_bound_key);
|
||||
@ -644,10 +651,12 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
|
||||
|
||||
if(export_state->it != nullptr) {
|
||||
rocksdb::Iterator* it = export_state->it;
|
||||
size_t batch_counter = 0;
|
||||
res->body.clear();
|
||||
|
||||
if(it->Valid() && it->key().ToString().compare(0, seq_id_prefix.size(), seq_id_prefix) == 0) {
|
||||
while(it->Valid() && it->key().ToString().compare(0, seq_id_prefix.size(), seq_id_prefix) == 0) {
|
||||
if(export_state->include_fields.empty() && export_state->exclude_fields.empty()) {
|
||||
res->body = it->value().ToString();
|
||||
res->body += it->value().ToString();
|
||||
} else {
|
||||
nlohmann::json doc = nlohmann::json::parse(it->value().ToString());
|
||||
nlohmann::json filtered_doc;
|
||||
@ -663,7 +672,7 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
|
||||
}
|
||||
}
|
||||
|
||||
res->body = filtered_doc.dump();
|
||||
res->body += filtered_doc.dump();
|
||||
}
|
||||
|
||||
it->Next();
|
||||
@ -677,10 +686,15 @@ bool get_export_documents(const std::shared_ptr<http_req>& req, const std::share
|
||||
req->last_chunk_aggregate = true;
|
||||
res->final = true;
|
||||
}
|
||||
|
||||
batch_counter++;
|
||||
if(batch_counter == export_state->export_batch_size) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bool done;
|
||||
stateful_export_docs(export_state, 100, done);
|
||||
stateful_export_docs(export_state, export_state->export_batch_size, done);
|
||||
|
||||
if(!done) {
|
||||
req->last_chunk_aggregate = false;
|
||||
|
@ -474,8 +474,6 @@ int HttpServer::catch_all_handler(h2o_handler_t *_h2o_handler, h2o_req_t *req) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::shared_ptr<http_req> request = std::make_shared<http_req>(req, rpath->http_method, path_without_query,
|
||||
route_hash, query_map, embedded_params_vec,
|
||||
api_auth_key_sent, body, client_ip);
|
||||
|
@ -24,17 +24,17 @@
|
||||
#include <timsort.hpp>
|
||||
#include "logger.h"
|
||||
|
||||
#define RETURN_CIRCUIT_BREAKER if(std::chrono::duration_cast<std::chrono::milliseconds>(\
|
||||
std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) { \
|
||||
search_cutoff = true; \
|
||||
return ;\
|
||||
}
|
||||
#define RETURN_CIRCUIT_BREAKER if((std::chrono::duration_cast<std::chrono::microseconds>( \
|
||||
std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { \
|
||||
search_cutoff = true; \
|
||||
return ;\
|
||||
}
|
||||
|
||||
#define BREAK_CIRCUIT_BREAKER if(std::chrono::duration_cast<std::chrono::milliseconds>(\
|
||||
std::chrono::high_resolution_clock::now() - search_begin).count() > search_stop_ms) { \
|
||||
search_cutoff = true; \
|
||||
break;\
|
||||
}
|
||||
#define BREAK_CIRCUIT_BREAKER if((std::chrono::duration_cast<std::chrono::microseconds>( \
|
||||
std::chrono::system_clock::now().time_since_epoch()).count() - search_begin_us) > search_stop_us) { \
|
||||
search_cutoff = true; \
|
||||
break;\
|
||||
}
|
||||
|
||||
spp::sparse_hash_map<uint32_t, int64_t> Index::text_match_sentinel_value;
|
||||
spp::sparse_hash_map<uint32_t, int64_t> Index::seq_id_sentinel_value;
|
||||
@ -1171,16 +1171,14 @@ void Index::tokenize_string_array_with_facets(const std::vector<std::string>& st
|
||||
}
|
||||
}
|
||||
|
||||
//LOG(INFO) << "Str: " << str << ", last_token: " << last_token;
|
||||
if(is_facet) {
|
||||
facet_hashes.push_back(facet_hash);
|
||||
}
|
||||
|
||||
if(token_set.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(is_facet) {
|
||||
facet_hashes.push_back(facet_hash);
|
||||
}
|
||||
|
||||
for(auto& the_token: token_set) {
|
||||
// repeat last element to indicate end of offsets for this array index
|
||||
token_to_offsets[the_token].push_back(token_to_offsets[the_token].back());
|
||||
@ -2362,8 +2360,8 @@ void Index::search_infix(const std::string& query, const std::string& field_name
|
||||
|
||||
auto search_tree = search_index.at(field_name);
|
||||
|
||||
const auto parent_search_begin = search_begin;
|
||||
const auto parent_search_stop_ms = search_stop_ms;
|
||||
const auto parent_search_begin = search_begin_us;
|
||||
const auto parent_search_stop_ms = search_stop_us;
|
||||
auto parent_search_cutoff = search_cutoff;
|
||||
|
||||
for(auto infix_set: infix_sets) {
|
||||
@ -2371,7 +2369,7 @@ void Index::search_infix(const std::string& query, const std::string& field_name
|
||||
&num_processed, &m_process, &cv_process,
|
||||
&parent_search_begin, &parent_search_stop_ms, &parent_search_cutoff]() {
|
||||
|
||||
search_begin = parent_search_begin;
|
||||
search_begin_us = parent_search_begin;
|
||||
search_cutoff = parent_search_cutoff;
|
||||
auto op_search_stop_ms = parent_search_stop_ms/2;
|
||||
|
||||
@ -2396,8 +2394,8 @@ void Index::search_infix(const std::string& query, const std::string& field_name
|
||||
|
||||
// check for search cutoff but only once every 2^10 docs to reduce overhead
|
||||
if(((num_iterated + 1) % (1 << 12)) == 0) {
|
||||
if (std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - search_begin).count() > op_search_stop_ms) {
|
||||
if ((std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().
|
||||
time_since_epoch()).count() - search_begin_us) > op_search_stop_ms) {
|
||||
search_cutoff = true;
|
||||
break;
|
||||
}
|
||||
@ -2596,6 +2594,11 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
|
||||
|
||||
for (const auto& dist_label : dist_labels) {
|
||||
uint32 seq_id = dist_label.second;
|
||||
|
||||
if(vector_query.query_doc_given && vector_query.seq_id == seq_id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint64_t distinct_id = seq_id;
|
||||
if (group_limit != 0) {
|
||||
distinct_id = get_distinct_id(group_by_fields, seq_id);
|
||||
@ -4386,8 +4389,8 @@ void Index::search_wildcard(filter_node_t const* const& filter_tree_root,
|
||||
size_t num_queued = 0;
|
||||
size_t filter_index = 0;
|
||||
|
||||
const auto parent_search_begin = search_begin;
|
||||
const auto parent_search_stop_ms = search_stop_ms;
|
||||
const auto parent_search_begin = search_begin_us;
|
||||
const auto parent_search_stop_ms = search_stop_us;
|
||||
auto parent_search_cutoff = search_cutoff;
|
||||
|
||||
for(size_t thread_id = 0; thread_id < num_threads && filter_index < filter_ids_length; thread_id++) {
|
||||
@ -4412,8 +4415,8 @@ void Index::search_wildcard(filter_node_t const* const& filter_tree_root,
|
||||
batch_result_ids, batch_res_len,
|
||||
&num_processed, &m_process, &cv_process]() {
|
||||
|
||||
search_begin = parent_search_begin;
|
||||
search_stop_ms = parent_search_stop_ms;
|
||||
search_begin_us = parent_search_begin;
|
||||
search_stop_us = parent_search_stop_ms;
|
||||
search_cutoff = parent_search_cutoff;
|
||||
|
||||
size_t filter_index = 0;
|
||||
|
@ -217,6 +217,10 @@ void StringUtils::replace_all(std::string& subject, const std::string& search, c
|
||||
}
|
||||
}
|
||||
|
||||
void StringUtils::erase_char(std::string& str, const char c) {
|
||||
str.erase(std::remove(str.begin(), str.end(), c), str.cend());
|
||||
}
|
||||
|
||||
std::string StringUtils::trim_curly_spaces(const std::string& str) {
|
||||
std::string left_trimmed;
|
||||
int i = 0;
|
||||
|
@ -262,9 +262,9 @@ nlohmann::json synonym_t::to_view_json() const {
|
||||
}
|
||||
|
||||
if(!symbols.empty()) {
|
||||
obj["symbols"] = nlohmann::json::array();
|
||||
obj["symbols_to_index"] = nlohmann::json::array();
|
||||
for(char c: symbols) {
|
||||
obj["symbols"].push_back(std::string(1, c));
|
||||
obj["symbols_to_index"].push_back(std::string(1, c));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,6 @@
|
||||
#include "thread_local_vars.h"
|
||||
|
||||
thread_local int64_t write_log_index = 0;
|
||||
thread_local std::chrono::high_resolution_clock::time_point search_begin;
|
||||
thread_local int64_t search_stop_ms;
|
||||
thread_local uint64_t search_begin_us;
|
||||
thread_local uint64_t search_stop_us;
|
||||
thread_local bool search_cutoff = false;
|
||||
|
@ -105,6 +105,8 @@ void init_cmdline_options(cmdline::parser & options, int argc, char **argv) {
|
||||
options.add<int>("memory-used-max-percentage", '\0', "Reject writes when memory usage exceeds this percentage. Default: 100 (never reject).", false, 100);
|
||||
options.add<bool>("skip-writes", '\0', "Skip all writes except config changes. Default: false.", false, false);
|
||||
|
||||
options.add<int>("log-slow-searches-time-ms", '\0', "When >= 0, searches that take longer than this duration are logged.", false, 30*1000);
|
||||
|
||||
// DEPRECATED
|
||||
options.add<std::string>("listen-address", 'h', "[DEPRECATED: use `api-address`] Address to which Typesense API service binds.", false, "0.0.0.0");
|
||||
options.add<uint32_t>("listen-port", 'p', "[DEPRECATED: use `api-port`] Port on which Typesense API service listens.", false, 8108);
|
||||
|
159
src/vector_query_ops.cpp
Normal file
159
src/vector_query_ops.cpp
Normal file
@ -0,0 +1,159 @@
|
||||
#include "vector_query_ops.h"
|
||||
#include "string_utils.h"
|
||||
#include "collection.h"
|
||||
|
||||
Option<bool> VectorQueryOps::parse_vector_query_str(std::string vector_query_str, vector_query_t& vector_query,
|
||||
const Collection* coll) {
|
||||
// FORMAT:
|
||||
// field_name([0.34, 0.66, 0.12, 0.68], exact: false, k: 10)
|
||||
size_t i = 0;
|
||||
while(i < vector_query_str.size()) {
|
||||
if(vector_query_str[i] != ':') {
|
||||
vector_query.field_name += vector_query_str[i];
|
||||
i++;
|
||||
} else {
|
||||
if(vector_query_str[i] != ':') {
|
||||
// missing ":"
|
||||
return Option<bool>(400, "Malformed vector query string: `:` is missing.");
|
||||
}
|
||||
|
||||
// field name is done
|
||||
i++;
|
||||
|
||||
StringUtils::trim(vector_query.field_name);
|
||||
|
||||
while(i < vector_query_str.size() && vector_query_str[i] != '(') {
|
||||
i++;
|
||||
}
|
||||
|
||||
if(vector_query_str[i] != '(') {
|
||||
// missing "("
|
||||
return Option<bool>(400, "Malformed vector query string.");
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
while(i < vector_query_str.size() && vector_query_str[i] != '[') {
|
||||
i++;
|
||||
}
|
||||
|
||||
if(vector_query_str[i] != '[') {
|
||||
// missing opening "["
|
||||
return Option<bool>(400, "Malformed vector query string.");
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
std::string values_str;
|
||||
while(i < vector_query_str.size() && vector_query_str[i] != ']') {
|
||||
values_str += vector_query_str[i];
|
||||
i++;
|
||||
}
|
||||
|
||||
if(vector_query_str[i] != ']') {
|
||||
// missing closing "]"
|
||||
return Option<bool>(400, "Malformed vector query string.");
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
std::vector<std::string> svalues;
|
||||
StringUtils::split(values_str, svalues, ",");
|
||||
|
||||
for(auto& svalue: svalues) {
|
||||
if(!StringUtils::is_float(svalue)) {
|
||||
return Option<bool>(400, "Malformed vector query string: one of the vector values is not a float.");
|
||||
}
|
||||
|
||||
vector_query.values.push_back(std::stof(svalue));
|
||||
}
|
||||
|
||||
if(i == vector_query_str.size()-1) {
|
||||
// missing params
|
||||
if(vector_query.values.empty()) {
|
||||
// when query values are missing, atleast the `id` parameter must be present
|
||||
return Option<bool>(400, "When a vector query value is empty, an `id` parameter must be present.");
|
||||
}
|
||||
|
||||
return Option<bool>(true);
|
||||
}
|
||||
|
||||
std::string param_str = vector_query_str.substr(i, (vector_query_str.size() - i));
|
||||
std::vector<std::string> param_kvs;
|
||||
StringUtils::split(param_str, param_kvs, ",");
|
||||
|
||||
for(auto& param_kv_str: param_kvs) {
|
||||
if(param_kv_str.back() == ')') {
|
||||
param_kv_str.pop_back();
|
||||
}
|
||||
|
||||
std::vector<std::string> param_kv;
|
||||
StringUtils::split(param_kv_str, param_kv, ":");
|
||||
if(param_kv.size() != 2) {
|
||||
return Option<bool>(400, "Malformed vector query string.");
|
||||
}
|
||||
|
||||
if(param_kv[0] == "id") {
|
||||
if(!vector_query.values.empty()) {
|
||||
// cannot pass both vector values and id
|
||||
return Option<bool>(400, "Malformed vector query string: cannot pass both vector query "
|
||||
"and `id` parameter.");
|
||||
}
|
||||
|
||||
Option<uint32_t> id_op = coll->doc_id_to_seq_id(param_kv[1]);
|
||||
if(!id_op.ok()) {
|
||||
return Option<bool>(400, "Document id referenced in vector query is not found.");
|
||||
}
|
||||
|
||||
nlohmann::json document;
|
||||
auto doc_op = coll->get_document_from_store(id_op.get(), document);
|
||||
if(!doc_op.ok()) {
|
||||
return Option<bool>(400, "Document id referenced in vector query is not found.");
|
||||
}
|
||||
|
||||
if(!document.contains(vector_query.field_name) || !document[vector_query.field_name].is_array()) {
|
||||
return Option<bool>(400, "Document referenced in vector query does not contain a valid "
|
||||
"vector field.");
|
||||
}
|
||||
|
||||
for(auto& fvalue: document[vector_query.field_name]) {
|
||||
if(!fvalue.is_number_float()) {
|
||||
return Option<bool>(400, "Document referenced in vector query does not contain a valid "
|
||||
"vector field.");
|
||||
}
|
||||
|
||||
vector_query.values.push_back(fvalue.get<float>());
|
||||
}
|
||||
|
||||
vector_query.query_doc_given = true;
|
||||
vector_query.seq_id = id_op.get();
|
||||
}
|
||||
|
||||
if(param_kv[0] == "k") {
|
||||
if(!StringUtils::is_uint32_t(param_kv[1])) {
|
||||
return Option<bool>(400, "Malformed vector query string: `k` parameter must be an integer.");
|
||||
}
|
||||
|
||||
vector_query.k = std::stoul(param_kv[1]);
|
||||
}
|
||||
|
||||
if(param_kv[0] == "flat_search_cutoff") {
|
||||
if(!StringUtils::is_uint32_t(param_kv[1])) {
|
||||
return Option<bool>(400, "Malformed vector query string: "
|
||||
"`flat_search_cutoff` parameter must be an integer.");
|
||||
}
|
||||
|
||||
vector_query.flat_search_cutoff = std::stoi(param_kv[1]);
|
||||
}
|
||||
}
|
||||
|
||||
if(!vector_query.query_doc_given && vector_query.values.empty()) {
|
||||
return Option<bool>(400, "When a vector query value is empty, an `id` parameter must be present.");
|
||||
}
|
||||
|
||||
return Option<bool>(true);
|
||||
}
|
||||
}
|
||||
|
||||
return Option<bool>(400, "Malformed vector query string.");
|
||||
}
|
@ -983,23 +983,23 @@ TEST_F(CollectionFacetingTest, FacetByNestedIntField) {
|
||||
|
||||
TEST_F(CollectionFacetingTest, FacetParseTest){
|
||||
std::vector<field> fields = {
|
||||
field("score", field_types::INT32, true),
|
||||
field("grade", field_types::INT32, true),
|
||||
field("rank", field_types::INT32, true),
|
||||
field("score", field_types::INT32, true),
|
||||
field("grade", field_types::INT32, true),
|
||||
field("rank", field_types::INT32, true),
|
||||
};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
|
||||
|
||||
std::vector<std::string> range_facet_fields {
|
||||
"score(fail:[0, 40], pass:[40, 100])",
|
||||
"grade(A:[80, 100], B:[60, 80], C:[40, 60])"
|
||||
"score(fail:[0, 40], pass:[40, 100])",
|
||||
"grade(A:[80, 100], B:[60, 80], C:[40, 60])"
|
||||
};
|
||||
std::vector<facet> range_facets;
|
||||
for(const std::string & facet_field: range_facet_fields) {
|
||||
coll1->parse_facet(facet_field, range_facets);
|
||||
}
|
||||
ASSERT_EQ(2, range_facets.size());
|
||||
|
||||
|
||||
ASSERT_STREQ("score", range_facets[0].field_name.c_str());
|
||||
ASSERT_TRUE(range_facets[0].is_range_query);
|
||||
ASSERT_GT(range_facets[0].facet_range_map.size(), 0);
|
||||
@ -1009,8 +1009,8 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
|
||||
ASSERT_GT(range_facets[1].facet_range_map.size(), 0);
|
||||
|
||||
std::vector<std::string> normal_facet_fields {
|
||||
"score",
|
||||
"grade"
|
||||
"score",
|
||||
"grade"
|
||||
};
|
||||
std::vector<facet> normal_facets;
|
||||
for(const std::string & facet_field: normal_facet_fields) {
|
||||
@ -1022,18 +1022,18 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
|
||||
ASSERT_STREQ("grade", normal_facets[1].field_name.c_str());
|
||||
|
||||
std::vector<std::string> mixed_facet_fields {
|
||||
"score",
|
||||
"grade(A:[80, 100], B:[60, 80], C:[40, 60])",
|
||||
"rank"
|
||||
"score",
|
||||
"grade(A:[80, 100], B:[60, 80], C:[40, 60])",
|
||||
"rank"
|
||||
};
|
||||
std::vector<facet> mixed_facets;
|
||||
for(const std::string & facet_field: mixed_facet_fields) {
|
||||
coll1->parse_facet(facet_field, mixed_facets);
|
||||
}
|
||||
ASSERT_EQ(3, mixed_facets.size());
|
||||
|
||||
|
||||
ASSERT_STREQ("score", mixed_facets[0].field_name.c_str());
|
||||
|
||||
|
||||
ASSERT_STREQ("grade", mixed_facets[1].field_name.c_str());
|
||||
ASSERT_TRUE(mixed_facets[1].is_range_query);
|
||||
ASSERT_GT(mixed_facets[1].facet_range_map.size(), 0);
|
||||
@ -1041,7 +1041,6 @@ TEST_F(CollectionFacetingTest, FacetParseTest){
|
||||
ASSERT_STREQ("rank", mixed_facets[2].field_name.c_str());
|
||||
}
|
||||
|
||||
|
||||
TEST_F(CollectionFacetingTest, RangeFacetTest) {
|
||||
std::vector<field> fields = {field("place", field_types::STRING, false),
|
||||
field("state", field_types::STRING, false),
|
||||
@ -1345,11 +1344,44 @@ TEST_F(CollectionFacetingTest, SampleFacetCounts) {
|
||||
// test for sample percent > 100
|
||||
|
||||
auto res_op = coll1->search("*", {}, "", {"color"}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 3, 3, 2, 2, false, "", 200, 0);
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 3, 3, 2, 2, false, "", 200, 0);
|
||||
|
||||
ASSERT_FALSE(res_op.ok());
|
||||
ASSERT_EQ("Value of `facet_sample_percent` must be less than 100.", res_op.error());
|
||||
}
|
||||
|
||||
TEST_F(CollectionFacetingTest, FacetOnArrayFieldWithSpecialChars) {
|
||||
std::vector<field> fields = {
|
||||
field("tags", field_types::STRING_ARRAY, true),
|
||||
field("points", field_types::INT32, true),
|
||||
};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["tags"] = {"gamma"};
|
||||
doc["points"] = 10;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
doc["tags"] = {"alpha", "| . |", "beta", "gamma"};
|
||||
doc["points"] = 10;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
auto results = coll1->search("*", {},
|
||||
"", {"tags"}, {}, {2}, 10, 1, FREQUENCY, {true}, 1).get();
|
||||
|
||||
ASSERT_EQ(1, results["facet_counts"].size());
|
||||
ASSERT_EQ(4, results["facet_counts"][0]["counts"].size());
|
||||
|
||||
for(size_t i = 0; i < results["facet_counts"][0]["counts"].size(); i++) {
|
||||
auto fvalue = results["facet_counts"][0]["counts"][i]["value"].get<std::string>();
|
||||
if(fvalue == "gamma") {
|
||||
ASSERT_EQ(2, results["facet_counts"][0]["counts"][i]["count"].get<size_t>());
|
||||
} else {
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"][i]["count"].get<size_t>());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -526,7 +526,10 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) {
|
||||
embedded_params["filter_by"] = "points: 200";
|
||||
|
||||
std::string json_res;
|
||||
auto search_op = collectionManager.do_search(req_params, embedded_params, json_res);
|
||||
auto now_ts = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
|
||||
auto search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
|
||||
ASSERT_TRUE(search_op.ok());
|
||||
|
||||
nlohmann::json res_obj = nlohmann::json::parse(json_res);
|
||||
@ -540,7 +543,7 @@ TEST_F(CollectionManagerTest, VerifyEmbeddedParametersOfScopedAPIKey) {
|
||||
req_params["filter_by"] = "year: 1922";
|
||||
req_params["q"] = "*";
|
||||
|
||||
search_op = collectionManager.do_search(req_params, embedded_params, json_res);
|
||||
search_op = collectionManager.do_search(req_params, embedded_params, json_res, now_ts);
|
||||
ASSERT_TRUE(search_op.ok());
|
||||
res_obj = nlohmann::json::parse(json_res);
|
||||
|
||||
@ -989,43 +992,6 @@ TEST_F(CollectionManagerTest, ParseSortByClause) {
|
||||
ASSERT_FALSE(sort_by_parsed);
|
||||
}
|
||||
|
||||
TEST_F(CollectionManagerTest, ParseVectorQueryString) {
|
||||
vector_query_t vector_query;
|
||||
bool parsed = CollectionManager::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10)", vector_query);
|
||||
ASSERT_TRUE(parsed);
|
||||
ASSERT_EQ("vec", vector_query.field_name);
|
||||
ASSERT_EQ(10, vector_query.k);
|
||||
std::vector<float> fvs = {0.34, 0.66, 0.12, 0.68};
|
||||
ASSERT_EQ(fvs.size(), vector_query.values.size());
|
||||
for(size_t i = 0; i < fvs.size(); i++) {
|
||||
ASSERT_EQ(fvs[i], vector_query.values[i]);
|
||||
}
|
||||
|
||||
vector_query._reset();
|
||||
parsed = CollectionManager::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10)", vector_query);
|
||||
ASSERT_TRUE(parsed);
|
||||
|
||||
vector_query._reset();
|
||||
parsed = CollectionManager::parse_vector_query_str("vec:[0.34, 0.66, 0.12, 0.68], k: 10)", vector_query);
|
||||
ASSERT_FALSE(parsed);
|
||||
|
||||
vector_query._reset();
|
||||
parsed = CollectionManager::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10", vector_query);
|
||||
ASSERT_TRUE(parsed);
|
||||
|
||||
vector_query._reset();
|
||||
parsed = CollectionManager::parse_vector_query_str("vec:(0.34, 0.66, 0.12, 0.68, k: 10)", vector_query);
|
||||
ASSERT_FALSE(parsed);
|
||||
|
||||
vector_query._reset();
|
||||
parsed = CollectionManager::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], )", vector_query);
|
||||
ASSERT_FALSE(parsed);
|
||||
|
||||
vector_query._reset();
|
||||
parsed = CollectionManager::parse_vector_query_str("vec([0.34, 0.66, 0.12, 0.68])", vector_query);
|
||||
ASSERT_FALSE(parsed);
|
||||
}
|
||||
|
||||
TEST_F(CollectionManagerTest, Presets) {
|
||||
// try getting on a blank slate
|
||||
auto presets = collectionManager.get_presets();
|
||||
|
@ -97,9 +97,9 @@ TEST_F(CollectionSynonymsTest, SynonymParsingFromJson) {
|
||||
ASSERT_STREQ("#", synonym_plus.synonyms[1][0].c_str());
|
||||
|
||||
nlohmann::json view_json = synonym_plus.to_view_json();
|
||||
ASSERT_EQ(2, view_json["symbols"].size());
|
||||
ASSERT_EQ("+", view_json["symbols"][0].get<std::string>());
|
||||
ASSERT_EQ("#", view_json["symbols"][1].get<std::string>());
|
||||
ASSERT_EQ(2, view_json["symbols_to_index"].size());
|
||||
ASSERT_EQ("+", view_json["symbols_to_index"][0].get<std::string>());
|
||||
ASSERT_EQ("#", view_json["symbols_to_index"][1].get<std::string>());
|
||||
|
||||
// when `id` is not given
|
||||
nlohmann::json syn_json_without_id = {
|
||||
|
@ -144,6 +144,33 @@ TEST_F(CollectionVectorTest, BasicVectorQuerying) {
|
||||
ASSERT_FALSE(res_op.ok());
|
||||
ASSERT_EQ("Field `zec` does not have a vector query index.", res_op.error());
|
||||
|
||||
// pass `id` of existing doc instead of vector, query doc should be omitted from results
|
||||
results = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([], id: 1)").get();
|
||||
|
||||
ASSERT_EQ(2, results["found"].get<size_t>());
|
||||
ASSERT_EQ(2, results["hits"].size());
|
||||
|
||||
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
// when `id` does not exist, return appropriate error
|
||||
res_op = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
||||
"", 10, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
|
||||
4, {off}, 32767, 32767, 2,
|
||||
false, true, "vec:([], id: 100)");
|
||||
|
||||
ASSERT_FALSE(res_op.ok());
|
||||
ASSERT_EQ("Document id referenced in vector query is not found.", res_op.error());
|
||||
|
||||
// only supported with wildcard queries
|
||||
res_op = coll1->search("title", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
|
@ -199,6 +199,29 @@ TEST_F(CoreAPIUtilsTest, MultiSearchEmbeddedKeys) {
|
||||
// ensure that req params are appended to (embedded params are also rolled into req params)
|
||||
ASSERT_EQ("((user_id: 100) && (age: > 100)) && (foo: bar)", req->params["filter_by"]);
|
||||
|
||||
// when empty filter_by is present in req params, don't add ()
|
||||
req->params["filter_by"] = "";
|
||||
post_multi_search(req, res);
|
||||
ASSERT_EQ("((age: > 100)) && (foo: bar)", req->params["filter_by"]);
|
||||
|
||||
// when empty filter_by in collection search params, don't add ()
|
||||
req->params["filter_by"] = "user_id: 100";
|
||||
search["filter_by"] = "";
|
||||
body["searches"].clear();
|
||||
body["searches"].push_back(search);
|
||||
req->body = body.dump();
|
||||
post_multi_search(req, res);
|
||||
ASSERT_EQ("((user_id: 100)) && (foo: bar)", req->params["filter_by"]);
|
||||
|
||||
// when both are empty, don't add ()
|
||||
req->params["filter_by"] = "";
|
||||
search["filter_by"] = "";
|
||||
body["searches"].clear();
|
||||
body["searches"].push_back(search);
|
||||
req->body = body.dump();
|
||||
post_multi_search(req, res);
|
||||
ASSERT_EQ("(foo: bar)", req->params["filter_by"]);
|
||||
|
||||
// try setting max search limit
|
||||
req->embedded_params_vec[0]["limit_multi_searches"] = 0;
|
||||
ASSERT_FALSE(post_multi_search(req, res));
|
||||
|
73
test/vector_query_ops_test.cpp
Normal file
73
test/vector_query_ops_test.cpp
Normal file
@ -0,0 +1,73 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "vector_query_ops.h"
|
||||
|
||||
class VectorQueryOpsTest : public ::testing::Test {
|
||||
protected:
|
||||
void setupCollection() {
|
||||
}
|
||||
|
||||
virtual void SetUp() {
|
||||
setupCollection();
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(VectorQueryOpsTest, ParseVectorQueryString) {
|
||||
vector_query_t vector_query;
|
||||
auto parsed = VectorQueryOps::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10)", vector_query, nullptr);
|
||||
ASSERT_TRUE(parsed.ok());
|
||||
ASSERT_EQ("vec", vector_query.field_name);
|
||||
ASSERT_EQ(10, vector_query.k);
|
||||
std::vector<float> fvs = {0.34, 0.66, 0.12, 0.68};
|
||||
ASSERT_EQ(fvs.size(), vector_query.values.size());
|
||||
for (size_t i = 0; i < fvs.size(); i++) {
|
||||
ASSERT_EQ(fvs[i], vector_query.values[i]);
|
||||
}
|
||||
|
||||
vector_query._reset();
|
||||
parsed = VectorQueryOps::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10)", vector_query, nullptr);
|
||||
ASSERT_TRUE(parsed.ok());
|
||||
|
||||
vector_query._reset();
|
||||
parsed = VectorQueryOps::parse_vector_query_str("vec:([])", vector_query, nullptr);
|
||||
ASSERT_FALSE(parsed.ok());
|
||||
ASSERT_EQ("When a vector query value is empty, an `id` parameter must be present.", parsed.error());
|
||||
|
||||
// cannot pass both vector and id
|
||||
vector_query._reset();
|
||||
parsed = VectorQueryOps::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], id: 10)", vector_query, nullptr);
|
||||
ASSERT_FALSE(parsed.ok());
|
||||
ASSERT_EQ("Malformed vector query string: cannot pass both vector query and `id` parameter.", parsed.error());
|
||||
|
||||
vector_query._reset();
|
||||
parsed = VectorQueryOps::parse_vector_query_str("vec:([], k: 10)", vector_query, nullptr);
|
||||
ASSERT_FALSE(parsed.ok());
|
||||
ASSERT_EQ("When a vector query value is empty, an `id` parameter must be present.", parsed.error());
|
||||
|
||||
vector_query._reset();
|
||||
parsed = VectorQueryOps::parse_vector_query_str("vec:[0.34, 0.66, 0.12, 0.68], k: 10)", vector_query, nullptr);
|
||||
ASSERT_FALSE(parsed.ok());
|
||||
ASSERT_EQ("Malformed vector query string.", parsed.error());
|
||||
|
||||
vector_query._reset();
|
||||
parsed = VectorQueryOps::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], k: 10", vector_query, nullptr);
|
||||
ASSERT_TRUE(parsed.ok());
|
||||
|
||||
vector_query._reset();
|
||||
parsed = VectorQueryOps::parse_vector_query_str("vec:(0.34, 0.66, 0.12, 0.68, k: 10)", vector_query, nullptr);
|
||||
ASSERT_FALSE(parsed.ok());
|
||||
ASSERT_EQ("Malformed vector query string.", parsed.error());
|
||||
|
||||
vector_query._reset();
|
||||
parsed = VectorQueryOps::parse_vector_query_str("vec:([0.34, 0.66, 0.12, 0.68], )", vector_query, nullptr);
|
||||
ASSERT_FALSE(parsed.ok());
|
||||
ASSERT_EQ("Malformed vector query string.", parsed.error());
|
||||
|
||||
vector_query._reset();
|
||||
parsed = VectorQueryOps::parse_vector_query_str("vec([0.34, 0.66, 0.12, 0.68])", vector_query, nullptr);
|
||||
ASSERT_FALSE(parsed.ok());
|
||||
ASSERT_EQ("Malformed vector query string.", parsed.error());
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user