Merge branch 'distinct-grouping'

This commit is contained in:
kishorenc 2020-07-04 13:50:34 +05:30
commit 3e82089423
42 changed files with 1474 additions and 607 deletions

View File

@ -2,7 +2,7 @@ version: 2
jobs:
build:
docker:
- image: typesense/typesense-development:22-MAR-2020-5
- image: typesense/typesense-development:23-JUNE-2020-1
environment:
- PROJECT_DIR: /typesense
- TYPESENSE_VERSION: $CIRCLE_BRANCH-$CIRCLE_SHA1
@ -16,7 +16,7 @@ jobs:
keys:
- external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }}
- external-Linux-cache-{{ .Branch }}
- external-Linux-cache
- external-Linux-cache
- run:
name: build
command: $PROJECT_DIR/build.sh

View File

@ -43,6 +43,7 @@ FIND_PACKAGE(ICU REQUIRED)
FIND_PACKAGE(Protobuf REQUIRED)
FIND_PACKAGE(LevelDB REQUIRED)
FIND_PACKAGE(gflags REQUIRED)
FIND_PACKAGE(glog REQUIRED)
message("OpenSSL library: ${OPENSSL_LIBRARIES}")
@ -53,11 +54,6 @@ include(cmake/GoogleTest.cmake)
include(cmake/TestResources.cmake)
include(cmake/Iconv.cmake)
if (APPLE)
include(cmake/brpc.cmake)
include(cmake/braft.cmake)
endif()
FILE(GLOB SRC_FILES src/*.cpp)
FILE(GLOB TEST_FILES test/*.cpp)
@ -87,8 +83,10 @@ link_directories(${DEP_ROOT_DIR}/${FOR_NAME})
link_directories(${DEP_ROOT_DIR}/${H2O_NAME}/build)
link_directories(${DEP_ROOT_DIR}/${ROCKSDB_NAME})
link_directories(${DEP_ROOT_DIR}/${ICONV_NAME}/lib/.libs)
if (APPLE)
link_directories(${DEP_ROOT_DIR}/${BRPC_NAME}/lib)
link_directories(${DEP_ROOT_DIR}/${BRAFT_NAME}/lib)
endif()
# Write dependency libraries to a file
file(WRITE ${DEP_ROOT_DIR}/libs.txt "")
@ -152,7 +150,7 @@ set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIE
set(CORE_LIBS h2o-evloop iconv ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES}
${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} pthread dl ${STD_LIB})
set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS})
set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} glog ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS})
target_link_libraries(typesense-core ${CORE_LIBS})
target_link_libraries(typesense-server ${CORE_LIBS})

View File

@ -37,7 +37,7 @@ Here's a quick example showcasing how you can create a collection, index a docum
Let's begin by starting the Typesense server via Docker:
```
docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.13.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.14.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
```
Install the Python client for Typesense (we have [clients](https://typesense.org/api/#api-clients) for other languages too):
@ -146,7 +146,7 @@ works without turning many knobs.
**Speed is great, but what about the memory footprint?**
A fresh Typesense server will take less than 5 MB of memory. As you start indexing documents, the memory use will
A fresh Typesense server will consume about 30 MB of memory. As you start indexing documents, the memory use will
increase correspondingly. How much it increases depends on the number and type of fields you index.
We've strived to keep the in-memory data structures lean. To give you a rough idea: when 1 million

View File

@ -97,6 +97,10 @@
- ~~Have a LOG(ERROR) level~~
- ~~Handle SIGTERM which is sent when process is killed~~
- ~~Use snappy compression for storage~~
- ~~Fix exclude_scalar early returns~~
- ~~Fix result ids length during grouped overrides~~
- ~~Fix override grouping (collate_included_ids)~~
- ~~Test for overriding result on second page~~
- atleast 1 token match for proceeding with drop tokens
- support wildcard query with filters
- API for optimizing on disk storage

View File

@ -1,4 +1,4 @@
set(BRAFT_VERSION 0a9ec3f)
set(BRAFT_VERSION fb27e63)
set(BRAFT_NAME braft-${BRAFT_VERSION})
set(BRAFT_TAR_PATH ${DEP_ROOT_DIR}/${BRAFT_NAME}.tar.gz)

View File

@ -1,4 +1,4 @@
set(BRPC_VERSION 23c66e3)
set(BRPC_VERSION 2f8fc37d)
set(BRPC_NAME brpc-${BRPC_VERSION})
set(BRPC_TAR_PATH ${DEP_ROOT_DIR}/${BRPC_NAME}.tar.gz)

View File

@ -22,13 +22,13 @@ if [[ "$@" == *"--depclean"* ]]; then
fi
#echo "Creating development image..."
#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:latest $PROJECT_DIR/docker
#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:23-JUNE-2020-1 $PROJECT_DIR/docker
echo "Building Typesense $TYPESENSE_VERSION..."
docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
-DCMAKE_BUILD_TYPE=Release -H/typesense -B/typesense/$BUILD_DIR
docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development make typesense-server -C/typesense/$BUILD_DIR
docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 make typesense-server -C/typesense/$BUILD_DIR
if [[ "$@" == *"--build-deploy-image"* ]]; then
echo "Creating deployment image for Typesense $TYPESENSE_VERSION server ..."
@ -60,7 +60,7 @@ if [[ "$@" == *"--package-libs"* ]]; then
fi
#
#if [[ "$@" == *"--create-deb-upload"* ]]; then
# docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
# docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
# -DCMAKE_BUILD_TYPE=Debug -H/typesense -B/typesense/$BUILD_DIR
#fi

View File

@ -55,25 +55,34 @@ ADD https://github.com/protocolbuffers/protobuf/releases/download/v3.11.4/protob
RUN tar -C /opt -xf /opt/protobuf-cpp-3.11.4.tar.gz && chown -R root:root /opt/protobuf-3.11.4
RUN cd /opt/protobuf-3.11.4 && ./configure --disable-shared && make -j8 && make check && make install && rm -rf /usr/local/lib/*.so*
ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz.tar.gz
RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz.tar.gz
ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz
RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz
RUN mkdir -p /opt/leveldb-1.22/build && cd /opt/leveldb-1.22/build && cmake -DCMAKE_BUILD_TYPE=Release .. && \
cmake --build . && make install && rm -rf /usr/local/lib/*.so*
ADD https://github.com/google/glog/archive/0a2e593.tar.gz /opt/glog-0a2e593.tar.gz
RUN tar -C /opt -xf /opt/glog-0a2e593.tar.gz
RUN mkdir -p /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \
cd /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \
cmake -DBUILD_TESTING=0 -DWITH_GFLAGS=ON -DWITH_UNWIND=OFF .. && \
cmake --build . && make install && rm -rf /usr/local/lib/*.so*
ADD https://github.com/apache/incubator-brpc/archive/0.9.7-rc03.tar.gz /opt/brpc-0.9.7-rc03.tar.gz
RUN tar -C /opt -xf /opt/brpc-0.9.7-rc03.tar.gz
COPY patches/brpc_cmakelists.txt /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt
RUN chown root:root /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt
RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/build && cd /opt/incubator-brpc-0.9.7-rc03/build && \
cmake -DWITH_DEBUG_SYMBOLS=OFF .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
rm -rf /opt/incubator-brpc-0.9.7-rc03/build/output/bin
RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/bld && cd /opt/incubator-brpc-0.9.7-rc03/bld && \
cmake -DWITH_DEBUG_SYMBOLS=OFF -DWITH_GLOG=ON .. && \
make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
rm -rf /opt/incubator-brpc-0.9.7-rc03/bld/output/bin
ADD https://github.com/baidu/braft/archive/v1.1.0.tar.gz /opt/braft-v1.1.0.tar.gz
RUN tar -C /opt -xf /opt/braft-v1.1.0.tar.gz
COPY patches/braft_cmakelists.txt /opt/braft-1.1.0/src/CMakeLists.txt
RUN chown root:root /opt/braft-1.1.0/src/CMakeLists.txt
RUN mkdir -p /opt/braft-1.1.0/build && cd /opt/braft-1.1.0/build && \
cmake -DWITH_DEBUG_SYMBOLS=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so*
ADD https://github.com/baidu/braft/archive/v1.1.1.tar.gz /opt/braft-v1.1.1.tar.gz
RUN tar -C /opt -xf /opt/braft-v1.1.1.tar.gz
COPY patches/braft_cmakelists.txt /opt/braft-1.1.1/src/CMakeLists.txt
RUN chown root:root /opt/braft-1.1.1/src/CMakeLists.txt
RUN mkdir -p /opt/braft-1.1.1/bld && cd /opt/braft-1.1.1/bld && \
cmake -DWITH_DEBUG_SYMBOLS=ON -DBRPC_WITH_GLOG=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
rm -rf /opt/braft-1.1.1/bld/output/bin
ENV CC /usr/local/gcc-6.4.0/bin/gcc
ENV CXX /usr/local/gcc-6.4.0/bin/g++

View File

@ -35,8 +35,8 @@ add_library(brpc-static STATIC $<TARGET_OBJECTS:BUTIL_LIB>
$<TARGET_OBJECTS:SOURCES_LIB>
$<TARGET_OBJECTS:PROTO_LIB>)
if(BRPC_WITH_THRIFT)
target_link_libraries(brpc-static thrift)
if(BRPC_WITH_GLOG)
target_link_libraries(brpc-static ${GLOG_LIB})
endif()
SET_TARGET_PROPERTIES(brpc-static PROPERTIES OUTPUT_NAME brpc CLEAN_DIRECT_OUTPUT 1)

View File

@ -147,7 +147,7 @@ private:
std::string get_seq_id_key(uint32_t seq_id);
void highlight_result(const field &search_field, const std::vector<std::vector<art_leaf *>> &searched_queries,
const KV &field_order_kv, const nlohmann::json &document,
const KV* field_order_kv, const nlohmann::json &document,
StringUtils & string_utils, size_t snippet_threshold,
bool highlighted_fully,
highlight_t &highlight);
@ -155,10 +155,10 @@ private:
void remove_document(nlohmann::json & document, const uint32_t seq_id, bool remove_from_store);
void populate_overrides(std::string query,
const std::map<std::string, size_t>& pinned_hits,
const std::map<size_t, std::vector<std::string>>& pinned_hits,
const std::vector<std::string>& hidden_hits,
std::map<uint32_t, size_t> & id_pos_map,
std::vector<uint32_t> & included_ids, std::vector<uint32_t> & excluded_ids);
std::map<size_t, std::vector<uint32_t>>& include_ids,
std::vector<uint32_t> & excluded_ids);
static bool facet_count_compare(const std::pair<uint64_t, facet_count_t>& a,
const std::pair<uint64_t, facet_count_t>& b) {
@ -236,8 +236,10 @@ public:
const size_t snippet_threshold = 30,
const std::string & highlight_full_fields = "",
size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD,
const std::map<std::string, size_t>& pinned_hits={},
const std::vector<std::string>& hidden_hits={});
const std::map<size_t, std::vector<std::string>>& pinned_hits={},
const std::vector<std::string>& hidden_hits={},
const std::vector<std::string>& group_by_fields={},
const size_t group_limit = 0);
Option<nlohmann::json> get(const std::string & id);
@ -271,6 +273,8 @@ public:
const size_t PER_PAGE_MAX = 250;
const size_t GROUP_LIMIT_MAX = 99;
// Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store
static constexpr const char* COLLECTION_META_PREFIX = "$CM";
static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS";
@ -280,5 +284,9 @@ public:
void facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document,
std::string &value);
void aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const;
void populate_result_kvs(Topster *topster, std::vector<std::vector<KV *>> &result_kvs) const;
};

View File

@ -146,6 +146,7 @@ struct token_pos_cost_t {
struct facet_count_t {
uint32_t count;
spp::sparse_hash_set<uint64_t> groups; // used for faceting grouped results
// used to fetch the actual document and value for representation
uint32_t doc_id;

View File

@ -117,16 +117,16 @@ struct http_req {
uint64_t route_hash;
std::map<std::string, std::string> params;
std::string body;
uint64_t seed;
std::string metadata;
http_req(): route_hash(1), seed(random_uint64_t()) {
http_req(): route_hash(1) {
}
http_req(h2o_req_t* _req, const std::string & http_method, uint64_t route_hash,
const std::map<std::string, std::string> & params, std::string body):
_req(_req), http_method(http_method), route_hash(route_hash), params(params),
body(body), seed(random_uint64_t()) {
body(body) {
}
@ -136,7 +136,7 @@ struct http_req {
nlohmann::json content = nlohmann::json::parse(serialized_content);
route_hash = content["route_hash"];
body = content["body"];
seed = content["seed"];
metadata = content.count("metadata") != 0 ? content["metadata"] : "";
for (nlohmann::json::iterator it = content["params"].begin(); it != content["params"].end(); ++it) {
params.emplace(it.key(), it.value());
@ -150,16 +150,10 @@ struct http_req {
content["route_hash"] = route_hash;
content["params"] = params;
content["body"] = body;
content["seed"] = seed;
content["metadata"] = metadata;
return content.dump();
}
uint64_t random_uint64_t() {
thread_local std::mt19937 rg(std::random_device{}());
thread_local std::uniform_int_distribution<uint64_t> pick(0, std::numeric_limits<uint64_t>::max());
return pick(rg);
}
};
struct request_response {

View File

@ -13,6 +13,7 @@
#include <json.hpp>
#include <field.h>
#include <option.h>
#include <set>
#include "string_utils.h"
struct token_candidates {
@ -26,23 +27,27 @@ struct search_args {
std::vector<std::string> search_fields;
std::vector<filter> filters;
std::vector<facet> facets;
std::vector<uint32_t> included_ids;
std::map<size_t, std::map<size_t, uint32_t>> included_ids;
std::vector<uint32_t> excluded_ids;
std::vector<sort_by> sort_fields_std;
facet_query_t facet_query;
int num_typos;
size_t max_facet_values;
size_t max_hits;
size_t per_page;
size_t page;
token_ordering token_order;
bool prefix;
size_t drop_tokens_threshold;
size_t typo_tokens_threshold;
std::vector<KV> raw_result_kvs;
std::vector<std::string> group_by_fields;
size_t group_limit;
size_t all_result_ids_len;
spp::sparse_hash_set<uint64_t> groups_processed;
std::vector<std::vector<art_leaf*>> searched_queries;
std::vector<KV> override_result_kvs;
Topster* topster;
Topster* curated_topster;
std::vector<std::vector<KV*>> raw_result_kvs;
std::vector<std::vector<KV*>> override_result_kvs;
Option<uint32_t> outcome;
search_args(): outcome(0) {
@ -50,18 +55,28 @@ struct search_args {
}
search_args(std::string query, std::vector<std::string> search_fields, std::vector<filter> filters,
std::vector<facet> facets, std::vector<uint32_t> included_ids, std::vector<uint32_t> excluded_ids,
std::vector<facet> facets, std::map<size_t, std::map<size_t, uint32_t>> included_ids, std::vector<uint32_t> excluded_ids,
std::vector<sort_by> sort_fields_std, facet_query_t facet_query, int num_typos, size_t max_facet_values,
size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix,
size_t drop_tokens_threshold, size_t typo_tokens_threshold):
size_t drop_tokens_threshold, size_t typo_tokens_threshold,
const std::vector<std::string>& group_by_fields, size_t group_limit):
query(query), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids),
excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), facet_query(facet_query), num_typos(num_typos),
max_facet_values(max_facet_values), max_hits(max_hits), per_page(per_page),
max_facet_values(max_facet_values), per_page(per_page),
page(page), token_order(token_order), prefix(prefix),
drop_tokens_threshold(drop_tokens_threshold), typo_tokens_threshold(typo_tokens_threshold),
group_by_fields(group_by_fields), group_limit(group_limit),
all_result_ids_len(0), outcome(0) {
const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory
topster = new Topster(topster_size, group_limit);
curated_topster = new Topster(topster_size, group_limit);
}
~search_args() {
delete topster;
delete curated_topster;
};
};
struct index_record {
@ -149,22 +164,23 @@ private:
void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
const uint32_t* result_ids, size_t results_size);
void drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids);
void search_field(const uint8_t & field_id, const std::string & query,
const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
const std::vector<uint32_t>& curated_ids,
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
const int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
Topster & topster, uint32_t** all_result_ids,
size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY,
const bool prefix = false,
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
uint32_t** all_result_ids, size_t & all_result_ids_len,
const token_ordering token_order = FREQUENCY, const bool prefix = false,
const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD,
const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD);
void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
const std::vector<uint32_t>& curated_ids,
const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
const token_ordering token_order, std::vector<std::vector<art_leaf*>> & searched_queries,
Topster & topster, uint32_t** all_result_ids,
std::vector<std::vector<art_leaf*>> & searched_queries,
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
uint32_t** all_result_ids,
size_t & all_result_ids_len,
const size_t typo_tokens_threshold);
@ -196,14 +212,19 @@ private:
void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
const uint32_t indices_length);
void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
const std::vector<uint32_t> & included_ids,
Topster & curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
uint64_t facet_token_hash(const field & a_field, const std::string &token);
void compute_facet_stats(facet &a_facet, int64_t raw_value, const std::string & field_type);
// reference: https://stackoverflow.com/a/27952689/131050
uint64_t hash_combine(uint64_t lhs, uint64_t rhs) const {
lhs ^= rhs + 0x517cc1b727220a95 + (lhs << 6) + (lhs >> 2);
return lhs;
}
public:
Index() = delete;
@ -218,12 +239,18 @@ public:
void search(Option<uint32_t> & outcome, const std::string & query, const std::vector<std::string> & search_fields,
const std::vector<filter> & filters, std::vector<facet> & facets,
facet_query_t & facet_query,
const std::vector<uint32_t> & included_ids, const std::vector<uint32_t> & excluded_ids,
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
const std::vector<uint32_t> & excluded_ids,
const std::vector<sort_by> & sort_fields_std, const int num_typos,
const size_t max_hits, const size_t per_page, const size_t page, const token_ordering token_order,
const bool prefix, const size_t drop_tokens_threshold, std::vector<KV> & raw_result_kvs,
size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries,
std::vector<KV> & override_result_kvs, const size_t typo_tokens_threshold);
Topster* topster, Topster* curated_topster,
const size_t per_page, const size_t page, const token_ordering token_order,
const bool prefix, const size_t drop_tokens_threshold,
size_t & all_result_ids_len,
spp::sparse_hash_set<uint64_t>& groups_processed,
std::vector<std::vector<art_leaf*>> & searched_queries,
std::vector<std::vector<KV*>> & raw_result_kvs,
std::vector<std::vector<KV*>> & override_result_kvs,
const size_t typo_tokens_threshold);
Option<uint32_t> remove(const uint32_t seq_id, nlohmann::json & document);
@ -235,7 +262,8 @@ public:
std::vector<std::vector<std::vector<uint16_t>>> &array_token_positions);
void score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
const uint32_t total_cost, Topster &topster, const std::vector<art_leaf *> & query_suggestion,
const uint32_t total_cost, Topster* topster, const std::vector<art_leaf *> & query_suggestion,
spp::sparse_hash_set<uint64_t>& groups_processed,
const uint32_t *result_ids, const size_t result_size) const;
static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);
@ -278,7 +306,7 @@ public:
bool processed; // prevents spurious wake up of the main thread
bool terminate; // used for interrupting the thread during tear down
search_args search_params;
search_args* search_params;
static void populate_array_token_positions(std::vector<std::vector<std::vector<uint16_t>>> & array_token_positions,
const art_leaf *token_leaf, uint32_t doc_index);
@ -286,5 +314,7 @@ public:
int get_bounded_typo_cost(const size_t max_cost, const size_t token_len) const;
static int64_t float_to_in64_t(float n);
uint64_t get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id, const uint32_t seq_id) const;
};

View File

@ -1,5 +1,3 @@
#pragma once
#include <string>
#include <iostream>
#include <butil/logging.h>
#include <glog/logging.h>

View File

@ -151,8 +151,18 @@ struct StringUtils {
}
char * p ;
strtoull(s.c_str(), &p, 10);
return (*p == 0);
unsigned long long ull = strtoull(s.c_str(), &p, 10);
return (*p == 0) && ull <= std::numeric_limits<uint64_t>::max();
}
static bool is_uint32_t(const std::string &s) {
if(s.empty()) {
return false;
}
char * p ;
unsigned long ul = strtoul(s.c_str(), &p, 10);
return (*p == 0) && ul <= std::numeric_limits<uint32_t>::max();
}
static void toupper(std::string& str) {
@ -234,7 +244,7 @@ struct StringUtils {
return hash != std::numeric_limits<uint64_t>::max() ? hash : (std::numeric_limits<uint64_t>::max()-1);
}
static std::string randstring(size_t length, uint64_t seed);
static std::string randstring(size_t length);
static std::string hmac(const std::string& key, const std::string& msg);
};

View File

@ -118,6 +118,23 @@ private:
}
}
static unsigned long linux_get_mem_available_bytes() {
std::string token;
std::ifstream file("/proc/meminfo");
while(file >> token) {
if(token == "MemAvailable:") {
unsigned long mem_kb;
if(file >> mem_kb) {
return mem_kb * 1000;
} else {
return 0;
}
}
}
return 0; // nothing found
}
public:
void get(const std::string & data_dir_path, nlohmann::json& result);

View File

@ -5,16 +5,26 @@
#include <cstdio>
#include <algorithm>
#include <sparsepp.h>
#include <match_score.h>
#include <number.h>
struct KV {
uint8_t field_id;
uint16_t query_index;
uint16_t array_index;
uint64_t key;
uint64_t match_score;
int64_t scores[3]; // match score + 2 custom attributes
uint8_t field_id{};
uint16_t query_index{};
uint16_t array_index{};
uint64_t key{};
uint64_t distinct_key{};
uint64_t match_score{};
int64_t scores[3]{}; // match score + 2 custom attributes
KV(uint8_t fieldId, uint16_t queryIndex, uint64_t key, uint64_t distinct_key,
uint64_t match_score, const int64_t *scores):
field_id(fieldId), query_index(queryIndex), array_index(0), key(key),
distinct_key(distinct_key), match_score(match_score) {
this->scores[0] = scores[0];
this->scores[1] = scores[1];
this->scores[2] = scores[2];
}
KV() = default;
};
/*
@ -25,12 +35,19 @@ struct Topster {
uint32_t size;
KV *data;
KV* *kvs;
KV** kvs;
spp::sparse_hash_map<uint64_t, KV*> keys;
// For distinct, stores the min heap kv of each group_kv_map topster value
spp::sparse_hash_map<uint64_t, KV*> kv_map;
explicit Topster(size_t capacity): MAX_SIZE(capacity), size(0) {
// we allocate data first to get contiguous memory block whose indices are then assigned to `kvs`
spp::sparse_hash_map<uint64_t, Topster*> group_kv_map;
size_t distinct;
explicit Topster(size_t capacity): Topster(capacity, 0) {
}
explicit Topster(size_t capacity, size_t distinct): MAX_SIZE(capacity), size(0), distinct(distinct) {
// we allocate data first to get a memory block whose indices are then assigned to `kvs`
// we use separate **kvs for easier pointer swaps
data = new KV[capacity];
kvs = new KV*[capacity];
@ -38,15 +55,25 @@ struct Topster {
for(size_t i=0; i<capacity; i++) {
data[i].field_id = 0;
data[i].query_index = 0;
data[i].array_index = i;
data[i].key = 0;
data[i].distinct_key = 0;
data[i].match_score = 0;
kvs[i] = &data[i];
}
}
~Topster() {
delete [] data;
delete [] kvs;
delete[] data;
delete[] kvs;
for(auto& kv: group_kv_map) {
delete kv.second;
}
data = nullptr;
kvs = nullptr;
group_kv_map.clear();
}
static inline void swapMe(KV** a, KV** b) {
@ -59,133 +86,193 @@ struct Topster {
(*b)->array_index = a_index;
}
static inline void replace_key_values(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index,
const uint64_t &match_score, const int64_t *scores, uint32_t start,
KV* *kvs, spp::sparse_hash_map<uint64_t, KV*>& keys) {
kvs[start]->key = key;
kvs[start]->field_id = field_id;
kvs[start]->query_index = query_index;
kvs[start]->array_index = start;
kvs[start]->match_score = match_score;
kvs[start]->scores[0] = scores[0];
kvs[start]->scores[1] = scores[1];
kvs[start]->scores[2] = scores[2];
bool add(KV* kv) {
//LOG(INFO) << "kv_map size: " << kv_map.size() << " -- kvs[0]: " << kvs[0]->match_score;
/*for(auto kv: kv_map) {
LOG(INFO) << "kv key: " << kv.first << " => " << kv.second->match_score;
}*/
keys.erase(kvs[start]->key);
keys[key] = kvs[start];
}
bool less_than_min_heap = (size >= MAX_SIZE) && is_smaller_equal(kv, kvs[0]);
size_t heap_op_index = 0;
void add(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index, const uint64_t &match_score,
const int64_t scores[3]) {
if (size >= MAX_SIZE) {
if(!is_greater(kvs[0], scores)) {
// when incoming value is less than the smallest in the heap, ignore
return;
if(!distinct && less_than_min_heap) {
// for non-distinct, if incoming value is smaller than min-heap ignore
return false;
}
bool SIFT_DOWN = true;
if(distinct) {
const auto& found_it = group_kv_map.find(kv->distinct_key);
bool is_duplicate_key = (found_it != group_kv_map.end());
if(!is_duplicate_key && less_than_min_heap) {
// for distinct, if a non duplicate kv is < than min heap we ignore
return false;
}
uint32_t start = 0;
if(is_duplicate_key) {
// if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift
Topster* group_topster = found_it->second;
KV old_min_heap_kv = *kv_map[kv->distinct_key];
bool added = group_topster->add(kv);
// When the key already exists and has a greater score, ignore. Otherwise, we have to replace.
// NOTE: we don't consider primary and secondary attrs here because they will be the same for a given key.
if(keys.count(key) != 0) {
const KV* existing = keys.at(key);
if(match_score <= existing->match_score) {
return ;
if(!added) {
return false;
}
// replace and sift down
start = existing->array_index;
}
// if new kv score is greater than previous min heap score we sift dowm, otherwise sift up
SIFT_DOWN = is_greater(kv, &old_min_heap_kv);
replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys);
// new kv is different from old_min_heap_kv so we have to sift heap
heap_op_index = old_min_heap_kv.array_index;
// sift down to maintain heap property
while ((2*start+1) < MAX_SIZE) {
uint32_t next = (2 * start + 1);
if (next+1 < MAX_SIZE && is_greater_kv(kvs[next], kvs[next+1])) {
next++;
}
// erase current min heap key from kv_map
kv_map.erase(old_min_heap_kv.distinct_key);
if (is_greater_kv(kvs[start], kvs[next])) {
swapMe(&kvs[start], &kvs[next]);
// kv will be copied into the pointer at heap_op_index
kv_map.emplace(kv->distinct_key, kvs[heap_op_index]);
} else {
// kv is guaranteed to be > current min heap: kvs[0]
// create fresh topster for this distinct group key since it does not exist
Topster* group_topster = new Topster(distinct, 0);
group_topster->add(kv);
// add new group key to map
group_kv_map.emplace(kv->distinct_key, group_topster);
// find heap operation index for updating kvs
if(size < MAX_SIZE) {
// there is enough space in heap we just copy to end
SIFT_DOWN = false;
heap_op_index = size;
size++;
} else {
break;
SIFT_DOWN = true;
// max size is reached so we are forced to replace current min heap element (kvs[0])
heap_op_index = 0;
// remove current min heap group key from maps
delete group_kv_map[kvs[heap_op_index]->distinct_key];
group_kv_map.erase(kvs[heap_op_index]->distinct_key);
kv_map.erase(kvs[heap_op_index]->distinct_key);
}
start = next;
// kv will be copied into the pointer at heap_op_index
kv_map.emplace(kv->distinct_key, kvs[heap_op_index]);
}
} else {
uint32_t start = size;
bool key_found = false;
// When the key already exists and has a greater score, ignore. Otherwise, we have to replace
if(keys.count(key) != 0) {
const KV* existing = keys.at(key);
if(match_score <= existing->match_score) {
return ;
} else { // not distinct
//LOG(INFO) << "Searching for key: " << kv->key;
const auto& found_it = kv_map.find(kv->key);
bool is_duplicate_key = (found_it != kv_map.end());
/*
is_duplicate_key: SIFT_DOWN regardless of `size`.
Else:
Do SIFT_UP if size < max_size
Else SIFT_DOWN
*/
if(is_duplicate_key) {
// Need to check if kv is greater than existing duplicate kv.
KV* existing_kv = found_it->second;
//LOG(INFO) << "existing_kv: " << existing_kv->key << " -> " << existing_kv->match_score;
if(is_smaller_equal(kv, existing_kv)) {
return false;
}
// replace and sift down
start = existing->array_index;
key_found = true;
}
SIFT_DOWN = true;
replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys);
// replace existing kv and sift down
heap_op_index = existing_kv->array_index;
kv_map.erase(kvs[heap_op_index]->key);
if(key_found) {
// need to sift down if it's a replace
while ((2*start+1) < size) {
uint32_t next = (2 * start + 1);
if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) {
next++;
}
// kv will be copied into the pointer at heap_op_index
kv_map.emplace(kv->key, kvs[heap_op_index]);
} else { // not duplicate
if (is_greater_kv(kvs[start], kvs[next])) {
swapMe(&kvs[start], &kvs[next]);
} else {
break;
}
start = next;
}
return ;
}
while(start > 0) {
uint32_t parent = (start-1)/2;
if (is_greater_kv(kvs[parent], kvs[start])) {
swapMe(&kvs[start], &kvs[parent]);
start = parent;
if(size < MAX_SIZE) {
// we just copy to end of array
SIFT_DOWN = false;
heap_op_index = size;
size++;
} else {
break;
// kv is guaranteed to be > min heap.
// we have to replace min heap element since array is full
SIFT_DOWN = true;
heap_op_index = 0;
kv_map.erase(kvs[heap_op_index]->key);
}
}
if(keys.count(key) != 0) {
size++;
// kv will be copied into the pointer at heap_op_index
kv_map.emplace(kv->key, kvs[heap_op_index]);
}
}
// we have to replace the existing element in the heap and sift down
kv->array_index = heap_op_index;
*kvs[heap_op_index] = *kv;
// sift up/down to maintain heap property
if(SIFT_DOWN) {
while ((2 * heap_op_index + 1) < size) {
uint32_t next = (2 * heap_op_index + 1); // left child
if (next+1 < size && is_greater(kvs[next], kvs[next + 1])) {
// for min heap we compare with the minimum of children
next++; // right child (2n + 2)
}
if (is_greater(kvs[heap_op_index], kvs[next])) {
swapMe(&kvs[heap_op_index], &kvs[next]);
} else {
break;
}
heap_op_index = next;
}
} else {
// SIFT UP
while(heap_op_index > 0) {
uint32_t parent = (heap_op_index - 1) / 2;
if (is_greater(kvs[parent], kvs[heap_op_index])) {
swapMe(&kvs[heap_op_index], &kvs[parent]);
heap_op_index = parent;
} else {
break;
}
}
}
return true;
}
static bool is_greater(const struct KV* i, const int64_t scores[3]) {
return std::tie(scores[0], scores[1], scores[2]) >
std::tie(i->scores[0], i->scores[1], i->scores[2]);
}
static bool is_greater_kv(const struct KV* i, const struct KV* j) {
static bool is_greater(const struct KV* i, const struct KV* j) {
return std::tie(i->scores[0], i->scores[1], i->scores[2], i->key) >
std::tie(j->scores[0], j->scores[1], j->scores[2], j->key);
}
static bool is_greater_kv_value(const struct KV & i, const struct KV & j) {
return std::tie(i.scores[0], i.scores[1], i.scores[2], i.key) >
std::tie(j.scores[0], j.scores[1], j.scores[2], j.key);
static bool is_smaller_equal(const struct KV* i, const struct KV* j) {
return std::tie(i->scores[0], i->scores[1], i->scores[2]) <=
std::tie(j->scores[0], j->scores[1], j->scores[2]);
}
static bool is_greater_kv_group(const std::vector<KV*>& i, const std::vector<KV*>& j) {
return std::tie(i[0]->scores[0], i[0]->scores[1], i[0]->scores[2], i[0]->key) >
std::tie(j[0]->scores[0], j[0]->scores[1], j[0]->scores[2], j[0]->key);
}
// topster must be sorted before iterated upon to remove dead array entries
void sort() {
std::stable_sort(kvs, kvs+size, is_greater_kv);
std::stable_sort(kvs, kvs + size, is_greater);
for(auto &group_topster: group_kv_map) {
group_topster.second->sort();
}
}
void clear(){
@ -196,6 +283,10 @@ struct Topster {
return kvs[index]->key;
}
uint64_t getDistinctKeyAt(uint32_t index) {
return kvs[index]->distinct_key;
}
KV* getKV(uint32_t index) {
return kvs[index];
}

View File

@ -1,10 +1,10 @@
#pragma once
#include "logger.h"
#include <string>
#include <iostream>
#include <cmdline.h>
#include "config.h"
#include "logger.h"
#include "store.h"
#include "collection_manager.h"
#include <csignal>

View File

@ -61,12 +61,12 @@ void array::remove_index(uint32_t start_index, uint32_t end_index) {
}
uint32_t size_required = (uint32_t) (unsorted_append_size_required(max, new_index) * FOR_GROWTH_FACTOR);
uint8_t *out = new uint8_t[size_required];
uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out);
uint32_t actual_size = for_compress_unsorted(new_array, out, new_index);
delete[] curr_array;
delete[] new_array;
delete[] in;
free(in);
in = out;
length = new_index;

View File

@ -107,14 +107,16 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA,
size_t indexA = 0, indexB = 0, res_index = 0;
if(A == nullptr && B == nullptr) {
return 0;
*out = nullptr;
return 0;
}
if(A == nullptr) {
*out = nullptr;
return 0;
}
if(B == nullptr) {
if(lenB == 0 || B == nullptr) {
*out = new uint32_t[lenA];
memcpy(*out, A, lenA * sizeof(uint32_t));
return lenA;

View File

@ -1384,8 +1384,6 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
art_fuzzy_recurse(0, 0, t->root, -1, term, term_len, irow, jrow, min_cost, max_cost, prefix, nodes);
}
PROCESS_NODES:
if(token_order == FREQUENCY) {
std::sort(nodes.begin(), nodes.end(), compare_art_node_frequency);
} else {

View File

@ -305,54 +305,68 @@ void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash
}
void Collection::populate_overrides(std::string query,
const std::map<std::string, size_t>& pinned_hits,
const std::map<size_t, std::vector<std::string>>& pinned_hits,
const std::vector<std::string>& hidden_hits,
std::map<uint32_t, size_t> & id_pos_map,
std::vector<uint32_t> & included_ids,
std::map<size_t, std::vector<uint32_t>>& include_ids,
std::vector<uint32_t> & excluded_ids) {
StringUtils::tolowercase(query);
std::set<uint32_t> excluded_set;
// If pinned or hidden hits are provided, they take precedence over overrides
// have to ensure that hidden hits take precedence over included hits
if(!hidden_hits.empty()) {
for(const auto & hit: hidden_hits) {
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit);
if(seq_id_op.ok()) {
excluded_ids.push_back(seq_id_op.get());
excluded_set.insert(seq_id_op.get());
}
}
}
for(const auto & override_kv: overrides) {
const auto & override = override_kv.second;
if( (override.rule.match == override_t::MATCH_EXACT && override.rule.query == query) ||
(override.rule.match == override_t::MATCH_CONTAINS && query.find(override.rule.query) != std::string::npos) ) {
for(const auto & hit: override.add_hits) {
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
if(seq_id_op.ok()) {
included_ids.push_back(seq_id_op.get());
id_pos_map[seq_id_op.get()] = hit.position;
}
}
// have to ensure that dropped hits take precedence over added hits
for(const auto & hit: override.drop_hits) {
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
if(seq_id_op.ok()) {
excluded_ids.push_back(seq_id_op.get());
excluded_set.insert(seq_id_op.get());
}
}
for(const auto & hit: override.add_hits) {
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
if(!seq_id_op.ok()) {
continue;
}
uint32_t seq_id = seq_id_op.get();
bool excluded = (excluded_set.count(seq_id) != 0);
if(!excluded) {
include_ids[hit.position].push_back(seq_id);
}
}
}
}
// If pinned or hidden hits are provided, they take precedence over overrides
if(!pinned_hits.empty()) {
for(const auto & hit: pinned_hits) {
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.first);
if(seq_id_op.ok()) {
included_ids.push_back(seq_id_op.get());
id_pos_map[seq_id_op.get()] = hit.second;
}
}
}
if(!hidden_hits.empty()) {
for(const auto & hit: hidden_hits) {
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit);
if(seq_id_op.ok()) {
included_ids.erase(std::remove(included_ids.begin(), included_ids.end(), seq_id_op.get()), included_ids.end());
id_pos_map.erase(seq_id_op.get());
excluded_ids.push_back(seq_id_op.get());
for(const auto& pos_ids: pinned_hits) {
size_t pos = pos_ids.first;
for(const std::string& id: pos_ids.second) {
Option<uint32_t> seq_id_op = doc_id_to_seq_id(id);
if(!seq_id_op.ok()) {
continue;
}
uint32_t seq_id = seq_id_op.get();
bool excluded = (excluded_set.count(seq_id) != 0);
if(!excluded) {
include_ids[pos].push_back(seq_id);
}
}
}
}
@ -371,20 +385,60 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
const size_t snippet_threshold,
const std::string & highlight_full_fields,
size_t typo_tokens_threshold,
const std::map<std::string, size_t>& pinned_hits,
const std::vector<std::string>& hidden_hits) {
const std::map<size_t, std::vector<std::string>>& pinned_hits,
const std::vector<std::string>& hidden_hits,
const std::vector<std::string>& group_by_fields,
const size_t group_limit) {
if(query != "*" && search_fields.empty()) {
return Option<nlohmann::json>(400, "No search fields specified for the query.");
}
if(!group_by_fields.empty() && (group_limit == 0 || group_limit > GROUP_LIMIT_MAX)) {
return Option<nlohmann::json>(400, "Value of `group_limit` must be between 1 and " +
std::to_string(GROUP_LIMIT_MAX) + ".");
}
std::vector<uint32_t> included_ids;
std::vector<uint32_t> excluded_ids;
std::map<uint32_t, size_t> id_pos_map;
populate_overrides(query, pinned_hits, hidden_hits, id_pos_map, included_ids, excluded_ids);
std::map<size_t, std::vector<uint32_t>> include_ids; // position => list of IDs
populate_overrides(query, pinned_hits, hidden_hits, include_ids, excluded_ids);
std::map<uint32_t, std::vector<uint32_t>> index_to_included_ids;
/*for(auto kv: include_ids) {
LOG(INFO) << "key: " << kv.first;
for(auto val: kv.second) {
LOG(INFO) << val;
}
}
LOG(INFO) << "Excludes:";
for(auto id: excluded_ids) {
LOG(INFO) << id;
}
LOG(INFO) << "include_ids size: " << include_ids.size();
for(auto& group: include_ids) {
for(uint32_t& seq_id: group.second) {
LOG(INFO) << "seq_id: " << seq_id;
}
LOG(INFO) << "----";
}
*/
std::map<uint32_t, std::map<size_t, std::map<size_t, uint32_t>>> index_to_included_ids;
std::map<uint32_t, std::vector<uint32_t>> index_to_excluded_ids;
for(auto seq_id: included_ids) {
auto index_id = (seq_id % num_indices);
index_to_included_ids[index_id].push_back(seq_id);
for(const auto& pos_ids: include_ids) {
size_t outer_pos = pos_ids.first;
size_t ids_per_pos = std::max(size_t(1), group_limit);
for(size_t inner_pos = 0; inner_pos < std::min(ids_per_pos, pos_ids.second.size()); inner_pos++) {
auto seq_id = pos_ids.second[inner_pos];
auto index_id = (seq_id % num_indices);
index_to_included_ids[index_id][outer_pos][inner_pos] = seq_id;
//LOG(INFO) << "Adding seq_id " << seq_id << " to index_id " << index_id;
}
}
for(auto seq_id: excluded_ids) {
@ -408,6 +462,22 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
}
}
// validate group by fields
for(const std::string & field_name: group_by_fields) {
if(search_schema.count(field_name) == 0) {
std::string error = "Could not find a field named `" + field_name + "` in the schema.";
return Option<nlohmann::json>(404, error);
}
field search_field = search_schema.at(field_name);
// must be a facet field
if(!search_field.is_facet()) {
std::string error = "Group by field `" + field_name + "` should be a facet field.";
return Option<nlohmann::json>(400, error);
}
}
// validate filter fields
std::vector<std::string> filter_blocks;
StringUtils::split(simple_filter_query, filter_blocks, "&&");
@ -515,7 +585,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
}
// for a wildcard query, if filter is not specified, use default_sorting_field as a catch-all
if(query == "*" && filters.size() == 0) {
if(query == "*" && filters.empty()) {
field f = search_schema.at(default_sorting_field);
std::string max_value = f.is_float() ? std::to_string(std::numeric_limits<float>::max()) :
std::to_string(std::numeric_limits<int32_t>::max());
@ -631,19 +701,21 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
const size_t max_hits = std::min((page * per_page), get_num_documents());
std::vector<std::vector<art_leaf*>> searched_queries; // search queries used for generating the results
std::vector<KV> raw_result_kvs;
std::vector<KV> override_result_kvs;
std::vector<std::vector<KV*>> raw_result_kvs;
std::vector<std::vector<KV*>> override_result_kvs;
size_t total_found = 0;
spp::sparse_hash_set<uint64_t> groups_processed; // used to calculate total_found for grouped query
// send data to individual index threads
size_t index_id = 0;
for(Index* index: indices) {
index->search_params = search_args(query, search_fields, filters, facets,
index_to_included_ids[index_id], index_to_excluded_ids[index_id],
sort_fields_std, facet_query, num_typos, max_facet_values, max_hits,
per_page, page, token_order, prefix,
drop_tokens_threshold, typo_tokens_threshold);
index->search_params = new search_args(query, search_fields, filters, facets,
index_to_included_ids[index_id], index_to_excluded_ids[index_id],
sort_fields_std, facet_query, num_typos, max_facet_values, max_hits,
per_page, page, token_order, prefix,
drop_tokens_threshold, typo_tokens_threshold,
group_by_fields, group_limit);
{
std::lock_guard<std::mutex> lk(index->m);
index->ready = true;
@ -655,6 +727,12 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
Option<nlohmann::json> index_search_op({}); // stores the last error across all index threads
// for grouping we have re-aggregate
const size_t topster_size = std::max((size_t)1, max_hits);
Topster topster(topster_size, group_limit);
Topster curated_topster(topster_size, group_limit);
for(Index* index: indices) {
// wait for the worker
{
@ -662,9 +740,9 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
index->cv.wait(lk, [index]{return index->processed;});
}
if(!index->search_params.outcome.ok()) {
index_search_op = Option<nlohmann::json>(index->search_params.outcome.code(),
index->search_params.outcome.error());
if(!index->search_params->outcome.ok()) {
index_search_op = Option<nlohmann::json>(index->search_params->outcome.code(),
index->search_params->outcome.error());
}
if(!index_search_op.ok()) {
@ -672,34 +750,33 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
continue;
}
for(auto & field_order_kv: index->search_params.raw_result_kvs) {
field_order_kv.query_index += searched_queries.size();
raw_result_kvs.push_back(field_order_kv);
}
aggregate_topster(searched_queries.size(), topster, index->search_params->topster);
aggregate_topster(searched_queries.size(), curated_topster, index->search_params->curated_topster);
for(auto & field_order_kv: index->search_params.override_result_kvs) {
field_order_kv.query_index += searched_queries.size();
override_result_kvs.push_back(field_order_kv);
}
searched_queries.insert(searched_queries.end(), index->search_params->searched_queries.begin(),
index->search_params->searched_queries.end());
searched_queries.insert(searched_queries.end(), index->search_params.searched_queries.begin(),
index->search_params.searched_queries.end());
for(size_t fi = 0; fi < index->search_params.facets.size(); fi++) {
auto & this_facet = index->search_params.facets[fi];
for(size_t fi = 0; fi < index->search_params->facets.size(); fi++) {
auto & this_facet = index->search_params->facets[fi];
auto & acc_facet = facets[fi];
for(auto & facet_kv: this_facet.result_map) {
size_t count = 0;
if(acc_facet.result_map.count(facet_kv.first) == 0) {
// not found, so set it
count = facet_kv.second.count;
if(index->search_params->group_limit) {
// we have to add all group sets
acc_facet.result_map[facet_kv.first].groups.insert(
facet_kv.second.groups.begin(), facet_kv.second.groups.end()
);
} else {
count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
size_t count = 0;
if(acc_facet.result_map.count(facet_kv.first) == 0) {
// not found, so set it
count = facet_kv.second.count;
} else {
count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
}
acc_facet.result_map[facet_kv.first].count = count;
}
acc_facet.result_map[facet_kv.first].count = count;
acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos;
@ -713,138 +790,195 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
}
}
total_found += index->search_params.all_result_ids_len;
if(group_limit) {
groups_processed.insert(
index->search_params->groups_processed.begin(),
index->search_params->groups_processed.end()
);
} else {
total_found += index->search_params->all_result_ids_len;
}
}
if(!index_search_op.ok()) {
return index_search_op;
}
// All fields are sorted descending
std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_value);
topster.sort();
curated_topster.sort();
// Sort based on position in overriden list
populate_result_kvs(&topster, raw_result_kvs);
populate_result_kvs(&curated_topster, override_result_kvs);
// for grouping we have to aggregate group set sizes to a count value
if(group_limit) {
for(auto& acc_facet: facets) {
for(auto& facet_kv: acc_facet.result_map) {
facet_kv.second.count = facet_kv.second.groups.size();
}
}
total_found = groups_processed.size() + override_result_kvs.size();
}
// All fields are sorted descending
std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_group);
// Sort based on position in overridden list
std::sort(
override_result_kvs.begin(), override_result_kvs.end(),
[&id_pos_map](const KV & a, const KV & b) -> bool {
return id_pos_map[a.key] < id_pos_map[b.key];
[](const std::vector<KV*>& a, std::vector<KV*>& b) -> bool {
return a[0]->distinct_key < b[0]->distinct_key;
}
);
nlohmann::json result = nlohmann::json::object();
result["hits"] = nlohmann::json::array();
result["found"] = total_found;
std::vector<KV> result_kvs;
std::vector<std::vector<KV*>> result_group_kvs;
size_t override_kv_index = 0;
size_t raw_results_index = 0;
// merge raw results and override results
while(override_kv_index < override_result_kvs.size() && raw_results_index < raw_result_kvs.size()) {
if(override_kv_index < override_result_kvs.size() &&
id_pos_map.count(override_result_kvs[override_kv_index].key) != 0 &&
result_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index].key]) {
result_kvs.push_back(override_result_kvs[override_kv_index]);
override_kv_index++;
size_t result_position = result_group_kvs.size() + 1;
uint64_t override_position = override_result_kvs[override_kv_index][0]->distinct_key;
if(result_position == override_position) {
override_result_kvs[override_kv_index][0]->match_score = 0; // to identify curated result
result_group_kvs.push_back(override_result_kvs[override_kv_index]);
override_kv_index++;
} else {
result_kvs.push_back(raw_result_kvs[raw_results_index]);
result_group_kvs.push_back(raw_result_kvs[raw_results_index]);
raw_results_index++;
}
}
while(override_kv_index < override_result_kvs.size()) {
result_kvs.push_back(override_result_kvs[override_kv_index]);
override_result_kvs[override_kv_index][0]->match_score = 0; // to identify curated result
result_group_kvs.push_back({override_result_kvs[override_kv_index]});
override_kv_index++;
}
while(raw_results_index < raw_result_kvs.size()) {
result_kvs.push_back(raw_result_kvs[raw_results_index]);
result_group_kvs.push_back(raw_result_kvs[raw_results_index]);
raw_results_index++;
}
const long start_result_index = (page - 1) * per_page;
const long end_result_index = std::min(max_hits, result_kvs.size()) - 1; // could be -1 when max_hits is 0
const long end_result_index = std::min(max_hits, result_group_kvs.size()) - 1; // could be -1 when max_hits is 0
nlohmann::json result = nlohmann::json::object();
result["found"] = total_found;
std::string hits_key = group_limit ? "grouped_hits" : "hits";
result[hits_key] = nlohmann::json::array();
// construct results array
for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) {
const auto & field_order_kv = result_kvs[result_kvs_index];
const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv.key);
const std::vector<KV*> & kv_group = result_group_kvs[result_kvs_index];
nlohmann::json document;
const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
if(!document_op.ok()) {
LOG(ERROR) << "Document fetch error. " << document_op.error();
continue;
nlohmann::json group_hits;
if(group_limit) {
group_hits["hits"] = nlohmann::json::array();
}
nlohmann::json wrapper_doc;
wrapper_doc["highlights"] = nlohmann::json::array();
std::vector<highlight_t> highlights;
StringUtils string_utils;
nlohmann::json& hits_array = group_limit ? group_hits["hits"] : result["hits"];
// find out if fields have to be highlighted fully
std::vector<std::string> fields_highlighted_fully_vec;
spp::sparse_hash_set<std::string> fields_highlighted_fully;
StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ",");
for(const KV* field_order_kv: kv_group) {
const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key);
for(std::string & highlight_full_field: fields_highlighted_fully_vec) {
StringUtils::trim(highlight_full_field);
fields_highlighted_fully.emplace(highlight_full_field);
}
nlohmann::json document;
const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
for(const std::string & field_name: search_fields) {
// should not pick excluded field for highlighting
if(exclude_fields.count(field_name) > 0) {
if(!document_op.ok()) {
LOG(ERROR) << "Document fetch error. " << document_op.error();
continue;
}
field search_field = search_schema.at(field_name);
if(query != "*" && (search_field.type == field_types::STRING ||
search_field.type == field_types::STRING_ARRAY)) {
nlohmann::json wrapper_doc;
wrapper_doc["highlights"] = nlohmann::json::array();
std::vector<highlight_t> highlights;
StringUtils string_utils;
bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
highlight_t highlight;
highlight_result(search_field, searched_queries, field_order_kv, document,
string_utils, snippet_threshold, highlighted_fully, highlight);
// find out if fields have to be highlighted fully
std::vector<std::string> fields_highlighted_fully_vec;
spp::sparse_hash_set<std::string> fields_highlighted_fully;
StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ",");
if(!highlight.snippets.empty()) {
highlights.push_back(highlight);
}
for(std::string & highlight_full_field: fields_highlighted_fully_vec) {
StringUtils::trim(highlight_full_field);
fields_highlighted_fully.emplace(highlight_full_field);
}
}
std::sort(highlights.begin(), highlights.end());
for(const auto & highlight: highlights) {
nlohmann::json h_json = nlohmann::json::object();
h_json["field"] = highlight.field;
bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end());
if(!highlight.indices.empty()) {
h_json["indices"] = highlight.indices;
h_json["snippets"] = highlight.snippets;
if(highlight_fully) {
h_json["values"] = highlight.values;
for(const std::string & field_name: search_fields) {
// should not pick excluded field for highlighting
if(exclude_fields.count(field_name) > 0) {
continue;
}
} else {
h_json["snippet"] = highlight.snippets[0];
if(highlight_fully) {
h_json["value"] = highlight.values[0];
field search_field = search_schema.at(field_name);
if(query != "*" && (search_field.type == field_types::STRING ||
search_field.type == field_types::STRING_ARRAY)) {
bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
highlight_t highlight;
highlight_result(search_field, searched_queries, field_order_kv, document,
string_utils, snippet_threshold, highlighted_fully, highlight);
if(!highlight.snippets.empty()) {
highlights.push_back(highlight);
}
}
}
wrapper_doc["highlights"].push_back(h_json);
std::sort(highlights.begin(), highlights.end());
for(const auto & highlight: highlights) {
nlohmann::json h_json = nlohmann::json::object();
h_json["field"] = highlight.field;
bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end());
if(!highlight.indices.empty()) {
h_json["indices"] = highlight.indices;
h_json["snippets"] = highlight.snippets;
if(highlight_fully) {
h_json["values"] = highlight.values;
}
} else {
h_json["snippet"] = highlight.snippets[0];
if(highlight_fully) {
h_json["value"] = highlight.values[0];
}
}
wrapper_doc["highlights"].push_back(h_json);
}
//wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key;
prune_document(document, include_fields, exclude_fields);
wrapper_doc["document"] = document;
wrapper_doc["text_match"] = field_order_kv->match_score;
if(field_order_kv->match_score == 0) {
wrapper_doc["curated"] = true;
}
hits_array.push_back(wrapper_doc);
}
prune_document(document, include_fields, exclude_fields);
wrapper_doc["document"] = document;
wrapper_doc["text_match"] = field_order_kv.match_score;
//wrapper_doc["seq_id"] = (uint32_t) field_order_kv.key;
if(group_limit) {
const auto& document = group_hits["hits"][0]["document"];
result["hits"].push_back(wrapper_doc);
group_hits["group_key"] = nlohmann::json::array();
for(const auto& field_name: group_by_fields) {
if(document.count(field_name) != 0) {
group_hits["group_key"].push_back(document[field_name]);
}
}
result["grouped_hits"].push_back(group_hits);
}
}
result["facet_counts"] = nlohmann::json::array();
@ -941,6 +1075,11 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
result["facet_counts"].push_back(facet_result);
}
// free search params
for(Index* index: indices) {
delete index->search_params;
}
result["request_params"] = nlohmann::json::object();;
result["request_params"]["per_page"] = per_page;
result["request_params"]["q"] = query;
@ -951,6 +1090,43 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
return result;
}
void Collection::populate_result_kvs(Topster *topster, std::vector<std::vector<KV *>> &result_kvs) const {
if(topster->distinct) {
for(auto &group_topster_entry: topster->group_kv_map) {
Topster* group_topster = group_topster_entry.second;
const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
result_kvs.emplace_back(group_kvs);
}
} else {
for(uint32_t t = 0; t < topster->size; t++) {
KV* kv = topster->getKV(t);
result_kvs.push_back({kv});
}
}
}
void Collection::aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const {
if(index_topster->distinct) {
for(auto &group_topster_entry: index_topster->group_kv_map) {
Topster* group_topster = group_topster_entry.second;
const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
for(KV* kv: group_kvs) {
kv->query_index += query_index;
topster.add(kv);
}
}
} else {
for(uint32_t t = 0; t < index_topster->size; t++) {
KV* kv = index_topster->getKV(t);
kv->query_index += query_index;
topster.add(kv);
}
}
}
void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count,
const nlohmann::json &document, std::string &value) {
@ -989,7 +1165,7 @@ void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t
void Collection::highlight_result(const field &search_field,
const std::vector<std::vector<art_leaf *>> &searched_queries,
const KV & field_order_kv, const nlohmann::json & document,
const KV* field_order_kv, const nlohmann::json & document,
StringUtils & string_utils, size_t snippet_threshold,
bool highlighted_fully,
highlight_t & highlight) {
@ -997,15 +1173,15 @@ void Collection::highlight_result(const field &search_field,
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
std::vector<art_leaf *> query_suggestion;
for (const art_leaf *token_leaf : searched_queries[field_order_kv.query_index]) {
for (const art_leaf *token_leaf : searched_queries[field_order_kv->query_index]) {
// Must search for the token string fresh on that field for the given document since `token_leaf`
// is from the best matched field and need not be present in other fields of a document.
Index* index = indices[field_order_kv.key % num_indices];
Index* index = indices[field_order_kv->key % num_indices];
art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len);
if(actual_leaf != nullptr) {
query_suggestion.push_back(actual_leaf);
std::vector<uint16_t> positions;
uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv.key);
uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key);
auto doc_indices = new uint32_t[1];
doc_indices[0] = doc_index;
leaf_to_indices.emplace(actual_leaf, doc_indices);
@ -1031,8 +1207,8 @@ void Collection::highlight_result(const field &search_field,
continue;
}
const Match & this_match = Match::match(field_order_kv.key, token_positions);
uint64_t this_match_score = this_match.get_match_score(1, field_order_kv.field_id);
const Match & this_match = Match::match(field_order_kv->key, token_positions);
uint64_t this_match_score = this_match.get_match_score(1, field_order_kv->field_id);
match_indices.emplace_back(this_match, this_match_score, array_index);
}
@ -1231,7 +1407,7 @@ Option<uint32_t> Collection::add_override(const override_t & override) {
return Option<uint32_t>(500, "Error while storing the override on disk.");
}
overrides[override.id] = override;
overrides.emplace(override.id, override);
return Option<uint32_t>(200);
}

View File

@ -212,6 +212,9 @@ bool get_search(http_req & req, http_res & res) {
const char *FACET_QUERY = "facet_query";
const char *MAX_FACET_VALUES = "max_facet_values";
const char *GROUP_BY = "group_by";
const char *GROUP_LIMIT = "group_limit";
const char *PER_PAGE = "per_page";
const char *PAGE = "page";
const char *CALLBACK = "callback";
@ -249,11 +252,6 @@ bool get_search(http_req & req, http_res & res) {
return false;
}
if(req.params.count(QUERY_BY) == 0) {
res.set_400(std::string("Parameter `") + QUERY_BY + "` is required.");
return false;
}
if(req.params.count(MAX_FACET_VALUES) == 0) {
req.params[MAX_FACET_VALUES] = "10";
}
@ -291,41 +289,58 @@ bool get_search(http_req & req, http_res & res) {
req.params[EXCLUDE_FIELDS] = "";
}
if(!StringUtils::is_uint64_t(req.params[DROP_TOKENS_THRESHOLD])) {
if(req.params.count(GROUP_BY) == 0) {
req.params[GROUP_BY] = "";
}
if(req.params.count(GROUP_LIMIT) == 0) {
if(req.params[GROUP_BY] != "") {
req.params[GROUP_LIMIT] = "3";
} else {
req.params[GROUP_LIMIT] = "0";
}
}
if(!StringUtils::is_uint32_t(req.params[DROP_TOKENS_THRESHOLD])) {
res.set_400("Parameter `" + std::string(DROP_TOKENS_THRESHOLD) + "` must be an unsigned integer.");
return false;
}
if(!StringUtils::is_uint64_t(req.params[TYPO_TOKENS_THRESHOLD])) {
if(!StringUtils::is_uint32_t(req.params[TYPO_TOKENS_THRESHOLD])) {
res.set_400("Parameter `" + std::string(TYPO_TOKENS_THRESHOLD) + "` must be an unsigned integer.");
return false;
}
if(!StringUtils::is_uint64_t(req.params[NUM_TYPOS])) {
if(!StringUtils::is_uint32_t(req.params[NUM_TYPOS])) {
res.set_400("Parameter `" + std::string(NUM_TYPOS) + "` must be an unsigned integer.");
return false;
}
if(!StringUtils::is_uint64_t(req.params[PER_PAGE])) {
if(!StringUtils::is_uint32_t(req.params[PER_PAGE])) {
res.set_400("Parameter `" + std::string(PER_PAGE) + "` must be an unsigned integer.");
return false;
}
if(!StringUtils::is_uint64_t(req.params[PAGE])) {
if(!StringUtils::is_uint32_t(req.params[PAGE])) {
res.set_400("Parameter `" + std::string(PAGE) + "` must be an unsigned integer.");
return false;
}
if(!StringUtils::is_uint64_t(req.params[MAX_FACET_VALUES])) {
if(!StringUtils::is_uint32_t(req.params[MAX_FACET_VALUES])) {
res.set_400("Parameter `" + std::string(MAX_FACET_VALUES) + "` must be an unsigned integer.");
return false;
}
if(!StringUtils::is_uint64_t(req.params[SNIPPET_THRESHOLD])) {
if(!StringUtils::is_uint32_t(req.params[SNIPPET_THRESHOLD])) {
res.set_400("Parameter `" + std::string(SNIPPET_THRESHOLD) + "` must be an unsigned integer.");
return false;
}
if(!StringUtils::is_uint32_t(req.params[GROUP_LIMIT])) {
res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer.");
return false;
}
std::string filter_str = req.params.count(FILTER) != 0 ? req.params[FILTER] : "";
std::vector<std::string> search_fields;
@ -343,6 +358,9 @@ bool get_search(http_req & req, http_res & res) {
spp::sparse_hash_set<std::string> include_fields(include_fields_vec.begin(), include_fields_vec.end());
spp::sparse_hash_set<std::string> exclude_fields(exclude_fields_vec.begin(), exclude_fields_vec.end());
std::vector<std::string> group_by_fields;
StringUtils::split(req.params[GROUP_BY], group_by_fields, ",");
std::vector<sort_by> sort_fields;
if(req.params.count(SORT_BY) != 0) {
std::vector<std::string> sort_field_strs;
@ -367,7 +385,8 @@ bool get_search(http_req & req, http_res & res) {
}
}
std::map<std::string, size_t> pinned_hits;
std::map<size_t, std::vector<std::string>> pinned_hits;
if(req.params.count(PINNED_HITS) != 0) {
std::vector<std::string> pinned_hits_strs;
StringUtils::split(req.params[PINNED_HITS], pinned_hits_strs, ",");
@ -392,7 +411,7 @@ bool get_search(http_req & req, http_res & res) {
return false;
}
pinned_hits.emplace(expression_parts[0], position);
pinned_hits[position].emplace_back(expression_parts[0]);
}
}
@ -422,17 +441,19 @@ bool get_search(http_req & req, http_res & res) {
Option<nlohmann::json> result_op = collection->search(req.params[QUERY], search_fields, filter_str, facet_fields,
sort_fields, std::stoi(req.params[NUM_TYPOS]),
static_cast<size_t>(std::stoi(req.params[PER_PAGE])),
static_cast<size_t>(std::stoi(req.params[PAGE])),
static_cast<size_t>(std::stol(req.params[PER_PAGE])),
static_cast<size_t>(std::stol(req.params[PAGE])),
token_order, prefix, drop_tokens_threshold,
include_fields, exclude_fields,
static_cast<size_t>(std::stoi(req.params[MAX_FACET_VALUES])),
static_cast<size_t>(std::stol(req.params[MAX_FACET_VALUES])),
req.params[FACET_QUERY],
static_cast<size_t>(std::stoi(req.params[SNIPPET_THRESHOLD])),
static_cast<size_t>(std::stol(req.params[SNIPPET_THRESHOLD])),
req.params[HIGHLIGHT_FULL_FIELDS],
typo_tokens_threshold,
pinned_hits,
hidden_hits
hidden_hits,
group_by_fields,
static_cast<size_t>(std::stol(req.params[GROUP_LIMIT]))
);
uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
@ -873,7 +894,7 @@ bool post_create_key(http_req &req, http_res &res) {
return false;
}
const std::string &rand_key = StringUtils::randstring(AuthManager::KEY_LEN, req.seed);
const std::string &rand_key = req.metadata;
api_key_t api_key(
rand_key,

View File

@ -6,6 +6,7 @@
#include <signal.h>
#include <h2o.h>
#include <iostream>
#include <auth_manager.h>
#include "raft_server.h"
#include "logger.h"
@ -186,20 +187,6 @@ std::string HttpServer::get_version() {
void HttpServer::clear_timeouts(const std::vector<h2o_timer_t*> & timers, bool trigger_callback) {
for(h2o_timer_t* timer: timers) {
h2o_timer_unlink(timer);
/*while (!h2o_linklist_is_empty(&timer->_link)) {
h2o_timer_t *entry = H2O_STRUCT_FROM_MEMBER(h2o_timer_t, _link, timer->_link.next);
if(entry == nullptr) {
continue;
}
if(trigger_callback) {
entry->cb(entry);
}
//entry->expire_at = 0;
h2o_linklist_unlink(&entry->_link);
h2o_timer_unlink(timer);
}*/
}
}
@ -385,6 +372,12 @@ int HttpServer::catch_all_handler(h2o_handler_t *_self, h2o_req_t *req) {
}
// routes match and is an authenticated request
// do any additional pre-request middleware operations here
if(rpath->action == "keys:create") {
// we enrich incoming request with a random API key here so that leader and replicas will use the same key
request->metadata = StringUtils::randstring(AuthManager::KEY_LEN);
}
// for writes, we defer to replication_state
if(http_method != "GET") {
self->http_server->get_replication_state()->write(request, response);
@ -476,26 +469,13 @@ void HttpServer::on(const std::string & message, bool (*handler)(void*)) {
HttpServer::~HttpServer() {
delete message_dispatcher;
// remove all timeouts defined in: https://github.com/h2o/h2o/blob/v2.2.2/lib/core/context.c#L142
/*std::vector<h2o_timeout_t*> timeouts = {
&ctx.zero_timeout,
&ctx.one_sec_timeout,
&ctx.hundred_ms_timeout,
&ctx.handshake_timeout,
&ctx.http1.req_timeout,
&ctx.http2.idle_timeout,
&ctx.http2.graceful_shutdown_timeout,
&ctx.proxy.io_timeout
};
clear_timeouts(timeouts);
*/
if(ssl_refresh_timer.timer.expire_at != 0) {
// avoid callback since it recreates timeout
clear_timeouts({&ssl_refresh_timer.timer}, false);
}
h2o_timerwheel_run(ctx.loop->_timeouts, 9999999999999);
h2o_context_dispose(&ctx);
free(ctx.globalconf->server_name.base);

View File

@ -633,7 +633,7 @@ void Index::compute_facet_stats(facet &a_facet, int64_t raw_value, const std::st
void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
const uint32_t* result_ids, size_t results_size) {
std::map<std::string, size_t> facet_to_index;
std::unordered_map<std::string, size_t> facet_to_index;
size_t i_facet = 0;
for(const auto & facet: facet_schema) {
@ -661,6 +661,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
std::vector<std::string> query_tokens;
StringUtils::split(facet_query.query, query_tokens, " ");
// for non-string fields, `faceted_name` returns their aliased stringified field name
art_tree *t = search_index.at(facet_field.faceted_name());
for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
@ -703,7 +704,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_id];
int array_pos = 0;
int fvalue_found = 0;
bool fvalue_found = false;
std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
spp::sparse_hash_map<uint32_t, token_pos_cost_t> query_token_positions;
size_t field_token_index = -1;
@ -717,9 +718,9 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
// ftoken_hash is the raw value for numeric fields
compute_facet_stats(a_facet, ftoken_hash, facet_field.type);
// not using facet query or this particular facet value is found in facet filter
if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) {
// not using facet query or this particular facet value is found in facet filter
fvalue_found |= 1; // bitwise to ensure only one count for a multi-token facet value
fvalue_found = true;
if(use_facet_query) {
// map token index to query index (used for highlighting later on)
@ -736,41 +737,35 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
}
}
//std::cout << "j: " << j << std::endl;
// 0 indicates separator, while the second condition checks for non-array string
if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && j == fhashes.size() - 1)) {
if(!use_facet_query || fvalue_found != 0) {
if(!use_facet_query || fvalue_found) {
const std::string & fvalue_str = fvaluestream.str();
uint64_t fhash = 0;
if(facet_field.is_string()) {
fhash = facet_token_hash(facet_field, fvalue_str);
} else {
fhash = std::atoi(fvalue_str.c_str());
}
uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
if(a_facet.result_map.count(fhash) == 0) {
a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0,
a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_set<uint64_t>(),
doc_seq_id, 0,
spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
}
a_facet.result_map[fhash].count += 1;
a_facet.result_map[fhash].doc_id = doc_seq_id;
a_facet.result_map[fhash].array_pos = array_pos;
if(search_params->group_limit) {
uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id);
a_facet.result_map[fhash].groups.emplace(distinct_id);
} else {
a_facet.result_map[fhash].count += 1;
}
if(use_facet_query) {
a_facet.result_map[fhash].query_token_pos = query_token_positions;
/*if(j == 11) {
for(auto xx: query_token_positions) {
std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl;
}
}*/
}
}
array_pos++;
fvalue_found = 0;
fvalue_found = false;
std::stringstream().swap(fvaluestream);
spp::sparse_hash_map<uint32_t, token_pos_cost_t>().swap(query_token_positions);
field_token_index = -1;
@ -781,56 +776,12 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
}
}
void Index::drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids) {
std::map<std::string, size_t> facet_to_index;
size_t i_facet = 0;
for(const auto & facet: facet_schema) {
facet_to_index[facet.first] = i_facet;
i_facet++;
}
for(auto & a_facet: facets) {
const field & facet_field = facet_schema.at(a_facet.field_name);
// assumed that facet fields have already been validated upstream
for(const uint32_t doc_seq_id: ids) {
if(facet_index_v2.count(doc_seq_id) != 0) {
// FORMAT OF VALUES
// String: h1 h2 h3
// String array: h1 h2 h3 0 h1 0 h1 h2 0
const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
for(size_t j = 0; j < fhashes.size(); j++) {
if(fhashes[j] != FACET_ARRAY_DELIMETER) {
int64_t ftoken_hash = fhashes[j];
fvaluestream << ftoken_hash;
}
if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER &&
j == fhashes.size() - 1)) {
const std::string & fvalue_str = fvaluestream.str();
uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
if(a_facet.result_map.count(fhash) != 0) {
a_facet.result_map[fhash].count -= 1;
if(a_facet.result_map[fhash].count == 0) {
a_facet.result_map.erase(fhash);
}
}
std::stringstream().swap(fvaluestream);
}
}
}
}
}
}
void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
const std::vector<uint32_t>& curated_ids,
const std::vector<sort_by> & sort_fields,
std::vector<token_candidates> & token_candidates_vec, const token_ordering token_order,
std::vector<std::vector<art_leaf*>> & searched_queries, Topster & topster,
std::vector<token_candidates> & token_candidates_vec,
std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
spp::sparse_hash_set<uint64_t>& groups_processed,
uint32_t** all_result_ids, size_t & all_result_ids_len,
const size_t typo_tokens_threshold) {
const long long combination_limit = 10;
@ -874,6 +825,15 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
continue;
}
if(!curated_ids.empty()) {
uint32_t *excluded_result_ids = nullptr;
result_size = ArrayUtils::exclude_scalar(result_ids, result_size, &curated_ids[0],
curated_ids.size(), &excluded_result_ids);
delete [] result_ids;
result_ids = excluded_result_ids;
}
if(filter_ids != nullptr) {
// intersect once again with filter ids
uint32_t* filtered_result_ids = nullptr;
@ -888,7 +848,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
// go through each matching document id and calculate match score
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
filtered_result_ids, filtered_results_size);
groups_processed, filtered_result_ids, filtered_results_size);
delete[] filtered_result_ids;
delete[] result_ids;
@ -900,7 +860,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
*all_result_ids = new_all_result_ids;
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
result_ids, result_size);
groups_processed, result_ids, result_size);
delete[] result_ids;
}
@ -1056,13 +1016,17 @@ void Index::run_search() {
}
// after the wait, we own the lock.
search(search_params.outcome, search_params.query, search_params.search_fields,
search_params.filters, search_params.facets, search_params.facet_query, search_params.included_ids,
search_params.excluded_ids, search_params.sort_fields_std, search_params.num_typos,
search_params.max_hits, search_params.per_page, search_params.page, search_params.token_order,
search_params.prefix, search_params.drop_tokens_threshold, search_params.raw_result_kvs,
search_params.all_result_ids_len, search_params.searched_queries, search_params.override_result_kvs,
search_params.typo_tokens_threshold);
search(search_params->outcome, search_params->query, search_params->search_fields,
search_params->filters, search_params->facets, search_params->facet_query,
search_params->included_ids, search_params->excluded_ids,
search_params->sort_fields_std, search_params->num_typos,
search_params->topster, search_params->curated_topster,
search_params->per_page, search_params->page, search_params->token_order,
search_params->prefix, search_params->drop_tokens_threshold,
search_params->all_result_ids_len, search_params->groups_processed,
search_params->searched_queries,
search_params->raw_result_kvs, search_params->override_result_kvs,
search_params->typo_tokens_threshold);
// hand control back to main thread
processed = true;
@ -1074,12 +1038,12 @@ void Index::run_search() {
}
}
void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
const std::vector<uint32_t> & included_ids,
Topster & curated_topster,
std::vector<std::vector<art_leaf*>> & searched_queries) {
void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
Topster* curated_topster,
std::vector<std::vector<art_leaf*>> & searched_queries) {
if(included_ids.size() == 0) {
if(included_ids_map.empty()) {
return;
}
@ -1098,48 +1062,34 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
0, 0, 1, token_ordering::MAX_SCORE, false, leaves);
if(leaves.size() > 0) {
if(!leaves.empty()) {
override_query.push_back(leaves[0]);
}
}
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
for(const auto& pos_ids: included_ids_map) {
const size_t outer_pos = pos_ids.first;
for (art_leaf *token_leaf : override_query) {
uint32_t *indices = new uint32_t[included_ids.size()];
token_leaf->values->ids.indexOf(&included_ids[0], included_ids.size(), indices);
leaf_to_indices.emplace(token_leaf, indices);
}
for(const auto& index_seq_id: pos_ids.second) {
uint32_t inner_pos = index_seq_id.first;
uint32_t seq_id = index_seq_id.second;
for(size_t j=0; j<included_ids.size(); j++) {
const uint32_t seq_id = included_ids[j];
uint64_t distinct_id = outer_pos; // outer pos is the group distinct key
uint64_t match_score = (64000 - inner_pos); // inner pos within a group is the match score
std::vector<std::vector<std::vector<uint16_t>>> array_token_positions;
populate_token_positions(override_query, leaf_to_indices, j, array_token_positions);
// LOG(INFO) << "seq_id: " << seq_id << " - " << match_score;
uint64_t match_score = 0;
int64_t scores[3];
scores[0] = match_score;
scores[1] = int64_t(1);
scores[2] = int64_t(1);
for(const std::vector<std::vector<uint16_t>> & token_positions: array_token_positions) {
if(token_positions.empty()) {
continue;
}
const Match & match = Match::match(seq_id, token_positions);
uint64_t this_match_score = match.get_match_score(0, field_id);
if(this_match_score > match_score) {
match_score = this_match_score;
}
KV kv(field_id, searched_queries.size(), seq_id, distinct_id, match_score, scores);
curated_topster->add(&kv);
}
int64_t scores[3];
scores[0] = int64_t(match_score);
scores[1] = int64_t(1);
scores[2] = int64_t(1);
curated_topster.add(seq_id, field_id, searched_queries.size(), match_score, scores);
searched_queries.push_back(override_query);
}
searched_queries.push_back(override_query);
}
void Index::search(Option<uint32_t> & outcome,
@ -1147,19 +1097,20 @@ void Index::search(Option<uint32_t> & outcome,
const std::vector<std::string> & search_fields,
const std::vector<filter> & filters,
std::vector<facet> & facets, facet_query_t & facet_query,
const std::vector<uint32_t> & included_ids,
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
const std::vector<uint32_t> & excluded_ids,
const std::vector<sort_by> & sort_fields_std, const int num_typos, const size_t max_hits,
const std::vector<sort_by> & sort_fields_std, const int num_typos,
Topster* topster,
Topster* curated_topster,
const size_t per_page, const size_t page, const token_ordering token_order,
const bool prefix, const size_t drop_tokens_threshold,
std::vector<KV> & raw_result_kvs,
size_t & all_result_ids_len,
std::vector<std::vector<art_leaf*>> & searched_queries,
std::vector<KV> & override_result_kvs,
spp::sparse_hash_set<uint64_t>& groups_processed,
std::vector<std::vector<art_leaf*>>& searched_queries,
std::vector<std::vector<KV*>> & raw_result_kvs,
std::vector<std::vector<KV*>> & override_result_kvs,
const size_t typo_tokens_threshold) {
const size_t num_results = (page * per_page);
// process the filters
uint32_t* filter_ids = nullptr;
@ -1169,25 +1120,48 @@ void Index::search(Option<uint32_t> & outcome,
return ;
}
const uint32_t filter_ids_length = op_filter_ids_length.get();
uint32_t filter_ids_length = op_filter_ids_length.get();
// we will be removing all curated IDs from organic result ids before running topster
std::set<uint32_t> curated_ids;
std::vector<uint32_t> included_ids;
for(const auto& outer_pos_ids: included_ids_map) {
for(const auto& inner_pos_seq_id: outer_pos_ids.second) {
curated_ids.insert(inner_pos_seq_id.second);
included_ids.push_back(inner_pos_seq_id.second);
}
}
curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
std::vector<uint32_t> curated_ids_sorted(curated_ids.begin(), curated_ids.end());
std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end());
// Order of `fields` are used to sort results
//auto begin = std::chrono::high_resolution_clock::now();
uint32_t* all_result_ids = nullptr;
const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory
Topster topster(topster_size);
Topster curated_topster(topster_size);
if(query == "*") {
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
const std::string & field = search_fields[0];
if(!curated_ids.empty()) {
uint32_t *excluded_result_ids = nullptr;
filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0],
curated_ids.size(), &excluded_result_ids);
delete [] filter_ids;
filter_ids = excluded_result_ids;
}
score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
filter_ids, filter_ids_length);
collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
do_facets(facets, facet_query, filter_ids, filter_ids_length);
groups_processed, filter_ids, filter_ids_length);
collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries);
all_result_ids_len = filter_ids_length;
all_result_ids = filter_ids;
filter_ids = nullptr;
} else {
const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM);
for(size_t i = 0; i < num_search_fields; i++) {
@ -1196,47 +1170,22 @@ void Index::search(Option<uint32_t> & outcome,
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i); // Order of `fields` are used to sort results
const std::string & field = search_fields[i];
search_field(field_id, query, field, filter_ids, filter_ids_length, facets, sort_fields_std,
num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len,
search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len,
token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries);
}
}
do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
}
do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
do_facets(facets, facet_query, &included_ids[0], included_ids.size());
// must be sorted before iterated upon to remove "empty" array entries
topster.sort();
curated_topster.sort();
topster->sort();
curated_topster->sort();
std::set<uint32_t> ids_to_remove(included_ids.begin(), included_ids.end());
ids_to_remove.insert(excluded_ids.begin(), excluded_ids.end());
std::vector<uint32_t> dropped_ids;
// loop through topster and remove elements from included and excluded id lists
for(uint32_t t = 0; t < topster.size && t < num_results; t++) {
KV* kv = topster.getKV(t);
if(ids_to_remove.count(kv->key) != 0) {
dropped_ids.push_back((uint32_t)kv->key);
} else {
raw_result_kvs.push_back(*kv);
}
}
for(uint32_t t = 0; t < curated_topster.size && t < num_results; t++) {
KV* kv = curated_topster.getKV(t);
override_result_kvs.push_back(*kv);
}
// for the ids that are dropped, remove their corresponding facet components from facet results
drop_facets(facets, dropped_ids);
all_result_ids_len -= dropped_ids.size();
all_result_ids_len += curated_topster.size;
all_result_ids_len += curated_topster->size;
delete [] filter_ids;
delete [] all_result_ids;
@ -1258,9 +1207,11 @@ void Index::search(Option<uint32_t> & outcome,
*/
void Index::search_field(const uint8_t & field_id, const std::string & query, const std::string & field,
uint32_t *filter_ids, size_t filter_ids_length,
const std::vector<uint32_t>& curated_ids,
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
std::vector<std::vector<art_leaf*>> & searched_queries,
Topster & topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
uint32_t** all_result_ids, size_t & all_result_ids_len,
const token_ordering token_order, const bool prefix,
const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) {
std::vector<std::string> tokens;
@ -1373,8 +1324,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) {
// If all tokens were found, go ahead and search for candidates with what we have so far
search_candidates(field_id, filter_ids, filter_ids_length, sort_fields, token_candidates_vec,
token_order, searched_queries, topster, all_result_ids, all_result_ids_len,
search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec,
searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
typo_tokens_threshold);
}
@ -1407,8 +1358,9 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
truncated_query += " " + token_count_pairs.at(i).first;
}
return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
searched_queries, topster, all_result_ids, all_result_ids_len,
return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids,
facets, sort_fields, num_typos,
searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
token_order, prefix);
}
}
@ -1434,8 +1386,9 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
}
void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
const uint8_t & field_id, const uint32_t total_cost, Topster & topster,
const uint8_t & field_id, const uint32_t total_cost, Topster* topster,
const std::vector<art_leaf *> &query_suggestion,
spp::sparse_hash_set<uint64_t>& groups_processed,
const uint32_t *result_ids, const size_t result_size) const {
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
@ -1467,6 +1420,16 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
Match single_token_match = Match(1, 0, 0, empty_offset_diffs);
const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost, field_id);
std::unordered_map<std::string, size_t> facet_to_id;
if(search_params->group_limit > 0) {
size_t i_facet = 0;
for(const auto & facet: facet_schema) {
facet_to_id[facet.first] = i_facet;
i_facet++;
}
}
for(size_t i=0; i<result_size; i++) {
const uint32_t seq_id = result_ids[i];
@ -1500,7 +1463,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
}
const int64_t default_score = 0;
int64_t scores[3];
int64_t scores[3] = {0};
// avoiding loop
if(sort_fields.size() > 0) {
@ -1541,7 +1504,15 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
}
}
topster.add(seq_id, field_id, query_index, match_score, scores);
uint64_t distinct_id = seq_id;
if(search_params->group_limit != 0) {
distinct_id = get_distinct_id(facet_to_id, seq_id);
groups_processed.emplace(distinct_id);
}
KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);
topster->add(&kv);
}
//long long int timeNanos = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
@ -1553,6 +1524,26 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
}
}
uint64_t Index::get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id,
const uint32_t seq_id) const {
uint64_t distinct_id = 1; // some constant initial value
// calculate hash from group_by_fields
for(const auto& field: search_params->group_by_fields) {
if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) {
continue;
}
size_t facet_id = facet_to_id.at(field);
const std::vector<uint64_t>& fhashes = facet_index_v2.at(seq_id)[facet_id];
for(const auto& hash: fhashes) {
distinct_id = hash_combine(distinct_id, hash);
}
}
return distinct_id;
}
void Index::populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
size_t result_index,

View File

@ -46,7 +46,7 @@ void benchmark_hn_titles(char* file_path) {
Store *store = new Store("/tmp/typesense-data");
CollectionManager & collectionManager = CollectionManager::get_instance();
collectionManager.init(store, 4, "abcd", "1234");
collectionManager.init(store, 4, "abcd");
collectionManager.load();
Collection *collection = collectionManager.get_collection("hnstories_direct");
@ -116,7 +116,7 @@ void benchmark_reactjs_pages(char* file_path) {
Store *store = new Store("/tmp/typesense-data");
CollectionManager & collectionManager = CollectionManager::get_instance();
collectionManager.init(store, 4, "abcd", "1234");
collectionManager.init(store, 4, "abcd");
collectionManager.load();
Collection *collection = collectionManager.get_collection("reactjs_pages");

View File

@ -21,7 +21,7 @@ int main(int argc, char* argv[]) {
Store *store = new Store(state_dir_path);
CollectionManager & collectionManager = CollectionManager::get_instance();
collectionManager.init(store, 4, "abcd", "123");
collectionManager.init(store, 4, "abcd");
collectionManager.load();
std::vector<field> fields_to_index = {

View File

@ -1,6 +1,6 @@
#include "typesense_server_utils.h"
#include "core_api.h"
#include "config.h"
#include "typesense_server_utils.h"
void master_server_routes() {
// collection management
@ -55,13 +55,7 @@ void replica_server_routes() {
server->get("/health", get_health);
}
namespace logging {
DECLARE_bool(log_year);
}
int main(int argc, char **argv) {
logging::FLAGS_log_year = true;
Config config;
cmdline::parser options;

View File

@ -34,6 +34,7 @@ int ReplicationState::start(const butil::EndPoint & peering_endpoint, const int
node_options.fsm = this;
node_options.node_owns_fsm = false;
node_options.snapshot_interval_s = snapshot_interval_s;
node_options.filter_before_copy_remote = false;
std::string prefix = "local://" + raft_dir;
node_options.log_uri = prefix + "/" + log_dir_name;
node_options.raft_meta_uri = prefix + "/" + meta_dir_name;
@ -297,7 +298,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) {
return -1;
}
LOG(TRACE) << "rm " << store->get_state_dir_path() << " success";
LOG(INFO) << "rm " << store->get_state_dir_path() << " success";
std::string snapshot_path = reader->get_path();
snapshot_path.append(std::string("/") + db_snapshot_name);
@ -308,7 +309,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) {
return -1;
}
LOG(TRACE) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success";
LOG(INFO) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success";
return init_db();
}

View File

@ -6,7 +6,7 @@ void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_lengt
max = array_length > 1 ? sorted_array[array_length-1] : min;
uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR);
uint8_t *out = new uint8_t[size_required];
uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out);
uint32_t actual_size = for_compress_sorted(sorted_array, out, array_length);
free(in);

View File

@ -1,8 +1,8 @@
#include "string_utils.h"
#include <iostream>
#include <openssl/evp.h>
#include <iomanip>
#include <openssl/hmac.h>
#include <random>
std::string lower_and_no_special_chars(const std::string & str) {
std::stringstream ss;
@ -19,6 +19,10 @@ std::string lower_and_no_special_chars(const std::string & str) {
}
void StringUtils::unicode_normalize(std::string & str) const {
if(str.empty()) {
return ;
}
std::stringstream out;
for (char *s = &str[0]; *s;) {
@ -49,16 +53,16 @@ void StringUtils::unicode_normalize(std::string & str) const {
}
}
str.assign(lower_and_no_special_chars(out.str()));
str = lower_and_no_special_chars(out.str());
}
std::string StringUtils::randstring(size_t length, uint64_t seed) {
std::string StringUtils::randstring(size_t length) {
static auto& chrs = "0123456789"
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
thread_local static std::mt19937 rg(seed);
thread_local static std::uniform_int_distribution<std::string::size_type> pick(0, sizeof(chrs) - 2);
thread_local std::mt19937 rg(std::random_device{}());
thread_local std::uniform_int_distribution<uint32_t> pick(0, sizeof(chrs) - 2);
std::string s;
s.reserve(length);

View File

@ -27,9 +27,9 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
rusage r_usage;
getrusage(RUSAGE_SELF, &r_usage);
result["memory_used_process_bytes"] = r_usage.ru_maxrss * 1000;
result["typesense_memory_used_bytes"] = r_usage.ru_maxrss * 1000;
uint64_t memory_free_bytes = 0;
uint64_t memory_available_bytes = 0;
uint64_t memory_total_bytes = 0;
#ifdef __APPLE__
@ -42,7 +42,7 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
if (KERN_SUCCESS == host_page_size(mach_port, &mach_page_size) &&
KERN_SUCCESS == host_statistics64(mach_port, HOST_VM_INFO,
(host_info64_t)&vm_stats, &count)) {
memory_free_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size;
memory_available_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size;
}
uint64_t pages = sysconf(_SC_PHYS_PAGES);
@ -51,11 +51,11 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
#elif __linux__
struct sysinfo sys_info;
sysinfo(&sys_info);
memory_free_bytes = sys_info.freeram;
memory_available_bytes = linux_get_mem_available_bytes();
memory_total_bytes = sys_info.totalram;
#endif
result["memory_free_bytes"] = memory_free_bytes;
result["memory_available_bytes"] = memory_available_bytes;
result["memory_total_bytes"] = memory_total_bytes;
// CPU METRICS

View File

@ -133,35 +133,45 @@ int init_logger(Config & config, const std::string & server_version) {
signal(SIGILL, catch_crash);
signal(SIGSEGV, catch_crash);
logging::LoggingSettings log_settings;
// we can install new signal handlers only after overriding above
signal(SIGINT, catch_interrupt);
signal(SIGTERM, catch_interrupt);
google::InitGoogleLogging("typesense");
std::string log_dir = config.get_log_dir();
std::string log_path;
if(log_dir.empty()) {
// use console logger if log dir is not specified
log_settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
FLAGS_logtostderr = true;
} else {
if(!directory_exists(log_dir)) {
std::cerr << "Typesense failed to start. " << "Log directory " << log_dir << " does not exist.";
return 1;
}
log_settings.logging_dest = logging::LOG_TO_FILE;
log_path = log_dir + "/" + "typesense.log";
log_settings.log_file = log_path.c_str();
// flush log levels above -1 immediately (INFO=0)
FLAGS_logbuflevel = -1;
LOG(INFO) << "Starting Typesense " << server_version << ". Log directory is configured as: "
<< log_dir << std::endl;
// available only on glog master (ensures that log file name is constant)
FLAGS_timestamp_in_logfile_name = false;
std::string log_path = log_dir + "/" + "typesense.log";
// will log levels INFO **and above** to the given log file
google::SetLogDestination(google::INFO, log_path.c_str());
// don't create symlink for INFO log
google::SetLogSymlink(google::INFO, "");
// don't create separate log files for each level
google::SetLogDestination(google::WARNING, "");
google::SetLogDestination(google::ERROR, "");
google::SetLogDestination(google::FATAL, "");
std::cout << "Log directory is configured as: " << log_dir << std::endl;
}
logging::InitLogging(log_settings);
logging::SetMinLogLevel(0);
return 0;
}
@ -275,7 +285,6 @@ int start_raft_server(ReplicationState& replication_state, const std::string& st
}
int run_server(const Config & config, const std::string & version, void (*master_server_routes)()) {
LOG(INFO) << "Starting Typesense " << version << std::flush;
quit_raft_service = false;

View File

@ -15,7 +15,7 @@ protected:
std::vector<sort_by> sort_fields;
void setupCollection() {
std::string state_dir_path = "/tmp/typesense_test/collection_sorting";
std::string state_dir_path = "/tmp/typesense_test/collection_faceting";
LOG(INFO) << "Truncating and creating: " << state_dir_path;
system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
@ -29,6 +29,7 @@ protected:
}
virtual void TearDown() {
collectionManager.dispose();
delete store;
}
};

View File

@ -0,0 +1,357 @@
#include <gtest/gtest.h>
#include <string>
#include <vector>
#include <fstream>
#include <algorithm>
#include <collection_manager.h>
#include "collection.h"
class CollectionGroupingTest : public ::testing::Test {
protected:
Store *store;
CollectionManager & collectionManager = CollectionManager::get_instance();
Collection *coll_group;
void setupCollection() {
std::string state_dir_path = "/tmp/typesense_test/collection_grouping";
LOG(INFO) << "Truncating and creating: " << state_dir_path;
system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
store = new Store(state_dir_path);
collectionManager.init(store, 4, "auth_key");
collectionManager.load();
std::vector<field> fields = {
field("title", field_types::STRING, false),
field("brand", field_types::STRING, true, true),
field("size", field_types::INT32, true, false),
field("colors", field_types::STRING_ARRAY, true, false),
field("rating", field_types::FLOAT, true, false)
};
coll_group = collectionManager.get_collection("coll_group");
if(coll_group == nullptr) {
coll_group = collectionManager.create_collection("coll_group", fields, "rating").get();
}
std::ifstream infile(std::string(ROOT_DIR)+"test/group_documents.jsonl");
std::string json_line;
while (std::getline(infile, json_line)) {
auto add_op = coll_group->add(json_line);
if(!add_op.ok()) {
std::cout << add_op.error() << std::endl;
}
ASSERT_TRUE(add_op.ok());
}
infile.close();
}
virtual void SetUp() {
setupCollection();
}
virtual void TearDown() {
collectionManager.dispose();
delete store;
}
};
TEST_F(CollectionGroupingTest, GroupingBasics) {
// group by size (int32)
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
"", 10,
{}, {}, {"size"}, 2).get();
ASSERT_EQ(3, res["found"].get<size_t>());
ASSERT_EQ(3, res["grouped_hits"].size());
ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());
ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
ASSERT_EQ(11, res["grouped_hits"][0]["hits"][0]["document"]["size"].get<size_t>());
ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get<float>());
ASSERT_STREQ("1", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get<float>());
ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get<float>());
ASSERT_STREQ("2", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get<float>());
ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
// group by rating (float) and sort by size
std::vector<sort_by> sort_size = {sort_by("size", "DESC")};
res = coll_group->search("*", {}, "", {"brand"}, sort_size, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
"", 10,
{}, {}, {"rating"}, 2).get();
// 7 unique ratings
ASSERT_EQ(7, res["found"].get<size_t>());
ASSERT_EQ(7, res["grouped_hits"].size());
ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["group_key"][0].get<float>());
ASSERT_EQ(12, res["grouped_hits"][0]["hits"][0]["document"]["size"].get<uint32_t>());
ASSERT_STREQ("8", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
ASSERT_EQ(12, res["grouped_hits"][1]["hits"][0]["document"]["size"].get<uint32_t>());
ASSERT_STREQ("6", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
ASSERT_EQ(11, res["grouped_hits"][1]["hits"][1]["document"]["size"].get<uint32_t>());
ASSERT_STREQ("1", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get<float>());
ASSERT_EQ(10, res["grouped_hits"][5]["hits"][0]["document"]["size"].get<uint32_t>());
ASSERT_STREQ("9", res["grouped_hits"][5]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.1, res["grouped_hits"][5]["hits"][0]["document"]["rating"].get<float>());
ASSERT_EQ(10, res["grouped_hits"][6]["hits"][0]["document"]["size"].get<uint32_t>());
ASSERT_STREQ("0", res["grouped_hits"][6]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][6]["hits"][0]["document"]["rating"].get<float>());
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_STREQ("<mark>Omeg</mark>a", res["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
}
TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
// group by size+brand (int32, string)
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
"", 10,
{}, {}, {"size", "brand"}, 2).get();
ASSERT_EQ(10, res["found"].get<size_t>());
ASSERT_EQ(10, res["grouped_hits"].size());
ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());
ASSERT_STREQ("Beta", res["grouped_hits"][0]["group_key"][1].get<std::string>().c_str());
// optional field should have no value in the group key component
ASSERT_EQ(1, res["grouped_hits"][5]["group_key"].size());
ASSERT_STREQ("10", res["grouped_hits"][5]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("11", res["grouped_hits"][5]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size());
ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ(1, res["grouped_hits"][1]["hits"].size());
ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ(2, res["grouped_hits"][2]["hits"].size());
ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get<float>());
ASSERT_STREQ("3", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get<float>());
ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
// pagination with page=2, per_page=2
res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 2, 2, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
"", 10,
{}, {}, {"size", "brand"}, 2).get();
// 3rd result from previous assertion will be in the first position
ASSERT_EQ(2, res["grouped_hits"][0]["hits"].size());
ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
ASSERT_STREQ("3", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get<float>());
ASSERT_STREQ("0", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
// total count and facet counts should be the same
ASSERT_EQ(10, res["found"].get<size_t>());
ASSERT_EQ(2, res["grouped_hits"].size());
ASSERT_EQ(10, res["grouped_hits"][0]["group_key"][0].get<size_t>());
ASSERT_STREQ("Omega", res["grouped_hits"][0]["group_key"][1].get<std::string>().c_str());
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
// respect min and max grouping limit (greater than 0 and less than 99)
auto res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
"", 10,
{}, {}, {"rating"}, 100);
ASSERT_FALSE(res_op.ok());
ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str());
res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
"", 10,
{}, {}, {"rating"}, 0);
ASSERT_FALSE(res_op.ok());
ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str());
}
TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) {
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
"", 10,
{}, {}, {"brand"}, 1).get();
ASSERT_EQ(5, res["found"].get<size_t>());
ASSERT_EQ(5, res["grouped_hits"].size());
// all hits array must be of size 1
for(auto i=0; i<5; i++) {
ASSERT_EQ(1, res["grouped_hits"][i]["hits"].size());
}
ASSERT_STREQ("4", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("2", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("10", res["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str()); // unbranded
ASSERT_STREQ("9", res["grouped_hits"][4]["hits"][0]["document"]["id"].get<std::string>().c_str());
// facet counts should each be 1, including unbranded
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
for(size_t i=0; i < 4; i++) {
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][i]["count"]);
}
}
TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
nlohmann::json override_json_include = {
{"id", "include-rule"},
{
"rule", {
{"query", "shirt"},
{"match", override_t::MATCH_EXACT}
}
}
};
override_json_include["includes"] = nlohmann::json::array();
override_json_include["includes"][0] = nlohmann::json::object();
override_json_include["includes"][0]["id"] = "11";
override_json_include["includes"][0]["position"] = 1;
override_json_include["includes"][1] = nlohmann::json::object();
override_json_include["includes"][1]["id"] = "10";
override_json_include["includes"][1]["position"] = 1;
nlohmann::json override_json_exclude = {
{"id", "exclude-rule"},
{
"rule", {
{"query", "shirt"},
{"match", override_t::MATCH_EXACT}
}
}
};
override_json_exclude["excludes"] = nlohmann::json::array();
override_json_exclude["excludes"][0] = nlohmann::json::object();
override_json_exclude["excludes"][0]["id"] = "2";
override_t override1(override_json_include);
override_t override2(override_json_exclude);
Option<uint32_t> ov1_op = coll_group->add_override(override1);
Option<uint32_t> ov2_op = coll_group->add_override(override2);
ASSERT_TRUE(ov1_op.ok());
ASSERT_TRUE(ov2_op.ok());
auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30,
"", 10,
{}, {}, {"colors"}, 2).get();
ASSERT_EQ(4, res["found"].get<size_t>());
ASSERT_EQ(4, res["grouped_hits"].size());
ASSERT_EQ(1, res["grouped_hits"][0]["group_key"][0].size());
ASSERT_STREQ("white", res["grouped_hits"][0]["group_key"][0][0].get<std::string>().c_str());
ASSERT_STREQ("11", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("10", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("5", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("4", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_EQ(1, res["grouped_hits"][3]["hits"].size());
ASSERT_STREQ("8", res["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str());
// assert facet counts
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][0]["count"]);
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][1]["count"]);
ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
}

View File

@ -40,6 +40,7 @@ protected:
virtual void TearDown() {
collectionManager.drop_collection("collection1");
collectionManager.dispose();
delete store;
}
};

View File

@ -18,7 +18,7 @@ protected:
system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
store = new Store(state_dir_path);
collectionManager.init(store, 1, "auth_key");
collectionManager.init(store, 4, "auth_key");
collectionManager.load();
std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
@ -49,6 +49,7 @@ protected:
virtual void TearDown() {
collectionManager.drop_collection("coll_mul_fields");
collectionManager.dispose();
delete store;
}
};
@ -121,6 +122,11 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeExactQueryMatch) {
ASSERT_STREQ("3", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get<std::string>().c_str());
// curated results should be marked as such
ASSERT_EQ(true, results["hits"][0]["curated"].get<bool>());
ASSERT_EQ(true, results["hits"][1]["curated"].get<bool>());
ASSERT_EQ(0, results["hits"][2].count("curated"));
coll_mul_fields->remove_override("exclude-rule");
coll_mul_fields->remove_override("include-rule");
@ -219,6 +225,8 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();
ASSERT_EQ(9, results["found"].get<size_t>());
// "count" would be `2` without exclusion
ASSERT_EQ("<mark>Scott</mark> Glenn", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>());
ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
@ -233,7 +241,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();
ASSERT_EQ(10, results["found"].get<size_t>());
ASSERT_EQ(9, results["found"].get<size_t>());
ASSERT_EQ(0, results["hits"].size());
coll_mul_fields->remove_override("exclude-rule");
@ -246,7 +254,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "").get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(2, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
@ -254,10 +262,9 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
}
TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
std::map<std::string, size_t> pinned_hits;
std::vector<std::string> hidden_hits;
pinned_hits["13"] = 1;
pinned_hits["4"] = 2;
std::map<size_t, std::vector<std::string>> pinned_hits;
pinned_hits[1] = {"13"};
pinned_hits[2] = {"4"};
// basic pinning
@ -277,6 +284,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
// both pinning and hiding
std::vector<std::string> hidden_hits;
hidden_hits = {"11", "16"};
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
@ -289,6 +297,21 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
ASSERT_STREQ("4", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("6", results["hits"][2]["document"]["id"].get<std::string>().c_str());
// paginating such that pinned hits appear on second page
pinned_hits.clear();
pinned_hits[4] = {"13"};
pinned_hits[5] = {"4"};
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 2, 2, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
"", 10,
pinned_hits, hidden_hits).get();
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("13", results["hits"][1]["document"]["id"].get<std::string>().c_str());
// take precedence over override rules
nlohmann::json override_json_include = {
@ -325,4 +348,60 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
ASSERT_EQ(8, results["found"].get<size_t>());
ASSERT_STREQ("8", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("6", results["hits"][1]["document"]["id"].get<std::string>().c_str());
}
TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
std::map<size_t, std::vector<std::string>> pinned_hits;
pinned_hits[1] = {"6", "8"};
pinned_hits[2] = {"1"};
pinned_hits[3] = {"13", "4"};
// without any grouping parameter, only the first ID in a position should be picked
// and other IDs should appear in their original positions
auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
"", 10,
pinned_hits, {}).get();
ASSERT_EQ(10, results["found"].get<size_t>());
ASSERT_STREQ("6", results["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("11", results["hits"][3]["document"]["id"].get<std::string>().c_str());
// pinned hits should be marked as curated
ASSERT_EQ(true, results["hits"][0]["curated"].get<bool>());
ASSERT_EQ(true, results["hits"][1]["curated"].get<bool>());
ASSERT_EQ(true, results["hits"][2]["curated"].get<bool>());
ASSERT_EQ(0, results["hits"][3].count("curated"));
// with grouping
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
false, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
"", 10,
pinned_hits, {}, {"cast"}, 2).get();
ASSERT_EQ(8, results["found"].get<size_t>());
ASSERT_EQ(1, results["grouped_hits"][0]["group_key"].size());
ASSERT_EQ(2, results["grouped_hits"][0]["group_key"][0].size());
ASSERT_STREQ("Chris Evans", results["grouped_hits"][0]["group_key"][0][0].get<std::string>().c_str());
ASSERT_STREQ("Scarlett Johansson", results["grouped_hits"][0]["group_key"][0][1].get<std::string>().c_str());
ASSERT_STREQ("6", results["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("8", results["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("1", results["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("13", results["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("4", results["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("11", results["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str());
ASSERT_STREQ("16", results["grouped_hits"][4]["hits"][0]["document"]["id"].get<std::string>().c_str());
}

View File

@ -29,6 +29,7 @@ protected:
}
virtual void TearDown() {
collectionManager.dispose();
delete store;
}
};
@ -248,6 +249,7 @@ TEST_F(CollectionSortingTest, ThreeSortFieldsLimit) {
sort_by("min", "DESC"),
};
query_fields = {"title"};
auto res_op = coll1->search("the", query_fields, "", {}, sort_fields_desc, 0, 10, 1, FREQUENCY, false);
ASSERT_FALSE(res_op.ok());

View File

@ -56,6 +56,7 @@ protected:
virtual void TearDown() {
collectionManager.drop_collection("collection");
collectionManager.dispose();
delete store;
}
};
@ -455,6 +456,18 @@ TEST_F(CollectionTest, WildcardQuery) {
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
// wildcard query should not require a search field
results_op = collection->search("*", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false);
ASSERT_TRUE(results_op.ok());
results = results_op.get();
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ(25, results["found"].get<uint32_t>());
// non-wildcard query should require a search field
results_op = collection->search("the", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false);
ASSERT_FALSE(results_op.ok());
ASSERT_STREQ("No search fields specified for the query.", results_op.error().c_str());
}
TEST_F(CollectionTest, PrefixSearching) {
@ -2259,7 +2272,6 @@ TEST_F(CollectionTest, OptionalFields) {
// try fetching the schema (should contain optional field)
nlohmann::json coll_summary = coll1->get_summary_json();
LOG(INFO) << coll_summary;
ASSERT_STREQ("title", coll_summary["fields"][0]["name"].get<std::string>().c_str());
ASSERT_STREQ("string", coll_summary["fields"][0]["type"].get<std::string>().c_str());
ASSERT_FALSE(coll_summary["fields"][0]["facet"].get<bool>());

View File

@ -0,0 +1,12 @@
{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["white", "blue"], "rating": 4.5}
{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 11, "colors": ["white", "blue"], "rating": 4.3}
{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 12, "colors": ["white", "blue"], "rating": 4.6}
{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["blue"], "rating": 4.6}
{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 10, "colors": ["white", "blue"], "rating": 4.8}
{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 11, "colors": ["blue"], "rating": 4.8}
{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 12, "colors": ["white", "blue"], "rating": 4.3}
{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 10, "colors": ["white", "blue"], "rating": 4.3}
{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 12, "colors": ["white", "red"], "rating": 4.4}
{"title": "Zeta Casual Shirt", "brand": "Zeta", "size": 10, "colors": ["white", "blue"], "rating": 4.1}
{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 4.3}
{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 3.3}

View File

@ -54,3 +54,8 @@ TEST(StringUtilsTest, HMAC) {
std::string digest1 = StringUtils::hmac("KeyVal", "{\"filter_by\": \"user_id:1080\"}");
ASSERT_STREQ("IvjqWNZ5M5ElcvbMoXj45BxkQrZG4ZKEaNQoRioCx2s=", digest1.c_str());
}
TEST(StringUtilsTest, UInt32Validation) {
std::string big_num = "99999999999999999999999999999999";
ASSERT_FALSE(StringUtils::is_uint32_t(big_num));
}

View File

@ -36,7 +36,8 @@ TEST(TopsterTest, MaxIntValues) {
scores[1] = data[i].primary_attr;
scores[2] = data[i].secondary_attr;
topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores);
KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores);
topster.add(&kv);
}
topster.sort();
@ -87,7 +88,8 @@ TEST(TopsterTest, MaxFloatValues) {
scores[1] = Index::float_to_in64_t(data[i].primary_attr);
scores[2] = data[i].secondary_attr;
topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores);
KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores);
topster.add(&kv);
}
topster.sort();
@ -97,4 +99,64 @@ TEST(TopsterTest, MaxFloatValues) {
for(uint32_t i = 0; i < topster.size; i++) {
EXPECT_EQ(ids[i], topster.getKeyAt(i));
}
}
TEST(TopsterTest, DistinctIntValues) {
Topster dist_topster(5, 2);
struct {
uint8_t field_id;
uint16_t query_index;
uint64_t distinct_key;
uint64_t match_score;
int64_t primary_attr;
int64_t secondary_attr;
} data[14] = {
{1, 0, 1, 11, 20, 30},
{1, 0, 1, 12, 20, 32},
{1, 0, 2, 4, 20, 30},
{1, 2, 3, 7, 20, 30},
{1, 0, 4, 14, 20, 30},
{1, 1, 5, 9, 20, 30},
{1, 1, 5, 10, 20, 32},
{1, 1, 5, 9, 20, 30},
{1, 0, 6, 6, 20, 30},
{1, 2, 7, 6, 22, 30},
{1, 2, 7, 6, 22, 30},
{1, 1, 8, 9, 20, 30},
{1, 0, 9, 8, 20, 30},
{1, 3, 10, 5, 20, 30},
};
for(int i = 0; i < 14; i++) {
int64_t scores[3];
scores[0] = int64_t(data[i].match_score);
scores[1] = data[i].primary_attr;
scores[2] = data[i].secondary_attr;
KV kv(data[i].field_id, data[i].query_index, i+100, data[i].distinct_key, data[i].match_score, scores);
dist_topster.add(&kv);
}
dist_topster.sort();
std::vector<uint64_t> distinct_ids = {4, 1, 5, 8, 9};
for(uint32_t i = 0; i < dist_topster.size; i++) {
EXPECT_EQ(distinct_ids[i], dist_topster.getDistinctKeyAt(i));
if(distinct_ids[i] == 1) {
EXPECT_EQ(12, (int) dist_topster.getKV(i)->match_score);
EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size);
EXPECT_EQ(12, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score);
EXPECT_EQ(11, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score);
}
if(distinct_ids[i] == 5) {
EXPECT_EQ(10, (int) dist_topster.getKV(i)->match_score);
EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size);
EXPECT_EQ(10, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score);
EXPECT_EQ(9, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score);
}
}
}