mirror of
https://github.com/typesense/typesense.git
synced 2025-05-17 04:02:36 +08:00
Merge branch 'distinct-grouping'
This commit is contained in:
commit
3e82089423
@ -2,7 +2,7 @@ version: 2
|
||||
jobs:
|
||||
build:
|
||||
docker:
|
||||
- image: typesense/typesense-development:22-MAR-2020-5
|
||||
- image: typesense/typesense-development:23-JUNE-2020-1
|
||||
environment:
|
||||
- PROJECT_DIR: /typesense
|
||||
- TYPESENSE_VERSION: $CIRCLE_BRANCH-$CIRCLE_SHA1
|
||||
@ -16,7 +16,7 @@ jobs:
|
||||
keys:
|
||||
- external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }}
|
||||
- external-Linux-cache-{{ .Branch }}
|
||||
- external-Linux-cache
|
||||
- external-Linux-cache
|
||||
- run:
|
||||
name: build
|
||||
command: $PROJECT_DIR/build.sh
|
||||
|
@ -43,6 +43,7 @@ FIND_PACKAGE(ICU REQUIRED)
|
||||
FIND_PACKAGE(Protobuf REQUIRED)
|
||||
FIND_PACKAGE(LevelDB REQUIRED)
|
||||
FIND_PACKAGE(gflags REQUIRED)
|
||||
FIND_PACKAGE(glog REQUIRED)
|
||||
|
||||
message("OpenSSL library: ${OPENSSL_LIBRARIES}")
|
||||
|
||||
@ -53,11 +54,6 @@ include(cmake/GoogleTest.cmake)
|
||||
include(cmake/TestResources.cmake)
|
||||
include(cmake/Iconv.cmake)
|
||||
|
||||
if (APPLE)
|
||||
include(cmake/brpc.cmake)
|
||||
include(cmake/braft.cmake)
|
||||
endif()
|
||||
|
||||
FILE(GLOB SRC_FILES src/*.cpp)
|
||||
FILE(GLOB TEST_FILES test/*.cpp)
|
||||
|
||||
@ -87,8 +83,10 @@ link_directories(${DEP_ROOT_DIR}/${FOR_NAME})
|
||||
link_directories(${DEP_ROOT_DIR}/${H2O_NAME}/build)
|
||||
link_directories(${DEP_ROOT_DIR}/${ROCKSDB_NAME})
|
||||
link_directories(${DEP_ROOT_DIR}/${ICONV_NAME}/lib/.libs)
|
||||
if (APPLE)
|
||||
link_directories(${DEP_ROOT_DIR}/${BRPC_NAME}/lib)
|
||||
link_directories(${DEP_ROOT_DIR}/${BRAFT_NAME}/lib)
|
||||
endif()
|
||||
|
||||
# Write dependency libraries to a file
|
||||
file(WRITE ${DEP_ROOT_DIR}/libs.txt "")
|
||||
@ -152,7 +150,7 @@ set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIE
|
||||
set(CORE_LIBS h2o-evloop iconv ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES}
|
||||
${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} pthread dl ${STD_LIB})
|
||||
|
||||
set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS})
|
||||
set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} glog ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS})
|
||||
|
||||
target_link_libraries(typesense-core ${CORE_LIBS})
|
||||
target_link_libraries(typesense-server ${CORE_LIBS})
|
||||
|
@ -37,7 +37,7 @@ Here's a quick example showcasing how you can create a collection, index a docum
|
||||
Let's begin by starting the Typesense server via Docker:
|
||||
|
||||
```
|
||||
docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.13.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
|
||||
docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.14.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
|
||||
```
|
||||
|
||||
Install the Python client for Typesense (we have [clients](https://typesense.org/api/#api-clients) for other languages too):
|
||||
@ -146,7 +146,7 @@ works without turning many knobs.
|
||||
|
||||
**Speed is great, but what about the memory footprint?**
|
||||
|
||||
A fresh Typesense server will take less than 5 MB of memory. As you start indexing documents, the memory use will
|
||||
A fresh Typesense server will consume about 30 MB of memory. As you start indexing documents, the memory use will
|
||||
increase correspondingly. How much it increases depends on the number and type of fields you index.
|
||||
|
||||
We've strived to keep the in-memory data structures lean. To give you a rough idea: when 1 million
|
||||
|
4
TODO.md
4
TODO.md
@ -97,6 +97,10 @@
|
||||
- ~~Have a LOG(ERROR) level~~
|
||||
- ~~Handle SIGTERM which is sent when process is killed~~
|
||||
- ~~Use snappy compression for storage~~
|
||||
- ~~Fix exclude_scalar early returns~~
|
||||
- ~~Fix result ids length during grouped overrides~~
|
||||
- ~~Fix override grouping (collate_included_ids)~~
|
||||
- ~~Test for overriding result on second page~~
|
||||
- atleast 1 token match for proceeding with drop tokens
|
||||
- support wildcard query with filters
|
||||
- API for optimizing on disk storage
|
||||
|
@ -1,4 +1,4 @@
|
||||
set(BRAFT_VERSION 0a9ec3f)
|
||||
set(BRAFT_VERSION fb27e63)
|
||||
set(BRAFT_NAME braft-${BRAFT_VERSION})
|
||||
set(BRAFT_TAR_PATH ${DEP_ROOT_DIR}/${BRAFT_NAME}.tar.gz)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
set(BRPC_VERSION 23c66e3)
|
||||
set(BRPC_VERSION 2f8fc37d)
|
||||
set(BRPC_NAME brpc-${BRPC_VERSION})
|
||||
set(BRPC_TAR_PATH ${DEP_ROOT_DIR}/${BRPC_NAME}.tar.gz)
|
||||
|
||||
|
@ -22,13 +22,13 @@ if [[ "$@" == *"--depclean"* ]]; then
|
||||
fi
|
||||
|
||||
#echo "Creating development image..."
|
||||
#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:latest $PROJECT_DIR/docker
|
||||
#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:23-JUNE-2020-1 $PROJECT_DIR/docker
|
||||
|
||||
echo "Building Typesense $TYPESENSE_VERSION..."
|
||||
docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
|
||||
docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
|
||||
-DCMAKE_BUILD_TYPE=Release -H/typesense -B/typesense/$BUILD_DIR
|
||||
|
||||
docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development make typesense-server -C/typesense/$BUILD_DIR
|
||||
docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 make typesense-server -C/typesense/$BUILD_DIR
|
||||
|
||||
if [[ "$@" == *"--build-deploy-image"* ]]; then
|
||||
echo "Creating deployment image for Typesense $TYPESENSE_VERSION server ..."
|
||||
@ -60,7 +60,7 @@ if [[ "$@" == *"--package-libs"* ]]; then
|
||||
fi
|
||||
#
|
||||
#if [[ "$@" == *"--create-deb-upload"* ]]; then
|
||||
# docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
|
||||
# docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \
|
||||
# -DCMAKE_BUILD_TYPE=Debug -H/typesense -B/typesense/$BUILD_DIR
|
||||
#fi
|
||||
|
||||
|
@ -55,25 +55,34 @@ ADD https://github.com/protocolbuffers/protobuf/releases/download/v3.11.4/protob
|
||||
RUN tar -C /opt -xf /opt/protobuf-cpp-3.11.4.tar.gz && chown -R root:root /opt/protobuf-3.11.4
|
||||
RUN cd /opt/protobuf-3.11.4 && ./configure --disable-shared && make -j8 && make check && make install && rm -rf /usr/local/lib/*.so*
|
||||
|
||||
ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz.tar.gz
|
||||
RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz.tar.gz
|
||||
ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz
|
||||
RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz
|
||||
RUN mkdir -p /opt/leveldb-1.22/build && cd /opt/leveldb-1.22/build && cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
cmake --build . && make install && rm -rf /usr/local/lib/*.so*
|
||||
|
||||
ADD https://github.com/google/glog/archive/0a2e593.tar.gz /opt/glog-0a2e593.tar.gz
|
||||
RUN tar -C /opt -xf /opt/glog-0a2e593.tar.gz
|
||||
RUN mkdir -p /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \
|
||||
cd /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \
|
||||
cmake -DBUILD_TESTING=0 -DWITH_GFLAGS=ON -DWITH_UNWIND=OFF .. && \
|
||||
cmake --build . && make install && rm -rf /usr/local/lib/*.so*
|
||||
|
||||
ADD https://github.com/apache/incubator-brpc/archive/0.9.7-rc03.tar.gz /opt/brpc-0.9.7-rc03.tar.gz
|
||||
RUN tar -C /opt -xf /opt/brpc-0.9.7-rc03.tar.gz
|
||||
COPY patches/brpc_cmakelists.txt /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt
|
||||
RUN chown root:root /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt
|
||||
RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/build && cd /opt/incubator-brpc-0.9.7-rc03/build && \
|
||||
cmake -DWITH_DEBUG_SYMBOLS=OFF .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
|
||||
rm -rf /opt/incubator-brpc-0.9.7-rc03/build/output/bin
|
||||
RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/bld && cd /opt/incubator-brpc-0.9.7-rc03/bld && \
|
||||
cmake -DWITH_DEBUG_SYMBOLS=OFF -DWITH_GLOG=ON .. && \
|
||||
make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
|
||||
rm -rf /opt/incubator-brpc-0.9.7-rc03/bld/output/bin
|
||||
|
||||
ADD https://github.com/baidu/braft/archive/v1.1.0.tar.gz /opt/braft-v1.1.0.tar.gz
|
||||
RUN tar -C /opt -xf /opt/braft-v1.1.0.tar.gz
|
||||
COPY patches/braft_cmakelists.txt /opt/braft-1.1.0/src/CMakeLists.txt
|
||||
RUN chown root:root /opt/braft-1.1.0/src/CMakeLists.txt
|
||||
RUN mkdir -p /opt/braft-1.1.0/build && cd /opt/braft-1.1.0/build && \
|
||||
cmake -DWITH_DEBUG_SYMBOLS=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so*
|
||||
ADD https://github.com/baidu/braft/archive/v1.1.1.tar.gz /opt/braft-v1.1.1.tar.gz
|
||||
RUN tar -C /opt -xf /opt/braft-v1.1.1.tar.gz
|
||||
COPY patches/braft_cmakelists.txt /opt/braft-1.1.1/src/CMakeLists.txt
|
||||
RUN chown root:root /opt/braft-1.1.1/src/CMakeLists.txt
|
||||
RUN mkdir -p /opt/braft-1.1.1/bld && cd /opt/braft-1.1.1/bld && \
|
||||
cmake -DWITH_DEBUG_SYMBOLS=ON -DBRPC_WITH_GLOG=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \
|
||||
rm -rf /opt/braft-1.1.1/bld/output/bin
|
||||
|
||||
ENV CC /usr/local/gcc-6.4.0/bin/gcc
|
||||
ENV CXX /usr/local/gcc-6.4.0/bin/g++
|
||||
|
@ -35,8 +35,8 @@ add_library(brpc-static STATIC $<TARGET_OBJECTS:BUTIL_LIB>
|
||||
$<TARGET_OBJECTS:SOURCES_LIB>
|
||||
$<TARGET_OBJECTS:PROTO_LIB>)
|
||||
|
||||
if(BRPC_WITH_THRIFT)
|
||||
target_link_libraries(brpc-static thrift)
|
||||
if(BRPC_WITH_GLOG)
|
||||
target_link_libraries(brpc-static ${GLOG_LIB})
|
||||
endif()
|
||||
|
||||
SET_TARGET_PROPERTIES(brpc-static PROPERTIES OUTPUT_NAME brpc CLEAN_DIRECT_OUTPUT 1)
|
||||
|
@ -147,7 +147,7 @@ private:
|
||||
std::string get_seq_id_key(uint32_t seq_id);
|
||||
|
||||
void highlight_result(const field &search_field, const std::vector<std::vector<art_leaf *>> &searched_queries,
|
||||
const KV &field_order_kv, const nlohmann::json &document,
|
||||
const KV* field_order_kv, const nlohmann::json &document,
|
||||
StringUtils & string_utils, size_t snippet_threshold,
|
||||
bool highlighted_fully,
|
||||
highlight_t &highlight);
|
||||
@ -155,10 +155,10 @@ private:
|
||||
void remove_document(nlohmann::json & document, const uint32_t seq_id, bool remove_from_store);
|
||||
|
||||
void populate_overrides(std::string query,
|
||||
const std::map<std::string, size_t>& pinned_hits,
|
||||
const std::map<size_t, std::vector<std::string>>& pinned_hits,
|
||||
const std::vector<std::string>& hidden_hits,
|
||||
std::map<uint32_t, size_t> & id_pos_map,
|
||||
std::vector<uint32_t> & included_ids, std::vector<uint32_t> & excluded_ids);
|
||||
std::map<size_t, std::vector<uint32_t>>& include_ids,
|
||||
std::vector<uint32_t> & excluded_ids);
|
||||
|
||||
static bool facet_count_compare(const std::pair<uint64_t, facet_count_t>& a,
|
||||
const std::pair<uint64_t, facet_count_t>& b) {
|
||||
@ -236,8 +236,10 @@ public:
|
||||
const size_t snippet_threshold = 30,
|
||||
const std::string & highlight_full_fields = "",
|
||||
size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD,
|
||||
const std::map<std::string, size_t>& pinned_hits={},
|
||||
const std::vector<std::string>& hidden_hits={});
|
||||
const std::map<size_t, std::vector<std::string>>& pinned_hits={},
|
||||
const std::vector<std::string>& hidden_hits={},
|
||||
const std::vector<std::string>& group_by_fields={},
|
||||
const size_t group_limit = 0);
|
||||
|
||||
Option<nlohmann::json> get(const std::string & id);
|
||||
|
||||
@ -271,6 +273,8 @@ public:
|
||||
|
||||
const size_t PER_PAGE_MAX = 250;
|
||||
|
||||
const size_t GROUP_LIMIT_MAX = 99;
|
||||
|
||||
// Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store
|
||||
static constexpr const char* COLLECTION_META_PREFIX = "$CM";
|
||||
static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS";
|
||||
@ -280,5 +284,9 @@ public:
|
||||
|
||||
void facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document,
|
||||
std::string &value);
|
||||
|
||||
void aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const;
|
||||
|
||||
void populate_result_kvs(Topster *topster, std::vector<std::vector<KV *>> &result_kvs) const;
|
||||
};
|
||||
|
||||
|
@ -146,6 +146,7 @@ struct token_pos_cost_t {
|
||||
|
||||
struct facet_count_t {
|
||||
uint32_t count;
|
||||
spp::sparse_hash_set<uint64_t> groups; // used for faceting grouped results
|
||||
|
||||
// used to fetch the actual document and value for representation
|
||||
uint32_t doc_id;
|
||||
|
@ -117,16 +117,16 @@ struct http_req {
|
||||
uint64_t route_hash;
|
||||
std::map<std::string, std::string> params;
|
||||
std::string body;
|
||||
uint64_t seed;
|
||||
std::string metadata;
|
||||
|
||||
http_req(): route_hash(1), seed(random_uint64_t()) {
|
||||
http_req(): route_hash(1) {
|
||||
|
||||
}
|
||||
|
||||
http_req(h2o_req_t* _req, const std::string & http_method, uint64_t route_hash,
|
||||
const std::map<std::string, std::string> & params, std::string body):
|
||||
_req(_req), http_method(http_method), route_hash(route_hash), params(params),
|
||||
body(body), seed(random_uint64_t()) {
|
||||
body(body) {
|
||||
|
||||
}
|
||||
|
||||
@ -136,7 +136,7 @@ struct http_req {
|
||||
nlohmann::json content = nlohmann::json::parse(serialized_content);
|
||||
route_hash = content["route_hash"];
|
||||
body = content["body"];
|
||||
seed = content["seed"];
|
||||
metadata = content.count("metadata") != 0 ? content["metadata"] : "";
|
||||
|
||||
for (nlohmann::json::iterator it = content["params"].begin(); it != content["params"].end(); ++it) {
|
||||
params.emplace(it.key(), it.value());
|
||||
@ -150,16 +150,10 @@ struct http_req {
|
||||
content["route_hash"] = route_hash;
|
||||
content["params"] = params;
|
||||
content["body"] = body;
|
||||
content["seed"] = seed;
|
||||
content["metadata"] = metadata;
|
||||
|
||||
return content.dump();
|
||||
}
|
||||
|
||||
uint64_t random_uint64_t() {
|
||||
thread_local std::mt19937 rg(std::random_device{}());
|
||||
thread_local std::uniform_int_distribution<uint64_t> pick(0, std::numeric_limits<uint64_t>::max());
|
||||
return pick(rg);
|
||||
}
|
||||
};
|
||||
|
||||
struct request_response {
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <json.hpp>
|
||||
#include <field.h>
|
||||
#include <option.h>
|
||||
#include <set>
|
||||
#include "string_utils.h"
|
||||
|
||||
struct token_candidates {
|
||||
@ -26,23 +27,27 @@ struct search_args {
|
||||
std::vector<std::string> search_fields;
|
||||
std::vector<filter> filters;
|
||||
std::vector<facet> facets;
|
||||
std::vector<uint32_t> included_ids;
|
||||
std::map<size_t, std::map<size_t, uint32_t>> included_ids;
|
||||
std::vector<uint32_t> excluded_ids;
|
||||
std::vector<sort_by> sort_fields_std;
|
||||
facet_query_t facet_query;
|
||||
int num_typos;
|
||||
size_t max_facet_values;
|
||||
size_t max_hits;
|
||||
size_t per_page;
|
||||
size_t page;
|
||||
token_ordering token_order;
|
||||
bool prefix;
|
||||
size_t drop_tokens_threshold;
|
||||
size_t typo_tokens_threshold;
|
||||
std::vector<KV> raw_result_kvs;
|
||||
std::vector<std::string> group_by_fields;
|
||||
size_t group_limit;
|
||||
size_t all_result_ids_len;
|
||||
spp::sparse_hash_set<uint64_t> groups_processed;
|
||||
std::vector<std::vector<art_leaf*>> searched_queries;
|
||||
std::vector<KV> override_result_kvs;
|
||||
Topster* topster;
|
||||
Topster* curated_topster;
|
||||
std::vector<std::vector<KV*>> raw_result_kvs;
|
||||
std::vector<std::vector<KV*>> override_result_kvs;
|
||||
Option<uint32_t> outcome;
|
||||
|
||||
search_args(): outcome(0) {
|
||||
@ -50,18 +55,28 @@ struct search_args {
|
||||
}
|
||||
|
||||
search_args(std::string query, std::vector<std::string> search_fields, std::vector<filter> filters,
|
||||
std::vector<facet> facets, std::vector<uint32_t> included_ids, std::vector<uint32_t> excluded_ids,
|
||||
std::vector<facet> facets, std::map<size_t, std::map<size_t, uint32_t>> included_ids, std::vector<uint32_t> excluded_ids,
|
||||
std::vector<sort_by> sort_fields_std, facet_query_t facet_query, int num_typos, size_t max_facet_values,
|
||||
size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix,
|
||||
size_t drop_tokens_threshold, size_t typo_tokens_threshold):
|
||||
size_t drop_tokens_threshold, size_t typo_tokens_threshold,
|
||||
const std::vector<std::string>& group_by_fields, size_t group_limit):
|
||||
query(query), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids),
|
||||
excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), facet_query(facet_query), num_typos(num_typos),
|
||||
max_facet_values(max_facet_values), max_hits(max_hits), per_page(per_page),
|
||||
max_facet_values(max_facet_values), per_page(per_page),
|
||||
page(page), token_order(token_order), prefix(prefix),
|
||||
drop_tokens_threshold(drop_tokens_threshold), typo_tokens_threshold(typo_tokens_threshold),
|
||||
group_by_fields(group_by_fields), group_limit(group_limit),
|
||||
all_result_ids_len(0), outcome(0) {
|
||||
|
||||
const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory
|
||||
topster = new Topster(topster_size, group_limit);
|
||||
curated_topster = new Topster(topster_size, group_limit);
|
||||
}
|
||||
|
||||
~search_args() {
|
||||
delete topster;
|
||||
delete curated_topster;
|
||||
};
|
||||
};
|
||||
|
||||
struct index_record {
|
||||
@ -149,22 +164,23 @@ private:
|
||||
void do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
const uint32_t* result_ids, size_t results_size);
|
||||
|
||||
void drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids);
|
||||
|
||||
void search_field(const uint8_t & field_id, const std::string & query,
|
||||
const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
|
||||
const std::vector<uint32_t>& curated_ids,
|
||||
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields,
|
||||
const int num_typos, std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
Topster & topster, uint32_t** all_result_ids,
|
||||
size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY,
|
||||
const bool prefix = false,
|
||||
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
uint32_t** all_result_ids, size_t & all_result_ids_len,
|
||||
const token_ordering token_order = FREQUENCY, const bool prefix = false,
|
||||
const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD,
|
||||
const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD);
|
||||
|
||||
void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
|
||||
const std::vector<uint32_t>& curated_ids,
|
||||
const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
|
||||
const token_ordering token_order, std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
Topster & topster, uint32_t** all_result_ids,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
uint32_t** all_result_ids,
|
||||
size_t & all_result_ids_len,
|
||||
const size_t typo_tokens_threshold);
|
||||
|
||||
@ -196,14 +212,19 @@ private:
|
||||
void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted,
|
||||
const uint32_t indices_length);
|
||||
|
||||
void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
|
||||
const std::vector<uint32_t> & included_ids,
|
||||
Topster & curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
|
||||
void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
|
||||
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
|
||||
Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries);
|
||||
|
||||
uint64_t facet_token_hash(const field & a_field, const std::string &token);
|
||||
|
||||
void compute_facet_stats(facet &a_facet, int64_t raw_value, const std::string & field_type);
|
||||
|
||||
// reference: https://stackoverflow.com/a/27952689/131050
|
||||
uint64_t hash_combine(uint64_t lhs, uint64_t rhs) const {
|
||||
lhs ^= rhs + 0x517cc1b727220a95 + (lhs << 6) + (lhs >> 2);
|
||||
return lhs;
|
||||
}
|
||||
|
||||
public:
|
||||
Index() = delete;
|
||||
@ -218,12 +239,18 @@ public:
|
||||
void search(Option<uint32_t> & outcome, const std::string & query, const std::vector<std::string> & search_fields,
|
||||
const std::vector<filter> & filters, std::vector<facet> & facets,
|
||||
facet_query_t & facet_query,
|
||||
const std::vector<uint32_t> & included_ids, const std::vector<uint32_t> & excluded_ids,
|
||||
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
|
||||
const std::vector<uint32_t> & excluded_ids,
|
||||
const std::vector<sort_by> & sort_fields_std, const int num_typos,
|
||||
const size_t max_hits, const size_t per_page, const size_t page, const token_ordering token_order,
|
||||
const bool prefix, const size_t drop_tokens_threshold, std::vector<KV> & raw_result_kvs,
|
||||
size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
std::vector<KV> & override_result_kvs, const size_t typo_tokens_threshold);
|
||||
Topster* topster, Topster* curated_topster,
|
||||
const size_t per_page, const size_t page, const token_ordering token_order,
|
||||
const bool prefix, const size_t drop_tokens_threshold,
|
||||
size_t & all_result_ids_len,
|
||||
spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
std::vector<std::vector<KV*>> & raw_result_kvs,
|
||||
std::vector<std::vector<KV*>> & override_result_kvs,
|
||||
const size_t typo_tokens_threshold);
|
||||
|
||||
Option<uint32_t> remove(const uint32_t seq_id, nlohmann::json & document);
|
||||
|
||||
@ -235,7 +262,8 @@ public:
|
||||
std::vector<std::vector<std::vector<uint16_t>>> &array_token_positions);
|
||||
|
||||
void score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index, const uint8_t & field_id,
|
||||
const uint32_t total_cost, Topster &topster, const std::vector<art_leaf *> & query_suggestion,
|
||||
const uint32_t total_cost, Topster* topster, const std::vector<art_leaf *> & query_suggestion,
|
||||
spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
const uint32_t *result_ids, const size_t result_size) const;
|
||||
|
||||
static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field);
|
||||
@ -278,7 +306,7 @@ public:
|
||||
bool processed; // prevents spurious wake up of the main thread
|
||||
bool terminate; // used for interrupting the thread during tear down
|
||||
|
||||
search_args search_params;
|
||||
search_args* search_params;
|
||||
|
||||
static void populate_array_token_positions(std::vector<std::vector<std::vector<uint16_t>>> & array_token_positions,
|
||||
const art_leaf *token_leaf, uint32_t doc_index);
|
||||
@ -286,5 +314,7 @@ public:
|
||||
int get_bounded_typo_cost(const size_t max_cost, const size_t token_len) const;
|
||||
|
||||
static int64_t float_to_in64_t(float n);
|
||||
|
||||
uint64_t get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id, const uint32_t seq_id) const;
|
||||
};
|
||||
|
||||
|
@ -1,5 +1,3 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <butil/logging.h>
|
||||
#include <glog/logging.h>
|
@ -151,8 +151,18 @@ struct StringUtils {
|
||||
}
|
||||
|
||||
char * p ;
|
||||
strtoull(s.c_str(), &p, 10);
|
||||
return (*p == 0);
|
||||
unsigned long long ull = strtoull(s.c_str(), &p, 10);
|
||||
return (*p == 0) && ull <= std::numeric_limits<uint64_t>::max();
|
||||
}
|
||||
|
||||
static bool is_uint32_t(const std::string &s) {
|
||||
if(s.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
char * p ;
|
||||
unsigned long ul = strtoul(s.c_str(), &p, 10);
|
||||
return (*p == 0) && ul <= std::numeric_limits<uint32_t>::max();
|
||||
}
|
||||
|
||||
static void toupper(std::string& str) {
|
||||
@ -234,7 +244,7 @@ struct StringUtils {
|
||||
return hash != std::numeric_limits<uint64_t>::max() ? hash : (std::numeric_limits<uint64_t>::max()-1);
|
||||
}
|
||||
|
||||
static std::string randstring(size_t length, uint64_t seed);
|
||||
static std::string randstring(size_t length);
|
||||
|
||||
static std::string hmac(const std::string& key, const std::string& msg);
|
||||
};
|
@ -118,6 +118,23 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned long linux_get_mem_available_bytes() {
|
||||
std::string token;
|
||||
std::ifstream file("/proc/meminfo");
|
||||
while(file >> token) {
|
||||
if(token == "MemAvailable:") {
|
||||
unsigned long mem_kb;
|
||||
if(file >> mem_kb) {
|
||||
return mem_kb * 1000;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0; // nothing found
|
||||
}
|
||||
|
||||
public:
|
||||
void get(const std::string & data_dir_path, nlohmann::json& result);
|
||||
|
||||
|
@ -5,16 +5,26 @@
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
#include <sparsepp.h>
|
||||
#include <match_score.h>
|
||||
#include <number.h>
|
||||
|
||||
struct KV {
|
||||
uint8_t field_id;
|
||||
uint16_t query_index;
|
||||
uint16_t array_index;
|
||||
uint64_t key;
|
||||
uint64_t match_score;
|
||||
int64_t scores[3]; // match score + 2 custom attributes
|
||||
uint8_t field_id{};
|
||||
uint16_t query_index{};
|
||||
uint16_t array_index{};
|
||||
uint64_t key{};
|
||||
uint64_t distinct_key{};
|
||||
uint64_t match_score{};
|
||||
int64_t scores[3]{}; // match score + 2 custom attributes
|
||||
|
||||
KV(uint8_t fieldId, uint16_t queryIndex, uint64_t key, uint64_t distinct_key,
|
||||
uint64_t match_score, const int64_t *scores):
|
||||
field_id(fieldId), query_index(queryIndex), array_index(0), key(key),
|
||||
distinct_key(distinct_key), match_score(match_score) {
|
||||
this->scores[0] = scores[0];
|
||||
this->scores[1] = scores[1];
|
||||
this->scores[2] = scores[2];
|
||||
}
|
||||
|
||||
KV() = default;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -25,12 +35,19 @@ struct Topster {
|
||||
uint32_t size;
|
||||
|
||||
KV *data;
|
||||
KV* *kvs;
|
||||
KV** kvs;
|
||||
|
||||
spp::sparse_hash_map<uint64_t, KV*> keys;
|
||||
// For distinct, stores the min heap kv of each group_kv_map topster value
|
||||
spp::sparse_hash_map<uint64_t, KV*> kv_map;
|
||||
|
||||
explicit Topster(size_t capacity): MAX_SIZE(capacity), size(0) {
|
||||
// we allocate data first to get contiguous memory block whose indices are then assigned to `kvs`
|
||||
spp::sparse_hash_map<uint64_t, Topster*> group_kv_map;
|
||||
size_t distinct;
|
||||
|
||||
explicit Topster(size_t capacity): Topster(capacity, 0) {
|
||||
}
|
||||
|
||||
explicit Topster(size_t capacity, size_t distinct): MAX_SIZE(capacity), size(0), distinct(distinct) {
|
||||
// we allocate data first to get a memory block whose indices are then assigned to `kvs`
|
||||
// we use separate **kvs for easier pointer swaps
|
||||
data = new KV[capacity];
|
||||
kvs = new KV*[capacity];
|
||||
@ -38,15 +55,25 @@ struct Topster {
|
||||
for(size_t i=0; i<capacity; i++) {
|
||||
data[i].field_id = 0;
|
||||
data[i].query_index = 0;
|
||||
data[i].array_index = i;
|
||||
data[i].key = 0;
|
||||
data[i].distinct_key = 0;
|
||||
data[i].match_score = 0;
|
||||
kvs[i] = &data[i];
|
||||
}
|
||||
}
|
||||
|
||||
~Topster() {
|
||||
delete [] data;
|
||||
delete [] kvs;
|
||||
delete[] data;
|
||||
delete[] kvs;
|
||||
for(auto& kv: group_kv_map) {
|
||||
delete kv.second;
|
||||
}
|
||||
|
||||
data = nullptr;
|
||||
kvs = nullptr;
|
||||
|
||||
group_kv_map.clear();
|
||||
}
|
||||
|
||||
static inline void swapMe(KV** a, KV** b) {
|
||||
@ -59,133 +86,193 @@ struct Topster {
|
||||
(*b)->array_index = a_index;
|
||||
}
|
||||
|
||||
static inline void replace_key_values(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index,
|
||||
const uint64_t &match_score, const int64_t *scores, uint32_t start,
|
||||
KV* *kvs, spp::sparse_hash_map<uint64_t, KV*>& keys) {
|
||||
kvs[start]->key = key;
|
||||
kvs[start]->field_id = field_id;
|
||||
kvs[start]->query_index = query_index;
|
||||
kvs[start]->array_index = start;
|
||||
kvs[start]->match_score = match_score;
|
||||
kvs[start]->scores[0] = scores[0];
|
||||
kvs[start]->scores[1] = scores[1];
|
||||
kvs[start]->scores[2] = scores[2];
|
||||
bool add(KV* kv) {
|
||||
//LOG(INFO) << "kv_map size: " << kv_map.size() << " -- kvs[0]: " << kvs[0]->match_score;
|
||||
/*for(auto kv: kv_map) {
|
||||
LOG(INFO) << "kv key: " << kv.first << " => " << kv.second->match_score;
|
||||
}*/
|
||||
|
||||
keys.erase(kvs[start]->key);
|
||||
keys[key] = kvs[start];
|
||||
}
|
||||
bool less_than_min_heap = (size >= MAX_SIZE) && is_smaller_equal(kv, kvs[0]);
|
||||
size_t heap_op_index = 0;
|
||||
|
||||
void add(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index, const uint64_t &match_score,
|
||||
const int64_t scores[3]) {
|
||||
if (size >= MAX_SIZE) {
|
||||
if(!is_greater(kvs[0], scores)) {
|
||||
// when incoming value is less than the smallest in the heap, ignore
|
||||
return;
|
||||
if(!distinct && less_than_min_heap) {
|
||||
// for non-distinct, if incoming value is smaller than min-heap ignore
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SIFT_DOWN = true;
|
||||
|
||||
if(distinct) {
|
||||
const auto& found_it = group_kv_map.find(kv->distinct_key);
|
||||
bool is_duplicate_key = (found_it != group_kv_map.end());
|
||||
|
||||
if(!is_duplicate_key && less_than_min_heap) {
|
||||
// for distinct, if a non duplicate kv is < than min heap we ignore
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t start = 0;
|
||||
if(is_duplicate_key) {
|
||||
// if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift
|
||||
Topster* group_topster = found_it->second;
|
||||
KV old_min_heap_kv = *kv_map[kv->distinct_key];
|
||||
bool added = group_topster->add(kv);
|
||||
|
||||
// When the key already exists and has a greater score, ignore. Otherwise, we have to replace.
|
||||
// NOTE: we don't consider primary and secondary attrs here because they will be the same for a given key.
|
||||
if(keys.count(key) != 0) {
|
||||
const KV* existing = keys.at(key);
|
||||
if(match_score <= existing->match_score) {
|
||||
return ;
|
||||
if(!added) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// replace and sift down
|
||||
start = existing->array_index;
|
||||
}
|
||||
// if new kv score is greater than previous min heap score we sift dowm, otherwise sift up
|
||||
SIFT_DOWN = is_greater(kv, &old_min_heap_kv);
|
||||
|
||||
replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys);
|
||||
// new kv is different from old_min_heap_kv so we have to sift heap
|
||||
heap_op_index = old_min_heap_kv.array_index;
|
||||
|
||||
// sift down to maintain heap property
|
||||
while ((2*start+1) < MAX_SIZE) {
|
||||
uint32_t next = (2 * start + 1);
|
||||
if (next+1 < MAX_SIZE && is_greater_kv(kvs[next], kvs[next+1])) {
|
||||
next++;
|
||||
}
|
||||
// erase current min heap key from kv_map
|
||||
kv_map.erase(old_min_heap_kv.distinct_key);
|
||||
|
||||
if (is_greater_kv(kvs[start], kvs[next])) {
|
||||
swapMe(&kvs[start], &kvs[next]);
|
||||
// kv will be copied into the pointer at heap_op_index
|
||||
kv_map.emplace(kv->distinct_key, kvs[heap_op_index]);
|
||||
} else {
|
||||
// kv is guaranteed to be > current min heap: kvs[0]
|
||||
// create fresh topster for this distinct group key since it does not exist
|
||||
Topster* group_topster = new Topster(distinct, 0);
|
||||
group_topster->add(kv);
|
||||
|
||||
// add new group key to map
|
||||
group_kv_map.emplace(kv->distinct_key, group_topster);
|
||||
|
||||
// find heap operation index for updating kvs
|
||||
|
||||
if(size < MAX_SIZE) {
|
||||
// there is enough space in heap we just copy to end
|
||||
SIFT_DOWN = false;
|
||||
heap_op_index = size;
|
||||
size++;
|
||||
} else {
|
||||
break;
|
||||
SIFT_DOWN = true;
|
||||
|
||||
// max size is reached so we are forced to replace current min heap element (kvs[0])
|
||||
heap_op_index = 0;
|
||||
|
||||
// remove current min heap group key from maps
|
||||
delete group_kv_map[kvs[heap_op_index]->distinct_key];
|
||||
group_kv_map.erase(kvs[heap_op_index]->distinct_key);
|
||||
kv_map.erase(kvs[heap_op_index]->distinct_key);
|
||||
}
|
||||
|
||||
start = next;
|
||||
// kv will be copied into the pointer at heap_op_index
|
||||
kv_map.emplace(kv->distinct_key, kvs[heap_op_index]);
|
||||
}
|
||||
} else {
|
||||
uint32_t start = size;
|
||||
bool key_found = false;
|
||||
|
||||
// When the key already exists and has a greater score, ignore. Otherwise, we have to replace
|
||||
if(keys.count(key) != 0) {
|
||||
const KV* existing = keys.at(key);
|
||||
if(match_score <= existing->match_score) {
|
||||
return ;
|
||||
} else { // not distinct
|
||||
//LOG(INFO) << "Searching for key: " << kv->key;
|
||||
|
||||
const auto& found_it = kv_map.find(kv->key);
|
||||
bool is_duplicate_key = (found_it != kv_map.end());
|
||||
|
||||
/*
|
||||
is_duplicate_key: SIFT_DOWN regardless of `size`.
|
||||
Else:
|
||||
Do SIFT_UP if size < max_size
|
||||
Else SIFT_DOWN
|
||||
*/
|
||||
|
||||
if(is_duplicate_key) {
|
||||
// Need to check if kv is greater than existing duplicate kv.
|
||||
KV* existing_kv = found_it->second;
|
||||
//LOG(INFO) << "existing_kv: " << existing_kv->key << " -> " << existing_kv->match_score;
|
||||
|
||||
if(is_smaller_equal(kv, existing_kv)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// replace and sift down
|
||||
start = existing->array_index;
|
||||
key_found = true;
|
||||
}
|
||||
SIFT_DOWN = true;
|
||||
|
||||
replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys);
|
||||
// replace existing kv and sift down
|
||||
heap_op_index = existing_kv->array_index;
|
||||
kv_map.erase(kvs[heap_op_index]->key);
|
||||
|
||||
if(key_found) {
|
||||
// need to sift down if it's a replace
|
||||
while ((2*start+1) < size) {
|
||||
uint32_t next = (2 * start + 1);
|
||||
if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) {
|
||||
next++;
|
||||
}
|
||||
// kv will be copied into the pointer at heap_op_index
|
||||
kv_map.emplace(kv->key, kvs[heap_op_index]);
|
||||
} else { // not duplicate
|
||||
|
||||
if (is_greater_kv(kvs[start], kvs[next])) {
|
||||
swapMe(&kvs[start], &kvs[next]);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
start = next;
|
||||
}
|
||||
|
||||
return ;
|
||||
}
|
||||
|
||||
while(start > 0) {
|
||||
uint32_t parent = (start-1)/2;
|
||||
if (is_greater_kv(kvs[parent], kvs[start])) {
|
||||
swapMe(&kvs[start], &kvs[parent]);
|
||||
start = parent;
|
||||
if(size < MAX_SIZE) {
|
||||
// we just copy to end of array
|
||||
SIFT_DOWN = false;
|
||||
heap_op_index = size;
|
||||
size++;
|
||||
} else {
|
||||
break;
|
||||
// kv is guaranteed to be > min heap.
|
||||
// we have to replace min heap element since array is full
|
||||
SIFT_DOWN = true;
|
||||
heap_op_index = 0;
|
||||
kv_map.erase(kvs[heap_op_index]->key);
|
||||
}
|
||||
}
|
||||
|
||||
if(keys.count(key) != 0) {
|
||||
size++;
|
||||
// kv will be copied into the pointer at heap_op_index
|
||||
kv_map.emplace(kv->key, kvs[heap_op_index]);
|
||||
}
|
||||
}
|
||||
|
||||
// we have to replace the existing element in the heap and sift down
|
||||
kv->array_index = heap_op_index;
|
||||
*kvs[heap_op_index] = *kv;
|
||||
|
||||
// sift up/down to maintain heap property
|
||||
|
||||
if(SIFT_DOWN) {
|
||||
while ((2 * heap_op_index + 1) < size) {
|
||||
uint32_t next = (2 * heap_op_index + 1); // left child
|
||||
if (next+1 < size && is_greater(kvs[next], kvs[next + 1])) {
|
||||
// for min heap we compare with the minimum of children
|
||||
next++; // right child (2n + 2)
|
||||
}
|
||||
|
||||
if (is_greater(kvs[heap_op_index], kvs[next])) {
|
||||
swapMe(&kvs[heap_op_index], &kvs[next]);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
heap_op_index = next;
|
||||
}
|
||||
} else {
|
||||
// SIFT UP
|
||||
while(heap_op_index > 0) {
|
||||
uint32_t parent = (heap_op_index - 1) / 2;
|
||||
if (is_greater(kvs[parent], kvs[heap_op_index])) {
|
||||
swapMe(&kvs[heap_op_index], &kvs[parent]);
|
||||
heap_op_index = parent;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool is_greater(const struct KV* i, const int64_t scores[3]) {
|
||||
return std::tie(scores[0], scores[1], scores[2]) >
|
||||
std::tie(i->scores[0], i->scores[1], i->scores[2]);
|
||||
}
|
||||
|
||||
static bool is_greater_kv(const struct KV* i, const struct KV* j) {
|
||||
static bool is_greater(const struct KV* i, const struct KV* j) {
|
||||
return std::tie(i->scores[0], i->scores[1], i->scores[2], i->key) >
|
||||
std::tie(j->scores[0], j->scores[1], j->scores[2], j->key);
|
||||
}
|
||||
|
||||
static bool is_greater_kv_value(const struct KV & i, const struct KV & j) {
|
||||
return std::tie(i.scores[0], i.scores[1], i.scores[2], i.key) >
|
||||
std::tie(j.scores[0], j.scores[1], j.scores[2], j.key);
|
||||
static bool is_smaller_equal(const struct KV* i, const struct KV* j) {
|
||||
return std::tie(i->scores[0], i->scores[1], i->scores[2]) <=
|
||||
std::tie(j->scores[0], j->scores[1], j->scores[2]);
|
||||
}
|
||||
|
||||
static bool is_greater_kv_group(const std::vector<KV*>& i, const std::vector<KV*>& j) {
|
||||
return std::tie(i[0]->scores[0], i[0]->scores[1], i[0]->scores[2], i[0]->key) >
|
||||
std::tie(j[0]->scores[0], j[0]->scores[1], j[0]->scores[2], j[0]->key);
|
||||
}
|
||||
|
||||
// topster must be sorted before iterated upon to remove dead array entries
|
||||
void sort() {
|
||||
std::stable_sort(kvs, kvs+size, is_greater_kv);
|
||||
std::stable_sort(kvs, kvs + size, is_greater);
|
||||
for(auto &group_topster: group_kv_map) {
|
||||
group_topster.second->sort();
|
||||
}
|
||||
}
|
||||
|
||||
void clear(){
|
||||
@ -196,6 +283,10 @@ struct Topster {
|
||||
return kvs[index]->key;
|
||||
}
|
||||
|
||||
uint64_t getDistinctKeyAt(uint32_t index) {
|
||||
return kvs[index]->distinct_key;
|
||||
}
|
||||
|
||||
KV* getKV(uint32_t index) {
|
||||
return kvs[index];
|
||||
}
|
||||
|
@ -1,10 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include "logger.h"
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <cmdline.h>
|
||||
#include "config.h"
|
||||
#include "logger.h"
|
||||
#include "store.h"
|
||||
#include "collection_manager.h"
|
||||
#include <csignal>
|
||||
|
@ -61,12 +61,12 @@ void array::remove_index(uint32_t start_index, uint32_t end_index) {
|
||||
}
|
||||
|
||||
uint32_t size_required = (uint32_t) (unsorted_append_size_required(max, new_index) * FOR_GROWTH_FACTOR);
|
||||
uint8_t *out = new uint8_t[size_required];
|
||||
uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out);
|
||||
uint32_t actual_size = for_compress_unsorted(new_array, out, new_index);
|
||||
|
||||
delete[] curr_array;
|
||||
delete[] new_array;
|
||||
delete[] in;
|
||||
free(in);
|
||||
|
||||
in = out;
|
||||
length = new_index;
|
||||
|
@ -107,14 +107,16 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA,
|
||||
size_t indexA = 0, indexB = 0, res_index = 0;
|
||||
|
||||
if(A == nullptr && B == nullptr) {
|
||||
return 0;
|
||||
*out = nullptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(A == nullptr) {
|
||||
*out = nullptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(B == nullptr) {
|
||||
if(lenB == 0 || B == nullptr) {
|
||||
*out = new uint32_t[lenA];
|
||||
memcpy(*out, A, lenA * sizeof(uint32_t));
|
||||
return lenA;
|
||||
|
@ -1384,8 +1384,6 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len,
|
||||
art_fuzzy_recurse(0, 0, t->root, -1, term, term_len, irow, jrow, min_cost, max_cost, prefix, nodes);
|
||||
}
|
||||
|
||||
PROCESS_NODES:
|
||||
|
||||
if(token_order == FREQUENCY) {
|
||||
std::sort(nodes.begin(), nodes.end(), compare_art_node_frequency);
|
||||
} else {
|
||||
|
@ -305,54 +305,68 @@ void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash
|
||||
}
|
||||
|
||||
void Collection::populate_overrides(std::string query,
|
||||
const std::map<std::string, size_t>& pinned_hits,
|
||||
const std::map<size_t, std::vector<std::string>>& pinned_hits,
|
||||
const std::vector<std::string>& hidden_hits,
|
||||
std::map<uint32_t, size_t> & id_pos_map,
|
||||
std::vector<uint32_t> & included_ids,
|
||||
std::map<size_t, std::vector<uint32_t>>& include_ids,
|
||||
std::vector<uint32_t> & excluded_ids) {
|
||||
StringUtils::tolowercase(query);
|
||||
std::set<uint32_t> excluded_set;
|
||||
|
||||
// If pinned or hidden hits are provided, they take precedence over overrides
|
||||
|
||||
// have to ensure that hidden hits take precedence over included hits
|
||||
if(!hidden_hits.empty()) {
|
||||
for(const auto & hit: hidden_hits) {
|
||||
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit);
|
||||
if(seq_id_op.ok()) {
|
||||
excluded_ids.push_back(seq_id_op.get());
|
||||
excluded_set.insert(seq_id_op.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto & override_kv: overrides) {
|
||||
const auto & override = override_kv.second;
|
||||
|
||||
if( (override.rule.match == override_t::MATCH_EXACT && override.rule.query == query) ||
|
||||
(override.rule.match == override_t::MATCH_CONTAINS && query.find(override.rule.query) != std::string::npos) ) {
|
||||
for(const auto & hit: override.add_hits) {
|
||||
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
|
||||
if(seq_id_op.ok()) {
|
||||
included_ids.push_back(seq_id_op.get());
|
||||
id_pos_map[seq_id_op.get()] = hit.position;
|
||||
}
|
||||
}
|
||||
|
||||
// have to ensure that dropped hits take precedence over added hits
|
||||
for(const auto & hit: override.drop_hits) {
|
||||
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
|
||||
if(seq_id_op.ok()) {
|
||||
excluded_ids.push_back(seq_id_op.get());
|
||||
excluded_set.insert(seq_id_op.get());
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto & hit: override.add_hits) {
|
||||
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.doc_id);
|
||||
if(!seq_id_op.ok()) {
|
||||
continue;
|
||||
}
|
||||
uint32_t seq_id = seq_id_op.get();
|
||||
bool excluded = (excluded_set.count(seq_id) != 0);
|
||||
if(!excluded) {
|
||||
include_ids[hit.position].push_back(seq_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If pinned or hidden hits are provided, they take precedence over overrides
|
||||
|
||||
if(!pinned_hits.empty()) {
|
||||
for(const auto & hit: pinned_hits) {
|
||||
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit.first);
|
||||
if(seq_id_op.ok()) {
|
||||
included_ids.push_back(seq_id_op.get());
|
||||
id_pos_map[seq_id_op.get()] = hit.second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!hidden_hits.empty()) {
|
||||
for(const auto & hit: hidden_hits) {
|
||||
Option<uint32_t> seq_id_op = doc_id_to_seq_id(hit);
|
||||
if(seq_id_op.ok()) {
|
||||
included_ids.erase(std::remove(included_ids.begin(), included_ids.end(), seq_id_op.get()), included_ids.end());
|
||||
id_pos_map.erase(seq_id_op.get());
|
||||
excluded_ids.push_back(seq_id_op.get());
|
||||
for(const auto& pos_ids: pinned_hits) {
|
||||
size_t pos = pos_ids.first;
|
||||
for(const std::string& id: pos_ids.second) {
|
||||
Option<uint32_t> seq_id_op = doc_id_to_seq_id(id);
|
||||
if(!seq_id_op.ok()) {
|
||||
continue;
|
||||
}
|
||||
uint32_t seq_id = seq_id_op.get();
|
||||
bool excluded = (excluded_set.count(seq_id) != 0);
|
||||
if(!excluded) {
|
||||
include_ids[pos].push_back(seq_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -371,20 +385,60 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
const size_t snippet_threshold,
|
||||
const std::string & highlight_full_fields,
|
||||
size_t typo_tokens_threshold,
|
||||
const std::map<std::string, size_t>& pinned_hits,
|
||||
const std::vector<std::string>& hidden_hits) {
|
||||
const std::map<size_t, std::vector<std::string>>& pinned_hits,
|
||||
const std::vector<std::string>& hidden_hits,
|
||||
const std::vector<std::string>& group_by_fields,
|
||||
const size_t group_limit) {
|
||||
|
||||
if(query != "*" && search_fields.empty()) {
|
||||
return Option<nlohmann::json>(400, "No search fields specified for the query.");
|
||||
}
|
||||
|
||||
if(!group_by_fields.empty() && (group_limit == 0 || group_limit > GROUP_LIMIT_MAX)) {
|
||||
return Option<nlohmann::json>(400, "Value of `group_limit` must be between 1 and " +
|
||||
std::to_string(GROUP_LIMIT_MAX) + ".");
|
||||
}
|
||||
|
||||
std::vector<uint32_t> included_ids;
|
||||
std::vector<uint32_t> excluded_ids;
|
||||
std::map<uint32_t, size_t> id_pos_map;
|
||||
populate_overrides(query, pinned_hits, hidden_hits, id_pos_map, included_ids, excluded_ids);
|
||||
std::map<size_t, std::vector<uint32_t>> include_ids; // position => list of IDs
|
||||
populate_overrides(query, pinned_hits, hidden_hits, include_ids, excluded_ids);
|
||||
|
||||
std::map<uint32_t, std::vector<uint32_t>> index_to_included_ids;
|
||||
/*for(auto kv: include_ids) {
|
||||
LOG(INFO) << "key: " << kv.first;
|
||||
for(auto val: kv.second) {
|
||||
LOG(INFO) << val;
|
||||
}
|
||||
}
|
||||
|
||||
LOG(INFO) << "Excludes:";
|
||||
|
||||
for(auto id: excluded_ids) {
|
||||
LOG(INFO) << id;
|
||||
}
|
||||
|
||||
LOG(INFO) << "include_ids size: " << include_ids.size();
|
||||
for(auto& group: include_ids) {
|
||||
for(uint32_t& seq_id: group.second) {
|
||||
LOG(INFO) << "seq_id: " << seq_id;
|
||||
}
|
||||
|
||||
LOG(INFO) << "----";
|
||||
}
|
||||
*/
|
||||
|
||||
std::map<uint32_t, std::map<size_t, std::map<size_t, uint32_t>>> index_to_included_ids;
|
||||
std::map<uint32_t, std::vector<uint32_t>> index_to_excluded_ids;
|
||||
|
||||
for(auto seq_id: included_ids) {
|
||||
auto index_id = (seq_id % num_indices);
|
||||
index_to_included_ids[index_id].push_back(seq_id);
|
||||
for(const auto& pos_ids: include_ids) {
|
||||
size_t outer_pos = pos_ids.first;
|
||||
size_t ids_per_pos = std::max(size_t(1), group_limit);
|
||||
|
||||
for(size_t inner_pos = 0; inner_pos < std::min(ids_per_pos, pos_ids.second.size()); inner_pos++) {
|
||||
auto seq_id = pos_ids.second[inner_pos];
|
||||
auto index_id = (seq_id % num_indices);
|
||||
index_to_included_ids[index_id][outer_pos][inner_pos] = seq_id;
|
||||
//LOG(INFO) << "Adding seq_id " << seq_id << " to index_id " << index_id;
|
||||
}
|
||||
}
|
||||
|
||||
for(auto seq_id: excluded_ids) {
|
||||
@ -408,6 +462,22 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
}
|
||||
}
|
||||
|
||||
// validate group by fields
|
||||
for(const std::string & field_name: group_by_fields) {
|
||||
if(search_schema.count(field_name) == 0) {
|
||||
std::string error = "Could not find a field named `" + field_name + "` in the schema.";
|
||||
return Option<nlohmann::json>(404, error);
|
||||
}
|
||||
|
||||
field search_field = search_schema.at(field_name);
|
||||
|
||||
// must be a facet field
|
||||
if(!search_field.is_facet()) {
|
||||
std::string error = "Group by field `" + field_name + "` should be a facet field.";
|
||||
return Option<nlohmann::json>(400, error);
|
||||
}
|
||||
}
|
||||
|
||||
// validate filter fields
|
||||
std::vector<std::string> filter_blocks;
|
||||
StringUtils::split(simple_filter_query, filter_blocks, "&&");
|
||||
@ -515,7 +585,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
}
|
||||
|
||||
// for a wildcard query, if filter is not specified, use default_sorting_field as a catch-all
|
||||
if(query == "*" && filters.size() == 0) {
|
||||
if(query == "*" && filters.empty()) {
|
||||
field f = search_schema.at(default_sorting_field);
|
||||
std::string max_value = f.is_float() ? std::to_string(std::numeric_limits<float>::max()) :
|
||||
std::to_string(std::numeric_limits<int32_t>::max());
|
||||
@ -631,19 +701,21 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
const size_t max_hits = std::min((page * per_page), get_num_documents());
|
||||
|
||||
std::vector<std::vector<art_leaf*>> searched_queries; // search queries used for generating the results
|
||||
std::vector<KV> raw_result_kvs;
|
||||
std::vector<KV> override_result_kvs;
|
||||
std::vector<std::vector<KV*>> raw_result_kvs;
|
||||
std::vector<std::vector<KV*>> override_result_kvs;
|
||||
|
||||
size_t total_found = 0;
|
||||
spp::sparse_hash_set<uint64_t> groups_processed; // used to calculate total_found for grouped query
|
||||
|
||||
// send data to individual index threads
|
||||
size_t index_id = 0;
|
||||
for(Index* index: indices) {
|
||||
index->search_params = search_args(query, search_fields, filters, facets,
|
||||
index_to_included_ids[index_id], index_to_excluded_ids[index_id],
|
||||
sort_fields_std, facet_query, num_typos, max_facet_values, max_hits,
|
||||
per_page, page, token_order, prefix,
|
||||
drop_tokens_threshold, typo_tokens_threshold);
|
||||
index->search_params = new search_args(query, search_fields, filters, facets,
|
||||
index_to_included_ids[index_id], index_to_excluded_ids[index_id],
|
||||
sort_fields_std, facet_query, num_typos, max_facet_values, max_hits,
|
||||
per_page, page, token_order, prefix,
|
||||
drop_tokens_threshold, typo_tokens_threshold,
|
||||
group_by_fields, group_limit);
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(index->m);
|
||||
index->ready = true;
|
||||
@ -655,6 +727,12 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
|
||||
Option<nlohmann::json> index_search_op({}); // stores the last error across all index threads
|
||||
|
||||
// for grouping we have re-aggregate
|
||||
|
||||
const size_t topster_size = std::max((size_t)1, max_hits);
|
||||
Topster topster(topster_size, group_limit);
|
||||
Topster curated_topster(topster_size, group_limit);
|
||||
|
||||
for(Index* index: indices) {
|
||||
// wait for the worker
|
||||
{
|
||||
@ -662,9 +740,9 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
index->cv.wait(lk, [index]{return index->processed;});
|
||||
}
|
||||
|
||||
if(!index->search_params.outcome.ok()) {
|
||||
index_search_op = Option<nlohmann::json>(index->search_params.outcome.code(),
|
||||
index->search_params.outcome.error());
|
||||
if(!index->search_params->outcome.ok()) {
|
||||
index_search_op = Option<nlohmann::json>(index->search_params->outcome.code(),
|
||||
index->search_params->outcome.error());
|
||||
}
|
||||
|
||||
if(!index_search_op.ok()) {
|
||||
@ -672,34 +750,33 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
continue;
|
||||
}
|
||||
|
||||
for(auto & field_order_kv: index->search_params.raw_result_kvs) {
|
||||
field_order_kv.query_index += searched_queries.size();
|
||||
raw_result_kvs.push_back(field_order_kv);
|
||||
}
|
||||
aggregate_topster(searched_queries.size(), topster, index->search_params->topster);
|
||||
aggregate_topster(searched_queries.size(), curated_topster, index->search_params->curated_topster);
|
||||
|
||||
for(auto & field_order_kv: index->search_params.override_result_kvs) {
|
||||
field_order_kv.query_index += searched_queries.size();
|
||||
override_result_kvs.push_back(field_order_kv);
|
||||
}
|
||||
searched_queries.insert(searched_queries.end(), index->search_params->searched_queries.begin(),
|
||||
index->search_params->searched_queries.end());
|
||||
|
||||
searched_queries.insert(searched_queries.end(), index->search_params.searched_queries.begin(),
|
||||
index->search_params.searched_queries.end());
|
||||
|
||||
for(size_t fi = 0; fi < index->search_params.facets.size(); fi++) {
|
||||
auto & this_facet = index->search_params.facets[fi];
|
||||
for(size_t fi = 0; fi < index->search_params->facets.size(); fi++) {
|
||||
auto & this_facet = index->search_params->facets[fi];
|
||||
auto & acc_facet = facets[fi];
|
||||
|
||||
for(auto & facet_kv: this_facet.result_map) {
|
||||
size_t count = 0;
|
||||
|
||||
if(acc_facet.result_map.count(facet_kv.first) == 0) {
|
||||
// not found, so set it
|
||||
count = facet_kv.second.count;
|
||||
if(index->search_params->group_limit) {
|
||||
// we have to add all group sets
|
||||
acc_facet.result_map[facet_kv.first].groups.insert(
|
||||
facet_kv.second.groups.begin(), facet_kv.second.groups.end()
|
||||
);
|
||||
} else {
|
||||
count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
|
||||
size_t count = 0;
|
||||
if(acc_facet.result_map.count(facet_kv.first) == 0) {
|
||||
// not found, so set it
|
||||
count = facet_kv.second.count;
|
||||
} else {
|
||||
count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count;
|
||||
}
|
||||
acc_facet.result_map[facet_kv.first].count = count;
|
||||
}
|
||||
|
||||
acc_facet.result_map[facet_kv.first].count = count;
|
||||
acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id;
|
||||
acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos;
|
||||
acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos;
|
||||
@ -713,138 +790,195 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
}
|
||||
}
|
||||
|
||||
total_found += index->search_params.all_result_ids_len;
|
||||
if(group_limit) {
|
||||
groups_processed.insert(
|
||||
index->search_params->groups_processed.begin(),
|
||||
index->search_params->groups_processed.end()
|
||||
);
|
||||
} else {
|
||||
total_found += index->search_params->all_result_ids_len;
|
||||
}
|
||||
}
|
||||
|
||||
if(!index_search_op.ok()) {
|
||||
return index_search_op;
|
||||
}
|
||||
|
||||
// All fields are sorted descending
|
||||
std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_value);
|
||||
topster.sort();
|
||||
curated_topster.sort();
|
||||
|
||||
// Sort based on position in overriden list
|
||||
populate_result_kvs(&topster, raw_result_kvs);
|
||||
populate_result_kvs(&curated_topster, override_result_kvs);
|
||||
|
||||
// for grouping we have to aggregate group set sizes to a count value
|
||||
if(group_limit) {
|
||||
for(auto& acc_facet: facets) {
|
||||
for(auto& facet_kv: acc_facet.result_map) {
|
||||
facet_kv.second.count = facet_kv.second.groups.size();
|
||||
}
|
||||
}
|
||||
|
||||
total_found = groups_processed.size() + override_result_kvs.size();
|
||||
}
|
||||
|
||||
// All fields are sorted descending
|
||||
std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_group);
|
||||
|
||||
// Sort based on position in overridden list
|
||||
std::sort(
|
||||
override_result_kvs.begin(), override_result_kvs.end(),
|
||||
[&id_pos_map](const KV & a, const KV & b) -> bool {
|
||||
return id_pos_map[a.key] < id_pos_map[b.key];
|
||||
[](const std::vector<KV*>& a, std::vector<KV*>& b) -> bool {
|
||||
return a[0]->distinct_key < b[0]->distinct_key;
|
||||
}
|
||||
);
|
||||
|
||||
nlohmann::json result = nlohmann::json::object();
|
||||
|
||||
result["hits"] = nlohmann::json::array();
|
||||
result["found"] = total_found;
|
||||
|
||||
std::vector<KV> result_kvs;
|
||||
std::vector<std::vector<KV*>> result_group_kvs;
|
||||
size_t override_kv_index = 0;
|
||||
size_t raw_results_index = 0;
|
||||
|
||||
// merge raw results and override results
|
||||
while(override_kv_index < override_result_kvs.size() && raw_results_index < raw_result_kvs.size()) {
|
||||
if(override_kv_index < override_result_kvs.size() &&
|
||||
id_pos_map.count(override_result_kvs[override_kv_index].key) != 0 &&
|
||||
result_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index].key]) {
|
||||
result_kvs.push_back(override_result_kvs[override_kv_index]);
|
||||
override_kv_index++;
|
||||
size_t result_position = result_group_kvs.size() + 1;
|
||||
uint64_t override_position = override_result_kvs[override_kv_index][0]->distinct_key;
|
||||
|
||||
if(result_position == override_position) {
|
||||
override_result_kvs[override_kv_index][0]->match_score = 0; // to identify curated result
|
||||
result_group_kvs.push_back(override_result_kvs[override_kv_index]);
|
||||
override_kv_index++;
|
||||
} else {
|
||||
result_kvs.push_back(raw_result_kvs[raw_results_index]);
|
||||
result_group_kvs.push_back(raw_result_kvs[raw_results_index]);
|
||||
raw_results_index++;
|
||||
}
|
||||
}
|
||||
|
||||
while(override_kv_index < override_result_kvs.size()) {
|
||||
result_kvs.push_back(override_result_kvs[override_kv_index]);
|
||||
override_result_kvs[override_kv_index][0]->match_score = 0; // to identify curated result
|
||||
result_group_kvs.push_back({override_result_kvs[override_kv_index]});
|
||||
override_kv_index++;
|
||||
}
|
||||
|
||||
while(raw_results_index < raw_result_kvs.size()) {
|
||||
result_kvs.push_back(raw_result_kvs[raw_results_index]);
|
||||
result_group_kvs.push_back(raw_result_kvs[raw_results_index]);
|
||||
raw_results_index++;
|
||||
}
|
||||
|
||||
const long start_result_index = (page - 1) * per_page;
|
||||
const long end_result_index = std::min(max_hits, result_kvs.size()) - 1; // could be -1 when max_hits is 0
|
||||
const long end_result_index = std::min(max_hits, result_group_kvs.size()) - 1; // could be -1 when max_hits is 0
|
||||
|
||||
nlohmann::json result = nlohmann::json::object();
|
||||
|
||||
result["found"] = total_found;
|
||||
|
||||
std::string hits_key = group_limit ? "grouped_hits" : "hits";
|
||||
result[hits_key] = nlohmann::json::array();
|
||||
|
||||
// construct results array
|
||||
for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) {
|
||||
const auto & field_order_kv = result_kvs[result_kvs_index];
|
||||
const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv.key);
|
||||
const std::vector<KV*> & kv_group = result_group_kvs[result_kvs_index];
|
||||
|
||||
nlohmann::json document;
|
||||
const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
|
||||
|
||||
if(!document_op.ok()) {
|
||||
LOG(ERROR) << "Document fetch error. " << document_op.error();
|
||||
continue;
|
||||
nlohmann::json group_hits;
|
||||
if(group_limit) {
|
||||
group_hits["hits"] = nlohmann::json::array();
|
||||
}
|
||||
|
||||
nlohmann::json wrapper_doc;
|
||||
wrapper_doc["highlights"] = nlohmann::json::array();
|
||||
std::vector<highlight_t> highlights;
|
||||
StringUtils string_utils;
|
||||
nlohmann::json& hits_array = group_limit ? group_hits["hits"] : result["hits"];
|
||||
|
||||
// find out if fields have to be highlighted fully
|
||||
std::vector<std::string> fields_highlighted_fully_vec;
|
||||
spp::sparse_hash_set<std::string> fields_highlighted_fully;
|
||||
StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ",");
|
||||
for(const KV* field_order_kv: kv_group) {
|
||||
const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key);
|
||||
|
||||
for(std::string & highlight_full_field: fields_highlighted_fully_vec) {
|
||||
StringUtils::trim(highlight_full_field);
|
||||
fields_highlighted_fully.emplace(highlight_full_field);
|
||||
}
|
||||
nlohmann::json document;
|
||||
const Option<bool> & document_op = get_document_from_store(seq_id_key, document);
|
||||
|
||||
for(const std::string & field_name: search_fields) {
|
||||
// should not pick excluded field for highlighting
|
||||
if(exclude_fields.count(field_name) > 0) {
|
||||
if(!document_op.ok()) {
|
||||
LOG(ERROR) << "Document fetch error. " << document_op.error();
|
||||
continue;
|
||||
}
|
||||
|
||||
field search_field = search_schema.at(field_name);
|
||||
if(query != "*" && (search_field.type == field_types::STRING ||
|
||||
search_field.type == field_types::STRING_ARRAY)) {
|
||||
nlohmann::json wrapper_doc;
|
||||
wrapper_doc["highlights"] = nlohmann::json::array();
|
||||
std::vector<highlight_t> highlights;
|
||||
StringUtils string_utils;
|
||||
|
||||
bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
|
||||
highlight_t highlight;
|
||||
highlight_result(search_field, searched_queries, field_order_kv, document,
|
||||
string_utils, snippet_threshold, highlighted_fully, highlight);
|
||||
// find out if fields have to be highlighted fully
|
||||
std::vector<std::string> fields_highlighted_fully_vec;
|
||||
spp::sparse_hash_set<std::string> fields_highlighted_fully;
|
||||
StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ",");
|
||||
|
||||
if(!highlight.snippets.empty()) {
|
||||
highlights.push_back(highlight);
|
||||
}
|
||||
for(std::string & highlight_full_field: fields_highlighted_fully_vec) {
|
||||
StringUtils::trim(highlight_full_field);
|
||||
fields_highlighted_fully.emplace(highlight_full_field);
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(highlights.begin(), highlights.end());
|
||||
|
||||
for(const auto & highlight: highlights) {
|
||||
nlohmann::json h_json = nlohmann::json::object();
|
||||
h_json["field"] = highlight.field;
|
||||
bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end());
|
||||
|
||||
if(!highlight.indices.empty()) {
|
||||
h_json["indices"] = highlight.indices;
|
||||
h_json["snippets"] = highlight.snippets;
|
||||
if(highlight_fully) {
|
||||
h_json["values"] = highlight.values;
|
||||
for(const std::string & field_name: search_fields) {
|
||||
// should not pick excluded field for highlighting
|
||||
if(exclude_fields.count(field_name) > 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
} else {
|
||||
h_json["snippet"] = highlight.snippets[0];
|
||||
if(highlight_fully) {
|
||||
h_json["value"] = highlight.values[0];
|
||||
field search_field = search_schema.at(field_name);
|
||||
if(query != "*" && (search_field.type == field_types::STRING ||
|
||||
search_field.type == field_types::STRING_ARRAY)) {
|
||||
|
||||
bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end());
|
||||
highlight_t highlight;
|
||||
highlight_result(search_field, searched_queries, field_order_kv, document,
|
||||
string_utils, snippet_threshold, highlighted_fully, highlight);
|
||||
|
||||
if(!highlight.snippets.empty()) {
|
||||
highlights.push_back(highlight);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
wrapper_doc["highlights"].push_back(h_json);
|
||||
std::sort(highlights.begin(), highlights.end());
|
||||
|
||||
for(const auto & highlight: highlights) {
|
||||
nlohmann::json h_json = nlohmann::json::object();
|
||||
h_json["field"] = highlight.field;
|
||||
bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end());
|
||||
|
||||
if(!highlight.indices.empty()) {
|
||||
h_json["indices"] = highlight.indices;
|
||||
h_json["snippets"] = highlight.snippets;
|
||||
if(highlight_fully) {
|
||||
h_json["values"] = highlight.values;
|
||||
}
|
||||
|
||||
} else {
|
||||
h_json["snippet"] = highlight.snippets[0];
|
||||
if(highlight_fully) {
|
||||
h_json["value"] = highlight.values[0];
|
||||
}
|
||||
}
|
||||
|
||||
wrapper_doc["highlights"].push_back(h_json);
|
||||
}
|
||||
|
||||
//wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key;
|
||||
|
||||
prune_document(document, include_fields, exclude_fields);
|
||||
wrapper_doc["document"] = document;
|
||||
wrapper_doc["text_match"] = field_order_kv->match_score;
|
||||
|
||||
if(field_order_kv->match_score == 0) {
|
||||
wrapper_doc["curated"] = true;
|
||||
}
|
||||
|
||||
hits_array.push_back(wrapper_doc);
|
||||
}
|
||||
|
||||
prune_document(document, include_fields, exclude_fields);
|
||||
wrapper_doc["document"] = document;
|
||||
wrapper_doc["text_match"] = field_order_kv.match_score;
|
||||
//wrapper_doc["seq_id"] = (uint32_t) field_order_kv.key;
|
||||
if(group_limit) {
|
||||
const auto& document = group_hits["hits"][0]["document"];
|
||||
|
||||
result["hits"].push_back(wrapper_doc);
|
||||
group_hits["group_key"] = nlohmann::json::array();
|
||||
for(const auto& field_name: group_by_fields) {
|
||||
if(document.count(field_name) != 0) {
|
||||
group_hits["group_key"].push_back(document[field_name]);
|
||||
}
|
||||
}
|
||||
|
||||
result["grouped_hits"].push_back(group_hits);
|
||||
}
|
||||
}
|
||||
|
||||
result["facet_counts"] = nlohmann::json::array();
|
||||
@ -941,6 +1075,11 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
result["facet_counts"].push_back(facet_result);
|
||||
}
|
||||
|
||||
// free search params
|
||||
for(Index* index: indices) {
|
||||
delete index->search_params;
|
||||
}
|
||||
|
||||
result["request_params"] = nlohmann::json::object();;
|
||||
result["request_params"]["per_page"] = per_page;
|
||||
result["request_params"]["q"] = query;
|
||||
@ -951,6 +1090,43 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
void Collection::populate_result_kvs(Topster *topster, std::vector<std::vector<KV *>> &result_kvs) const {
|
||||
if(topster->distinct) {
|
||||
for(auto &group_topster_entry: topster->group_kv_map) {
|
||||
Topster* group_topster = group_topster_entry.second;
|
||||
const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
|
||||
result_kvs.emplace_back(group_kvs);
|
||||
}
|
||||
|
||||
} else {
|
||||
for(uint32_t t = 0; t < topster->size; t++) {
|
||||
KV* kv = topster->getKV(t);
|
||||
result_kvs.push_back({kv});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Collection::aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const {
|
||||
if(index_topster->distinct) {
|
||||
for(auto &group_topster_entry: index_topster->group_kv_map) {
|
||||
Topster* group_topster = group_topster_entry.second;
|
||||
const std::vector<KV*> group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size);
|
||||
for(KV* kv: group_kvs) {
|
||||
kv->query_index += query_index;
|
||||
topster.add(kv);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
for(uint32_t t = 0; t < index_topster->size; t++) {
|
||||
KV* kv = index_topster->getKV(t);
|
||||
kv->query_index += query_index;
|
||||
topster.add(kv);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count,
|
||||
const nlohmann::json &document, std::string &value) {
|
||||
|
||||
@ -989,7 +1165,7 @@ void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t
|
||||
|
||||
void Collection::highlight_result(const field &search_field,
|
||||
const std::vector<std::vector<art_leaf *>> &searched_queries,
|
||||
const KV & field_order_kv, const nlohmann::json & document,
|
||||
const KV* field_order_kv, const nlohmann::json & document,
|
||||
StringUtils & string_utils, size_t snippet_threshold,
|
||||
bool highlighted_fully,
|
||||
highlight_t & highlight) {
|
||||
@ -997,15 +1173,15 @@ void Collection::highlight_result(const field &search_field,
|
||||
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
|
||||
std::vector<art_leaf *> query_suggestion;
|
||||
|
||||
for (const art_leaf *token_leaf : searched_queries[field_order_kv.query_index]) {
|
||||
for (const art_leaf *token_leaf : searched_queries[field_order_kv->query_index]) {
|
||||
// Must search for the token string fresh on that field for the given document since `token_leaf`
|
||||
// is from the best matched field and need not be present in other fields of a document.
|
||||
Index* index = indices[field_order_kv.key % num_indices];
|
||||
Index* index = indices[field_order_kv->key % num_indices];
|
||||
art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len);
|
||||
if(actual_leaf != nullptr) {
|
||||
query_suggestion.push_back(actual_leaf);
|
||||
std::vector<uint16_t> positions;
|
||||
uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv.key);
|
||||
uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key);
|
||||
auto doc_indices = new uint32_t[1];
|
||||
doc_indices[0] = doc_index;
|
||||
leaf_to_indices.emplace(actual_leaf, doc_indices);
|
||||
@ -1031,8 +1207,8 @@ void Collection::highlight_result(const field &search_field,
|
||||
continue;
|
||||
}
|
||||
|
||||
const Match & this_match = Match::match(field_order_kv.key, token_positions);
|
||||
uint64_t this_match_score = this_match.get_match_score(1, field_order_kv.field_id);
|
||||
const Match & this_match = Match::match(field_order_kv->key, token_positions);
|
||||
uint64_t this_match_score = this_match.get_match_score(1, field_order_kv->field_id);
|
||||
match_indices.emplace_back(this_match, this_match_score, array_index);
|
||||
}
|
||||
|
||||
@ -1231,7 +1407,7 @@ Option<uint32_t> Collection::add_override(const override_t & override) {
|
||||
return Option<uint32_t>(500, "Error while storing the override on disk.");
|
||||
}
|
||||
|
||||
overrides[override.id] = override;
|
||||
overrides.emplace(override.id, override);
|
||||
return Option<uint32_t>(200);
|
||||
}
|
||||
|
||||
|
@ -212,6 +212,9 @@ bool get_search(http_req & req, http_res & res) {
|
||||
const char *FACET_QUERY = "facet_query";
|
||||
const char *MAX_FACET_VALUES = "max_facet_values";
|
||||
|
||||
const char *GROUP_BY = "group_by";
|
||||
const char *GROUP_LIMIT = "group_limit";
|
||||
|
||||
const char *PER_PAGE = "per_page";
|
||||
const char *PAGE = "page";
|
||||
const char *CALLBACK = "callback";
|
||||
@ -249,11 +252,6 @@ bool get_search(http_req & req, http_res & res) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(req.params.count(QUERY_BY) == 0) {
|
||||
res.set_400(std::string("Parameter `") + QUERY_BY + "` is required.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(req.params.count(MAX_FACET_VALUES) == 0) {
|
||||
req.params[MAX_FACET_VALUES] = "10";
|
||||
}
|
||||
@ -291,41 +289,58 @@ bool get_search(http_req & req, http_res & res) {
|
||||
req.params[EXCLUDE_FIELDS] = "";
|
||||
}
|
||||
|
||||
if(!StringUtils::is_uint64_t(req.params[DROP_TOKENS_THRESHOLD])) {
|
||||
if(req.params.count(GROUP_BY) == 0) {
|
||||
req.params[GROUP_BY] = "";
|
||||
}
|
||||
|
||||
if(req.params.count(GROUP_LIMIT) == 0) {
|
||||
if(req.params[GROUP_BY] != "") {
|
||||
req.params[GROUP_LIMIT] = "3";
|
||||
} else {
|
||||
req.params[GROUP_LIMIT] = "0";
|
||||
}
|
||||
}
|
||||
|
||||
if(!StringUtils::is_uint32_t(req.params[DROP_TOKENS_THRESHOLD])) {
|
||||
res.set_400("Parameter `" + std::string(DROP_TOKENS_THRESHOLD) + "` must be an unsigned integer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!StringUtils::is_uint64_t(req.params[TYPO_TOKENS_THRESHOLD])) {
|
||||
if(!StringUtils::is_uint32_t(req.params[TYPO_TOKENS_THRESHOLD])) {
|
||||
res.set_400("Parameter `" + std::string(TYPO_TOKENS_THRESHOLD) + "` must be an unsigned integer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!StringUtils::is_uint64_t(req.params[NUM_TYPOS])) {
|
||||
if(!StringUtils::is_uint32_t(req.params[NUM_TYPOS])) {
|
||||
res.set_400("Parameter `" + std::string(NUM_TYPOS) + "` must be an unsigned integer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!StringUtils::is_uint64_t(req.params[PER_PAGE])) {
|
||||
if(!StringUtils::is_uint32_t(req.params[PER_PAGE])) {
|
||||
res.set_400("Parameter `" + std::string(PER_PAGE) + "` must be an unsigned integer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!StringUtils::is_uint64_t(req.params[PAGE])) {
|
||||
if(!StringUtils::is_uint32_t(req.params[PAGE])) {
|
||||
res.set_400("Parameter `" + std::string(PAGE) + "` must be an unsigned integer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!StringUtils::is_uint64_t(req.params[MAX_FACET_VALUES])) {
|
||||
if(!StringUtils::is_uint32_t(req.params[MAX_FACET_VALUES])) {
|
||||
res.set_400("Parameter `" + std::string(MAX_FACET_VALUES) + "` must be an unsigned integer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!StringUtils::is_uint64_t(req.params[SNIPPET_THRESHOLD])) {
|
||||
if(!StringUtils::is_uint32_t(req.params[SNIPPET_THRESHOLD])) {
|
||||
res.set_400("Parameter `" + std::string(SNIPPET_THRESHOLD) + "` must be an unsigned integer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!StringUtils::is_uint32_t(req.params[GROUP_LIMIT])) {
|
||||
res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string filter_str = req.params.count(FILTER) != 0 ? req.params[FILTER] : "";
|
||||
|
||||
std::vector<std::string> search_fields;
|
||||
@ -343,6 +358,9 @@ bool get_search(http_req & req, http_res & res) {
|
||||
spp::sparse_hash_set<std::string> include_fields(include_fields_vec.begin(), include_fields_vec.end());
|
||||
spp::sparse_hash_set<std::string> exclude_fields(exclude_fields_vec.begin(), exclude_fields_vec.end());
|
||||
|
||||
std::vector<std::string> group_by_fields;
|
||||
StringUtils::split(req.params[GROUP_BY], group_by_fields, ",");
|
||||
|
||||
std::vector<sort_by> sort_fields;
|
||||
if(req.params.count(SORT_BY) != 0) {
|
||||
std::vector<std::string> sort_field_strs;
|
||||
@ -367,7 +385,8 @@ bool get_search(http_req & req, http_res & res) {
|
||||
}
|
||||
}
|
||||
|
||||
std::map<std::string, size_t> pinned_hits;
|
||||
std::map<size_t, std::vector<std::string>> pinned_hits;
|
||||
|
||||
if(req.params.count(PINNED_HITS) != 0) {
|
||||
std::vector<std::string> pinned_hits_strs;
|
||||
StringUtils::split(req.params[PINNED_HITS], pinned_hits_strs, ",");
|
||||
@ -392,7 +411,7 @@ bool get_search(http_req & req, http_res & res) {
|
||||
return false;
|
||||
}
|
||||
|
||||
pinned_hits.emplace(expression_parts[0], position);
|
||||
pinned_hits[position].emplace_back(expression_parts[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -422,17 +441,19 @@ bool get_search(http_req & req, http_res & res) {
|
||||
|
||||
Option<nlohmann::json> result_op = collection->search(req.params[QUERY], search_fields, filter_str, facet_fields,
|
||||
sort_fields, std::stoi(req.params[NUM_TYPOS]),
|
||||
static_cast<size_t>(std::stoi(req.params[PER_PAGE])),
|
||||
static_cast<size_t>(std::stoi(req.params[PAGE])),
|
||||
static_cast<size_t>(std::stol(req.params[PER_PAGE])),
|
||||
static_cast<size_t>(std::stol(req.params[PAGE])),
|
||||
token_order, prefix, drop_tokens_threshold,
|
||||
include_fields, exclude_fields,
|
||||
static_cast<size_t>(std::stoi(req.params[MAX_FACET_VALUES])),
|
||||
static_cast<size_t>(std::stol(req.params[MAX_FACET_VALUES])),
|
||||
req.params[FACET_QUERY],
|
||||
static_cast<size_t>(std::stoi(req.params[SNIPPET_THRESHOLD])),
|
||||
static_cast<size_t>(std::stol(req.params[SNIPPET_THRESHOLD])),
|
||||
req.params[HIGHLIGHT_FULL_FIELDS],
|
||||
typo_tokens_threshold,
|
||||
pinned_hits,
|
||||
hidden_hits
|
||||
hidden_hits,
|
||||
group_by_fields,
|
||||
static_cast<size_t>(std::stol(req.params[GROUP_LIMIT]))
|
||||
);
|
||||
|
||||
uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
@ -873,7 +894,7 @@ bool post_create_key(http_req &req, http_res &res) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::string &rand_key = StringUtils::randstring(AuthManager::KEY_LEN, req.seed);
|
||||
const std::string &rand_key = req.metadata;
|
||||
|
||||
api_key_t api_key(
|
||||
rand_key,
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <signal.h>
|
||||
#include <h2o.h>
|
||||
#include <iostream>
|
||||
#include <auth_manager.h>
|
||||
#include "raft_server.h"
|
||||
#include "logger.h"
|
||||
|
||||
@ -186,20 +187,6 @@ std::string HttpServer::get_version() {
|
||||
void HttpServer::clear_timeouts(const std::vector<h2o_timer_t*> & timers, bool trigger_callback) {
|
||||
for(h2o_timer_t* timer: timers) {
|
||||
h2o_timer_unlink(timer);
|
||||
/*while (!h2o_linklist_is_empty(&timer->_link)) {
|
||||
h2o_timer_t *entry = H2O_STRUCT_FROM_MEMBER(h2o_timer_t, _link, timer->_link.next);
|
||||
if(entry == nullptr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(trigger_callback) {
|
||||
entry->cb(entry);
|
||||
}
|
||||
|
||||
//entry->expire_at = 0;
|
||||
h2o_linklist_unlink(&entry->_link);
|
||||
h2o_timer_unlink(timer);
|
||||
}*/
|
||||
}
|
||||
}
|
||||
|
||||
@ -385,6 +372,12 @@ int HttpServer::catch_all_handler(h2o_handler_t *_self, h2o_req_t *req) {
|
||||
}
|
||||
|
||||
// routes match and is an authenticated request
|
||||
// do any additional pre-request middleware operations here
|
||||
if(rpath->action == "keys:create") {
|
||||
// we enrich incoming request with a random API key here so that leader and replicas will use the same key
|
||||
request->metadata = StringUtils::randstring(AuthManager::KEY_LEN);
|
||||
}
|
||||
|
||||
// for writes, we defer to replication_state
|
||||
if(http_method != "GET") {
|
||||
self->http_server->get_replication_state()->write(request, response);
|
||||
@ -476,26 +469,13 @@ void HttpServer::on(const std::string & message, bool (*handler)(void*)) {
|
||||
HttpServer::~HttpServer() {
|
||||
delete message_dispatcher;
|
||||
|
||||
// remove all timeouts defined in: https://github.com/h2o/h2o/blob/v2.2.2/lib/core/context.c#L142
|
||||
/*std::vector<h2o_timeout_t*> timeouts = {
|
||||
&ctx.zero_timeout,
|
||||
&ctx.one_sec_timeout,
|
||||
&ctx.hundred_ms_timeout,
|
||||
&ctx.handshake_timeout,
|
||||
&ctx.http1.req_timeout,
|
||||
&ctx.http2.idle_timeout,
|
||||
&ctx.http2.graceful_shutdown_timeout,
|
||||
&ctx.proxy.io_timeout
|
||||
};
|
||||
|
||||
clear_timeouts(timeouts);
|
||||
*/
|
||||
|
||||
if(ssl_refresh_timer.timer.expire_at != 0) {
|
||||
// avoid callback since it recreates timeout
|
||||
clear_timeouts({&ssl_refresh_timer.timer}, false);
|
||||
}
|
||||
|
||||
h2o_timerwheel_run(ctx.loop->_timeouts, 9999999999999);
|
||||
|
||||
h2o_context_dispose(&ctx);
|
||||
|
||||
free(ctx.globalconf->server_name.base);
|
||||
|
331
src/index.cpp
331
src/index.cpp
@ -633,7 +633,7 @@ void Index::compute_facet_stats(facet &a_facet, int64_t raw_value, const std::st
|
||||
void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
const uint32_t* result_ids, size_t results_size) {
|
||||
|
||||
std::map<std::string, size_t> facet_to_index;
|
||||
std::unordered_map<std::string, size_t> facet_to_index;
|
||||
|
||||
size_t i_facet = 0;
|
||||
for(const auto & facet: facet_schema) {
|
||||
@ -661,6 +661,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
std::vector<std::string> query_tokens;
|
||||
StringUtils::split(facet_query.query, query_tokens, " ");
|
||||
|
||||
// for non-string fields, `faceted_name` returns their aliased stringified field name
|
||||
art_tree *t = search_index.at(facet_field.faceted_name());
|
||||
|
||||
for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
|
||||
@ -703,7 +704,7 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_id];
|
||||
|
||||
int array_pos = 0;
|
||||
int fvalue_found = 0;
|
||||
bool fvalue_found = false;
|
||||
std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
|
||||
spp::sparse_hash_map<uint32_t, token_pos_cost_t> query_token_positions;
|
||||
size_t field_token_index = -1;
|
||||
@ -717,9 +718,9 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
// ftoken_hash is the raw value for numeric fields
|
||||
compute_facet_stats(a_facet, ftoken_hash, facet_field.type);
|
||||
|
||||
// not using facet query or this particular facet value is found in facet filter
|
||||
if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) {
|
||||
// not using facet query or this particular facet value is found in facet filter
|
||||
fvalue_found |= 1; // bitwise to ensure only one count for a multi-token facet value
|
||||
fvalue_found = true;
|
||||
|
||||
if(use_facet_query) {
|
||||
// map token index to query index (used for highlighting later on)
|
||||
@ -736,41 +737,35 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
}
|
||||
}
|
||||
|
||||
//std::cout << "j: " << j << std::endl;
|
||||
|
||||
// 0 indicates separator, while the second condition checks for non-array string
|
||||
if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && j == fhashes.size() - 1)) {
|
||||
if(!use_facet_query || fvalue_found != 0) {
|
||||
if(!use_facet_query || fvalue_found) {
|
||||
const std::string & fvalue_str = fvaluestream.str();
|
||||
uint64_t fhash = 0;
|
||||
if(facet_field.is_string()) {
|
||||
fhash = facet_token_hash(facet_field, fvalue_str);
|
||||
} else {
|
||||
fhash = std::atoi(fvalue_str.c_str());
|
||||
}
|
||||
uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
|
||||
|
||||
if(a_facet.result_map.count(fhash) == 0) {
|
||||
a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0,
|
||||
a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_set<uint64_t>(),
|
||||
doc_seq_id, 0,
|
||||
spp::sparse_hash_map<uint32_t, token_pos_cost_t>()};
|
||||
}
|
||||
|
||||
a_facet.result_map[fhash].count += 1;
|
||||
a_facet.result_map[fhash].doc_id = doc_seq_id;
|
||||
a_facet.result_map[fhash].array_pos = array_pos;
|
||||
|
||||
if(search_params->group_limit) {
|
||||
uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id);
|
||||
a_facet.result_map[fhash].groups.emplace(distinct_id);
|
||||
} else {
|
||||
a_facet.result_map[fhash].count += 1;
|
||||
}
|
||||
|
||||
if(use_facet_query) {
|
||||
a_facet.result_map[fhash].query_token_pos = query_token_positions;
|
||||
|
||||
/*if(j == 11) {
|
||||
for(auto xx: query_token_positions) {
|
||||
std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl;
|
||||
}
|
||||
}*/
|
||||
}
|
||||
}
|
||||
|
||||
array_pos++;
|
||||
fvalue_found = 0;
|
||||
fvalue_found = false;
|
||||
std::stringstream().swap(fvaluestream);
|
||||
spp::sparse_hash_map<uint32_t, token_pos_cost_t>().swap(query_token_positions);
|
||||
field_token_index = -1;
|
||||
@ -781,56 +776,12 @@ void Index::do_facets(std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
}
|
||||
}
|
||||
|
||||
void Index::drop_facets(std::vector<facet> & facets, const std::vector<uint32_t> & ids) {
|
||||
std::map<std::string, size_t> facet_to_index;
|
||||
|
||||
size_t i_facet = 0;
|
||||
for(const auto & facet: facet_schema) {
|
||||
facet_to_index[facet.first] = i_facet;
|
||||
i_facet++;
|
||||
}
|
||||
|
||||
for(auto & a_facet: facets) {
|
||||
const field & facet_field = facet_schema.at(a_facet.field_name);
|
||||
|
||||
// assumed that facet fields have already been validated upstream
|
||||
for(const uint32_t doc_seq_id: ids) {
|
||||
if(facet_index_v2.count(doc_seq_id) != 0) {
|
||||
// FORMAT OF VALUES
|
||||
// String: h1 h2 h3
|
||||
// String array: h1 h2 h3 0 h1 0 h1 h2 0
|
||||
const std::vector<uint64_t> & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]];
|
||||
std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens)
|
||||
|
||||
for(size_t j = 0; j < fhashes.size(); j++) {
|
||||
if(fhashes[j] != FACET_ARRAY_DELIMETER) {
|
||||
int64_t ftoken_hash = fhashes[j];
|
||||
fvaluestream << ftoken_hash;
|
||||
}
|
||||
|
||||
if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER &&
|
||||
j == fhashes.size() - 1)) {
|
||||
const std::string & fvalue_str = fvaluestream.str();
|
||||
uint64_t fhash = facet_token_hash(facet_field, fvalue_str);
|
||||
|
||||
if(a_facet.result_map.count(fhash) != 0) {
|
||||
a_facet.result_map[fhash].count -= 1;
|
||||
if(a_facet.result_map[fhash].count == 0) {
|
||||
a_facet.result_map.erase(fhash);
|
||||
}
|
||||
}
|
||||
std::stringstream().swap(fvaluestream);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length,
|
||||
const std::vector<uint32_t>& curated_ids,
|
||||
const std::vector<sort_by> & sort_fields,
|
||||
std::vector<token_candidates> & token_candidates_vec, const token_ordering token_order,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries, Topster & topster,
|
||||
std::vector<token_candidates> & token_candidates_vec,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries, Topster* topster,
|
||||
spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
uint32_t** all_result_ids, size_t & all_result_ids_len,
|
||||
const size_t typo_tokens_threshold) {
|
||||
const long long combination_limit = 10;
|
||||
@ -874,6 +825,15 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!curated_ids.empty()) {
|
||||
uint32_t *excluded_result_ids = nullptr;
|
||||
result_size = ArrayUtils::exclude_scalar(result_ids, result_size, &curated_ids[0],
|
||||
curated_ids.size(), &excluded_result_ids);
|
||||
|
||||
delete [] result_ids;
|
||||
result_ids = excluded_result_ids;
|
||||
}
|
||||
|
||||
if(filter_ids != nullptr) {
|
||||
// intersect once again with filter ids
|
||||
uint32_t* filtered_result_ids = nullptr;
|
||||
@ -888,7 +848,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
|
||||
|
||||
// go through each matching document id and calculate match score
|
||||
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
|
||||
filtered_result_ids, filtered_results_size);
|
||||
groups_processed, filtered_result_ids, filtered_results_size);
|
||||
|
||||
delete[] filtered_result_ids;
|
||||
delete[] result_ids;
|
||||
@ -900,7 +860,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si
|
||||
*all_result_ids = new_all_result_ids;
|
||||
|
||||
score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion,
|
||||
result_ids, result_size);
|
||||
groups_processed, result_ids, result_size);
|
||||
delete[] result_ids;
|
||||
}
|
||||
|
||||
@ -1056,13 +1016,17 @@ void Index::run_search() {
|
||||
}
|
||||
|
||||
// after the wait, we own the lock.
|
||||
search(search_params.outcome, search_params.query, search_params.search_fields,
|
||||
search_params.filters, search_params.facets, search_params.facet_query, search_params.included_ids,
|
||||
search_params.excluded_ids, search_params.sort_fields_std, search_params.num_typos,
|
||||
search_params.max_hits, search_params.per_page, search_params.page, search_params.token_order,
|
||||
search_params.prefix, search_params.drop_tokens_threshold, search_params.raw_result_kvs,
|
||||
search_params.all_result_ids_len, search_params.searched_queries, search_params.override_result_kvs,
|
||||
search_params.typo_tokens_threshold);
|
||||
search(search_params->outcome, search_params->query, search_params->search_fields,
|
||||
search_params->filters, search_params->facets, search_params->facet_query,
|
||||
search_params->included_ids, search_params->excluded_ids,
|
||||
search_params->sort_fields_std, search_params->num_typos,
|
||||
search_params->topster, search_params->curated_topster,
|
||||
search_params->per_page, search_params->page, search_params->token_order,
|
||||
search_params->prefix, search_params->drop_tokens_threshold,
|
||||
search_params->all_result_ids_len, search_params->groups_processed,
|
||||
search_params->searched_queries,
|
||||
search_params->raw_result_kvs, search_params->override_result_kvs,
|
||||
search_params->typo_tokens_threshold);
|
||||
|
||||
// hand control back to main thread
|
||||
processed = true;
|
||||
@ -1074,12 +1038,12 @@ void Index::run_search() {
|
||||
}
|
||||
}
|
||||
|
||||
void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id,
|
||||
const std::vector<uint32_t> & included_ids,
|
||||
Topster & curated_topster,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries) {
|
||||
void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id,
|
||||
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
|
||||
Topster* curated_topster,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries) {
|
||||
|
||||
if(included_ids.size() == 0) {
|
||||
if(included_ids_map.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1098,48 +1062,34 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f
|
||||
art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
|
||||
0, 0, 1, token_ordering::MAX_SCORE, false, leaves);
|
||||
|
||||
if(leaves.size() > 0) {
|
||||
if(!leaves.empty()) {
|
||||
override_query.push_back(leaves[0]);
|
||||
}
|
||||
}
|
||||
|
||||
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
|
||||
for(const auto& pos_ids: included_ids_map) {
|
||||
const size_t outer_pos = pos_ids.first;
|
||||
|
||||
for (art_leaf *token_leaf : override_query) {
|
||||
uint32_t *indices = new uint32_t[included_ids.size()];
|
||||
token_leaf->values->ids.indexOf(&included_ids[0], included_ids.size(), indices);
|
||||
leaf_to_indices.emplace(token_leaf, indices);
|
||||
}
|
||||
for(const auto& index_seq_id: pos_ids.second) {
|
||||
uint32_t inner_pos = index_seq_id.first;
|
||||
uint32_t seq_id = index_seq_id.second;
|
||||
|
||||
for(size_t j=0; j<included_ids.size(); j++) {
|
||||
const uint32_t seq_id = included_ids[j];
|
||||
uint64_t distinct_id = outer_pos; // outer pos is the group distinct key
|
||||
uint64_t match_score = (64000 - inner_pos); // inner pos within a group is the match score
|
||||
|
||||
std::vector<std::vector<std::vector<uint16_t>>> array_token_positions;
|
||||
populate_token_positions(override_query, leaf_to_indices, j, array_token_positions);
|
||||
// LOG(INFO) << "seq_id: " << seq_id << " - " << match_score;
|
||||
|
||||
uint64_t match_score = 0;
|
||||
int64_t scores[3];
|
||||
scores[0] = match_score;
|
||||
scores[1] = int64_t(1);
|
||||
scores[2] = int64_t(1);
|
||||
|
||||
for(const std::vector<std::vector<uint16_t>> & token_positions: array_token_positions) {
|
||||
if(token_positions.empty()) {
|
||||
continue;
|
||||
}
|
||||
const Match & match = Match::match(seq_id, token_positions);
|
||||
uint64_t this_match_score = match.get_match_score(0, field_id);
|
||||
|
||||
if(this_match_score > match_score) {
|
||||
match_score = this_match_score;
|
||||
}
|
||||
KV kv(field_id, searched_queries.size(), seq_id, distinct_id, match_score, scores);
|
||||
curated_topster->add(&kv);
|
||||
}
|
||||
|
||||
int64_t scores[3];
|
||||
scores[0] = int64_t(match_score);
|
||||
scores[1] = int64_t(1);
|
||||
scores[2] = int64_t(1);
|
||||
|
||||
curated_topster.add(seq_id, field_id, searched_queries.size(), match_score, scores);
|
||||
|
||||
searched_queries.push_back(override_query);
|
||||
}
|
||||
|
||||
searched_queries.push_back(override_query);
|
||||
}
|
||||
|
||||
void Index::search(Option<uint32_t> & outcome,
|
||||
@ -1147,19 +1097,20 @@ void Index::search(Option<uint32_t> & outcome,
|
||||
const std::vector<std::string> & search_fields,
|
||||
const std::vector<filter> & filters,
|
||||
std::vector<facet> & facets, facet_query_t & facet_query,
|
||||
const std::vector<uint32_t> & included_ids,
|
||||
const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
|
||||
const std::vector<uint32_t> & excluded_ids,
|
||||
const std::vector<sort_by> & sort_fields_std, const int num_typos, const size_t max_hits,
|
||||
const std::vector<sort_by> & sort_fields_std, const int num_typos,
|
||||
Topster* topster,
|
||||
Topster* curated_topster,
|
||||
const size_t per_page, const size_t page, const token_ordering token_order,
|
||||
const bool prefix, const size_t drop_tokens_threshold,
|
||||
std::vector<KV> & raw_result_kvs,
|
||||
size_t & all_result_ids_len,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
std::vector<KV> & override_result_kvs,
|
||||
spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
std::vector<std::vector<art_leaf*>>& searched_queries,
|
||||
std::vector<std::vector<KV*>> & raw_result_kvs,
|
||||
std::vector<std::vector<KV*>> & override_result_kvs,
|
||||
const size_t typo_tokens_threshold) {
|
||||
|
||||
const size_t num_results = (page * per_page);
|
||||
|
||||
// process the filters
|
||||
|
||||
uint32_t* filter_ids = nullptr;
|
||||
@ -1169,25 +1120,48 @@ void Index::search(Option<uint32_t> & outcome,
|
||||
return ;
|
||||
}
|
||||
|
||||
const uint32_t filter_ids_length = op_filter_ids_length.get();
|
||||
uint32_t filter_ids_length = op_filter_ids_length.get();
|
||||
|
||||
// we will be removing all curated IDs from organic result ids before running topster
|
||||
std::set<uint32_t> curated_ids;
|
||||
std::vector<uint32_t> included_ids;
|
||||
|
||||
for(const auto& outer_pos_ids: included_ids_map) {
|
||||
for(const auto& inner_pos_seq_id: outer_pos_ids.second) {
|
||||
curated_ids.insert(inner_pos_seq_id.second);
|
||||
included_ids.push_back(inner_pos_seq_id.second);
|
||||
}
|
||||
}
|
||||
|
||||
curated_ids.insert(excluded_ids.begin(), excluded_ids.end());
|
||||
|
||||
std::vector<uint32_t> curated_ids_sorted(curated_ids.begin(), curated_ids.end());
|
||||
std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end());
|
||||
|
||||
// Order of `fields` are used to sort results
|
||||
//auto begin = std::chrono::high_resolution_clock::now();
|
||||
uint32_t* all_result_ids = nullptr;
|
||||
|
||||
const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory
|
||||
Topster topster(topster_size);
|
||||
Topster curated_topster(topster_size);
|
||||
|
||||
if(query == "*") {
|
||||
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0);
|
||||
const std::string & field = search_fields[0];
|
||||
|
||||
if(!curated_ids.empty()) {
|
||||
uint32_t *excluded_result_ids = nullptr;
|
||||
filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0],
|
||||
curated_ids.size(), &excluded_result_ids);
|
||||
|
||||
delete [] filter_ids;
|
||||
filter_ids = excluded_result_ids;
|
||||
}
|
||||
|
||||
score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {},
|
||||
filter_ids, filter_ids_length);
|
||||
collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
|
||||
do_facets(facets, facet_query, filter_ids, filter_ids_length);
|
||||
groups_processed, filter_ids, filter_ids_length);
|
||||
collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries);
|
||||
|
||||
all_result_ids_len = filter_ids_length;
|
||||
all_result_ids = filter_ids;
|
||||
filter_ids = nullptr;
|
||||
} else {
|
||||
const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM);
|
||||
for(size_t i = 0; i < num_search_fields; i++) {
|
||||
@ -1196,47 +1170,22 @@ void Index::search(Option<uint32_t> & outcome,
|
||||
const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i); // Order of `fields` are used to sort results
|
||||
const std::string & field = search_fields[i];
|
||||
|
||||
search_field(field_id, query, field, filter_ids, filter_ids_length, facets, sort_fields_std,
|
||||
num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len,
|
||||
search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std,
|
||||
num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len,
|
||||
token_order, prefix, drop_tokens_threshold, typo_tokens_threshold);
|
||||
collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries);
|
||||
collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries);
|
||||
}
|
||||
}
|
||||
do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
|
||||
}
|
||||
|
||||
do_facets(facets, facet_query, all_result_ids, all_result_ids_len);
|
||||
do_facets(facets, facet_query, &included_ids[0], included_ids.size());
|
||||
|
||||
// must be sorted before iterated upon to remove "empty" array entries
|
||||
topster.sort();
|
||||
curated_topster.sort();
|
||||
topster->sort();
|
||||
curated_topster->sort();
|
||||
|
||||
std::set<uint32_t> ids_to_remove(included_ids.begin(), included_ids.end());
|
||||
ids_to_remove.insert(excluded_ids.begin(), excluded_ids.end());
|
||||
|
||||
std::vector<uint32_t> dropped_ids;
|
||||
|
||||
// loop through topster and remove elements from included and excluded id lists
|
||||
for(uint32_t t = 0; t < topster.size && t < num_results; t++) {
|
||||
KV* kv = topster.getKV(t);
|
||||
|
||||
if(ids_to_remove.count(kv->key) != 0) {
|
||||
dropped_ids.push_back((uint32_t)kv->key);
|
||||
} else {
|
||||
raw_result_kvs.push_back(*kv);
|
||||
}
|
||||
}
|
||||
|
||||
for(uint32_t t = 0; t < curated_topster.size && t < num_results; t++) {
|
||||
KV* kv = curated_topster.getKV(t);
|
||||
override_result_kvs.push_back(*kv);
|
||||
}
|
||||
|
||||
// for the ids that are dropped, remove their corresponding facet components from facet results
|
||||
drop_facets(facets, dropped_ids);
|
||||
|
||||
all_result_ids_len -= dropped_ids.size();
|
||||
all_result_ids_len += curated_topster.size;
|
||||
all_result_ids_len += curated_topster->size;
|
||||
|
||||
delete [] filter_ids;
|
||||
delete [] all_result_ids;
|
||||
@ -1258,9 +1207,11 @@ void Index::search(Option<uint32_t> & outcome,
|
||||
*/
|
||||
void Index::search_field(const uint8_t & field_id, const std::string & query, const std::string & field,
|
||||
uint32_t *filter_ids, size_t filter_ids_length,
|
||||
const std::vector<uint32_t>& curated_ids,
|
||||
std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
|
||||
std::vector<std::vector<art_leaf*>> & searched_queries,
|
||||
Topster & topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
|
||||
Topster* topster, spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
uint32_t** all_result_ids, size_t & all_result_ids_len,
|
||||
const token_ordering token_order, const bool prefix,
|
||||
const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) {
|
||||
std::vector<std::string> tokens;
|
||||
@ -1373,8 +1324,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
|
||||
|
||||
if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) {
|
||||
// If all tokens were found, go ahead and search for candidates with what we have so far
|
||||
search_candidates(field_id, filter_ids, filter_ids_length, sort_fields, token_candidates_vec,
|
||||
token_order, searched_queries, topster, all_result_ids, all_result_ids_len,
|
||||
search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec,
|
||||
searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
|
||||
typo_tokens_threshold);
|
||||
}
|
||||
|
||||
@ -1407,8 +1358,9 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co
|
||||
truncated_query += " " + token_count_pairs.at(i).first;
|
||||
}
|
||||
|
||||
return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
|
||||
searched_queries, topster, all_result_ids, all_result_ids_len,
|
||||
return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids,
|
||||
facets, sort_fields, num_typos,
|
||||
searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len,
|
||||
token_order, prefix);
|
||||
}
|
||||
}
|
||||
@ -1434,8 +1386,9 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect
|
||||
}
|
||||
|
||||
void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16_t & query_index,
|
||||
const uint8_t & field_id, const uint32_t total_cost, Topster & topster,
|
||||
const uint8_t & field_id, const uint32_t total_cost, Topster* topster,
|
||||
const std::vector<art_leaf *> &query_suggestion,
|
||||
spp::sparse_hash_set<uint64_t>& groups_processed,
|
||||
const uint32_t *result_ids, const size_t result_size) const {
|
||||
|
||||
spp::sparse_hash_map<const art_leaf*, uint32_t*> leaf_to_indices;
|
||||
@ -1467,6 +1420,16 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
Match single_token_match = Match(1, 0, 0, empty_offset_diffs);
|
||||
const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost, field_id);
|
||||
|
||||
std::unordered_map<std::string, size_t> facet_to_id;
|
||||
|
||||
if(search_params->group_limit > 0) {
|
||||
size_t i_facet = 0;
|
||||
for(const auto & facet: facet_schema) {
|
||||
facet_to_id[facet.first] = i_facet;
|
||||
i_facet++;
|
||||
}
|
||||
}
|
||||
|
||||
for(size_t i=0; i<result_size; i++) {
|
||||
const uint32_t seq_id = result_ids[i];
|
||||
|
||||
@ -1500,7 +1463,7 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
}
|
||||
|
||||
const int64_t default_score = 0;
|
||||
int64_t scores[3];
|
||||
int64_t scores[3] = {0};
|
||||
|
||||
// avoiding loop
|
||||
if(sort_fields.size() > 0) {
|
||||
@ -1541,7 +1504,15 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
}
|
||||
}
|
||||
|
||||
topster.add(seq_id, field_id, query_index, match_score, scores);
|
||||
uint64_t distinct_id = seq_id;
|
||||
|
||||
if(search_params->group_limit != 0) {
|
||||
distinct_id = get_distinct_id(facet_to_id, seq_id);
|
||||
groups_processed.emplace(distinct_id);
|
||||
}
|
||||
|
||||
KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores);
|
||||
topster->add(&kv);
|
||||
}
|
||||
|
||||
//long long int timeNanos = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
@ -1553,6 +1524,26 @@ void Index::score_results(const std::vector<sort_by> & sort_fields, const uint16
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t Index::get_distinct_id(const std::unordered_map<std::string, size_t> &facet_to_id,
|
||||
const uint32_t seq_id) const {
|
||||
uint64_t distinct_id = 1; // some constant initial value
|
||||
|
||||
// calculate hash from group_by_fields
|
||||
for(const auto& field: search_params->group_by_fields) {
|
||||
if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t facet_id = facet_to_id.at(field);
|
||||
const std::vector<uint64_t>& fhashes = facet_index_v2.at(seq_id)[facet_id];
|
||||
|
||||
for(const auto& hash: fhashes) {
|
||||
distinct_id = hash_combine(distinct_id, hash);
|
||||
}
|
||||
}
|
||||
return distinct_id;
|
||||
}
|
||||
|
||||
void Index::populate_token_positions(const std::vector<art_leaf *> &query_suggestion,
|
||||
spp::sparse_hash_map<const art_leaf *, uint32_t *> &leaf_to_indices,
|
||||
size_t result_index,
|
||||
|
@ -46,7 +46,7 @@ void benchmark_hn_titles(char* file_path) {
|
||||
|
||||
Store *store = new Store("/tmp/typesense-data");
|
||||
CollectionManager & collectionManager = CollectionManager::get_instance();
|
||||
collectionManager.init(store, 4, "abcd", "1234");
|
||||
collectionManager.init(store, 4, "abcd");
|
||||
collectionManager.load();
|
||||
|
||||
Collection *collection = collectionManager.get_collection("hnstories_direct");
|
||||
@ -116,7 +116,7 @@ void benchmark_reactjs_pages(char* file_path) {
|
||||
|
||||
Store *store = new Store("/tmp/typesense-data");
|
||||
CollectionManager & collectionManager = CollectionManager::get_instance();
|
||||
collectionManager.init(store, 4, "abcd", "1234");
|
||||
collectionManager.init(store, 4, "abcd");
|
||||
collectionManager.load();
|
||||
|
||||
Collection *collection = collectionManager.get_collection("reactjs_pages");
|
||||
|
@ -21,7 +21,7 @@ int main(int argc, char* argv[]) {
|
||||
Store *store = new Store(state_dir_path);
|
||||
|
||||
CollectionManager & collectionManager = CollectionManager::get_instance();
|
||||
collectionManager.init(store, 4, "abcd", "123");
|
||||
collectionManager.init(store, 4, "abcd");
|
||||
collectionManager.load();
|
||||
|
||||
std::vector<field> fields_to_index = {
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include "typesense_server_utils.h"
|
||||
#include "core_api.h"
|
||||
#include "config.h"
|
||||
#include "typesense_server_utils.h"
|
||||
|
||||
void master_server_routes() {
|
||||
// collection management
|
||||
@ -55,13 +55,7 @@ void replica_server_routes() {
|
||||
server->get("/health", get_health);
|
||||
}
|
||||
|
||||
namespace logging {
|
||||
DECLARE_bool(log_year);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
logging::FLAGS_log_year = true;
|
||||
|
||||
Config config;
|
||||
|
||||
cmdline::parser options;
|
||||
|
@ -34,6 +34,7 @@ int ReplicationState::start(const butil::EndPoint & peering_endpoint, const int
|
||||
node_options.fsm = this;
|
||||
node_options.node_owns_fsm = false;
|
||||
node_options.snapshot_interval_s = snapshot_interval_s;
|
||||
node_options.filter_before_copy_remote = false;
|
||||
std::string prefix = "local://" + raft_dir;
|
||||
node_options.log_uri = prefix + "/" + log_dir_name;
|
||||
node_options.raft_meta_uri = prefix + "/" + meta_dir_name;
|
||||
@ -297,7 +298,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
LOG(TRACE) << "rm " << store->get_state_dir_path() << " success";
|
||||
LOG(INFO) << "rm " << store->get_state_dir_path() << " success";
|
||||
|
||||
std::string snapshot_path = reader->get_path();
|
||||
snapshot_path.append(std::string("/") + db_snapshot_name);
|
||||
@ -308,7 +309,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
LOG(TRACE) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success";
|
||||
LOG(INFO) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success";
|
||||
|
||||
return init_db();
|
||||
}
|
||||
|
@ -6,7 +6,7 @@ void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_lengt
|
||||
max = array_length > 1 ? sorted_array[array_length-1] : min;
|
||||
|
||||
uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR);
|
||||
uint8_t *out = new uint8_t[size_required];
|
||||
uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out);
|
||||
uint32_t actual_size = for_compress_sorted(sorted_array, out, array_length);
|
||||
|
||||
free(in);
|
||||
|
@ -1,8 +1,8 @@
|
||||
#include "string_utils.h"
|
||||
#include <iostream>
|
||||
#include <openssl/evp.h>
|
||||
#include <iomanip>
|
||||
#include <openssl/hmac.h>
|
||||
#include <random>
|
||||
|
||||
std::string lower_and_no_special_chars(const std::string & str) {
|
||||
std::stringstream ss;
|
||||
@ -19,6 +19,10 @@ std::string lower_and_no_special_chars(const std::string & str) {
|
||||
}
|
||||
|
||||
void StringUtils::unicode_normalize(std::string & str) const {
|
||||
if(str.empty()) {
|
||||
return ;
|
||||
}
|
||||
|
||||
std::stringstream out;
|
||||
|
||||
for (char *s = &str[0]; *s;) {
|
||||
@ -49,16 +53,16 @@ void StringUtils::unicode_normalize(std::string & str) const {
|
||||
}
|
||||
}
|
||||
|
||||
str.assign(lower_and_no_special_chars(out.str()));
|
||||
str = lower_and_no_special_chars(out.str());
|
||||
}
|
||||
|
||||
std::string StringUtils::randstring(size_t length, uint64_t seed) {
|
||||
std::string StringUtils::randstring(size_t length) {
|
||||
static auto& chrs = "0123456789"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
||||
|
||||
thread_local static std::mt19937 rg(seed);
|
||||
thread_local static std::uniform_int_distribution<std::string::size_type> pick(0, sizeof(chrs) - 2);
|
||||
thread_local std::mt19937 rg(std::random_device{}());
|
||||
thread_local std::uniform_int_distribution<uint32_t> pick(0, sizeof(chrs) - 2);
|
||||
|
||||
std::string s;
|
||||
s.reserve(length);
|
||||
|
@ -27,9 +27,9 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
|
||||
|
||||
rusage r_usage;
|
||||
getrusage(RUSAGE_SELF, &r_usage);
|
||||
result["memory_used_process_bytes"] = r_usage.ru_maxrss * 1000;
|
||||
result["typesense_memory_used_bytes"] = r_usage.ru_maxrss * 1000;
|
||||
|
||||
uint64_t memory_free_bytes = 0;
|
||||
uint64_t memory_available_bytes = 0;
|
||||
uint64_t memory_total_bytes = 0;
|
||||
|
||||
#ifdef __APPLE__
|
||||
@ -42,7 +42,7 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
|
||||
if (KERN_SUCCESS == host_page_size(mach_port, &mach_page_size) &&
|
||||
KERN_SUCCESS == host_statistics64(mach_port, HOST_VM_INFO,
|
||||
(host_info64_t)&vm_stats, &count)) {
|
||||
memory_free_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size;
|
||||
memory_available_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size;
|
||||
}
|
||||
|
||||
uint64_t pages = sysconf(_SC_PHYS_PAGES);
|
||||
@ -51,11 +51,11 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result
|
||||
#elif __linux__
|
||||
struct sysinfo sys_info;
|
||||
sysinfo(&sys_info);
|
||||
memory_free_bytes = sys_info.freeram;
|
||||
memory_available_bytes = linux_get_mem_available_bytes();
|
||||
memory_total_bytes = sys_info.totalram;
|
||||
#endif
|
||||
|
||||
result["memory_free_bytes"] = memory_free_bytes;
|
||||
result["memory_available_bytes"] = memory_available_bytes;
|
||||
result["memory_total_bytes"] = memory_total_bytes;
|
||||
|
||||
// CPU METRICS
|
||||
|
@ -133,35 +133,45 @@ int init_logger(Config & config, const std::string & server_version) {
|
||||
signal(SIGILL, catch_crash);
|
||||
signal(SIGSEGV, catch_crash);
|
||||
|
||||
logging::LoggingSettings log_settings;
|
||||
|
||||
// we can install new signal handlers only after overriding above
|
||||
signal(SIGINT, catch_interrupt);
|
||||
signal(SIGTERM, catch_interrupt);
|
||||
|
||||
google::InitGoogleLogging("typesense");
|
||||
|
||||
std::string log_dir = config.get_log_dir();
|
||||
std::string log_path;
|
||||
|
||||
if(log_dir.empty()) {
|
||||
// use console logger if log dir is not specified
|
||||
log_settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
|
||||
FLAGS_logtostderr = true;
|
||||
} else {
|
||||
if(!directory_exists(log_dir)) {
|
||||
std::cerr << "Typesense failed to start. " << "Log directory " << log_dir << " does not exist.";
|
||||
return 1;
|
||||
}
|
||||
|
||||
log_settings.logging_dest = logging::LOG_TO_FILE;
|
||||
log_path = log_dir + "/" + "typesense.log";
|
||||
log_settings.log_file = log_path.c_str();
|
||||
// flush log levels above -1 immediately (INFO=0)
|
||||
FLAGS_logbuflevel = -1;
|
||||
|
||||
LOG(INFO) << "Starting Typesense " << server_version << ". Log directory is configured as: "
|
||||
<< log_dir << std::endl;
|
||||
// available only on glog master (ensures that log file name is constant)
|
||||
FLAGS_timestamp_in_logfile_name = false;
|
||||
|
||||
std::string log_path = log_dir + "/" + "typesense.log";
|
||||
|
||||
// will log levels INFO **and above** to the given log file
|
||||
google::SetLogDestination(google::INFO, log_path.c_str());
|
||||
|
||||
// don't create symlink for INFO log
|
||||
google::SetLogSymlink(google::INFO, "");
|
||||
|
||||
// don't create separate log files for each level
|
||||
google::SetLogDestination(google::WARNING, "");
|
||||
google::SetLogDestination(google::ERROR, "");
|
||||
google::SetLogDestination(google::FATAL, "");
|
||||
|
||||
std::cout << "Log directory is configured as: " << log_dir << std::endl;
|
||||
}
|
||||
|
||||
logging::InitLogging(log_settings);
|
||||
logging::SetMinLogLevel(0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -275,7 +285,6 @@ int start_raft_server(ReplicationState& replication_state, const std::string& st
|
||||
}
|
||||
|
||||
int run_server(const Config & config, const std::string & version, void (*master_server_routes)()) {
|
||||
|
||||
LOG(INFO) << "Starting Typesense " << version << std::flush;
|
||||
quit_raft_service = false;
|
||||
|
||||
|
@ -15,7 +15,7 @@ protected:
|
||||
std::vector<sort_by> sort_fields;
|
||||
|
||||
void setupCollection() {
|
||||
std::string state_dir_path = "/tmp/typesense_test/collection_sorting";
|
||||
std::string state_dir_path = "/tmp/typesense_test/collection_faceting";
|
||||
LOG(INFO) << "Truncating and creating: " << state_dir_path;
|
||||
system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
|
||||
|
||||
@ -29,6 +29,7 @@ protected:
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
collectionManager.dispose();
|
||||
delete store;
|
||||
}
|
||||
};
|
||||
|
357
test/collection_grouping_test.cpp
Normal file
357
test/collection_grouping_test.cpp
Normal file
@ -0,0 +1,357 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <collection_manager.h>
|
||||
#include "collection.h"
|
||||
|
||||
class CollectionGroupingTest : public ::testing::Test {
|
||||
protected:
|
||||
Store *store;
|
||||
CollectionManager & collectionManager = CollectionManager::get_instance();
|
||||
Collection *coll_group;
|
||||
|
||||
void setupCollection() {
|
||||
std::string state_dir_path = "/tmp/typesense_test/collection_grouping";
|
||||
LOG(INFO) << "Truncating and creating: " << state_dir_path;
|
||||
system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
|
||||
|
||||
store = new Store(state_dir_path);
|
||||
collectionManager.init(store, 4, "auth_key");
|
||||
collectionManager.load();
|
||||
|
||||
std::vector<field> fields = {
|
||||
field("title", field_types::STRING, false),
|
||||
field("brand", field_types::STRING, true, true),
|
||||
field("size", field_types::INT32, true, false),
|
||||
field("colors", field_types::STRING_ARRAY, true, false),
|
||||
field("rating", field_types::FLOAT, true, false)
|
||||
};
|
||||
|
||||
coll_group = collectionManager.get_collection("coll_group");
|
||||
if(coll_group == nullptr) {
|
||||
coll_group = collectionManager.create_collection("coll_group", fields, "rating").get();
|
||||
}
|
||||
|
||||
std::ifstream infile(std::string(ROOT_DIR)+"test/group_documents.jsonl");
|
||||
|
||||
std::string json_line;
|
||||
|
||||
while (std::getline(infile, json_line)) {
|
||||
auto add_op = coll_group->add(json_line);
|
||||
if(!add_op.ok()) {
|
||||
std::cout << add_op.error() << std::endl;
|
||||
}
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
}
|
||||
|
||||
infile.close();
|
||||
}
|
||||
|
||||
virtual void SetUp() {
|
||||
setupCollection();
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
collectionManager.dispose();
|
||||
delete store;
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(CollectionGroupingTest, GroupingBasics) {
|
||||
// group by size (int32)
|
||||
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30,
|
||||
"", 10,
|
||||
{}, {}, {"size"}, 2).get();
|
||||
|
||||
ASSERT_EQ(3, res["found"].get<size_t>());
|
||||
ASSERT_EQ(3, res["grouped_hits"].size());
|
||||
ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());
|
||||
|
||||
ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
|
||||
ASSERT_EQ(11, res["grouped_hits"][0]["hits"][0]["document"]["size"].get<size_t>());
|
||||
ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get<float>());
|
||||
ASSERT_STREQ("1", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
|
||||
ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get<float>());
|
||||
ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get<float>());
|
||||
ASSERT_STREQ("2", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get<float>());
|
||||
ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
|
||||
ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
|
||||
ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
|
||||
ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
|
||||
|
||||
// group by rating (float) and sort by size
|
||||
std::vector<sort_by> sort_size = {sort_by("size", "DESC")};
|
||||
res = coll_group->search("*", {}, "", {"brand"}, sort_size, 0, 50, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
|
||||
"", 10,
|
||||
{}, {}, {"rating"}, 2).get();
|
||||
|
||||
// 7 unique ratings
|
||||
ASSERT_EQ(7, res["found"].get<size_t>());
|
||||
ASSERT_EQ(7, res["grouped_hits"].size());
|
||||
ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["group_key"][0].get<float>());
|
||||
|
||||
ASSERT_EQ(12, res["grouped_hits"][0]["hits"][0]["document"]["size"].get<uint32_t>());
|
||||
ASSERT_STREQ("8", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
|
||||
|
||||
ASSERT_EQ(12, res["grouped_hits"][1]["hits"][0]["document"]["size"].get<uint32_t>());
|
||||
ASSERT_STREQ("6", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
|
||||
|
||||
ASSERT_EQ(11, res["grouped_hits"][1]["hits"][1]["document"]["size"].get<uint32_t>());
|
||||
ASSERT_STREQ("1", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get<float>());
|
||||
|
||||
ASSERT_EQ(10, res["grouped_hits"][5]["hits"][0]["document"]["size"].get<uint32_t>());
|
||||
ASSERT_STREQ("9", res["grouped_hits"][5]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_FLOAT_EQ(4.1, res["grouped_hits"][5]["hits"][0]["document"]["rating"].get<float>());
|
||||
|
||||
ASSERT_EQ(10, res["grouped_hits"][6]["hits"][0]["document"]["size"].get<uint32_t>());
|
||||
ASSERT_STREQ("0", res["grouped_hits"][6]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][6]["hits"][0]["document"]["rating"].get<float>());
|
||||
|
||||
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("<mark>Omeg</mark>a", res["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>().c_str());
|
||||
}
|
||||
|
||||
TEST_F(CollectionGroupingTest, GroupingCompoundKey) {
|
||||
// group by size+brand (int32, string)
|
||||
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30,
|
||||
"", 10,
|
||||
{}, {}, {"size", "brand"}, 2).get();
|
||||
|
||||
ASSERT_EQ(10, res["found"].get<size_t>());
|
||||
ASSERT_EQ(10, res["grouped_hits"].size());
|
||||
ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get<size_t>());
|
||||
|
||||
ASSERT_STREQ("Beta", res["grouped_hits"][0]["group_key"][1].get<std::string>().c_str());
|
||||
|
||||
// optional field should have no value in the group key component
|
||||
ASSERT_EQ(1, res["grouped_hits"][5]["group_key"].size());
|
||||
ASSERT_STREQ("10", res["grouped_hits"][5]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("11", res["grouped_hits"][5]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size());
|
||||
ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
|
||||
ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, res["grouped_hits"][1]["hits"].size());
|
||||
ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get<float>());
|
||||
ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(2, res["grouped_hits"][2]["hits"].size());
|
||||
ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get<float>());
|
||||
ASSERT_STREQ("3", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get<float>());
|
||||
ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
|
||||
ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
|
||||
ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
|
||||
ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
|
||||
|
||||
// pagination with page=2, per_page=2
|
||||
res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 2, 2, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30,
|
||||
"", 10,
|
||||
{}, {}, {"size", "brand"}, 2).get();
|
||||
|
||||
|
||||
// 3rd result from previous assertion will be in the first position
|
||||
ASSERT_EQ(2, res["grouped_hits"][0]["hits"].size());
|
||||
ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get<float>());
|
||||
ASSERT_STREQ("3", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get<float>());
|
||||
ASSERT_STREQ("0", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
// total count and facet counts should be the same
|
||||
ASSERT_EQ(10, res["found"].get<size_t>());
|
||||
ASSERT_EQ(2, res["grouped_hits"].size());
|
||||
ASSERT_EQ(10, res["grouped_hits"][0]["group_key"][0].get<size_t>());
|
||||
ASSERT_STREQ("Omega", res["grouped_hits"][0]["group_key"][1].get<std::string>().c_str());
|
||||
|
||||
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]);
|
||||
ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
|
||||
ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
|
||||
ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
|
||||
|
||||
// respect min and max grouping limit (greater than 0 and less than 99)
|
||||
auto res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
|
||||
"", 10,
|
||||
{}, {}, {"rating"}, 100);
|
||||
|
||||
ASSERT_FALSE(res_op.ok());
|
||||
ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str());
|
||||
|
||||
res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "brand: omeg", 30,
|
||||
"", 10,
|
||||
{}, {}, {"rating"}, 0);
|
||||
|
||||
ASSERT_FALSE(res_op.ok());
|
||||
ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str());
|
||||
}
|
||||
|
||||
TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) {
|
||||
auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30,
|
||||
"", 10,
|
||||
{}, {}, {"brand"}, 1).get();
|
||||
|
||||
ASSERT_EQ(5, res["found"].get<size_t>());
|
||||
ASSERT_EQ(5, res["grouped_hits"].size());
|
||||
|
||||
// all hits array must be of size 1
|
||||
for(auto i=0; i<5; i++) {
|
||||
ASSERT_EQ(1, res["grouped_hits"][i]["hits"].size());
|
||||
}
|
||||
|
||||
ASSERT_STREQ("4", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("2", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("10", res["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str()); // unbranded
|
||||
ASSERT_STREQ("9", res["grouped_hits"][4]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
// facet counts should each be 1, including unbranded
|
||||
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
|
||||
for(size_t i=0; i < 4; i++) {
|
||||
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][i]["count"]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) {
|
||||
nlohmann::json override_json_include = {
|
||||
{"id", "include-rule"},
|
||||
{
|
||||
"rule", {
|
||||
{"query", "shirt"},
|
||||
{"match", override_t::MATCH_EXACT}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
override_json_include["includes"] = nlohmann::json::array();
|
||||
override_json_include["includes"][0] = nlohmann::json::object();
|
||||
override_json_include["includes"][0]["id"] = "11";
|
||||
override_json_include["includes"][0]["position"] = 1;
|
||||
|
||||
override_json_include["includes"][1] = nlohmann::json::object();
|
||||
override_json_include["includes"][1]["id"] = "10";
|
||||
override_json_include["includes"][1]["position"] = 1;
|
||||
|
||||
nlohmann::json override_json_exclude = {
|
||||
{"id", "exclude-rule"},
|
||||
{
|
||||
"rule", {
|
||||
{"query", "shirt"},
|
||||
{"match", override_t::MATCH_EXACT}
|
||||
}
|
||||
}
|
||||
};
|
||||
override_json_exclude["excludes"] = nlohmann::json::array();
|
||||
override_json_exclude["excludes"][0] = nlohmann::json::object();
|
||||
override_json_exclude["excludes"][0]["id"] = "2";
|
||||
|
||||
override_t override1(override_json_include);
|
||||
override_t override2(override_json_exclude);
|
||||
Option<uint32_t> ov1_op = coll_group->add_override(override1);
|
||||
Option<uint32_t> ov2_op = coll_group->add_override(override2);
|
||||
|
||||
ASSERT_TRUE(ov1_op.ok());
|
||||
ASSERT_TRUE(ov2_op.ok());
|
||||
|
||||
auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30,
|
||||
"", 10,
|
||||
{}, {}, {"colors"}, 2).get();
|
||||
|
||||
ASSERT_EQ(4, res["found"].get<size_t>());
|
||||
ASSERT_EQ(4, res["grouped_hits"].size());
|
||||
|
||||
ASSERT_EQ(1, res["grouped_hits"][0]["group_key"][0].size());
|
||||
ASSERT_STREQ("white", res["grouped_hits"][0]["group_key"][0][0].get<std::string>().c_str());
|
||||
|
||||
ASSERT_STREQ("11", res["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("10", res["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_STREQ("5", res["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_STREQ("4", res["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, res["grouped_hits"][3]["hits"].size());
|
||||
ASSERT_STREQ("8", res["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
// assert facet counts
|
||||
ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
||||
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][0]["count"]);
|
||||
ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][1]["count"]);
|
||||
ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]);
|
||||
ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][2]["value"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]);
|
||||
ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get<std::string>().c_str());
|
||||
}
|
@ -40,6 +40,7 @@ protected:
|
||||
|
||||
virtual void TearDown() {
|
||||
collectionManager.drop_collection("collection1");
|
||||
collectionManager.dispose();
|
||||
delete store;
|
||||
}
|
||||
};
|
||||
|
@ -18,7 +18,7 @@ protected:
|
||||
system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
|
||||
|
||||
store = new Store(state_dir_path);
|
||||
collectionManager.init(store, 1, "auth_key");
|
||||
collectionManager.init(store, 4, "auth_key");
|
||||
collectionManager.load();
|
||||
|
||||
std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
|
||||
@ -49,6 +49,7 @@ protected:
|
||||
|
||||
virtual void TearDown() {
|
||||
collectionManager.drop_collection("coll_mul_fields");
|
||||
collectionManager.dispose();
|
||||
delete store;
|
||||
}
|
||||
};
|
||||
@ -121,6 +122,11 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeExactQueryMatch) {
|
||||
ASSERT_STREQ("3", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
// curated results should be marked as such
|
||||
ASSERT_EQ(true, results["hits"][0]["curated"].get<bool>());
|
||||
ASSERT_EQ(true, results["hits"][1]["curated"].get<bool>());
|
||||
ASSERT_EQ(0, results["hits"][2].count("curated"));
|
||||
|
||||
coll_mul_fields->remove_override("exclude-rule");
|
||||
coll_mul_fields->remove_override("include-rule");
|
||||
|
||||
@ -219,6 +225,8 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();
|
||||
|
||||
ASSERT_EQ(9, results["found"].get<size_t>());
|
||||
|
||||
// "count" would be `2` without exclusion
|
||||
ASSERT_EQ("<mark>Scott</mark> Glenn", results["facet_counts"][0]["counts"][0]["highlighted"].get<std::string>());
|
||||
ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get<size_t>());
|
||||
@ -233,7 +241,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "starring: scott").get();
|
||||
|
||||
ASSERT_EQ(10, results["found"].get<size_t>());
|
||||
ASSERT_EQ(9, results["found"].get<size_t>());
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
coll_mul_fields->remove_override("exclude-rule");
|
||||
@ -246,7 +254,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "").get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_EQ(2, results["found"].get<size_t>());
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
|
||||
@ -254,10 +262,9 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
|
||||
std::map<std::string, size_t> pinned_hits;
|
||||
std::vector<std::string> hidden_hits;
|
||||
pinned_hits["13"] = 1;
|
||||
pinned_hits["4"] = 2;
|
||||
std::map<size_t, std::vector<std::string>> pinned_hits;
|
||||
pinned_hits[1] = {"13"};
|
||||
pinned_hits[2] = {"4"};
|
||||
|
||||
// basic pinning
|
||||
|
||||
@ -277,6 +284,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
|
||||
|
||||
// both pinning and hiding
|
||||
|
||||
std::vector<std::string> hidden_hits;
|
||||
hidden_hits = {"11", "16"};
|
||||
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
@ -289,6 +297,21 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
|
||||
ASSERT_STREQ("4", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("6", results["hits"][2]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
// paginating such that pinned hits appear on second page
|
||||
pinned_hits.clear();
|
||||
pinned_hits[4] = {"13"};
|
||||
pinned_hits[5] = {"4"};
|
||||
|
||||
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 2, 2, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
|
||||
"", 10,
|
||||
pinned_hits, hidden_hits).get();
|
||||
|
||||
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("13", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
// take precedence over override rules
|
||||
|
||||
nlohmann::json override_json_include = {
|
||||
@ -325,4 +348,60 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) {
|
||||
ASSERT_EQ(8, results["found"].get<size_t>());
|
||||
ASSERT_STREQ("8", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("6", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
}
|
||||
|
||||
TEST_F(CollectionOverrideTest, PinnedHitsGrouping) {
|
||||
std::map<size_t, std::vector<std::string>> pinned_hits;
|
||||
pinned_hits[1] = {"6", "8"};
|
||||
pinned_hits[2] = {"1"};
|
||||
pinned_hits[3] = {"13", "4"};
|
||||
|
||||
// without any grouping parameter, only the first ID in a position should be picked
|
||||
// and other IDs should appear in their original positions
|
||||
|
||||
auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
|
||||
"", 10,
|
||||
pinned_hits, {}).get();
|
||||
|
||||
ASSERT_EQ(10, results["found"].get<size_t>());
|
||||
ASSERT_STREQ("6", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("11", results["hits"][3]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
// pinned hits should be marked as curated
|
||||
ASSERT_EQ(true, results["hits"][0]["curated"].get<bool>());
|
||||
ASSERT_EQ(true, results["hits"][1]["curated"].get<bool>());
|
||||
ASSERT_EQ(true, results["hits"][2]["curated"].get<bool>());
|
||||
ASSERT_EQ(0, results["hits"][3].count("curated"));
|
||||
|
||||
// with grouping
|
||||
|
||||
results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY,
|
||||
false, Index::DROP_TOKENS_THRESHOLD,
|
||||
spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "starring: will", 30,
|
||||
"", 10,
|
||||
pinned_hits, {}, {"cast"}, 2).get();
|
||||
|
||||
ASSERT_EQ(8, results["found"].get<size_t>());
|
||||
|
||||
ASSERT_EQ(1, results["grouped_hits"][0]["group_key"].size());
|
||||
ASSERT_EQ(2, results["grouped_hits"][0]["group_key"][0].size());
|
||||
ASSERT_STREQ("Chris Evans", results["grouped_hits"][0]["group_key"][0][0].get<std::string>().c_str());
|
||||
ASSERT_STREQ("Scarlett Johansson", results["grouped_hits"][0]["group_key"][0][1].get<std::string>().c_str());
|
||||
|
||||
ASSERT_STREQ("6", results["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("8", results["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_STREQ("1", results["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_STREQ("13", results["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("4", results["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
ASSERT_STREQ("11", results["grouped_hits"][3]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("16", results["grouped_hits"][4]["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
}
|
@ -29,6 +29,7 @@ protected:
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
collectionManager.dispose();
|
||||
delete store;
|
||||
}
|
||||
};
|
||||
@ -248,6 +249,7 @@ TEST_F(CollectionSortingTest, ThreeSortFieldsLimit) {
|
||||
sort_by("min", "DESC"),
|
||||
};
|
||||
|
||||
query_fields = {"title"};
|
||||
auto res_op = coll1->search("the", query_fields, "", {}, sort_fields_desc, 0, 10, 1, FREQUENCY, false);
|
||||
|
||||
ASSERT_FALSE(res_op.ok());
|
||||
|
@ -56,6 +56,7 @@ protected:
|
||||
|
||||
virtual void TearDown() {
|
||||
collectionManager.drop_collection("collection");
|
||||
collectionManager.dispose();
|
||||
delete store;
|
||||
}
|
||||
};
|
||||
@ -455,6 +456,18 @@ TEST_F(CollectionTest, WildcardQuery) {
|
||||
std::string id = ids.at(i);
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
// wildcard query should not require a search field
|
||||
results_op = collection->search("*", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false);
|
||||
ASSERT_TRUE(results_op.ok());
|
||||
results = results_op.get();
|
||||
ASSERT_EQ(3, results["hits"].size());
|
||||
ASSERT_EQ(25, results["found"].get<uint32_t>());
|
||||
|
||||
// non-wildcard query should require a search field
|
||||
results_op = collection->search("the", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false);
|
||||
ASSERT_FALSE(results_op.ok());
|
||||
ASSERT_STREQ("No search fields specified for the query.", results_op.error().c_str());
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, PrefixSearching) {
|
||||
@ -2259,7 +2272,6 @@ TEST_F(CollectionTest, OptionalFields) {
|
||||
|
||||
// try fetching the schema (should contain optional field)
|
||||
nlohmann::json coll_summary = coll1->get_summary_json();
|
||||
LOG(INFO) << coll_summary;
|
||||
ASSERT_STREQ("title", coll_summary["fields"][0]["name"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("string", coll_summary["fields"][0]["type"].get<std::string>().c_str());
|
||||
ASSERT_FALSE(coll_summary["fields"][0]["facet"].get<bool>());
|
||||
|
12
test/group_documents.jsonl
Normal file
12
test/group_documents.jsonl
Normal file
@ -0,0 +1,12 @@
|
||||
{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["white", "blue"], "rating": 4.5}
|
||||
{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 11, "colors": ["white", "blue"], "rating": 4.3}
|
||||
{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 12, "colors": ["white", "blue"], "rating": 4.6}
|
||||
{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["blue"], "rating": 4.6}
|
||||
{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 10, "colors": ["white", "blue"], "rating": 4.8}
|
||||
{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 11, "colors": ["blue"], "rating": 4.8}
|
||||
{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 12, "colors": ["white", "blue"], "rating": 4.3}
|
||||
{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 10, "colors": ["white", "blue"], "rating": 4.3}
|
||||
{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 12, "colors": ["white", "red"], "rating": 4.4}
|
||||
{"title": "Zeta Casual Shirt", "brand": "Zeta", "size": 10, "colors": ["white", "blue"], "rating": 4.1}
|
||||
{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 4.3}
|
||||
{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 3.3}
|
@ -54,3 +54,8 @@ TEST(StringUtilsTest, HMAC) {
|
||||
std::string digest1 = StringUtils::hmac("KeyVal", "{\"filter_by\": \"user_id:1080\"}");
|
||||
ASSERT_STREQ("IvjqWNZ5M5ElcvbMoXj45BxkQrZG4ZKEaNQoRioCx2s=", digest1.c_str());
|
||||
}
|
||||
|
||||
TEST(StringUtilsTest, UInt32Validation) {
|
||||
std::string big_num = "99999999999999999999999999999999";
|
||||
ASSERT_FALSE(StringUtils::is_uint32_t(big_num));
|
||||
}
|
||||
|
@ -36,7 +36,8 @@ TEST(TopsterTest, MaxIntValues) {
|
||||
scores[1] = data[i].primary_attr;
|
||||
scores[2] = data[i].secondary_attr;
|
||||
|
||||
topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores);
|
||||
KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores);
|
||||
topster.add(&kv);
|
||||
}
|
||||
|
||||
topster.sort();
|
||||
@ -87,7 +88,8 @@ TEST(TopsterTest, MaxFloatValues) {
|
||||
scores[1] = Index::float_to_in64_t(data[i].primary_attr);
|
||||
scores[2] = data[i].secondary_attr;
|
||||
|
||||
topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores);
|
||||
KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores);
|
||||
topster.add(&kv);
|
||||
}
|
||||
|
||||
topster.sort();
|
||||
@ -97,4 +99,64 @@ TEST(TopsterTest, MaxFloatValues) {
|
||||
for(uint32_t i = 0; i < topster.size; i++) {
|
||||
EXPECT_EQ(ids[i], topster.getKeyAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TopsterTest, DistinctIntValues) {
|
||||
Topster dist_topster(5, 2);
|
||||
|
||||
struct {
|
||||
uint8_t field_id;
|
||||
uint16_t query_index;
|
||||
uint64_t distinct_key;
|
||||
uint64_t match_score;
|
||||
int64_t primary_attr;
|
||||
int64_t secondary_attr;
|
||||
} data[14] = {
|
||||
{1, 0, 1, 11, 20, 30},
|
||||
{1, 0, 1, 12, 20, 32},
|
||||
{1, 0, 2, 4, 20, 30},
|
||||
{1, 2, 3, 7, 20, 30},
|
||||
{1, 0, 4, 14, 20, 30},
|
||||
{1, 1, 5, 9, 20, 30},
|
||||
{1, 1, 5, 10, 20, 32},
|
||||
{1, 1, 5, 9, 20, 30},
|
||||
{1, 0, 6, 6, 20, 30},
|
||||
{1, 2, 7, 6, 22, 30},
|
||||
{1, 2, 7, 6, 22, 30},
|
||||
{1, 1, 8, 9, 20, 30},
|
||||
{1, 0, 9, 8, 20, 30},
|
||||
{1, 3, 10, 5, 20, 30},
|
||||
};
|
||||
|
||||
for(int i = 0; i < 14; i++) {
|
||||
int64_t scores[3];
|
||||
scores[0] = int64_t(data[i].match_score);
|
||||
scores[1] = data[i].primary_attr;
|
||||
scores[2] = data[i].secondary_attr;
|
||||
|
||||
KV kv(data[i].field_id, data[i].query_index, i+100, data[i].distinct_key, data[i].match_score, scores);
|
||||
dist_topster.add(&kv);
|
||||
}
|
||||
|
||||
dist_topster.sort();
|
||||
|
||||
std::vector<uint64_t> distinct_ids = {4, 1, 5, 8, 9};
|
||||
|
||||
for(uint32_t i = 0; i < dist_topster.size; i++) {
|
||||
EXPECT_EQ(distinct_ids[i], dist_topster.getDistinctKeyAt(i));
|
||||
|
||||
if(distinct_ids[i] == 1) {
|
||||
EXPECT_EQ(12, (int) dist_topster.getKV(i)->match_score);
|
||||
EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size);
|
||||
EXPECT_EQ(12, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score);
|
||||
EXPECT_EQ(11, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score);
|
||||
}
|
||||
|
||||
if(distinct_ids[i] == 5) {
|
||||
EXPECT_EQ(10, (int) dist_topster.getKV(i)->match_score);
|
||||
EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size);
|
||||
EXPECT_EQ(10, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score);
|
||||
EXPECT_EQ(9, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user