diff --git a/.circleci/config.yml b/.circleci/config.yml index 63d2d996..a55fe2d4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2 jobs: build: docker: - - image: typesense/typesense-development:22-MAR-2020-5 + - image: typesense/typesense-development:23-JUNE-2020-1 environment: - PROJECT_DIR: /typesense - TYPESENSE_VERSION: $CIRCLE_BRANCH-$CIRCLE_SHA1 @@ -16,7 +16,7 @@ jobs: keys: - external-Linux-cache-{{ .Branch }}-{{ checksum "last-changed-git-sha-for-dependency-listing" }} - external-Linux-cache-{{ .Branch }} - - external-Linux-cache + - external-Linux-cache - run: name: build command: $PROJECT_DIR/build.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index b8c4e9f8..b5afc665 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ FIND_PACKAGE(ICU REQUIRED) FIND_PACKAGE(Protobuf REQUIRED) FIND_PACKAGE(LevelDB REQUIRED) FIND_PACKAGE(gflags REQUIRED) +FIND_PACKAGE(glog REQUIRED) message("OpenSSL library: ${OPENSSL_LIBRARIES}") @@ -53,11 +54,6 @@ include(cmake/GoogleTest.cmake) include(cmake/TestResources.cmake) include(cmake/Iconv.cmake) -if (APPLE) - include(cmake/brpc.cmake) - include(cmake/braft.cmake) -endif() - FILE(GLOB SRC_FILES src/*.cpp) FILE(GLOB TEST_FILES test/*.cpp) @@ -87,8 +83,10 @@ link_directories(${DEP_ROOT_DIR}/${FOR_NAME}) link_directories(${DEP_ROOT_DIR}/${H2O_NAME}/build) link_directories(${DEP_ROOT_DIR}/${ROCKSDB_NAME}) link_directories(${DEP_ROOT_DIR}/${ICONV_NAME}/lib/.libs) +if (APPLE) link_directories(${DEP_ROOT_DIR}/${BRPC_NAME}/lib) link_directories(${DEP_ROOT_DIR}/${BRAFT_NAME}/lib) +endif() # Write dependency libraries to a file file(WRITE ${DEP_ROOT_DIR}/libs.txt "") @@ -152,7 +150,7 @@ set(ICU_ALL_LIBRARIES ${ICU_I18N_LIBRARIES} ${ICU_LIBRARIES} ${ICU_DATA_LIBRARIE set(CORE_LIBS h2o-evloop iconv ${CURL_LIBRARIES} for ${ICU_ALL_LIBRARIES} ${G3LOGGER_LIBRARIES} ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} pthread dl ${STD_LIB}) -set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS}) +set(CORE_LIBS braft brpc ${LevelDB_LIBRARIES} glog ${CORE_LIBS} ${GFLAGS_LIBRARIES} ${PROTOBUF_LIBRARIES} ${SYSTEM_LIBS}) target_link_libraries(typesense-core ${CORE_LIBS}) target_link_libraries(typesense-server ${CORE_LIBS}) diff --git a/README.md b/README.md index d4f7cbfe..c0463bbd 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Here's a quick example showcasing how you can create a collection, index a docum Let's begin by starting the Typesense server via Docker: ``` -docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.13.0 --data-dir /data --api-key=Hu52dwsas2AdxdE +docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.14.0 --data-dir /data --api-key=Hu52dwsas2AdxdE ``` Install the Python client for Typesense (we have [clients](https://typesense.org/api/#api-clients) for other languages too): @@ -146,7 +146,7 @@ works without turning many knobs. **Speed is great, but what about the memory footprint?** -A fresh Typesense server will take less than 5 MB of memory. As you start indexing documents, the memory use will +A fresh Typesense server will consume about 30 MB of memory. As you start indexing documents, the memory use will increase correspondingly. How much it increases depends on the number and type of fields you index. We've strived to keep the in-memory data structures lean. To give you a rough idea: when 1 million diff --git a/TODO.md b/TODO.md index 69298247..8c3bed2d 100644 --- a/TODO.md +++ b/TODO.md @@ -97,6 +97,10 @@ - ~~Have a LOG(ERROR) level~~ - ~~Handle SIGTERM which is sent when process is killed~~ - ~~Use snappy compression for storage~~ +- ~~Fix exclude_scalar early returns~~ +- ~~Fix result ids length during grouped overrides~~ +- ~~Fix override grouping (collate_included_ids)~~ +- ~~Test for overriding result on second page~~ - atleast 1 token match for proceeding with drop tokens - support wildcard query with filters - API for optimizing on disk storage diff --git a/cmake/braft.cmake b/cmake/braft.cmake index 082f95c9..b72a1e25 100644 --- a/cmake/braft.cmake +++ b/cmake/braft.cmake @@ -1,4 +1,4 @@ -set(BRAFT_VERSION 0a9ec3f) +set(BRAFT_VERSION fb27e63) set(BRAFT_NAME braft-${BRAFT_VERSION}) set(BRAFT_TAR_PATH ${DEP_ROOT_DIR}/${BRAFT_NAME}.tar.gz) diff --git a/cmake/brpc.cmake b/cmake/brpc.cmake index 4c97adf1..9ea1927a 100644 --- a/cmake/brpc.cmake +++ b/cmake/brpc.cmake @@ -1,4 +1,4 @@ -set(BRPC_VERSION 23c66e3) +set(BRPC_VERSION 2f8fc37d) set(BRPC_NAME brpc-${BRPC_VERSION}) set(BRPC_TAR_PATH ${DEP_ROOT_DIR}/${BRPC_NAME}.tar.gz) diff --git a/docker-build.sh b/docker-build.sh index 16bd3f26..e2ab7d71 100755 --- a/docker-build.sh +++ b/docker-build.sh @@ -22,13 +22,13 @@ if [[ "$@" == *"--depclean"* ]]; then fi #echo "Creating development image..." -#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:latest $PROJECT_DIR/docker +#docker build --file $PROJECT_DIR/docker/development.Dockerfile --tag typesense/typesense-development:23-JUNE-2020-1 $PROJECT_DIR/docker echo "Building Typesense $TYPESENSE_VERSION..." -docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ +docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ -DCMAKE_BUILD_TYPE=Release -H/typesense -B/typesense/$BUILD_DIR -docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development make typesense-server -C/typesense/$BUILD_DIR +docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 make typesense-server -C/typesense/$BUILD_DIR if [[ "$@" == *"--build-deploy-image"* ]]; then echo "Creating deployment image for Typesense $TYPESENSE_VERSION server ..." @@ -60,7 +60,7 @@ if [[ "$@" == *"--package-libs"* ]]; then fi # #if [[ "$@" == *"--create-deb-upload"* ]]; then -# docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ +# docker run -it -v $PROJECT_DIR:/typesense typesense/typesense-development:23-JUNE-2020-1 cmake -DTYPESENSE_VERSION=$TYPESENSE_VERSION \ # -DCMAKE_BUILD_TYPE=Debug -H/typesense -B/typesense/$BUILD_DIR #fi diff --git a/docker/development.Dockerfile b/docker/development.Dockerfile index 790a3246..ad2acb33 100644 --- a/docker/development.Dockerfile +++ b/docker/development.Dockerfile @@ -55,25 +55,34 @@ ADD https://github.com/protocolbuffers/protobuf/releases/download/v3.11.4/protob RUN tar -C /opt -xf /opt/protobuf-cpp-3.11.4.tar.gz && chown -R root:root /opt/protobuf-3.11.4 RUN cd /opt/protobuf-3.11.4 && ./configure --disable-shared && make -j8 && make check && make install && rm -rf /usr/local/lib/*.so* -ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz.tar.gz -RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz.tar.gz +ADD https://github.com/google/leveldb/archive/1.22.tar.gz /opt/leveldb-1.22.tar.gz +RUN tar -C /opt -xf /opt/leveldb-1.22.tar.gz RUN mkdir -p /opt/leveldb-1.22/build && cd /opt/leveldb-1.22/build && cmake -DCMAKE_BUILD_TYPE=Release .. && \ cmake --build . && make install && rm -rf /usr/local/lib/*.so* +ADD https://github.com/google/glog/archive/0a2e593.tar.gz /opt/glog-0a2e593.tar.gz +RUN tar -C /opt -xf /opt/glog-0a2e593.tar.gz +RUN mkdir -p /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \ + cd /opt/glog-0a2e5931bd5ff22fd3bf8999eb8ce776f159cda6/bld && \ + cmake -DBUILD_TESTING=0 -DWITH_GFLAGS=ON -DWITH_UNWIND=OFF .. && \ + cmake --build . && make install && rm -rf /usr/local/lib/*.so* + ADD https://github.com/apache/incubator-brpc/archive/0.9.7-rc03.tar.gz /opt/brpc-0.9.7-rc03.tar.gz RUN tar -C /opt -xf /opt/brpc-0.9.7-rc03.tar.gz COPY patches/brpc_cmakelists.txt /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt RUN chown root:root /opt/incubator-brpc-0.9.7-rc03/src/CMakeLists.txt -RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/build && cd /opt/incubator-brpc-0.9.7-rc03/build && \ - cmake -DWITH_DEBUG_SYMBOLS=OFF .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \ - rm -rf /opt/incubator-brpc-0.9.7-rc03/build/output/bin +RUN mkdir -p /opt/incubator-brpc-0.9.7-rc03/bld && cd /opt/incubator-brpc-0.9.7-rc03/bld && \ + cmake -DWITH_DEBUG_SYMBOLS=OFF -DWITH_GLOG=ON .. && \ + make -j8 && make install && rm -rf /usr/local/lib/*.so* && \ + rm -rf /opt/incubator-brpc-0.9.7-rc03/bld/output/bin -ADD https://github.com/baidu/braft/archive/v1.1.0.tar.gz /opt/braft-v1.1.0.tar.gz -RUN tar -C /opt -xf /opt/braft-v1.1.0.tar.gz -COPY patches/braft_cmakelists.txt /opt/braft-1.1.0/src/CMakeLists.txt -RUN chown root:root /opt/braft-1.1.0/src/CMakeLists.txt -RUN mkdir -p /opt/braft-1.1.0/build && cd /opt/braft-1.1.0/build && \ - cmake -DWITH_DEBUG_SYMBOLS=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* +ADD https://github.com/baidu/braft/archive/v1.1.1.tar.gz /opt/braft-v1.1.1.tar.gz +RUN tar -C /opt -xf /opt/braft-v1.1.1.tar.gz +COPY patches/braft_cmakelists.txt /opt/braft-1.1.1/src/CMakeLists.txt +RUN chown root:root /opt/braft-1.1.1/src/CMakeLists.txt +RUN mkdir -p /opt/braft-1.1.1/bld && cd /opt/braft-1.1.1/bld && \ + cmake -DWITH_DEBUG_SYMBOLS=ON -DBRPC_WITH_GLOG=ON .. && make -j8 && make install && rm -rf /usr/local/lib/*.so* && \ + rm -rf /opt/braft-1.1.1/bld/output/bin ENV CC /usr/local/gcc-6.4.0/bin/gcc ENV CXX /usr/local/gcc-6.4.0/bin/g++ diff --git a/docker/patches/brpc_cmakelists.txt b/docker/patches/brpc_cmakelists.txt index 03c05a11..687b3b7d 100644 --- a/docker/patches/brpc_cmakelists.txt +++ b/docker/patches/brpc_cmakelists.txt @@ -35,8 +35,8 @@ add_library(brpc-static STATIC $ $ $) -if(BRPC_WITH_THRIFT) - target_link_libraries(brpc-static thrift) +if(BRPC_WITH_GLOG) + target_link_libraries(brpc-static ${GLOG_LIB}) endif() SET_TARGET_PROPERTIES(brpc-static PROPERTIES OUTPUT_NAME brpc CLEAN_DIRECT_OUTPUT 1) diff --git a/include/collection.h b/include/collection.h index c821af1f..69f15832 100644 --- a/include/collection.h +++ b/include/collection.h @@ -147,7 +147,7 @@ private: std::string get_seq_id_key(uint32_t seq_id); void highlight_result(const field &search_field, const std::vector> &searched_queries, - const KV &field_order_kv, const nlohmann::json &document, + const KV* field_order_kv, const nlohmann::json &document, StringUtils & string_utils, size_t snippet_threshold, bool highlighted_fully, highlight_t &highlight); @@ -155,10 +155,10 @@ private: void remove_document(nlohmann::json & document, const uint32_t seq_id, bool remove_from_store); void populate_overrides(std::string query, - const std::map& pinned_hits, + const std::map>& pinned_hits, const std::vector& hidden_hits, - std::map & id_pos_map, - std::vector & included_ids, std::vector & excluded_ids); + std::map>& include_ids, + std::vector & excluded_ids); static bool facet_count_compare(const std::pair& a, const std::pair& b) { @@ -236,8 +236,10 @@ public: const size_t snippet_threshold = 30, const std::string & highlight_full_fields = "", size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD, - const std::map& pinned_hits={}, - const std::vector& hidden_hits={}); + const std::map>& pinned_hits={}, + const std::vector& hidden_hits={}, + const std::vector& group_by_fields={}, + const size_t group_limit = 0); Option get(const std::string & id); @@ -271,6 +273,8 @@ public: const size_t PER_PAGE_MAX = 250; + const size_t GROUP_LIMIT_MAX = 99; + // Using a $ prefix so that these meta keys stay above record entries in a lexicographically ordered KV store static constexpr const char* COLLECTION_META_PREFIX = "$CM"; static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS"; @@ -280,5 +284,9 @@ public: void facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document, std::string &value); + + void aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const; + + void populate_result_kvs(Topster *topster, std::vector> &result_kvs) const; }; diff --git a/include/field.h b/include/field.h index 95c1fcbe..44761e23 100644 --- a/include/field.h +++ b/include/field.h @@ -146,6 +146,7 @@ struct token_pos_cost_t { struct facet_count_t { uint32_t count; + spp::sparse_hash_set groups; // used for faceting grouped results // used to fetch the actual document and value for representation uint32_t doc_id; diff --git a/include/http_data.h b/include/http_data.h index 3a917956..cd306204 100644 --- a/include/http_data.h +++ b/include/http_data.h @@ -117,16 +117,16 @@ struct http_req { uint64_t route_hash; std::map params; std::string body; - uint64_t seed; + std::string metadata; - http_req(): route_hash(1), seed(random_uint64_t()) { + http_req(): route_hash(1) { } http_req(h2o_req_t* _req, const std::string & http_method, uint64_t route_hash, const std::map & params, std::string body): _req(_req), http_method(http_method), route_hash(route_hash), params(params), - body(body), seed(random_uint64_t()) { + body(body) { } @@ -136,7 +136,7 @@ struct http_req { nlohmann::json content = nlohmann::json::parse(serialized_content); route_hash = content["route_hash"]; body = content["body"]; - seed = content["seed"]; + metadata = content.count("metadata") != 0 ? content["metadata"] : ""; for (nlohmann::json::iterator it = content["params"].begin(); it != content["params"].end(); ++it) { params.emplace(it.key(), it.value()); @@ -150,16 +150,10 @@ struct http_req { content["route_hash"] = route_hash; content["params"] = params; content["body"] = body; - content["seed"] = seed; + content["metadata"] = metadata; return content.dump(); } - - uint64_t random_uint64_t() { - thread_local std::mt19937 rg(std::random_device{}()); - thread_local std::uniform_int_distribution pick(0, std::numeric_limits::max()); - return pick(rg); - } }; struct request_response { diff --git a/include/index.h b/include/index.h index 9f2d2edd..36cae022 100644 --- a/include/index.h +++ b/include/index.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "string_utils.h" struct token_candidates { @@ -26,23 +27,27 @@ struct search_args { std::vector search_fields; std::vector filters; std::vector facets; - std::vector included_ids; + std::map> included_ids; std::vector excluded_ids; std::vector sort_fields_std; facet_query_t facet_query; int num_typos; size_t max_facet_values; - size_t max_hits; size_t per_page; size_t page; token_ordering token_order; bool prefix; size_t drop_tokens_threshold; size_t typo_tokens_threshold; - std::vector raw_result_kvs; + std::vector group_by_fields; + size_t group_limit; size_t all_result_ids_len; + spp::sparse_hash_set groups_processed; std::vector> searched_queries; - std::vector override_result_kvs; + Topster* topster; + Topster* curated_topster; + std::vector> raw_result_kvs; + std::vector> override_result_kvs; Option outcome; search_args(): outcome(0) { @@ -50,18 +55,28 @@ struct search_args { } search_args(std::string query, std::vector search_fields, std::vector filters, - std::vector facets, std::vector included_ids, std::vector excluded_ids, + std::vector facets, std::map> included_ids, std::vector excluded_ids, std::vector sort_fields_std, facet_query_t facet_query, int num_typos, size_t max_facet_values, size_t max_hits, size_t per_page, size_t page, token_ordering token_order, bool prefix, - size_t drop_tokens_threshold, size_t typo_tokens_threshold): + size_t drop_tokens_threshold, size_t typo_tokens_threshold, + const std::vector& group_by_fields, size_t group_limit): query(query), search_fields(search_fields), filters(filters), facets(facets), included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), facet_query(facet_query), num_typos(num_typos), - max_facet_values(max_facet_values), max_hits(max_hits), per_page(per_page), + max_facet_values(max_facet_values), per_page(per_page), page(page), token_order(token_order), prefix(prefix), drop_tokens_threshold(drop_tokens_threshold), typo_tokens_threshold(typo_tokens_threshold), + group_by_fields(group_by_fields), group_limit(group_limit), all_result_ids_len(0), outcome(0) { + const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory + topster = new Topster(topster_size, group_limit); + curated_topster = new Topster(topster_size, group_limit); } + + ~search_args() { + delete topster; + delete curated_topster; + }; }; struct index_record { @@ -149,22 +164,23 @@ private: void do_facets(std::vector & facets, facet_query_t & facet_query, const uint32_t* result_ids, size_t results_size); - void drop_facets(std::vector & facets, const std::vector & ids); - void search_field(const uint8_t & field_id, const std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, std::vector & facets, const std::vector & sort_fields, const int num_typos, std::vector> & searched_queries, - Topster & topster, uint32_t** all_result_ids, - size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY, - const bool prefix = false, + Topster* topster, spp::sparse_hash_set& groups_processed, + uint32_t** all_result_ids, size_t & all_result_ids_len, + const token_ordering token_order = FREQUENCY, const bool prefix = false, const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD, const size_t typo_tokens_threshold = Index::TYPO_TOKENS_THRESHOLD); void search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, const std::vector & sort_fields, std::vector & token_to_candidates, - const token_ordering token_order, std::vector> & searched_queries, - Topster & topster, uint32_t** all_result_ids, + std::vector> & searched_queries, + Topster* topster, spp::sparse_hash_set& groups_processed, + uint32_t** all_result_ids, size_t & all_result_ids_len, const size_t typo_tokens_threshold); @@ -196,14 +212,19 @@ private: void remove_and_shift_offset_index(sorted_array &offset_index, const uint32_t *indices_sorted, const uint32_t indices_length); - void collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id, - const std::vector & included_ids, - Topster & curated_topster, std::vector> & searched_queries); + void collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id, + const std::map> & included_ids_map, + Topster* curated_topster, std::vector> & searched_queries); uint64_t facet_token_hash(const field & a_field, const std::string &token); void compute_facet_stats(facet &a_facet, int64_t raw_value, const std::string & field_type); + // reference: https://stackoverflow.com/a/27952689/131050 + uint64_t hash_combine(uint64_t lhs, uint64_t rhs) const { + lhs ^= rhs + 0x517cc1b727220a95 + (lhs << 6) + (lhs >> 2); + return lhs; + } public: Index() = delete; @@ -218,12 +239,18 @@ public: void search(Option & outcome, const std::string & query, const std::vector & search_fields, const std::vector & filters, std::vector & facets, facet_query_t & facet_query, - const std::vector & included_ids, const std::vector & excluded_ids, + const std::map> & included_ids_map, + const std::vector & excluded_ids, const std::vector & sort_fields_std, const int num_typos, - const size_t max_hits, const size_t per_page, const size_t page, const token_ordering token_order, - const bool prefix, const size_t drop_tokens_threshold, std::vector & raw_result_kvs, - size_t & all_result_ids_len, std::vector> & searched_queries, - std::vector & override_result_kvs, const size_t typo_tokens_threshold); + Topster* topster, Topster* curated_topster, + const size_t per_page, const size_t page, const token_ordering token_order, + const bool prefix, const size_t drop_tokens_threshold, + size_t & all_result_ids_len, + spp::sparse_hash_set& groups_processed, + std::vector> & searched_queries, + std::vector> & raw_result_kvs, + std::vector> & override_result_kvs, + const size_t typo_tokens_threshold); Option remove(const uint32_t seq_id, nlohmann::json & document); @@ -235,7 +262,8 @@ public: std::vector>> &array_token_positions); void score_results(const std::vector & sort_fields, const uint16_t & query_index, const uint8_t & field_id, - const uint32_t total_cost, Topster &topster, const std::vector & query_suggestion, + const uint32_t total_cost, Topster* topster, const std::vector & query_suggestion, + spp::sparse_hash_set& groups_processed, const uint32_t *result_ids, const size_t result_size) const; static int32_t get_points_from_doc(const nlohmann::json &document, const std::string & default_sorting_field); @@ -278,7 +306,7 @@ public: bool processed; // prevents spurious wake up of the main thread bool terminate; // used for interrupting the thread during tear down - search_args search_params; + search_args* search_params; static void populate_array_token_positions(std::vector>> & array_token_positions, const art_leaf *token_leaf, uint32_t doc_index); @@ -286,5 +314,7 @@ public: int get_bounded_typo_cost(const size_t max_cost, const size_t token_len) const; static int64_t float_to_in64_t(float n); + + uint64_t get_distinct_id(const std::unordered_map &facet_to_id, const uint32_t seq_id) const; }; diff --git a/include/logger.h b/include/logger.h index c8a9e62e..5eee4cdf 100644 --- a/include/logger.h +++ b/include/logger.h @@ -1,5 +1,3 @@ #pragma once -#include -#include -#include +#include \ No newline at end of file diff --git a/include/string_utils.h b/include/string_utils.h index 5a19b3ca..1b12da8b 100644 --- a/include/string_utils.h +++ b/include/string_utils.h @@ -151,8 +151,18 @@ struct StringUtils { } char * p ; - strtoull(s.c_str(), &p, 10); - return (*p == 0); + unsigned long long ull = strtoull(s.c_str(), &p, 10); + return (*p == 0) && ull <= std::numeric_limits::max(); + } + + static bool is_uint32_t(const std::string &s) { + if(s.empty()) { + return false; + } + + char * p ; + unsigned long ul = strtoul(s.c_str(), &p, 10); + return (*p == 0) && ul <= std::numeric_limits::max(); } static void toupper(std::string& str) { @@ -234,7 +244,7 @@ struct StringUtils { return hash != std::numeric_limits::max() ? hash : (std::numeric_limits::max()-1); } - static std::string randstring(size_t length, uint64_t seed); + static std::string randstring(size_t length); static std::string hmac(const std::string& key, const std::string& msg); }; \ No newline at end of file diff --git a/include/system_metrics.h b/include/system_metrics.h index fda58fdd..6ea58dee 100644 --- a/include/system_metrics.h +++ b/include/system_metrics.h @@ -118,6 +118,23 @@ private: } } + static unsigned long linux_get_mem_available_bytes() { + std::string token; + std::ifstream file("/proc/meminfo"); + while(file >> token) { + if(token == "MemAvailable:") { + unsigned long mem_kb; + if(file >> mem_kb) { + return mem_kb * 1000; + } else { + return 0; + } + } + } + + return 0; // nothing found + } + public: void get(const std::string & data_dir_path, nlohmann::json& result); diff --git a/include/topster.h b/include/topster.h index 6ebadece..2e585a6b 100644 --- a/include/topster.h +++ b/include/topster.h @@ -5,16 +5,26 @@ #include #include #include -#include -#include struct KV { - uint8_t field_id; - uint16_t query_index; - uint16_t array_index; - uint64_t key; - uint64_t match_score; - int64_t scores[3]; // match score + 2 custom attributes + uint8_t field_id{}; + uint16_t query_index{}; + uint16_t array_index{}; + uint64_t key{}; + uint64_t distinct_key{}; + uint64_t match_score{}; + int64_t scores[3]{}; // match score + 2 custom attributes + + KV(uint8_t fieldId, uint16_t queryIndex, uint64_t key, uint64_t distinct_key, + uint64_t match_score, const int64_t *scores): + field_id(fieldId), query_index(queryIndex), array_index(0), key(key), + distinct_key(distinct_key), match_score(match_score) { + this->scores[0] = scores[0]; + this->scores[1] = scores[1]; + this->scores[2] = scores[2]; + } + + KV() = default; }; /* @@ -25,12 +35,19 @@ struct Topster { uint32_t size; KV *data; - KV* *kvs; + KV** kvs; - spp::sparse_hash_map keys; + // For distinct, stores the min heap kv of each group_kv_map topster value + spp::sparse_hash_map kv_map; - explicit Topster(size_t capacity): MAX_SIZE(capacity), size(0) { - // we allocate data first to get contiguous memory block whose indices are then assigned to `kvs` + spp::sparse_hash_map group_kv_map; + size_t distinct; + + explicit Topster(size_t capacity): Topster(capacity, 0) { + } + + explicit Topster(size_t capacity, size_t distinct): MAX_SIZE(capacity), size(0), distinct(distinct) { + // we allocate data first to get a memory block whose indices are then assigned to `kvs` // we use separate **kvs for easier pointer swaps data = new KV[capacity]; kvs = new KV*[capacity]; @@ -38,15 +55,25 @@ struct Topster { for(size_t i=0; iarray_index = a_index; } - static inline void replace_key_values(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index, - const uint64_t &match_score, const int64_t *scores, uint32_t start, - KV* *kvs, spp::sparse_hash_map& keys) { - kvs[start]->key = key; - kvs[start]->field_id = field_id; - kvs[start]->query_index = query_index; - kvs[start]->array_index = start; - kvs[start]->match_score = match_score; - kvs[start]->scores[0] = scores[0]; - kvs[start]->scores[1] = scores[1]; - kvs[start]->scores[2] = scores[2]; + bool add(KV* kv) { + //LOG(INFO) << "kv_map size: " << kv_map.size() << " -- kvs[0]: " << kvs[0]->match_score; + /*for(auto kv: kv_map) { + LOG(INFO) << "kv key: " << kv.first << " => " << kv.second->match_score; + }*/ - keys.erase(kvs[start]->key); - keys[key] = kvs[start]; - } + bool less_than_min_heap = (size >= MAX_SIZE) && is_smaller_equal(kv, kvs[0]); + size_t heap_op_index = 0; - void add(const uint64_t &key, const uint8_t &field_id, const uint16_t &query_index, const uint64_t &match_score, - const int64_t scores[3]) { - if (size >= MAX_SIZE) { - if(!is_greater(kvs[0], scores)) { - // when incoming value is less than the smallest in the heap, ignore - return; + if(!distinct && less_than_min_heap) { + // for non-distinct, if incoming value is smaller than min-heap ignore + return false; + } + + bool SIFT_DOWN = true; + + if(distinct) { + const auto& found_it = group_kv_map.find(kv->distinct_key); + bool is_duplicate_key = (found_it != group_kv_map.end()); + + if(!is_duplicate_key && less_than_min_heap) { + // for distinct, if a non duplicate kv is < than min heap we ignore + return false; } - uint32_t start = 0; + if(is_duplicate_key) { + // if min heap (group_topster.kvs[0]) changes, we have to update kvs and sift + Topster* group_topster = found_it->second; + KV old_min_heap_kv = *kv_map[kv->distinct_key]; + bool added = group_topster->add(kv); - // When the key already exists and has a greater score, ignore. Otherwise, we have to replace. - // NOTE: we don't consider primary and secondary attrs here because they will be the same for a given key. - if(keys.count(key) != 0) { - const KV* existing = keys.at(key); - if(match_score <= existing->match_score) { - return ; + if(!added) { + return false; } - // replace and sift down - start = existing->array_index; - } + // if new kv score is greater than previous min heap score we sift dowm, otherwise sift up + SIFT_DOWN = is_greater(kv, &old_min_heap_kv); - replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys); + // new kv is different from old_min_heap_kv so we have to sift heap + heap_op_index = old_min_heap_kv.array_index; - // sift down to maintain heap property - while ((2*start+1) < MAX_SIZE) { - uint32_t next = (2 * start + 1); - if (next+1 < MAX_SIZE && is_greater_kv(kvs[next], kvs[next+1])) { - next++; - } + // erase current min heap key from kv_map + kv_map.erase(old_min_heap_kv.distinct_key); - if (is_greater_kv(kvs[start], kvs[next])) { - swapMe(&kvs[start], &kvs[next]); + // kv will be copied into the pointer at heap_op_index + kv_map.emplace(kv->distinct_key, kvs[heap_op_index]); + } else { + // kv is guaranteed to be > current min heap: kvs[0] + // create fresh topster for this distinct group key since it does not exist + Topster* group_topster = new Topster(distinct, 0); + group_topster->add(kv); + + // add new group key to map + group_kv_map.emplace(kv->distinct_key, group_topster); + + // find heap operation index for updating kvs + + if(size < MAX_SIZE) { + // there is enough space in heap we just copy to end + SIFT_DOWN = false; + heap_op_index = size; + size++; } else { - break; + SIFT_DOWN = true; + + // max size is reached so we are forced to replace current min heap element (kvs[0]) + heap_op_index = 0; + + // remove current min heap group key from maps + delete group_kv_map[kvs[heap_op_index]->distinct_key]; + group_kv_map.erase(kvs[heap_op_index]->distinct_key); + kv_map.erase(kvs[heap_op_index]->distinct_key); } - start = next; + // kv will be copied into the pointer at heap_op_index + kv_map.emplace(kv->distinct_key, kvs[heap_op_index]); } - } else { - uint32_t start = size; - bool key_found = false; - // When the key already exists and has a greater score, ignore. Otherwise, we have to replace - if(keys.count(key) != 0) { - const KV* existing = keys.at(key); - if(match_score <= existing->match_score) { - return ; + } else { // not distinct + //LOG(INFO) << "Searching for key: " << kv->key; + + const auto& found_it = kv_map.find(kv->key); + bool is_duplicate_key = (found_it != kv_map.end()); + + /* + is_duplicate_key: SIFT_DOWN regardless of `size`. + Else: + Do SIFT_UP if size < max_size + Else SIFT_DOWN + */ + + if(is_duplicate_key) { + // Need to check if kv is greater than existing duplicate kv. + KV* existing_kv = found_it->second; + //LOG(INFO) << "existing_kv: " << existing_kv->key << " -> " << existing_kv->match_score; + + if(is_smaller_equal(kv, existing_kv)) { + return false; } - // replace and sift down - start = existing->array_index; - key_found = true; - } + SIFT_DOWN = true; - replace_key_values(key, field_id, query_index, match_score, scores, start, kvs, keys); + // replace existing kv and sift down + heap_op_index = existing_kv->array_index; + kv_map.erase(kvs[heap_op_index]->key); - if(key_found) { - // need to sift down if it's a replace - while ((2*start+1) < size) { - uint32_t next = (2 * start + 1); - if (next+1 < size && is_greater_kv(kvs[next], kvs[next+1])) { - next++; - } + // kv will be copied into the pointer at heap_op_index + kv_map.emplace(kv->key, kvs[heap_op_index]); + } else { // not duplicate - if (is_greater_kv(kvs[start], kvs[next])) { - swapMe(&kvs[start], &kvs[next]); - } else { - break; - } - - start = next; - } - - return ; - } - - while(start > 0) { - uint32_t parent = (start-1)/2; - if (is_greater_kv(kvs[parent], kvs[start])) { - swapMe(&kvs[start], &kvs[parent]); - start = parent; + if(size < MAX_SIZE) { + // we just copy to end of array + SIFT_DOWN = false; + heap_op_index = size; + size++; } else { - break; + // kv is guaranteed to be > min heap. + // we have to replace min heap element since array is full + SIFT_DOWN = true; + heap_op_index = 0; + kv_map.erase(kvs[heap_op_index]->key); } - } - if(keys.count(key) != 0) { - size++; + // kv will be copied into the pointer at heap_op_index + kv_map.emplace(kv->key, kvs[heap_op_index]); } } + + // we have to replace the existing element in the heap and sift down + kv->array_index = heap_op_index; + *kvs[heap_op_index] = *kv; + + // sift up/down to maintain heap property + + if(SIFT_DOWN) { + while ((2 * heap_op_index + 1) < size) { + uint32_t next = (2 * heap_op_index + 1); // left child + if (next+1 < size && is_greater(kvs[next], kvs[next + 1])) { + // for min heap we compare with the minimum of children + next++; // right child (2n + 2) + } + + if (is_greater(kvs[heap_op_index], kvs[next])) { + swapMe(&kvs[heap_op_index], &kvs[next]); + } else { + break; + } + + heap_op_index = next; + } + } else { + // SIFT UP + while(heap_op_index > 0) { + uint32_t parent = (heap_op_index - 1) / 2; + if (is_greater(kvs[parent], kvs[heap_op_index])) { + swapMe(&kvs[heap_op_index], &kvs[parent]); + heap_op_index = parent; + } else { + break; + } + } + } + + return true; } - static bool is_greater(const struct KV* i, const int64_t scores[3]) { - return std::tie(scores[0], scores[1], scores[2]) > - std::tie(i->scores[0], i->scores[1], i->scores[2]); - } - - static bool is_greater_kv(const struct KV* i, const struct KV* j) { + static bool is_greater(const struct KV* i, const struct KV* j) { return std::tie(i->scores[0], i->scores[1], i->scores[2], i->key) > std::tie(j->scores[0], j->scores[1], j->scores[2], j->key); } - static bool is_greater_kv_value(const struct KV & i, const struct KV & j) { - return std::tie(i.scores[0], i.scores[1], i.scores[2], i.key) > - std::tie(j.scores[0], j.scores[1], j.scores[2], j.key); + static bool is_smaller_equal(const struct KV* i, const struct KV* j) { + return std::tie(i->scores[0], i->scores[1], i->scores[2]) <= + std::tie(j->scores[0], j->scores[1], j->scores[2]); + } + + static bool is_greater_kv_group(const std::vector& i, const std::vector& j) { + return std::tie(i[0]->scores[0], i[0]->scores[1], i[0]->scores[2], i[0]->key) > + std::tie(j[0]->scores[0], j[0]->scores[1], j[0]->scores[2], j[0]->key); } // topster must be sorted before iterated upon to remove dead array entries void sort() { - std::stable_sort(kvs, kvs+size, is_greater_kv); + std::stable_sort(kvs, kvs + size, is_greater); + for(auto &group_topster: group_kv_map) { + group_topster.second->sort(); + } } void clear(){ @@ -196,6 +283,10 @@ struct Topster { return kvs[index]->key; } + uint64_t getDistinctKeyAt(uint32_t index) { + return kvs[index]->distinct_key; + } + KV* getKV(uint32_t index) { return kvs[index]; } diff --git a/include/typesense_server_utils.h b/include/typesense_server_utils.h index 4b0af691..34f0cff4 100644 --- a/include/typesense_server_utils.h +++ b/include/typesense_server_utils.h @@ -1,10 +1,10 @@ #pragma once +#include "logger.h" #include #include #include #include "config.h" -#include "logger.h" #include "store.h" #include "collection_manager.h" #include diff --git a/src/array.cpp b/src/array.cpp index bc901ff7..7a009d2c 100644 --- a/src/array.cpp +++ b/src/array.cpp @@ -61,12 +61,12 @@ void array::remove_index(uint32_t start_index, uint32_t end_index) { } uint32_t size_required = (uint32_t) (unsorted_append_size_required(max, new_index) * FOR_GROWTH_FACTOR); - uint8_t *out = new uint8_t[size_required]; + uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out); uint32_t actual_size = for_compress_unsorted(new_array, out, new_index); delete[] curr_array; delete[] new_array; - delete[] in; + free(in); in = out; length = new_index; diff --git a/src/array_utils.cpp b/src/array_utils.cpp index 9d93fa46..5a88dffd 100644 --- a/src/array_utils.cpp +++ b/src/array_utils.cpp @@ -107,14 +107,16 @@ size_t ArrayUtils::exclude_scalar(const uint32_t *A, const size_t lenA, size_t indexA = 0, indexB = 0, res_index = 0; if(A == nullptr && B == nullptr) { - return 0; + *out = nullptr; + return 0; } if(A == nullptr) { + *out = nullptr; return 0; } - if(B == nullptr) { + if(lenB == 0 || B == nullptr) { *out = new uint32_t[lenA]; memcpy(*out, A, lenA * sizeof(uint32_t)); return lenA; diff --git a/src/art.cpp b/src/art.cpp index 7ac945c0..1da84cf1 100644 --- a/src/art.cpp +++ b/src/art.cpp @@ -1384,8 +1384,6 @@ int art_fuzzy_search(art_tree *t, const unsigned char *term, const int term_len, art_fuzzy_recurse(0, 0, t->root, -1, term, term_len, irow, jrow, min_cost, max_cost, prefix, nodes); } - PROCESS_NODES: - if(token_order == FREQUENCY) { std::sort(nodes.begin(), nodes.end(), compare_art_node_frequency); } else { diff --git a/src/collection.cpp b/src/collection.cpp index c3342fec..e3724a11 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -305,54 +305,68 @@ void Collection::prune_document(nlohmann::json &document, const spp::sparse_hash } void Collection::populate_overrides(std::string query, - const std::map& pinned_hits, + const std::map>& pinned_hits, const std::vector& hidden_hits, - std::map & id_pos_map, - std::vector & included_ids, + std::map>& include_ids, std::vector & excluded_ids) { StringUtils::tolowercase(query); + std::set excluded_set; + + // If pinned or hidden hits are provided, they take precedence over overrides + + // have to ensure that hidden hits take precedence over included hits + if(!hidden_hits.empty()) { + for(const auto & hit: hidden_hits) { + Option seq_id_op = doc_id_to_seq_id(hit); + if(seq_id_op.ok()) { + excluded_ids.push_back(seq_id_op.get()); + excluded_set.insert(seq_id_op.get()); + } + } + } for(const auto & override_kv: overrides) { const auto & override = override_kv.second; if( (override.rule.match == override_t::MATCH_EXACT && override.rule.query == query) || (override.rule.match == override_t::MATCH_CONTAINS && query.find(override.rule.query) != std::string::npos) ) { - for(const auto & hit: override.add_hits) { - Option seq_id_op = doc_id_to_seq_id(hit.doc_id); - if(seq_id_op.ok()) { - included_ids.push_back(seq_id_op.get()); - id_pos_map[seq_id_op.get()] = hit.position; - } - } + // have to ensure that dropped hits take precedence over added hits for(const auto & hit: override.drop_hits) { Option seq_id_op = doc_id_to_seq_id(hit.doc_id); if(seq_id_op.ok()) { excluded_ids.push_back(seq_id_op.get()); + excluded_set.insert(seq_id_op.get()); + } + } + + for(const auto & hit: override.add_hits) { + Option seq_id_op = doc_id_to_seq_id(hit.doc_id); + if(!seq_id_op.ok()) { + continue; + } + uint32_t seq_id = seq_id_op.get(); + bool excluded = (excluded_set.count(seq_id) != 0); + if(!excluded) { + include_ids[hit.position].push_back(seq_id); } } } } - // If pinned or hidden hits are provided, they take precedence over overrides - if(!pinned_hits.empty()) { - for(const auto & hit: pinned_hits) { - Option seq_id_op = doc_id_to_seq_id(hit.first); - if(seq_id_op.ok()) { - included_ids.push_back(seq_id_op.get()); - id_pos_map[seq_id_op.get()] = hit.second; - } - } - } - - if(!hidden_hits.empty()) { - for(const auto & hit: hidden_hits) { - Option seq_id_op = doc_id_to_seq_id(hit); - if(seq_id_op.ok()) { - included_ids.erase(std::remove(included_ids.begin(), included_ids.end(), seq_id_op.get()), included_ids.end()); - id_pos_map.erase(seq_id_op.get()); - excluded_ids.push_back(seq_id_op.get()); + for(const auto& pos_ids: pinned_hits) { + size_t pos = pos_ids.first; + for(const std::string& id: pos_ids.second) { + Option seq_id_op = doc_id_to_seq_id(id); + if(!seq_id_op.ok()) { + continue; + } + uint32_t seq_id = seq_id_op.get(); + bool excluded = (excluded_set.count(seq_id) != 0); + if(!excluded) { + include_ids[pos].push_back(seq_id); + } } } } @@ -371,20 +385,60 @@ Option Collection::search(const std::string & query, const std:: const size_t snippet_threshold, const std::string & highlight_full_fields, size_t typo_tokens_threshold, - const std::map& pinned_hits, - const std::vector& hidden_hits) { + const std::map>& pinned_hits, + const std::vector& hidden_hits, + const std::vector& group_by_fields, + const size_t group_limit) { + + if(query != "*" && search_fields.empty()) { + return Option(400, "No search fields specified for the query."); + } + + if(!group_by_fields.empty() && (group_limit == 0 || group_limit > GROUP_LIMIT_MAX)) { + return Option(400, "Value of `group_limit` must be between 1 and " + + std::to_string(GROUP_LIMIT_MAX) + "."); + } - std::vector included_ids; std::vector excluded_ids; - std::map id_pos_map; - populate_overrides(query, pinned_hits, hidden_hits, id_pos_map, included_ids, excluded_ids); + std::map> include_ids; // position => list of IDs + populate_overrides(query, pinned_hits, hidden_hits, include_ids, excluded_ids); - std::map> index_to_included_ids; + /*for(auto kv: include_ids) { + LOG(INFO) << "key: " << kv.first; + for(auto val: kv.second) { + LOG(INFO) << val; + } + } + + LOG(INFO) << "Excludes:"; + + for(auto id: excluded_ids) { + LOG(INFO) << id; + } + + LOG(INFO) << "include_ids size: " << include_ids.size(); + for(auto& group: include_ids) { + for(uint32_t& seq_id: group.second) { + LOG(INFO) << "seq_id: " << seq_id; + } + + LOG(INFO) << "----"; + } + */ + + std::map>> index_to_included_ids; std::map> index_to_excluded_ids; - for(auto seq_id: included_ids) { - auto index_id = (seq_id % num_indices); - index_to_included_ids[index_id].push_back(seq_id); + for(const auto& pos_ids: include_ids) { + size_t outer_pos = pos_ids.first; + size_t ids_per_pos = std::max(size_t(1), group_limit); + + for(size_t inner_pos = 0; inner_pos < std::min(ids_per_pos, pos_ids.second.size()); inner_pos++) { + auto seq_id = pos_ids.second[inner_pos]; + auto index_id = (seq_id % num_indices); + index_to_included_ids[index_id][outer_pos][inner_pos] = seq_id; + //LOG(INFO) << "Adding seq_id " << seq_id << " to index_id " << index_id; + } } for(auto seq_id: excluded_ids) { @@ -408,6 +462,22 @@ Option Collection::search(const std::string & query, const std:: } } + // validate group by fields + for(const std::string & field_name: group_by_fields) { + if(search_schema.count(field_name) == 0) { + std::string error = "Could not find a field named `" + field_name + "` in the schema."; + return Option(404, error); + } + + field search_field = search_schema.at(field_name); + + // must be a facet field + if(!search_field.is_facet()) { + std::string error = "Group by field `" + field_name + "` should be a facet field."; + return Option(400, error); + } + } + // validate filter fields std::vector filter_blocks; StringUtils::split(simple_filter_query, filter_blocks, "&&"); @@ -515,7 +585,7 @@ Option Collection::search(const std::string & query, const std:: } // for a wildcard query, if filter is not specified, use default_sorting_field as a catch-all - if(query == "*" && filters.size() == 0) { + if(query == "*" && filters.empty()) { field f = search_schema.at(default_sorting_field); std::string max_value = f.is_float() ? std::to_string(std::numeric_limits::max()) : std::to_string(std::numeric_limits::max()); @@ -631,19 +701,21 @@ Option Collection::search(const std::string & query, const std:: const size_t max_hits = std::min((page * per_page), get_num_documents()); std::vector> searched_queries; // search queries used for generating the results - std::vector raw_result_kvs; - std::vector override_result_kvs; + std::vector> raw_result_kvs; + std::vector> override_result_kvs; size_t total_found = 0; + spp::sparse_hash_set groups_processed; // used to calculate total_found for grouped query // send data to individual index threads size_t index_id = 0; for(Index* index: indices) { - index->search_params = search_args(query, search_fields, filters, facets, - index_to_included_ids[index_id], index_to_excluded_ids[index_id], - sort_fields_std, facet_query, num_typos, max_facet_values, max_hits, - per_page, page, token_order, prefix, - drop_tokens_threshold, typo_tokens_threshold); + index->search_params = new search_args(query, search_fields, filters, facets, + index_to_included_ids[index_id], index_to_excluded_ids[index_id], + sort_fields_std, facet_query, num_typos, max_facet_values, max_hits, + per_page, page, token_order, prefix, + drop_tokens_threshold, typo_tokens_threshold, + group_by_fields, group_limit); { std::lock_guard lk(index->m); index->ready = true; @@ -655,6 +727,12 @@ Option Collection::search(const std::string & query, const std:: Option index_search_op({}); // stores the last error across all index threads + // for grouping we have re-aggregate + + const size_t topster_size = std::max((size_t)1, max_hits); + Topster topster(topster_size, group_limit); + Topster curated_topster(topster_size, group_limit); + for(Index* index: indices) { // wait for the worker { @@ -662,9 +740,9 @@ Option Collection::search(const std::string & query, const std:: index->cv.wait(lk, [index]{return index->processed;}); } - if(!index->search_params.outcome.ok()) { - index_search_op = Option(index->search_params.outcome.code(), - index->search_params.outcome.error()); + if(!index->search_params->outcome.ok()) { + index_search_op = Option(index->search_params->outcome.code(), + index->search_params->outcome.error()); } if(!index_search_op.ok()) { @@ -672,34 +750,33 @@ Option Collection::search(const std::string & query, const std:: continue; } - for(auto & field_order_kv: index->search_params.raw_result_kvs) { - field_order_kv.query_index += searched_queries.size(); - raw_result_kvs.push_back(field_order_kv); - } + aggregate_topster(searched_queries.size(), topster, index->search_params->topster); + aggregate_topster(searched_queries.size(), curated_topster, index->search_params->curated_topster); - for(auto & field_order_kv: index->search_params.override_result_kvs) { - field_order_kv.query_index += searched_queries.size(); - override_result_kvs.push_back(field_order_kv); - } + searched_queries.insert(searched_queries.end(), index->search_params->searched_queries.begin(), + index->search_params->searched_queries.end()); - searched_queries.insert(searched_queries.end(), index->search_params.searched_queries.begin(), - index->search_params.searched_queries.end()); - - for(size_t fi = 0; fi < index->search_params.facets.size(); fi++) { - auto & this_facet = index->search_params.facets[fi]; + for(size_t fi = 0; fi < index->search_params->facets.size(); fi++) { + auto & this_facet = index->search_params->facets[fi]; auto & acc_facet = facets[fi]; for(auto & facet_kv: this_facet.result_map) { - size_t count = 0; - - if(acc_facet.result_map.count(facet_kv.first) == 0) { - // not found, so set it - count = facet_kv.second.count; + if(index->search_params->group_limit) { + // we have to add all group sets + acc_facet.result_map[facet_kv.first].groups.insert( + facet_kv.second.groups.begin(), facet_kv.second.groups.end() + ); } else { - count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count; + size_t count = 0; + if(acc_facet.result_map.count(facet_kv.first) == 0) { + // not found, so set it + count = facet_kv.second.count; + } else { + count = acc_facet.result_map[facet_kv.first].count + facet_kv.second.count; + } + acc_facet.result_map[facet_kv.first].count = count; } - acc_facet.result_map[facet_kv.first].count = count; acc_facet.result_map[facet_kv.first].doc_id = facet_kv.second.doc_id; acc_facet.result_map[facet_kv.first].array_pos = facet_kv.second.array_pos; acc_facet.result_map[facet_kv.first].query_token_pos = facet_kv.second.query_token_pos; @@ -713,138 +790,195 @@ Option Collection::search(const std::string & query, const std:: } } - total_found += index->search_params.all_result_ids_len; + if(group_limit) { + groups_processed.insert( + index->search_params->groups_processed.begin(), + index->search_params->groups_processed.end() + ); + } else { + total_found += index->search_params->all_result_ids_len; + } } if(!index_search_op.ok()) { return index_search_op; } - // All fields are sorted descending - std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_value); + topster.sort(); + curated_topster.sort(); - // Sort based on position in overriden list + populate_result_kvs(&topster, raw_result_kvs); + populate_result_kvs(&curated_topster, override_result_kvs); + + // for grouping we have to aggregate group set sizes to a count value + if(group_limit) { + for(auto& acc_facet: facets) { + for(auto& facet_kv: acc_facet.result_map) { + facet_kv.second.count = facet_kv.second.groups.size(); + } + } + + total_found = groups_processed.size() + override_result_kvs.size(); + } + + // All fields are sorted descending + std::sort(raw_result_kvs.begin(), raw_result_kvs.end(), Topster::is_greater_kv_group); + + // Sort based on position in overridden list std::sort( override_result_kvs.begin(), override_result_kvs.end(), - [&id_pos_map](const KV & a, const KV & b) -> bool { - return id_pos_map[a.key] < id_pos_map[b.key]; + [](const std::vector& a, std::vector& b) -> bool { + return a[0]->distinct_key < b[0]->distinct_key; } ); - nlohmann::json result = nlohmann::json::object(); - - result["hits"] = nlohmann::json::array(); - result["found"] = total_found; - - std::vector result_kvs; + std::vector> result_group_kvs; size_t override_kv_index = 0; size_t raw_results_index = 0; // merge raw results and override results while(override_kv_index < override_result_kvs.size() && raw_results_index < raw_result_kvs.size()) { - if(override_kv_index < override_result_kvs.size() && - id_pos_map.count(override_result_kvs[override_kv_index].key) != 0 && - result_kvs.size() + 1 == id_pos_map[override_result_kvs[override_kv_index].key]) { - result_kvs.push_back(override_result_kvs[override_kv_index]); - override_kv_index++; + size_t result_position = result_group_kvs.size() + 1; + uint64_t override_position = override_result_kvs[override_kv_index][0]->distinct_key; + + if(result_position == override_position) { + override_result_kvs[override_kv_index][0]->match_score = 0; // to identify curated result + result_group_kvs.push_back(override_result_kvs[override_kv_index]); + override_kv_index++; } else { - result_kvs.push_back(raw_result_kvs[raw_results_index]); + result_group_kvs.push_back(raw_result_kvs[raw_results_index]); raw_results_index++; } } while(override_kv_index < override_result_kvs.size()) { - result_kvs.push_back(override_result_kvs[override_kv_index]); + override_result_kvs[override_kv_index][0]->match_score = 0; // to identify curated result + result_group_kvs.push_back({override_result_kvs[override_kv_index]}); override_kv_index++; } while(raw_results_index < raw_result_kvs.size()) { - result_kvs.push_back(raw_result_kvs[raw_results_index]); + result_group_kvs.push_back(raw_result_kvs[raw_results_index]); raw_results_index++; } const long start_result_index = (page - 1) * per_page; - const long end_result_index = std::min(max_hits, result_kvs.size()) - 1; // could be -1 when max_hits is 0 + const long end_result_index = std::min(max_hits, result_group_kvs.size()) - 1; // could be -1 when max_hits is 0 + + nlohmann::json result = nlohmann::json::object(); + + result["found"] = total_found; + + std::string hits_key = group_limit ? "grouped_hits" : "hits"; + result[hits_key] = nlohmann::json::array(); // construct results array for(long result_kvs_index = start_result_index; result_kvs_index <= end_result_index; result_kvs_index++) { - const auto & field_order_kv = result_kvs[result_kvs_index]; - const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv.key); + const std::vector & kv_group = result_group_kvs[result_kvs_index]; - nlohmann::json document; - const Option & document_op = get_document_from_store(seq_id_key, document); - - if(!document_op.ok()) { - LOG(ERROR) << "Document fetch error. " << document_op.error(); - continue; + nlohmann::json group_hits; + if(group_limit) { + group_hits["hits"] = nlohmann::json::array(); } - nlohmann::json wrapper_doc; - wrapper_doc["highlights"] = nlohmann::json::array(); - std::vector highlights; - StringUtils string_utils; + nlohmann::json& hits_array = group_limit ? group_hits["hits"] : result["hits"]; - // find out if fields have to be highlighted fully - std::vector fields_highlighted_fully_vec; - spp::sparse_hash_set fields_highlighted_fully; - StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ","); + for(const KV* field_order_kv: kv_group) { + const std::string& seq_id_key = get_seq_id_key((uint32_t) field_order_kv->key); - for(std::string & highlight_full_field: fields_highlighted_fully_vec) { - StringUtils::trim(highlight_full_field); - fields_highlighted_fully.emplace(highlight_full_field); - } + nlohmann::json document; + const Option & document_op = get_document_from_store(seq_id_key, document); - for(const std::string & field_name: search_fields) { - // should not pick excluded field for highlighting - if(exclude_fields.count(field_name) > 0) { + if(!document_op.ok()) { + LOG(ERROR) << "Document fetch error. " << document_op.error(); continue; } - field search_field = search_schema.at(field_name); - if(query != "*" && (search_field.type == field_types::STRING || - search_field.type == field_types::STRING_ARRAY)) { + nlohmann::json wrapper_doc; + wrapper_doc["highlights"] = nlohmann::json::array(); + std::vector highlights; + StringUtils string_utils; - bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end()); - highlight_t highlight; - highlight_result(search_field, searched_queries, field_order_kv, document, - string_utils, snippet_threshold, highlighted_fully, highlight); + // find out if fields have to be highlighted fully + std::vector fields_highlighted_fully_vec; + spp::sparse_hash_set fields_highlighted_fully; + StringUtils::split(highlight_full_fields, fields_highlighted_fully_vec, ","); - if(!highlight.snippets.empty()) { - highlights.push_back(highlight); - } + for(std::string & highlight_full_field: fields_highlighted_fully_vec) { + StringUtils::trim(highlight_full_field); + fields_highlighted_fully.emplace(highlight_full_field); } - } - std::sort(highlights.begin(), highlights.end()); - - for(const auto & highlight: highlights) { - nlohmann::json h_json = nlohmann::json::object(); - h_json["field"] = highlight.field; - bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end()); - - if(!highlight.indices.empty()) { - h_json["indices"] = highlight.indices; - h_json["snippets"] = highlight.snippets; - if(highlight_fully) { - h_json["values"] = highlight.values; + for(const std::string & field_name: search_fields) { + // should not pick excluded field for highlighting + if(exclude_fields.count(field_name) > 0) { + continue; } - } else { - h_json["snippet"] = highlight.snippets[0]; - if(highlight_fully) { - h_json["value"] = highlight.values[0]; + field search_field = search_schema.at(field_name); + if(query != "*" && (search_field.type == field_types::STRING || + search_field.type == field_types::STRING_ARRAY)) { + + bool highlighted_fully = (fields_highlighted_fully.find(field_name) != fields_highlighted_fully.end()); + highlight_t highlight; + highlight_result(search_field, searched_queries, field_order_kv, document, + string_utils, snippet_threshold, highlighted_fully, highlight); + + if(!highlight.snippets.empty()) { + highlights.push_back(highlight); + } } } - wrapper_doc["highlights"].push_back(h_json); + std::sort(highlights.begin(), highlights.end()); + + for(const auto & highlight: highlights) { + nlohmann::json h_json = nlohmann::json::object(); + h_json["field"] = highlight.field; + bool highlight_fully = (fields_highlighted_fully.find(highlight.field) != fields_highlighted_fully.end()); + + if(!highlight.indices.empty()) { + h_json["indices"] = highlight.indices; + h_json["snippets"] = highlight.snippets; + if(highlight_fully) { + h_json["values"] = highlight.values; + } + + } else { + h_json["snippet"] = highlight.snippets[0]; + if(highlight_fully) { + h_json["value"] = highlight.values[0]; + } + } + + wrapper_doc["highlights"].push_back(h_json); + } + + //wrapper_doc["seq_id"] = (uint32_t) field_order_kv->key; + + prune_document(document, include_fields, exclude_fields); + wrapper_doc["document"] = document; + wrapper_doc["text_match"] = field_order_kv->match_score; + + if(field_order_kv->match_score == 0) { + wrapper_doc["curated"] = true; + } + + hits_array.push_back(wrapper_doc); } - prune_document(document, include_fields, exclude_fields); - wrapper_doc["document"] = document; - wrapper_doc["text_match"] = field_order_kv.match_score; - //wrapper_doc["seq_id"] = (uint32_t) field_order_kv.key; + if(group_limit) { + const auto& document = group_hits["hits"][0]["document"]; - result["hits"].push_back(wrapper_doc); + group_hits["group_key"] = nlohmann::json::array(); + for(const auto& field_name: group_by_fields) { + if(document.count(field_name) != 0) { + group_hits["group_key"].push_back(document[field_name]); + } + } + + result["grouped_hits"].push_back(group_hits); + } } result["facet_counts"] = nlohmann::json::array(); @@ -941,6 +1075,11 @@ Option Collection::search(const std::string & query, const std:: result["facet_counts"].push_back(facet_result); } + // free search params + for(Index* index: indices) { + delete index->search_params; + } + result["request_params"] = nlohmann::json::object();; result["request_params"]["per_page"] = per_page; result["request_params"]["q"] = query; @@ -951,6 +1090,43 @@ Option Collection::search(const std::string & query, const std:: return result; } + +void Collection::populate_result_kvs(Topster *topster, std::vector> &result_kvs) const { + if(topster->distinct) { + for(auto &group_topster_entry: topster->group_kv_map) { + Topster* group_topster = group_topster_entry.second; + const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); + result_kvs.emplace_back(group_kvs); + } + + } else { + for(uint32_t t = 0; t < topster->size; t++) { + KV* kv = topster->getKV(t); + result_kvs.push_back({kv}); + } + } +} + +void Collection::aggregate_topster(size_t query_index, Topster &topster, Topster *index_topster) const { + if(index_topster->distinct) { + for(auto &group_topster_entry: index_topster->group_kv_map) { + Topster* group_topster = group_topster_entry.second; + const std::vector group_kvs(group_topster->kvs, group_topster->kvs+group_topster->size); + for(KV* kv: group_kvs) { + kv->query_index += query_index; + topster.add(kv); + } + } + + } else { + for(uint32_t t = 0; t < index_topster->size; t++) { + KV* kv = index_topster->getKV(t); + kv->query_index += query_index; + topster.add(kv); + } + } +} + void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t &facet_count, const nlohmann::json &document, std::string &value) { @@ -989,7 +1165,7 @@ void Collection::facet_value_to_string(const facet &a_facet, const facet_count_t void Collection::highlight_result(const field &search_field, const std::vector> &searched_queries, - const KV & field_order_kv, const nlohmann::json & document, + const KV* field_order_kv, const nlohmann::json & document, StringUtils & string_utils, size_t snippet_threshold, bool highlighted_fully, highlight_t & highlight) { @@ -997,15 +1173,15 @@ void Collection::highlight_result(const field &search_field, spp::sparse_hash_map leaf_to_indices; std::vector query_suggestion; - for (const art_leaf *token_leaf : searched_queries[field_order_kv.query_index]) { + for (const art_leaf *token_leaf : searched_queries[field_order_kv->query_index]) { // Must search for the token string fresh on that field for the given document since `token_leaf` // is from the best matched field and need not be present in other fields of a document. - Index* index = indices[field_order_kv.key % num_indices]; + Index* index = indices[field_order_kv->key % num_indices]; art_leaf *actual_leaf = index->get_token_leaf(search_field.name, &token_leaf->key[0], token_leaf->key_len); if(actual_leaf != nullptr) { query_suggestion.push_back(actual_leaf); std::vector positions; - uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv.key); + uint32_t doc_index = actual_leaf->values->ids.indexOf(field_order_kv->key); auto doc_indices = new uint32_t[1]; doc_indices[0] = doc_index; leaf_to_indices.emplace(actual_leaf, doc_indices); @@ -1031,8 +1207,8 @@ void Collection::highlight_result(const field &search_field, continue; } - const Match & this_match = Match::match(field_order_kv.key, token_positions); - uint64_t this_match_score = this_match.get_match_score(1, field_order_kv.field_id); + const Match & this_match = Match::match(field_order_kv->key, token_positions); + uint64_t this_match_score = this_match.get_match_score(1, field_order_kv->field_id); match_indices.emplace_back(this_match, this_match_score, array_index); } @@ -1231,7 +1407,7 @@ Option Collection::add_override(const override_t & override) { return Option(500, "Error while storing the override on disk."); } - overrides[override.id] = override; + overrides.emplace(override.id, override); return Option(200); } diff --git a/src/core_api.cpp b/src/core_api.cpp index 4e47aeda..06952a80 100644 --- a/src/core_api.cpp +++ b/src/core_api.cpp @@ -212,6 +212,9 @@ bool get_search(http_req & req, http_res & res) { const char *FACET_QUERY = "facet_query"; const char *MAX_FACET_VALUES = "max_facet_values"; + const char *GROUP_BY = "group_by"; + const char *GROUP_LIMIT = "group_limit"; + const char *PER_PAGE = "per_page"; const char *PAGE = "page"; const char *CALLBACK = "callback"; @@ -249,11 +252,6 @@ bool get_search(http_req & req, http_res & res) { return false; } - if(req.params.count(QUERY_BY) == 0) { - res.set_400(std::string("Parameter `") + QUERY_BY + "` is required."); - return false; - } - if(req.params.count(MAX_FACET_VALUES) == 0) { req.params[MAX_FACET_VALUES] = "10"; } @@ -291,41 +289,58 @@ bool get_search(http_req & req, http_res & res) { req.params[EXCLUDE_FIELDS] = ""; } - if(!StringUtils::is_uint64_t(req.params[DROP_TOKENS_THRESHOLD])) { + if(req.params.count(GROUP_BY) == 0) { + req.params[GROUP_BY] = ""; + } + + if(req.params.count(GROUP_LIMIT) == 0) { + if(req.params[GROUP_BY] != "") { + req.params[GROUP_LIMIT] = "3"; + } else { + req.params[GROUP_LIMIT] = "0"; + } + } + + if(!StringUtils::is_uint32_t(req.params[DROP_TOKENS_THRESHOLD])) { res.set_400("Parameter `" + std::string(DROP_TOKENS_THRESHOLD) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[TYPO_TOKENS_THRESHOLD])) { + if(!StringUtils::is_uint32_t(req.params[TYPO_TOKENS_THRESHOLD])) { res.set_400("Parameter `" + std::string(TYPO_TOKENS_THRESHOLD) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[NUM_TYPOS])) { + if(!StringUtils::is_uint32_t(req.params[NUM_TYPOS])) { res.set_400("Parameter `" + std::string(NUM_TYPOS) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[PER_PAGE])) { + if(!StringUtils::is_uint32_t(req.params[PER_PAGE])) { res.set_400("Parameter `" + std::string(PER_PAGE) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[PAGE])) { + if(!StringUtils::is_uint32_t(req.params[PAGE])) { res.set_400("Parameter `" + std::string(PAGE) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[MAX_FACET_VALUES])) { + if(!StringUtils::is_uint32_t(req.params[MAX_FACET_VALUES])) { res.set_400("Parameter `" + std::string(MAX_FACET_VALUES) + "` must be an unsigned integer."); return false; } - if(!StringUtils::is_uint64_t(req.params[SNIPPET_THRESHOLD])) { + if(!StringUtils::is_uint32_t(req.params[SNIPPET_THRESHOLD])) { res.set_400("Parameter `" + std::string(SNIPPET_THRESHOLD) + "` must be an unsigned integer."); return false; } + if(!StringUtils::is_uint32_t(req.params[GROUP_LIMIT])) { + res.set_400("Parameter `" + std::string(GROUP_LIMIT) + "` must be an unsigned integer."); + return false; + } + std::string filter_str = req.params.count(FILTER) != 0 ? req.params[FILTER] : ""; std::vector search_fields; @@ -343,6 +358,9 @@ bool get_search(http_req & req, http_res & res) { spp::sparse_hash_set include_fields(include_fields_vec.begin(), include_fields_vec.end()); spp::sparse_hash_set exclude_fields(exclude_fields_vec.begin(), exclude_fields_vec.end()); + std::vector group_by_fields; + StringUtils::split(req.params[GROUP_BY], group_by_fields, ","); + std::vector sort_fields; if(req.params.count(SORT_BY) != 0) { std::vector sort_field_strs; @@ -367,7 +385,8 @@ bool get_search(http_req & req, http_res & res) { } } - std::map pinned_hits; + std::map> pinned_hits; + if(req.params.count(PINNED_HITS) != 0) { std::vector pinned_hits_strs; StringUtils::split(req.params[PINNED_HITS], pinned_hits_strs, ","); @@ -392,7 +411,7 @@ bool get_search(http_req & req, http_res & res) { return false; } - pinned_hits.emplace(expression_parts[0], position); + pinned_hits[position].emplace_back(expression_parts[0]); } } @@ -422,17 +441,19 @@ bool get_search(http_req & req, http_res & res) { Option result_op = collection->search(req.params[QUERY], search_fields, filter_str, facet_fields, sort_fields, std::stoi(req.params[NUM_TYPOS]), - static_cast(std::stoi(req.params[PER_PAGE])), - static_cast(std::stoi(req.params[PAGE])), + static_cast(std::stol(req.params[PER_PAGE])), + static_cast(std::stol(req.params[PAGE])), token_order, prefix, drop_tokens_threshold, include_fields, exclude_fields, - static_cast(std::stoi(req.params[MAX_FACET_VALUES])), + static_cast(std::stol(req.params[MAX_FACET_VALUES])), req.params[FACET_QUERY], - static_cast(std::stoi(req.params[SNIPPET_THRESHOLD])), + static_cast(std::stol(req.params[SNIPPET_THRESHOLD])), req.params[HIGHLIGHT_FULL_FIELDS], typo_tokens_threshold, pinned_hits, - hidden_hits + hidden_hits, + group_by_fields, + static_cast(std::stol(req.params[GROUP_LIMIT])) ); uint64_t timeMillis = std::chrono::duration_cast( @@ -873,7 +894,7 @@ bool post_create_key(http_req &req, http_res &res) { return false; } - const std::string &rand_key = StringUtils::randstring(AuthManager::KEY_LEN, req.seed); + const std::string &rand_key = req.metadata; api_key_t api_key( rand_key, diff --git a/src/http_server.cpp b/src/http_server.cpp index 37b6aae0..2b8b6dc1 100644 --- a/src/http_server.cpp +++ b/src/http_server.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "raft_server.h" #include "logger.h" @@ -186,20 +187,6 @@ std::string HttpServer::get_version() { void HttpServer::clear_timeouts(const std::vector & timers, bool trigger_callback) { for(h2o_timer_t* timer: timers) { h2o_timer_unlink(timer); - /*while (!h2o_linklist_is_empty(&timer->_link)) { - h2o_timer_t *entry = H2O_STRUCT_FROM_MEMBER(h2o_timer_t, _link, timer->_link.next); - if(entry == nullptr) { - continue; - } - - if(trigger_callback) { - entry->cb(entry); - } - - //entry->expire_at = 0; - h2o_linklist_unlink(&entry->_link); - h2o_timer_unlink(timer); - }*/ } } @@ -385,6 +372,12 @@ int HttpServer::catch_all_handler(h2o_handler_t *_self, h2o_req_t *req) { } // routes match and is an authenticated request + // do any additional pre-request middleware operations here + if(rpath->action == "keys:create") { + // we enrich incoming request with a random API key here so that leader and replicas will use the same key + request->metadata = StringUtils::randstring(AuthManager::KEY_LEN); + } + // for writes, we defer to replication_state if(http_method != "GET") { self->http_server->get_replication_state()->write(request, response); @@ -476,26 +469,13 @@ void HttpServer::on(const std::string & message, bool (*handler)(void*)) { HttpServer::~HttpServer() { delete message_dispatcher; - // remove all timeouts defined in: https://github.com/h2o/h2o/blob/v2.2.2/lib/core/context.c#L142 - /*std::vector timeouts = { - &ctx.zero_timeout, - &ctx.one_sec_timeout, - &ctx.hundred_ms_timeout, - &ctx.handshake_timeout, - &ctx.http1.req_timeout, - &ctx.http2.idle_timeout, - &ctx.http2.graceful_shutdown_timeout, - &ctx.proxy.io_timeout - }; - - clear_timeouts(timeouts); - */ - if(ssl_refresh_timer.timer.expire_at != 0) { // avoid callback since it recreates timeout clear_timeouts({&ssl_refresh_timer.timer}, false); } + h2o_timerwheel_run(ctx.loop->_timeouts, 9999999999999); + h2o_context_dispose(&ctx); free(ctx.globalconf->server_name.base); diff --git a/src/index.cpp b/src/index.cpp index 109452f2..66b0a299 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -633,7 +633,7 @@ void Index::compute_facet_stats(facet &a_facet, int64_t raw_value, const std::st void Index::do_facets(std::vector & facets, facet_query_t & facet_query, const uint32_t* result_ids, size_t results_size) { - std::map facet_to_index; + std::unordered_map facet_to_index; size_t i_facet = 0; for(const auto & facet: facet_schema) { @@ -661,6 +661,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, std::vector query_tokens; StringUtils::split(facet_query.query, query_tokens, " "); + // for non-string fields, `faceted_name` returns their aliased stringified field name art_tree *t = search_index.at(facet_field.faceted_name()); for(size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) { @@ -703,7 +704,7 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, const std::vector & fhashes = facet_index_v2[doc_seq_id][facet_id]; int array_pos = 0; - int fvalue_found = 0; + bool fvalue_found = false; std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens) spp::sparse_hash_map query_token_positions; size_t field_token_index = -1; @@ -717,9 +718,9 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, // ftoken_hash is the raw value for numeric fields compute_facet_stats(a_facet, ftoken_hash, facet_field.type); + // not using facet query or this particular facet value is found in facet filter if(!use_facet_query || fhash_qtoken_pos.find(ftoken_hash) != fhash_qtoken_pos.end()) { - // not using facet query or this particular facet value is found in facet filter - fvalue_found |= 1; // bitwise to ensure only one count for a multi-token facet value + fvalue_found = true; if(use_facet_query) { // map token index to query index (used for highlighting later on) @@ -736,41 +737,35 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, } } - //std::cout << "j: " << j << std::endl; - // 0 indicates separator, while the second condition checks for non-array string if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && j == fhashes.size() - 1)) { - if(!use_facet_query || fvalue_found != 0) { + if(!use_facet_query || fvalue_found) { const std::string & fvalue_str = fvaluestream.str(); - uint64_t fhash = 0; - if(facet_field.is_string()) { - fhash = facet_token_hash(facet_field, fvalue_str); - } else { - fhash = std::atoi(fvalue_str.c_str()); - } + uint64_t fhash = facet_token_hash(facet_field, fvalue_str); if(a_facet.result_map.count(fhash) == 0) { - a_facet.result_map[fhash] = facet_count_t{0, doc_seq_id, 0, + a_facet.result_map[fhash] = facet_count_t{0, spp::sparse_hash_set(), + doc_seq_id, 0, spp::sparse_hash_map()}; } - a_facet.result_map[fhash].count += 1; a_facet.result_map[fhash].doc_id = doc_seq_id; a_facet.result_map[fhash].array_pos = array_pos; + if(search_params->group_limit) { + uint64_t distinct_id = get_distinct_id(facet_to_index, doc_seq_id); + a_facet.result_map[fhash].groups.emplace(distinct_id); + } else { + a_facet.result_map[fhash].count += 1; + } + if(use_facet_query) { a_facet.result_map[fhash].query_token_pos = query_token_positions; - - /*if(j == 11) { - for(auto xx: query_token_positions) { - std::cout << xx.first << " -> " << xx.second.pos << " , " << xx.second.cost << std::endl; - } - }*/ } } array_pos++; - fvalue_found = 0; + fvalue_found = false; std::stringstream().swap(fvaluestream); spp::sparse_hash_map().swap(query_token_positions); field_token_index = -1; @@ -781,56 +776,12 @@ void Index::do_facets(std::vector & facets, facet_query_t & facet_query, } } -void Index::drop_facets(std::vector & facets, const std::vector & ids) { - std::map facet_to_index; - - size_t i_facet = 0; - for(const auto & facet: facet_schema) { - facet_to_index[facet.first] = i_facet; - i_facet++; - } - - for(auto & a_facet: facets) { - const field & facet_field = facet_schema.at(a_facet.field_name); - - // assumed that facet fields have already been validated upstream - for(const uint32_t doc_seq_id: ids) { - if(facet_index_v2.count(doc_seq_id) != 0) { - // FORMAT OF VALUES - // String: h1 h2 h3 - // String array: h1 h2 h3 0 h1 0 h1 h2 0 - const std::vector & fhashes = facet_index_v2[doc_seq_id][facet_to_index[a_facet.field_name]]; - std::stringstream fvaluestream; // for hashing the entire facet value (multiple tokens) - - for(size_t j = 0; j < fhashes.size(); j++) { - if(fhashes[j] != FACET_ARRAY_DELIMETER) { - int64_t ftoken_hash = fhashes[j]; - fvaluestream << ftoken_hash; - } - - if(fhashes[j] == FACET_ARRAY_DELIMETER || (fhashes.back() != FACET_ARRAY_DELIMETER && - j == fhashes.size() - 1)) { - const std::string & fvalue_str = fvaluestream.str(); - uint64_t fhash = facet_token_hash(facet_field, fvalue_str); - - if(a_facet.result_map.count(fhash) != 0) { - a_facet.result_map[fhash].count -= 1; - if(a_facet.result_map[fhash].count == 0) { - a_facet.result_map.erase(fhash); - } - } - std::stringstream().swap(fvaluestream); - } - } - } - } - } -} - void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, const std::vector & sort_fields, - std::vector & token_candidates_vec, const token_ordering token_order, - std::vector> & searched_queries, Topster & topster, + std::vector & token_candidates_vec, + std::vector> & searched_queries, Topster* topster, + spp::sparse_hash_set& groups_processed, uint32_t** all_result_ids, size_t & all_result_ids_len, const size_t typo_tokens_threshold) { const long long combination_limit = 10; @@ -874,6 +825,15 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si continue; } + if(!curated_ids.empty()) { + uint32_t *excluded_result_ids = nullptr; + result_size = ArrayUtils::exclude_scalar(result_ids, result_size, &curated_ids[0], + curated_ids.size(), &excluded_result_ids); + + delete [] result_ids; + result_ids = excluded_result_ids; + } + if(filter_ids != nullptr) { // intersect once again with filter ids uint32_t* filtered_result_ids = nullptr; @@ -888,7 +848,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si // go through each matching document id and calculate match score score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion, - filtered_result_ids, filtered_results_size); + groups_processed, filtered_result_ids, filtered_results_size); delete[] filtered_result_ids; delete[] result_ids; @@ -900,7 +860,7 @@ void Index::search_candidates(const uint8_t & field_id, uint32_t* filter_ids, si *all_result_ids = new_all_result_ids; score_results(sort_fields, (uint16_t) searched_queries.size(), field_id, total_cost, topster, query_suggestion, - result_ids, result_size); + groups_processed, result_ids, result_size); delete[] result_ids; } @@ -1056,13 +1016,17 @@ void Index::run_search() { } // after the wait, we own the lock. - search(search_params.outcome, search_params.query, search_params.search_fields, - search_params.filters, search_params.facets, search_params.facet_query, search_params.included_ids, - search_params.excluded_ids, search_params.sort_fields_std, search_params.num_typos, - search_params.max_hits, search_params.per_page, search_params.page, search_params.token_order, - search_params.prefix, search_params.drop_tokens_threshold, search_params.raw_result_kvs, - search_params.all_result_ids_len, search_params.searched_queries, search_params.override_result_kvs, - search_params.typo_tokens_threshold); + search(search_params->outcome, search_params->query, search_params->search_fields, + search_params->filters, search_params->facets, search_params->facet_query, + search_params->included_ids, search_params->excluded_ids, + search_params->sort_fields_std, search_params->num_typos, + search_params->topster, search_params->curated_topster, + search_params->per_page, search_params->page, search_params->token_order, + search_params->prefix, search_params->drop_tokens_threshold, + search_params->all_result_ids_len, search_params->groups_processed, + search_params->searched_queries, + search_params->raw_result_kvs, search_params->override_result_kvs, + search_params->typo_tokens_threshold); // hand control back to main thread processed = true; @@ -1074,12 +1038,12 @@ void Index::run_search() { } } -void Index::collate_curated_ids(const std::string & query, const std::string & field, const uint8_t field_id, - const std::vector & included_ids, - Topster & curated_topster, - std::vector> & searched_queries) { +void Index::collate_included_ids(const std::string & query, const std::string & field, const uint8_t field_id, + const std::map> & included_ids_map, + Topster* curated_topster, + std::vector> & searched_queries) { - if(included_ids.size() == 0) { + if(included_ids_map.empty()) { return; } @@ -1098,48 +1062,34 @@ void Index::collate_curated_ids(const std::string & query, const std::string & f art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len, 0, 0, 1, token_ordering::MAX_SCORE, false, leaves); - if(leaves.size() > 0) { + if(!leaves.empty()) { override_query.push_back(leaves[0]); } } - spp::sparse_hash_map leaf_to_indices; + for(const auto& pos_ids: included_ids_map) { + const size_t outer_pos = pos_ids.first; - for (art_leaf *token_leaf : override_query) { - uint32_t *indices = new uint32_t[included_ids.size()]; - token_leaf->values->ids.indexOf(&included_ids[0], included_ids.size(), indices); - leaf_to_indices.emplace(token_leaf, indices); - } + for(const auto& index_seq_id: pos_ids.second) { + uint32_t inner_pos = index_seq_id.first; + uint32_t seq_id = index_seq_id.second; - for(size_t j=0; j>> array_token_positions; - populate_token_positions(override_query, leaf_to_indices, j, array_token_positions); + // LOG(INFO) << "seq_id: " << seq_id << " - " << match_score; - uint64_t match_score = 0; + int64_t scores[3]; + scores[0] = match_score; + scores[1] = int64_t(1); + scores[2] = int64_t(1); - for(const std::vector> & token_positions: array_token_positions) { - if(token_positions.empty()) { - continue; - } - const Match & match = Match::match(seq_id, token_positions); - uint64_t this_match_score = match.get_match_score(0, field_id); - - if(this_match_score > match_score) { - match_score = this_match_score; - } + KV kv(field_id, searched_queries.size(), seq_id, distinct_id, match_score, scores); + curated_topster->add(&kv); } - - int64_t scores[3]; - scores[0] = int64_t(match_score); - scores[1] = int64_t(1); - scores[2] = int64_t(1); - - curated_topster.add(seq_id, field_id, searched_queries.size(), match_score, scores); - - searched_queries.push_back(override_query); } + + searched_queries.push_back(override_query); } void Index::search(Option & outcome, @@ -1147,19 +1097,20 @@ void Index::search(Option & outcome, const std::vector & search_fields, const std::vector & filters, std::vector & facets, facet_query_t & facet_query, - const std::vector & included_ids, + const std::map> & included_ids_map, const std::vector & excluded_ids, - const std::vector & sort_fields_std, const int num_typos, const size_t max_hits, + const std::vector & sort_fields_std, const int num_typos, + Topster* topster, + Topster* curated_topster, const size_t per_page, const size_t page, const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold, - std::vector & raw_result_kvs, size_t & all_result_ids_len, - std::vector> & searched_queries, - std::vector & override_result_kvs, + spp::sparse_hash_set& groups_processed, + std::vector>& searched_queries, + std::vector> & raw_result_kvs, + std::vector> & override_result_kvs, const size_t typo_tokens_threshold) { - const size_t num_results = (page * per_page); - // process the filters uint32_t* filter_ids = nullptr; @@ -1169,25 +1120,48 @@ void Index::search(Option & outcome, return ; } - const uint32_t filter_ids_length = op_filter_ids_length.get(); + uint32_t filter_ids_length = op_filter_ids_length.get(); + + // we will be removing all curated IDs from organic result ids before running topster + std::set curated_ids; + std::vector included_ids; + + for(const auto& outer_pos_ids: included_ids_map) { + for(const auto& inner_pos_seq_id: outer_pos_ids.second) { + curated_ids.insert(inner_pos_seq_id.second); + included_ids.push_back(inner_pos_seq_id.second); + } + } + + curated_ids.insert(excluded_ids.begin(), excluded_ids.end()); + + std::vector curated_ids_sorted(curated_ids.begin(), curated_ids.end()); + std::sort(curated_ids_sorted.begin(), curated_ids_sorted.end()); // Order of `fields` are used to sort results //auto begin = std::chrono::high_resolution_clock::now(); uint32_t* all_result_ids = nullptr; - const size_t topster_size = std::max((size_t)1, max_hits); // needs to be atleast 1 since scoring is mandatory - Topster topster(topster_size); - Topster curated_topster(topster_size); - if(query == "*") { const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - 0); const std::string & field = search_fields[0]; + if(!curated_ids.empty()) { + uint32_t *excluded_result_ids = nullptr; + filter_ids_length = ArrayUtils::exclude_scalar(filter_ids, filter_ids_length, &curated_ids_sorted[0], + curated_ids.size(), &excluded_result_ids); + + delete [] filter_ids; + filter_ids = excluded_result_ids; + } + score_results(sort_fields_std, (uint16_t) searched_queries.size(), field_id, 0, topster, {}, - filter_ids, filter_ids_length); - collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries); - do_facets(facets, facet_query, filter_ids, filter_ids_length); + groups_processed, filter_ids, filter_ids_length); + collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries); + all_result_ids_len = filter_ids_length; + all_result_ids = filter_ids; + filter_ids = nullptr; } else { const size_t num_search_fields = std::min(search_fields.size(), (size_t) FIELD_LIMIT_NUM); for(size_t i = 0; i < num_search_fields; i++) { @@ -1196,47 +1170,22 @@ void Index::search(Option & outcome, const uint8_t field_id = (uint8_t)(FIELD_LIMIT_NUM - i); // Order of `fields` are used to sort results const std::string & field = search_fields[i]; - search_field(field_id, query, field, filter_ids, filter_ids_length, facets, sort_fields_std, - num_typos, searched_queries, topster, &all_result_ids, all_result_ids_len, + search_field(field_id, query, field, filter_ids, filter_ids_length, curated_ids_sorted, facets, sort_fields_std, + num_typos, searched_queries, topster, groups_processed, &all_result_ids, all_result_ids_len, token_order, prefix, drop_tokens_threshold, typo_tokens_threshold); - collate_curated_ids(query, field, field_id, included_ids, curated_topster, searched_queries); + collate_included_ids(query, field, field_id, included_ids_map, curated_topster, searched_queries); } } - do_facets(facets, facet_query, all_result_ids, all_result_ids_len); } + do_facets(facets, facet_query, all_result_ids, all_result_ids_len); do_facets(facets, facet_query, &included_ids[0], included_ids.size()); // must be sorted before iterated upon to remove "empty" array entries - topster.sort(); - curated_topster.sort(); + topster->sort(); + curated_topster->sort(); - std::set ids_to_remove(included_ids.begin(), included_ids.end()); - ids_to_remove.insert(excluded_ids.begin(), excluded_ids.end()); - - std::vector dropped_ids; - - // loop through topster and remove elements from included and excluded id lists - for(uint32_t t = 0; t < topster.size && t < num_results; t++) { - KV* kv = topster.getKV(t); - - if(ids_to_remove.count(kv->key) != 0) { - dropped_ids.push_back((uint32_t)kv->key); - } else { - raw_result_kvs.push_back(*kv); - } - } - - for(uint32_t t = 0; t < curated_topster.size && t < num_results; t++) { - KV* kv = curated_topster.getKV(t); - override_result_kvs.push_back(*kv); - } - - // for the ids that are dropped, remove their corresponding facet components from facet results - drop_facets(facets, dropped_ids); - - all_result_ids_len -= dropped_ids.size(); - all_result_ids_len += curated_topster.size; + all_result_ids_len += curated_topster->size; delete [] filter_ids; delete [] all_result_ids; @@ -1258,9 +1207,11 @@ void Index::search(Option & outcome, */ void Index::search_field(const uint8_t & field_id, const std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length, + const std::vector& curated_ids, std::vector & facets, const std::vector & sort_fields, const int num_typos, std::vector> & searched_queries, - Topster & topster, uint32_t** all_result_ids, size_t & all_result_ids_len, + Topster* topster, spp::sparse_hash_set& groups_processed, + uint32_t** all_result_ids, size_t & all_result_ids_len, const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold, const size_t typo_tokens_threshold) { std::vector tokens; @@ -1373,8 +1324,8 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co if(!token_candidates_vec.empty() && token_candidates_vec.size() == tokens.size()) { // If all tokens were found, go ahead and search for candidates with what we have so far - search_candidates(field_id, filter_ids, filter_ids_length, sort_fields, token_candidates_vec, - token_order, searched_queries, topster, all_result_ids, all_result_ids_len, + search_candidates(field_id, filter_ids, filter_ids_length, curated_ids, sort_fields, token_candidates_vec, + searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len, typo_tokens_threshold); } @@ -1407,8 +1358,9 @@ void Index::search_field(const uint8_t & field_id, const std::string & query, co truncated_query += " " + token_count_pairs.at(i).first; } - return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos, - searched_queries, topster, all_result_ids, all_result_ids_len, + return search_field(field_id, truncated_query, field, filter_ids, filter_ids_length, curated_ids, + facets, sort_fields, num_typos, + searched_queries, topster, groups_processed, all_result_ids, all_result_ids_len, token_order, prefix); } } @@ -1434,8 +1386,9 @@ void Index::log_leaves(const int cost, const std::string &token, const std::vect } void Index::score_results(const std::vector & sort_fields, const uint16_t & query_index, - const uint8_t & field_id, const uint32_t total_cost, Topster & topster, + const uint8_t & field_id, const uint32_t total_cost, Topster* topster, const std::vector &query_suggestion, + spp::sparse_hash_set& groups_processed, const uint32_t *result_ids, const size_t result_size) const { spp::sparse_hash_map leaf_to_indices; @@ -1467,6 +1420,16 @@ void Index::score_results(const std::vector & sort_fields, const uint16 Match single_token_match = Match(1, 0, 0, empty_offset_diffs); const uint64_t single_token_match_score = single_token_match.get_match_score(total_cost, field_id); + std::unordered_map facet_to_id; + + if(search_params->group_limit > 0) { + size_t i_facet = 0; + for(const auto & facet: facet_schema) { + facet_to_id[facet.first] = i_facet; + i_facet++; + } + } + for(size_t i=0; i & sort_fields, const uint16 } const int64_t default_score = 0; - int64_t scores[3]; + int64_t scores[3] = {0}; // avoiding loop if(sort_fields.size() > 0) { @@ -1541,7 +1504,15 @@ void Index::score_results(const std::vector & sort_fields, const uint16 } } - topster.add(seq_id, field_id, query_index, match_score, scores); + uint64_t distinct_id = seq_id; + + if(search_params->group_limit != 0) { + distinct_id = get_distinct_id(facet_to_id, seq_id); + groups_processed.emplace(distinct_id); + } + + KV kv(field_id, query_index, seq_id, distinct_id, match_score, scores); + topster->add(&kv); } //long long int timeNanos = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); @@ -1553,6 +1524,26 @@ void Index::score_results(const std::vector & sort_fields, const uint16 } } +uint64_t Index::get_distinct_id(const std::unordered_map &facet_to_id, + const uint32_t seq_id) const { + uint64_t distinct_id = 1; // some constant initial value + + // calculate hash from group_by_fields + for(const auto& field: search_params->group_by_fields) { + if(facet_to_id.count(field) == 0 || facet_index_v2.count(seq_id) == 0) { + continue; + } + + size_t facet_id = facet_to_id.at(field); + const std::vector& fhashes = facet_index_v2.at(seq_id)[facet_id]; + + for(const auto& hash: fhashes) { + distinct_id = hash_combine(distinct_id, hash); + } + } + return distinct_id; +} + void Index::populate_token_positions(const std::vector &query_suggestion, spp::sparse_hash_map &leaf_to_indices, size_t result_index, diff --git a/src/main/benchmark.cpp b/src/main/benchmark.cpp index 793eb00a..3000c26e 100644 --- a/src/main/benchmark.cpp +++ b/src/main/benchmark.cpp @@ -46,7 +46,7 @@ void benchmark_hn_titles(char* file_path) { Store *store = new Store("/tmp/typesense-data"); CollectionManager & collectionManager = CollectionManager::get_instance(); - collectionManager.init(store, 4, "abcd", "1234"); + collectionManager.init(store, 4, "abcd"); collectionManager.load(); Collection *collection = collectionManager.get_collection("hnstories_direct"); @@ -116,7 +116,7 @@ void benchmark_reactjs_pages(char* file_path) { Store *store = new Store("/tmp/typesense-data"); CollectionManager & collectionManager = CollectionManager::get_instance(); - collectionManager.init(store, 4, "abcd", "1234"); + collectionManager.init(store, 4, "abcd"); collectionManager.load(); Collection *collection = collectionManager.get_collection("reactjs_pages"); diff --git a/src/main/main.cpp b/src/main/main.cpp index 80606fe1..fc5ad311 100644 --- a/src/main/main.cpp +++ b/src/main/main.cpp @@ -21,7 +21,7 @@ int main(int argc, char* argv[]) { Store *store = new Store(state_dir_path); CollectionManager & collectionManager = CollectionManager::get_instance(); - collectionManager.init(store, 4, "abcd", "123"); + collectionManager.init(store, 4, "abcd"); collectionManager.load(); std::vector fields_to_index = { diff --git a/src/main/typesense_server.cpp b/src/main/typesense_server.cpp index b76942e3..fd9ed722 100644 --- a/src/main/typesense_server.cpp +++ b/src/main/typesense_server.cpp @@ -1,6 +1,6 @@ +#include "typesense_server_utils.h" #include "core_api.h" #include "config.h" -#include "typesense_server_utils.h" void master_server_routes() { // collection management @@ -55,13 +55,7 @@ void replica_server_routes() { server->get("/health", get_health); } -namespace logging { - DECLARE_bool(log_year); -} - int main(int argc, char **argv) { - logging::FLAGS_log_year = true; - Config config; cmdline::parser options; diff --git a/src/raft_server.cpp b/src/raft_server.cpp index 70a0b472..73bd1a63 100644 --- a/src/raft_server.cpp +++ b/src/raft_server.cpp @@ -34,6 +34,7 @@ int ReplicationState::start(const butil::EndPoint & peering_endpoint, const int node_options.fsm = this; node_options.node_owns_fsm = false; node_options.snapshot_interval_s = snapshot_interval_s; + node_options.filter_before_copy_remote = false; std::string prefix = "local://" + raft_dir; node_options.log_uri = prefix + "/" + log_dir_name; node_options.raft_meta_uri = prefix + "/" + meta_dir_name; @@ -297,7 +298,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) { return -1; } - LOG(TRACE) << "rm " << store->get_state_dir_path() << " success"; + LOG(INFO) << "rm " << store->get_state_dir_path() << " success"; std::string snapshot_path = reader->get_path(); snapshot_path.append(std::string("/") + db_snapshot_name); @@ -308,7 +309,7 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) { return -1; } - LOG(TRACE) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success"; + LOG(INFO) << "copy snapshot " << snapshot_path << " to " << store->get_state_dir_path() << " success"; return init_db(); } diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp index 228f6ba8..60f0a962 100644 --- a/src/sorted_array.cpp +++ b/src/sorted_array.cpp @@ -6,7 +6,7 @@ void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_lengt max = array_length > 1 ? sorted_array[array_length-1] : min; uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR); - uint8_t *out = new uint8_t[size_required]; + uint8_t *out = (uint8_t *) malloc(size_required * sizeof *out); uint32_t actual_size = for_compress_sorted(sorted_array, out, array_length); free(in); diff --git a/src/string_utils.cpp b/src/string_utils.cpp index 2782c2b6..041f7f07 100644 --- a/src/string_utils.cpp +++ b/src/string_utils.cpp @@ -1,8 +1,8 @@ #include "string_utils.h" #include #include -#include #include +#include std::string lower_and_no_special_chars(const std::string & str) { std::stringstream ss; @@ -19,6 +19,10 @@ std::string lower_and_no_special_chars(const std::string & str) { } void StringUtils::unicode_normalize(std::string & str) const { + if(str.empty()) { + return ; + } + std::stringstream out; for (char *s = &str[0]; *s;) { @@ -49,16 +53,16 @@ void StringUtils::unicode_normalize(std::string & str) const { } } - str.assign(lower_and_no_special_chars(out.str())); + str = lower_and_no_special_chars(out.str()); } -std::string StringUtils::randstring(size_t length, uint64_t seed) { +std::string StringUtils::randstring(size_t length) { static auto& chrs = "0123456789" "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - thread_local static std::mt19937 rg(seed); - thread_local static std::uniform_int_distribution pick(0, sizeof(chrs) - 2); + thread_local std::mt19937 rg(std::random_device{}()); + thread_local std::uniform_int_distribution pick(0, sizeof(chrs) - 2); std::string s; s.reserve(length); diff --git a/src/system_metrics.cpp b/src/system_metrics.cpp index a0c6d01e..eb79c42b 100644 --- a/src/system_metrics.cpp +++ b/src/system_metrics.cpp @@ -27,9 +27,9 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result rusage r_usage; getrusage(RUSAGE_SELF, &r_usage); - result["memory_used_process_bytes"] = r_usage.ru_maxrss * 1000; + result["typesense_memory_used_bytes"] = r_usage.ru_maxrss * 1000; - uint64_t memory_free_bytes = 0; + uint64_t memory_available_bytes = 0; uint64_t memory_total_bytes = 0; #ifdef __APPLE__ @@ -42,7 +42,7 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result if (KERN_SUCCESS == host_page_size(mach_port, &mach_page_size) && KERN_SUCCESS == host_statistics64(mach_port, HOST_VM_INFO, (host_info64_t)&vm_stats, &count)) { - memory_free_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size; + memory_available_bytes = (int64_t)(vm_stats.free_count) * (int64_t)mach_page_size; } uint64_t pages = sysconf(_SC_PHYS_PAGES); @@ -51,11 +51,11 @@ void SystemMetrics::get(const std::string &data_dir_path, nlohmann::json &result #elif __linux__ struct sysinfo sys_info; sysinfo(&sys_info); - memory_free_bytes = sys_info.freeram; + memory_available_bytes = linux_get_mem_available_bytes(); memory_total_bytes = sys_info.totalram; #endif - result["memory_free_bytes"] = memory_free_bytes; + result["memory_available_bytes"] = memory_available_bytes; result["memory_total_bytes"] = memory_total_bytes; // CPU METRICS diff --git a/src/typesense_server_utils.cpp b/src/typesense_server_utils.cpp index 1a940518..9a56416f 100644 --- a/src/typesense_server_utils.cpp +++ b/src/typesense_server_utils.cpp @@ -133,35 +133,45 @@ int init_logger(Config & config, const std::string & server_version) { signal(SIGILL, catch_crash); signal(SIGSEGV, catch_crash); - logging::LoggingSettings log_settings; - // we can install new signal handlers only after overriding above signal(SIGINT, catch_interrupt); signal(SIGTERM, catch_interrupt); + google::InitGoogleLogging("typesense"); + std::string log_dir = config.get_log_dir(); - std::string log_path; if(log_dir.empty()) { // use console logger if log dir is not specified - log_settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG; + FLAGS_logtostderr = true; } else { if(!directory_exists(log_dir)) { std::cerr << "Typesense failed to start. " << "Log directory " << log_dir << " does not exist."; return 1; } - log_settings.logging_dest = logging::LOG_TO_FILE; - log_path = log_dir + "/" + "typesense.log"; - log_settings.log_file = log_path.c_str(); + // flush log levels above -1 immediately (INFO=0) + FLAGS_logbuflevel = -1; - LOG(INFO) << "Starting Typesense " << server_version << ". Log directory is configured as: " - << log_dir << std::endl; + // available only on glog master (ensures that log file name is constant) + FLAGS_timestamp_in_logfile_name = false; + + std::string log_path = log_dir + "/" + "typesense.log"; + + // will log levels INFO **and above** to the given log file + google::SetLogDestination(google::INFO, log_path.c_str()); + + // don't create symlink for INFO log + google::SetLogSymlink(google::INFO, ""); + + // don't create separate log files for each level + google::SetLogDestination(google::WARNING, ""); + google::SetLogDestination(google::ERROR, ""); + google::SetLogDestination(google::FATAL, ""); + + std::cout << "Log directory is configured as: " << log_dir << std::endl; } - logging::InitLogging(log_settings); - logging::SetMinLogLevel(0); - return 0; } @@ -275,7 +285,6 @@ int start_raft_server(ReplicationState& replication_state, const std::string& st } int run_server(const Config & config, const std::string & version, void (*master_server_routes)()) { - LOG(INFO) << "Starting Typesense " << version << std::flush; quit_raft_service = false; diff --git a/test/collection_faceting_test.cpp b/test/collection_faceting_test.cpp index 5db5bac6..2df584c4 100644 --- a/test/collection_faceting_test.cpp +++ b/test/collection_faceting_test.cpp @@ -15,7 +15,7 @@ protected: std::vector sort_fields; void setupCollection() { - std::string state_dir_path = "/tmp/typesense_test/collection_sorting"; + std::string state_dir_path = "/tmp/typesense_test/collection_faceting"; LOG(INFO) << "Truncating and creating: " << state_dir_path; system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); @@ -29,6 +29,7 @@ protected: } virtual void TearDown() { + collectionManager.dispose(); delete store; } }; diff --git a/test/collection_grouping_test.cpp b/test/collection_grouping_test.cpp new file mode 100644 index 00000000..9d02e0a8 --- /dev/null +++ b/test/collection_grouping_test.cpp @@ -0,0 +1,357 @@ +#include +#include +#include +#include +#include +#include +#include "collection.h" + +class CollectionGroupingTest : public ::testing::Test { +protected: + Store *store; + CollectionManager & collectionManager = CollectionManager::get_instance(); + Collection *coll_group; + + void setupCollection() { + std::string state_dir_path = "/tmp/typesense_test/collection_grouping"; + LOG(INFO) << "Truncating and creating: " << state_dir_path; + system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); + + store = new Store(state_dir_path); + collectionManager.init(store, 4, "auth_key"); + collectionManager.load(); + + std::vector fields = { + field("title", field_types::STRING, false), + field("brand", field_types::STRING, true, true), + field("size", field_types::INT32, true, false), + field("colors", field_types::STRING_ARRAY, true, false), + field("rating", field_types::FLOAT, true, false) + }; + + coll_group = collectionManager.get_collection("coll_group"); + if(coll_group == nullptr) { + coll_group = collectionManager.create_collection("coll_group", fields, "rating").get(); + } + + std::ifstream infile(std::string(ROOT_DIR)+"test/group_documents.jsonl"); + + std::string json_line; + + while (std::getline(infile, json_line)) { + auto add_op = coll_group->add(json_line); + if(!add_op.ok()) { + std::cout << add_op.error() << std::endl; + } + ASSERT_TRUE(add_op.ok()); + } + + infile.close(); + } + + virtual void SetUp() { + setupCollection(); + } + + virtual void TearDown() { + collectionManager.dispose(); + delete store; + } +}; + +TEST_F(CollectionGroupingTest, GroupingBasics) { + // group by size (int32) + auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, + "", 10, + {}, {}, {"size"}, 2).get(); + + ASSERT_EQ(3, res["found"].get()); + ASSERT_EQ(3, res["grouped_hits"].size()); + ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get()); + + ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get()); + ASSERT_EQ(11, res["grouped_hits"][0]["hits"][0]["document"]["size"].get()); + ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get()); + ASSERT_STREQ("1", res["grouped_hits"][0]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get()); + ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("2", res["grouped_hits"][2]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get()); + ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get().c_str()); + + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]); + ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get().c_str()); + + ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]); + ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get().c_str()); + + // group by rating (float) and sort by size + std::vector sort_size = {sort_by("size", "DESC")}; + res = coll_group->search("*", {}, "", {"brand"}, sort_size, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "brand: omeg", 30, + "", 10, + {}, {}, {"rating"}, 2).get(); + + // 7 unique ratings + ASSERT_EQ(7, res["found"].get()); + ASSERT_EQ(7, res["grouped_hits"].size()); + ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["group_key"][0].get()); + + ASSERT_EQ(12, res["grouped_hits"][0]["hits"][0]["document"]["size"].get()); + ASSERT_STREQ("8", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.4, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get()); + + ASSERT_EQ(12, res["grouped_hits"][1]["hits"][0]["document"]["size"].get()); + ASSERT_STREQ("6", res["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get()); + + ASSERT_EQ(11, res["grouped_hits"][1]["hits"][1]["document"]["size"].get()); + ASSERT_STREQ("1", res["grouped_hits"][1]["hits"][1]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.3, res["grouped_hits"][1]["hits"][1]["document"]["rating"].get()); + + ASSERT_EQ(10, res["grouped_hits"][5]["hits"][0]["document"]["size"].get()); + ASSERT_STREQ("9", res["grouped_hits"][5]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.1, res["grouped_hits"][5]["hits"][0]["document"]["rating"].get()); + + ASSERT_EQ(10, res["grouped_hits"][6]["hits"][0]["document"]["size"].get()); + ASSERT_STREQ("0", res["grouped_hits"][6]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][6]["hits"][0]["document"]["rating"].get()); + + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["highlighted"].get().c_str()); +} + +TEST_F(CollectionGroupingTest, GroupingCompoundKey) { + // group by size+brand (int32, string) + auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, + "", 10, + {}, {}, {"size", "brand"}, 2).get(); + + ASSERT_EQ(10, res["found"].get()); + ASSERT_EQ(10, res["grouped_hits"].size()); + ASSERT_EQ(11, res["grouped_hits"][0]["group_key"][0].get()); + + ASSERT_STREQ("Beta", res["grouped_hits"][0]["group_key"][1].get().c_str()); + + // optional field should have no value in the group key component + ASSERT_EQ(1, res["grouped_hits"][5]["group_key"].size()); + ASSERT_STREQ("10", res["grouped_hits"][5]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("11", res["grouped_hits"][5]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_EQ(1, res["grouped_hits"][0]["hits"].size()); + ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("5", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + + ASSERT_EQ(1, res["grouped_hits"][1]["hits"].size()); + ASSERT_FLOAT_EQ(4.8, res["grouped_hits"][1]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("4", res["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + + ASSERT_EQ(2, res["grouped_hits"][2]["hits"].size()); + ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][2]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("3", res["grouped_hits"][2]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][2]["hits"][1]["document"]["rating"].get()); + ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get().c_str()); + + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]); + ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get().c_str()); + + ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]); + ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get().c_str()); + + // pagination with page=2, per_page=2 + res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 2, 2, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, + "", 10, + {}, {}, {"size", "brand"}, 2).get(); + + + // 3rd result from previous assertion will be in the first position + ASSERT_EQ(2, res["grouped_hits"][0]["hits"].size()); + ASSERT_FLOAT_EQ(4.6, res["grouped_hits"][0]["hits"][0]["document"]["rating"].get()); + ASSERT_STREQ("3", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_FLOAT_EQ(4.5, res["grouped_hits"][0]["hits"][1]["document"]["rating"].get()); + ASSERT_STREQ("0", res["grouped_hits"][0]["hits"][1]["document"]["id"].get().c_str()); + + // total count and facet counts should be the same + ASSERT_EQ(10, res["found"].get()); + ASSERT_EQ(2, res["grouped_hits"].size()); + ASSERT_EQ(10, res["grouped_hits"][0]["group_key"][0].get()); + ASSERT_STREQ("Omega", res["grouped_hits"][0]["group_key"][1].get().c_str()); + + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + ASSERT_EQ(3, (int) res["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][1]["value"].get().c_str()); + + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]); + ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][2]["value"].get().c_str()); + + ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]); + ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get().c_str()); + + // respect min and max grouping limit (greater than 0 and less than 99) + auto res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "brand: omeg", 30, + "", 10, + {}, {}, {"rating"}, 100); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str()); + + res_op = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "brand: omeg", 30, + "", 10, + {}, {}, {"rating"}, 0); + + ASSERT_FALSE(res_op.ok()); + ASSERT_STREQ("Value of `group_limit` must be between 1 and 99.", res_op.error().c_str()); +} + +TEST_F(CollectionGroupingTest, GroupingWithGropLimitOfOne) { + auto res = coll_group->search("*", {}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, + "", 10, + {}, {}, {"brand"}, 1).get(); + + ASSERT_EQ(5, res["found"].get()); + ASSERT_EQ(5, res["grouped_hits"].size()); + + // all hits array must be of size 1 + for(auto i=0; i<5; i++) { + ASSERT_EQ(1, res["grouped_hits"][i]["hits"].size()); + } + + ASSERT_STREQ("4", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("2", res["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("8", res["grouped_hits"][2]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("10", res["grouped_hits"][3]["hits"][0]["document"]["id"].get().c_str()); // unbranded + ASSERT_STREQ("9", res["grouped_hits"][4]["hits"][0]["document"]["id"].get().c_str()); + + // facet counts should each be 1, including unbranded + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + + for(size_t i=0; i < 4; i++) { + ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][i]["count"]); + } +} + +TEST_F(CollectionGroupingTest, GroupingWithArrayFieldAndOverride) { + nlohmann::json override_json_include = { + {"id", "include-rule"}, + { + "rule", { + {"query", "shirt"}, + {"match", override_t::MATCH_EXACT} + } + } + }; + + override_json_include["includes"] = nlohmann::json::array(); + override_json_include["includes"][0] = nlohmann::json::object(); + override_json_include["includes"][0]["id"] = "11"; + override_json_include["includes"][0]["position"] = 1; + + override_json_include["includes"][1] = nlohmann::json::object(); + override_json_include["includes"][1]["id"] = "10"; + override_json_include["includes"][1]["position"] = 1; + + nlohmann::json override_json_exclude = { + {"id", "exclude-rule"}, + { + "rule", { + {"query", "shirt"}, + {"match", override_t::MATCH_EXACT} + } + } + }; + override_json_exclude["excludes"] = nlohmann::json::array(); + override_json_exclude["excludes"][0] = nlohmann::json::object(); + override_json_exclude["excludes"][0]["id"] = "2"; + + override_t override1(override_json_include); + override_t override2(override_json_exclude); + Option ov1_op = coll_group->add_override(override1); + Option ov2_op = coll_group->add_override(override2); + + ASSERT_TRUE(ov1_op.ok()); + ASSERT_TRUE(ov2_op.ok()); + + auto res = coll_group->search("shirt", {"title"}, "", {"brand"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, + "", 10, + {}, {}, {"colors"}, 2).get(); + + ASSERT_EQ(4, res["found"].get()); + ASSERT_EQ(4, res["grouped_hits"].size()); + + ASSERT_EQ(1, res["grouped_hits"][0]["group_key"][0].size()); + ASSERT_STREQ("white", res["grouped_hits"][0]["group_key"][0][0].get().c_str()); + + ASSERT_STREQ("11", res["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("10", res["grouped_hits"][0]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("5", res["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("3", res["grouped_hits"][1]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("4", res["grouped_hits"][2]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("0", res["grouped_hits"][2]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_EQ(1, res["grouped_hits"][3]["hits"].size()); + ASSERT_STREQ("8", res["grouped_hits"][3]["hits"][0]["document"]["id"].get().c_str()); + + // assert facet counts + ASSERT_STREQ("brand", res["facet_counts"][0]["field_name"].get().c_str()); + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][0]["count"]); + ASSERT_STREQ("Omega", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); + + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][1]["count"]); + ASSERT_STREQ("Xorp", res["facet_counts"][0]["counts"][1]["value"].get().c_str()); + + ASSERT_EQ(2, (int) res["facet_counts"][0]["counts"][2]["count"]); + ASSERT_STREQ("Beta", res["facet_counts"][0]["counts"][2]["value"].get().c_str()); + + ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][3]["count"]); + ASSERT_STREQ("Zeta", res["facet_counts"][0]["counts"][3]["value"].get().c_str()); +} \ No newline at end of file diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index f2b1f03a..4c8ecfbb 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -40,6 +40,7 @@ protected: virtual void TearDown() { collectionManager.drop_collection("collection1"); + collectionManager.dispose(); delete store; } }; diff --git a/test/collection_override_test.cpp b/test/collection_override_test.cpp index a94bfbc0..e08f32d2 100644 --- a/test/collection_override_test.cpp +++ b/test/collection_override_test.cpp @@ -18,7 +18,7 @@ protected: system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); store = new Store(state_dir_path); - collectionManager.init(store, 1, "auth_key"); + collectionManager.init(store, 4, "auth_key"); collectionManager.load(); std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl"); @@ -49,6 +49,7 @@ protected: virtual void TearDown() { collectionManager.drop_collection("coll_mul_fields"); + collectionManager.dispose(); delete store; } }; @@ -121,6 +122,11 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeExactQueryMatch) { ASSERT_STREQ("3", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get().c_str()); + // curated results should be marked as such + ASSERT_EQ(true, results["hits"][0]["curated"].get()); + ASSERT_EQ(true, results["hits"][1]["curated"].get()); + ASSERT_EQ(0, results["hits"][2].count("curated")); + coll_mul_fields->remove_override("exclude-rule"); coll_mul_fields->remove_override("include-rule"); @@ -219,6 +225,8 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) { spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "starring: scott").get(); + ASSERT_EQ(9, results["found"].get()); + // "count" would be `2` without exclusion ASSERT_EQ("Scott Glenn", results["facet_counts"][0]["counts"][0]["highlighted"].get()); ASSERT_EQ(1, results["facet_counts"][0]["counts"][0]["count"].get()); @@ -233,7 +241,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) { spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "starring: scott").get(); - ASSERT_EQ(10, results["found"].get()); + ASSERT_EQ(9, results["found"].get()); ASSERT_EQ(0, results["hits"].size()); coll_mul_fields->remove_override("exclude-rule"); @@ -246,7 +254,7 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) { spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "").get(); - ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); @@ -254,10 +262,9 @@ TEST_F(CollectionOverrideTest, ExcludeIncludeFacetFilterQuery) { } TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { - std::map pinned_hits; - std::vector hidden_hits; - pinned_hits["13"] = 1; - pinned_hits["4"] = 2; + std::map> pinned_hits; + pinned_hits[1] = {"13"}; + pinned_hits[2] = {"4"}; // basic pinning @@ -277,6 +284,7 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { // both pinning and hiding + std::vector hidden_hits; hidden_hits = {"11", "16"}; results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, false, Index::DROP_TOKENS_THRESHOLD, @@ -289,6 +297,21 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { ASSERT_STREQ("4", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("6", results["hits"][2]["document"]["id"].get().c_str()); + // paginating such that pinned hits appear on second page + pinned_hits.clear(); + pinned_hits[4] = {"13"}; + pinned_hits[5] = {"4"}; + + results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 2, 2, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "starring: will", 30, + "", 10, + pinned_hits, hidden_hits).get(); + + ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("13", results["hits"][1]["document"]["id"].get().c_str()); + // take precedence over override rules nlohmann::json override_json_include = { @@ -325,4 +348,60 @@ TEST_F(CollectionOverrideTest, IncludeExcludeHitsQuery) { ASSERT_EQ(8, results["found"].get()); ASSERT_STREQ("8", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("6", results["hits"][1]["document"]["id"].get().c_str()); +} + +TEST_F(CollectionOverrideTest, PinnedHitsGrouping) { + std::map> pinned_hits; + pinned_hits[1] = {"6", "8"}; + pinned_hits[2] = {"1"}; + pinned_hits[3] = {"13", "4"}; + + // without any grouping parameter, only the first ID in a position should be picked + // and other IDs should appear in their original positions + + auto results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "starring: will", 30, + "", 10, + pinned_hits, {}).get(); + + ASSERT_EQ(10, results["found"].get()); + ASSERT_STREQ("6", results["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); + ASSERT_STREQ("13", results["hits"][2]["document"]["id"].get().c_str()); + ASSERT_STREQ("11", results["hits"][3]["document"]["id"].get().c_str()); + + // pinned hits should be marked as curated + ASSERT_EQ(true, results["hits"][0]["curated"].get()); + ASSERT_EQ(true, results["hits"][1]["curated"].get()); + ASSERT_EQ(true, results["hits"][2]["curated"].get()); + ASSERT_EQ(0, results["hits"][3].count("curated")); + + // with grouping + + results = coll_mul_fields->search("the", {"title"}, "", {"starring"}, {}, 0, 50, 1, FREQUENCY, + false, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "starring: will", 30, + "", 10, + pinned_hits, {}, {"cast"}, 2).get(); + + ASSERT_EQ(8, results["found"].get()); + + ASSERT_EQ(1, results["grouped_hits"][0]["group_key"].size()); + ASSERT_EQ(2, results["grouped_hits"][0]["group_key"][0].size()); + ASSERT_STREQ("Chris Evans", results["grouped_hits"][0]["group_key"][0][0].get().c_str()); + ASSERT_STREQ("Scarlett Johansson", results["grouped_hits"][0]["group_key"][0][1].get().c_str()); + + ASSERT_STREQ("6", results["grouped_hits"][0]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("8", results["grouped_hits"][0]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("1", results["grouped_hits"][1]["hits"][0]["document"]["id"].get().c_str()); + + ASSERT_STREQ("13", results["grouped_hits"][2]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("4", results["grouped_hits"][2]["hits"][1]["document"]["id"].get().c_str()); + + ASSERT_STREQ("11", results["grouped_hits"][3]["hits"][0]["document"]["id"].get().c_str()); + ASSERT_STREQ("16", results["grouped_hits"][4]["hits"][0]["document"]["id"].get().c_str()); } \ No newline at end of file diff --git a/test/collection_sorting_test.cpp b/test/collection_sorting_test.cpp index 93caff5b..e552f307 100644 --- a/test/collection_sorting_test.cpp +++ b/test/collection_sorting_test.cpp @@ -29,6 +29,7 @@ protected: } virtual void TearDown() { + collectionManager.dispose(); delete store; } }; @@ -248,6 +249,7 @@ TEST_F(CollectionSortingTest, ThreeSortFieldsLimit) { sort_by("min", "DESC"), }; + query_fields = {"title"}; auto res_op = coll1->search("the", query_fields, "", {}, sort_fields_desc, 0, 10, 1, FREQUENCY, false); ASSERT_FALSE(res_op.ok()); diff --git a/test/collection_test.cpp b/test/collection_test.cpp index f5c9bf09..12748830 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -56,6 +56,7 @@ protected: virtual void TearDown() { collectionManager.drop_collection("collection"); + collectionManager.dispose(); delete store; } }; @@ -455,6 +456,18 @@ TEST_F(CollectionTest, WildcardQuery) { std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } + + // wildcard query should not require a search field + results_op = collection->search("*", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false); + ASSERT_TRUE(results_op.ok()); + results = results_op.get(); + ASSERT_EQ(3, results["hits"].size()); + ASSERT_EQ(25, results["found"].get()); + + // non-wildcard query should require a search field + results_op = collection->search("the", {}, "", {}, sort_fields, 0, 3, 1, FREQUENCY, false); + ASSERT_FALSE(results_op.ok()); + ASSERT_STREQ("No search fields specified for the query.", results_op.error().c_str()); } TEST_F(CollectionTest, PrefixSearching) { @@ -2259,7 +2272,6 @@ TEST_F(CollectionTest, OptionalFields) { // try fetching the schema (should contain optional field) nlohmann::json coll_summary = coll1->get_summary_json(); - LOG(INFO) << coll_summary; ASSERT_STREQ("title", coll_summary["fields"][0]["name"].get().c_str()); ASSERT_STREQ("string", coll_summary["fields"][0]["type"].get().c_str()); ASSERT_FALSE(coll_summary["fields"][0]["facet"].get()); diff --git a/test/group_documents.jsonl b/test/group_documents.jsonl new file mode 100644 index 00000000..5d673ff0 --- /dev/null +++ b/test/group_documents.jsonl @@ -0,0 +1,12 @@ +{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["white", "blue"], "rating": 4.5} +{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 11, "colors": ["white", "blue"], "rating": 4.3} +{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 12, "colors": ["white", "blue"], "rating": 4.6} +{"title": "Omega Casual Poplin Shirt", "brand": "Omega", "size": 10, "colors": ["blue"], "rating": 4.6} +{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 10, "colors": ["white", "blue"], "rating": 4.8} +{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 11, "colors": ["blue"], "rating": 4.8} +{"title": "Beta Casual Poplin Shirt", "brand": "Beta", "size": 12, "colors": ["white", "blue"], "rating": 4.3} +{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 10, "colors": ["white", "blue"], "rating": 4.3} +{"title": "Xorp Casual Shirt", "brand": "Xorp", "size": 12, "colors": ["white", "red"], "rating": 4.4} +{"title": "Zeta Casual Shirt", "brand": "Zeta", "size": 10, "colors": ["white", "blue"], "rating": 4.1} +{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 4.3} +{"title": "White Casual Shirt", "size": 10, "colors": ["white"], "rating": 3.3} \ No newline at end of file diff --git a/test/string_utils_test.cpp b/test/string_utils_test.cpp index 4f8c2400..a2d64147 100644 --- a/test/string_utils_test.cpp +++ b/test/string_utils_test.cpp @@ -54,3 +54,8 @@ TEST(StringUtilsTest, HMAC) { std::string digest1 = StringUtils::hmac("KeyVal", "{\"filter_by\": \"user_id:1080\"}"); ASSERT_STREQ("IvjqWNZ5M5ElcvbMoXj45BxkQrZG4ZKEaNQoRioCx2s=", digest1.c_str()); } + +TEST(StringUtilsTest, UInt32Validation) { + std::string big_num = "99999999999999999999999999999999"; + ASSERT_FALSE(StringUtils::is_uint32_t(big_num)); +} diff --git a/test/topster_test.cpp b/test/topster_test.cpp index 1e9438a8..104e941e 100644 --- a/test/topster_test.cpp +++ b/test/topster_test.cpp @@ -36,7 +36,8 @@ TEST(TopsterTest, MaxIntValues) { scores[1] = data[i].primary_attr; scores[2] = data[i].secondary_attr; - topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores); + KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores); + topster.add(&kv); } topster.sort(); @@ -87,7 +88,8 @@ TEST(TopsterTest, MaxFloatValues) { scores[1] = Index::float_to_in64_t(data[i].primary_attr); scores[2] = data[i].secondary_attr; - topster.add(data[i].key, data[i].field_id, data[i].query_index, data[i].match_score, scores); + KV kv(data[i].field_id, data[i].query_index, data[i].key, data[i].key, data[i].match_score, scores); + topster.add(&kv); } topster.sort(); @@ -97,4 +99,64 @@ TEST(TopsterTest, MaxFloatValues) { for(uint32_t i = 0; i < topster.size; i++) { EXPECT_EQ(ids[i], topster.getKeyAt(i)); } +} + +TEST(TopsterTest, DistinctIntValues) { + Topster dist_topster(5, 2); + + struct { + uint8_t field_id; + uint16_t query_index; + uint64_t distinct_key; + uint64_t match_score; + int64_t primary_attr; + int64_t secondary_attr; + } data[14] = { + {1, 0, 1, 11, 20, 30}, + {1, 0, 1, 12, 20, 32}, + {1, 0, 2, 4, 20, 30}, + {1, 2, 3, 7, 20, 30}, + {1, 0, 4, 14, 20, 30}, + {1, 1, 5, 9, 20, 30}, + {1, 1, 5, 10, 20, 32}, + {1, 1, 5, 9, 20, 30}, + {1, 0, 6, 6, 20, 30}, + {1, 2, 7, 6, 22, 30}, + {1, 2, 7, 6, 22, 30}, + {1, 1, 8, 9, 20, 30}, + {1, 0, 9, 8, 20, 30}, + {1, 3, 10, 5, 20, 30}, + }; + + for(int i = 0; i < 14; i++) { + int64_t scores[3]; + scores[0] = int64_t(data[i].match_score); + scores[1] = data[i].primary_attr; + scores[2] = data[i].secondary_attr; + + KV kv(data[i].field_id, data[i].query_index, i+100, data[i].distinct_key, data[i].match_score, scores); + dist_topster.add(&kv); + } + + dist_topster.sort(); + + std::vector distinct_ids = {4, 1, 5, 8, 9}; + + for(uint32_t i = 0; i < dist_topster.size; i++) { + EXPECT_EQ(distinct_ids[i], dist_topster.getDistinctKeyAt(i)); + + if(distinct_ids[i] == 1) { + EXPECT_EQ(12, (int) dist_topster.getKV(i)->match_score); + EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size); + EXPECT_EQ(12, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score); + EXPECT_EQ(11, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score); + } + + if(distinct_ids[i] == 5) { + EXPECT_EQ(10, (int) dist_topster.getKV(i)->match_score); + EXPECT_EQ(2, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->size); + EXPECT_EQ(10, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(0)->match_score); + EXPECT_EQ(9, dist_topster.group_kv_map[dist_topster.getDistinctKeyAt(i)]->getKV(1)->match_score); + } + } } \ No newline at end of file