diff --git a/CMakeLists.txt b/CMakeLists.txt index 20fef29a..1eca6791 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,7 +65,7 @@ link_directories(${DEP_ROOT_DIR}/${FOR_NAME}) link_directories(${DEP_ROOT_DIR}/${H2O_NAME}/build) link_directories(${DEP_ROOT_DIR}/${ROCKSDB_NAME}) -add_executable(typesense-server ${SRC_FILES} src/main/server.cpp) +add_executable(typesense-server ${SRC_FILES} src/main/typesense_server.cpp) add_executable(search ${SRC_FILES} src/main/main.cpp) add_executable(benchmark ${SRC_FILES} src/main/benchmark.cpp) add_executable(typesense_test ${SRC_FILES} test/array_test.cpp test/sorted_array_test.cpp test/art_test.cpp @@ -83,7 +83,7 @@ if(NOT APPLE) list(APPEND ROCKSDB_LIBS rt) endif() -target_link_libraries(typesense-server for pthread h2o-evloop ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} dl ${STD_LIB}) +target_link_libraries(typesense-server h2o-evloop for pthread ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} dl ${STD_LIB}) target_link_libraries(search for pthread h2o-evloop ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} dl ${STD_LIB}) -target_link_libraries(benchmark for pthread ${ROCKSDB_LIBS} ${STD_LIB}) -target_link_libraries(typesense_test pthread for ${ROCKSDB_LIBS} gtest gtest_main ${STD_LIB}) +target_link_libraries(benchmark for pthread h2o-evloop ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} dl ${STD_LIB}) +target_link_libraries(typesense_test h2o-evloop ${OPENSSL_LIBRARIES} pthread for ${ROCKSDB_LIBS} gtest gtest_main dl ${STD_LIB}) diff --git a/README.md b/README.md index c82fcdd8..be50f5b6 100644 --- a/README.md +++ b/README.md @@ -3,28 +3,24 @@ Typesense is an open source search engine for building a delightful search experience. - **Typo tolerance:** Handles typographical errors out-of-the-box -- **Tunable ranking + relevancy:** Tailor your search results to perfection +- **Tunable ranking:** Tailor your search results to perfection - **Blazing fast:** Meticulously designed and optimized for speed - **Simple and delightful:** Simple API, delightful out-of-the-box experience ## Development -### Build from source +### Building from source -Please ensure that you have docker installed on your system. +Building on your machine: ``` $ ./build.sh [--clean] -. -. -. -$ ./dockcross build/typesense_test -. -. -. -$ ./dockcross build/typesense-server ``` -We use [dockcross](https://github.com/dockcross/dockcross) to build our development environment consistently. +Building on a Docker container: + +``` +$ ./docker-build.sh +``` © 2016-2017 Wreally Studios Inc. \ No newline at end of file diff --git a/TODO.md b/TODO.md index ccd3c991..9458d489 100644 --- a/TODO.md +++ b/TODO.md @@ -30,6 +30,24 @@ - ~~Schema validation during insertion (missing fields + type errors)~~ - ~~Proper score field for ranking tokens~~ - ~~Throw errors when schema is broken~~ +- ~~Desc/Asc ordering with tests~~ +- ~~Found count is wrong~~ +- ~~Filter query in the API~~ +- ~~Facet limit (hardcode to top 10)~~ +- ~~Deprecate old split function~~ +- When prefix=true, use token_ranking_field for token ordering +- Search snippet +- ID should not have "/" +- Group results by field +- Use rocksdb batch put for atomic insertion +- Test for sorted_array::indexOf when length is 0 +- Handle store-get() not finding a key +- Fix API response codes +- Test for search without any sort_by given +- Test for asc/desc upper/lower casing +- Test for collection creation validation +- Test for delete document +- Proper pagination - Prevent string copy during indexing - clean special chars before indexing - Minimum results should be a variable instead of blindly going with max_results diff --git a/cmake/RocksDB.cmake b/cmake/RocksDB.cmake index b026df18..d249fa75 100644 --- a/cmake/RocksDB.cmake +++ b/cmake/RocksDB.cmake @@ -20,6 +20,8 @@ file(COPY ${CMAKE_SOURCE_DIR}/cmake/patches/build_detect_platform DESTINATION if(NOT EXISTS ${DEP_ROOT_DIR}/${ROCKSDB_NAME}/librocksdb.a) message("Building ${ROCKSDB_NAME} locally...") + set(ENV{PORTABLE} 1) + execute_process(COMMAND make "clean" WORKING_DIRECTORY ${DEP_ROOT_DIR}/${ROCKSDB_NAME}/) execute_process(COMMAND make "static_lib" WORKING_DIRECTORY ${DEP_ROOT_DIR}/${ROCKSDB_NAME}/ RESULT_VARIABLE ROCKSDB_BUILD) if(NOT ROCKSDB_BUILD EQUAL 0) diff --git a/include/api.h b/include/api.h new file mode 100644 index 00000000..7d18bd45 --- /dev/null +++ b/include/api.h @@ -0,0 +1,11 @@ +#pragma once + +#include "http_server.h" + +void post_create_collection(http_req & req, http_res & res); + +void get_search(http_req & req, http_res & res); + +void post_add_document(http_req & req, http_res & res); + +void del_remove_document(http_req & req, http_res & res); \ No newline at end of file diff --git a/include/array.h b/include/array.h index c390705a..cb402b96 100644 --- a/include/array.h +++ b/include/array.h @@ -14,7 +14,7 @@ private: uint32_t m = std::min(min, value); uint32_t M = std::max(max, value); uint32_t bnew = required_bits(M - m); - return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew); + return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew); } public: diff --git a/include/array_base.h b/include/array_base.h index a0ad124c..53ef54df 100644 --- a/include/array_base.h +++ b/include/array_base.h @@ -24,20 +24,6 @@ protected: return (uint32_t) (v == 0 ? 0 : 32 - __builtin_clz(v)); } - uint32_t inline sorted_append_size_required(uint32_t value, uint32_t new_length) { - uint32_t m = std::min(min, value); - uint32_t M = std::max(max, value); - uint32_t bnew = required_bits(M - m); - return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew); - } - - uint32_t inline unsorted_append_size_required(uint32_t value, uint32_t new_length) { - uint32_t m = std::min(min, value); - uint32_t M = std::max(max, value); - uint32_t bnew = required_bits(M - m); - return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew); - } - public: array_base(const uint32_t n=2) { size_bytes = METADATA_OVERHEAD + (n * FOR_ELE_SIZE); diff --git a/include/intersection.h b/include/array_utils.h similarity index 51% rename from include/intersection.h rename to include/array_utils.h index cf7f38c2..c0bd5cc1 100644 --- a/include/intersection.h +++ b/include/array_utils.h @@ -2,12 +2,15 @@ #include #include +#include /* Different intersection routines adapted from: * https://github.com/lemire/SIMDCompressionAndIntersection/blob/master/src/intersection.cpp */ -class Intersection { +class ArrayUtils { public: // Fast scalar scheme designed by N. Kurz. Returns the size of out (intersected set) - static size_t scalar(const uint32_t *A, const size_t lenA, const uint32_t *B, const size_t lenB, uint32_t *out); + static size_t and_scalar(const uint32_t *A, const size_t lenA, const uint32_t *B, const size_t lenB, uint32_t *out); + + static size_t or_scalar(const uint32_t *A, const size_t lenA, const uint32_t *B, const size_t lenB, uint32_t **out); }; \ No newline at end of file diff --git a/include/art.h b/include/art.h index 288418a2..878378cd 100644 --- a/include/art.h +++ b/include/art.h @@ -110,7 +110,7 @@ typedef struct { */ typedef struct { art_values* values; - uint16_t max_score; + uint32_t max_score; uint32_t key_len; unsigned char key[]; } art_leaf; diff --git a/include/cmdline.h b/include/cmdline.h new file mode 100644 index 00000000..d7ee2927 --- /dev/null +++ b/include/cmdline.h @@ -0,0 +1,809 @@ +/* + Copyright (c) 2009, Hideyuki Tanaka + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY ''AS IS'' AND ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cmdline{ + + namespace detail{ + + template + class lexical_cast_t{ + public: + static Target cast(const Source &arg){ + Target ret; + std::stringstream ss; + if (!(ss<>ret && ss.eof())) + throw std::bad_cast(); + + return ret; + } + }; + + template + class lexical_cast_t{ + public: + static Target cast(const Source &arg){ + return arg; + } + }; + + template + class lexical_cast_t{ + public: + static std::string cast(const Source &arg){ + std::ostringstream ss; + ss< + class lexical_cast_t{ + public: + static Target cast(const std::string &arg){ + Target ret; + std::istringstream ss(arg); + if (!(ss>>ret && ss.eof())) + throw std::bad_cast(); + return ret; + } + }; + + template + struct is_same { + static const bool value = false; + }; + + template + struct is_same{ + static const bool value = true; + }; + + template + Target lexical_cast(const Source &arg) + { + return lexical_cast_t::value>::cast(arg); + } + + static inline std::string demangle(const std::string &name) + { + int status=0; + char *p=abi::__cxa_demangle(name.c_str(), 0, 0, &status); + std::string ret(p); + free(p); + return ret; + } + + template + std::string readable_typename() + { + return demangle(typeid(T).name()); + } + + template + std::string default_value(T def) + { + return detail::lexical_cast(def); + } + + template <> + inline std::string readable_typename() + { + return "string"; + } + + } // detail + +//----- + + class cmdline_error : public std::exception { + public: + cmdline_error(const std::string &msg): msg(msg){} + ~cmdline_error() throw() {} + const char *what() const throw() { return msg.c_str(); } + private: + std::string msg; + }; + + template + struct default_reader{ + T operator()(const std::string &str){ + return detail::lexical_cast(str); + } + }; + + template + struct range_reader{ + range_reader(const T &low, const T &high): low(low), high(high) {} + T operator()(const std::string &s) const { + T ret=default_reader()(s); + if (!(ret>=low && ret<=high)) throw cmdline::cmdline_error("range_error"); + return ret; + } + private: + T low, high; + }; + + template + range_reader range(const T &low, const T &high) + { + return range_reader(low, high); + } + + template + struct oneof_reader{ + T operator()(const std::string &s){ + T ret=default_reader()(s); + if (std::find(alt.begin(), alt.end(), ret)==alt.end()) + throw cmdline_error(""); + return ret; + } + void add(const T &v){ alt.push_back(v); } + private: + std::vector alt; + }; + + template + oneof_reader oneof(T a1) + { + oneof_reader ret; + ret.add(a1); + return ret; + } + + template + oneof_reader oneof(T a1, T a2) + { + oneof_reader ret; + ret.add(a1); + ret.add(a2); + return ret; + } + + template + oneof_reader oneof(T a1, T a2, T a3) + { + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + return ret; + } + + template + oneof_reader oneof(T a1, T a2, T a3, T a4) + { + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + return ret; + } + + template + oneof_reader oneof(T a1, T a2, T a3, T a4, T a5) + { + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + return ret; + } + + template + oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6) + { + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + return ret; + } + + template + oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7) + { + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + return ret; + } + + template + oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8) + { + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + ret.add(a8); + return ret; + } + + template + oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9) + { + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + ret.add(a8); + ret.add(a9); + return ret; + } + + template + oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9, T a10) + { + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + ret.add(a8); + ret.add(a9); + ret.add(a10); + return ret; + } + +//----- + + class parser{ + public: + parser(){ + } + ~parser(){ + for (std::map::iterator p=options.begin(); + p!=options.end(); p++) + delete p->second; + } + + void add(const std::string &name, + char short_name=0, + const std::string &desc=""){ + if (options.count(name)) throw cmdline_error("multiple definition: "+name); + options[name]=new option_without_value(name, short_name, desc); + ordered.push_back(options[name]); + } + + template + void add(const std::string &name, + char short_name=0, + const std::string &desc="", + bool need=true, + const T def=T()){ + add(name, short_name, desc, need, def, default_reader()); + } + + template + void add(const std::string &name, + char short_name=0, + const std::string &desc="", + bool need=true, + const T def=T(), + F reader=F()){ + if (options.count(name)) throw cmdline_error("multiple definition: "+name); + options[name]=new option_with_value_with_reader(name, short_name, need, def, desc, reader); + ordered.push_back(options[name]); + } + + void footer(const std::string &f){ + ftr=f; + } + + void set_program_name(const std::string &name){ + prog_name=name; + } + + bool exist(const std::string &name) const { + if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name); + return options.find(name)->second->has_set(); + } + + template + const T &get(const std::string &name) const { + if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name); + const option_with_value *p=dynamic_cast*>(options.find(name)->second); + if (p==NULL) throw cmdline_error("type mismatch flag '"+name+"'"); + return p->get(); + } + + const std::vector &rest() const { + return others; + } + + bool parse(const std::string &arg){ + std::vector args; + + std::string buf; + bool in_quote=false; + for (std::string::size_type i=0; i=arg.length()){ + errors.push_back("unexpected occurrence of '\\' at end of string"); + return false; + } + } + + buf+=arg[i]; + } + + if (in_quote){ + errors.push_back("quote is not closed"); + return false; + } + + if (buf.length()>0) + args.push_back(buf); + + for (size_t i=0; i &args){ + int argc=static_cast(args.size()); + std::vector argv(argc); + + for (int i=0; i lookup; + for (std::map::iterator p=options.begin(); + p!=options.end(); p++){ + if (p->first.length()==0) continue; + char initial=p->second->short_name(); + if (initial){ + if (lookup.count(initial)>0){ + lookup[initial]=""; + errors.push_back(std::string("short option '")+initial+"' is ambiguous"); + return false; + } + else lookup[initial]=p->first; + } + } + + for (int i=1; i &args){ + if (!options.count("help")) + add("help", '?', "print this message"); + check(args.size(), parse(args)); + } + + void parse_check(int argc, char *argv[]){ + if (!options.count("help")) + add("help", '?', "print this message"); + check(argc, parse(argc, argv)); + } + + std::string error() const{ + return errors.size()>0?errors[0]:""; + } + + std::string error_full() const{ + std::ostringstream oss; + for (size_t i=0; imust()) + oss<short_description()<<" "; + } + + oss<<"[options] ... "<name().length()); + } + for (size_t i=0; ishort_name()){ + oss<<" -"<short_name()<<", "; + } + else{ + oss<<" "; + } + + oss<<"--"<name(); + for (size_t j=ordered[i]->name().length(); jdescription()<set()){ + errors.push_back("option needs value: --"+name); + return; + } + } + + void set_option(const std::string &name, const std::string &value){ + if (options.count(name)==0){ + errors.push_back("undefined option: --"+name); + return; + } + if (!options[name]->set(value)){ + errors.push_back("option value is invalid: --"+name+"="+value); + return; + } + } + + class option_base{ + public: + virtual ~option_base(){} + + virtual bool has_value() const=0; + virtual bool set()=0; + virtual bool set(const std::string &value)=0; + virtual bool has_set() const=0; + virtual bool valid() const=0; + virtual bool must() const=0; + + virtual const std::string &name() const=0; + virtual char short_name() const=0; + virtual const std::string &description() const=0; + virtual std::string short_description() const=0; + }; + + class option_without_value : public option_base { + public: + option_without_value(const std::string &name, + char short_name, + const std::string &desc) + :nam(name), snam(short_name), desc(desc), has(false){ + } + ~option_without_value(){} + + bool has_value() const { return false; } + + bool set(){ + has=true; + return true; + } + + bool set(const std::string &){ + return false; + } + + bool has_set() const { + return has; + } + + bool valid() const{ + return true; + } + + bool must() const{ + return false; + } + + const std::string &name() const{ + return nam; + } + + char short_name() const{ + return snam; + } + + const std::string &description() const { + return desc; + } + + std::string short_description() const{ + return "--"+nam; + } + + private: + std::string nam; + char snam; + std::string desc; + bool has; + }; + + template + class option_with_value : public option_base { + public: + option_with_value(const std::string &name, + char short_name, + bool need, + const T &def, + const std::string &desc) + : nam(name), snam(short_name), need(need), has(false) + , def(def), actual(def) { + this->desc=full_description(desc); + } + ~option_with_value(){} + + const T &get() const { + return actual; + } + + bool has_value() const { return true; } + + bool set(){ + return false; + } + + bool set(const std::string &value){ + try{ + actual=read(value); + has=true; + } + catch(const std::exception &e){ + return false; + } + return true; + } + + bool has_set() const{ + return has; + } + + bool valid() const{ + if (need && !has) return false; + return true; + } + + bool must() const{ + return need; + } + + const std::string &name() const{ + return nam; + } + + char short_name() const{ + return snam; + } + + const std::string &description() const { + return desc; + } + + std::string short_description() const{ + return "--"+nam+"="+detail::readable_typename(); + } + + protected: + std::string full_description(const std::string &desc){ + return + desc+" ("+detail::readable_typename()+ + (need?"":" [="+detail::default_value(def)+"]") + +")"; + } + + virtual T read(const std::string &s)=0; + + std::string nam; + char snam; + bool need; + std::string desc; + + bool has; + T def; + T actual; + }; + + template + class option_with_value_with_reader : public option_with_value { + public: + option_with_value_with_reader(const std::string &name, + char short_name, + bool need, + const T def, + const std::string &desc, + F reader) + : option_with_value(name, short_name, need, def, desc), reader(reader){ + } + + private: + T read(const std::string &s){ + return reader(s); + } + + F reader; + }; + + std::map options; + std::vector ordered; + std::string ftr; + + std::string prog_name; + std::vector others; + + std::vector errors; + }; + +} // cmdline \ No newline at end of file diff --git a/include/collection.h b/include/collection.h index d28b2d7a..d3ef4a9c 100644 --- a/include/collection.h +++ b/include/collection.h @@ -10,6 +10,33 @@ #include #include +struct facet_value { + // use string to int mapping for saving memory + spp::sparse_hash_map value_index; + spp::sparse_hash_map index_value; + + spp::sparse_hash_map> doc_values; + + uint32_t get_value_index(const std::string & value) { + if(value_index.count(value) != 0) { + return value_index[value]; + } + + uint32_t new_index = value_index.size(); + value_index.emplace(value, new_index); + index_value.emplace(new_index, value); + return new_index; + } + + void index_values(uint32_t doc_seq_id, const std::vector & values) { + std::vector value_vec(values.size()); + for(auto i = 0; i < values.size(); i++) { + value_vec[i] = get_value_index(values[i]); + } + doc_values.emplace(doc_seq_id, value_vec); + } +}; + class Collection { private: std::string name; @@ -23,19 +50,19 @@ private: spp::sparse_hash_map facet_schema; - std::vector rank_fields; + std::vector sort_fields; Store* store; spp::sparse_hash_map search_index; - spp::sparse_hash_map facet_index; + spp::sparse_hash_map facet_index; - spp::sparse_hash_map*> rank_index; + spp::sparse_hash_map*> sort_index; - std::string token_ordering_field; + std::string token_ranking_field; - std::string get_doc_id_key(std::string doc_id); + std::string get_doc_id_key(const std::string & doc_id); std::string get_seq_id_key(uint32_t seq_id); @@ -51,14 +78,15 @@ private: void do_facets(std::vector & facets, uint32_t* result_ids, size_t results_size); void search_field(std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length, - std::vector & facets, const std::vector & rank_fields, const int num_typos, - const size_t num_results, Topster<100> &topster, size_t & num_found, - const token_ordering token_order = FREQUENCY, const bool prefix = false); + std::vector & facets, const std::vector & sort_fields, const int num_typos, + const size_t num_results, Topster<100> &topster, uint32_t** all_result_ids, + size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY, const bool prefix = false); void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector & facets, - const std::vector & rank_fields, int & token_rank, + const std::vector & sort_fields, int & token_rank, std::vector> & token_leaves, Topster<100> & topster, - size_t & total_results, size_t & num_found, const size_t & max_results); + size_t & total_results, uint32_t** all_result_ids, size_t & all_result_ids_len, + const size_t & max_results); void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id, const bool verbatim) const; @@ -82,13 +110,13 @@ public: Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store, const std::vector & search_fields, const std::vector & facet_fields, - const std::vector & rank_fields, const std::string token_ordering_field); + const std::vector & sort_fields, const std::string token_ranking_field); ~Collection(); - static std::string get_next_seq_id_key(std::string collection_name); + static std::string get_next_seq_id_key(const std::string & collection_name); - static std::string get_meta_key(std::string collection_name); + static std::string get_meta_key(const std::string & collection_name); std::string get_seq_id_collection_prefix(); @@ -100,26 +128,26 @@ public: std::vector get_facet_fields(); - std::vector get_rank_fields(); + std::vector get_sort_fields(); spp::sparse_hash_map get_schema(); - std::string get_token_ordering_field(); + std::string get_token_ranking_field(); - Option add(std::string json_str); + Option add(const std::string & json_str); nlohmann::json search(std::string query, const std::vector search_fields, const std::string & simple_filter_query, const std::vector & facet_fields, - const std::vector & rank_fields, const int num_typos, + const std::vector & sort_fields, const int num_typos, const size_t num_results, const token_ordering token_order = FREQUENCY, const bool prefix = false); - void remove(std::string id); + Option remove(const std::string & id); - void score_results(const std::vector & rank_fields, const int & token_rank, Topster<100> &topster, + void score_results(const std::vector & sort_fields, const int & token_rank, Topster<100> &topster, const std::vector & query_suggestion, const uint32_t *result_ids, const size_t result_size) const; - Option index_in_memory(const nlohmann::json &document, uint32_t seq_id); + Option index_in_memory(const nlohmann::json & document, uint32_t seq_id); enum {MAX_SEARCH_TOKENS = 20}; enum {MAX_RESULTS = 100}; diff --git a/include/collection_manager.h b/include/collection_manager.h index a71dc611..ddde53c8 100644 --- a/include/collection_manager.h +++ b/include/collection_manager.h @@ -22,19 +22,19 @@ private: static constexpr const char* COLLECTION_ID_KEY = "id"; static constexpr const char* COLLECTION_SEARCH_FIELDS_KEY = "search_fields"; static constexpr const char* COLLECTION_FACET_FIELDS_KEY = "facet_fields"; - static constexpr const char* COLLECTION_RANK_FIELDS_KEY = "rank_fields"; - static constexpr const char* COLLECTION_TOKEN_ORDERING_FIELD_KEY = "token_ordering_field"; + static constexpr const char* COLLECTION_SORT_FIELDS_KEY = "sort_fields"; + static constexpr const char* COLLECTION_TOKEN_ORDERING_FIELD_KEY = "token_ranking_field"; CollectionManager(); + ~CollectionManager() = default; + public: - static CollectionManager& get_instance() { + static CollectionManager & get_instance() { static CollectionManager instance; return instance; } - ~CollectionManager(); - CollectionManager(CollectionManager const&) = delete; void operator=(CollectionManager const&) = delete; @@ -42,8 +42,8 @@ public: Collection* create_collection(std::string name, const std::vector & search_fields, const std::vector & facet_fields, - const std::vector & rank_fields, - const std::string & token_ordering_field = ""); + const std::vector & sort_fields, + const std::string & token_ranking_field = ""); Collection* get_collection(std::string collection_name); diff --git a/include/field.h b/include/field.h index a3e8ed3c..07193a33 100644 --- a/include/field.h +++ b/include/field.h @@ -23,7 +23,7 @@ struct field { std::string name; std::string type; - field(std::string name, std::string type): name(name), type(type) { + field(const std::string & name, const std::string & type): name(name), type(type) { } @@ -64,6 +64,28 @@ struct filter { } }; +namespace sort_field_const { + static const std::string name = "name"; + static const std::string order = "order"; + static const std::string asc = "ASC"; + static const std::string desc = "DESC"; +} + +struct sort_field { + std::string name; + std::string order; + + sort_field(const std::string & name, const std::string & order): name(name), order(order) { + + } + + sort_field& operator=(sort_field other) { + name = other.name; + order = other.order; + return *this; + } +}; + struct facet { const std::string field_name; std::map result_map; diff --git a/include/http_server.h b/include/http_server.h new file mode 100644 index 00000000..a1297e10 --- /dev/null +++ b/include/http_server.h @@ -0,0 +1,112 @@ +#pragma once + +#define H2O_USE_LIBUV 0 + +extern "C" { + #include "h2o.h" + #include "h2o/http1.h" + #include "h2o/http2.h" +} + +#include +#include +#include +#include "collection.h" +#include "collection_manager.h" + +struct http_res { + uint32_t status_code; + std::string body; + + void send_200(const std::string & res_body) { + status_code = 200; + body = res_body; + } + + void send_201(const std::string & res_body) { + status_code = 201; + body = res_body; + } + + void send_400(const std::string & message) { + status_code = 400; + body = "{\"message\": \"" + message + "\"}"; + } + + void send_404() { + status_code = 404; + body = "{\"message\": \"Not Found\"}"; + } + + void send_409(const std::string & message) { + status_code = 400; + body = "{\"message\": \"" + message + "\"}"; + } + + void send_500(const std::string & res_body) { + status_code = 500; + body = res_body; + } + + void send(uint32_t code, const std::string & message) { + status_code = code; + body = "{\"message\": \"" + message + "\"}"; + } +}; + +struct http_req { + std::map params; + std::string body; +}; + +struct route_path { + std::string http_method; + std::vector path_parts; + void (*handler)(http_req & req, http_res &); + + inline bool operator< (const route_path& rhs) const { + return true; + } +}; + +class HttpServer { +private: + static h2o_globalconf_t config; + static h2o_context_t ctx; + static h2o_accept_ctx_t accept_ctx; + static std::vector routes; + + std::string listen_address; + + uint32_t listen_port; + + h2o_hostconf_t *hostconf; + + static void on_accept(h2o_socket_t *listener, const char *err); + + int create_listener(); + + h2o_pathconf_t *register_handler(h2o_hostconf_t *hostconf, const char *path, + int (*on_req)(h2o_handler_t *, h2o_req_t *)); + + static const char* get_status_reason(uint32_t status_code); + + static std::map parse_query(const std::string& query); + + static int catch_all_handler(h2o_handler_t *self, h2o_req_t *req); + +public: + HttpServer(std::string listen_address, uint32_t listen_port); + + ~HttpServer(); + + void get(const std::string & path, void (*handler)(http_req & req, http_res &)); + + void post(const std::string & path, void (*handler)(http_req &, http_res &)); + + void put(const std::string & path, void (*handler)(http_req &, http_res &)); + + void del(const std::string & path, void (*handler)(http_req &, http_res &)); + + int run(); +}; \ No newline at end of file diff --git a/include/sorted_array.h b/include/sorted_array.h index 8798e0b6..93553cab 100644 --- a/include/sorted_array.h +++ b/include/sorted_array.h @@ -16,7 +16,7 @@ private: uint32_t m = std::min(min, value); uint32_t M = std::max(max, value); uint32_t bnew = required_bits(M - m); - return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew); + return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew); } uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base, diff --git a/include/store.h b/include/store.h index fc58e821..da801488 100644 --- a/include/store.h +++ b/include/store.h @@ -24,6 +24,13 @@ public: } }; +enum StoreStatus { + FOUND, + OK, + NOT_FOUND, + ERROR +}; + /* * Abstraction for underlying KV store (RocksDB) */ @@ -72,9 +79,18 @@ public: return status.ok() && !status.IsNotFound(); } - bool get(const std::string& key, std::string& value) { + StoreStatus get(const std::string& key, std::string& value) { rocksdb::Status status = db->Get(rocksdb::ReadOptions(), key, &value); - return status.ok(); + + if(status.IsNotFound()) { + return StoreStatus::NOT_FOUND; + } + + if(!status.ok()) { + return StoreStatus::ERROR; + } + + return StoreStatus::FOUND; } bool remove(const std::string& key) { diff --git a/include/string_utils.h b/include/string_utils.h index a93c6bff..88998d66 100644 --- a/include/string_utils.h +++ b/include/string_utils.h @@ -5,46 +5,6 @@ #include struct StringUtils { - - template - static void tokenize(const std::string &str, ContainerT &tokens, - const std::string &delimiters = " ", bool trimEmpty = true, unsigned long maxTokenLength = 100) { - const std::string truncated_str = str.substr(0, maxTokenLength); - std::string::size_type pos, lastPos = 0; - - using value_type = typename ContainerT::value_type; - using size_type = typename ContainerT::size_type; - - while (true) { - pos = truncated_str.find_first_of(delimiters, lastPos); - if (pos == std::string::npos) { - pos = truncated_str.length(); - - if (pos != lastPos || !trimEmpty) - tokens.push_back(value_type(truncated_str.data() + lastPos, - (size_type) pos - lastPos)); - - break; - } - else { - if (pos != lastPos || !trimEmpty) - tokens.push_back(value_type(truncated_str.data() + lastPos, - (size_type) pos - lastPos)); - } - - lastPos = pos + 1; - } - } - - static std::string replace_all(std::string str, const std::string &from, const std::string &to) { - size_t start_pos = 0; - while ((start_pos = str.find(from, start_pos)) != std::string::npos) { - str.replace(start_pos, from.length(), to); - start_pos += to.length(); // Handles case where 'to' is a substring of 'from' - } - return str; - } - // Adapted from: http://stackoverflow.com/a/236180/131050 static void split(const std::string& s, std::vector & result, const std::string& delim, const bool keep_empty = false) { if (delim.empty()) { @@ -122,4 +82,8 @@ struct StringUtils { strtol(s.c_str(), &p, 10); return (*p == 0); } + + static void toupper(std::string& str) { + std::transform(str.begin(), str.end(), str.begin(), ::toupper); + } }; \ No newline at end of file diff --git a/src/api.cpp b/src/api.cpp new file mode 100644 index 00000000..8b6cb27d --- /dev/null +++ b/src/api.cpp @@ -0,0 +1,236 @@ +#include +#include +#include +#include "api.h" +#include "string_utils.h" +#include "collection.h" +#include "collection_manager.h" + +void post_create_collection(http_req & req, http_res & res) { + nlohmann::json req_json; + + try { + req_json = nlohmann::json::parse(req.body); + } catch(...) { + return res.send_400("Bad JSON."); + } + + CollectionManager & collectionManager = CollectionManager::get_instance(); + + // validate presence of mandatory fields + + if(req_json.count("name") == 0) { + return res.send_400("Parameter `name` is required."); + } + + if(req_json.count("search_fields") == 0) { + return res.send_400("Parameter `search_fields` is required."); + } + + if(req_json.count("sort_fields") == 0) { + return res.send_400("Parameter `sort_fields` is required."); + } + + if(collectionManager.get_collection(req_json["name"]) != nullptr) { + return res.send_409("Collection with name `" + req_json["name"].get() + "` already exists."); + } + + // field specific validation + + std::vector search_fields; + + if(!req_json["search_fields"].is_array() || req_json["search_fields"].size() == 0) { + return res.send_400("Wrong format for `search_fields`. It should be an array like: " + "[{\"name\": \"\", \"type\": \"\"}]"); + } + + for(const nlohmann::json & search_field_json: req_json["search_fields"]) { + if(!search_field_json.is_object() || + search_field_json.count(fields::name) == 0 || search_field_json.count(fields::type) == 0 || + !search_field_json.at(fields::name).is_string() || !search_field_json.at(fields::type).is_string()) { + + return res.send_400("Wrong format for `search_fields`. It should be an array like: " + "[{\"name\": \"\", \"type\": \"\"}]"); + } + + search_fields.push_back(field(search_field_json["name"], search_field_json["type"])); + } + + std::vector facet_fields; + + if(req_json.count("facet_fields") != 0) { + if(!req_json["facet_fields"].is_array()) { + return res.send_400("Wrong format for `facet_fields`. It should be an array like: " + "[{\"name\": \"\", \"type\": \"\"}]"); + } + + for(const nlohmann::json & facet_field_json: req_json["facet_fields"]) { + if(!facet_field_json.is_object() || + facet_field_json.count(fields::name) == 0 || facet_field_json.count(fields::type) == 0 || + !facet_field_json.at(fields::name).is_string() || !facet_field_json.at(fields::type).is_string()) { + + return res.send_400("Wrong format for `facet_fields`. It should be an array like: " + "[{\"name\": \"\", \"type\": \"\"}]"); + } + + facet_fields.push_back(field(facet_field_json["name"], facet_field_json["type"])); + } + } + + std::vector sort_fields; + + if(!req_json["sort_fields"].is_array() || req_json["sort_fields"].size() == 0) { + return res.send_400("Wrong format for `sort_fields`. It should be an array like: " + "[{\"name\": \"\", \"type\": \"\"}]"); + } + + for(const nlohmann::json & sort_field_json: req_json["sort_fields"]) { + if(!sort_field_json.is_object() || + sort_field_json.count(fields::name) == 0 || sort_field_json.count(fields::type) == 0 || + !sort_field_json.at(fields::name).is_string() || + !sort_field_json.at(fields::type).is_string()) { + + return res.send_400("Wrong format for `sort_fields`. It should be an array like: " + "[{\"name\": \"\", \"type\": \"\"}]"); + } + + if(sort_field_json["type"] != "INT32" && sort_field_json["type"] != "INT64") { + return res.send_400("Sort field `" + sort_field_json["name"].get() + "` must be a number."); + } + + sort_fields.push_back(field(sort_field_json["name"], sort_field_json["type"])); + } + + std::string token_ranking_field = ""; + + if(req_json.count("token_ranking_field") != 0) { + if(!req_json["token_ranking_field"].is_string()) { + return res.send_400("Wrong format for `token_ranking_field`. It should be a string (name of a field)."); + } + + token_ranking_field = req_json["token_ranking_field"].get(); + } + + collectionManager.create_collection(req_json["name"], search_fields, facet_fields, sort_fields, token_ranking_field); + res.send_201(req.body); +} + +void get_search(http_req & req, http_res & res) { + const char *NUM_TYPOS = "num_typos"; + const char *PREFIX = "prefix"; + const char *FILTER = "filter_by"; + const char *SEARCH_BY = "search_by"; + const char *SORT_BY = "sort_by"; + const char *FACET_BY = "facet_by"; + + if(req.params.count(NUM_TYPOS) == 0) { + req.params[NUM_TYPOS] = "2"; + } + + if(req.params.count(PREFIX) == 0) { + req.params[PREFIX] = "false"; + } + + if(req.params.count(SEARCH_BY) == 0) { + return res.send_400(std::string("Parameter `") + SEARCH_BY + "` is required."); + } + + std::string filter_str = req.params.count(FILTER) != 0 ? req.params[FILTER] : ""; + + std::vector search_fields; + StringUtils::split(req.params[SEARCH_BY], search_fields, ","); + + std::vector facet_fields; + StringUtils::split(req.params[FACET_BY], facet_fields, "&&"); + + std::vector sort_fields; + if(req.params.count(SORT_BY) != 0) { + std::vector sort_field_strs; + StringUtils::split(req.params[SORT_BY], sort_field_strs, ","); + + if(sort_field_strs.size() > 2) { + return res.send_400("Only upto 2 sort fields are allowed."); + } + + for(const std::string & sort_field_str: sort_field_strs) { + std::vector expression_parts; + StringUtils::split(sort_field_str, expression_parts, ":"); + + if(expression_parts.size() != 2) { + return res.send_400(std::string("Parameter `") + SORT_BY + "` is malformed."); + } + + StringUtils::toupper(expression_parts[1]); + sort_fields.push_back(sort_field(expression_parts[0], expression_parts[1])); + } + } + + auto begin = std::chrono::high_resolution_clock::now(); + + CollectionManager & collectionManager = CollectionManager::get_instance(); + Collection* collection = collectionManager.get_collection(req.params["collection"]); + + if(collection == nullptr) { + return res.send_404(); + } + + bool prefix = (req.params[PREFIX] == "true"); + + token_ordering token_order = FREQUENCY; + if(prefix && !collection->get_token_ranking_field().empty()) { + token_order = MAX_SCORE; + } + + nlohmann::json result = collection->search(req.params["q"], search_fields, filter_str, facet_fields, + sort_fields, std::stoi(req.params[NUM_TYPOS]), 100, + token_order, prefix); + const std::string & json_str = result.dump(); + //std::cout << "JSON:" << json_str << std::endl; + struct rusage r_usage; + getrusage(RUSAGE_SELF,&r_usage); + + //std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl; + res.send_200(json_str); + + long long int timeMicros = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); + std::cout << "Time taken: " << timeMicros << "us" << std::endl; +} + +void post_add_document(http_req & req, http_res & res) { + CollectionManager & collectionManager = CollectionManager::get_instance(); + Collection* collection = collectionManager.get_collection(req.params["collection"]); + + if(collection == nullptr) { + return res.send_404(); + } + + Option inserted_id_op = collection->add(req.body); + + if(!inserted_id_op.ok()) { + res.send(inserted_id_op.code(), inserted_id_op.error()); + } else { + nlohmann::json json_response; + json_response["id"] = inserted_id_op.get(); + res.send_201(json_response.dump()); + } +} + +void del_remove_document(http_req & req, http_res & res) { + std::string doc_id = req.params["id"]; + + CollectionManager & collectionManager = CollectionManager::get_instance(); + Collection* collection = collectionManager.get_collection(req.params["collection"]); + if(collection == nullptr) { + return res.send_404(); + } + + Option deleted_id_op = collection->remove(doc_id); + + if(!deleted_id_op.ok()) { + res.send(deleted_id_op.code(), deleted_id_op.error()); + } else { + nlohmann::json json_response; + json_response["id"] = deleted_id_op.get(); + res.send_200(json_response.dump()); + } +} \ No newline at end of file diff --git a/src/array_utils.cpp b/src/array_utils.cpp new file mode 100644 index 00000000..610aa96c --- /dev/null +++ b/src/array_utils.cpp @@ -0,0 +1,87 @@ +#include "array_utils.h" +#include + +size_t ArrayUtils::and_scalar(const uint32_t *A, const size_t lenA, + const uint32_t *B, const size_t lenB, uint32_t *out) { + const uint32_t *const initout(out); + if (lenA == 0 || lenB == 0) + return 0; + + const uint32_t *endA = A + lenA; + const uint32_t *endB = B + lenB; + + while (1) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) + return (out - initout); + } + while (*A > *B) { + if (++B == endB) + return (out - initout); + } + if (*A == *B) { + *out++ = *A; + if (++A == endA || ++B == endB) + return (out - initout); + } else { + goto SKIP_FIRST_COMPARE; + } + } + + return (out - initout); // NOTREACHED +} + +size_t ArrayUtils::or_scalar(const uint32_t *A, const size_t lenA, + const uint32_t *B, const size_t lenB, uint32_t **out) { + size_t indexA = 0, indexB = 0, res_index = 0; + + if(A == nullptr) { + *out = new uint32_t[lenB]; + memcpy(*out, B, lenB * sizeof(uint32_t)); + return lenB; + } + + uint32_t* results = new uint32_t[lenA+lenB]; + + while (indexA < lenA && indexB < lenB) { + if (A[indexA] < B[indexB]) { + if(res_index == 0 || results[res_index-1] != A[indexA]) { + results[res_index] = A[indexA]; + res_index++; + } + indexA++; + } else { + if(res_index == 0 || results[res_index-1] != B[indexB]) { + results[res_index] = B[indexB]; + res_index++; + } + indexB++; + } + } + + while (indexA < lenA) { + if(results[res_index-1] != A[indexA]) { + results[res_index] = A[indexA]; + res_index++; + } + + indexA++; + } + + while (indexB < lenB) { + if(results[res_index-1] != B[indexB]) { + results[res_index] = B[indexB]; + res_index++; + } + + indexB++; + } + + // shrink fit + *out = new uint32_t[res_index]; + memcpy(*out, results, res_index * sizeof(uint32_t)); + delete[] results; + + return res_index; +} \ No newline at end of file diff --git a/src/collection.cpp b/src/collection.cpp index 80e63976..c06fca87 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -2,16 +2,16 @@ #include #include -#include +#include #include #include #include Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store, const std::vector &search_fields, const std::vector & facet_fields, - const std::vector & rank_fields, const std::string token_ordering_field): + const std::vector & sort_fields, const std::string token_ranking_field): name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store), - rank_fields(rank_fields), token_ordering_field(token_ordering_field) { + sort_fields(sort_fields), token_ranking_field(token_ranking_field) { for(const field& field: search_fields) { art_tree *t = new art_tree; @@ -21,33 +21,27 @@ Collection::Collection(const std::string name, const uint32_t collection_id, con } for(const field& field: facet_fields) { - art_tree *t = new art_tree; - art_tree_init(t); - facet_index.emplace(field.name, t); + facet_value fvalue; + facet_index.emplace(field.name, fvalue); facet_schema.emplace(field.name, field); } - for(const std::string & rank_field: rank_fields) { + for(const field & sort_field: sort_fields) { spp::sparse_hash_map * doc_to_score = new spp::sparse_hash_map(); - rank_index.emplace(rank_field, doc_to_score); + sort_index.emplace(sort_field.name, doc_to_score); } } Collection::~Collection() { - for(std::pair name_field: search_schema) { + for(auto & name_field: search_schema) { art_tree *t = search_index.at(name_field.first); art_tree_destroy(t); t = nullptr; } - for(std::pair name_field: facet_schema) { - art_tree *t = facet_index.at(name_field.first); - art_tree_destroy(t); - t = nullptr; - } - - for(std::pair*> name_map: rank_index) { + for(auto & name_map: sort_index) { delete name_map.second; + name_map.second = nullptr; } } @@ -56,7 +50,7 @@ uint32_t Collection::get_next_seq_id() { return next_seq_id++; } -Option Collection::add(std::string json_str) { +Option Collection::add(const std::string & json_str) { nlohmann::json document = nlohmann::json::parse(json_str); uint32_t seq_id = get_next_seq_id(); @@ -80,22 +74,22 @@ Option Collection::add(std::string json_str) { } Option Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) { - if(!token_ordering_field.empty() && document.count(token_ordering_field) == 0) { - return Option<>(400, "Field `" + token_ordering_field + "` has been declared as a token ordering field, " + if(!token_ranking_field.empty() && document.count(token_ranking_field) == 0) { + return Option<>(400, "Field `" + token_ranking_field + "` has been declared as a token ranking field, " "but is not found in the document."); } - if(!token_ordering_field.empty() && !document[token_ordering_field].is_number()) { - return Option<>(400, "Token ordering field `" + token_ordering_field + "` must be an INT32."); + if(!token_ranking_field.empty() && !document[token_ranking_field].is_number()) { + return Option<>(400, "Token ranking field `" + token_ranking_field + "` must be an INT32."); } - if(!token_ordering_field.empty() && document[token_ordering_field].get() > INT32_MAX) { - return Option<>(400, "Token ordering field `" + token_ordering_field + "` exceeds maximum value of INT32."); + if(!token_ranking_field.empty() && document[token_ranking_field].get() > INT32_MAX) { + return Option<>(400, "Token ranking field `" + token_ranking_field + "` exceeds maximum value of INT32."); } uint32_t points = 0; - if(!token_ordering_field.empty()) { - points = document[token_ordering_field]; + if(!token_ranking_field.empty()) { + points = document[token_ranking_field]; } for(const std::pair & field_pair: search_schema) { @@ -176,13 +170,13 @@ Option Collection::index_in_memory(const nlohmann::json &document, uin "but is not found in the document."); } - art_tree *t = facet_index.at(field_name); + facet_value & fvalue = facet_index.at(field_name); if(field_pair.second.type == field_types::STRING) { if(!document[field_name].is_string()) { return Option<>(400, "Facet field `" + field_name + "` must be a STRING."); } - const std::string & text = document[field_name]; - index_string_field(text, points, t, seq_id, true); + const std::string & value = document[field_name]; + fvalue.index_values(seq_id, { value }); } else if(field_pair.second.type == field_types::STRING_ARRAY) { if(!document[field_name].is_array()) { return Option<>(400, "Facet field `" + field_name + "` must be a STRING_ARRAY."); @@ -192,23 +186,23 @@ Option Collection::index_in_memory(const nlohmann::json &document, uin return Option<>(400, "Facet field `" + field_name + "` must be a STRING_ARRAY."); } - std::vector strings = document[field_name]; - index_string_array_field(strings, points, t, seq_id, true); + const std::vector & values = document[field_name]; + fvalue.index_values(seq_id, values); } } - for(const std::string & rank_field: rank_fields) { - if(document.count(rank_field) == 0) { - return Option<>(400, "Field `" + rank_field + "` has been declared as a rank field in the schema, " + for(const field & sort_field: sort_fields) { + if(document.count(sort_field.name) == 0) { + return Option<>(400, "Field `" + sort_field.name + "` has been declared as a sort field in the schema, " "but is not found in the document."); } - if(!document[rank_field].is_number()) { - return Option<>(400, "Rank field `" + rank_field + "` must be an integer."); + if(!document[sort_field.name].is_number()) { + return Option<>(400, "Sort field `" + sort_field.name + "` must be a number."); } - spp::sparse_hash_map *doc_to_score = rank_index.at(rank_field); - doc_to_score->emplace(seq_id, document[rank_fields[0]].get()); + spp::sparse_hash_map *doc_to_score = sort_index.at(sort_field.name); + doc_to_score->emplace(seq_id, document[sort_field.name].get()); } return Option<>(200); @@ -269,9 +263,9 @@ void Collection::index_string_field(const std::string & text, const uint32_t sco tokens.push_back(text); token_to_offsets[text].push_back(0); } else { - StringUtils::tokenize(text, tokens, " ", true); + StringUtils::split(text, tokens, " "); for(uint32_t i=0; i & facets, uint32_t* result_ids, si for(auto & a_facet: facets) { // assumed that facet fields have already been validated upstream const field & facet_field = facet_schema.at(a_facet.field_name); + const facet_value & fvalue = facet_index.at(facet_field.name); - // loop through the field, get all keys and intersect those ids with result ids - if(facet_index.count(facet_field.name) != 0) { - art_tree *t = facet_index.at(facet_field.name); - std::vector leaves; - - art_topk_iter(t->root, MAX_SCORE, 10, leaves); - - for(const art_leaf* leaf: leaves) { - const uint32_t* facet_ids = leaf->values->ids.uncompress(); - size_t facet_ids_size = leaf->values->ids.getLength(); - - uint32_t* facet_results = new uint32_t[std::min(facet_ids_size, results_size)]; - const size_t facet_results_size = Intersection::scalar(result_ids, results_size, - facet_ids, facet_ids_size, facet_results); - - const std::string facet_value((const char *)leaf->key, leaf->key_len-1); // drop trailing null - a_facet.result_map.insert(std::pair(facet_value, facet_results_size)); - - delete [] facet_ids; - delete [] facet_results; + for(auto i = 0; i < results_size; i++) { + uint32_t doc_seq_id = result_ids[i]; + if(fvalue.doc_values.count(doc_seq_id) != 0) { + // for every result document, get the values associated and increment counter + const std::vector & value_indices = fvalue.doc_values.at(doc_seq_id); + for(auto j = 0; j < value_indices.size(); j++) { + const std::string & facet_value = fvalue.index_value.at(value_indices.at(j)); + a_facet.result_map[facet_value] += 1; + } } } } } void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector & facets, - const std::vector & rank_fields, int & token_rank, + const std::vector & sort_fields, int & candidate_rank, std::vector> & token_leaves, Topster<100> & topster, - size_t & total_results, size_t & num_found, const size_t & max_results) { + size_t & total_results, uint32_t** all_result_ids, size_t & all_result_ids_len, + const size_t & max_results) { const size_t combination_limit = 10; auto product = []( long long a, std::vector& b ) { return a*b.size(); }; long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product); @@ -367,7 +353,11 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt for(long long n=0; n query_suggestion = next_suggestion(token_leaves, n); - token_rank++; + candidate_rank++; + + /*for(auto i=0; i < query_suggestion.size(); i++) { + std::cout << "i: " << i << " - " << query_suggestion[i]->key << std::endl; + }*/ // initialize results with the starting element (for further intersection) uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress(); @@ -386,22 +376,32 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt if(filter_ids != nullptr) { // intersect once again with filter ids uint32_t* filtered_result_ids = new uint32_t[std::min(filter_ids_length, result_size)]; - size_t filtered_results_size = - Intersection::scalar(filter_ids, filter_ids_length, result_ids, result_size, filtered_result_ids); + size_t filtered_results_size = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids, + result_size, filtered_result_ids); + + uint32_t* new_all_result_ids; + all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, filtered_result_ids, + filtered_results_size, &new_all_result_ids); + delete [] *all_result_ids; + *all_result_ids = new_all_result_ids; do_facets(facets, filtered_result_ids, filtered_results_size); // go through each matching document id and calculate match score - score_results(rank_fields, token_rank, topster, query_suggestion, filtered_result_ids, filtered_results_size); - num_found += filtered_results_size; + score_results(sort_fields, candidate_rank, topster, query_suggestion, filtered_result_ids, filtered_results_size); delete[] filtered_result_ids; delete[] result_ids; } else { do_facets(facets, result_ids, result_size); - score_results(rank_fields, token_rank, topster, query_suggestion, result_ids, result_size); - num_found += result_size; + uint32_t* new_all_result_ids; + all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, result_ids, + result_size, &new_all_result_ids); + delete [] *all_result_ids; + *all_result_ids = new_all_result_ids; + + score_results(sort_fields, candidate_rank, topster, query_suggestion, result_ids, result_size); delete[] result_ids; } @@ -544,7 +544,8 @@ Option Collection::do_filtering(uint32_t** filter_ids_out, const std:: filter_ids_length = result_ids_length; } else { uint32_t* filtered_results = new uint32_t[std::min((size_t)filter_ids_length, result_ids_length)]; - filter_ids_length = Intersection::scalar(filter_ids, filter_ids_length, result_ids, result_ids_length, filtered_results); + filter_ids_length = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids, + result_ids_length, filtered_results); delete [] filter_ids; delete [] result_ids; filter_ids = filtered_results; @@ -558,9 +559,8 @@ Option Collection::do_filtering(uint32_t** filter_ids_out, const std:: nlohmann::json Collection::search(std::string query, const std::vector search_fields, const std::string & simple_filter_query, const std::vector & facet_fields, - const std::vector & rank_fields, const int num_typos, + const std::vector & sort_fields, const int num_typos, const size_t num_results, const token_ordering token_order, const bool prefix) { - size_t num_found = 0; nlohmann::json result = nlohmann::json::object(); std::vector facets; @@ -587,10 +587,15 @@ nlohmann::json Collection::search(std::string query, const std::vector::KV>> field_order_kvs; + uint32_t* all_result_ids = nullptr; + size_t all_result_ids_len = 0; for(int i = 0; i < search_fields.size(); i++) { Topster<100> topster; const std::string & field = search_fields[i]; // proceed to query search only when no filters are provided or when filtering produces results if(simple_filter_query.size() == 0 || filter_ids_length > 0) { - search_field(query, field, filter_ids, filter_ids_length, facets, rank_fields, num_typos, num_results, - topster, num_found, token_order, prefix); + search_field(query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos, num_results, + topster, &all_result_ids, all_result_ids_len, token_order, prefix); topster.sort(); } + // order of fields specified matter: matching docs from earlier fields are more important for(auto t = 0; t < topster.size && t < num_results; t++) { field_order_kvs.push_back(std::make_pair(search_fields.size() - i, topster.getKV(t))); } @@ -626,18 +634,19 @@ nlohmann::json Collection::search(std::string query, const std::vector::KV> & a, const std::pair::KV> & b) { if(a.second.match_score != b.second.match_score) return a.second.match_score > b.second.match_score; if(a.second.primary_attr != b.second.primary_attr) return a.second.primary_attr > b.second.primary_attr; if(a.second.secondary_attr != b.second.secondary_attr) return a.second.secondary_attr > b.second.secondary_attr; - if(a.first != b.first) return a.first > b.first; + if(a.first != b.first) return a.first > b.first; // field position return a.second.key > b.second.key; }); result["hits"] = nlohmann::json::array(); - for(auto field_order_kv: field_order_kvs) { + for(auto & field_order_kv: field_order_kvs) { std::string value; const std::string &seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key); store->get(seq_id_key, value); @@ -645,7 +654,7 @@ nlohmann::json Collection::search(std::string query, const std::vector> value_to_count; + for (auto itr = a_facet.result_map.begin(); itr != a_facet.result_map.end(); ++itr) { + value_to_count.push_back(*itr); + } + + std::sort(value_to_count.begin(), value_to_count.end(), + [=](std::pair& a, std::pair& b) { + return a.second > b.second; + }); + + for(auto i = 0; i < std::min((size_t)10, value_to_count.size()); i++) { + auto & kv = value_to_count[i]; nlohmann::json facet_value_count = nlohmann::json::object(); facet_value_count["value"] = kv.first; facet_value_count["count"] = kv.second; @@ -681,11 +702,11 @@ nlohmann::json Collection::search(std::string query, const std::vector & facets, const std::vector & rank_fields, const int num_typos, - const size_t num_results, Topster<100> &topster, size_t & num_found, - const token_ordering token_order, const bool prefix) { + std::vector & facets, const std::vector & sort_fields, const int num_typos, + const size_t num_results, Topster<100> &topster, uint32_t** all_result_ids, + size_t & all_result_ids_len, const token_ordering token_order, const bool prefix) { std::vector tokens; - StringUtils::tokenize(query, tokens, " ", true); + StringUtils::split(query, tokens, " "); const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos; const size_t max_results = std::min(num_results, (size_t) Collection::MAX_RESULTS); @@ -714,7 +735,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui const size_t combination_limit = 10; auto product = []( long long a, std::vector& b ) { return a*b.size(); }; - int token_rank = 0; + int candidate_rank = 0; long long n = 0; long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product); @@ -737,8 +758,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui const std::string token_cost_hash = token + std::to_string(costs[token_index]); std::vector leaves; - /*std::cout << "\nSearching for: " << token << " - cost: " << costs[token_index] << ", token_rank: " - << token_rank << std::endl;*/ + /*std::cout << "\nSearching for: " << token << " - cost: " << costs[token_index] << ", candidate_rank: " + << candidate_rank << std::endl;*/ if(token_cost_cache.count(token_cost_hash) != 0) { leaves = token_cost_cache[token_cost_hash]; @@ -758,9 +779,9 @@ void Collection::search_field(std::string & query, const std::string & field, ui if(!leaves.empty()) { //!log_leaves(costs[token_index], token, leaves); token_leaves.push_back(leaves); - token_to_count[token] = leaves.at(0)->values->ids.getLength(); + token_to_count[token] = std::max(token_to_count[token], leaves.at(0)->values->ids.getLength()); } else { - // No result at `cost = costs[token_index]` => remove cost for token and re-do combinations + // No result at `cost = costs[token_index]`. Remove costs until `cost` for token and re-do combinations auto it = std::find(token_to_costs[token_index].begin(), token_to_costs[token_index].end(), costs[token_index]); if(it != token_to_costs[token_index].end()) { token_to_costs[token_index].erase(it); @@ -773,9 +794,9 @@ void Collection::search_field(std::string & query, const std::string & field, ui } } + // To continue outerloop on new cost combination n = -1; N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product); - break; } @@ -784,8 +805,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) { // If all tokens were found, go ahead and search for candidates with what we have so far - search_candidates(filter_ids, filter_ids_length, facets, rank_fields, token_rank, token_leaves, topster, - total_results, num_found, max_results); + search_candidates(filter_ids, filter_ids_length, facets, sort_fields, candidate_rank, token_leaves, topster, + total_results, all_result_ids, all_result_ids_len, max_results); if (total_results >= max_results) { // If we don't find enough results, we continue outerloop (looking at tokens with greater cost) @@ -818,8 +839,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui } } - return search_field(truncated_query, field, filter_ids, filter_ids_length, facets, rank_fields, num_typos, - num_results, topster, num_found, token_order, prefix); + return search_field(truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos, + num_results, topster, all_result_ids, all_result_ids_len, token_order, prefix); } } @@ -834,11 +855,11 @@ void Collection::log_leaves(const int cost, const std::string &token, const std: } } -void Collection::score_results(const std::vector & rank_fields, const int & token_rank, +void Collection::score_results(const std::vector & sort_fields, const int & candidate_rank, Topster<100> & topster, const std::vector &query_suggestion, const uint32_t *result_ids, const size_t result_size) const { - const int max_token_rank = 250; + const int max_candidate_rank = 250; spp::sparse_hash_map leaf_to_indices; if(query_suggestion.size() != 1) { @@ -853,13 +874,23 @@ void Collection::score_results(const std::vector & rank_fields, con spp::sparse_hash_map * primary_rank_scores = nullptr; spp::sparse_hash_map * secondary_rank_scores = nullptr; - if(rank_fields.size() > 0) { + // Used for asc/desc ordering. NOTE: Topster keeps biggest keys (i.e. it's desc in nature) + int64_t primary_rank_factor = 1; + int64_t secondary_rank_factor = 1; + + if(sort_fields.size() > 0) { // assumed that rank field exists in the index - checked earlier in the chain - primary_rank_scores = rank_index.at(rank_fields[0]); + primary_rank_scores = sort_index.at(sort_fields[0].name); + if(sort_fields[0].order == sort_field_const::asc) { + primary_rank_factor = -1; + } } - if(rank_fields.size() > 1) { - secondary_rank_scores = rank_index.at(rank_fields[1]); + if(sort_fields.size() > 1) { + secondary_rank_scores = sort_index.at(sort_fields[1].name); + if(sort_fields[1].order == sort_field_const::asc) { + secondary_rank_factor = -1; + } } for(auto i=0; i & rank_fields, con mscore = MatchScore::match_score(seq_id, token_positions); } - int token_rank_score = max_token_rank - token_rank; + int candidate_rank_score = max_candidate_rank - candidate_rank; // Construct a single match_score from individual components (for multi-field sort) - const uint64_t match_score = (token_rank_score << 16) + - ((uint64_t)(mscore.words_present) << 8) + + const uint64_t match_score = ((uint64_t)(mscore.words_present) << 16) + + (candidate_rank_score << 8) + (MAX_SEARCH_TOKENS - mscore.distance); - int64_t primary_rank_score = primary_rank_scores->count(seq_id) > 0 ? primary_rank_scores->at(seq_id) : 0; + int64_t primary_rank_score = (primary_rank_scores && primary_rank_scores->count(seq_id) > 0) ? + primary_rank_scores->at(seq_id) : 0; int64_t secondary_rank_score = (secondary_rank_scores && secondary_rank_scores->count(seq_id) > 0) ? secondary_rank_scores->at(seq_id) : 0; - topster.add(seq_id, match_score, primary_rank_score, secondary_rank_score); - /*std::cout << "token_rank_score: " << token_rank_score << ", match_score: " + topster.add(seq_id, match_score, + primary_rank_factor * primary_rank_score, + secondary_rank_factor * secondary_rank_score); + + /*std::cout << "candidate_rank_score: " << candidate_rank_score << ", match_score: " << match_score << ", primary_rank_score: " << primary_rank_score << ", seq_id: " << seq_id << std::endl;*/ } @@ -968,9 +1003,13 @@ void Collection::remove_and_shift_offset_index(sorted_array &offset_index, const delete[] new_array; } -void Collection::remove(std::string id) { +Option Collection::remove(const std::string & id) { std::string seq_id_str; - store->get(get_doc_id_key(id), seq_id_str); + StoreStatus status = store->get(get_doc_id_key(id), seq_id_str); + + if(status == StoreStatus::NOT_FOUND) { + return Option(404, "Could not find a document with id: " + id); + } uint32_t seq_id = (uint32_t) std::stol(seq_id_str); @@ -979,48 +1018,108 @@ void Collection::remove(std::string id) { nlohmann::json document = nlohmann::json::parse(parsed_document); - std::vector tokens; - StringUtils::tokenize(document["title"], tokens, " ", true); - - for(auto token: tokens) { - std::transform(token.begin(), token.end(), token.begin(), ::tolower); - - const unsigned char *key = (const unsigned char *) token.c_str(); - int key_len = (int) (token.length() + 1); - - art_leaf* leaf = (art_leaf *) art_search(search_index.at("title"), key, key_len); - if(leaf != NULL) { - uint32_t seq_id_values[1] = {seq_id}; - - uint32_t doc_index = leaf->values->ids.indexOf(seq_id); - uint32_t start_offset = leaf->values->offset_index.at(doc_index); - uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ? - leaf->values->offsets.getLength() : - leaf->values->offset_index.at(doc_index+1); - - uint32_t doc_indices[1] = {doc_index}; - remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1); - - leaf->values->offsets.remove_index(start_offset, end_offset); - leaf->values->ids.remove_values(seq_id_values, 1); - - /*len = leaf->values->offset_index.getLength(); - for(auto i=0; ivalues->offset_index.at(i) << std::endl; + for(auto & name_field: search_schema) { + std::vector tokens; + if(name_field.second.type == field_types::STRING) { + StringUtils::split(document[name_field.first], tokens, " "); + } else if(name_field.second.type == field_types::STRING_ARRAY) { + tokens = document[name_field.first].get>(); + } else if(name_field.second.type == field_types::INT32) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + int32_t value = document[name_field.first].get(); + encode_int32(value, key); + tokens.push_back(std::string((char*)key, KEY_LEN)); + } else if(name_field.second.type == field_types::INT32_ARRAY) { + std::vector values = document[name_field.first].get>(); + for(const int32_t value: values) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + encode_int32(value, key); + tokens.push_back(std::string((char*)key, KEY_LEN)); } - std::cout << "----" << std::endl;*/ + } else if(name_field.second.type == field_types::INT64) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + int64_t value = document[name_field.first].get(); + encode_int64(value, key); + tokens.push_back(std::string((char*)key, KEY_LEN)); + } else if(name_field.second.type == field_types::INT64_ARRAY) { + std::vector values = document[name_field.first].get>(); + for(const int64_t value: values) { + const int KEY_LEN = 8; + unsigned char key[KEY_LEN]; + encode_int64(value, key); + tokens.push_back(std::string((char*)key, KEY_LEN)); + } + } - if(leaf->values->ids.getLength() == 0) { - art_delete(search_index.at("title"), key, key_len); + for(auto & token: tokens) { + const unsigned char *key; + int key_len; + + if(name_field.second.type == field_types::STRING_ARRAY || name_field.second.type == field_types::STRING) { + std::transform(token.begin(), token.end(), token.begin(), ::tolower); + key = (const unsigned char *) token.c_str(); + key_len = (int) (token.length() + 1); + } else { + key = (const unsigned char *) token.c_str(); + key_len = (int) (token.length()); + } + + art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len); + if(leaf != NULL) { + uint32_t seq_id_values[1] = {seq_id}; + uint32_t doc_index = leaf->values->ids.indexOf(seq_id); + + if(doc_index == leaf->values->ids.getLength()) { + // not found - happens when 2 tokens repeat in a field, e.g "is it or is is not?" + continue; + } + + uint32_t start_offset = leaf->values->offset_index.at(doc_index); + uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ? + leaf->values->offsets.getLength() : + leaf->values->offset_index.at(doc_index+1); + + uint32_t doc_indices[1] = {doc_index}; + remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1); + + leaf->values->offsets.remove_index(start_offset, end_offset); + leaf->values->ids.remove_values(seq_id_values, 1); + + /*len = leaf->values->offset_index.getLength(); + for(auto i=0; ivalues->offset_index.at(i) << std::endl; + } + std::cout << "----" << std::endl;*/ + + if(leaf->values->ids.getLength() == 0) { + art_values* values = (art_values*) art_delete(search_index.at(name_field.first), key, key_len); + delete values; + values = nullptr; + } } } } + // remove facets if any + for(auto & field_facet_value: facet_index) { + field_facet_value.second.doc_values.erase(seq_id); + } + + // remove sort index if any + for(auto & field_doc_value_map: sort_index) { + field_doc_value_map.second->erase(seq_id); + } + store->remove(get_doc_id_key(id)); store->remove(get_seq_id_key(seq_id)); + + return Option(id); } -std::string Collection::get_next_seq_id_key(std::string collection_name) { +std::string Collection::get_next_seq_id_key(const std::string & collection_name) { return std::string(COLLECTION_NEXT_SEQ_PREFIX) + "_" + collection_name; } @@ -1035,7 +1134,7 @@ std::string Collection::get_seq_id_key(uint32_t seq_id) { return get_seq_id_collection_prefix() + "_" + std::string(bytes, bytes+4); } -std::string Collection::get_doc_id_key(std::string doc_id) { +std::string Collection::get_doc_id_key(const std::string & doc_id) { return std::to_string(collection_id) + "_" + DOC_ID_PREFIX + doc_id; } @@ -1059,15 +1158,15 @@ std::vector Collection::get_facet_fields() { return facet_fields_copy; } -std::vector Collection::get_rank_fields() { - return rank_fields; +std::vector Collection::get_sort_fields() { + return sort_fields; } spp::sparse_hash_map Collection::get_schema() { return search_schema; }; -std::string Collection::get_meta_key(std::string collection_name) { +std::string Collection::get_meta_key(const std::string & collection_name) { return COLLECTION_META_PREFIX + collection_name; } @@ -1075,6 +1174,6 @@ std::string Collection::get_seq_id_collection_prefix() { return std::to_string(collection_id) + "_" + std::string(SEQ_ID_PREFIX); } -std::string Collection::get_token_ordering_field() { - return token_ordering_field; +std::string Collection::get_token_ranking_field() { + return token_ranking_field; } \ No newline at end of file diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index ca6d86c5..4e2e1ddc 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -23,7 +23,7 @@ void CollectionManager::init(Store *store) { std::vector collection_meta_jsons; store->scan_fill(Collection::COLLECTION_META_PREFIX, collection_meta_jsons); - for(auto collection_meta_json: collection_meta_jsons) { + for(auto & collection_meta_json: collection_meta_jsons) { nlohmann::json collection_meta = nlohmann::json::parse(collection_meta_json); std::string this_collection_name = collection_meta[COLLECTION_NAME_KEY].get(); @@ -45,10 +45,15 @@ void CollectionManager::init(Store *store) { store->get(Collection::get_next_seq_id_key(this_collection_name), collection_next_seq_id_str); uint32_t collection_next_seq_id = (const uint32_t) std::stoi(collection_next_seq_id_str); - std::vector collection_rank_fields = - collection_meta[COLLECTION_RANK_FIELDS_KEY].get>(); - std::string token_ordering_field = collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY].get(); + std::vector collection_sort_fields; + nlohmann::json sort_fields_map = collection_meta[COLLECTION_SORT_FIELDS_KEY]; + + for (nlohmann::json::iterator it = sort_fields_map.begin(); it != sort_fields_map.end(); ++it) { + collection_sort_fields.push_back({it.value()[fields::name], it.value()[fields::type]}); + } + + std::string token_ranking_field = collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY].get(); Collection* collection = new Collection(this_collection_name, collection_meta[COLLECTION_ID_KEY].get(), @@ -56,8 +61,8 @@ void CollectionManager::init(Store *store) { store, search_fields, facet_fields, - collection_rank_fields, - token_ordering_field); + collection_sort_fields, + token_ranking_field); // Fetch records from the store and re-create memory index std::vector documents; @@ -82,8 +87,8 @@ void CollectionManager::init(Store *store) { Collection* CollectionManager::create_collection(std::string name, const std::vector & search_fields, const std::vector & facet_fields, - const std::vector & rank_fields, - const std::string & token_ordering_field) { + const std::vector & sort_fields, + const std::string & token_ranking_field) { if(store->contains(Collection::get_meta_key(name))) { return nullptr; } @@ -91,7 +96,7 @@ Collection* CollectionManager::create_collection(std::string name, const std::ve nlohmann::json collection_meta; nlohmann::json search_fields_json = nlohmann::json::array();; - for(const field& search_field: search_fields) { + for(const field & search_field: search_fields) { nlohmann::json field_val; field_val[fields::name] = search_field.name; field_val[fields::type] = search_field.type; @@ -99,22 +104,30 @@ Collection* CollectionManager::create_collection(std::string name, const std::ve } nlohmann::json facet_fields_json = nlohmann::json::array();; - for(const field& facet_field: facet_fields) { + for(const field & facet_field: facet_fields) { nlohmann::json field_val; field_val[fields::name] = facet_field.name; field_val[fields::type] = facet_field.type; facet_fields_json.push_back(field_val); } + nlohmann::json sort_fields_json = nlohmann::json::array();; + for(const field & sort_field: sort_fields) { + nlohmann::json sort_field_val; + sort_field_val[fields::name] = sort_field.name; + sort_field_val[fields::type] = sort_field.type; + sort_fields_json.push_back(sort_field_val); + } + collection_meta[COLLECTION_NAME_KEY] = name; collection_meta[COLLECTION_ID_KEY] = next_collection_id; collection_meta[COLLECTION_SEARCH_FIELDS_KEY] = search_fields_json; collection_meta[COLLECTION_FACET_FIELDS_KEY] = facet_fields_json; - collection_meta[COLLECTION_RANK_FIELDS_KEY] = rank_fields; - collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY] = token_ordering_field; + collection_meta[COLLECTION_SORT_FIELDS_KEY] = sort_fields_json; + collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY] = token_ranking_field; Collection* new_collection = new Collection(name, next_collection_id, 0, store, search_fields, facet_fields, - rank_fields, token_ordering_field); + sort_fields, token_ranking_field); store->insert(Collection::get_meta_key(name), collection_meta.dump()); store->insert(Collection::get_next_seq_id_key(name), std::to_string(0)); @@ -135,12 +148,6 @@ Collection* CollectionManager::get_collection(std::string collection_name) { return nullptr; } -CollectionManager::~CollectionManager() { - for(auto kv: collections) { - drop_collection(kv.first); - } -} - bool CollectionManager::drop_collection(std::string collection_name) { Collection* collection = get_collection(collection_name); if(collection == nullptr) { diff --git a/src/http_server.cpp b/src/http_server.cpp new file mode 100644 index 00000000..cd02e2d5 --- /dev/null +++ b/src/http_server.cpp @@ -0,0 +1,219 @@ +#include "http_server.h" +#include "string_utils.h" +#include +#include + +h2o_globalconf_t HttpServer::config; +h2o_context_t HttpServer::ctx; +h2o_accept_ctx_t HttpServer::accept_ctx; +std::vector HttpServer::routes; + +HttpServer::HttpServer(std::string listen_address, uint32_t listen_port): + listen_address(listen_address), listen_port(listen_port) { + h2o_config_init(&config); + hostconf = h2o_config_register_host(&config, h2o_iovec_init(H2O_STRLIT("default")), 65535); + register_handler(hostconf, "/", catch_all_handler); +} + +void HttpServer::on_accept(h2o_socket_t *listener, const char *err) { + h2o_socket_t *sock; + + if (err != NULL) { + return; + } + + if ((sock = h2o_evloop_socket_accept(listener)) == NULL) { + return; + } + + h2o_accept(&accept_ctx, sock); +} + +int HttpServer::create_listener(void) { + struct sockaddr_in addr; + int fd, reuseaddr_flag = 1; + h2o_socket_t *sock; + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_port = htons(listen_port); + inet_pton(AF_INET, listen_address.c_str(), &(addr.sin_addr)); + + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) == -1 || + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr_flag, sizeof(reuseaddr_flag)) != 0 || + bind(fd, (struct sockaddr *)&addr, sizeof(addr)) != 0 || + listen(fd, SOMAXCONN) != 0) { + return -1; + } + + ctx.globalconf->server_name = h2o_strdup(NULL, "", SIZE_MAX); + sock = h2o_evloop_socket_create(ctx.loop, fd, H2O_SOCKET_FLAG_DONT_READ); + h2o_socket_read_start(sock, on_accept); + + return 0; +} + +int HttpServer::run() { + + signal(SIGPIPE, SIG_IGN); + h2o_context_init(&ctx, h2o_evloop_create(), &config); + + accept_ctx.ctx = &ctx; + accept_ctx.hosts = config.hosts; + + if (create_listener() != 0) { + std::cerr << "Failed to listen on " << listen_address << ":" << listen_port << std::endl + << "Error: " << strerror(errno) << std::endl; + return 1; + } + + while (h2o_evloop_run(ctx.loop) == 0); + + return 0; +} + +h2o_pathconf_t* HttpServer::register_handler(h2o_hostconf_t *hostconf, const char *path, + int (*on_req)(h2o_handler_t *, h2o_req_t *)) { + h2o_pathconf_t *pathconf = h2o_config_register_path(hostconf, path, 0); + h2o_handler_t *handler = h2o_create_handler(pathconf, sizeof(*handler)); + handler->on_req = on_req; + return pathconf; +} + +const char* HttpServer::get_status_reason(uint32_t status_code) { + switch(status_code) { + case 200: return "OK"; + case 201: return "Created"; + case 400: return "Bad Request"; + case 404: return "Not Found"; + case 409: return "Conflict"; + case 500: return "Internal Server Error"; + default: return ""; + } +} + + +std::map HttpServer::parse_query(const std::string& query) { + std::map query_map; + std::regex pattern("([\\w+%]+)=([^&]*)"); + + auto words_begin = std::sregex_iterator(query.begin(), query.end(), pattern); + auto words_end = std::sregex_iterator(); + + for (std::sregex_iterator i = words_begin; i != words_end; i++) { + std::string key = (*i)[1].str(); + std::string raw_value = (*i)[2].str(); + std::string value = StringUtils::url_decode(raw_value); + if(query_map.count(value) == 0) { + query_map[key] = value; + } else { + query_map[key] = query_map[key] + "&&" + value; + } + } + + return query_map; +} + +int HttpServer::catch_all_handler(h2o_handler_t *self, h2o_req_t *req) { + const std::string & http_method = std::string(req->method.base, req->method.len); + const std::string & path = std::string(req->path.base, req->path.len); + h2o_generator_t generator = {NULL, NULL}; + + std::vector path_with_query_parts; + StringUtils::split(path, path_with_query_parts, "?"); + const std::string & path_without_query = path_with_query_parts[0]; + + std::vector path_parts; + StringUtils::split(path_without_query, path_parts, "/"); + + h2o_iovec_t query = req->query_at != SIZE_MAX ? + h2o_iovec_init(req->path.base + req->query_at, req->path.len - req->query_at) : + h2o_iovec_init(H2O_STRLIT("")); + + std::string query_str(query.base, query.len); + std::map query_map = parse_query(query_str); + const std::string & req_body = std::string(req->entity.base, req->entity.len); + + for(const route_path & rpath: routes) { + if(rpath.path_parts.size() != path_parts.size() || rpath.http_method != http_method) { + continue; + } + + bool found = true; + + for(size_t i = 0; i < rpath.path_parts.size(); i++) { + const std::string & rpart = rpath.path_parts[i]; + const std::string & given_part = path_parts[i]; + if(rpart != given_part && rpart[0] != ':') { + found = false; + goto check_next_route; + } + } + + check_next_route: + + if(found) { + // routes match - iterate and extract path params + for(size_t i = 0; i < rpath.path_parts.size(); i++) { + const std::string & path_part = rpath.path_parts[i]; + if(path_part[0] == ':') { + query_map.emplace(path_part.substr(1), path_parts[i]); + } + } + + http_req request = {query_map, req_body}; + http_res response; + (rpath.handler)(request, response); + + h2o_iovec_t body = h2o_strdup(&req->pool, response.body.c_str(), SIZE_MAX); + req->res.status = response.status_code; + req->res.reason = get_status_reason(response.status_code); + h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8")); + h2o_start_response(req, &generator); + h2o_send(req, &body, 1, 1); + + return 0; + } + } + + h2o_iovec_t res_body = h2o_strdup(&req->pool, "{ \"message\": \"Not Found\"}", SIZE_MAX); + req->res.status = 404; + req->res.reason = get_status_reason(404); + h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8")); + h2o_start_response(req, &generator); + h2o_send(req, &res_body, 1, 1); + + return 0; +} + +void HttpServer::get(const std::string & path, void (*handler)(http_req &, http_res &)) { + std::vector path_parts; + StringUtils::split(path, path_parts, "/"); + route_path rpath = {"GET", path_parts, handler}; + routes.push_back(rpath); +} + +void HttpServer::post(const std::string & path, void (*handler)(http_req &, http_res &)) { + std::vector path_parts; + StringUtils::split(path, path_parts, "/"); + route_path rpath = {"POST", path_parts, handler}; + routes.push_back(rpath); +} + +void HttpServer::put(const std::string & path, void (*handler)(http_req &, http_res &)) { + std::vector path_parts; + StringUtils::split(path, path_parts, "/"); + route_path rpath = {"PUT", path_parts, handler}; + routes.push_back(rpath); +} + +void HttpServer::del(const std::string & path, void (*handler)(http_req &, http_res &)) { + std::vector path_parts; + StringUtils::split(path, path_parts, "/"); + route_path rpath = {"DELETE", path_parts, handler}; + routes.push_back(rpath); +} + +HttpServer::~HttpServer() { + +} \ No newline at end of file diff --git a/src/intersection.cpp b/src/intersection.cpp deleted file mode 100644 index 3dd9e95a..00000000 --- a/src/intersection.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include "intersection.h" - -size_t Intersection::scalar(const uint32_t *A, const size_t lenA, - const uint32_t *B, const size_t lenB, uint32_t *out) { - const uint32_t *const initout(out); - if (lenA == 0 || lenB == 0) - return 0; - - const uint32_t *endA = A + lenA; - const uint32_t *endB = B + lenB; - - while (1) { - while (*A < *B) { - SKIP_FIRST_COMPARE: - if (++A == endA) - return (out - initout); - } - while (*A > *B) { - if (++B == endB) - return (out - initout); - } - if (*A == *B) { - *out++ = *A; - if (++A == endA || ++B == endB) - return (out - initout); - } else { - goto SKIP_FIRST_COMPARE; - } - } - - return (out - initout); // NOTREACHED -} diff --git a/src/main/benchmark.cpp b/src/main/benchmark.cpp index 22919a37..b9bfb38a 100644 --- a/src/main/benchmark.cpp +++ b/src/main/benchmark.cpp @@ -17,14 +17,14 @@ int main(int argc, char* argv[]) { system("rm -rf /tmp/typesense-data && mkdir -p /tmp/typesense-data"); std::vector fields_to_index = {field("title", field_types::STRING)}; - std::vector rank_fields = {"points"}; + std::vector sort_fields = { field("points", "INT32")}; Store *store = new Store("/tmp/typesense-data"); CollectionManager & collectionManager = CollectionManager::get_instance(); collectionManager.init(store); Collection *collection = collectionManager.get_collection("collection"); if(collection == nullptr) { - collection = collectionManager.create_collection("collection", fields_to_index, {}, rank_fields); + collection = collectionManager.create_collection("collection", fields_to_index, {}, sort_fields); } std::ifstream infile("/Users/kishore/Downloads/hnstories_small.jsonl"); @@ -48,7 +48,7 @@ int main(int argc, char* argv[]) { while(counter < 3000) { auto i = counter % 5; - auto results = collection->search(queries[i], search_fields, "", { }, {"points"}, 1, 100, MAX_SCORE, 0); + auto results = collection->search(queries[i], search_fields, "", { }, {sort_field("points", "DESC")}, 1, 100, MAX_SCORE, 0); results_total += results.size(); counter++; } diff --git a/src/main/main.cpp b/src/main/main.cpp index 52b28058..2b6ee5cc 100644 --- a/src/main/main.cpp +++ b/src/main/main.cpp @@ -8,72 +8,95 @@ #include #include #include "string_utils.h" +#include #include "collection.h" #include "collection_manager.h" using namespace std; -void find_indices(const uint32_t *result_ids, int low, int high, std::vector & results) { - if(high >= low) { - size_t pivot = (low + high) / 2; - //std::cout << pivot << std::endl; - results.at(pivot) = result_ids[pivot]; - find_indices(result_ids, low, pivot-1, results); - find_indices(result_ids, pivot+1, high, results); - } -} - int main(int argc, char* argv[]) { - std::vector results(3); - uint32_t *result_ids = new uint32_t[3]; - /*for(auto i = 0; i < 100; i++) { - result_ids[i] = i; - }*/ - result_ids[0] = 6; - result_ids[1] = 19; - result_ids[2] = 21; - - find_indices(result_ids, 0, 2, results); - //std::sort(results.begin(), results.end()); - for(auto i : results) { - std::cout << i << std::endl; - } - - return 0; - - const std::string state_dir_path = "/tmp/typesense-data"; - - std::vector fields_to_index = {field("title", field_types::STRING)}; - std::vector rank_fields = {"points"}; Store *store = new Store("/tmp/typesense-data"); CollectionManager & collectionManager = CollectionManager::get_instance(); collectionManager.init(store); - Collection *collection = collectionManager.get_collection("collection"); - if(collection == nullptr) { - collection = collectionManager.create_collection("collection", fields_to_index, {}, rank_fields); - std::ifstream infile(std::string(ROOT_DIR)+"test/documents.jsonl"); - //std::ifstream infile(argv[1]); + std::vector fields_to_index = { + field("lang", field_types::STRING), + field("description", field_types::STRING), + field("topics", field_types::STRING_ARRAY), + field("stars", field_types::INT32), + field("repo_name", field_types::STRING), + field("org", field_types::STRING) + }; + std::vector facet_fields_index = { + field("lang", field_types::STRING), + field("org", field_types::STRING), + field("topics", field_types::STRING_ARRAY) + }; + + std::vector sort_fields = { + field("stars", "INT32") + }; + + Collection *collection = collectionManager.get_collection("github_top1k"); + + if(collection == nullptr) { + collection = collectionManager.create_collection("github_top1k", fields_to_index, facet_fields_index, sort_fields); + } + + int j = 0; + while(j < 1000) { + j++; + + std::ifstream infile(argv[1]); std::string json_line; + cout << "BEGINNING Iteration: " << j << endl << flush; + auto begin = std::chrono::high_resolution_clock::now(); + int doc_id = 0; + while (std::getline(infile, json_line)) { - collection->add(json_line); + nlohmann::json document = nlohmann::json::parse(json_line); + //document["id"] = std::to_string(doc_id); + document["id"] = document["org"].get() + ":" + document["repo_name"].get(); + collection->add(document.dump()); + doc_id++; } infile.close(); - cout << "FINISHED INDEXING!" << endl << flush; + + long long int timeMillis = + std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); + + std::cout << "Time taken for insertion: " << timeMillis << "ms" << std::endl; + begin = std::chrono::high_resolution_clock::now(); + + std::ifstream infile2(argv[1]); + + doc_id = 0; + + while (std::getline(infile2, json_line)) { + nlohmann::json document = nlohmann::json::parse(json_line); + //document["id"] = std::to_string(doc_id); + document["id"] = document["org"].get() + ":" + document["repo_name"].get(); + collection->remove(document["id"]); + doc_id++; + } + + infile2.close(); + + timeMillis = + std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); + + struct rusage r_usage; + getrusage(RUSAGE_SELF,&r_usage); + std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl; + std::cout << "Time taken for deletion: " << timeMillis << "ms" << std::endl; } - //collection->remove("foo"); - - auto begin = std::chrono::high_resolution_clock::now(); - std::vector search_fields = {"title"}; - collection->search("the", search_fields, "", {}, {"points"}, 1, 100, MAX_SCORE, 0); - long long int timeMillis = - std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); - cout << "Time taken: " << timeMillis << "us" << endl; + delete collection; + delete store; return 0; } \ No newline at end of file diff --git a/src/main/server.cpp b/src/main/server.cpp deleted file mode 100644 index 2a94beff..00000000 --- a/src/main/server.cpp +++ /dev/null @@ -1,264 +0,0 @@ -#define H2O_USE_LIBUV 0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "string_utils.h" -#include "collection.h" -#include "collection_manager.h" -#include "option.h" -#include - -#include "h2o.h" -#include "h2o/http1.h" -#include "h2o/http2.h" -#include "h2o/memcached.h" - -static h2o_globalconf_t config; -static h2o_context_t ctx; -static h2o_accept_ctx_t accept_ctx; -std::vector search_fields = {field("title", field_types::STRING), field("points", field_types::INT32)}; -std::vector rank_fields = {"points"}; -Collection *collection; - -static h2o_pathconf_t *register_handler(h2o_hostconf_t *hostconf, const char *path, - int (*on_req)(h2o_handler_t *, h2o_req_t *)) { - h2o_pathconf_t *pathconf = h2o_config_register_path(hostconf, path, 0); - h2o_handler_t *handler = h2o_create_handler(pathconf, sizeof(*handler)); - handler->on_req = on_req; - return pathconf; -} - -std::map parse_query(const std::string& query) { - std::map query_map; - std::regex pattern("([\\w+%]+)=([^&]*)"); - - auto words_begin = std::sregex_iterator(query.begin(), query.end(), pattern); - auto words_end = std::sregex_iterator(); - - for (std::sregex_iterator i = words_begin; i != words_end; i++) { - std::string key = (*i)[1].str(); - std::string raw_value = (*i)[2].str(); - std::string value = StringUtils::url_decode(raw_value); - if(query_map.count(value) == 0) { - query_map[key] = value; - } else { - query_map[key] = query_map[key] + "&&" + value; - } - } - - return query_map; -} - -static int get_search(h2o_handler_t *self, h2o_req_t *req) { - static h2o_generator_t generator = {NULL, NULL}; - h2o_iovec_t query = req->query_at != SIZE_MAX ? - h2o_iovec_init(req->path.base + req->query_at, req->path.len - req->query_at) : - h2o_iovec_init(H2O_STRLIT("")); - - std::string query_str(query.base, query.len); - std::map query_map = parse_query(query_str); - const char *NUM_TYPOS = "num_typos"; - const char *PREFIX = "prefix"; - const char *TOKEN_ORDERING = "token_ordering"; - const char *FILTERS = "filters"; - - if(query_map.count(NUM_TYPOS) == 0) { - query_map[NUM_TYPOS] = "2"; - } - - if(query_map.count(PREFIX) == 0) { - query_map[PREFIX] = "false"; - } - - if(query_map.count(TOKEN_ORDERING) == 0) { - query_map[TOKEN_ORDERING] = "FREQUENCY"; - } - - std::string filter_str = query_map.count(FILTERS) != 0 ? query_map[FILTERS] : ""; - //std::cout << "filter_str: " << filter_str << std::endl; - - token_ordering token_order = (query_map[TOKEN_ORDERING] == "MAX_SCORE") ? MAX_SCORE : FREQUENCY; - - //printf("Query: %s\n", query_map["q"].c_str()); - auto begin = std::chrono::high_resolution_clock::now(); - - std::vector search_fields = {"title"}; - - nlohmann::json result = collection->search(query_map["q"], search_fields, filter_str, { }, - {"points"}, std::stoi(query_map[NUM_TYPOS]), 100, token_order, false); - std::string json_str = result.dump(); - //std::cout << "JSON:" << json_str << std::endl; - struct rusage r_usage; - getrusage(RUSAGE_SELF,&r_usage); - - //std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl; - - h2o_iovec_t body = h2o_strdup(&req->pool, json_str.c_str(), SIZE_MAX); - req->res.status = 200; - req->res.reason = "OK"; - h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8")); - h2o_start_response(req, &generator); - h2o_send(req, &body, 1, 1); - - long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); - std::cout << "Time taken: " << timeMillis << "us" << std::endl; - return 0; -} - -static int post_add_document(h2o_handler_t *self, h2o_req_t *req) { - std::string document(req->entity.base, req->entity.len); - Option inserted_id_op = collection->add(document); - - nlohmann::json json_response; - static h2o_generator_t generator = {NULL, NULL}; - - if(!inserted_id_op.ok()) { - req->res.status = 400; - req->res.reason = "BAD REQUEST"; - json_response["message"] = inserted_id_op.error(); - } else { - req->res.status = 201; - req->res.reason = "CREATED"; - json_response["id"] = inserted_id_op.get(); - } - - h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8")); - h2o_start_response(req, &generator); - - h2o_iovec_t body = h2o_strdup(&req->pool, json_response.dump().c_str(), SIZE_MAX); - h2o_send(req, &body, 1, 1); - return 0; -} - -static int delete_remove_document(h2o_handler_t *self, h2o_req_t *req) { - h2o_iovec_t query = req->query_at != SIZE_MAX ? - h2o_iovec_init(req->path.base + req->query_at, req->path.len - req->query_at) : - h2o_iovec_init(H2O_STRLIT("")); - - std::string query_str(query.base, query.len); - std::map query_map = parse_query(query_str); - - std::string doc_id = query_map["id"]; - - auto begin = std::chrono::high_resolution_clock::now(); - collection->remove(doc_id); - long long int time_micro = std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - begin).count(); - std::cout << "Time taken: " << time_micro << "us" << std::endl; - - nlohmann::json json_response; - json_response["id"] = doc_id; - json_response["status"] = "SUCCESS"; - - static h2o_generator_t generator = {NULL, NULL}; - req->res.status = 200; - req->res.reason = "OK"; - h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8")); - h2o_start_response(req, &generator); - h2o_iovec_t body = h2o_strdup(&req->pool, json_response.dump().c_str(), SIZE_MAX); - h2o_send(req, &body, 1, 1); - return 0; -} - -static void on_accept(h2o_socket_t *listener, const char *err) { - h2o_socket_t *sock; - - if (err != NULL) { - return; - } - - if ((sock = h2o_evloop_socket_accept(listener)) == NULL) - return; - h2o_accept(&accept_ctx, sock); -} - -static int create_listener(void) { - struct sockaddr_in addr; - int fd, reuseaddr_flag = 1; - h2o_socket_t *sock; - - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(0x7f000001); - addr.sin_port = htons(1088); - - if ((fd = socket(AF_INET, SOCK_STREAM, 0)) == -1 || - setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr_flag, sizeof(reuseaddr_flag)) != 0 || - bind(fd, (struct sockaddr *)&addr, sizeof(addr)) != 0 || listen(fd, SOMAXCONN) != 0) { - return -1; - } - - sock = h2o_evloop_socket_create(ctx.loop, fd, H2O_SOCKET_FLAG_DONT_READ); - h2o_socket_read_start(sock, on_accept); - - return 0; -} - -void index_documents(std::string path_to_docs) { - std::ifstream infile(path_to_docs); -// std::ifstream infile(path_to_docs); - - std::string json_line; - - while (std::getline(infile, json_line)) { - collection->add(json_line); - } - - infile.close(); - std::cout << "FINISHED INDEXING!" << std::endl << std::flush; - struct rusage r_usage; - getrusage(RUSAGE_SELF,&r_usage); - - std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl; -} - -int main(int argc, char **argv) { - signal(SIGPIPE, SIG_IGN); - - Store *store = new Store("/tmp/typesense-data"); - - CollectionManager & collectionManager = CollectionManager::get_instance(); - collectionManager.init(store); - - collection = collectionManager.get_collection("collection"); - if(collection == nullptr) { - collection = collectionManager.create_collection("collection", search_fields, {}, rank_fields); - //index_documents(std::string(ROOT_DIR)+"test/documents.jsonl"); - if(argc > 1) { - index_documents(argv[1]); - } - } - - h2o_config_init(&config); - h2o_hostconf_t *hostconf = h2o_config_register_host(&config, h2o_iovec_init(H2O_STRLIT("default")), 65535); - register_handler(hostconf, "/add", post_add_document); - register_handler(hostconf, "/delete", delete_remove_document); - register_handler(hostconf, "/search", get_search); - - h2o_context_init(&ctx, h2o_evloop_create(), &config); - - accept_ctx.ctx = &ctx; - accept_ctx.hosts = config.hosts; - - if (create_listener() != 0) { - fprintf(stderr, "failed to listen to 127.0.0.1:1088:%s\n", strerror(errno)); - return 1; - } - - while (h2o_evloop_run(ctx.loop) == 0); - - return 0; -} \ No newline at end of file diff --git a/src/main/typesense_server.cpp b/src/main/typesense_server.cpp new file mode 100644 index 00000000..c6dba806 --- /dev/null +++ b/src/main/typesense_server.cpp @@ -0,0 +1,25 @@ +#include +#include "http_server.h" +#include "api.h" + +int main(int argc, char **argv) { + cmdline::parser options; + options.add("data-dir", 'd', "Directory where data will be stored.", true); + options.add("listen-address", 'a', "Address to which Typesense server binds.", false, "0.0.0.0"); + options.add("listen-port", 'p', "Port on which Typesense server listens.", false, 8080); + options.parse_check(argc, argv); + + Store store(options.get("data-dir")); + CollectionManager & collectionManager = CollectionManager::get_instance(); + collectionManager.init(&store); + + HttpServer server(options.get("listen-address"), options.get("listen-port")); + + server.post("/collection", post_create_collection); + server.post("/collection/:collection", post_add_document); + server.get("/collection/:collection/search", get_search); + server.del("/collection/:collection/:id", del_remove_document); + + server.run(); + return 0; +} \ No newline at end of file diff --git a/src/sorted_array.cpp b/src/sorted_array.cpp index b02ccd1c..8c8310fa 100644 --- a/src/sorted_array.cpp +++ b/src/sorted_array.cpp @@ -1,8 +1,8 @@ #include "sorted_array.h" -#include "intersection.h" +#include "array_utils.h" void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_length) { - min = sorted_array[0]; + min = array_length != 0 ? sorted_array[0] : 0; max = array_length > 1 ? sorted_array[array_length-1] : min; uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR); @@ -55,6 +55,10 @@ bool sorted_array::contains(uint32_t value) { } uint32_t sorted_array::indexOf(uint32_t value) { + if(length == 0) { + return length; + } + uint32_t actual; uint32_t index = for_lower_bound_search(in, length, value, &actual); if(actual == value) return index; @@ -173,7 +177,7 @@ size_t sorted_array::intersect(uint32_t* arr, const size_t arr_length, uint32_t* uint32_t* curr = uncompress(); uint32_t* results = new uint32_t[std::min(arr_length, (size_t) length)]; - size_t results_length = Intersection::scalar(arr, arr_length, curr, length, results); + size_t results_length = ArrayUtils::and_scalar(arr, arr_length, curr, length, results); delete[] curr; *results_out = results; diff --git a/test/collection_manager_test.cpp b/test/collection_manager_test.cpp index 6d301322..79510942 100644 --- a/test/collection_manager_test.cpp +++ b/test/collection_manager_test.cpp @@ -12,7 +12,9 @@ protected: Collection *collection1; std::vector search_fields; std::vector facet_fields; - std::vector rank_fields; + std::vector sort_fields_index; + + std::vector sort_fields; void setupCollection() { std::string state_dir_path = "/tmp/typesense_test/coll_manager_test_db"; @@ -24,10 +26,11 @@ protected: search_fields = {field("title", field_types::STRING), field("starring", field_types::STRING)}; facet_fields = {field("starring", field_types::STRING)}; - rank_fields = {"points"}; + sort_fields = { sort_field("points", "DESC") }; + sort_fields_index = { field("points", "INT32") }; collection1 = collectionManager.create_collection("collection1", search_fields, facet_fields, - rank_fields, "points"); + sort_fields_index, "points"); } virtual void SetUp() { @@ -53,7 +56,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) { std::vector search_fields = {"starring", "title"}; std::vector facets; - nlohmann::json results = collection1->search("thomas", search_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false); + nlohmann::json results = collection1->search("thomas", search_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); spp::sparse_hash_map schema = collection1->get_schema(); @@ -70,11 +73,12 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) { ASSERT_EQ(0, collection1->get_collection_id()); ASSERT_EQ(18, collection1->get_next_seq_id()); ASSERT_EQ(facet_fields_expected, collection1->get_facet_fields()); - ASSERT_EQ(rank_fields, collection1->get_rank_fields()); + ASSERT_EQ(1, collection1->get_sort_fields().size()); + ASSERT_EQ(sort_fields[0].name, collection1->get_sort_fields()[0].name); ASSERT_EQ(schema.size(), collection1->get_schema().size()); - ASSERT_EQ("points", collection1->get_token_ordering_field()); + ASSERT_EQ("points", collection1->get_token_ranking_field()); - results = collection1->search("thomas", search_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false); + results = collection1->search("thomas", search_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); } diff --git a/test/collection_test.cpp b/test/collection_test.cpp index 659fd9c3..74e107d0 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "collection.h" @@ -11,8 +12,9 @@ protected: std::vector query_fields; Store *store; CollectionManager & collectionManager = CollectionManager::get_instance(); - std::vector rank_fields; std::vector facet_fields; + std::vector sort_fields_index; + std::vector sort_fields; void setupCollection() { std::string state_dir_path = "/tmp/typesense_test/collection"; @@ -27,12 +29,13 @@ protected: query_fields = {"title"}; facet_fields = { }; - rank_fields = {"points"}; + sort_fields = { sort_field("points", "DESC") }; + sort_fields_index = { field("points", "INT32") }; collection = collectionManager.get_collection("collection"); if(collection == nullptr) { collection = collectionManager.create_collection("collection", search_fields, facet_fields, - rank_fields, "points"); + sort_fields_index, "points"); } std::string json_line; @@ -60,7 +63,7 @@ protected: TEST_F(CollectionTest, ExactSearchShouldBeStable) { std::vector facets; - nlohmann::json results = collection->search("the", query_fields, "", facets, rank_fields, 0, 10); + nlohmann::json results = collection->search("the", query_fields, "", facets, sort_fields, 0, 10); ASSERT_EQ(7, results["hits"].size()); ASSERT_EQ(7, results["found"].get()); @@ -73,12 +76,29 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) { std::string result_id = result["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } + + // check ASC sorting + std::vector sort_fields_asc = { sort_field("points", "ASC") }; + + results = collection->search("the", query_fields, "", facets, sort_fields_asc, 0, 10); + ASSERT_EQ(7, results["hits"].size()); + ASSERT_EQ(7, results["found"].get()); + + ids = {"16", "13", "10", "8", "6", "foo", "1"}; + + for(size_t i = 0; i < results["hits"].size(); i++) { + nlohmann::json result = results["hits"].at(i); + std::string id = ids.at(i); + std::string result_id = result["id"]; + ASSERT_STREQ(id.c_str(), result_id.c_str()); + } } TEST_F(CollectionTest, ExactPhraseSearch) { std::vector facets; - nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, rank_fields, 0, 10); + nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, sort_fields, 0, 10); ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(5, results["found"].get()); /* Sort by (match, diff, score) @@ -98,9 +118,28 @@ TEST_F(CollectionTest, ExactPhraseSearch) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } + // Check ASC sort order + std::vector sort_fields_asc = { sort_field("points", "ASC") }; + results = collection->search("rocket launch", query_fields, "", facets, sort_fields_asc, 0, 10); + ASSERT_EQ(5, results["hits"].size()); + ASSERT_EQ(5, results["found"].get()); + + ids = {"8", "17", "1", "16", "13"}; + + for(size_t i = 0; i < results["hits"].size(); i++) { + nlohmann::json result = results["hits"].at(i); + std::string id = ids.at(i); + std::string result_id = result["id"]; + ASSERT_STREQ(id.c_str(), result_id.c_str()); + } + // Check pagination - results = collection->search("rocket launch", query_fields, "", facets, rank_fields, 0, 3); + results = collection->search("rocket launch", query_fields, "", facets, sort_fields, 0, 3); ASSERT_EQ(3, results["hits"].size()); + ASSERT_EQ(4, results["found"].get()); + + ids = {"8", "1", "17", "16", "13"}; + for(size_t i = 0; i < 3; i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); @@ -112,7 +151,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) { TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { // Tokens that are not found in the index should be skipped std::vector facets; - nlohmann::json results = collection->search("DoesNotExist from", query_fields, "", facets, rank_fields, 0, 10); + nlohmann::json results = collection->search("DoesNotExist from", query_fields, "", facets, sort_fields, 0, 10); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"2", "17"}; @@ -125,7 +164,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { } // with non-zero cost - results = collection->search("DoesNotExist from", query_fields, "", facets, rank_fields, 1, 10); + results = collection->search("DoesNotExist from", query_fields, "", facets, sort_fields, 1, 10); ASSERT_EQ(2, results["hits"].size()); for(size_t i = 0; i < results["hits"].size(); i++) { @@ -136,7 +175,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { } // with 2 indexed words - results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, rank_fields, 1, 10); + results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, sort_fields, 1, 10); ASSERT_EQ(2, results["hits"].size()); ids = {"2", "17"}; @@ -148,17 +187,17 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { } results.clear(); - results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, rank_fields, 0, 10); + results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, sort_fields, 0, 10); ASSERT_EQ(0, results["hits"].size()); results.clear(); - results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, rank_fields, 2, 10); + results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, sort_fields, 2, 10); ASSERT_EQ(0, results["hits"].size()); } TEST_F(CollectionTest, PartialPhraseSearch) { std::vector facets; - nlohmann::json results = collection->search("rocket research", query_fields, "", facets, rank_fields, 0, 10); + nlohmann::json results = collection->search("rocket research", query_fields, "", facets, sort_fields, 0, 10); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"1", "8", "16", "17"}; @@ -173,7 +212,7 @@ TEST_F(CollectionTest, PartialPhraseSearch) { TEST_F(CollectionTest, QueryWithTypo) { std::vector facets; - nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, rank_fields, 2, 3); + nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, sort_fields, 2, 3); ASSERT_EQ(3, results["hits"].size()); std::vector ids = {"19", "20", "21"}; @@ -186,7 +225,7 @@ TEST_F(CollectionTest, QueryWithTypo) { } results.clear(); - results = collection->search("fer thx", query_fields, "", facets, rank_fields, 1, 3); + results = collection->search("fer thx", query_fields, "", facets, sort_fields, 1, 3); ids = {"1", "10", "13"}; ASSERT_EQ(3, results["hits"].size()); @@ -201,7 +240,7 @@ TEST_F(CollectionTest, QueryWithTypo) { TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { std::vector facets; - nlohmann::json results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 2, MAX_SCORE, false); + nlohmann::json results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 2, MAX_SCORE, false); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"22", "23"}; @@ -212,7 +251,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 3, FREQUENCY, false); + results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 3, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"3", "12", "24"}; @@ -224,19 +263,19 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { } // Check pagination - results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 1, FREQUENCY, false); + results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 1, FREQUENCY, false); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); std::string solo_id = results["hits"].at(0)["id"]; ASSERT_STREQ("3", solo_id.c_str()); - results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 2, FREQUENCY, false); + results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 2, FREQUENCY, false); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); // Check total ordering - results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 10, FREQUENCY, false); + results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 10, FREQUENCY, false); ASSERT_EQ(5, results["hits"].size()); ids = {"3", "12", "24", "22", "23"}; @@ -247,7 +286,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 10, MAX_SCORE, false); + results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 10, MAX_SCORE, false); ASSERT_EQ(5, results["hits"].size()); ids = {"22", "23", "3", "12", "24"}; @@ -262,8 +301,9 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { TEST_F(CollectionTest, TextContainingAnActualTypo) { // A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens std::vector facets; - nlohmann::json results = collection->search("ISX what", query_fields, "", facets, rank_fields, 1, 4, FREQUENCY, false); + nlohmann::json results = collection->search("ISX what", query_fields, "", facets, sort_fields, 1, 4, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); + ASSERT_EQ(4, results["found"].get()); std::vector ids = {"19", "6", "21", "8"}; @@ -275,8 +315,9 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) { } // Record containing exact token match should appear first - results = collection->search("ISX", query_fields, "", facets, rank_fields, 1, 10, FREQUENCY, false); + results = collection->search("ISX", query_fields, "", facets, sort_fields, 1, 10, FREQUENCY, false); ASSERT_EQ(8, results["hits"].size()); + ASSERT_EQ(8, results["found"].get()); ids = {"20", "19", "6", "3", "21", "4", "10", "8"}; @@ -290,7 +331,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) { TEST_F(CollectionTest, PrefixSearching) { std::vector facets; - nlohmann::json results = collection->search("ex", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, true); + nlohmann::json results = collection->search("ex", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, true); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"12", "6"}; @@ -301,7 +342,7 @@ TEST_F(CollectionTest, PrefixSearching) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = collection->search("ex", query_fields, "", facets, rank_fields, 0, 10, MAX_SCORE, true); + results = collection->search("ex", query_fields, "", facets, sort_fields, 0, 10, MAX_SCORE, true); ASSERT_EQ(2, results["hits"].size()); ids = {"6", "12"}; @@ -311,6 +352,19 @@ TEST_F(CollectionTest, PrefixSearching) { std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } + + std::cout << "WHAT EX..." << std::endl; + + results = collection->search("what ex", query_fields, "", facets, sort_fields, 0, 10, MAX_SCORE, true); + ASSERT_EQ(9, results["hits"].size()); + ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"}; + + for(size_t i = 0; i < results["hits"].size(); i++) { + nlohmann::json result = results["hits"].at(i); + std::string result_id = result["id"]; + std::string id = ids.at(i); + ASSERT_STREQ(id.c_str(), result_id.c_str()); + } } TEST_F(CollectionTest, MultipleFields) { @@ -319,11 +373,10 @@ TEST_F(CollectionTest, MultipleFields) { std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl"); std::vector fields = {field("title", field_types::STRING), field("starring", field_types::STRING), field("cast", field_types::STRING_ARRAY)}; - std::vector rank_fields = {"points"}; coll_mul_fields = collectionManager.get_collection("coll_mul_fields"); if(coll_mul_fields == nullptr) { - coll_mul_fields = collectionManager.create_collection("coll_mul_fields", fields, facet_fields, rank_fields); + coll_mul_fields = collectionManager.create_collection("coll_mul_fields", fields, facet_fields, sort_fields_index); } std::string json_line; @@ -336,7 +389,7 @@ TEST_F(CollectionTest, MultipleFields) { query_fields = {"title", "starring"}; std::vector facets; - nlohmann::json results = coll_mul_fields->search("Will", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false); + nlohmann::json results = coll_mul_fields->search("Will", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"3", "2", "1", "0"}; @@ -351,7 +404,7 @@ TEST_F(CollectionTest, MultipleFields) { // when "starring" takes higher priority than "title" query_fields = {"starring", "title"}; - results = coll_mul_fields->search("thomas", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("thomas", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); ids = {"15", "14", "12", "13"}; @@ -364,11 +417,11 @@ TEST_F(CollectionTest, MultipleFields) { } query_fields = {"starring", "title", "cast"}; - results = coll_mul_fields->search("ben affleck", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("ben affleck", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(1, results["hits"].size()); query_fields = {"cast"}; - results = coll_mul_fields->search("chris", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("chris", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"6", "1", "7"}; @@ -380,7 +433,7 @@ TEST_F(CollectionTest, MultipleFields) { } query_fields = {"cast"}; - results = coll_mul_fields->search("chris pine", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("chris pine", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"7", "6", "1"}; @@ -399,11 +452,12 @@ TEST_F(CollectionTest, FilterOnNumericFields) { std::vector fields = {field("name", field_types::STRING), field("age", field_types::INT32), field("years", field_types::INT32_ARRAY), field("timestamps", field_types::INT64_ARRAY)}; - std::vector rank_fields = {"age"}; + std::vector sort_fields = { sort_field("age", "DESC") }; + std::vector sort_fields_index = { field("age", "INT32") }; coll_array_fields = collectionManager.get_collection("coll_array_fields"); if(coll_array_fields == nullptr) { - coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, rank_fields); + coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, sort_fields_index); } std::string json_line; @@ -417,7 +471,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { // Plain search with no filters - results should be sorted by rank fields query_fields = {"name"}; std::vector facets; - nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false); + nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(5, results["hits"].size()); std::vector ids = {"3", "1", "4", "0", "2"}; @@ -430,7 +484,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // Searching on an int32 field - results = coll_array_fields->search("Jeremy", query_fields, "age:>24", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "age:>24", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"3", "1", "4"}; @@ -442,14 +496,14 @@ TEST_F(CollectionTest, FilterOnNumericFields) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = coll_array_fields->search("Jeremy", query_fields, "age:>=24", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "age:>=24", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); - results = coll_array_fields->search("Jeremy", query_fields, "age:24", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "age:24", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(1, results["hits"].size()); // Searching a number against an int32 array field - results = coll_array_fields->search("Jeremy", query_fields, "years:>2002", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "years:>2002", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"1", "0", "2"}; @@ -460,7 +514,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = coll_array_fields->search("Jeremy", query_fields, "years:<1989", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "years:<1989", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(1, results["hits"].size()); ids = {"3"}; @@ -472,7 +526,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // multiple filters - results = coll_array_fields->search("Jeremy", query_fields, "years:<2005 && years:>1987", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "years:<2005 && years:>1987", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(1, results["hits"].size()); ids = {"4"}; @@ -484,7 +538,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // multiple search values (works like SQL's IN operator) against a single int field - results = coll_array_fields->search("Jeremy", query_fields, "age:[21, 24, 63]", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "age:[21, 24, 63]", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"3", "0", "2"}; @@ -496,7 +550,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // multiple search values against an int32 array field - also use extra padding between symbols - results = coll_array_fields->search("Jeremy", query_fields, "years : [ 2015, 1985 , 1999]", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "years : [ 2015, 1985 , 1999]", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); ids = {"3", "1", "4", "0"}; @@ -508,7 +562,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // searching on an int64 array field - also ensure that padded space causes no issues - results = coll_array_fields->search("Jeremy", query_fields, "timestamps : > 475205222", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "timestamps : > 475205222", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); ids = {"1", "4", "0", "2"}; @@ -521,7 +575,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) { } // when filters don't match any record, no results should be returned - results = coll_array_fields->search("Jeremy", query_fields, "timestamps:<1", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "timestamps:<1", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); collectionManager.drop_collection("coll_array_fields"); @@ -534,11 +588,13 @@ TEST_F(CollectionTest, FilterOnTextFields) { std::vector fields = {field("name", field_types::STRING), field("age", field_types::INT32), field("years", field_types::INT32_ARRAY), field("tags", field_types::STRING_ARRAY)}; - std::vector rank_fields = {"age"}; + + std::vector sort_fields_index = { field("age", "INT32") }; + std::vector sort_fields = { sort_field("age", "DESC") }; coll_array_fields = collectionManager.get_collection("coll_array_fields"); if(coll_array_fields == nullptr) { - coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, rank_fields); + coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, sort_fields_index); } std::string json_line; @@ -551,7 +607,7 @@ TEST_F(CollectionTest, FilterOnTextFields) { query_fields = {"name"}; std::vector facets; - nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "tags: gold", facets, rank_fields, 0, 10, FREQUENCY, false); + nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "tags: gold", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"1", "4", "0", "2"}; @@ -563,7 +619,7 @@ TEST_F(CollectionTest, FilterOnTextFields) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = coll_array_fields->search("Jeremy", query_fields, "tags : bronze", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "tags : bronze", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(2, results["hits"].size()); ids = {"4", "2"}; @@ -576,7 +632,7 @@ TEST_F(CollectionTest, FilterOnTextFields) { } // search with a list of tags, also testing extra padding of space - results = coll_array_fields->search("Jeremy", query_fields, "tags: [bronze, silver]", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "tags: [bronze, silver]", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); ids = {"3", "4", "0", "2"}; @@ -589,7 +645,7 @@ TEST_F(CollectionTest, FilterOnTextFields) { } // should be exact matches (no normalization or fuzzy searching should happen) - results = coll_array_fields->search("Jeremy", query_fields, "tags: BRONZE", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "tags: BRONZE", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); collectionManager.drop_collection("coll_array_fields"); @@ -604,11 +660,13 @@ TEST_F(CollectionTest, HandleBadlyFormedFilterQuery) { field("years", field_types::INT32_ARRAY), field("timestamps", field_types::INT64_ARRAY), field("tags", field_types::STRING_ARRAY)}; - std::vector rank_fields = {"age"}; + + std::vector sort_fields_index = { field("age", "INT32") }; + std::vector sort_fields = { sort_field("age", "DESC") }; coll_array_fields = collectionManager.get_collection("coll_array_fields"); if(coll_array_fields == nullptr) { - coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, rank_fields); + coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, sort_fields_index); } std::string json_line; @@ -623,27 +681,27 @@ TEST_F(CollectionTest, HandleBadlyFormedFilterQuery) { std::vector facets; // when filter field does not exist in the schema - nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "tagzz: gold", facets, rank_fields, 0, 10, FREQUENCY, false); + nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "tagzz: gold", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); // searching using a string for a numeric field - results = coll_array_fields->search("Jeremy", query_fields, "age: abcdef", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "age: abcdef", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); // searching using a string for a numeric array field - results = coll_array_fields->search("Jeremy", query_fields, "timestamps: abcdef", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "timestamps: abcdef", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); // malformed k:v syntax - results = coll_array_fields->search("Jeremy", query_fields, "timestamps abcdef", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "timestamps abcdef", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); // just empty spaces - results = coll_array_fields->search("Jeremy", query_fields, " ", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, " ", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); // wrapping number with quotes - results = coll_array_fields->search("Jeremy", query_fields, "age: '21'", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "age: '21'", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); collectionManager.drop_collection("coll_array_fields"); @@ -658,11 +716,13 @@ TEST_F(CollectionTest, FacetCounts) { field("timestamps", field_types::INT64_ARRAY), field("tags", field_types::STRING_ARRAY)}; facet_fields = {field("tags", field_types::STRING_ARRAY), field("name", field_types::STRING)}; - std::vector rank_fields = {"age"}; + + std::vector sort_fields_index = { field("age", "DESC") }; + std::vector sort_fields = { sort_field("age", "DESC") }; coll_array_fields = collectionManager.get_collection("coll_array_fields"); if(coll_array_fields == nullptr) { - coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, rank_fields); + coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, sort_fields_index); } std::string json_line; @@ -677,27 +737,27 @@ TEST_F(CollectionTest, FacetCounts) { std::vector facets = {"tags"}; // single facet with no filters - nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false); + nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(1, results["facet_counts"].size()); ASSERT_EQ(2, results["facet_counts"][0].size()); ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]); - ASSERT_EQ("gold", results["facet_counts"][0]["counts"][1]["value"]); - ASSERT_EQ(4, (int) results["facet_counts"][0]["counts"][1]["count"]); + ASSERT_EQ("gold", results["facet_counts"][0]["counts"][0]["value"]); + ASSERT_EQ(4, (int) results["facet_counts"][0]["counts"][0]["count"]); - ASSERT_EQ("silver", results["facet_counts"][0]["counts"][2]["value"]); - ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][2]["count"]); + ASSERT_EQ("silver", results["facet_counts"][0]["counts"][1]["value"]); + ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]); - ASSERT_EQ("bronze", results["facet_counts"][0]["counts"][0]["value"]); - ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ("bronze", results["facet_counts"][0]["counts"][2]["value"]); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][2]["count"]); // 2 facets, 1 text filter with no filters facets.clear(); facets.push_back("tags"); facets.push_back("name"); - results = coll_array_fields->search("Jeremy", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(2, results["facet_counts"].size()); @@ -712,19 +772,19 @@ TEST_F(CollectionTest, FacetCounts) { // facet with filters facets.clear(); facets.push_back("tags"); - results = coll_array_fields->search("Jeremy", query_fields, "age: >24", facets, rank_fields, 0, 10, FREQUENCY, false); + results = coll_array_fields->search("Jeremy", query_fields, "age: >24", facets, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(1, results["facet_counts"].size()); ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]); - ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]); + ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]); ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][1]["count"]); - ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][2]["count"]); + ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][2]["count"]); - ASSERT_EQ("bronze", results["facet_counts"][0]["counts"][0]["value"]); - ASSERT_EQ("gold", results["facet_counts"][0]["counts"][1]["value"]); - ASSERT_EQ("silver", results["facet_counts"][0]["counts"][2]["value"]); + ASSERT_EQ("gold", results["facet_counts"][0]["counts"][0]["value"]); + ASSERT_EQ("silver", results["facet_counts"][0]["counts"][1]["value"]); + ASSERT_EQ("bronze", results["facet_counts"][0]["counts"][2]["value"]); collectionManager.drop_collection("coll_array_fields"); } @@ -739,11 +799,12 @@ TEST_F(CollectionTest, SearchingWithMissingFields) { field("timestamps", field_types::INT64_ARRAY), field("tags", field_types::STRING_ARRAY)}; facet_fields = {field("tags", field_types::STRING_ARRAY), field("name", field_types::STRING)}; - std::vector rank_fields = {"age"}; + std::vector sort_fields_index = { field("age", "DESC") }; + std::vector sort_fields = { sort_field("age", "DESC") }; coll_array_fields = collectionManager.get_collection("coll_array_fields"); if(coll_array_fields == nullptr) { - coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, rank_fields); + coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, sort_fields_index); } std::string json_line; @@ -758,28 +819,28 @@ TEST_F(CollectionTest, SearchingWithMissingFields) { std::vector facets; std::vector query_fields_not_found = {"titlez"}; - nlohmann::json res = coll_array_fields->search("the", query_fields_not_found, "", facets, rank_fields, 0, 10); + nlohmann::json res = coll_array_fields->search("the", query_fields_not_found, "", facets, sort_fields, 0, 10); ASSERT_EQ(0, res["hits"].size()); ASSERT_STREQ("Could not find a search field named `titlez` in the schema.",res["error"].get().c_str()); // when a query field is an integer field - res = coll_array_fields->search("the", {"age"}, "", facets, rank_fields, 0, 10); + res = coll_array_fields->search("the", {"age"}, "", facets, sort_fields, 0, 10); ASSERT_EQ(0, res["hits"].size()); ASSERT_STREQ("Search field `age` should be a string or a string array.", res["error"].get().c_str()); // when a facet field is not defined in the schema - res = coll_array_fields->search("the", {"name"}, "", {"timestamps"}, rank_fields, 0, 10); + res = coll_array_fields->search("the", {"name"}, "", {"timestamps"}, sort_fields, 0, 10); ASSERT_EQ(0, res["hits"].size()); ASSERT_STREQ("Could not find a facet field named `timestamps` in the schema.", res["error"].get().c_str()); // when a rank field is not defined in the schema - res = coll_array_fields->search("the", {"name"}, "", {}, {"timestamps"}, 0, 10); + res = coll_array_fields->search("the", {"name"}, "", {}, { sort_field("timestamps", "ASC") }, 0, 10); ASSERT_EQ(0, res["hits"].size()); - ASSERT_STREQ("Could not find a rank field named `timestamps` in the schema.", res["error"].get().c_str()); + ASSERT_STREQ("Could not find a sort field named `timestamps` in the schema.", res["error"].get().c_str()); - res = coll_array_fields->search("the", {"name"}, "", {}, {"_rank"}, 0, 10); + res = coll_array_fields->search("the", {"name"}, "", {}, { sort_field("_rank", "ASC") }, 0, 10); ASSERT_EQ(0, res["hits"].size()); - ASSERT_STREQ("Could not find a rank field named `_rank` in the schema.", res["error"].get().c_str()); + ASSERT_STREQ("Could not find a sort field named `_rank` in the schema.", res["error"].get().c_str()); collectionManager.drop_collection("coll_array_fields"); } @@ -790,12 +851,14 @@ TEST_F(CollectionTest, IndexingWithBadData) { std::vector fields = {field("name", field_types::STRING)}; facet_fields = {field("tags", field_types::STRING_ARRAY)}; - std::vector rank_fields = {"age", "average"}; + + std::vector sort_fields_index = { field("age", "INT32"), field("average", "INT32") }; + std::vector sort_fields = { sort_field("age", "DESC"), sort_field("average", "DESC") }; sample_collection = collectionManager.get_collection("sample_collection"); if(sample_collection == nullptr) { sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields, - rank_fields, "age"); + sort_fields_index, "age"); } const Option & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\", \"age\": 29}"); @@ -814,10 +877,10 @@ TEST_F(CollectionTest, IndexingWithBadData) { facet_fields_missing_op1.error().c_str()); const char *doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [\"red\", \"blue\"]}"; - const Option & rank_fields_missing_op1 = sample_collection->add(doc_str); - ASSERT_FALSE(rank_fields_missing_op1.ok()); - ASSERT_STREQ("Field `average` has been declared as a rank field in the schema, but is not found in the document.", - rank_fields_missing_op1.error().c_str()); + const Option & sort_fields_missing_op1 = sample_collection->add(doc_str); + ASSERT_FALSE(sort_fields_missing_op1.ok()); + ASSERT_STREQ("Field `average` has been declared as a sort field in the schema, but is not found in the document.", + sort_fields_missing_op1.error().c_str()); // Handle type errors @@ -832,19 +895,25 @@ TEST_F(CollectionTest, IndexingWithBadData) { ASSERT_TRUE(empty_facet_field_op.ok()); doc_str = "{\"name\": \"foo\", \"age\": \"34\", \"tags\": [], \"average\": 34 }"; - const Option & bad_token_ordering_field_op1 = sample_collection->add(doc_str); - ASSERT_FALSE(bad_token_ordering_field_op1.ok()); - ASSERT_STREQ("Token ordering field `age` must be an INT32.", bad_token_ordering_field_op1.error().c_str()); + const Option & bad_token_ranking_field_op1 = sample_collection->add(doc_str); + ASSERT_FALSE(bad_token_ranking_field_op1.ok()); + ASSERT_STREQ("Token ranking field `age` must be an INT32.", bad_token_ranking_field_op1.error().c_str()); doc_str = "{\"name\": \"foo\", \"age\": 343234324234233234, \"tags\": [], \"average\": 34 }"; - const Option & bad_token_ordering_field_op2 = sample_collection->add(doc_str); - ASSERT_FALSE(bad_token_ordering_field_op2.ok()); - ASSERT_STREQ("Token ordering field `age` exceeds maximum value of INT32.", bad_token_ordering_field_op2.error().c_str()); + const Option & bad_token_ranking_field_op2 = sample_collection->add(doc_str); + ASSERT_FALSE(bad_token_ranking_field_op2.ok()); + ASSERT_STREQ("Token ranking field `age` exceeds maximum value of INT32.", bad_token_ranking_field_op2.error().c_str()); + + doc_str = "{\"name\": \"foo\", \"tags\": [], \"average\": 34 }"; + const Option & bad_token_ranking_field_op3 = sample_collection->add(doc_str); + ASSERT_FALSE(bad_token_ranking_field_op3.ok()); + ASSERT_STREQ("Field `age` has been declared as a token ranking field, but is not found in the document.", + bad_token_ranking_field_op3.error().c_str()); doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [], \"average\": \"34\"}"; const Option & bad_rank_field_op = sample_collection->add(doc_str); ASSERT_FALSE(bad_rank_field_op.ok()); - ASSERT_STREQ("Rank field `average` must be an integer.", bad_rank_field_op.error().c_str()); + ASSERT_STREQ("Sort field `average` must be a number.", bad_rank_field_op.error().c_str()); collectionManager.drop_collection("sample_collection"); } @@ -854,13 +923,15 @@ TEST_F(CollectionTest, EmptyIndexShouldNotCrash) { std::vector fields = {field("name", field_types::STRING)}; facet_fields = {field("tags", field_types::STRING_ARRAY)}; - std::vector rank_fields = {"age", "average"}; + + std::vector sort_fields_index = { field("age", "INT32"), field("average", "INT32") }; + std::vector sort_fields = { sort_field("age", "DESC"), sort_field("average", "DESC") }; empty_coll = collectionManager.get_collection("empty_coll"); if(empty_coll == nullptr) { - empty_coll = collectionManager.create_collection("empty_coll", fields, facet_fields, rank_fields, "age"); + empty_coll = collectionManager.create_collection("empty_coll", fields, facet_fields, sort_fields_index, "age"); } - nlohmann::json results = empty_coll->search("a", {"name"}, "", {}, rank_fields, 0, 10, FREQUENCY, false); + nlohmann::json results = empty_coll->search("a", {"name"}, "", {}, sort_fields, 0, 10, FREQUENCY, false); ASSERT_EQ(0, results["hits"].size()); } \ No newline at end of file