Merge branch 'typesense_server_api'

This commit is contained in:
Kishore Nallan 2017-05-29 21:00:36 +05:30
commit 12f425c6e0
30 changed files with 2173 additions and 727 deletions

View File

@ -65,7 +65,7 @@ link_directories(${DEP_ROOT_DIR}/${FOR_NAME})
link_directories(${DEP_ROOT_DIR}/${H2O_NAME}/build)
link_directories(${DEP_ROOT_DIR}/${ROCKSDB_NAME})
add_executable(typesense-server ${SRC_FILES} src/main/server.cpp)
add_executable(typesense-server ${SRC_FILES} src/main/typesense_server.cpp)
add_executable(search ${SRC_FILES} src/main/main.cpp)
add_executable(benchmark ${SRC_FILES} src/main/benchmark.cpp)
add_executable(typesense_test ${SRC_FILES} test/array_test.cpp test/sorted_array_test.cpp test/art_test.cpp
@ -83,7 +83,7 @@ if(NOT APPLE)
list(APPEND ROCKSDB_LIBS rt)
endif()
target_link_libraries(typesense-server for pthread h2o-evloop ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} dl ${STD_LIB})
target_link_libraries(typesense-server h2o-evloop for pthread ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} dl ${STD_LIB})
target_link_libraries(search for pthread h2o-evloop ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} dl ${STD_LIB})
target_link_libraries(benchmark for pthread ${ROCKSDB_LIBS} ${STD_LIB})
target_link_libraries(typesense_test pthread for ${ROCKSDB_LIBS} gtest gtest_main ${STD_LIB})
target_link_libraries(benchmark for pthread h2o-evloop ${ROCKSDB_LIBS} ${OPENSSL_LIBRARIES} dl ${STD_LIB})
target_link_libraries(typesense_test h2o-evloop ${OPENSSL_LIBRARIES} pthread for ${ROCKSDB_LIBS} gtest gtest_main dl ${STD_LIB})

View File

@ -3,28 +3,24 @@
Typesense is an open source search engine for building a delightful search experience.
- **Typo tolerance:** Handles typographical errors out-of-the-box
- **Tunable ranking + relevancy:** Tailor your search results to perfection
- **Tunable ranking:** Tailor your search results to perfection
- **Blazing fast:** Meticulously designed and optimized for speed
- **Simple and delightful:** Simple API, delightful out-of-the-box experience
## Development
### Build from source
### Building from source
Please ensure that you have docker installed on your system.
Building on your machine:
```
$ ./build.sh [--clean]
.
.
.
$ ./dockcross build/typesense_test
.
.
.
$ ./dockcross build/typesense-server
```
We use [dockcross](https://github.com/dockcross/dockcross) to build our development environment consistently.
Building on a Docker container:
```
$ ./docker-build.sh
```
© 2016-2017 Wreally Studios Inc.

18
TODO.md
View File

@ -30,6 +30,24 @@
- ~~Schema validation during insertion (missing fields + type errors)~~
- ~~Proper score field for ranking tokens~~
- ~~Throw errors when schema is broken~~
- ~~Desc/Asc ordering with tests~~
- ~~Found count is wrong~~
- ~~Filter query in the API~~
- ~~Facet limit (hardcode to top 10)~~
- ~~Deprecate old split function~~
- When prefix=true, use token_ranking_field for token ordering
- Search snippet
- ID should not have "/"
- Group results by field
- Use rocksdb batch put for atomic insertion
- Test for sorted_array::indexOf when length is 0
- Handle store-get() not finding a key
- Fix API response codes
- Test for search without any sort_by given
- Test for asc/desc upper/lower casing
- Test for collection creation validation
- Test for delete document
- Proper pagination
- Prevent string copy during indexing
- clean special chars before indexing
- Minimum results should be a variable instead of blindly going with max_results

View File

@ -20,6 +20,8 @@ file(COPY ${CMAKE_SOURCE_DIR}/cmake/patches/build_detect_platform DESTINATION
if(NOT EXISTS ${DEP_ROOT_DIR}/${ROCKSDB_NAME}/librocksdb.a)
message("Building ${ROCKSDB_NAME} locally...")
set(ENV{PORTABLE} 1)
execute_process(COMMAND make "clean" WORKING_DIRECTORY ${DEP_ROOT_DIR}/${ROCKSDB_NAME}/)
execute_process(COMMAND make "static_lib" WORKING_DIRECTORY ${DEP_ROOT_DIR}/${ROCKSDB_NAME}/
RESULT_VARIABLE ROCKSDB_BUILD)
if(NOT ROCKSDB_BUILD EQUAL 0)

11
include/api.h Normal file
View File

@ -0,0 +1,11 @@
#pragma once
#include "http_server.h"
void post_create_collection(http_req & req, http_res & res);
void get_search(http_req & req, http_res & res);
void post_add_document(http_req & req, http_res & res);
void del_remove_document(http_req & req, http_res & res);

View File

@ -14,7 +14,7 @@ private:
uint32_t m = std::min(min, value);
uint32_t M = std::max(max, value);
uint32_t bnew = required_bits(M - m);
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
}
public:

View File

@ -24,20 +24,6 @@ protected:
return (uint32_t) (v == 0 ? 0 : 32 - __builtin_clz(v));
}
uint32_t inline sorted_append_size_required(uint32_t value, uint32_t new_length) {
uint32_t m = std::min(min, value);
uint32_t M = std::max(max, value);
uint32_t bnew = required_bits(M - m);
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
}
uint32_t inline unsorted_append_size_required(uint32_t value, uint32_t new_length) {
uint32_t m = std::min(min, value);
uint32_t M = std::max(max, value);
uint32_t bnew = required_bits(M - m);
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
}
public:
array_base(const uint32_t n=2) {
size_bytes = METADATA_OVERHEAD + (n * FOR_ELE_SIZE);

View File

@ -2,12 +2,15 @@
#include <cstddef>
#include <stdint.h>
#include <array>
/* Different intersection routines adapted from:
* https://github.com/lemire/SIMDCompressionAndIntersection/blob/master/src/intersection.cpp
*/
class Intersection {
class ArrayUtils {
public:
// Fast scalar scheme designed by N. Kurz. Returns the size of out (intersected set)
static size_t scalar(const uint32_t *A, const size_t lenA, const uint32_t *B, const size_t lenB, uint32_t *out);
static size_t and_scalar(const uint32_t *A, const size_t lenA, const uint32_t *B, const size_t lenB, uint32_t *out);
static size_t or_scalar(const uint32_t *A, const size_t lenA, const uint32_t *B, const size_t lenB, uint32_t **out);
};

View File

@ -110,7 +110,7 @@ typedef struct {
*/
typedef struct {
art_values* values;
uint16_t max_score;
uint32_t max_score;
uint32_t key_len;
unsigned char key[];
} art_leaf;

809
include/cmdline.h Normal file
View File

@ -0,0 +1,809 @@
/*
Copyright (c) 2009, Hideyuki Tanaka
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the <organization> nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY <copyright holder> ''AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL <copyright holder> BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <iostream>
#include <sstream>
#include <vector>
#include <map>
#include <string>
#include <stdexcept>
#include <typeinfo>
#include <cstring>
#include <algorithm>
#include <cxxabi.h>
#include <cstdlib>
namespace cmdline{
namespace detail{
template <typename Target, typename Source, bool Same>
class lexical_cast_t{
public:
static Target cast(const Source &arg){
Target ret;
std::stringstream ss;
if (!(ss<<arg && ss>>ret && ss.eof()))
throw std::bad_cast();
return ret;
}
};
template <typename Target, typename Source>
class lexical_cast_t<Target, Source, true>{
public:
static Target cast(const Source &arg){
return arg;
}
};
template <typename Source>
class lexical_cast_t<std::string, Source, false>{
public:
static std::string cast(const Source &arg){
std::ostringstream ss;
ss<<arg;
return ss.str();
}
};
template <typename Target>
class lexical_cast_t<Target, std::string, false>{
public:
static Target cast(const std::string &arg){
Target ret;
std::istringstream ss(arg);
if (!(ss>>ret && ss.eof()))
throw std::bad_cast();
return ret;
}
};
template <typename T1, typename T2>
struct is_same {
static const bool value = false;
};
template <typename T>
struct is_same<T, T>{
static const bool value = true;
};
template<typename Target, typename Source>
Target lexical_cast(const Source &arg)
{
return lexical_cast_t<Target, Source, detail::is_same<Target, Source>::value>::cast(arg);
}
static inline std::string demangle(const std::string &name)
{
int status=0;
char *p=abi::__cxa_demangle(name.c_str(), 0, 0, &status);
std::string ret(p);
free(p);
return ret;
}
template <class T>
std::string readable_typename()
{
return demangle(typeid(T).name());
}
template <class T>
std::string default_value(T def)
{
return detail::lexical_cast<std::string>(def);
}
template <>
inline std::string readable_typename<std::string>()
{
return "string";
}
} // detail
//-----
class cmdline_error : public std::exception {
public:
cmdline_error(const std::string &msg): msg(msg){}
~cmdline_error() throw() {}
const char *what() const throw() { return msg.c_str(); }
private:
std::string msg;
};
template <class T>
struct default_reader{
T operator()(const std::string &str){
return detail::lexical_cast<T>(str);
}
};
template <class T>
struct range_reader{
range_reader(const T &low, const T &high): low(low), high(high) {}
T operator()(const std::string &s) const {
T ret=default_reader<T>()(s);
if (!(ret>=low && ret<=high)) throw cmdline::cmdline_error("range_error");
return ret;
}
private:
T low, high;
};
template <class T>
range_reader<T> range(const T &low, const T &high)
{
return range_reader<T>(low, high);
}
template <class T>
struct oneof_reader{
T operator()(const std::string &s){
T ret=default_reader<T>()(s);
if (std::find(alt.begin(), alt.end(), ret)==alt.end())
throw cmdline_error("");
return ret;
}
void add(const T &v){ alt.push_back(v); }
private:
std::vector<T> alt;
};
template <class T>
oneof_reader<T> oneof(T a1)
{
oneof_reader<T> ret;
ret.add(a1);
return ret;
}
template <class T>
oneof_reader<T> oneof(T a1, T a2)
{
oneof_reader<T> ret;
ret.add(a1);
ret.add(a2);
return ret;
}
template <class T>
oneof_reader<T> oneof(T a1, T a2, T a3)
{
oneof_reader<T> ret;
ret.add(a1);
ret.add(a2);
ret.add(a3);
return ret;
}
template <class T>
oneof_reader<T> oneof(T a1, T a2, T a3, T a4)
{
oneof_reader<T> ret;
ret.add(a1);
ret.add(a2);
ret.add(a3);
ret.add(a4);
return ret;
}
template <class T>
oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5)
{
oneof_reader<T> ret;
ret.add(a1);
ret.add(a2);
ret.add(a3);
ret.add(a4);
ret.add(a5);
return ret;
}
template <class T>
oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6)
{
oneof_reader<T> ret;
ret.add(a1);
ret.add(a2);
ret.add(a3);
ret.add(a4);
ret.add(a5);
ret.add(a6);
return ret;
}
template <class T>
oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7)
{
oneof_reader<T> ret;
ret.add(a1);
ret.add(a2);
ret.add(a3);
ret.add(a4);
ret.add(a5);
ret.add(a6);
ret.add(a7);
return ret;
}
template <class T>
oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8)
{
oneof_reader<T> ret;
ret.add(a1);
ret.add(a2);
ret.add(a3);
ret.add(a4);
ret.add(a5);
ret.add(a6);
ret.add(a7);
ret.add(a8);
return ret;
}
template <class T>
oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9)
{
oneof_reader<T> ret;
ret.add(a1);
ret.add(a2);
ret.add(a3);
ret.add(a4);
ret.add(a5);
ret.add(a6);
ret.add(a7);
ret.add(a8);
ret.add(a9);
return ret;
}
template <class T>
oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9, T a10)
{
oneof_reader<T> ret;
ret.add(a1);
ret.add(a2);
ret.add(a3);
ret.add(a4);
ret.add(a5);
ret.add(a6);
ret.add(a7);
ret.add(a8);
ret.add(a9);
ret.add(a10);
return ret;
}
//-----
class parser{
public:
parser(){
}
~parser(){
for (std::map<std::string, option_base*>::iterator p=options.begin();
p!=options.end(); p++)
delete p->second;
}
void add(const std::string &name,
char short_name=0,
const std::string &desc=""){
if (options.count(name)) throw cmdline_error("multiple definition: "+name);
options[name]=new option_without_value(name, short_name, desc);
ordered.push_back(options[name]);
}
template <class T>
void add(const std::string &name,
char short_name=0,
const std::string &desc="",
bool need=true,
const T def=T()){
add(name, short_name, desc, need, def, default_reader<T>());
}
template <class T, class F>
void add(const std::string &name,
char short_name=0,
const std::string &desc="",
bool need=true,
const T def=T(),
F reader=F()){
if (options.count(name)) throw cmdline_error("multiple definition: "+name);
options[name]=new option_with_value_with_reader<T, F>(name, short_name, need, def, desc, reader);
ordered.push_back(options[name]);
}
void footer(const std::string &f){
ftr=f;
}
void set_program_name(const std::string &name){
prog_name=name;
}
bool exist(const std::string &name) const {
if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name);
return options.find(name)->second->has_set();
}
template <class T>
const T &get(const std::string &name) const {
if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name);
const option_with_value<T> *p=dynamic_cast<const option_with_value<T>*>(options.find(name)->second);
if (p==NULL) throw cmdline_error("type mismatch flag '"+name+"'");
return p->get();
}
const std::vector<std::string> &rest() const {
return others;
}
bool parse(const std::string &arg){
std::vector<std::string> args;
std::string buf;
bool in_quote=false;
for (std::string::size_type i=0; i<arg.length(); i++){
if (arg[i]=='\"'){
in_quote=!in_quote;
continue;
}
if (arg[i]==' ' && !in_quote){
args.push_back(buf);
buf="";
continue;
}
if (arg[i]=='\\'){
i++;
if (i>=arg.length()){
errors.push_back("unexpected occurrence of '\\' at end of string");
return false;
}
}
buf+=arg[i];
}
if (in_quote){
errors.push_back("quote is not closed");
return false;
}
if (buf.length()>0)
args.push_back(buf);
for (size_t i=0; i<args.size(); i++)
std::cout<<"\""<<args[i]<<"\""<<std::endl;
return parse(args);
}
bool parse(const std::vector<std::string> &args){
int argc=static_cast<int>(args.size());
std::vector<const char*> argv(argc);
for (int i=0; i<argc; i++)
argv[i]=args[i].c_str();
return parse(argc, &argv[0]);
}
bool parse(int argc, const char * const argv[]){
errors.clear();
others.clear();
if (argc<1){
errors.push_back("argument number must be longer than 0");
return false;
}
if (prog_name=="")
prog_name=argv[0];
std::map<char, std::string> lookup;
for (std::map<std::string, option_base*>::iterator p=options.begin();
p!=options.end(); p++){
if (p->first.length()==0) continue;
char initial=p->second->short_name();
if (initial){
if (lookup.count(initial)>0){
lookup[initial]="";
errors.push_back(std::string("short option '")+initial+"' is ambiguous");
return false;
}
else lookup[initial]=p->first;
}
}
for (int i=1; i<argc; i++){
if (strncmp(argv[i], "--", 2)==0){
const char *p=strchr(argv[i]+2, '=');
if (p){
std::string name(argv[i]+2, p);
std::string val(p+1);
set_option(name, val);
}
else{
std::string name(argv[i]+2);
if (options.count(name)==0){
errors.push_back("undefined option: --"+name);
continue;
}
if (options[name]->has_value()){
if (i+1>=argc){
errors.push_back("option needs value: --"+name);
continue;
}
else{
i++;
set_option(name, argv[i]);
}
}
else{
set_option(name);
}
}
}
else if (strncmp(argv[i], "-", 1)==0){
if (!argv[i][1]) continue;
char last=argv[i][1];
for (int j=2; argv[i][j]; j++){
last=argv[i][j];
if (lookup.count(argv[i][j-1])==0){
errors.push_back(std::string("undefined short option: -")+argv[i][j-1]);
continue;
}
if (lookup[argv[i][j-1]]==""){
errors.push_back(std::string("ambiguous short option: -")+argv[i][j-1]);
continue;
}
set_option(lookup[argv[i][j-1]]);
}
if (lookup.count(last)==0){
errors.push_back(std::string("undefined short option: -")+last);
continue;
}
if (lookup[last]==""){
errors.push_back(std::string("ambiguous short option: -")+last);
continue;
}
if (i+1<argc && options[lookup[last]]->has_value()){
set_option(lookup[last], argv[i+1]);
i++;
}
else{
set_option(lookup[last]);
}
}
else{
others.push_back(argv[i]);
}
}
for (std::map<std::string, option_base*>::iterator p=options.begin();
p!=options.end(); p++)
if (!p->second->valid())
errors.push_back("need option: --"+std::string(p->first));
return errors.size()==0;
}
void parse_check(const std::string &arg){
if (!options.count("help"))
add("help", '?', "print this message");
check(0, parse(arg));
}
void parse_check(const std::vector<std::string> &args){
if (!options.count("help"))
add("help", '?', "print this message");
check(args.size(), parse(args));
}
void parse_check(int argc, char *argv[]){
if (!options.count("help"))
add("help", '?', "print this message");
check(argc, parse(argc, argv));
}
std::string error() const{
return errors.size()>0?errors[0]:"";
}
std::string error_full() const{
std::ostringstream oss;
for (size_t i=0; i<errors.size(); i++)
oss<<errors[i]<<std::endl;
return oss.str();
}
std::string usage() const {
std::ostringstream oss;
oss<<"usage: "<<prog_name<<" ";
for (size_t i=0; i<ordered.size(); i++){
if (ordered[i]->must())
oss<<ordered[i]->short_description()<<" ";
}
oss<<"[options] ... "<<ftr<<std::endl;
oss<<"options:"<<std::endl;
size_t max_width=0;
for (size_t i=0; i<ordered.size(); i++){
max_width=std::max(max_width, ordered[i]->name().length());
}
for (size_t i=0; i<ordered.size(); i++){
if (ordered[i]->short_name()){
oss<<" -"<<ordered[i]->short_name()<<", ";
}
else{
oss<<" ";
}
oss<<"--"<<ordered[i]->name();
for (size_t j=ordered[i]->name().length(); j<max_width+4; j++)
oss<<' ';
oss<<ordered[i]->description()<<std::endl;
}
return oss.str();
}
private:
void check(int argc, bool ok){
if ((argc==1 && !ok) || exist("help")){
std::cerr<<usage();
exit(0);
}
if (!ok){
std::cerr<<error()<<std::endl<<usage();
exit(1);
}
}
void set_option(const std::string &name){
if (options.count(name)==0){
errors.push_back("undefined option: --"+name);
return;
}
if (!options[name]->set()){
errors.push_back("option needs value: --"+name);
return;
}
}
void set_option(const std::string &name, const std::string &value){
if (options.count(name)==0){
errors.push_back("undefined option: --"+name);
return;
}
if (!options[name]->set(value)){
errors.push_back("option value is invalid: --"+name+"="+value);
return;
}
}
class option_base{
public:
virtual ~option_base(){}
virtual bool has_value() const=0;
virtual bool set()=0;
virtual bool set(const std::string &value)=0;
virtual bool has_set() const=0;
virtual bool valid() const=0;
virtual bool must() const=0;
virtual const std::string &name() const=0;
virtual char short_name() const=0;
virtual const std::string &description() const=0;
virtual std::string short_description() const=0;
};
class option_without_value : public option_base {
public:
option_without_value(const std::string &name,
char short_name,
const std::string &desc)
:nam(name), snam(short_name), desc(desc), has(false){
}
~option_without_value(){}
bool has_value() const { return false; }
bool set(){
has=true;
return true;
}
bool set(const std::string &){
return false;
}
bool has_set() const {
return has;
}
bool valid() const{
return true;
}
bool must() const{
return false;
}
const std::string &name() const{
return nam;
}
char short_name() const{
return snam;
}
const std::string &description() const {
return desc;
}
std::string short_description() const{
return "--"+nam;
}
private:
std::string nam;
char snam;
std::string desc;
bool has;
};
template <class T>
class option_with_value : public option_base {
public:
option_with_value(const std::string &name,
char short_name,
bool need,
const T &def,
const std::string &desc)
: nam(name), snam(short_name), need(need), has(false)
, def(def), actual(def) {
this->desc=full_description(desc);
}
~option_with_value(){}
const T &get() const {
return actual;
}
bool has_value() const { return true; }
bool set(){
return false;
}
bool set(const std::string &value){
try{
actual=read(value);
has=true;
}
catch(const std::exception &e){
return false;
}
return true;
}
bool has_set() const{
return has;
}
bool valid() const{
if (need && !has) return false;
return true;
}
bool must() const{
return need;
}
const std::string &name() const{
return nam;
}
char short_name() const{
return snam;
}
const std::string &description() const {
return desc;
}
std::string short_description() const{
return "--"+nam+"="+detail::readable_typename<T>();
}
protected:
std::string full_description(const std::string &desc){
return
desc+" ("+detail::readable_typename<T>()+
(need?"":" [="+detail::default_value<T>(def)+"]")
+")";
}
virtual T read(const std::string &s)=0;
std::string nam;
char snam;
bool need;
std::string desc;
bool has;
T def;
T actual;
};
template <class T, class F>
class option_with_value_with_reader : public option_with_value<T> {
public:
option_with_value_with_reader(const std::string &name,
char short_name,
bool need,
const T def,
const std::string &desc,
F reader)
: option_with_value<T>(name, short_name, need, def, desc), reader(reader){
}
private:
T read(const std::string &s){
return reader(s);
}
F reader;
};
std::map<std::string, option_base*> options;
std::vector<option_base*> ordered;
std::string ftr;
std::string prog_name;
std::vector<std::string> others;
std::vector<std::string> errors;
};
} // cmdline

View File

@ -10,6 +10,33 @@
#include <field.h>
#include <option.h>
struct facet_value {
// use string to int mapping for saving memory
spp::sparse_hash_map<std::string, uint32_t> value_index;
spp::sparse_hash_map<uint32_t, std::string> index_value;
spp::sparse_hash_map<uint32_t, std::vector<uint32_t>> doc_values;
uint32_t get_value_index(const std::string & value) {
if(value_index.count(value) != 0) {
return value_index[value];
}
uint32_t new_index = value_index.size();
value_index.emplace(value, new_index);
index_value.emplace(new_index, value);
return new_index;
}
void index_values(uint32_t doc_seq_id, const std::vector<std::string> & values) {
std::vector<uint32_t> value_vec(values.size());
for(auto i = 0; i < values.size(); i++) {
value_vec[i] = get_value_index(values[i]);
}
doc_values.emplace(doc_seq_id, value_vec);
}
};
class Collection {
private:
std::string name;
@ -23,19 +50,19 @@ private:
spp::sparse_hash_map<std::string, field> facet_schema;
std::vector<std::string> rank_fields;
std::vector<field> sort_fields;
Store* store;
spp::sparse_hash_map<std::string, art_tree*> search_index;
spp::sparse_hash_map<std::string, art_tree*> facet_index;
spp::sparse_hash_map<std::string, facet_value> facet_index;
spp::sparse_hash_map<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> rank_index;
spp::sparse_hash_map<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> sort_index;
std::string token_ordering_field;
std::string token_ranking_field;
std::string get_doc_id_key(std::string doc_id);
std::string get_doc_id_key(const std::string & doc_id);
std::string get_seq_id_key(uint32_t seq_id);
@ -51,14 +78,15 @@ private:
void do_facets(std::vector<facet> & facets, uint32_t* result_ids, size_t results_size);
void search_field(std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
std::vector<facet> & facets, const std::vector<std::string> & rank_fields, const int num_typos,
const size_t num_results, Topster<100> &topster, size_t & num_found,
const token_ordering token_order = FREQUENCY, const bool prefix = false);
std::vector<facet> & facets, const std::vector<sort_field> & sort_fields, const int num_typos,
const size_t num_results, Topster<100> &topster, uint32_t** all_result_ids,
size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY, const bool prefix = false);
void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
const std::vector<std::string> & rank_fields, int & token_rank,
const std::vector<sort_field> & sort_fields, int & token_rank,
std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
size_t & total_results, size_t & num_found, const size_t & max_results);
size_t & total_results, uint32_t** all_result_ids, size_t & all_result_ids_len,
const size_t & max_results);
void index_string_field(const std::string & text, const uint32_t score, art_tree *t, uint32_t seq_id,
const bool verbatim) const;
@ -82,13 +110,13 @@ public:
Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
const std::vector<field> & search_fields, const std::vector<field> & facet_fields,
const std::vector<std::string> & rank_fields, const std::string token_ordering_field);
const std::vector<field> & sort_fields, const std::string token_ranking_field);
~Collection();
static std::string get_next_seq_id_key(std::string collection_name);
static std::string get_next_seq_id_key(const std::string & collection_name);
static std::string get_meta_key(std::string collection_name);
static std::string get_meta_key(const std::string & collection_name);
std::string get_seq_id_collection_prefix();
@ -100,26 +128,26 @@ public:
std::vector<std::string> get_facet_fields();
std::vector<std::string> get_rank_fields();
std::vector<field> get_sort_fields();
spp::sparse_hash_map<std::string, field> get_schema();
std::string get_token_ordering_field();
std::string get_token_ranking_field();
Option<std::string> add(std::string json_str);
Option<std::string> add(const std::string & json_str);
nlohmann::json search(std::string query, const std::vector<std::string> search_fields,
const std::string & simple_filter_query, const std::vector<std::string> & facet_fields,
const std::vector<std::string> & rank_fields, const int num_typos,
const std::vector<sort_field> & sort_fields, const int num_typos,
const size_t num_results, const token_ordering token_order = FREQUENCY, const bool prefix = false);
void remove(std::string id);
Option<std::string> remove(const std::string & id);
void score_results(const std::vector<std::string> & rank_fields, const int & token_rank, Topster<100> &topster,
void score_results(const std::vector<sort_field> & sort_fields, const int & token_rank, Topster<100> &topster,
const std::vector<art_leaf *> & query_suggestion, const uint32_t *result_ids,
const size_t result_size) const;
Option<uint32_t> index_in_memory(const nlohmann::json &document, uint32_t seq_id);
Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id);
enum {MAX_SEARCH_TOKENS = 20};
enum {MAX_RESULTS = 100};

View File

@ -22,19 +22,19 @@ private:
static constexpr const char* COLLECTION_ID_KEY = "id";
static constexpr const char* COLLECTION_SEARCH_FIELDS_KEY = "search_fields";
static constexpr const char* COLLECTION_FACET_FIELDS_KEY = "facet_fields";
static constexpr const char* COLLECTION_RANK_FIELDS_KEY = "rank_fields";
static constexpr const char* COLLECTION_TOKEN_ORDERING_FIELD_KEY = "token_ordering_field";
static constexpr const char* COLLECTION_SORT_FIELDS_KEY = "sort_fields";
static constexpr const char* COLLECTION_TOKEN_ORDERING_FIELD_KEY = "token_ranking_field";
CollectionManager();
~CollectionManager() = default;
public:
static CollectionManager& get_instance() {
static CollectionManager & get_instance() {
static CollectionManager instance;
return instance;
}
~CollectionManager();
CollectionManager(CollectionManager const&) = delete;
void operator=(CollectionManager const&) = delete;
@ -42,8 +42,8 @@ public:
Collection* create_collection(std::string name, const std::vector<field> & search_fields,
const std::vector<field> & facet_fields,
const std::vector<std::string> & rank_fields,
const std::string & token_ordering_field = "");
const std::vector<field> & sort_fields,
const std::string & token_ranking_field = "");
Collection* get_collection(std::string collection_name);

View File

@ -23,7 +23,7 @@ struct field {
std::string name;
std::string type;
field(std::string name, std::string type): name(name), type(type) {
field(const std::string & name, const std::string & type): name(name), type(type) {
}
@ -64,6 +64,28 @@ struct filter {
}
};
namespace sort_field_const {
static const std::string name = "name";
static const std::string order = "order";
static const std::string asc = "ASC";
static const std::string desc = "DESC";
}
struct sort_field {
std::string name;
std::string order;
sort_field(const std::string & name, const std::string & order): name(name), order(order) {
}
sort_field& operator=(sort_field other) {
name = other.name;
order = other.order;
return *this;
}
};
struct facet {
const std::string field_name;
std::map<std::string, size_t> result_map;

112
include/http_server.h Normal file
View File

@ -0,0 +1,112 @@
#pragma once
#define H2O_USE_LIBUV 0
extern "C" {
#include "h2o.h"
#include "h2o/http1.h"
#include "h2o/http2.h"
}
#include <map>
#include <string>
#include <stdio.h>
#include "collection.h"
#include "collection_manager.h"
struct http_res {
uint32_t status_code;
std::string body;
void send_200(const std::string & res_body) {
status_code = 200;
body = res_body;
}
void send_201(const std::string & res_body) {
status_code = 201;
body = res_body;
}
void send_400(const std::string & message) {
status_code = 400;
body = "{\"message\": \"" + message + "\"}";
}
void send_404() {
status_code = 404;
body = "{\"message\": \"Not Found\"}";
}
void send_409(const std::string & message) {
status_code = 400;
body = "{\"message\": \"" + message + "\"}";
}
void send_500(const std::string & res_body) {
status_code = 500;
body = res_body;
}
void send(uint32_t code, const std::string & message) {
status_code = code;
body = "{\"message\": \"" + message + "\"}";
}
};
struct http_req {
std::map<std::string, std::string> params;
std::string body;
};
struct route_path {
std::string http_method;
std::vector<std::string> path_parts;
void (*handler)(http_req & req, http_res &);
inline bool operator< (const route_path& rhs) const {
return true;
}
};
class HttpServer {
private:
static h2o_globalconf_t config;
static h2o_context_t ctx;
static h2o_accept_ctx_t accept_ctx;
static std::vector<route_path> routes;
std::string listen_address;
uint32_t listen_port;
h2o_hostconf_t *hostconf;
static void on_accept(h2o_socket_t *listener, const char *err);
int create_listener();
h2o_pathconf_t *register_handler(h2o_hostconf_t *hostconf, const char *path,
int (*on_req)(h2o_handler_t *, h2o_req_t *));
static const char* get_status_reason(uint32_t status_code);
static std::map<std::string, std::string> parse_query(const std::string& query);
static int catch_all_handler(h2o_handler_t *self, h2o_req_t *req);
public:
HttpServer(std::string listen_address, uint32_t listen_port);
~HttpServer();
void get(const std::string & path, void (*handler)(http_req & req, http_res &));
void post(const std::string & path, void (*handler)(http_req &, http_res &));
void put(const std::string & path, void (*handler)(http_req &, http_res &));
void del(const std::string & path, void (*handler)(http_req &, http_res &));
int run();
};

View File

@ -16,7 +16,7 @@ private:
uint32_t m = std::min(min, value);
uint32_t M = std::max(max, value);
uint32_t bnew = required_bits(M - m);
return METADATA_OVERHEAD + for_compressed_size_bits(new_length, bnew);
return METADATA_OVERHEAD + 4 + for_compressed_size_bits(new_length, bnew);
}
uint32_t lower_bound_search_bits(const uint8_t *in, uint32_t imin, uint32_t imax, uint32_t base,

View File

@ -24,6 +24,13 @@ public:
}
};
enum StoreStatus {
FOUND,
OK,
NOT_FOUND,
ERROR
};
/*
* Abstraction for underlying KV store (RocksDB)
*/
@ -72,9 +79,18 @@ public:
return status.ok() && !status.IsNotFound();
}
bool get(const std::string& key, std::string& value) {
StoreStatus get(const std::string& key, std::string& value) {
rocksdb::Status status = db->Get(rocksdb::ReadOptions(), key, &value);
return status.ok();
if(status.IsNotFound()) {
return StoreStatus::NOT_FOUND;
}
if(!status.ok()) {
return StoreStatus::ERROR;
}
return StoreStatus::FOUND;
}
bool remove(const std::string& key) {

View File

@ -5,46 +5,6 @@
#include <sstream>
struct StringUtils {
template<class ContainerT>
static void tokenize(const std::string &str, ContainerT &tokens,
const std::string &delimiters = " ", bool trimEmpty = true, unsigned long maxTokenLength = 100) {
const std::string truncated_str = str.substr(0, maxTokenLength);
std::string::size_type pos, lastPos = 0;
using value_type = typename ContainerT::value_type;
using size_type = typename ContainerT::size_type;
while (true) {
pos = truncated_str.find_first_of(delimiters, lastPos);
if (pos == std::string::npos) {
pos = truncated_str.length();
if (pos != lastPos || !trimEmpty)
tokens.push_back(value_type(truncated_str.data() + lastPos,
(size_type) pos - lastPos));
break;
}
else {
if (pos != lastPos || !trimEmpty)
tokens.push_back(value_type(truncated_str.data() + lastPos,
(size_type) pos - lastPos));
}
lastPos = pos + 1;
}
}
static std::string replace_all(std::string str, const std::string &from, const std::string &to) {
size_t start_pos = 0;
while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
str.replace(start_pos, from.length(), to);
start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
}
return str;
}
// Adapted from: http://stackoverflow.com/a/236180/131050
static void split(const std::string& s, std::vector<std::string> & result, const std::string& delim, const bool keep_empty = false) {
if (delim.empty()) {
@ -122,4 +82,8 @@ struct StringUtils {
strtol(s.c_str(), &p, 10);
return (*p == 0);
}
static void toupper(std::string& str) {
std::transform(str.begin(), str.end(), str.begin(), ::toupper);
}
};

236
src/api.cpp Normal file
View File

@ -0,0 +1,236 @@
#include <regex>
#include <chrono>
#include <sys/resource.h>
#include "api.h"
#include "string_utils.h"
#include "collection.h"
#include "collection_manager.h"
void post_create_collection(http_req & req, http_res & res) {
nlohmann::json req_json;
try {
req_json = nlohmann::json::parse(req.body);
} catch(...) {
return res.send_400("Bad JSON.");
}
CollectionManager & collectionManager = CollectionManager::get_instance();
// validate presence of mandatory fields
if(req_json.count("name") == 0) {
return res.send_400("Parameter `name` is required.");
}
if(req_json.count("search_fields") == 0) {
return res.send_400("Parameter `search_fields` is required.");
}
if(req_json.count("sort_fields") == 0) {
return res.send_400("Parameter `sort_fields` is required.");
}
if(collectionManager.get_collection(req_json["name"]) != nullptr) {
return res.send_409("Collection with name `" + req_json["name"].get<std::string>() + "` already exists.");
}
// field specific validation
std::vector<field> search_fields;
if(!req_json["search_fields"].is_array() || req_json["search_fields"].size() == 0) {
return res.send_400("Wrong format for `search_fields`. It should be an array like: "
"[{\"name\": \"<field_name>\", \"type\": \"<field_type>\"}]");
}
for(const nlohmann::json & search_field_json: req_json["search_fields"]) {
if(!search_field_json.is_object() ||
search_field_json.count(fields::name) == 0 || search_field_json.count(fields::type) == 0 ||
!search_field_json.at(fields::name).is_string() || !search_field_json.at(fields::type).is_string()) {
return res.send_400("Wrong format for `search_fields`. It should be an array like: "
"[{\"name\": \"<field_name>\", \"type\": \"<field_type>\"}]");
}
search_fields.push_back(field(search_field_json["name"], search_field_json["type"]));
}
std::vector<field> facet_fields;
if(req_json.count("facet_fields") != 0) {
if(!req_json["facet_fields"].is_array()) {
return res.send_400("Wrong format for `facet_fields`. It should be an array like: "
"[{\"name\": \"<field_name>\", \"type\": \"<field_type>\"}]");
}
for(const nlohmann::json & facet_field_json: req_json["facet_fields"]) {
if(!facet_field_json.is_object() ||
facet_field_json.count(fields::name) == 0 || facet_field_json.count(fields::type) == 0 ||
!facet_field_json.at(fields::name).is_string() || !facet_field_json.at(fields::type).is_string()) {
return res.send_400("Wrong format for `facet_fields`. It should be an array like: "
"[{\"name\": \"<field_name>\", \"type\": \"<field_type>\"}]");
}
facet_fields.push_back(field(facet_field_json["name"], facet_field_json["type"]));
}
}
std::vector<field> sort_fields;
if(!req_json["sort_fields"].is_array() || req_json["sort_fields"].size() == 0) {
return res.send_400("Wrong format for `sort_fields`. It should be an array like: "
"[{\"name\": \"<field_name>\", \"type\": \"<field_type>\"}]");
}
for(const nlohmann::json & sort_field_json: req_json["sort_fields"]) {
if(!sort_field_json.is_object() ||
sort_field_json.count(fields::name) == 0 || sort_field_json.count(fields::type) == 0 ||
!sort_field_json.at(fields::name).is_string() ||
!sort_field_json.at(fields::type).is_string()) {
return res.send_400("Wrong format for `sort_fields`. It should be an array like: "
"[{\"name\": \"<field_name>\", \"type\": \"<field_type>\"}]");
}
if(sort_field_json["type"] != "INT32" && sort_field_json["type"] != "INT64") {
return res.send_400("Sort field `" + sort_field_json["name"].get<std::string>() + "` must be a number.");
}
sort_fields.push_back(field(sort_field_json["name"], sort_field_json["type"]));
}
std::string token_ranking_field = "";
if(req_json.count("token_ranking_field") != 0) {
if(!req_json["token_ranking_field"].is_string()) {
return res.send_400("Wrong format for `token_ranking_field`. It should be a string (name of a field).");
}
token_ranking_field = req_json["token_ranking_field"].get<std::string>();
}
collectionManager.create_collection(req_json["name"], search_fields, facet_fields, sort_fields, token_ranking_field);
res.send_201(req.body);
}
void get_search(http_req & req, http_res & res) {
const char *NUM_TYPOS = "num_typos";
const char *PREFIX = "prefix";
const char *FILTER = "filter_by";
const char *SEARCH_BY = "search_by";
const char *SORT_BY = "sort_by";
const char *FACET_BY = "facet_by";
if(req.params.count(NUM_TYPOS) == 0) {
req.params[NUM_TYPOS] = "2";
}
if(req.params.count(PREFIX) == 0) {
req.params[PREFIX] = "false";
}
if(req.params.count(SEARCH_BY) == 0) {
return res.send_400(std::string("Parameter `") + SEARCH_BY + "` is required.");
}
std::string filter_str = req.params.count(FILTER) != 0 ? req.params[FILTER] : "";
std::vector<std::string> search_fields;
StringUtils::split(req.params[SEARCH_BY], search_fields, ",");
std::vector<std::string> facet_fields;
StringUtils::split(req.params[FACET_BY], facet_fields, "&&");
std::vector<sort_field> sort_fields;
if(req.params.count(SORT_BY) != 0) {
std::vector<std::string> sort_field_strs;
StringUtils::split(req.params[SORT_BY], sort_field_strs, ",");
if(sort_field_strs.size() > 2) {
return res.send_400("Only upto 2 sort fields are allowed.");
}
for(const std::string & sort_field_str: sort_field_strs) {
std::vector<std::string> expression_parts;
StringUtils::split(sort_field_str, expression_parts, ":");
if(expression_parts.size() != 2) {
return res.send_400(std::string("Parameter `") + SORT_BY + "` is malformed.");
}
StringUtils::toupper(expression_parts[1]);
sort_fields.push_back(sort_field(expression_parts[0], expression_parts[1]));
}
}
auto begin = std::chrono::high_resolution_clock::now();
CollectionManager & collectionManager = CollectionManager::get_instance();
Collection* collection = collectionManager.get_collection(req.params["collection"]);
if(collection == nullptr) {
return res.send_404();
}
bool prefix = (req.params[PREFIX] == "true");
token_ordering token_order = FREQUENCY;
if(prefix && !collection->get_token_ranking_field().empty()) {
token_order = MAX_SCORE;
}
nlohmann::json result = collection->search(req.params["q"], search_fields, filter_str, facet_fields,
sort_fields, std::stoi(req.params[NUM_TYPOS]), 100,
token_order, prefix);
const std::string & json_str = result.dump();
//std::cout << "JSON:" << json_str << std::endl;
struct rusage r_usage;
getrusage(RUSAGE_SELF,&r_usage);
//std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl;
res.send_200(json_str);
long long int timeMicros = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
std::cout << "Time taken: " << timeMicros << "us" << std::endl;
}
void post_add_document(http_req & req, http_res & res) {
CollectionManager & collectionManager = CollectionManager::get_instance();
Collection* collection = collectionManager.get_collection(req.params["collection"]);
if(collection == nullptr) {
return res.send_404();
}
Option<std::string> inserted_id_op = collection->add(req.body);
if(!inserted_id_op.ok()) {
res.send(inserted_id_op.code(), inserted_id_op.error());
} else {
nlohmann::json json_response;
json_response["id"] = inserted_id_op.get();
res.send_201(json_response.dump());
}
}
void del_remove_document(http_req & req, http_res & res) {
std::string doc_id = req.params["id"];
CollectionManager & collectionManager = CollectionManager::get_instance();
Collection* collection = collectionManager.get_collection(req.params["collection"]);
if(collection == nullptr) {
return res.send_404();
}
Option<std::string> deleted_id_op = collection->remove(doc_id);
if(!deleted_id_op.ok()) {
res.send(deleted_id_op.code(), deleted_id_op.error());
} else {
nlohmann::json json_response;
json_response["id"] = deleted_id_op.get();
res.send_200(json_response.dump());
}
}

87
src/array_utils.cpp Normal file
View File

@ -0,0 +1,87 @@
#include "array_utils.h"
#include <memory.h>
size_t ArrayUtils::and_scalar(const uint32_t *A, const size_t lenA,
const uint32_t *B, const size_t lenB, uint32_t *out) {
const uint32_t *const initout(out);
if (lenA == 0 || lenB == 0)
return 0;
const uint32_t *endA = A + lenA;
const uint32_t *endB = B + lenB;
while (1) {
while (*A < *B) {
SKIP_FIRST_COMPARE:
if (++A == endA)
return (out - initout);
}
while (*A > *B) {
if (++B == endB)
return (out - initout);
}
if (*A == *B) {
*out++ = *A;
if (++A == endA || ++B == endB)
return (out - initout);
} else {
goto SKIP_FIRST_COMPARE;
}
}
return (out - initout); // NOTREACHED
}
size_t ArrayUtils::or_scalar(const uint32_t *A, const size_t lenA,
const uint32_t *B, const size_t lenB, uint32_t **out) {
size_t indexA = 0, indexB = 0, res_index = 0;
if(A == nullptr) {
*out = new uint32_t[lenB];
memcpy(*out, B, lenB * sizeof(uint32_t));
return lenB;
}
uint32_t* results = new uint32_t[lenA+lenB];
while (indexA < lenA && indexB < lenB) {
if (A[indexA] < B[indexB]) {
if(res_index == 0 || results[res_index-1] != A[indexA]) {
results[res_index] = A[indexA];
res_index++;
}
indexA++;
} else {
if(res_index == 0 || results[res_index-1] != B[indexB]) {
results[res_index] = B[indexB];
res_index++;
}
indexB++;
}
}
while (indexA < lenA) {
if(results[res_index-1] != A[indexA]) {
results[res_index] = A[indexA];
res_index++;
}
indexA++;
}
while (indexB < lenB) {
if(results[res_index-1] != B[indexB]) {
results[res_index] = B[indexB];
res_index++;
}
indexB++;
}
// shrink fit
*out = new uint32_t[res_index];
memcpy(*out, results, res_index * sizeof(uint32_t));
delete[] results;
return res_index;
}

View File

@ -2,16 +2,16 @@
#include <numeric>
#include <chrono>
#include <intersection.h>
#include <array_utils.h>
#include <match_score.h>
#include <string_utils.h>
#include <art.h>
Collection::Collection(const std::string name, const uint32_t collection_id, const uint32_t next_seq_id, Store *store,
const std::vector<field> &search_fields, const std::vector<field> & facet_fields,
const std::vector<std::string> & rank_fields, const std::string token_ordering_field):
const std::vector<field> & sort_fields, const std::string token_ranking_field):
name(name), collection_id(collection_id), next_seq_id(next_seq_id), store(store),
rank_fields(rank_fields), token_ordering_field(token_ordering_field) {
sort_fields(sort_fields), token_ranking_field(token_ranking_field) {
for(const field& field: search_fields) {
art_tree *t = new art_tree;
@ -21,33 +21,27 @@ Collection::Collection(const std::string name, const uint32_t collection_id, con
}
for(const field& field: facet_fields) {
art_tree *t = new art_tree;
art_tree_init(t);
facet_index.emplace(field.name, t);
facet_value fvalue;
facet_index.emplace(field.name, fvalue);
facet_schema.emplace(field.name, field);
}
for(const std::string & rank_field: rank_fields) {
for(const field & sort_field: sort_fields) {
spp::sparse_hash_map<uint32_t, int64_t> * doc_to_score = new spp::sparse_hash_map<uint32_t, int64_t>();
rank_index.emplace(rank_field, doc_to_score);
sort_index.emplace(sort_field.name, doc_to_score);
}
}
Collection::~Collection() {
for(std::pair<std::string, field> name_field: search_schema) {
for(auto & name_field: search_schema) {
art_tree *t = search_index.at(name_field.first);
art_tree_destroy(t);
t = nullptr;
}
for(std::pair<std::string, field> name_field: facet_schema) {
art_tree *t = facet_index.at(name_field.first);
art_tree_destroy(t);
t = nullptr;
}
for(std::pair<std::string, spp::sparse_hash_map<uint32_t, int64_t>*> name_map: rank_index) {
for(auto & name_map: sort_index) {
delete name_map.second;
name_map.second = nullptr;
}
}
@ -56,7 +50,7 @@ uint32_t Collection::get_next_seq_id() {
return next_seq_id++;
}
Option<std::string> Collection::add(std::string json_str) {
Option<std::string> Collection::add(const std::string & json_str) {
nlohmann::json document = nlohmann::json::parse(json_str);
uint32_t seq_id = get_next_seq_id();
@ -80,22 +74,22 @@ Option<std::string> Collection::add(std::string json_str) {
}
Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uint32_t seq_id) {
if(!token_ordering_field.empty() && document.count(token_ordering_field) == 0) {
return Option<>(400, "Field `" + token_ordering_field + "` has been declared as a token ordering field, "
if(!token_ranking_field.empty() && document.count(token_ranking_field) == 0) {
return Option<>(400, "Field `" + token_ranking_field + "` has been declared as a token ranking field, "
"but is not found in the document.");
}
if(!token_ordering_field.empty() && !document[token_ordering_field].is_number()) {
return Option<>(400, "Token ordering field `" + token_ordering_field + "` must be an INT32.");
if(!token_ranking_field.empty() && !document[token_ranking_field].is_number()) {
return Option<>(400, "Token ranking field `" + token_ranking_field + "` must be an INT32.");
}
if(!token_ordering_field.empty() && document[token_ordering_field].get<int64_t>() > INT32_MAX) {
return Option<>(400, "Token ordering field `" + token_ordering_field + "` exceeds maximum value of INT32.");
if(!token_ranking_field.empty() && document[token_ranking_field].get<int64_t>() > INT32_MAX) {
return Option<>(400, "Token ranking field `" + token_ranking_field + "` exceeds maximum value of INT32.");
}
uint32_t points = 0;
if(!token_ordering_field.empty()) {
points = document[token_ordering_field];
if(!token_ranking_field.empty()) {
points = document[token_ranking_field];
}
for(const std::pair<std::string, field> & field_pair: search_schema) {
@ -176,13 +170,13 @@ Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uin
"but is not found in the document.");
}
art_tree *t = facet_index.at(field_name);
facet_value & fvalue = facet_index.at(field_name);
if(field_pair.second.type == field_types::STRING) {
if(!document[field_name].is_string()) {
return Option<>(400, "Facet field `" + field_name + "` must be a STRING.");
}
const std::string & text = document[field_name];
index_string_field(text, points, t, seq_id, true);
const std::string & value = document[field_name];
fvalue.index_values(seq_id, { value });
} else if(field_pair.second.type == field_types::STRING_ARRAY) {
if(!document[field_name].is_array()) {
return Option<>(400, "Facet field `" + field_name + "` must be a STRING_ARRAY.");
@ -192,23 +186,23 @@ Option<uint32_t> Collection::index_in_memory(const nlohmann::json &document, uin
return Option<>(400, "Facet field `" + field_name + "` must be a STRING_ARRAY.");
}
std::vector<std::string> strings = document[field_name];
index_string_array_field(strings, points, t, seq_id, true);
const std::vector<std::string> & values = document[field_name];
fvalue.index_values(seq_id, values);
}
}
for(const std::string & rank_field: rank_fields) {
if(document.count(rank_field) == 0) {
return Option<>(400, "Field `" + rank_field + "` has been declared as a rank field in the schema, "
for(const field & sort_field: sort_fields) {
if(document.count(sort_field.name) == 0) {
return Option<>(400, "Field `" + sort_field.name + "` has been declared as a sort field in the schema, "
"but is not found in the document.");
}
if(!document[rank_field].is_number()) {
return Option<>(400, "Rank field `" + rank_field + "` must be an integer.");
if(!document[sort_field.name].is_number()) {
return Option<>(400, "Sort field `" + sort_field.name + "` must be a number.");
}
spp::sparse_hash_map<uint32_t, int64_t> *doc_to_score = rank_index.at(rank_field);
doc_to_score->emplace(seq_id, document[rank_fields[0]].get<int64_t>());
spp::sparse_hash_map<uint32_t, int64_t> *doc_to_score = sort_index.at(sort_field.name);
doc_to_score->emplace(seq_id, document[sort_field.name].get<int64_t>());
}
return Option<>(200);
@ -269,9 +263,9 @@ void Collection::index_string_field(const std::string & text, const uint32_t sco
tokens.push_back(text);
token_to_offsets[text].push_back(0);
} else {
StringUtils::tokenize(text, tokens, " ", true);
StringUtils::split(text, tokens, " ");
for(uint32_t i=0; i<tokens.size(); i++) {
auto token = tokens[i];
auto & token = tokens[i];
transform(token.begin(), token.end(), token.begin(), tolower);
token_to_offsets[token].push_back(i);
}
@ -301,7 +295,8 @@ void Collection::index_string_field(const std::string & text, const uint32_t sco
}
art_insert(t, key, key_len, &art_doc, num_hits);
delete art_doc.offsets;
delete [] art_doc.offsets;
art_doc.offsets = nullptr;
}
}
@ -330,36 +325,27 @@ void Collection::do_facets(std::vector<facet> & facets, uint32_t* result_ids, si
for(auto & a_facet: facets) {
// assumed that facet fields have already been validated upstream
const field & facet_field = facet_schema.at(a_facet.field_name);
const facet_value & fvalue = facet_index.at(facet_field.name);
// loop through the field, get all keys and intersect those ids with result ids
if(facet_index.count(facet_field.name) != 0) {
art_tree *t = facet_index.at(facet_field.name);
std::vector<art_leaf *> leaves;
art_topk_iter(t->root, MAX_SCORE, 10, leaves);
for(const art_leaf* leaf: leaves) {
const uint32_t* facet_ids = leaf->values->ids.uncompress();
size_t facet_ids_size = leaf->values->ids.getLength();
uint32_t* facet_results = new uint32_t[std::min(facet_ids_size, results_size)];
const size_t facet_results_size = Intersection::scalar(result_ids, results_size,
facet_ids, facet_ids_size, facet_results);
const std::string facet_value((const char *)leaf->key, leaf->key_len-1); // drop trailing null
a_facet.result_map.insert(std::pair<std::string, size_t>(facet_value, facet_results_size));
delete [] facet_ids;
delete [] facet_results;
for(auto i = 0; i < results_size; i++) {
uint32_t doc_seq_id = result_ids[i];
if(fvalue.doc_values.count(doc_seq_id) != 0) {
// for every result document, get the values associated and increment counter
const std::vector<uint32_t> & value_indices = fvalue.doc_values.at(doc_seq_id);
for(auto j = 0; j < value_indices.size(); j++) {
const std::string & facet_value = fvalue.index_value.at(value_indices.at(j));
a_facet.result_map[facet_value] += 1;
}
}
}
}
}
void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
const std::vector<std::string> & rank_fields, int & token_rank,
const std::vector<sort_field> & sort_fields, int & candidate_rank,
std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
size_t & total_results, size_t & num_found, const size_t & max_results) {
size_t & total_results, uint32_t** all_result_ids, size_t & all_result_ids_len,
const size_t & max_results) {
const size_t combination_limit = 10;
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product);
@ -367,7 +353,11 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
for(long long n=0; n<N && n<combination_limit; ++n) {
// every element in `query_suggestion` contains a token and its associated hits
std::vector<art_leaf *> query_suggestion = next_suggestion(token_leaves, n);
token_rank++;
candidate_rank++;
/*for(auto i=0; i < query_suggestion.size(); i++) {
std::cout << "i: " << i << " - " << query_suggestion[i]->key << std::endl;
}*/
// initialize results with the starting element (for further intersection)
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
@ -386,22 +376,32 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
if(filter_ids != nullptr) {
// intersect once again with filter ids
uint32_t* filtered_result_ids = new uint32_t[std::min(filter_ids_length, result_size)];
size_t filtered_results_size =
Intersection::scalar(filter_ids, filter_ids_length, result_ids, result_size, filtered_result_ids);
size_t filtered_results_size = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids,
result_size, filtered_result_ids);
uint32_t* new_all_result_ids;
all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, filtered_result_ids,
filtered_results_size, &new_all_result_ids);
delete [] *all_result_ids;
*all_result_ids = new_all_result_ids;
do_facets(facets, filtered_result_ids, filtered_results_size);
// go through each matching document id and calculate match score
score_results(rank_fields, token_rank, topster, query_suggestion, filtered_result_ids, filtered_results_size);
num_found += filtered_results_size;
score_results(sort_fields, candidate_rank, topster, query_suggestion, filtered_result_ids, filtered_results_size);
delete[] filtered_result_ids;
delete[] result_ids;
} else {
do_facets(facets, result_ids, result_size);
score_results(rank_fields, token_rank, topster, query_suggestion, result_ids, result_size);
num_found += result_size;
uint32_t* new_all_result_ids;
all_result_ids_len = ArrayUtils::or_scalar(*all_result_ids, all_result_ids_len, result_ids,
result_size, &new_all_result_ids);
delete [] *all_result_ids;
*all_result_ids = new_all_result_ids;
score_results(sort_fields, candidate_rank, topster, query_suggestion, result_ids, result_size);
delete[] result_ids;
}
@ -544,7 +544,8 @@ Option<uint32_t> Collection::do_filtering(uint32_t** filter_ids_out, const std::
filter_ids_length = result_ids_length;
} else {
uint32_t* filtered_results = new uint32_t[std::min((size_t)filter_ids_length, result_ids_length)];
filter_ids_length = Intersection::scalar(filter_ids, filter_ids_length, result_ids, result_ids_length, filtered_results);
filter_ids_length = ArrayUtils::and_scalar(filter_ids, filter_ids_length, result_ids,
result_ids_length, filtered_results);
delete [] filter_ids;
delete [] result_ids;
filter_ids = filtered_results;
@ -558,9 +559,8 @@ Option<uint32_t> Collection::do_filtering(uint32_t** filter_ids_out, const std::
nlohmann::json Collection::search(std::string query, const std::vector<std::string> search_fields,
const std::string & simple_filter_query, const std::vector<std::string> & facet_fields,
const std::vector<std::string> & rank_fields, const int num_typos,
const std::vector<sort_field> & sort_fields, const int num_typos,
const size_t num_results, const token_ordering token_order, const bool prefix) {
size_t num_found = 0;
nlohmann::json result = nlohmann::json::object();
std::vector<facet> facets;
@ -587,10 +587,15 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
facets.push_back(facet(field_name));
}
// validate rank fields
for(const std::string & field_name: rank_fields) {
if(rank_index.count(field_name) == 0) {
result["error"] = "Could not find a rank field named `" + field_name + "` in the schema.";
// validate sort fields
for(const sort_field & _sort_field: sort_fields) {
if(sort_index.count(_sort_field.name) == 0) {
result["error"] = "Could not find a sort field named `" + _sort_field.name + "` in the schema.";
return result;
}
if(_sort_field.order != sort_field_const::asc && _sort_field.order != sort_field_const::desc) {
result["error"] = "Order for sort field` " + _sort_field.name + "` should be either ASC or DESC.";
return result;
}
}
@ -605,20 +610,23 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
const uint32_t filter_ids_length = op_filter_ids_length.get();
// Order of `fields` are used to rank results
// Order of `fields` are used to sort results
auto begin = std::chrono::high_resolution_clock::now();
std::vector<std::pair<int, Topster<100>::KV>> field_order_kvs;
uint32_t* all_result_ids = nullptr;
size_t all_result_ids_len = 0;
for(int i = 0; i < search_fields.size(); i++) {
Topster<100> topster;
const std::string & field = search_fields[i];
// proceed to query search only when no filters are provided or when filtering produces results
if(simple_filter_query.size() == 0 || filter_ids_length > 0) {
search_field(query, field, filter_ids, filter_ids_length, facets, rank_fields, num_typos, num_results,
topster, num_found, token_order, prefix);
search_field(query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos, num_results,
topster, &all_result_ids, all_result_ids_len, token_order, prefix);
topster.sort();
}
// order of fields specified matter: matching docs from earlier fields are more important
for(auto t = 0; t < topster.size && t < num_results; t++) {
field_order_kvs.push_back(std::make_pair(search_fields.size() - i, topster.getKV(t)));
}
@ -626,18 +634,19 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
delete [] filter_ids;
// All fields are sorted descending
std::sort(field_order_kvs.begin(), field_order_kvs.end(),
[](const std::pair<int, Topster<100>::KV> & a, const std::pair<int, Topster<100>::KV> & b) {
if(a.second.match_score != b.second.match_score) return a.second.match_score > b.second.match_score;
if(a.second.primary_attr != b.second.primary_attr) return a.second.primary_attr > b.second.primary_attr;
if(a.second.secondary_attr != b.second.secondary_attr) return a.second.secondary_attr > b.second.secondary_attr;
if(a.first != b.first) return a.first > b.first;
if(a.first != b.first) return a.first > b.first; // field position
return a.second.key > b.second.key;
});
result["hits"] = nlohmann::json::array();
for(auto field_order_kv: field_order_kvs) {
for(auto & field_order_kv: field_order_kvs) {
std::string value;
const std::string &seq_id_key = get_seq_id_key((uint32_t) field_order_kv.second.key);
store->get(seq_id_key, value);
@ -645,7 +654,7 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
result["hits"].push_back(document);
}
result["found"] = num_found;
result["found"] = all_result_ids_len;
result["facet_counts"] = nlohmann::json::array();
@ -655,7 +664,19 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
facet_result["field_name"] = a_facet.field_name;
facet_result["counts"] = nlohmann::json::array();
for(auto kv: a_facet.result_map) {
// keep only top 10 facets
std::vector<std::pair<std::string, size_t>> value_to_count;
for (auto itr = a_facet.result_map.begin(); itr != a_facet.result_map.end(); ++itr) {
value_to_count.push_back(*itr);
}
std::sort(value_to_count.begin(), value_to_count.end(),
[=](std::pair<std::string, size_t>& a, std::pair<std::string, size_t>& b) {
return a.second > b.second;
});
for(auto i = 0; i < std::min((size_t)10, value_to_count.size()); i++) {
auto & kv = value_to_count[i];
nlohmann::json facet_value_count = nlohmann::json::object();
facet_value_count["value"] = kv.first;
facet_value_count["count"] = kv.second;
@ -681,11 +702,11 @@ nlohmann::json Collection::search(std::string query, const std::vector<std::stri
5. Sort the docs based on some ranking criteria
*/
void Collection::search_field(std::string & query, const std::string & field, uint32_t *filter_ids, size_t filter_ids_length,
std::vector<facet> & facets, const std::vector<std::string> & rank_fields, const int num_typos,
const size_t num_results, Topster<100> &topster, size_t & num_found,
const token_ordering token_order, const bool prefix) {
std::vector<facet> & facets, const std::vector<sort_field> & sort_fields, const int num_typos,
const size_t num_results, Topster<100> &topster, uint32_t** all_result_ids,
size_t & all_result_ids_len, const token_ordering token_order, const bool prefix) {
std::vector<std::string> tokens;
StringUtils::tokenize(query, tokens, " ", true);
StringUtils::split(query, tokens, " ");
const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
const size_t max_results = std::min(num_results, (size_t) Collection::MAX_RESULTS);
@ -714,7 +735,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui
const size_t combination_limit = 10;
auto product = []( long long a, std::vector<int>& b ) { return a*b.size(); };
int token_rank = 0;
int candidate_rank = 0;
long long n = 0;
long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
@ -737,8 +758,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui
const std::string token_cost_hash = token + std::to_string(costs[token_index]);
std::vector<art_leaf*> leaves;
/*std::cout << "\nSearching for: " << token << " - cost: " << costs[token_index] << ", token_rank: "
<< token_rank << std::endl;*/
/*std::cout << "\nSearching for: " << token << " - cost: " << costs[token_index] << ", candidate_rank: "
<< candidate_rank << std::endl;*/
if(token_cost_cache.count(token_cost_hash) != 0) {
leaves = token_cost_cache[token_cost_hash];
@ -758,9 +779,9 @@ void Collection::search_field(std::string & query, const std::string & field, ui
if(!leaves.empty()) {
//!log_leaves(costs[token_index], token, leaves);
token_leaves.push_back(leaves);
token_to_count[token] = leaves.at(0)->values->ids.getLength();
token_to_count[token] = std::max(token_to_count[token], leaves.at(0)->values->ids.getLength());
} else {
// No result at `cost = costs[token_index]` => remove cost for token and re-do combinations
// No result at `cost = costs[token_index]`. Remove costs until `cost` for token and re-do combinations
auto it = std::find(token_to_costs[token_index].begin(), token_to_costs[token_index].end(), costs[token_index]);
if(it != token_to_costs[token_index].end()) {
token_to_costs[token_index].erase(it);
@ -773,9 +794,9 @@ void Collection::search_field(std::string & query, const std::string & field, ui
}
}
// To continue outerloop on new cost combination
n = -1;
N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
break;
}
@ -784,8 +805,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui
if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) {
// If all tokens were found, go ahead and search for candidates with what we have so far
search_candidates(filter_ids, filter_ids_length, facets, rank_fields, token_rank, token_leaves, topster,
total_results, num_found, max_results);
search_candidates(filter_ids, filter_ids_length, facets, sort_fields, candidate_rank, token_leaves, topster,
total_results, all_result_ids, all_result_ids_len, max_results);
if (total_results >= max_results) {
// If we don't find enough results, we continue outerloop (looking at tokens with greater cost)
@ -818,8 +839,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui
}
}
return search_field(truncated_query, field, filter_ids, filter_ids_length, facets, rank_fields, num_typos,
num_results, topster, num_found, token_order, prefix);
return search_field(truncated_query, field, filter_ids, filter_ids_length, facets, sort_fields, num_typos,
num_results, topster, all_result_ids, all_result_ids_len, token_order, prefix);
}
}
@ -834,11 +855,11 @@ void Collection::log_leaves(const int cost, const std::string &token, const std:
}
}
void Collection::score_results(const std::vector<std::string> & rank_fields, const int & token_rank,
void Collection::score_results(const std::vector<sort_field> & sort_fields, const int & candidate_rank,
Topster<100> & topster, const std::vector<art_leaf *> &query_suggestion,
const uint32_t *result_ids, const size_t result_size) const {
const int max_token_rank = 250;
const int max_candidate_rank = 250;
spp::sparse_hash_map<art_leaf*, uint32_t*> leaf_to_indices;
if(query_suggestion.size() != 1) {
@ -853,13 +874,23 @@ void Collection::score_results(const std::vector<std::string> & rank_fields, con
spp::sparse_hash_map<uint32_t, int64_t> * primary_rank_scores = nullptr;
spp::sparse_hash_map<uint32_t, int64_t> * secondary_rank_scores = nullptr;
if(rank_fields.size() > 0) {
// Used for asc/desc ordering. NOTE: Topster keeps biggest keys (i.e. it's desc in nature)
int64_t primary_rank_factor = 1;
int64_t secondary_rank_factor = 1;
if(sort_fields.size() > 0) {
// assumed that rank field exists in the index - checked earlier in the chain
primary_rank_scores = rank_index.at(rank_fields[0]);
primary_rank_scores = sort_index.at(sort_fields[0].name);
if(sort_fields[0].order == sort_field_const::asc) {
primary_rank_factor = -1;
}
}
if(rank_fields.size() > 1) {
secondary_rank_scores = rank_index.at(rank_fields[1]);
if(sort_fields.size() > 1) {
secondary_rank_scores = sort_index.at(sort_fields[1].name);
if(sort_fields[1].order == sort_field_const::asc) {
secondary_rank_factor = -1;
}
}
for(auto i=0; i<result_size; i++) {
@ -895,18 +926,22 @@ void Collection::score_results(const std::vector<std::string> & rank_fields, con
mscore = MatchScore::match_score(seq_id, token_positions);
}
int token_rank_score = max_token_rank - token_rank;
int candidate_rank_score = max_candidate_rank - candidate_rank;
// Construct a single match_score from individual components (for multi-field sort)
const uint64_t match_score = (token_rank_score << 16) +
((uint64_t)(mscore.words_present) << 8) +
const uint64_t match_score = ((uint64_t)(mscore.words_present) << 16) +
(candidate_rank_score << 8) +
(MAX_SEARCH_TOKENS - mscore.distance);
int64_t primary_rank_score = primary_rank_scores->count(seq_id) > 0 ? primary_rank_scores->at(seq_id) : 0;
int64_t primary_rank_score = (primary_rank_scores && primary_rank_scores->count(seq_id) > 0) ?
primary_rank_scores->at(seq_id) : 0;
int64_t secondary_rank_score = (secondary_rank_scores && secondary_rank_scores->count(seq_id) > 0) ?
secondary_rank_scores->at(seq_id) : 0;
topster.add(seq_id, match_score, primary_rank_score, secondary_rank_score);
/*std::cout << "token_rank_score: " << token_rank_score << ", match_score: "
topster.add(seq_id, match_score,
primary_rank_factor * primary_rank_score,
secondary_rank_factor * secondary_rank_score);
/*std::cout << "candidate_rank_score: " << candidate_rank_score << ", match_score: "
<< match_score << ", primary_rank_score: " << primary_rank_score << ", seq_id: " << seq_id << std::endl;*/
}
@ -968,9 +1003,13 @@ void Collection::remove_and_shift_offset_index(sorted_array &offset_index, const
delete[] new_array;
}
void Collection::remove(std::string id) {
Option<std::string> Collection::remove(const std::string & id) {
std::string seq_id_str;
store->get(get_doc_id_key(id), seq_id_str);
StoreStatus status = store->get(get_doc_id_key(id), seq_id_str);
if(status == StoreStatus::NOT_FOUND) {
return Option<std::string>(404, "Could not find a document with id: " + id);
}
uint32_t seq_id = (uint32_t) std::stol(seq_id_str);
@ -979,48 +1018,108 @@ void Collection::remove(std::string id) {
nlohmann::json document = nlohmann::json::parse(parsed_document);
std::vector<std::string> tokens;
StringUtils::tokenize(document["title"], tokens, " ", true);
for(auto token: tokens) {
std::transform(token.begin(), token.end(), token.begin(), ::tolower);
const unsigned char *key = (const unsigned char *) token.c_str();
int key_len = (int) (token.length() + 1);
art_leaf* leaf = (art_leaf *) art_search(search_index.at("title"), key, key_len);
if(leaf != NULL) {
uint32_t seq_id_values[1] = {seq_id};
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
uint32_t start_offset = leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
leaf->values->offsets.getLength() :
leaf->values->offset_index.at(doc_index+1);
uint32_t doc_indices[1] = {doc_index};
remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);
leaf->values->offsets.remove_index(start_offset, end_offset);
leaf->values->ids.remove_values(seq_id_values, 1);
/*len = leaf->values->offset_index.getLength();
for(auto i=0; i<len; i++) {
std::cout << "i: " << i << ", val: " << leaf->values->offset_index.at(i) << std::endl;
for(auto & name_field: search_schema) {
std::vector<std::string> tokens;
if(name_field.second.type == field_types::STRING) {
StringUtils::split(document[name_field.first], tokens, " ");
} else if(name_field.second.type == field_types::STRING_ARRAY) {
tokens = document[name_field.first].get<std::vector<std::string>>();
} else if(name_field.second.type == field_types::INT32) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
int32_t value = document[name_field.first].get<int32_t>();
encode_int32(value, key);
tokens.push_back(std::string((char*)key, KEY_LEN));
} else if(name_field.second.type == field_types::INT32_ARRAY) {
std::vector<int32_t> values = document[name_field.first].get<std::vector<int32_t>>();
for(const int32_t value: values) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
encode_int32(value, key);
tokens.push_back(std::string((char*)key, KEY_LEN));
}
std::cout << "----" << std::endl;*/
} else if(name_field.second.type == field_types::INT64) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
int64_t value = document[name_field.first].get<int64_t>();
encode_int64(value, key);
tokens.push_back(std::string((char*)key, KEY_LEN));
} else if(name_field.second.type == field_types::INT64_ARRAY) {
std::vector<int64_t> values = document[name_field.first].get<std::vector<int64_t>>();
for(const int64_t value: values) {
const int KEY_LEN = 8;
unsigned char key[KEY_LEN];
encode_int64(value, key);
tokens.push_back(std::string((char*)key, KEY_LEN));
}
}
if(leaf->values->ids.getLength() == 0) {
art_delete(search_index.at("title"), key, key_len);
for(auto & token: tokens) {
const unsigned char *key;
int key_len;
if(name_field.second.type == field_types::STRING_ARRAY || name_field.second.type == field_types::STRING) {
std::transform(token.begin(), token.end(), token.begin(), ::tolower);
key = (const unsigned char *) token.c_str();
key_len = (int) (token.length() + 1);
} else {
key = (const unsigned char *) token.c_str();
key_len = (int) (token.length());
}
art_leaf* leaf = (art_leaf *) art_search(search_index.at(name_field.first), key, key_len);
if(leaf != NULL) {
uint32_t seq_id_values[1] = {seq_id};
uint32_t doc_index = leaf->values->ids.indexOf(seq_id);
if(doc_index == leaf->values->ids.getLength()) {
// not found - happens when 2 tokens repeat in a field, e.g "is it or is is not?"
continue;
}
uint32_t start_offset = leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == leaf->values->ids.getLength() - 1) ?
leaf->values->offsets.getLength() :
leaf->values->offset_index.at(doc_index+1);
uint32_t doc_indices[1] = {doc_index};
remove_and_shift_offset_index(leaf->values->offset_index, doc_indices, 1);
leaf->values->offsets.remove_index(start_offset, end_offset);
leaf->values->ids.remove_values(seq_id_values, 1);
/*len = leaf->values->offset_index.getLength();
for(auto i=0; i<len; i++) {
std::cout << "i: " << i << ", val: " << leaf->values->offset_index.at(i) << std::endl;
}
std::cout << "----" << std::endl;*/
if(leaf->values->ids.getLength() == 0) {
art_values* values = (art_values*) art_delete(search_index.at(name_field.first), key, key_len);
delete values;
values = nullptr;
}
}
}
}
// remove facets if any
for(auto & field_facet_value: facet_index) {
field_facet_value.second.doc_values.erase(seq_id);
}
// remove sort index if any
for(auto & field_doc_value_map: sort_index) {
field_doc_value_map.second->erase(seq_id);
}
store->remove(get_doc_id_key(id));
store->remove(get_seq_id_key(seq_id));
return Option<std::string>(id);
}
std::string Collection::get_next_seq_id_key(std::string collection_name) {
std::string Collection::get_next_seq_id_key(const std::string & collection_name) {
return std::string(COLLECTION_NEXT_SEQ_PREFIX) + "_" + collection_name;
}
@ -1035,7 +1134,7 @@ std::string Collection::get_seq_id_key(uint32_t seq_id) {
return get_seq_id_collection_prefix() + "_" + std::string(bytes, bytes+4);
}
std::string Collection::get_doc_id_key(std::string doc_id) {
std::string Collection::get_doc_id_key(const std::string & doc_id) {
return std::to_string(collection_id) + "_" + DOC_ID_PREFIX + doc_id;
}
@ -1059,15 +1158,15 @@ std::vector<std::string> Collection::get_facet_fields() {
return facet_fields_copy;
}
std::vector<std::string> Collection::get_rank_fields() {
return rank_fields;
std::vector<field> Collection::get_sort_fields() {
return sort_fields;
}
spp::sparse_hash_map<std::string, field> Collection::get_schema() {
return search_schema;
};
std::string Collection::get_meta_key(std::string collection_name) {
std::string Collection::get_meta_key(const std::string & collection_name) {
return COLLECTION_META_PREFIX + collection_name;
}
@ -1075,6 +1174,6 @@ std::string Collection::get_seq_id_collection_prefix() {
return std::to_string(collection_id) + "_" + std::string(SEQ_ID_PREFIX);
}
std::string Collection::get_token_ordering_field() {
return token_ordering_field;
std::string Collection::get_token_ranking_field() {
return token_ranking_field;
}

View File

@ -23,7 +23,7 @@ void CollectionManager::init(Store *store) {
std::vector<std::string> collection_meta_jsons;
store->scan_fill(Collection::COLLECTION_META_PREFIX, collection_meta_jsons);
for(auto collection_meta_json: collection_meta_jsons) {
for(auto & collection_meta_json: collection_meta_jsons) {
nlohmann::json collection_meta = nlohmann::json::parse(collection_meta_json);
std::string this_collection_name = collection_meta[COLLECTION_NAME_KEY].get<std::string>();
@ -45,10 +45,15 @@ void CollectionManager::init(Store *store) {
store->get(Collection::get_next_seq_id_key(this_collection_name), collection_next_seq_id_str);
uint32_t collection_next_seq_id = (const uint32_t) std::stoi(collection_next_seq_id_str);
std::vector<std::string> collection_rank_fields =
collection_meta[COLLECTION_RANK_FIELDS_KEY].get<std::vector<std::string>>();
std::string token_ordering_field = collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY].get<std::string>();
std::vector<field> collection_sort_fields;
nlohmann::json sort_fields_map = collection_meta[COLLECTION_SORT_FIELDS_KEY];
for (nlohmann::json::iterator it = sort_fields_map.begin(); it != sort_fields_map.end(); ++it) {
collection_sort_fields.push_back({it.value()[fields::name], it.value()[fields::type]});
}
std::string token_ranking_field = collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY].get<std::string>();
Collection* collection = new Collection(this_collection_name,
collection_meta[COLLECTION_ID_KEY].get<uint32_t>(),
@ -56,8 +61,8 @@ void CollectionManager::init(Store *store) {
store,
search_fields,
facet_fields,
collection_rank_fields,
token_ordering_field);
collection_sort_fields,
token_ranking_field);
// Fetch records from the store and re-create memory index
std::vector<std::string> documents;
@ -82,8 +87,8 @@ void CollectionManager::init(Store *store) {
Collection* CollectionManager::create_collection(std::string name, const std::vector<field> & search_fields,
const std::vector<field> & facet_fields,
const std::vector<std::string> & rank_fields,
const std::string & token_ordering_field) {
const std::vector<field> & sort_fields,
const std::string & token_ranking_field) {
if(store->contains(Collection::get_meta_key(name))) {
return nullptr;
}
@ -91,7 +96,7 @@ Collection* CollectionManager::create_collection(std::string name, const std::ve
nlohmann::json collection_meta;
nlohmann::json search_fields_json = nlohmann::json::array();;
for(const field& search_field: search_fields) {
for(const field & search_field: search_fields) {
nlohmann::json field_val;
field_val[fields::name] = search_field.name;
field_val[fields::type] = search_field.type;
@ -99,22 +104,30 @@ Collection* CollectionManager::create_collection(std::string name, const std::ve
}
nlohmann::json facet_fields_json = nlohmann::json::array();;
for(const field& facet_field: facet_fields) {
for(const field & facet_field: facet_fields) {
nlohmann::json field_val;
field_val[fields::name] = facet_field.name;
field_val[fields::type] = facet_field.type;
facet_fields_json.push_back(field_val);
}
nlohmann::json sort_fields_json = nlohmann::json::array();;
for(const field & sort_field: sort_fields) {
nlohmann::json sort_field_val;
sort_field_val[fields::name] = sort_field.name;
sort_field_val[fields::type] = sort_field.type;
sort_fields_json.push_back(sort_field_val);
}
collection_meta[COLLECTION_NAME_KEY] = name;
collection_meta[COLLECTION_ID_KEY] = next_collection_id;
collection_meta[COLLECTION_SEARCH_FIELDS_KEY] = search_fields_json;
collection_meta[COLLECTION_FACET_FIELDS_KEY] = facet_fields_json;
collection_meta[COLLECTION_RANK_FIELDS_KEY] = rank_fields;
collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY] = token_ordering_field;
collection_meta[COLLECTION_SORT_FIELDS_KEY] = sort_fields_json;
collection_meta[COLLECTION_TOKEN_ORDERING_FIELD_KEY] = token_ranking_field;
Collection* new_collection = new Collection(name, next_collection_id, 0, store, search_fields, facet_fields,
rank_fields, token_ordering_field);
sort_fields, token_ranking_field);
store->insert(Collection::get_meta_key(name), collection_meta.dump());
store->insert(Collection::get_next_seq_id_key(name), std::to_string(0));
@ -135,12 +148,6 @@ Collection* CollectionManager::get_collection(std::string collection_name) {
return nullptr;
}
CollectionManager::~CollectionManager() {
for(auto kv: collections) {
drop_collection(kv.first);
}
}
bool CollectionManager::drop_collection(std::string collection_name) {
Collection* collection = get_collection(collection_name);
if(collection == nullptr) {

219
src/http_server.cpp Normal file
View File

@ -0,0 +1,219 @@
#include "http_server.h"
#include "string_utils.h"
#include <regex>
#include <signal.h>
h2o_globalconf_t HttpServer::config;
h2o_context_t HttpServer::ctx;
h2o_accept_ctx_t HttpServer::accept_ctx;
std::vector<route_path> HttpServer::routes;
HttpServer::HttpServer(std::string listen_address, uint32_t listen_port):
listen_address(listen_address), listen_port(listen_port) {
h2o_config_init(&config);
hostconf = h2o_config_register_host(&config, h2o_iovec_init(H2O_STRLIT("default")), 65535);
register_handler(hostconf, "/", catch_all_handler);
}
void HttpServer::on_accept(h2o_socket_t *listener, const char *err) {
h2o_socket_t *sock;
if (err != NULL) {
return;
}
if ((sock = h2o_evloop_socket_accept(listener)) == NULL) {
return;
}
h2o_accept(&accept_ctx, sock);
}
int HttpServer::create_listener(void) {
struct sockaddr_in addr;
int fd, reuseaddr_flag = 1;
h2o_socket_t *sock;
memset(&addr, 0, sizeof(addr));
addr.sin_family = AF_INET;
addr.sin_port = htons(listen_port);
inet_pton(AF_INET, listen_address.c_str(), &(addr.sin_addr));
if ((fd = socket(AF_INET, SOCK_STREAM, 0)) == -1 ||
setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr_flag, sizeof(reuseaddr_flag)) != 0 ||
bind(fd, (struct sockaddr *)&addr, sizeof(addr)) != 0 ||
listen(fd, SOMAXCONN) != 0) {
return -1;
}
ctx.globalconf->server_name = h2o_strdup(NULL, "", SIZE_MAX);
sock = h2o_evloop_socket_create(ctx.loop, fd, H2O_SOCKET_FLAG_DONT_READ);
h2o_socket_read_start(sock, on_accept);
return 0;
}
int HttpServer::run() {
signal(SIGPIPE, SIG_IGN);
h2o_context_init(&ctx, h2o_evloop_create(), &config);
accept_ctx.ctx = &ctx;
accept_ctx.hosts = config.hosts;
if (create_listener() != 0) {
std::cerr << "Failed to listen on " << listen_address << ":" << listen_port << std::endl
<< "Error: " << strerror(errno) << std::endl;
return 1;
}
while (h2o_evloop_run(ctx.loop) == 0);
return 0;
}
h2o_pathconf_t* HttpServer::register_handler(h2o_hostconf_t *hostconf, const char *path,
int (*on_req)(h2o_handler_t *, h2o_req_t *)) {
h2o_pathconf_t *pathconf = h2o_config_register_path(hostconf, path, 0);
h2o_handler_t *handler = h2o_create_handler(pathconf, sizeof(*handler));
handler->on_req = on_req;
return pathconf;
}
const char* HttpServer::get_status_reason(uint32_t status_code) {
switch(status_code) {
case 200: return "OK";
case 201: return "Created";
case 400: return "Bad Request";
case 404: return "Not Found";
case 409: return "Conflict";
case 500: return "Internal Server Error";
default: return "";
}
}
std::map<std::string, std::string> HttpServer::parse_query(const std::string& query) {
std::map<std::string, std::string> query_map;
std::regex pattern("([\\w+%]+)=([^&]*)");
auto words_begin = std::sregex_iterator(query.begin(), query.end(), pattern);
auto words_end = std::sregex_iterator();
for (std::sregex_iterator i = words_begin; i != words_end; i++) {
std::string key = (*i)[1].str();
std::string raw_value = (*i)[2].str();
std::string value = StringUtils::url_decode(raw_value);
if(query_map.count(value) == 0) {
query_map[key] = value;
} else {
query_map[key] = query_map[key] + "&&" + value;
}
}
return query_map;
}
int HttpServer::catch_all_handler(h2o_handler_t *self, h2o_req_t *req) {
const std::string & http_method = std::string(req->method.base, req->method.len);
const std::string & path = std::string(req->path.base, req->path.len);
h2o_generator_t generator = {NULL, NULL};
std::vector<std::string> path_with_query_parts;
StringUtils::split(path, path_with_query_parts, "?");
const std::string & path_without_query = path_with_query_parts[0];
std::vector<std::string> path_parts;
StringUtils::split(path_without_query, path_parts, "/");
h2o_iovec_t query = req->query_at != SIZE_MAX ?
h2o_iovec_init(req->path.base + req->query_at, req->path.len - req->query_at) :
h2o_iovec_init(H2O_STRLIT(""));
std::string query_str(query.base, query.len);
std::map<std::string, std::string> query_map = parse_query(query_str);
const std::string & req_body = std::string(req->entity.base, req->entity.len);
for(const route_path & rpath: routes) {
if(rpath.path_parts.size() != path_parts.size() || rpath.http_method != http_method) {
continue;
}
bool found = true;
for(size_t i = 0; i < rpath.path_parts.size(); i++) {
const std::string & rpart = rpath.path_parts[i];
const std::string & given_part = path_parts[i];
if(rpart != given_part && rpart[0] != ':') {
found = false;
goto check_next_route;
}
}
check_next_route:
if(found) {
// routes match - iterate and extract path params
for(size_t i = 0; i < rpath.path_parts.size(); i++) {
const std::string & path_part = rpath.path_parts[i];
if(path_part[0] == ':') {
query_map.emplace(path_part.substr(1), path_parts[i]);
}
}
http_req request = {query_map, req_body};
http_res response;
(rpath.handler)(request, response);
h2o_iovec_t body = h2o_strdup(&req->pool, response.body.c_str(), SIZE_MAX);
req->res.status = response.status_code;
req->res.reason = get_status_reason(response.status_code);
h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8"));
h2o_start_response(req, &generator);
h2o_send(req, &body, 1, 1);
return 0;
}
}
h2o_iovec_t res_body = h2o_strdup(&req->pool, "{ \"message\": \"Not Found\"}", SIZE_MAX);
req->res.status = 404;
req->res.reason = get_status_reason(404);
h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8"));
h2o_start_response(req, &generator);
h2o_send(req, &res_body, 1, 1);
return 0;
}
void HttpServer::get(const std::string & path, void (*handler)(http_req &, http_res &)) {
std::vector<std::string> path_parts;
StringUtils::split(path, path_parts, "/");
route_path rpath = {"GET", path_parts, handler};
routes.push_back(rpath);
}
void HttpServer::post(const std::string & path, void (*handler)(http_req &, http_res &)) {
std::vector<std::string> path_parts;
StringUtils::split(path, path_parts, "/");
route_path rpath = {"POST", path_parts, handler};
routes.push_back(rpath);
}
void HttpServer::put(const std::string & path, void (*handler)(http_req &, http_res &)) {
std::vector<std::string> path_parts;
StringUtils::split(path, path_parts, "/");
route_path rpath = {"PUT", path_parts, handler};
routes.push_back(rpath);
}
void HttpServer::del(const std::string & path, void (*handler)(http_req &, http_res &)) {
std::vector<std::string> path_parts;
StringUtils::split(path, path_parts, "/");
route_path rpath = {"DELETE", path_parts, handler};
routes.push_back(rpath);
}
HttpServer::~HttpServer() {
}

View File

@ -1,32 +0,0 @@
#include "intersection.h"
size_t Intersection::scalar(const uint32_t *A, const size_t lenA,
const uint32_t *B, const size_t lenB, uint32_t *out) {
const uint32_t *const initout(out);
if (lenA == 0 || lenB == 0)
return 0;
const uint32_t *endA = A + lenA;
const uint32_t *endB = B + lenB;
while (1) {
while (*A < *B) {
SKIP_FIRST_COMPARE:
if (++A == endA)
return (out - initout);
}
while (*A > *B) {
if (++B == endB)
return (out - initout);
}
if (*A == *B) {
*out++ = *A;
if (++A == endA || ++B == endB)
return (out - initout);
} else {
goto SKIP_FIRST_COMPARE;
}
}
return (out - initout); // NOTREACHED
}

View File

@ -17,14 +17,14 @@ int main(int argc, char* argv[]) {
system("rm -rf /tmp/typesense-data && mkdir -p /tmp/typesense-data");
std::vector<field> fields_to_index = {field("title", field_types::STRING)};
std::vector<std::string> rank_fields = {"points"};
std::vector<field> sort_fields = { field("points", "INT32")};
Store *store = new Store("/tmp/typesense-data");
CollectionManager & collectionManager = CollectionManager::get_instance();
collectionManager.init(store);
Collection *collection = collectionManager.get_collection("collection");
if(collection == nullptr) {
collection = collectionManager.create_collection("collection", fields_to_index, {}, rank_fields);
collection = collectionManager.create_collection("collection", fields_to_index, {}, sort_fields);
}
std::ifstream infile("/Users/kishore/Downloads/hnstories_small.jsonl");
@ -48,7 +48,7 @@ int main(int argc, char* argv[]) {
while(counter < 3000) {
auto i = counter % 5;
auto results = collection->search(queries[i], search_fields, "", { }, {"points"}, 1, 100, MAX_SCORE, 0);
auto results = collection->search(queries[i], search_fields, "", { }, {sort_field("points", "DESC")}, 1, 100, MAX_SCORE, 0);
results_total += results.size();
counter++;
}

View File

@ -8,72 +8,95 @@
#include <unordered_map>
#include <queue>
#include "string_utils.h"
#include <sys/resource.h>
#include "collection.h"
#include "collection_manager.h"
using namespace std;
void find_indices(const uint32_t *result_ids, int low, int high, std::vector<uint32_t> & results) {
if(high >= low) {
size_t pivot = (low + high) / 2;
//std::cout << pivot << std::endl;
results.at(pivot) = result_ids[pivot];
find_indices(result_ids, low, pivot-1, results);
find_indices(result_ids, pivot+1, high, results);
}
}
int main(int argc, char* argv[]) {
std::vector<uint32_t> results(3);
uint32_t *result_ids = new uint32_t[3];
/*for(auto i = 0; i < 100; i++) {
result_ids[i] = i;
}*/
result_ids[0] = 6;
result_ids[1] = 19;
result_ids[2] = 21;
find_indices(result_ids, 0, 2, results);
//std::sort(results.begin(), results.end());
for(auto i : results) {
std::cout << i << std::endl;
}
return 0;
const std::string state_dir_path = "/tmp/typesense-data";
std::vector<field> fields_to_index = {field("title", field_types::STRING)};
std::vector<std::string> rank_fields = {"points"};
Store *store = new Store("/tmp/typesense-data");
CollectionManager & collectionManager = CollectionManager::get_instance();
collectionManager.init(store);
Collection *collection = collectionManager.get_collection("collection");
if(collection == nullptr) {
collection = collectionManager.create_collection("collection", fields_to_index, {}, rank_fields);
std::ifstream infile(std::string(ROOT_DIR)+"test/documents.jsonl");
//std::ifstream infile(argv[1]);
std::vector<field> fields_to_index = {
field("lang", field_types::STRING),
field("description", field_types::STRING),
field("topics", field_types::STRING_ARRAY),
field("stars", field_types::INT32),
field("repo_name", field_types::STRING),
field("org", field_types::STRING)
};
std::vector<field> facet_fields_index = {
field("lang", field_types::STRING),
field("org", field_types::STRING),
field("topics", field_types::STRING_ARRAY)
};
std::vector<field> sort_fields = {
field("stars", "INT32")
};
Collection *collection = collectionManager.get_collection("github_top1k");
if(collection == nullptr) {
collection = collectionManager.create_collection("github_top1k", fields_to_index, facet_fields_index, sort_fields);
}
int j = 0;
while(j < 1000) {
j++;
std::ifstream infile(argv[1]);
std::string json_line;
cout << "BEGINNING Iteration: " << j << endl << flush;
auto begin = std::chrono::high_resolution_clock::now();
int doc_id = 0;
while (std::getline(infile, json_line)) {
collection->add(json_line);
nlohmann::json document = nlohmann::json::parse(json_line);
//document["id"] = std::to_string(doc_id);
document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
collection->add(document.dump());
doc_id++;
}
infile.close();
cout << "FINISHED INDEXING!" << endl << flush;
long long int timeMillis =
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
std::cout << "Time taken for insertion: " << timeMillis << "ms" << std::endl;
begin = std::chrono::high_resolution_clock::now();
std::ifstream infile2(argv[1]);
doc_id = 0;
while (std::getline(infile2, json_line)) {
nlohmann::json document = nlohmann::json::parse(json_line);
//document["id"] = std::to_string(doc_id);
document["id"] = document["org"].get<std::string>() + ":" + document["repo_name"].get<std::string>();
collection->remove(document["id"]);
doc_id++;
}
infile2.close();
timeMillis =
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
struct rusage r_usage;
getrusage(RUSAGE_SELF,&r_usage);
std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl;
std::cout << "Time taken for deletion: " << timeMillis << "ms" << std::endl;
}
//collection->remove("foo");
auto begin = std::chrono::high_resolution_clock::now();
std::vector<std::string> search_fields = {"title"};
collection->search("the", search_fields, "", {}, {"points"}, 1, 100, MAX_SCORE, 0);
long long int timeMillis =
std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
cout << "Time taken: " << timeMillis << "us" << endl;
delete collection;
delete store;
return 0;
}

View File

@ -1,264 +0,0 @@
#define H2O_USE_LIBUV 0
#include <errno.h>
#include <limits.h>
#include <netinet/in.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <fstream>
#include <iostream>
#include <chrono>
#include <vector>
#include <string>
#include <map>
#include <regex>
#include "string_utils.h"
#include "collection.h"
#include "collection_manager.h"
#include "option.h"
#include <sys/resource.h>
#include "h2o.h"
#include "h2o/http1.h"
#include "h2o/http2.h"
#include "h2o/memcached.h"
static h2o_globalconf_t config;
static h2o_context_t ctx;
static h2o_accept_ctx_t accept_ctx;
std::vector<field> search_fields = {field("title", field_types::STRING), field("points", field_types::INT32)};
std::vector<std::string> rank_fields = {"points"};
Collection *collection;
static h2o_pathconf_t *register_handler(h2o_hostconf_t *hostconf, const char *path,
int (*on_req)(h2o_handler_t *, h2o_req_t *)) {
h2o_pathconf_t *pathconf = h2o_config_register_path(hostconf, path, 0);
h2o_handler_t *handler = h2o_create_handler(pathconf, sizeof(*handler));
handler->on_req = on_req;
return pathconf;
}
std::map<std::string, std::string> parse_query(const std::string& query) {
std::map<std::string, std::string> query_map;
std::regex pattern("([\\w+%]+)=([^&]*)");
auto words_begin = std::sregex_iterator(query.begin(), query.end(), pattern);
auto words_end = std::sregex_iterator();
for (std::sregex_iterator i = words_begin; i != words_end; i++) {
std::string key = (*i)[1].str();
std::string raw_value = (*i)[2].str();
std::string value = StringUtils::url_decode(raw_value);
if(query_map.count(value) == 0) {
query_map[key] = value;
} else {
query_map[key] = query_map[key] + "&&" + value;
}
}
return query_map;
}
static int get_search(h2o_handler_t *self, h2o_req_t *req) {
static h2o_generator_t generator = {NULL, NULL};
h2o_iovec_t query = req->query_at != SIZE_MAX ?
h2o_iovec_init(req->path.base + req->query_at, req->path.len - req->query_at) :
h2o_iovec_init(H2O_STRLIT(""));
std::string query_str(query.base, query.len);
std::map<std::string, std::string> query_map = parse_query(query_str);
const char *NUM_TYPOS = "num_typos";
const char *PREFIX = "prefix";
const char *TOKEN_ORDERING = "token_ordering";
const char *FILTERS = "filters";
if(query_map.count(NUM_TYPOS) == 0) {
query_map[NUM_TYPOS] = "2";
}
if(query_map.count(PREFIX) == 0) {
query_map[PREFIX] = "false";
}
if(query_map.count(TOKEN_ORDERING) == 0) {
query_map[TOKEN_ORDERING] = "FREQUENCY";
}
std::string filter_str = query_map.count(FILTERS) != 0 ? query_map[FILTERS] : "";
//std::cout << "filter_str: " << filter_str << std::endl;
token_ordering token_order = (query_map[TOKEN_ORDERING] == "MAX_SCORE") ? MAX_SCORE : FREQUENCY;
//printf("Query: %s\n", query_map["q"].c_str());
auto begin = std::chrono::high_resolution_clock::now();
std::vector<std::string> search_fields = {"title"};
nlohmann::json result = collection->search(query_map["q"], search_fields, filter_str, { },
{"points"}, std::stoi(query_map[NUM_TYPOS]), 100, token_order, false);
std::string json_str = result.dump();
//std::cout << "JSON:" << json_str << std::endl;
struct rusage r_usage;
getrusage(RUSAGE_SELF,&r_usage);
//std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl;
h2o_iovec_t body = h2o_strdup(&req->pool, json_str.c_str(), SIZE_MAX);
req->res.status = 200;
req->res.reason = "OK";
h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8"));
h2o_start_response(req, &generator);
h2o_send(req, &body, 1, 1);
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
std::cout << "Time taken: " << timeMillis << "us" << std::endl;
return 0;
}
static int post_add_document(h2o_handler_t *self, h2o_req_t *req) {
std::string document(req->entity.base, req->entity.len);
Option<std::string> inserted_id_op = collection->add(document);
nlohmann::json json_response;
static h2o_generator_t generator = {NULL, NULL};
if(!inserted_id_op.ok()) {
req->res.status = 400;
req->res.reason = "BAD REQUEST";
json_response["message"] = inserted_id_op.error();
} else {
req->res.status = 201;
req->res.reason = "CREATED";
json_response["id"] = inserted_id_op.get();
}
h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8"));
h2o_start_response(req, &generator);
h2o_iovec_t body = h2o_strdup(&req->pool, json_response.dump().c_str(), SIZE_MAX);
h2o_send(req, &body, 1, 1);
return 0;
}
static int delete_remove_document(h2o_handler_t *self, h2o_req_t *req) {
h2o_iovec_t query = req->query_at != SIZE_MAX ?
h2o_iovec_init(req->path.base + req->query_at, req->path.len - req->query_at) :
h2o_iovec_init(H2O_STRLIT(""));
std::string query_str(query.base, query.len);
std::map<std::string, std::string> query_map = parse_query(query_str);
std::string doc_id = query_map["id"];
auto begin = std::chrono::high_resolution_clock::now();
collection->remove(doc_id);
long long int time_micro = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::high_resolution_clock::now() - begin).count();
std::cout << "Time taken: " << time_micro << "us" << std::endl;
nlohmann::json json_response;
json_response["id"] = doc_id;
json_response["status"] = "SUCCESS";
static h2o_generator_t generator = {NULL, NULL};
req->res.status = 200;
req->res.reason = "OK";
h2o_add_header(&req->pool, &req->res.headers, H2O_TOKEN_CONTENT_TYPE, H2O_STRLIT("application/json; charset=utf-8"));
h2o_start_response(req, &generator);
h2o_iovec_t body = h2o_strdup(&req->pool, json_response.dump().c_str(), SIZE_MAX);
h2o_send(req, &body, 1, 1);
return 0;
}
static void on_accept(h2o_socket_t *listener, const char *err) {
h2o_socket_t *sock;
if (err != NULL) {
return;
}
if ((sock = h2o_evloop_socket_accept(listener)) == NULL)
return;
h2o_accept(&accept_ctx, sock);
}
static int create_listener(void) {
struct sockaddr_in addr;
int fd, reuseaddr_flag = 1;
h2o_socket_t *sock;
memset(&addr, 0, sizeof(addr));
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = htonl(0x7f000001);
addr.sin_port = htons(1088);
if ((fd = socket(AF_INET, SOCK_STREAM, 0)) == -1 ||
setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr_flag, sizeof(reuseaddr_flag)) != 0 ||
bind(fd, (struct sockaddr *)&addr, sizeof(addr)) != 0 || listen(fd, SOMAXCONN) != 0) {
return -1;
}
sock = h2o_evloop_socket_create(ctx.loop, fd, H2O_SOCKET_FLAG_DONT_READ);
h2o_socket_read_start(sock, on_accept);
return 0;
}
void index_documents(std::string path_to_docs) {
std::ifstream infile(path_to_docs);
// std::ifstream infile(path_to_docs);
std::string json_line;
while (std::getline(infile, json_line)) {
collection->add(json_line);
}
infile.close();
std::cout << "FINISHED INDEXING!" << std::endl << std::flush;
struct rusage r_usage;
getrusage(RUSAGE_SELF,&r_usage);
std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl;
}
int main(int argc, char **argv) {
signal(SIGPIPE, SIG_IGN);
Store *store = new Store("/tmp/typesense-data");
CollectionManager & collectionManager = CollectionManager::get_instance();
collectionManager.init(store);
collection = collectionManager.get_collection("collection");
if(collection == nullptr) {
collection = collectionManager.create_collection("collection", search_fields, {}, rank_fields);
//index_documents(std::string(ROOT_DIR)+"test/documents.jsonl");
if(argc > 1) {
index_documents(argv[1]);
}
}
h2o_config_init(&config);
h2o_hostconf_t *hostconf = h2o_config_register_host(&config, h2o_iovec_init(H2O_STRLIT("default")), 65535);
register_handler(hostconf, "/add", post_add_document);
register_handler(hostconf, "/delete", delete_remove_document);
register_handler(hostconf, "/search", get_search);
h2o_context_init(&ctx, h2o_evloop_create(), &config);
accept_ctx.ctx = &ctx;
accept_ctx.hosts = config.hosts;
if (create_listener() != 0) {
fprintf(stderr, "failed to listen to 127.0.0.1:1088:%s\n", strerror(errno));
return 1;
}
while (h2o_evloop_run(ctx.loop) == 0);
return 0;
}

View File

@ -0,0 +1,25 @@
#include <cmdline.h>
#include "http_server.h"
#include "api.h"
int main(int argc, char **argv) {
cmdline::parser options;
options.add<std::string>("data-dir", 'd', "Directory where data will be stored.", true);
options.add<std::string>("listen-address", 'a', "Address to which Typesense server binds.", false, "0.0.0.0");
options.add<uint32_t>("listen-port", 'p', "Port on which Typesense server listens.", false, 8080);
options.parse_check(argc, argv);
Store store(options.get<std::string>("data-dir"));
CollectionManager & collectionManager = CollectionManager::get_instance();
collectionManager.init(&store);
HttpServer server(options.get<std::string>("listen-address"), options.get<uint32_t>("listen-port"));
server.post("/collection", post_create_collection);
server.post("/collection/:collection", post_add_document);
server.get("/collection/:collection/search", get_search);
server.del("/collection/:collection/:id", del_remove_document);
server.run();
return 0;
}

View File

@ -1,8 +1,8 @@
#include "sorted_array.h"
#include "intersection.h"
#include "array_utils.h"
void sorted_array::load(const uint32_t *sorted_array, const uint32_t array_length) {
min = sorted_array[0];
min = array_length != 0 ? sorted_array[0] : 0;
max = array_length > 1 ? sorted_array[array_length-1] : min;
uint32_t size_required = (uint32_t) (sorted_append_size_required(max, array_length) * FOR_GROWTH_FACTOR);
@ -55,6 +55,10 @@ bool sorted_array::contains(uint32_t value) {
}
uint32_t sorted_array::indexOf(uint32_t value) {
if(length == 0) {
return length;
}
uint32_t actual;
uint32_t index = for_lower_bound_search(in, length, value, &actual);
if(actual == value) return index;
@ -173,7 +177,7 @@ size_t sorted_array::intersect(uint32_t* arr, const size_t arr_length, uint32_t*
uint32_t* curr = uncompress();
uint32_t* results = new uint32_t[std::min(arr_length, (size_t) length)];
size_t results_length = Intersection::scalar(arr, arr_length, curr, length, results);
size_t results_length = ArrayUtils::and_scalar(arr, arr_length, curr, length, results);
delete[] curr;
*results_out = results;

View File

@ -12,7 +12,9 @@ protected:
Collection *collection1;
std::vector<field> search_fields;
std::vector<field> facet_fields;
std::vector<std::string> rank_fields;
std::vector<field> sort_fields_index;
std::vector<sort_field> sort_fields;
void setupCollection() {
std::string state_dir_path = "/tmp/typesense_test/coll_manager_test_db";
@ -24,10 +26,11 @@ protected:
search_fields = {field("title", field_types::STRING), field("starring", field_types::STRING)};
facet_fields = {field("starring", field_types::STRING)};
rank_fields = {"points"};
sort_fields = { sort_field("points", "DESC") };
sort_fields_index = { field("points", "INT32") };
collection1 = collectionManager.create_collection("collection1", search_fields, facet_fields,
rank_fields, "points");
sort_fields_index, "points");
}
virtual void SetUp() {
@ -53,7 +56,7 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
std::vector<std::string> search_fields = {"starring", "title"};
std::vector<std::string> facets;
nlohmann::json results = collection1->search("thomas", search_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
nlohmann::json results = collection1->search("thomas", search_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
spp::sparse_hash_map<std::string, field> schema = collection1->get_schema();
@ -70,11 +73,12 @@ TEST_F(CollectionManagerTest, RestoreRecordsOnRestart) {
ASSERT_EQ(0, collection1->get_collection_id());
ASSERT_EQ(18, collection1->get_next_seq_id());
ASSERT_EQ(facet_fields_expected, collection1->get_facet_fields());
ASSERT_EQ(rank_fields, collection1->get_rank_fields());
ASSERT_EQ(1, collection1->get_sort_fields().size());
ASSERT_EQ(sort_fields[0].name, collection1->get_sort_fields()[0].name);
ASSERT_EQ(schema.size(), collection1->get_schema().size());
ASSERT_EQ("points", collection1->get_token_ordering_field());
ASSERT_EQ("points", collection1->get_token_ranking_field());
results = collection1->search("thomas", search_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
results = collection1->search("thomas", search_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
}

View File

@ -2,6 +2,7 @@
#include <string>
#include <vector>
#include <fstream>
#include <algorithm>
#include <collection_manager.h>
#include "collection.h"
@ -11,8 +12,9 @@ protected:
std::vector<std::string> query_fields;
Store *store;
CollectionManager & collectionManager = CollectionManager::get_instance();
std::vector<std::string> rank_fields;
std::vector<field> facet_fields;
std::vector<field> sort_fields_index;
std::vector<sort_field> sort_fields;
void setupCollection() {
std::string state_dir_path = "/tmp/typesense_test/collection";
@ -27,12 +29,13 @@ protected:
query_fields = {"title"};
facet_fields = { };
rank_fields = {"points"};
sort_fields = { sort_field("points", "DESC") };
sort_fields_index = { field("points", "INT32") };
collection = collectionManager.get_collection("collection");
if(collection == nullptr) {
collection = collectionManager.create_collection("collection", search_fields, facet_fields,
rank_fields, "points");
sort_fields_index, "points");
}
std::string json_line;
@ -60,7 +63,7 @@ protected:
TEST_F(CollectionTest, ExactSearchShouldBeStable) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("the", query_fields, "", facets, rank_fields, 0, 10);
nlohmann::json results = collection->search("the", query_fields, "", facets, sort_fields, 0, 10);
ASSERT_EQ(7, results["hits"].size());
ASSERT_EQ(7, results["found"].get<int>());
@ -73,12 +76,29 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
std::string result_id = result["id"];
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
// check ASC sorting
std::vector<sort_field> sort_fields_asc = { sort_field("points", "ASC") };
results = collection->search("the", query_fields, "", facets, sort_fields_asc, 0, 10);
ASSERT_EQ(7, results["hits"].size());
ASSERT_EQ(7, results["found"].get<int>());
ids = {"16", "13", "10", "8", "6", "foo", "1"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
std::string id = ids.at(i);
std::string result_id = result["id"];
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
}
TEST_F(CollectionTest, ExactPhraseSearch) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, rank_fields, 0, 10);
nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, sort_fields, 0, 10);
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(5, results["found"].get<uint32_t>());
/*
Sort by (match, diff, score)
@ -98,9 +118,28 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
// Check ASC sort order
std::vector<sort_field> sort_fields_asc = { sort_field("points", "ASC") };
results = collection->search("rocket launch", query_fields, "", facets, sort_fields_asc, 0, 10);
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(5, results["found"].get<uint32_t>());
ids = {"8", "17", "1", "16", "13"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
std::string id = ids.at(i);
std::string result_id = result["id"];
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
// Check pagination
results = collection->search("rocket launch", query_fields, "", facets, rank_fields, 0, 3);
results = collection->search("rocket launch", query_fields, "", facets, sort_fields, 0, 3);
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ(4, results["found"].get<uint32_t>());
ids = {"8", "1", "17", "16", "13"};
for(size_t i = 0; i < 3; i++) {
nlohmann::json result = results["hits"].at(i);
std::string id = ids.at(i);
@ -112,7 +151,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
// Tokens that are not found in the index should be skipped
std::vector<std::string> facets;
nlohmann::json results = collection->search("DoesNotExist from", query_fields, "", facets, rank_fields, 0, 10);
nlohmann::json results = collection->search("DoesNotExist from", query_fields, "", facets, sort_fields, 0, 10);
ASSERT_EQ(2, results["hits"].size());
std::vector<std::string> ids = {"2", "17"};
@ -125,7 +164,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
}
// with non-zero cost
results = collection->search("DoesNotExist from", query_fields, "", facets, rank_fields, 1, 10);
results = collection->search("DoesNotExist from", query_fields, "", facets, sort_fields, 1, 10);
ASSERT_EQ(2, results["hits"].size());
for(size_t i = 0; i < results["hits"].size(); i++) {
@ -136,7 +175,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
}
// with 2 indexed words
results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, rank_fields, 1, 10);
results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, sort_fields, 1, 10);
ASSERT_EQ(2, results["hits"].size());
ids = {"2", "17"};
@ -148,17 +187,17 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
}
results.clear();
results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, rank_fields, 0, 10);
results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, sort_fields, 0, 10);
ASSERT_EQ(0, results["hits"].size());
results.clear();
results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, rank_fields, 2, 10);
results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, sort_fields, 2, 10);
ASSERT_EQ(0, results["hits"].size());
}
TEST_F(CollectionTest, PartialPhraseSearch) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("rocket research", query_fields, "", facets, rank_fields, 0, 10);
nlohmann::json results = collection->search("rocket research", query_fields, "", facets, sort_fields, 0, 10);
ASSERT_EQ(4, results["hits"].size());
std::vector<std::string> ids = {"1", "8", "16", "17"};
@ -173,7 +212,7 @@ TEST_F(CollectionTest, PartialPhraseSearch) {
TEST_F(CollectionTest, QueryWithTypo) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, rank_fields, 2, 3);
nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, sort_fields, 2, 3);
ASSERT_EQ(3, results["hits"].size());
std::vector<std::string> ids = {"19", "20", "21"};
@ -186,7 +225,7 @@ TEST_F(CollectionTest, QueryWithTypo) {
}
results.clear();
results = collection->search("fer thx", query_fields, "", facets, rank_fields, 1, 3);
results = collection->search("fer thx", query_fields, "", facets, sort_fields, 1, 3);
ids = {"1", "10", "13"};
ASSERT_EQ(3, results["hits"].size());
@ -201,7 +240,7 @@ TEST_F(CollectionTest, QueryWithTypo) {
TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 2, MAX_SCORE, false);
nlohmann::json results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 2, MAX_SCORE, false);
ASSERT_EQ(2, results["hits"].size());
std::vector<std::string> ids = {"22", "23"};
@ -212,7 +251,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 3, FREQUENCY, false);
results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 3, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"3", "12", "24"};
@ -224,19 +263,19 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
}
// Check pagination
results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 1, FREQUENCY, false);
results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 1, FREQUENCY, false);
ASSERT_EQ(3, results["found"].get<int>());
ASSERT_EQ(1, results["hits"].size());
std::string solo_id = results["hits"].at(0)["id"];
ASSERT_STREQ("3", solo_id.c_str());
results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 2, FREQUENCY, false);
results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 2, FREQUENCY, false);
ASSERT_EQ(3, results["found"].get<int>());
ASSERT_EQ(2, results["hits"].size());
// Check total ordering
results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 10, FREQUENCY, false);
results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 10, FREQUENCY, false);
ASSERT_EQ(5, results["hits"].size());
ids = {"3", "12", "24", "22", "23"};
@ -247,7 +286,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = collection->search("loox", query_fields, "", facets, rank_fields, 1, 10, MAX_SCORE, false);
results = collection->search("loox", query_fields, "", facets, sort_fields, 1, 10, MAX_SCORE, false);
ASSERT_EQ(5, results["hits"].size());
ids = {"22", "23", "3", "12", "24"};
@ -262,8 +301,9 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
TEST_F(CollectionTest, TextContainingAnActualTypo) {
// A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
std::vector<std::string> facets;
nlohmann::json results = collection->search("ISX what", query_fields, "", facets, rank_fields, 1, 4, FREQUENCY, false);
nlohmann::json results = collection->search("ISX what", query_fields, "", facets, sort_fields, 1, 4, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
ASSERT_EQ(4, results["found"].get<uint32_t>());
std::vector<std::string> ids = {"19", "6", "21", "8"};
@ -275,8 +315,9 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
}
// Record containing exact token match should appear first
results = collection->search("ISX", query_fields, "", facets, rank_fields, 1, 10, FREQUENCY, false);
results = collection->search("ISX", query_fields, "", facets, sort_fields, 1, 10, FREQUENCY, false);
ASSERT_EQ(8, results["hits"].size());
ASSERT_EQ(8, results["found"].get<uint32_t>());
ids = {"20", "19", "6", "3", "21", "4", "10", "8"};
@ -290,7 +331,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
TEST_F(CollectionTest, PrefixSearching) {
std::vector<std::string> facets;
nlohmann::json results = collection->search("ex", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, true);
nlohmann::json results = collection->search("ex", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, true);
ASSERT_EQ(2, results["hits"].size());
std::vector<std::string> ids = {"12", "6"};
@ -301,7 +342,7 @@ TEST_F(CollectionTest, PrefixSearching) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = collection->search("ex", query_fields, "", facets, rank_fields, 0, 10, MAX_SCORE, true);
results = collection->search("ex", query_fields, "", facets, sort_fields, 0, 10, MAX_SCORE, true);
ASSERT_EQ(2, results["hits"].size());
ids = {"6", "12"};
@ -311,6 +352,19 @@ TEST_F(CollectionTest, PrefixSearching) {
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
std::cout << "WHAT EX..." << std::endl;
results = collection->search("what ex", query_fields, "", facets, sort_fields, 0, 10, MAX_SCORE, true);
ASSERT_EQ(9, results["hits"].size());
ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
std::string result_id = result["id"];
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
}
TEST_F(CollectionTest, MultipleFields) {
@ -319,11 +373,10 @@ TEST_F(CollectionTest, MultipleFields) {
std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
std::vector<field> fields = {field("title", field_types::STRING), field("starring", field_types::STRING),
field("cast", field_types::STRING_ARRAY)};
std::vector<std::string> rank_fields = {"points"};
coll_mul_fields = collectionManager.get_collection("coll_mul_fields");
if(coll_mul_fields == nullptr) {
coll_mul_fields = collectionManager.create_collection("coll_mul_fields", fields, facet_fields, rank_fields);
coll_mul_fields = collectionManager.create_collection("coll_mul_fields", fields, facet_fields, sort_fields_index);
}
std::string json_line;
@ -336,7 +389,7 @@ TEST_F(CollectionTest, MultipleFields) {
query_fields = {"title", "starring"};
std::vector<std::string> facets;
nlohmann::json results = coll_mul_fields->search("Will", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
nlohmann::json results = coll_mul_fields->search("Will", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
std::vector<std::string> ids = {"3", "2", "1", "0"};
@ -351,7 +404,7 @@ TEST_F(CollectionTest, MultipleFields) {
// when "starring" takes higher priority than "title"
query_fields = {"starring", "title"};
results = coll_mul_fields->search("thomas", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("thomas", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
ids = {"15", "14", "12", "13"};
@ -364,11 +417,11 @@ TEST_F(CollectionTest, MultipleFields) {
}
query_fields = {"starring", "title", "cast"};
results = coll_mul_fields->search("ben affleck", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("ben affleck", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(1, results["hits"].size());
query_fields = {"cast"};
results = coll_mul_fields->search("chris", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("chris", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"6", "1", "7"};
@ -380,7 +433,7 @@ TEST_F(CollectionTest, MultipleFields) {
}
query_fields = {"cast"};
results = coll_mul_fields->search("chris pine", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("chris pine", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"7", "6", "1"};
@ -399,11 +452,12 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
std::vector<field> fields = {field("name", field_types::STRING), field("age", field_types::INT32),
field("years", field_types::INT32_ARRAY),
field("timestamps", field_types::INT64_ARRAY)};
std::vector<std::string> rank_fields = {"age"};
std::vector<sort_field> sort_fields = { sort_field("age", "DESC") };
std::vector<field> sort_fields_index = { field("age", "INT32") };
coll_array_fields = collectionManager.get_collection("coll_array_fields");
if(coll_array_fields == nullptr) {
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, rank_fields);
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, sort_fields_index);
}
std::string json_line;
@ -417,7 +471,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
// Plain search with no filters - results should be sorted by rank fields
query_fields = {"name"};
std::vector<std::string> facets;
nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(5, results["hits"].size());
std::vector<std::string> ids = {"3", "1", "4", "0", "2"};
@ -430,7 +484,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// Searching on an int32 field
results = coll_array_fields->search("Jeremy", query_fields, "age:>24", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "age:>24", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"3", "1", "4"};
@ -442,14 +496,14 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = coll_array_fields->search("Jeremy", query_fields, "age:>=24", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "age:>=24", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
results = coll_array_fields->search("Jeremy", query_fields, "age:24", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "age:24", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(1, results["hits"].size());
// Searching a number against an int32 array field
results = coll_array_fields->search("Jeremy", query_fields, "years:>2002", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "years:>2002", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"1", "0", "2"};
@ -460,7 +514,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = coll_array_fields->search("Jeremy", query_fields, "years:<1989", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "years:<1989", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(1, results["hits"].size());
ids = {"3"};
@ -472,7 +526,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// multiple filters
results = coll_array_fields->search("Jeremy", query_fields, "years:<2005 && years:>1987", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "years:<2005 && years:>1987", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(1, results["hits"].size());
ids = {"4"};
@ -484,7 +538,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// multiple search values (works like SQL's IN operator) against a single int field
results = coll_array_fields->search("Jeremy", query_fields, "age:[21, 24, 63]", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "age:[21, 24, 63]", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"3", "0", "2"};
@ -496,7 +550,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// multiple search values against an int32 array field - also use extra padding between symbols
results = coll_array_fields->search("Jeremy", query_fields, "years : [ 2015, 1985 , 1999]", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "years : [ 2015, 1985 , 1999]", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
ids = {"3", "1", "4", "0"};
@ -508,7 +562,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// searching on an int64 array field - also ensure that padded space causes no issues
results = coll_array_fields->search("Jeremy", query_fields, "timestamps : > 475205222", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "timestamps : > 475205222", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
ids = {"1", "4", "0", "2"};
@ -521,7 +575,7 @@ TEST_F(CollectionTest, FilterOnNumericFields) {
}
// when filters don't match any record, no results should be returned
results = coll_array_fields->search("Jeremy", query_fields, "timestamps:<1", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "timestamps:<1", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
collectionManager.drop_collection("coll_array_fields");
@ -534,11 +588,13 @@ TEST_F(CollectionTest, FilterOnTextFields) {
std::vector<field> fields = {field("name", field_types::STRING), field("age", field_types::INT32),
field("years", field_types::INT32_ARRAY),
field("tags", field_types::STRING_ARRAY)};
std::vector<std::string> rank_fields = {"age"};
std::vector<field> sort_fields_index = { field("age", "INT32") };
std::vector<sort_field> sort_fields = { sort_field("age", "DESC") };
coll_array_fields = collectionManager.get_collection("coll_array_fields");
if(coll_array_fields == nullptr) {
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, rank_fields);
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, sort_fields_index);
}
std::string json_line;
@ -551,7 +607,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
query_fields = {"name"};
std::vector<std::string> facets;
nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "tags: gold", facets, rank_fields, 0, 10, FREQUENCY, false);
nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "tags: gold", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
std::vector<std::string> ids = {"1", "4", "0", "2"};
@ -563,7 +619,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = coll_array_fields->search("Jeremy", query_fields, "tags : bronze", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "tags : bronze", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(2, results["hits"].size());
ids = {"4", "2"};
@ -576,7 +632,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
}
// search with a list of tags, also testing extra padding of space
results = coll_array_fields->search("Jeremy", query_fields, "tags: [bronze, silver]", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "tags: [bronze, silver]", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
ids = {"3", "4", "0", "2"};
@ -589,7 +645,7 @@ TEST_F(CollectionTest, FilterOnTextFields) {
}
// should be exact matches (no normalization or fuzzy searching should happen)
results = coll_array_fields->search("Jeremy", query_fields, "tags: BRONZE", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "tags: BRONZE", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
collectionManager.drop_collection("coll_array_fields");
@ -604,11 +660,13 @@ TEST_F(CollectionTest, HandleBadlyFormedFilterQuery) {
field("years", field_types::INT32_ARRAY),
field("timestamps", field_types::INT64_ARRAY),
field("tags", field_types::STRING_ARRAY)};
std::vector<std::string> rank_fields = {"age"};
std::vector<field> sort_fields_index = { field("age", "INT32") };
std::vector<sort_field> sort_fields = { sort_field("age", "DESC") };
coll_array_fields = collectionManager.get_collection("coll_array_fields");
if(coll_array_fields == nullptr) {
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, rank_fields);
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, sort_fields_index);
}
std::string json_line;
@ -623,27 +681,27 @@ TEST_F(CollectionTest, HandleBadlyFormedFilterQuery) {
std::vector<std::string> facets;
// when filter field does not exist in the schema
nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "tagzz: gold", facets, rank_fields, 0, 10, FREQUENCY, false);
nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "tagzz: gold", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
// searching using a string for a numeric field
results = coll_array_fields->search("Jeremy", query_fields, "age: abcdef", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "age: abcdef", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
// searching using a string for a numeric array field
results = coll_array_fields->search("Jeremy", query_fields, "timestamps: abcdef", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "timestamps: abcdef", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
// malformed k:v syntax
results = coll_array_fields->search("Jeremy", query_fields, "timestamps abcdef", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "timestamps abcdef", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
// just empty spaces
results = coll_array_fields->search("Jeremy", query_fields, " ", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, " ", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
// wrapping number with quotes
results = coll_array_fields->search("Jeremy", query_fields, "age: '21'", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "age: '21'", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
collectionManager.drop_collection("coll_array_fields");
@ -658,11 +716,13 @@ TEST_F(CollectionTest, FacetCounts) {
field("timestamps", field_types::INT64_ARRAY),
field("tags", field_types::STRING_ARRAY)};
facet_fields = {field("tags", field_types::STRING_ARRAY), field("name", field_types::STRING)};
std::vector<std::string> rank_fields = {"age"};
std::vector<field> sort_fields_index = { field("age", "DESC") };
std::vector<sort_field> sort_fields = { sort_field("age", "DESC") };
coll_array_fields = collectionManager.get_collection("coll_array_fields");
if(coll_array_fields == nullptr) {
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, rank_fields);
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, sort_fields_index);
}
std::string json_line;
@ -677,27 +737,27 @@ TEST_F(CollectionTest, FacetCounts) {
std::vector<std::string> facets = {"tags"};
// single facet with no filters
nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
nlohmann::json results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(2, results["facet_counts"][0].size());
ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]);
ASSERT_EQ("gold", results["facet_counts"][0]["counts"][1]["value"]);
ASSERT_EQ(4, (int) results["facet_counts"][0]["counts"][1]["count"]);
ASSERT_EQ("gold", results["facet_counts"][0]["counts"][0]["value"]);
ASSERT_EQ(4, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_EQ("silver", results["facet_counts"][0]["counts"][2]["value"]);
ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][2]["count"]);
ASSERT_EQ("silver", results["facet_counts"][0]["counts"][1]["value"]);
ASSERT_EQ(3, (int) results["facet_counts"][0]["counts"][1]["count"]);
ASSERT_EQ("bronze", results["facet_counts"][0]["counts"][0]["value"]);
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_EQ("bronze", results["facet_counts"][0]["counts"][2]["value"]);
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][2]["count"]);
// 2 facets, 1 text filter with no filters
facets.clear();
facets.push_back("tags");
facets.push_back("name");
results = coll_array_fields->search("Jeremy", query_fields, "", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(5, results["hits"].size());
ASSERT_EQ(2, results["facet_counts"].size());
@ -712,19 +772,19 @@ TEST_F(CollectionTest, FacetCounts) {
// facet with filters
facets.clear();
facets.push_back("tags");
results = coll_array_fields->search("Jeremy", query_fields, "age: >24", facets, rank_fields, 0, 10, FREQUENCY, false);
results = coll_array_fields->search("Jeremy", query_fields, "age: >24", facets, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ("tags", results["facet_counts"][0]["field_name"]);
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][1]["count"]);
ASSERT_EQ(2, (int) results["facet_counts"][0]["counts"][2]["count"]);
ASSERT_EQ(1, (int) results["facet_counts"][0]["counts"][2]["count"]);
ASSERT_EQ("bronze", results["facet_counts"][0]["counts"][0]["value"]);
ASSERT_EQ("gold", results["facet_counts"][0]["counts"][1]["value"]);
ASSERT_EQ("silver", results["facet_counts"][0]["counts"][2]["value"]);
ASSERT_EQ("gold", results["facet_counts"][0]["counts"][0]["value"]);
ASSERT_EQ("silver", results["facet_counts"][0]["counts"][1]["value"]);
ASSERT_EQ("bronze", results["facet_counts"][0]["counts"][2]["value"]);
collectionManager.drop_collection("coll_array_fields");
}
@ -739,11 +799,12 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {
field("timestamps", field_types::INT64_ARRAY),
field("tags", field_types::STRING_ARRAY)};
facet_fields = {field("tags", field_types::STRING_ARRAY), field("name", field_types::STRING)};
std::vector<std::string> rank_fields = {"age"};
std::vector<field> sort_fields_index = { field("age", "DESC") };
std::vector<sort_field> sort_fields = { sort_field("age", "DESC") };
coll_array_fields = collectionManager.get_collection("coll_array_fields");
if(coll_array_fields == nullptr) {
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, rank_fields);
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, facet_fields, sort_fields_index);
}
std::string json_line;
@ -758,28 +819,28 @@ TEST_F(CollectionTest, SearchingWithMissingFields) {
std::vector<std::string> facets;
std::vector<std::string> query_fields_not_found = {"titlez"};
nlohmann::json res = coll_array_fields->search("the", query_fields_not_found, "", facets, rank_fields, 0, 10);
nlohmann::json res = coll_array_fields->search("the", query_fields_not_found, "", facets, sort_fields, 0, 10);
ASSERT_EQ(0, res["hits"].size());
ASSERT_STREQ("Could not find a search field named `titlez` in the schema.",res["error"].get<std::string>().c_str());
// when a query field is an integer field
res = coll_array_fields->search("the", {"age"}, "", facets, rank_fields, 0, 10);
res = coll_array_fields->search("the", {"age"}, "", facets, sort_fields, 0, 10);
ASSERT_EQ(0, res["hits"].size());
ASSERT_STREQ("Search field `age` should be a string or a string array.", res["error"].get<std::string>().c_str());
// when a facet field is not defined in the schema
res = coll_array_fields->search("the", {"name"}, "", {"timestamps"}, rank_fields, 0, 10);
res = coll_array_fields->search("the", {"name"}, "", {"timestamps"}, sort_fields, 0, 10);
ASSERT_EQ(0, res["hits"].size());
ASSERT_STREQ("Could not find a facet field named `timestamps` in the schema.", res["error"].get<std::string>().c_str());
// when a rank field is not defined in the schema
res = coll_array_fields->search("the", {"name"}, "", {}, {"timestamps"}, 0, 10);
res = coll_array_fields->search("the", {"name"}, "", {}, { sort_field("timestamps", "ASC") }, 0, 10);
ASSERT_EQ(0, res["hits"].size());
ASSERT_STREQ("Could not find a rank field named `timestamps` in the schema.", res["error"].get<std::string>().c_str());
ASSERT_STREQ("Could not find a sort field named `timestamps` in the schema.", res["error"].get<std::string>().c_str());
res = coll_array_fields->search("the", {"name"}, "", {}, {"_rank"}, 0, 10);
res = coll_array_fields->search("the", {"name"}, "", {}, { sort_field("_rank", "ASC") }, 0, 10);
ASSERT_EQ(0, res["hits"].size());
ASSERT_STREQ("Could not find a rank field named `_rank` in the schema.", res["error"].get<std::string>().c_str());
ASSERT_STREQ("Could not find a sort field named `_rank` in the schema.", res["error"].get<std::string>().c_str());
collectionManager.drop_collection("coll_array_fields");
}
@ -790,12 +851,14 @@ TEST_F(CollectionTest, IndexingWithBadData) {
std::vector<field> fields = {field("name", field_types::STRING)};
facet_fields = {field("tags", field_types::STRING_ARRAY)};
std::vector<std::string> rank_fields = {"age", "average"};
std::vector<field> sort_fields_index = { field("age", "INT32"), field("average", "INT32") };
std::vector<sort_field> sort_fields = { sort_field("age", "DESC"), sort_field("average", "DESC") };
sample_collection = collectionManager.get_collection("sample_collection");
if(sample_collection == nullptr) {
sample_collection = collectionManager.create_collection("sample_collection", fields, facet_fields,
rank_fields, "age");
sort_fields_index, "age");
}
const Option<std::string> & search_fields_missing_op1 = sample_collection->add("{\"namezz\": \"foo\", \"age\": 29}");
@ -814,10 +877,10 @@ TEST_F(CollectionTest, IndexingWithBadData) {
facet_fields_missing_op1.error().c_str());
const char *doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [\"red\", \"blue\"]}";
const Option<std::string> & rank_fields_missing_op1 = sample_collection->add(doc_str);
ASSERT_FALSE(rank_fields_missing_op1.ok());
ASSERT_STREQ("Field `average` has been declared as a rank field in the schema, but is not found in the document.",
rank_fields_missing_op1.error().c_str());
const Option<std::string> & sort_fields_missing_op1 = sample_collection->add(doc_str);
ASSERT_FALSE(sort_fields_missing_op1.ok());
ASSERT_STREQ("Field `average` has been declared as a sort field in the schema, but is not found in the document.",
sort_fields_missing_op1.error().c_str());
// Handle type errors
@ -832,19 +895,25 @@ TEST_F(CollectionTest, IndexingWithBadData) {
ASSERT_TRUE(empty_facet_field_op.ok());
doc_str = "{\"name\": \"foo\", \"age\": \"34\", \"tags\": [], \"average\": 34 }";
const Option<std::string> & bad_token_ordering_field_op1 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ordering_field_op1.ok());
ASSERT_STREQ("Token ordering field `age` must be an INT32.", bad_token_ordering_field_op1.error().c_str());
const Option<std::string> & bad_token_ranking_field_op1 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ranking_field_op1.ok());
ASSERT_STREQ("Token ranking field `age` must be an INT32.", bad_token_ranking_field_op1.error().c_str());
doc_str = "{\"name\": \"foo\", \"age\": 343234324234233234, \"tags\": [], \"average\": 34 }";
const Option<std::string> & bad_token_ordering_field_op2 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ordering_field_op2.ok());
ASSERT_STREQ("Token ordering field `age` exceeds maximum value of INT32.", bad_token_ordering_field_op2.error().c_str());
const Option<std::string> & bad_token_ranking_field_op2 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ranking_field_op2.ok());
ASSERT_STREQ("Token ranking field `age` exceeds maximum value of INT32.", bad_token_ranking_field_op2.error().c_str());
doc_str = "{\"name\": \"foo\", \"tags\": [], \"average\": 34 }";
const Option<std::string> & bad_token_ranking_field_op3 = sample_collection->add(doc_str);
ASSERT_FALSE(bad_token_ranking_field_op3.ok());
ASSERT_STREQ("Field `age` has been declared as a token ranking field, but is not found in the document.",
bad_token_ranking_field_op3.error().c_str());
doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [], \"average\": \"34\"}";
const Option<std::string> & bad_rank_field_op = sample_collection->add(doc_str);
ASSERT_FALSE(bad_rank_field_op.ok());
ASSERT_STREQ("Rank field `average` must be an integer.", bad_rank_field_op.error().c_str());
ASSERT_STREQ("Sort field `average` must be a number.", bad_rank_field_op.error().c_str());
collectionManager.drop_collection("sample_collection");
}
@ -854,13 +923,15 @@ TEST_F(CollectionTest, EmptyIndexShouldNotCrash) {
std::vector<field> fields = {field("name", field_types::STRING)};
facet_fields = {field("tags", field_types::STRING_ARRAY)};
std::vector<std::string> rank_fields = {"age", "average"};
std::vector<field> sort_fields_index = { field("age", "INT32"), field("average", "INT32") };
std::vector<sort_field> sort_fields = { sort_field("age", "DESC"), sort_field("average", "DESC") };
empty_coll = collectionManager.get_collection("empty_coll");
if(empty_coll == nullptr) {
empty_coll = collectionManager.create_collection("empty_coll", fields, facet_fields, rank_fields, "age");
empty_coll = collectionManager.create_collection("empty_coll", fields, facet_fields, sort_fields_index, "age");
}
nlohmann::json results = empty_coll->search("a", {"name"}, "", {}, rank_fields, 0, 10, FREQUENCY, false);
nlohmann::json results = empty_coll->search("a", {"name"}, "", {}, sort_fields, 0, 10, FREQUENCY, false);
ASSERT_EQ(0, results["hits"].size());
}