Skeleton for filter support.

This commit is contained in:
Kishore Nallan 2017-02-02 08:58:47 +05:30
parent 431fd70fce
commit cab0b36699
8 changed files with 143 additions and 37 deletions

View File

@ -89,9 +89,10 @@ public:
std::string add(std::string json_str);
nlohmann::json search(std::string query, const std::vector<std::string> fields, const int num_typos,
const size_t num_results, const token_ordering token_order = FREQUENCY,
const bool prefix = false);
nlohmann::json search(std::string query, const std::vector<std::string> fields, const std::vector<filter> filters,
const int num_typos, const size_t num_results, const token_ordering token_order = FREQUENCY,
const bool prefix = false);
void remove(std::string id);
void score_results(Topster<100> &topster, const int & token_rank, const std::vector<art_leaf *> &query_suggestion,

View File

@ -23,4 +23,10 @@ struct field {
field(std::string name, std::string type): name(name), type(type) {
}
};
struct filter {
std::string field_name;
std::string value_json;
std::string compare_operator;
};

View File

@ -55,7 +55,7 @@ public:
}
~Store() {
delete db;
close();
}
bool insert(const std::string& key, const std::string& value) {
@ -103,6 +103,11 @@ public:
db->Merge(rocksdb::WriteOptions(), key, std::to_string(value));
}
void close() {
delete db;
db = nullptr;
}
void print_memory_usage() {
std::string index_usage;
db->GetProperty("rocksdb.estimate-table-readers-mem", &index_usage);

View File

@ -231,11 +231,26 @@ void Collection::search_candidates(int & token_rank, std::vector<std::vector<art
}
}
nlohmann::json Collection::search(std::string query, const std::vector<std::string> fields,
const int num_typos, const size_t num_results,
const token_ordering token_order, const bool prefix) {
nlohmann::json Collection::search(std::string query, const std::vector<std::string> fields, const std::vector<filter> filters,
const int num_typos, const size_t num_results,
const token_ordering token_order, const bool prefix) {
size_t num_found = 0;
// process the filters first
/*for(const filter & a_filter: filters) {
if(index_map.count(a_filter.field_name) != 0) {
art_tree* t = index_map.at(a_filter.field_name);
nlohmann::json json_value = nlohmann::json::parse(a_filter.value_json);
if(json_value.is_number()) {
// do integer art search
} else if(json_value.is_string()) {
} else if(json_value.is_array()) {
}
}
}*/
// Order of `fields` are used to rank results
auto begin = std::chrono::high_resolution_clock::now();
std::vector<std::pair<int, Topster<100>::KV>> field_order_kvs;

View File

@ -48,7 +48,7 @@ int main(int argc, char* argv[]) {
while(counter < 3000) {
auto i = counter % 5;
auto results = collection->search(queries[i], search_fields, 1, 100);
auto results = collection->search(queries[i], search_fields, {}, 1, 100);
results_total += results.size();
counter++;
}

View File

@ -43,7 +43,7 @@ int main(int argc, char* argv[]) {
auto begin = std::chrono::high_resolution_clock::now();
std::vector<std::string> search_fields = {"title"};
collection->search("the", search_fields, 1, 100);
collection->search("the", search_fields, {}, 1, 100);
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
cout << "Time taken: " << timeMillis << "us" << endl;
return 0;

View File

@ -91,8 +91,8 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) {
std::vector<std::string> search_fields = {"title"};
nlohmann::json result = collection->search(query_map["q"], search_fields,
std::stoi(query_map[NUM_TYPOS]), 100, token_order, false);
nlohmann::json result = collection->search(query_map["q"], search_fields, {}, std::stoi(query_map[NUM_TYPOS]),
100, token_order, false);
std::string json_str = result.dump();
//std::cout << "JSON:" << json_str << std::endl;
struct rusage r_usage;

View File

@ -54,7 +54,7 @@ protected:
};
TEST_F(CollectionTest, ExactSearchShouldBeStable) {
nlohmann::json results = collection->search("the", search_fields, 0, 10);
nlohmann::json results = collection->search("the", search_fields, {}, 0, 10);
ASSERT_EQ(7, results["hits"].size());
ASSERT_EQ(7, results["found"].get<int>());
@ -70,7 +70,7 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
}
TEST_F(CollectionTest, ExactPhraseSearch) {
nlohmann::json results = collection->search("rocket launch", search_fields, 0, 10);
nlohmann::json results = collection->search("rocket launch", search_fields, {}, 0, 10);
ASSERT_EQ(5, results["hits"].size());
/*
@ -92,7 +92,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
}
// Check pagination
results = collection->search("rocket launch", search_fields, 0, 3);
results = collection->search("rocket launch", search_fields, {}, 0, 3);
ASSERT_EQ(3, results["hits"].size());
for(size_t i = 0; i < 3; i++) {
nlohmann::json result = results["hits"].at(i);
@ -104,7 +104,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
// Tokens that are not found in the index should be skipped
nlohmann::json results = collection->search("DoesNotExist from", search_fields, 0, 10);
nlohmann::json results = collection->search("DoesNotExist from", search_fields, {}, 0, 10);
ASSERT_EQ(2, results["hits"].size());
std::vector<std::string> ids = {"2", "17"};
@ -117,7 +117,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
}
// with non-zero cost
results = collection->search("DoesNotExist from", search_fields, 1, 10);
results = collection->search("DoesNotExist from", search_fields, {}, 1, 10);
ASSERT_EQ(2, results["hits"].size());
for(size_t i = 0; i < results["hits"].size(); i++) {
@ -128,7 +128,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
}
// with 2 indexed words
results = collection->search("from DoesNotExist insTruments", search_fields, 1, 10);
results = collection->search("from DoesNotExist insTruments", search_fields, {}, 1, 10);
ASSERT_EQ(2, results["hits"].size());
ids = {"2", "17"};
@ -140,16 +140,16 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
}
results.clear();
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, 0, 10);
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 0, 10);
ASSERT_EQ(0, results["hits"].size());
results.clear();
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, 2, 10);
results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 2, 10);
ASSERT_EQ(0, results["hits"].size());
}
TEST_F(CollectionTest, PartialPhraseSearch) {
nlohmann::json results = collection->search("rocket research", search_fields, 0, 10);
nlohmann::json results = collection->search("rocket research", search_fields, {}, 0, 10);
ASSERT_EQ(4, results["hits"].size());
std::vector<std::string> ids = {"1", "8", "16", "17"};
@ -163,7 +163,7 @@ TEST_F(CollectionTest, PartialPhraseSearch) {
}
TEST_F(CollectionTest, QueryWithTypo) {
nlohmann::json results = collection->search("kind biologcal", search_fields, 2, 3);
nlohmann::json results = collection->search("kind biologcal", search_fields, {}, 2, 3);
ASSERT_EQ(3, results["hits"].size());
std::vector<std::string> ids = {"19", "20", "21"};
@ -176,7 +176,7 @@ TEST_F(CollectionTest, QueryWithTypo) {
}
results.clear();
results = collection->search("fer thx", search_fields, 1, 3);
results = collection->search("fer thx", search_fields, {}, 1, 3);
ids = {"1", "10", "13"};
ASSERT_EQ(3, results["hits"].size());
@ -190,7 +190,7 @@ TEST_F(CollectionTest, QueryWithTypo) {
}
TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
nlohmann::json results = collection->search("loox", search_fields, 1, 2, MAX_SCORE, false);
nlohmann::json results = collection->search("loox", search_fields, {}, 1, 2, MAX_SCORE, false);
ASSERT_EQ(2, results["hits"].size());
std::vector<std::string> ids = {"22", "23"};
@ -201,7 +201,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = collection->search("loox", search_fields, 1, 3, FREQUENCY, false);
results = collection->search("loox", search_fields, {}, 1, 3, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"3", "12", "24"};
@ -213,19 +213,19 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
}
// Check pagination
results = collection->search("loox", search_fields, 1, 1, FREQUENCY, false);
results = collection->search("loox", search_fields, {}, 1, 1, FREQUENCY, false);
ASSERT_EQ(3, results["found"].get<int>());
ASSERT_EQ(1, results["hits"].size());
std::string solo_id = results["hits"].at(0)["id"];
ASSERT_STREQ("3", solo_id.c_str());
results = collection->search("loox", search_fields, 1, 2, FREQUENCY, false);
results = collection->search("loox", search_fields, {}, 1, 2, FREQUENCY, false);
ASSERT_EQ(3, results["found"].get<int>());
ASSERT_EQ(2, results["hits"].size());
// Check total ordering
results = collection->search("loox", search_fields, 1, 10, FREQUENCY, false);
results = collection->search("loox", search_fields, {}, 1, 10, FREQUENCY, false);
ASSERT_EQ(5, results["hits"].size());
ids = {"3", "12", "24", "22", "23"};
@ -236,7 +236,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = collection->search("loox", search_fields, 1, 10, MAX_SCORE, false);
results = collection->search("loox", search_fields, {}, 1, 10, MAX_SCORE, false);
ASSERT_EQ(5, results["hits"].size());
ids = {"22", "23", "3", "12", "24"};
@ -250,7 +250,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
TEST_F(CollectionTest, TextContainingAnActualTypo) {
// A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
nlohmann::json results = collection->search("ISX what", search_fields, 1, 4, FREQUENCY, false);
nlohmann::json results = collection->search("ISX what", search_fields, {}, 1, 4, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
std::vector<std::string> ids = {"19", "6", "21", "8"};
@ -263,7 +263,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
}
// Record containing exact token match should appear first
results = collection->search("ISX", search_fields, 1, 10, FREQUENCY, false);
results = collection->search("ISX", search_fields, {}, 1, 10, FREQUENCY, false);
ASSERT_EQ(8, results["hits"].size());
ids = {"20", "19", "6", "3", "21", "4", "10", "8"};
@ -277,7 +277,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
}
TEST_F(CollectionTest, PrefixSearching) {
nlohmann::json results = collection->search("ex", search_fields, 0, 10, FREQUENCY, true);
nlohmann::json results = collection->search("ex", search_fields, {}, 0, 10, FREQUENCY, true);
ASSERT_EQ(2, results["hits"].size());
std::vector<std::string> ids = {"12", "6"};
@ -288,7 +288,7 @@ TEST_F(CollectionTest, PrefixSearching) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
results = collection->search("ex", search_fields, 0, 10, MAX_SCORE, true);
results = collection->search("ex", search_fields, {}, 0, 10, MAX_SCORE, true);
ASSERT_EQ(2, results["hits"].size());
ids = {"6", "12"};
@ -322,7 +322,7 @@ TEST_F(CollectionTest, MultipleFields) {
infile.close();
search_fields = {"title", "starring"};
nlohmann::json results = coll_mul_fields->search("Will", search_fields, 0, 10, FREQUENCY, false);
nlohmann::json results = coll_mul_fields->search("Will", search_fields, {}, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
std::vector<std::string> ids = {"3", "2", "1", "0"};
@ -337,7 +337,7 @@ TEST_F(CollectionTest, MultipleFields) {
// when "starring" takes higher priority than "title"
search_fields = {"starring", "title"};
results = coll_mul_fields->search("thomas", search_fields, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
ids = {"15", "14", "12", "13"};
@ -350,11 +350,11 @@ TEST_F(CollectionTest, MultipleFields) {
}
search_fields = {"starring", "title", "cast"};
results = coll_mul_fields->search("ben affleck", search_fields, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("ben affleck", search_fields, {}, 0, 10, FREQUENCY, false);
ASSERT_EQ(1, results["hits"].size());
search_fields = {"cast"};
results = coll_mul_fields->search("chris", search_fields, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("chris", search_fields, {}, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"6", "1", "7"};
@ -366,7 +366,7 @@ TEST_F(CollectionTest, MultipleFields) {
}
search_fields = {"cast"};
results = coll_mul_fields->search("chris pine", search_fields, 0, 10, FREQUENCY, false);
results = coll_mul_fields->search("chris pine", search_fields, {}, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"7", "6", "1"};
@ -377,3 +377,82 @@ TEST_F(CollectionTest, MultipleFields) {
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
}
/*
TEST_F(CollectionTest, SearchNumericFields) {
Collection *coll_array_fields;
std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl");
std::vector<field> fields = {field("name", field_types::STRING), field("years", field_types::INT32_ARRAY),
field("timestamps", field_types::INT64_ARRAY)};
std::vector<std::string> rank_fields = {"age"};
coll_array_fields = collectionManager.get_collection("coll_array_fields");
if(coll_array_fields == nullptr) {
coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, rank_fields);
}
std::string json_line;
while (std::getline(infile, json_line)) {
coll_array_fields->add(json_line);
}
infile.close();
search_fields = {"years"};
nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, {}, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
std::vector<std::string> ids = {"3", "2", "1", "0"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
std::string result_id = result["id"];
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
search_fields = {"starring", "title"};
results = coll_array_fields->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false);
ASSERT_EQ(4, results["hits"].size());
ids = {"15", "14", "12", "13"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
std::string result_id = result["id"];
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
search_fields = {"starring", "title", "cast"};
results = coll_array_fields->search("ben affleck", search_fields, {}, 0, 10, FREQUENCY, false);
ASSERT_EQ(1, results["hits"].size());
search_fields = {"cast"};
results = coll_array_fields->search("chris", search_fields, {}, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"6", "1", "7"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
std::string result_id = result["id"];
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
search_fields = {"cast"};
results = coll_array_fields->search("chris pine", search_fields, {}, 0, 10, FREQUENCY, false);
ASSERT_EQ(3, results["hits"].size());
ids = {"7", "6", "1"};
for(size_t i = 0; i < results["hits"].size(); i++) {
nlohmann::json result = results["hits"].at(i);
std::string result_id = result["id"];
std::string id = ids.at(i);
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
}
*/