diff --git a/include/collection.h b/include/collection.h index 00bfc38e..e84be1f7 100644 --- a/include/collection.h +++ b/include/collection.h @@ -89,9 +89,10 @@ public: std::string add(std::string json_str); - nlohmann::json search(std::string query, const std::vector fields, const int num_typos, - const size_t num_results, const token_ordering token_order = FREQUENCY, - const bool prefix = false); + nlohmann::json search(std::string query, const std::vector fields, const std::vector filters, + const int num_typos, const size_t num_results, const token_ordering token_order = FREQUENCY, + const bool prefix = false); + void remove(std::string id); void score_results(Topster<100> &topster, const int & token_rank, const std::vector &query_suggestion, diff --git a/include/field.h b/include/field.h index 515bce34..2603cf00 100644 --- a/include/field.h +++ b/include/field.h @@ -23,4 +23,10 @@ struct field { field(std::string name, std::string type): name(name), type(type) { } +}; + +struct filter { + std::string field_name; + std::string value_json; + std::string compare_operator; }; \ No newline at end of file diff --git a/include/store.h b/include/store.h index 45efa254..05cb4489 100644 --- a/include/store.h +++ b/include/store.h @@ -55,7 +55,7 @@ public: } ~Store() { - delete db; + close(); } bool insert(const std::string& key, const std::string& value) { @@ -103,6 +103,11 @@ public: db->Merge(rocksdb::WriteOptions(), key, std::to_string(value)); } + void close() { + delete db; + db = nullptr; + } + void print_memory_usage() { std::string index_usage; db->GetProperty("rocksdb.estimate-table-readers-mem", &index_usage); diff --git a/src/collection.cpp b/src/collection.cpp index 06ae6465..8b31de71 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -231,11 +231,26 @@ void Collection::search_candidates(int & token_rank, std::vector fields, - const int num_typos, const size_t num_results, - const token_ordering token_order, const bool prefix) { +nlohmann::json Collection::search(std::string query, const std::vector fields, const std::vector filters, + const int num_typos, const size_t num_results, + const token_ordering token_order, const bool prefix) { size_t num_found = 0; + // process the filters first + /*for(const filter & a_filter: filters) { + if(index_map.count(a_filter.field_name) != 0) { + art_tree* t = index_map.at(a_filter.field_name); + nlohmann::json json_value = nlohmann::json::parse(a_filter.value_json); + if(json_value.is_number()) { + // do integer art search + } else if(json_value.is_string()) { + + } else if(json_value.is_array()) { + + } + } + }*/ + // Order of `fields` are used to rank results auto begin = std::chrono::high_resolution_clock::now(); std::vector::KV>> field_order_kvs; diff --git a/src/main/benchmark.cpp b/src/main/benchmark.cpp index b329c418..447b551f 100644 --- a/src/main/benchmark.cpp +++ b/src/main/benchmark.cpp @@ -48,7 +48,7 @@ int main(int argc, char* argv[]) { while(counter < 3000) { auto i = counter % 5; - auto results = collection->search(queries[i], search_fields, 1, 100); + auto results = collection->search(queries[i], search_fields, {}, 1, 100); results_total += results.size(); counter++; } diff --git a/src/main/main.cpp b/src/main/main.cpp index ed758d7a..c7c47004 100644 --- a/src/main/main.cpp +++ b/src/main/main.cpp @@ -43,7 +43,7 @@ int main(int argc, char* argv[]) { auto begin = std::chrono::high_resolution_clock::now(); std::vector search_fields = {"title"}; - collection->search("the", search_fields, 1, 100); + collection->search("the", search_fields, {}, 1, 100); long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); cout << "Time taken: " << timeMillis << "us" << endl; return 0; diff --git a/src/main/server.cpp b/src/main/server.cpp index bacae185..587bb86a 100644 --- a/src/main/server.cpp +++ b/src/main/server.cpp @@ -91,8 +91,8 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) { std::vector search_fields = {"title"}; - nlohmann::json result = collection->search(query_map["q"], search_fields, - std::stoi(query_map[NUM_TYPOS]), 100, token_order, false); + nlohmann::json result = collection->search(query_map["q"], search_fields, {}, std::stoi(query_map[NUM_TYPOS]), + 100, token_order, false); std::string json_str = result.dump(); //std::cout << "JSON:" << json_str << std::endl; struct rusage r_usage; diff --git a/test/collection_test.cpp b/test/collection_test.cpp index c7fade15..169acd3a 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -54,7 +54,7 @@ protected: }; TEST_F(CollectionTest, ExactSearchShouldBeStable) { - nlohmann::json results = collection->search("the", search_fields, 0, 10); + nlohmann::json results = collection->search("the", search_fields, {}, 0, 10); ASSERT_EQ(7, results["hits"].size()); ASSERT_EQ(7, results["found"].get()); @@ -70,7 +70,7 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) { } TEST_F(CollectionTest, ExactPhraseSearch) { - nlohmann::json results = collection->search("rocket launch", search_fields, 0, 10); + nlohmann::json results = collection->search("rocket launch", search_fields, {}, 0, 10); ASSERT_EQ(5, results["hits"].size()); /* @@ -92,7 +92,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) { } // Check pagination - results = collection->search("rocket launch", search_fields, 0, 3); + results = collection->search("rocket launch", search_fields, {}, 0, 3); ASSERT_EQ(3, results["hits"].size()); for(size_t i = 0; i < 3; i++) { nlohmann::json result = results["hits"].at(i); @@ -104,7 +104,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) { TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { // Tokens that are not found in the index should be skipped - nlohmann::json results = collection->search("DoesNotExist from", search_fields, 0, 10); + nlohmann::json results = collection->search("DoesNotExist from", search_fields, {}, 0, 10); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"2", "17"}; @@ -117,7 +117,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { } // with non-zero cost - results = collection->search("DoesNotExist from", search_fields, 1, 10); + results = collection->search("DoesNotExist from", search_fields, {}, 1, 10); ASSERT_EQ(2, results["hits"].size()); for(size_t i = 0; i < results["hits"].size(); i++) { @@ -128,7 +128,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { } // with 2 indexed words - results = collection->search("from DoesNotExist insTruments", search_fields, 1, 10); + results = collection->search("from DoesNotExist insTruments", search_fields, {}, 1, 10); ASSERT_EQ(2, results["hits"].size()); ids = {"2", "17"}; @@ -140,16 +140,16 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) { } results.clear(); - results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, 0, 10); + results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 0, 10); ASSERT_EQ(0, results["hits"].size()); results.clear(); - results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, 2, 10); + results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, {}, 2, 10); ASSERT_EQ(0, results["hits"].size()); } TEST_F(CollectionTest, PartialPhraseSearch) { - nlohmann::json results = collection->search("rocket research", search_fields, 0, 10); + nlohmann::json results = collection->search("rocket research", search_fields, {}, 0, 10); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"1", "8", "16", "17"}; @@ -163,7 +163,7 @@ TEST_F(CollectionTest, PartialPhraseSearch) { } TEST_F(CollectionTest, QueryWithTypo) { - nlohmann::json results = collection->search("kind biologcal", search_fields, 2, 3); + nlohmann::json results = collection->search("kind biologcal", search_fields, {}, 2, 3); ASSERT_EQ(3, results["hits"].size()); std::vector ids = {"19", "20", "21"}; @@ -176,7 +176,7 @@ TEST_F(CollectionTest, QueryWithTypo) { } results.clear(); - results = collection->search("fer thx", search_fields, 1, 3); + results = collection->search("fer thx", search_fields, {}, 1, 3); ids = {"1", "10", "13"}; ASSERT_EQ(3, results["hits"].size()); @@ -190,7 +190,7 @@ TEST_F(CollectionTest, QueryWithTypo) { } TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { - nlohmann::json results = collection->search("loox", search_fields, 1, 2, MAX_SCORE, false); + nlohmann::json results = collection->search("loox", search_fields, {}, 1, 2, MAX_SCORE, false); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"22", "23"}; @@ -201,7 +201,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = collection->search("loox", search_fields, 1, 3, FREQUENCY, false); + results = collection->search("loox", search_fields, {}, 1, 3, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"3", "12", "24"}; @@ -213,19 +213,19 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { } // Check pagination - results = collection->search("loox", search_fields, 1, 1, FREQUENCY, false); + results = collection->search("loox", search_fields, {}, 1, 1, FREQUENCY, false); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); std::string solo_id = results["hits"].at(0)["id"]; ASSERT_STREQ("3", solo_id.c_str()); - results = collection->search("loox", search_fields, 1, 2, FREQUENCY, false); + results = collection->search("loox", search_fields, {}, 1, 2, FREQUENCY, false); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); // Check total ordering - results = collection->search("loox", search_fields, 1, 10, FREQUENCY, false); + results = collection->search("loox", search_fields, {}, 1, 10, FREQUENCY, false); ASSERT_EQ(5, results["hits"].size()); ids = {"3", "12", "24", "22", "23"}; @@ -236,7 +236,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = collection->search("loox", search_fields, 1, 10, MAX_SCORE, false); + results = collection->search("loox", search_fields, {}, 1, 10, MAX_SCORE, false); ASSERT_EQ(5, results["hits"].size()); ids = {"22", "23", "3", "12", "24"}; @@ -250,7 +250,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { TEST_F(CollectionTest, TextContainingAnActualTypo) { // A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens - nlohmann::json results = collection->search("ISX what", search_fields, 1, 4, FREQUENCY, false); + nlohmann::json results = collection->search("ISX what", search_fields, {}, 1, 4, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"19", "6", "21", "8"}; @@ -263,7 +263,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) { } // Record containing exact token match should appear first - results = collection->search("ISX", search_fields, 1, 10, FREQUENCY, false); + results = collection->search("ISX", search_fields, {}, 1, 10, FREQUENCY, false); ASSERT_EQ(8, results["hits"].size()); ids = {"20", "19", "6", "3", "21", "4", "10", "8"}; @@ -277,7 +277,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) { } TEST_F(CollectionTest, PrefixSearching) { - nlohmann::json results = collection->search("ex", search_fields, 0, 10, FREQUENCY, true); + nlohmann::json results = collection->search("ex", search_fields, {}, 0, 10, FREQUENCY, true); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"12", "6"}; @@ -288,7 +288,7 @@ TEST_F(CollectionTest, PrefixSearching) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } - results = collection->search("ex", search_fields, 0, 10, MAX_SCORE, true); + results = collection->search("ex", search_fields, {}, 0, 10, MAX_SCORE, true); ASSERT_EQ(2, results["hits"].size()); ids = {"6", "12"}; @@ -322,7 +322,7 @@ TEST_F(CollectionTest, MultipleFields) { infile.close(); search_fields = {"title", "starring"}; - nlohmann::json results = coll_mul_fields->search("Will", search_fields, 0, 10, FREQUENCY, false); + nlohmann::json results = coll_mul_fields->search("Will", search_fields, {}, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"3", "2", "1", "0"}; @@ -337,7 +337,7 @@ TEST_F(CollectionTest, MultipleFields) { // when "starring" takes higher priority than "title" search_fields = {"starring", "title"}; - results = coll_mul_fields->search("thomas", search_fields, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false); ASSERT_EQ(4, results["hits"].size()); ids = {"15", "14", "12", "13"}; @@ -350,11 +350,11 @@ TEST_F(CollectionTest, MultipleFields) { } search_fields = {"starring", "title", "cast"}; - results = coll_mul_fields->search("ben affleck", search_fields, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("ben affleck", search_fields, {}, 0, 10, FREQUENCY, false); ASSERT_EQ(1, results["hits"].size()); search_fields = {"cast"}; - results = coll_mul_fields->search("chris", search_fields, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("chris", search_fields, {}, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"6", "1", "7"}; @@ -366,7 +366,7 @@ TEST_F(CollectionTest, MultipleFields) { } search_fields = {"cast"}; - results = coll_mul_fields->search("chris pine", search_fields, 0, 10, FREQUENCY, false); + results = coll_mul_fields->search("chris pine", search_fields, {}, 0, 10, FREQUENCY, false); ASSERT_EQ(3, results["hits"].size()); ids = {"7", "6", "1"}; @@ -377,3 +377,82 @@ TEST_F(CollectionTest, MultipleFields) { ASSERT_STREQ(id.c_str(), result_id.c_str()); } } + +/* +TEST_F(CollectionTest, SearchNumericFields) { + Collection *coll_array_fields; + + std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl"); + std::vector fields = {field("name", field_types::STRING), field("years", field_types::INT32_ARRAY), + field("timestamps", field_types::INT64_ARRAY)}; + std::vector rank_fields = {"age"}; + + coll_array_fields = collectionManager.get_collection("coll_array_fields"); + if(coll_array_fields == nullptr) { + coll_array_fields = collectionManager.create_collection("coll_array_fields", fields, rank_fields); + } + + std::string json_line; + + while (std::getline(infile, json_line)) { + coll_array_fields->add(json_line); + } + + infile.close(); + + search_fields = {"years"}; + nlohmann::json results = coll_array_fields->search("Jeremy", search_fields, {}, 0, 10, FREQUENCY, false); + ASSERT_EQ(4, results["hits"].size()); + + std::vector ids = {"3", "2", "1", "0"}; + + for(size_t i = 0; i < results["hits"].size(); i++) { + nlohmann::json result = results["hits"].at(i); + std::string result_id = result["id"]; + std::string id = ids.at(i); + ASSERT_STREQ(id.c_str(), result_id.c_str()); + } + + + search_fields = {"starring", "title"}; + results = coll_array_fields->search("thomas", search_fields, {}, 0, 10, FREQUENCY, false); + ASSERT_EQ(4, results["hits"].size()); + + ids = {"15", "14", "12", "13"}; + + for(size_t i = 0; i < results["hits"].size(); i++) { + nlohmann::json result = results["hits"].at(i); + std::string result_id = result["id"]; + std::string id = ids.at(i); + ASSERT_STREQ(id.c_str(), result_id.c_str()); + } + + search_fields = {"starring", "title", "cast"}; + results = coll_array_fields->search("ben affleck", search_fields, {}, 0, 10, FREQUENCY, false); + ASSERT_EQ(1, results["hits"].size()); + + search_fields = {"cast"}; + results = coll_array_fields->search("chris", search_fields, {}, 0, 10, FREQUENCY, false); + ASSERT_EQ(3, results["hits"].size()); + + ids = {"6", "1", "7"}; + for(size_t i = 0; i < results["hits"].size(); i++) { + nlohmann::json result = results["hits"].at(i); + std::string result_id = result["id"]; + std::string id = ids.at(i); + ASSERT_STREQ(id.c_str(), result_id.c_str()); + } + + search_fields = {"cast"}; + results = coll_array_fields->search("chris pine", search_fields, {}, 0, 10, FREQUENCY, false); + ASSERT_EQ(3, results["hits"].size()); + + ids = {"7", "6", "1"}; + for(size_t i = 0; i < results["hits"].size(); i++) { + nlohmann::json result = results["hits"].at(i); + std::string result_id = result["id"]; + std::string id = ids.at(i); + ASSERT_STREQ(id.c_str(), result_id.c_str()); + } +} +*/