#include #include #include #include #include #include #include #include #include #include "collection.h" #include "embedder_manager.h" #include "http_client.h" class CollectionTest : public ::testing::Test { protected: Collection *collection; std::vector query_fields; Store *store; CollectionManager & collectionManager = CollectionManager::get_instance(); std::atomic quit = false; std::vector sort_fields; // used for generating random text std::vector words; void setupCollection() { std::string state_dir_path = "/tmp/typesense_test/collection"; LOG(INFO) << "Truncating and creating: " << state_dir_path; system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str()); system("mkdir -p /tmp/typesense_test/models"); store = new Store(state_dir_path); collectionManager.init(store, 1.0, "auth_key", quit); collectionManager.load(8, 1000); std::ifstream infile(std::string(ROOT_DIR)+"test/documents.jsonl"); std::vector search_fields = { field("title", field_types::STRING, false), field("points", field_types::INT32, false) }; query_fields = {"title"}; sort_fields = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "DESC") }; collection = collectionManager.get_collection("collection").get(); if(collection == nullptr) { collection = collectionManager.create_collection("collection", 4, search_fields, "points").get(); } std::string json_line; // dummy record for record id 0: to make the test record IDs to match with line numbers json_line = "{\"points\":10,\"title\":\"z\"}"; collection->add(json_line); while (std::getline(infile, json_line)) { collection->add(json_line); } infile.close(); std::ifstream words_file(std::string(ROOT_DIR)+"test/resources/common100_english.txt"); std::stringstream strstream; strstream << words_file.rdbuf(); words_file.close(); StringUtils::split(strstream.str(), words, "\n"); } virtual void SetUp() { setupCollection(); } virtual void TearDown() { collectionManager.drop_collection("collection"); collectionManager.dispose(); delete store; } std::string get_text(size_t num_words) { time_t t; srand((unsigned) time(&t)); std::vector strs; for(size_t i = 0 ; i < num_words ; i++ ) { int word_index = rand() % words.size(); strs.push_back(words[word_index]); } return StringUtils::join(strs, " "); } }; TEST_F(CollectionTest, VerifyCountOfDocuments) { // we have 1 dummy record to match the line numbers on the fixtures file with sequence numbers ASSERT_EQ(24+1, collection->get_num_documents()); // check default no specific dirty values option is sent for a collection that has explicit schema std::string empty_dirty_values; ASSERT_EQ(DIRTY_VALUES::REJECT, collection->parse_dirty_values_option(empty_dirty_values)); } TEST_F(CollectionTest, RetrieveADocumentById) { Option doc_option = collection->get("1"); ASSERT_TRUE(doc_option.ok()); nlohmann::json doc = doc_option.get(); std::string id = doc["id"]; doc_option = collection->get("foo"); ASSERT_TRUE(doc_option.ok()); doc = doc_option.get(); id = doc["id"]; ASSERT_STREQ("foo", id.c_str()); doc_option = collection->get("baz"); ASSERT_FALSE(doc_option.ok()); } TEST_F(CollectionTest, ExactSearchShouldBeStable) { std::vector facets; nlohmann::json results = collection->search("the", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(7, results["hits"].size()); ASSERT_EQ(7, results["found"].get()); ASSERT_STREQ("collection", results["request_params"]["collection_name"].get().c_str()); ASSERT_STREQ("the", results["request_params"]["q"].get().c_str()); ASSERT_EQ(10, results["request_params"]["per_page"].get()); // For two documents of the same score, the larger doc_id appears first std::vector ids = {"1", "6", "foo", "13", "10", "8", "16"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); std::string result_id = result["document"]["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } // check ASC sorting std::vector sort_fields_asc = { sort_by("points", "ASC") }; results = collection->search("the", query_fields, "", facets, sort_fields_asc, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(7, results["hits"].size()); ASSERT_EQ(7, results["found"].get()); ids = {"16", "13", "10", "8", "6", "foo", "1"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); std::string result_id = result["document"]["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } // when a query does not return results, hits and found fields should still exist in response results = collection->search("zxsadqewsad", query_fields, "", facets, sort_fields_asc, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(0, results["hits"].size()); ASSERT_EQ(0, results["found"].get()); } TEST_F(CollectionTest, MultiTokenSearch) { std::vector facets; nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(5, results["found"].get()); /* Sort by (match, diff, score) 8: score: 12, diff: 0 1: score: 15, diff: 4 17: score: 8, diff: 4 16: score: 10, diff: 5 13: score: 12, (single word match) */ std::vector ids = {"8", "1", "17", "16", "13"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); std::string result_id = result["document"]["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } ASSERT_EQ(results["hits"][0]["highlights"].size(), (unsigned long) 1); ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get().c_str(), "title"); ASSERT_STREQ(results["hits"][0]["highlights"][0]["snippet"].get().c_str(), "What is the power, requirement of a rocket launch these days?"); // Check ASC sort order std::vector sort_fields_asc = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "ASC") }; results = collection->search("rocket launch", query_fields, "", facets, sort_fields_asc, {0}, 10, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(5, results["found"].get()); ids = {"8", "17", "1", "16", "13"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); std::string result_id = result["document"]["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } // Check pagination results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 3, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(5, results["found"].get()); ASSERT_EQ(3, results["request_params"]["per_page"].get()); ids = {"8", "1", "17"}; for(size_t i = 0; i < 3; i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); std::string result_id = result["document"]["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } } TEST_F(CollectionTest, SearchWithExcludedTokens) { std::vector facets; nlohmann::json results = collection->search("how -propellants -are", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ASSERT_EQ(2, results["hits"].size()); ASSERT_EQ(2, results["found"].get()); std::vector ids = {"9", "17"}; for (size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); std::string result_id = result["document"]["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } results = collection->search("-rocket", query_fields, "", facets, sort_fields, {0}, 50).get(); ASSERT_EQ(21, results["found"].get()); ASSERT_EQ(21, results["hits"].size()); results = collection->search("-rocket -cryovolcanism", query_fields, "", facets, sort_fields, {0}, 50).get(); ASSERT_EQ(20, results["found"].get()); } TEST_F(CollectionTest, SkipUnindexedTokensDuringMultiTokenSearch) { // Tokens that are not found in the index should be skipped std::vector facets; nlohmann::json results = collection->search("DoesNotExist from", query_fields, "", facets, sort_fields, {0}, 10).get(); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"2", "17"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); std::string result_id = result["document"]["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } // with non-zero cost results = collection->search("DoesNotExist from", query_fields, "", facets, sort_fields, {1}, 10).get(); ASSERT_EQ(2, results["hits"].size()); for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); std::string result_id = result["document"]["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } // with 2 indexed words results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, sort_fields, {1}, 10).get(); ASSERT_EQ(2, results["hits"].size()); ids = {"2", "17"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); std::string result_id = result["document"]["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } // exhaustive search should give same results results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 1, {}, {}, {}, 0, "", "", {}, 1000, true, false, true, "", true).get(); ASSERT_EQ(2, results["hits"].size()); ids = {"2", "17"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); std::string result_id = result["document"]["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } // should not try to drop tokens to expand query results.clear(); results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ASSERT_EQ(9, results["hits"].size()); results.clear(); results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(3, results["hits"].size()); ids = {"8", "16", "10"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string id = ids.at(i); std::string result_id = result["document"]["id"]; ASSERT_STREQ(id.c_str(), result_id.c_str()); } results.clear(); results = collection->search("the a insurance", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(0, results["hits"].size()); // with no indexed word results.clear(); results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, sort_fields, {0}, 10).get(); ASSERT_EQ(0, results["hits"].size()); results.clear(); results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, sort_fields, {2}, 10).get(); ASSERT_EQ(0, results["hits"].size()); } TEST_F(CollectionTest, PartialMultiTokenSearch) { std::vector facets; nlohmann::json results = collection->search("rocket research", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10).get(); ASSERT_EQ(6, results["hits"].size()); std::vector ids = {"19", "1", "10", "8", "16", "17"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } } TEST_F(CollectionTest, QueryWithTypo) { std::vector facets; nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, sort_fields, {2}, 3, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ASSERT_EQ(3, results["hits"].size()); std::vector ids = {"19", "3", "20"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } results.clear(); results = collection->search("lauxnch rcket", query_fields, "", facets, sort_fields, {1}, 3, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ids = {"8", "1", "17"}; ASSERT_EQ(3, results["hits"].size()); for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } } TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) { std::vector facets; nlohmann::json results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 2, 1, MAX_SCORE, {false}).get(); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"22", "3"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 3, 1, FREQUENCY, {false}).get(); ASSERT_EQ(3, results["hits"].size()); ids = {"22", "3", "12"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } // Check pagination results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 1, 1, FREQUENCY, {false}).get(); ASSERT_EQ(5, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); std::string solo_id = results["hits"].at(0)["document"]["id"]; ASSERT_STREQ("22", solo_id.c_str()); results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 2, 1, FREQUENCY, {false}).get(); ASSERT_EQ(5, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); // Check total ordering results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(5, results["hits"].size()); ids = {"22", "3", "12", "23", "24"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 10, 1, MAX_SCORE, {false}).get(); ASSERT_EQ(5, results["hits"].size()); ids = {"22", "3", "12", "23", "24"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } } TEST_F(CollectionTest, TextContainingAnActualTypo) { // A line contains "ISSX" but not "what" - need to ensure that correction to "ISSS what" happens std::vector facets; nlohmann::json results = collection->search("ISSX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false}, 20, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 20).get(); ASSERT_EQ(4, results["hits"].size()); ASSERT_EQ(11, results["found"].get()); std::vector ids = {"19", "6", "21", "22"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } // Record containing exact token match should appear first results = collection->search("ISSX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(5, results["found"].get()); ids = {"20", "19", "6", "3", "21"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } } TEST_F(CollectionTest, Pagination) { nlohmann::json results = collection->search("the", query_fields, "", {}, sort_fields, {0}, 3, 1, FREQUENCY, {false}).get(); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(7, results["found"].get()); std::vector ids = {"1", "6", "foo"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } results = collection->search("the", query_fields, "", {}, sort_fields, {0}, 3, 2, FREQUENCY, {false}).get(); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(7, results["found"].get()); ids = {"13", "10", "8"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } results = collection->search("the", query_fields, "", {}, sort_fields, {0}, 3, 3, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); ASSERT_EQ(7, results["found"].get()); ids = {"16"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } } TEST_F(CollectionTest, WildcardQuery) { nlohmann::json results = collection->search("*", query_fields, "points:>0", {}, sort_fields, {0}, 3, 1, FREQUENCY, {false}).get(); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); // when no filter is specified, fall back on default sorting field based catch-all filter Option results_op = collection->search("*", query_fields, "", {}, sort_fields, {0}, 3, 1, FREQUENCY, {false}); ASSERT_TRUE(results_op.ok()); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); // wildcard query with no filters and ASC sort std::vector sort_fields = { sort_by("points", "ASC") }; results = collection->search("*", query_fields, "", {}, sort_fields, {0}, 3, 1, FREQUENCY, {false}).get(); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); std::vector ids = {"21", "24", "17"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } // wildcard query should not require a search field results_op = collection->search("*", {}, "", {}, sort_fields, {0}, 3, 1, FREQUENCY, {false}); ASSERT_TRUE(results_op.ok()); results = results_op.get(); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); // non-wildcard query should require a search field results_op = collection->search("the", {}, "", {}, sort_fields, {0}, 3, 1, FREQUENCY, {false}); ASSERT_FALSE(results_op.ok()); ASSERT_STREQ("No search fields specified for the query.", results_op.error().c_str()); Collection* empty_coll; std::vector fields = {field("title", field_types::STRING, false)}; empty_coll = collectionManager.get_collection("empty_coll").get(); if(empty_coll == nullptr) { empty_coll = collectionManager.create_collection("empty_coll", 1, fields).get(); } results = empty_coll->search("*", {}, "title:!= foo", {}, {}, {0}, 3, 1).get(); ASSERT_EQ(0, results["hits"].size()); ASSERT_EQ(0, results["found"]); } TEST_F(CollectionTest, PrefixSearching) { std::vector facets; nlohmann::json results = collection->search("ex", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get(); ASSERT_EQ(2, results["hits"].size()); std::vector ids = {"6", "12"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } results = collection->search("ex", query_fields, "", facets, sort_fields, {0}, 10, 1, MAX_SCORE, {true}).get(); ASSERT_EQ(2, results["hits"].size()); ids = {"6", "12"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } results = collection->search("what ex", query_fields, "", facets, sort_fields, {0}, 10, 1, MAX_SCORE, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ASSERT_EQ(9, results["hits"].size()); ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } // restrict to only 2 results and differentiate between MAX_SCORE and FREQUENCY results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, MAX_SCORE, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ASSERT_EQ(2, results["hits"].size()); ids = {"19", "22"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 5, "", 10).get(); ASSERT_EQ(2, results["hits"].size()); ids = {"1", "2"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } // only the last token in the query should be used for prefix search - so, "math" should not match "mathematics" results = collection->search("math fx", query_fields, "", facets, sort_fields, {0}, 1, 1, FREQUENCY, {true}, 0).get(); ASSERT_EQ(0, results["hits"].size()); // single and double char prefixes should set a ceiling on the num_typos possible results = collection->search("x", query_fields, "", facets, sort_fields, {2}, 2, 1, FREQUENCY, {true}).get(); ASSERT_EQ(0, results["hits"].size()); // prefix with a typo results = collection->search("late propx", query_fields, "", facets, sort_fields, {2}, 1, 1, FREQUENCY, {true}).get(); ASSERT_EQ(1, results["hits"].size()); ASSERT_EQ("16", results["hits"].at(0)["document"]["id"]); } TEST_F(CollectionTest, TypoTokensThreshold) { // Typo correction should happen only based on the `typo_tokens_threshold` value auto results = collection->search("redundant", {"title"}, "", {}, sort_fields, {2}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "", 0).get(); ASSERT_EQ(1, results["hits"].size()); ASSERT_EQ(1, results["found"].get()); results = collection->search("redundant", {"title"}, "", {}, sort_fields, {2}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "", 10).get(); ASSERT_EQ(2, results["hits"].size()); ASSERT_EQ(2, results["found"].get()); } TEST_F(CollectionTest, MultiOccurrenceString) { Collection *coll_multi_string; std::vector fields = { field("title", field_types::STRING, false), field("points", field_types::INT32, false) }; coll_multi_string = collectionManager.get_collection("coll_multi_string").get(); if (coll_multi_string == nullptr) { coll_multi_string = collectionManager.create_collection("coll_multi_string", 4, fields, "points").get(); } nlohmann::json document; document["title"] = "The brown fox was the tallest of the lot and the quickest of the trot."; document["points"] = 100; coll_multi_string->add(document.dump()).get(); query_fields = {"title"}; nlohmann::json results = coll_multi_string->search("the", query_fields, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(1, results["hits"].size()); collectionManager.drop_collection("coll_multi_string"); } TEST_F(CollectionTest, ArrayStringFieldHighlight) { Collection *coll_array_text; std::ifstream infile(std::string(ROOT_DIR)+"test/array_text_documents.jsonl"); std::vector fields = { field("title", field_types::STRING, false), field("tags", field_types::STRING_ARRAY, false), field("points", field_types::INT32, false) }; coll_array_text = collectionManager.get_collection("coll_array_text").get(); if (coll_array_text == nullptr) { coll_array_text = collectionManager.create_collection("coll_array_text", 4, fields, "points").get(); } std::string json_line; while (std::getline(infile, json_line)) { coll_array_text->add(json_line); } infile.close(); query_fields = {"tags"}; std::vector facets; nlohmann::json results = coll_array_text->search("truth about", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(1, results["hits"].size()); std::vector ids = {"0"}; for (size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } ASSERT_EQ(results["hits"][0]["highlights"].size(), 1); ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get().c_str(), "tags"); // an array's snippets must be sorted on match score, if match score is same, priority to be given to lower indices ASSERT_EQ(3, results["hits"][0]["highlights"][0]["snippets"].size()); ASSERT_STREQ("truth about", results["hits"][0]["highlights"][0]["snippets"][0].get().c_str()); ASSERT_STREQ("the truth", results["hits"][0]["highlights"][0]["snippets"][1].get().c_str()); ASSERT_STREQ("about forever", results["hits"][0]["highlights"][0]["snippets"][2].get().c_str()); ASSERT_EQ(3, results["hits"][0]["highlights"][0]["indices"].size()); ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"][0]); ASSERT_EQ(0, results["hits"][0]["highlights"][0]["indices"][1]); ASSERT_EQ(1, results["hits"][0]["highlights"][0]["indices"][2]); results = coll_array_text->search("forever truth", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(1, results["hits"].size()); ids = {"0"}; for (size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get().c_str(), "tags"); ASSERT_EQ(3, results["hits"][0]["highlights"][0]["snippets"].size()); ASSERT_STREQ("the truth", results["hits"][0]["highlights"][0]["snippets"][0].get().c_str()); ASSERT_STREQ("about forever", results["hits"][0]["highlights"][0]["snippets"][1].get().c_str()); ASSERT_STREQ("truth about", results["hits"][0]["highlights"][0]["snippets"][2].get().c_str()); ASSERT_EQ(3, results["hits"][0]["highlights"][0]["indices"].size()); ASSERT_EQ(0, results["hits"][0]["highlights"][0]["indices"][0]); ASSERT_EQ(1, results["hits"][0]["highlights"][0]["indices"][1]); ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"][2]); results = coll_array_text->search("truth", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(2, results["hits"].size()); ids = {"1", "0"}; for (size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } results = coll_array_text->search("asdadasd", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(0, results["hits"].size()); query_fields = {"title", "tags"}; results = coll_array_text->search("truth", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(2, results["hits"].size()); ASSERT_EQ(2, results["hits"][0]["highlights"].size()); ids = {"1", "0"}; for (size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } ASSERT_EQ(4, results["hits"][0]["highlights"][0].size()); ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get().c_str(), "tags"); ASSERT_EQ(2, results["hits"][0]["highlights"][0]["snippets"].size()); ASSERT_STREQ("truth", results["hits"][0]["highlights"][0]["snippets"][0].get().c_str()); ASSERT_STREQ("plain truth", results["hits"][0]["highlights"][0]["snippets"][1].get().c_str()); ASSERT_EQ(2, results["hits"][0]["highlights"][0]["matched_tokens"].size()); ASSERT_STREQ("truth", results["hits"][0]["highlights"][0]["matched_tokens"][0][0].get().c_str()); ASSERT_STREQ("truth", results["hits"][0]["highlights"][0]["matched_tokens"][1][0].get().c_str()); ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"].size()); ASSERT_EQ(1, results["hits"][0]["highlights"][0]["indices"][0]); ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"][1]); ASSERT_EQ(3, results["hits"][0]["highlights"][1].size()); ASSERT_STREQ("title", results["hits"][0]["highlights"][1]["field"].get().c_str()); ASSERT_STREQ("Plain Truth", results["hits"][0]["highlights"][1]["snippet"].get().c_str()); ASSERT_EQ(1, results["hits"][0]["highlights"][1]["matched_tokens"].size()); ASSERT_STREQ("Truth", results["hits"][0]["highlights"][1]["matched_tokens"][0].get().c_str()); ASSERT_EQ(3, results["hits"][1]["highlights"][0].size()); ASSERT_STREQ("title", results["hits"][1]["highlights"][0]["field"].get().c_str()); ASSERT_STREQ("The Truth About Forever", results["hits"][1]["highlights"][0]["snippet"].get().c_str()); ASSERT_EQ(1, results["hits"][1]["highlights"][0]["matched_tokens"].size()); ASSERT_STREQ("Truth", results["hits"][1]["highlights"][0]["matched_tokens"][0].get().c_str()); ASSERT_EQ(4, results["hits"][1]["highlights"][1].size()); ASSERT_STREQ(results["hits"][1]["highlights"][1]["field"].get().c_str(), "tags"); ASSERT_EQ(2, results["hits"][1]["highlights"][1]["snippets"].size()); ASSERT_STREQ("the truth", results["hits"][1]["highlights"][1]["snippets"][0].get().c_str()); ASSERT_STREQ("truth about", results["hits"][1]["highlights"][1]["snippets"][1].get().c_str()); ASSERT_EQ(2, results["hits"][1]["highlights"][1]["matched_tokens"].size()); ASSERT_STREQ("truth", results["hits"][1]["highlights"][1]["matched_tokens"][0][0].get().c_str()); ASSERT_STREQ("truth", results["hits"][1]["highlights"][1]["matched_tokens"][1][0].get().c_str()); ASSERT_EQ(2, results["hits"][1]["highlights"][1]["indices"].size()); ASSERT_EQ(0, results["hits"][1]["highlights"][1]["indices"][0]); ASSERT_EQ(2, results["hits"][1]["highlights"][1]["indices"][1]); // highlight fields must be ordered based on match score results = coll_array_text->search("amazing movie", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(1, results["hits"].size()); ASSERT_EQ(2, results["hits"][0]["highlights"].size()); ASSERT_EQ(4, results["hits"][0]["highlights"][0].size()); ASSERT_STREQ("tags", results["hits"][0]["highlights"][0]["field"].get().c_str()); ASSERT_STREQ("amazing movie", results["hits"][0]["highlights"][0]["snippets"][0].get().c_str()); ASSERT_EQ(1, results["hits"][0]["highlights"][0]["indices"].size()); ASSERT_EQ(0, results["hits"][0]["highlights"][0]["indices"][0]); ASSERT_EQ(1, results["hits"][0]["highlights"][0]["matched_tokens"].size()); ASSERT_STREQ("amazing", results["hits"][0]["highlights"][0]["matched_tokens"][0][0].get().c_str()); ASSERT_EQ(3, results["hits"][0]["highlights"][1].size()); ASSERT_STREQ(results["hits"][0]["highlights"][1]["field"].get().c_str(), "title"); ASSERT_STREQ(results["hits"][0]["highlights"][1]["snippet"].get().c_str(), "Amazing Spiderman is amazing"); // should highlight duplicating tokens ASSERT_EQ(2, results["hits"][0]["highlights"][1]["matched_tokens"].size()); ASSERT_STREQ("Amazing", results["hits"][0]["highlights"][1]["matched_tokens"][0].get().c_str()); ASSERT_STREQ("amazing", results["hits"][0]["highlights"][1]["matched_tokens"][1].get().c_str()); // when query tokens are not found in an array field they should be ignored results = coll_array_text->search("winds", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get(); ASSERT_EQ(1, results["hits"].size()); ASSERT_EQ(1, results["hits"][0]["highlights"].size()); collectionManager.drop_collection("coll_array_text"); } TEST_F(CollectionTest, MultipleFields) { Collection *coll_mul_fields; std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl"); std::vector fields = { field("title", field_types::STRING, false), field("starring", field_types::STRING, false), field("starring_facet", field_types::STRING, true), field("cast", field_types::STRING_ARRAY, false), field("points", field_types::INT32, false) }; coll_mul_fields = collectionManager.get_collection("coll_mul_fields").get(); if(coll_mul_fields == nullptr) { coll_mul_fields = collectionManager.create_collection("coll_mul_fields", 4, fields, "points").get(); } std::string json_line; while (std::getline(infile, json_line)) { coll_mul_fields->add(json_line); } infile.close(); query_fields = {"title", "starring"}; std::vector facets; nlohmann::json results = coll_mul_fields->search("Will", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"3", "2", "1", "0"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } // when "starring" takes higher priority than "title" query_fields = {"starring", "title"}; results = coll_mul_fields->search("thomas", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {2, 1}).get(); ASSERT_EQ(4, results["hits"].size()); ids = {"15", "12", "13", "14"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } query_fields = {"starring", "title", "cast"}; results = coll_mul_fields->search("ben affleck", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); query_fields = {"cast"}; results = coll_mul_fields->search("chris", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(3, results["hits"].size()); ids = {"6", "1", "7"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } query_fields = {"cast"}; results = coll_mul_fields->search("chris pine", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); ids = {"7"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } // filtering on unfaceted multi-valued string field query_fields = {"title"}; results = coll_mul_fields->search("captain", query_fields, "cast: chris", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); ids = {"6"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } // when a token exists in multiple fields of the same document, document and facet should be returned only once query_fields = {"starring", "title", "cast"}; facets = {"starring_facet"}; results = coll_mul_fields->search("myers", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); ids = {"17"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } ASSERT_EQ(1, results["facet_counts"].size()); ASSERT_STREQ("starring_facet", results["facet_counts"][0]["field_name"].get().c_str()); size_t facet_count = results["facet_counts"][0]["counts"][0]["count"]; ASSERT_EQ(1, facet_count); collectionManager.drop_collection("coll_mul_fields"); } TEST_F(CollectionTest, KeywordQueryReturnsResultsBasedOnPerPageParam) { Collection *coll_mul_fields; std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl"); std::vector fields = { field("title", field_types::STRING, false), field("starring", field_types::STRING, false), field("starring_facet", field_types::STRING, true), field("cast", field_types::STRING_ARRAY, false), field("points", field_types::INT32, false) }; coll_mul_fields = collectionManager.get_collection("coll_mul_fields").get(); if(coll_mul_fields == nullptr) { coll_mul_fields = collectionManager.create_collection("coll_mul_fields", 4, fields, "points").get(); } std::string json_line; while (std::getline(infile, json_line)) { coll_mul_fields->add(json_line); } infile.close(); query_fields = {"title", "starring"}; std::vector facets; spp::sparse_hash_set empty; nlohmann::json results = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 3, 1, FREQUENCY, {true}, 1000, empty, empty, 10).get(); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(6, results["found"].get()); // cannot fetch more than in-built limit of 250 auto res_op = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 251, 1, FREQUENCY, {true}, 1000, empty, empty, 10); ASSERT_FALSE(res_op.ok()); ASSERT_EQ(422, res_op.code()); ASSERT_STREQ("Only upto 250 hits can be fetched per page.", res_op.error().c_str()); // when page number is zero, use the first page results = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 3, 0, FREQUENCY, {true}, 1000, empty, empty, 10).get(); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(6, results["found"].get()); // do pagination results = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 3, 1, FREQUENCY, {true}, 1000, empty, empty, 10).get(); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(6, results["found"].get()); results = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 3, 2, FREQUENCY, {true}, 1000, empty, empty, 10).get(); ASSERT_EQ(3, results["hits"].size()); ASSERT_EQ(6, results["found"].get()); collectionManager.drop_collection("coll_mul_fields"); } std::vector import_res_to_json(const std::vector& imported_results) { std::vector out; for(const auto& imported_result: imported_results) { out.emplace_back(nlohmann::json::parse(imported_result)); } return out; } TEST_F(CollectionTest, ImportDocumentsUpsert) { Collection *coll_mul_fields; std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl"); std::stringstream strstream; strstream << infile.rdbuf(); infile.close(); std::vector import_records; StringUtils::split(strstream.str(), import_records, "\n"); std::vector fields = { field("title", field_types::STRING, false), field("starring", field_types::STRING, true), field("cast", field_types::STRING_ARRAY, false), field("points", field_types::INT32, false) }; coll_mul_fields = collectionManager.get_collection("coll_mul_fields").get(); if(coll_mul_fields == nullptr) { coll_mul_fields = collectionManager.create_collection("coll_mul_fields", 1, fields, "points").get(); } // try importing records nlohmann::json document; nlohmann::json import_response = coll_mul_fields->add_many(import_records, document); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(18, import_response["num_imported"].get()); // try searching with filter auto results = coll_mul_fields->search("*", query_fields, "starring:= [Will Ferrell]", {"starring"}, sort_fields, {0}, 30, 1, FREQUENCY, {false}).get(); ASSERT_EQ(2, results["hits"].size()); // update existing record verbatim std::vector existing_records = {R"({"id": "0", "title": "Wake Up, Ron Burgundy: The Lost Movie"})"}; import_response = coll_mul_fields->add_many(existing_records, document, UPDATE); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(1, import_response["num_imported"].get()); // update + upsert records std::vector more_records = {R"({"id": "0", "title": "The Fifth Harry", "starring": "Will Ferrell", "points":62, "cast":["Adam McKay","Steve Carell","Paul Rudd"]})", R"({"id": "2", "cast": ["Chris Fisher", "Rand Alan"], "points":81, "starring":"Daniel Day-Lewis","title":"There Will Be Blood"})", R"({"id": "18", "title": "Back Again Forest", "points": 45, "starring": "Ronald Wells", "cast": ["Dant Saren"]})", R"({"id": "6", "points": 77, "cast":["Chris Evans","Scarlett Johansson"], "starring":"Samuel L. Jackson","title":"Captain America: The Winter Soldier"})"}; import_response = coll_mul_fields->add_many(more_records, document, UPSERT); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(4, import_response["num_imported"].get()); std::vector import_results = import_res_to_json(more_records); ASSERT_EQ(4, import_results.size()); for(size_t i=0; i<4; i++) { ASSERT_TRUE(import_results[i]["success"].get()); ASSERT_EQ(1, import_results[i].size()); } // try with filters again results = coll_mul_fields->search("*", query_fields, "starring:= [Will Ferrell]", {"starring"}, sort_fields, {0}, 30, 1, FREQUENCY, {false}).get(); ASSERT_EQ(2, results["hits"].size()); results = coll_mul_fields->search("*", query_fields, "", {"starring"}, sort_fields, {0}, 30, 1, FREQUENCY, {false}).get(); ASSERT_EQ(19, results["hits"].size()); ASSERT_EQ(19, coll_mul_fields->get_num_documents()); results = coll_mul_fields->search("back again forest", query_fields, "", {"starring"}, sort_fields, {0}, 30, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("Back Again Forest", coll_mul_fields->get("18").get()["title"].get().c_str()); results = coll_mul_fields->search("fifth", query_fields, "", {"starring"}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(2, results["hits"].size()); ASSERT_STREQ("The Fifth Harry", results["hits"][0]["highlights"][0]["snippet"].get().c_str()); ASSERT_STREQ("The Woman in the Fifth from Kristin", results["hits"][1]["highlights"][0]["snippet"].get().c_str()); results = coll_mul_fields->search("burgundy", query_fields, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(0, results["hits"].size()); results = coll_mul_fields->search("harry", query_fields, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); results = coll_mul_fields->search("captain america", query_fields, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); ASSERT_EQ(77, results["hits"][0]["document"]["points"].get()); // upserting with some bad docs more_records = {R"({"id": "1", "title": "Wake up, Harry", "cast":["Josh Lawson","Chris Parnell"],"points":63,"starring":"Will Ferrell"})", R"({"id": "90", "cast": ["Kim Werrel", "Random Wake"]})", // missing fields R"({"id": "5", "points": 60, "cast":["Logan Lerman","Alexandra Daddario"],"starring":"Ron Perlman","starring_facet":"Ron Perlman","title":"Percy Jackson: Sea of Monsters"})", R"({"id": "24", "starring": "John", "cast": ["John Kim"], "points": 11})"}; // missing fields bool return_id = true; import_response = coll_mul_fields->add_many(more_records, document, UPSERT, "", DIRTY_VALUES::COERCE_OR_REJECT, false, return_id); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(2, import_response["num_imported"].get()); import_results = import_res_to_json(more_records); ASSERT_FALSE(import_results[1]["success"].get()); ASSERT_FALSE(import_results[3]["success"].get()); ASSERT_STREQ("Field `points` has been declared as a default sorting field, but is not found in the document.", import_results[1]["error"].get().c_str()); ASSERT_STREQ("Field `title` has been declared in the schema, but is not found in the document.", import_results[3]["error"].get().c_str()); ASSERT_EQ("1", import_results[0]["id"].get()); ASSERT_EQ("90", import_results[1]["id"].get()); ASSERT_EQ("5", import_results[2]["id"].get()); ASSERT_EQ("24", import_results[3]["id"].get()); // try to duplicate records without upsert option more_records = {R"({"id": "1", "title": "Wake up, Harry"})", R"({"id": "5", "points": 60})"}; import_response = coll_mul_fields->add_many(more_records, document, CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT, false); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(0, import_response["num_imported"].get()); import_results = import_res_to_json(more_records); ASSERT_FALSE(import_results[0]["success"].get()); ASSERT_FALSE(import_results[1]["success"].get()); ASSERT_STREQ("A document with id 1 already exists.", import_results[0]["error"].get().c_str()); ASSERT_STREQ("A document with id 5 already exists.", import_results[1]["error"].get().c_str()); // doc should not be returned, since return_doc = false ASSERT_FALSE(import_results[0].contains("document")); // update document with verbatim fields, except for points more_records = {R"({"id": "3", "cast":["Matt Damon","Ben Affleck","Minnie Driver"], "points":70,"starring":"Robin Williams","starring_facet":"Robin Williams", "title":"Good Will Hunting"})"}; import_response = coll_mul_fields->add_many(more_records, document, UPDATE); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(1, import_response["num_imported"].get()); results = coll_mul_fields->search("Good Will Hunting", query_fields, "", {"starring"}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(70, results["hits"][0]["document"]["points"].get()); // updating a document that does not exist should fail, others should succeed more_records = {R"({"id": "20", "points": 51})", R"({"id": "1", "points": 64})"}; import_response = coll_mul_fields->add_many(more_records, document, UPDATE); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(1, import_response["num_imported"].get()); import_results = import_res_to_json(more_records); ASSERT_FALSE(import_results[0]["success"].get()); ASSERT_TRUE(import_results[1]["success"].get()); ASSERT_STREQ("Could not find a document with id: 20", import_results[0]["error"].get().c_str()); ASSERT_EQ(404, import_results[0]["code"].get()); results = coll_mul_fields->search("wake up harry", query_fields, "", {"starring"}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(64, results["hits"][0]["document"]["points"].get()); // trying to create documents with existing IDs should fail more_records = {R"({"id": "2", "points": 51})", R"({"id": "1", "points": 64})"}; import_response = coll_mul_fields->add_many(more_records, document, CREATE); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(0, import_response["num_imported"].get()); import_results = import_res_to_json(more_records); ASSERT_FALSE(import_results[0]["success"].get()); ASSERT_FALSE(import_results[1]["success"].get()); ASSERT_STREQ("A document with id 2 already exists.", import_results[0]["error"].get().c_str()); ASSERT_STREQ("A document with id 1 already exists.", import_results[1]["error"].get().c_str()); ASSERT_EQ(409, import_results[0]["code"].get()); ASSERT_EQ(409, import_results[1]["code"].get()); } TEST_F(CollectionTest, ImportDocumentsEmplace) { Collection* coll1; std::vector fields = { field("title", field_types::STRING, false, false), field("points", field_types::INT32, false, false) }; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields).get(); } nlohmann::json document; std::vector records = {R"({"id": "0", "title": "The Matrix", "points":0})", R"({"id": "1", "title": "Inception", "points":1})"}; std::vector docs = import_res_to_json(records); // use `emplace` mode for creating documents auto import_response = coll1->add_many(records, document, EMPLACE, "", DIRTY_VALUES::COERCE_OR_REJECT, true, true); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(2, import_response["num_imported"].get()); std::vector import_results = import_res_to_json(records); ASSERT_EQ(2, import_results.size()); for (size_t i = 0; i < 2; i++) { ASSERT_TRUE(import_results[i]["success"].get()); ASSERT_EQ(3, import_results[i].size()); ASSERT_EQ(docs[i], import_results[i]["document"]); ASSERT_EQ(docs[i]["id"], import_results[i]["id"]); } auto res = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10).get(); ASSERT_EQ(2, res["found"].get()); // emplace both update + create records = {R"({"id": "1", "title": "The Inception"})", R"({"id": "2", "title": "Spiderman", "points":2})"}; import_response = coll1->add_many(records, document, EMPLACE); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(2, import_response["num_imported"].get()); import_results = import_res_to_json(records); ASSERT_EQ(2, import_results.size()); for (size_t i = 0; i < 2; i++) { ASSERT_TRUE(import_results[i]["success"].get()); ASSERT_EQ(1, import_results[i].size()); } res = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10).get(); ASSERT_EQ(3, res["found"].get()); ASSERT_EQ("2", res["hits"][0]["document"]["id"].get()); ASSERT_EQ(2, res["hits"][0]["document"]["points"].get()); ASSERT_EQ("1", res["hits"][1]["document"]["id"].get()); ASSERT_EQ(1, res["hits"][1]["document"]["points"].get()); ASSERT_EQ("The Inception", res["hits"][1]["document"]["title"].get()); ASSERT_EQ("0", res["hits"][2]["document"]["id"].get()); ASSERT_EQ(0, res["hits"][2]["document"]["points"].get()); // emplace with an error due to bad data records = {R"({"id": "2", "points": "abcd"})", R"({"id": "3", "title": "Superman", "points":3})"}; import_response = coll1->add_many(records, document, EMPLACE); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(1, import_response["num_imported"].get()); import_results = import_res_to_json(records); ASSERT_EQ(2, import_results.size()); ASSERT_FALSE(import_results[0]["success"].get()); ASSERT_TRUE(import_results[1]["success"].get()); ASSERT_EQ(1, import_results[1].size()); ASSERT_EQ(1, import_results[1].size()); // can update individual document via "emplace" with only partial field (missing points) std::string doc_3_update = R"({"id": "3", "title": "The Superman"})"; auto add_op = coll1->add(doc_3_update, EMPLACE); ASSERT_TRUE(add_op.ok()); res = coll1->search("superman", {"title"}, "", {}, {}, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10).get(); ASSERT_EQ(1, res["found"].get()); ASSERT_EQ("3", res["hits"][0]["document"]["id"].get()); ASSERT_EQ(3, res["hits"][0]["document"]["points"].get()); ASSERT_EQ("The Superman", res["hits"][0]["document"]["title"].get()); // can create individual document via "emplace" std::string doc_4_create = R"({"id": "4", "title": "The Avengers", "points": 4})"; add_op = coll1->add(doc_4_create, EMPLACE); ASSERT_TRUE(add_op.ok()); res = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10).get(); ASSERT_EQ(5, res["found"].get()); } TEST_F(CollectionTest, DISABLED_CrashTroubleshooting) { Collection *coll1; std::vector fields = { field("title", field_types::STRING_ARRAY, false, true), field("points", field_types::INT32, false) }; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } std::ifstream create_file("/tmp/create.jsonl"); std::string json_line; std::vector create_records; while (std::getline(create_file, json_line)) { create_records.push_back(json_line); } create_file.close(); nlohmann::json document; auto import_response = coll1->add_many(create_records, document, CREATE); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(1000, import_response["num_imported"].get()); // now try to upsert std::ifstream upsert_file("/tmp/upsert.jsonl"); std::vector upsert_records; while (std::getline(upsert_file, json_line)) { upsert_records.push_back(json_line); } upsert_file.close(); import_response = coll1->add_many(upsert_records, document, UPSERT); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(1000, import_response["num_imported"].get()); } TEST_F(CollectionTest, ImportDocumentsUpsertOptional) { Collection *coll1; std::vector fields = { field("title", field_types::STRING_ARRAY, false, true), field("points", field_types::INT32, false) }; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } std::vector records; size_t NUM_RECORDS = 1000; for(size_t i=0; iadd_many(records, document, CREATE); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(1000, import_response["num_imported"].get()); // upsert documents with title records.clear(); for(size_t i=0; iadd_many(records, document, UPSERT); auto time_micros = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - begin).count(); //LOG(INFO) << "Time taken for first upsert: " << time_micros; ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(1000, import_response["num_imported"].get()); // run upsert again with title override records.clear(); for(size_t i=0; iadd_many(records, document, UPSERT); time_micros = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - begin).count(); //LOG(INFO) << "Time taken for second upsert: " << time_micros; ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(1000, import_response["num_imported"].get()); // update records (can contain partial fields) records.clear(); for(size_t i=0; iadd_many(records, document, UPDATE); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(1000, import_response["num_imported"].get()); } TEST_F(CollectionTest, ImportDocuments) { Collection *coll_mul_fields; std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl"); std::stringstream strstream; strstream << infile.rdbuf(); infile.close(); std::vector import_records; StringUtils::split(strstream.str(), import_records, "\n"); std::vector fields = { field("title", field_types::STRING, false), field("starring", field_types::STRING, false), field("cast", field_types::STRING_ARRAY, false), field("points", field_types::INT32, false) }; coll_mul_fields = collectionManager.get_collection("coll_mul_fields").get(); if(coll_mul_fields == nullptr) { coll_mul_fields = collectionManager.create_collection("coll_mul_fields", 4, fields, "points").get(); } // try importing records nlohmann::json document; nlohmann::json import_response = coll_mul_fields->add_many(import_records, document); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(18, import_response["num_imported"].get()); // now try searching for records query_fields = {"title", "starring"}; std::vector facets; auto x = coll_mul_fields->search("Will", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}); nlohmann::json results = coll_mul_fields->search("Will", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(4, results["hits"].size()); std::vector ids = {"3", "2", "1", "0"}; for(size_t i = 0; i < results["hits"].size(); i++) { nlohmann::json result = results["hits"].at(i); std::string result_id = result["document"]["id"]; std::string id = ids.at(i); ASSERT_STREQ(id.c_str(), result_id.c_str()); } // verify that empty import is handled gracefully std::vector empty_records; import_response = coll_mul_fields->add_many(empty_records, document); ASSERT_TRUE(import_response["success"].get()); ASSERT_EQ(0, import_response["num_imported"].get()); // verify that only bad records are rejected, rest must be imported (records 2 and 4 are bad) std::vector more_records = {"{\"id\": \"id1\", \"title\": \"Test1\", \"starring\": \"Rand Fish\", \"points\": 12, " "\"cast\": [\"Tom Skerritt\"] }", "{\"title\": 123, \"starring\": \"Jazz Gosh\", \"points\": 23, " "\"cast\": [\"Tom Skerritt\"] }", "{\"title\": \"Test3\", \"starring\": \"Brad Fin\", \"points\": 11, " "\"cast\": [\"Tom Skerritt\"] }", "{\"title\": \"Test4\", \"points\": 55, " "\"cast\": [\"Tom Skerritt\"] }"}; import_response = coll_mul_fields->add_many(more_records, document, CREATE, "", DIRTY_VALUES::REJECT, true); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(2, import_response["num_imported"].get()); std::vector import_results = import_res_to_json(more_records); ASSERT_EQ(4, import_results.size()); ASSERT_TRUE(import_results[0]["success"].get()); ASSERT_FALSE(import_results[1]["success"].get()); ASSERT_TRUE(import_results[2]["success"].get()); ASSERT_FALSE(import_results[3]["success"].get()); ASSERT_STREQ("Field `title` must be a string.", import_results[1]["error"].get().c_str()); ASSERT_STREQ("Field `starring` has been declared in the schema, but is not found in the document.", import_results[3]["error"].get().c_str()); ASSERT_STREQ("{\"title\": 123, \"starring\": \"Jazz Gosh\", \"points\": 23, \"cast\": [\"Tom Skerritt\"] }", import_results[1]["document"].get().c_str()); // record with duplicate IDs more_records = {"{\"id\": \"id2\", \"title\": \"Test1\", \"starring\": \"Rand Fish\", \"points\": 12, " "\"cast\": [\"Tom Skerritt\"] }", "{\"id\": \"id1\", \"title\": \"Test1\", \"starring\": \"Rand Fish\", \"points\": 12, " "\"cast\": [\"Tom Skerritt\"] }"}; import_response = coll_mul_fields->add_many(more_records, document, CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT, true); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(1, import_response["num_imported"].get()); import_results = import_res_to_json(more_records); ASSERT_EQ(2, import_results.size()); ASSERT_TRUE(import_results[0]["success"].get()); ASSERT_FALSE(import_results[1]["success"].get()); ASSERT_STREQ("A document with id id1 already exists.", import_results[1]["error"].get().c_str()); ASSERT_STREQ("{\"id\": \"id1\", \"title\": \"Test1\", \"starring\": \"Rand Fish\", \"points\": 12, " "\"cast\": [\"Tom Skerritt\"] }",import_results[1]["document"].get().c_str()); // handle bad import json // valid JSON but not a document more_records = {"[]"}; import_response = coll_mul_fields->add_many(more_records, document, CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT, true); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(0, import_response["num_imported"].get()); import_results = import_res_to_json(more_records); ASSERT_EQ(1, import_results.size()); ASSERT_EQ(false, import_results[0]["success"].get()); ASSERT_STREQ("Bad JSON: not a properly formed document.", import_results[0]["error"].get().c_str()); ASSERT_STREQ("[]", import_results[0]["document"].get().c_str()); // invalid JSON more_records = {"{"}; import_response = coll_mul_fields->add_many(more_records, document, CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT, true); ASSERT_FALSE(import_response["success"].get()); ASSERT_EQ(0, import_response["num_imported"].get()); import_results = import_res_to_json(more_records); ASSERT_EQ(1, import_results.size()); ASSERT_EQ(false, import_results[0]["success"].get()); ASSERT_STREQ("Bad JSON: [json.exception.parse_error.101] parse error at line 1, column 2: syntax error " "while parsing object key - unexpected end of input; expected string literal", import_results[0]["error"].get().c_str()); ASSERT_STREQ("{", import_results[0]["document"].get().c_str()); collectionManager.drop_collection("coll_mul_fields"); } TEST_F(CollectionTest, SearchingWithMissingFields) { // return error without crashing when searching for fields that do not conform to the schema Collection *coll_array_fields; std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl"); std::vector fields = {field("name", field_types::STRING, false), field("age", field_types::INT32, false), field("years", field_types::INT32_ARRAY, false), field("timestamps", field_types::INT64_ARRAY, false), field("tags", field_types::STRING_ARRAY, true)}; std::vector sort_fields = { sort_by("age", "DESC") }; coll_array_fields = collectionManager.get_collection("coll_array_fields").get(); if(coll_array_fields == nullptr) { coll_array_fields = collectionManager.create_collection("coll_array_fields", 4, fields, "age").get(); } std::string json_line; while (std::getline(infile, json_line)) { coll_array_fields->add(json_line); } infile.close(); // when a query field mentioned in schema does not exist std::vector facets; std::vector query_fields_not_found = {"titlez"}; Option res_op = coll_array_fields->search("the", query_fields_not_found, "", facets, sort_fields, {0}, 10); ASSERT_FALSE(res_op.ok()); ASSERT_EQ(404, res_op.code()); ASSERT_STREQ("Could not find a field named `titlez` in the schema.", res_op.error().c_str()); // when a query field is an integer field res_op = coll_array_fields->search("the", {"age"}, "", facets, sort_fields, {0}, 10); ASSERT_EQ(400, res_op.code()); ASSERT_STREQ("Field `age` should be a string or a string array.", res_op.error().c_str()); // when a facet field is not defined in the schema res_op = coll_array_fields->search("the", {"name"}, "", {"timestamps"}, sort_fields, {0}, 10); ASSERT_EQ(404, res_op.code()); ASSERT_STREQ("Could not find a facet field named `timestamps` in the schema.", res_op.error().c_str()); // when a rank field is not defined in the schema res_op = coll_array_fields->search("the", {"name"}, "", {}, { sort_by("timestamps", "ASC") }, {0}, 10); ASSERT_EQ(404, res_op.code()); ASSERT_STREQ("Could not find a field named `timestamps` in the schema for sorting.", res_op.error().c_str()); res_op = coll_array_fields->search("the", {"name"}, "", {}, { sort_by("_rank", "ASC") }, {0}, 10); ASSERT_EQ(404, res_op.code()); ASSERT_STREQ("Could not find a field named `_rank` in the schema for sorting.", res_op.error().c_str()); collectionManager.drop_collection("coll_array_fields"); } TEST_F(CollectionTest, IndexingWithBadData) { // should not crash when document to-be-indexed doesn't match schema Collection *sample_collection; std::vector fields = {field("name", field_types::STRING, false), field("tags", field_types::STRING_ARRAY, true), field("age", field_types::INT32, false), field("average", field_types::INT32, false) }; std::vector sort_fields = { sort_by("age", "DESC"), sort_by("average", "DESC") }; sample_collection = collectionManager.get_collection("sample_collection").get(); if(sample_collection == nullptr) { sample_collection = collectionManager.create_collection("sample_collection", 4, fields, "age").get(); } const Option & search_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 29, \"average\": 78}"); ASSERT_FALSE(search_fields_missing_op1.ok()); ASSERT_STREQ("Field `tags` has been declared in the schema, but is not found in the document.", search_fields_missing_op1.error().c_str()); const Option & search_fields_missing_op2 = sample_collection->add("{\"namez\": \"foo\", \"tags\": [], \"age\": 34, \"average\": 78}"); ASSERT_FALSE(search_fields_missing_op2.ok()); ASSERT_STREQ("Field `name` has been declared in the schema, but is not found in the document.", search_fields_missing_op2.error().c_str()); const Option & facet_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 34, \"average\": 78}"); ASSERT_FALSE(facet_fields_missing_op1.ok()); ASSERT_STREQ("Field `tags` has been declared in the schema, but is not found in the document.", facet_fields_missing_op1.error().c_str()); const char *doc_str = "{\"name\": \"foo\", \"age\": 34, \"avg\": 78, \"tags\": [\"red\", \"blue\"]}"; const Option & sort_fields_missing_op1 = sample_collection->add(doc_str); ASSERT_FALSE(sort_fields_missing_op1.ok()); ASSERT_STREQ("Field `average` has been declared in the schema, but is not found in the document.", sort_fields_missing_op1.error().c_str()); // Handle type errors doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": 22, \"average\": 78}"; const Option & bad_facet_field_op = sample_collection->add(doc_str); ASSERT_FALSE(bad_facet_field_op.ok()); ASSERT_STREQ("Field `tags` must be an array.", bad_facet_field_op.error().c_str()); doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [\"red\", 22], \"average\": 78}"; const Option & bad_array_field_op = sample_collection->add(doc_str, CREATE, "", DIRTY_VALUES::REJECT); ASSERT_FALSE(bad_array_field_op.ok()); ASSERT_STREQ("Field `tags` must be an array of string.", bad_array_field_op.error().c_str()); // with coercion should work doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [\"red\", 22], \"average\": 78}"; const Option &bad_array_field_coercion_op = sample_collection->add(doc_str, CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT); ASSERT_TRUE(bad_array_field_coercion_op.ok()); doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [], \"average\": 34}"; const Option & empty_facet_field_op = sample_collection->add(doc_str); ASSERT_TRUE(empty_facet_field_op.ok()); doc_str = "{\"name\": \"foo\", \"age\": [\"34\"], \"tags\": [], \"average\": 34 }"; const Option & bad_default_sorting_field_op1 = sample_collection->add(doc_str); ASSERT_FALSE(bad_default_sorting_field_op1.ok()); ASSERT_STREQ("Field `age` must be an int32.", bad_default_sorting_field_op1.error().c_str()); doc_str = "{\"name\": \"foo\", \"tags\": [], \"average\": 34 }"; const Option & bad_default_sorting_field_op3 = sample_collection->add(doc_str); ASSERT_FALSE(bad_default_sorting_field_op3.ok()); ASSERT_STREQ("Field `age` has been declared as a default sorting field, but is not found in the document.", bad_default_sorting_field_op3.error().c_str()); doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [], \"average\": \"34\"}"; const Option & bad_rank_field_op = sample_collection->add(doc_str, CREATE, "", DIRTY_VALUES::REJECT); ASSERT_FALSE(bad_rank_field_op.ok()); ASSERT_STREQ("Field `average` must be an int32.", bad_rank_field_op.error().c_str()); doc_str = "{\"name\": \"foo\", \"age\": asdadasd, \"tags\": [], \"average\": 34 }"; const Option & bad_default_sorting_field_op4 = sample_collection->add(doc_str); ASSERT_FALSE(bad_default_sorting_field_op4.ok()); ASSERT_STREQ("Bad JSON: [json.exception.parse_error.101] parse error at line 1, column 24: syntax error " "while parsing value - invalid literal; last read: '\"age\": a'", bad_default_sorting_field_op4.error().c_str()); // should return an error when a document with pre-existing id is being added std::string doc = "{\"id\": \"100\", \"name\": \"foo\", \"age\": 29, \"tags\": [], \"average\": 78}"; Option add_op = sample_collection->add(doc); ASSERT_TRUE(add_op.ok()); add_op = sample_collection->add(doc); ASSERT_FALSE(add_op.ok()); ASSERT_EQ(409, add_op.code()); ASSERT_STREQ("A document with id 100 already exists.", add_op.error().c_str()); collectionManager.drop_collection("sample_collection"); } TEST_F(CollectionTest, EmptyIndexShouldNotCrash) { Collection *empty_coll; std::vector fields = {field("name", field_types::STRING, false), field("tags", field_types::STRING_ARRAY, false), field("age", field_types::INT32, false), field("average", field_types::INT32, false)}; std::vector sort_fields = { sort_by("age", "DESC"), sort_by("average", "DESC") }; empty_coll = collectionManager.get_collection("empty_coll").get(); if(empty_coll == nullptr) { empty_coll = collectionManager.create_collection("empty_coll", 4, fields, "age").get(); } nlohmann::json results = empty_coll->search("a", {"name"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(0, results["hits"].size()); collectionManager.drop_collection("empty_coll"); } TEST_F(CollectionTest, IdFieldShouldBeAString) { Collection *coll1; std::vector fields = {field("name", field_types::STRING, false), field("tags", field_types::STRING_ARRAY, false), field("age", field_types::INT32, false), field("average", field_types::INT32, false)}; std::vector sort_fields = { sort_by("age", "DESC"), sort_by("average", "DESC") }; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "age").get(); } nlohmann::json doc; doc["id"] = 101010; doc["name"] = "Jane"; doc["age"] = 25; doc["average"] = 98; doc["tags"] = nlohmann::json::array(); doc["tags"].push_back("tag1"); Option inserted_id_op = coll1->add(doc.dump()); ASSERT_FALSE(inserted_id_op.ok()); ASSERT_STREQ("Document's `id` field should be a string.", inserted_id_op.error().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, AnIntegerCanBePassedToAFloatField) { Collection *coll1; std::vector fields = {field("name", field_types::STRING, false), field("average", field_types::FLOAT, false)}; std::vector sort_fields = { sort_by("average", "DESC") }; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "average").get(); } nlohmann::json doc; doc["id"] = "101010"; doc["name"] = "Jane"; doc["average"] = 98; Option inserted_id_op = coll1->add(doc.dump()); EXPECT_TRUE(inserted_id_op.ok()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, DeletionOfADocument) { collectionManager.drop_collection("collection"); std::ifstream infile(std::string(ROOT_DIR)+"test/documents.jsonl"); std::vector search_fields = {field("title", field_types::STRING, false), field("points", field_types::INT32, false)}; std::vector query_fields = {"title"}; std::vector sort_fields = { sort_by("points", "DESC") }; Collection *collection_for_del; collection_for_del = collectionManager.get_collection("collection_for_del").get(); if(collection_for_del == nullptr) { collection_for_del = collectionManager.create_collection("collection_for_del", 4, search_fields, "points").get(); } std::string json_line; rocksdb::Iterator* it; size_t num_keys = 0; // dummy record for record id 0: to make the test record IDs to match with line numbers json_line = "{\"points\":10,\"title\":\"z\"}"; collection_for_del->add(json_line); while (std::getline(infile, json_line)) { collection_for_del->add(json_line); } ASSERT_EQ(25, collection_for_del->get_num_documents()); infile.close(); nlohmann::json results; // asserts before removing any record results = collection_for_del->search("cryogenic", query_fields, "", {}, sort_fields, {0}, 5, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); it = store->get_iterator(); num_keys = 0; for (it->SeekToFirst(); it->Valid(); it->Next()) { num_keys += 1; } ASSERT_EQ(25+25+3, num_keys); // 25 records, 25 id mapping, 3 meta keys delete it; // actually remove a record now collection_for_del->remove("1"); results = collection_for_del->search("cryogenic", query_fields, "", {}, sort_fields, {0}, 5, 1, FREQUENCY, {false}).get(); ASSERT_EQ(0, results["hits"].size()); ASSERT_EQ(0, results["found"]); results = collection_for_del->search("archives", query_fields, "", {}, sort_fields, {0}, 5, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); ASSERT_EQ(1, results["found"]); collection_for_del->remove("foo"); // custom id record results = collection_for_del->search("martian", query_fields, "", {}, sort_fields, {0}, 5, 1, FREQUENCY, {false}).get(); ASSERT_EQ(0, results["hits"].size()); ASSERT_EQ(0, results["found"]); // delete all records for(int id = 0; id <= 25; id++) { collection_for_del->remove(std::to_string(id)); } ASSERT_EQ(0, collection_for_del->get_num_documents()); it = store->get_iterator(); num_keys = 0; for (it->SeekToFirst(); it->Valid(); it->Next()) { num_keys += 1; } delete it; ASSERT_EQ(3, num_keys); collectionManager.drop_collection("collection_for_del"); } TEST_F(CollectionTest, DeletionOfDocumentSingularFields) { Collection *coll1; std::vector fields = {field("str", field_types::STRING, false), field("int32", field_types::INT32, false), field("int64", field_types::INT64, false), field("float", field_types::FLOAT, false), field("bool", field_types::BOOL, false)}; std::vector sort_fields = { sort_by("int32", "DESC") }; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "int32").get(); } nlohmann::json doc; doc["id"] = "100"; doc["str"] = "[NEW] Cell Phone Cases, Holders & Clips!"; doc["int32"] = 100032; doc["int64"] = 1582369739000; doc["float"] = -293.24; doc["bool"] = true; Option add_op = coll1->add(doc.dump()); ASSERT_TRUE(add_op.ok()); nlohmann::json res = coll1->search("phone", {"str"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10).get(); ASSERT_EQ(1, res["found"]); Option rem_op = coll1->remove("100"); ASSERT_TRUE(rem_op.ok()); res = coll1->search("phone", {"str"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10).get(); ASSERT_EQ(0, res["found"].get()); // also assert against the actual index const Index *index = coll1->_get_index(); // seq id will always be zero for first document auto search_index = index->_get_search_index(); auto numerical_index = index->_get_numerical_index(); auto str_tree = search_index["str"]; auto int32_tree = numerical_index["int32"]; auto int64_tree = numerical_index["int64"]; auto float_tree = numerical_index["float"]; auto bool_tree = numerical_index["bool"]; ASSERT_EQ(0, art_size(str_tree)); ASSERT_EQ(0, int32_tree->size()); ASSERT_EQ(0, int64_tree->size()); ASSERT_EQ(0, float_tree->size()); ASSERT_EQ(0, bool_tree->size()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, DeletionOfDocumentArrayFields) { Collection *coll1; std::vector fields = {field("strarray", field_types::STRING_ARRAY, false), field("int32array", field_types::INT32_ARRAY, false), field("int64array", field_types::INT64_ARRAY, false), field("floatarray", field_types::FLOAT_ARRAY, false), field("boolarray", field_types::BOOL_ARRAY, false), field("points", field_types::INT32, false)}; std::vector sort_fields = { sort_by("points", "DESC") }; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } nlohmann::json doc; doc["id"] = "100"; doc["strarray"] = {"Cell Phones", "Cell Phone Accessories", "Cell Phone Cases & Clips"}; doc["int32array"] = {100, 200, 300}; doc["int64array"] = {1582369739000, 1582369739000, 1582369739000}; doc["floatarray"] = {19.99, 400.999}; doc["boolarray"] = {true, false, true}; doc["points"] = 25; Option add_op = coll1->add(doc.dump()); ASSERT_TRUE(add_op.ok()); nlohmann::json res = coll1->search("phone", {"strarray"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10).get(); ASSERT_EQ(1, res["found"].get()); Option rem_op = coll1->remove("100"); ASSERT_TRUE(rem_op.ok()); res = coll1->search("phone", {"strarray"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10).get(); ASSERT_EQ(0, res["found"].get()); // also assert against the actual index const Index *index = coll1->_get_index(); // seq id will always be zero for first document auto search_index = index->_get_search_index(); auto numerical_index = index->_get_numerical_index(); auto strarray_tree = search_index["strarray"]; auto int32array_tree = numerical_index["int32array"]; auto int64array_tree = numerical_index["int64array"]; auto floatarray_tree = numerical_index["floatarray"]; auto boolarray_tree = numerical_index["boolarray"]; ASSERT_EQ(0, art_size(strarray_tree)); ASSERT_EQ(0, int32array_tree->size()); ASSERT_EQ(0, int64array_tree->size()); ASSERT_EQ(0, floatarray_tree->size()); ASSERT_EQ(0, boolarray_tree->size()); collectionManager.drop_collection("coll1"); } nlohmann::json get_prune_doc() { nlohmann::json document; document["one"] = 1; document["two"] = 2; document["three"] = 3; document["four"] = 4; return document; } TEST_F(CollectionTest, SearchLargeTextField) { Collection *coll_large_text; std::vector fields = {field("text", field_types::STRING, false), field("age", field_types::INT32, false), }; std::vector sort_fields = { sort_by(sort_field_const::text_match, "DESC"), sort_by("age", "DESC") }; coll_large_text = collectionManager.get_collection("coll_large_text").get(); if(coll_large_text == nullptr) { coll_large_text = collectionManager.create_collection("coll_large_text", 4, fields, "age").get(); } std::string json_line; std::ifstream infile(std::string(ROOT_DIR)+"test/large_text_field.jsonl"); while (std::getline(infile, json_line)) { coll_large_text->add(json_line); } infile.close(); Option res_op = coll_large_text->search("eguilazer", {"text"}, "", {}, sort_fields, {0}, 10); ASSERT_TRUE(res_op.ok()); nlohmann::json results = res_op.get(); ASSERT_EQ(1, results["hits"].size()); res_op = coll_large_text->search("tristique", {"text"}, "", {}, sort_fields, {0}, 10); ASSERT_TRUE(res_op.ok()); results = res_op.get(); ASSERT_EQ(2, results["hits"].size()); // query whose length exceeds maximum highlight window (match score's WINDOW_SIZE) res_op = coll_large_text->search( "Phasellus non tristique elit Praesent non arcu id lectus accumsan venenatis at", {"text"}, "", {}, sort_fields, {0}, 10 ); ASSERT_TRUE(res_op.ok()); results = res_op.get(); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); // only single matched token in match window res_op = coll_large_text->search("molestie maecenas accumsan", {"text"}, "", {}, sort_fields, {0}, 10); ASSERT_TRUE(res_op.ok()); results = res_op.get(); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("non arcu id lectus accumsan venenatis at at justo.", results["hits"][0]["highlights"][0]["snippet"].get().c_str()); collectionManager.drop_collection("coll_large_text"); } TEST_F(CollectionTest, PruneFieldsFromDocument) { nlohmann::json document = get_prune_doc(); Collection::prune_doc(document, {"one", "two"}, tsl::htrie_set()); ASSERT_EQ(2, document.size()); ASSERT_EQ(1, document["one"]); ASSERT_EQ(2, document["two"]); // exclude takes precedence document = get_prune_doc(); Collection::prune_doc(document, {"one"}, {"one"}); ASSERT_EQ(0, document.size()); // when no inclusion is specified, should return all fields not mentioned by exclusion list document = get_prune_doc(); Collection::prune_doc(document, tsl::htrie_set(), tsl::htrie_set({"three"}), ""); ASSERT_EQ(3, document.size()); ASSERT_EQ(1, document["one"]); ASSERT_EQ(2, document["two"]); ASSERT_EQ(4, document["four"]); document = get_prune_doc(); Collection::prune_doc(document, tsl::htrie_set(), tsl::htrie_set(), ""); ASSERT_EQ(4, document.size()); // when included field does not exist document = get_prune_doc(); Collection::prune_doc(document, {"notfound"}, tsl::htrie_set(), ""); ASSERT_EQ(0, document.size()); // when excluded field does not exist document = get_prune_doc(); Collection::prune_doc(document, tsl::htrie_set(), {"notfound"}, ""); ASSERT_EQ(4, document.size()); // included set is prefix of allowed fields document = get_prune_doc(); Collection::prune_doc(document, {"ones"}, tsl::htrie_set(), ""); ASSERT_EQ(0, document.size()); } TEST_F(CollectionTest, StringArrayFieldShouldNotAllowPlainString) { Collection *coll1; std::vector fields = {field("categories", field_types::STRING_ARRAY, true), field("points", field_types::INT32, false)}; std::vector sort_fields = {sort_by("points", "DESC")}; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } nlohmann::json doc; doc["id"] = "100"; doc["categories"] = "Should not be allowed!"; doc["points"] = 25; auto add_op = coll1->add(doc.dump()); ASSERT_FALSE(add_op.ok()); ASSERT_STREQ("Field `categories` must be an array.", add_op.error().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, SearchHighlightShouldFollowThreshold) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, true), field("points", field_types::INT32, false)}; std::vector sort_fields = {sort_by("points", "DESC")}; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } nlohmann::json doc; doc["id"] = "100"; doc["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep."; doc["points"] = 25; auto add_op = coll1->add(doc.dump()); ASSERT_TRUE(add_op.ok()); // first with a large threshold auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "").get(); ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.", res["hits"][0]["highlights"][0]["snippet"].get().c_str()); // now with with a small threshold (will show only 4 words either side of the matched token) res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5).get(); ASSERT_STREQ("fox jumped over the lazy dog and ran straight", res["hits"][0]["highlights"][0]["snippet"].get().c_str()); // specify the number of surrounding tokens to return size_t highlight_affix_num_tokens = 2; res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, highlight_affix_num_tokens).get(); ASSERT_STREQ("over the lazy dog and", res["hits"][0]["highlights"][0]["snippet"].get().c_str()); highlight_affix_num_tokens = 0; res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, highlight_affix_num_tokens).get(); ASSERT_STREQ("lazy", res["hits"][0]["highlights"][0]["snippet"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, SearchHighlightShouldUseHighlightTags) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, true), field("points", field_types::INT32, false)}; std::vector sort_fields = {sort_by("points", "DESC")}; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } nlohmann::json doc; doc["id"] = "100"; doc["title"] = "The quick brown fox jumped over the lazy fox. "; // adding some extra spaces doc["points"] = 25; auto add_op = coll1->add(doc.dump()); ASSERT_TRUE(add_op.ok()); // use non-default highlighting tags auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "").get(); ASSERT_STREQ("The quick brown fox jumped over the lazy fox. ", res["hits"][0]["highlights"][0]["snippet"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, SearchHighlightWithNewLine) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, true), field("points", field_types::INT32, false)}; std::vector sort_fields = {sort_by("points", "DESC")}; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } nlohmann::json doc; doc["id"] = "100"; doc["title"] = "Blah, blah\nStark Industries"; doc["points"] = 25; auto add_op = coll1->add(doc.dump()); ASSERT_TRUE(add_op.ok()); auto res = coll1->search("stark", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0).get(); ASSERT_STREQ("Blah, blah\nStark Industries", res["hits"][0]["highlights"][0]["snippet"].get().c_str()); ASSERT_STREQ("Stark", res["hits"][0]["highlights"][0]["matched_tokens"][0].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, UpdateDocument) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, true), field("tags", field_types::STRING_ARRAY, true, true), field("points", field_types::INT32, false)}; std::vector sort_fields = {sort_by("points", "DESC")}; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } nlohmann::json doc; doc["id"] = "100"; doc["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep."; doc["tags"] = {"NEWS", "LAZY"}; doc["points"] = 25; auto add_op = coll1->add(doc.dump()); ASSERT_TRUE(add_op.ok()); auto res = coll1->search("lazy", {"title"}, "", {"tags"}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.", res["hits"][0]["document"]["title"].get().c_str()); // reindex the document entirely again verbatim and try querying add_op = coll1->add(doc.dump(), UPSERT); ASSERT_TRUE(add_op.ok()); ASSERT_EQ(1, coll1->get_num_documents()); res = coll1->search("lazy", {"title"}, "", {"tags"}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_EQ(1, res["facet_counts"].size()); ASSERT_STREQ("tags", res["facet_counts"][0]["field_name"].get().c_str()); ASSERT_EQ(2, res["facet_counts"][0]["counts"].size()); ASSERT_STREQ("NEWS", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][0]["count"]); ASSERT_STREQ("LAZY", res["facet_counts"][0]["counts"][1]["value"].get().c_str()); ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][1]["count"]); // upsert only part of the document -- document should be REPLACED nlohmann::json partial_doc = doc; partial_doc.erase("tags"); add_op = coll1->add(partial_doc.dump(), UPSERT); ASSERT_TRUE(add_op.ok()); res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_FALSE(res["hits"][0].contains("tags")); // upserting without a mandatory field should be an error partial_doc = doc; partial_doc.erase("title"); add_op = coll1->add(partial_doc.dump(), UPSERT); ASSERT_FALSE(add_op.ok()); ASSERT_EQ("Field `title` has been declared in the schema, but is not found in the document.", add_op.error()); // try changing the title and searching for an older token doc["title"] = "The quick brown fox."; add_op = coll1->add(doc.dump(), UPSERT); ASSERT_TRUE(add_op.ok()); ASSERT_EQ(1, coll1->get_num_documents()); res = coll1->search("lazy", {"title"}, "", {"tags"}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(0, res["hits"].size()); res = coll1->search("quick", {"title"}, "", {"title"}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_STREQ("The quick brown fox.", res["hits"][0]["document"]["title"].get().c_str()); // try to update document tags without `id` nlohmann::json doc2; doc2["tags"] = {"SENTENCE"}; add_op = coll1->add(doc2.dump(), UPDATE); ASSERT_FALSE(add_op.ok()); ASSERT_STREQ("For update, the `id` key must be provided.", add_op.error().c_str()); // now change tags with id doc2["id"] = "100"; add_op = coll1->add(doc2.dump(), UPDATE); ASSERT_TRUE(add_op.ok()); // check for old tag res = coll1->search("NEWS", {"tags"}, "", {"tags"}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(0, res["hits"].size()); // now check for new tag and also try faceting on that field res = coll1->search("SENTENCE", {"tags"}, "", {"tags"}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_STREQ("SENTENCE", res["facet_counts"][0]["counts"][0]["value"].get().c_str()); // try changing points nlohmann::json doc3; doc3["points"] = 99; doc3["id"] = "100"; add_op = coll1->add(doc3.dump(), UPDATE); ASSERT_TRUE(add_op.ok()); res = coll1->search("*", {"tags"}, "points: > 90", {"tags"}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_EQ(99, res["hits"][0]["document"]["points"].get()); // id can be passed by param nlohmann::json doc4; doc4["points"] = 105; add_op = coll1->add(doc4.dump(), UPDATE, "100"); ASSERT_TRUE(add_op.ok()); res = coll1->search("*", {"tags"}, "points: > 101", {"tags"}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_EQ(105, res["hits"][0]["document"]["points"].get()); // try to change a field with bad value and verify that old document is put back doc4["points"] = "abc"; add_op = coll1->add(doc4.dump(), UPDATE, "100"); ASSERT_FALSE(add_op.ok()); ASSERT_EQ("Field `points` must be an int32.", add_op.error()); res = coll1->search("*", {"tags"}, "points: > 101", {"tags"}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_EQ(105, res["hits"][0]["document"]["points"].get()); // when explicit path id does not match doc id, error should be returned nlohmann::json doc5; doc5["id"] = "800"; doc5["title"] = "The Secret Seven"; doc5["points"] = 250; doc5["tags"] = {"BOOK", "ENID BLYTON"}; add_op = coll1->add(doc5.dump(), UPSERT, "799"); ASSERT_FALSE(add_op.ok()); ASSERT_EQ(400, add_op.code()); ASSERT_STREQ("The `id` of the resource does not match the `id` in the JSON body.", add_op.error().c_str()); // passing an empty id should not succeed nlohmann::json doc6; doc6["id"] = ""; doc6["title"] = "The Secret Seven"; doc6["points"] = 250; doc6["tags"] = {"BOOK", "ENID BLYTON"}; add_op = coll1->add(doc6.dump(), UPDATE); ASSERT_FALSE(add_op.ok()); ASSERT_EQ(400, add_op.code()); ASSERT_STREQ("The `id` should not be empty.", add_op.error().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, UpdateDocuments) { nlohmann::json schema = R"({ "name": "update_docs_collection", "enable_nested_fields": true, "fields": [ {"name": "user_name", "type": "string", "facet": true}, {"name": "likes", "type": "int32"}, {"name": "content", "type": "object"} ], "default_sorting_field": "likes" })"_json; Collection *update_docs_collection = collectionManager.get_collection("update_docs_collection").get(); if (update_docs_collection == nullptr) { auto op = CollectionManager::create_collection(schema); ASSERT_TRUE(op.ok()); update_docs_collection = op.get(); } std::vector json_lines = { R"({"user_name": "fat_cat","likes": 5215,"content": {"title": "cat data 1", "body": "cd1"}})", R"({"user_name": "fast_dog","likes": 273,"content": {"title": "dog data 1", "body": "dd1"}})", R"({"user_name": "fat_cat","likes": 2133,"content": {"title": "cat data 2", "body": "cd2"}})", R"({"user_name": "fast_dog","likes": 9754,"content": {"title": "dog data 2", "body": "dd2"}})", R"({"user_name": "fast_dog","likes": 576,"content": {"title": "dog data 3", "body": "dd3"}})" }; for (auto const& json: json_lines){ auto add_op = update_docs_collection->add(json); if (!add_op.ok()) { std::cout << add_op.error() << std::endl; } ASSERT_TRUE(add_op.ok()); } std::vector sort_fields = { sort_by("likes", "DESC") }; auto res = update_docs_collection->search("cat data", {"content"}, "", {}, sort_fields, {0}, 10).get(); ASSERT_EQ(2, res["hits"].size()); for (size_t i = 0; i < res["hits"].size(); i++) { ASSERT_EQ("fat_cat", res["hits"][i]["document"]["user_name"].get()); } nlohmann::json document; document["user_name"] = "slim_cat"; std::string dirty_values; bool validate_field_names = false; auto update_op = update_docs_collection->update_matching_filter("foo:=fat_cat", document.dump(), dirty_values, validate_field_names); ASSERT_TRUE(update_op.ok()); ASSERT_EQ(0, update_op.get()["num_updated"]); update_op = update_docs_collection->update_matching_filter("user_name:=fat_cat", document.dump(), dirty_values); ASSERT_TRUE(update_op.ok()); ASSERT_EQ(2, update_op.get()["num_updated"]); res = update_docs_collection->search("cat data", {"content"}, "", {}, sort_fields, {0}, 10).get(); ASSERT_EQ(2, res["hits"].size()); for (size_t i = 0; i < res["hits"].size(); i++) { ASSERT_EQ("slim_cat", res["hits"][i]["document"]["user_name"].get()); } validate_field_names = true; // Test batching res = update_docs_collection->search("dog data", {"content"}, "", {}, sort_fields, {0}, 10).get(); ASSERT_EQ(3, res["hits"].size()); for (size_t i = 0; i < res["hits"].size(); i++) { ASSERT_EQ("fast_dog", res["hits"][i]["document"]["user_name"].get()); } document["user_name"] = "lazy_dog"; update_op = update_docs_collection->update_matching_filter("user_name:=fast_dog", document.dump(), dirty_values, validate_field_names, 2); ASSERT_TRUE(update_op.ok()); ASSERT_EQ(3, update_op.get()["num_updated"]); res = update_docs_collection->search("dog data", {"content"}, "", {}, sort_fields, {0}, 10).get(); ASSERT_EQ(3, res["hits"].size()); for (size_t i = 0; i < res["hits"].size(); i++) { ASSERT_EQ("lazy_dog", res["hits"][i]["document"]["user_name"].get()); } // Test nested fields updation res = update_docs_collection->search("*", {}, "user_name:=slim_cat", {}, sort_fields, {0}, 10).get(); ASSERT_EQ(2, res["hits"].size()); for (size_t i = 0; i < res["hits"].size(); i++) { ASSERT_EQ("cat data " + std::to_string(i + 1), res["hits"][i]["document"]["content"]["title"].get()); } document.clear(); document["content"]["title"] = "fancy cat title"; update_op = update_docs_collection->update_matching_filter("user_name:=slim_cat", document.dump(), dirty_values, validate_field_names, 2); ASSERT_TRUE(update_op.ok()); ASSERT_EQ(2, update_op.get()["num_updated"]); res = update_docs_collection->search("*", {}, "user_name:=slim_cat", {}, sort_fields, {0}, 10).get(); ASSERT_EQ(2, res["hits"].size()); for (size_t i = 0; i < res["hits"].size(); i++) { ASSERT_EQ("fancy cat title", res["hits"][i]["document"]["content"]["title"].get()); } // Test all document updation res = update_docs_collection->search("*", {}, "", {}, sort_fields, {0}, 10).get(); ASSERT_EQ(5, res["hits"].size()); for (size_t i = 0; i < res["hits"].size(); i++) { ASSERT_NE(0, res["hits"][i]["document"]["likes"].get()); } document.clear(); document["likes"] = 0; update_op = update_docs_collection->update_matching_filter("*", document.dump(), dirty_values, validate_field_names, 2); ASSERT_TRUE(update_op.ok()); ASSERT_EQ(5, update_op.get()["num_updated"]); res = update_docs_collection->search("*", {}, "", {}, sort_fields, {0}, 10).get(); ASSERT_EQ(5, res["hits"].size()); for (size_t i = 0; i < res["hits"].size(); i++) { ASSERT_EQ(0, res["hits"][i]["document"]["likes"].get()); } collectionManager.drop_collection("update_docs_collection"); } TEST_F(CollectionTest, UpdateDocumentSorting) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, true), field("tags", field_types::STRING_ARRAY, true), field("points", field_types::INT32, false)}; std::vector sort_fields = {sort_by("points", "DESC")}; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } nlohmann::json doc1; doc1["id"] = "100"; doc1["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep."; doc1["tags"] = {"NEWS", "LAZY"}; doc1["points"] = 100; nlohmann::json doc2; doc2["id"] = "101"; doc2["title"] = "The random sentence."; doc2["tags"] = {"RANDOM"}; doc2["points"] = 101; auto add_op = coll1->add(doc1.dump()); coll1->add(doc2.dump()); auto res = coll1->search("*", {"tags"}, "", {"tags"}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(2, res["hits"].size()); ASSERT_EQ(101, res["hits"][0]["document"]["points"].get()); ASSERT_STREQ("101", res["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(100, res["hits"][1]["document"]["points"].get()); ASSERT_STREQ("100", res["hits"][1]["document"]["id"].get().c_str()); // now update doc1 points from 100 -> 1000 and it should bubble up doc1["points"] = 1000; coll1->add(doc1.dump(), UPDATE); res = coll1->search("*", {"tags"}, "", {"tags"}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(2, res["hits"].size()); ASSERT_EQ(1000, res["hits"][0]["document"]["points"].get()); ASSERT_STREQ("100", res["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(101, res["hits"][1]["document"]["points"].get()); ASSERT_STREQ("101", res["hits"][1]["document"]["id"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, UpdateDocumentUnIndexedField) { Collection* coll1; std::vector fields = {field("title", field_types::STRING, true), field("points", field_types::INT32, false)}; std::vector sort_fields = {sort_by("points", "DESC")}; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } nlohmann::json doc; doc["id"] = "100"; doc["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep."; doc["foo"] = "foo1"; doc["points"] = 25; auto add_op = coll1->add(doc.dump()); ASSERT_TRUE(add_op.ok()); auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.", res["hits"][0]["document"]["title"].get().c_str()); // reindex the document again by changing only the unindexed field doc["foo"] = "foo2"; add_op = coll1->add(doc.dump(), UPSERT); ASSERT_TRUE(add_op.ok()); res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_STREQ("foo2", res["hits"][0]["document"]["foo"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, SearchHighlightFieldFully) { Collection *coll1; std::vector fields = { field("title", field_types::STRING, true), field("tags", field_types::STRING_ARRAY, true), field("points", field_types::INT32, false)}; std::vector sort_fields = {sort_by("points", "DESC")}; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } nlohmann::json doc; doc["id"] = "100"; doc["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep."; doc["tags"] = {"NEWS", "LAZY"}; doc["points"] = 25; auto add_op = coll1->add(doc.dump()); ASSERT_TRUE(add_op.ok()); // look for fully highlighted value in response auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title").get(); ASSERT_EQ(1, res["hits"][0]["highlights"].size()); ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.", res["hits"][0]["highlights"][0]["value"].get().c_str()); // should not return value key when highlight_full_fields is not specified res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "").get(); ASSERT_EQ(3, res["hits"][0]["highlights"][0].size()); // query multiple fields res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 5, 5, "title, tags").get(); ASSERT_EQ(2, res["hits"][0]["highlights"].size()); ASSERT_EQ("tags", res["hits"][0]["highlights"][0]["field"]); ASSERT_EQ(1, res["hits"][0]["highlights"][0]["values"].size()); ASSERT_EQ("LAZY", res["hits"][0]["highlights"][0]["values"][0].get()); ASSERT_EQ(1, res["hits"][0]["highlights"][0]["snippets"].size()); ASSERT_EQ("LAZY", res["hits"][0]["highlights"][0]["snippets"][0].get()); ASSERT_EQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.", res["hits"][0]["highlights"][1]["value"].get()); ASSERT_EQ("title", res["hits"][0]["highlights"][1]["field"]); ASSERT_EQ(1, res["hits"][0]["highlights"][1]["matched_tokens"].size()); ASSERT_STREQ("lazy", res["hits"][0]["highlights"][1]["matched_tokens"][0].get().c_str()); // excluded fields should not be returned in highlights section spp::sparse_hash_set excluded_fields = {"tags"}; res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), excluded_fields, 10, "", 5, 5, "title, tags").get(); ASSERT_EQ(1, res["hits"][0]["highlights"].size()); ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.", res["hits"][0]["highlights"][0]["value"].get().c_str()); // when all fields are excluded excluded_fields = {"tags", "title"}; res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), excluded_fields, 10, "", 5, 5, "title, tags").get(); ASSERT_EQ(0, res["hits"][0]["highlights"].size()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, OptionalFields) { Collection *coll1; std::vector fields = { field("title", field_types::STRING, false), field("description", field_types::STRING, true, true), field("max", field_types::INT32, false), field("scores", field_types::INT64_ARRAY, false, true), field("average", field_types::FLOAT, false, true), field("is_valid", field_types::BOOL, false, true), }; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "max").get(); } std::ifstream infile(std::string(ROOT_DIR)+"test/optional_fields.jsonl"); std::string json_line; while (std::getline(infile, json_line)) { auto add_op = coll1->add(json_line); if(!add_op.ok()) { std::cout << add_op.error() << std::endl; } ASSERT_TRUE(add_op.ok()); } infile.close(); // first must be able to fetch all records (i.e. all must have been indexed) auto res = coll1->search("*", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(6, res["found"].get()); // search on optional `description` field res = coll1->search("book", {"description"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(5, res["found"].get()); // filter on optional `average` field res = coll1->search("the", {"title"}, "average: >0", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(5, res["found"].get()); // facet on optional `description` field res = coll1->search("the", {"title"}, "", {"description"}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(6, res["found"].get()); ASSERT_EQ(5, res["facet_counts"][0]["counts"][0]["count"].get()); ASSERT_STREQ("description", res["facet_counts"][0]["field_name"].get().c_str()); // sort_by optional `average` field should be allowed (default used for missing values) std::vector sort_fields = { sort_by("average", "DESC") }; auto res_op = coll1->search("*", {"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false}); ASSERT_TRUE(res_op.ok()); res = res_op.get(); ASSERT_EQ(6, res["found"].get()); ASSERT_EQ(0, res["hits"][5]["document"].count("average")); // record with missing average is last // try deleting a record having optional field Option remove_op = coll1->remove("1"); ASSERT_TRUE(remove_op.ok()); // try fetching the schema (should contain optional field) nlohmann::json coll_summary = coll1->get_summary_json(); ASSERT_STREQ("title", coll_summary["fields"][0]["name"].get().c_str()); ASSERT_STREQ("string", coll_summary["fields"][0]["type"].get().c_str()); ASSERT_FALSE(coll_summary["fields"][0]["facet"].get()); ASSERT_FALSE(coll_summary["fields"][0]["optional"].get()); ASSERT_STREQ("description", coll_summary["fields"][1]["name"].get().c_str()); ASSERT_STREQ("string", coll_summary["fields"][1]["type"].get().c_str()); ASSERT_TRUE(coll_summary["fields"][1]["facet"].get()); ASSERT_TRUE(coll_summary["fields"][1]["optional"].get()); // default sorting field should not be declared optional fields = { field("title", field_types::STRING, false), field("score", field_types::INT32, false, true), }; auto create_op = collectionManager.create_collection("coll2", 4, fields, "score"); ASSERT_FALSE(create_op.ok()); ASSERT_STREQ("Default sorting field `score` cannot be an optional field.", create_op.error().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, OptionalFieldCanBeNull) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false, true), field("genres", field_types::STRING_ARRAY, false, true), field("launch_year", field_types::INT32, false, true), field("updated_at", field_types::INT64, false, true), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } nlohmann::json doc; doc["id"] = "0"; doc["title"] = "Beat it"; doc["artist"] = nullptr; doc["genres"] = nullptr; doc["launch_year"] = nullptr; doc["updated_at"] = nullptr; doc["points"] = 100; ASSERT_TRUE(coll1->add(doc.dump()).ok()); ASSERT_EQ(2, coll1->_get_index()->_get_search_index().at("title")->size); ASSERT_EQ(0, coll1->_get_index()->_get_search_index().at("artist")->size); ASSERT_EQ(0, coll1->_get_index()->_get_search_index().at("genres")->size); auto results = coll1->search("beat", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, EmptyStringNotIndexed) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false, true), field("genres", field_types::STRING_ARRAY, false, true), field("launch_year", field_types::STRING, false, true), field("labels", field_types::STRING_ARRAY, false, true), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } nlohmann::json doc; doc["id"] = "0"; doc["title"] = "Beat it"; doc["artist"] = ""; doc["launch_year"] = " "; doc["genres"] = {""}; doc["labels"] = {"song", " ", ""}; doc["points"] = 100; ASSERT_TRUE(coll1->add(doc.dump()).ok()); auto results = coll1->search("beat", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_EQ(2, coll1->_get_index()->_get_search_index().at("title")->size); ASSERT_EQ(0, coll1->_get_index()->_get_search_index().at("artist")->size); ASSERT_EQ(0, coll1->_get_index()->_get_search_index().at("launch_year")->size); ASSERT_EQ(0, coll1->_get_index()->_get_search_index().at("genres")->size); ASSERT_EQ(1, coll1->_get_index()->_get_search_index().at("labels")->size); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, WildcardQueryReturnsResultsBasedOnPerPageParam) { std::vector facets; spp::sparse_hash_set empty; nlohmann::json results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 12, 1, FREQUENCY, {false}, 1000, empty, empty, 10).get(); ASSERT_EQ(12, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); // should match collection size results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 100, 1, FREQUENCY, {false}, 1000, empty, empty, 10).get(); ASSERT_EQ(25, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); // cannot fetch more than in-built limit of 250 auto res_op = collection->search("*", query_fields, "", facets, sort_fields, {0}, 251, 1, FREQUENCY, {false}, 1000, empty, empty, 10); ASSERT_FALSE(res_op.ok()); ASSERT_EQ(422, res_op.code()); ASSERT_STREQ("Only upto 250 hits can be fetched per page.", res_op.error().c_str()); // when page number is 0, just fetch first page results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 0, FREQUENCY, {false}, 1000, empty, empty, 10).get(); ASSERT_EQ(10, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); // do pagination results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 1000, empty, empty, 10).get(); ASSERT_EQ(10, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 2, FREQUENCY, {false}, 1000, empty, empty, 10).get(); ASSERT_EQ(10, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 3, FREQUENCY, {false}, 1000, empty, empty, 10).get(); ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); // enforce limit_hits auto limit_hits = 20; results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 3, FREQUENCY, {false}, 1000, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1}, limit_hits).get(); ASSERT_EQ(0, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 15, 2, FREQUENCY, {false}, 1000, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1}, limit_hits).get(); ASSERT_EQ(5, results["hits"].size()); ASSERT_EQ(25, results["found"].get()); } TEST_F(CollectionTest, RemoveIfFound) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, true), field("points", field_types::INT32, false)}; std::vector sort_fields = {sort_by("points", "DESC")}; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } for(size_t i=0; i<10; i++) { nlohmann::json doc; doc["id"] = std::to_string(i); doc["title"] = "Title " + std::to_string(i); doc["points"] = i; coll1->add(doc.dump()); } auto res = coll1->search("*", {"title"}, "", {}, sort_fields, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0).get(); ASSERT_EQ(10, res["found"].get()); // removing found doc Option found_op = coll1->remove_if_found(0); ASSERT_TRUE(found_op.ok()); ASSERT_TRUE(found_op.get()); auto get_op = coll1->get("0"); ASSERT_FALSE(get_op.ok()); ASSERT_EQ(404, get_op.code()); // removing doc not found found_op = coll1->remove_if_found(100); ASSERT_TRUE(found_op.ok()); ASSERT_FALSE(found_op.get()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, CreateCollectionInvalidFieldType) { std::vector fields = {field("title", "blah", true), field("points", "int", false)}; std::vector sort_fields = {sort_by("points", "DESC")}; collectionManager.drop_collection("coll1"); auto create_op = collectionManager.create_collection("coll1", 4, fields, "points"); ASSERT_FALSE(create_op.ok()); ASSERT_STREQ("Field `title` has an invalid data type `blah`, see docs for supported data types.", create_op.error().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, MultiFieldRelevance) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } std::vector> records = { {"Down There by the Train", "Dustin Kensrue"}, {"Down There by the Train", "Gord Downie"}, {"State Trooper", "Dustin Kensrue"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("Dustin Kensrue Down There by the Train", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10).get(); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(3, results["hits"].size()); std::vector expected_ids = {0, 1, 2}; for(size_t i=0; i())); } ASSERT_STREQ("Down There by the Train", results["hits"][0]["highlights"][0]["snippet"].get().c_str()); ASSERT_STREQ("Down There by the Train", results["hits"][1]["highlights"][0]["snippet"].get().c_str()); ASSERT_STREQ("Dustin Kensrue", results["hits"][2]["highlights"][0]["snippet"].get().c_str()); // remove documents, reindex in another order and search again for(size_t i=0; iremove_if_found(i, true); } records = { {"State Trooper", "Dustin Kensrue"}, {"Down There by the Train", "Gord Downie"}, {"Down There by the Train", "Dustin Kensrue"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } results = coll1->search("Dustin Kensrue Down There by the Train", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10).get(); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(3, results["hits"].size()); expected_ids = {2, 1, 0}; for(size_t i=0; i())); } // with exclude token syntax results = coll1->search("-downie dustin kensrue down there by the train", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); expected_ids = {2, 0}; for(size_t i=0; i())); } collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, MultiFieldRelevance2) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"A Daikon Freestyle", "Ghosts on a Trampoline"}, {"Leaving on a Jetplane", "Coby Grant"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("on a jetplane", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(0, results["hits"][0]["text_match_info"]["num_tokens_dropped"]); ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_EQ(1, results["hits"][1]["text_match_info"]["num_tokens_dropped"]); // changing weights to favor artist still favors title because it contains all tokens of the query results = coll1->search("on a jetplane", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 4}).get(); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(0, results["hits"][0]["text_match_info"]["num_tokens_dropped"]); ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_EQ(1, results["hits"][1]["text_match_info"]["num_tokens_dropped"]); // use same weights results = coll1->search("on a jetplane", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(0, results["hits"][0]["text_match_info"]["num_tokens_dropped"]); ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_EQ(1, results["hits"][1]["text_match_info"]["num_tokens_dropped"]); // add weights to favor artist without all tokens in a query being found in a field results = coll1->search("on a helicopter", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 4}).get(); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(1, results["hits"][0]["text_match_info"]["num_tokens_dropped"]); ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_EQ(1, results["hits"][1]["text_match_info"]["num_tokens_dropped"]); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, FieldWeightsNotProper) { // when weights are not given properly Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } auto results_op = coll1->search("on a jetplane", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1}); ASSERT_FALSE(results_op.ok()); ASSERT_STREQ("Number of weights in `query_by_weights` does not match number " "of `query_by` fields.", results_op.error().c_str()); results_op = coll1->search("on a jetplane", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {2, 1}); ASSERT_FALSE(results_op.ok()); ASSERT_STREQ("Number of weights in `query_by_weights` does not match number " "of `query_by` fields.", results_op.error().c_str()); // empty weights are fine (will be defaulted to) results_op = coll1->search("on a jetplane", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {}); ASSERT_TRUE(results_op.ok()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, MultiFieldRelevance3) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"Taylor Swift Karaoke: reputation", "Taylor Swift"}, {"Style", "Taylor Swift"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("style taylor swift", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(0, results["hits"][0]["text_match_info"]["num_tokens_dropped"]); ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_EQ(1, results["hits"][1]["text_match_info"]["num_tokens_dropped"]); results = coll1->search("swift", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(0, results["hits"][0]["text_match_info"]["num_tokens_dropped"]); ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_EQ(0, results["hits"][1]["text_match_info"]["num_tokens_dropped"]); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, MultiFieldRelevance4) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"Maddras Dreams", "Chennai King"}, {"Maddurai Express", "Maddura Maddy"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("maddras", {"title", "artist"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, MultiFieldRelevance5) { Collection *coll1; std::vector fields = {field("company_name", field_types::STRING, false), field("country", field_types::STRING, false), field("field_a", field_types::STRING, false), field("num_employees", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "num_employees").get(); } std::vector> records = { {"Stark Industries ™", "Canada", "Canadia", "5215"}, {"Canaida Corp", "United States", "Canadoo", "200"}, {"Acme Corp", "Mexico", "Canadoo", "300"} }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("Canada", {"company_name","country","field_a"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1, 1}).get(); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(3, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get().c_str()); results = coll1->search("Canada", {"company_name","field_a","country"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1, 1}).get(); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(3, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get().c_str()); ASSERT_EQ(2, results["hits"][0]["highlights"].size()); ASSERT_EQ("field_a", results["hits"][0]["highlights"][0]["field"].get()); ASSERT_EQ("Canadia", results["hits"][0]["highlights"][0]["snippet"].get()); ASSERT_EQ("country", results["hits"][0]["highlights"][1]["field"].get()); ASSERT_EQ("Canada", results["hits"][0]["highlights"][1]["snippet"].get()); ASSERT_EQ(1, results["hits"][1]["highlights"].size()); ASSERT_EQ("field_a", results["hits"][1]["highlights"][0]["field"].get()); ASSERT_EQ("Canadoo", results["hits"][1]["highlights"][0]["snippet"].get()); ASSERT_EQ(2, results["hits"][2]["highlights"].size()); ASSERT_EQ("field_a", results["hits"][2]["highlights"][0]["field"].get()); ASSERT_EQ("Canadoo", results["hits"][2]["highlights"][0]["snippet"].get()); ASSERT_EQ("company_name", results["hits"][2]["highlights"][1]["field"].get()); ASSERT_EQ("Canaida Corp", results["hits"][2]["highlights"][1]["snippet"].get()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, MultiFieldRelevance6) { // with exact match, the number of fields with exact match will not be considered as a ranking signal Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"Taylor Swift", "Taylor Swift"}, {"Taylor Swift Song", "Taylor Swift"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("taylor swift", {"title", "artist"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); // when exact matches are disabled results = coll1->search("taylor swift", {"title", "artist"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}, 100, false).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, ExactMatch) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"Alpha", "DJ"}, {"Alpha Beta", "DJ"}, {"Alpha Beta Gamma", "DJ"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("alpha beta", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10).get(); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(3, results["hits"].size()); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("0", results["hits"][2]["document"]["id"].get().c_str()); results = coll1->search("alpha", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10).get(); ASSERT_EQ(3, results["found"].get()); ASSERT_EQ(3, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, MultiFieldHighlighting) { Collection *coll1; std::vector fields = {field("name", field_types::STRING, false), field("description", field_types::STRING, false), field("categories", field_types::STRING_ARRAY, false), field("points", field_types::INT32, false)}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"Best Wireless Vehicle Charger", "Easily replenish your cell phone with this wireless charger.", "Cell Phones > Cell Phone Accessories > Car Chargers"}, {"Annie's Song", "John Denver", "Album > Compilation"}, }; for(size_t i=0; i categories; StringUtils::split(records[i][2], categories, ">"); doc["id"] = std::to_string(i); doc["name"] = records[i][0]; doc["description"] = records[i][1]; doc["categories"] = categories; doc["points"] = i; ASSERT_TRUE(coll1->add(doc.dump()).ok()); } auto results = coll1->search("charger", {"name","description","categories"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1, 1}).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(3, results["hits"][0]["highlights"].size()); ASSERT_EQ("name", results["hits"][0]["highlights"][0]["field"].get()); ASSERT_EQ("Best Wireless Vehicle Charger", results["hits"][0]["highlights"][0]["snippet"].get()); ASSERT_EQ("description", results["hits"][0]["highlights"][1]["field"].get()); ASSERT_EQ("Easily replenish your cell phone with this wireless charger.", results["hits"][0]["highlights"][1]["snippet"].get()); ASSERT_EQ("categories", results["hits"][0]["highlights"][2]["field"].get()); ASSERT_EQ("Car Chargers", results["hits"][0]["highlights"][2]["snippets"][0].get()); results = coll1->search("John With Denver", {"description"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1}).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(1, results["hits"][0]["highlights"].size()); ASSERT_EQ("description", results["hits"][0]["highlights"][0]["field"].get()); ASSERT_EQ("John Denver", results["hits"][0]["highlights"][0]["snippet"].get()); results = coll1->search("Annies song John Denver", {"name","description"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 1, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(2, results["hits"][0]["highlights"].size()); ASSERT_EQ("name", results["hits"][0]["highlights"][0]["field"].get()); ASSERT_EQ("Annie's Song", results["hits"][0]["highlights"][0]["snippet"].get()); ASSERT_EQ("description", results["hits"][0]["highlights"][1]["field"].get()); ASSERT_EQ("John Denver", results["hits"][0]["highlights"][1]["snippet"].get()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, MultiFieldMatchRanking) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"Style", "Taylor Swift"}, {"Blank Space", "Taylor Swift"}, {"Balance Overkill", "Taylor Swift"}, {"Cardigan", "Taylor Swift"}, {"Invisible String", "Taylor Swift"}, {"The Last Great American Dynasty", "Taylor Swift"}, {"Mirrorball", "Taylor Swift"}, {"Peace", "Taylor Swift"}, {"Betty", "Taylor Swift"}, {"Mad Woman", "Taylor Swift"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("taylor swift style", {"artist", "title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5).get(); ASSERT_EQ(10, results["found"].get()); ASSERT_EQ(3, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("9", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("8", results["hits"][2]["document"]["id"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, MultiFieldMatchRankingOnArray) { Collection *coll1; std::vector fields = {field("name", field_types::STRING, false), field("strong_skills", field_types::STRING_ARRAY, false), field("skills", field_types::STRING_ARRAY, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector>> records = { {{"John Snow"}, {"Golang", "Vue", "React"}, {"Docker", "Goa", "Elixir"}}, {{"Jack Dan"}, {"Golang", "Phoenix", "React"}, {"Docker", "Vue", "Kubernetes"}}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("golang vue", {"strong_skills", "skills"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 1).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, MultiFieldMatchRankingOnFieldOrder) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"Toxic", "Britney Spears"}, {"Bad", "Michael Jackson"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("michael jackson toxic", {"title", "artist"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 10, {}, {}, {}, 0, "", "", {1, 6}).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, PrefixRankedAfterExactMatch) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"Rotini Puttanesca"}, {"Poulet Roti Tout Simple"}, {"Chapatis (Roti)"}, {"School Days Rotini Pasta Salad"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("roti", {"title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5).get(); ASSERT_EQ(4, results["found"].get()); ASSERT_EQ(3, results["hits"].size()); ASSERT_STREQ("2", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("3", results["hits"][2]["document"]["id"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, HighlightWithAccentedCharacters) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get(); } std::vector> records = { {"Mise T.J. à jour Timy depuis PC"}, {"Down There by the T.r.a.i.n"}, {"State Trooper"}, {"The Google Nexus Q Is Baffling"}, }; for (size_t i = 0; i < records.size(); i++) { nlohmann::json doc; doc["id"] = std::to_string(i); doc["title"] = records[i][0]; doc["points"] = i; ASSERT_TRUE(coll1->add(doc.dump()).ok()); } auto results = coll1->search("à jour", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("Mise T.J. à jour Timy depuis PC", results["hits"][0]["highlights"][0]["snippet"].get().c_str()); ASSERT_EQ(2, results["hits"][0]["highlights"][0]["matched_tokens"].size()); ASSERT_STREQ("à", results["hits"][0]["highlights"][0]["matched_tokens"][0].get().c_str()); ASSERT_STREQ("jour", results["hits"][0]["highlights"][0]["matched_tokens"][1].get().c_str()); results = coll1->search("by train", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "title").get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("Down There by the T.r.a.i.n", results["hits"][0]["highlights"][0]["snippet"].get().c_str()); ASSERT_STREQ("Down There by the T.r.a.i.n", results["hits"][0]["highlights"][0]["value"].get().c_str()); results = coll1->search("state trooper", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("State Trooper", results["hits"][0]["highlights"][0]["snippet"].get().c_str()); // test single character highlight results = coll1->search("q", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_STREQ("The Google Nexus Q Is Baffling", results["hits"][0]["highlights"][0]["snippet"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, DISABLED_SearchingForRecordsWithSpecialChars) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("url", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"Amazon Home", "https://amazon.com/"}, {"Google Home", "https://google.com///"}, {"Github Issue", "https://github.com/typesense/typesense/issues/241"}, {"Amazon Search", "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("google", {"title", "url"}, "", {}, {}, {2}, 10, 1, FREQUENCY).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(2, results["hits"][0]["highlights"].size()); ASSERT_EQ("Google Home", results["hits"][0]["highlights"][0]["snippet"].get()); ASSERT_EQ("https://google.com///", results["hits"][0]["highlights"][1]["snippet"].get()); results = coll1->search("amazon.com", {"title", "url"}, "", {}, {}, {2}, 10, 1, FREQUENCY).get(); ASSERT_EQ(3, results["found"].get()); ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get().c_str()); ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get().c_str()); results = coll1->search("typesense", {"title", "url"}, "", {}, {}, {2}, 10, 1, FREQUENCY).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_STREQ("2", results["hits"][0]["document"]["id"].get().c_str()); results = coll1->search("nb_sb_noss_2", {"title", "url"}, "", {}, {}, {2}, 10, 1, FREQUENCY).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_EQ(1, results["hits"][0]["highlights"].size()); ASSERT_EQ("https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2", results["hits"][0]["highlights"][0]["snippet"].get()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, FieldSpecificNumTypos) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"Taylor Swift Karaoke: reputation", "Taylor Swift"}, {"Taylor & Friends", "Adam Smith"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("tayylor", {"title", "artist"}, "", {}, {}, {1, 1}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); results = coll1->search("tayylor", {"title", "artist"}, "", {}, {}, {0, 1}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); // must return error when num_typos does not match length of search fields queried auto res_op = coll1->search("tayylor", {"title"}, "", {}, {}, {0, 1}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}); ASSERT_FALSE(res_op.ok()); ASSERT_EQ("Number of weights in `query_by_weights` does not match number of `query_by` fields.", res_op.error()); // can use a single typo param for multiple fields results = coll1->search("tayylor", {"title", "artist"}, "", {}, {}, {1}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); // wildcard search with typos results = coll1->search("*", {}, "", {}, {}, {1}, 10, 1, FREQUENCY, {true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, BadHighlightingOnText) { Collection *coll1; std::vector fields = {field("text", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } nlohmann::json doc; doc["id"] = "0"; doc["text"] = "include destruction of natural marine and estuarine\\nhabitats, loss of productive agricultural " "land,\\nand soil erosion. 90 When interviewed, multiple\\nexperts stated that inappropriate land use " "and\\nmanagement is a central factor contributing to\\nenvironmental degradation in the " "Castries-Gros\\nIslet Corridor. 91 The construction is placing greater\\nstress on natural resources " "and biodiversity, and\\nthe capacity to produce food and retain freshwater\\nhas been diminished. " "92 Moreover, increased\\nwater consumption by the tourism sector, when\\ncompounded by climate " "change, is increasing food\\nand water insecurity throughout Saint Lucia, as well\\nas suppressing " "long-term growth prospects. 93"; doc["points"] = 0; ASSERT_TRUE(coll1->add(doc.dump()).ok()); auto results = coll1->search("natural saint lucia", {"text"}, "", {}, {}, {1}, 10, 1, FREQUENCY, {true}, 10).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("food\\nand water insecurity throughout Saint Lucia, as well\\nas suppressing long-term", results["hits"][0]["highlights"][0]["snippet"].get().c_str()); ASSERT_EQ(2, results["hits"][0]["highlights"][0]["matched_tokens"].size()); ASSERT_STREQ("Saint", results["hits"][0]["highlights"][0]["matched_tokens"][0].get().c_str()); ASSERT_STREQ("Lucia", results["hits"][0]["highlights"][0]["matched_tokens"][1].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, FieldLevelPrefixConfiguration) { Collection *coll1; std::vector fields = {field("title", field_types::STRING, false), field("artist", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if(coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector> records = { {"Taylor Swift Karaoke: reputation", "Taylor Swift"}, {"Style", "Taylor Swift"}, }; for(size_t i=0; iadd(doc.dump()).ok()); } auto results = coll1->search("taylo", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true, false}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_EQ(1, results["found"].get()); ASSERT_EQ(1, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); results = coll1->search("taylo", {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true, true}, 10, spp::sparse_hash_set(), spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, "", "", {1, 1}).get(); ASSERT_EQ(2, results["found"].get()); ASSERT_EQ(2, results["hits"].size()); ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get().c_str()); ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get().c_str()); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, QueryParsingForPhraseSearch) { Collection* coll1; std::vector fields = {field("title", field_types::STRING, false), field("points", field_types::INT32, false),}; coll1 = collectionManager.get_collection("coll1").get(); if (coll1 == nullptr) { coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); } std::vector q_include_tokens, q_unstemmed_tokens; std::vector> q_exclude_tokens; std::vector> q_phrases; std::string q = R"(the "phrase search" query)"; /*coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false); ASSERT_EQ(2, q_include_tokens.size()); ASSERT_EQ("the", q_include_tokens[0]); ASSERT_EQ("query", q_include_tokens[1]); ASSERT_EQ(1, q_phrases.size()); ASSERT_EQ(2, q_phrases[0].size()); ASSERT_EQ("phrase", q_phrases[0][0]); ASSERT_EQ("search", q_phrases[0][1]); */ // quoted string has trailing padded space q = R"("space padded " query)"; q_include_tokens.clear(); q_exclude_tokens.clear(); q_unstemmed_tokens.clear(); q_phrases.clear(); coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false); ASSERT_EQ(1, q_include_tokens.size()); ASSERT_EQ("query", q_include_tokens[0]); ASSERT_EQ(1, q_phrases.size()); ASSERT_EQ(2, q_phrases[0].size()); ASSERT_EQ("space", q_phrases[0][0]); ASSERT_EQ("padded", q_phrases[0][1]); // multiple quoted strings q = R"("first phrase" "second phrase")"; q_include_tokens.clear(); q_exclude_tokens.clear(); q_phrases.clear(); coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false); ASSERT_EQ(1, q_include_tokens.size()); ASSERT_EQ("*", q_include_tokens[0]); ASSERT_EQ(2, q_phrases.size()); ASSERT_EQ(2, q_phrases[0].size()); ASSERT_EQ("first", q_phrases[0][0]); ASSERT_EQ("phrase", q_phrases[0][1]); ASSERT_EQ("second", q_phrases[1][0]); ASSERT_EQ("phrase", q_phrases[1][1]); // single quoted string q = R"("hello")"; q_include_tokens.clear(); q_exclude_tokens.clear(); q_phrases.clear(); coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false); ASSERT_EQ(1, q_include_tokens.size()); ASSERT_EQ("*", q_include_tokens[0]); ASSERT_EQ(1, q_phrases.size()); ASSERT_EQ(1, q_phrases[0].size()); ASSERT_EQ("hello", q_phrases[0][0]); // stray trailing quote q = R"(hello")"; q_include_tokens.clear(); q_exclude_tokens.clear(); q_phrases.clear(); coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false); ASSERT_EQ(1, q_include_tokens.size()); ASSERT_EQ("hello", q_include_tokens[0]); ASSERT_EQ(0, q_phrases.size()); // padded space one either side of quote q = R"("some query " here)"; q_include_tokens.clear(); q_exclude_tokens.clear(); q_phrases.clear(); coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false); ASSERT_EQ(1, q_include_tokens.size()); ASSERT_EQ("here", q_include_tokens[0]); ASSERT_EQ(1, q_phrases.size()); ASSERT_EQ(2, q_phrases[0].size()); ASSERT_EQ("some", q_phrases[0][0]); ASSERT_EQ("query", q_phrases[0][1]); // with exclude operator q = R"(-"some phrase" here)"; q_include_tokens.clear(); q_exclude_tokens.clear(); q_phrases.clear(); coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false); ASSERT_EQ(1, q_include_tokens.size()); ASSERT_EQ("here", q_include_tokens[0]); ASSERT_EQ(0, q_phrases.size()); ASSERT_EQ(1, q_exclude_tokens.size()); ASSERT_EQ(2, q_exclude_tokens[0].size()); ASSERT_EQ("some", q_exclude_tokens[0][0]); ASSERT_EQ("phrase", q_exclude_tokens[0][1]); // with multiple exclude operators q = R"(-"some phrase" here -token)"; q_include_tokens.clear(); q_exclude_tokens.clear(); q_phrases.clear(); coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false); ASSERT_EQ(1, q_include_tokens.size()); ASSERT_EQ("here", q_include_tokens[0]); ASSERT_EQ(0, q_phrases.size()); ASSERT_EQ(2, q_exclude_tokens.size()); ASSERT_EQ(2, q_exclude_tokens[0].size()); ASSERT_EQ("some", q_exclude_tokens[0][0]); ASSERT_EQ("phrase", q_exclude_tokens[0][1]); ASSERT_EQ(1, q_exclude_tokens[1].size()); ASSERT_EQ("token", q_exclude_tokens[1][0]); collectionManager.drop_collection("coll1"); } TEST_F(CollectionTest, WildcardQueryBy) { nlohmann::json schema = R"({ "name": "posts", "enable_nested_fields": true, "fields": [ {"name": "username", "type": "string", "facet": true}, {"name": "user.rank", "type": "int32", "facet": true}, {"name": "user.bio", "type": "string"}, {"name": "likes", "type": "int32"}, {"name": "content", "type": "object"} ], "default_sorting_field": "likes" })"_json; auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); std::vector json_lines = { R"({"id": "124","username": "user_a","user": {"rank": 100,"bio": "Hi! I'm user_a"},"likes": 5215,"content": {"title": "title 1","body": "body 1 user_a"}})", R"({"id": "125","username": "user_b","user": {"rank": 50,"bio": "user_b here, nice to meet you!"},"likes": 5215,"content": {"title": "title 2","body": "body 2 user_b"}})" }; for (auto const& json: json_lines){ auto add_op = coll->add(json); if (!add_op.ok()) { LOG(INFO) << add_op.error(); } ASSERT_TRUE(add_op.ok()); } // * matches username, user.bio, content.title, content.body auto result = coll->search("user_a", {"*"}, "", {}, {}, {0}).get(); ASSERT_EQ(1, result["found"].get()); ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ("Hi! I'm user_a", result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get()); ASSERT_EQ("user_a", result["hits"][0]["highlight"]["username"]["snippet"].get()); // ASSERT_EQ("body 1 user_a", // result["hits"][0]["highlight"]["content"]["body"]["snippet"].get()); // user* matches username and user.bio result = coll->search("user_a", {"user*"}, "", {}, {}, {0}).get(); ASSERT_EQ(1, result["found"].get()); ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ("Hi! I'm user_a", result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get()); ASSERT_EQ("user_a", result["hits"][0]["highlight"]["username"]["snippet"].get()); // user.* matches user.bio result = coll->search("user_a", {"user.*"}, "", {}, {}, {0}).get(); ASSERT_EQ(1, result["found"].get()); ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ("Hi! I'm user_a", result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get()); // user.rank cannot be queried result = coll->search("100", {"user*"}, "", {}, {}, {0}).get(); ASSERT_EQ(0, result["found"].get()); ASSERT_EQ(0, result["hits"].size()); // No matching field for query_by auto error = coll->search("user_a", {"foo*"}, "", {}, {}, {0}).error(); ASSERT_EQ("No string or string array field found matching the pattern `foo*` in the schema.", error); } TEST_F(CollectionTest, WildcardHighlightFields) { nlohmann::json schema = R"({ "name": "posts", "enable_nested_fields": true, "fields": [ {"name": "user_name", "type": "string", "facet": true}, {"name": "user", "type": "object"} ] })"_json; auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); auto add_op = coll->add(R"({"id": "124","user_name": "user_a","user": {"rank": 100,"phone": "+91 123123123"}})"); if (!add_op.ok()) { LOG(INFO) << add_op.error(); } ASSERT_TRUE(add_op.ok()); spp::sparse_hash_set dummy_include_exclude; std::string highlight_fields = "user*"; // user* matches user_name, user.rank and user.phone auto result = coll->search("123", {"user"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "", "", {}, UINT32_MAX, true, false, true, highlight_fields).get(); ASSERT_EQ(1, result["found"].get()); ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ(1, result["hits"][0]["highlight"].size()); ASSERT_EQ("+91 123123123", result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get()); highlight_fields = "user.*"; // user.* matches user.rank and user.phone result = coll->search("+91", {"user"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "", "", {}, UINT32_MAX, true, false, true, highlight_fields).get(); ASSERT_EQ(1, result["found"].get()); ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ(1, result["hits"][0]["highlight"].size()); ASSERT_EQ("+91 123123123", result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get()); highlight_fields = "user*"; // user* matches user_name, user.rank and user.phone result = coll->search("user_a", {"user_name"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "", "", {}, UINT32_MAX, true, false, true, highlight_fields).get(); ASSERT_EQ(1, result["found"].get()); ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ(1, result["hits"][0]["highlight"].size()); ASSERT_EQ("user_a", result["hits"][0]["highlight"]["user_name"]["snippet"].get()); highlight_fields = "user.*"; // user.* matches user.rank and user.phone result = coll->search("user_a", {"user_name"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "", "", {}, UINT32_MAX, true, false, true, highlight_fields).get(); ASSERT_EQ(1, result["found"].get()); ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ(0, result["hits"][0]["highlight"].size()); highlight_fields = "foo*"; // No matching field for highlight_fields result = coll->search("user_a", {"user_name"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "", "", {}, UINT32_MAX, true, false, true, highlight_fields).get(); ASSERT_EQ(1, result["found"].get()); ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ(0, result["hits"][0]["highlight"].size()); } TEST_F(CollectionTest, WildcardHighlightFullFields) { nlohmann::json schema = R"({ "name": "posts", "enable_nested_fields": true, "fields": [ {"name": "user_name", "type": "string", "facet": true}, {"name": "user.rank", "type": "int32", "facet": true}, {"name": "user.phone", "type": "string"}, {"name": "user.bio", "type": "string"} ] })"_json; auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); auto json = R"({ "id": "124", "user_name": "user_a", "user": { "rank": 100, "phone": "+91 123123123" } })"_json; std::string bio = "Once there was a middle-aged boy named User_a who was an avid swimmer." "He had been swimming competitively for most of his life, and had even competed in several national competitions." "However, despite his passion and talent for the sport, he had never quite managed to win that elusive gold medal." "Determined to change that, User_a began training harder than ever before." "He woke up early every morning to swim laps before work and spent his evenings at the pool as well." "Despite the grueling schedule, he never once complained." "Instead, he reminded himself of his goal: to become a national champion."; json["user"]["bio"] = bio; auto add_op = coll->add(json.dump()); if (!add_op.ok()) { LOG(INFO) << add_op.error(); } ASSERT_TRUE(add_op.ok()); spp::sparse_hash_set dummy_include_exclude; std::string highlight_full_fields = "user*"; // user* matches user_name, user.bio auto result = coll->search("user_a", {"*"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, highlight_full_fields).get(); ASSERT_EQ(1, result["found"].get()); ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ("a middle-aged boy named User_a who was an avid", result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get()); std::string highlighted_value = "Once there was a middle-aged boy named User_a who was an avid swimmer." "He had been swimming competitively for most of his life, and had even competed in several national competitions." "However, despite his passion and talent for the sport, he had never quite managed to win that elusive gold medal." "Determined to change that, User_a began training harder than ever before." "He woke up early every morning to swim laps before work and spent his evenings at the pool as well." "Despite the grueling schedule, he never once complained." "Instead, he reminded himself of his goal: to become a national champion."; ASSERT_EQ( highlighted_value, result["hits"][0]["highlight"]["user"]["bio"]["value"].get()); ASSERT_EQ("user_a", result["hits"][0]["highlight"]["user_name"]["value"].get()); highlight_full_fields = "user.*"; // user.* matches user.bio result = coll->search("user_a", {"*"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, highlight_full_fields).get(); ASSERT_EQ(1, result["found"].get()); ASSERT_EQ(1, result["hits"].size()); ASSERT_EQ(highlighted_value, result["hits"][0]["highlight"]["user"]["bio"]["value"].get()); ASSERT_EQ(0, result["hits"][0]["highlight"]["user_name"].count("value")); highlight_full_fields = "foo*"; // No matching field for highlight_fields result = coll->search("user_a", {"*"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, highlight_full_fields).get(); ASSERT_EQ(0, result["hits"][0]["highlight"]["user"]["bio"].count("value")); ASSERT_EQ(0, result["hits"][0]["highlight"]["user_name"].count("value")); } TEST_F(CollectionTest, SemanticSearchTest) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "name", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); nlohmann::json object; object["name"] = "apple"; auto add_op = coll->add(object.dump()); ASSERT_TRUE(add_op.ok()); ASSERT_EQ("apple", add_op.get()["name"]); ASSERT_EQ(384, add_op.get()["embedding"].size()); spp::sparse_hash_set dummy_include_exclude; auto search_res_op = coll->search("apple", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, ""); ASSERT_TRUE(search_res_op.ok()); auto search_res = search_res_op.get(); ASSERT_EQ(1, search_res["found"].get()); ASSERT_EQ(1, search_res["hits"].size()); ASSERT_EQ("apple", search_res["hits"][0]["document"]["name"].get()); ASSERT_EQ(384, search_res["hits"][0]["document"]["embedding"].size()); } TEST_F(CollectionTest, InvalidSemanticSearch) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "name", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); LOG(INFO) << "op.error(): " << op.error(); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); nlohmann::json object; object["name"] = "apple"; auto add_op = coll->add(object.dump()); ASSERT_TRUE(add_op.ok()); ASSERT_EQ("apple", add_op.get()["name"]); ASSERT_EQ(384, add_op.get()["embedding"].size()); spp::sparse_hash_set dummy_include_exclude; auto search_res_op = coll->search("apple", {"embedding", "embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, ""); ASSERT_FALSE(search_res_op.ok()); } TEST_F(CollectionTest, HybridSearch) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "name", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); nlohmann::json object; object["name"] = "apple"; auto add_op = coll->add(object.dump()); LOG(INFO) << "add_op.error(): " << add_op.error(); ASSERT_TRUE(add_op.ok()); ASSERT_EQ("apple", add_op.get()["name"]); ASSERT_EQ(384, add_op.get()["embedding"].size()); spp::sparse_hash_set dummy_include_exclude; auto search_res_op = coll->search("apple", {"name","embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, ""); ASSERT_TRUE(search_res_op.ok()); auto search_res = search_res_op.get(); ASSERT_EQ(1, search_res["found"].get()); ASSERT_EQ(1, search_res["hits"].size()); ASSERT_EQ("apple", search_res["hits"][0]["document"]["name"].get()); ASSERT_EQ(384, search_res["hits"][0]["document"]["embedding"].size()); } // TEST_F(CollectionTest, EmbedFielsTest) { // nlohmann::json schema = R"({ // "name": "objects", // "fields": [ // {"name": "name", "type": "string"}, // {"name": "embedding", "type":"float[]", "embed":{"from": ["name"]} // ] // })"_json; // EmbedderManager::set_model_dir("/tmp/typesense_test/models"); // // auto op = collectionManager.create_collection(schema); // ASSERT_TRUE(op.ok()); // Collection* coll = op.get(); // nlohmann::json object = R"({ // "name": "apple" // })"_json; // auto embed_op = coll->embed_fields(object); // ASSERT_TRUE(embed_op.ok()); // ASSERT_EQ("apple", object["name"]); // ASSERT_EQ(384, object["embedding"].get>().size()); // } TEST_F(CollectionTest, HybridSearchRankFusionTest) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "name", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); nlohmann::json object; object["name"] = "butter"; auto add_op = coll->add(object.dump()); ASSERT_TRUE(add_op.ok()); object["name"] = "butterball"; add_op = coll->add(object.dump()); ASSERT_TRUE(add_op.ok()); object["name"] = "butterfly"; add_op = coll->add(object.dump()); ASSERT_TRUE(add_op.ok()); spp::sparse_hash_set dummy_include_exclude; auto search_res_op = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, ""); ASSERT_TRUE(search_res_op.ok()); auto search_res = search_res_op.get(); ASSERT_EQ(3, search_res["found"].get()); ASSERT_EQ(3, search_res["hits"].size()); // Vector search order: // 1. butter // 2. butterball // 3. butterfly ASSERT_EQ("butter", search_res["hits"][0]["document"]["name"].get()); ASSERT_EQ("butterball", search_res["hits"][1]["document"]["name"].get()); ASSERT_EQ("butterfly", search_res["hits"][2]["document"]["name"].get()); search_res_op = coll->search("butter", {"name"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, ""); ASSERT_TRUE(search_res_op.ok()); search_res = search_res_op.get(); ASSERT_EQ(3, search_res["found"].get()); ASSERT_EQ(3, search_res["hits"].size()); // Keyword search order: // 1. butter // 2. butterfly // 3. butterball ASSERT_EQ("butter", search_res["hits"][0]["document"]["name"].get()); ASSERT_EQ("butterfly", search_res["hits"][1]["document"]["name"].get()); ASSERT_EQ("butterball", search_res["hits"][2]["document"]["name"].get()); search_res_op = coll->search("butter", {"name","embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, ""); ASSERT_TRUE(search_res_op.ok()); search_res = search_res_op.get(); ASSERT_EQ(3, search_res["found"].get()); ASSERT_EQ(3, search_res["hits"].size()); // Hybrid search with rank fusion order: // 1. butter (1/1 * 0.7) + (1/1 * 0.3) = 1 // 2. butterfly (1/2 * 0.7) + (1/3 * 0.3) = 0.45 // 3. butterball (1/3 * 0.7) + (1/2 * 0.3) = 0.383 ASSERT_EQ("butter", search_res["hits"][0]["document"]["name"].get()); ASSERT_EQ("butterfly", search_res["hits"][1]["document"]["name"].get()); ASSERT_EQ("butterball", search_res["hits"][2]["document"]["name"].get()); ASSERT_FLOAT_EQ((1.0/1.0 * 0.7) + (1.0/1.0 * 0.3), search_res["hits"][0]["hybrid_search_info"]["rank_fusion_score"].get()); ASSERT_FLOAT_EQ((1.0/2.0 * 0.7) + (1.0/3.0 * 0.3), search_res["hits"][1]["hybrid_search_info"]["rank_fusion_score"].get()); ASSERT_FLOAT_EQ((1.0/3.0 * 0.7) + (1.0/2.0 * 0.3), search_res["hits"][2]["hybrid_search_info"]["rank_fusion_score"].get()); } TEST_F(CollectionTest, WildcardSearchWithEmbeddingField) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "name", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); spp::sparse_hash_set dummy_include_exclude; auto search_res_op = coll->search("*", {"name","embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, ""); ASSERT_TRUE(search_res_op.ok()); } TEST_F(CollectionTest, CreateModelDirIfNotExists) { system("mkdir -p /tmp/typesense_test/new_models_dir"); system("rm -rf /tmp/typesense_test/new_models_dir"); EmbedderManager::set_model_dir("/tmp/typesense_test/new_models_dir"); // check if model dir is created ASSERT_TRUE(std::filesystem::exists("/tmp/typesense_test/new_models_dir")); } TEST_F(CollectionTest, EmbedStringArrayField) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "names", "type": "string[]"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["names"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); nlohmann::json doc; doc["names"].push_back("butter"); doc["names"].push_back("butterfly"); doc["names"].push_back("butterball"); auto add_op = coll->add(doc.dump()); ASSERT_TRUE(add_op.ok()); } TEST_F(CollectionTest, MissingFieldForEmbedding) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "names", "type": "string[]"}, {"name": "category", "type": "string", "optional": true}, {"name": "embedding", "type":"float[]", "embed":{"from": ["names", "category"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); nlohmann::json doc; doc["names"].push_back("butter"); doc["names"].push_back("butterfly"); doc["names"].push_back("butterball"); auto add_op = coll->add(doc.dump()); ASSERT_TRUE(add_op.ok()); } TEST_F(CollectionTest, WrongTypeInEmbedFrom) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "category", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": [1122], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_FALSE(op.ok()); ASSERT_EQ("Property `embed.from` must contain only field names as strings.", op.error()); } TEST_F(CollectionTest, WrongTypeForEmbedding) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "category", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["category"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); nlohmann::json doc; doc["category"] = 1; auto add_op = validator_t::validate_embed_fields(doc, coll->get_embedding_fields(), coll->get_schema(), true); ASSERT_FALSE(add_op.ok()); ASSERT_EQ("Field `category` has malformed data.", add_op.error()); } TEST_F(CollectionTest, WrongTypeOfElementForEmbeddingInStringArray) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "category", "type": "string[]"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["category"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); nlohmann::json doc; doc["category"].push_back(33); auto add_op = validator_t::validate_embed_fields(doc, coll->get_embedding_fields(), coll->get_schema(), true); ASSERT_FALSE(add_op.ok()); ASSERT_EQ("Field `category` has malformed data.", add_op.error()); } TEST_F(CollectionTest, UpdateEmbeddingsForUpdatedDocument) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "name", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); Collection* coll = op.get(); nlohmann::json doc; doc["name"] = "butter"; auto add_op = coll->add(doc.dump()); ASSERT_TRUE(add_op.ok()); // get embedding field // get id of the document auto id = add_op.get()["id"]; // get embedding field from the document auto embedding_field = add_op.get()["embedding"].get>(); ASSERT_EQ(384, embedding_field.size()); // update the document nlohmann::json update_doc; update_doc["name"] = "butterball"; std::string dirty_values; auto update_op = coll->update_matching_filter("id:=" + id.get(), update_doc.dump(), dirty_values); ASSERT_TRUE(update_op.ok()); ASSERT_EQ(1, update_op.get()["num_updated"]); // get the document again auto get_op = coll->get(id); ASSERT_TRUE(get_op.ok()); auto updated_embedding_field = get_op.get()["embedding"].get>(); // check if the embedding field is updated ASSERT_NE(embedding_field, updated_embedding_field); } TEST_F(CollectionTest, CreateCollectionWithOpenAI) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "name", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "openai/text-embedding-ada-002"}}} ] })"_json; if (std::getenv("api_key") == nullptr) { LOG(INFO) << "Skipping test as api_key is not set."; return; } auto api_key = std::string(std::getenv("api_key")); schema["fields"][1]["embed"]["model_config"]["api_key"] = api_key; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); // create one more collection schema = R"({ "name": "objects2", "fields": [ {"name": "name", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "openai/text-embedding-ada-002"}}} ] })"_json; schema["fields"][1]["embed"]["model_config"]["api_key"] = api_key; op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); } TEST_F(CollectionTest, CreateOpenAIEmbeddingField) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "name", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "openai/text-embedding-ada-002"}}} ] })"_json; if (std::getenv("api_key") == nullptr) { LOG(INFO) << "Skipping test as api_key is not set."; return; } auto api_key = std::string(std::getenv("api_key")); schema["fields"][1]["embed"]["model_config"]["api_key"] = api_key; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); auto summary = op.get()->get_summary_json(); ASSERT_EQ("openai/text-embedding-ada-002", summary["fields"][1]["embed"]["model_config"]["model_name"]); ASSERT_EQ(1536, summary["fields"][1]["num_dim"]); nlohmann::json doc; doc["name"] = "butter"; auto add_op = op.get()->add(doc.dump()); ASSERT_TRUE(add_op.ok()); ASSERT_EQ(1536, add_op.get()["embedding"].size()); } TEST_F(CollectionTest, HideOpenAIApiKey) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "name", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "openai/text-embedding-ada-002"}}} ] })"_json; if (std::getenv("api_key") == nullptr) { LOG(INFO) << "Skipping test as api_key is not set."; return; } auto api_key = std::string(std::getenv("api_key")); schema["fields"][1]["embed"]["model_config"]["api_key"] = api_key; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); auto summary = op.get()->get_summary_json(); // hide api key with * after first 3 characters ASSERT_EQ(summary["fields"][1]["embed"]["model_config"]["api_key"].get(), api_key.replace(5, api_key.size() - 5, api_key.size() - 5, '*')); } TEST_F(CollectionTest, PrefixSearchDisabledForOpenAI) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "name", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "openai/text-embedding-ada-002"}}} ] })"_json; if (std::getenv("api_key") == nullptr) { LOG(INFO) << "Skipping test as api_key is not set."; return; } auto api_key = std::string(std::getenv("api_key")); schema["fields"][1]["embed"]["model_config"]["api_key"] = api_key; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); nlohmann::json doc; doc["name"] = "butter"; auto add_op = op.get()->add(doc.dump()); ASSERT_TRUE(add_op.ok()); spp::sparse_hash_set dummy_include_exclude; auto search_res_op = op.get()->search("dummy", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, ""); ASSERT_FALSE(search_res_op.ok()); ASSERT_EQ("Prefix search is not supported for remote embedders. Please set `prefix=false` as an additional search parameter to disable prefix searching.", search_res_op.error()); search_res_op = op.get()->search("dummy", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, ""); ASSERT_TRUE(search_res_op.ok()); } TEST_F(CollectionTest, MoreThanOneEmbeddingField) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "name", "type": "string"}, {"name": "name2", "type": "string"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}, {"name": "embedding2", "type":"float[]", "embed":{"from": ["name2"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); auto coll = op.get(); nlohmann::json doc; doc["name"] = "butter"; doc["name2"] = "butterball"; auto add_op = validator_t::validate_embed_fields(doc, op.get()->get_embedding_fields(), op.get()->get_schema(), true); ASSERT_TRUE(add_op.ok()); spp::sparse_hash_set dummy_include_exclude; auto search_res_op = coll->search("butter", {"name", "embedding", "embedding2"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, ""); ASSERT_FALSE(search_res_op.ok()); ASSERT_EQ("Only one embedding field is allowed in the query.", search_res_op.error()); } TEST_F(CollectionTest, EmbeddingFieldEmptyArrayInDocument) { nlohmann::json schema = R"({ "name": "objects", "fields": [ {"name": "names", "type": "string[]"}, {"name": "embedding", "type":"float[]", "embed":{"from": ["names"], "model_config": {"model_name": "ts/e5-small"}}} ] })"_json; EmbedderManager::set_model_dir("/tmp/typesense_test/models"); auto op = collectionManager.create_collection(schema); ASSERT_TRUE(op.ok()); auto coll = op.get(); nlohmann::json doc; doc["names"] = nlohmann::json::array(); // try adding auto add_op = coll->add(doc.dump()); ASSERT_TRUE(add_op.ok()); ASSERT_TRUE(add_op.get()["embedding"].is_null()); // try updating auto id = add_op.get()["id"]; doc["names"].push_back("butter"); std::string dirty_values; auto update_op = coll->update_matching_filter("id:=" + id.get(), doc.dump(), dirty_values); ASSERT_TRUE(update_op.ok()); ASSERT_EQ(1, update_op.get()["num_updated"]); auto get_op = coll->get(id); ASSERT_TRUE(get_op.ok()); ASSERT_FALSE(get_op.get()["embedding"].is_null()); ASSERT_EQ(384, get_op.get()["embedding"].size()); } TEST_F(CollectionTest, CatchPartialResponseFromRemoteEmbedding) { std::string partial_json = R"({ "results": [ { "embedding": [ 0.0, 0.0, 0.0 ], "text": "butter" }, { "embedding": [ 0.0, 0.0, 0.0 ], "text": "butterball" }, { "embedding": [ 0.0, 0.0)"; nlohmann::json req_body = R"({ "inputs": [ "butter", "butterball", "butterfly" ] })"_json; OpenAIEmbedder embedder("", "", 0, false, ""); auto res = embedder.get_error_json(req_body, 200, partial_json); ASSERT_EQ(res["response"]["error"], "Malformed response from OpenAI API."); ASSERT_EQ(res["request"]["body"], req_body); }