mirror of
https://github.com/typesense/typesense.git
synced 2025-05-16 11:28:44 +08:00
* When `validate_field_names` is false, ignore the filter whose field name is not found in the schema. * Add test. * Ignored filter should match 0 documents. * Respect `validate_field_names` in export, update, delete, overrides, _eval, and joins. * Refactor `validate_field_names` initialization. * Update comment.
5284 lines
224 KiB
C++
5284 lines
224 KiB
C++
#include <gtest/gtest.h>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <fstream>
|
|
#include <algorithm>
|
|
#include <filesystem>
|
|
#include <cstdlib>
|
|
#include <collection_manager.h>
|
|
#include <validator.h>
|
|
#include "collection.h"
|
|
#include "embedder_manager.h"
|
|
#include "http_client.h"
|
|
|
|
class CollectionTest : public ::testing::Test {
|
|
protected:
|
|
Collection *collection;
|
|
std::vector<std::string> query_fields;
|
|
Store *store;
|
|
CollectionManager & collectionManager = CollectionManager::get_instance();
|
|
std::atomic<bool> quit = false;
|
|
std::vector<sort_by> sort_fields;
|
|
|
|
// used for generating random text
|
|
std::vector<std::string> words;
|
|
|
|
void setupCollection() {
|
|
std::string state_dir_path = "/tmp/typesense_test/collection";
|
|
LOG(INFO) << "Truncating and creating: " << state_dir_path;
|
|
system(("rm -rf "+state_dir_path+" && mkdir -p "+state_dir_path).c_str());
|
|
system("mkdir -p /tmp/typesense_test/models");
|
|
|
|
store = new Store(state_dir_path);
|
|
collectionManager.init(store, 1.0, "auth_key", quit);
|
|
collectionManager.load(8, 1000);
|
|
|
|
std::ifstream infile(std::string(ROOT_DIR)+"test/documents.jsonl");
|
|
std::vector<field> search_fields = {
|
|
field("title", field_types::STRING, false),
|
|
field("points", field_types::INT32, false)
|
|
};
|
|
|
|
query_fields = {"title"};
|
|
sort_fields = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "DESC") };
|
|
|
|
collection = collectionManager.get_collection("collection").get();
|
|
if(collection == nullptr) {
|
|
collection = collectionManager.create_collection("collection", 4, search_fields, "points").get();
|
|
}
|
|
|
|
std::string json_line;
|
|
|
|
// dummy record for record id 0: to make the test record IDs to match with line numbers
|
|
json_line = "{\"points\":10,\"title\":\"z\"}";
|
|
collection->add(json_line);
|
|
|
|
while (std::getline(infile, json_line)) {
|
|
collection->add(json_line);
|
|
}
|
|
|
|
infile.close();
|
|
|
|
std::ifstream words_file(std::string(ROOT_DIR)+"test/resources/common100_english.txt");
|
|
std::stringstream strstream;
|
|
strstream << words_file.rdbuf();
|
|
words_file.close();
|
|
StringUtils::split(strstream.str(), words, "\n");
|
|
}
|
|
|
|
virtual void SetUp() {
|
|
setupCollection();
|
|
}
|
|
|
|
virtual void TearDown() {
|
|
collectionManager.drop_collection("collection");
|
|
collectionManager.dispose();
|
|
delete store;
|
|
}
|
|
|
|
std::string get_text(size_t num_words) {
|
|
time_t t;
|
|
srand((unsigned) time(&t));
|
|
std::vector<std::string> strs;
|
|
|
|
for(size_t i = 0 ; i < num_words ; i++ ) {
|
|
int word_index = rand() % words.size();
|
|
strs.push_back(words[word_index]);
|
|
}
|
|
return StringUtils::join(strs, " ");
|
|
}
|
|
};
|
|
|
|
TEST_F(CollectionTest, VerifyCountOfDocuments) {
|
|
// we have 1 dummy record to match the line numbers on the fixtures file with sequence numbers
|
|
ASSERT_EQ(24+1, collection->get_num_documents());
|
|
|
|
// check default no specific dirty values option is sent for a collection that has explicit schema
|
|
std::string empty_dirty_values;
|
|
ASSERT_EQ(DIRTY_VALUES::REJECT, collection->parse_dirty_values_option(empty_dirty_values));
|
|
}
|
|
|
|
TEST_F(CollectionTest, RetrieveADocumentById) {
|
|
Option<nlohmann::json> doc_option = collection->get("1");
|
|
ASSERT_TRUE(doc_option.ok());
|
|
nlohmann::json doc = doc_option.get();
|
|
std::string id = doc["id"];
|
|
|
|
doc_option = collection->get("foo");
|
|
ASSERT_TRUE(doc_option.ok());
|
|
doc = doc_option.get();
|
|
id = doc["id"];
|
|
ASSERT_STREQ("foo", id.c_str());
|
|
|
|
doc_option = collection->get("baz");
|
|
ASSERT_FALSE(doc_option.ok());
|
|
}
|
|
|
|
TEST_F(CollectionTest, ExactSearchShouldBeStable) {
|
|
std::vector<std::string> facets;
|
|
nlohmann::json results = collection->search("the", query_fields, "", facets, sort_fields, {0}, 10,
|
|
1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(7, results["hits"].size());
|
|
ASSERT_EQ(7, results["found"].get<int>());
|
|
|
|
ASSERT_STREQ("collection", results["request_params"]["collection_name"].get<std::string>().c_str());
|
|
ASSERT_STREQ("the", results["request_params"]["q"].get<std::string>().c_str());
|
|
ASSERT_EQ(10, results["request_params"]["per_page"].get<size_t>());
|
|
|
|
// For two documents of the same score, the larger doc_id appears first
|
|
std::vector<std::string> ids = {"1", "6", "foo", "13", "10", "8", "16"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string id = ids.at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// check ASC sorting
|
|
std::vector<sort_by> sort_fields_asc = { sort_by("points", "ASC") };
|
|
|
|
results = collection->search("the", query_fields, "", facets, sort_fields_asc, {0}, 10,
|
|
1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(7, results["hits"].size());
|
|
ASSERT_EQ(7, results["found"].get<int>());
|
|
|
|
ids = {"16", "13", "10", "8", "6", "foo", "1"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string id = ids.at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// when a query does not return results, hits and found fields should still exist in response
|
|
results = collection->search("zxsadqewsad", query_fields, "", facets, sort_fields_asc, {0}, 10,
|
|
1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
ASSERT_EQ(0, results["found"].get<int>());
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiTokenSearch) {
|
|
std::vector<std::string> facets;
|
|
nlohmann::json results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 10,
|
|
1, FREQUENCY,
|
|
{false}, 10,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
|
"", 10).get();
|
|
ASSERT_EQ(5, results["hits"].size());
|
|
ASSERT_EQ(5, results["found"].get<uint32_t>());
|
|
|
|
/*
|
|
Sort by (match, diff, score)
|
|
8: score: 12, diff: 0
|
|
1: score: 15, diff: 4
|
|
17: score: 8, diff: 4
|
|
16: score: 10, diff: 5
|
|
13: score: 12, (single word match)
|
|
*/
|
|
|
|
std::vector<std::string> ids = {"8", "1", "17", "16", "13"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string id = ids.at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
ASSERT_EQ(results["hits"][0]["highlights"].size(), (unsigned long) 1);
|
|
ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get<std::string>().c_str(), "title");
|
|
ASSERT_STREQ(results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str(),
|
|
"What is the power, requirement of a <mark>rocket</mark> <mark>launch</mark> these days?");
|
|
|
|
// Check ASC sort order
|
|
std::vector<sort_by> sort_fields_asc = { sort_by(sort_field_const::text_match, "DESC"), sort_by("points", "ASC") };
|
|
results = collection->search("rocket launch", query_fields, "", facets, sort_fields_asc, {0}, 10,
|
|
1, FREQUENCY,
|
|
{false}, 10,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
|
"", 10).get();
|
|
ASSERT_EQ(5, results["hits"].size());
|
|
ASSERT_EQ(5, results["found"].get<uint32_t>());
|
|
|
|
ids = {"8", "17", "1", "16", "13"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string id = ids.at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// Check pagination
|
|
results = collection->search("rocket launch", query_fields, "", facets, sort_fields, {0}, 3,
|
|
1, FREQUENCY,
|
|
{false}, 10,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
|
"", 10).get();
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ASSERT_EQ(5, results["found"].get<uint32_t>());
|
|
|
|
ASSERT_EQ(3, results["request_params"]["per_page"].get<size_t>());
|
|
|
|
ids = {"8", "1", "17"};
|
|
|
|
for(size_t i = 0; i < 3; i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string id = ids.at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
}
|
|
|
|
TEST_F(CollectionTest, SearchWithExcludedTokens) {
|
|
std::vector<std::string> facets;
|
|
nlohmann::json results = collection->search("how -propellants -are", query_fields, "", facets, sort_fields, {0}, 10,
|
|
1, FREQUENCY,
|
|
{false}, 10,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
|
"", 10).get();
|
|
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
ASSERT_EQ(2, results["found"].get<uint32_t>());
|
|
|
|
std::vector<std::string> ids = {"9", "17"};
|
|
|
|
for (size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string id = ids.at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
results = collection->search("-rocket", query_fields, "", facets, sort_fields, {0}, 50).get();
|
|
|
|
ASSERT_EQ(21, results["found"].get<uint32_t>());
|
|
ASSERT_EQ(21, results["hits"].size());
|
|
|
|
results = collection->search("-rocket -cryovolcanism", query_fields, "", facets, sort_fields, {0}, 50).get();
|
|
|
|
ASSERT_EQ(20, results["found"].get<uint32_t>());
|
|
}
|
|
|
|
TEST_F(CollectionTest, SkipUnindexedTokensDuringMultiTokenSearch) {
|
|
// Tokens that are not found in the index should be skipped
|
|
std::vector<std::string> facets;
|
|
nlohmann::json results = collection->search("DoesNotExist from", query_fields, "", facets, sort_fields, {0}, 10).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
std::vector<std::string> ids = {"2", "17"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string id = ids.at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// with non-zero cost
|
|
results = collection->search("DoesNotExist from", query_fields, "", facets, sort_fields, {1}, 10).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string id = ids.at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// with 2 indexed words
|
|
results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, sort_fields, {1}, 10).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
ids = {"2", "17"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string id = ids.at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// exhaustive search should give same results
|
|
results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, sort_fields, {1}, 10,
|
|
1, FREQUENCY, {true},
|
|
1, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 1, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {}, 1000,
|
|
true, false, true, "", true).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
ids = {"2", "17"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string id = ids.at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// should not try to drop tokens to expand query
|
|
results.clear();
|
|
results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
|
"", 10).get();
|
|
ASSERT_EQ(9, results["hits"].size());
|
|
|
|
results.clear();
|
|
results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ids = {"8", "16", "10"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string id = ids.at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
results.clear();
|
|
results = collection->search("the a insurance", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
|
|
// with no indexed word
|
|
results.clear();
|
|
results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, sort_fields, {0}, 10).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
|
|
results.clear();
|
|
results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, sort_fields, {2}, 10).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
}
|
|
|
|
TEST_F(CollectionTest, PartialMultiTokenSearch) {
|
|
std::vector<std::string> facets;
|
|
nlohmann::json results = collection->search("rocket research", query_fields, "", facets,
|
|
sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10).get();
|
|
ASSERT_EQ(6, results["hits"].size());
|
|
|
|
std::vector<std::string> ids = {"19", "1", "10", "8", "16", "17"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
}
|
|
|
|
TEST_F(CollectionTest, QueryWithTypo) {
|
|
std::vector<std::string> facets;
|
|
nlohmann::json results = collection->search("kind biologcal", query_fields, "", facets, sort_fields, {2}, 3,
|
|
1, FREQUENCY,
|
|
{false}, 10,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
|
"", 10).get();
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
|
|
std::vector<std::string> ids = {"19", "3", "20"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
results.clear();
|
|
results = collection->search("lauxnch rcket", query_fields, "", facets, sort_fields, {1}, 3,
|
|
1, FREQUENCY,
|
|
{false}, 10,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
|
"", 10).get();
|
|
|
|
ids = {"8", "1", "17"};
|
|
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
}
|
|
|
|
TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
|
|
std::vector<std::string> facets;
|
|
nlohmann::json results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 2, 1, MAX_SCORE, {false}).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
std::vector<std::string> ids = {"22", "3"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 3, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ids = {"22", "3", "12"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// Check pagination
|
|
results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 1, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(5, results["found"].get<int>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
std::string solo_id = results["hits"].at(0)["document"]["id"];
|
|
ASSERT_STREQ("22", solo_id.c_str());
|
|
|
|
results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 2, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(5, results["found"].get<int>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
// Check total ordering
|
|
|
|
results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(5, results["hits"].size());
|
|
ids = {"22", "3", "12", "23", "24"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
results = collection->search("loox", query_fields, "", facets, sort_fields, {1}, 10, 1, MAX_SCORE, {false}).get();
|
|
ASSERT_EQ(5, results["hits"].size());
|
|
ids = {"22", "3", "12", "23", "24"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
}
|
|
|
|
TEST_F(CollectionTest, TextContainingAnActualTypo) {
|
|
// A line contains "ISSX" but not "what" - need to ensure that correction to "ISSS what" happens
|
|
std::vector<std::string> facets;
|
|
nlohmann::json results = collection->search("ISSX what", query_fields, "", facets, sort_fields, {1}, 4, 1, FREQUENCY, {false},
|
|
20, spp::sparse_hash_set<std::string>(), spp::sparse_hash_set<std::string>(),
|
|
10, "", 30, 5, "", 20).get();
|
|
ASSERT_EQ(4, results["hits"].size());
|
|
ASSERT_EQ(11, results["found"].get<uint32_t>());
|
|
|
|
std::vector<std::string> ids = {"19", "6", "21", "22"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// Record containing exact token match should appear first
|
|
results = collection->search("ISSX", query_fields, "", facets, sort_fields, {1}, 10, 1, FREQUENCY, {false}, 10,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
|
"", 10).get();
|
|
|
|
ASSERT_EQ(5, results["hits"].size());
|
|
ASSERT_EQ(5, results["found"].get<uint32_t>());
|
|
|
|
ids = {"20", "19", "6", "3", "21"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
}
|
|
|
|
TEST_F(CollectionTest, Pagination) {
|
|
nlohmann::json results = collection->search("the", query_fields, "", {}, sort_fields, {0}, 3, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ASSERT_EQ(7, results["found"].get<uint32_t>());
|
|
|
|
std::vector<std::string> ids = {"1", "6", "foo"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
results = collection->search("the", query_fields, "", {}, sort_fields, {0}, 3, 2, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ASSERT_EQ(7, results["found"].get<uint32_t>());
|
|
|
|
ids = {"13", "10", "8"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
results = collection->search("the", query_fields, "", {}, sort_fields, {0}, 3, 3, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
ASSERT_EQ(7, results["found"].get<uint32_t>());
|
|
|
|
ids = {"16"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
}
|
|
|
|
TEST_F(CollectionTest, WildcardQuery) {
|
|
nlohmann::json results = collection->search("*", query_fields, "points:>0", {}, sort_fields, {0}, 3, 1, FREQUENCY,
|
|
{false}).get();
|
|
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<uint32_t>());
|
|
|
|
// when no filter is specified, fall back on default sorting field based catch-all filter
|
|
Option<nlohmann::json> results_op = collection->search("*", query_fields, "", {}, sort_fields, {0}, 3, 1, FREQUENCY,
|
|
{false});
|
|
|
|
ASSERT_TRUE(results_op.ok());
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<uint32_t>());
|
|
|
|
// wildcard query with no filters and ASC sort
|
|
std::vector<sort_by> sort_fields = { sort_by("points", "ASC") };
|
|
results = collection->search("*", query_fields, "", {}, sort_fields, {0}, 3, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<uint32_t>());
|
|
|
|
std::vector<std::string> ids = {"21", "24", "17"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// wildcard query should not require a search field
|
|
results_op = collection->search("*", {}, "", {}, sort_fields, {0}, 3, 1, FREQUENCY, {false});
|
|
ASSERT_TRUE(results_op.ok());
|
|
results = results_op.get();
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<uint32_t>());
|
|
|
|
// non-wildcard query should require a search field
|
|
results_op = collection->search("the", {}, "", {}, sort_fields, {0}, 3, 1, FREQUENCY, {false});
|
|
ASSERT_FALSE(results_op.ok());
|
|
ASSERT_STREQ("No search fields specified for the query.", results_op.error().c_str());
|
|
|
|
Collection* empty_coll;
|
|
std::vector<field> fields = {field("title", field_types::STRING, false)};
|
|
|
|
empty_coll = collectionManager.get_collection("empty_coll").get();
|
|
if(empty_coll == nullptr) {
|
|
empty_coll = collectionManager.create_collection("empty_coll", 1, fields).get();
|
|
}
|
|
results = empty_coll->search("*", {}, "title:!= foo", {}, {}, {0}, 3, 1).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
ASSERT_EQ(0, results["found"]);
|
|
}
|
|
|
|
TEST_F(CollectionTest, PrefixSearching) {
|
|
std::vector<std::string> facets;
|
|
nlohmann::json results = collection->search("ex", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {true}).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
std::vector<std::string> ids = {"6", "12"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
results = collection->search("ex", query_fields, "", facets, sort_fields, {0}, 10, 1, MAX_SCORE, {true}).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
ids = {"6", "12"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
results = collection->search("what ex", query_fields, "", facets, sort_fields, {0}, 10, 1, MAX_SCORE, {true}, 10,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
|
"", 10).get();
|
|
ASSERT_EQ(9, results["hits"].size());
|
|
ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// restrict to only 2 results and differentiate between MAX_SCORE and FREQUENCY
|
|
results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, MAX_SCORE, {true}, 10,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
|
"", 10).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
ids = {"19", "22"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
results = collection->search("t", query_fields, "", facets, sort_fields, {0}, 2, 1, FREQUENCY, {true}, 10,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
|
|
"", 10).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
ids = {"1", "2"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// only the last token in the query should be used for prefix search - so, "math" should not match "mathematics"
|
|
results = collection->search("math fx", query_fields, "", facets, sort_fields, {0}, 1, 1, FREQUENCY, {true}, 0).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
|
|
// single and double char prefixes should set a ceiling on the num_typos possible
|
|
results = collection->search("x", query_fields, "", facets, sort_fields, {2}, 2, 1, FREQUENCY, {true}).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
|
|
// prefix with a typo
|
|
results = collection->search("late propx", query_fields, "", facets, sort_fields, {2}, 1, 1, FREQUENCY, {true}).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
ASSERT_EQ("16", results["hits"].at(0)["document"]["id"]);
|
|
}
|
|
|
|
TEST_F(CollectionTest, TypoTokensThreshold) {
|
|
// Typo correction should happen only based on the `typo_tokens_threshold` value
|
|
auto results = collection->search("redundant", {"title"}, "", {}, sort_fields, {2}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "", 0).get();
|
|
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
|
|
results = collection->search("redundant", {"title"}, "", {}, sort_fields, {2}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "", 10).get();
|
|
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiOccurrenceString) {
|
|
Collection *coll_multi_string;
|
|
|
|
std::vector<field> fields = {
|
|
field("title", field_types::STRING, false),
|
|
field("points", field_types::INT32, false)
|
|
};
|
|
|
|
coll_multi_string = collectionManager.get_collection("coll_multi_string").get();
|
|
if (coll_multi_string == nullptr) {
|
|
coll_multi_string = collectionManager.create_collection("coll_multi_string", 4, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json document;
|
|
document["title"] = "The brown fox was the tallest of the lot and the quickest of the trot.";
|
|
document["points"] = 100;
|
|
|
|
coll_multi_string->add(document.dump()).get();
|
|
|
|
query_fields = {"title"};
|
|
nlohmann::json results = coll_multi_string->search("the", query_fields, "", {}, sort_fields, {0}, 10, 1,
|
|
FREQUENCY, {false}, 0).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
collectionManager.drop_collection("coll_multi_string");
|
|
}
|
|
|
|
TEST_F(CollectionTest, ArrayStringFieldHighlight) {
|
|
Collection *coll_array_text;
|
|
|
|
std::ifstream infile(std::string(ROOT_DIR)+"test/array_text_documents.jsonl");
|
|
std::vector<field> fields = {
|
|
field("title", field_types::STRING, false),
|
|
field("tags", field_types::STRING_ARRAY, false),
|
|
field("points", field_types::INT32, false)
|
|
};
|
|
|
|
coll_array_text = collectionManager.get_collection("coll_array_text").get();
|
|
if (coll_array_text == nullptr) {
|
|
coll_array_text = collectionManager.create_collection("coll_array_text", 4, fields, "points").get();
|
|
}
|
|
|
|
std::string json_line;
|
|
|
|
while (std::getline(infile, json_line)) {
|
|
coll_array_text->add(json_line);
|
|
}
|
|
|
|
infile.close();
|
|
|
|
query_fields = {"tags"};
|
|
std::vector<std::string> facets;
|
|
|
|
nlohmann::json results = coll_array_text->search("truth about", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY,
|
|
{false}, 0).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
std::vector<std::string> ids = {"0"};
|
|
|
|
for (size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
ASSERT_EQ(results["hits"][0]["highlights"].size(), 1);
|
|
ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get<std::string>().c_str(), "tags");
|
|
|
|
// an array's snippets must be sorted on match score, if match score is same, priority to be given to lower indices
|
|
ASSERT_EQ(3, results["hits"][0]["highlights"][0]["snippets"].size());
|
|
ASSERT_STREQ("<mark>truth</mark> <mark>about</mark>", results["hits"][0]["highlights"][0]["snippets"][0].get<std::string>().c_str());
|
|
ASSERT_STREQ("the <mark>truth</mark>", results["hits"][0]["highlights"][0]["snippets"][1].get<std::string>().c_str());
|
|
ASSERT_STREQ("<mark>about</mark> forever", results["hits"][0]["highlights"][0]["snippets"][2].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(3, results["hits"][0]["highlights"][0]["indices"].size());
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"][0]);
|
|
ASSERT_EQ(0, results["hits"][0]["highlights"][0]["indices"][1]);
|
|
ASSERT_EQ(1, results["hits"][0]["highlights"][0]["indices"][2]);
|
|
|
|
results = coll_array_text->search("forever truth", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY,
|
|
{false}, 0).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ids = {"0"};
|
|
|
|
for (size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get<std::string>().c_str(), "tags");
|
|
ASSERT_EQ(3, results["hits"][0]["highlights"][0]["snippets"].size());
|
|
ASSERT_STREQ("the <mark>truth</mark>", results["hits"][0]["highlights"][0]["snippets"][0].get<std::string>().c_str());
|
|
ASSERT_STREQ("about <mark>forever</mark>", results["hits"][0]["highlights"][0]["snippets"][1].get<std::string>().c_str());
|
|
ASSERT_STREQ("<mark>truth</mark> about", results["hits"][0]["highlights"][0]["snippets"][2].get<std::string>().c_str());
|
|
ASSERT_EQ(3, results["hits"][0]["highlights"][0]["indices"].size());
|
|
ASSERT_EQ(0, results["hits"][0]["highlights"][0]["indices"][0]);
|
|
ASSERT_EQ(1, results["hits"][0]["highlights"][0]["indices"][1]);
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"][2]);
|
|
|
|
results = coll_array_text->search("truth", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY,
|
|
{false}, 0).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
ids = {"1", "0"};
|
|
|
|
for (size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
results = coll_array_text->search("asdadasd", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY,
|
|
{false}, 0).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
|
|
query_fields = {"title", "tags"};
|
|
results = coll_array_text->search("truth", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY,
|
|
{false}, 0).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
|
|
|
|
ids = {"1", "0"};
|
|
|
|
for (size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
ASSERT_EQ(4, results["hits"][0]["highlights"][0].size());
|
|
ASSERT_STREQ(results["hits"][0]["highlights"][0]["field"].get<std::string>().c_str(), "tags");
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["snippets"].size());
|
|
ASSERT_STREQ("<mark>truth</mark>", results["hits"][0]["highlights"][0]["snippets"][0].get<std::string>().c_str());
|
|
ASSERT_STREQ("plain <mark>truth</mark>", results["hits"][0]["highlights"][0]["snippets"][1].get<std::string>().c_str());
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["matched_tokens"].size());
|
|
ASSERT_STREQ("truth", results["hits"][0]["highlights"][0]["matched_tokens"][0][0].get<std::string>().c_str());
|
|
ASSERT_STREQ("truth", results["hits"][0]["highlights"][0]["matched_tokens"][1][0].get<std::string>().c_str());
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"].size());
|
|
ASSERT_EQ(1, results["hits"][0]["highlights"][0]["indices"][0]);
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["indices"][1]);
|
|
|
|
ASSERT_EQ(3, results["hits"][0]["highlights"][1].size());
|
|
ASSERT_STREQ("title", results["hits"][0]["highlights"][1]["field"].get<std::string>().c_str());
|
|
ASSERT_STREQ("Plain <mark>Truth</mark>", results["hits"][0]["highlights"][1]["snippet"].get<std::string>().c_str());
|
|
ASSERT_EQ(1, results["hits"][0]["highlights"][1]["matched_tokens"].size());
|
|
ASSERT_STREQ("Truth", results["hits"][0]["highlights"][1]["matched_tokens"][0].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(3, results["hits"][1]["highlights"][0].size());
|
|
ASSERT_STREQ("title", results["hits"][1]["highlights"][0]["field"].get<std::string>().c_str());
|
|
ASSERT_STREQ("The <mark>Truth</mark> About Forever", results["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
ASSERT_EQ(1, results["hits"][1]["highlights"][0]["matched_tokens"].size());
|
|
ASSERT_STREQ("Truth", results["hits"][1]["highlights"][0]["matched_tokens"][0].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(4, results["hits"][1]["highlights"][1].size());
|
|
ASSERT_STREQ(results["hits"][1]["highlights"][1]["field"].get<std::string>().c_str(), "tags");
|
|
ASSERT_EQ(2, results["hits"][1]["highlights"][1]["snippets"].size());
|
|
ASSERT_STREQ("the <mark>truth</mark>", results["hits"][1]["highlights"][1]["snippets"][0].get<std::string>().c_str());
|
|
ASSERT_STREQ("<mark>truth</mark> about", results["hits"][1]["highlights"][1]["snippets"][1].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(2, results["hits"][1]["highlights"][1]["matched_tokens"].size());
|
|
ASSERT_STREQ("truth", results["hits"][1]["highlights"][1]["matched_tokens"][0][0].get<std::string>().c_str());
|
|
ASSERT_STREQ("truth", results["hits"][1]["highlights"][1]["matched_tokens"][1][0].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(2, results["hits"][1]["highlights"][1]["indices"].size());
|
|
ASSERT_EQ(0, results["hits"][1]["highlights"][1]["indices"][0]);
|
|
ASSERT_EQ(2, results["hits"][1]["highlights"][1]["indices"][1]);
|
|
|
|
// highlight fields must be ordered based on match score
|
|
results = coll_array_text->search("amazing movie", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY,
|
|
{false}, 0).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
|
|
|
|
ASSERT_EQ(4, results["hits"][0]["highlights"][0].size());
|
|
ASSERT_STREQ("tags", results["hits"][0]["highlights"][0]["field"].get<std::string>().c_str());
|
|
ASSERT_STREQ("<mark>amazing</mark> <mark>movie</mark>", results["hits"][0]["highlights"][0]["snippets"][0].get<std::string>().c_str());
|
|
ASSERT_EQ(1, results["hits"][0]["highlights"][0]["indices"].size());
|
|
ASSERT_EQ(0, results["hits"][0]["highlights"][0]["indices"][0]);
|
|
ASSERT_EQ(1, results["hits"][0]["highlights"][0]["matched_tokens"].size());
|
|
ASSERT_STREQ("amazing", results["hits"][0]["highlights"][0]["matched_tokens"][0][0].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(3, results["hits"][0]["highlights"][1].size());
|
|
ASSERT_STREQ(results["hits"][0]["highlights"][1]["field"].get<std::string>().c_str(), "title");
|
|
ASSERT_STREQ(results["hits"][0]["highlights"][1]["snippet"].get<std::string>().c_str(),
|
|
"<mark>Amazing</mark> Spiderman is <mark>amazing</mark>"); // should highlight duplicating tokens
|
|
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"][1]["matched_tokens"].size());
|
|
ASSERT_STREQ("Amazing", results["hits"][0]["highlights"][1]["matched_tokens"][0].get<std::string>().c_str());
|
|
ASSERT_STREQ("amazing", results["hits"][0]["highlights"][1]["matched_tokens"][1].get<std::string>().c_str());
|
|
|
|
// when query tokens are not found in an array field they should be ignored
|
|
results = coll_array_text->search("winds", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY,
|
|
{false}, 0).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
ASSERT_EQ(1, results["hits"][0]["highlights"].size());
|
|
|
|
collectionManager.drop_collection("coll_array_text");
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultipleFields) {
|
|
Collection *coll_mul_fields;
|
|
|
|
std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
|
|
std::vector<field> fields = {
|
|
field("title", field_types::STRING, false),
|
|
field("starring", field_types::STRING, false),
|
|
field("starring_facet", field_types::STRING, true),
|
|
field("cast", field_types::STRING_ARRAY, false),
|
|
field("points", field_types::INT32, false)
|
|
};
|
|
|
|
coll_mul_fields = collectionManager.get_collection("coll_mul_fields").get();
|
|
if(coll_mul_fields == nullptr) {
|
|
coll_mul_fields = collectionManager.create_collection("coll_mul_fields", 4, fields, "points").get();
|
|
}
|
|
|
|
std::string json_line;
|
|
|
|
while (std::getline(infile, json_line)) {
|
|
coll_mul_fields->add(json_line);
|
|
}
|
|
|
|
infile.close();
|
|
|
|
query_fields = {"title", "starring"};
|
|
std::vector<std::string> facets;
|
|
|
|
nlohmann::json results = coll_mul_fields->search("Will", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(4, results["hits"].size());
|
|
|
|
std::vector<std::string> ids = {"3", "2", "1", "0"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// when "starring" takes higher priority than "title"
|
|
|
|
query_fields = {"starring", "title"};
|
|
results = coll_mul_fields->search("thomas", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false},
|
|
10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {2, 1}).get();
|
|
ASSERT_EQ(4, results["hits"].size());
|
|
|
|
ids = {"15", "12", "13", "14"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
query_fields = {"starring", "title", "cast"};
|
|
results = coll_mul_fields->search("ben affleck", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
query_fields = {"cast"};
|
|
results = coll_mul_fields->search("chris", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
|
|
ids = {"6", "1", "7"};
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
query_fields = {"cast"};
|
|
results = coll_mul_fields->search("chris pine", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ids = {"7"};
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// filtering on unfaceted multi-valued string field
|
|
query_fields = {"title"};
|
|
results = coll_mul_fields->search("captain", query_fields, "cast: chris", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
ids = {"6"};
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// when a token exists in multiple fields of the same document, document and facet should be returned only once
|
|
query_fields = {"starring", "title", "cast"};
|
|
facets = {"starring_facet"};
|
|
|
|
results = coll_mul_fields->search("myers", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
ids = {"17"};
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
ASSERT_EQ(1, results["facet_counts"].size());
|
|
ASSERT_STREQ("starring_facet", results["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
|
size_t facet_count = results["facet_counts"][0]["counts"][0]["count"];
|
|
ASSERT_EQ(1, facet_count);
|
|
|
|
collectionManager.drop_collection("coll_mul_fields");
|
|
}
|
|
|
|
TEST_F(CollectionTest, KeywordQueryReturnsResultsBasedOnPerPageParam) {
|
|
Collection *coll_mul_fields;
|
|
|
|
std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
|
|
std::vector<field> fields = {
|
|
field("title", field_types::STRING, false),
|
|
field("starring", field_types::STRING, false),
|
|
field("starring_facet", field_types::STRING, true),
|
|
field("cast", field_types::STRING_ARRAY, false),
|
|
field("points", field_types::INT32, false)
|
|
};
|
|
|
|
coll_mul_fields = collectionManager.get_collection("coll_mul_fields").get();
|
|
if(coll_mul_fields == nullptr) {
|
|
coll_mul_fields = collectionManager.create_collection("coll_mul_fields", 4, fields, "points").get();
|
|
}
|
|
|
|
std::string json_line;
|
|
|
|
while (std::getline(infile, json_line)) {
|
|
coll_mul_fields->add(json_line);
|
|
}
|
|
|
|
infile.close();
|
|
|
|
query_fields = {"title", "starring"};
|
|
std::vector<std::string> facets;
|
|
|
|
spp::sparse_hash_set<std::string> empty;
|
|
nlohmann::json results = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 3, 1,
|
|
FREQUENCY, {true}, 1000, empty, empty, 10).get();
|
|
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ASSERT_EQ(6, results["found"].get<int>());
|
|
|
|
// cannot fetch more than in-built limit of 250
|
|
auto res_op = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 251, 1,
|
|
FREQUENCY, {true}, 1000, empty, empty, 10);
|
|
ASSERT_FALSE(res_op.ok());
|
|
ASSERT_EQ(422, res_op.code());
|
|
ASSERT_STREQ("Only upto 250 hits can be fetched per page.", res_op.error().c_str());
|
|
|
|
// when page number is zero, use the first page
|
|
results = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 3, 0,
|
|
FREQUENCY, {true}, 1000, empty, empty, 10).get();
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ASSERT_EQ(6, results["found"].get<int>());
|
|
|
|
// do pagination
|
|
|
|
results = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 3, 1,
|
|
FREQUENCY, {true}, 1000, empty, empty, 10).get();
|
|
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ASSERT_EQ(6, results["found"].get<int>());
|
|
|
|
results = coll_mul_fields->search("w", query_fields, "", facets, sort_fields, {0}, 3, 2,
|
|
FREQUENCY, {true}, 1000, empty, empty, 10).get();
|
|
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
ASSERT_EQ(6, results["found"].get<int>());
|
|
|
|
collectionManager.drop_collection("coll_mul_fields");
|
|
}
|
|
|
|
std::vector<nlohmann::json> import_res_to_json(const std::vector<std::string>& imported_results) {
|
|
std::vector<nlohmann::json> out;
|
|
|
|
for(const auto& imported_result: imported_results) {
|
|
out.emplace_back(nlohmann::json::parse(imported_result));
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
TEST_F(CollectionTest, ImportDocumentsUpsert) {
|
|
Collection *coll_mul_fields;
|
|
|
|
std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
|
|
std::stringstream strstream;
|
|
strstream << infile.rdbuf();
|
|
infile.close();
|
|
|
|
std::vector<std::string> import_records;
|
|
StringUtils::split(strstream.str(), import_records, "\n");
|
|
|
|
std::vector<field> fields = {
|
|
field("title", field_types::STRING, false),
|
|
field("starring", field_types::STRING, true),
|
|
field("cast", field_types::STRING_ARRAY, false),
|
|
field("points", field_types::INT32, false)
|
|
};
|
|
|
|
coll_mul_fields = collectionManager.get_collection("coll_mul_fields").get();
|
|
if(coll_mul_fields == nullptr) {
|
|
coll_mul_fields = collectionManager.create_collection("coll_mul_fields", 1, fields, "points").get();
|
|
}
|
|
|
|
// try importing records
|
|
nlohmann::json document;
|
|
nlohmann::json import_response = coll_mul_fields->add_many(import_records, document);
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(18, import_response["num_imported"].get<int>());
|
|
|
|
// try searching with filter
|
|
auto results = coll_mul_fields->search("*", query_fields, "starring:= [Will Ferrell]", {"starring"}, sort_fields, {0}, 30, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
// update existing record verbatim
|
|
std::vector<std::string> existing_records = {R"({"id": "0", "title": "Wake Up, Ron Burgundy: The Lost Movie"})"};
|
|
import_response = coll_mul_fields->add_many(existing_records, document, UPDATE);
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(1, import_response["num_imported"].get<int>());
|
|
|
|
// update + upsert records
|
|
std::vector<std::string> more_records = {R"({"id": "0", "title": "The Fifth Harry", "starring": "Will Ferrell", "points":62, "cast":["Adam McKay","Steve Carell","Paul Rudd"]})",
|
|
R"({"id": "2", "cast": ["Chris Fisher", "Rand Alan"], "points":81, "starring":"Daniel Day-Lewis","title":"There Will Be Blood"})",
|
|
R"({"id": "18", "title": "Back Again Forest", "points": 45, "starring": "Ronald Wells", "cast": ["Dant Saren"]})",
|
|
R"({"id": "6", "points": 77, "cast":["Chris Evans","Scarlett Johansson"], "starring":"Samuel L. Jackson","title":"Captain America: The Winter Soldier"})"};
|
|
|
|
import_response = coll_mul_fields->add_many(more_records, document, UPSERT);
|
|
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(4, import_response["num_imported"].get<int>());
|
|
|
|
std::vector<nlohmann::json> import_results = import_res_to_json(more_records);
|
|
ASSERT_EQ(4, import_results.size());
|
|
|
|
for(size_t i=0; i<4; i++) {
|
|
ASSERT_TRUE(import_results[i]["success"].get<bool>());
|
|
ASSERT_EQ(1, import_results[i].size());
|
|
}
|
|
|
|
// try with filters again
|
|
results = coll_mul_fields->search("*", query_fields, "starring:= [Will Ferrell]", {"starring"}, sort_fields, {0}, 30, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
results = coll_mul_fields->search("*", query_fields, "", {"starring"}, sort_fields, {0}, 30, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(19, results["hits"].size());
|
|
ASSERT_EQ(19, coll_mul_fields->get_num_documents());
|
|
|
|
results = coll_mul_fields->search("back again forest", query_fields, "", {"starring"}, sort_fields, {0}, 30, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_STREQ("Back Again Forest", coll_mul_fields->get("18").get()["title"].get<std::string>().c_str());
|
|
|
|
results = coll_mul_fields->search("fifth", query_fields, "", {"starring"}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
ASSERT_STREQ("The <mark>Fifth</mark> Harry", results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
ASSERT_STREQ("The Woman in the <mark>Fifth</mark> from Kristin", results["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
results = coll_mul_fields->search("burgundy", query_fields, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
|
|
results = coll_mul_fields->search("harry", query_fields, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
results = coll_mul_fields->search("captain america", query_fields, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
ASSERT_EQ(77, results["hits"][0]["document"]["points"].get<size_t>());
|
|
|
|
// upserting with some bad docs
|
|
more_records = {R"({"id": "1", "title": "Wake up, Harry", "cast":["Josh Lawson","Chris Parnell"],"points":63,"starring":"Will Ferrell"})",
|
|
R"({"id": "90", "cast": ["Kim Werrel", "Random Wake"]})", // missing fields
|
|
R"({"id": "5", "points": 60, "cast":["Logan Lerman","Alexandra Daddario"],"starring":"Ron Perlman","starring_facet":"Ron Perlman","title":"Percy Jackson: Sea of Monsters"})",
|
|
R"({"id": "24", "starring": "John", "cast": ["John Kim"], "points": 11})"}; // missing fields
|
|
|
|
bool return_id = true;
|
|
import_response = coll_mul_fields->add_many(more_records, document, UPSERT, "",
|
|
DIRTY_VALUES::COERCE_OR_REJECT, false, return_id);
|
|
|
|
ASSERT_FALSE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(2, import_response["num_imported"].get<int>());
|
|
|
|
import_results = import_res_to_json(more_records);
|
|
ASSERT_FALSE(import_results[1]["success"].get<bool>());
|
|
ASSERT_FALSE(import_results[3]["success"].get<bool>());
|
|
ASSERT_STREQ("Field `points` has been declared as a default sorting field, but is not found in the document.", import_results[1]["error"].get<std::string>().c_str());
|
|
ASSERT_STREQ("Field `title` has been declared in the schema, but is not found in the document.", import_results[3]["error"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ("1", import_results[0]["id"].get<std::string>());
|
|
ASSERT_EQ("90", import_results[1]["id"].get<std::string>());
|
|
ASSERT_EQ("5", import_results[2]["id"].get<std::string>());
|
|
ASSERT_EQ("24", import_results[3]["id"].get<std::string>());
|
|
|
|
// try to duplicate records without upsert option
|
|
|
|
more_records = {R"({"id": "1", "title": "Wake up, Harry"})",
|
|
R"({"id": "5", "points": 60})"};
|
|
|
|
import_response = coll_mul_fields->add_many(more_records, document, CREATE, "",
|
|
DIRTY_VALUES::COERCE_OR_REJECT, false);
|
|
ASSERT_FALSE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(0, import_response["num_imported"].get<int>());
|
|
|
|
import_results = import_res_to_json(more_records);
|
|
ASSERT_FALSE(import_results[0]["success"].get<bool>());
|
|
ASSERT_FALSE(import_results[1]["success"].get<bool>());
|
|
ASSERT_STREQ("A document with id 1 already exists.", import_results[0]["error"].get<std::string>().c_str());
|
|
ASSERT_STREQ("A document with id 5 already exists.", import_results[1]["error"].get<std::string>().c_str());
|
|
|
|
// doc should not be returned, since return_doc = false
|
|
ASSERT_FALSE(import_results[0].contains("document"));
|
|
|
|
// update document with verbatim fields, except for points
|
|
more_records = {R"({"id": "3", "cast":["Matt Damon","Ben Affleck","Minnie Driver"],
|
|
"points":70,"starring":"Robin Williams","starring_facet":"Robin Williams",
|
|
"title":"Good Will Hunting"})"};
|
|
|
|
import_response = coll_mul_fields->add_many(more_records, document, UPDATE);
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(1, import_response["num_imported"].get<int>());
|
|
|
|
results = coll_mul_fields->search("Good Will Hunting", query_fields, "", {"starring"}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(70, results["hits"][0]["document"]["points"].get<uint32_t>());
|
|
|
|
// updating a document that does not exist should fail, others should succeed
|
|
more_records = {R"({"id": "20", "points": 51})",
|
|
R"({"id": "1", "points": 64})"};
|
|
|
|
import_response = coll_mul_fields->add_many(more_records, document, UPDATE);
|
|
ASSERT_FALSE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(1, import_response["num_imported"].get<int>());
|
|
|
|
import_results = import_res_to_json(more_records);
|
|
ASSERT_FALSE(import_results[0]["success"].get<bool>());
|
|
ASSERT_TRUE(import_results[1]["success"].get<bool>());
|
|
ASSERT_STREQ("Could not find a document with id: 20", import_results[0]["error"].get<std::string>().c_str());
|
|
ASSERT_EQ(404, import_results[0]["code"].get<size_t>());
|
|
|
|
results = coll_mul_fields->search("wake up harry", query_fields, "", {"starring"}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(64, results["hits"][0]["document"]["points"].get<uint32_t>());
|
|
|
|
// trying to create documents with existing IDs should fail
|
|
more_records = {R"({"id": "2", "points": 51})",
|
|
R"({"id": "1", "points": 64})"};
|
|
|
|
import_response = coll_mul_fields->add_many(more_records, document, CREATE);
|
|
ASSERT_FALSE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(0, import_response["num_imported"].get<int>());
|
|
|
|
import_results = import_res_to_json(more_records);
|
|
ASSERT_FALSE(import_results[0]["success"].get<bool>());
|
|
ASSERT_FALSE(import_results[1]["success"].get<bool>());
|
|
ASSERT_STREQ("A document with id 2 already exists.", import_results[0]["error"].get<std::string>().c_str());
|
|
ASSERT_STREQ("A document with id 1 already exists.", import_results[1]["error"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(409, import_results[0]["code"].get<size_t>());
|
|
ASSERT_EQ(409, import_results[1]["code"].get<size_t>());
|
|
}
|
|
|
|
TEST_F(CollectionTest, ImportDocumentsEmplace) {
|
|
Collection* coll1;
|
|
std::vector<field> fields = {
|
|
field("title", field_types::STRING, false, false),
|
|
field("points", field_types::INT32, false, false)
|
|
};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields).get();
|
|
}
|
|
|
|
nlohmann::json document;
|
|
std::vector<std::string> records = {R"({"id": "0", "title": "The Matrix", "points":0})",
|
|
R"({"id": "1", "title": "Inception", "points":1})"};
|
|
std::vector<nlohmann::json> docs = import_res_to_json(records);
|
|
|
|
// use `emplace` mode for creating documents
|
|
auto import_response = coll1->add_many(records, document, EMPLACE, "", DIRTY_VALUES::COERCE_OR_REJECT, true, true);
|
|
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(2, import_response["num_imported"].get<int>());
|
|
|
|
std::vector<nlohmann::json> import_results = import_res_to_json(records);
|
|
ASSERT_EQ(2, import_results.size());
|
|
|
|
for (size_t i = 0; i < 2; i++) {
|
|
ASSERT_TRUE(import_results[i]["success"].get<bool>());
|
|
ASSERT_EQ(3, import_results[i].size());
|
|
ASSERT_EQ(docs[i], import_results[i]["document"]);
|
|
ASSERT_EQ(docs[i]["id"], import_results[i]["id"]);
|
|
}
|
|
|
|
auto res = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10).get();
|
|
ASSERT_EQ(2, res["found"].get<size_t>());
|
|
|
|
// emplace both update + create
|
|
records = {R"({"id": "1", "title": "The Inception"})",
|
|
R"({"id": "2", "title": "Spiderman", "points":2})"};
|
|
|
|
import_response = coll1->add_many(records, document, EMPLACE);
|
|
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(2, import_response["num_imported"].get<int>());
|
|
|
|
import_results = import_res_to_json(records);
|
|
ASSERT_EQ(2, import_results.size());
|
|
|
|
for (size_t i = 0; i < 2; i++) {
|
|
ASSERT_TRUE(import_results[i]["success"].get<bool>());
|
|
ASSERT_EQ(1, import_results[i].size());
|
|
}
|
|
|
|
res = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10).get();
|
|
ASSERT_EQ(3, res["found"].get<size_t>());
|
|
|
|
ASSERT_EQ("2", res["hits"][0]["document"]["id"].get<std::string>());
|
|
ASSERT_EQ(2, res["hits"][0]["document"]["points"].get<size_t>());
|
|
|
|
ASSERT_EQ("1", res["hits"][1]["document"]["id"].get<std::string>());
|
|
ASSERT_EQ(1, res["hits"][1]["document"]["points"].get<size_t>());
|
|
ASSERT_EQ("The Inception", res["hits"][1]["document"]["title"].get<std::string>());
|
|
|
|
ASSERT_EQ("0", res["hits"][2]["document"]["id"].get<std::string>());
|
|
ASSERT_EQ(0, res["hits"][2]["document"]["points"].get<size_t>());
|
|
|
|
// emplace with an error due to bad data
|
|
records = {R"({"id": "2", "points": "abcd"})",
|
|
R"({"id": "3", "title": "Superman", "points":3})"};
|
|
|
|
import_response = coll1->add_many(records, document, EMPLACE);
|
|
|
|
ASSERT_FALSE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(1, import_response["num_imported"].get<int>());
|
|
|
|
import_results = import_res_to_json(records);
|
|
|
|
ASSERT_EQ(2, import_results.size());
|
|
|
|
ASSERT_FALSE(import_results[0]["success"].get<bool>());
|
|
|
|
ASSERT_TRUE(import_results[1]["success"].get<bool>());
|
|
ASSERT_EQ(1, import_results[1].size());
|
|
ASSERT_EQ(1, import_results[1].size());
|
|
|
|
// can update individual document via "emplace" with only partial field (missing points)
|
|
std::string doc_3_update = R"({"id": "3", "title": "The Superman"})";
|
|
auto add_op = coll1->add(doc_3_update, EMPLACE);
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
res = coll1->search("superman", {"title"}, "", {}, {}, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10).get();
|
|
ASSERT_EQ(1, res["found"].get<size_t>());
|
|
|
|
ASSERT_EQ("3", res["hits"][0]["document"]["id"].get<std::string>());
|
|
ASSERT_EQ(3, res["hits"][0]["document"]["points"].get<size_t>());
|
|
ASSERT_EQ("The Superman", res["hits"][0]["document"]["title"].get<std::string>());
|
|
|
|
// can create individual document via "emplace"
|
|
std::string doc_4_create = R"({"id": "4", "title": "The Avengers", "points": 4})";
|
|
add_op = coll1->add(doc_4_create, EMPLACE);
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
res = coll1->search("*", {}, "", {}, {}, {0}, 10, 1, token_ordering::FREQUENCY, {true}, 10).get();
|
|
ASSERT_EQ(5, res["found"].get<size_t>());
|
|
}
|
|
|
|
TEST_F(CollectionTest, DISABLED_CrashTroubleshooting) {
|
|
Collection *coll1;
|
|
std::vector<field> fields = {
|
|
field("title", field_types::STRING_ARRAY, false, true),
|
|
field("points", field_types::INT32, false)
|
|
};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
std::ifstream create_file("/tmp/create.jsonl");
|
|
std::string json_line;
|
|
std::vector<std::string> create_records;
|
|
|
|
while (std::getline(create_file, json_line)) {
|
|
create_records.push_back(json_line);
|
|
}
|
|
|
|
create_file.close();
|
|
|
|
nlohmann::json document;
|
|
auto import_response = coll1->add_many(create_records, document, CREATE);
|
|
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
|
|
|
|
// now try to upsert
|
|
|
|
std::ifstream upsert_file("/tmp/upsert.jsonl");
|
|
std::vector<std::string> upsert_records;
|
|
|
|
while (std::getline(upsert_file, json_line)) {
|
|
upsert_records.push_back(json_line);
|
|
}
|
|
|
|
upsert_file.close();
|
|
|
|
import_response = coll1->add_many(upsert_records, document, UPSERT);
|
|
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
|
|
}
|
|
|
|
TEST_F(CollectionTest, ImportDocumentsUpsertOptional) {
|
|
Collection *coll1;
|
|
std::vector<field> fields = {
|
|
field("title", field_types::STRING_ARRAY, false, true),
|
|
field("points", field_types::INT32, false)
|
|
};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::string> records;
|
|
|
|
size_t NUM_RECORDS = 1000;
|
|
|
|
for(size_t i=0; i<NUM_RECORDS; i++) {
|
|
nlohmann::json doc;
|
|
doc["id"] = std::to_string(i);
|
|
doc["points"] = i;
|
|
records.push_back(doc.dump());
|
|
}
|
|
|
|
// import records without title
|
|
|
|
nlohmann::json document;
|
|
nlohmann::json import_response = coll1->add_many(records, document, CREATE);
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
|
|
|
|
// upsert documents with title
|
|
|
|
records.clear();
|
|
|
|
for(size_t i=0; i<NUM_RECORDS; i++) {
|
|
nlohmann::json updoc;
|
|
updoc["id"] = std::to_string(i);
|
|
updoc["points"] = i;
|
|
updoc["title"] = {
|
|
get_text(10),
|
|
get_text(10),
|
|
get_text(10),
|
|
get_text(10),
|
|
};
|
|
records.push_back(updoc.dump());
|
|
}
|
|
|
|
auto begin = std::chrono::high_resolution_clock::now();
|
|
import_response = coll1->add_many(records, document, UPSERT);
|
|
auto time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
|
|
std::chrono::high_resolution_clock::now() - begin).count();
|
|
|
|
//LOG(INFO) << "Time taken for first upsert: " << time_micros;
|
|
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
|
|
|
|
// run upsert again with title override
|
|
|
|
records.clear();
|
|
|
|
for(size_t i=0; i<NUM_RECORDS; i++) {
|
|
nlohmann::json updoc;
|
|
updoc["id"] = std::to_string(i);
|
|
updoc["points"] = i;
|
|
updoc["title"] = {
|
|
get_text(10),
|
|
get_text(10),
|
|
get_text(10),
|
|
get_text(10),
|
|
};
|
|
records.push_back(updoc.dump());
|
|
}
|
|
|
|
begin = std::chrono::high_resolution_clock::now();
|
|
import_response = coll1->add_many(records, document, UPSERT);
|
|
time_micros = std::chrono::duration_cast<std::chrono::microseconds>(
|
|
std::chrono::high_resolution_clock::now() - begin).count();
|
|
|
|
//LOG(INFO) << "Time taken for second upsert: " << time_micros;
|
|
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
|
|
|
|
// update records (can contain partial fields)
|
|
|
|
records.clear();
|
|
|
|
for(size_t i=0; i<NUM_RECORDS; i++) {
|
|
nlohmann::json updoc;
|
|
updoc["id"] = std::to_string(i);
|
|
// no points field
|
|
updoc["title"] = {
|
|
get_text(10),
|
|
get_text(10),
|
|
get_text(10),
|
|
get_text(10),
|
|
};
|
|
records.push_back(updoc.dump());
|
|
}
|
|
|
|
import_response = coll1->add_many(records, document, UPDATE);
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(1000, import_response["num_imported"].get<int>());
|
|
}
|
|
|
|
TEST_F(CollectionTest, ImportDocuments) {
|
|
Collection *coll_mul_fields;
|
|
|
|
std::ifstream infile(std::string(ROOT_DIR)+"test/multi_field_documents.jsonl");
|
|
std::stringstream strstream;
|
|
strstream << infile.rdbuf();
|
|
infile.close();
|
|
|
|
std::vector<std::string> import_records;
|
|
StringUtils::split(strstream.str(), import_records, "\n");
|
|
|
|
std::vector<field> fields = {
|
|
field("title", field_types::STRING, false),
|
|
field("starring", field_types::STRING, false),
|
|
field("cast", field_types::STRING_ARRAY, false),
|
|
field("points", field_types::INT32, false)
|
|
};
|
|
|
|
coll_mul_fields = collectionManager.get_collection("coll_mul_fields").get();
|
|
if(coll_mul_fields == nullptr) {
|
|
coll_mul_fields = collectionManager.create_collection("coll_mul_fields", 4, fields, "points").get();
|
|
}
|
|
|
|
// try importing records
|
|
nlohmann::json document;
|
|
nlohmann::json import_response = coll_mul_fields->add_many(import_records, document);
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(18, import_response["num_imported"].get<int>());
|
|
|
|
// now try searching for records
|
|
|
|
query_fields = {"title", "starring"};
|
|
std::vector<std::string> facets;
|
|
|
|
auto x = coll_mul_fields->search("Will", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false});
|
|
|
|
nlohmann::json results = coll_mul_fields->search("Will", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(4, results["hits"].size());
|
|
|
|
std::vector<std::string> ids = {"3", "2", "1", "0"};
|
|
|
|
for(size_t i = 0; i < results["hits"].size(); i++) {
|
|
nlohmann::json result = results["hits"].at(i);
|
|
std::string result_id = result["document"]["id"];
|
|
std::string id = ids.at(i);
|
|
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
|
}
|
|
|
|
// verify that empty import is handled gracefully
|
|
std::vector<std::string> empty_records;
|
|
import_response = coll_mul_fields->add_many(empty_records, document);
|
|
ASSERT_TRUE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(0, import_response["num_imported"].get<int>());
|
|
|
|
// verify that only bad records are rejected, rest must be imported (records 2 and 4 are bad)
|
|
std::vector<std::string> more_records = {"{\"id\": \"id1\", \"title\": \"Test1\", \"starring\": \"Rand Fish\", \"points\": 12, "
|
|
"\"cast\": [\"Tom Skerritt\"] }",
|
|
"{\"title\": 123, \"starring\": \"Jazz Gosh\", \"points\": 23, "
|
|
"\"cast\": [\"Tom Skerritt\"] }",
|
|
"{\"title\": \"Test3\", \"starring\": \"Brad Fin\", \"points\": 11, "
|
|
"\"cast\": [\"Tom Skerritt\"] }",
|
|
"{\"title\": \"Test4\", \"points\": 55, "
|
|
"\"cast\": [\"Tom Skerritt\"] }"};
|
|
|
|
import_response = coll_mul_fields->add_many(more_records, document, CREATE, "", DIRTY_VALUES::REJECT, true);
|
|
ASSERT_FALSE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(2, import_response["num_imported"].get<int>());
|
|
|
|
std::vector<nlohmann::json> import_results = import_res_to_json(more_records);
|
|
|
|
ASSERT_EQ(4, import_results.size());
|
|
ASSERT_TRUE(import_results[0]["success"].get<bool>());
|
|
ASSERT_FALSE(import_results[1]["success"].get<bool>());
|
|
ASSERT_TRUE(import_results[2]["success"].get<bool>());
|
|
ASSERT_FALSE(import_results[3]["success"].get<bool>());
|
|
|
|
ASSERT_STREQ("Field `title` must be a string.", import_results[1]["error"].get<std::string>().c_str());
|
|
ASSERT_STREQ("Field `starring` has been declared in the schema, but is not found in the document.",
|
|
import_results[3]["error"].get<std::string>().c_str());
|
|
ASSERT_STREQ("{\"title\": 123, \"starring\": \"Jazz Gosh\", \"points\": 23, \"cast\": [\"Tom Skerritt\"] }",
|
|
import_results[1]["document"].get<std::string>().c_str());
|
|
|
|
// record with duplicate IDs
|
|
|
|
more_records = {"{\"id\": \"id2\", \"title\": \"Test1\", \"starring\": \"Rand Fish\", \"points\": 12, "
|
|
"\"cast\": [\"Tom Skerritt\"] }",
|
|
"{\"id\": \"id1\", \"title\": \"Test1\", \"starring\": \"Rand Fish\", \"points\": 12, "
|
|
"\"cast\": [\"Tom Skerritt\"] }"};
|
|
|
|
import_response = coll_mul_fields->add_many(more_records, document, CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT, true);
|
|
|
|
ASSERT_FALSE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(1, import_response["num_imported"].get<int>());
|
|
|
|
import_results = import_res_to_json(more_records);
|
|
ASSERT_EQ(2, import_results.size());
|
|
ASSERT_TRUE(import_results[0]["success"].get<bool>());
|
|
ASSERT_FALSE(import_results[1]["success"].get<bool>());
|
|
|
|
ASSERT_STREQ("A document with id id1 already exists.", import_results[1]["error"].get<std::string>().c_str());
|
|
ASSERT_STREQ("{\"id\": \"id1\", \"title\": \"Test1\", \"starring\": \"Rand Fish\", \"points\": 12, "
|
|
"\"cast\": [\"Tom Skerritt\"] }",import_results[1]["document"].get<std::string>().c_str());
|
|
|
|
// handle bad import json
|
|
|
|
// valid JSON but not a document
|
|
more_records = {"[]"};
|
|
import_response = coll_mul_fields->add_many(more_records, document, CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT, true);
|
|
|
|
ASSERT_FALSE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(0, import_response["num_imported"].get<int>());
|
|
|
|
import_results = import_res_to_json(more_records);
|
|
ASSERT_EQ(1, import_results.size());
|
|
|
|
ASSERT_EQ(false, import_results[0]["success"].get<bool>());
|
|
ASSERT_STREQ("Bad JSON: not a properly formed document.", import_results[0]["error"].get<std::string>().c_str());
|
|
ASSERT_STREQ("[]", import_results[0]["document"].get<std::string>().c_str());
|
|
|
|
// invalid JSON
|
|
more_records = {"{"};
|
|
import_response = coll_mul_fields->add_many(more_records, document, CREATE, "", DIRTY_VALUES::COERCE_OR_REJECT, true);
|
|
|
|
ASSERT_FALSE(import_response["success"].get<bool>());
|
|
ASSERT_EQ(0, import_response["num_imported"].get<int>());
|
|
|
|
import_results = import_res_to_json(more_records);
|
|
ASSERT_EQ(1, import_results.size());
|
|
|
|
ASSERT_EQ(false, import_results[0]["success"].get<bool>());
|
|
ASSERT_STREQ("Bad JSON: [json.exception.parse_error.101] parse error at line 1, column 2: syntax error "
|
|
"while parsing object key - unexpected end of input; expected string literal",
|
|
import_results[0]["error"].get<std::string>().c_str());
|
|
ASSERT_STREQ("{", import_results[0]["document"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll_mul_fields");
|
|
}
|
|
|
|
TEST_F(CollectionTest, SearchingWithMissingFields) {
|
|
// return error without crashing when searching for fields that do not conform to the schema
|
|
Collection *coll_array_fields;
|
|
|
|
std::ifstream infile(std::string(ROOT_DIR)+"test/numeric_array_documents.jsonl");
|
|
std::vector<field> fields = {field("name", field_types::STRING, false),
|
|
field("age", field_types::INT32, false),
|
|
field("years", field_types::INT32_ARRAY, false),
|
|
field("timestamps", field_types::INT64_ARRAY, false),
|
|
field("tags", field_types::STRING_ARRAY, true)};
|
|
|
|
std::vector<sort_by> sort_fields = { sort_by("age", "DESC") };
|
|
|
|
coll_array_fields = collectionManager.get_collection("coll_array_fields").get();
|
|
if(coll_array_fields == nullptr) {
|
|
coll_array_fields = collectionManager.create_collection("coll_array_fields", 4, fields, "age").get();
|
|
}
|
|
|
|
std::string json_line;
|
|
|
|
while (std::getline(infile, json_line)) {
|
|
coll_array_fields->add(json_line);
|
|
}
|
|
|
|
infile.close();
|
|
|
|
// when a query field mentioned in schema does not exist
|
|
std::vector<std::string> facets;
|
|
std::vector<std::string> query_fields_not_found = {"titlez"};
|
|
|
|
Option<nlohmann::json> res_op = coll_array_fields->search("the", query_fields_not_found, "", facets, sort_fields, {0}, 10);
|
|
ASSERT_FALSE(res_op.ok());
|
|
ASSERT_EQ(404, res_op.code());
|
|
ASSERT_STREQ("Could not find a field named `titlez` in the schema.", res_op.error().c_str());
|
|
|
|
// when a query field is an integer field
|
|
res_op = coll_array_fields->search("the", {"age"}, "", facets, sort_fields, {0}, 10);
|
|
ASSERT_EQ(400, res_op.code());
|
|
ASSERT_STREQ("Field `age` should be a string or a string array.", res_op.error().c_str());
|
|
|
|
// when a facet field is not defined in the schema
|
|
res_op = coll_array_fields->search("the", {"name"}, "", {"timestamps"}, sort_fields, {0}, 10);
|
|
ASSERT_EQ(404, res_op.code());
|
|
ASSERT_STREQ("Could not find a facet field named `timestamps` in the schema.", res_op.error().c_str());
|
|
|
|
// when a rank field is not defined in the schema
|
|
res_op = coll_array_fields->search("the", {"name"}, "", {}, { sort_by("timestamps", "ASC") }, {0}, 10);
|
|
ASSERT_EQ(404, res_op.code());
|
|
ASSERT_STREQ("Could not find a field named `timestamps` in the schema for sorting.", res_op.error().c_str());
|
|
|
|
res_op = coll_array_fields->search("the", {"name"}, "", {}, { sort_by("_rank", "ASC") }, {0}, 10);
|
|
ASSERT_EQ(404, res_op.code());
|
|
ASSERT_STREQ("Could not find a field named `_rank` in the schema for sorting.", res_op.error().c_str());
|
|
|
|
collectionManager.drop_collection("coll_array_fields");
|
|
}
|
|
|
|
TEST_F(CollectionTest, IndexingWithBadData) {
|
|
// should not crash when document to-be-indexed doesn't match schema
|
|
Collection *sample_collection;
|
|
|
|
std::vector<field> fields = {field("name", field_types::STRING, false),
|
|
field("tags", field_types::STRING_ARRAY, true),
|
|
field("age", field_types::INT32, false),
|
|
field("average", field_types::INT32, false) };
|
|
|
|
std::vector<sort_by> sort_fields = { sort_by("age", "DESC"), sort_by("average", "DESC") };
|
|
|
|
sample_collection = collectionManager.get_collection("sample_collection").get();
|
|
if(sample_collection == nullptr) {
|
|
sample_collection = collectionManager.create_collection("sample_collection", 4, fields, "age").get();
|
|
}
|
|
|
|
const Option<nlohmann::json> & search_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 29, \"average\": 78}");
|
|
ASSERT_FALSE(search_fields_missing_op1.ok());
|
|
ASSERT_STREQ("Field `tags` has been declared in the schema, but is not found in the document.",
|
|
search_fields_missing_op1.error().c_str());
|
|
|
|
const Option<nlohmann::json> & search_fields_missing_op2 = sample_collection->add("{\"namez\": \"foo\", \"tags\": [], \"age\": 34, \"average\": 78}");
|
|
ASSERT_FALSE(search_fields_missing_op2.ok());
|
|
ASSERT_STREQ("Field `name` has been declared in the schema, but is not found in the document.",
|
|
search_fields_missing_op2.error().c_str());
|
|
|
|
const Option<nlohmann::json> & facet_fields_missing_op1 = sample_collection->add("{\"name\": \"foo\", \"age\": 34, \"average\": 78}");
|
|
ASSERT_FALSE(facet_fields_missing_op1.ok());
|
|
ASSERT_STREQ("Field `tags` has been declared in the schema, but is not found in the document.",
|
|
facet_fields_missing_op1.error().c_str());
|
|
|
|
const char *doc_str = "{\"name\": \"foo\", \"age\": 34, \"avg\": 78, \"tags\": [\"red\", \"blue\"]}";
|
|
const Option<nlohmann::json> & sort_fields_missing_op1 = sample_collection->add(doc_str);
|
|
ASSERT_FALSE(sort_fields_missing_op1.ok());
|
|
ASSERT_STREQ("Field `average` has been declared in the schema, but is not found in the document.",
|
|
sort_fields_missing_op1.error().c_str());
|
|
|
|
// Handle type errors
|
|
|
|
doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": 22, \"average\": 78}";
|
|
const Option<nlohmann::json> & bad_facet_field_op = sample_collection->add(doc_str);
|
|
ASSERT_FALSE(bad_facet_field_op.ok());
|
|
ASSERT_STREQ("Field `tags` must be an array.", bad_facet_field_op.error().c_str());
|
|
|
|
doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [\"red\", 22], \"average\": 78}";
|
|
const Option<nlohmann::json> & bad_array_field_op = sample_collection->add(doc_str, CREATE, "",
|
|
DIRTY_VALUES::REJECT);
|
|
ASSERT_FALSE(bad_array_field_op.ok());
|
|
ASSERT_STREQ("Field `tags` must be an array of string.", bad_array_field_op.error().c_str());
|
|
|
|
// with coercion should work
|
|
doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [\"red\", 22], \"average\": 78}";
|
|
const Option<nlohmann::json> &bad_array_field_coercion_op = sample_collection->add(doc_str, CREATE, "",
|
|
DIRTY_VALUES::COERCE_OR_REJECT);
|
|
ASSERT_TRUE(bad_array_field_coercion_op.ok());
|
|
|
|
doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [], \"average\": 34}";
|
|
const Option<nlohmann::json> & empty_facet_field_op = sample_collection->add(doc_str);
|
|
ASSERT_TRUE(empty_facet_field_op.ok());
|
|
|
|
doc_str = "{\"name\": \"foo\", \"age\": [\"34\"], \"tags\": [], \"average\": 34 }";
|
|
const Option<nlohmann::json> & bad_default_sorting_field_op1 = sample_collection->add(doc_str);
|
|
ASSERT_FALSE(bad_default_sorting_field_op1.ok());
|
|
ASSERT_STREQ("Field `age` must be an int32.", bad_default_sorting_field_op1.error().c_str());
|
|
|
|
doc_str = "{\"name\": \"foo\", \"tags\": [], \"average\": 34 }";
|
|
const Option<nlohmann::json> & bad_default_sorting_field_op3 = sample_collection->add(doc_str);
|
|
ASSERT_FALSE(bad_default_sorting_field_op3.ok());
|
|
ASSERT_STREQ("Field `age` has been declared as a default sorting field, but is not found in the document.",
|
|
bad_default_sorting_field_op3.error().c_str());
|
|
|
|
doc_str = "{\"name\": \"foo\", \"age\": 34, \"tags\": [], \"average\": \"34\"}";
|
|
const Option<nlohmann::json> & bad_rank_field_op = sample_collection->add(doc_str, CREATE, "", DIRTY_VALUES::REJECT);
|
|
ASSERT_FALSE(bad_rank_field_op.ok());
|
|
ASSERT_STREQ("Field `average` must be an int32.", bad_rank_field_op.error().c_str());
|
|
|
|
doc_str = "{\"name\": \"foo\", \"age\": asdadasd, \"tags\": [], \"average\": 34 }";
|
|
const Option<nlohmann::json> & bad_default_sorting_field_op4 = sample_collection->add(doc_str);
|
|
ASSERT_FALSE(bad_default_sorting_field_op4.ok());
|
|
ASSERT_STREQ("Bad JSON: [json.exception.parse_error.101] parse error at line 1, column 24: syntax error "
|
|
"while parsing value - invalid literal; last read: '\"age\": a'",
|
|
bad_default_sorting_field_op4.error().c_str());
|
|
|
|
// should return an error when a document with pre-existing id is being added
|
|
std::string doc = "{\"id\": \"100\", \"name\": \"foo\", \"age\": 29, \"tags\": [], \"average\": 78}";
|
|
Option<nlohmann::json> add_op = sample_collection->add(doc);
|
|
ASSERT_TRUE(add_op.ok());
|
|
add_op = sample_collection->add(doc);
|
|
ASSERT_FALSE(add_op.ok());
|
|
ASSERT_EQ(409, add_op.code());
|
|
ASSERT_STREQ("A document with id 100 already exists.", add_op.error().c_str());
|
|
|
|
collectionManager.drop_collection("sample_collection");
|
|
}
|
|
|
|
TEST_F(CollectionTest, EmptyIndexShouldNotCrash) {
|
|
Collection *empty_coll;
|
|
|
|
std::vector<field> fields = {field("name", field_types::STRING, false),
|
|
field("tags", field_types::STRING_ARRAY, false),
|
|
field("age", field_types::INT32, false),
|
|
field("average", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = { sort_by("age", "DESC"), sort_by("average", "DESC") };
|
|
|
|
empty_coll = collectionManager.get_collection("empty_coll").get();
|
|
if(empty_coll == nullptr) {
|
|
empty_coll = collectionManager.create_collection("empty_coll", 4, fields, "age").get();
|
|
}
|
|
|
|
nlohmann::json results = empty_coll->search("a", {"name"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
collectionManager.drop_collection("empty_coll");
|
|
}
|
|
|
|
TEST_F(CollectionTest, IdFieldShouldBeAString) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("name", field_types::STRING, false),
|
|
field("tags", field_types::STRING_ARRAY, false),
|
|
field("age", field_types::INT32, false),
|
|
field("average", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = { sort_by("age", "DESC"), sort_by("average", "DESC") };
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "age").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
doc["id"] = 101010;
|
|
doc["name"] = "Jane";
|
|
doc["age"] = 25;
|
|
doc["average"] = 98;
|
|
doc["tags"] = nlohmann::json::array();
|
|
doc["tags"].push_back("tag1");
|
|
|
|
Option<nlohmann::json> inserted_id_op = coll1->add(doc.dump());
|
|
ASSERT_FALSE(inserted_id_op.ok());
|
|
ASSERT_STREQ("Document's `id` field should be a string.", inserted_id_op.error().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, AnIntegerCanBePassedToAFloatField) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("name", field_types::STRING, false),
|
|
field("average", field_types::FLOAT, false)};
|
|
|
|
std::vector<sort_by> sort_fields = { sort_by("average", "DESC") };
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "average").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
doc["id"] = "101010";
|
|
doc["name"] = "Jane";
|
|
doc["average"] = 98;
|
|
|
|
Option<nlohmann::json> inserted_id_op = coll1->add(doc.dump());
|
|
EXPECT_TRUE(inserted_id_op.ok());
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, DeletionOfADocument) {
|
|
collectionManager.drop_collection("collection");
|
|
|
|
std::ifstream infile(std::string(ROOT_DIR)+"test/documents.jsonl");
|
|
|
|
std::vector<field> search_fields = {field("title", field_types::STRING, false),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
|
|
std::vector<std::string> query_fields = {"title"};
|
|
std::vector<sort_by> sort_fields = { sort_by("points", "DESC") };
|
|
|
|
Collection *collection_for_del;
|
|
collection_for_del = collectionManager.get_collection("collection_for_del").get();
|
|
if(collection_for_del == nullptr) {
|
|
collection_for_del = collectionManager.create_collection("collection_for_del", 4, search_fields, "points").get();
|
|
}
|
|
|
|
std::string json_line;
|
|
rocksdb::Iterator* it;
|
|
size_t num_keys = 0;
|
|
|
|
// dummy record for record id 0: to make the test record IDs to match with line numbers
|
|
json_line = "{\"points\":10,\"title\":\"z\"}";
|
|
collection_for_del->add(json_line);
|
|
|
|
while (std::getline(infile, json_line)) {
|
|
collection_for_del->add(json_line);
|
|
}
|
|
|
|
ASSERT_EQ(25, collection_for_del->get_num_documents());
|
|
|
|
infile.close();
|
|
|
|
nlohmann::json results;
|
|
|
|
// asserts before removing any record
|
|
results = collection_for_del->search("cryogenic", query_fields, "", {}, sort_fields, {0}, 5, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
it = store->get_iterator();
|
|
num_keys = 0;
|
|
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
|
num_keys += 1;
|
|
}
|
|
ASSERT_EQ(25+25+3, num_keys); // 25 records, 25 id mapping, 3 meta keys
|
|
delete it;
|
|
|
|
// actually remove a record now
|
|
collection_for_del->remove("1");
|
|
|
|
results = collection_for_del->search("cryogenic", query_fields, "", {}, sort_fields, {0}, 5, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
ASSERT_EQ(0, results["found"]);
|
|
|
|
results = collection_for_del->search("archives", query_fields, "", {}, sort_fields, {0}, 5, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
ASSERT_EQ(1, results["found"]);
|
|
|
|
collection_for_del->remove("foo"); // custom id record
|
|
results = collection_for_del->search("martian", query_fields, "", {}, sort_fields, {0}, 5, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
ASSERT_EQ(0, results["found"]);
|
|
|
|
// delete all records
|
|
for(int id = 0; id <= 25; id++) {
|
|
collection_for_del->remove(std::to_string(id));
|
|
}
|
|
|
|
ASSERT_EQ(0, collection_for_del->get_num_documents());
|
|
|
|
it = store->get_iterator();
|
|
num_keys = 0;
|
|
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
|
num_keys += 1;
|
|
}
|
|
delete it;
|
|
ASSERT_EQ(3, num_keys);
|
|
|
|
collectionManager.drop_collection("collection_for_del");
|
|
}
|
|
|
|
TEST_F(CollectionTest, DeletionOfDocumentSingularFields) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("str", field_types::STRING, false),
|
|
field("int32", field_types::INT32, false),
|
|
field("int64", field_types::INT64, false),
|
|
field("float", field_types::FLOAT, false),
|
|
field("bool", field_types::BOOL, false)};
|
|
|
|
std::vector<sort_by> sort_fields = { sort_by("int32", "DESC") };
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "int32").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
doc["id"] = "100";
|
|
doc["str"] = "[NEW] Cell Phone Cases, Holders & Clips!";
|
|
doc["int32"] = 100032;
|
|
doc["int64"] = 1582369739000;
|
|
doc["float"] = -293.24;
|
|
doc["bool"] = true;
|
|
|
|
Option<nlohmann::json> add_op = coll1->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
nlohmann::json res = coll1->search("phone", {"str"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10).get();
|
|
|
|
ASSERT_EQ(1, res["found"]);
|
|
|
|
Option<std::string> rem_op = coll1->remove("100");
|
|
|
|
ASSERT_TRUE(rem_op.ok());
|
|
|
|
res = coll1->search("phone", {"str"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10).get();
|
|
|
|
ASSERT_EQ(0, res["found"].get<int32_t>());
|
|
|
|
// also assert against the actual index
|
|
const Index *index = coll1->_get_index(); // seq id will always be zero for first document
|
|
auto search_index = index->_get_search_index();
|
|
auto numerical_index = index->_get_numerical_index();
|
|
|
|
auto str_tree = search_index["str"];
|
|
auto int32_tree = numerical_index["int32"];
|
|
auto int64_tree = numerical_index["int64"];
|
|
auto float_tree = numerical_index["float"];
|
|
auto bool_tree = numerical_index["bool"];
|
|
|
|
ASSERT_EQ(0, art_size(str_tree));
|
|
|
|
ASSERT_EQ(0, int32_tree->size());
|
|
ASSERT_EQ(0, int64_tree->size());
|
|
ASSERT_EQ(0, float_tree->size());
|
|
ASSERT_EQ(0, bool_tree->size());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, DeletionOfDocumentArrayFields) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("strarray", field_types::STRING_ARRAY, false),
|
|
field("int32array", field_types::INT32_ARRAY, false),
|
|
field("int64array", field_types::INT64_ARRAY, false),
|
|
field("floatarray", field_types::FLOAT_ARRAY, false),
|
|
field("boolarray", field_types::BOOL_ARRAY, false),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = { sort_by("points", "DESC") };
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
doc["id"] = "100";
|
|
doc["strarray"] = {"Cell Phones", "Cell Phone Accessories", "Cell Phone Cases & Clips"};
|
|
doc["int32array"] = {100, 200, 300};
|
|
doc["int64array"] = {1582369739000, 1582369739000, 1582369739000};
|
|
doc["floatarray"] = {19.99, 400.999};
|
|
doc["boolarray"] = {true, false, true};
|
|
doc["points"] = 25;
|
|
|
|
Option<nlohmann::json> add_op = coll1->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
nlohmann::json res = coll1->search("phone", {"strarray"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10).get();
|
|
|
|
ASSERT_EQ(1, res["found"].get<size_t>());
|
|
|
|
Option<std::string> rem_op = coll1->remove("100");
|
|
|
|
ASSERT_TRUE(rem_op.ok());
|
|
|
|
res = coll1->search("phone", {"strarray"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10).get();
|
|
|
|
ASSERT_EQ(0, res["found"].get<int32_t>());
|
|
|
|
// also assert against the actual index
|
|
const Index *index = coll1->_get_index(); // seq id will always be zero for first document
|
|
auto search_index = index->_get_search_index();
|
|
auto numerical_index = index->_get_numerical_index();
|
|
|
|
auto strarray_tree = search_index["strarray"];
|
|
auto int32array_tree = numerical_index["int32array"];
|
|
auto int64array_tree = numerical_index["int64array"];
|
|
auto floatarray_tree = numerical_index["floatarray"];
|
|
auto boolarray_tree = numerical_index["boolarray"];
|
|
|
|
ASSERT_EQ(0, art_size(strarray_tree));
|
|
|
|
ASSERT_EQ(0, int32array_tree->size());
|
|
ASSERT_EQ(0, int64array_tree->size());
|
|
ASSERT_EQ(0, floatarray_tree->size());
|
|
ASSERT_EQ(0, boolarray_tree->size());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
nlohmann::json get_prune_doc() {
|
|
nlohmann::json document;
|
|
document["one"] = 1;
|
|
document["two"] = 2;
|
|
document["three"] = 3;
|
|
document["four"] = 4;
|
|
|
|
return document;
|
|
}
|
|
|
|
TEST_F(CollectionTest, SearchLargeTextField) {
|
|
Collection *coll_large_text;
|
|
|
|
std::vector<field> fields = {field("text", field_types::STRING, false),
|
|
field("age", field_types::INT32, false),
|
|
};
|
|
|
|
std::vector<sort_by> sort_fields = { sort_by(sort_field_const::text_match, "DESC"), sort_by("age", "DESC") };
|
|
|
|
coll_large_text = collectionManager.get_collection("coll_large_text").get();
|
|
if(coll_large_text == nullptr) {
|
|
coll_large_text = collectionManager.create_collection("coll_large_text", 4, fields, "age").get();
|
|
}
|
|
|
|
std::string json_line;
|
|
std::ifstream infile(std::string(ROOT_DIR)+"test/large_text_field.jsonl");
|
|
|
|
while (std::getline(infile, json_line)) {
|
|
coll_large_text->add(json_line);
|
|
}
|
|
|
|
infile.close();
|
|
|
|
Option<nlohmann::json> res_op = coll_large_text->search("eguilazer", {"text"}, "", {}, sort_fields, {0}, 10);
|
|
ASSERT_TRUE(res_op.ok());
|
|
nlohmann::json results = res_op.get();
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
res_op = coll_large_text->search("tristique", {"text"}, "", {}, sort_fields, {0}, 10);
|
|
ASSERT_TRUE(res_op.ok());
|
|
results = res_op.get();
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
// query whose length exceeds maximum highlight window (match score's WINDOW_SIZE)
|
|
res_op = coll_large_text->search(
|
|
"Phasellus non tristique elit Praesent non arcu id lectus accumsan venenatis at",
|
|
{"text"}, "", {}, sort_fields, {0}, 10
|
|
);
|
|
|
|
ASSERT_TRUE(res_op.ok());
|
|
results = res_op.get();
|
|
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
|
|
// only single matched token in match window
|
|
|
|
res_op = coll_large_text->search("molestie maecenas accumsan", {"text"}, "", {}, sort_fields, {0}, 10);
|
|
ASSERT_TRUE(res_op.ok());
|
|
results = res_op.get();
|
|
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_STREQ("non arcu id lectus <mark>accumsan</mark> venenatis at at justo.",
|
|
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll_large_text");
|
|
}
|
|
|
|
TEST_F(CollectionTest, PruneFieldsFromDocument) {
|
|
nlohmann::json document = get_prune_doc();
|
|
Collection::prune_doc(document, {"one", "two"}, tsl::htrie_set<char>());
|
|
ASSERT_EQ(2, document.size());
|
|
ASSERT_EQ(1, document["one"]);
|
|
ASSERT_EQ(2, document["two"]);
|
|
|
|
// exclude takes precedence
|
|
document = get_prune_doc();
|
|
Collection::prune_doc(document, {"one"}, {"one"});
|
|
ASSERT_EQ(0, document.size());
|
|
|
|
// when no inclusion is specified, should return all fields not mentioned by exclusion list
|
|
document = get_prune_doc();
|
|
Collection::prune_doc(document, tsl::htrie_set<char>(), tsl::htrie_set<char>({"three"}), "");
|
|
ASSERT_EQ(3, document.size());
|
|
ASSERT_EQ(1, document["one"]);
|
|
ASSERT_EQ(2, document["two"]);
|
|
ASSERT_EQ(4, document["four"]);
|
|
|
|
document = get_prune_doc();
|
|
Collection::prune_doc(document, tsl::htrie_set<char>(), tsl::htrie_set<char>(), "");
|
|
ASSERT_EQ(4, document.size());
|
|
|
|
// when included field does not exist
|
|
document = get_prune_doc();
|
|
Collection::prune_doc(document, {"notfound"}, tsl::htrie_set<char>(), "");
|
|
ASSERT_EQ(0, document.size());
|
|
|
|
// when excluded field does not exist
|
|
document = get_prune_doc();
|
|
Collection::prune_doc(document, tsl::htrie_set<char>(), {"notfound"}, "");
|
|
ASSERT_EQ(4, document.size());
|
|
|
|
// included set is prefix of allowed fields
|
|
document = get_prune_doc();
|
|
Collection::prune_doc(document, {"ones"}, tsl::htrie_set<char>(), "");
|
|
ASSERT_EQ(0, document.size());
|
|
}
|
|
|
|
TEST_F(CollectionTest, StringArrayFieldShouldNotAllowPlainString) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("categories", field_types::STRING_ARRAY, true),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
doc["id"] = "100";
|
|
doc["categories"] = "Should not be allowed!";
|
|
doc["points"] = 25;
|
|
|
|
auto add_op = coll1->add(doc.dump());
|
|
ASSERT_FALSE(add_op.ok());
|
|
ASSERT_STREQ("Field `categories` must be an array.", add_op.error().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, SearchHighlightShouldFollowThreshold) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, true),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
doc["id"] = "100";
|
|
doc["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.";
|
|
doc["points"] = 25;
|
|
|
|
auto add_op = coll1->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
// first with a large threshold
|
|
|
|
auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "").get();
|
|
|
|
ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
|
|
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
// now with with a small threshold (will show only 4 words either side of the matched token)
|
|
|
|
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5).get();
|
|
|
|
ASSERT_STREQ("fox jumped over the <mark>lazy</mark> dog and ran straight",
|
|
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
// specify the number of surrounding tokens to return
|
|
size_t highlight_affix_num_tokens = 2;
|
|
|
|
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, highlight_affix_num_tokens).get();
|
|
ASSERT_STREQ("over the <mark>lazy</mark> dog and",
|
|
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
highlight_affix_num_tokens = 0;
|
|
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, highlight_affix_num_tokens).get();
|
|
ASSERT_STREQ("<mark>lazy</mark>",
|
|
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, SearchHighlightShouldUseHighlightTags) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, true),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
doc["id"] = "100";
|
|
doc["title"] = "The quick brown fox jumped over the lazy fox. "; // adding some extra spaces
|
|
doc["points"] = 25;
|
|
|
|
auto add_op = coll1->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
// use non-default highlighting tags
|
|
|
|
auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<em class=\"h\">", "</em>").get();
|
|
|
|
ASSERT_STREQ("The quick brown fox jumped over the <em class=\"h\">lazy</em> fox. ",
|
|
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, SearchHighlightWithNewLine) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, true),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
doc["id"] = "100";
|
|
doc["title"] = "Blah, blah\nStark Industries";
|
|
doc["points"] = 25;
|
|
|
|
auto add_op = coll1->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
auto res = coll1->search("stark", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0).get();
|
|
|
|
ASSERT_STREQ("Blah, blah\n<mark>Stark</mark> Industries",
|
|
res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
ASSERT_STREQ("Stark", res["hits"][0]["highlights"][0]["matched_tokens"][0].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, UpdateDocument) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, true),
|
|
field("tags", field_types::STRING_ARRAY, true, true),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
doc["id"] = "100";
|
|
doc["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.";
|
|
doc["tags"] = {"NEWS", "LAZY"};
|
|
doc["points"] = 25;
|
|
|
|
auto add_op = coll1->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
auto res = coll1->search("lazy", {"title"}, "", {"tags"}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(1, res["hits"].size());
|
|
ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.",
|
|
res["hits"][0]["document"]["title"].get<std::string>().c_str());
|
|
|
|
// reindex the document entirely again verbatim and try querying
|
|
add_op = coll1->add(doc.dump(), UPSERT);
|
|
ASSERT_TRUE(add_op.ok());
|
|
ASSERT_EQ(1, coll1->get_num_documents());
|
|
|
|
res = coll1->search("lazy", {"title"}, "", {"tags"}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(1, res["hits"].size());
|
|
ASSERT_EQ(1, res["facet_counts"].size());
|
|
ASSERT_STREQ("tags", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
|
ASSERT_EQ(2, res["facet_counts"][0]["counts"].size());
|
|
|
|
ASSERT_STREQ("NEWS", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
|
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][0]["count"]);
|
|
|
|
ASSERT_STREQ("LAZY", res["facet_counts"][0]["counts"][1]["value"].get<std::string>().c_str());
|
|
ASSERT_EQ(1, (int) res["facet_counts"][0]["counts"][1]["count"]);
|
|
|
|
// upsert only part of the document -- document should be REPLACED
|
|
nlohmann::json partial_doc = doc;
|
|
partial_doc.erase("tags");
|
|
add_op = coll1->add(partial_doc.dump(), UPSERT);
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(1, res["hits"].size());
|
|
ASSERT_FALSE(res["hits"][0].contains("tags"));
|
|
|
|
// upserting without a mandatory field should be an error
|
|
partial_doc = doc;
|
|
partial_doc.erase("title");
|
|
add_op = coll1->add(partial_doc.dump(), UPSERT);
|
|
ASSERT_FALSE(add_op.ok());
|
|
ASSERT_EQ("Field `title` has been declared in the schema, but is not found in the document.", add_op.error());
|
|
|
|
// try changing the title and searching for an older token
|
|
doc["title"] = "The quick brown fox.";
|
|
add_op = coll1->add(doc.dump(), UPSERT);
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
ASSERT_EQ(1, coll1->get_num_documents());
|
|
|
|
res = coll1->search("lazy", {"title"}, "", {"tags"}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(0, res["hits"].size());
|
|
|
|
res = coll1->search("quick", {"title"}, "", {"title"}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(1, res["hits"].size());
|
|
ASSERT_STREQ("The quick brown fox.", res["hits"][0]["document"]["title"].get<std::string>().c_str());
|
|
|
|
// try to update document tags without `id`
|
|
nlohmann::json doc2;
|
|
doc2["tags"] = {"SENTENCE"};
|
|
add_op = coll1->add(doc2.dump(), UPDATE);
|
|
ASSERT_FALSE(add_op.ok());
|
|
ASSERT_STREQ("For update, the `id` key must be provided.", add_op.error().c_str());
|
|
|
|
// now change tags with id
|
|
doc2["id"] = "100";
|
|
add_op = coll1->add(doc2.dump(), UPDATE);
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
// check for old tag
|
|
res = coll1->search("NEWS", {"tags"}, "", {"tags"}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(0, res["hits"].size());
|
|
|
|
// now check for new tag and also try faceting on that field
|
|
res = coll1->search("SENTENCE", {"tags"}, "", {"tags"}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(1, res["hits"].size());
|
|
ASSERT_STREQ("SENTENCE", res["facet_counts"][0]["counts"][0]["value"].get<std::string>().c_str());
|
|
|
|
// try changing points
|
|
nlohmann::json doc3;
|
|
doc3["points"] = 99;
|
|
doc3["id"] = "100";
|
|
|
|
add_op = coll1->add(doc3.dump(), UPDATE);
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
res = coll1->search("*", {"tags"}, "points: > 90", {"tags"}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(1, res["hits"].size());
|
|
ASSERT_EQ(99, res["hits"][0]["document"]["points"].get<size_t>());
|
|
|
|
// id can be passed by param
|
|
nlohmann::json doc4;
|
|
doc4["points"] = 105;
|
|
|
|
add_op = coll1->add(doc4.dump(), UPDATE, "100");
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
res = coll1->search("*", {"tags"}, "points: > 101", {"tags"}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(1, res["hits"].size());
|
|
ASSERT_EQ(105, res["hits"][0]["document"]["points"].get<size_t>());
|
|
|
|
// try to change a field with bad value and verify that old document is put back
|
|
doc4["points"] = "abc";
|
|
add_op = coll1->add(doc4.dump(), UPDATE, "100");
|
|
ASSERT_FALSE(add_op.ok());
|
|
ASSERT_EQ("Field `points` must be an int32.", add_op.error());
|
|
|
|
res = coll1->search("*", {"tags"}, "points: > 101", {"tags"}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(1, res["hits"].size());
|
|
ASSERT_EQ(105, res["hits"][0]["document"]["points"].get<size_t>());
|
|
|
|
// when explicit path id does not match doc id, error should be returned
|
|
nlohmann::json doc5;
|
|
doc5["id"] = "800";
|
|
doc5["title"] = "The Secret Seven";
|
|
doc5["points"] = 250;
|
|
doc5["tags"] = {"BOOK", "ENID BLYTON"};
|
|
|
|
add_op = coll1->add(doc5.dump(), UPSERT, "799");
|
|
ASSERT_FALSE(add_op.ok());
|
|
ASSERT_EQ(400, add_op.code());
|
|
ASSERT_STREQ("The `id` of the resource does not match the `id` in the JSON body.", add_op.error().c_str());
|
|
|
|
// passing an empty id should not succeed
|
|
nlohmann::json doc6;
|
|
doc6["id"] = "";
|
|
doc6["title"] = "The Secret Seven";
|
|
doc6["points"] = 250;
|
|
doc6["tags"] = {"BOOK", "ENID BLYTON"};
|
|
|
|
add_op = coll1->add(doc6.dump(), UPDATE);
|
|
ASSERT_FALSE(add_op.ok());
|
|
ASSERT_EQ(400, add_op.code());
|
|
ASSERT_STREQ("The `id` should not be empty.", add_op.error().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, UpdateDocuments) {
|
|
nlohmann::json schema = R"({
|
|
"name": "update_docs_collection",
|
|
"enable_nested_fields": true,
|
|
"fields": [
|
|
{"name": "user_name", "type": "string", "facet": true},
|
|
{"name": "likes", "type": "int32"},
|
|
{"name": "content", "type": "object"}
|
|
],
|
|
"default_sorting_field": "likes"
|
|
})"_json;
|
|
|
|
Collection *update_docs_collection = collectionManager.get_collection("update_docs_collection").get();
|
|
if (update_docs_collection == nullptr) {
|
|
auto op = CollectionManager::create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
update_docs_collection = op.get();
|
|
}
|
|
|
|
std::vector<std::string> json_lines = {
|
|
R"({"user_name": "fat_cat","likes": 5215,"content": {"title": "cat data 1", "body": "cd1"}})",
|
|
R"({"user_name": "fast_dog","likes": 273,"content": {"title": "dog data 1", "body": "dd1"}})",
|
|
R"({"user_name": "fat_cat","likes": 2133,"content": {"title": "cat data 2", "body": "cd2"}})",
|
|
R"({"user_name": "fast_dog","likes": 9754,"content": {"title": "dog data 2", "body": "dd2"}})",
|
|
R"({"user_name": "fast_dog","likes": 576,"content": {"title": "dog data 3", "body": "dd3"}})"
|
|
};
|
|
|
|
for (auto const& json: json_lines){
|
|
auto add_op = update_docs_collection->add(json);
|
|
if (!add_op.ok()) {
|
|
std::cout << add_op.error() << std::endl;
|
|
}
|
|
ASSERT_TRUE(add_op.ok());
|
|
}
|
|
|
|
std::vector<sort_by> sort_fields = { sort_by("likes", "DESC") };
|
|
|
|
auto res = update_docs_collection->search("cat data", {"content"}, "", {}, sort_fields, {0}, 10).get();
|
|
ASSERT_EQ(2, res["hits"].size());
|
|
for (size_t i = 0; i < res["hits"].size(); i++) {
|
|
ASSERT_EQ("fat_cat", res["hits"][i]["document"]["user_name"].get<std::string>());
|
|
}
|
|
|
|
nlohmann::json document;
|
|
document["user_name"] = "slim_cat";
|
|
std::string dirty_values;
|
|
bool validate_field_names = false;
|
|
|
|
auto update_op = update_docs_collection->update_matching_filter("foo:=fat_cat", document.dump(), dirty_values,
|
|
validate_field_names);
|
|
ASSERT_TRUE(update_op.ok());
|
|
ASSERT_EQ(0, update_op.get()["num_updated"]);
|
|
|
|
update_op = update_docs_collection->update_matching_filter("user_name:=fat_cat", document.dump(), dirty_values);
|
|
ASSERT_TRUE(update_op.ok());
|
|
ASSERT_EQ(2, update_op.get()["num_updated"]);
|
|
|
|
res = update_docs_collection->search("cat data", {"content"}, "", {}, sort_fields, {0}, 10).get();
|
|
ASSERT_EQ(2, res["hits"].size());
|
|
for (size_t i = 0; i < res["hits"].size(); i++) {
|
|
ASSERT_EQ("slim_cat", res["hits"][i]["document"]["user_name"].get<std::string>());
|
|
}
|
|
|
|
validate_field_names = true;
|
|
|
|
// Test batching
|
|
res = update_docs_collection->search("dog data", {"content"}, "", {}, sort_fields, {0}, 10).get();
|
|
ASSERT_EQ(3, res["hits"].size());
|
|
for (size_t i = 0; i < res["hits"].size(); i++) {
|
|
ASSERT_EQ("fast_dog", res["hits"][i]["document"]["user_name"].get<std::string>());
|
|
}
|
|
|
|
document["user_name"] = "lazy_dog";
|
|
update_op = update_docs_collection->update_matching_filter("user_name:=fast_dog", document.dump(), dirty_values,
|
|
validate_field_names, 2);
|
|
ASSERT_TRUE(update_op.ok());
|
|
ASSERT_EQ(3, update_op.get()["num_updated"]);
|
|
|
|
res = update_docs_collection->search("dog data", {"content"}, "", {}, sort_fields, {0}, 10).get();
|
|
ASSERT_EQ(3, res["hits"].size());
|
|
for (size_t i = 0; i < res["hits"].size(); i++) {
|
|
ASSERT_EQ("lazy_dog", res["hits"][i]["document"]["user_name"].get<std::string>());
|
|
}
|
|
|
|
// Test nested fields updation
|
|
res = update_docs_collection->search("*", {}, "user_name:=slim_cat", {}, sort_fields, {0}, 10).get();
|
|
ASSERT_EQ(2, res["hits"].size());
|
|
for (size_t i = 0; i < res["hits"].size(); i++) {
|
|
ASSERT_EQ("cat data " + std::to_string(i + 1), res["hits"][i]["document"]["content"]["title"].get<std::string>());
|
|
}
|
|
|
|
document.clear();
|
|
document["content"]["title"] = "fancy cat title";
|
|
|
|
update_op = update_docs_collection->update_matching_filter("user_name:=slim_cat", document.dump(), dirty_values,
|
|
validate_field_names, 2);
|
|
ASSERT_TRUE(update_op.ok());
|
|
ASSERT_EQ(2, update_op.get()["num_updated"]);
|
|
|
|
res = update_docs_collection->search("*", {}, "user_name:=slim_cat", {}, sort_fields, {0}, 10).get();
|
|
ASSERT_EQ(2, res["hits"].size());
|
|
for (size_t i = 0; i < res["hits"].size(); i++) {
|
|
ASSERT_EQ("fancy cat title", res["hits"][i]["document"]["content"]["title"].get<std::string>());
|
|
}
|
|
|
|
// Test all document updation
|
|
res = update_docs_collection->search("*", {}, "", {}, sort_fields, {0}, 10).get();
|
|
ASSERT_EQ(5, res["hits"].size());
|
|
for (size_t i = 0; i < res["hits"].size(); i++) {
|
|
ASSERT_NE(0, res["hits"][i]["document"]["likes"].get<int>());
|
|
}
|
|
|
|
document.clear();
|
|
document["likes"] = 0;
|
|
|
|
update_op = update_docs_collection->update_matching_filter("*", document.dump(), dirty_values, validate_field_names,
|
|
2);
|
|
ASSERT_TRUE(update_op.ok());
|
|
ASSERT_EQ(5, update_op.get()["num_updated"]);
|
|
|
|
res = update_docs_collection->search("*", {}, "", {}, sort_fields, {0}, 10).get();
|
|
ASSERT_EQ(5, res["hits"].size());
|
|
for (size_t i = 0; i < res["hits"].size(); i++) {
|
|
ASSERT_EQ(0, res["hits"][i]["document"]["likes"].get<int>());
|
|
}
|
|
|
|
collectionManager.drop_collection("update_docs_collection");
|
|
}
|
|
|
|
TEST_F(CollectionTest, UpdateDocumentSorting) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, true),
|
|
field("tags", field_types::STRING_ARRAY, true),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc1;
|
|
doc1["id"] = "100";
|
|
doc1["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.";
|
|
doc1["tags"] = {"NEWS", "LAZY"};
|
|
doc1["points"] = 100;
|
|
|
|
nlohmann::json doc2;
|
|
doc2["id"] = "101";
|
|
doc2["title"] = "The random sentence.";
|
|
doc2["tags"] = {"RANDOM"};
|
|
doc2["points"] = 101;
|
|
|
|
auto add_op = coll1->add(doc1.dump());
|
|
coll1->add(doc2.dump());
|
|
|
|
auto res = coll1->search("*", {"tags"}, "", {"tags"}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(2, res["hits"].size());
|
|
ASSERT_EQ(101, res["hits"][0]["document"]["points"].get<size_t>());
|
|
ASSERT_STREQ("101", res["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(100, res["hits"][1]["document"]["points"].get<size_t>());
|
|
ASSERT_STREQ("100", res["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
|
|
// now update doc1 points from 100 -> 1000 and it should bubble up
|
|
doc1["points"] = 1000;
|
|
coll1->add(doc1.dump(), UPDATE);
|
|
|
|
res = coll1->search("*", {"tags"}, "", {"tags"}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(2, res["hits"].size());
|
|
ASSERT_EQ(1000, res["hits"][0]["document"]["points"].get<size_t>());
|
|
ASSERT_STREQ("100", res["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(101, res["hits"][1]["document"]["points"].get<size_t>());
|
|
ASSERT_STREQ("101", res["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, UpdateDocumentUnIndexedField) {
|
|
Collection* coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, true),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
doc["id"] = "100";
|
|
doc["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.";
|
|
doc["foo"] = "foo1";
|
|
doc["points"] = 25;
|
|
|
|
auto add_op = coll1->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(1, res["hits"].size());
|
|
ASSERT_STREQ("The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.",
|
|
res["hits"][0]["document"]["title"].get<std::string>().c_str());
|
|
|
|
// reindex the document again by changing only the unindexed field
|
|
doc["foo"] = "foo2";
|
|
add_op = coll1->add(doc.dump(), UPSERT);
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(1, res["hits"].size());
|
|
ASSERT_STREQ("foo2", res["hits"][0]["document"]["foo"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, SearchHighlightFieldFully) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = { field("title", field_types::STRING, true),
|
|
field("tags", field_types::STRING_ARRAY, true),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
doc["id"] = "100";
|
|
doc["title"] = "The quick brown fox jumped over the lazy dog and ran straight to the forest to sleep.";
|
|
doc["tags"] = {"NEWS", "LAZY"};
|
|
doc["points"] = 25;
|
|
|
|
auto add_op = coll1->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
// look for fully highlighted value in response
|
|
|
|
auto res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title").get();
|
|
|
|
ASSERT_EQ(1, res["hits"][0]["highlights"].size());
|
|
ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
|
|
res["hits"][0]["highlights"][0]["value"].get<std::string>().c_str());
|
|
|
|
// should not return value key when highlight_full_fields is not specified
|
|
res = coll1->search("lazy", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "").get();
|
|
|
|
ASSERT_EQ(3, res["hits"][0]["highlights"][0].size());
|
|
|
|
// query multiple fields
|
|
res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 5, 5, "title, tags").get();
|
|
|
|
ASSERT_EQ(2, res["hits"][0]["highlights"].size());
|
|
|
|
ASSERT_EQ("tags", res["hits"][0]["highlights"][0]["field"]);
|
|
ASSERT_EQ(1, res["hits"][0]["highlights"][0]["values"].size());
|
|
ASSERT_EQ("<mark>LAZY</mark>", res["hits"][0]["highlights"][0]["values"][0].get<std::string>());
|
|
ASSERT_EQ(1, res["hits"][0]["highlights"][0]["snippets"].size());
|
|
ASSERT_EQ("<mark>LAZY</mark>", res["hits"][0]["highlights"][0]["snippets"][0].get<std::string>());
|
|
|
|
ASSERT_EQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
|
|
res["hits"][0]["highlights"][1]["value"].get<std::string>());
|
|
ASSERT_EQ("title", res["hits"][0]["highlights"][1]["field"]);
|
|
ASSERT_EQ(1, res["hits"][0]["highlights"][1]["matched_tokens"].size());
|
|
ASSERT_STREQ("lazy", res["hits"][0]["highlights"][1]["matched_tokens"][0].get<std::string>().c_str());
|
|
|
|
// excluded fields should not be returned in highlights section
|
|
spp::sparse_hash_set<std::string> excluded_fields = {"tags"};
|
|
res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
excluded_fields, 10, "", 5, 5, "title, tags").get();
|
|
|
|
ASSERT_EQ(1, res["hits"][0]["highlights"].size());
|
|
ASSERT_STREQ("The quick brown fox jumped over the <mark>lazy</mark> dog and ran straight to the forest to sleep.",
|
|
res["hits"][0]["highlights"][0]["value"].get<std::string>().c_str());
|
|
|
|
// when all fields are excluded
|
|
excluded_fields = {"tags", "title"};
|
|
res = coll1->search("lazy", {"title", "tags"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
excluded_fields, 10, "", 5, 5, "title, tags").get();
|
|
ASSERT_EQ(0, res["hits"][0]["highlights"].size());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, OptionalFields) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {
|
|
field("title", field_types::STRING, false),
|
|
field("description", field_types::STRING, true, true),
|
|
field("max", field_types::INT32, false),
|
|
field("scores", field_types::INT64_ARRAY, false, true),
|
|
field("average", field_types::FLOAT, false, true),
|
|
field("is_valid", field_types::BOOL, false, true),
|
|
};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "max").get();
|
|
}
|
|
|
|
std::ifstream infile(std::string(ROOT_DIR)+"test/optional_fields.jsonl");
|
|
|
|
std::string json_line;
|
|
|
|
while (std::getline(infile, json_line)) {
|
|
auto add_op = coll1->add(json_line);
|
|
if(!add_op.ok()) {
|
|
std::cout << add_op.error() << std::endl;
|
|
}
|
|
ASSERT_TRUE(add_op.ok());
|
|
}
|
|
|
|
infile.close();
|
|
|
|
// first must be able to fetch all records (i.e. all must have been indexed)
|
|
|
|
auto res = coll1->search("*", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(6, res["found"].get<size_t>());
|
|
|
|
// search on optional `description` field
|
|
res = coll1->search("book", {"description"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(5, res["found"].get<size_t>());
|
|
|
|
// filter on optional `average` field
|
|
res = coll1->search("the", {"title"}, "average: >0", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(5, res["found"].get<size_t>());
|
|
|
|
// facet on optional `description` field
|
|
res = coll1->search("the", {"title"}, "", {"description"}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
|
|
ASSERT_EQ(6, res["found"].get<size_t>());
|
|
ASSERT_EQ(5, res["facet_counts"][0]["counts"][0]["count"].get<size_t>());
|
|
ASSERT_STREQ("description", res["facet_counts"][0]["field_name"].get<std::string>().c_str());
|
|
|
|
// sort_by optional `average` field should be allowed (default used for missing values)
|
|
std::vector<sort_by> sort_fields = { sort_by("average", "DESC") };
|
|
auto res_op = coll1->search("*", {"title"}, "", {}, sort_fields, {0}, 10, 1, FREQUENCY, {false});
|
|
ASSERT_TRUE(res_op.ok());
|
|
res = res_op.get();
|
|
|
|
ASSERT_EQ(6, res["found"].get<size_t>());
|
|
ASSERT_EQ(0, res["hits"][5]["document"].count("average")); // record with missing average is last
|
|
|
|
// try deleting a record having optional field
|
|
Option<std::string> remove_op = coll1->remove("1");
|
|
ASSERT_TRUE(remove_op.ok());
|
|
|
|
// try fetching the schema (should contain optional field)
|
|
nlohmann::json coll_summary = coll1->get_summary_json();
|
|
ASSERT_STREQ("title", coll_summary["fields"][0]["name"].get<std::string>().c_str());
|
|
ASSERT_STREQ("string", coll_summary["fields"][0]["type"].get<std::string>().c_str());
|
|
ASSERT_FALSE(coll_summary["fields"][0]["facet"].get<bool>());
|
|
ASSERT_FALSE(coll_summary["fields"][0]["optional"].get<bool>());
|
|
|
|
ASSERT_STREQ("description", coll_summary["fields"][1]["name"].get<std::string>().c_str());
|
|
ASSERT_STREQ("string", coll_summary["fields"][1]["type"].get<std::string>().c_str());
|
|
ASSERT_TRUE(coll_summary["fields"][1]["facet"].get<bool>());
|
|
ASSERT_TRUE(coll_summary["fields"][1]["optional"].get<bool>());
|
|
|
|
// default sorting field should not be declared optional
|
|
fields = {
|
|
field("title", field_types::STRING, false),
|
|
field("score", field_types::INT32, false, true),
|
|
};
|
|
|
|
auto create_op = collectionManager.create_collection("coll2", 4, fields, "score");
|
|
|
|
ASSERT_FALSE(create_op.ok());
|
|
ASSERT_STREQ("Default sorting field `score` cannot be an optional field.", create_op.error().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, OptionalFieldCanBeNull) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false, true),
|
|
field("genres", field_types::STRING_ARRAY, false, true),
|
|
field("launch_year", field_types::INT32, false, true),
|
|
field("updated_at", field_types::INT64, false, true),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = "0";
|
|
doc["title"] = "Beat it";
|
|
doc["artist"] = nullptr;
|
|
doc["genres"] = nullptr;
|
|
doc["launch_year"] = nullptr;
|
|
doc["updated_at"] = nullptr;
|
|
doc["points"] = 100;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
|
|
ASSERT_EQ(2, coll1->_get_index()->_get_search_index().at("title")->size);
|
|
ASSERT_EQ(0, coll1->_get_index()->_get_search_index().at("artist")->size);
|
|
ASSERT_EQ(0, coll1->_get_index()->_get_search_index().at("genres")->size);
|
|
|
|
auto results = coll1->search("beat",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, EmptyStringNotIndexed) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false, true),
|
|
field("genres", field_types::STRING_ARRAY, false, true),
|
|
field("launch_year", field_types::STRING, false, true),
|
|
field("labels", field_types::STRING_ARRAY, false, true),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = "0";
|
|
doc["title"] = "Beat it";
|
|
doc["artist"] = "";
|
|
doc["launch_year"] = " ";
|
|
doc["genres"] = {""};
|
|
doc["labels"] = {"song", " ", ""};
|
|
doc["points"] = 100;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
|
|
auto results = coll1->search("beat",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_EQ(2, coll1->_get_index()->_get_search_index().at("title")->size);
|
|
ASSERT_EQ(0, coll1->_get_index()->_get_search_index().at("artist")->size);
|
|
ASSERT_EQ(0, coll1->_get_index()->_get_search_index().at("launch_year")->size);
|
|
ASSERT_EQ(0, coll1->_get_index()->_get_search_index().at("genres")->size);
|
|
ASSERT_EQ(1, coll1->_get_index()->_get_search_index().at("labels")->size);
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, WildcardQueryReturnsResultsBasedOnPerPageParam) {
|
|
std::vector<std::string> facets;
|
|
spp::sparse_hash_set<std::string> empty;
|
|
nlohmann::json results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 12, 1,
|
|
FREQUENCY, {false}, 1000, empty, empty, 10).get();
|
|
|
|
ASSERT_EQ(12, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<int>());
|
|
|
|
// should match collection size
|
|
results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 100, 1,
|
|
FREQUENCY, {false}, 1000, empty, empty, 10).get();
|
|
|
|
ASSERT_EQ(25, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<int>());
|
|
|
|
// cannot fetch more than in-built limit of 250
|
|
auto res_op = collection->search("*", query_fields, "", facets, sort_fields, {0}, 251, 1,
|
|
FREQUENCY, {false}, 1000, empty, empty, 10);
|
|
ASSERT_FALSE(res_op.ok());
|
|
ASSERT_EQ(422, res_op.code());
|
|
ASSERT_STREQ("Only upto 250 hits can be fetched per page.", res_op.error().c_str());
|
|
|
|
// when page number is 0, just fetch first page
|
|
results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 0,
|
|
FREQUENCY, {false}, 1000, empty, empty, 10).get();
|
|
ASSERT_EQ(10, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<int>());
|
|
|
|
// do pagination
|
|
|
|
results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 1,
|
|
FREQUENCY, {false}, 1000, empty, empty, 10).get();
|
|
|
|
ASSERT_EQ(10, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<int>());
|
|
|
|
results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 2,
|
|
FREQUENCY, {false}, 1000, empty, empty, 10).get();
|
|
|
|
ASSERT_EQ(10, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<int>());
|
|
|
|
results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 3,
|
|
FREQUENCY, {false}, 1000, empty, empty, 10).get();
|
|
|
|
ASSERT_EQ(5, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<int>());
|
|
|
|
// enforce limit_hits
|
|
auto limit_hits = 20;
|
|
results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 10, 3,
|
|
FREQUENCY, {false}, 1000,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1}, limit_hits).get();
|
|
|
|
ASSERT_EQ(0, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<int>());
|
|
|
|
results = collection->search("*", query_fields, "", facets, sort_fields, {0}, 15, 2,
|
|
FREQUENCY, {false}, 1000,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1}, limit_hits).get();
|
|
|
|
ASSERT_EQ(5, results["hits"].size());
|
|
ASSERT_EQ(25, results["found"].get<int>());
|
|
}
|
|
|
|
TEST_F(CollectionTest, RemoveIfFound) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, true),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
for(size_t i=0; i<10; i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = "Title " + std::to_string(i);
|
|
doc["points"] = i;
|
|
|
|
coll1->add(doc.dump());
|
|
}
|
|
|
|
auto res = coll1->search("*", {"title"}, "", {}, sort_fields, {0}, 10, 1,
|
|
token_ordering::FREQUENCY, {true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0).get();
|
|
|
|
ASSERT_EQ(10, res["found"].get<int>());
|
|
|
|
// removing found doc
|
|
Option<bool> found_op = coll1->remove_if_found(0);
|
|
ASSERT_TRUE(found_op.ok());
|
|
ASSERT_TRUE(found_op.get());
|
|
|
|
auto get_op = coll1->get("0");
|
|
ASSERT_FALSE(get_op.ok());
|
|
ASSERT_EQ(404, get_op.code());
|
|
|
|
// removing doc not found
|
|
found_op = coll1->remove_if_found(100);
|
|
ASSERT_TRUE(found_op.ok());
|
|
ASSERT_FALSE(found_op.get());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, CreateCollectionInvalidFieldType) {
|
|
std::vector<field> fields = {field("title", "blah", true),
|
|
field("points", "int", false)};
|
|
|
|
std::vector<sort_by> sort_fields = {sort_by("points", "DESC")};
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
|
|
auto create_op = collectionManager.create_collection("coll1", 4, fields, "points");
|
|
|
|
ASSERT_FALSE(create_op.ok());
|
|
ASSERT_STREQ("Field `title` has an invalid data type `blah`, see docs for supported data types.",
|
|
create_op.error().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiFieldRelevance) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Down There by the Train", "Dustin Kensrue"},
|
|
{"Down There by the Train", "Gord Downie"},
|
|
{"State Trooper", "Dustin Kensrue"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["artist"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("Dustin Kensrue Down There by the Train",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10).get();
|
|
|
|
ASSERT_EQ(3, results["found"].get<size_t>());
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
|
|
std::vector<size_t> expected_ids = {0, 1, 2};
|
|
|
|
for(size_t i=0; i<expected_ids.size(); i++) {
|
|
ASSERT_EQ(expected_ids[i], std::stoi(results["hits"][i]["document"]["id"].get<std::string>()));
|
|
}
|
|
|
|
ASSERT_STREQ("<mark>Down</mark> <mark>There</mark> <mark>by</mark> <mark>the</mark> <mark>Train</mark>",
|
|
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
ASSERT_STREQ("<mark>Down</mark> <mark>There</mark> <mark>by</mark> <mark>the</mark> <mark>Train</mark>",
|
|
results["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
ASSERT_STREQ("<mark>Dustin</mark> <mark>Kensrue</mark>",
|
|
results["hits"][2]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
// remove documents, reindex in another order and search again
|
|
for(size_t i=0; i<expected_ids.size(); i++) {
|
|
coll1->remove_if_found(i, true);
|
|
}
|
|
|
|
records = {
|
|
{"State Trooper", "Dustin Kensrue"},
|
|
{"Down There by the Train", "Gord Downie"},
|
|
{"Down There by the Train", "Dustin Kensrue"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["artist"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
results = coll1->search("Dustin Kensrue Down There by the Train",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10).get();
|
|
|
|
ASSERT_EQ(3, results["found"].get<size_t>());
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
|
|
expected_ids = {2, 1, 0};
|
|
|
|
for(size_t i=0; i<expected_ids.size(); i++) {
|
|
ASSERT_EQ(expected_ids[i], std::stoi(results["hits"][i]["document"]["id"].get<std::string>()));
|
|
}
|
|
|
|
// with exclude token syntax
|
|
results = coll1->search("-downie dustin kensrue down there by the train",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
expected_ids = {2, 0};
|
|
|
|
for(size_t i=0; i<expected_ids.size(); i++) {
|
|
ASSERT_EQ(expected_ids[i], std::stoi(results["hits"][i]["document"]["id"].get<std::string>()));
|
|
}
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiFieldRelevance2) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"A Daikon Freestyle", "Ghosts on a Trampoline"},
|
|
{"Leaving on a Jetplane", "Coby Grant"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["artist"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("on a jetplane",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(0, results["hits"][0]["text_match_info"]["num_tokens_dropped"]);
|
|
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(1, results["hits"][1]["text_match_info"]["num_tokens_dropped"]);
|
|
|
|
// changing weights to favor artist still favors title because it contains all tokens of the query
|
|
|
|
results = coll1->search("on a jetplane",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 4}).get();
|
|
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(0, results["hits"][0]["text_match_info"]["num_tokens_dropped"]);
|
|
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(1, results["hits"][1]["text_match_info"]["num_tokens_dropped"]);
|
|
|
|
// use same weights
|
|
|
|
results = coll1->search("on a jetplane",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(0, results["hits"][0]["text_match_info"]["num_tokens_dropped"]);
|
|
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(1, results["hits"][1]["text_match_info"]["num_tokens_dropped"]);
|
|
|
|
|
|
// add weights to favor artist without all tokens in a query being found in a field
|
|
|
|
results = coll1->search("on a helicopter",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 4}).get();
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(1, results["hits"][0]["text_match_info"]["num_tokens_dropped"]);
|
|
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(1, results["hits"][1]["text_match_info"]["num_tokens_dropped"]);
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, FieldWeightsNotProper) {
|
|
// when weights are not given properly
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
auto results_op = coll1->search("on a jetplane",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1});
|
|
|
|
ASSERT_FALSE(results_op.ok());
|
|
ASSERT_STREQ("Number of weights in `query_by_weights` does not match number "
|
|
"of `query_by` fields.", results_op.error().c_str());
|
|
|
|
results_op = coll1->search("on a jetplane",
|
|
{"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {2, 1});
|
|
|
|
ASSERT_FALSE(results_op.ok());
|
|
ASSERT_STREQ("Number of weights in `query_by_weights` does not match number "
|
|
"of `query_by` fields.", results_op.error().c_str());
|
|
|
|
// empty weights are fine (will be defaulted to)
|
|
|
|
results_op = coll1->search("on a jetplane",
|
|
{"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {});
|
|
|
|
ASSERT_TRUE(results_op.ok());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiFieldRelevance3) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Taylor Swift Karaoke: reputation", "Taylor Swift"},
|
|
{"Style", "Taylor Swift"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["artist"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("style taylor swift",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(0, results["hits"][0]["text_match_info"]["num_tokens_dropped"]);
|
|
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(1, results["hits"][1]["text_match_info"]["num_tokens_dropped"]);
|
|
|
|
results = coll1->search("swift",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(0, results["hits"][0]["text_match_info"]["num_tokens_dropped"]);
|
|
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_EQ(0, results["hits"][1]["text_match_info"]["num_tokens_dropped"]);
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiFieldRelevance4) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Maddras Dreams", "Chennai King"},
|
|
{"Maddurai Express", "Maddura Maddy"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["artist"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("maddras",
|
|
{"title", "artist"}, "", {}, {}, {2}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiFieldRelevance5) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("company_name", field_types::STRING, false),
|
|
field("country", field_types::STRING, false),
|
|
field("field_a", field_types::STRING, false),
|
|
field("num_employees", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "num_employees").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Stark Industries ™", "Canada", "Canadia", "5215"},
|
|
{"Canaida Corp", "United States", "Canadoo", "200"},
|
|
{"Acme Corp", "Mexico", "Canadoo", "300"}
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["company_name"] = records[i][0];
|
|
doc["country"] = records[i][1];
|
|
doc["field_a"] = records[i][2];
|
|
doc["num_employees"] = std::stoi(records[i][3]);
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("Canada",
|
|
{"company_name","country","field_a"}, "", {}, {}, {2}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1, 1}).get();
|
|
|
|
ASSERT_EQ(3, results["found"].get<size_t>());
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());
|
|
|
|
results = coll1->search("Canada",
|
|
{"company_name","field_a","country"}, "", {}, {}, {2}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1, 1}).get();
|
|
|
|
ASSERT_EQ(3, results["found"].get<size_t>());
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
|
|
ASSERT_EQ("field_a", results["hits"][0]["highlights"][0]["field"].get<std::string>());
|
|
ASSERT_EQ("<mark>Canadia</mark>", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
|
ASSERT_EQ("country", results["hits"][0]["highlights"][1]["field"].get<std::string>());
|
|
ASSERT_EQ("<mark>Canada</mark>", results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
|
|
|
|
ASSERT_EQ(1, results["hits"][1]["highlights"].size());
|
|
ASSERT_EQ("field_a", results["hits"][1]["highlights"][0]["field"].get<std::string>());
|
|
ASSERT_EQ("<mark>Canadoo</mark>", results["hits"][1]["highlights"][0]["snippet"].get<std::string>());
|
|
|
|
ASSERT_EQ(2, results["hits"][2]["highlights"].size());
|
|
ASSERT_EQ("field_a", results["hits"][2]["highlights"][0]["field"].get<std::string>());
|
|
ASSERT_EQ("<mark>Canadoo</mark>", results["hits"][2]["highlights"][0]["snippet"].get<std::string>());
|
|
ASSERT_EQ("company_name", results["hits"][2]["highlights"][1]["field"].get<std::string>());
|
|
ASSERT_EQ("<mark>Canaida</mark> Corp", results["hits"][2]["highlights"][1]["snippet"].get<std::string>());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiFieldRelevance6) {
|
|
// with exact match, the number of fields with exact match will not be considered as a ranking signal
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Taylor Swift", "Taylor Swift"},
|
|
{"Taylor Swift Song", "Taylor Swift"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["artist"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("taylor swift",
|
|
{"title", "artist"}, "", {}, {}, {2}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
|
|
// when exact matches are disabled
|
|
results = coll1->search("taylor swift",
|
|
{"title", "artist"}, "", {}, {}, {2}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}, 100, false).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, ExactMatch) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Alpha", "DJ"},
|
|
{"Alpha Beta", "DJ"},
|
|
{"Alpha Beta Gamma", "DJ"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["artist"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("alpha beta",
|
|
{"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY,
|
|
{true}, 10).get();
|
|
|
|
ASSERT_EQ(3, results["found"].get<size_t>());
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("0", results["hits"][2]["document"]["id"].get<std::string>().c_str());
|
|
|
|
results = coll1->search("alpha", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10).get();
|
|
|
|
ASSERT_EQ(3, results["found"].get<size_t>());
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiFieldHighlighting) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("name", field_types::STRING, false),
|
|
field("description", field_types::STRING, false),
|
|
field("categories", field_types::STRING_ARRAY, false),
|
|
field("points", field_types::INT32, false)};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Best Wireless Vehicle Charger",
|
|
"Easily replenish your cell phone with this wireless charger.",
|
|
"Cell Phones > Cell Phone Accessories > Car Chargers"},
|
|
|
|
{"Annie's Song",
|
|
"John Denver",
|
|
"Album > Compilation"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
std::vector<std::string> categories;
|
|
StringUtils::split(records[i][2], categories, ">");
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["name"] = records[i][0];
|
|
doc["description"] = records[i][1];
|
|
doc["categories"] = categories;
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("charger",
|
|
{"name","description","categories"}, "", {}, {}, {2}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1, 1}).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(3, results["hits"][0]["highlights"].size());
|
|
ASSERT_EQ("name", results["hits"][0]["highlights"][0]["field"].get<std::string>());
|
|
ASSERT_EQ("Best Wireless Vehicle <mark>Charger</mark>",
|
|
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
|
|
|
ASSERT_EQ("description", results["hits"][0]["highlights"][1]["field"].get<std::string>());
|
|
ASSERT_EQ("Easily replenish your cell phone with this wireless <mark>charger</mark>.",
|
|
results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
|
|
|
|
ASSERT_EQ("categories", results["hits"][0]["highlights"][2]["field"].get<std::string>());
|
|
ASSERT_EQ("Car <mark>Charger</mark>s", results["hits"][0]["highlights"][2]["snippets"][0].get<std::string>());
|
|
|
|
results = coll1->search("John With Denver",
|
|
{"description"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 1, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1}).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(1, results["hits"][0]["highlights"].size());
|
|
ASSERT_EQ("description", results["hits"][0]["highlights"][0]["field"].get<std::string>());
|
|
ASSERT_EQ("<mark>John</mark> <mark>Denver</mark>",
|
|
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
|
|
|
results = coll1->search("Annies song John Denver",
|
|
{"name","description"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 1, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
|
|
ASSERT_EQ("name", results["hits"][0]["highlights"][0]["field"].get<std::string>());
|
|
ASSERT_EQ("<mark>Annie's</mark> <mark>Song</mark>",
|
|
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
|
|
|
ASSERT_EQ("description", results["hits"][0]["highlights"][1]["field"].get<std::string>());
|
|
ASSERT_EQ("<mark>John</mark> <mark>Denver</mark>",
|
|
results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiFieldMatchRanking) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Style", "Taylor Swift"},
|
|
{"Blank Space", "Taylor Swift"},
|
|
{"Balance Overkill", "Taylor Swift"},
|
|
{"Cardigan", "Taylor Swift"},
|
|
{"Invisible String", "Taylor Swift"},
|
|
{"The Last Great American Dynasty", "Taylor Swift"},
|
|
{"Mirrorball", "Taylor Swift"},
|
|
{"Peace", "Taylor Swift"},
|
|
{"Betty", "Taylor Swift"},
|
|
{"Mad Woman", "Taylor Swift"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["artist"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("taylor swift style",
|
|
{"artist", "title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5).get();
|
|
|
|
ASSERT_EQ(10, results["found"].get<size_t>());
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("9", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("8", results["hits"][2]["document"]["id"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiFieldMatchRankingOnArray) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("name", field_types::STRING, false),
|
|
field("strong_skills", field_types::STRING_ARRAY, false),
|
|
field("skills", field_types::STRING_ARRAY, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::vector<std::string>>> records = {
|
|
{{"John Snow"}, {"Golang", "Vue", "React"}, {"Docker", "Goa", "Elixir"}},
|
|
{{"Jack Dan"}, {"Golang", "Phoenix", "React"}, {"Docker", "Vue", "Kubernetes"}},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["name"] = records[i][0][0];
|
|
doc["strong_skills"] = records[i][1];
|
|
doc["skills"] = records[i][2];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("golang vue",
|
|
{"strong_skills", "skills"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 1).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, MultiFieldMatchRankingOnFieldOrder) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Toxic", "Britney Spears"},
|
|
{"Bad", "Michael Jackson"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["artist"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("michael jackson toxic",
|
|
{"title", "artist"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5,
|
|
spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 6}).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, PrefixRankedAfterExactMatch) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Rotini Puttanesca"},
|
|
{"Poulet Roti Tout Simple"},
|
|
{"Chapatis (Roti)"},
|
|
{"School Days Rotini Pasta Salad"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("roti", {"title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {true}, 5).get();
|
|
|
|
ASSERT_EQ(4, results["found"].get<size_t>());
|
|
ASSERT_EQ(3, results["hits"].size());
|
|
|
|
ASSERT_STREQ("2", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("3", results["hits"][2]["document"]["id"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, HighlightWithAccentedCharacters) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 4, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Mise T.J. à jour Timy depuis PC"},
|
|
{"Down There by the T.r.a.i.n"},
|
|
{"State Trooper"},
|
|
{"The Google Nexus Q Is Baffling"},
|
|
};
|
|
|
|
for (size_t i = 0; i < records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("à jour", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_STREQ("Mise T.J. <mark>à</mark> <mark>jour</mark> Timy depuis PC",
|
|
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["matched_tokens"].size());
|
|
ASSERT_STREQ("à", results["hits"][0]["highlights"][0]["matched_tokens"][0].get<std::string>().c_str());
|
|
ASSERT_STREQ("jour", results["hits"][0]["highlights"][0]["matched_tokens"][1].get<std::string>().c_str());
|
|
|
|
results = coll1->search("by train", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title").get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_STREQ("Down There <mark>by</mark> the <mark>T.r.a.i.n</mark>",
|
|
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
ASSERT_STREQ("Down There <mark>by</mark> the <mark>T.r.a.i.n</mark>",
|
|
results["hits"][0]["highlights"][0]["value"].get<std::string>().c_str());
|
|
|
|
results = coll1->search("state trooper", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_STREQ("<mark>State</mark> <mark>Trooper</mark>",
|
|
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
// test single character highlight
|
|
|
|
results = coll1->search("q", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_STREQ("The Google Nexus <mark>Q</mark> Is Baffling",
|
|
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, DISABLED_SearchingForRecordsWithSpecialChars) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("url", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Amazon Home", "https://amazon.com/"},
|
|
{"Google Home", "https://google.com///"},
|
|
{"Github Issue", "https://github.com/typesense/typesense/issues/241"},
|
|
{"Amazon Search", "https://www.amazon.com/s?k=phone&ref=nb_sb_noss_2"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["url"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("google",
|
|
{"title", "url"}, "", {}, {}, {2}, 10, 1, FREQUENCY).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"].size());
|
|
ASSERT_EQ("<mark>Google</mark> Home", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
|
ASSERT_EQ("https://<mark>google</mark>.com///", results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
|
|
|
|
results = coll1->search("amazon.com",
|
|
{"title", "url"}, "", {}, {}, {2}, 10, 1, FREQUENCY).get();
|
|
|
|
ASSERT_EQ(3, results["found"].get<size_t>());
|
|
ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("1", results["hits"][2]["document"]["id"].get<std::string>().c_str());
|
|
|
|
results = coll1->search("typesense",
|
|
{"title", "url"}, "", {}, {}, {2}, 10, 1, FREQUENCY).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_STREQ("2", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
|
|
results = coll1->search("nb_sb_noss_2",
|
|
{"title", "url"}, "", {}, {}, {2}, 10, 1, FREQUENCY).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_STREQ("3", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(1, results["hits"][0]["highlights"].size());
|
|
ASSERT_EQ("https://www.amazon.com/s?k=phone&ref=<mark>nb</mark>_<mark>sb</mark>_<mark>noss</mark>_<mark>2</mark>",
|
|
results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, FieldSpecificNumTypos) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Taylor Swift Karaoke: reputation", "Taylor Swift"},
|
|
{"Taylor & Friends", "Adam Smith"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["artist"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("tayylor",
|
|
{"title", "artist"}, "", {}, {}, {1, 1}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
|
|
results = coll1->search("tayylor",
|
|
{"title", "artist"}, "", {}, {}, {0, 1}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
|
|
// must return error when num_typos does not match length of search fields queried
|
|
auto res_op = coll1->search("tayylor",
|
|
{"title"}, "", {}, {}, {0, 1}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1});
|
|
|
|
ASSERT_FALSE(res_op.ok());
|
|
ASSERT_EQ("Number of weights in `query_by_weights` does not match number of `query_by` fields.", res_op.error());
|
|
|
|
// can use a single typo param for multiple fields
|
|
results = coll1->search("tayylor",
|
|
{"title", "artist"}, "", {}, {}, {1}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
// wildcard search with typos
|
|
results = coll1->search("*",
|
|
{}, "", {}, {}, {1}, 10, 1, FREQUENCY,
|
|
{true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, BadHighlightingOnText) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("text", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = "0";
|
|
doc["text"] = "include destruction of natural marine and estuarine\\nhabitats, loss of productive agricultural "
|
|
"land,\\nand soil erosion. 90 When interviewed, multiple\\nexperts stated that inappropriate land use "
|
|
"and\\nmanagement is a central factor contributing to\\nenvironmental degradation in the "
|
|
"Castries-Gros\\nIslet Corridor. 91 The construction is placing greater\\nstress on natural resources "
|
|
"and biodiversity, and\\nthe capacity to produce food and retain freshwater\\nhas been diminished. "
|
|
"92 Moreover, increased\\nwater consumption by the tourism sector, when\\ncompounded by climate "
|
|
"change, is increasing food\\nand water insecurity throughout Saint Lucia, as well\\nas suppressing "
|
|
"long-term growth prospects. 93";
|
|
|
|
doc["points"] = 0;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
|
|
auto results = coll1->search("natural saint lucia", {"text"}, "", {}, {}, {1}, 10, 1, FREQUENCY,
|
|
{true}, 10).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_STREQ("food\\nand water insecurity throughout <mark>Saint</mark> <mark>Lucia</mark>, as well\\nas suppressing long-term",
|
|
results["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
|
|
|
|
ASSERT_EQ(2, results["hits"][0]["highlights"][0]["matched_tokens"].size());
|
|
ASSERT_STREQ("Saint", results["hits"][0]["highlights"][0]["matched_tokens"][0].get<std::string>().c_str());
|
|
ASSERT_STREQ("Lucia", results["hits"][0]["highlights"][0]["matched_tokens"][1].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, FieldLevelPrefixConfiguration) {
|
|
Collection *coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("artist", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if(coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::vector<std::string>> records = {
|
|
{"Taylor Swift Karaoke: reputation", "Taylor Swift"},
|
|
{"Style", "Taylor Swift"},
|
|
};
|
|
|
|
for(size_t i=0; i<records.size(); i++) {
|
|
nlohmann::json doc;
|
|
|
|
doc["id"] = std::to_string(i);
|
|
doc["title"] = records[i][0];
|
|
doc["artist"] = records[i][1];
|
|
doc["points"] = i;
|
|
|
|
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
|
}
|
|
|
|
auto results = coll1->search("taylo",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true, false}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_EQ(1, results["found"].get<size_t>());
|
|
ASSERT_EQ(1, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
|
|
results = coll1->search("taylo",
|
|
{"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
|
|
{true, true}, 10, spp::sparse_hash_set<std::string>(),
|
|
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
|
"<mark>", "</mark>", {1, 1}).get();
|
|
|
|
ASSERT_EQ(2, results["found"].get<size_t>());
|
|
ASSERT_EQ(2, results["hits"].size());
|
|
|
|
ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
|
|
ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, QueryParsingForPhraseSearch) {
|
|
Collection* coll1;
|
|
|
|
std::vector<field> fields = {field("title", field_types::STRING, false),
|
|
field("points", field_types::INT32, false),};
|
|
|
|
coll1 = collectionManager.get_collection("coll1").get();
|
|
if (coll1 == nullptr) {
|
|
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
|
}
|
|
|
|
std::vector<std::string> q_include_tokens, q_unstemmed_tokens;
|
|
std::vector<std::vector<std::string>> q_exclude_tokens;
|
|
std::vector<std::vector<std::string>> q_phrases;
|
|
|
|
std::string q = R"(the "phrase search" query)";
|
|
/*coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false);
|
|
|
|
ASSERT_EQ(2, q_include_tokens.size());
|
|
ASSERT_EQ("the", q_include_tokens[0]);
|
|
ASSERT_EQ("query", q_include_tokens[1]);
|
|
ASSERT_EQ(1, q_phrases.size());
|
|
ASSERT_EQ(2, q_phrases[0].size());
|
|
ASSERT_EQ("phrase", q_phrases[0][0]);
|
|
ASSERT_EQ("search", q_phrases[0][1]);
|
|
*/
|
|
// quoted string has trailing padded space
|
|
|
|
q = R"("space padded " query)";
|
|
q_include_tokens.clear();
|
|
q_exclude_tokens.clear();
|
|
q_unstemmed_tokens.clear();
|
|
q_phrases.clear();
|
|
|
|
coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false);
|
|
ASSERT_EQ(1, q_include_tokens.size());
|
|
ASSERT_EQ("query", q_include_tokens[0]);
|
|
ASSERT_EQ(1, q_phrases.size());
|
|
ASSERT_EQ(2, q_phrases[0].size());
|
|
ASSERT_EQ("space", q_phrases[0][0]);
|
|
ASSERT_EQ("padded", q_phrases[0][1]);
|
|
|
|
// multiple quoted strings
|
|
|
|
q = R"("first phrase" "second phrase")";
|
|
q_include_tokens.clear();
|
|
q_exclude_tokens.clear();
|
|
q_phrases.clear();
|
|
|
|
coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false);
|
|
ASSERT_EQ(1, q_include_tokens.size());
|
|
ASSERT_EQ("*", q_include_tokens[0]);
|
|
ASSERT_EQ(2, q_phrases.size());
|
|
ASSERT_EQ(2, q_phrases[0].size());
|
|
ASSERT_EQ("first", q_phrases[0][0]);
|
|
ASSERT_EQ("phrase", q_phrases[0][1]);
|
|
ASSERT_EQ("second", q_phrases[1][0]);
|
|
ASSERT_EQ("phrase", q_phrases[1][1]);
|
|
|
|
// single quoted string
|
|
|
|
q = R"("hello")";
|
|
q_include_tokens.clear();
|
|
q_exclude_tokens.clear();
|
|
q_phrases.clear();
|
|
|
|
coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false);
|
|
ASSERT_EQ(1, q_include_tokens.size());
|
|
ASSERT_EQ("*", q_include_tokens[0]);
|
|
ASSERT_EQ(1, q_phrases.size());
|
|
ASSERT_EQ(1, q_phrases[0].size());
|
|
ASSERT_EQ("hello", q_phrases[0][0]);
|
|
|
|
// stray trailing quote
|
|
|
|
q = R"(hello")";
|
|
q_include_tokens.clear();
|
|
q_exclude_tokens.clear();
|
|
q_phrases.clear();
|
|
|
|
coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false);
|
|
ASSERT_EQ(1, q_include_tokens.size());
|
|
ASSERT_EQ("hello", q_include_tokens[0]);
|
|
ASSERT_EQ(0, q_phrases.size());
|
|
|
|
// padded space one either side of quote
|
|
q = R"("some query " here)";
|
|
q_include_tokens.clear();
|
|
q_exclude_tokens.clear();
|
|
q_phrases.clear();
|
|
|
|
coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false);
|
|
ASSERT_EQ(1, q_include_tokens.size());
|
|
ASSERT_EQ("here", q_include_tokens[0]);
|
|
ASSERT_EQ(1, q_phrases.size());
|
|
ASSERT_EQ(2, q_phrases[0].size());
|
|
ASSERT_EQ("some", q_phrases[0][0]);
|
|
ASSERT_EQ("query", q_phrases[0][1]);
|
|
|
|
// with exclude operator
|
|
q = R"(-"some phrase" here)";
|
|
q_include_tokens.clear();
|
|
q_exclude_tokens.clear();
|
|
q_phrases.clear();
|
|
coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false);
|
|
ASSERT_EQ(1, q_include_tokens.size());
|
|
ASSERT_EQ("here", q_include_tokens[0]);
|
|
ASSERT_EQ(0, q_phrases.size());
|
|
ASSERT_EQ(1, q_exclude_tokens.size());
|
|
ASSERT_EQ(2, q_exclude_tokens[0].size());
|
|
ASSERT_EQ("some", q_exclude_tokens[0][0]);
|
|
ASSERT_EQ("phrase", q_exclude_tokens[0][1]);
|
|
|
|
// with multiple exclude operators
|
|
q = R"(-"some phrase" here -token)";
|
|
q_include_tokens.clear();
|
|
q_exclude_tokens.clear();
|
|
q_phrases.clear();
|
|
coll1->parse_search_query(q, q_include_tokens, q_unstemmed_tokens, q_exclude_tokens, q_phrases, "en", false);
|
|
ASSERT_EQ(1, q_include_tokens.size());
|
|
ASSERT_EQ("here", q_include_tokens[0]);
|
|
ASSERT_EQ(0, q_phrases.size());
|
|
ASSERT_EQ(2, q_exclude_tokens.size());
|
|
ASSERT_EQ(2, q_exclude_tokens[0].size());
|
|
ASSERT_EQ("some", q_exclude_tokens[0][0]);
|
|
ASSERT_EQ("phrase", q_exclude_tokens[0][1]);
|
|
ASSERT_EQ(1, q_exclude_tokens[1].size());
|
|
ASSERT_EQ("token", q_exclude_tokens[1][0]);
|
|
|
|
collectionManager.drop_collection("coll1");
|
|
}
|
|
|
|
TEST_F(CollectionTest, WildcardQueryBy) {
|
|
nlohmann::json schema = R"({
|
|
"name": "posts",
|
|
"enable_nested_fields": true,
|
|
"fields": [
|
|
{"name": "username", "type": "string", "facet": true},
|
|
{"name": "user.rank", "type": "int32", "facet": true},
|
|
{"name": "user.bio", "type": "string"},
|
|
{"name": "likes", "type": "int32"},
|
|
{"name": "content", "type": "object"}
|
|
],
|
|
"default_sorting_field": "likes"
|
|
})"_json;
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
|
|
std::vector<std::string> json_lines = {
|
|
R"({"id": "124","username": "user_a","user": {"rank": 100,"bio": "Hi! I'm user_a"},"likes": 5215,"content": {"title": "title 1","body": "body 1 user_a"}})",
|
|
R"({"id": "125","username": "user_b","user": {"rank": 50,"bio": "user_b here, nice to meet you!"},"likes": 5215,"content": {"title": "title 2","body": "body 2 user_b"}})"
|
|
};
|
|
|
|
for (auto const& json: json_lines){
|
|
auto add_op = coll->add(json);
|
|
if (!add_op.ok()) {
|
|
LOG(INFO) << add_op.error();
|
|
}
|
|
ASSERT_TRUE(add_op.ok());
|
|
}
|
|
|
|
// * matches username, user.bio, content.title, content.body
|
|
auto result = coll->search("user_a", {"*"}, "", {}, {}, {0}).get();
|
|
|
|
ASSERT_EQ(1, result["found"].get<size_t>());
|
|
ASSERT_EQ(1, result["hits"].size());
|
|
|
|
ASSERT_EQ("Hi! I'm <mark>user_a</mark>",
|
|
result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
|
|
ASSERT_EQ("<mark>user_a</mark>",
|
|
result["hits"][0]["highlight"]["username"]["snippet"].get<std::string>());
|
|
// ASSERT_EQ("body 1 <mark>user_a</mark>",
|
|
// result["hits"][0]["highlight"]["content"]["body"]["snippet"].get<std::string>());
|
|
|
|
// user* matches username and user.bio
|
|
result = coll->search("user_a", {"user*"}, "", {}, {}, {0}).get();
|
|
|
|
ASSERT_EQ(1, result["found"].get<size_t>());
|
|
ASSERT_EQ(1, result["hits"].size());
|
|
|
|
ASSERT_EQ("Hi! I'm <mark>user_a</mark>",
|
|
result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
|
|
ASSERT_EQ("<mark>user_a</mark>",
|
|
result["hits"][0]["highlight"]["username"]["snippet"].get<std::string>());
|
|
|
|
// user.* matches user.bio
|
|
result = coll->search("user_a", {"user.*"}, "", {}, {}, {0}).get();
|
|
|
|
ASSERT_EQ(1, result["found"].get<size_t>());
|
|
ASSERT_EQ(1, result["hits"].size());
|
|
|
|
ASSERT_EQ("Hi! I'm <mark>user_a</mark>",
|
|
result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
|
|
|
|
// user.rank cannot be queried
|
|
result = coll->search("100", {"user*"}, "", {}, {}, {0}).get();
|
|
ASSERT_EQ(0, result["found"].get<size_t>());
|
|
ASSERT_EQ(0, result["hits"].size());
|
|
|
|
// No matching field for query_by
|
|
auto error = coll->search("user_a", {"foo*"}, "", {}, {}, {0}).error();
|
|
ASSERT_EQ("No string or string array field found matching the pattern `foo*` in the schema.", error);
|
|
}
|
|
|
|
TEST_F(CollectionTest, WildcardHighlightFields) {
|
|
nlohmann::json schema = R"({
|
|
"name": "posts",
|
|
"enable_nested_fields": true,
|
|
"fields": [
|
|
{"name": "user_name", "type": "string", "facet": true},
|
|
{"name": "user", "type": "object"}
|
|
]
|
|
})"_json;
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
|
|
auto add_op = coll->add(R"({"id": "124","user_name": "user_a","user": {"rank": 100,"phone": "+91 123123123"}})");
|
|
if (!add_op.ok()) {
|
|
LOG(INFO) << add_op.error();
|
|
}
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
spp::sparse_hash_set<std::string> dummy_include_exclude;
|
|
std::string highlight_fields = "user*";
|
|
// user* matches user_name, user.rank and user.phone
|
|
auto result = coll->search("123", {"user"}, "", {}, {}, {0},
|
|
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
|
|
30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
|
|
true, false, true, highlight_fields).get();
|
|
|
|
ASSERT_EQ(1, result["found"].get<size_t>());
|
|
ASSERT_EQ(1, result["hits"].size());
|
|
ASSERT_EQ(1, result["hits"][0]["highlight"].size());
|
|
ASSERT_EQ("+91 <mark>123</mark>123123", result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get<std::string>());
|
|
|
|
highlight_fields = "user.*";
|
|
// user.* matches user.rank and user.phone
|
|
result = coll->search("+91", {"user"}, "", {}, {}, {0},
|
|
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
|
|
30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
|
|
true, false, true, highlight_fields).get();
|
|
|
|
ASSERT_EQ(1, result["found"].get<size_t>());
|
|
ASSERT_EQ(1, result["hits"].size());
|
|
ASSERT_EQ(1, result["hits"][0]["highlight"].size());
|
|
ASSERT_EQ("+<mark>91</mark> 123123123",
|
|
result["hits"][0]["highlight"]["user"]["phone"]["snippet"].get<std::string>());
|
|
|
|
highlight_fields = "user*";
|
|
// user* matches user_name, user.rank and user.phone
|
|
result = coll->search("user_a", {"user_name"}, "", {}, {}, {0},
|
|
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
|
|
30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
|
|
true, false, true, highlight_fields).get();
|
|
|
|
ASSERT_EQ(1, result["found"].get<size_t>());
|
|
ASSERT_EQ(1, result["hits"].size());
|
|
ASSERT_EQ(1, result["hits"][0]["highlight"].size());
|
|
ASSERT_EQ("<mark>user_a</mark>",
|
|
result["hits"][0]["highlight"]["user_name"]["snippet"].get<std::string>());
|
|
|
|
highlight_fields = "user.*";
|
|
// user.* matches user.rank and user.phone
|
|
result = coll->search("user_a", {"user_name"}, "", {}, {}, {0},
|
|
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
|
|
30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
|
|
true, false, true, highlight_fields).get();
|
|
|
|
ASSERT_EQ(1, result["found"].get<size_t>());
|
|
ASSERT_EQ(1, result["hits"].size());
|
|
ASSERT_EQ(0, result["hits"][0]["highlight"].size());
|
|
|
|
highlight_fields = "foo*";
|
|
// No matching field for highlight_fields
|
|
result = coll->search("user_a", {"user_name"}, "", {}, {}, {0},
|
|
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
|
|
30, 4, "", Index::TYPO_TOKENS_THRESHOLD, "", "", {}, 3, "<mark>", "</mark>", {}, UINT32_MAX,
|
|
true, false, true, highlight_fields).get();
|
|
|
|
ASSERT_EQ(1, result["found"].get<size_t>());
|
|
ASSERT_EQ(1, result["hits"].size());
|
|
ASSERT_EQ(0, result["hits"][0]["highlight"].size());
|
|
}
|
|
|
|
TEST_F(CollectionTest, WildcardHighlightFullFields) {
|
|
nlohmann::json schema = R"({
|
|
"name": "posts",
|
|
"enable_nested_fields": true,
|
|
"fields": [
|
|
{"name": "user_name", "type": "string", "facet": true},
|
|
{"name": "user.rank", "type": "int32", "facet": true},
|
|
{"name": "user.phone", "type": "string"},
|
|
{"name": "user.bio", "type": "string"}
|
|
]
|
|
})"_json;
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
|
|
auto json = R"({
|
|
"id": "124",
|
|
"user_name": "user_a",
|
|
"user": {
|
|
"rank": 100,
|
|
"phone": "+91 123123123"
|
|
}
|
|
})"_json;
|
|
std::string bio = "Once there was a middle-aged boy named User_a who was an avid swimmer."
|
|
"He had been swimming competitively for most of his life, and had even competed in several national competitions."
|
|
"However, despite his passion and talent for the sport, he had never quite managed to win that elusive gold medal."
|
|
"Determined to change that, User_a began training harder than ever before."
|
|
"He woke up early every morning to swim laps before work and spent his evenings at the pool as well."
|
|
"Despite the grueling schedule, he never once complained."
|
|
"Instead, he reminded himself of his goal: to become a national champion.";
|
|
json["user"]["bio"] = bio;
|
|
|
|
auto add_op = coll->add(json.dump());
|
|
if (!add_op.ok()) {
|
|
LOG(INFO) << add_op.error();
|
|
}
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
spp::sparse_hash_set<std::string> dummy_include_exclude;
|
|
std::string highlight_full_fields = "user*";
|
|
// user* matches user_name, user.bio
|
|
auto result = coll->search("user_a", {"*"}, "", {}, {}, {0},
|
|
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
|
|
30, 4, highlight_full_fields).get();
|
|
|
|
ASSERT_EQ(1, result["found"].get<size_t>());
|
|
ASSERT_EQ(1, result["hits"].size());
|
|
|
|
ASSERT_EQ("a middle-aged boy named <mark>User_a</mark> who was an avid",
|
|
result["hits"][0]["highlight"]["user"]["bio"]["snippet"].get<std::string>());
|
|
|
|
std::string highlighted_value = "Once there was a middle-aged boy named <mark>User_a</mark> who was an avid swimmer."
|
|
"He had been swimming competitively for most of his life, and had even competed in several national competitions."
|
|
"However, despite his passion and talent for the sport, he had never quite managed to win that elusive gold medal."
|
|
"Determined to change that, <mark>User_a</mark> began training harder than ever before."
|
|
"He woke up early every morning to swim laps before work and spent his evenings at the pool as well."
|
|
"Despite the grueling schedule, he never once complained."
|
|
"Instead, he reminded himself of his goal: to become a national champion.";
|
|
ASSERT_EQ( highlighted_value, result["hits"][0]["highlight"]["user"]["bio"]["value"].get<std::string>());
|
|
ASSERT_EQ("<mark>user_a</mark>",
|
|
result["hits"][0]["highlight"]["user_name"]["value"].get<std::string>());
|
|
|
|
highlight_full_fields = "user.*";
|
|
// user.* matches user.bio
|
|
result = coll->search("user_a", {"*"}, "", {}, {}, {0},
|
|
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
|
|
30, 4, highlight_full_fields).get();
|
|
|
|
ASSERT_EQ(1, result["found"].get<size_t>());
|
|
ASSERT_EQ(1, result["hits"].size());
|
|
|
|
ASSERT_EQ(highlighted_value, result["hits"][0]["highlight"]["user"]["bio"]["value"].get<std::string>());
|
|
ASSERT_EQ(0, result["hits"][0]["highlight"]["user_name"].count("value"));
|
|
|
|
highlight_full_fields = "foo*";
|
|
// No matching field for highlight_fields
|
|
result = coll->search("user_a", {"*"}, "", {}, {}, {0},
|
|
10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "",
|
|
30, 4, highlight_full_fields).get();
|
|
|
|
ASSERT_EQ(0, result["hits"][0]["highlight"]["user"]["bio"].count("value"));
|
|
ASSERT_EQ(0, result["hits"][0]["highlight"]["user_name"].count("value"));
|
|
}
|
|
|
|
|
|
TEST_F(CollectionTest, SemanticSearchTest) {
|
|
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
nlohmann::json object;
|
|
object["name"] = "apple";
|
|
auto add_op = coll->add(object.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
ASSERT_EQ("apple", add_op.get()["name"]);
|
|
ASSERT_EQ(384, add_op.get()["embedding"].size());
|
|
|
|
spp::sparse_hash_set<std::string> dummy_include_exclude;
|
|
|
|
auto search_res_op = coll->search("apple", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "");
|
|
|
|
ASSERT_TRUE(search_res_op.ok());
|
|
auto search_res = search_res_op.get();
|
|
ASSERT_EQ(1, search_res["found"].get<size_t>());
|
|
ASSERT_EQ(1, search_res["hits"].size());
|
|
ASSERT_EQ("apple", search_res["hits"][0]["document"]["name"].get<std::string>());
|
|
ASSERT_EQ(384, search_res["hits"][0]["document"]["embedding"].size());
|
|
}
|
|
|
|
TEST_F(CollectionTest, InvalidSemanticSearch) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
LOG(INFO) << "op.error(): " << op.error();
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
nlohmann::json object;
|
|
object["name"] = "apple";
|
|
auto add_op = coll->add(object.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
ASSERT_EQ("apple", add_op.get()["name"]);
|
|
ASSERT_EQ(384, add_op.get()["embedding"].size());
|
|
|
|
spp::sparse_hash_set<std::string> dummy_include_exclude;
|
|
|
|
auto search_res_op = coll->search("apple", {"embedding", "embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "");
|
|
|
|
ASSERT_FALSE(search_res_op.ok());
|
|
}
|
|
|
|
TEST_F(CollectionTest, HybridSearch) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
nlohmann::json object;
|
|
object["name"] = "apple";
|
|
auto add_op = coll->add(object.dump());
|
|
LOG(INFO) << "add_op.error(): " << add_op.error();
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
ASSERT_EQ("apple", add_op.get()["name"]);
|
|
ASSERT_EQ(384, add_op.get()["embedding"].size());
|
|
|
|
spp::sparse_hash_set<std::string> dummy_include_exclude;
|
|
auto search_res_op = coll->search("apple", {"name","embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "");
|
|
ASSERT_TRUE(search_res_op.ok());
|
|
auto search_res = search_res_op.get();
|
|
ASSERT_EQ(1, search_res["found"].get<size_t>());
|
|
ASSERT_EQ(1, search_res["hits"].size());
|
|
ASSERT_EQ("apple", search_res["hits"][0]["document"]["name"].get<std::string>());
|
|
ASSERT_EQ(384, search_res["hits"][0]["document"]["embedding"].size());
|
|
}
|
|
|
|
// TEST_F(CollectionTest, EmbedFielsTest) {
|
|
// nlohmann::json schema = R"({
|
|
// "name": "objects",
|
|
// "fields": [
|
|
// {"name": "name", "type": "string"},
|
|
// {"name": "embedding", "type":"float[]", "embed":{"from": ["name"]}
|
|
// ]
|
|
// })"_json;
|
|
|
|
// EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
//
|
|
|
|
// auto op = collectionManager.create_collection(schema);
|
|
// ASSERT_TRUE(op.ok());
|
|
// Collection* coll = op.get();
|
|
|
|
// nlohmann::json object = R"({
|
|
// "name": "apple"
|
|
// })"_json;
|
|
|
|
// auto embed_op = coll->embed_fields(object);
|
|
|
|
// ASSERT_TRUE(embed_op.ok());
|
|
|
|
// ASSERT_EQ("apple", object["name"]);
|
|
// ASSERT_EQ(384, object["embedding"].get<std::vector<float>>().size());
|
|
// }
|
|
|
|
TEST_F(CollectionTest, HybridSearchRankFusionTest) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
nlohmann::json object;
|
|
object["name"] = "butter";
|
|
auto add_op = coll->add(object.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
object["name"] = "butterball";
|
|
add_op = coll->add(object.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
object["name"] = "butterfly";
|
|
add_op = coll->add(object.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
spp::sparse_hash_set<std::string> dummy_include_exclude;
|
|
auto search_res_op = coll->search("butter", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "");
|
|
ASSERT_TRUE(search_res_op.ok());
|
|
auto search_res = search_res_op.get();
|
|
ASSERT_EQ(3, search_res["found"].get<size_t>());
|
|
ASSERT_EQ(3, search_res["hits"].size());
|
|
// Vector search order:
|
|
// 1. butter
|
|
// 2. butterball
|
|
// 3. butterfly
|
|
ASSERT_EQ("butter", search_res["hits"][0]["document"]["name"].get<std::string>());
|
|
ASSERT_EQ("butterball", search_res["hits"][1]["document"]["name"].get<std::string>());
|
|
ASSERT_EQ("butterfly", search_res["hits"][2]["document"]["name"].get<std::string>());
|
|
|
|
|
|
search_res_op = coll->search("butter", {"name"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "");
|
|
ASSERT_TRUE(search_res_op.ok());
|
|
search_res = search_res_op.get();
|
|
ASSERT_EQ(3, search_res["found"].get<size_t>());
|
|
ASSERT_EQ(3, search_res["hits"].size());
|
|
// Keyword search order:
|
|
// 1. butter
|
|
// 2. butterfly
|
|
// 3. butterball
|
|
ASSERT_EQ("butter", search_res["hits"][0]["document"]["name"].get<std::string>());
|
|
ASSERT_EQ("butterfly", search_res["hits"][1]["document"]["name"].get<std::string>());
|
|
ASSERT_EQ("butterball", search_res["hits"][2]["document"]["name"].get<std::string>());
|
|
|
|
search_res_op = coll->search("butter", {"name","embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "");
|
|
ASSERT_TRUE(search_res_op.ok());
|
|
search_res = search_res_op.get();
|
|
ASSERT_EQ(3, search_res["found"].get<size_t>());
|
|
ASSERT_EQ(3, search_res["hits"].size());
|
|
// Hybrid search with rank fusion order:
|
|
// 1. butter (1/1 * 0.7) + (1/1 * 0.3) = 1
|
|
// 2. butterfly (1/2 * 0.7) + (1/3 * 0.3) = 0.45
|
|
// 3. butterball (1/3 * 0.7) + (1/2 * 0.3) = 0.383
|
|
ASSERT_EQ("butter", search_res["hits"][0]["document"]["name"].get<std::string>());
|
|
ASSERT_EQ("butterfly", search_res["hits"][1]["document"]["name"].get<std::string>());
|
|
ASSERT_EQ("butterball", search_res["hits"][2]["document"]["name"].get<std::string>());
|
|
|
|
ASSERT_FLOAT_EQ((1.0/1.0 * 0.7) + (1.0/1.0 * 0.3), search_res["hits"][0]["hybrid_search_info"]["rank_fusion_score"].get<float>());
|
|
ASSERT_FLOAT_EQ((1.0/2.0 * 0.7) + (1.0/3.0 * 0.3), search_res["hits"][1]["hybrid_search_info"]["rank_fusion_score"].get<float>());
|
|
ASSERT_FLOAT_EQ((1.0/3.0 * 0.7) + (1.0/2.0 * 0.3), search_res["hits"][2]["hybrid_search_info"]["rank_fusion_score"].get<float>());
|
|
}
|
|
|
|
TEST_F(CollectionTest, WildcardSearchWithEmbeddingField) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
|
|
spp::sparse_hash_set<std::string> dummy_include_exclude;
|
|
auto search_res_op = coll->search("*", {"name","embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "");
|
|
|
|
ASSERT_TRUE(search_res_op.ok());
|
|
}
|
|
|
|
TEST_F(CollectionTest, CreateModelDirIfNotExists) {
|
|
system("mkdir -p /tmp/typesense_test/new_models_dir");
|
|
system("rm -rf /tmp/typesense_test/new_models_dir");
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/new_models_dir");
|
|
|
|
// check if model dir is created
|
|
ASSERT_TRUE(std::filesystem::exists("/tmp/typesense_test/new_models_dir"));
|
|
}
|
|
|
|
TEST_F(CollectionTest, EmbedStringArrayField) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "names", "type": "string[]"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["names"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
|
|
nlohmann::json doc;
|
|
doc["names"].push_back("butter");
|
|
doc["names"].push_back("butterfly");
|
|
doc["names"].push_back("butterball");
|
|
|
|
auto add_op = coll->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
}
|
|
|
|
TEST_F(CollectionTest, MissingFieldForEmbedding) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "names", "type": "string[]"},
|
|
{"name": "category", "type": "string", "optional": true},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["names", "category"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
|
|
nlohmann::json doc;
|
|
doc["names"].push_back("butter");
|
|
doc["names"].push_back("butterfly");
|
|
doc["names"].push_back("butterball");
|
|
|
|
auto add_op = coll->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
}
|
|
|
|
TEST_F(CollectionTest, WrongTypeInEmbedFrom) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "category", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": [1122], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_FALSE(op.ok());
|
|
ASSERT_EQ("Property `embed.from` must contain only field names as strings.", op.error());
|
|
}
|
|
|
|
TEST_F(CollectionTest, WrongTypeForEmbedding) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "category", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["category"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
|
|
nlohmann::json doc;
|
|
doc["category"] = 1;
|
|
|
|
auto add_op = validator_t::validate_embed_fields(doc, coll->get_embedding_fields(), coll->get_schema(), true);
|
|
ASSERT_FALSE(add_op.ok());
|
|
ASSERT_EQ("Field `category` has malformed data.", add_op.error());
|
|
}
|
|
|
|
TEST_F(CollectionTest, WrongTypeOfElementForEmbeddingInStringArray) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "category", "type": "string[]"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["category"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
|
|
nlohmann::json doc;
|
|
doc["category"].push_back(33);
|
|
|
|
auto add_op = validator_t::validate_embed_fields(doc, coll->get_embedding_fields(), coll->get_schema(), true);
|
|
ASSERT_FALSE(add_op.ok());
|
|
ASSERT_EQ("Field `category` has malformed data.", add_op.error());
|
|
}
|
|
|
|
TEST_F(CollectionTest, UpdateEmbeddingsForUpdatedDocument) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
Collection* coll = op.get();
|
|
|
|
nlohmann::json doc;
|
|
doc["name"] = "butter";
|
|
|
|
auto add_op = coll->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
// get embedding field
|
|
|
|
// get id of the document
|
|
auto id = add_op.get()["id"];
|
|
// get embedding field from the document
|
|
auto embedding_field = add_op.get()["embedding"].get<std::vector<float>>();
|
|
ASSERT_EQ(384, embedding_field.size());
|
|
|
|
// update the document
|
|
nlohmann::json update_doc;
|
|
update_doc["name"] = "butterball";
|
|
std::string dirty_values;
|
|
|
|
auto update_op = coll->update_matching_filter("id:=" + id.get<std::string>(), update_doc.dump(), dirty_values);
|
|
ASSERT_TRUE(update_op.ok());
|
|
ASSERT_EQ(1, update_op.get()["num_updated"]);
|
|
|
|
// get the document again
|
|
auto get_op = coll->get(id);
|
|
ASSERT_TRUE(get_op.ok());
|
|
auto updated_embedding_field = get_op.get()["embedding"].get<std::vector<float>>();
|
|
|
|
// check if the embedding field is updated
|
|
ASSERT_NE(embedding_field, updated_embedding_field);
|
|
}
|
|
|
|
TEST_F(CollectionTest, CreateCollectionWithOpenAI) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "openai/text-embedding-ada-002"}}}
|
|
]
|
|
})"_json;
|
|
|
|
if (std::getenv("api_key") == nullptr) {
|
|
LOG(INFO) << "Skipping test as api_key is not set.";
|
|
return;
|
|
}
|
|
|
|
auto api_key = std::string(std::getenv("api_key"));
|
|
schema["fields"][1]["embed"]["model_config"]["api_key"] = api_key;
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
|
|
// create one more collection
|
|
schema = R"({
|
|
"name": "objects2",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "openai/text-embedding-ada-002"}}}
|
|
]
|
|
})"_json;
|
|
schema["fields"][1]["embed"]["model_config"]["api_key"] = api_key;
|
|
op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
}
|
|
|
|
TEST_F(CollectionTest, CreateOpenAIEmbeddingField) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "openai/text-embedding-ada-002"}}}
|
|
]
|
|
})"_json;
|
|
|
|
if (std::getenv("api_key") == nullptr) {
|
|
LOG(INFO) << "Skipping test as api_key is not set.";
|
|
return;
|
|
}
|
|
|
|
auto api_key = std::string(std::getenv("api_key"));
|
|
schema["fields"][1]["embed"]["model_config"]["api_key"] = api_key;
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
auto summary = op.get()->get_summary_json();
|
|
ASSERT_EQ("openai/text-embedding-ada-002", summary["fields"][1]["embed"]["model_config"]["model_name"]);
|
|
ASSERT_EQ(1536, summary["fields"][1]["num_dim"]);
|
|
|
|
nlohmann::json doc;
|
|
doc["name"] = "butter";
|
|
|
|
auto add_op = op.get()->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
ASSERT_EQ(1536, add_op.get()["embedding"].size());
|
|
}
|
|
|
|
TEST_F(CollectionTest, HideOpenAIApiKey) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "openai/text-embedding-ada-002"}}}
|
|
]
|
|
})"_json;
|
|
|
|
if (std::getenv("api_key") == nullptr) {
|
|
LOG(INFO) << "Skipping test as api_key is not set.";
|
|
return;
|
|
}
|
|
|
|
auto api_key = std::string(std::getenv("api_key"));
|
|
schema["fields"][1]["embed"]["model_config"]["api_key"] = api_key;
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
auto summary = op.get()->get_summary_json();
|
|
// hide api key with * after first 3 characters
|
|
ASSERT_EQ(summary["fields"][1]["embed"]["model_config"]["api_key"].get<std::string>(), api_key.replace(5, api_key.size() - 5, api_key.size() - 5, '*'));
|
|
}
|
|
|
|
TEST_F(CollectionTest, PrefixSearchDisabledForOpenAI) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "openai/text-embedding-ada-002"}}}
|
|
]
|
|
})"_json;
|
|
|
|
if (std::getenv("api_key") == nullptr) {
|
|
LOG(INFO) << "Skipping test as api_key is not set.";
|
|
return;
|
|
}
|
|
|
|
auto api_key = std::string(std::getenv("api_key"));
|
|
schema["fields"][1]["embed"]["model_config"]["api_key"] = api_key;
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
|
|
nlohmann::json doc;
|
|
doc["name"] = "butter";
|
|
|
|
auto add_op = op.get()->add(doc.dump());
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
spp::sparse_hash_set<std::string> dummy_include_exclude;
|
|
auto search_res_op = op.get()->search("dummy", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "");
|
|
|
|
ASSERT_FALSE(search_res_op.ok());
|
|
ASSERT_EQ("Prefix search is not supported for remote embedders. Please set `prefix=false` as an additional search parameter to disable prefix searching.", search_res_op.error());
|
|
|
|
search_res_op = op.get()->search("dummy", {"embedding"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "");
|
|
ASSERT_TRUE(search_res_op.ok());
|
|
}
|
|
|
|
|
|
TEST_F(CollectionTest, MoreThanOneEmbeddingField) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "name", "type": "string"},
|
|
{"name": "name2", "type": "string"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["name"], "model_config": {"model_name": "ts/e5-small"}}},
|
|
{"name": "embedding2", "type":"float[]", "embed":{"from": ["name2"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
|
|
auto coll = op.get();
|
|
|
|
nlohmann::json doc;
|
|
doc["name"] = "butter";
|
|
doc["name2"] = "butterball";
|
|
|
|
auto add_op = validator_t::validate_embed_fields(doc, op.get()->get_embedding_fields(), op.get()->get_schema(), true);
|
|
|
|
ASSERT_TRUE(add_op.ok());
|
|
spp::sparse_hash_set<std::string> dummy_include_exclude;
|
|
|
|
auto search_res_op = coll->search("butter", {"name", "embedding", "embedding2"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, dummy_include_exclude, dummy_include_exclude, 10, "", 30, 4, "");
|
|
|
|
ASSERT_FALSE(search_res_op.ok());
|
|
|
|
ASSERT_EQ("Only one embedding field is allowed in the query.", search_res_op.error());
|
|
}
|
|
|
|
|
|
TEST_F(CollectionTest, EmbeddingFieldEmptyArrayInDocument) {
|
|
nlohmann::json schema = R"({
|
|
"name": "objects",
|
|
"fields": [
|
|
{"name": "names", "type": "string[]"},
|
|
{"name": "embedding", "type":"float[]", "embed":{"from": ["names"], "model_config": {"model_name": "ts/e5-small"}}}
|
|
]
|
|
})"_json;
|
|
|
|
EmbedderManager::set_model_dir("/tmp/typesense_test/models");
|
|
|
|
auto op = collectionManager.create_collection(schema);
|
|
ASSERT_TRUE(op.ok());
|
|
|
|
auto coll = op.get();
|
|
|
|
nlohmann::json doc;
|
|
doc["names"] = nlohmann::json::array();
|
|
|
|
// try adding
|
|
auto add_op = coll->add(doc.dump());
|
|
|
|
ASSERT_TRUE(add_op.ok());
|
|
|
|
ASSERT_TRUE(add_op.get()["embedding"].is_null());
|
|
|
|
// try updating
|
|
auto id = add_op.get()["id"];
|
|
doc["names"].push_back("butter");
|
|
std::string dirty_values;
|
|
|
|
|
|
auto update_op = coll->update_matching_filter("id:=" + id.get<std::string>(), doc.dump(), dirty_values);
|
|
ASSERT_TRUE(update_op.ok());
|
|
ASSERT_EQ(1, update_op.get()["num_updated"]);
|
|
|
|
|
|
auto get_op = coll->get(id);
|
|
ASSERT_TRUE(get_op.ok());
|
|
|
|
ASSERT_FALSE(get_op.get()["embedding"].is_null());
|
|
|
|
ASSERT_EQ(384, get_op.get()["embedding"].size());
|
|
}
|
|
|
|
|
|
TEST_F(CollectionTest, CatchPartialResponseFromRemoteEmbedding) {
|
|
std::string partial_json = R"({
|
|
"results": [
|
|
{
|
|
"embedding": [
|
|
0.0,
|
|
0.0,
|
|
0.0
|
|
],
|
|
"text": "butter"
|
|
},
|
|
{
|
|
"embedding": [
|
|
0.0,
|
|
0.0,
|
|
0.0
|
|
],
|
|
"text": "butterball"
|
|
},
|
|
{
|
|
"embedding": [
|
|
0.0,
|
|
0.0)";
|
|
|
|
nlohmann::json req_body = R"({
|
|
"inputs": [
|
|
"butter",
|
|
"butterball",
|
|
"butterfly"
|
|
]
|
|
})"_json;
|
|
|
|
OpenAIEmbedder embedder("", "", 0, false, "");
|
|
|
|
auto res = embedder.get_error_json(req_body, 200, partial_json);
|
|
|
|
ASSERT_EQ(res["response"]["error"], "Malformed response from OpenAI API.");
|
|
ASSERT_EQ(res["request"]["body"], req_body);
|
|
}
|