Merge branch 'v0.25-join' into v0.26-facets

# Conflicts:
#	include/collection.h
#	include/index.h
#	src/collection.cpp
#	src/collection_manager.cpp
#	src/index.cpp
#	test/collection_specific_more_test.cpp
#	test/collection_vector_search_test.cpp
This commit is contained in:
Kishore Nallan 2023-10-13 19:00:17 +05:30
commit 91209a17dc
12 changed files with 303 additions and 32 deletions

View File

@ -209,6 +209,8 @@ private:
static Option<bool> parse_pinned_hits(const std::string& pinned_hits_str,
std::map<size_t, std::vector<std::string>>& pinned_hits);
static Option<drop_tokens_param_t> parse_drop_tokens_mode(const std::string& drop_tokens_mode);
Index* init_index();
static std::vector<char> to_char_array(const std::vector<std::string>& strs);
@ -502,7 +504,7 @@ public:
const std::string& stopwords_set="",
const std::vector<std::string>& facet_return_parent = {},
const std::vector<ref_include_fields>& ref_include_fields_vec = {},
const drop_tokens_mode_t drop_tokens_mode = right_to_left,
const std::string& drop_tokens_mode = "right_to_left",
const bool prioritize_num_matching_fields = true,
const bool group_missing_values = true) const;

View File

@ -101,6 +101,18 @@ enum text_match_type_t {
enum drop_tokens_mode_t {
left_to_right,
right_to_left,
both_sides,
};
struct drop_tokens_param_t {
drop_tokens_mode_t mode = right_to_left;
size_t token_limit = 1000;
drop_tokens_param_t() {
}
drop_tokens_param_t(drop_tokens_mode_t mode, size_t token_limit) : mode(mode), token_limit(token_limit) {}
};
struct search_args {
@ -153,7 +165,7 @@ struct search_args {
vector_query_t& vector_query;
size_t facet_sample_percent;
size_t facet_sample_threshold;
drop_tokens_mode_t drop_tokens_mode;
drop_tokens_param_t drop_tokens_mode;
search_args(std::vector<query_tokens_t> field_query_tokens, std::vector<search_field_t> search_fields,
const text_match_type_t match_type,
@ -170,7 +182,7 @@ struct search_args {
size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector<enable_t>& infixes,
const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos,
const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query,
size_t facet_sample_percent, size_t facet_sample_threshold, drop_tokens_mode_t drop_tokens_mode) :
size_t facet_sample_percent, size_t facet_sample_threshold, drop_tokens_param_t drop_tokens_mode) :
field_query_tokens(field_query_tokens),
search_fields(search_fields), match_type(match_type), filter_tree_root(filter_tree_root), facets(facets),
included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std),
@ -672,8 +684,10 @@ public:
const size_t max_extra_suffix, const size_t facet_query_num_typos,
const bool filter_curated_hits, enable_t split_join_tokens,
const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold,
const std::string& collection_name, facet_index_type_t facet_index_type = DETECT,
const drop_tokens_mode_t drop_tokens_mode = right_to_left) const;
const std::string& collection_name,
const drop_tokens_param_t drop_tokens_mode,
facet_index_type_t facet_index_type = DETECT
) const;
void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name,
const bool is_update);

View File

@ -15,6 +15,7 @@ struct vector_query_t {
uint32_t seq_id = 0;
bool query_doc_given = false;
float alpha = 0.3;
void _reset() {
// used for testing only

View File

@ -1419,7 +1419,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
const std::string& stopwords_set,
const std::vector<std::string>& facet_return_parent,
const std::vector<ref_include_fields>& ref_include_fields_vec,
const drop_tokens_mode_t drop_tokens_mode,
const std::string& drop_tokens_mode,
const bool prioritize_num_matching_fields,
const bool group_missing_values) const {
@ -1779,6 +1779,13 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
}
}
Option<drop_tokens_param_t> drop_tokens_param_op = parse_drop_tokens_mode(drop_tokens_mode);
if(!drop_tokens_param_op.ok()) {
return Option<nlohmann::json>(drop_tokens_param_op.code(), drop_tokens_param_op.error());
}
auto drop_tokens_param = drop_tokens_param_op.get();
std::vector<std::vector<KV*>> raw_result_kvs;
std::vector<std::vector<KV*>> override_result_kvs;
@ -1936,7 +1943,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
min_len_1typo, min_len_2typo, max_candidates, infixes,
max_extra_prefix, max_extra_suffix, facet_query_num_typos,
filter_curated_hits, split_join_tokens, vector_query,
facet_sample_percent, facet_sample_threshold, drop_tokens_mode);
facet_sample_percent, facet_sample_threshold, drop_tokens_param);
std::unique_ptr<search_args> search_params_guard(search_params);
@ -4071,6 +4078,35 @@ Option<bool> Collection::parse_pinned_hits(const std::string& pinned_hits_str,
return Option<bool>(true);
}
Option<drop_tokens_param_t> Collection::parse_drop_tokens_mode(const std::string& drop_tokens_mode) {
drop_tokens_mode_t drop_tokens_mode_val = left_to_right;
size_t drop_tokens_token_limit = 1000;
auto drop_tokens_mode_op = magic_enum::enum_cast<drop_tokens_mode_t>(drop_tokens_mode);
if(drop_tokens_mode_op.has_value()) {
drop_tokens_mode_val = drop_tokens_mode_op.value();
} else {
std::vector<std::string> drop_token_parts;
StringUtils::split(drop_tokens_mode, drop_token_parts, ":");
if(drop_token_parts.size() == 2) {
if(!StringUtils::is_uint32_t(drop_token_parts[1])) {
return Option<drop_tokens_param_t>(400, "Invalid format for drop tokens mode.");
}
drop_tokens_mode_op = magic_enum::enum_cast<drop_tokens_mode_t>(drop_token_parts[0]);
if(drop_tokens_mode_op.has_value()) {
drop_tokens_mode_val = drop_tokens_mode_op.value();
}
drop_tokens_token_limit = std::stoul(drop_token_parts[1]);
} else {
return Option<drop_tokens_param_t>(400, "Invalid format for drop tokens mode.");
}
}
return Option<drop_tokens_param_t>(drop_tokens_param_t(drop_tokens_mode_val, drop_tokens_token_limit));
}
Option<bool> Collection::add_synonym(const nlohmann::json& syn_json, bool write_to_store) {
std::shared_lock lock(mutex);
synonym_t synonym;

View File

@ -1415,12 +1415,6 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
Index::NUM_CANDIDATES_DEFAULT_MIN);
}
auto drop_tokens_mode_op = magic_enum::enum_cast<drop_tokens_mode_t>(drop_tokens_mode_str);
drop_tokens_mode_t drop_tokens_mode;
if(drop_tokens_mode_op.has_value()) {
drop_tokens_mode = drop_tokens_mode_op.value();
}
Option<nlohmann::json> result_op = collection->search(raw_query, search_fields, filter_query, facet_fields,
sort_fields, num_typos,
per_page,
@ -1470,7 +1464,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
stopwords_set,
facet_return_parent,
ref_include_fields_vec,
drop_tokens_mode,
drop_tokens_mode_str,
prioritize_num_matching_fields,
group_missing_values);

View File

@ -1858,8 +1858,9 @@ Option<bool> Index::run_search(search_args* search_params, const std::string& co
search_params->facet_sample_percent,
search_params->facet_sample_threshold,
collection_name,
facet_index_type,
search_params->drop_tokens_mode);
search_params->drop_tokens_mode,
facet_index_type
);
}
void Index::collate_included_ids(const std::vector<token_t>& q_included_tokens,
@ -2310,8 +2311,9 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
const vector_query_t& vector_query,
size_t facet_sample_percent, size_t facet_sample_threshold,
const std::string& collection_name,
facet_index_type_t facet_index_type,
const drop_tokens_mode_t drop_tokens_mode) const {
const drop_tokens_param_t drop_tokens_mode,
facet_index_type_t facet_index_type
) const {
std::shared_lock lock(mutex);
auto filter_result_iterator = new filter_result_iterator_t(collection_name, this, filter_tree_root);
@ -2743,10 +2745,22 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
for (size_t qi = 0; qi < all_queries.size(); qi++) {
auto& orig_tokens = all_queries[qi];
size_t num_tokens_dropped = 0;
auto curr_direction = drop_tokens_mode;
size_t total_dirs_done = 0;
while(exhaustive_search || all_result_ids_len < drop_tokens_threshold) {
// NOTE: when dropping both sides we will ignore exhaustive search
auto curr_direction = drop_tokens_mode.mode;
bool drop_both_sides = false;
if(drop_tokens_mode.mode == both_sides) {
if(orig_tokens.size() <= drop_tokens_mode.token_limit) {
drop_both_sides = true;
} else {
curr_direction = right_to_left;
}
}
while(exhaustive_search || all_result_ids_len < drop_tokens_threshold || drop_both_sides) {
// When atleast two tokens from the query are available we can drop one
std::vector<token_t> truncated_tokens;
std::vector<token_t> dropped_tokens;
@ -2843,8 +2857,8 @@ Option<bool> Index::search(std::vector<query_tokens_t>& field_query_tokens, cons
if(has_text_match) {
// For hybrid search, we need to give weight to text match and vector search
constexpr float TEXT_MATCH_WEIGHT = 0.7;
constexpr float VECTOR_SEARCH_WEIGHT = 1.0 - TEXT_MATCH_WEIGHT;
const float VECTOR_SEARCH_WEIGHT = vector_query.alpha;
const float TEXT_MATCH_WEIGHT = 1.0 - VECTOR_SEARCH_WEIGHT;
VectorFilterFunctor filterFunctor(filter_result_iterator);
auto& field_vector_index = vector_index.at(vector_query.field_name);

View File

@ -87,6 +87,13 @@ void Tokenizer::init(const std::string& input) {
}
unicode_text = icu::UnicodeString::fromUTF8(text);
if(locale == "fa") {
icu::UnicodeString target_str;
target_str.setTo(0x200C); // U+200C (ZERO WIDTH NON-JOINER)
unicode_text.findAndReplace(target_str, " ");
}
bi->setText(unicode_text);
start_pos = bi->first();

View File

@ -156,6 +156,15 @@ Option<bool> VectorQueryOps::parse_vector_query_str(const std::string& vector_qu
vector_query.distance_threshold = std::stof(param_kv[1]);
}
if(param_kv[0] == "alpha") {
if(!StringUtils::is_float(param_kv[1]) || std::stof(param_kv[1]) < 0.0 || std::stof(param_kv[1]) > 1.0) {
return Option<bool>(400, "Malformed vector query string: "
"`alpha` parameter must be a float between 0.0-1.0.");
}
vector_query.alpha = std::stof(param_kv[1]);
}
}
return Option<bool>(true);

View File

@ -645,7 +645,7 @@ TEST_F(CollectionGroupingTest, ControlMissingValues) {
{}, {}, {"brand"}, 2,
"<mark>", "</mark>", {3,3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 0, 0, 0, 2, false, "", true, 0, max_score,
100, 0, 0, HASH, 30000, 2, "", {}, {}, right_to_left, true, false).get();
100, 0, 0, HASH, 30000, 2, "", {}, {}, "right_to_left", true, false).get();
ASSERT_EQ(3, res["grouped_hits"].size());
ASSERT_EQ("Omega", res["grouped_hits"][0]["group_key"][0].get<std::string>());
@ -668,7 +668,7 @@ TEST_F(CollectionGroupingTest, ControlMissingValues) {
{}, {}, {"brand"}, 2,
"<mark>", "</mark>", {3,3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 0, 0, 0, 2, false, "", true, 0, max_score,
100, 0, 0, HASH, 30000, 2, "", {}, {}, right_to_left, true, true).get();
100, 0, 0, HASH, 30000, 2, "", {}, {}, "right_to_left", true, true).get();
ASSERT_EQ(2, res["grouped_hits"].size());
@ -911,7 +911,7 @@ TEST_F(CollectionGroupingTest, SkipToReverseGroupBy) {
{}, {}, {"brand"}, 2,
"<mark>", "</mark>", {3,3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 0, 0, 0, 2, false, "", true, 0, max_score,
100, 0, 0, HASH, 30000, 2, "", {}, {}, right_to_left, true, false).get();
100, 0, 0, HASH, 30000, 2, "", {}, {}, "right_to_left", true, false).get();
ASSERT_EQ(1, res["grouped_hits"].size());
@ -944,7 +944,7 @@ TEST_F(CollectionGroupingTest, SkipToReverseGroupBy) {
{}, {}, {"brand"}, 2,
"<mark>", "</mark>", {3,3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 0, 0, 0, 2, false, "", true, 0, max_score,
100, 0, 0, HASH, 30000, 2, "", {}, {}, right_to_left, true, false).get();
100, 0, 0, HASH, 30000, 2, "", {}, {}, "right_to_left", true, false).get();
ASSERT_EQ(5, res["grouped_hits"].size());
@ -973,7 +973,7 @@ TEST_F(CollectionGroupingTest, SkipToReverseGroupBy) {
{}, {}, {"brand"}, 2,
"<mark>", "</mark>", {3,3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 0, 0, 0, 2, false, "", true, 0, max_score,
100, 0, 0, HASH, 30000, 2, "", {}, {}, right_to_left, true, true).get();
100, 0, 0, HASH, 30000, 2, "", {}, {}, "right_to_left", true, true).get();
ASSERT_EQ(4, res["grouped_hits"].size());

View File

@ -1883,7 +1883,7 @@ TEST_F(CollectionSpecificMoreTest, DisableFieldCountForScoring) {
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3,3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 0, 0, 0, 2, false, "", true, 0, max_score,
100, 0, 0, HASH, 30000, 2, "", {}, {}, right_to_left, true);
100, 0, 0, HASH, 30000, 2, "", {}, {}, "right_to_left", true);
auto res = coll1->search("beta", {"name", "brand"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 5,
@ -1891,7 +1891,7 @@ TEST_F(CollectionSpecificMoreTest, DisableFieldCountForScoring) {
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3,3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 0, 0, 0, 2, false, "", true, 0, max_score,
100, 0, 0, HASH, 30000, 2, "", {}, {}, right_to_left, false).get();
100, 0, 0, HASH, 30000, 2, "", {}, {}, "right_to_left", false).get();
size_t score1 = std::stoul(res["hits"][0]["text_match_info"]["score"].get<std::string>());
size_t score2 = std::stoul(res["hits"][1]["text_match_info"]["score"].get<std::string>());
@ -1902,7 +1902,7 @@ TEST_F(CollectionSpecificMoreTest, DisableFieldCountForScoring) {
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3,3}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, fallback,
4, {off}, 0, 0, 0, 2, false, "", true, 0, max_score,
100, 0, 0, HASH, 30000, 2, "", {}, {}, right_to_left, true).get();
100, 0, 0, HASH, 30000, 2, "", {}, {}, "right_to_left", true).get();
ASSERT_EQ("0", res["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("1", res["hits"][1]["document"]["id"].get<std::string>());
@ -2413,7 +2413,7 @@ TEST_F(CollectionSpecificMoreTest, DropTokensLeftToRightFirst) {
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 10000,
4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0,
0, HASH, 30000, 2, "", {}, {}, left_to_right).get();
0, HASH, 30000, 2, "", {}, {}, "left_to_right").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_EQ("1", res["hits"][0]["document"]["id"].get<std::string>());
@ -2423,10 +2423,48 @@ TEST_F(CollectionSpecificMoreTest, DropTokensLeftToRightFirst) {
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 10000,
4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0,
0, HASH, 30000, 2, "", {}, {}, right_to_left).get();
0, HASH, 30000, 2, "", {}, {}, "right_to_left").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_EQ("0", res["hits"][0]["document"]["id"].get<std::string>());
// search on both sides
res = coll1->search("alpha gamma", {"title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {false}, drop_tokens_threshold,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 10000,
4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0,
0, HASH, 30000, 2, "", {}, {}, "both_sides:3").get();
ASSERT_EQ(2, res["hits"].size());
// but must follow token limit
res = coll1->search("alpha gamma", {"title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {false}, drop_tokens_threshold,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 10000,
4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0,
0, HASH, 30000, 2, "", {}, {}, "both_sides:1").get();
ASSERT_EQ(1, res["hits"].size());
ASSERT_EQ("0", res["hits"][0]["document"]["id"].get<std::string>());
// validation checks
auto res_op = coll1->search("alpha gamma", {"title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {false}, drop_tokens_threshold,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 10000,
4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0,
0, HASH, 30000, 2, "", {}, {}, "all_sides");
ASSERT_FALSE(res_op.ok());
ASSERT_EQ("Invalid format for drop tokens mode.", res_op.error());
res_op = coll1->search("alpha gamma", {"title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {false}, drop_tokens_threshold,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 10000,
4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0,
0, HASH, 30000, 2, "", {}, {}, "both_sides:x");
ASSERT_FALSE(res_op.ok());
ASSERT_EQ("Invalid format for drop tokens mode.", res_op.error());
}
TEST_F(CollectionSpecificMoreTest, DoNotHighlightFieldsForSpecialCharacterQuery) {

View File

@ -2515,3 +2515,154 @@ TEST_F(CollectionVectorTest, TestUnloadModelsCollectionHaveTwoEmbeddingField) {
text_embedders = TextEmbedderManager::get_instance()._get_text_embedders();
ASSERT_EQ(0, text_embedders.size());
}
TEST_F(CollectionVectorTest, TestHybridSearchAlphaParam) {
nlohmann::json schema = R"({
"name": "test",
"fields": [
{
"name": "name",
"type": "string"
},
{
"name": "embedding",
"type": "float[]",
"embed": {
"from": [
"name"
],
"model_config": {
"model_name": "ts/e5-small"
}
}
}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto collection_create_op = collectionManager.create_collection(schema);
ASSERT_TRUE(collection_create_op.ok());
auto coll = collection_create_op.get();
auto add_op = coll->add(R"({
"name": "soccer"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
add_op = coll->add(R"({
"name": "basketball"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
add_op = coll->add(R"({
"name": "volleyball"
})"_json.dump());
ASSERT_TRUE(add_op.ok());
// do hybrid search
auto hybrid_results = coll->search("sports", {"name", "embedding"},
"", {}, {}, {2}, 10,
1, FREQUENCY, {true},
0, spp::sparse_hash_set<std::string>()).get();
ASSERT_EQ(3, hybrid_results["hits"].size());
// check scores
ASSERT_FLOAT_EQ(0.3, hybrid_results["hits"][0]["hybrid_search_info"]["rank_fusion_score"].get<float>());
ASSERT_FLOAT_EQ(0.15, hybrid_results["hits"][1]["hybrid_search_info"]["rank_fusion_score"].get<float>());
ASSERT_FLOAT_EQ(0.10, hybrid_results["hits"][2]["hybrid_search_info"]["rank_fusion_score"].get<float>());
// do hybrid search with alpha = 0.5
hybrid_results = coll->search("sports", {"name", "embedding"}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
fallback,
4, {off}, 32767, 32767, 2,
false, true, "embedding:([], alpha:0.5)").get();
ASSERT_EQ(3, hybrid_results["hits"].size());
// check scores
ASSERT_FLOAT_EQ(0.5, hybrid_results["hits"][0]["hybrid_search_info"]["rank_fusion_score"].get<float>());
ASSERT_FLOAT_EQ(0.25, hybrid_results["hits"][1]["hybrid_search_info"]["rank_fusion_score"].get<float>());
ASSERT_FLOAT_EQ(0.16666667, hybrid_results["hits"][2]["hybrid_search_info"]["rank_fusion_score"].get<float>());
}
TEST_F(CollectionVectorTest, TestHybridSearchInvalidAlpha) {
nlohmann::json schema = R"({
"name": "test",
"fields": [
{
"name": "name",
"type": "string"
},
{
"name": "embedding",
"type": "float[]",
"embed": {
"from": [
"name"
],
"model_config": {
"model_name": "ts/e5-small"
}
}
}
]
})"_json;
TextEmbedderManager::set_model_dir("/tmp/typesense_test/models");
auto collection_create_op = collectionManager.create_collection(schema);
ASSERT_TRUE(collection_create_op.ok());
auto coll = collection_create_op.get();
// do hybrid search with alpha = 1.5
auto hybrid_results = coll->search("sports", {"name", "embedding"}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
fallback,
4, {off}, 32767, 32767, 2,
false, true, "embedding:([], alpha:1.5)");
ASSERT_FALSE(hybrid_results.ok());
ASSERT_EQ("Malformed vector query string: "
"`alpha` parameter must be a float between 0.0-1.0.", hybrid_results.error());
// do hybrid search with alpha = -0.5
hybrid_results = coll->search("sports", {"name", "embedding"}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
fallback,
4, {off}, 32767, 32767, 2,
false, true, "embedding:([], alpha:-0.5)");
ASSERT_FALSE(hybrid_results.ok());
ASSERT_EQ("Malformed vector query string: "
"`alpha` parameter must be a float between 0.0-1.0.", hybrid_results.error());
// do hybrid search with alpha as string
hybrid_results = coll->search("sports", {"name", "embedding"}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
fallback,
4, {off}, 32767, 32767, 2,
false, true, "embedding:([], alpha:\"0.5\")");
ASSERT_FALSE(hybrid_results.ok());
ASSERT_EQ("Malformed vector query string: "
"`alpha` parameter must be a float between 0.0-1.0.", hybrid_results.error());
}

View File

@ -323,6 +323,11 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) {
tokens.clear(); // 配管
Tokenizer("配管", true, false, "ja").tokenize(tokens);
// persian containing zwnj
tokens.clear();
Tokenizer("روان\u200Cشناسی", false, false, "fa").tokenize(tokens);
ASSERT_EQ(2, tokens.size());
}
TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {