From 70738115466d9b49b929e6be12d4f62faa4afa82 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Mon, 9 Oct 2023 23:20:35 +0300 Subject: [PATCH 1/4] Add vector query param to set hybrid saerch alpha --- include/vector_query_ops.h | 1 + src/index.cpp | 4 +- src/vector_query_ops.cpp | 9 +++ test/collection_vector_search_test.cpp | 78 +++++++++++++++++++++++++- 4 files changed, 89 insertions(+), 3 deletions(-) diff --git a/include/vector_query_ops.h b/include/vector_query_ops.h index 29bb36f6..b161bd3e 100644 --- a/include/vector_query_ops.h +++ b/include/vector_query_ops.h @@ -15,6 +15,7 @@ struct vector_query_t { uint32_t seq_id = 0; bool query_doc_given = false; + float alpha = 0.3; void _reset() { // used for testing only diff --git a/src/index.cpp b/src/index.cpp index 5059fd8d..8dc5ebb8 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -3178,8 +3178,8 @@ Option Index::search(std::vector& field_query_tokens, cons if(has_text_match) { // For hybrid search, we need to give weight to text match and vector search - constexpr float TEXT_MATCH_WEIGHT = 0.7; - constexpr float VECTOR_SEARCH_WEIGHT = 1.0 - TEXT_MATCH_WEIGHT; + const float VECTOR_SEARCH_WEIGHT = vector_query.alpha; + const float TEXT_MATCH_WEIGHT = 1.0 - VECTOR_SEARCH_WEIGHT; VectorFilterFunctor filterFunctor(filter_result.docs, filter_result.count); auto& field_vector_index = vector_index.at(vector_query.field_name); diff --git a/src/vector_query_ops.cpp b/src/vector_query_ops.cpp index 67443f2b..dba9d27d 100644 --- a/src/vector_query_ops.cpp +++ b/src/vector_query_ops.cpp @@ -156,6 +156,15 @@ Option VectorQueryOps::parse_vector_query_str(const std::string& vector_qu vector_query.distance_threshold = std::stof(param_kv[1]); } + + if(param_kv[0] == "alpha") { + if(!StringUtils::is_float(param_kv[1]) || std::stof(param_kv[1]) < 0.0 || std::stof(param_kv[1]) > 1.0) { + return Option(400, "Malformed vector query string: " + "`alpha` parameter must be a float between 0.0-1.0."); + } + + vector_query.alpha = std::stof(param_kv[1]); + } } return Option(true); diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index 55049641..cdb682f8 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -2497,4 +2497,80 @@ TEST_F(CollectionVectorTest, TestUnloadModelsCollectionHaveTwoEmbeddingField) { text_embedders = TextEmbedderManager::get_instance()._get_text_embedders(); ASSERT_EQ(0, text_embedders.size()); -} \ No newline at end of file +} + +TEST_F(CollectionVectorTest, TestHybridSearchAlphaParam) { + nlohmann::json schema = R"({ + "name": "test", + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "embedding", + "type": "float[]", + "embed": { + "from": [ + "name" + ], + "model_config": { + "model_name": "ts/e5-small" + } + } + } + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema); + ASSERT_TRUE(collection_create_op.ok()); + + auto coll = collection_create_op.get(); + + auto add_op = coll->add(R"({ + "name": "soccer" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + add_op = coll->add(R"({ + "name": "basketball" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + add_op = coll->add(R"({ + "name": "volleyball" + })"_json.dump()); + ASSERT_TRUE(add_op.ok()); + + + // do hybrid search + auto hybrid_results = coll->search("sports", {"name", "embedding"}, + "", {}, {}, {2}, 10, + 1, FREQUENCY, {true}, + 0, spp::sparse_hash_set()).get(); + + ASSERT_EQ(3, hybrid_results["hits"].size()); + + // check scores + ASSERT_FLOAT_EQ(0.3, hybrid_results["hits"][0]["hybrid_search_info"]["rank_fusion_score"].get()); + ASSERT_FLOAT_EQ(0.15, hybrid_results["hits"][1]["hybrid_search_info"]["rank_fusion_score"].get()); + ASSERT_FLOAT_EQ(0.10, hybrid_results["hits"][2]["hybrid_search_info"]["rank_fusion_score"].get()); + + // do hybrid search with alpha = 0.5 + hybrid_results = coll->search("sports", {"name", "embedding"}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, + fallback, + 4, {off}, 32767, 32767, 2, + false, true, "embedding:([], alpha:0.5)").get(); + ASSERT_EQ(3, hybrid_results["hits"].size()); + + // check scores + ASSERT_FLOAT_EQ(0.5, hybrid_results["hits"][0]["hybrid_search_info"]["rank_fusion_score"].get()); + ASSERT_FLOAT_EQ(0.25, hybrid_results["hits"][1]["hybrid_search_info"]["rank_fusion_score"].get()); + ASSERT_FLOAT_EQ(0.16666667, hybrid_results["hits"][2]["hybrid_search_info"]["rank_fusion_score"].get()); +} \ No newline at end of file From 998b071956a412887035f76b75a0700149615b88 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Tue, 10 Oct 2023 11:20:15 +0300 Subject: [PATCH 2/4] Add test for invalid alpha params --- test/collection_vector_search_test.cpp | 77 +++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/test/collection_vector_search_test.cpp b/test/collection_vector_search_test.cpp index cdb682f8..b22686d1 100644 --- a/test/collection_vector_search_test.cpp +++ b/test/collection_vector_search_test.cpp @@ -2573,4 +2573,79 @@ TEST_F(CollectionVectorTest, TestHybridSearchAlphaParam) { ASSERT_FLOAT_EQ(0.5, hybrid_results["hits"][0]["hybrid_search_info"]["rank_fusion_score"].get()); ASSERT_FLOAT_EQ(0.25, hybrid_results["hits"][1]["hybrid_search_info"]["rank_fusion_score"].get()); ASSERT_FLOAT_EQ(0.16666667, hybrid_results["hits"][2]["hybrid_search_info"]["rank_fusion_score"].get()); -} \ No newline at end of file +} + +TEST_F(CollectionVectorTest, TestHybridSearchInvalidAlpha) { + nlohmann::json schema = R"({ + "name": "test", + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "embedding", + "type": "float[]", + "embed": { + "from": [ + "name" + ], + "model_config": { + "model_name": "ts/e5-small" + } + } + } + ] + })"_json; + + TextEmbedderManager::set_model_dir("/tmp/typesense_test/models"); + + auto collection_create_op = collectionManager.create_collection(schema); + ASSERT_TRUE(collection_create_op.ok()); + + auto coll = collection_create_op.get(); + + + // do hybrid search with alpha = 1.5 + auto hybrid_results = coll->search("sports", {"name", "embedding"}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, + fallback, + 4, {off}, 32767, 32767, 2, + false, true, "embedding:([], alpha:1.5)"); + + ASSERT_FALSE(hybrid_results.ok()); + ASSERT_EQ("Malformed vector query string: " + "`alpha` parameter must be a float between 0.0-1.0.", hybrid_results.error()); + + // do hybrid search with alpha = -0.5 + hybrid_results = coll->search("sports", {"name", "embedding"}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, + fallback, + 4, {off}, 32767, 32767, 2, + false, true, "embedding:([], alpha:-0.5)"); + + ASSERT_FALSE(hybrid_results.ok()); + ASSERT_EQ("Malformed vector query string: " + "`alpha` parameter must be a float between 0.0-1.0.", hybrid_results.error()); + + // do hybrid search with alpha as string + hybrid_results = coll->search("sports", {"name", "embedding"}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 5, + "", 10, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7, + fallback, + 4, {off}, 32767, 32767, 2, + false, true, "embedding:([], alpha:\"0.5\")"); + + ASSERT_FALSE(hybrid_results.ok()); + ASSERT_EQ("Malformed vector query string: " + "`alpha` parameter must be a float between 0.0-1.0.", hybrid_results.error()); + +} \ No newline at end of file From 40a684619b29a6170f696e923812248f5078661b Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Tue, 10 Oct 2023 19:18:33 +0530 Subject: [PATCH 3/4] Add both sides drop tokens mode. --- include/collection.h | 4 ++- include/index.h | 18 +++++++++-- src/collection.cpp | 40 ++++++++++++++++++++++-- src/collection_manager.cpp | 8 +---- src/index.cpp | 18 +++++++++-- test/collection_specific_more_test.cpp | 42 ++++++++++++++++++++++++-- 6 files changed, 112 insertions(+), 18 deletions(-) diff --git a/include/collection.h b/include/collection.h index 35cefee1..1a4780f2 100644 --- a/include/collection.h +++ b/include/collection.h @@ -201,6 +201,8 @@ private: static Option parse_pinned_hits(const std::string& pinned_hits_str, std::map>& pinned_hits); + static Option parse_drop_tokens_mode(const std::string& drop_tokens_mode); + Index* init_index(); static std::vector to_char_array(const std::vector& strs); @@ -474,7 +476,7 @@ public: const size_t remote_embedding_num_tries = 2, const bool prioritize_num_matching_fields = true, const bool group_missing_values = true, - const drop_tokens_mode_t drop_tokens_mode = right_to_left) const; + const std::string& drop_tokens_mode = "right_to_left") const; Option get_filter_ids(const std::string & filter_query, filter_result_t& filter_result) const; diff --git a/include/index.h b/include/index.h index b3487b39..ad38e00e 100644 --- a/include/index.h +++ b/include/index.h @@ -99,6 +99,18 @@ enum text_match_type_t { enum drop_tokens_mode_t { left_to_right, right_to_left, + both_sides, +}; + +struct drop_tokens_param_t { + drop_tokens_mode_t mode = right_to_left; + size_t token_limit = 1000; + + drop_tokens_param_t() { + + } + + drop_tokens_param_t(drop_tokens_mode_t mode, size_t token_limit) : mode(mode), token_limit(token_limit) {} }; struct search_args { @@ -151,7 +163,7 @@ struct search_args { vector_query_t& vector_query; size_t facet_sample_percent; size_t facet_sample_threshold; - drop_tokens_mode_t drop_tokens_mode; + drop_tokens_param_t drop_tokens_mode; search_args(std::vector field_query_tokens, std::vector search_fields, const text_match_type_t match_type, @@ -168,7 +180,7 @@ struct search_args { size_t min_len_1typo, size_t min_len_2typo, size_t max_candidates, const std::vector& infixes, const size_t max_extra_prefix, const size_t max_extra_suffix, const size_t facet_query_num_typos, const bool filter_curated_hits, const enable_t split_join_tokens, vector_query_t& vector_query, - size_t facet_sample_percent, size_t facet_sample_threshold, drop_tokens_mode_t drop_tokens_mode) : + size_t facet_sample_percent, size_t facet_sample_threshold, drop_tokens_param_t drop_tokens_mode) : field_query_tokens(field_query_tokens), search_fields(search_fields), match_type(match_type), filter_tree_root(filter_tree_root), facets(facets), included_ids(included_ids), excluded_ids(excluded_ids), sort_fields_std(sort_fields_std), @@ -687,7 +699,7 @@ public: const bool filter_curated_hits, enable_t split_join_tokens, const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold, const std::string& collection_name, - const drop_tokens_mode_t drop_tokens_mode = right_to_left) const; + const drop_tokens_param_t drop_tokens_mode) const; void remove_field(uint32_t seq_id, const nlohmann::json& document, const std::string& field_name, const bool is_update); diff --git a/src/collection.cpp b/src/collection.cpp index eb9dee36..27275a4b 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -1117,7 +1117,7 @@ Option Collection::search(std::string raw_query, const size_t remote_embedding_num_tries, const bool prioritize_num_matching_fields, const bool group_missing_values, - const drop_tokens_mode_t drop_tokens_mode) const { + const std::string& drop_tokens_mode) const { std::shared_lock lock(mutex); @@ -1475,6 +1475,13 @@ Option Collection::search(std::string raw_query, } } + Option drop_tokens_param_op = parse_drop_tokens_mode(drop_tokens_mode); + if(!drop_tokens_param_op.ok()) { + return Option(drop_tokens_param_op.code(), drop_tokens_param_op.error()); + } + + auto drop_tokens_param = drop_tokens_param_op.get(); + std::vector> raw_result_kvs; std::vector> override_result_kvs; @@ -1632,7 +1639,7 @@ Option Collection::search(std::string raw_query, min_len_1typo, min_len_2typo, max_candidates, infixes, max_extra_prefix, max_extra_suffix, facet_query_num_typos, filter_curated_hits, split_join_tokens, vector_query, - facet_sample_percent, facet_sample_threshold, drop_tokens_mode); + facet_sample_percent, facet_sample_threshold, drop_tokens_param); std::unique_ptr search_params_guard(search_params); @@ -3687,6 +3694,35 @@ Option Collection::parse_pinned_hits(const std::string& pinned_hits_str, return Option(true); } +Option Collection::parse_drop_tokens_mode(const std::string& drop_tokens_mode) { + drop_tokens_mode_t drop_tokens_mode_val = left_to_right; + size_t drop_tokens_token_limit = 1000; + auto drop_tokens_mode_op = magic_enum::enum_cast(drop_tokens_mode); + if(drop_tokens_mode_op.has_value()) { + drop_tokens_mode_val = drop_tokens_mode_op.value(); + } else { + std::vector drop_token_parts; + StringUtils::split(drop_tokens_mode, drop_token_parts, ":"); + if(drop_token_parts.size() == 2) { + if(!StringUtils::is_uint32_t(drop_token_parts[1])) { + return Option(400, "Invalid format for drop tokens mode."); + } + + drop_tokens_mode_op = magic_enum::enum_cast(drop_token_parts[0]); + if(drop_tokens_mode_op.has_value()) { + drop_tokens_mode_val = drop_tokens_mode_op.value(); + } + + drop_tokens_token_limit = std::stoul(drop_token_parts[1]); + + } else { + return Option(400, "Invalid format for drop tokens mode."); + } + } + + return Option(drop_tokens_param_t(drop_tokens_mode_val, drop_tokens_token_limit)); +} + Option Collection::add_synonym(const nlohmann::json& syn_json, bool write_to_store) { std::shared_lock lock(mutex); synonym_t synonym; diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 5ad7d814..9019eabb 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -1072,12 +1072,6 @@ Option CollectionManager::do_search(std::map& re Index::NUM_CANDIDATES_DEFAULT_MIN); } - auto drop_tokens_mode_op = magic_enum::enum_cast(drop_tokens_mode_str); - drop_tokens_mode_t drop_tokens_mode; - if(drop_tokens_mode_op.has_value()) { - drop_tokens_mode = drop_tokens_mode_op.value(); - } - Option result_op = collection->search(raw_query, search_fields, simple_filter_query, facet_fields, sort_fields, num_typos, per_page, @@ -1125,7 +1119,7 @@ Option CollectionManager::do_search(std::map& re remote_embedding_num_tries, prioritize_num_matching_fields, group_missing_values, - drop_tokens_mode); + drop_tokens_mode_str); uint64_t timeMillis = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - begin).count(); diff --git a/src/index.cpp b/src/index.cpp index f22f4874..71f3e829 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -2759,7 +2759,7 @@ Option Index::search(std::vector& field_query_tokens, cons const vector_query_t& vector_query, size_t facet_sample_percent, size_t facet_sample_threshold, const std::string& collection_name, - const drop_tokens_mode_t drop_tokens_mode) const { + const drop_tokens_param_t drop_tokens_mode) const { std::shared_lock lock(mutex); uint32_t filter_ids_length = 0; @@ -3102,10 +3102,22 @@ Option Index::search(std::vector& field_query_tokens, cons for (size_t qi = 0; qi < all_queries.size(); qi++) { auto& orig_tokens = all_queries[qi]; size_t num_tokens_dropped = 0; - auto curr_direction = drop_tokens_mode; size_t total_dirs_done = 0; - while(exhaustive_search || all_result_ids_len < drop_tokens_threshold) { + // NOTE: when dropping both sides we will ignore exhaustive search + + auto curr_direction = drop_tokens_mode.mode; + bool drop_both_sides = false; + + if(drop_tokens_mode.mode == both_sides) { + if(orig_tokens.size() <= drop_tokens_mode.token_limit) { + drop_both_sides = true; + } else { + curr_direction = right_to_left; + } + } + + while(exhaustive_search || all_result_ids_len < drop_tokens_threshold || drop_both_sides) { // When atleast two tokens from the query are available we can drop one std::vector truncated_tokens; std::vector dropped_tokens; diff --git a/test/collection_specific_more_test.cpp b/test/collection_specific_more_test.cpp index fc5468eb..86d0de41 100644 --- a/test/collection_specific_more_test.cpp +++ b/test/collection_specific_more_test.cpp @@ -2403,7 +2403,7 @@ TEST_F(CollectionSpecificMoreTest, DropTokensLeftToRightFirst) { spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, "", "", {}, 1000, true, false, true, "", false, 10000, 4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0, - 0, 30000, 2, true, true, left_to_right).get(); + 0, 30000, 2, true, true, "left_to_right").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_EQ("1", res["hits"][0]["document"]["id"].get()); @@ -2413,10 +2413,48 @@ TEST_F(CollectionSpecificMoreTest, DropTokensLeftToRightFirst) { spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, "", "", {}, 1000, true, false, true, "", false, 10000, 4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0, - 0, 30000, 2, true, true, right_to_left).get(); + 0, 30000, 2, true, true, "right_to_left").get(); ASSERT_EQ(1, res["hits"].size()); ASSERT_EQ("0", res["hits"][0]["document"]["id"].get()); + + // search on both sides + res = coll1->search("alpha gamma", {"title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {false}, drop_tokens_threshold, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 10000, + 4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0, + 0, 30000, 2, true, true, "both_sides:3").get(); + ASSERT_EQ(2, res["hits"].size()); + + // but must follow token limit + res = coll1->search("alpha gamma", {"title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {false}, drop_tokens_threshold, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 10000, + 4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0, + 0, 30000, 2, true, true, "both_sides:1").get(); + ASSERT_EQ(1, res["hits"].size()); + ASSERT_EQ("0", res["hits"][0]["document"]["id"].get()); + + // validation checks + auto res_op = coll1->search("alpha gamma", {"title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {false}, drop_tokens_threshold, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 10000, + 4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0, + 0, 30000, 2, true, true, "all_sides"); + ASSERT_FALSE(res_op.ok()); + ASSERT_EQ("Invalid format for drop tokens mode.", res_op.error()); + + res_op = coll1->search("alpha gamma", {"title"}, "", {}, {}, {0}, 3, 1, FREQUENCY, {false}, drop_tokens_threshold, + spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0, + "", "", {}, 1000, true, false, true, "", false, 10000, + 4, 7, fallback, 4, {off}, 100, 100, 2, 2, false, "", true, 0, max_score, 100, 0, + 0, 30000, 2, true, true, "both_sides:x"); + ASSERT_FALSE(res_op.ok()); + ASSERT_EQ("Invalid format for drop tokens mode.", res_op.error()); } TEST_F(CollectionSpecificMoreTest, DoNotHighlightFieldsForSpecialCharacterQuery) { From ff0d2596cc9340296e76dfcb340fa8e0fbd66a24 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Wed, 11 Oct 2023 15:02:22 +0530 Subject: [PATCH 4/4] Handle zero width non-joiner character for persian. It must split the tokens into individual words. --- src/tokenizer.cpp | 7 +++++++ test/tokenizer_test.cpp | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 5688e27d..eb80b09f 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -87,6 +87,13 @@ void Tokenizer::init(const std::string& input) { } unicode_text = icu::UnicodeString::fromUTF8(text); + + if(locale == "fa") { + icu::UnicodeString target_str; + target_str.setTo(0x200C); // U+200C (ZERO WIDTH NON-JOINER) + unicode_text.findAndReplace(target_str, " "); + } + bi->setText(unicode_text); start_pos = bi->first(); diff --git a/test/tokenizer_test.cpp b/test/tokenizer_test.cpp index b141e97c..054df18b 100644 --- a/test/tokenizer_test.cpp +++ b/test/tokenizer_test.cpp @@ -323,6 +323,11 @@ TEST(TokenizerTest, ShouldTokenizeLocaleText) { tokens.clear(); // 配管 Tokenizer("配管", true, false, "ja").tokenize(tokens); + + // persian containing zwnj + tokens.clear(); + Tokenizer("روان\u200Cشناسی", false, false, "fa").tokenize(tokens); + ASSERT_EQ(2, tokens.size()); } TEST(TokenizerTest, ShouldTokenizeLocaleTextWithEnglishText) {