diff --git a/include/collection.h b/include/collection.h index effe0374..9bb556a4 100644 --- a/include/collection.h +++ b/include/collection.h @@ -493,7 +493,7 @@ public: void parse_search_query(const std::string &query, std::vector& q_include_tokens, std::vector& q_exclude_tokens, - const std::string& locale) const; + const std::string& locale, const bool already_segmented) const; // PUBLIC OPERATIONS @@ -531,7 +531,8 @@ public: const std::string& highlight_end_tag="", std::vector query_by_weights={}, size_t limit_hits=UINT32_MAX, - bool prioritize_exact_match=true) const; + bool prioritize_exact_match=true, + bool pre_segmented_query=false) const; Option get_filter_ids(const std::string & simple_filter_query, std::vector>& index_ids); diff --git a/src/collection.cpp b/src/collection.cpp index a9760d92..5172730f 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -504,7 +504,8 @@ Option Collection::search(const std::string & query, const std:: const std::string& highlight_end_tag, std::vector query_by_weights, size_t limit_hits, - bool prioritize_exact_match) const { + bool prioritize_exact_match, + bool pre_segmented_query) const { std::shared_lock lock(mutex); @@ -870,7 +871,8 @@ Option Collection::search(const std::string & query, const std:: if(search_fields.size() == 0) { // has to be a wildcard query field_query_tokens.emplace_back(query_tokens_t{}); - parse_search_query(query, field_query_tokens[0].q_include_tokens, field_query_tokens[0].q_exclude_tokens,""); + parse_search_query(query, field_query_tokens[0].q_include_tokens, field_query_tokens[0].q_exclude_tokens, "", + false); } else { for(size_t i = 0; i < search_fields.size(); i++) { const auto& search_field = search_fields[i]; @@ -878,7 +880,7 @@ Option Collection::search(const std::string & query, const std:: const std::string & field_locale = search_schema.at(search_field).locale; parse_search_query(query, field_query_tokens[i].q_include_tokens, field_query_tokens[i].q_exclude_tokens, - field_locale); + field_locale, pre_segmented_query); // get synonyms std::vector> q_synonyms; @@ -1278,13 +1280,18 @@ Option Collection::search(const std::string & query, const std:: void Collection::parse_search_query(const std::string &query, std::vector& q_include_tokens, std::vector& q_exclude_tokens, - const std::string& locale) const { + const std::string& locale, const bool already_segmented) const { if(query == "*") { q_exclude_tokens = {}; q_include_tokens = {query}; } else { std::vector tokens; - Tokenizer(query, true, false, locale, {'-'}).tokenize(tokens); + + if(already_segmented) { + StringUtils::split(query, tokens, " "); + } else { + Tokenizer(query, true, false, locale, {'-'}).tokenize(tokens); + } bool exclude_operator_prior = false; diff --git a/src/collection_manager.cpp b/src/collection_manager.cpp index 85722d47..05eb42b8 100644 --- a/src/collection_manager.cpp +++ b/src/collection_manager.cpp @@ -499,6 +499,7 @@ Option CollectionManager::do_search(std::map& re const char *HIGHLIGHT_END_TAG = "highlight_end_tag"; const char *PRIORITIZE_EXACT_MATCH = "prioritize_exact_match"; + const char *PRE_SEGMENTED_QUERY = "pre_segmented_query"; if(req_params.count(NUM_TYPOS) == 0) { req_params[NUM_TYPOS] = "2"; @@ -589,6 +590,10 @@ Option CollectionManager::do_search(std::map& re req_params[PRIORITIZE_EXACT_MATCH] = "true"; } + if(req_params.count(PRE_SEGMENTED_QUERY) == 0) { + req_params[PRE_SEGMENTED_QUERY] = "false"; + } + std::vector query_by_weights_str; std::vector query_by_weights; @@ -659,6 +664,7 @@ Option CollectionManager::do_search(std::map& re } bool prioritize_exact_match = (req_params[PRIORITIZE_EXACT_MATCH] == "true"); + bool pre_segmented_query = (req_params[PRE_SEGMENTED_QUERY] == "true"); std::string filter_str = req_params.count(FILTER) != 0 ? req_params[FILTER] : ""; @@ -741,8 +747,9 @@ Option CollectionManager::do_search(std::map& re req_params[HIGHLIGHT_END_TAG], query_by_weights, static_cast(std::stol(req_params[LIMIT_HITS])), - prioritize_exact_match - ); + prioritize_exact_match, + pre_segmented_query + ); uint64_t timeMillis = std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - begin).count(); diff --git a/test/collection_locale_test.cpp b/test/collection_locale_test.cpp index 8e96bad4..47ff4dce 100644 --- a/test/collection_locale_test.cpp +++ b/test/collection_locale_test.cpp @@ -149,6 +149,45 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiText) { ASSERT_EQ("พกไฟ\nเสมอ", results["hits"][0]["highlights"][0]["snippet"].get()); } +TEST_F(CollectionLocaleTest, SearchThaiTextPreSegmentedQuery) { + Collection *coll1; + + std::vector fields = {field("title", field_types::STRING, false, false, true, DEFAULT_GEO_RESOLUTION, "th"), + field("artist", field_types::STRING, false), + field("points", field_types::INT32, false),}; + + coll1 = collectionManager.get_collection("coll1").get(); + if(coll1 == nullptr) { + coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get(); + } + + std::vector> records = { + {"ความเหลื่อมล้ำ", "Compound Word"}, // ความ, เหลื่อม, ล้ำ + {"การกระจายรายได้", "Doc A"}, + {"จารีย์", "Doc B"}, + }; + + for(size_t i=0; iadd(doc.dump()).ok()); + } + + auto results = coll1->search("เหลื่", + {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true, + 10, spp::sparse_hash_set(), + spp::sparse_hash_set(), 10, "", 30, 4, "", 40, {}, {}, {}, 0, + "", "", {1}, 1000, true, true).get(); + + ASSERT_EQ(1, results["found"].get()); + ASSERT_EQ("0", results["hits"][0]["document"]["id"].get()); +} + TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) { Collection* coll1;