mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 13:12:22 +08:00
Allow search query to be pre-segmented.
This commit is contained in:
parent
066dae9a16
commit
8a43fa8b88
@ -493,7 +493,7 @@ public:
|
||||
|
||||
void parse_search_query(const std::string &query, std::vector<std::string>& q_include_tokens,
|
||||
std::vector<std::string>& q_exclude_tokens,
|
||||
const std::string& locale) const;
|
||||
const std::string& locale, const bool already_segmented) const;
|
||||
|
||||
// PUBLIC OPERATIONS
|
||||
|
||||
@ -531,7 +531,8 @@ public:
|
||||
const std::string& highlight_end_tag="</mark>",
|
||||
std::vector<size_t> query_by_weights={},
|
||||
size_t limit_hits=UINT32_MAX,
|
||||
bool prioritize_exact_match=true) const;
|
||||
bool prioritize_exact_match=true,
|
||||
bool pre_segmented_query=false) const;
|
||||
|
||||
Option<bool> get_filter_ids(const std::string & simple_filter_query,
|
||||
std::vector<std::pair<size_t, uint32_t*>>& index_ids);
|
||||
|
@ -504,7 +504,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
const std::string& highlight_end_tag,
|
||||
std::vector<size_t> query_by_weights,
|
||||
size_t limit_hits,
|
||||
bool prioritize_exact_match) const {
|
||||
bool prioritize_exact_match,
|
||||
bool pre_segmented_query) const {
|
||||
|
||||
std::shared_lock lock(mutex);
|
||||
|
||||
@ -870,7 +871,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
if(search_fields.size() == 0) {
|
||||
// has to be a wildcard query
|
||||
field_query_tokens.emplace_back(query_tokens_t{});
|
||||
parse_search_query(query, field_query_tokens[0].q_include_tokens, field_query_tokens[0].q_exclude_tokens,"");
|
||||
parse_search_query(query, field_query_tokens[0].q_include_tokens, field_query_tokens[0].q_exclude_tokens, "",
|
||||
false);
|
||||
} else {
|
||||
for(size_t i = 0; i < search_fields.size(); i++) {
|
||||
const auto& search_field = search_fields[i];
|
||||
@ -878,7 +880,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
|
||||
const std::string & field_locale = search_schema.at(search_field).locale;
|
||||
parse_search_query(query, field_query_tokens[i].q_include_tokens, field_query_tokens[i].q_exclude_tokens,
|
||||
field_locale);
|
||||
field_locale, pre_segmented_query);
|
||||
|
||||
// get synonyms
|
||||
std::vector<std::vector<std::string>> q_synonyms;
|
||||
@ -1278,13 +1280,18 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
|
||||
|
||||
void Collection::parse_search_query(const std::string &query, std::vector<std::string>& q_include_tokens,
|
||||
std::vector<std::string>& q_exclude_tokens,
|
||||
const std::string& locale) const {
|
||||
const std::string& locale, const bool already_segmented) const {
|
||||
if(query == "*") {
|
||||
q_exclude_tokens = {};
|
||||
q_include_tokens = {query};
|
||||
} else {
|
||||
std::vector<std::string> tokens;
|
||||
Tokenizer(query, true, false, locale, {'-'}).tokenize(tokens);
|
||||
|
||||
if(already_segmented) {
|
||||
StringUtils::split(query, tokens, " ");
|
||||
} else {
|
||||
Tokenizer(query, true, false, locale, {'-'}).tokenize(tokens);
|
||||
}
|
||||
|
||||
bool exclude_operator_prior = false;
|
||||
|
||||
|
@ -499,6 +499,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
const char *HIGHLIGHT_END_TAG = "highlight_end_tag";
|
||||
|
||||
const char *PRIORITIZE_EXACT_MATCH = "prioritize_exact_match";
|
||||
const char *PRE_SEGMENTED_QUERY = "pre_segmented_query";
|
||||
|
||||
if(req_params.count(NUM_TYPOS) == 0) {
|
||||
req_params[NUM_TYPOS] = "2";
|
||||
@ -589,6 +590,10 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
req_params[PRIORITIZE_EXACT_MATCH] = "true";
|
||||
}
|
||||
|
||||
if(req_params.count(PRE_SEGMENTED_QUERY) == 0) {
|
||||
req_params[PRE_SEGMENTED_QUERY] = "false";
|
||||
}
|
||||
|
||||
std::vector<std::string> query_by_weights_str;
|
||||
std::vector<size_t> query_by_weights;
|
||||
|
||||
@ -659,6 +664,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
}
|
||||
|
||||
bool prioritize_exact_match = (req_params[PRIORITIZE_EXACT_MATCH] == "true");
|
||||
bool pre_segmented_query = (req_params[PRE_SEGMENTED_QUERY] == "true");
|
||||
|
||||
std::string filter_str = req_params.count(FILTER) != 0 ? req_params[FILTER] : "";
|
||||
|
||||
@ -741,8 +747,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
|
||||
req_params[HIGHLIGHT_END_TAG],
|
||||
query_by_weights,
|
||||
static_cast<size_t>(std::stol(req_params[LIMIT_HITS])),
|
||||
prioritize_exact_match
|
||||
);
|
||||
prioritize_exact_match,
|
||||
pre_segmented_query
|
||||
);
|
||||
|
||||
uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::high_resolution_clock::now() - begin).count();
|
||||
|
@ -149,6 +149,45 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiText) {
|
||||
ASSERT_EQ("<mark>พกไฟ</mark>\nเสมอ", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionLocaleTest, SearchThaiTextPreSegmentedQuery) {
|
||||
Collection *coll1;
|
||||
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, DEFAULT_GEO_RESOLUTION, "th"),
|
||||
field("artist", field_types::STRING, false),
|
||||
field("points", field_types::INT32, false),};
|
||||
|
||||
coll1 = collectionManager.get_collection("coll1").get();
|
||||
if(coll1 == nullptr) {
|
||||
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::string>> records = {
|
||||
{"ความเหลื่อมล้ำ", "Compound Word"}, // ความ, เหลื่อม, ล้ำ
|
||||
{"การกระจายรายได้", "Doc A"},
|
||||
{"จารีย์", "Doc B"},
|
||||
};
|
||||
|
||||
for(size_t i=0; i<records.size(); i++) {
|
||||
nlohmann::json doc;
|
||||
|
||||
doc["id"] = std::to_string(i);
|
||||
doc["title"] = records[i][0];
|
||||
doc["artist"] = records[i][1];
|
||||
doc["points"] = i;
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
}
|
||||
|
||||
auto results = coll1->search("เหลื่",
|
||||
{"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true,
|
||||
10, spp::sparse_hash_set<std::string>(),
|
||||
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
|
||||
"<mark>", "</mark>", {1}, 1000, true, true).get();
|
||||
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
|
||||
}
|
||||
|
||||
TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) {
|
||||
Collection* coll1;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user