Allow search query to be pre-segmented.

This commit is contained in:
Kishore Nallan 2021-06-07 21:43:28 +05:30
parent 066dae9a16
commit 8a43fa8b88
4 changed files with 63 additions and 9 deletions

View File

@ -493,7 +493,7 @@ public:
void parse_search_query(const std::string &query, std::vector<std::string>& q_include_tokens,
std::vector<std::string>& q_exclude_tokens,
const std::string& locale) const;
const std::string& locale, const bool already_segmented) const;
// PUBLIC OPERATIONS
@ -531,7 +531,8 @@ public:
const std::string& highlight_end_tag="</mark>",
std::vector<size_t> query_by_weights={},
size_t limit_hits=UINT32_MAX,
bool prioritize_exact_match=true) const;
bool prioritize_exact_match=true,
bool pre_segmented_query=false) const;
Option<bool> get_filter_ids(const std::string & simple_filter_query,
std::vector<std::pair<size_t, uint32_t*>>& index_ids);

View File

@ -504,7 +504,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
const std::string& highlight_end_tag,
std::vector<size_t> query_by_weights,
size_t limit_hits,
bool prioritize_exact_match) const {
bool prioritize_exact_match,
bool pre_segmented_query) const {
std::shared_lock lock(mutex);
@ -870,7 +871,8 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
if(search_fields.size() == 0) {
// has to be a wildcard query
field_query_tokens.emplace_back(query_tokens_t{});
parse_search_query(query, field_query_tokens[0].q_include_tokens, field_query_tokens[0].q_exclude_tokens,"");
parse_search_query(query, field_query_tokens[0].q_include_tokens, field_query_tokens[0].q_exclude_tokens, "",
false);
} else {
for(size_t i = 0; i < search_fields.size(); i++) {
const auto& search_field = search_fields[i];
@ -878,7 +880,7 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
const std::string & field_locale = search_schema.at(search_field).locale;
parse_search_query(query, field_query_tokens[i].q_include_tokens, field_query_tokens[i].q_exclude_tokens,
field_locale);
field_locale, pre_segmented_query);
// get synonyms
std::vector<std::vector<std::string>> q_synonyms;
@ -1278,13 +1280,18 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::
void Collection::parse_search_query(const std::string &query, std::vector<std::string>& q_include_tokens,
std::vector<std::string>& q_exclude_tokens,
const std::string& locale) const {
const std::string& locale, const bool already_segmented) const {
if(query == "*") {
q_exclude_tokens = {};
q_include_tokens = {query};
} else {
std::vector<std::string> tokens;
Tokenizer(query, true, false, locale, {'-'}).tokenize(tokens);
if(already_segmented) {
StringUtils::split(query, tokens, " ");
} else {
Tokenizer(query, true, false, locale, {'-'}).tokenize(tokens);
}
bool exclude_operator_prior = false;

View File

@ -499,6 +499,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
const char *HIGHLIGHT_END_TAG = "highlight_end_tag";
const char *PRIORITIZE_EXACT_MATCH = "prioritize_exact_match";
const char *PRE_SEGMENTED_QUERY = "pre_segmented_query";
if(req_params.count(NUM_TYPOS) == 0) {
req_params[NUM_TYPOS] = "2";
@ -589,6 +590,10 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
req_params[PRIORITIZE_EXACT_MATCH] = "true";
}
if(req_params.count(PRE_SEGMENTED_QUERY) == 0) {
req_params[PRE_SEGMENTED_QUERY] = "false";
}
std::vector<std::string> query_by_weights_str;
std::vector<size_t> query_by_weights;
@ -659,6 +664,7 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
}
bool prioritize_exact_match = (req_params[PRIORITIZE_EXACT_MATCH] == "true");
bool pre_segmented_query = (req_params[PRE_SEGMENTED_QUERY] == "true");
std::string filter_str = req_params.count(FILTER) != 0 ? req_params[FILTER] : "";
@ -741,8 +747,9 @@ Option<bool> CollectionManager::do_search(std::map<std::string, std::string>& re
req_params[HIGHLIGHT_END_TAG],
query_by_weights,
static_cast<size_t>(std::stol(req_params[LIMIT_HITS])),
prioritize_exact_match
);
prioritize_exact_match,
pre_segmented_query
);
uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - begin).count();

View File

@ -149,6 +149,45 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiText) {
ASSERT_EQ("<mark>พกไฟ</mark>\nเสมอ", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
}
TEST_F(CollectionLocaleTest, SearchThaiTextPreSegmentedQuery) {
Collection *coll1;
std::vector<field> fields = {field("title", field_types::STRING, false, false, true, DEFAULT_GEO_RESOLUTION, "th"),
field("artist", field_types::STRING, false),
field("points", field_types::INT32, false),};
coll1 = collectionManager.get_collection("coll1").get();
if(coll1 == nullptr) {
coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
}
std::vector<std::vector<std::string>> records = {
{"ความเหลื่อมล้ำ", "Compound Word"}, // ความ, เหลื่อม, ล้ำ
{"การกระจายรายได้", "Doc A"},
{"จารีย์", "Doc B"},
};
for(size_t i=0; i<records.size(); i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = records[i][0];
doc["artist"] = records[i][1];
doc["points"] = i;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
auto results = coll1->search("เหลื่",
{"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, true,
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
"<mark>", "</mark>", {1}, 1000, true, true).get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
}
TEST_F(CollectionLocaleTest, SearchAgainstThaiTextExactMatch) {
Collection* coll1;