mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 21:22:25 +08:00
Basics of treating space as typo.
This commit is contained in:
parent
237b67816d
commit
f0b09e6c07
@ -575,6 +575,8 @@ private:
|
||||
const std::string &field_name,
|
||||
nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);
|
||||
|
||||
bool common_results_exist(std::vector<art_leaf*>& leaves);
|
||||
|
||||
public:
|
||||
// for limiting number of results on multiple candidates / query rewrites
|
||||
enum {TYPO_TOKENS_THRESHOLD = 1};
|
||||
@ -754,6 +756,9 @@ public:
|
||||
const std::vector<std::string>& group_by_fields,
|
||||
std::vector<facet_info_t>& facet_infos) const;
|
||||
|
||||
void resolve_space_as_typos(std::vector<std::string>& qtokens, const std::string& field_name,
|
||||
std::vector<std::vector<std::string>>& resolved_queries);
|
||||
|
||||
size_t num_seq_ids() const;
|
||||
};
|
||||
|
||||
|
@ -916,8 +916,17 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query, const s
|
||||
field_locale, pre_segmented_query);
|
||||
|
||||
// get synonyms
|
||||
std::vector<std::vector<std::string>> q_synonyms;
|
||||
synonym_reduction(field_query_tokens[i].q_include_tokens, field_query_tokens[i].q_synonyms);
|
||||
|
||||
std::vector<std::vector<std::string>> space_resolved_queries;
|
||||
index->resolve_space_as_typos(field_query_tokens[i].q_include_tokens, search_field,
|
||||
space_resolved_queries);
|
||||
|
||||
// only one query is resolved for now, so just use that
|
||||
if(!space_resolved_queries.empty()) {
|
||||
field_query_tokens[i].q_include_tokens = space_resolved_queries[0];
|
||||
synonym_reduction(space_resolved_queries[0], field_query_tokens[i].q_synonyms);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
150
src/index.cpp
150
src/index.cpp
@ -3920,6 +3920,156 @@ size_t Index::num_seq_ids() const {
|
||||
return seq_ids.getLength();
|
||||
}
|
||||
|
||||
void Index::resolve_space_as_typos(std::vector<std::string>& qtokens, const string& field_name,
|
||||
std::vector<std::vector<std::string>>& resolved_queries) {
|
||||
|
||||
std::shared_lock lock(mutex);
|
||||
|
||||
auto tree_it = search_index.find(field_name);
|
||||
|
||||
if(tree_it == search_index.end()) {
|
||||
return ;
|
||||
}
|
||||
|
||||
// we will try to find a verbatim match first
|
||||
|
||||
art_tree* t = tree_it->second;
|
||||
std::vector<art_leaf*> leaves;
|
||||
|
||||
for(const std::string& token: qtokens) {
|
||||
art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) token.c_str(),
|
||||
token.length()+1);
|
||||
if(leaf == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
||||
leaves.push_back(leaf);
|
||||
}
|
||||
|
||||
if(leaves.size() == qtokens.size() && common_results_exist(leaves)) {
|
||||
return ;
|
||||
}
|
||||
|
||||
// When we cannot find verbatim match, we can try concatting and splitting query tokens for alternatives.
|
||||
|
||||
// Concatenation:
|
||||
|
||||
size_t qtokens_size = std::min<size_t>(5, qtokens.size()); // only first 5 tokens will be considered
|
||||
|
||||
if(qtokens.size() > 1) {
|
||||
// a) join all tokens to form a single string
|
||||
const string& all_tokens_query = StringUtils::join(qtokens, "");
|
||||
if(art_search(t, (const unsigned char*) all_tokens_query.c_str(), all_tokens_query.length()+1) != nullptr) {
|
||||
resolved_queries.push_back({all_tokens_query});
|
||||
return;
|
||||
}
|
||||
|
||||
// b) join 2 adjacent tokens in a sliding window (provided they are atleast 2 tokens in size)
|
||||
|
||||
for(size_t i = 0; i < qtokens_size-1 && qtokens_size > 2; i++) {
|
||||
std::vector<std::string> candidate_tokens;
|
||||
|
||||
for(size_t j = 0; j < i; j++) {
|
||||
candidate_tokens.push_back(qtokens[j]);
|
||||
}
|
||||
|
||||
std::string joined_tokens = qtokens[i] + qtokens[i+1];
|
||||
candidate_tokens.push_back(joined_tokens);
|
||||
|
||||
for(size_t j = i+2; j < qtokens.size(); j++) {
|
||||
candidate_tokens.push_back(qtokens[j]);
|
||||
}
|
||||
|
||||
leaves.clear();
|
||||
|
||||
for(auto& token: candidate_tokens) {
|
||||
art_leaf* leaf = static_cast<art_leaf*>(art_search(t, (const unsigned char*) token.c_str(),
|
||||
token.length() + 1));
|
||||
if(leaf == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
||||
leaves.push_back(leaf);
|
||||
}
|
||||
|
||||
if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves)) {
|
||||
resolved_queries.push_back(candidate_tokens);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// concats did not work, we will try splitting individual tokens
|
||||
for(size_t i = 0; i < qtokens_size; i++) {
|
||||
std::vector<std::string> candidate_tokens;
|
||||
|
||||
for(size_t j = 0; j < i; j++) {
|
||||
candidate_tokens.push_back(qtokens[j]);
|
||||
}
|
||||
|
||||
const std::string& token = qtokens[i];
|
||||
bool found_split = false;
|
||||
|
||||
for(size_t ci = 1; ci < token.size(); ci++) {
|
||||
std::string first_part = token.substr(0, token.size()-ci);
|
||||
art_leaf* first_leaf = static_cast<art_leaf*>(art_search(t, (const unsigned char*) first_part.c_str(),
|
||||
first_part.length() + 1));
|
||||
|
||||
if(first_leaf != nullptr) {
|
||||
// check if rest of the string is also a valid token
|
||||
std::string second_part = token.substr(token.size()-ci, ci);
|
||||
art_leaf* second_leaf = static_cast<art_leaf*>(art_search(t, (const unsigned char*) second_part.c_str(),
|
||||
second_part.length() + 1));
|
||||
|
||||
std::vector<art_leaf*> part_leaves = {first_leaf, second_leaf};
|
||||
if(second_leaf != nullptr && common_results_exist(part_leaves)) {
|
||||
candidate_tokens.push_back(first_part);
|
||||
candidate_tokens.push_back(second_part);
|
||||
found_split = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(!found_split) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for(size_t j = i+1; j < qtokens.size(); j++) {
|
||||
candidate_tokens.push_back(qtokens[j]);
|
||||
}
|
||||
|
||||
leaves.clear();
|
||||
|
||||
for(auto& token: candidate_tokens) {
|
||||
art_leaf* leaf = static_cast<art_leaf*>(art_search(t, (const unsigned char*) token.c_str(),
|
||||
token.length() + 1));
|
||||
if(leaf == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
||||
leaves.push_back(leaf);
|
||||
}
|
||||
|
||||
if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves)) {
|
||||
resolved_queries.push_back(candidate_tokens);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Index::common_results_exist(std::vector<art_leaf*>& leaves) {
|
||||
std::vector<uint32_t> result_ids;
|
||||
std::vector<void*> leaf_vals;
|
||||
|
||||
for(auto leaf: leaves) {
|
||||
leaf_vals.push_back(leaf->values);
|
||||
}
|
||||
|
||||
posting_t::intersect(leaf_vals, result_ids);
|
||||
return !result_ids.empty();
|
||||
}
|
||||
|
||||
/*
|
||||
// https://stackoverflow.com/questions/924171/geo-fencing-point-inside-outside-polygon
|
||||
// NOTE: polygon and point should have been transformed with `transform_for_180th_meridian`
|
||||
|
@ -1711,8 +1711,6 @@ TEST_F(CollectionSpecificTest, RepeatingStringArrayTokens) {
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, HighlightOnPrefixRegression) {
|
||||
std::vector<std::string> tags;
|
||||
|
||||
// when the first document containing a token already cannot fit compact posting list
|
||||
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),};
|
||||
@ -1727,8 +1725,44 @@ TEST_F(CollectionSpecificTest, HighlightOnPrefixRegression) {
|
||||
auto results = coll1->search("and", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
LOG(INFO) << results;
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
||||
TEST_F(CollectionSpecificTest, SearchShouldJoinToken) {
|
||||
// when the first document containing a token already cannot fit compact posting list
|
||||
std::vector<field> fields = {field("title", field_types::STRING, false),};
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
|
||||
|
||||
nlohmann::json doc;
|
||||
doc["title"] = "The nonstick pressure cooker is a great invention.";
|
||||
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
|
||||
auto results = coll1->search("non stick", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
results = coll1->search("pressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
results = coll1->search("t h e", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
results = coll1->search("c o o k e r", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
|
||||
// three word split won't work
|
||||
|
||||
results = coll1->search("nonstickpressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
// only first 5 words of the query are used for concat/split
|
||||
|
||||
results = coll1->search("nonstick pressure cooker is a greatinvention", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
results = coll1->search("nonstick pressure cooker is a gr eat", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
|
||||
ASSERT_EQ(0, results["hits"].size());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
||||
|
@ -561,13 +561,12 @@ TEST_F(CollectionSynonymsTest, SynonymSingleTokenExactMatch) {
|
||||
synonym_t synonym1{"syn-1", {"lulu", "lemon"}, {{"lululemon"}}};
|
||||
coll1->add_synonym(synonym1);
|
||||
|
||||
auto res = coll1->search("lulu lemon", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 1).get();
|
||||
auto res = coll1->search("lulu lemon", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 0).get();
|
||||
|
||||
ASSERT_EQ(2, res["hits"].size());
|
||||
ASSERT_EQ(2, res["found"].get<uint32_t>());
|
||||
ASSERT_EQ(1, res["hits"].size());
|
||||
ASSERT_EQ(1, res["found"].get<uint32_t>());
|
||||
|
||||
ASSERT_STREQ("2", res["hits"][0]["document"]["id"].get<std::string>().c_str());
|
||||
ASSERT_STREQ("1", res["hits"][1]["document"]["id"].get<std::string>().c_str());
|
||||
|
||||
collectionManager.drop_collection("coll1");
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user