diff --git a/include/index.h b/include/index.h index 9c8d0fc4..a64ca29f 100644 --- a/include/index.h +++ b/include/index.h @@ -575,6 +575,8 @@ private: const std::string &field_name, nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased); + bool common_results_exist(std::vector& leaves); + public: // for limiting number of results on multiple candidates / query rewrites enum {TYPO_TOKENS_THRESHOLD = 1}; @@ -754,6 +756,9 @@ public: const std::vector& group_by_fields, std::vector& facet_infos) const; + void resolve_space_as_typos(std::vector& qtokens, const std::string& field_name, + std::vector>& resolved_queries); + size_t num_seq_ids() const; }; diff --git a/src/collection.cpp b/src/collection.cpp index f875bb68..65a8440b 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -916,8 +916,17 @@ Option Collection::search(const std::string & raw_query, const s field_locale, pre_segmented_query); // get synonyms - std::vector> q_synonyms; synonym_reduction(field_query_tokens[i].q_include_tokens, field_query_tokens[i].q_synonyms); + + std::vector> space_resolved_queries; + index->resolve_space_as_typos(field_query_tokens[i].q_include_tokens, search_field, + space_resolved_queries); + + // only one query is resolved for now, so just use that + if(!space_resolved_queries.empty()) { + field_query_tokens[i].q_include_tokens = space_resolved_queries[0]; + synonym_reduction(space_resolved_queries[0], field_query_tokens[i].q_synonyms); + } } } diff --git a/src/index.cpp b/src/index.cpp index 99b72e8a..f84a222e 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -3920,6 +3920,156 @@ size_t Index::num_seq_ids() const { return seq_ids.getLength(); } +void Index::resolve_space_as_typos(std::vector& qtokens, const string& field_name, + std::vector>& resolved_queries) { + + std::shared_lock lock(mutex); + + auto tree_it = search_index.find(field_name); + + if(tree_it == search_index.end()) { + return ; + } + + // we will try to find a verbatim match first + + art_tree* t = tree_it->second; + std::vector leaves; + + for(const std::string& token: qtokens) { + art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) token.c_str(), + token.length()+1); + if(leaf == nullptr) { + break; + } + + leaves.push_back(leaf); + } + + if(leaves.size() == qtokens.size() && common_results_exist(leaves)) { + return ; + } + + // When we cannot find verbatim match, we can try concatting and splitting query tokens for alternatives. + + // Concatenation: + + size_t qtokens_size = std::min(5, qtokens.size()); // only first 5 tokens will be considered + + if(qtokens.size() > 1) { + // a) join all tokens to form a single string + const string& all_tokens_query = StringUtils::join(qtokens, ""); + if(art_search(t, (const unsigned char*) all_tokens_query.c_str(), all_tokens_query.length()+1) != nullptr) { + resolved_queries.push_back({all_tokens_query}); + return; + } + + // b) join 2 adjacent tokens in a sliding window (provided they are atleast 2 tokens in size) + + for(size_t i = 0; i < qtokens_size-1 && qtokens_size > 2; i++) { + std::vector candidate_tokens; + + for(size_t j = 0; j < i; j++) { + candidate_tokens.push_back(qtokens[j]); + } + + std::string joined_tokens = qtokens[i] + qtokens[i+1]; + candidate_tokens.push_back(joined_tokens); + + for(size_t j = i+2; j < qtokens.size(); j++) { + candidate_tokens.push_back(qtokens[j]); + } + + leaves.clear(); + + for(auto& token: candidate_tokens) { + art_leaf* leaf = static_cast(art_search(t, (const unsigned char*) token.c_str(), + token.length() + 1)); + if(leaf == nullptr) { + break; + } + + leaves.push_back(leaf); + } + + if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves)) { + resolved_queries.push_back(candidate_tokens); + return; + } + } + } + + // concats did not work, we will try splitting individual tokens + for(size_t i = 0; i < qtokens_size; i++) { + std::vector candidate_tokens; + + for(size_t j = 0; j < i; j++) { + candidate_tokens.push_back(qtokens[j]); + } + + const std::string& token = qtokens[i]; + bool found_split = false; + + for(size_t ci = 1; ci < token.size(); ci++) { + std::string first_part = token.substr(0, token.size()-ci); + art_leaf* first_leaf = static_cast(art_search(t, (const unsigned char*) first_part.c_str(), + first_part.length() + 1)); + + if(first_leaf != nullptr) { + // check if rest of the string is also a valid token + std::string second_part = token.substr(token.size()-ci, ci); + art_leaf* second_leaf = static_cast(art_search(t, (const unsigned char*) second_part.c_str(), + second_part.length() + 1)); + + std::vector part_leaves = {first_leaf, second_leaf}; + if(second_leaf != nullptr && common_results_exist(part_leaves)) { + candidate_tokens.push_back(first_part); + candidate_tokens.push_back(second_part); + found_split = true; + break; + } + } + } + + if(!found_split) { + continue; + } + + for(size_t j = i+1; j < qtokens.size(); j++) { + candidate_tokens.push_back(qtokens[j]); + } + + leaves.clear(); + + for(auto& token: candidate_tokens) { + art_leaf* leaf = static_cast(art_search(t, (const unsigned char*) token.c_str(), + token.length() + 1)); + if(leaf == nullptr) { + break; + } + + leaves.push_back(leaf); + } + + if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves)) { + resolved_queries.push_back(candidate_tokens); + return; + } + } +} + +bool Index::common_results_exist(std::vector& leaves) { + std::vector result_ids; + std::vector leaf_vals; + + for(auto leaf: leaves) { + leaf_vals.push_back(leaf->values); + } + + posting_t::intersect(leaf_vals, result_ids); + return !result_ids.empty(); +} + /* // https://stackoverflow.com/questions/924171/geo-fencing-point-inside-outside-polygon // NOTE: polygon and point should have been transformed with `transform_for_180th_meridian` diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp index 8ae33b22..e33866f5 100644 --- a/test/collection_specific_test.cpp +++ b/test/collection_specific_test.cpp @@ -1711,8 +1711,6 @@ TEST_F(CollectionSpecificTest, RepeatingStringArrayTokens) { } TEST_F(CollectionSpecificTest, HighlightOnPrefixRegression) { - std::vector tags; - // when the first document containing a token already cannot fit compact posting list std::vector fields = {field("title", field_types::STRING, false),}; @@ -1727,8 +1725,44 @@ TEST_F(CollectionSpecificTest, HighlightOnPrefixRegression) { auto results = coll1->search("and", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get(); ASSERT_EQ(1, results["hits"].size()); - LOG(INFO) << results; - collectionManager.drop_collection("coll1"); } +TEST_F(CollectionSpecificTest, SearchShouldJoinToken) { + // when the first document containing a token already cannot fit compact posting list + std::vector fields = {field("title", field_types::STRING, false),}; + + Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get(); + + nlohmann::json doc; + doc["title"] = "The nonstick pressure cooker is a great invention."; + + ASSERT_TRUE(coll1->add(doc.dump()).ok()); + + auto results = coll1->search("non stick", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); + ASSERT_EQ(1, results["hits"].size()); + + results = coll1->search("pressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); + ASSERT_EQ(1, results["hits"].size()); + + results = coll1->search("t h e", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); + ASSERT_EQ(1, results["hits"].size()); + + results = coll1->search("c o o k e r", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); + ASSERT_EQ(1, results["hits"].size()); + + // three word split won't work + + results = coll1->search("nonstickpressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); + ASSERT_EQ(0, results["hits"].size()); + + // only first 5 words of the query are used for concat/split + + results = coll1->search("nonstick pressure cooker is a greatinvention", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); + ASSERT_EQ(0, results["hits"].size()); + + results = coll1->search("nonstick pressure cooker is a gr eat", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get(); + ASSERT_EQ(0, results["hits"].size()); + + collectionManager.drop_collection("coll1"); +} diff --git a/test/collection_synonyms_test.cpp b/test/collection_synonyms_test.cpp index 5f9807b3..3d4cdb15 100644 --- a/test/collection_synonyms_test.cpp +++ b/test/collection_synonyms_test.cpp @@ -561,13 +561,12 @@ TEST_F(CollectionSynonymsTest, SynonymSingleTokenExactMatch) { synonym_t synonym1{"syn-1", {"lulu", "lemon"}, {{"lululemon"}}}; coll1->add_synonym(synonym1); - auto res = coll1->search("lulu lemon", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 1).get(); + auto res = coll1->search("lulu lemon", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 0).get(); - ASSERT_EQ(2, res["hits"].size()); - ASSERT_EQ(2, res["found"].get()); + ASSERT_EQ(1, res["hits"].size()); + ASSERT_EQ(1, res["found"].get()); ASSERT_STREQ("2", res["hits"][0]["document"]["id"].get().c_str()); - ASSERT_STREQ("1", res["hits"][1]["document"]["id"].get().c_str()); collectionManager.drop_collection("coll1"); } \ No newline at end of file