Basics of treating space as typo.

2025-05-19 21:22:25 +08:00 · 2021-11-21 07:53:27 +05:30 · 2021-11-21 07:53:27 +05:30 · f0b09e6c07
commit f0b09e6c07
parent 237b67816d
5 changed files with 206 additions and 9 deletions
--- a/include/index.h
+++ b/include/index.h
@ -575,6 +575,8 @@ private:
                                            const std::string &field_name,
                                            nlohmann::json::iterator& array_iter, bool is_array, bool& array_ele_erased);

+    bool common_results_exist(std::vector<art_leaf*>& leaves);
+
 public:
    // for limiting number of results on multiple candidates / query rewrites
    enum {TYPO_TOKENS_THRESHOLD = 1};
@ -754,6 +756,9 @@ public:
                             const std::vector<std::string>& group_by_fields,
                             std::vector<facet_info_t>& facet_infos) const;

+    void resolve_space_as_typos(std::vector<std::string>& qtokens, const std::string& field_name,
+                                std::vector<std::vector<std::string>>& resolved_queries);
+
    size_t num_seq_ids() const;
 };

--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -916,8 +916,17 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query, const s
                               field_locale, pre_segmented_query);

            // get synonyms
-            std::vector<std::vector<std::string>> q_synonyms;
            synonym_reduction(field_query_tokens[i].q_include_tokens, field_query_tokens[i].q_synonyms);
+
+            std::vector<std::vector<std::string>> space_resolved_queries;
+            index->resolve_space_as_typos(field_query_tokens[i].q_include_tokens, search_field,
+                                          space_resolved_queries);
+
+            // only one query is resolved for now, so just use that
+            if(!space_resolved_queries.empty()) {
+                field_query_tokens[i].q_include_tokens = space_resolved_queries[0];
+                synonym_reduction(space_resolved_queries[0], field_query_tokens[i].q_synonyms);
+            }
        }
    }

--- a/src/index.cpp
+++ b/src/index.cpp
@ -3920,6 +3920,156 @@ size_t Index::num_seq_ids() const {
    return seq_ids.getLength();
 }

+void Index::resolve_space_as_typos(std::vector<std::string>& qtokens, const string& field_name,
+                                   std::vector<std::vector<std::string>>& resolved_queries) {
+
+    std::shared_lock lock(mutex);
+
+    auto tree_it = search_index.find(field_name);
+
+    if(tree_it == search_index.end()) {
+        return ;
+    }
+
+    // we will try to find a verbatim match first
+
+    art_tree* t = tree_it->second;
+    std::vector<art_leaf*> leaves;
+
+    for(const std::string& token: qtokens) {
+        art_leaf* leaf = (art_leaf *) art_search(t, (const unsigned char*) token.c_str(),
+                                                 token.length()+1);
+        if(leaf == nullptr) {
+            break;
+        }
+
+        leaves.push_back(leaf);
+    }
+
+    if(leaves.size() == qtokens.size() && common_results_exist(leaves)) {
+        return ;
+    }
+
+    // When we cannot find verbatim match, we can try concatting and splitting query tokens for alternatives.
+
+    // Concatenation:
+
+    size_t qtokens_size = std::min<size_t>(5, qtokens.size());  // only first 5 tokens will be considered
+
+    if(qtokens.size() > 1) {
+        // a) join all tokens to form a single string
+        const string& all_tokens_query = StringUtils::join(qtokens, "");
+        if(art_search(t, (const unsigned char*) all_tokens_query.c_str(), all_tokens_query.length()+1) != nullptr) {
+            resolved_queries.push_back({all_tokens_query});
+            return;
+        }
+
+        // b) join 2 adjacent tokens in a sliding window (provided they are atleast 2 tokens in size)
+
+        for(size_t i = 0; i < qtokens_size-1 && qtokens_size > 2; i++) {
+            std::vector<std::string> candidate_tokens;
+
+            for(size_t j = 0; j < i; j++) {
+                candidate_tokens.push_back(qtokens[j]);
+            }
+
+            std::string joined_tokens = qtokens[i] + qtokens[i+1];
+            candidate_tokens.push_back(joined_tokens);
+
+            for(size_t j = i+2; j < qtokens.size(); j++) {
+                candidate_tokens.push_back(qtokens[j]);
+            }
+
+            leaves.clear();
+
+            for(auto& token: candidate_tokens) {
+                art_leaf* leaf = static_cast<art_leaf*>(art_search(t, (const unsigned char*) token.c_str(),
+                                                                   token.length() + 1));
+                if(leaf == nullptr) {
+                    break;
+                }
+
+                leaves.push_back(leaf);
+            }
+
+            if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves)) {
+                resolved_queries.push_back(candidate_tokens);
+                return;
+            }
+        }
+    }
+
+    // concats did not work, we will try splitting individual tokens
+    for(size_t i = 0; i < qtokens_size; i++) {
+        std::vector<std::string> candidate_tokens;
+
+        for(size_t j = 0; j < i; j++) {
+            candidate_tokens.push_back(qtokens[j]);
+        }
+
+        const std::string& token = qtokens[i];
+        bool found_split = false;
+
+        for(size_t ci = 1; ci < token.size(); ci++) {
+            std::string first_part = token.substr(0, token.size()-ci);
+            art_leaf* first_leaf = static_cast<art_leaf*>(art_search(t, (const unsigned char*) first_part.c_str(),
+                                                                     first_part.length() + 1));
+
+            if(first_leaf != nullptr) {
+                // check if rest of the string is also a valid token
+                std::string second_part = token.substr(token.size()-ci, ci);
+                art_leaf* second_leaf = static_cast<art_leaf*>(art_search(t, (const unsigned char*) second_part.c_str(),
+                                                                          second_part.length() + 1));
+
+                std::vector<art_leaf*> part_leaves = {first_leaf, second_leaf};
+                if(second_leaf != nullptr && common_results_exist(part_leaves)) {
+                    candidate_tokens.push_back(first_part);
+                    candidate_tokens.push_back(second_part);
+                    found_split = true;
+                    break;
+                }
+            }
+        }
+
+        if(!found_split) {
+            continue;
+        }
+
+        for(size_t j = i+1; j < qtokens.size(); j++) {
+            candidate_tokens.push_back(qtokens[j]);
+        }
+
+        leaves.clear();
+
+        for(auto& token: candidate_tokens) {
+            art_leaf* leaf = static_cast<art_leaf*>(art_search(t, (const unsigned char*) token.c_str(),
+                                                               token.length() + 1));
+            if(leaf == nullptr) {
+                break;
+            }
+
+            leaves.push_back(leaf);
+        }
+
+        if(candidate_tokens.size() == leaves.size() && common_results_exist(leaves)) {
+            resolved_queries.push_back(candidate_tokens);
+            return;
+        }
+    }
+}
+
+bool Index::common_results_exist(std::vector<art_leaf*>& leaves) {
+    std::vector<uint32_t> result_ids;
+    std::vector<void*> leaf_vals;
+
+    for(auto leaf: leaves) {
+        leaf_vals.push_back(leaf->values);
+    }
+
+    posting_t::intersect(leaf_vals, result_ids);
+    return !result_ids.empty();
+}
+
 /*
 // https://stackoverflow.com/questions/924171/geo-fencing-point-inside-outside-polygon
 // NOTE: polygon and point should have been transformed with `transform_for_180th_meridian`
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@ -1711,8 +1711,6 @@ TEST_F(CollectionSpecificTest, RepeatingStringArrayTokens) {
 }

 TEST_F(CollectionSpecificTest, HighlightOnPrefixRegression) {
-    std::vector<std::string> tags;
-
    // when the first document containing a token already cannot fit compact posting list

    std::vector<field> fields = {field("title", field_types::STRING, false),};
@ -1727,8 +1725,44 @@ TEST_F(CollectionSpecificTest, HighlightOnPrefixRegression) {
    auto results = coll1->search("and", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
    ASSERT_EQ(1, results["hits"].size());

-    LOG(INFO) << results;
-
    collectionManager.drop_collection("coll1");
 }

+TEST_F(CollectionSpecificTest, SearchShouldJoinToken) {
+    // when the first document containing a token already cannot fit compact posting list
+    std::vector<field> fields = {field("title", field_types::STRING, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
+
+    nlohmann::json doc;
+    doc["title"] = "The nonstick pressure cooker is a great invention.";
+
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    auto results = coll1->search("non stick", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
+    ASSERT_EQ(1, results["hits"].size());
+
+    results = coll1->search("pressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
+    ASSERT_EQ(1, results["hits"].size());
+
+    results = coll1->search("t h e", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
+    ASSERT_EQ(1, results["hits"].size());
+
+    results = coll1->search("c o o k e r", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
+    ASSERT_EQ(1, results["hits"].size());
+
+    // three word split won't work
+
+    results = coll1->search("nonstickpressurecooker", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
+    ASSERT_EQ(0, results["hits"].size());
+
+    // only first 5 words of the query are used for concat/split
+
+    results = coll1->search("nonstick pressure cooker is a greatinvention", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
+    ASSERT_EQ(0, results["hits"].size());
+
+    results = coll1->search("nonstick pressure cooker is a gr eat", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}, 0).get();
+    ASSERT_EQ(0, results["hits"].size());
+
+    collectionManager.drop_collection("coll1");
+}
--- a/test/collection_synonyms_test.cpp
+++ b/test/collection_synonyms_test.cpp
@ -561,13 +561,12 @@ TEST_F(CollectionSynonymsTest, SynonymSingleTokenExactMatch) {
    synonym_t synonym1{"syn-1", {"lulu", "lemon"}, {{"lululemon"}}};
    coll1->add_synonym(synonym1);

-    auto res = coll1->search("lulu lemon", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 1).get();
+    auto res = coll1->search("lulu lemon", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 0).get();

-    ASSERT_EQ(2, res["hits"].size());
-    ASSERT_EQ(2, res["found"].get<uint32_t>());
+    ASSERT_EQ(1, res["hits"].size());
+    ASSERT_EQ(1, res["found"].get<uint32_t>());

    ASSERT_STREQ("2", res["hits"][0]["document"]["id"].get<std::string>().c_str());
-    ASSERT_STREQ("1", res["hits"][1]["document"]["id"].get<std::string>().c_str());

    collectionManager.drop_collection("coll1");
 }