Do drop tokens at a global level.

2025-05-17 12:12:35 +08:00 · 2022-03-23 16:40:08 +05:30 · 2022-03-23 16:40:08 +05:30 · 66cb71039f
commit 66cb71039f
parent 6b743cfa48
16 changed files with 664 additions and 453 deletions
--- a/include/collection.h
+++ b/include/collection.h
@ -18,108 +18,13 @@
 #include <field.h>
 #include <option.h>
 #include "tokenizer.h"
+#include "synonym_index.h"

 struct doc_seq_id_t {
    uint32_t seq_id;
    bool is_new;
 };

-struct synonym_t {
-    std::string id;
-    std::vector<std::string> root;
-    std::vector<std::vector<std::string>> synonyms;
-
-    synonym_t() = default;
-
-    synonym_t(const std::string& id, const std::vector<std::string>& root,
-              const std::vector<std::vector<std::string>>& synonyms):
-              id(id), root(root), synonyms(synonyms) {
-
-    }
-
-    explicit synonym_t(const nlohmann::json& synonym) {
-        id = synonym["id"].get<std::string>();
-        if(synonym.count("root") != 0) {
-            root = synonym["root"].get<std::vector<std::string>>();
-        }
-        synonyms = synonym["synonyms"].get<std::vector<std::vector<std::string>>>();
-    }
-
-    nlohmann::json to_json() const {
-        nlohmann::json obj;
-        obj["id"] = id;
-        obj["root"] = root;
-        obj["synonyms"] = synonyms;
-        return obj;
-    }
-
-    nlohmann::json to_view_json() const {
-        nlohmann::json obj;
-        obj["id"] = id;
-        obj["root"] = StringUtils::join(root, " ");
-
-        obj["synonyms"] = nlohmann::json::array();
-
-        for(const auto& synonym: synonyms) {
-            obj["synonyms"].push_back(StringUtils::join(synonym, " "));
-        }
-
-        return obj;
-    }
-
-    static Option<bool> parse(const nlohmann::json& synonym_json, synonym_t& syn) {
-        if(synonym_json.count("id") == 0) {
-            return Option<bool>(400, "Missing `id` field.");
-        }
-
-        if(synonym_json.count("synonyms") == 0) {
-            return Option<bool>(400, "Could not find an array of `synonyms`");
-        }
-
-        if(synonym_json.count("root") != 0 && !synonym_json["root"].is_string()) {
-            return Option<bool>(400, "Key `root` should be a string.");
-        }
-
-        if (!synonym_json["synonyms"].is_array() || synonym_json["synonyms"].empty()) {
-            return Option<bool>(400, "Could not find an array of `synonyms`");
-        }
-
-        for(const auto& synonym: synonym_json["synonyms"]) {
-            if(!synonym.is_string() || synonym == "") {
-                return Option<bool>(400, "Could not find a valid string array of `synonyms`");
-            }
-
-            std::vector<std::string> tokens;
-            Tokenizer(synonym, true).tokenize(tokens);
-            syn.synonyms.push_back(tokens);
-        }
-
-        if(synonym_json.count("root") != 0) {
-            std::vector<std::string> tokens;
-            Tokenizer(synonym_json["root"], true).tokenize(tokens);
-            syn.root = tokens;
-        }
-
-        syn.id = synonym_json["id"];
-        return Option<bool>(true);
-    }
-
-    static uint64_t get_hash(const std::vector<std::string>& tokens) {
-        uint64_t hash = 1;
-        for(size_t i=0; i < tokens.size(); i++) {
-            auto& token = tokens[i];
-            uint64_t token_hash = StringUtils::hash_wy(token.c_str(), token.size());
-            if(i == 0) {
-                hash = token_hash;
-            } else {
-                hash = Index::hash_combine(hash, token_hash);
-            }
-        }
-
-        return hash;
-    }
-};
-
 struct highlight_field_t {
    std::string name;
    bool fully_highlighted;
@ -176,9 +81,6 @@ private:

    std::map<std::string, override_t> overrides;

-    spp::sparse_hash_map<std::string, synonym_t> synonym_definitions;
-    spp::sparse_hash_map<uint64_t, std::vector<std::string>> synonym_index;
-
    const std::string default_sorting_field;

    const float max_memory_ratio;
@ -193,6 +95,8 @@ private:

    Index* index;

+    SynonymIndex* synonym_index;
+
    // methods

    std::string get_doc_id_key(const std::string & doc_id) const;
@ -242,12 +146,6 @@ private:
    static Option<bool> parse_pinned_hits(const std::string& pinned_hits_str,
                                   std::map<size_t, std::vector<std::string>>& pinned_hits);

-    void synonym_reduction_internal(const std::vector<std::string>& tokens,
-                                    size_t start_window_size,
-                                    size_t start_index_pos,
-                                    std::set<uint64_t>& processed_syn_hashes,
-                                    std::vector<std::vector<std::string>>& results) const;
-
    Index* init_index();

    static std::vector<char> to_char_array(const std::vector<std::string>& strs);
@ -267,7 +165,6 @@ public:
    static constexpr const char* COLLECTION_META_PREFIX = "$CM";
    static constexpr const char* COLLECTION_NEXT_SEQ_PREFIX = "$CS";
    static constexpr const char* COLLECTION_OVERRIDE_PREFIX = "$CO";
-    static constexpr const char* COLLECTION_SYNONYM_PREFIX = "$CY";
    static constexpr const char* SEQ_ID_PREFIX = "$SI";
    static constexpr const char* DOC_ID_PREFIX = "$DI";

@ -300,8 +197,6 @@ public:

    static std::string get_override_key(const std::string & collection_name, const std::string & override_id);

-    static std::string get_synonym_key(const std::string & collection_name, const std::string & synonym_id);
-
    std::string get_seq_id_collection_prefix() const;

    std::string get_name() const;
@ -444,9 +339,6 @@ public:

    // synonym operations

-    void synonym_reduction(const std::vector<std::string>& tokens,
-                           std::vector<std::vector<std::string>>& results) const;
-
    spp::sparse_hash_map<std::string, synonym_t> get_synonyms();

    bool get_synonym(const std::string& id, synonym_t& synonym);
@ -455,6 +347,11 @@ public:

    Option<bool> remove_synonym(const std::string & id);

+    void synonym_reduction(const std::vector<std::string>& tokens,
+                           std::vector<std::vector<std::string>>& results) const;
+
+    // highlight ops
+
    static void highlight_text(const string& highlight_start_tag, const string& highlight_end_tag, const string& last_raw_q_token,
                   const string& text, const std::map<size_t, size_t>& token_offsets,
                   const std::map<size_t, std::string>& prefix_start_offsets, size_t snippet_end_offset,
--- a/include/index.h
+++ b/include/index.h
@ -24,6 +24,7 @@
 #include "adi_tree.h"
 #include "tsl/htrie_set.h"
 #include "id_list.h"
+#include "synonym_index.h"

 static constexpr size_t ARRAY_FACET_DIM = 4;
 using facet_map_t = spp::sparse_hash_map<uint32_t, facet_hash_values_t>;
@ -56,7 +57,7 @@ struct search_field_t {
 };

 struct query_tokens_t {
-    std::vector<std::string> q_include_tokens;
+    std::vector<token_t> q_include_tokens;
    std::vector<std::vector<std::string>> q_exclude_tokens;
    std::vector<std::vector<std::string>> q_phrases;
    std::vector<std::vector<std::string>> q_synonyms;
@ -431,6 +432,8 @@ private:

    const Store* store;

+    const SynonymIndex* synonym_index;
+
    ThreadPool* thread_pool;

    size_t num_documents;
@ -480,6 +483,7 @@ private:
                                       long long int n,
                                       std::vector<art_leaf *>& actual_query_suggestion,
                                       std::vector<art_leaf *>& query_suggestion,
+                                       int syn_orig_num_tokens,
                                       uint32_t& token_bits,
                                       uint64& qhash);

@ -506,8 +510,7 @@ private:
    static void aggregate_topster(Topster* agg_topster, Topster* index_topster);

    void search_field(const uint8_t & field_id,
-                      std::vector<token_t>& query_tokens,
-                      std::vector<token_t>& search_tokens,
+                      const std::vector<token_t>& query_tokens,
                      const uint32_t* exclude_token_ids,
                      size_t exclude_token_ids_size,
                      size_t& num_tokens_dropped,
@ -575,7 +578,7 @@ private:
                                           std::unordered_map<std::string, std::vector<uint32_t>>& token_to_offsets,
                                           std::vector<uint64_t>& facet_hashes);

-    void collate_included_ids(const std::vector<std::string>& q_included_tokens,
+    void collate_included_ids(const std::vector<token_t>& q_included_tokens,
                              const std::string & field, const uint8_t field_id,
                              const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
                              Topster* curated_topster, std::vector<std::vector<art_leaf*>> & searched_queries) const;
@ -635,18 +638,14 @@ public:
    Index(const std::string& name,
          const uint32_t collection_id,
          const Store* store,
+          SynonymIndex* synonym_index,
          ThreadPool* thread_pool,
          const std::unordered_map<std::string, field>& search_schema,
-          const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators);
+          const std::vector<char>& symbols_to_index,
+          const std::vector<char>& token_separators);

    ~Index();

-    // reference: https://stackoverflow.com/a/27952689/131050
-    static uint64_t hash_combine(uint64_t combined, uint64_t hash) {
-        combined ^= hash + 0x517cc1b727220a95 + (combined << 6) + (combined >> 2);
-        return combined;
-    }
-
    static void concat_topster_ids(Topster* topster, spp::sparse_hash_map<uint64_t, std::vector<KV*>>& topster_ids);

    void score_results(const std::vector<sort_by> &sort_fields, const uint16_t &query_index, const uint8_t &field_id,
@ -819,7 +818,7 @@ public:
                           int field_num_typos,
                           bool field_prefix, const uint8_t field_id, const string& field_name,
                           const std::unordered_map<string, field>::const_iterator& field_it,
-                           std::vector<token_t>& query_tokens, std::vector<token_t>& search_tokens,
+                           std::vector<token_t>& query_tokens,
                           size_t num_tokens_dropped, Topster* actual_topster, size_t field_num_results,
                           std::vector<query_tokens_t>& field_query_tokens, size_t& all_result_ids_len,
                           spp::sparse_hash_set<uint64_t>& groups_processed,
--- a/include/string_utils.h
+++ b/include/string_utils.h
@ -295,6 +295,12 @@ struct StringUtils {
        return hash != std::numeric_limits<uint64_t>::max() ? hash : (std::numeric_limits<uint64_t>::max()-1);
    }

+    // reference: https://stackoverflow.com/a/27952689/131050
+    static uint64_t hash_combine(uint64_t combined, uint64_t hash) {
+        combined ^= hash + 0x517cc1b727220a95 + (combined << 6) + (combined >> 2);
+        return combined;
+    }
+
    std::string unicode_nfkd(const std::string& text);

    static std::string randstring(size_t length);
--- a/include/synonym_index.h
+++ b/include/synonym_index.h
@ -0,0 +1,139 @@
+#pragma once
+
+#include <set>
+#include "sparsepp.h"
+#include "json.hpp"
+#include "string_utils.h"
+#include "option.h"
+#include "tokenizer.h"
+#include "store.h"
+
+struct synonym_t {
+    std::string id;
+    std::vector<std::string> root;
+    std::vector<std::vector<std::string>> synonyms;
+
+    synonym_t() = default;
+
+    synonym_t(const std::string& id, const std::vector<std::string>& root,
+              const std::vector<std::vector<std::string>>& synonyms):
+            id(id), root(root), synonyms(synonyms) {
+
+    }
+
+    explicit synonym_t(const nlohmann::json& synonym) {
+        id = synonym["id"].get<std::string>();
+        if(synonym.count("root") != 0) {
+            root = synonym["root"].get<std::vector<std::string>>();
+        }
+        synonyms = synonym["synonyms"].get<std::vector<std::vector<std::string>>>();
+    }
+
+    nlohmann::json to_json() const {
+        nlohmann::json obj;
+        obj["id"] = id;
+        obj["root"] = root;
+        obj["synonyms"] = synonyms;
+        return obj;
+    }
+
+    nlohmann::json to_view_json() const {
+        nlohmann::json obj;
+        obj["id"] = id;
+        obj["root"] = StringUtils::join(root, " ");
+
+        obj["synonyms"] = nlohmann::json::array();
+
+        for(const auto& synonym: synonyms) {
+            obj["synonyms"].push_back(StringUtils::join(synonym, " "));
+        }
+
+        return obj;
+    }
+
+    static Option<bool> parse(const nlohmann::json& synonym_json, synonym_t& syn) {
+        if(synonym_json.count("id") == 0) {
+            return Option<bool>(400, "Missing `id` field.");
+        }
+
+        if(synonym_json.count("synonyms") == 0) {
+            return Option<bool>(400, "Could not find an array of `synonyms`");
+        }
+
+        if(synonym_json.count("root") != 0 && !synonym_json["root"].is_string()) {
+            return Option<bool>(400, "Key `root` should be a string.");
+        }
+
+        if (!synonym_json["synonyms"].is_array() || synonym_json["synonyms"].empty()) {
+            return Option<bool>(400, "Could not find an array of `synonyms`");
+        }
+
+        for(const auto& synonym: synonym_json["synonyms"]) {
+            if(!synonym.is_string() || synonym == "") {
+                return Option<bool>(400, "Could not find a valid string array of `synonyms`");
+            }
+
+            std::vector<std::string> tokens;
+            Tokenizer(synonym, true).tokenize(tokens);
+            syn.synonyms.push_back(tokens);
+        }
+
+        if(synonym_json.count("root") != 0) {
+            std::vector<std::string> tokens;
+            Tokenizer(synonym_json["root"], true).tokenize(tokens);
+            syn.root = tokens;
+        }
+
+        syn.id = synonym_json["id"];
+        return Option<bool>(true);
+    }
+
+    static uint64_t get_hash(const std::vector<std::string>& tokens) {
+        uint64_t hash = 1;
+        for(size_t i=0; i < tokens.size(); i++) {
+            auto& token = tokens[i];
+            uint64_t token_hash = StringUtils::hash_wy(token.c_str(), token.size());
+            if(i == 0) {
+                hash = token_hash;
+            } else {
+                hash = StringUtils::hash_combine(hash, token_hash);
+            }
+        }
+
+        return hash;
+    }
+};
+
+class SynonymIndex {
+private:
+
+    mutable std::shared_mutex mutex;
+    Store* store;
+    spp::sparse_hash_map<std::string, synonym_t> synonym_definitions;
+    spp::sparse_hash_map<uint64_t, std::vector<std::string>> synonym_index;
+
+    void synonym_reduction_internal(const std::vector<std::string>& tokens,
+                                    size_t start_window_size,
+                                    size_t start_index_pos,
+                                    std::set<uint64_t>& processed_syn_hashes,
+                                    std::vector<std::vector<std::string>>& results) const;
+
+public:
+
+    static constexpr const char* COLLECTION_SYNONYM_PREFIX = "$CY";
+
+    SynonymIndex(Store* store): store(store) { }
+
+    static std::string get_synonym_key(const std::string & collection_name, const std::string & synonym_id);
+
+    void synonym_reduction(const std::vector<std::string>& tokens,
+                           std::vector<std::vector<std::string>>& results) const;
+
+    spp::sparse_hash_map<std::string, synonym_t> get_synonyms();
+
+    bool get_synonym(const std::string& id, synonym_t& synonym);
+
+    Option<bool> add_synonym(const std::string & collection_name, const synonym_t& synonym);
+
+    Option<bool> remove_synonym(const std::string & collection_name, const std::string & id);
+};
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -57,6 +57,7 @@ Collection::Collection(const std::string& name, const uint32_t collection_id, co
 Collection::~Collection() {
    std::unique_lock lock(mutex);
    delete index;
+    delete synonym_index;
 }

 uint32_t Collection::get_next_seq_id() {
@ -974,28 +975,34 @@ Option<nlohmann::json> Collection::search(const std::string & raw_query, const s
    //LOG(INFO) << "Num indices used for querying: " << indices.size();
    std::vector<query_tokens_t> field_query_tokens;
    std::vector<std::string> q_tokens;  // used for auxillary highlighting
+    std::vector<std::string> q_include_tokens;

    if(search_fields.size() == 0) {
        // has to be a wildcard query
        field_query_tokens.emplace_back(query_tokens_t{});
-        parse_search_query(query, field_query_tokens[0].q_include_tokens,
+        parse_search_query(query, q_include_tokens,
                           field_query_tokens[0].q_exclude_tokens, field_query_tokens[0].q_phrases, "",
                           false);
+        for(size_t i = 0; i < q_include_tokens.size(); i++) {
+            auto& q_include_token = q_include_tokens[i];
+            field_query_tokens[0].q_include_tokens.emplace_back(i, q_include_token, (i == q_include_tokens.size()-1));
+        }
    } else {
        field_query_tokens.emplace_back(query_tokens_t{});
        const std::string & field_locale = search_schema.at(search_fields[0]).locale;
-        parse_search_query(query, field_query_tokens[0].q_include_tokens,
+        parse_search_query(query, q_include_tokens,
                           field_query_tokens[0].q_exclude_tokens,
                           field_query_tokens[0].q_phrases,
                           field_locale, pre_segmented_query);

        // process filter overrides first, before synonyms (order is important)
-        index->process_filter_overrides(filter_overrides, field_query_tokens[0].q_include_tokens, token_order, filters);
+        index->process_filter_overrides(filter_overrides, q_include_tokens, token_order, filters);

-        // get synonyms
-        synonym_reduction(field_query_tokens[0].q_include_tokens, field_query_tokens[0].q_synonyms);
-
-        q_tokens = field_query_tokens[0].q_include_tokens;
+        for(size_t i = 0; i < q_include_tokens.size(); i++) {
+            auto& q_include_token = q_include_tokens[i];
+            q_tokens.push_back(q_include_token);
+            field_query_tokens[0].q_include_tokens.emplace_back(i, q_include_token, (i == q_include_tokens.size()-1));
+        }

        for(auto& phrase: field_query_tokens[0].q_phrases) {
            for(auto& token: phrase) {
@ -2344,10 +2351,6 @@ std::string Collection::get_override_key(const std::string & collection_name, co
    return std::string(COLLECTION_OVERRIDE_PREFIX) + "_" + collection_name + "_" + override_id;
 }

-std::string Collection::get_synonym_key(const std::string & collection_name, const std::string & synonym_id) {
-    return std::string(COLLECTION_SYNONYM_PREFIX) + "_" + collection_name + "_" + synonym_id;
-}
-
 std::string Collection::get_seq_id_collection_prefix() const {
    return std::to_string(collection_id) + "_" + std::string(SEQ_ID_PREFIX);
 }
@ -2432,184 +2435,30 @@ Option<bool> Collection::parse_pinned_hits(const std::string& pinned_hits_str,
    return Option<bool>(true);
 }

-void Collection::synonym_reduction(const std::vector<std::string>& tokens,
-                                   std::vector<std::vector<std::string>>& results) const {
-    if(synonym_definitions.empty()) {
-        return;
-    }
-
-    std::set<uint64_t> processed_syn_hashes;
-    synonym_reduction_internal(tokens, tokens.size(), 0, processed_syn_hashes, results);
-}
-
 Option<bool> Collection::add_synonym(const synonym_t& synonym) {
-    if(synonym_definitions.count(synonym.id) != 0) {
-        // first we have to delete existing entries so we can upsert
-        Option<bool> rem_op = remove_synonym(synonym.id);
-        if(!rem_op.ok()) {
-            return rem_op;
-        }
-    }
-
-    std::unique_lock write_lock(mutex);
-    synonym_definitions[synonym.id] = synonym;
-
-    if(!synonym.root.empty()) {
-        uint64_t root_hash = synonym_t::get_hash(synonym.root);
-        synonym_index[root_hash].emplace_back(synonym.id);
-    } else {
-        for(const auto & syn_tokens : synonym.synonyms) {
-            uint64_t syn_hash = synonym_t::get_hash(syn_tokens);
-            synonym_index[syn_hash].emplace_back(synonym.id);
-        }
-    }
-
-    write_lock.unlock();
-
-    bool inserted = store->insert(Collection::get_synonym_key(name, synonym.id), synonym.to_json().dump());
-    if(!inserted) {
-        return Option<bool>(500, "Error while storing the synonym on disk.");
-    }
-
-    return Option<bool>(true);
+    std::shared_lock lock(mutex);
+    return synonym_index->add_synonym(name, synonym);
 }

 bool Collection::get_synonym(const std::string& id, synonym_t& synonym) {
    std::shared_lock lock(mutex);
-
-    if(synonym_definitions.count(id) != 0) {
-        synonym = synonym_definitions[id];
-        return true;
-    }
-
-    return false;
-}
-
-void Collection::synonym_reduction_internal(const std::vector<std::string>& tokens,
-                                            size_t start_window_size, size_t start_index_pos,
-                                            std::set<uint64_t>& processed_syn_hashes,
-                                            std::vector<std::vector<std::string>>& results) const {
-
-    bool recursed = false;
-
-    for(size_t window_len = start_window_size; window_len > 0; window_len--) {
-        for(size_t start_index = start_index_pos; start_index+window_len-1 < tokens.size(); start_index++) {
-            std::vector<uint64_t> syn_hashes;
-            uint64_t syn_hash = 1;
-
-            for(size_t i = start_index; i < start_index+window_len; i++) {
-                uint64_t token_hash = StringUtils::hash_wy(tokens[i].c_str(), tokens[i].size());
-
-                if(i == start_index) {
-                    syn_hash = token_hash;
-                } else {
-                    syn_hash = Index::hash_combine(syn_hash, token_hash);
-                }
-
-                syn_hashes.push_back(token_hash);
-            }
-
-            const auto& syn_itr = synonym_index.find(syn_hash);
-
-            if(syn_itr != synonym_index.end() && processed_syn_hashes.count(syn_hash) == 0) {
-                // tokens in this window match a synonym: reconstruct tokens and rerun synonym mapping against matches
-                const auto& syn_ids = syn_itr->second;
-
-                for(const auto& syn_id: syn_ids) {
-                    const auto &syn_def = synonym_definitions.at(syn_id);
-
-                    for (const auto &syn_def_tokens: syn_def.synonyms) {
-                        std::vector<std::string> new_tokens;
-
-                        for (size_t i = 0; i < start_index; i++) {
-                            new_tokens.push_back(tokens[i]);
-                        }
-
-                        std::vector<uint64_t> syn_def_hashes;
-                        uint64_t syn_def_hash = 1;
-
-                        for (size_t i = 0; i < syn_def_tokens.size(); i++) {
-                            const auto &syn_def_token = syn_def_tokens[i];
-                            new_tokens.push_back(syn_def_token);
-                            uint64_t token_hash = StringUtils::hash_wy(syn_def_token.c_str(),
-                                                                       syn_def_token.size());
-
-                            if (i == 0) {
-                                syn_def_hash = token_hash;
-                            } else {
-                                syn_def_hash = Index::hash_combine(syn_def_hash, token_hash);
-                            }
-
-                            syn_def_hashes.push_back(token_hash);
-                        }
-
-                        if (syn_def_hash == syn_hash) {
-                            // skip over token matching itself in the group
-                            continue;
-                        }
-
-                        for (size_t i = start_index + window_len; i < tokens.size(); i++) {
-                            new_tokens.push_back(tokens[i]);
-                        }
-
-                        processed_syn_hashes.emplace(syn_def_hash);
-                        processed_syn_hashes.emplace(syn_hash);
-
-                        for (uint64_t h: syn_def_hashes) {
-                            processed_syn_hashes.emplace(h);
-                        }
-
-                        for (uint64_t h: syn_hashes) {
-                            processed_syn_hashes.emplace(h);
-                        }
-
-                        recursed = true;
-                        synonym_reduction_internal(new_tokens, window_len, start_index, processed_syn_hashes, results);
-                    }
-                }
-            }
-        }
-
-        // reset it because for the next window we have to start from scratch
-        start_index_pos = 0;
-    }
-
-    if(!recursed && !processed_syn_hashes.empty()) {
-        results.emplace_back(tokens);
-    }
+    return synonym_index->get_synonym(id, synonym);
 }

 Option<bool> Collection::remove_synonym(const std::string &id) {
-    std::unique_lock lock(mutex);
-    const auto& syn_iter = synonym_definitions.find(id);
+    std::shared_lock lock(mutex);
+    return synonym_index->remove_synonym(name, id);
+}

-    if(syn_iter != synonym_definitions.end()) {
-        bool removed = store->remove(Collection::get_synonym_key(name, id));
-        if(!removed) {
-            return Option<bool>(500, "Error while deleting the synonym from disk.");
-        }
-
-        const auto& synonym = syn_iter->second;
-        if(!synonym.root.empty()) {
-            uint64_t root_hash = synonym_t::get_hash(synonym.root);
-            synonym_index.erase(root_hash);
-        } else {
-            for(const auto & syn_tokens : synonym.synonyms) {
-                uint64_t syn_hash = synonym_t::get_hash(syn_tokens);
-                synonym_index.erase(syn_hash);
-            }
-        }
-
-        synonym_definitions.erase(id);
-        return Option<bool>(true);
-    }
-
-    return Option<bool>(404, "Could not find that `id`.");
+void Collection::synonym_reduction(const std::vector<std::string>& tokens,
+                                     std::vector<std::vector<std::string>>& results) const {
+    std::shared_lock lock(mutex);
+    return synonym_index->synonym_reduction(tokens, results);
 }

 spp::sparse_hash_map<std::string, synonym_t> Collection::get_synonyms() {
    std::shared_lock lock(mutex);
-    return synonym_definitions;
+    return synonym_index->get_synonyms();
 }

 Option<bool> Collection::check_and_update_schema(nlohmann::json& document, const DIRTY_VALUES& dirty_values) {
@ -2773,9 +2622,12 @@ Index* Collection::init_index() {
        search_schema.emplace(field.name, field);
    }

+    synonym_index = new SynonymIndex(store);
+
    return new Index(name+std::to_string(0),
                     collection_id,
                     store,
+                     synonym_index,
                     CollectionManager::get_instance().get_thread_pool(),
                     search_schema,
                     symbols_to_index, token_separators);
--- a/src/collection_manager.cpp
+++ b/src/collection_manager.cpp
@ -429,7 +429,7 @@ Option<nlohmann::json> CollectionManager::drop_collection(const std::string& col

        // delete synonyms
        const std::string& del_synonym_prefix =
-                std::string(Collection::COLLECTION_SYNONYM_PREFIX) + "_" + actual_coll_name + "_";
+                std::string(SynonymIndex::COLLECTION_SYNONYM_PREFIX) + "_" + actual_coll_name + "_";
        iter = store->scan(del_synonym_prefix);
        while(iter->Valid() && iter->key().starts_with(del_synonym_prefix)) {
            store->remove(iter->key().ToString());
@ -1110,7 +1110,7 @@ Option<bool> CollectionManager::load_collection(const nlohmann::json &collection

    // initialize synonyms
    std::vector<std::string> collection_synonym_jsons;
-    cm.store->scan_fill(Collection::get_synonym_key(this_collection_name, ""), collection_synonym_jsons);
+    cm.store->scan_fill(SynonymIndex::get_synonym_key(this_collection_name, ""), collection_synonym_jsons);

    for(const auto & collection_synonym_json: collection_synonym_jsons) {
        nlohmann::json collection_synonym = nlohmann::json::parse(collection_synonym_json);
--- a/src/index.cpp
+++ b/src/index.cpp
@ -38,10 +38,11 @@ spp::sparse_hash_map<uint32_t, int64_t> Index::seq_id_sentinel_value;
 spp::sparse_hash_map<uint32_t, int64_t> Index::geo_sentinel_value;
 spp::sparse_hash_map<uint32_t, int64_t> Index::str_sentinel_value;

-Index::Index(const std::string& name, const uint32_t collection_id, const Store* store, ThreadPool* thread_pool,
+Index::Index(const std::string& name, const uint32_t collection_id, const Store* store,
+             SynonymIndex* synonym_index, ThreadPool* thread_pool,
             const std::unordered_map<std::string, field> & search_schema,
             const std::vector<char>& symbols_to_index, const std::vector<char>& token_separators):
-        name(name), collection_id(collection_id), store(store), thread_pool(thread_pool),
+        name(name), collection_id(collection_id), store(store), synonym_index(synonym_index), thread_pool(thread_pool),
        search_schema(search_schema),
        seq_ids(new id_list_t(256)), symbols_to_index(symbols_to_index), token_separators(token_separators) {

@ -1142,7 +1143,7 @@ void Index::search_candidates(const uint8_t & field_id, bool field_is_array,

        uint32_t token_bits = 0;
        uint32_t total_cost = next_suggestion(token_candidates_vec, n, actual_query_suggestion,
-                                              query_suggestion, token_bits, qhash);
+                                              query_suggestion, syn_orig_num_tokens, token_bits, qhash);

        if(query_hashes.find(qhash) != query_hashes.end()) {
            // skip this query since it has already been processed before
@ -1659,7 +1660,7 @@ void Index::run_search(search_args* search_params) {
           search_params->split_join_tokens);
 }

-void Index::collate_included_ids(const std::vector<std::string>& q_included_tokens,
+void Index::collate_included_ids(const std::vector<token_t>& q_included_tokens,
                                 const std::string & field, const uint8_t field_id,
                                 const std::map<size_t, std::map<size_t, uint32_t>> & included_ids_map,
                                 Topster* curated_topster,
@ -1673,15 +1674,15 @@ void Index::collate_included_ids(const std::vector<std::string>& q_included_toke

    std::vector<art_leaf *> override_query;

-    for(const std::string& token: q_included_tokens) {
-        if(token == "*") {
+    for(const token_t& token: q_included_tokens) {
+        if(token.value == "*") {
            continue;
        }

-        const size_t token_len = token.size() + 1;
+        const size_t token_len = token.value.size() + 1;

        std::vector<art_leaf*> leaves;
-        art_fuzzy_search(search_index.at(field), (const unsigned char *) token.c_str(), token_len,
+        art_fuzzy_search(search_index.at(field), (const unsigned char *) token.value.c_str(), token_len,
                         0, 0, 1, token_ordering::MAX_SCORE, false, nullptr, 0, leaves);

        if(!leaves.empty()) {
@ -1931,8 +1932,6 @@ bool Index::check_for_overrides(const token_ordering& token_order, const string&
                window_tokens_set.emplace(tokens[i]);
            }

-            std::vector<token_t> search_tokens = window_tokens;
-
            std::vector<facet> facets;
            std::vector<std::vector<art_leaf*>> searched_queries;
            Topster* topster = nullptr;
@ -1950,7 +1949,7 @@ bool Index::check_for_overrides(const token_ordering& token_order, const string&
                continue;
            }

-            search_field(0, window_tokens, search_tokens, nullptr, 0, num_toks_dropped, field_it->second, field_name,
+            search_field(0, window_tokens, nullptr, 0, num_toks_dropped, field_it->second, field_name,
                         nullptr, 0, {}, {}, -1, 2, searched_queries, topster, groups_processed,
                         &result_ids, result_ids_len, field_num_results, 0, group_by_fields,
                         false, 4, query_hashes, token_order, false, 0, 1, false, -1, 3, 7, 4);
@ -2142,7 +2141,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
    std::vector<Topster*> ftopsters;

    auto is_wildcard_query = !field_query_tokens.empty() && !field_query_tokens[0].q_include_tokens.empty() &&
-                             field_query_tokens[0].q_include_tokens[0] == "*";
+                             field_query_tokens[0].q_include_tokens[0].value == "*";

    // for phrase query, parser will set field_query_tokens to "*", need to handle that
    if (is_wildcard_query && field_query_tokens[0].q_phrases.empty()) {
@ -2162,6 +2161,10 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
        // In multi-field searches, a record can be matched across different fields, so we use this for aggregation
        spp::sparse_hash_map<uint64_t, std::vector<KV*>> topster_ids;
        //begin = std::chrono::high_resolution_clock::now();
+        for(size_t i = 0; i < num_search_fields; i++) {
+            Topster* ftopster = new Topster(topster->MAX_SIZE, topster->distinct);
+            ftopsters.push_back(ftopster);
+        }

        // We do progressive typo relaxation here so that results with minimal typos are fetched first,
        // and further results are fetched only if `typo_tokens_threshold` is not satisfied.
@ -2180,6 +2183,65 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
            }
        }

+        // now try to drop tokens
+
+        size_t num_tokens_dropped = 0;
+        std::vector<token_t> orig_tokens = field_query_tokens[0].q_include_tokens;
+        std::vector<query_tokens_t> truncated_field_query_tokens = field_query_tokens;
+
+        while(exhaustive_search || all_result_ids_len < drop_tokens_threshold) {
+            // When atleast two tokens from the query are available we can drop one
+            std::vector<token_t> truncated_tokens;
+
+            if(orig_tokens.size() > 1 && num_tokens_dropped < 2*(orig_tokens.size()-1)) {
+                bool prefix_search = false;
+
+                if(num_tokens_dropped < orig_tokens.size()-1) {
+                    // drop from right
+                    size_t truncated_len = orig_tokens.size() - num_tokens_dropped - 1;
+                    for(size_t i=0; i < truncated_len; i++) {
+                        truncated_tokens.emplace_back(orig_tokens[i]);
+                    }
+                } else {
+                    // drop from left
+                    prefix_search = true;
+                    size_t start_index = (num_tokens_dropped + 1) - orig_tokens.size() + 1;
+                    for(size_t i = start_index; i < orig_tokens.size(); i++) {
+                        truncated_tokens.emplace_back(orig_tokens[i]);
+                    }
+                }
+
+                num_tokens_dropped++;
+
+                for(size_t i = 0; i < num_search_fields; i++) {
+                    truncated_field_query_tokens[i].q_include_tokens = truncated_tokens;
+                }
+
+                std::vector<bool> drop_token_prefixes;
+
+                for(const auto p: prefixes) {
+                    drop_token_prefixes.push_back(p && prefix_search);
+                }
+
+                for(int min_typo = 0; min_typo <= 2; min_typo++) {
+                    search_fields(filters, included_ids_map, sort_fields_std, min_typo, num_typos, topster,
+                                  curated_topster, token_order, drop_token_prefixes, drop_tokens_threshold, groups_processed,
+                                  searched_queries, typo_tokens_threshold, group_limit, group_by_fields,
+                                  prioritize_exact_match, exhaustive_search, concurrency, min_len_1typo, min_len_2typo,
+                                  max_candidates, infixes, max_extra_prefix, max_extra_suffix,
+                                  filter_ids, filter_ids_length, curated_ids, curated_ids_sorted, num_search_fields,
+                                  exclude_token_ids, exclude_token_ids_size, ftopsters, is_wildcard_query, false,
+                                  truncated_field_query_tokens, the_fields, all_result_ids_len, all_result_ids, topster_ids);
+
+                    if (!exhaustive_search && all_result_ids_len >= typo_tokens_threshold) {
+                        break;
+                    }
+                }
+            } else {
+                break;
+            }
+        }
+
        //auto begin0 = std::chrono::high_resolution_clock::now();
        /*size_t total_q_tokens = field_query_tokens[0].q_include_tokens.size();
        for(const auto& phrase: field_query_tokens[0].q_phrases) {
@ -2392,7 +2454,7 @@ void Index::aggregate_and_score_fields(const std::vector<query_tokens_t>& field_

            // FIXME: must consider phrase tokens also
            for(size_t token_index=0; token_index < field_query_tokens[i].q_include_tokens.size(); token_index++) {
-                const auto& token = field_query_tokens[i].q_include_tokens[token_index];
+                const auto& token = field_query_tokens[i].q_include_tokens[token_index].value;
                const art_leaf* leaf = (art_leaf *) art_search(search_index.at(field), (const unsigned char*) token.c_str(),
                                                         token.length()+1);

@ -2494,11 +2556,7 @@ void Index::search_fields(const std::vector<filter>& filters,
            continue;
        }

-        std::vector<token_t> q_include_pos_tokens;
-        for(size_t j=0; j < field_query_tokens[i].q_include_tokens.size(); j++) {
-            bool is_prefix = (j == field_query_tokens[i].q_include_tokens.size()-1);
-            q_include_pos_tokens.emplace_back(j, field_query_tokens[i].q_include_tokens[j], is_prefix);
-        }
+        std::vector<token_t> q_include_pos_tokens = field_query_tokens[i].q_include_tokens;

        // these are already validated upstream, but still playing safe
        bool field_prefix = (i < prefixes.size()) ? prefixes[i] : prefixes[0];
@ -2514,15 +2572,12 @@ void Index::search_fields(const std::vector<filter>& filters,
            }

            std::vector<token_t> query_tokens = q_include_pos_tokens;
-            std::vector<token_t> search_tokens = q_include_pos_tokens;
            size_t num_tokens_dropped = 0;

            //LOG(INFO) << "searching field_name! " << field_name;
-            Topster* ftopster = new Topster(topster->MAX_SIZE, topster->distinct);
-            ftopsters.push_back(ftopster);

            // Don't waste additional cycles for single field_name searches
-            Topster* actual_topster = (num_search_fields == 1) ? topster : ftopster;
+            Topster* actual_topster = (num_search_fields == 1) ? topster : ftopsters[i];

            // tracks the number of results found for the current field_name
            size_t field_num_results = 0;
@ -2531,7 +2586,7 @@ void Index::search_fields(const std::vector<filter>& filters,
            int last_typo = int(min_typo) - 1;

            if(!is_wildcard_query) {
-                search_field(field_id, query_tokens, search_tokens, exclude_token_ids, exclude_token_ids_size,
+                search_field(field_id, query_tokens, exclude_token_ids, exclude_token_ids_size,
                             num_tokens_dropped, field_it->second, field_name,
                             actual_filter_ids, actual_filter_ids_length, curated_ids_sorted, sort_fields_std,
                             last_typo, min_typo, searched_queries, actual_topster, groups_processed,
@ -2548,7 +2603,13 @@ void Index::search_fields(const std::vector<filter>& filters,

                if(field_num_results == 0 && split_join_tokens) {
                    std::vector<std::vector<std::string>> space_resolved_queries;
-                    resolve_space_as_typos(field_query_tokens[i].q_include_tokens, field_name,
+                    std::vector<std::string> q_include_tokens;
+
+                    for(auto& q_include_token: field_query_tokens[i].q_include_tokens) {
+                        q_include_tokens.push_back(q_include_token.value);
+                    }
+
+                    resolve_space_as_typos(q_include_tokens, field_name,
                                           space_resolved_queries);

                    // only one query is resolved for now, so just use that
@ -2562,9 +2623,8 @@ void Index::search_fields(const std::vector<filter>& filters,
                        }

                        query_tokens = q_include_pos_tokens;
-                        search_tokens = q_include_pos_tokens;

-                        search_field(field_id, query_tokens, search_tokens, exclude_token_ids, exclude_token_ids_size,
+                        search_field(field_id, query_tokens, exclude_token_ids, exclude_token_ids_size,
                                     num_tokens_dropped, field_it->second, field_name,
                                     actual_filter_ids, actual_filter_ids_length, curated_ids_sorted, sort_fields_std,
                                     last_typo, min_typo, searched_queries, actual_topster, groups_processed,
@ -2590,6 +2650,14 @@ void Index::search_fields(const std::vector<filter>& filters,
            }

            // do synonym based searches
+
+            // get synonyms
+            std::vector<std::string> q_include_tokens;
+            for(size_t j = 0; j < field_query_tokens[i].q_include_tokens.size(); j++) {
+                q_include_tokens.push_back(field_query_tokens[i].q_include_tokens[j].value);
+            }
+            synonym_index->synonym_reduction(q_include_tokens, field_query_tokens[i].q_synonyms);
+
            // since typos are disabled, we will use drop_tokens_threshold for typo_tokens_threshold as well
            // otherwise, we can't support dropping of tokens here.
            do_synonym_search(filters, included_ids_map, sort_fields_std, curated_topster, token_order,
@ -2599,13 +2667,13 @@ void Index::search_fields(const std::vector<filter>& filters,
                              max_candidates, curated_ids, curated_ids_sorted, exclude_token_ids,
                              exclude_token_ids_size, i, actual_filter_ids_length, 0, field_prefix,
                              field_id, field_name, field_it,
-                              query_tokens, search_tokens, num_tokens_dropped, actual_topster, field_num_results,
+                              query_tokens, num_tokens_dropped, actual_topster, field_num_results,
                              field_query_tokens, all_result_ids_len, groups_processed, searched_queries,
                              all_result_ids,
                              actual_filter_ids, query_hashes);

            // concat is done only for multi-field searches as `ftopster` will be empty for single-field search
-            concat_topster_ids(ftopster, topster_ids);
+            concat_topster_ids(ftopsters[i], topster_ids);
            collate_included_ids(field_query_tokens[i].q_include_tokens, field_name, field_id, included_ids_map, curated_topster, searched_queries);
            //LOG(INFO) << "topster_ids.size: " << topster_ids.size();
        }
@ -2691,7 +2759,7 @@ void Index::do_synonym_search(const std::vector<filter>& filters,
                              size_t exclude_token_ids_size, size_t i, uint32_t actual_filter_ids_length,
                              int field_num_typos, bool field_prefix, const uint8_t field_id, const string& field_name,
                              const std::unordered_map<string, field>::const_iterator& field_it,
-                              std::vector<token_t>& query_tokens, std::vector<token_t>& search_tokens,
+                              std::vector<token_t>& query_tokens,
                              size_t num_tokens_dropped, Topster* actual_topster, size_t field_num_results,
                              std::vector<query_tokens_t>& field_query_tokens, size_t& all_result_ids_len,
                              spp::sparse_hash_set<uint64_t>& groups_processed,
@ -2713,7 +2781,7 @@ void Index::do_synonym_search(const std::vector<filter>& filters,
    for(const auto& syn_tokens: q_pos_synonyms) {
        num_tokens_dropped = 0;
        field_num_results = 0;
-        query_tokens = search_tokens = syn_tokens;
+        query_tokens = syn_tokens;
        query_hashes.clear();

        if(query_tokens.size() == 1 && query_tokens[0].value == "*") {
@ -2734,7 +2802,7 @@ void Index::do_synonym_search(const std::vector<filter>& filters,
                            all_result_ids, all_result_ids_len,
                            actual_filter_ids, actual_filter_ids_length, concurrency);
        } else {
-            search_field(field_id, query_tokens, search_tokens, exclude_token_ids, exclude_token_ids_size, num_tokens_dropped,
+            search_field(field_id, query_tokens, exclude_token_ids, exclude_token_ids_size, num_tokens_dropped,
                         field_it->second, field_name, actual_filter_ids, actual_filter_ids_length, curated_ids_sorted, sort_fields_std,
                         -1, field_num_typos, searched_queries, actual_topster, groups_processed,
                         &all_result_ids, all_result_ids_len,
@ -2913,11 +2981,10 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
            Tokenizer(facet_query.query, true, !facet_field.is_string(),
                      facet_field.locale, symbols_to_index, token_separators).tokenize(query_tokens);

-            std::vector<token_t> search_tokens, qtokens;
+            std::vector<token_t> qtokens;

            for (size_t qtoken_index = 0; qtoken_index < query_tokens.size(); qtoken_index++) {
                bool is_prefix = (qtoken_index == query_tokens.size()-1);
-                search_tokens.emplace_back(qtoken_index, query_tokens[qtoken_index], is_prefix);
                qtokens.emplace_back(qtoken_index, query_tokens[qtoken_index], is_prefix);
            }

@ -2930,7 +2997,7 @@ void Index::compute_facet_infos(const std::vector<facet>& facets, facet_query_t&
            std::set<uint64> query_hashes;
            size_t num_toks_dropped = 0;

-            search_field(0, qtokens, search_tokens, nullptr, 0, num_toks_dropped,
+            search_field(0, qtokens, nullptr, 0, num_toks_dropped,
                         facet_field, facet_field.faceted_name(),
                         all_result_ids, all_result_ids_len, {}, {}, -1, facet_query_num_typos, searched_queries, topster,
                         groups_processed, &field_result_ids, field_result_ids_len, field_num_results, 0, group_by_fields,
@ -3187,8 +3254,7 @@ void Index::populate_sort_mapping(int* sort_order, std::vector<size_t>& geopoint
   5. Sort the docs based on some ranking criteria
 */
 void Index::search_field(const uint8_t & field_id,
-                         std::vector<token_t>& query_tokens,
-                         std::vector<token_t>& search_tokens,
+                         const std::vector<token_t>& query_tokens,
                         const uint32_t* exclude_token_ids,
                         size_t exclude_token_ids_size,
                         size_t& num_tokens_dropped,
@ -3228,8 +3294,8 @@ void Index::search_field(const uint8_t & field_id,

    std::vector<std::vector<int>> token_to_costs;

-    for(size_t stoken_index=0; stoken_index < search_tokens.size(); stoken_index++) {
-        const std::string& token = search_tokens[stoken_index].value;
+    for(size_t stoken_index=0; stoken_index < query_tokens.size(); stoken_index++) {
+        const std::string& token = query_tokens[stoken_index].value;

        std::vector<int> all_costs;
        // This ensures that we don't end up doing a cost of 1 for a single char etc.
@ -3279,13 +3345,13 @@ void Index::search_field(const uint8_t & field_id,
        token_candidates_vec.clear();
        size_t token_index = 0;

-        while(token_index < search_tokens.size()) {
+        while(token_index < query_tokens.size()) {
            // For each token, look up the generated cost for this iteration and search using that cost
-            const std::string& token = search_tokens[token_index].value;
+            const std::string& token = query_tokens[token_index].value;
            const std::string token_cost_hash = token + std::to_string(costs[token_index]);

            std::vector<art_leaf*> leaves;
-            const bool prefix_search = prefix && search_tokens[token_index].prefix;
+            const bool prefix_search = prefix && query_tokens[token_index].prefix;

            /*LOG(INFO) << "Searching for field: " << the_field.name << ", token:"
                      << token << " - cost: " << costs[token_index] << ", prefix_search: " << prefix_search;*/
@ -3318,42 +3384,16 @@ void Index::search_field(const uint8_t & field_id,
            if(!leaves.empty()) {
                //log_leaves(costs[token_index], token, leaves);
                token_candidates_vec.push_back(
-                        token_candidates{search_tokens[token_index], costs[token_index], prefix_search, leaves});
-            } else {
-                // No result at `cost = costs[token_index]`. Remove `cost` for token and re-do combinations
-                auto it = std::find(token_to_costs[token_index].begin(), token_to_costs[token_index].end(), costs[token_index]);
-                if(it != token_to_costs[token_index].end()) {
-                    token_to_costs[token_index].erase(it);
-
-                    // when no more costs are left for this token
-                    if(token_to_costs[token_index].empty()) {
-                        // we can try to drop the token and search with remaining tokens
-
-                        if(!exhaustive_search && field_num_results >= drop_tokens_threshold) {
-                            // but if drop_tokens_threshold is breached, we are done
-                            return ;
-                        }
-
-                        token_to_costs.erase(token_to_costs.begin()+token_index);
-                        search_tokens.erase(search_tokens.begin()+token_index);
-                        query_tokens.erase(query_tokens.begin()+token_index);
-                        costs.erase(costs.begin()+token_index);
-                    }
-                }
-
-                // Continue outerloop on new cost combination
-                n = -1;
-                N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
-                goto resume_typo_loop;
+                        token_candidates{query_tokens[token_index], costs[token_index], prefix_search, leaves});
            }

            token_index++;
        }

-        if(!token_candidates_vec.empty()) {
+        if(token_candidates_vec.size() == query_tokens.size()) {
            std::vector<uint32_t> id_buff;

-            // If atleast one token is found, go ahead and search for candidates
+            // If all tokens are, go ahead and search for candidates
            search_candidates(field_id, the_field.is_array(), filter_ids, filter_ids_length,
                              exclude_token_ids, exclude_token_ids_size,
                              curated_ids, sort_fields, token_candidates_vec, searched_queries, topster,
@ -3374,8 +3414,6 @@ void Index::search_field(const uint8_t & field_id,
            *all_result_ids = new_all_result_ids;
        }

-        resume_typo_loop:
-
        if(!exhaustive_search && field_num_results >= typo_tokens_threshold) {
            // if typo threshold is breached, we are done
            return ;
@ -3383,41 +3421,6 @@ void Index::search_field(const uint8_t & field_id,

        n++;
    }
-
-    // When atleast two tokens from the query are available so we can drop one
-    if(query_tokens.size() > 1 && num_tokens_dropped < query_tokens.size()) {
-        // Drop tokens from right until (len/2 + 1), and then from left until (len/2 + 1)
-
-        if(!exhaustive_search && field_num_results >= drop_tokens_threshold) {
-            // if drop_tokens_threshold is breached, we are done
-            return ;
-        }
-
-        std::vector<token_t> truncated_tokens;
-        num_tokens_dropped++;
-
-        if(num_tokens_dropped < query_tokens.size()) {
-            // drop from right
-            size_t end_index = query_tokens.size() - num_tokens_dropped - 1;
-            for(size_t i=0; i <= end_index; i++) {
-                truncated_tokens.emplace_back(query_tokens[i].position, query_tokens[i].value, query_tokens[i].prefix);
-            }
-        } else {
-            // drop from left
-            size_t start_index = (num_tokens_dropped - query_tokens.size() + 1);
-            for(size_t i=start_index; i<query_tokens.size(); i++) {
-                truncated_tokens.emplace_back(query_tokens[i].position, query_tokens[i].value, query_tokens[i].prefix);
-            }
-        }
-
-        return search_field(field_id, query_tokens, truncated_tokens, exclude_token_ids, exclude_token_ids_size,
-                            num_tokens_dropped, the_field, field_name, filter_ids, filter_ids_length, curated_ids,
-                            sort_fields, last_typo, max_typos, searched_queries, topster, groups_processed, all_result_ids,
-                            all_result_ids_len, field_num_results, group_limit, group_by_fields,
-                            prioritize_exact_match, concurrency, query_hashes,
-                            token_order, prefix, drop_tokens_threshold, typo_tokens_threshold,
-                            exhaustive_search, syn_orig_num_tokens, min_len_1typo, min_len_2typo, max_candidates);
-    }
 }

 int Index::get_bounded_typo_cost(const size_t max_cost, const size_t token_len,
@ -3683,7 +3686,7 @@ uint64_t Index::get_distinct_id(const std::vector<std::string>& group_by_fields,
        const auto& facet_hashes = facet_hashes_it->second;

        for(size_t i = 0; i < facet_hashes.size(); i++) {
-            distinct_id = hash_combine(distinct_id, facet_hashes.hashes[i]);
+            distinct_id = StringUtils::hash_combine(distinct_id, facet_hashes.hashes[i]);
        }
    }

@ -3694,6 +3697,7 @@ inline uint32_t Index::next_suggestion(const std::vector<token_candidates> &toke
                                   long long int n,
                                   std::vector<art_leaf *>& actual_query_suggestion,
                                   std::vector<art_leaf *>& query_suggestion,
+                                   const int syn_orig_num_tokens,
                                   uint32_t& token_bits,
                                   uint64& qhash) {
    uint32_t total_cost = 0;
@ -3717,13 +3721,20 @@ inline uint32_t Index::next_suggestion(const std::vector<token_candidates> &toke
        token_bits |= 1UL << token_candidates_vec[i].token.position; // sets n-th bit

        uintptr_t addr_val = (uintptr_t) query_suggestion[i];
-        qhash = Index::hash_combine(qhash, addr_val);
+        qhash = StringUtils::hash_combine(qhash, addr_val);

        /*LOG(INFO) << "suggestion key: " << actual_query_suggestion[i]->key << ", token: "
                  << token_candidates_vec[i].token.value << ", actual_cost: " << actual_cost;
        LOG(INFO) << ".";*/
    }

+    if(syn_orig_num_tokens != -1) {
+        token_bits = 0;
+        for(size_t i = 0; i < size_t(syn_orig_num_tokens); i++) {
+            token_bits |= 1UL << i;
+        }
+    }
+
    return total_cost;
 }

--- a/src/synonym_index.cpp
+++ b/src/synonym_index.cpp
@ -0,0 +1,186 @@
+#include "synonym_index.h"
+
+
+void SynonymIndex::synonym_reduction_internal(const std::vector<std::string>& tokens,
+                                            size_t start_window_size, size_t start_index_pos,
+                                            std::set<uint64_t>& processed_syn_hashes,
+                                            std::vector<std::vector<std::string>>& results) const {
+
+    bool recursed = false;
+
+    for(size_t window_len = start_window_size; window_len > 0; window_len--) {
+        for(size_t start_index = start_index_pos; start_index+window_len-1 < tokens.size(); start_index++) {
+            std::vector<uint64_t> syn_hashes;
+            uint64_t syn_hash = 1;
+
+            for(size_t i = start_index; i < start_index+window_len; i++) {
+                uint64_t token_hash = StringUtils::hash_wy(tokens[i].c_str(), tokens[i].size());
+
+                if(i == start_index) {
+                    syn_hash = token_hash;
+                } else {
+                    syn_hash = StringUtils::hash_combine(syn_hash, token_hash);
+                }
+
+                syn_hashes.push_back(token_hash);
+            }
+
+            const auto& syn_itr = synonym_index.find(syn_hash);
+
+            if(syn_itr != synonym_index.end() && processed_syn_hashes.count(syn_hash) == 0) {
+                // tokens in this window match a synonym: reconstruct tokens and rerun synonym mapping against matches
+                const auto& syn_ids = syn_itr->second;
+
+                for(const auto& syn_id: syn_ids) {
+                    const auto &syn_def = synonym_definitions.at(syn_id);
+
+                    for (const auto &syn_def_tokens: syn_def.synonyms) {
+                        std::vector<std::string> new_tokens;
+
+                        for (size_t i = 0; i < start_index; i++) {
+                            new_tokens.push_back(tokens[i]);
+                        }
+
+                        std::vector<uint64_t> syn_def_hashes;
+                        uint64_t syn_def_hash = 1;
+
+                        for (size_t i = 0; i < syn_def_tokens.size(); i++) {
+                            const auto &syn_def_token = syn_def_tokens[i];
+                            new_tokens.push_back(syn_def_token);
+                            uint64_t token_hash = StringUtils::hash_wy(syn_def_token.c_str(),
+                                                                       syn_def_token.size());
+
+                            if (i == 0) {
+                                syn_def_hash = token_hash;
+                            } else {
+                                syn_def_hash = StringUtils::hash_combine(syn_def_hash, token_hash);
+                            }
+
+                            syn_def_hashes.push_back(token_hash);
+                        }
+
+                        if (syn_def_hash == syn_hash) {
+                            // skip over token matching itself in the group
+                            continue;
+                        }
+
+                        for (size_t i = start_index + window_len; i < tokens.size(); i++) {
+                            new_tokens.push_back(tokens[i]);
+                        }
+
+                        processed_syn_hashes.emplace(syn_def_hash);
+                        processed_syn_hashes.emplace(syn_hash);
+
+                        for (uint64_t h: syn_def_hashes) {
+                            processed_syn_hashes.emplace(h);
+                        }
+
+                        for (uint64_t h: syn_hashes) {
+                            processed_syn_hashes.emplace(h);
+                        }
+
+                        recursed = true;
+                        synonym_reduction_internal(new_tokens, window_len, start_index, processed_syn_hashes, results);
+                    }
+                }
+            }
+        }
+
+        // reset it because for the next window we have to start from scratch
+        start_index_pos = 0;
+    }
+
+    if(!recursed && !processed_syn_hashes.empty()) {
+        results.emplace_back(tokens);
+    }
+}
+
+void SynonymIndex::synonym_reduction(const std::vector<std::string>& tokens,
+                                   std::vector<std::vector<std::string>>& results) const {
+    if(synonym_definitions.empty()) {
+        return;
+    }
+
+    std::set<uint64_t> processed_syn_hashes;
+    synonym_reduction_internal(tokens, tokens.size(), 0, processed_syn_hashes, results);
+}
+
+Option<bool> SynonymIndex::add_synonym(const std::string & collection_name, const synonym_t& synonym) {
+    if(synonym_definitions.count(synonym.id) != 0) {
+        // first we have to delete existing entries so we can upsert
+        Option<bool> rem_op = remove_synonym(collection_name, synonym.id);
+        if(!rem_op.ok()) {
+            return rem_op;
+        }
+    }
+
+    std::unique_lock write_lock(mutex);
+    synonym_definitions[synonym.id] = synonym;
+
+    if(!synonym.root.empty()) {
+        uint64_t root_hash = synonym_t::get_hash(synonym.root);
+        synonym_index[root_hash].emplace_back(synonym.id);
+    } else {
+        for(const auto & syn_tokens : synonym.synonyms) {
+            uint64_t syn_hash = synonym_t::get_hash(syn_tokens);
+            synonym_index[syn_hash].emplace_back(synonym.id);
+        }
+    }
+
+    write_lock.unlock();
+
+    bool inserted = store->insert(get_synonym_key(collection_name, synonym.id), synonym.to_json().dump());
+    if(!inserted) {
+        return Option<bool>(500, "Error while storing the synonym on disk.");
+    }
+
+    return Option<bool>(true);
+}
+
+bool SynonymIndex::get_synonym(const std::string& id, synonym_t& synonym) {
+    std::shared_lock lock(mutex);
+
+    if(synonym_definitions.count(id) != 0) {
+        synonym = synonym_definitions[id];
+        return true;
+    }
+
+    return false;
+}
+
+Option<bool> SynonymIndex::remove_synonym(const std::string & collection_name, const std::string &id) {
+    std::unique_lock lock(mutex);
+    const auto& syn_iter = synonym_definitions.find(id);
+
+    if(syn_iter != synonym_definitions.end()) {
+        bool removed = store->remove(get_synonym_key(collection_name, id));
+        if(!removed) {
+            return Option<bool>(500, "Error while deleting the synonym from disk.");
+        }
+
+        const auto& synonym = syn_iter->second;
+        if(!synonym.root.empty()) {
+            uint64_t root_hash = synonym_t::get_hash(synonym.root);
+            synonym_index.erase(root_hash);
+        } else {
+            for(const auto & syn_tokens : synonym.synonyms) {
+                uint64_t syn_hash = synonym_t::get_hash(syn_tokens);
+                synonym_index.erase(syn_hash);
+            }
+        }
+
+        synonym_definitions.erase(id);
+        return Option<bool>(true);
+    }
+
+    return Option<bool>(404, "Could not find that `id`.");
+}
+
+spp::sparse_hash_map<std::string, synonym_t> SynonymIndex::get_synonyms() {
+    std::shared_lock lock(mutex);
+    return synonym_definitions;
+}
+
+std::string SynonymIndex::get_synonym_key(const std::string & collection_name, const std::string & synonym_id) {
+    return std::string(COLLECTION_SYNONYM_PREFIX) + "_" + collection_name + "_" + synonym_id;
+}
--- a/test/collection_filtering_test.cpp
+++ b/test/collection_filtering_test.cpp
@ -2254,7 +2254,6 @@ TEST_F(CollectionFilteringTest, ExcludeMultipleTokens) {
            {"title"}, "",
            {}, {}, {0}, 10, 1, FREQUENCY).get();

-    LOG(INFO) << results;
    ASSERT_EQ(1, results["found"].get<size_t>());
    ASSERT_EQ(1, results["hits"].size());

--- a/test/collection_grouping_test.cpp
+++ b/test/collection_grouping_test.cpp
@ -300,17 +300,17 @@ TEST_F(CollectionGroupingTest, GroupingWithMultiFieldRelevance) {
    ASSERT_STREQ("pop", results["grouped_hits"][0]["group_key"][0].get<std::string>().c_str());
    ASSERT_EQ(2, results["grouped_hits"][0]["hits"].size());
    ASSERT_STREQ("1", results["grouped_hits"][0]["hits"][0]["document"]["id"].get<std::string>().c_str());
-    ASSERT_STREQ("4", results["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("2", results["grouped_hits"][0]["hits"][1]["document"]["id"].get<std::string>().c_str());

    ASSERT_STREQ("rock", results["grouped_hits"][1]["group_key"][0].get<std::string>().c_str());
    ASSERT_EQ(2, results["grouped_hits"][1]["hits"].size());
    ASSERT_STREQ("5", results["grouped_hits"][1]["hits"][0]["document"]["id"].get<std::string>().c_str());
-    ASSERT_STREQ("6", results["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("0", results["grouped_hits"][1]["hits"][1]["document"]["id"].get<std::string>().c_str());

    ASSERT_STREQ("country", results["grouped_hits"][2]["group_key"][0].get<std::string>().c_str());
    ASSERT_EQ(2, results["grouped_hits"][2]["hits"].size());
-    ASSERT_STREQ("8", results["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
-    ASSERT_STREQ("3", results["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("3", results["grouped_hits"][2]["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("8", results["grouped_hits"][2]["hits"][1]["document"]["id"].get<std::string>().c_str());

    collectionManager.drop_collection("coll1");
 }
--- a/test/collection_locale_test.cpp
+++ b/test/collection_locale_test.cpp
@ -170,7 +170,7 @@ TEST_F(CollectionLocaleTest, SearchAgainstThaiText) {
    ASSERT_EQ("<mark>ลง</mark>ที่นั่นโดย<mark>รถไฟ</mark>", results["hits"][0]["highlights"][0]["snippet"].get<std::string>());

    results = coll1->search("ลงรถไฟ downie",
-                            {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
+                            {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10).get();

    ASSERT_EQ(2, results["found"].get<size_t>());
    ASSERT_EQ(2, results["hits"].size());
--- a/test/collection_override_test.cpp
+++ b/test/collection_override_test.cpp
@ -1040,12 +1040,12 @@ TEST_F(CollectionOverrideTest, DynamicFilteringExactMatchBasics) {

    // should not apply filter for non-exact case
    results = coll1->search("running shoes", {"name", "category", "brand"}, "",
-                            {}, sort_fields, {2, 2, 2}, 10).get();
+                            {}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();

    ASSERT_EQ(3, results["hits"].size());

    results = coll1->search("adidas shoes", {"name", "category", "brand"}, "",
-                            {}, sort_fields, {2, 2, 2}, 10).get();
+                            {}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();

    ASSERT_EQ(1, results["hits"].size());
    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
@ -1200,7 +1200,7 @@ TEST_F(CollectionOverrideTest, DynamicFilteringMultiplePlaceholders) {

    // not an exact match of rule (because of "light") so all results will be fetched, not just Air Jordan brand
    auto results = coll1->search("Nike Air Jordan light yellow shoes", {"name", "category", "brand"}, "",
-                            {}, sort_fields, {2, 2, 2}, 10).get();
+                            {}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();

    ASSERT_EQ(3, results["hits"].size());
    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
@ -1209,7 +1209,7 @@ TEST_F(CollectionOverrideTest, DynamicFilteringMultiplePlaceholders) {

    // query with tokens at the start that preceding the placeholders in the rule
    results = coll1->search("New Nike Air Jordan yellow shoes", {"name", "category", "brand"}, "",
-                            {}, sort_fields, {2, 2, 2}, 10).get();
+                            {}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();

    ASSERT_EQ(1, results["hits"].size());
    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
@ -1381,16 +1381,16 @@ TEST_F(CollectionOverrideTest, DynamicFilteringWithNumericalFilter) {
    // should not match the defined override

    results = coll1->search("running adidas shoes", {"name", "category", "brand"}, "",
-                            {}, sort_fields, {2, 2, 2}, 10).get();
+                            {}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();

    ASSERT_EQ(4, results["hits"].size());
    ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
-    ASSERT_EQ("2", results["hits"][1]["document"]["id"].get<std::string>());
-    ASSERT_EQ("0", results["hits"][2]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+    ASSERT_EQ("2", results["hits"][2]["document"]["id"].get<std::string>());
    ASSERT_EQ("1", results["hits"][3]["document"]["id"].get<std::string>());

    results = coll1->search("adidas", {"name", "category", "brand"}, "",
-                            {}, sort_fields, {2, 2, 2}, 10).get();
+                            {}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {false}, 10).get();

    ASSERT_EQ(1, results["hits"].size());
    ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
@ -1475,14 +1475,16 @@ TEST_F(CollectionOverrideTest, DynamicFilteringExactMatch) {
    ASSERT_EQ(4, results["hits"].size());

    results = coll1->search("popular nike running shoes", {"name", "category", "brand"}, "",
-                            {}, sort_fields, {2, 2, 2}, 10).get();
+                            {}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {true}, 10).get();

    ASSERT_EQ(4, results["hits"].size());

    results = coll1->search("popular nike shoes running", {"name", "category", "brand"}, "",
-                            {}, sort_fields, {2, 2, 2}, 10).get();
+                            {}, sort_fields, {2, 2, 2}, 10, 1, FREQUENCY, {true}, 10).get();

-    ASSERT_EQ(4, results["hits"].size());
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("2", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("3", results["hits"][1]["document"]["id"].get<std::string>());

    collectionManager.drop_collection("coll1");
 }
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@ -293,6 +293,7 @@ TEST_F(CollectionSpecificTest, OrderMultiFieldFuzzyMatch) {
                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
                                 "<mark>", "</mark>", {1, 1}).get();

+    ASSERT_EQ(2, results["hits"].size());
    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());

@ -304,12 +305,50 @@ TEST_F(CollectionSpecificTest, OrderMultiFieldFuzzyMatch) {
                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
                            "<mark>", "</mark>", {2, 1}).get();

+    ASSERT_EQ(2, results["hits"].size());
    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
    ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());

    collectionManager.drop_collection("coll1");
 }

+TEST_F(CollectionSpecificTest, TypoBeforeDropTokens) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Josh Wexler";
+    doc1["points"] = 500;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "Josh Lipson";
+    doc2["points"] = 100;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto results = coll1->search("Josh Lixson", {"title"}, "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {true}).get();
+
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+
+    results = coll1->search("Josh Lixson", {"title"}, "", {}, {}, {2}, 10,
+                            1, FREQUENCY, {true},
+                            10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 10).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+
 TEST_F(CollectionSpecificTest, FieldWeighting) {
    std::vector<field> fields = {field("title", field_types::STRING, false),
                                 field("description", field_types::STRING, false),
@ -338,6 +377,7 @@ TEST_F(CollectionSpecificTest, FieldWeighting) {
                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
                                 "<mark>", "</mark>", {1, 4}).get();

+    ASSERT_EQ(2, results["hits"].size());
    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());

@ -372,10 +412,8 @@ TEST_F(CollectionSpecificTest, MultiFieldArrayRepeatingTokens) {
    auto results = coll1->search("rv345 cisco 18", {"title", "description", "attrs"}, "", {}, {}, {1}, 10,
                                 1, FREQUENCY, {true, true, true}).get();

-    LOG(INFO) << results;
-
+    ASSERT_EQ(1, results["hits"].size());
    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
-    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());

    collectionManager.drop_collection("coll1");
 }
@ -629,7 +667,7 @@ TEST_F(CollectionSpecificTest, DeleteOverridesAndSynonymsOnDiskDuringCollDrop) {
    ASSERT_TRUE(stored_values.empty());

    // synonyms should also have been deleted from the store
-    store->scan_fill(Collection::COLLECTION_SYNONYM_PREFIX, stored_values);
+    store->scan_fill(SynonymIndex::COLLECTION_SYNONYM_PREFIX, stored_values);
    ASSERT_TRUE(stored_values.empty());
 }

@ -877,16 +915,39 @@ TEST_F(CollectionSpecificTest, HighlightWithDropTokensAndPrefixSearch) {
                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 1, {}, {}, {}, 0,
                                 "<mark>", "</mark>").get();

+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ(1, results["found"].get<size_t>());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
    ASSERT_EQ(2, results["hits"][0]["highlights"].size());
+
+    ASSERT_EQ("<mark>Pandaabear</mark> <mark>Bas</mark>ic",
+              results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
+
+    ASSERT_EQ("<mark>Pandaabear</mark>",
+              results["hits"][0]["highlights"][1]["snippet"].get<std::string>());
+
+    results = coll1->search("pandaabear bas", {"username", "name"},
+                            "", {}, {}, {2, 2}, 10,
+                            1, FREQUENCY, {true, true},
+                            10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 1, {}, {}, {}, 0,
+                            "<mark>", "</mark>").get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ(2, results["found"].get<size_t>());
    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());

-    ASSERT_EQ(2, results["hits"][1]["highlights"].size());
+    ASSERT_EQ(2, results["hits"][0]["highlights"].size());
+    ASSERT_EQ("<mark>Pandaabear</mark> <mark>Bas</mark>ic",
+              results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
+    ASSERT_EQ("<mark>Pandaabear</mark>",
+              results["hits"][0]["highlights"][1]["snippet"].get<std::string>());

+    ASSERT_EQ(2, results["hits"][1]["highlights"].size());
    ASSERT_EQ("<mark>Pandaabear</mark>",
              results["hits"][1]["highlights"][0]["snippet"].get<std::string>());
-
-    ASSERT_EQ("Panda's <mark>Basement</mark>",
+    ASSERT_EQ("Panda's <mark>Bas</mark>ement",
              results["hits"][1]["highlights"][1]["snippet"].get<std::string>());

    results = coll1->search("pandaabear bas", {"username", "tags"},
@ -2534,6 +2595,15 @@ TEST_F(CollectionSpecificTest, DropTokensTillOneToken) {
    auto results = coll1->search("harry malcolm roscow", {"title"},
                                 "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10).get();

+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("2", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+
+    // with drop tokens threshold of 1
+
+    results = coll1->search("harry malcolm roscow", {"title"},
+                            "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 1).get();
+
    ASSERT_EQ(1, results["hits"].size());
    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());

--- a/test/collection_synonyms_test.cpp
+++ b/test/collection_synonyms_test.cpp
@ -329,8 +329,7 @@ TEST_F(CollectionSynonymsTest, OneWaySynonym) {
    ASSERT_EQ(1, res["found"].get<uint32_t>());
 }

-
-TEST_F(CollectionSynonymsTest, VerbatimMatchShouldConsiderTokensMatchedAcrossAllFieldsWithSynonyms) {
+TEST_F(CollectionSynonymsTest, SynonymQueryVariantWithDropTokens) {
    std::vector<field> fields = {field("category", field_types::STRING_ARRAY, false),
                                 field("location", field_types::STRING, false),
                                 field("points", field_types::INT32, false),};
@ -377,9 +376,56 @@ TEST_F(CollectionSynonymsTest, VerbatimMatchShouldConsiderTokensMatchedAcrossAll

    ASSERT_EQ(3, res["hits"].size());

+    // NOTE: "2" is ranked above "1" because synonym matches uses the root query's number of tokens for counting
+    // This means that "united states" == "us" so a single token match, same as match on "sneakers" in record 2.
+
    ASSERT_EQ("0", res["hits"][0]["document"]["id"].get<std::string>());
-    ASSERT_EQ("1", res["hits"][1]["document"]["id"].get<std::string>());
-    ASSERT_EQ("2", res["hits"][2]["document"]["id"].get<std::string>());
+    ASSERT_EQ("2", res["hits"][1]["document"]["id"].get<std::string>());
+    ASSERT_EQ("1", res["hits"][2]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionSynonymsTest, SynonymsTextMatchSameAsRootQuery) {
+    std::vector<field> fields = {field("name", field_types::STRING, false),
+                                 field("title", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json syn_json = {
+        {"id", "syn-1"},
+        {"root", "ceo"},
+        {"synonyms", {"chief executive officer"} }
+    };
+
+    synonym_t synonym;
+    auto syn_op = synonym_t::parse(syn_json, synonym);
+    ASSERT_TRUE(syn_op.ok());
+    coll1->add_synonym(synonym);
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["name"] = "Dan Fisher";
+    doc1["title"] = "Chief Executive Officer";
+    doc1["points"] = 10;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["name"] = "Jack Sparrow";
+    doc2["title"] = "CEO";
+    doc2["points"] = 20;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto res = coll1->search("ceo", {"name", "title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10).get();
+    ASSERT_EQ(2, res["hits"].size());
+
+    ASSERT_EQ("1", res["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", res["hits"][1]["document"]["id"].get<std::string>());
+
+    ASSERT_EQ(res["hits"][1]["text_match"].get<size_t>(), res["hits"][0]["text_match"].get<size_t>());

    collectionManager.drop_collection("coll1");
 }
@ -426,8 +472,8 @@ TEST_F(CollectionSynonymsTest, MultiWaySynonym) {

    ASSERT_EQ(2, res["hits"].size());
    ASSERT_EQ(2, res["found"].get<uint32_t>());
-    ASSERT_STREQ("<mark>Samuel</mark> L. <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
-    ASSERT_STREQ("<mark>Samuel</mark> L. <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][0]["highlights"][0]["snippet"].get<std::string>().c_str());
+    ASSERT_STREQ("<mark>Samuel</mark> <mark>L.</mark> <mark>Jackson</mark>", res["hits"][1]["highlights"][0]["snippet"].get<std::string>().c_str());

    // for now we don't support synonyms on ANY prefix

--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -285,8 +285,8 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringMultiTokenSearch) {

    // with 2 indexed words
    results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, sort_fields, {1}, 10).get();
-    ASSERT_EQ(1, results["hits"].size());
-    ids = {"2"};
+    ASSERT_EQ(2, results["hits"].size());
+    ids = {"2", "17"};

    for(size_t i = 0; i < results["hits"].size(); i++) {
        nlohmann::json result = results["hits"].at(i);
@ -295,7 +295,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringMultiTokenSearch) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    // exhaustive search should throw more results
+    // exhaustive search should give same results
    results = collection->search("from DoesNotExist insTruments", query_fields, "", facets, sort_fields, {1}, 10,
                                 1, FREQUENCY, {true},
                                 1, spp::sparse_hash_set<std::string>(),
@ -318,7 +318,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringMultiTokenSearch) {
                                 spp::sparse_hash_set<std::string>(),
                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
                                 "", 10).get();
-    ASSERT_EQ(7, results["hits"].size());
+    ASSERT_EQ(9, results["hits"].size());

    results.clear();
    results = collection->search("the a", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
@ -350,9 +350,9 @@ TEST_F(CollectionTest, PartialMultiTokenSearch) {
    std::vector<std::string> facets;
    nlohmann::json results = collection->search("rocket research", query_fields, "", facets,
                                                sort_fields, {0}, 10, 1, FREQUENCY, {false}, 10).get();
-    ASSERT_EQ(4, results["hits"].size());
+    ASSERT_EQ(6, results["hits"].size());

-    std::vector<std::string> ids = {"1", "8", "16", "17"};
+    std::vector<std::string> ids = {"19", "1", "10", "8", "16", "17"};

    for(size_t i = 0; i < results["hits"].size(); i++) {
        nlohmann::json result = results["hits"].at(i);
@ -3026,7 +3026,8 @@ TEST_F(CollectionTest, MultiFieldRelevance) {
    }

    auto results = coll1->search("Dustin Kensrue Down There by the Train",
-                                 {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
+                                 {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
+                                 {true}, 10).get();

    ASSERT_EQ(3, results["found"].get<size_t>());
    ASSERT_EQ(3, results["hits"].size());
@ -3069,7 +3070,8 @@ TEST_F(CollectionTest, MultiFieldRelevance) {
    }

    results = coll1->search("Dustin Kensrue Down There by the Train",
-                                 {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
+                                 {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
+                            {true}, 10).get();

    ASSERT_EQ(3, results["found"].get<size_t>());
    ASSERT_EQ(3, results["hits"].size());
@ -3082,7 +3084,8 @@ TEST_F(CollectionTest, MultiFieldRelevance) {

    // with exclude token syntax
    results = coll1->search("-downie dustin kensrue down there by the train",
-                            {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
+                            {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
+                            {true}, 10).get();

    ASSERT_EQ(2, results["found"].get<size_t>());
    ASSERT_EQ(2, results["hits"].size());
@ -3125,7 +3128,7 @@ TEST_F(CollectionTest, MultiFieldRelevance2) {
    }

    auto results = coll1->search("on a jetplane",
-                                 {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY).get();
+                                 {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {true}, 10).get();

    ASSERT_EQ(2, results["found"].get<size_t>());
    ASSERT_EQ(2, results["hits"].size());
@ -3479,11 +3482,12 @@ TEST_F(CollectionTest, ExactMatch) {
                                 {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY,
                                 {true}, 10).get();

-    ASSERT_EQ(2, results["found"].get<size_t>());
-    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ(3, results["found"].get<size_t>());
+    ASSERT_EQ(3, results["hits"].size());

    ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
    ASSERT_STREQ("2", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("0", results["hits"][2]["document"]["id"].get<std::string>().c_str());

    results = coll1->search("alpha", {"title"}, "", {}, {}, {2}, 10, 1, FREQUENCY, {true}, 10).get();

--- a/test/index_test.cpp
+++ b/test/index_test.cpp
@ -12,7 +12,7 @@ TEST(IndexTest, ScrubReindexDoc) {

    ThreadPool pool(4);

-    Index index("index", 1, nullptr, &pool, search_schema, {}, {});
+    Index index("index", 1, nullptr, nullptr, &pool, search_schema, {}, {});
    nlohmann::json old_doc;
    old_doc["id"] = "1";
    old_doc["title"] = "One more thing.";