Merge branch '0.22.0-rc' into postings-refactor-integration

# Conflicts: # include/index.h # include/posting.h # include/posting_list.h # src/art.cpp # src/collection.cpp # src/index.cpp # src/posting.cpp # src/posting_list.cpp # test/art_test.cpp # test/collection_specific_test.cpp # test/collection_test.cpp # test/posting_list_test.cpp
2025-05-20 21:52:23 +08:00 · 2021-07-24 17:10:54 +05:30 · 2021-07-24 17:10:54 +05:30 · 41c16fb7a7
commit 41c16fb7a7
parent 88f0d8b6a4 89a509513a
9 changed files with 279 additions and 38 deletions
--- a/README.md
+++ b/README.md
@ -5,6 +5,11 @@
  Typesense is a fast, typo-tolerant search engine for building delightful search experiences.
 </p>

+<p align="center">
+  An Open Source Algolia Alternative & <br>
+  An Easier-to-Use ElasticSearch Alternative
+</p>
+
 <p align="center">
 <a href="https://circleci.com/gh/typesense/typesense"><img src="https://circleci.com/gh/typesense/typesense.svg?style=shield&circle-token=1addd775339738a3d90869ddd8201110d561feaa"></a>
 <a href="https://hub.docker.com/r/typesense/typesense/tags"><img src="https://img.shields.io/docker/pulls/typesense/typesense"></a>
@ -25,7 +30,9 @@
 - Search a 32M songs dataset from MusicBrainz: [songs-search.typesense.org](https://songs-search.typesense.org/)
 - Search a 28M books dataset from OpenLibrary: [books-search.typesense.org](https://books-search.typesense.org/)
 - Search a 2M recipe dataset from RecipeNLG: [recipe-search.typesense.org](https://recipe-search.typesense.org/)
+- Search 1M Git commit messages from the Linux Kernel: [linux-commits-search.typesense.org](https://linux-commits-search.typesense.org/)
 - Spellchecker with type-ahead, with 333K English words: [spellcheck.typesense.org](https://spellcheck.typesense.org/)
+- An E-Commerce Store Browsing experience: [ecommerce-store.typesense.org](https://ecommerce-store.typesense.org/)

 🗣️ 🎥 If you prefer watching videos, here's one where we introduce Typesense and show a walk-through: https://youtu.be/F4mB0x_B1AE?t=144

@ -56,11 +63,13 @@
 - **Faceting & Filtering:** Drill down and refine results.
 - **Grouping & Distinct:** Group similar results together to show more variety.
 - **Federated Search:** Search across multiple collections (indices) in a single HTTP request.
+- **Geo Search:** Search and sort by results around a geographic location - [in beta](https://github.com/typesense/typesense/issues/78#issuecomment-842308057).
 - **Scoped API Keys:** Generate API keys that only allow access to certain records, for multi-tenant applications.
 - **Synonyms:** Define words as equivalents of each other, so searching for a word will also return results for the synonyms defined.
 - **Curation & Merchandizing:** Boost particular records to a fixed position in the search results, to feature them.
 - **Raft-based Clustering:** Setup a distributed cluster that is highly available.
 - **Seamless Version Upgrades:** As new versions of Typesense come out, upgrading is as simple as swapping out the binary and restarting Typesense.
+- **No Runtime Dependencies:** Typesense is a single binary that you can run locally or in production with a single command.

 **Don't see a feature on this list?** Search our issue tracker if someone has already requested it and upvote it, or open a new issue if not. We prioritize our roadmap based on user feedback, so we'd love to hear from you. 

@ -102,7 +111,7 @@ Here's a quick example showcasing how you can create a collection, index a docum
 Let's begin by starting the Typesense server via Docker:

 ```
-docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.20.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
+docker run -p 8108:8108 -v/tmp/data:/data typesense/typesense:0.21.0 --data-dir /data --api-key=Hu52dwsas2AdxdE
 ```

 We have [API Clients](#api-clients) in a couple of languages, but let's use the Python client for this example.
@ -204,7 +213,7 @@ We welcome community contributions to add more official client libraries and int
 You can use our [InstantSearch.js adapter](https://github.com/typesense/typesense-instantsearch-adapter) 
 to quickly build powerful search experiences, complete with filtering, sorting, pagination and more.

-Here's how: [https://typesense.org/docs/0.20.0/guide/#search-ui](https://typesense.org/docs/0.20.0/guide/#search-ui) 
+Here's how: [https://typesense.org/docs/0.21.0/guide/#search-ui](https://typesense.org/docs/0.21.0/guide/#search-ui) 

 ## FAQ

--- a/SECURITY.md
+++ b/SECURITY.md
@ -0,0 +1 @@
+Please report any security vulnerabilities to security@typesense.org. 
--- a/SHOWCASE.md
+++ b/SHOWCASE.md
@ -7,9 +7,7 @@ Please add to this file and send us a PR:
 | Name        | Description |
 | ----------- | ----------- |
 | [Recipe Search](https://recipe-search.typesense.org/) | A site that showcases Typesense in action on a 2 Million recipe database, with the ability to filter by ingredients.|
-| [Books Search](https://books-search.typesense.org/) | A site that showcases Typesense in action on a 28 Million books database from [OpenLibrary](https://openlibrary.org/), with the ability to filter by authors and subject.  |
-| [MusicBrainz Songs Search](https://songs-search.typesense.org/) | A site that showcases Typesense in action on a 32 Million Songs database from [MusicBrainz](https://musicbrainz.org/) |
-| [2020 US Presidential Candidates' Speeches Search](https://biden-trump-speeches-search.typesense.org/) | Instant Search speeches of US Presidential Candidates side-by-side. |
+| [Linux Commit History Search](https://linux-commits-search.typesense.org/) | A site that indexes 1M linux commit messages in Typesense and lets you browse, search and filter through the commits.|
 | [Grafikart](https://www.grafikart.fr/) | Learning resources library |
 | [New York University Databrary](https://nyu.databrary.org/) | Video documentation library |
 | [ElbiseBul](https://www.elbisebul.com/) | E-commerce |
@ -20,3 +18,8 @@ Please add to this file and send us a PR:
 | [Have A Class](https://haveaclass.com/) | Find the perfect teacher online |
 | [Follow up Boss](https://www.followupboss.com/) | Real-estate CRM software, using Typesense for user notes search. |
 | [Jobsort](https://www.jobsort.com/) | Job search engine for developers, by developers (quicksort for tech jobs.) |
+| [Books Search](https://books-search.typesense.org/) | A site that showcases Typesense in action on a 28 Million books database from [OpenLibrary](https://openlibrary.org/), with the ability to filter by authors and subject.  |
+| [MusicBrainz Songs Search](https://songs-search.typesense.org/) | A site that showcases Typesense in action on a 32 Million Songs database from [MusicBrainz](https://musicbrainz.org/) |
+| [2020 US Presidential Candidates' Speeches Search](https://biden-trump-speeches-search.typesense.org/) | Instant Search speeches of US Presidential Candidates side-by-side. |
+| [E-commerce Store Browsing Experience](https://ecommerce-store.typesense.org/) | A site that showcases how to build an e-commerce storefront browsing experience with Typesense. |
+| [Read This Twice](https://www.readthistwice.com/) | Book discovery platform uses typesense to power the book/people search |
--- a/include/index.h
+++ b/include/index.h
@ -36,6 +36,7 @@ struct token_candidates {

 struct search_field_t {
    std::string name;
+    size_t priority;
    size_t weight;
 };

--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -601,21 +601,22 @@ Option<nlohmann::json> Collection::search(const std::string & query, const std::

    // process weights for search fields
    std::vector<search_field_t> weighted_search_fields;
+    size_t max_weight = 100;

    if(query_by_weights.empty()) {
+        max_weight = search_fields.size();
        for(size_t i=1; i <= search_fields.size(); i++) {
-            query_by_weights.push_back(i);
+            query_by_weights.push_back((max_weight - i) + 1);
        }
    } else {
-        auto max_weight = *std::max_element(query_by_weights.begin(), query_by_weights.end());
-        for(size_t i=0; i < query_by_weights.size(); i++) {
-            query_by_weights[i] = (max_weight - query_by_weights[i]) + 1;
-        }
+        max_weight = *std::max_element(query_by_weights.begin(), query_by_weights.end());
    }

    for(size_t i=0; i < search_fields.size(); i++) {
        const auto& search_field = search_fields[i];
-        weighted_search_fields.push_back({search_field, query_by_weights[i]});
+        const auto priority = (max_weight - query_by_weights[i]) + 1;
+        const auto weight = query_by_weights[i] + 1;
+        weighted_search_fields.push_back({search_field, priority, weight});
    }

    std::vector<facet> facets;
@ -1699,7 +1700,9 @@ void Collection::highlight_result(const field &search_field,
                if(offset_it != token_offsets.end()) {
                    if (i == offset_it->first) {
                        value_stream << highlight_start_tag;
-                    } else if (i == offset_it->second) {
+                    }
+
+                    if (i == offset_it->second) {
                        value_stream << text[i];
                        value_stream << highlight_end_tag;
                        offset_it++;
--- a/src/index.cpp
+++ b/src/index.cpp
@ -908,6 +908,7 @@ void Index::search_candidates(const uint8_t & field_id,
        uint32_t total_cost = next_suggestion(token_candidates_vec, n, actual_query_suggestion,
                                              query_suggestion, token_bits);

+        //LOG(INFO) << "field_num_results: " << field_num_results << ", typo_tokens_threshold: " << typo_tokens_threshold;
        /*LOG(INFO) << "n: " << n;
        for(size_t i=0; i < actual_query_suggestion.size(); i++) {
            LOG(INFO) << "i: " << i << " - " << actual_query_suggestion[i]->key << ", ids: "
@ -1026,11 +1027,6 @@ void Index::search_candidates(const uint8_t & field_id,
        delete [] excluded_result_ids;

        searched_queries.push_back(actual_query_suggestion);
-
-        //LOG(INFO) << "field_num_results: " << field_num_results << ", typo_tokens_threshold: " << typo_tokens_threshold;
-        if(field_num_results >= typo_tokens_threshold) {
-            break;
-        }
    }
 }

@ -1667,15 +1663,21 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,

            uint64_t verbatim_match_fields = 0;    // query matching field verbatim
            uint64_t exact_match_fields = 0;       // number of fields that contains all of query tokens
+            uint64_t max_weighted_tokens_match = 0;    // weighted max number of tokens matched in a field
            uint64_t total_token_matches = 0;      // total matches across fields (including fuzzy ones)

            //LOG(INFO) << "Init pop count: " << __builtin_popcount(token_bits);

            for(size_t i = 0; i < num_search_fields; i++) {
                const auto field_id = (uint8_t)(FIELD_LIMIT_NUM - i);
-                size_t weight = search_fields[i].weight;
+                const size_t priority = search_fields[i].priority;
+                const size_t weight = search_fields[i].weight;

-                //LOG(INFO) << "--- field index: " << i << ", weight: " << weight;
+                //LOG(INFO) << "--- field index: " << i << ", priority: " << priority;
+                // using `5` here because typo + prefix combo score range is: 0 - 5
+                // 0    1    2
+                // 0,1  2,3  4,5
+                int64_t MAX_SUM_TYPOS = 5 * field_query_tokens[i].q_include_tokens.size();

                if(existing_field_kvs.count(field_id) != 0) {
                    // for existing field, we will simply sum field-wise weighted scores
@ -1686,14 +1688,22 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,

                    uint64_t tokens_found = ((match_score >> 24) & 0xFF);
                    uint64_t field_typos = 255 - ((match_score >> 16) & 0xFF);
-                    total_typos += (field_typos + 1) * weight;
-                    total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * weight;
+                    total_typos += (field_typos + 1) * priority;
+                    total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * priority;
                    verbatim_match_fields += (((match_score & 0xFF)) + 1);

-                    if(field_typos == 0 && tokens_found == field_query_tokens[i].q_include_tokens.size()) {
+                    uint64_t unique_tokens_found =
+                            int64_t(__builtin_popcount(existing_field_kvs[field_id]->token_bits)) - 1;
+
+                    if(field_typos == 0 && unique_tokens_found == field_query_tokens[i].q_include_tokens.size()) {
                        exact_match_fields++;
                    }

+                    auto weighted_tokens_match = (tokens_found * weight) + (MAX_SUM_TYPOS - field_typos + 1);
+                    if(weighted_tokens_match > max_weighted_tokens_match) {
+                        max_weighted_tokens_match = weighted_tokens_match;
+                    }
+
                    if(field_typos < min_typos) {
                        min_typos = field_typos;
                    }
@ -1701,9 +1711,9 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,
                    total_token_matches += tokens_found;

                    /*LOG(INFO) << "seq_id: " << seq_id << ", total_typos: " << (255 - ((match_score >> 8) & 0xFF))
-                                  << ", weighted typos: " << std::max<uint64_t>((255 - ((match_score >> 8) & 0xFF)), 1) * weight
+                                  << ", weighted typos: " << std::max<uint64_t>((255 - ((match_score >> 8) & 0xFF)), 1) * priority
                                  << ", total dist: " << (((match_score & 0xFF)))
-                                  << ", weighted dist: " << std::max<uint64_t>((100 - (match_score & 0xFF)), 1) * weight;*/
+                                  << ", weighted dist: " << std::max<uint64_t>((100 - (match_score & 0xFF)), 1) * priority;*/
                    continue;
                }

@ -1746,14 +1756,20 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,

                    uint64_t tokens_found = ((match_score >> 24) & 0xFF);
                    uint64_t field_typos = 255 - ((match_score >> 16) & 0xFF);
-                    total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * weight;
-                    total_typos += (field_typos + 1) * weight;
+                    total_distances += ((100 - ((match_score >> 8) & 0xFF)) + 1) * priority;
+                    total_typos += (field_typos + 1) * priority;

                    if(field_typos == 0 && tokens_found == field_query_tokens[i].q_include_tokens.size()) {
                        exact_match_fields++;
                        verbatim_match_fields++;  // this is only an approximate
                    }

+                    auto weighted_tokens_match = (tokens_found * weight) + (MAX_SUM_TYPOS - field_typos + 1);
+
+                    if(weighted_tokens_match > max_weighted_tokens_match) {
+                        max_weighted_tokens_match = weighted_tokens_match;
+                    }
+
                    if(field_typos < min_typos) {
                        min_typos = field_typos;
                    }
@ -1768,9 +1784,11 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,

            total_typos = std::min<uint64_t>(255, total_typos);
            total_distances = std::min<uint64_t>(100, total_distances);
+            max_weighted_tokens_match = std::min<uint64_t>(255, max_weighted_tokens_match);

            uint64_t aggregated_score = (
-                (exact_match_fields << 48)  |     // number of fields that contain *all tokens* in the query
+                //(exact_match_fields << 48)  |       // number of fields that contain *all tokens* in the query
+                (max_weighted_tokens_match << 48) |   // weighted max number of tokens matched in a field
                (uniq_tokens_found << 40)   |     // number of unique tokens found across fields including typos
                ((255 - min_typos) << 32)   |     // minimum typo cost across all fields
                (total_token_matches << 24) |     // total matches across fields including typos
@ -1783,6 +1801,7 @@ void Index::search(const std::vector<query_tokens_t>& field_query_tokens,

            /*LOG(INFO) << "seq id: " << seq_id
                      << ", exact_match_fields: " << exact_match_fields
+                      << ", max_weighted_tokens_match: " << max_weighted_tokens_match
                      << ", uniq_tokens_found: " << uniq_tokens_found
                      << ", min typo score: " << (255 - min_typos)
                      << ", total_token_matches: " << total_token_matches
@ -1930,6 +1949,12 @@ void Index::search_field(const uint8_t & field_id,
                    // when no more costs are left for this token
                    if(token_to_costs[token_index].empty()) {
                        // we can try to drop the token and search with remaining tokens
+
+                        if(field_num_results >= drop_tokens_threshold) {
+                            // but if drop_tokens_threshold is breached, we are done
+                            return ;
+                        }
+
                        token_to_costs.erase(token_to_costs.begin()+token_index);
                        search_tokens.erase(search_tokens.begin()+token_index);
                        query_tokens.erase(query_tokens.begin()+token_index);
@ -1956,8 +1981,8 @@ void Index::search_field(const uint8_t & field_id,

        resume_typo_loop:

-        if(field_num_results >= drop_tokens_threshold || field_num_results >= typo_tokens_threshold) {
-            // if either threshold is breached, we are done
+        if(field_num_results >= typo_tokens_threshold) {
+            // if typo threshold is breached, we are done
            return ;
        }

@ -1968,6 +1993,11 @@ void Index::search_field(const uint8_t & field_id,
    if(!query_tokens.empty() && num_tokens_dropped < query_tokens.size()) {
        // Drop tokens from right until (len/2 + 1), and then from left until (len/2 + 1)

+        if(field_num_results >= drop_tokens_threshold) {
+            // if drop_tokens_threshold is breached, we are done
+            return ;
+        }
+
        std::vector<token_t> truncated_tokens;
        num_tokens_dropped++;

--- a/src/main/benchmark.cpp
+++ b/src/main/benchmark.cpp
@ -21,7 +21,7 @@ std::string get_query(StringUtils & string_utils, std::string & text) {

    for(uint32_t i=0; i<tokens.size(); i++) {
        auto token = tokens[i];
-        string_utils.unicode_normalize(token);
+        //string_utils.unicode_normalize(token);
        normalized_tokens.push_back(token);
    }

@ -87,7 +87,7 @@ void benchmark_hn_titles(char* file_path) {
    auto begin = std::chrono::high_resolution_clock::now();

    for(size_t i = 0; i < queries.size(); i++) {
-        auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("points", "DESC")}, 2, 10, 1, MAX_SCORE, true);
+        auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("points", "DESC")}, {2}, 10, 1, MAX_SCORE, {true});
        if(results_op.ok() != true) {
            exit(2);
        }
@ -152,8 +152,8 @@ void benchmark_reactjs_pages(char* file_path) {
    auto begin = std::chrono::high_resolution_clock::now();

    for(size_t i = 0; i < queries.size(); i++) {
-        auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("dummy_sorting_field", "DESC")}, 2, 10, 1,
-                                             MAX_SCORE, true, 10, spp::sparse_hash_set<std::string>(), {"p"});
+        auto results_op = collection->search(queries[i], search_fields, "", { }, {sort_by("dummy_sorting_field", "DESC")}, {2}, 10, 1,
+                                             MAX_SCORE, {true}, 10, spp::sparse_hash_set<std::string>(), {"p"});
        if(results_op.ok() != true) {
            exit(2);
        }
--- a/test/collection_specific_test.cpp
+++ b/test/collection_specific_test.cpp
@ -169,8 +169,6 @@ TEST_F(CollectionSpecificTest, ExactSingleFieldMatch) {
    auto results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
                                 1, FREQUENCY, {true, true}).get();

-    LOG(INFO) << results;
-
    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
    ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());

@ -200,11 +198,34 @@ TEST_F(CollectionSpecificTest, OrderMultiFieldFuzzyMatch) {
    ASSERT_TRUE(coll1->add(doc2.dump()).ok());

    auto results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
-                                 1, FREQUENCY, {true, true}).get();
+                                 1, FREQUENCY, {true, true},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {1, 1}).get();

    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());

+    results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
+                            1, FREQUENCY, {true, true},
+                            10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {2, 1}).get();
+
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+
+    // use extreme weights to push title matching ahead
+
+    results = coll1->search("charger", {"title", "description"}, "", {}, {}, {2}, 10,
+                            1, FREQUENCY, {true, true},
+                            10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {10, 1}).get();
+
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
+
    collectionManager.drop_collection("coll1");
 }

@ -242,6 +263,98 @@ TEST_F(CollectionSpecificTest, FieldWeighting) {
    collectionManager.drop_collection("coll1");
 }

+TEST_F(CollectionSpecificTest, MultiFieldArrayRepeatingTokens) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("description", field_types::STRING, false),
+                                 field("attrs", field_types::STRING_ARRAY, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "E182-72/4";
+    doc1["description"] = "Nexsan Technologies 18 SAN Array - 18 x HDD Supported - 18 x HDD Installed";
+    doc1["attrs"] = {"Hard Drives Supported > 18", "Hard Drives Installed > 18", "SSD Supported > 18"};
+    doc1["points"] = 100;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "RV345-K9-NA";
+    doc2["description"] = "Cisco RV345P Router - 18 Ports";
+    doc2["attrs"] = {"Number of Ports > 18", "Product Type > Router"};
+    doc2["points"] = 50;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto results = coll1->search("rv345 cisco 18", {"title", "description", "attrs"}, "", {}, {}, {1}, 10,
+                                 1, FREQUENCY, {true, true, true}).get();
+
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionSpecificTest, ExactMatchOnPrefix) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Yeshivah Gedolah High School";
+    doc1["points"] = 100;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "GED";
+    doc2["points"] = 50;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto results = coll1->search("ged", {"title"}, "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 1).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionSpecificTest, TypoPrefixSearchWithoutPrefixEnabled) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Cisco SG25026HP Gigabit Smart Switch";
+    doc1["points"] = 100;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+
+    auto results = coll1->search("SG25026H", {"title"}, "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {false}, 0,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "", 1).get();
+
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+
 TEST_F(CollectionSpecificTest, PrefixWithTypos) {
    std::vector<field> fields = {field("title", field_types::STRING, false),
                                 field("points", field_types::INT32, false),};
@ -433,3 +546,70 @@ TEST_F(CollectionSpecificTest, DeleteOverridesAndSynonymsOnDiskDuringCollDrop) {
    store->scan_fill(Collection::COLLECTION_SYNONYM_PREFIX, stored_values);
    ASSERT_TRUE(stored_values.empty());
 }
+
+TEST_F(CollectionSpecificTest, SingleCharMatchFullFieldHighlight) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Which of the following is a probable sign of infection?";
+    doc1["points"] = 100;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+
+    auto results = coll1->search("a 3-month", {"title"}, "", {}, {}, {2}, 10,
+                                 1, FREQUENCY, {false}, 1,
+                                 spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
+                                 "title", 1).get();
+
+    ASSERT_EQ(1, results["hits"].size());
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+
+    ASSERT_EQ("Which of the following is <mark>a</mark> probable sign of infection?",
+              results["hits"][0]["highlights"][0]["snippet"].get<std::string>());
+
+    ASSERT_EQ("Which of the following is <mark>a</mark> probable sign of infection?",
+                 results["hits"][0]["highlights"][0]["value"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
+
+TEST_F(CollectionSpecificTest, TokensSpreadAcrossFields) {
+    std::vector<field> fields = {field("title", field_types::STRING, false),
+                                 field("description", field_types::STRING, false),
+                                 field("points", field_types::INT32, false),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields, "points").get();
+
+    nlohmann::json doc1;
+    doc1["id"] = "0";
+    doc1["title"] = "Foo bar baz";
+    doc1["description"] = "Share information with this device.";
+    doc1["points"] = 100;
+
+    nlohmann::json doc2;
+    doc2["id"] = "1";
+    doc2["title"] = "Foo Random";
+    doc2["description"] = "The Bar Fox";
+    doc2["points"] = 250;
+
+    ASSERT_TRUE(coll1->add(doc1.dump()).ok());
+    ASSERT_TRUE(coll1->add(doc2.dump()).ok());
+
+    auto results = coll1->search("foo bar", {"title", "description"}, "", {}, {}, {0}, 10,
+                                 1, FREQUENCY, {false, false},
+                                 10, spp::sparse_hash_set<std::string>(),
+                                 spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
+                                 "<mark>", "</mark>", {4, 1}).get();
+
+    LOG(INFO) << results;
+
+    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
+
+    collectionManager.drop_collection("coll1");
+}
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -292,7 +292,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
    }

    results.clear();
-    results = collection->search("the a DoesNotExist", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
+    results = collection->search("the a insurance", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}, 0).get();
    ASSERT_EQ(0, results["hits"].size());

    // with no indexed word
@ -859,7 +859,10 @@ TEST_F(CollectionTest, MultipleFields) {
    // when "starring" takes higher priority than "title"

    query_fields = {"starring", "title"};
-    results = coll_mul_fields->search("thomas", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false}).get();
+    results = coll_mul_fields->search("thomas", query_fields, "", facets, sort_fields, {0}, 10, 1, FREQUENCY, {false},
+                                      10, spp::sparse_hash_set<std::string>(),
+                                      spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
+                                      "<mark>", "</mark>", {2, 1}).get();
    ASSERT_EQ(4, results["hits"].size());

    ids = {"15", "12", "13", "14"};
@ -2969,6 +2972,17 @@ TEST_F(CollectionTest, MultiFieldRelevance2) {
                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
                            "<mark>", "</mark>", {1, 4}).get();

+    ASSERT_STREQ("0", results["hits"][0]["document"]["id"].get<std::string>().c_str());
+    ASSERT_STREQ("1", results["hits"][1]["document"]["id"].get<std::string>().c_str());
+
+    // use same weights
+
+    results = coll1->search("on a jetplane",
+                            {"title", "artist"}, "", {}, {}, {0}, 10, 1, FREQUENCY,
+                            {true}, 10, spp::sparse_hash_set<std::string>(),
+                            spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "", 40, {}, {}, {}, 0,
+                            "<mark>", "</mark>", {1, 1}).get();
+
    ASSERT_STREQ("1", results["hits"][0]["document"]["id"].get<std::string>().c_str());
    ASSERT_STREQ("0", results["hits"][1]["document"]["id"].get<std::string>().c_str());
				`@ -0,0 +1 @@`
				`Please report any security vulnerabilities to security@typesense.org.`