Search across multiple fields.

Need to write more tests.
2025-05-18 12:42:50 +08:00 · 2017-01-01 18:47:23 +05:30 · 2017-01-01 18:47:23 +05:30 · 2b6293650e
commit 2b6293650e
parent 54a60398ab
6 changed files with 165 additions and 89 deletions
--- a/TODO.md
+++ b/TODO.md
@ -6,7 +6,7 @@

 - ~~Proper JSON as input~~
 - ~~Storing raw JSON input to RocksDB~~
- ART for every indexed field
+- ~~ART for every indexed field~~
 - UTF-8 support for fuzzy search
 - Facets
 - Filters
@ -19,10 +19,13 @@
 - only last token should be prefix searched
 - art int search should support signed ints
 - storage key prefix should include collection name
- storage key prefix should include collection name
- use art for indexing score as well
- ISX what (score based on typo matches)
- Mininum results should be a variable instead of blindly going with max_results
+- Minimum results should be a variable instead of blindly going with max_results
+- Benchmark with -ffast-math
+- Space sensitivity
+- Use bitmap index instead of forarray for doc list
+- ~~Search across multiple fields~~
+- Multi field search tests
+- Throw errors when schema is broken

 **API**

--- a/include/collection.h
+++ b/include/collection.h
@ -43,8 +43,13 @@ private:
                                                          long long int n);
    void log_leaves(const int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const;

-    void search_candidates(std::vector<std::vector<art_leaf*>> & token_leaves, std::vector<Topster<100>::KV> & result_kvs,
-                           spp::sparse_hash_set<uint64_t> & dedup_seq_ids, size_t & total_results, const size_t & max_results);
+    std::vector<Topster<100>::KV> search(std::string & query, const std::string & field, const int num_typos, const size_t num_results,
+                                         std::vector<Topster<100>::KV> & result_kvs, spp::sparse_hash_set<uint64_t> & result_set,
+                                         const token_ordering token_order = FREQUENCY, const bool prefix = false);
+
+    void search_candidates(int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves,
+                           std::vector<Topster<100>::KV> & result_kvs, spp::sparse_hash_set<uint64_t> & dedup_seq_ids,
+                           size_t & total_results, const size_t & max_results);

    void index_string_field(const std::string &field_name, art_tree *t, const nlohmann::json &document, uint32_t seq_id) const;

@ -56,12 +61,12 @@ public:
               const std::vector<std::string> rank_fields);
    ~Collection();
    std::string add(std::string json_str);
-    std::vector<nlohmann::json> search(std::string query, const int num_typos, const size_t num_results,
-                                       const token_ordering token_order = FREQUENCY, const bool prefix = false);
+    std::vector<nlohmann::json> search(std::string query, const std::vector<std::string> fields, const int num_typos,
+                                       const size_t num_results, const token_ordering token_order = FREQUENCY,
+                                       const bool prefix = false);
    void remove(std::string id);
-    void score_results(Topster<100> &topster, const std::vector<art_leaf *> &query_suggestion,
-                       const uint32_t *result_ids,
-                       const size_t result_size) const;
+    void score_results(Topster<100> &topster, const int & token_rank, const std::vector<art_leaf *> &query_suggestion,
+                       const uint32_t *result_ids, const size_t result_size) const;

    enum {MAX_SEARCH_TOKENS = 20};
    enum {MAX_RESULTS = 100};
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -143,7 +143,7 @@ void Collection::index_string_field(const std::string &field_name, art_tree *t,
    }
 }

-void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_leaves,
+void Collection::search_candidates(int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves,
                                   std::vector<Topster<100>::KV> & result_kvs, spp::sparse_hash_set<uint64_t> & dedup_seq_ids,
                                   size_t & total_results, const size_t & max_results) {
    const size_t combination_limit = 10;
@ -153,6 +153,7 @@ void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_l
    for(long long n=0; n<N && n<combination_limit; ++n) {
        // every element in `query_suggestion` contains a token and its associated hits
        std::vector<art_leaf *> query_suggestion = next_suggestion(token_leaves, n);
+        token_rank++;

        /*std:: cout << "\nSuggestion: ";
        for(auto suggestion_leaf: query_suggestion) {
@ -178,7 +179,7 @@ void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_l

        // go through each matching document id and calculate match score
        Topster<100> topster;
-        score_results(topster, query_suggestion, result_ids, result_size);
+        score_results(topster, token_rank, query_suggestion, result_ids, result_size);
        delete[] result_ids;
        topster.sort();

@ -191,10 +192,58 @@ void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_l
            }
        }

-        if(total_results >= max_results) break;
+        if(total_results >= max_results) {
+            break;
+        }
    }
 }

+std::vector<nlohmann::json> Collection::search(std::string query, const std::vector<std::string> fields,
+                                               const int num_typos, const size_t num_results,
+                                               const token_ordering token_order, const bool prefix) {
+    // Order of `fields` are used to rank results
+    auto begin = std::chrono::high_resolution_clock::now();
+    std::vector<std::pair<int, Topster<100>::KV>> field_order_kvs;
+
+    for(int i = 0; i < fields.size(); i++) {
+        const std::string & field = fields[i];
+
+        // Container for holding the results
+        std::vector<Topster<100>::KV> result_kvs;
+
+        // To prevent duplicate results, while preserving order of result vector
+        spp::sparse_hash_set<uint64_t> result_set;
+
+        search(query, field, num_typos, num_results, result_kvs, result_set, token_order, prefix);
+        for(auto result_kv: result_kvs) {
+            field_order_kvs.push_back(std::make_pair(fields.size() - i, result_kv));
+        }
+    }
+
+    std::sort(field_order_kvs.begin(), field_order_kvs.end(),
+      [](const std::pair<int, Topster<100>::KV> & a, const std::pair<int, Topster<100>::KV> & b) {
+        if(a.second.match_score != b.second.match_score) return a.second.match_score > b.second.match_score;
+        if(a.second.primary_attr != b.second.primary_attr) return a.second.primary_attr > b.second.primary_attr;
+        if(a.second.secondary_attr != b.second.secondary_attr) return a.second.secondary_attr > b.second.secondary_attr;
+        if(a.first != b.first) return a.first > b.first;
+        return a.second.key > b.second.key;
+    });
+
+    std::vector<nlohmann::json> results;
+
+    for(auto field_order_kv: field_order_kvs) {
+        std::string value;
+        store->get(get_seq_id_key((uint32_t) field_order_kv.second.key), value);
+        nlohmann::json document = nlohmann::json::parse(value);
+        results.push_back(document);
+    }
+
+    long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
+    std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
+    store->print_memory_usage();
+    return results;
+}
+
 /*
   1. Split the query into tokens
   2. Outer loop will generate bounded cartesian product with costs for each token
@ -204,25 +253,22 @@ void Collection::search_candidates(std::vector<std::vector<art_leaf*>> & token_l
   4. Intersect the lists to find docs that match each phrase
   5. Sort the docs based on some ranking criteria
 */
-std::vector<nlohmann::json> Collection::search(std::string query, const int num_typos, const size_t num_results,
-                                               const token_ordering token_order, const bool prefix) {
-    auto begin = std::chrono::high_resolution_clock::now();
-
+std::vector<Topster<100>::KV> Collection::search(std::string & query, const std::string & field,
+                                                 const int num_typos, const size_t num_results,
+                                                 std::vector<Topster<100>::KV> & result_kvs,
+                                                 spp::sparse_hash_set<uint64_t> & result_set,
+                                                 const token_ordering token_order, const bool prefix) {
    std::vector<std::string> tokens;
    StringUtils::tokenize(query, tokens, " ", true);

    const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
    const size_t max_results = std::min(num_results, (size_t) Collection::MAX_RESULTS);

-    size_t total_results = 0;
-    std::vector<Topster<100>::KV> result_kvs;
+    size_t total_results = result_kvs.size();

    // To prevent us from doing ART search repeatedly as we iterate through possible corrections
    spp::sparse_hash_map<std::string, std::vector<art_leaf*>> token_cost_cache;

-    // To prevent duplicate results, while preserving order of result vector
-    spp::sparse_hash_set<uint64_t> result_set;
-
    // Used to drop the least occurring token(s) for partial searches
    spp::sparse_hash_map<std::string, uint32_t> token_to_count;

@ -239,8 +285,10 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
    }

    std::vector<std::vector<art_leaf*>> token_leaves;
+
    const size_t combination_limit = 10;
    auto product = []( long long a, std::vector<int>& b ) { return a*b.size(); };
+    int token_rank = 0;
    long long n = 0;
    long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);

@ -256,7 +304,6 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_

        token_leaves.clear();
        int token_index = 0;
-        bool retry_with_larger_cost = false;

        while(token_index < tokens.size()) {
            // For each token, look up the generated cost for this iteration and search using that cost
@ -264,13 +311,14 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
            const std::string token_cost_hash = token + std::to_string(costs[token_index]);

            std::vector<art_leaf*> leaves;
-            //std::cout << "\nSearching for: " << token << " - cost: " << costs[token_index] << std::endl;
+            /*std::cout << "\nSearching for: " << token << " - cost: " << costs[token_index] << ", token_rank: "
+                      << token_rank << std::endl;*/

            if(token_cost_cache.count(token_cost_hash) != 0) {
                leaves = token_cost_cache[token_cost_hash];
            } else {
                int token_len = prefix ? (int) token.length() : (int) token.length() + 1;
-                art_fuzzy_search(index_map.at("title"), (const unsigned char *) token.c_str(), token_len,
+                art_fuzzy_search(index_map.at(field), (const unsigned char *) token.c_str(), token_len,
                                 costs[token_index], costs[token_index], 3, token_order, prefix, leaves);
                if(!leaves.empty()) {
                    token_cost_cache.emplace(token_cost_hash, leaves);
@ -298,22 +346,16 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
                n = -1;
                N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);

-                // Don't look at remaining tokens if
-                // a) We've run out of tokens, or b) We're not at at max_cost for this token
-                // since we would see them again in a future iteration when we retry with a larger cost
-                if(token_index == -1 || costs[token_index] != max_cost) {
-                    retry_with_larger_cost = true;
-                    break;
-                }
+                break;
            }

            token_index++;
        }

-        if(token_leaves.size() != 0 && !retry_with_larger_cost) {
+        if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) {
            // If a) all tokens were found, or b) Some were skipped because they don't exist within max_cost,
            // go ahead and search for candidates with what we have so far
-            search_candidates(token_leaves, result_kvs, result_set, total_results, max_results);
+            search_candidates(token_rank, token_leaves, result_kvs, result_set, total_results, max_results);

            if (total_results >= max_results) {
                // If we don't find enough results, we continue outerloop (looking at tokens with greater cost)
@ -324,7 +366,8 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
        n++;
    }

-    if(result_kvs.size() == 0 && token_to_count.size() != 0) {
+    // When there are not enough overall results and atleast one token has results
+    if(result_kvs.size() < max_results && token_to_count.size() > 1) {
        // Drop certain token with least hits and try searching again
        std::string truncated_query;

@ -340,27 +383,15 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
        );

        for(uint32_t i = 0; i < token_count_pairs.size()-1; i++) {
-            if(token_to_count.count(tokens[i]) != 0) {
+            if(token_to_count.count(token_count_pairs[i].first) != 0) {
                truncated_query += " " + token_count_pairs.at(i).first;
            }
        }

-        return search(truncated_query, num_typos, num_results);
+        return search(truncated_query, field, num_typos, num_results, result_kvs, result_set, token_order, prefix);
    }

-    std::vector<nlohmann::json> results;
-
-    for(auto result_kv: result_kvs) {
-        std::string value;
-        store->get(get_seq_id_key((uint32_t) result_kv.key), value);
-        nlohmann::json document = nlohmann::json::parse(value);
-        results.push_back(document);
-    }
-
-    long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
-    std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
-    store->print_memory_usage();
-    return results;
+    return result_kvs;
 }

 void Collection::log_leaves(const int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const {
@ -374,8 +405,12 @@ void Collection::log_leaves(const int cost, const std::string &token, const std:
    }
 }

-void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf *> &query_suggestion,
-                                const uint32_t *result_ids, const size_t result_size) const {
+void Collection::score_results(Topster<100> &topster, const int & token_rank,
+                               const std::vector<art_leaf *> &query_suggestion, const uint32_t *result_ids,
+                               const size_t result_size) const {
+
+    const int max_token_rank = 250;
+
    for(auto i=0; i<result_size; i++) {
        uint32_t doc_id = result_ids[i];
        std::vector<std::vector<uint16_t>> token_positions;
@ -405,18 +440,23 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
            mscore = MatchScore::match_score(doc_id, token_positions);
        }

-        const uint64_t match_score = (uint64_t)(mscore.words_present * 32 + (MAX_SEARCH_TOKENS - mscore.distance));
+        int token_rank_score = max_token_rank - token_rank;
+
+        // Construct a single match_score from individual components (for multi-field sort)
+        const uint64_t match_score = (token_rank_score << 16) +
+                                     ((uint64_t)(mscore.words_present) << 8) +
+                                     (MAX_SEARCH_TOKENS - mscore.distance);
+
        int64_t primary_rank_score = primary_rank_scores.count(doc_id) > 0 ? primary_rank_scores.at(doc_id) : 0;
        int64_t secondary_rank_score = secondary_rank_scores.count(doc_id) > 0 ? secondary_rank_scores.at(doc_id) : 0;
        topster.add(doc_id, match_score, primary_rank_score, secondary_rank_score);
-        /*std::cout << "mscore.distance: " << (int) mscore.distance << ", match_score: "
+        /*std::cout << "token_rank_score: " << token_rank_score << ", match_score: "
                  << match_score << ", primary_rank_score: " << primary_rank_score << ", doc_id: " << doc_id << std::endl;*/
    }
 }

-inline std::vector<art_leaf *> Collection::next_suggestion(
-        const std::vector<std::vector<art_leaf *>> &token_leaves,
-        long long int n) {
+inline std::vector<art_leaf *> Collection::next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
+                                                           long long int n) {
    std::vector<art_leaf*> query_suggestion(token_leaves.size());

    // generate the next combination from `token_leaves` and store it in `query_suggestion`
--- a/src/main/main.cpp
+++ b/src/main/main.cpp
@ -56,7 +56,8 @@ int main() {
    collection->remove("foo");

    auto begin = std::chrono::high_resolution_clock::now();
-    collection->search("the", 1, 100);
+    std::vector<std::string> search_fields = {"title"};
+    collection->search("the", search_fields, 1, 100);
    long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
    cout << "Time taken: " << timeMillis << "us" << endl;
    delete collection;
--- a/src/main/server.cpp
+++ b/src/main/server.cpp
@ -84,7 +84,10 @@ static int get_search(h2o_handler_t *self, h2o_req_t *req) {
    printf("Query: %s\n", query_map["q"].c_str());
    auto begin = std::chrono::high_resolution_clock::now();

-    std::vector<nlohmann::json> results = collection->search(query_map["q"], std::stoi(query_map[NUM_TYPOS]), 100, token_order, false);
+    std::vector<std::string> search_fields = {"title"};
+
+    std::vector<nlohmann::json> results = collection->search(query_map["q"], search_fields,
+                                                             std::stoi(query_map[NUM_TYPOS]), 100, token_order, false);
    nlohmann::json json_array = nlohmann::json::array();
    for(nlohmann::json& result: results) {
        json_array.push_back(result);
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -7,11 +7,13 @@
 class CollectionTest : public ::testing::Test {
 protected:
    Collection *collection;
+    std::vector<std::string> search_fields;

    virtual void SetUp() {
        std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.jsonl");
        std::vector<field> fields = {field("title", field_type::STRING)};
        std::vector<std::string> rank_fields = {"points"};
+        search_fields = {"title"};
        collection = new Collection("/tmp/typesense_test/collection", "collection", fields, rank_fields);

        std::string json_line;
@ -29,7 +31,7 @@ protected:
 };

 TEST_F(CollectionTest, ExactSearchShouldBeStable) {
-    std::vector<nlohmann::json> results = collection->search("the", 0, 10);
+    std::vector<nlohmann::json> results = collection->search("the", search_fields, 0, 10);
    ASSERT_EQ(7, results.size());

    // For two documents of the same score, the larger doc_id appears first
@ -44,8 +46,8 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
 }

 TEST_F(CollectionTest, ExactPhraseSearch) {
-    std::vector<nlohmann::json> results = collection->search("rocket launch", 0, 10);
-    ASSERT_EQ(4, results.size());
+    std::vector<nlohmann::json> results = collection->search("rocket launch", search_fields, 0, 10);
+    ASSERT_EQ(5, results.size());

    /*
       Sort by (match, diff, score)
@ -53,9 +55,10 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
       1:   score: 15, diff: 4
       17:  score: 8,  diff: 4
       16:  score: 10, diff: 5
+       13:  score: 12, (single word match)
    */

-    std::vector<std::string> ids = {"8", "1", "17", "16"};
+    std::vector<std::string> ids = {"8", "1", "17", "16", "13"};

    for(size_t i = 0; i < results.size(); i++) {
        nlohmann::json result = results.at(i);
@ -65,7 +68,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {
    }

    // Check pagination
-    results = collection->search("rocket launch", 0, 3);
+    results = collection->search("rocket launch", search_fields, 0, 3);
    ASSERT_EQ(3, results.size());
    for(size_t i = 0; i < 3; i++) {
        nlohmann::json result = results.at(i);
@ -77,7 +80,7 @@ TEST_F(CollectionTest, ExactPhraseSearch) {

 TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
    // Tokens that are not found in the index should be skipped
-    std::vector<nlohmann::json> results = collection->search("DoesNotExist from", 0, 10);
+    std::vector<nlohmann::json> results = collection->search("DoesNotExist from", search_fields, 0, 10);
    ASSERT_EQ(2, results.size());

    std::vector<std::string> ids = {"2", "17"};
@ -90,7 +93,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
    }

    // with non-zero cost
-    results = collection->search("DoesNotExist from", 1, 10);
+    results = collection->search("DoesNotExist from", search_fields, 1, 10);
    ASSERT_EQ(2, results.size());

    for(size_t i = 0; i < results.size(); i++) {
@ -101,7 +104,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
    }

    // with 2 indexed words
-    results = collection->search("from DoesNotExist insTruments", 1, 10);
+    results = collection->search("from DoesNotExist insTruments", search_fields, 1, 10);
    ASSERT_EQ(2, results.size());
    ids = {"2", "17"};

@ -113,16 +116,16 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
    }

    results.clear();
-    results = collection->search("DoesNotExist1 DoesNotExist2", 0, 10);
+    results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, 0, 10);
    ASSERT_EQ(0, results.size());

    results.clear();
-    results = collection->search("DoesNotExist1 DoesNotExist2", 2, 10);
+    results = collection->search("DoesNotExist1 DoesNotExist2", search_fields, 2, 10);
    ASSERT_EQ(0, results.size());
 }

 TEST_F(CollectionTest, PartialPhraseSearch) {
-    std::vector<nlohmann::json> results = collection->search("rocket research", 0, 10);
+    std::vector<nlohmann::json> results = collection->search("rocket research", search_fields, 0, 10);
    ASSERT_EQ(4, results.size());

    std::vector<std::string> ids = {"1", "8", "16", "17"};
@ -136,15 +139,23 @@ TEST_F(CollectionTest, PartialPhraseSearch) {
 }

 TEST_F(CollectionTest, QueryWithTypo) {
-    std::vector<nlohmann::json> results = collection->search("kind biologcal", 2, 10);
-    ASSERT_EQ(1, results.size());
+    std::vector<nlohmann::json> results = collection->search("kind biologcal", search_fields, 2, 3);
+    ASSERT_EQ(3, results.size());

-    std::string result_id = results.at(0)["id"];
-    ASSERT_STREQ("19", result_id.c_str());
+    std::vector<std::string> ids = {"19", "20", "21"};
+
+    for(size_t i = 0; i < results.size(); i++) {
+        nlohmann::json result = results.at(i);
+        std::string result_id = result["id"];
+        std::string id = ids.at(i);
+        ASSERT_STREQ(id.c_str(), result_id.c_str());
+    }

    results.clear();
-    results = collection->search("fer thx", 1, 10);
-    std::vector<std::string> ids = {"1", "10", "13"};
+    results = collection->search("fer thx", search_fields, 1, 3);
+    ids = {"1", "10", "13"};
+
+    ASSERT_EQ(3, results.size());

    for(size_t i = 0; i < results.size(); i++) {
        nlohmann::json result = results.at(i);
@ -155,7 +166,7 @@ TEST_F(CollectionTest, QueryWithTypo) {
 }

 TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
-    std::vector<nlohmann::json> results = collection->search("loox", 1, 2, MAX_SCORE, false);
+    std::vector<nlohmann::json> results = collection->search("loox", search_fields, 1, 2, MAX_SCORE, false);
    ASSERT_EQ(2, results.size());
    std::vector<std::string> ids = {"22", "23"};

@ -166,7 +177,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    results = collection->search("loox", 1, 3, FREQUENCY, false);
+    results = collection->search("loox", search_fields, 1, 3, FREQUENCY, false);
    ASSERT_EQ(3, results.size());
    ids = {"3", "12", "24"};

@ -178,17 +189,17 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
    }

    // Check pagination
-    results = collection->search("loox", 1, 1, FREQUENCY, false);
+    results = collection->search("loox", search_fields, 1, 1, FREQUENCY, false);
    ASSERT_EQ(1, results.size());
    std::string solo_id = results.at(0)["id"];
    ASSERT_STREQ("3", solo_id.c_str());

-    results = collection->search("loox", 1, 2, FREQUENCY, false);
+    results = collection->search("loox", search_fields, 1, 2, FREQUENCY, false);
    ASSERT_EQ(2, results.size());

    // Check total ordering

-    results = collection->search("loox", 1, 10, FREQUENCY, false);
+    results = collection->search("loox", search_fields, 1, 10, FREQUENCY, false);
    ASSERT_EQ(5, results.size());
    ids = {"3", "12", "24", "22", "23"};

@ -199,7 +210,7 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    results = collection->search("loox", 1, 10, MAX_SCORE, false);
+    results = collection->search("loox", search_fields, 1, 10, MAX_SCORE, false);
    ASSERT_EQ(5, results.size());
    ids = {"22", "23", "3", "12", "24"};

@ -213,10 +224,23 @@ TEST_F(CollectionTest, TypoTokenRankedByScoreAndFrequency) {

 TEST_F(CollectionTest, TextContainingAnActualTypo) {
    // A line contains "ISX" but not "what" - need to ensure that correction to "ISS what" happens
-    std::vector<nlohmann::json> results = collection->search("ISX what", 1, 10, FREQUENCY, false);
-    ASSERT_EQ(5, results.size());
+    std::vector<nlohmann::json> results = collection->search("ISX what", search_fields, 1, 4, FREQUENCY, false);
+    ASSERT_EQ(4, results.size());

-    std::vector<std::string> ids = {"20", "19", "6", "21", "8"};
+    std::vector<std::string> ids = {"19", "6", "21", "8"};
+
+    for(size_t i = 0; i < results.size(); i++) {
+        nlohmann::json result = results.at(i);
+        std::string result_id = result["id"];
+        std::string id = ids.at(i);
+        ASSERT_STREQ(id.c_str(), result_id.c_str());
+    }
+
+    // Record containing exact token match should appear first
+    results = collection->search("ISX", search_fields, 1, 10, FREQUENCY, false);
+    ASSERT_EQ(8, results.size());
+
+    ids = {"20", "19", "6", "3", "21", "4", "10", "8"};

    for(size_t i = 0; i < results.size(); i++) {
        nlohmann::json result = results.at(i);
@ -227,7 +251,7 @@ TEST_F(CollectionTest, TextContainingAnActualTypo) {
 }

 TEST_F(CollectionTest, PrefixSearching) {
-    std::vector<nlohmann::json> results = collection->search("ex", 0, 10, FREQUENCY, true);
+    std::vector<nlohmann::json> results = collection->search("ex", search_fields, 0, 10, FREQUENCY, true);
    ASSERT_EQ(2, results.size());
    std::vector<std::string> ids = {"12", "6"};

@ -238,7 +262,7 @@ TEST_F(CollectionTest, PrefixSearching) {
        ASSERT_STREQ(id.c_str(), result_id.c_str());
    }

-    results = collection->search("ex", 0, 10, MAX_SCORE, true);
+    results = collection->search("ex", search_fields, 0, 10, MAX_SCORE, true);
    ASSERT_EQ(2, results.size());
    ids = {"6", "12"};