From 6ad5e361799f7fbd96ff66b3baf8821d5f20902f Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishore@kishorelive.com>
Date: Wed, 11 Apr 2018 22:25:09 +0530
Subject: [PATCH 1/4] Allow configuring a threshold for dropping tokens.

Fixes https://github.com/typesense/typesense/issues/20
---
 TODO.md                  |  3 ++-
 include/collection.h     |  3 ++-
 include/index.h          | 17 ++++++++++++-----
 src/api.cpp              | 12 +++++++++++-
 src/collection.cpp       |  5 +++--
 src/index.cpp            | 14 ++++++++------
 test/collection_test.cpp | 13 +++++++++++++
 7 files changed, 51 insertions(+), 16 deletions(-)
diff --git a/TODO.md b/TODO.md
index 4d8e36b1..b4479eda 100644
--- a/TODO.md
+++ b/TODO.md
@@ -96,9 +96,10 @@
 - ~~gzip compress responses~~
 - ~~Have a LOG(ERROR) level~~
 - ~~Handle SIGTERM which is sent when process is killed~~
+- Exact search 
+- NOT operator support
 - Log operations
 - Parameterize replica's MAX_UPDATES_TO_SEND
-- NOT operator support
 - > INT32_MAX validation for float field
 - highlight of string arrays?
 - test for token ranking on float field
diff --git a/include/collection.h b/include/collection.h
index 13f89a41..8f5ac786 100644
--- a/include/collection.h
+++ b/include/collection.h
@@ -96,7 +96,8 @@ public:
                           const std::string & simple_filter_query, const std::vector<std::string> & facet_fields,
                           const std::vector<sort_by> & sort_fields, const int num_typos,
                           const size_t per_page = 10, const size_t page = 1,
-                          const token_ordering token_order = FREQUENCY, const bool prefix = false);
+                          const token_ordering token_order = FREQUENCY, const bool prefix = false,
+                          const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD);
 
     Option<nlohmann::json> get(const std::string & id);
 
diff --git a/include/index.h b/include/index.h
index ec51aede..79ab0ad2 100644
--- a/include/index.h
+++ b/include/index.h
@@ -32,6 +32,7 @@ struct search_args {
     size_t page;
     token_ordering token_order;
     bool prefix;
+    size_t drop_tokens_threshold;
     std::vector<std::pair<int, Topster<512>::KV>> field_order_kvs;
     size_t all_result_ids_len;
     std::vector<std::vector<art_leaf*>> searched_queries;
@@ -43,10 +44,11 @@ struct search_args {
 
     search_args(std::string query, std::vector<std::string> search_fields, std::vector<filter> filters,
                 std::vector<facet> facets, std::vector<sort_by> sort_fields_std, int num_typos,
-                size_t per_page, size_t page, token_ordering token_order, bool prefix):
+                size_t per_page, size_t page, token_ordering token_order, bool prefix, size_t drop_tokens_threshold):
             query(query), search_fields(search_fields), filters(filters), facets(facets),
             sort_fields_std(sort_fields_std), num_typos(num_typos), per_page(per_page), page(page),
-            token_order(token_order), prefix(prefix), all_result_ids_len(0), outcome(0) {
+            token_order(token_order), prefix(prefix), drop_tokens_threshold(drop_tokens_threshold),
+            all_result_ids_len(0), outcome(0) {
 
     }
 };
@@ -91,7 +93,8 @@ private:
                       const int num_typos, const size_t num_results,
                       std::vector<std::vector<art_leaf*>> & searched_queries,
                       Topster<512> & topster, uint32_t** all_result_ids,
-                      size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY, const bool prefix = false);
+                      size_t & all_result_ids_len, const token_ordering token_order = FREQUENCY,
+                      const bool prefix = false, const size_t drop_tokens_threshold = Index::DROP_TOKENS_THRESHOLD);
 
     void search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
                            const std::vector<sort_by> & sort_fields, std::vector<token_candidates> & token_to_candidates,
@@ -138,7 +141,7 @@ public:
                           const std::vector<filter> & filters, std::vector<facet> & facets,
                           std::vector<sort_by> sort_fields_std, const int num_typos,
                           const size_t per_page, const size_t page,
-                          const token_ordering token_order, const bool prefix,
+                          const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold,
                           std::vector<std::pair<int, Topster<512>::KV>> & field_order_kv,
                           size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries);
 
@@ -150,7 +153,11 @@ public:
 
     Option<uint32_t> index_in_memory(const nlohmann::json & document, uint32_t seq_id, int32_t points);
 
-    static const int SEARCH_LIMIT_NUM = 100;  // for limiting number of results on multiple candidates / query rewrites
+    static const int SEARCH_LIMIT_NUM = 100;      // for limiting number of results on multiple candidates / query rewrites
+
+    // If the number of results found is less than this threshold, Typesense will attempt to drop the tokens
+    // in the query that have the least individual hits one by one until enough results are found.
+    static const int DROP_TOKENS_THRESHOLD = 10;
 
     // strings under this length will be fully highlighted, instead of showing a snippet of relevant portion
     enum {SNIPPET_STR_ABOVE_LEN = 30};
diff --git a/src/api.cpp b/src/api.cpp
index b56218a4..2e5d5093 100644
--- a/src/api.cpp
+++ b/src/api.cpp
@@ -161,6 +161,7 @@ void get_search(http_req & req, http_res & res) {
 
     const char *NUM_TYPOS = "num_typos";
     const char *PREFIX = "prefix";
+    const char *DROP_TOKENS_THRESHOLD = "drop_tokens_threshold";
     const char *FILTER = "filter_by";
     const char *QUERY = "q";
     const char *QUERY_BY = "query_by";
@@ -179,6 +180,10 @@ void get_search(http_req & req, http_res & res) {
         req.params[PREFIX] = "true";
     }
 
+    if(req.params.count(DROP_TOKENS_THRESHOLD) == 0) {
+        req.params[DROP_TOKENS_THRESHOLD] = std::to_string(Index::DROP_TOKENS_THRESHOLD);
+    }
+
     if(req.params.count(QUERY) == 0) {
         return res.send_400(std::string("Parameter `") + QUERY + "` is required.");
     }
@@ -195,6 +200,10 @@ void get_search(http_req & req, http_res & res) {
         req.params[PAGE] = "1";
     }
 
+    if(!StringUtils::is_uint64_t(req.params[DROP_TOKENS_THRESHOLD])) {
+        return res.send_400("Parameter `" + std::string(DROP_TOKENS_THRESHOLD) + "` must be an unsigned integer.");
+    }
+
     if(!StringUtils::is_uint64_t(req.params[NUM_TYPOS])) {
         return res.send_400("Parameter `" + std::string(NUM_TYPOS) + "` must be an unsigned integer.");
     }
@@ -245,6 +254,7 @@ void get_search(http_req & req, http_res & res) {
     }
 
     bool prefix = (req.params[PREFIX] == "true");
+    const size_t drop_tokens_threshold = (size_t) std::stoi(req.params[DROP_TOKENS_THRESHOLD]);
 
     if(req.params.count(RANK_TOKENS_BY) == 0) {
         req.params[RANK_TOKENS_BY] = "DEFAULT_SORTING_FIELD";
@@ -256,7 +266,7 @@ void get_search(http_req & req, http_res & res) {
     Option<nlohmann::json> result_op = collection->search(req.params[QUERY], search_fields, filter_str, facet_fields,
                                                sort_fields, std::stoi(req.params[NUM_TYPOS]),
                                                std::stoi(req.params[PER_PAGE]), std::stoi(req.params[PAGE]),
-                                               token_order, prefix);
+                                               token_order, prefix, drop_tokens_threshold);
 
     uint64_t timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(
                                std::chrono::high_resolution_clock::now() - begin).count();
diff --git a/src/collection.cpp b/src/collection.cpp
index 27436382..5d87140d 100644
--- a/src/collection.cpp
+++ b/src/collection.cpp
@@ -264,7 +264,8 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
                                   const std::string & simple_filter_query, const std::vector<std::string> & facet_fields,
                                   const std::vector<sort_by> & sort_fields, const int num_typos,
                                   const size_t per_page, const size_t page,
-                                  const token_ordering token_order, const bool prefix) {
+                                  const token_ordering token_order, const bool prefix,
+                                  const size_t drop_tokens_threshold) {
     std::vector<facet> facets;
 
     // validate search fields
@@ -430,7 +431,7 @@ Option<nlohmann::json> Collection::search(std::string query, const std::vector<s
     // send data to individual index threads
     for(Index* index: indices) {
         index->search_params = search_args(query, search_fields, filters, facets, sort_fields_std,
-                                           num_typos, per_page, page, token_order, prefix);
+                                           num_typos, per_page, page, token_order, prefix, drop_tokens_threshold);
         {
             std::lock_guard<std::mutex> lk(index->m);
             index->ready = true;
diff --git a/src/index.cpp b/src/index.cpp
index 6c09ee6c..08bf7b64 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -548,8 +548,8 @@ void Index::run_search() {
         search(search_params.outcome, search_params.query, search_params.search_fields,
                search_params.filters, search_params.facets,
                search_params.sort_fields_std, search_params.num_typos, search_params.per_page, search_params.page,
-               search_params.token_order, search_params.prefix, search_params.field_order_kvs,
-               search_params.all_result_ids_len, search_params.searched_queries);
+               search_params.token_order, search_params.prefix, search_params.drop_tokens_threshold,
+               search_params.field_order_kvs, search_params.all_result_ids_len, search_params.searched_queries);
 
         // hand control back to main thread
         processed = true;
@@ -565,7 +565,8 @@ void Index::search(Option<uint32_t> & outcome, std::string query, const std::vec
                              const std::vector<filter> & filters, std::vector<facet> & facets,
                              std::vector<sort_by> sort_fields_std, const int num_typos,
                              const size_t per_page, const size_t page, const token_ordering token_order,
-                             const bool prefix, std::vector<std::pair<int, Topster<512>::KV>> & field_order_kvs,
+                             const bool prefix, const size_t drop_tokens_threshold,
+                             std::vector<std::pair<int, Topster<512>::KV>> & field_order_kvs,
                              size_t & all_result_ids_len, std::vector<std::vector<art_leaf*>> & searched_queries) {
 
     const size_t num_results = (page * per_page);
@@ -591,7 +592,8 @@ void Index::search(Option<uint32_t> & outcome, std::string query, const std::vec
         // proceed to query search only when no filters are provided or when filtering produces results
         if(filters.size() == 0 || filter_ids_length > 0) {
             search_field(query, field, filter_ids, filter_ids_length, facets, sort_fields_std, num_typos, num_results,
-                         searched_queries, topster, &all_result_ids, all_result_ids_len, token_order, prefix);
+                         searched_queries, topster, &all_result_ids, all_result_ids_len, token_order, prefix,
+                         drop_tokens_threshold);
             topster.sort();
         }
 
@@ -623,7 +625,7 @@ void Index::search_field(std::string & query, const std::string & field, uint32_
                               std::vector<facet> & facets, const std::vector<sort_by> & sort_fields, const int num_typos,
                               const size_t num_results, std::vector<std::vector<art_leaf*>> & searched_queries,
                               Topster<512> &topster, uint32_t** all_result_ids, size_t & all_result_ids_len,
-                              const token_ordering token_order, const bool prefix) {
+                              const token_ordering token_order, const bool prefix, const size_t drop_tokens_threshold) {
     std::vector<std::string> tokens;
     StringUtils::split(query, tokens, " ");
 
@@ -747,7 +749,7 @@ void Index::search_field(std::string & query, const std::string & field, uint32_
     }
 
     // When there are not enough overall results and atleast one token has results
-    if(topster.size < Index::SEARCH_LIMIT_NUM && token_to_count.size() > 1) {
+    if(topster.size < drop_tokens_threshold && token_to_count.size() > 1) {
         // Drop token with least hits and try searching again
         std::string truncated_query;
 
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index 91008b72..6ab46195 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -206,6 +206,19 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
         ASSERT_STREQ(id.c_str(), result_id.c_str());
     }
 
+    // should not try to drop tokens to expand query
+    results.clear();
+    results = collection->search("the a", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, false, 0).get();
+    ASSERT_EQ(3, results["hits"].size());
+    ids = {"8", "16", "10"};
+
+    for(size_t i = 0; i < results["hits"].size(); i++) {
+        nlohmann::json result = results["hits"].at(i);
+        std::string id = ids.at(i);
+        std::string result_id = result["document"]["id"];
+        ASSERT_STREQ(id.c_str(), result_id.c_str());
+    }
+
     results.clear();
     results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, sort_fields, 0, 10).get();
     ASSERT_EQ(0, results["hits"].size());

From 5a3482435bd3166f9bba7b50b7eb57eb06553066 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishore@kishorelive.com>
Date: Thu, 12 Apr 2018 20:40:03 +0530
Subject: [PATCH 2/4] Refactor type definition of art node priority queue
 comparator.

---
 src/art.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/art.cpp b/src/art.cpp
index 2f099704..fc515ca5 100644
--- a/src/art.cpp
+++ b/src/art.cpp
@@ -905,15 +905,11 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
                          std::vector<art_leaf *> &results) {
     printf("INSIDE art_topk_iter: root->type: %d\n", root->type);
 
-    std::priority_queue<art_node *, std::vector<const art_node *>,
-            std::function<bool(const art_node*, const art_node*)>> q;
+    std::priority_queue<art_node *, std::vector<const art_node *>, decltype(&compare_art_node_score_pq)> q;
 
     if(token_order == FREQUENCY) {
         q = std::priority_queue<art_node *, std::vector<const art_node *>,
-                std::function<bool(const art_node*, const art_node*)>>(compare_art_node_frequency_pq);
-    } else {
-        q = std::priority_queue<art_node *, std::vector<const art_node *>,
-                std::function<bool(const art_node*, const art_node*)>>(compare_art_node_score_pq);
+                decltype(&compare_art_node_frequency_pq)>(compare_art_node_frequency_pq);
     }
 
     q.push(root);

From 874b5beb898ddf0d47f1f4e7cb96aacba3e6ebc0 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishore@kishorelive.com>
Date: Thu, 12 Apr 2018 20:43:49 +0530
Subject: [PATCH 3/4] Fix initialization of priority queue.

---
 src/art.cpp              | 3 ++-
 test/collection_test.cpp | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/art.cpp b/src/art.cpp
index fc515ca5..80a0cd28 100644
--- a/src/art.cpp
+++ b/src/art.cpp
@@ -905,7 +905,8 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
                          std::vector<art_leaf *> &results) {
     printf("INSIDE art_topk_iter: root->type: %d\n", root->type);
 
-    std::priority_queue<art_node *, std::vector<const art_node *>, decltype(&compare_art_node_score_pq)> q;
+    std::priority_queue<art_node *, std::vector<const art_node *>,
+            decltype(&compare_art_node_score_pq)> q(compare_art_node_score_pq);
 
     if(token_order == FREQUENCY) {
         q = std::priority_queue<art_node *, std::vector<const art_node *>,
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index 6ab46195..5e4ccbd1 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -207,6 +207,10 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
     }
 
     // should not try to drop tokens to expand query
+    results.clear();
+    results = collection->search("the a", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, false, 10).get();
+    ASSERT_EQ(8, results["hits"].size());
+
     results.clear();
     results = collection->search("the a", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, false, 0).get();
     ASSERT_EQ(3, results["hits"].size());
@@ -219,6 +223,7 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
         ASSERT_STREQ(id.c_str(), result_id.c_str());
     }
 
+    // with no indexed word
     results.clear();
     results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, sort_fields, 0, 10).get();
     ASSERT_EQ(0, results["hits"].size());

From b186816ca57e13cb8637d47ad147f259ade765a5 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishore@kishorelive.com>
Date: Fri, 13 Apr 2018 06:52:16 +0500
Subject: [PATCH 4/4] Respect drop_tokens_threshold even when that token does
 not exist in the index.

---
 src/art.cpp              | 4 ++--
 src/index.cpp            | 9 +++++++--
 test/collection_test.cpp | 4 ++++
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/art.cpp b/src/art.cpp
index 80a0cd28..daa53a0a 100644
--- a/src/art.cpp
+++ b/src/art.cpp
@@ -905,11 +905,11 @@ int art_topk_iter(const art_node *root, token_ordering token_order, size_t max_r
                          std::vector<art_leaf *> &results) {
     printf("INSIDE art_topk_iter: root->type: %d\n", root->type);
 
-    std::priority_queue<art_node *, std::vector<const art_node *>,
+    std::priority_queue<const art_node *, std::vector<const art_node *>,
             decltype(&compare_art_node_score_pq)> q(compare_art_node_score_pq);
 
     if(token_order == FREQUENCY) {
-        q = std::priority_queue<art_node *, std::vector<const art_node *>,
+        q = std::priority_queue<const art_node *, std::vector<const art_node *>,
                 decltype(&compare_art_node_frequency_pq)>(compare_art_node_frequency_pq);
     }
 
diff --git a/src/index.cpp b/src/index.cpp
index 08bf7b64..2a73a6fe 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -331,7 +331,7 @@ void Index::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, st
         // every element in `query_suggestion` contains a token and its associated hits
         std::vector<art_leaf *> query_suggestion = next_suggestion(token_candidates_vec, n);
 
-        /*for(auto i=0; i < query_suggestion.size(); i++) {
+        /*for(size_t i=0; i < query_suggestion.size(); i++) {
             LOG(INFO) << "i: " << i << " - " << query_suggestion[i]->key;
         }*/
 
@@ -715,7 +715,12 @@ void Index::search_field(std::string & query, const std::string & field, uint32_
                 if(it != token_to_costs[token_index].end()) {
                     token_to_costs[token_index].erase(it);
 
-                    // no more costs left for this token, clean up
+                    // when no more costs are left for this token and `drop_tokens_threshold` is breached
+                    if(token_to_costs[token_index].empty() && topster.size >= drop_tokens_threshold) {
+                        break;
+                    }
+
+                    // otherwise, we try to drop the token and search with remaining tokens
                     if(token_to_costs[token_index].empty()) {
                         token_to_costs.erase(token_to_costs.begin()+token_index);
                         tokens.erase(tokens.begin()+token_index);
diff --git a/test/collection_test.cpp b/test/collection_test.cpp
index 5e4ccbd1..c503916d 100644
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@@ -223,6 +223,10 @@ TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
         ASSERT_STREQ(id.c_str(), result_id.c_str());
     }
 
+    results.clear();
+    results = collection->search("the a DoesNotExist", query_fields, "", facets, sort_fields, 0, 10, 1, FREQUENCY, false, 0).get();
+    ASSERT_EQ(0, results["hits"].size());
+
     // with no indexed word
     results.clear();
     results = collection->search("DoesNotExist1 DoesNotExist2", query_fields, "", facets, sort_fields, 0, 10).get();