Fixed multi word queries.

2025-05-20 21:52:23 +08:00 · 2016-09-12 09:54:34 +05:30 · 2016-09-12 09:54:34 +05:30 · e7c6c6d3cb
commit e7c6c6d3cb
parent 2f26b95c5b
7 changed files with 161 additions and 125 deletions
--- a/TODO.md
+++ b/TODO.md
@ -5,7 +5,7 @@
 **Search index**

 - ~~Proper JSON as input~~
- Storing raw JSON input to RocksDB
+- ~~Storing raw JSON input to RocksDB~~
 - ART for every indexed field
 - UTF-8 support for fuzzy search
 - Facets
@ -29,4 +29,8 @@
 **Refactoring**

 - ~~`token_count` in leaf is redundant: can be accessed from value~~
- ~~storing length in `offsets` is redundant: it can be found by looking up value of the next index in offset_index~~
+- ~~storing length in `offsets` is redundant: it can be found by looking up value of the next index in offset_index~~
+
+**Tech debt**
+
+- Use GLOB file pattern for CMake (better IDE refactoring support)
--- a/include/match_score.h
+++ b/include/match_score.h
@ -12,96 +12,110 @@
 #define D(x)
 #endif

-struct MatchScore {
-  struct TokenPosition {
-    uint8_t token_id;         // token identifier
-    uint16_t position;        // token's position in the text
-    uint16_t position_index;  // index of the position in the vector
+#define TokenOffsetHeap std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset>

-    bool operator() (const TokenPosition& a, const TokenPosition& b) {
-        return a.position > b.position;
+struct MatchScore {
+  struct TokenOffset {
+    uint8_t token_id;         // token identifier
+    uint16_t offset;          // token's offset in the text
+    uint16_t offset_index;    // index of the offset in the vector
+
+    bool operator() (const TokenOffset& a, const TokenOffset& b) {
+        return a.offset > b.offset;
    }
  };

-  #define addTopOfHeapToWindow(heap,q,token_positions,token_pos) {\
-    TokenPosition top = heap.top();\
-    heap.pop();\
-    q.push(top);\
-    token_pos[top.token_id] = top.position; \
-    top.position_index++;\
-    /* Must refill the heap - push the next position of the same token */\
-    if(top.position_index < token_positions[top.token_id].size()) {\
-        heap.push(TokenPosition{top.token_id, token_positions[top.token_id][top.position_index], top.position_index});\
-    }\
+  static void print_token_offsets(std::vector<std::vector<uint16_t>> &token_offsets) {
+    for(auto offsets: token_offsets) {
+      for(auto offset: offsets) {
+        std::cout << offset  << ", ";
+      }
+      std::cout << std::endl;
+    }
  }
+
+  static inline void addTopOfHeapToWindow(TokenOffsetHeap &heap, std::queue<TokenOffset> &window,
+                                           std::vector<std::vector<uint16_t>> &token_offsets, uint16_t *token_offset) {
+    TokenOffset top = heap.top();
+    heap.pop();
+    window.push(top);
+    token_offset[top.token_id] = top.offset;
+    top.offset_index++;
+
+    // Must refill the heap - push the next offset of the same token
+    if(top.offset_index < token_offsets[top.token_id].size()) {
+        heap.push(TokenOffset{top.token_id, token_offsets[top.token_id][top.offset_index], top.offset_index});
+    }
+  }
+
  uint16_t words_present;
  uint16_t distance;

  /*
-  *  Given *sorted positions* of each target token in a *single* document, generates a score that indicates:
+  *  Given *sorted offsets* of each target token in a *single* document, generates a score that indicates:
  *  a) How many tokens are present in the document
  *  b) The proximity between the tokens in the document
  *
-  *  We use a priority queue to read the position vectors in a sorted manner, slide a window of a given size, and
+  *  We use a priority queue to read the offset vectors in a sorted manner, slide a window of a given size, and
  *  compute the max_match and min_displacement of target tokens across the windows.
  */
-  static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_positions) {
+  static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
    const size_t WINDOW_SIZE = 20;
    const size_t MAX_TOKENS_IN_A_QUERY = 20;
-    const uint16_t MAX_UINT_16 = std::numeric_limits<uint16_t>::max();
+    const uint16_t MAX_DISPLACEMENT = 20;

-    std::priority_queue<TokenPosition, std::vector<TokenPosition>, TokenPosition> heap;
+    std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;

-    for(uint8_t token_id=0; token_id < token_positions.size(); token_id++) {
-      heap.push(TokenPosition{token_id, token_positions[token_id].front(), 0});
+    for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {
+      heap.push(TokenOffset{token_id, token_offsets[token_id].front(), 0});
    }

-    // heap now contains the first occurring position of each token in the given document
+    // heap now contains the first occurring offset of each token in the given document

    uint16_t max_match = 1;
-    uint16_t min_displacement = UINT16_MAX;
+    uint16_t min_displacement = MAX_DISPLACEMENT;

-    std::queue<TokenPosition> q;
-    uint16_t token_pos[MAX_TOKENS_IN_A_QUERY] = { };
-    std::fill_n(token_pos, MAX_TOKENS_IN_A_QUERY, MAX_UINT_16);
+    std::queue<TokenOffset> window;
+    uint16_t token_offset[MAX_TOKENS_IN_A_QUERY] = { };
+    std::fill_n(token_offset, MAX_TOKENS_IN_A_QUERY, MAX_DISPLACEMENT);

    do {
-      if(q.empty()) {
-        addTopOfHeapToWindow(heap, q, token_positions, token_pos);
+      if(window.empty()) {
+        addTopOfHeapToWindow(heap, window, token_offsets, token_offset);
      }

-      D(cout << "Loop till window fills..." << endl;)
+      D(std::cout << "Loop till window fills... doc_id: " << doc_id << std::endl;)

-      // Fill the queue with tokens within a given window frame size of the start position
+      // Fill the queue with tokens within a given window frame size of the start offset
      // At the same time, we also record the *last* occurrence of each token within the window
-      // For e.g. if `cat` appeared at positions 1,3 and 5, we will record `token_pos[cat] = 5`
-      const uint16_t start_pos = q.front().position;
-      while(!heap.empty() && heap.top().position < start_pos+WINDOW_SIZE) {
-        addTopOfHeapToWindow(heap, q, token_positions, token_pos);
+      // For e.g. if `cat` appeared at offsets 1,3 and 5, we will record `token_offset[cat] = 5`
+      const uint16_t start_offset = window.front().offset;
+      while(!heap.empty() && heap.top().offset < start_offset+WINDOW_SIZE) {
+        addTopOfHeapToWindow(heap, window, token_offsets, token_offset);
      }

-      D(cout << endl << "----" << endl);
+      D(std::cout << std::endl << "----" << std::endl);

-      uint16_t prev_pos = MAX_UINT_16;
+      uint16_t prev_pos = MAX_DISPLACEMENT;
      uint16_t num_match = 0;
      uint16_t displacement = 0;

-      for(size_t token_id=0; token_id<token_positions.size(); token_id++) {
-        // If a token appeared within the window, we would have recorded its position
-        if(token_pos[token_id] != MAX_UINT_16) {
+      for(size_t token_id=0; token_id<token_offsets.size(); token_id++) {
+        // If a token appeared within the window, we would have recorded its offset
+        if(token_offset[token_id] != MAX_DISPLACEMENT) {
          num_match++;
-          if(prev_pos == MAX_UINT_16) prev_pos = token_pos[token_id];
+          if(prev_pos == MAX_DISPLACEMENT) prev_pos = token_offset[token_id];
          else {
            // Calculate the distance between the tokens within the window
            // Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other
-            D(cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_pos[token_id] << endl);
-            displacement += abs(token_pos[token_id]-prev_pos);
-            prev_pos = token_pos[token_id];
+            D(std::cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_offset[token_id] << std::endl);
+            displacement += abs(token_offset[token_id]-prev_pos);
+            prev_pos = token_offset[token_id];
          }
        }
      }

-      D(cout << endl << "!!!displacement: " << displacement << " | num_match: " << num_match << endl);
+      D(std::cout << std::endl << "!!!displacement: " << displacement << " | num_match: " << num_match << std::endl);

      // Track the best `displacement` and `num_match` seen so far across all the windows
      if(num_match >= max_match) {
@ -112,8 +126,8 @@ struct MatchScore {
      }

      // As we slide the window, drop the first token of the window from the computation
-      token_pos[q.front().token_id] = 0;
-      q.pop();
+      token_offset[window.front().token_id] = 0;
+      window.pop();
    } while(!heap.empty());

    return MatchScore{max_match, min_displacement};
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -7,6 +7,7 @@
 #include <match_score.h>
 #include <string_utils.h>
 #include "sole.hpp"
+#include "art.h"
 #include "json.hpp"

 Collection::Collection(std::string state_dir_path): seq_id(0) {
@ -85,80 +86,90 @@ void Collection::add(std::string json_str) {
   4. Intersect the lists to find docs that match each phrase
   5. Sort the docs based on some ranking criteria
 */
-std::vector<nlohmann::json> Collection::search(std::string query, const int num_typos, const size_t max_results) {
+std::vector<nlohmann::json> Collection::search(std::string query, const int num_typos, const size_t num_results) {
    std::vector<std::string> tokens;
    StringUtils::tokenize(query, tokens, " ", true);

    const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
+    const size_t max_results = std::min(num_results, (size_t) 100);

-    std::cout << "Searching with max_cost=" << max_cost << std::endl;
-
-    std::vector<std::vector<art_leaf*>> token_leaves;
-    for(std::string token: tokens) {
-        std::vector<art_leaf*> leaves;
-        art_fuzzy_results(&t, (const unsigned char *) token.c_str(), (int) token.length() + 1, max_cost, 10, leaves);
-        if(!leaves.empty()) {
-            for(auto i=0; i<leaves.size(); i++) {
-                //printf("%s - ", token.c_str());
-                //printf("%.*s", leaves[i]->key_len, leaves[i]->key);
-                //printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->values->ids.getLength());
-            }
-            token_leaves.push_back(leaves);
-        }
-    }
-
-    if(token_leaves.size() == 0) {
-        return std::vector<nlohmann::json>();
-    }
-
-    //std::cout << "token_leaves.size = " << token_leaves.size() << std::endl;
-
-    Topster<100> topster;
+    int cost = 0;
    size_t total_results = 0;
-    const size_t combination_limit = 10;
-    auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
-    long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
-
-    for(long long n=0; n<N && n<combination_limit; ++n) {
-        // every element in `query_suggestion` represents a token and its associated hits
-        std::vector<art_leaf *> query_suggestion = _next_suggestion(token_leaves, n);
-
-        // initialize results with the starting element (for further intersection)
-        uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
-        size_t result_size = query_suggestion[0]->values->ids.getLength();
-
-        if(result_size == 0) continue;
-
-        // intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
-        for(auto i=1; i < query_suggestion.size(); i++) {
-            uint32_t* out = new uint32_t[result_size];
-            uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
-            result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
-            delete result_ids;
-            delete curr;
-            result_ids = out;
-        }
-
-        // go through each matching document id and calculate match score
-        score_results(topster, query_suggestion, result_ids, result_size);
-
-        total_results += result_size;
-        delete result_ids;
-
-        if(total_results >= max_results) break;
-    }
-
-    topster.sort();
-
    std::vector<nlohmann::json> results;

-    for(uint32_t i=0; i<topster.size; i++) {
-        uint32_t id = topster.getKeyAt(i);
-        std::cout << "ID: " << id << std::endl;
+    while(cost <= max_cost) {
+        std::cout << "Searching with cost=" << cost << std::endl;

-        const std::string value = store->get(std::to_string(id));
-        nlohmann::json document = nlohmann::json::parse(value);
-        results.push_back(document);
+        std::vector<std::vector<art_leaf*>> token_leaves;
+        for(std::string token: tokens) {
+            std::vector<art_leaf*> leaves;
+            art_fuzzy_results(&t, (const unsigned char *) token.c_str(), (int) token.length() + 1, cost, 3, leaves);
+            if(!leaves.empty()) {
+                for(auto i=0; i<leaves.size(); i++) {
+                    //printf("%s - ", token.c_str());
+                    //printf("%.*s", leaves[i]->key_len, leaves[i]->key);
+                    //printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->values->ids.getLength());
+                }
+                token_leaves.push_back(leaves);
+            }
+        }
+
+        if(token_leaves.size() != tokens.size()) {
+            //std::cout << "token_leaves.size() != tokens.size(), continuing..." << std::endl << std::endl;
+            cost++;
+            continue;
+        }
+
+        Topster<100> topster;
+        const size_t combination_limit = 10;
+        auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
+        long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
+
+        for(long long n=0; n<N && n<combination_limit; ++n) {
+            // every element in `query_suggestion` represents a token and its associated hits
+            std::vector<art_leaf *> query_suggestion = _next_suggestion(token_leaves, n);
+
+            // initialize results with the starting element (for further intersection)
+            uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
+            size_t result_size = query_suggestion[0]->values->ids.getLength();
+
+            if(result_size == 0) continue;
+
+            // intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
+            for(auto i=1; i < query_suggestion.size(); i++) {
+                uint32_t* out = new uint32_t[result_size];
+                uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
+                result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
+                delete result_ids;
+                delete curr;
+                result_ids = out;
+            }
+
+            // go through each matching document id and calculate match score
+            score_results(topster, query_suggestion, result_ids, result_size);
+
+            total_results += result_size;
+            delete result_ids;
+
+            if(total_results >= max_results) break;
+        }
+
+        topster.sort();
+
+        for(uint32_t i=0; i<topster.size; i++) {
+            uint32_t id = topster.getKeyAt(i);
+            std::cout << "ID: " << id << std::endl;
+
+            const std::string value = store->get(std::to_string(id));
+            nlohmann::json document = nlohmann::json::parse(value);
+            results.push_back(document);
+        }
+
+        if(total_results > 0) {
+            break;
+        }
+
+        cost++;
    }

    return results;
@ -176,10 +187,10 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
            uint32_t doc_index = token_leaf->values->ids.indexOf(doc_id);
            uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
            uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
-                                  (token_leaf->values->offsets.getLength() - 1) :
+                                  token_leaf->values->offsets.getLength() :
                                  token_leaf->values->offset_index.at(doc_index+1);

-            while(start_offset <= end_offset) {
+            while(start_offset < end_offset) {
                positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
                start_offset++;
            }
@ -190,10 +201,12 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
        MatchScore mscore = MatchScore::match_score(doc_id, token_positions);
        const uint32_t cumulativeScore = ((uint32_t)(mscore.words_present * 16 + (20 - mscore.distance)) * 64000) + doc_scores.at(doc_id);

-        /*std::cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: "
-                  << (int)mscore.distance << " - mscore.words_present: " << (int)mscore.words_present
-                  << " - doc_scores[doc_id]: " << (int)doc_scores[doc_id] << "  - cumulativeScore: "
-                  << cumulativeScore << std::endl;*/
+        /*
+          std::cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: "
+                  << (int) mscore.distance << " - mscore.words_present: " << (int) mscore.words_present
+                  << " - doc_scores[doc_id]: " << (int) doc_scores.at(doc_id) << "  - cumulativeScore: "
+                  << cumulativeScore << std::endl;
+        */

        topster.add(doc_id, cumulativeScore);
    }
--- a/src/collection.h
+++ b/src/collection.h
@ -25,7 +25,7 @@ public:
    Collection(std::string state_dir_path);
    ~Collection();
    void add(std::string json_str);
-    std::vector<nlohmann::json> search(std::string query, const int num_typos, const size_t max_results);
+    std::vector<nlohmann::json> search(std::string query, const int num_typos, const size_t num_results);

    static inline std::vector<art_leaf *> _next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
                                             long long int n);
--- a/src/main.cpp
+++ b/src/main.cpp
@ -26,7 +26,7 @@ int main() {
    cout << "FINISHED INDEXING!" << endl << flush;

    auto begin = std::chrono::high_resolution_clock::now();
-    collection->search("platn", 1, 100);
+    collection->search("platn growing", 1, 100);
    long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
    cout << "Time taken: " << timeMillis << "us" << endl;
    delete collection;
--- a/src/server.cpp
+++ b/src/server.cpp
@ -16,6 +16,7 @@
 #include <regex>
 #include "string_utils.h"
 #include "collection.h"
+#include <sys/resource.h>

 #include "h2o.h"
 #include "h2o/http1.h"
@ -82,6 +83,10 @@ static int chunked_test(h2o_handler_t *self, h2o_req_t *req) {

    std::string json_str = json_array.dump();

+    struct rusage r_usage;
+    getrusage(RUSAGE_SELF,&r_usage);
+
+    std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl;
    std::cout << "JSON:" << json_str << std::endl;

    h2o_iovec_t body = h2o_strdup(&req->pool, json_str.c_str(), SIZE_MAX);
--- a/test/documents.jsonl
+++ b/test/documents.jsonl
@ -1,4 +1,4 @@
-{"points":15,"title":"How are cryogenic rocket plan propellants delivered to the launch pad?"}
+{"points":15,"title":"How are cryogenic rocket plant propellants delivered to the growing launch pad?"}
 {"points":14,"title":"Are there any (free) are online data archives for data from instruments on Soviet / Russian missions?"}
 {"points":13,"title":"Where should I look in ISS to find mouldy food?"}
 {"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"}