From ba33da1d51a6b84384aab9d13300181e59f2c083 Mon Sep 17 00:00:00 2001
From: Kishore Nallan <kishore@kishorelive.com>
Date: Sun, 7 Aug 2016 14:55:26 -0700
Subject: [PATCH] Lots of code clean up.

* Move stuff out of main to classes
* Standardize naming conventions.
---
 CMakeLists.txt                            |   4 +-
 README.md                                 |   5 +
 include/{IdGenerator.h => id_generator.h} |   0
 include/match_score.h                     | 121 ++++++++++
 include/matchscore.h                      | 121 ----------
 include/string_utils.h                    |  44 ++++
 include/topster.h                         |   5 +-
 include/util.h                            |  36 ---
 src/main.cpp                              | 267 +---------------------
 src/search_index.cpp                      | 159 +++++++++++++
 src/search_index.h                        |  18 ++
 11 files changed, 363 insertions(+), 417 deletions(-)
 rename include/{IdGenerator.h => id_generator.h} (100%)
 create mode 100644 include/match_score.h
 delete mode 100644 include/matchscore.h
 create mode 100644 include/string_utils.h
 delete mode 100644 include/util.h
 create mode 100644 src/search_index.cpp
 create mode 100644 src/search_index.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7e1eb54..d1be8006 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,5 +7,5 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -stdlib=libc++ -std=gnu
 include_directories(include)
 include_directories(external/for)
 
-add_executable(search src/art.cpp src/intersection.cpp src/main.cpp)
-target_link_libraries(search ${CMAKE_SOURCE_DIR}/external/for/libfor.a)
+add_executable(search src/art.cpp src/intersection.cpp src/main.cpp src/search_index.cpp src/search_index.h)
+target_link_libraries(search ${CMAKE_SOURCE_DIR}/external/for/libfor.a boost_system)
diff --git a/README.md b/README.md
index df888421..df83fb43 100644
--- a/README.md
+++ b/README.md
@@ -6,4 +6,9 @@ A typo tolerant, open source search engine that helps you build delightful searc
 
 * [libfor](https://github.com/cruppstahl/for/)
 
+## Building
+
+* Switch to `external/libfor` and build libfor
+* Install `boost`
+
 &copy; 2016 Wreally Studios Inc.
\ No newline at end of file
diff --git a/include/IdGenerator.h b/include/id_generator.h
similarity index 100%
rename from include/IdGenerator.h
rename to include/id_generator.h
diff --git a/include/match_score.h b/include/match_score.h
new file mode 100644
index 00000000..67002bb8
--- /dev/null
+++ b/include/match_score.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <stdint.h>
+#include <vector>
+#include <queue>
+#include <stdlib.h>
+#include <limits>
+
+#ifdef DEBUG
+#define D(x) x
+#else
+#define D(x)
+#endif
+
+struct MatchScore {
+  struct TokenPosition {
+    uint8_t token_id;         // token identifier
+    uint16_t position;        // token's position in the text
+    uint16_t position_index;  // index of the position in the vector
+
+    bool operator() (const TokenPosition& a, const TokenPosition& b) {
+        return a.position > b.position;
+    }
+  };
+
+  #define addTopOfHeapToWindow(heap,q,token_positions,token_pos) {\
+    TokenPosition top = heap.top();\
+    heap.pop();\
+    q.push(top);\
+    token_pos[top.token_id] = top.position; \
+    top.position_index++;\
+    /* Must refill the heap - push the next position of the same token */\
+    if(top.position_index < token_positions[top.token_id].size()) {\
+        heap.push(TokenPosition{top.token_id, token_positions[top.token_id][top.position_index], top.position_index});\
+    }\
+  }
+  uint16_t words_present;
+  uint16_t distance;
+
+  /*
+  *  Given *sorted positions* of each target token in a *single* document, generates a score that indicates:
+  *  a) How many tokens are present in the document
+  *  b) The proximity between the tokens in the document
+  *
+  *  We use a priority queue to read the position vectors in a sorted manner, slide a window of a given size, and
+  *  compute the max_match and min_displacement of target tokens across the windows.
+  */
+  static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_positions) {
+    const size_t WINDOW_SIZE = 20;
+    const size_t MAX_TOKENS_IN_A_QUERY = 20;
+    const uint16_t MAX_UINT_16 = std::numeric_limits<uint16_t>::max();
+
+    std::priority_queue<TokenPosition, std::vector<TokenPosition>, TokenPosition> heap;
+
+    for(uint8_t token_id=0; token_id < token_positions.size(); token_id++) {
+      heap.push(TokenPosition{token_id, token_positions[token_id].front(), 0});
+    }
+
+    // heap now contains the first occurring position of each token in the given document
+
+    uint16_t max_match = 1;
+    uint16_t min_displacement = UINT16_MAX;
+
+    std::queue<TokenPosition> q;
+    uint16_t token_pos[MAX_TOKENS_IN_A_QUERY] = { };
+    std::fill_n(token_pos, MAX_TOKENS_IN_A_QUERY, MAX_UINT_16);
+
+    do {
+      if(q.empty()) {
+        addTopOfHeapToWindow(heap, q, token_positions, token_pos);
+      }
+
+      D(cout << "Loop till window fills..." << endl;)
+
+      // Fill the queue with tokens within a given window frame size of the start position
+      // At the same time, we also record the *last* occurrence of each token within the window
+      // For e.g. if `cat` appeared at positions 1,3 and 5, we will record `token_pos[cat] = 5`
+      const uint16_t start_pos = q.front().position;
+      while(!heap.empty() && heap.top().position < start_pos+WINDOW_SIZE) {
+        addTopOfHeapToWindow(heap, q, token_positions, token_pos);
+      }
+
+      D(cout << endl << "----" << endl);
+
+      uint16_t prev_pos = MAX_UINT_16;
+      uint16_t num_match = 0;
+      uint16_t displacement = 0;
+
+      for(size_t token_id=0; token_id<token_positions.size(); token_id++) {
+        // If a token appeared within the window, we would have recorded its position
+        if(token_pos[token_id] != MAX_UINT_16) {
+          num_match++;
+          if(prev_pos == MAX_UINT_16) prev_pos = token_pos[token_id];
+          else {
+            // Calculate the distance between the tokens within the window
+            // Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other
+            D(cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_pos[token_id] << endl);
+            displacement += abs(token_pos[token_id]-prev_pos);
+            prev_pos = token_pos[token_id];
+          }
+        }
+      }
+
+      D(cout << endl << "!!!displacement: " << displacement << " | num_match: " << num_match << endl);
+
+      // Track the best `displacement` and `num_match` seen so far across all the windows
+      if(num_match >= max_match) {
+        max_match = num_match;
+        if(displacement != 0 && displacement < min_displacement) {
+          min_displacement = displacement;
+        }
+      }
+
+      // As we slide the window, drop the first token of the window from the computation
+      token_pos[q.front().token_id] = 0;
+      q.pop();
+    } while(!heap.empty());
+
+    return MatchScore{max_match, min_displacement};
+  }
+};
diff --git a/include/matchscore.h b/include/matchscore.h
deleted file mode 100644
index 50e68556..00000000
--- a/include/matchscore.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-#include <vector>
-#include <queue>
-#include <stdlib.h>
-#include <limits>
-
-#ifdef DEBUG
-#define D(x) x
-#else
-#define D(x)
-#endif
-
-struct TokenPosition {
-  uint8_t token_id;         // token identifier
-  uint16_t position;        // token's position in the text
-  uint16_t position_index;  // index of the position in the vector
-
-  bool operator() (const TokenPosition& a, const TokenPosition& b) {
-    return a.position > b.position;
-  }
-};
-
-struct MatchScore {
-  uint16_t words_present;
-  uint16_t distance;
-};
-
-#define addTopOfHeapToWindow(heap,q,token_positions,token_pos) {\
-    TokenPosition top = heap.top();\
-    heap.pop();\
-    q.push(top);\
-    token_pos[top.token_id] = top.position; \
-    top.position_index++;\
-    /* Must refill the heap - push the next position of the same token */\
-    if(top.position_index < token_positions[top.token_id].size()) {\
-        heap.push(TokenPosition{top.token_id, token_positions[top.token_id][top.position_index], top.position_index});\
-    }\
-}
-/*
- *  Given *sorted positions* of each target token in a *single* document, generates a score that indicates:
- *  a) How many tokens are present in the document
- *  b) The proximity between the tokens in the document
- *
- *  We use a priority queue to read the position vectors in a sorted manner, slide a window of a given size, and
- *  compute the max_match and min_displacement of target tokens across the windows.
- */
-MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_positions) {
-  const size_t WINDOW_SIZE = 20;
-  const size_t MAX_TOKENS_IN_A_QUERY = 20;
-  const uint16_t MAX_UINT_16 = std::numeric_limits<uint16_t>::max();
-  
-  std::priority_queue<TokenPosition, std::vector<TokenPosition>, TokenPosition> heap;
-
-  for(uint8_t token_id=0; token_id < token_positions.size(); token_id++) {
-    heap.push(TokenPosition{token_id, token_positions[token_id].front(), 0});
-  }
-
-  // heap now contains the first occurring position of each token in the given document
-
-  uint16_t max_match = 1;
-  uint16_t min_displacement = UINT16_MAX;
-
-  std::queue<TokenPosition> q;
-  uint16_t token_pos[MAX_TOKENS_IN_A_QUERY] = { };
-  std::fill_n(token_pos, MAX_TOKENS_IN_A_QUERY, MAX_UINT_16);
-
-  do {
-    if(q.empty()) {
-      addTopOfHeapToWindow(heap, q, token_positions, token_pos);
-    }
-
-    D(cout << "Loop till window fills..." << endl;)
-
-    // Fill the queue with tokens within a given window frame size of the start position
-    // At the same time, we also record the *last* occurrence of each token within the window
-    // For e.g. if `cat` appeared at positions 1,3 and 5, we will record `token_pos[cat] = 5`
-    const uint16_t start_pos = q.front().position;
-    while(!heap.empty() && heap.top().position < start_pos+WINDOW_SIZE) {
-      addTopOfHeapToWindow(heap, q, token_positions, token_pos);
-    }
-
-    D(cout << endl << "----" << endl);
-
-    uint16_t prev_pos = MAX_UINT_16;
-    uint16_t num_match = 0;
-    uint16_t displacement = 0;
-
-    for(size_t token_id=0; token_id<token_positions.size(); token_id++) {
-      // If a token appeared within the window, we would have recorded its position
-      if(token_pos[token_id] != MAX_UINT_16) {
-        num_match++;
-        if(prev_pos == MAX_UINT_16) prev_pos = token_pos[token_id];
-        else {
-          // Calculate the distance between the tokens within the window
-          // Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other
-          D(cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_pos[token_id] << endl);
-          displacement += abs(token_pos[token_id]-prev_pos);
-          prev_pos = token_pos[token_id];
-        }
-      }
-    }
-
-    D(cout << endl << "!!!displacement: " << displacement << " | num_match: " << num_match << endl);
-
-    // Track the best `displacement` and `num_match` seen so far across all the windows
-    if(num_match >= max_match) {
-      max_match = num_match;
-      if(displacement != 0 && displacement < min_displacement) {
-        min_displacement = displacement;
-      }
-    }
-
-    // As we slide the window, drop the first token of the window from the computation
-    token_pos[q.front().token_id] = 0;
-    q.pop();
-  } while(!heap.empty());
-
-  return MatchScore{max_match, min_displacement};
-}
diff --git a/include/string_utils.h b/include/string_utils.h
new file mode 100644
index 00000000..5afc87e1
--- /dev/null
+++ b/include/string_utils.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <string>
+
+struct StringUtils {
+
+    template<class ContainerT>
+    static void tokenize(const std::string &str, ContainerT &tokens,
+                  const std::string &delimiters = " ", bool trimEmpty = false) {
+        std::string::size_type pos, lastPos = 0;
+
+        using value_type = typename ContainerT::value_type;
+        using size_type  = typename ContainerT::size_type;
+
+        while (true) {
+            pos = str.find_first_of(delimiters, lastPos);
+            if (pos == std::string::npos) {
+                pos = str.length();
+
+                if (pos != lastPos || !trimEmpty)
+                    tokens.push_back(value_type(str.data() + lastPos,
+                                                (size_type) pos - lastPos));
+
+                break;
+            }
+            else {
+                if (pos != lastPos || !trimEmpty)
+                    tokens.push_back(value_type(str.data() + lastPos,
+                                                (size_type) pos - lastPos));
+            }
+
+            lastPos = pos + 1;
+        }
+    }
+
+    static std::string replace_all(std::string str, const std::string &from, const std::string &to) {
+        size_t start_pos = 0;
+        while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
+            str.replace(start_pos, from.length(), to);
+            start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
+        }
+        return str;
+    }
+};
\ No newline at end of file
diff --git a/include/topster.h b/include/topster.h
index 62a22d03..714aa2ea 100644
--- a/include/topster.h
+++ b/include/topster.h
@@ -5,10 +5,11 @@
 #include <cstdio>
 #include <algorithm>
 
+/*
+* A bounded max heap that remembers the top-K elements seen so far
+*/
 template <size_t MAX_SIZE=100>
 struct Topster {
-    // A bounded max heap that remembers the top-K elements seen so far
-
     uint64_t data[MAX_SIZE];
     uint32_t smallest_index = 0;
     uint32_t size = 0;
diff --git a/include/util.h b/include/util.h
deleted file mode 100644
index 0ec50cda..00000000
--- a/include/util.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-#include <string>
-
-template < class ContainerT >
-void tokenize(const std::string& str, ContainerT& tokens,
-              const std::string& delimiters = " ", bool trimEmpty = false)
-{
-    std::string::size_type pos, lastPos = 0;
-
-    using value_type = typename ContainerT::value_type;
-    using size_type  = typename ContainerT::size_type;
-
-    while(true)
-    {
-        pos = str.find_first_of(delimiters, lastPos);
-        if(pos == std::string::npos)
-        {
-            pos = str.length();
-
-            if(pos != lastPos || !trimEmpty)
-                tokens.push_back(value_type(str.data()+lastPos,
-                                            (size_type)pos-lastPos ));
-
-            break;
-        }
-        else
-        {
-            if(pos != lastPos || !trimEmpty)
-                tokens.push_back(value_type(str.data()+lastPos,
-                                            (size_type)pos-lastPos ));
-        }
-
-        lastPos = pos + 1;
-    }
-}
diff --git a/src/main.cpp b/src/main.cpp
index 55c48d0f..6f97d557 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,225 +1,20 @@
 #include <stdlib.h>
 #include <iostream>
 #include <fstream>
-#include <chrono>
 #include <vector>
-#include <cstdlib>
 #include <numeric>
-#include <time.h>
 #include <art.h>
 #include <unordered_map>
-#include "topster.h"
-#include "intersection.h"
-#include "matchscore.h"
-#include "util.h"
+#include "string_utils.h"
+#include "crow_all.h"
+#include "search_index.h"
 
 using namespace std;
 
-static int test_prefix_cb(void *data, const unsigned char *k, uint32_t k_len, void *val) {
-    cout << "#>>>>Key: ";
-    printf("%.*s", k_len, k);
-    cout << "LENGTH OF IDS: " << ((art_values*)val)->ids.getLength() << endl;
-
-    for(uint32_t i=0; i<((art_values*)val)->ids.getLength(); i++) {
-        cout << ", ID: " << ((art_values*)val)->ids.at(i) << endl;
-    }
-    return 0;
-}
-
-void benchmark_heap_array() {
-    srand (time(NULL));
-
-    vector<uint32_t> records;
-
-    for(uint32_t i=0; i<10000000; i++) {
-        records.push_back((const unsigned int &) rand());
-    }
-
-    vector<uint32_t> hits;
-
-    for(uint32_t i=0; i<records.size(); i++) {
-        if(i%10 == 0) {
-            hits.push_back(i);
-        }
-    }
-
-    auto begin = std::chrono::high_resolution_clock::now();
-
-    Topster<4000> heapArray;
-
-    for(uint32_t i=0; i<hits.size(); i++) {
-        heapArray.add(i, records[hits[i]]);
-    }
-
-    std::sort(std::begin(heapArray.data), std::end(heapArray.data));
-
-    long long int timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
-
-    for(uint32_t i=0; i<heapArray.size; i++) {
-        cout << "Res: " << heapArray.data[i] << endl;
-    }
-
-    cout << "Time taken: " << timeMillis << endl;
-}
-
-void index_document(art_tree& t, uint32_t doc_id, vector<string> tokens, uint16_t score) {
-    unordered_map<string, vector<uint32_t>> token_to_offsets;
-
-    for(uint32_t i=0; i<tokens.size(); i++) {
-      auto token = tokens[i];
-      std::transform(token.begin(), token.end(), token.begin(), ::tolower);
-      token_to_offsets[token].push_back(i);
-    }
-
-    for(auto & kv: token_to_offsets) {
-      art_document document;
-      document.id = doc_id;
-      document.score = score;
-      document.offsets_len = (uint32_t) kv.second.size();
-      document.offsets = new uint32_t[kv.second.size()];
-
-      uint32_t num_hits = document.offsets_len;
-      art_leaf* leaf = (art_leaf *) art_search(&t, (const unsigned char *) kv.first.c_str(), (int) kv.first.length());
-      if(leaf != NULL) {
-        num_hits += leaf->token_count;
-      }
-
-      for(auto i=0; i<kv.second.size(); i++) {
-        document.offsets[i] = kv.second[i];
-      }
-
-      art_insert(&t, (const unsigned char *) kv.first.c_str(), (int) kv.first.length(), &document, num_hits);
-      delete document.offsets;
-    }
-}
-
-/*
-   1. Split q into tokens
-   2. For each token, look up ids using exact lookup
-       a. If a token has no result, try again with edit distance of 1, and then 2
-   3. Do a limited cartesian product of the word suggestions for each token to form possible corrected search phrases
-      (adapted from: http://stackoverflow.com/a/31169617/131050)
-   4. Intersect the lists to find docs that match each phrase
-   5. Sort the docs based on some ranking criteria
- */
-void find_documents(art_tree & t, unordered_map<uint32_t, uint16_t>& docscores, string query, size_t max_results) {
-  vector<string> tokens;
-  tokenize(query, tokens, " ", true);
-
-  vector<vector<art_leaf*>> token_leaves;
-  for(string token: tokens) {
-    vector<art_leaf*> leaves;
-    int max_cost = 2;
-    art_iter_fuzzy_prefix(&t, (const unsigned char *) token.c_str(), (int) token.length(), max_cost, 10, leaves);
-    if(!leaves.empty()) {
-      for(auto i=0; i<leaves.size(); i++) {
-        //printf("%s - ", token.c_str());
-        //printf("%.*s", leaves[i]->key_len, leaves[i]->key);
-        //printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->token_count);
-      }
-      token_leaves.push_back(leaves);
-    }
-  }
-
-  Topster<100> topster;
-  size_t total_results = 0;
-  const size_t combination_limit = 10;
-  auto product = []( long long a, vector<art_leaf*>& b ) { return a*b.size(); };
-  long long int N = accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
-
-  for(long long n=0; n<N && n<combination_limit; ++n) {
-    // every element in vector `query_suggestion` represents a token and its associated hits
-    vector<art_leaf*> query_suggestion(token_leaves.size());
-
-    // generate the next combination from `token_leaves` and store it in `query_suggestion`
-    ldiv_t q { n, 0 };
-    for( long long i=token_leaves.size()-1 ; 0<=i ; --i ) {
-        q = div(q.quot, token_leaves[i].size());
-        query_suggestion[i] = token_leaves[i][q.rem];
-    }
-
-    // sort ascending based on matched documents for each token to perform effective intersection
-    sort(query_suggestion.begin(), query_suggestion.end(), [](const art_leaf* left, const art_leaf* right) {
-      return left->values->ids.getLength() < right->values->ids.getLength();
-    });
-
-    // initialize results with the starting element (for further intersection)
-    uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
-    size_t result_size = query_suggestion[0]->values->ids.getLength();
-
-    if(result_size == 0) continue;
-
-    // intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
-    for(auto i=1; i < query_suggestion.size(); i++) {
-        uint32_t* out = new uint32_t[result_size];
-        uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
-        result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
-        delete result_ids;
-        delete curr;
-        result_ids = out;
-    }
-
-    //cout << "2result_size: " << result_size << endl;
-
-    // go through each matching document id and calculate match score
-    for(auto i=0; i<result_size; i++) {
-        uint32_t doc_id = result_ids[i];
-        std::vector<std::vector<uint16_t>> token_positions;
-
-        // for each token in the query, find the positions that it appears in this document
-        for (art_leaf *token_leaf : query_suggestion) {
-            vector<uint16_t> positions;
-            uint32_t doc_index = token_leaf->values->ids.indexOf(doc_id);
-            uint32_t offset_index = token_leaf->values->offset_index.at(doc_index);
-            uint32_t num_offsets = token_leaf->values->offsets.at(offset_index);
-            for (auto offset_count = 1; offset_count <= num_offsets; offset_count++) {
-              positions.push_back((uint16_t) token_leaf->values->offsets.at(offset_index + offset_count));
-            }
-            token_positions.push_back(positions);
-        }
-
-        MatchScore mscore = match_score(doc_id, token_positions);
-        const uint32_t cumulativeScore = ((uint32_t)(mscore.words_present * 16 + (20 - mscore.distance)) * 64000) + docscores[doc_id];
-
-//        cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: " << (int)mscore.distance << " - mscore.words_present: " << (int)mscore.words_present
-//             << " - docscores[doc_id]: " << (int)docscores[doc_id] << "  - cumulativeScore: " << cumulativeScore << endl;
-        topster.add(doc_id, cumulativeScore);
-    }
-
-    total_results += result_size;
-    delete result_ids;
-
-    if(total_results >= max_results) break;
-  }
-
-  topster.sort();
-
-  //cout << "RESULTS: " << endl << endl;
-
-  for(uint32_t i=0; i<topster.size; i++) {
-    uint32_t id = topster.getKeyAt(i);
-    cout << "ID: " << id << endl;
-  }
-
-  //cin.get();
-}
-
-std::string ReplaceAll(std::string str, const std::string& from, const std::string& to) {
-  size_t start_pos = 0;
-  while((start_pos = str.find(from, start_pos)) != std::string::npos) {
-    str.replace(start_pos, from.length(), to);
-    start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
-  }
-  return str;
-}
-
 int main() {
-    art_tree t;
-    art_tree_init(&t);
+    SearchIndex *index = new SearchIndex();
 
-    unordered_map<uint32_t, uint16_t> docscores;
-
-//    std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt");
+    //std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt");
     std::ifstream infile("/Users/kishore/Downloads/hnstories.tsv");
 
     std::string line;
@@ -227,63 +22,23 @@ int main() {
 
     while (std::getline(infile, line)) {
         vector<string> parts;
-        tokenize(line, parts, "\t", true);
-        line = ReplaceAll(line, "\"", "");
+        StringUtils::tokenize(line, parts, "\t", true);
+        line = StringUtils::replace_all(line, "\"", "");
 
         vector<string> tokens;
-        tokenize(parts[0], tokens, " ", true);
+        StringUtils::tokenize(parts[0], tokens, " ", true);
 
         if(parts.size() != 2) continue;
-
-        if(doc_id == 857622 || doc_id == 52838 || doc_id == 56961) {
-          cout << "Doc " << doc_id << ": " << line << endl;
-        }
-
-        //cout << "Doc " << doc_id << ": " << line << endl;
-
-        docscores[doc_id] = (uint16_t) stoi(parts[1]);
-        index_document(t, doc_id, tokens, stoi(parts[1]));
+        index->add(doc_id, tokens, stoi(parts[1]));
         doc_id++;
     }
 
     cout << "FINISHED INDEXING!" << endl << flush;
 
-    /*const unsigned char *prefix = (const unsigned char *) "the";
-    size_t prefix_len = strlen((const char *) prefix);
-    std::vector<art_leaf*> results;
-
     auto begin = std::chrono::high_resolution_clock::now();
-    art_iter_fuzzy_prefix(&t, prefix, prefix_len, 0, 2, results);
+    index->search("thei rserch", 100);
     long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
-
-    art_iter_prefix(&t, prefix, strlen((const char *) prefix), test_prefix_cb, NULL);
-    art_iter(&t, test_prefix_cb, NULL);
-
     cout << "Time taken: " << timeMillis << "us" << endl;
-
-    for(auto leaf: results) {
-        std::cout << ">>>>/Key: " << leaf->key << " - score: " << leaf->score << std::endl;
-        for(uint32_t i=0; i<leaf->values->ids.getLength(); i++) {
-            std::cout << ", ID: " << leaf->values->ids.at(i) << std::endl;
-        }
-        std::cout << ", Value: " << leaf->values->ids.at(0) << std::endl;
-    }*/
-
-    auto begin = std::chrono::high_resolution_clock::now();
-    find_documents(t, docscores, "thei rserch", 10);
-    long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
-
-//    string token = "nternet";
-//    vector<art_leaf*> leaves;
-//
-//    art_iter_fuzzy_prefix(&t, (const unsigned char *) token.c_str(), (int) token.length(), 1, 10, leaves);
-//    for(auto leaf: leaves) {
-//      printf("Word: %.*s", leaf->key_len, leaf->key);
-//      cout << " - score: " << leaf->token_count << endl;
-//    }
-
-    cout << "Time taken: " << timeMillis << "us" << endl;
-
-    art_tree_destroy(&t);
+    delete index;
     return 0;
 }
\ No newline at end of file
diff --git a/src/search_index.cpp b/src/search_index.cpp
new file mode 100644
index 00000000..db85df57
--- /dev/null
+++ b/src/search_index.cpp
@@ -0,0 +1,159 @@
+#include "search_index.h"
+
+#include <iostream>
+#include <numeric>
+#include <topster.h>
+#include <intersection.h>
+#include <match_score.h>
+#include <string_utils.h>
+
+SearchIndex::SearchIndex() {
+    art_tree_init(&t);
+}
+
+SearchIndex::~SearchIndex() {
+    art_tree_destroy(&t);
+}
+
+void SearchIndex::add(uint32_t doc_id, std::vector<std::string> tokens, uint16_t score) {
+    std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
+
+    for(uint32_t i=0; i<tokens.size(); i++) {
+        auto token = tokens[i];
+        std::transform(token.begin(), token.end(), token.begin(), ::tolower);
+        token_to_offsets[token].push_back(i);
+    }
+
+    for(auto & kv: token_to_offsets) {
+        art_document document;
+        document.id = doc_id;
+        document.score = score;
+        document.offsets_len = (uint32_t) kv.second.size();
+        document.offsets = new uint32_t[kv.second.size()];
+
+        uint32_t num_hits = document.offsets_len;
+        art_leaf* leaf = (art_leaf *) art_search(&t, (const unsigned char *) kv.first.c_str(), (int) kv.first.length());
+        if(leaf != NULL) {
+            num_hits += leaf->token_count;
+        }
+
+        for(auto i=0; i<kv.second.size(); i++) {
+            document.offsets[i] = kv.second[i];
+        }
+
+        art_insert(&t, (const unsigned char *) kv.first.c_str(), (int) kv.first.length(), &document, num_hits);
+        delete document.offsets;
+    }
+
+    doc_scores[doc_id] = score;
+}
+
+
+/*
+   1. Split q into tokens
+   2. For each token, look up ids using exact lookup
+       a. If a token has no result, try again with edit distance of 1, and then 2
+   3. Do a limited cartesian product of the word suggestions for each token to form possible corrected search phrases
+      (adapted from: http://stackoverflow.com/a/31169617/131050)
+   4. Intersect the lists to find docs that match each phrase
+   5. Sort the docs based on some ranking criteria
+ */
+void SearchIndex::search(std::string query, size_t max_results) {
+    std::vector<std::string> tokens;
+    StringUtils::tokenize(query, tokens, " ", true);
+
+    std::vector<std::vector<art_leaf*>> token_leaves;
+    for(std::string token: tokens) {
+        std::vector<art_leaf*> leaves;
+        int max_cost = 2;
+        art_iter_fuzzy_prefix(&t, (const unsigned char *) token.c_str(), (int) token.length(), max_cost, 10, leaves);
+        if(!leaves.empty()) {
+            for(auto i=0; i<leaves.size(); i++) {
+                //printf("%s - ", token.c_str());
+                //printf("%.*s", leaves[i]->key_len, leaves[i]->key);
+                //printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->token_count);
+            }
+            token_leaves.push_back(leaves);
+        }
+    }
+
+    Topster<100> topster;
+    size_t total_results = 0;
+    const size_t combination_limit = 10;
+    auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
+    long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
+
+    for(long long n=0; n<N && n<combination_limit; ++n) {
+        // every element in vector `query_suggestion` represents a token and its associated hits
+        std::vector<art_leaf*> query_suggestion(token_leaves.size());
+
+        // generate the next combination from `token_leaves` and store it in `query_suggestion`
+        ldiv_t q { n, 0 };
+        for(long long i=token_leaves.size()-1 ; 0<=i ; --i ) {
+            q = ldiv(q.quot, token_leaves[i].size());
+            query_suggestion[i] = token_leaves[i][q.rem];
+        }
+
+        // sort ascending based on matched documents for each token to perform effective intersection
+        sort(query_suggestion.begin(), query_suggestion.end(), [](const art_leaf* left, const art_leaf* right) {
+            return left->values->ids.getLength() < right->values->ids.getLength();
+        });
+
+        // initialize results with the starting element (for further intersection)
+        uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
+        size_t result_size = query_suggestion[0]->values->ids.getLength();
+
+        if(result_size == 0) continue;
+
+        // intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
+        for(auto i=1; i < query_suggestion.size(); i++) {
+            uint32_t* out = new uint32_t[result_size];
+            uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
+            result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
+            delete result_ids;
+            delete curr;
+            result_ids = out;
+        }
+
+        //cout << "2result_size: " << result_size << endl;
+
+        // go through each matching document id and calculate match score
+        for(auto i=0; i<result_size; i++) {
+            uint32_t doc_id = result_ids[i];
+            std::vector<std::vector<uint16_t>> token_positions;
+
+            // for each token in the query, find the positions that it appears in this document
+            for (art_leaf *token_leaf : query_suggestion) {
+                std::vector<uint16_t> positions;
+                uint32_t doc_index = token_leaf->values->ids.indexOf(doc_id);
+                uint32_t offset_index = token_leaf->values->offset_index.at(doc_index);
+                uint32_t num_offsets = token_leaf->values->offsets.at(offset_index);
+                for (auto offset_count = 1; offset_count <= num_offsets; offset_count++) {
+                    positions.push_back((uint16_t) token_leaf->values->offsets.at(offset_index + offset_count));
+                }
+                token_positions.push_back(positions);
+            }
+
+            MatchScore mscore = MatchScore::match_score(doc_id, token_positions);
+            const uint32_t cumulativeScore = ((uint32_t)(mscore.words_present * 16 + (20 - mscore.distance)) * 64000) + doc_scores[doc_id];
+
+//        cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: " << (int)mscore.distance << " - mscore.words_present: " << (int)mscore.words_present
+//             << " - docscores[doc_id]: " << (int)docscores[doc_id] << "  - cumulativeScore: " << cumulativeScore << endl;
+            topster.add(doc_id, cumulativeScore);
+        }
+
+        total_results += result_size;
+        delete result_ids;
+
+        if(total_results >= max_results) break;
+    }
+
+    topster.sort();
+
+    //cout << "RESULTS: " << endl << endl;
+
+    for(uint32_t i=0; i<topster.size; i++) {
+        uint32_t id = topster.getKeyAt(i);
+        std::cout << "ID: " << id << std::endl;
+    }
+}
diff --git a/src/search_index.h b/src/search_index.h
new file mode 100644
index 00000000..f87f56d9
--- /dev/null
+++ b/src/search_index.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <art.h>
+#include <unordered_map>
+
+class SearchIndex {
+private:
+    art_tree t;
+    std::unordered_map<uint32_t, uint16_t> doc_scores;
+public:
+    SearchIndex();
+    ~SearchIndex();
+    void add(uint32_t doc_id, std::vector<std::string> tokens, uint16_t score);
+    void search(std::string query, size_t max_results);
+};
+