From ba33da1d51a6b84384aab9d13300181e59f2c083 Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sun, 7 Aug 2016 14:55:26 -0700 Subject: [PATCH] Lots of code clean up. * Move stuff out of main to classes * Standardize naming conventions. --- CMakeLists.txt | 4 +- README.md | 5 + include/{IdGenerator.h => id_generator.h} | 0 include/match_score.h | 121 ++++++++++ include/matchscore.h | 121 ---------- include/string_utils.h | 44 ++++ include/topster.h | 5 +- include/util.h | 36 --- src/main.cpp | 267 +--------------------- src/search_index.cpp | 159 +++++++++++++ src/search_index.h | 18 ++ 11 files changed, 363 insertions(+), 417 deletions(-) rename include/{IdGenerator.h => id_generator.h} (100%) create mode 100644 include/match_score.h delete mode 100644 include/matchscore.h create mode 100644 include/string_utils.h delete mode 100644 include/util.h create mode 100644 src/search_index.cpp create mode 100644 src/search_index.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c7e1eb54..d1be8006 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,5 +7,5 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -stdlib=libc++ -std=gnu include_directories(include) include_directories(external/for) -add_executable(search src/art.cpp src/intersection.cpp src/main.cpp) -target_link_libraries(search ${CMAKE_SOURCE_DIR}/external/for/libfor.a) +add_executable(search src/art.cpp src/intersection.cpp src/main.cpp src/search_index.cpp src/search_index.h) +target_link_libraries(search ${CMAKE_SOURCE_DIR}/external/for/libfor.a boost_system) diff --git a/README.md b/README.md index df888421..df83fb43 100644 --- a/README.md +++ b/README.md @@ -6,4 +6,9 @@ A typo tolerant, open source search engine that helps you build delightful searc * [libfor](https://github.com/cruppstahl/for/) +## Building + +* Switch to `external/libfor` and build libfor +* Install `boost` + © 2016 Wreally Studios Inc. \ No newline at end of file diff --git a/include/IdGenerator.h b/include/id_generator.h similarity index 100% rename from include/IdGenerator.h rename to include/id_generator.h diff --git a/include/match_score.h b/include/match_score.h new file mode 100644 index 00000000..67002bb8 --- /dev/null +++ b/include/match_score.h @@ -0,0 +1,121 @@ +#pragma once + +#include +#include +#include +#include +#include + +#ifdef DEBUG +#define D(x) x +#else +#define D(x) +#endif + +struct MatchScore { + struct TokenPosition { + uint8_t token_id; // token identifier + uint16_t position; // token's position in the text + uint16_t position_index; // index of the position in the vector + + bool operator() (const TokenPosition& a, const TokenPosition& b) { + return a.position > b.position; + } + }; + + #define addTopOfHeapToWindow(heap,q,token_positions,token_pos) {\ + TokenPosition top = heap.top();\ + heap.pop();\ + q.push(top);\ + token_pos[top.token_id] = top.position; \ + top.position_index++;\ + /* Must refill the heap - push the next position of the same token */\ + if(top.position_index < token_positions[top.token_id].size()) {\ + heap.push(TokenPosition{top.token_id, token_positions[top.token_id][top.position_index], top.position_index});\ + }\ + } + uint16_t words_present; + uint16_t distance; + + /* + * Given *sorted positions* of each target token in a *single* document, generates a score that indicates: + * a) How many tokens are present in the document + * b) The proximity between the tokens in the document + * + * We use a priority queue to read the position vectors in a sorted manner, slide a window of a given size, and + * compute the max_match and min_displacement of target tokens across the windows. + */ + static MatchScore match_score(uint32_t doc_id, std::vector> &token_positions) { + const size_t WINDOW_SIZE = 20; + const size_t MAX_TOKENS_IN_A_QUERY = 20; + const uint16_t MAX_UINT_16 = std::numeric_limits::max(); + + std::priority_queue, TokenPosition> heap; + + for(uint8_t token_id=0; token_id < token_positions.size(); token_id++) { + heap.push(TokenPosition{token_id, token_positions[token_id].front(), 0}); + } + + // heap now contains the first occurring position of each token in the given document + + uint16_t max_match = 1; + uint16_t min_displacement = UINT16_MAX; + + std::queue q; + uint16_t token_pos[MAX_TOKENS_IN_A_QUERY] = { }; + std::fill_n(token_pos, MAX_TOKENS_IN_A_QUERY, MAX_UINT_16); + + do { + if(q.empty()) { + addTopOfHeapToWindow(heap, q, token_positions, token_pos); + } + + D(cout << "Loop till window fills..." << endl;) + + // Fill the queue with tokens within a given window frame size of the start position + // At the same time, we also record the *last* occurrence of each token within the window + // For e.g. if `cat` appeared at positions 1,3 and 5, we will record `token_pos[cat] = 5` + const uint16_t start_pos = q.front().position; + while(!heap.empty() && heap.top().position < start_pos+WINDOW_SIZE) { + addTopOfHeapToWindow(heap, q, token_positions, token_pos); + } + + D(cout << endl << "----" << endl); + + uint16_t prev_pos = MAX_UINT_16; + uint16_t num_match = 0; + uint16_t displacement = 0; + + for(size_t token_id=0; token_id= max_match) { + max_match = num_match; + if(displacement != 0 && displacement < min_displacement) { + min_displacement = displacement; + } + } + + // As we slide the window, drop the first token of the window from the computation + token_pos[q.front().token_id] = 0; + q.pop(); + } while(!heap.empty()); + + return MatchScore{max_match, min_displacement}; + } +}; diff --git a/include/matchscore.h b/include/matchscore.h deleted file mode 100644 index 50e68556..00000000 --- a/include/matchscore.h +++ /dev/null @@ -1,121 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -#ifdef DEBUG -#define D(x) x -#else -#define D(x) -#endif - -struct TokenPosition { - uint8_t token_id; // token identifier - uint16_t position; // token's position in the text - uint16_t position_index; // index of the position in the vector - - bool operator() (const TokenPosition& a, const TokenPosition& b) { - return a.position > b.position; - } -}; - -struct MatchScore { - uint16_t words_present; - uint16_t distance; -}; - -#define addTopOfHeapToWindow(heap,q,token_positions,token_pos) {\ - TokenPosition top = heap.top();\ - heap.pop();\ - q.push(top);\ - token_pos[top.token_id] = top.position; \ - top.position_index++;\ - /* Must refill the heap - push the next position of the same token */\ - if(top.position_index < token_positions[top.token_id].size()) {\ - heap.push(TokenPosition{top.token_id, token_positions[top.token_id][top.position_index], top.position_index});\ - }\ -} -/* - * Given *sorted positions* of each target token in a *single* document, generates a score that indicates: - * a) How many tokens are present in the document - * b) The proximity between the tokens in the document - * - * We use a priority queue to read the position vectors in a sorted manner, slide a window of a given size, and - * compute the max_match and min_displacement of target tokens across the windows. - */ -MatchScore match_score(uint32_t doc_id, std::vector> &token_positions) { - const size_t WINDOW_SIZE = 20; - const size_t MAX_TOKENS_IN_A_QUERY = 20; - const uint16_t MAX_UINT_16 = std::numeric_limits::max(); - - std::priority_queue, TokenPosition> heap; - - for(uint8_t token_id=0; token_id < token_positions.size(); token_id++) { - heap.push(TokenPosition{token_id, token_positions[token_id].front(), 0}); - } - - // heap now contains the first occurring position of each token in the given document - - uint16_t max_match = 1; - uint16_t min_displacement = UINT16_MAX; - - std::queue q; - uint16_t token_pos[MAX_TOKENS_IN_A_QUERY] = { }; - std::fill_n(token_pos, MAX_TOKENS_IN_A_QUERY, MAX_UINT_16); - - do { - if(q.empty()) { - addTopOfHeapToWindow(heap, q, token_positions, token_pos); - } - - D(cout << "Loop till window fills..." << endl;) - - // Fill the queue with tokens within a given window frame size of the start position - // At the same time, we also record the *last* occurrence of each token within the window - // For e.g. if `cat` appeared at positions 1,3 and 5, we will record `token_pos[cat] = 5` - const uint16_t start_pos = q.front().position; - while(!heap.empty() && heap.top().position < start_pos+WINDOW_SIZE) { - addTopOfHeapToWindow(heap, q, token_positions, token_pos); - } - - D(cout << endl << "----" << endl); - - uint16_t prev_pos = MAX_UINT_16; - uint16_t num_match = 0; - uint16_t displacement = 0; - - for(size_t token_id=0; token_id= max_match) { - max_match = num_match; - if(displacement != 0 && displacement < min_displacement) { - min_displacement = displacement; - } - } - - // As we slide the window, drop the first token of the window from the computation - token_pos[q.front().token_id] = 0; - q.pop(); - } while(!heap.empty()); - - return MatchScore{max_match, min_displacement}; -} diff --git a/include/string_utils.h b/include/string_utils.h new file mode 100644 index 00000000..5afc87e1 --- /dev/null +++ b/include/string_utils.h @@ -0,0 +1,44 @@ +#pragma once + +#include + +struct StringUtils { + + template + static void tokenize(const std::string &str, ContainerT &tokens, + const std::string &delimiters = " ", bool trimEmpty = false) { + std::string::size_type pos, lastPos = 0; + + using value_type = typename ContainerT::value_type; + using size_type = typename ContainerT::size_type; + + while (true) { + pos = str.find_first_of(delimiters, lastPos); + if (pos == std::string::npos) { + pos = str.length(); + + if (pos != lastPos || !trimEmpty) + tokens.push_back(value_type(str.data() + lastPos, + (size_type) pos - lastPos)); + + break; + } + else { + if (pos != lastPos || !trimEmpty) + tokens.push_back(value_type(str.data() + lastPos, + (size_type) pos - lastPos)); + } + + lastPos = pos + 1; + } + } + + static std::string replace_all(std::string str, const std::string &from, const std::string &to) { + size_t start_pos = 0; + while ((start_pos = str.find(from, start_pos)) != std::string::npos) { + str.replace(start_pos, from.length(), to); + start_pos += to.length(); // Handles case where 'to' is a substring of 'from' + } + return str; + } +}; \ No newline at end of file diff --git a/include/topster.h b/include/topster.h index 62a22d03..714aa2ea 100644 --- a/include/topster.h +++ b/include/topster.h @@ -5,10 +5,11 @@ #include #include +/* +* A bounded max heap that remembers the top-K elements seen so far +*/ template struct Topster { - // A bounded max heap that remembers the top-K elements seen so far - uint64_t data[MAX_SIZE]; uint32_t smallest_index = 0; uint32_t size = 0; diff --git a/include/util.h b/include/util.h deleted file mode 100644 index 0ec50cda..00000000 --- a/include/util.h +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once - -#include - -template < class ContainerT > -void tokenize(const std::string& str, ContainerT& tokens, - const std::string& delimiters = " ", bool trimEmpty = false) -{ - std::string::size_type pos, lastPos = 0; - - using value_type = typename ContainerT::value_type; - using size_type = typename ContainerT::size_type; - - while(true) - { - pos = str.find_first_of(delimiters, lastPos); - if(pos == std::string::npos) - { - pos = str.length(); - - if(pos != lastPos || !trimEmpty) - tokens.push_back(value_type(str.data()+lastPos, - (size_type)pos-lastPos )); - - break; - } - else - { - if(pos != lastPos || !trimEmpty) - tokens.push_back(value_type(str.data()+lastPos, - (size_type)pos-lastPos )); - } - - lastPos = pos + 1; - } -} diff --git a/src/main.cpp b/src/main.cpp index 55c48d0f..6f97d557 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,225 +1,20 @@ #include #include #include -#include #include -#include #include -#include #include #include -#include "topster.h" -#include "intersection.h" -#include "matchscore.h" -#include "util.h" +#include "string_utils.h" +#include "crow_all.h" +#include "search_index.h" using namespace std; -static int test_prefix_cb(void *data, const unsigned char *k, uint32_t k_len, void *val) { - cout << "#>>>>Key: "; - printf("%.*s", k_len, k); - cout << "LENGTH OF IDS: " << ((art_values*)val)->ids.getLength() << endl; - - for(uint32_t i=0; i<((art_values*)val)->ids.getLength(); i++) { - cout << ", ID: " << ((art_values*)val)->ids.at(i) << endl; - } - return 0; -} - -void benchmark_heap_array() { - srand (time(NULL)); - - vector records; - - for(uint32_t i=0; i<10000000; i++) { - records.push_back((const unsigned int &) rand()); - } - - vector hits; - - for(uint32_t i=0; i heapArray; - - for(uint32_t i=0; i(std::chrono::high_resolution_clock::now() - begin).count(); - - for(uint32_t i=0; i tokens, uint16_t score) { - unordered_map> token_to_offsets; - - for(uint32_t i=0; itoken_count; - } - - for(auto i=0; i& docscores, string query, size_t max_results) { - vector tokens; - tokenize(query, tokens, " ", true); - - vector> token_leaves; - for(string token: tokens) { - vector leaves; - int max_cost = 2; - art_iter_fuzzy_prefix(&t, (const unsigned char *) token.c_str(), (int) token.length(), max_cost, 10, leaves); - if(!leaves.empty()) { - for(auto i=0; ikey_len, leaves[i]->key); - //printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->token_count); - } - token_leaves.push_back(leaves); - } - } - - Topster<100> topster; - size_t total_results = 0; - const size_t combination_limit = 10; - auto product = []( long long a, vector& b ) { return a*b.size(); }; - long long int N = accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product ); - - for(long long n=0; n query_suggestion(token_leaves.size()); - - // generate the next combination from `token_leaves` and store it in `query_suggestion` - ldiv_t q { n, 0 }; - for( long long i=token_leaves.size()-1 ; 0<=i ; --i ) { - q = div(q.quot, token_leaves[i].size()); - query_suggestion[i] = token_leaves[i][q.rem]; - } - - // sort ascending based on matched documents for each token to perform effective intersection - sort(query_suggestion.begin(), query_suggestion.end(), [](const art_leaf* left, const art_leaf* right) { - return left->values->ids.getLength() < right->values->ids.getLength(); - }); - - // initialize results with the starting element (for further intersection) - uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress(); - size_t result_size = query_suggestion[0]->values->ids.getLength(); - - if(result_size == 0) continue; - - // intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`) - for(auto i=1; i < query_suggestion.size(); i++) { - uint32_t* out = new uint32_t[result_size]; - uint32_t* curr = query_suggestion[i]->values->ids.uncompress(); - result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out); - delete result_ids; - delete curr; - result_ids = out; - } - - //cout << "2result_size: " << result_size << endl; - - // go through each matching document id and calculate match score - for(auto i=0; i> token_positions; - - // for each token in the query, find the positions that it appears in this document - for (art_leaf *token_leaf : query_suggestion) { - vector positions; - uint32_t doc_index = token_leaf->values->ids.indexOf(doc_id); - uint32_t offset_index = token_leaf->values->offset_index.at(doc_index); - uint32_t num_offsets = token_leaf->values->offsets.at(offset_index); - for (auto offset_count = 1; offset_count <= num_offsets; offset_count++) { - positions.push_back((uint16_t) token_leaf->values->offsets.at(offset_index + offset_count)); - } - token_positions.push_back(positions); - } - - MatchScore mscore = match_score(doc_id, token_positions); - const uint32_t cumulativeScore = ((uint32_t)(mscore.words_present * 16 + (20 - mscore.distance)) * 64000) + docscores[doc_id]; - -// cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: " << (int)mscore.distance << " - mscore.words_present: " << (int)mscore.words_present -// << " - docscores[doc_id]: " << (int)docscores[doc_id] << " - cumulativeScore: " << cumulativeScore << endl; - topster.add(doc_id, cumulativeScore); - } - - total_results += result_size; - delete result_ids; - - if(total_results >= max_results) break; - } - - topster.sort(); - - //cout << "RESULTS: " << endl << endl; - - for(uint32_t i=0; i docscores; - -// std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt"); + //std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt"); std::ifstream infile("/Users/kishore/Downloads/hnstories.tsv"); std::string line; @@ -227,63 +22,23 @@ int main() { while (std::getline(infile, line)) { vector parts; - tokenize(line, parts, "\t", true); - line = ReplaceAll(line, "\"", ""); + StringUtils::tokenize(line, parts, "\t", true); + line = StringUtils::replace_all(line, "\"", ""); vector tokens; - tokenize(parts[0], tokens, " ", true); + StringUtils::tokenize(parts[0], tokens, " ", true); if(parts.size() != 2) continue; - - if(doc_id == 857622 || doc_id == 52838 || doc_id == 56961) { - cout << "Doc " << doc_id << ": " << line << endl; - } - - //cout << "Doc " << doc_id << ": " << line << endl; - - docscores[doc_id] = (uint16_t) stoi(parts[1]); - index_document(t, doc_id, tokens, stoi(parts[1])); + index->add(doc_id, tokens, stoi(parts[1])); doc_id++; } cout << "FINISHED INDEXING!" << endl << flush; - /*const unsigned char *prefix = (const unsigned char *) "the"; - size_t prefix_len = strlen((const char *) prefix); - std::vector results; - auto begin = std::chrono::high_resolution_clock::now(); - art_iter_fuzzy_prefix(&t, prefix, prefix_len, 0, 2, results); + index->search("thei rserch", 100); long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); - - art_iter_prefix(&t, prefix, strlen((const char *) prefix), test_prefix_cb, NULL); - art_iter(&t, test_prefix_cb, NULL); - cout << "Time taken: " << timeMillis << "us" << endl; - - for(auto leaf: results) { - std::cout << ">>>>/Key: " << leaf->key << " - score: " << leaf->score << std::endl; - for(uint32_t i=0; ivalues->ids.getLength(); i++) { - std::cout << ", ID: " << leaf->values->ids.at(i) << std::endl; - } - std::cout << ", Value: " << leaf->values->ids.at(0) << std::endl; - }*/ - - auto begin = std::chrono::high_resolution_clock::now(); - find_documents(t, docscores, "thei rserch", 10); - long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); - -// string token = "nternet"; -// vector leaves; -// -// art_iter_fuzzy_prefix(&t, (const unsigned char *) token.c_str(), (int) token.length(), 1, 10, leaves); -// for(auto leaf: leaves) { -// printf("Word: %.*s", leaf->key_len, leaf->key); -// cout << " - score: " << leaf->token_count << endl; -// } - - cout << "Time taken: " << timeMillis << "us" << endl; - - art_tree_destroy(&t); + delete index; return 0; } \ No newline at end of file diff --git a/src/search_index.cpp b/src/search_index.cpp new file mode 100644 index 00000000..db85df57 --- /dev/null +++ b/src/search_index.cpp @@ -0,0 +1,159 @@ +#include "search_index.h" + +#include +#include +#include +#include +#include +#include + +SearchIndex::SearchIndex() { + art_tree_init(&t); +} + +SearchIndex::~SearchIndex() { + art_tree_destroy(&t); +} + +void SearchIndex::add(uint32_t doc_id, std::vector tokens, uint16_t score) { + std::unordered_map> token_to_offsets; + + for(uint32_t i=0; itoken_count; + } + + for(auto i=0; i tokens; + StringUtils::tokenize(query, tokens, " ", true); + + std::vector> token_leaves; + for(std::string token: tokens) { + std::vector leaves; + int max_cost = 2; + art_iter_fuzzy_prefix(&t, (const unsigned char *) token.c_str(), (int) token.length(), max_cost, 10, leaves); + if(!leaves.empty()) { + for(auto i=0; ikey_len, leaves[i]->key); + //printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->token_count); + } + token_leaves.push_back(leaves); + } + } + + Topster<100> topster; + size_t total_results = 0; + const size_t combination_limit = 10; + auto product = []( long long a, std::vector& b ) { return a*b.size(); }; + long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product ); + + for(long long n=0; n query_suggestion(token_leaves.size()); + + // generate the next combination from `token_leaves` and store it in `query_suggestion` + ldiv_t q { n, 0 }; + for(long long i=token_leaves.size()-1 ; 0<=i ; --i ) { + q = ldiv(q.quot, token_leaves[i].size()); + query_suggestion[i] = token_leaves[i][q.rem]; + } + + // sort ascending based on matched documents for each token to perform effective intersection + sort(query_suggestion.begin(), query_suggestion.end(), [](const art_leaf* left, const art_leaf* right) { + return left->values->ids.getLength() < right->values->ids.getLength(); + }); + + // initialize results with the starting element (for further intersection) + uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress(); + size_t result_size = query_suggestion[0]->values->ids.getLength(); + + if(result_size == 0) continue; + + // intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`) + for(auto i=1; i < query_suggestion.size(); i++) { + uint32_t* out = new uint32_t[result_size]; + uint32_t* curr = query_suggestion[i]->values->ids.uncompress(); + result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out); + delete result_ids; + delete curr; + result_ids = out; + } + + //cout << "2result_size: " << result_size << endl; + + // go through each matching document id and calculate match score + for(auto i=0; i> token_positions; + + // for each token in the query, find the positions that it appears in this document + for (art_leaf *token_leaf : query_suggestion) { + std::vector positions; + uint32_t doc_index = token_leaf->values->ids.indexOf(doc_id); + uint32_t offset_index = token_leaf->values->offset_index.at(doc_index); + uint32_t num_offsets = token_leaf->values->offsets.at(offset_index); + for (auto offset_count = 1; offset_count <= num_offsets; offset_count++) { + positions.push_back((uint16_t) token_leaf->values->offsets.at(offset_index + offset_count)); + } + token_positions.push_back(positions); + } + + MatchScore mscore = MatchScore::match_score(doc_id, token_positions); + const uint32_t cumulativeScore = ((uint32_t)(mscore.words_present * 16 + (20 - mscore.distance)) * 64000) + doc_scores[doc_id]; + +// cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: " << (int)mscore.distance << " - mscore.words_present: " << (int)mscore.words_present +// << " - docscores[doc_id]: " << (int)docscores[doc_id] << " - cumulativeScore: " << cumulativeScore << endl; + topster.add(doc_id, cumulativeScore); + } + + total_results += result_size; + delete result_ids; + + if(total_results >= max_results) break; + } + + topster.sort(); + + //cout << "RESULTS: " << endl << endl; + + for(uint32_t i=0; i +#include +#include +#include + +class SearchIndex { +private: + art_tree t; + std::unordered_map doc_scores; +public: + SearchIndex(); + ~SearchIndex(); + void add(uint32_t doc_id, std::vector tokens, uint16_t score); + void search(std::string query, size_t max_results); +}; +