mirror of
https://github.com/typesense/typesense.git
synced 2025-05-18 12:42:50 +08:00
Lots of code clean up.
* Move stuff out of main to classes * Standardize naming conventions.
This commit is contained in:
parent
6c2974aaeb
commit
ba33da1d51
@ -7,5 +7,5 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_DEBUG} -std=c++11 -stdlib=libc++ -std=gnu
|
||||
include_directories(include)
|
||||
include_directories(external/for)
|
||||
|
||||
add_executable(search src/art.cpp src/intersection.cpp src/main.cpp)
|
||||
target_link_libraries(search ${CMAKE_SOURCE_DIR}/external/for/libfor.a)
|
||||
add_executable(search src/art.cpp src/intersection.cpp src/main.cpp src/search_index.cpp src/search_index.h)
|
||||
target_link_libraries(search ${CMAKE_SOURCE_DIR}/external/for/libfor.a boost_system)
|
||||
|
@ -6,4 +6,9 @@ A typo tolerant, open source search engine that helps you build delightful searc
|
||||
|
||||
* [libfor](https://github.com/cruppstahl/for/)
|
||||
|
||||
## Building
|
||||
|
||||
* Switch to `external/libfor` and build libfor
|
||||
* Install `boost`
|
||||
|
||||
© 2016 Wreally Studios Inc.
|
121
include/match_score.h
Normal file
121
include/match_score.h
Normal file
@ -0,0 +1,121 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
#include <stdlib.h>
|
||||
#include <limits>
|
||||
|
||||
#ifdef DEBUG
|
||||
#define D(x) x
|
||||
#else
|
||||
#define D(x)
|
||||
#endif
|
||||
|
||||
struct MatchScore {
|
||||
struct TokenPosition {
|
||||
uint8_t token_id; // token identifier
|
||||
uint16_t position; // token's position in the text
|
||||
uint16_t position_index; // index of the position in the vector
|
||||
|
||||
bool operator() (const TokenPosition& a, const TokenPosition& b) {
|
||||
return a.position > b.position;
|
||||
}
|
||||
};
|
||||
|
||||
#define addTopOfHeapToWindow(heap,q,token_positions,token_pos) {\
|
||||
TokenPosition top = heap.top();\
|
||||
heap.pop();\
|
||||
q.push(top);\
|
||||
token_pos[top.token_id] = top.position; \
|
||||
top.position_index++;\
|
||||
/* Must refill the heap - push the next position of the same token */\
|
||||
if(top.position_index < token_positions[top.token_id].size()) {\
|
||||
heap.push(TokenPosition{top.token_id, token_positions[top.token_id][top.position_index], top.position_index});\
|
||||
}\
|
||||
}
|
||||
uint16_t words_present;
|
||||
uint16_t distance;
|
||||
|
||||
/*
|
||||
* Given *sorted positions* of each target token in a *single* document, generates a score that indicates:
|
||||
* a) How many tokens are present in the document
|
||||
* b) The proximity between the tokens in the document
|
||||
*
|
||||
* We use a priority queue to read the position vectors in a sorted manner, slide a window of a given size, and
|
||||
* compute the max_match and min_displacement of target tokens across the windows.
|
||||
*/
|
||||
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_positions) {
|
||||
const size_t WINDOW_SIZE = 20;
|
||||
const size_t MAX_TOKENS_IN_A_QUERY = 20;
|
||||
const uint16_t MAX_UINT_16 = std::numeric_limits<uint16_t>::max();
|
||||
|
||||
std::priority_queue<TokenPosition, std::vector<TokenPosition>, TokenPosition> heap;
|
||||
|
||||
for(uint8_t token_id=0; token_id < token_positions.size(); token_id++) {
|
||||
heap.push(TokenPosition{token_id, token_positions[token_id].front(), 0});
|
||||
}
|
||||
|
||||
// heap now contains the first occurring position of each token in the given document
|
||||
|
||||
uint16_t max_match = 1;
|
||||
uint16_t min_displacement = UINT16_MAX;
|
||||
|
||||
std::queue<TokenPosition> q;
|
||||
uint16_t token_pos[MAX_TOKENS_IN_A_QUERY] = { };
|
||||
std::fill_n(token_pos, MAX_TOKENS_IN_A_QUERY, MAX_UINT_16);
|
||||
|
||||
do {
|
||||
if(q.empty()) {
|
||||
addTopOfHeapToWindow(heap, q, token_positions, token_pos);
|
||||
}
|
||||
|
||||
D(cout << "Loop till window fills..." << endl;)
|
||||
|
||||
// Fill the queue with tokens within a given window frame size of the start position
|
||||
// At the same time, we also record the *last* occurrence of each token within the window
|
||||
// For e.g. if `cat` appeared at positions 1,3 and 5, we will record `token_pos[cat] = 5`
|
||||
const uint16_t start_pos = q.front().position;
|
||||
while(!heap.empty() && heap.top().position < start_pos+WINDOW_SIZE) {
|
||||
addTopOfHeapToWindow(heap, q, token_positions, token_pos);
|
||||
}
|
||||
|
||||
D(cout << endl << "----" << endl);
|
||||
|
||||
uint16_t prev_pos = MAX_UINT_16;
|
||||
uint16_t num_match = 0;
|
||||
uint16_t displacement = 0;
|
||||
|
||||
for(size_t token_id=0; token_id<token_positions.size(); token_id++) {
|
||||
// If a token appeared within the window, we would have recorded its position
|
||||
if(token_pos[token_id] != MAX_UINT_16) {
|
||||
num_match++;
|
||||
if(prev_pos == MAX_UINT_16) prev_pos = token_pos[token_id];
|
||||
else {
|
||||
// Calculate the distance between the tokens within the window
|
||||
// Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other
|
||||
D(cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_pos[token_id] << endl);
|
||||
displacement += abs(token_pos[token_id]-prev_pos);
|
||||
prev_pos = token_pos[token_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
D(cout << endl << "!!!displacement: " << displacement << " | num_match: " << num_match << endl);
|
||||
|
||||
// Track the best `displacement` and `num_match` seen so far across all the windows
|
||||
if(num_match >= max_match) {
|
||||
max_match = num_match;
|
||||
if(displacement != 0 && displacement < min_displacement) {
|
||||
min_displacement = displacement;
|
||||
}
|
||||
}
|
||||
|
||||
// As we slide the window, drop the first token of the window from the computation
|
||||
token_pos[q.front().token_id] = 0;
|
||||
q.pop();
|
||||
} while(!heap.empty());
|
||||
|
||||
return MatchScore{max_match, min_displacement};
|
||||
}
|
||||
};
|
@ -1,121 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
#include <stdlib.h>
|
||||
#include <limits>
|
||||
|
||||
#ifdef DEBUG
|
||||
#define D(x) x
|
||||
#else
|
||||
#define D(x)
|
||||
#endif
|
||||
|
||||
struct TokenPosition {
|
||||
uint8_t token_id; // token identifier
|
||||
uint16_t position; // token's position in the text
|
||||
uint16_t position_index; // index of the position in the vector
|
||||
|
||||
bool operator() (const TokenPosition& a, const TokenPosition& b) {
|
||||
return a.position > b.position;
|
||||
}
|
||||
};
|
||||
|
||||
struct MatchScore {
|
||||
uint16_t words_present;
|
||||
uint16_t distance;
|
||||
};
|
||||
|
||||
#define addTopOfHeapToWindow(heap,q,token_positions,token_pos) {\
|
||||
TokenPosition top = heap.top();\
|
||||
heap.pop();\
|
||||
q.push(top);\
|
||||
token_pos[top.token_id] = top.position; \
|
||||
top.position_index++;\
|
||||
/* Must refill the heap - push the next position of the same token */\
|
||||
if(top.position_index < token_positions[top.token_id].size()) {\
|
||||
heap.push(TokenPosition{top.token_id, token_positions[top.token_id][top.position_index], top.position_index});\
|
||||
}\
|
||||
}
|
||||
/*
|
||||
* Given *sorted positions* of each target token in a *single* document, generates a score that indicates:
|
||||
* a) How many tokens are present in the document
|
||||
* b) The proximity between the tokens in the document
|
||||
*
|
||||
* We use a priority queue to read the position vectors in a sorted manner, slide a window of a given size, and
|
||||
* compute the max_match and min_displacement of target tokens across the windows.
|
||||
*/
|
||||
MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_positions) {
|
||||
const size_t WINDOW_SIZE = 20;
|
||||
const size_t MAX_TOKENS_IN_A_QUERY = 20;
|
||||
const uint16_t MAX_UINT_16 = std::numeric_limits<uint16_t>::max();
|
||||
|
||||
std::priority_queue<TokenPosition, std::vector<TokenPosition>, TokenPosition> heap;
|
||||
|
||||
for(uint8_t token_id=0; token_id < token_positions.size(); token_id++) {
|
||||
heap.push(TokenPosition{token_id, token_positions[token_id].front(), 0});
|
||||
}
|
||||
|
||||
// heap now contains the first occurring position of each token in the given document
|
||||
|
||||
uint16_t max_match = 1;
|
||||
uint16_t min_displacement = UINT16_MAX;
|
||||
|
||||
std::queue<TokenPosition> q;
|
||||
uint16_t token_pos[MAX_TOKENS_IN_A_QUERY] = { };
|
||||
std::fill_n(token_pos, MAX_TOKENS_IN_A_QUERY, MAX_UINT_16);
|
||||
|
||||
do {
|
||||
if(q.empty()) {
|
||||
addTopOfHeapToWindow(heap, q, token_positions, token_pos);
|
||||
}
|
||||
|
||||
D(cout << "Loop till window fills..." << endl;)
|
||||
|
||||
// Fill the queue with tokens within a given window frame size of the start position
|
||||
// At the same time, we also record the *last* occurrence of each token within the window
|
||||
// For e.g. if `cat` appeared at positions 1,3 and 5, we will record `token_pos[cat] = 5`
|
||||
const uint16_t start_pos = q.front().position;
|
||||
while(!heap.empty() && heap.top().position < start_pos+WINDOW_SIZE) {
|
||||
addTopOfHeapToWindow(heap, q, token_positions, token_pos);
|
||||
}
|
||||
|
||||
D(cout << endl << "----" << endl);
|
||||
|
||||
uint16_t prev_pos = MAX_UINT_16;
|
||||
uint16_t num_match = 0;
|
||||
uint16_t displacement = 0;
|
||||
|
||||
for(size_t token_id=0; token_id<token_positions.size(); token_id++) {
|
||||
// If a token appeared within the window, we would have recorded its position
|
||||
if(token_pos[token_id] != MAX_UINT_16) {
|
||||
num_match++;
|
||||
if(prev_pos == MAX_UINT_16) prev_pos = token_pos[token_id];
|
||||
else {
|
||||
// Calculate the distance between the tokens within the window
|
||||
// Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other
|
||||
D(cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_pos[token_id] << endl);
|
||||
displacement += abs(token_pos[token_id]-prev_pos);
|
||||
prev_pos = token_pos[token_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
D(cout << endl << "!!!displacement: " << displacement << " | num_match: " << num_match << endl);
|
||||
|
||||
// Track the best `displacement` and `num_match` seen so far across all the windows
|
||||
if(num_match >= max_match) {
|
||||
max_match = num_match;
|
||||
if(displacement != 0 && displacement < min_displacement) {
|
||||
min_displacement = displacement;
|
||||
}
|
||||
}
|
||||
|
||||
// As we slide the window, drop the first token of the window from the computation
|
||||
token_pos[q.front().token_id] = 0;
|
||||
q.pop();
|
||||
} while(!heap.empty());
|
||||
|
||||
return MatchScore{max_match, min_displacement};
|
||||
}
|
44
include/string_utils.h
Normal file
44
include/string_utils.h
Normal file
@ -0,0 +1,44 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
struct StringUtils {
|
||||
|
||||
template<class ContainerT>
|
||||
static void tokenize(const std::string &str, ContainerT &tokens,
|
||||
const std::string &delimiters = " ", bool trimEmpty = false) {
|
||||
std::string::size_type pos, lastPos = 0;
|
||||
|
||||
using value_type = typename ContainerT::value_type;
|
||||
using size_type = typename ContainerT::size_type;
|
||||
|
||||
while (true) {
|
||||
pos = str.find_first_of(delimiters, lastPos);
|
||||
if (pos == std::string::npos) {
|
||||
pos = str.length();
|
||||
|
||||
if (pos != lastPos || !trimEmpty)
|
||||
tokens.push_back(value_type(str.data() + lastPos,
|
||||
(size_type) pos - lastPos));
|
||||
|
||||
break;
|
||||
}
|
||||
else {
|
||||
if (pos != lastPos || !trimEmpty)
|
||||
tokens.push_back(value_type(str.data() + lastPos,
|
||||
(size_type) pos - lastPos));
|
||||
}
|
||||
|
||||
lastPos = pos + 1;
|
||||
}
|
||||
}
|
||||
|
||||
static std::string replace_all(std::string str, const std::string &from, const std::string &to) {
|
||||
size_t start_pos = 0;
|
||||
while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
|
||||
str.replace(start_pos, from.length(), to);
|
||||
start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
|
||||
}
|
||||
return str;
|
||||
}
|
||||
};
|
@ -5,10 +5,11 @@
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
|
||||
/*
|
||||
* A bounded max heap that remembers the top-K elements seen so far
|
||||
*/
|
||||
template <size_t MAX_SIZE=100>
|
||||
struct Topster {
|
||||
// A bounded max heap that remembers the top-K elements seen so far
|
||||
|
||||
uint64_t data[MAX_SIZE];
|
||||
uint32_t smallest_index = 0;
|
||||
uint32_t size = 0;
|
||||
|
@ -1,36 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
template < class ContainerT >
|
||||
void tokenize(const std::string& str, ContainerT& tokens,
|
||||
const std::string& delimiters = " ", bool trimEmpty = false)
|
||||
{
|
||||
std::string::size_type pos, lastPos = 0;
|
||||
|
||||
using value_type = typename ContainerT::value_type;
|
||||
using size_type = typename ContainerT::size_type;
|
||||
|
||||
while(true)
|
||||
{
|
||||
pos = str.find_first_of(delimiters, lastPos);
|
||||
if(pos == std::string::npos)
|
||||
{
|
||||
pos = str.length();
|
||||
|
||||
if(pos != lastPos || !trimEmpty)
|
||||
tokens.push_back(value_type(str.data()+lastPos,
|
||||
(size_type)pos-lastPos ));
|
||||
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(pos != lastPos || !trimEmpty)
|
||||
tokens.push_back(value_type(str.data()+lastPos,
|
||||
(size_type)pos-lastPos ));
|
||||
}
|
||||
|
||||
lastPos = pos + 1;
|
||||
}
|
||||
}
|
267
src/main.cpp
267
src/main.cpp
@ -1,225 +1,20 @@
|
||||
#include <stdlib.h>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
#include <numeric>
|
||||
#include <time.h>
|
||||
#include <art.h>
|
||||
#include <unordered_map>
|
||||
#include "topster.h"
|
||||
#include "intersection.h"
|
||||
#include "matchscore.h"
|
||||
#include "util.h"
|
||||
#include "string_utils.h"
|
||||
#include "crow_all.h"
|
||||
#include "search_index.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static int test_prefix_cb(void *data, const unsigned char *k, uint32_t k_len, void *val) {
|
||||
cout << "#>>>>Key: ";
|
||||
printf("%.*s", k_len, k);
|
||||
cout << "LENGTH OF IDS: " << ((art_values*)val)->ids.getLength() << endl;
|
||||
|
||||
for(uint32_t i=0; i<((art_values*)val)->ids.getLength(); i++) {
|
||||
cout << ", ID: " << ((art_values*)val)->ids.at(i) << endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void benchmark_heap_array() {
|
||||
srand (time(NULL));
|
||||
|
||||
vector<uint32_t> records;
|
||||
|
||||
for(uint32_t i=0; i<10000000; i++) {
|
||||
records.push_back((const unsigned int &) rand());
|
||||
}
|
||||
|
||||
vector<uint32_t> hits;
|
||||
|
||||
for(uint32_t i=0; i<records.size(); i++) {
|
||||
if(i%10 == 0) {
|
||||
hits.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
Topster<4000> heapArray;
|
||||
|
||||
for(uint32_t i=0; i<hits.size(); i++) {
|
||||
heapArray.add(i, records[hits[i]]);
|
||||
}
|
||||
|
||||
std::sort(std::begin(heapArray.data), std::end(heapArray.data));
|
||||
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
|
||||
for(uint32_t i=0; i<heapArray.size; i++) {
|
||||
cout << "Res: " << heapArray.data[i] << endl;
|
||||
}
|
||||
|
||||
cout << "Time taken: " << timeMillis << endl;
|
||||
}
|
||||
|
||||
void index_document(art_tree& t, uint32_t doc_id, vector<string> tokens, uint16_t score) {
|
||||
unordered_map<string, vector<uint32_t>> token_to_offsets;
|
||||
|
||||
for(uint32_t i=0; i<tokens.size(); i++) {
|
||||
auto token = tokens[i];
|
||||
std::transform(token.begin(), token.end(), token.begin(), ::tolower);
|
||||
token_to_offsets[token].push_back(i);
|
||||
}
|
||||
|
||||
for(auto & kv: token_to_offsets) {
|
||||
art_document document;
|
||||
document.id = doc_id;
|
||||
document.score = score;
|
||||
document.offsets_len = (uint32_t) kv.second.size();
|
||||
document.offsets = new uint32_t[kv.second.size()];
|
||||
|
||||
uint32_t num_hits = document.offsets_len;
|
||||
art_leaf* leaf = (art_leaf *) art_search(&t, (const unsigned char *) kv.first.c_str(), (int) kv.first.length());
|
||||
if(leaf != NULL) {
|
||||
num_hits += leaf->token_count;
|
||||
}
|
||||
|
||||
for(auto i=0; i<kv.second.size(); i++) {
|
||||
document.offsets[i] = kv.second[i];
|
||||
}
|
||||
|
||||
art_insert(&t, (const unsigned char *) kv.first.c_str(), (int) kv.first.length(), &document, num_hits);
|
||||
delete document.offsets;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
1. Split q into tokens
|
||||
2. For each token, look up ids using exact lookup
|
||||
a. If a token has no result, try again with edit distance of 1, and then 2
|
||||
3. Do a limited cartesian product of the word suggestions for each token to form possible corrected search phrases
|
||||
(adapted from: http://stackoverflow.com/a/31169617/131050)
|
||||
4. Intersect the lists to find docs that match each phrase
|
||||
5. Sort the docs based on some ranking criteria
|
||||
*/
|
||||
void find_documents(art_tree & t, unordered_map<uint32_t, uint16_t>& docscores, string query, size_t max_results) {
|
||||
vector<string> tokens;
|
||||
tokenize(query, tokens, " ", true);
|
||||
|
||||
vector<vector<art_leaf*>> token_leaves;
|
||||
for(string token: tokens) {
|
||||
vector<art_leaf*> leaves;
|
||||
int max_cost = 2;
|
||||
art_iter_fuzzy_prefix(&t, (const unsigned char *) token.c_str(), (int) token.length(), max_cost, 10, leaves);
|
||||
if(!leaves.empty()) {
|
||||
for(auto i=0; i<leaves.size(); i++) {
|
||||
//printf("%s - ", token.c_str());
|
||||
//printf("%.*s", leaves[i]->key_len, leaves[i]->key);
|
||||
//printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->token_count);
|
||||
}
|
||||
token_leaves.push_back(leaves);
|
||||
}
|
||||
}
|
||||
|
||||
Topster<100> topster;
|
||||
size_t total_results = 0;
|
||||
const size_t combination_limit = 10;
|
||||
auto product = []( long long a, vector<art_leaf*>& b ) { return a*b.size(); };
|
||||
long long int N = accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
|
||||
|
||||
for(long long n=0; n<N && n<combination_limit; ++n) {
|
||||
// every element in vector `query_suggestion` represents a token and its associated hits
|
||||
vector<art_leaf*> query_suggestion(token_leaves.size());
|
||||
|
||||
// generate the next combination from `token_leaves` and store it in `query_suggestion`
|
||||
ldiv_t q { n, 0 };
|
||||
for( long long i=token_leaves.size()-1 ; 0<=i ; --i ) {
|
||||
q = div(q.quot, token_leaves[i].size());
|
||||
query_suggestion[i] = token_leaves[i][q.rem];
|
||||
}
|
||||
|
||||
// sort ascending based on matched documents for each token to perform effective intersection
|
||||
sort(query_suggestion.begin(), query_suggestion.end(), [](const art_leaf* left, const art_leaf* right) {
|
||||
return left->values->ids.getLength() < right->values->ids.getLength();
|
||||
});
|
||||
|
||||
// initialize results with the starting element (for further intersection)
|
||||
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
|
||||
size_t result_size = query_suggestion[0]->values->ids.getLength();
|
||||
|
||||
if(result_size == 0) continue;
|
||||
|
||||
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
|
||||
for(auto i=1; i < query_suggestion.size(); i++) {
|
||||
uint32_t* out = new uint32_t[result_size];
|
||||
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
|
||||
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
|
||||
delete result_ids;
|
||||
delete curr;
|
||||
result_ids = out;
|
||||
}
|
||||
|
||||
//cout << "2result_size: " << result_size << endl;
|
||||
|
||||
// go through each matching document id and calculate match score
|
||||
for(auto i=0; i<result_size; i++) {
|
||||
uint32_t doc_id = result_ids[i];
|
||||
std::vector<std::vector<uint16_t>> token_positions;
|
||||
|
||||
// for each token in the query, find the positions that it appears in this document
|
||||
for (art_leaf *token_leaf : query_suggestion) {
|
||||
vector<uint16_t> positions;
|
||||
uint32_t doc_index = token_leaf->values->ids.indexOf(doc_id);
|
||||
uint32_t offset_index = token_leaf->values->offset_index.at(doc_index);
|
||||
uint32_t num_offsets = token_leaf->values->offsets.at(offset_index);
|
||||
for (auto offset_count = 1; offset_count <= num_offsets; offset_count++) {
|
||||
positions.push_back((uint16_t) token_leaf->values->offsets.at(offset_index + offset_count));
|
||||
}
|
||||
token_positions.push_back(positions);
|
||||
}
|
||||
|
||||
MatchScore mscore = match_score(doc_id, token_positions);
|
||||
const uint32_t cumulativeScore = ((uint32_t)(mscore.words_present * 16 + (20 - mscore.distance)) * 64000) + docscores[doc_id];
|
||||
|
||||
// cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: " << (int)mscore.distance << " - mscore.words_present: " << (int)mscore.words_present
|
||||
// << " - docscores[doc_id]: " << (int)docscores[doc_id] << " - cumulativeScore: " << cumulativeScore << endl;
|
||||
topster.add(doc_id, cumulativeScore);
|
||||
}
|
||||
|
||||
total_results += result_size;
|
||||
delete result_ids;
|
||||
|
||||
if(total_results >= max_results) break;
|
||||
}
|
||||
|
||||
topster.sort();
|
||||
|
||||
//cout << "RESULTS: " << endl << endl;
|
||||
|
||||
for(uint32_t i=0; i<topster.size; i++) {
|
||||
uint32_t id = topster.getKeyAt(i);
|
||||
cout << "ID: " << id << endl;
|
||||
}
|
||||
|
||||
//cin.get();
|
||||
}
|
||||
|
||||
std::string ReplaceAll(std::string str, const std::string& from, const std::string& to) {
|
||||
size_t start_pos = 0;
|
||||
while((start_pos = str.find(from, start_pos)) != std::string::npos) {
|
||||
str.replace(start_pos, from.length(), to);
|
||||
start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
int main() {
|
||||
art_tree t;
|
||||
art_tree_init(&t);
|
||||
SearchIndex *index = new SearchIndex();
|
||||
|
||||
unordered_map<uint32_t, uint16_t> docscores;
|
||||
|
||||
// std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt");
|
||||
//std::ifstream infile("/Users/kishore/others/wreally/typesense/test/documents.txt");
|
||||
std::ifstream infile("/Users/kishore/Downloads/hnstories.tsv");
|
||||
|
||||
std::string line;
|
||||
@ -227,63 +22,23 @@ int main() {
|
||||
|
||||
while (std::getline(infile, line)) {
|
||||
vector<string> parts;
|
||||
tokenize(line, parts, "\t", true);
|
||||
line = ReplaceAll(line, "\"", "");
|
||||
StringUtils::tokenize(line, parts, "\t", true);
|
||||
line = StringUtils::replace_all(line, "\"", "");
|
||||
|
||||
vector<string> tokens;
|
||||
tokenize(parts[0], tokens, " ", true);
|
||||
StringUtils::tokenize(parts[0], tokens, " ", true);
|
||||
|
||||
if(parts.size() != 2) continue;
|
||||
|
||||
if(doc_id == 857622 || doc_id == 52838 || doc_id == 56961) {
|
||||
cout << "Doc " << doc_id << ": " << line << endl;
|
||||
}
|
||||
|
||||
//cout << "Doc " << doc_id << ": " << line << endl;
|
||||
|
||||
docscores[doc_id] = (uint16_t) stoi(parts[1]);
|
||||
index_document(t, doc_id, tokens, stoi(parts[1]));
|
||||
index->add(doc_id, tokens, stoi(parts[1]));
|
||||
doc_id++;
|
||||
}
|
||||
|
||||
cout << "FINISHED INDEXING!" << endl << flush;
|
||||
|
||||
/*const unsigned char *prefix = (const unsigned char *) "the";
|
||||
size_t prefix_len = strlen((const char *) prefix);
|
||||
std::vector<art_leaf*> results;
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
art_iter_fuzzy_prefix(&t, prefix, prefix_len, 0, 2, results);
|
||||
index->search("thei rserch", 100);
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
|
||||
art_iter_prefix(&t, prefix, strlen((const char *) prefix), test_prefix_cb, NULL);
|
||||
art_iter(&t, test_prefix_cb, NULL);
|
||||
|
||||
cout << "Time taken: " << timeMillis << "us" << endl;
|
||||
|
||||
for(auto leaf: results) {
|
||||
std::cout << ">>>>/Key: " << leaf->key << " - score: " << leaf->score << std::endl;
|
||||
for(uint32_t i=0; i<leaf->values->ids.getLength(); i++) {
|
||||
std::cout << ", ID: " << leaf->values->ids.at(i) << std::endl;
|
||||
}
|
||||
std::cout << ", Value: " << leaf->values->ids.at(0) << std::endl;
|
||||
}*/
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
find_documents(t, docscores, "thei rserch", 10);
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
|
||||
// string token = "nternet";
|
||||
// vector<art_leaf*> leaves;
|
||||
//
|
||||
// art_iter_fuzzy_prefix(&t, (const unsigned char *) token.c_str(), (int) token.length(), 1, 10, leaves);
|
||||
// for(auto leaf: leaves) {
|
||||
// printf("Word: %.*s", leaf->key_len, leaf->key);
|
||||
// cout << " - score: " << leaf->token_count << endl;
|
||||
// }
|
||||
|
||||
cout << "Time taken: " << timeMillis << "us" << endl;
|
||||
|
||||
art_tree_destroy(&t);
|
||||
delete index;
|
||||
return 0;
|
||||
}
|
159
src/search_index.cpp
Normal file
159
src/search_index.cpp
Normal file
@ -0,0 +1,159 @@
|
||||
#include "search_index.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <topster.h>
|
||||
#include <intersection.h>
|
||||
#include <match_score.h>
|
||||
#include <string_utils.h>
|
||||
|
||||
SearchIndex::SearchIndex() {
|
||||
art_tree_init(&t);
|
||||
}
|
||||
|
||||
SearchIndex::~SearchIndex() {
|
||||
art_tree_destroy(&t);
|
||||
}
|
||||
|
||||
void SearchIndex::add(uint32_t doc_id, std::vector<std::string> tokens, uint16_t score) {
|
||||
std::unordered_map<std::string, std::vector<uint32_t>> token_to_offsets;
|
||||
|
||||
for(uint32_t i=0; i<tokens.size(); i++) {
|
||||
auto token = tokens[i];
|
||||
std::transform(token.begin(), token.end(), token.begin(), ::tolower);
|
||||
token_to_offsets[token].push_back(i);
|
||||
}
|
||||
|
||||
for(auto & kv: token_to_offsets) {
|
||||
art_document document;
|
||||
document.id = doc_id;
|
||||
document.score = score;
|
||||
document.offsets_len = (uint32_t) kv.second.size();
|
||||
document.offsets = new uint32_t[kv.second.size()];
|
||||
|
||||
uint32_t num_hits = document.offsets_len;
|
||||
art_leaf* leaf = (art_leaf *) art_search(&t, (const unsigned char *) kv.first.c_str(), (int) kv.first.length());
|
||||
if(leaf != NULL) {
|
||||
num_hits += leaf->token_count;
|
||||
}
|
||||
|
||||
for(auto i=0; i<kv.second.size(); i++) {
|
||||
document.offsets[i] = kv.second[i];
|
||||
}
|
||||
|
||||
art_insert(&t, (const unsigned char *) kv.first.c_str(), (int) kv.first.length(), &document, num_hits);
|
||||
delete document.offsets;
|
||||
}
|
||||
|
||||
doc_scores[doc_id] = score;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
1. Split q into tokens
|
||||
2. For each token, look up ids using exact lookup
|
||||
a. If a token has no result, try again with edit distance of 1, and then 2
|
||||
3. Do a limited cartesian product of the word suggestions for each token to form possible corrected search phrases
|
||||
(adapted from: http://stackoverflow.com/a/31169617/131050)
|
||||
4. Intersect the lists to find docs that match each phrase
|
||||
5. Sort the docs based on some ranking criteria
|
||||
*/
|
||||
void SearchIndex::search(std::string query, size_t max_results) {
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::tokenize(query, tokens, " ", true);
|
||||
|
||||
std::vector<std::vector<art_leaf*>> token_leaves;
|
||||
for(std::string token: tokens) {
|
||||
std::vector<art_leaf*> leaves;
|
||||
int max_cost = 2;
|
||||
art_iter_fuzzy_prefix(&t, (const unsigned char *) token.c_str(), (int) token.length(), max_cost, 10, leaves);
|
||||
if(!leaves.empty()) {
|
||||
for(auto i=0; i<leaves.size(); i++) {
|
||||
//printf("%s - ", token.c_str());
|
||||
//printf("%.*s", leaves[i]->key_len, leaves[i]->key);
|
||||
//printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->token_count);
|
||||
}
|
||||
token_leaves.push_back(leaves);
|
||||
}
|
||||
}
|
||||
|
||||
Topster<100> topster;
|
||||
size_t total_results = 0;
|
||||
const size_t combination_limit = 10;
|
||||
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
|
||||
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
|
||||
|
||||
for(long long n=0; n<N && n<combination_limit; ++n) {
|
||||
// every element in vector `query_suggestion` represents a token and its associated hits
|
||||
std::vector<art_leaf*> query_suggestion(token_leaves.size());
|
||||
|
||||
// generate the next combination from `token_leaves` and store it in `query_suggestion`
|
||||
ldiv_t q { n, 0 };
|
||||
for(long long i=token_leaves.size()-1 ; 0<=i ; --i ) {
|
||||
q = ldiv(q.quot, token_leaves[i].size());
|
||||
query_suggestion[i] = token_leaves[i][q.rem];
|
||||
}
|
||||
|
||||
// sort ascending based on matched documents for each token to perform effective intersection
|
||||
sort(query_suggestion.begin(), query_suggestion.end(), [](const art_leaf* left, const art_leaf* right) {
|
||||
return left->values->ids.getLength() < right->values->ids.getLength();
|
||||
});
|
||||
|
||||
// initialize results with the starting element (for further intersection)
|
||||
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
|
||||
size_t result_size = query_suggestion[0]->values->ids.getLength();
|
||||
|
||||
if(result_size == 0) continue;
|
||||
|
||||
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
|
||||
for(auto i=1; i < query_suggestion.size(); i++) {
|
||||
uint32_t* out = new uint32_t[result_size];
|
||||
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
|
||||
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
|
||||
delete result_ids;
|
||||
delete curr;
|
||||
result_ids = out;
|
||||
}
|
||||
|
||||
//cout << "2result_size: " << result_size << endl;
|
||||
|
||||
// go through each matching document id and calculate match score
|
||||
for(auto i=0; i<result_size; i++) {
|
||||
uint32_t doc_id = result_ids[i];
|
||||
std::vector<std::vector<uint16_t>> token_positions;
|
||||
|
||||
// for each token in the query, find the positions that it appears in this document
|
||||
for (art_leaf *token_leaf : query_suggestion) {
|
||||
std::vector<uint16_t> positions;
|
||||
uint32_t doc_index = token_leaf->values->ids.indexOf(doc_id);
|
||||
uint32_t offset_index = token_leaf->values->offset_index.at(doc_index);
|
||||
uint32_t num_offsets = token_leaf->values->offsets.at(offset_index);
|
||||
for (auto offset_count = 1; offset_count <= num_offsets; offset_count++) {
|
||||
positions.push_back((uint16_t) token_leaf->values->offsets.at(offset_index + offset_count));
|
||||
}
|
||||
token_positions.push_back(positions);
|
||||
}
|
||||
|
||||
MatchScore mscore = MatchScore::match_score(doc_id, token_positions);
|
||||
const uint32_t cumulativeScore = ((uint32_t)(mscore.words_present * 16 + (20 - mscore.distance)) * 64000) + doc_scores[doc_id];
|
||||
|
||||
// cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: " << (int)mscore.distance << " - mscore.words_present: " << (int)mscore.words_present
|
||||
// << " - docscores[doc_id]: " << (int)docscores[doc_id] << " - cumulativeScore: " << cumulativeScore << endl;
|
||||
topster.add(doc_id, cumulativeScore);
|
||||
}
|
||||
|
||||
total_results += result_size;
|
||||
delete result_ids;
|
||||
|
||||
if(total_results >= max_results) break;
|
||||
}
|
||||
|
||||
topster.sort();
|
||||
|
||||
//cout << "RESULTS: " << endl << endl;
|
||||
|
||||
for(uint32_t i=0; i<topster.size; i++) {
|
||||
uint32_t id = topster.getKeyAt(i);
|
||||
std::cout << "ID: " << id << std::endl;
|
||||
}
|
||||
}
|
18
src/search_index.h
Normal file
18
src/search_index.h
Normal file
@ -0,0 +1,18 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <art.h>
|
||||
#include <unordered_map>
|
||||
|
||||
class SearchIndex {
|
||||
private:
|
||||
art_tree t;
|
||||
std::unordered_map<uint32_t, uint16_t> doc_scores;
|
||||
public:
|
||||
SearchIndex();
|
||||
~SearchIndex();
|
||||
void add(uint32_t doc_id, std::vector<std::string> tokens, uint16_t score);
|
||||
void search(std::string query, size_t max_results);
|
||||
};
|
||||
|
Loading…
x
Reference in New Issue
Block a user