mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 21:52:23 +08:00
Fixed multi word queries.
This commit is contained in:
parent
2f26b95c5b
commit
e7c6c6d3cb
8
TODO.md
8
TODO.md
@ -5,7 +5,7 @@
|
||||
**Search index**
|
||||
|
||||
- ~~Proper JSON as input~~
|
||||
- Storing raw JSON input to RocksDB
|
||||
- ~~Storing raw JSON input to RocksDB~~
|
||||
- ART for every indexed field
|
||||
- UTF-8 support for fuzzy search
|
||||
- Facets
|
||||
@ -29,4 +29,8 @@
|
||||
**Refactoring**
|
||||
|
||||
- ~~`token_count` in leaf is redundant: can be accessed from value~~
|
||||
- ~~storing length in `offsets` is redundant: it can be found by looking up value of the next index in offset_index~~
|
||||
- ~~storing length in `offsets` is redundant: it can be found by looking up value of the next index in offset_index~~
|
||||
|
||||
**Tech debt**
|
||||
|
||||
- Use GLOB file pattern for CMake (better IDE refactoring support)
|
@ -12,96 +12,110 @@
|
||||
#define D(x)
|
||||
#endif
|
||||
|
||||
struct MatchScore {
|
||||
struct TokenPosition {
|
||||
uint8_t token_id; // token identifier
|
||||
uint16_t position; // token's position in the text
|
||||
uint16_t position_index; // index of the position in the vector
|
||||
#define TokenOffsetHeap std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset>
|
||||
|
||||
bool operator() (const TokenPosition& a, const TokenPosition& b) {
|
||||
return a.position > b.position;
|
||||
struct MatchScore {
|
||||
struct TokenOffset {
|
||||
uint8_t token_id; // token identifier
|
||||
uint16_t offset; // token's offset in the text
|
||||
uint16_t offset_index; // index of the offset in the vector
|
||||
|
||||
bool operator() (const TokenOffset& a, const TokenOffset& b) {
|
||||
return a.offset > b.offset;
|
||||
}
|
||||
};
|
||||
|
||||
#define addTopOfHeapToWindow(heap,q,token_positions,token_pos) {\
|
||||
TokenPosition top = heap.top();\
|
||||
heap.pop();\
|
||||
q.push(top);\
|
||||
token_pos[top.token_id] = top.position; \
|
||||
top.position_index++;\
|
||||
/* Must refill the heap - push the next position of the same token */\
|
||||
if(top.position_index < token_positions[top.token_id].size()) {\
|
||||
heap.push(TokenPosition{top.token_id, token_positions[top.token_id][top.position_index], top.position_index});\
|
||||
}\
|
||||
static void print_token_offsets(std::vector<std::vector<uint16_t>> &token_offsets) {
|
||||
for(auto offsets: token_offsets) {
|
||||
for(auto offset: offsets) {
|
||||
std::cout << offset << ", ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void addTopOfHeapToWindow(TokenOffsetHeap &heap, std::queue<TokenOffset> &window,
|
||||
std::vector<std::vector<uint16_t>> &token_offsets, uint16_t *token_offset) {
|
||||
TokenOffset top = heap.top();
|
||||
heap.pop();
|
||||
window.push(top);
|
||||
token_offset[top.token_id] = top.offset;
|
||||
top.offset_index++;
|
||||
|
||||
// Must refill the heap - push the next offset of the same token
|
||||
if(top.offset_index < token_offsets[top.token_id].size()) {
|
||||
heap.push(TokenOffset{top.token_id, token_offsets[top.token_id][top.offset_index], top.offset_index});
|
||||
}
|
||||
}
|
||||
|
||||
uint16_t words_present;
|
||||
uint16_t distance;
|
||||
|
||||
/*
|
||||
* Given *sorted positions* of each target token in a *single* document, generates a score that indicates:
|
||||
* Given *sorted offsets* of each target token in a *single* document, generates a score that indicates:
|
||||
* a) How many tokens are present in the document
|
||||
* b) The proximity between the tokens in the document
|
||||
*
|
||||
* We use a priority queue to read the position vectors in a sorted manner, slide a window of a given size, and
|
||||
* We use a priority queue to read the offset vectors in a sorted manner, slide a window of a given size, and
|
||||
* compute the max_match and min_displacement of target tokens across the windows.
|
||||
*/
|
||||
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_positions) {
|
||||
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
|
||||
const size_t WINDOW_SIZE = 20;
|
||||
const size_t MAX_TOKENS_IN_A_QUERY = 20;
|
||||
const uint16_t MAX_UINT_16 = std::numeric_limits<uint16_t>::max();
|
||||
const uint16_t MAX_DISPLACEMENT = 20;
|
||||
|
||||
std::priority_queue<TokenPosition, std::vector<TokenPosition>, TokenPosition> heap;
|
||||
std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
|
||||
|
||||
for(uint8_t token_id=0; token_id < token_positions.size(); token_id++) {
|
||||
heap.push(TokenPosition{token_id, token_positions[token_id].front(), 0});
|
||||
for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {
|
||||
heap.push(TokenOffset{token_id, token_offsets[token_id].front(), 0});
|
||||
}
|
||||
|
||||
// heap now contains the first occurring position of each token in the given document
|
||||
// heap now contains the first occurring offset of each token in the given document
|
||||
|
||||
uint16_t max_match = 1;
|
||||
uint16_t min_displacement = UINT16_MAX;
|
||||
uint16_t min_displacement = MAX_DISPLACEMENT;
|
||||
|
||||
std::queue<TokenPosition> q;
|
||||
uint16_t token_pos[MAX_TOKENS_IN_A_QUERY] = { };
|
||||
std::fill_n(token_pos, MAX_TOKENS_IN_A_QUERY, MAX_UINT_16);
|
||||
std::queue<TokenOffset> window;
|
||||
uint16_t token_offset[MAX_TOKENS_IN_A_QUERY] = { };
|
||||
std::fill_n(token_offset, MAX_TOKENS_IN_A_QUERY, MAX_DISPLACEMENT);
|
||||
|
||||
do {
|
||||
if(q.empty()) {
|
||||
addTopOfHeapToWindow(heap, q, token_positions, token_pos);
|
||||
if(window.empty()) {
|
||||
addTopOfHeapToWindow(heap, window, token_offsets, token_offset);
|
||||
}
|
||||
|
||||
D(cout << "Loop till window fills..." << endl;)
|
||||
D(std::cout << "Loop till window fills... doc_id: " << doc_id << std::endl;)
|
||||
|
||||
// Fill the queue with tokens within a given window frame size of the start position
|
||||
// Fill the queue with tokens within a given window frame size of the start offset
|
||||
// At the same time, we also record the *last* occurrence of each token within the window
|
||||
// For e.g. if `cat` appeared at positions 1,3 and 5, we will record `token_pos[cat] = 5`
|
||||
const uint16_t start_pos = q.front().position;
|
||||
while(!heap.empty() && heap.top().position < start_pos+WINDOW_SIZE) {
|
||||
addTopOfHeapToWindow(heap, q, token_positions, token_pos);
|
||||
// For e.g. if `cat` appeared at offsets 1,3 and 5, we will record `token_offset[cat] = 5`
|
||||
const uint16_t start_offset = window.front().offset;
|
||||
while(!heap.empty() && heap.top().offset < start_offset+WINDOW_SIZE) {
|
||||
addTopOfHeapToWindow(heap, window, token_offsets, token_offset);
|
||||
}
|
||||
|
||||
D(cout << endl << "----" << endl);
|
||||
D(std::cout << std::endl << "----" << std::endl);
|
||||
|
||||
uint16_t prev_pos = MAX_UINT_16;
|
||||
uint16_t prev_pos = MAX_DISPLACEMENT;
|
||||
uint16_t num_match = 0;
|
||||
uint16_t displacement = 0;
|
||||
|
||||
for(size_t token_id=0; token_id<token_positions.size(); token_id++) {
|
||||
// If a token appeared within the window, we would have recorded its position
|
||||
if(token_pos[token_id] != MAX_UINT_16) {
|
||||
for(size_t token_id=0; token_id<token_offsets.size(); token_id++) {
|
||||
// If a token appeared within the window, we would have recorded its offset
|
||||
if(token_offset[token_id] != MAX_DISPLACEMENT) {
|
||||
num_match++;
|
||||
if(prev_pos == MAX_UINT_16) prev_pos = token_pos[token_id];
|
||||
if(prev_pos == MAX_DISPLACEMENT) prev_pos = token_offset[token_id];
|
||||
else {
|
||||
// Calculate the distance between the tokens within the window
|
||||
// Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other
|
||||
D(cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_pos[token_id] << endl);
|
||||
displacement += abs(token_pos[token_id]-prev_pos);
|
||||
prev_pos = token_pos[token_id];
|
||||
D(std::cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_offset[token_id] << std::endl);
|
||||
displacement += abs(token_offset[token_id]-prev_pos);
|
||||
prev_pos = token_offset[token_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
D(cout << endl << "!!!displacement: " << displacement << " | num_match: " << num_match << endl);
|
||||
D(std::cout << std::endl << "!!!displacement: " << displacement << " | num_match: " << num_match << std::endl);
|
||||
|
||||
// Track the best `displacement` and `num_match` seen so far across all the windows
|
||||
if(num_match >= max_match) {
|
||||
@ -112,8 +126,8 @@ struct MatchScore {
|
||||
}
|
||||
|
||||
// As we slide the window, drop the first token of the window from the computation
|
||||
token_pos[q.front().token_id] = 0;
|
||||
q.pop();
|
||||
token_offset[window.front().token_id] = 0;
|
||||
window.pop();
|
||||
} while(!heap.empty());
|
||||
|
||||
return MatchScore{max_match, min_displacement};
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <match_score.h>
|
||||
#include <string_utils.h>
|
||||
#include "sole.hpp"
|
||||
#include "art.h"
|
||||
#include "json.hpp"
|
||||
|
||||
Collection::Collection(std::string state_dir_path): seq_id(0) {
|
||||
@ -85,80 +86,90 @@ void Collection::add(std::string json_str) {
|
||||
4. Intersect the lists to find docs that match each phrase
|
||||
5. Sort the docs based on some ranking criteria
|
||||
*/
|
||||
std::vector<nlohmann::json> Collection::search(std::string query, const int num_typos, const size_t max_results) {
|
||||
std::vector<nlohmann::json> Collection::search(std::string query, const int num_typos, const size_t num_results) {
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::tokenize(query, tokens, " ", true);
|
||||
|
||||
const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
|
||||
const size_t max_results = std::min(num_results, (size_t) 100);
|
||||
|
||||
std::cout << "Searching with max_cost=" << max_cost << std::endl;
|
||||
|
||||
std::vector<std::vector<art_leaf*>> token_leaves;
|
||||
for(std::string token: tokens) {
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_results(&t, (const unsigned char *) token.c_str(), (int) token.length() + 1, max_cost, 10, leaves);
|
||||
if(!leaves.empty()) {
|
||||
for(auto i=0; i<leaves.size(); i++) {
|
||||
//printf("%s - ", token.c_str());
|
||||
//printf("%.*s", leaves[i]->key_len, leaves[i]->key);
|
||||
//printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->values->ids.getLength());
|
||||
}
|
||||
token_leaves.push_back(leaves);
|
||||
}
|
||||
}
|
||||
|
||||
if(token_leaves.size() == 0) {
|
||||
return std::vector<nlohmann::json>();
|
||||
}
|
||||
|
||||
//std::cout << "token_leaves.size = " << token_leaves.size() << std::endl;
|
||||
|
||||
Topster<100> topster;
|
||||
int cost = 0;
|
||||
size_t total_results = 0;
|
||||
const size_t combination_limit = 10;
|
||||
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
|
||||
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
|
||||
|
||||
for(long long n=0; n<N && n<combination_limit; ++n) {
|
||||
// every element in `query_suggestion` represents a token and its associated hits
|
||||
std::vector<art_leaf *> query_suggestion = _next_suggestion(token_leaves, n);
|
||||
|
||||
// initialize results with the starting element (for further intersection)
|
||||
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
|
||||
size_t result_size = query_suggestion[0]->values->ids.getLength();
|
||||
|
||||
if(result_size == 0) continue;
|
||||
|
||||
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
|
||||
for(auto i=1; i < query_suggestion.size(); i++) {
|
||||
uint32_t* out = new uint32_t[result_size];
|
||||
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
|
||||
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
|
||||
delete result_ids;
|
||||
delete curr;
|
||||
result_ids = out;
|
||||
}
|
||||
|
||||
// go through each matching document id and calculate match score
|
||||
score_results(topster, query_suggestion, result_ids, result_size);
|
||||
|
||||
total_results += result_size;
|
||||
delete result_ids;
|
||||
|
||||
if(total_results >= max_results) break;
|
||||
}
|
||||
|
||||
topster.sort();
|
||||
|
||||
std::vector<nlohmann::json> results;
|
||||
|
||||
for(uint32_t i=0; i<topster.size; i++) {
|
||||
uint32_t id = topster.getKeyAt(i);
|
||||
std::cout << "ID: " << id << std::endl;
|
||||
while(cost <= max_cost) {
|
||||
std::cout << "Searching with cost=" << cost << std::endl;
|
||||
|
||||
const std::string value = store->get(std::to_string(id));
|
||||
nlohmann::json document = nlohmann::json::parse(value);
|
||||
results.push_back(document);
|
||||
std::vector<std::vector<art_leaf*>> token_leaves;
|
||||
for(std::string token: tokens) {
|
||||
std::vector<art_leaf*> leaves;
|
||||
art_fuzzy_results(&t, (const unsigned char *) token.c_str(), (int) token.length() + 1, cost, 3, leaves);
|
||||
if(!leaves.empty()) {
|
||||
for(auto i=0; i<leaves.size(); i++) {
|
||||
//printf("%s - ", token.c_str());
|
||||
//printf("%.*s", leaves[i]->key_len, leaves[i]->key);
|
||||
//printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->values->ids.getLength());
|
||||
}
|
||||
token_leaves.push_back(leaves);
|
||||
}
|
||||
}
|
||||
|
||||
if(token_leaves.size() != tokens.size()) {
|
||||
//std::cout << "token_leaves.size() != tokens.size(), continuing..." << std::endl << std::endl;
|
||||
cost++;
|
||||
continue;
|
||||
}
|
||||
|
||||
Topster<100> topster;
|
||||
const size_t combination_limit = 10;
|
||||
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
|
||||
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
|
||||
|
||||
for(long long n=0; n<N && n<combination_limit; ++n) {
|
||||
// every element in `query_suggestion` represents a token and its associated hits
|
||||
std::vector<art_leaf *> query_suggestion = _next_suggestion(token_leaves, n);
|
||||
|
||||
// initialize results with the starting element (for further intersection)
|
||||
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
|
||||
size_t result_size = query_suggestion[0]->values->ids.getLength();
|
||||
|
||||
if(result_size == 0) continue;
|
||||
|
||||
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
|
||||
for(auto i=1; i < query_suggestion.size(); i++) {
|
||||
uint32_t* out = new uint32_t[result_size];
|
||||
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
|
||||
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
|
||||
delete result_ids;
|
||||
delete curr;
|
||||
result_ids = out;
|
||||
}
|
||||
|
||||
// go through each matching document id and calculate match score
|
||||
score_results(topster, query_suggestion, result_ids, result_size);
|
||||
|
||||
total_results += result_size;
|
||||
delete result_ids;
|
||||
|
||||
if(total_results >= max_results) break;
|
||||
}
|
||||
|
||||
topster.sort();
|
||||
|
||||
for(uint32_t i=0; i<topster.size; i++) {
|
||||
uint32_t id = topster.getKeyAt(i);
|
||||
std::cout << "ID: " << id << std::endl;
|
||||
|
||||
const std::string value = store->get(std::to_string(id));
|
||||
nlohmann::json document = nlohmann::json::parse(value);
|
||||
results.push_back(document);
|
||||
}
|
||||
|
||||
if(total_results > 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
cost++;
|
||||
}
|
||||
|
||||
return results;
|
||||
@ -176,10 +187,10 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
|
||||
uint32_t doc_index = token_leaf->values->ids.indexOf(doc_id);
|
||||
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
|
||||
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
|
||||
(token_leaf->values->offsets.getLength() - 1) :
|
||||
token_leaf->values->offsets.getLength() :
|
||||
token_leaf->values->offset_index.at(doc_index+1);
|
||||
|
||||
while(start_offset <= end_offset) {
|
||||
while(start_offset < end_offset) {
|
||||
positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
|
||||
start_offset++;
|
||||
}
|
||||
@ -190,10 +201,12 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
|
||||
MatchScore mscore = MatchScore::match_score(doc_id, token_positions);
|
||||
const uint32_t cumulativeScore = ((uint32_t)(mscore.words_present * 16 + (20 - mscore.distance)) * 64000) + doc_scores.at(doc_id);
|
||||
|
||||
/*std::cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: "
|
||||
<< (int)mscore.distance << " - mscore.words_present: " << (int)mscore.words_present
|
||||
<< " - doc_scores[doc_id]: " << (int)doc_scores[doc_id] << " - cumulativeScore: "
|
||||
<< cumulativeScore << std::endl;*/
|
||||
/*
|
||||
std::cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: "
|
||||
<< (int) mscore.distance << " - mscore.words_present: " << (int) mscore.words_present
|
||||
<< " - doc_scores[doc_id]: " << (int) doc_scores.at(doc_id) << " - cumulativeScore: "
|
||||
<< cumulativeScore << std::endl;
|
||||
*/
|
||||
|
||||
topster.add(doc_id, cumulativeScore);
|
||||
}
|
||||
|
@ -25,7 +25,7 @@ public:
|
||||
Collection(std::string state_dir_path);
|
||||
~Collection();
|
||||
void add(std::string json_str);
|
||||
std::vector<nlohmann::json> search(std::string query, const int num_typos, const size_t max_results);
|
||||
std::vector<nlohmann::json> search(std::string query, const int num_typos, const size_t num_results);
|
||||
|
||||
static inline std::vector<art_leaf *> _next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
|
||||
long long int n);
|
||||
|
@ -26,7 +26,7 @@ int main() {
|
||||
cout << "FINISHED INDEXING!" << endl << flush;
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
collection->search("platn", 1, 100);
|
||||
collection->search("platn growing", 1, 100);
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
cout << "Time taken: " << timeMillis << "us" << endl;
|
||||
delete collection;
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <regex>
|
||||
#include "string_utils.h"
|
||||
#include "collection.h"
|
||||
#include <sys/resource.h>
|
||||
|
||||
#include "h2o.h"
|
||||
#include "h2o/http1.h"
|
||||
@ -82,6 +83,10 @@ static int chunked_test(h2o_handler_t *self, h2o_req_t *req) {
|
||||
|
||||
std::string json_str = json_array.dump();
|
||||
|
||||
struct rusage r_usage;
|
||||
getrusage(RUSAGE_SELF,&r_usage);
|
||||
|
||||
std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl;
|
||||
std::cout << "JSON:" << json_str << std::endl;
|
||||
|
||||
h2o_iovec_t body = h2o_strdup(&req->pool, json_str.c_str(), SIZE_MAX);
|
||||
|
@ -1,4 +1,4 @@
|
||||
{"points":15,"title":"How are cryogenic rocket plan propellants delivered to the launch pad?"}
|
||||
{"points":15,"title":"How are cryogenic rocket plant propellants delivered to the growing launch pad?"}
|
||||
{"points":14,"title":"Are there any (free) are online data archives for data from instruments on Soviet / Russian missions?"}
|
||||
{"points":13,"title":"Where should I look in ISS to find mouldy food?"}
|
||||
{"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"}
|
||||
|
Loading…
x
Reference in New Issue
Block a user