Fixed multi word queries.

This commit is contained in:
Kishore Nallan 2016-09-12 09:54:34 +05:30
parent 2f26b95c5b
commit e7c6c6d3cb
7 changed files with 161 additions and 125 deletions

View File

@ -5,7 +5,7 @@
**Search index**
- ~~Proper JSON as input~~
- Storing raw JSON input to RocksDB
- ~~Storing raw JSON input to RocksDB~~
- ART for every indexed field
- UTF-8 support for fuzzy search
- Facets
@ -29,4 +29,8 @@
**Refactoring**
- ~~`token_count` in leaf is redundant: can be accessed from value~~
- ~~storing length in `offsets` is redundant: it can be found by looking up value of the next index in offset_index~~
- ~~storing length in `offsets` is redundant: it can be found by looking up value of the next index in offset_index~~
**Tech debt**
- Use GLOB file pattern for CMake (better IDE refactoring support)

View File

@ -12,96 +12,110 @@
#define D(x)
#endif
struct MatchScore {
struct TokenPosition {
uint8_t token_id; // token identifier
uint16_t position; // token's position in the text
uint16_t position_index; // index of the position in the vector
#define TokenOffsetHeap std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset>
bool operator() (const TokenPosition& a, const TokenPosition& b) {
return a.position > b.position;
struct MatchScore {
struct TokenOffset {
uint8_t token_id; // token identifier
uint16_t offset; // token's offset in the text
uint16_t offset_index; // index of the offset in the vector
bool operator() (const TokenOffset& a, const TokenOffset& b) {
return a.offset > b.offset;
}
};
#define addTopOfHeapToWindow(heap,q,token_positions,token_pos) {\
TokenPosition top = heap.top();\
heap.pop();\
q.push(top);\
token_pos[top.token_id] = top.position; \
top.position_index++;\
/* Must refill the heap - push the next position of the same token */\
if(top.position_index < token_positions[top.token_id].size()) {\
heap.push(TokenPosition{top.token_id, token_positions[top.token_id][top.position_index], top.position_index});\
}\
static void print_token_offsets(std::vector<std::vector<uint16_t>> &token_offsets) {
for(auto offsets: token_offsets) {
for(auto offset: offsets) {
std::cout << offset << ", ";
}
std::cout << std::endl;
}
}
static inline void addTopOfHeapToWindow(TokenOffsetHeap &heap, std::queue<TokenOffset> &window,
std::vector<std::vector<uint16_t>> &token_offsets, uint16_t *token_offset) {
TokenOffset top = heap.top();
heap.pop();
window.push(top);
token_offset[top.token_id] = top.offset;
top.offset_index++;
// Must refill the heap - push the next offset of the same token
if(top.offset_index < token_offsets[top.token_id].size()) {
heap.push(TokenOffset{top.token_id, token_offsets[top.token_id][top.offset_index], top.offset_index});
}
}
uint16_t words_present;
uint16_t distance;
/*
* Given *sorted positions* of each target token in a *single* document, generates a score that indicates:
* Given *sorted offsets* of each target token in a *single* document, generates a score that indicates:
* a) How many tokens are present in the document
* b) The proximity between the tokens in the document
*
* We use a priority queue to read the position vectors in a sorted manner, slide a window of a given size, and
* We use a priority queue to read the offset vectors in a sorted manner, slide a window of a given size, and
* compute the max_match and min_displacement of target tokens across the windows.
*/
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_positions) {
static MatchScore match_score(uint32_t doc_id, std::vector<std::vector<uint16_t>> &token_offsets) {
const size_t WINDOW_SIZE = 20;
const size_t MAX_TOKENS_IN_A_QUERY = 20;
const uint16_t MAX_UINT_16 = std::numeric_limits<uint16_t>::max();
const uint16_t MAX_DISPLACEMENT = 20;
std::priority_queue<TokenPosition, std::vector<TokenPosition>, TokenPosition> heap;
std::priority_queue<TokenOffset, std::vector<TokenOffset>, TokenOffset> heap;
for(uint8_t token_id=0; token_id < token_positions.size(); token_id++) {
heap.push(TokenPosition{token_id, token_positions[token_id].front(), 0});
for(uint8_t token_id=0; token_id < token_offsets.size(); token_id++) {
heap.push(TokenOffset{token_id, token_offsets[token_id].front(), 0});
}
// heap now contains the first occurring position of each token in the given document
// heap now contains the first occurring offset of each token in the given document
uint16_t max_match = 1;
uint16_t min_displacement = UINT16_MAX;
uint16_t min_displacement = MAX_DISPLACEMENT;
std::queue<TokenPosition> q;
uint16_t token_pos[MAX_TOKENS_IN_A_QUERY] = { };
std::fill_n(token_pos, MAX_TOKENS_IN_A_QUERY, MAX_UINT_16);
std::queue<TokenOffset> window;
uint16_t token_offset[MAX_TOKENS_IN_A_QUERY] = { };
std::fill_n(token_offset, MAX_TOKENS_IN_A_QUERY, MAX_DISPLACEMENT);
do {
if(q.empty()) {
addTopOfHeapToWindow(heap, q, token_positions, token_pos);
if(window.empty()) {
addTopOfHeapToWindow(heap, window, token_offsets, token_offset);
}
D(cout << "Loop till window fills..." << endl;)
D(std::cout << "Loop till window fills... doc_id: " << doc_id << std::endl;)
// Fill the queue with tokens within a given window frame size of the start position
// Fill the queue with tokens within a given window frame size of the start offset
// At the same time, we also record the *last* occurrence of each token within the window
// For e.g. if `cat` appeared at positions 1,3 and 5, we will record `token_pos[cat] = 5`
const uint16_t start_pos = q.front().position;
while(!heap.empty() && heap.top().position < start_pos+WINDOW_SIZE) {
addTopOfHeapToWindow(heap, q, token_positions, token_pos);
// For e.g. if `cat` appeared at offsets 1,3 and 5, we will record `token_offset[cat] = 5`
const uint16_t start_offset = window.front().offset;
while(!heap.empty() && heap.top().offset < start_offset+WINDOW_SIZE) {
addTopOfHeapToWindow(heap, window, token_offsets, token_offset);
}
D(cout << endl << "----" << endl);
D(std::cout << std::endl << "----" << std::endl);
uint16_t prev_pos = MAX_UINT_16;
uint16_t prev_pos = MAX_DISPLACEMENT;
uint16_t num_match = 0;
uint16_t displacement = 0;
for(size_t token_id=0; token_id<token_positions.size(); token_id++) {
// If a token appeared within the window, we would have recorded its position
if(token_pos[token_id] != MAX_UINT_16) {
for(size_t token_id=0; token_id<token_offsets.size(); token_id++) {
// If a token appeared within the window, we would have recorded its offset
if(token_offset[token_id] != MAX_DISPLACEMENT) {
num_match++;
if(prev_pos == MAX_UINT_16) prev_pos = token_pos[token_id];
if(prev_pos == MAX_DISPLACEMENT) prev_pos = token_offset[token_id];
else {
// Calculate the distance between the tokens within the window
// Ideally, this should be (NUM_TOKENS - 1) when all the tokens are adjacent to each other
D(cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_pos[token_id] << endl);
displacement += abs(token_pos[token_id]-prev_pos);
prev_pos = token_pos[token_id];
D(std::cout << "prev_pos: " << prev_pos << " , curr_pos: " << token_offset[token_id] << std::endl);
displacement += abs(token_offset[token_id]-prev_pos);
prev_pos = token_offset[token_id];
}
}
}
D(cout << endl << "!!!displacement: " << displacement << " | num_match: " << num_match << endl);
D(std::cout << std::endl << "!!!displacement: " << displacement << " | num_match: " << num_match << std::endl);
// Track the best `displacement` and `num_match` seen so far across all the windows
if(num_match >= max_match) {
@ -112,8 +126,8 @@ struct MatchScore {
}
// As we slide the window, drop the first token of the window from the computation
token_pos[q.front().token_id] = 0;
q.pop();
token_offset[window.front().token_id] = 0;
window.pop();
} while(!heap.empty());
return MatchScore{max_match, min_displacement};

View File

@ -7,6 +7,7 @@
#include <match_score.h>
#include <string_utils.h>
#include "sole.hpp"
#include "art.h"
#include "json.hpp"
Collection::Collection(std::string state_dir_path): seq_id(0) {
@ -85,80 +86,90 @@ void Collection::add(std::string json_str) {
4. Intersect the lists to find docs that match each phrase
5. Sort the docs based on some ranking criteria
*/
std::vector<nlohmann::json> Collection::search(std::string query, const int num_typos, const size_t max_results) {
std::vector<nlohmann::json> Collection::search(std::string query, const int num_typos, const size_t num_results) {
std::vector<std::string> tokens;
StringUtils::tokenize(query, tokens, " ", true);
const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
const size_t max_results = std::min(num_results, (size_t) 100);
std::cout << "Searching with max_cost=" << max_cost << std::endl;
std::vector<std::vector<art_leaf*>> token_leaves;
for(std::string token: tokens) {
std::vector<art_leaf*> leaves;
art_fuzzy_results(&t, (const unsigned char *) token.c_str(), (int) token.length() + 1, max_cost, 10, leaves);
if(!leaves.empty()) {
for(auto i=0; i<leaves.size(); i++) {
//printf("%s - ", token.c_str());
//printf("%.*s", leaves[i]->key_len, leaves[i]->key);
//printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->values->ids.getLength());
}
token_leaves.push_back(leaves);
}
}
if(token_leaves.size() == 0) {
return std::vector<nlohmann::json>();
}
//std::cout << "token_leaves.size = " << token_leaves.size() << std::endl;
Topster<100> topster;
int cost = 0;
size_t total_results = 0;
const size_t combination_limit = 10;
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
for(long long n=0; n<N && n<combination_limit; ++n) {
// every element in `query_suggestion` represents a token and its associated hits
std::vector<art_leaf *> query_suggestion = _next_suggestion(token_leaves, n);
// initialize results with the starting element (for further intersection)
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
size_t result_size = query_suggestion[0]->values->ids.getLength();
if(result_size == 0) continue;
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
for(auto i=1; i < query_suggestion.size(); i++) {
uint32_t* out = new uint32_t[result_size];
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
delete result_ids;
delete curr;
result_ids = out;
}
// go through each matching document id and calculate match score
score_results(topster, query_suggestion, result_ids, result_size);
total_results += result_size;
delete result_ids;
if(total_results >= max_results) break;
}
topster.sort();
std::vector<nlohmann::json> results;
for(uint32_t i=0; i<topster.size; i++) {
uint32_t id = topster.getKeyAt(i);
std::cout << "ID: " << id << std::endl;
while(cost <= max_cost) {
std::cout << "Searching with cost=" << cost << std::endl;
const std::string value = store->get(std::to_string(id));
nlohmann::json document = nlohmann::json::parse(value);
results.push_back(document);
std::vector<std::vector<art_leaf*>> token_leaves;
for(std::string token: tokens) {
std::vector<art_leaf*> leaves;
art_fuzzy_results(&t, (const unsigned char *) token.c_str(), (int) token.length() + 1, cost, 3, leaves);
if(!leaves.empty()) {
for(auto i=0; i<leaves.size(); i++) {
//printf("%s - ", token.c_str());
//printf("%.*s", leaves[i]->key_len, leaves[i]->key);
//printf(" - max_cost: %d, - score: %d\n", max_cost, leaves[i]->values->ids.getLength());
}
token_leaves.push_back(leaves);
}
}
if(token_leaves.size() != tokens.size()) {
//std::cout << "token_leaves.size() != tokens.size(), continuing..." << std::endl << std::endl;
cost++;
continue;
}
Topster<100> topster;
const size_t combination_limit = 10;
auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
for(long long n=0; n<N && n<combination_limit; ++n) {
// every element in `query_suggestion` represents a token and its associated hits
std::vector<art_leaf *> query_suggestion = _next_suggestion(token_leaves, n);
// initialize results with the starting element (for further intersection)
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
size_t result_size = query_suggestion[0]->values->ids.getLength();
if(result_size == 0) continue;
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
for(auto i=1; i < query_suggestion.size(); i++) {
uint32_t* out = new uint32_t[result_size];
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
delete result_ids;
delete curr;
result_ids = out;
}
// go through each matching document id and calculate match score
score_results(topster, query_suggestion, result_ids, result_size);
total_results += result_size;
delete result_ids;
if(total_results >= max_results) break;
}
topster.sort();
for(uint32_t i=0; i<topster.size; i++) {
uint32_t id = topster.getKeyAt(i);
std::cout << "ID: " << id << std::endl;
const std::string value = store->get(std::to_string(id));
nlohmann::json document = nlohmann::json::parse(value);
results.push_back(document);
}
if(total_results > 0) {
break;
}
cost++;
}
return results;
@ -176,10 +187,10 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
uint32_t doc_index = token_leaf->values->ids.indexOf(doc_id);
uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
(token_leaf->values->offsets.getLength() - 1) :
token_leaf->values->offsets.getLength() :
token_leaf->values->offset_index.at(doc_index+1);
while(start_offset <= end_offset) {
while(start_offset < end_offset) {
positions.push_back((uint16_t) token_leaf->values->offsets.at(start_offset));
start_offset++;
}
@ -190,10 +201,12 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
MatchScore mscore = MatchScore::match_score(doc_id, token_positions);
const uint32_t cumulativeScore = ((uint32_t)(mscore.words_present * 16 + (20 - mscore.distance)) * 64000) + doc_scores.at(doc_id);
/*std::cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: "
<< (int)mscore.distance << " - mscore.words_present: " << (int)mscore.words_present
<< " - doc_scores[doc_id]: " << (int)doc_scores[doc_id] << " - cumulativeScore: "
<< cumulativeScore << std::endl;*/
/*
std::cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: "
<< (int) mscore.distance << " - mscore.words_present: " << (int) mscore.words_present
<< " - doc_scores[doc_id]: " << (int) doc_scores.at(doc_id) << " - cumulativeScore: "
<< cumulativeScore << std::endl;
*/
topster.add(doc_id, cumulativeScore);
}

View File

@ -25,7 +25,7 @@ public:
Collection(std::string state_dir_path);
~Collection();
void add(std::string json_str);
std::vector<nlohmann::json> search(std::string query, const int num_typos, const size_t max_results);
std::vector<nlohmann::json> search(std::string query, const int num_typos, const size_t num_results);
static inline std::vector<art_leaf *> _next_suggestion(const std::vector<std::vector<art_leaf *>> &token_leaves,
long long int n);

View File

@ -26,7 +26,7 @@ int main() {
cout << "FINISHED INDEXING!" << endl << flush;
auto begin = std::chrono::high_resolution_clock::now();
collection->search("platn", 1, 100);
collection->search("platn growing", 1, 100);
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
cout << "Time taken: " << timeMillis << "us" << endl;
delete collection;

View File

@ -16,6 +16,7 @@
#include <regex>
#include "string_utils.h"
#include "collection.h"
#include <sys/resource.h>
#include "h2o.h"
#include "h2o/http1.h"
@ -82,6 +83,10 @@ static int chunked_test(h2o_handler_t *self, h2o_req_t *req) {
std::string json_str = json_array.dump();
struct rusage r_usage;
getrusage(RUSAGE_SELF,&r_usage);
std::cout << "Memory usage: " << r_usage.ru_maxrss << std::endl;
std::cout << "JSON:" << json_str << std::endl;
h2o_iovec_t body = h2o_strdup(&req->pool, json_str.c_str(), SIZE_MAX);

View File

@ -1,4 +1,4 @@
{"points":15,"title":"How are cryogenic rocket plan propellants delivered to the launch pad?"}
{"points":15,"title":"How are cryogenic rocket plant propellants delivered to the growing launch pad?"}
{"points":14,"title":"Are there any (free) are online data archives for data from instruments on Soviet / Russian missions?"}
{"points":13,"title":"Where should I look in ISS to find mouldy food?"}
{"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"}