mirror of
https://github.com/typesense/typesense.git
synced 2025-05-21 14:12:27 +08:00
Minor refactoring. Adding more comments.
This commit is contained in:
parent
c667ed5d10
commit
10ff747802
51
src/main.cpp
51
src/main.cpp
@ -100,9 +100,9 @@ void index_document(art_tree& t, uint32_t doc_id, vector<string> & tokens, uint1
|
||||
4. Intersect the lists to find docs that match each phrase
|
||||
5. Sort the docs based on some ranking criteria
|
||||
*/
|
||||
void find_documents(art_tree & t, string q, size_t max_results) {
|
||||
void find_documents(art_tree & t, string query, size_t max_results) {
|
||||
vector<string> tokens;
|
||||
tokenize(q, tokens, " ", true);
|
||||
tokenize(query, tokens, " ", true);
|
||||
|
||||
vector<vector<art_leaf*>> token_leaves;
|
||||
for(auto token: tokens) {
|
||||
@ -116,7 +116,7 @@ void find_documents(art_tree & t, string q, size_t max_results) {
|
||||
}
|
||||
}
|
||||
|
||||
//cout << "token_leaves.size(): " << token_leaves.size() << endl;
|
||||
cout << "token_leaves.size(): " << token_leaves.size() << endl;
|
||||
|
||||
std::vector<std::vector<uint16_t>> word_positions;
|
||||
Topster<100> topster;
|
||||
@ -124,48 +124,51 @@ void find_documents(art_tree & t, string q, size_t max_results) {
|
||||
const size_t combination_limit = 10;
|
||||
auto product = []( long long a, vector<art_leaf*>& b ) { return a*b.size(); };
|
||||
long long int N = accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product );
|
||||
vector<art_leaf*> u(token_leaves.size());
|
||||
vector<art_leaf*> token_to_hits(token_leaves.size());
|
||||
|
||||
for(long long n=0; n<N && n<combination_limit; ++n) {
|
||||
lldiv_t q { n, 0 };
|
||||
for(long long i=token_leaves.size()-1; 0<=i; --i) {
|
||||
ldiv_t q { n, 0 };
|
||||
for(unsigned long i= token_leaves.size() - 1; 0 <= i; --i) {
|
||||
q = div(q.quot, token_leaves[i].size());
|
||||
u[i] = token_leaves[i][q.rem];
|
||||
token_to_hits[i] = token_leaves[i][q.rem];
|
||||
}
|
||||
|
||||
for(art_leaf* x : u) {
|
||||
cout << x->key << ' ';
|
||||
for(art_leaf* x : token_to_hits) {
|
||||
cout << x->key << ', ';
|
||||
}
|
||||
|
||||
// every element in vector `u` represents a token and its associated hits
|
||||
// sort ascending based on matched document size to perform effective intersection
|
||||
sort(u.begin(), u.end(), [](const art_leaf* left, const art_leaf* right) {
|
||||
sort(token_to_hits.begin(), token_to_hits.end(), [](const art_leaf* left, const art_leaf* right) {
|
||||
return left->values->ids.getLength() < right->values->ids.getLength();
|
||||
});
|
||||
|
||||
uint32_t* result = u[0]->values->ids.uncompress();
|
||||
size_t result_size = u[0]->values->ids.getLength();
|
||||
uint32_t*result_ids = token_to_hits[0]->values->ids.uncompress();
|
||||
size_t result_size = token_to_hits[0]->values->ids.getLength();
|
||||
|
||||
if(result_size == 0) continue;
|
||||
|
||||
for(auto i=1; i<u.size(); i++) {
|
||||
// intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`)
|
||||
for(auto i=1; i < token_to_hits.size(); i++) {
|
||||
uint32_t* out = new uint32_t[result_size];
|
||||
uint32_t* curr = u[i]->values->ids.uncompress();
|
||||
result_size = Intersection::scalar(result, result_size, curr, u[i]->values->ids.getLength(), out);
|
||||
delete result;
|
||||
uint32_t* curr = token_to_hits[i]->values->ids.uncompress();
|
||||
result_size = Intersection::scalar(result_ids, result_size, curr, token_to_hits[i]->values->ids.getLength(), out);
|
||||
delete result_ids;
|
||||
delete curr;
|
||||
result = out;
|
||||
result_ids = out;
|
||||
}
|
||||
|
||||
// go through each document and calculate match score
|
||||
// go through each document id and calculate match score
|
||||
for(auto i=0; i<result_size; i++) {
|
||||
for (art_leaf *token_leaf : u) {
|
||||
// we look up the doc_id in the token's doc index
|
||||
// and then arrive at the positions where the token occurs in every document
|
||||
for (art_leaf *token_leaf : token_to_hits) {
|
||||
vector<uint16_t> positions;
|
||||
// by using the document id, locate the positions where the token occurs
|
||||
uint32_t doc_index = token_leaf->values->ids.indexOf(result[i]);
|
||||
uint32_t doc_index = token_leaf->values->ids.indexOf(result_ids[i]);
|
||||
uint32_t offset_index = token_leaf->values->offset_index.at(doc_index);
|
||||
uint32_t num_offsets = token_leaf->values->offsets.at(offset_index);
|
||||
for (auto offset_count = 0; offset_count < num_offsets; offset_count++) {
|
||||
positions.push_back(token_leaf->values->offsets.at(offset_index + offset_count));
|
||||
positions.push_back((uint16_t) token_leaf->values->offsets.at(offset_index + offset_count));
|
||||
}
|
||||
word_positions.push_back(positions);
|
||||
}
|
||||
@ -175,8 +178,8 @@ void find_documents(art_tree & t, string q, size_t max_results) {
|
||||
}
|
||||
|
||||
total_results += result_size;
|
||||
cout << "RESULT SIZE: " << result_size << endl;
|
||||
delete result;
|
||||
cout << endl << "RESULT SIZE: " << result_size << endl;
|
||||
delete result_ids;
|
||||
|
||||
if(total_results >= max_results) break;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user