diff --git a/src/main.cpp b/src/main.cpp index 04e9d8a9..ee73be77 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -65,17 +65,13 @@ void benchmark_heap_array() { cout << "Time taken: " << timeMillis << endl; } -void index_document(art_tree& t, uint32_t doc_id, vector & tokens, uint16_t score) { +void index_document(art_tree& t, uint32_t doc_id, vector tokens, uint16_t score) { unordered_map> token_to_offsets; for(uint32_t i=0; i 0) { - token_to_offsets[token].push_back(i); - } else { - std::transform(token.begin(), token.end(), token.begin(), ::tolower); - token_to_offsets[token] = vector{i}; - } + std::transform(token.begin(), token.end(), token.begin(), ::tolower); + token_to_offsets[token].push_back(i); } for(auto & kv: token_to_offsets) { @@ -97,6 +93,7 @@ void index_document(art_tree& t, uint32_t doc_id, vector & tokens, uint1 2. For each token, look up ids using exact lookup a. If a token has no result, try again with edit distance of 1, and then 2 3. Do a limited cartesian product of the word suggestions for each token to form possible corrected search phrases + (adapted from: http://stackoverflow.com/a/31169617/131050) 4. Intersect the lists to find docs that match each phrase 5. Sort the docs based on some ranking criteria */ @@ -116,75 +113,81 @@ void find_documents(art_tree & t, string query, size_t max_results) { } } - cout << "token_leaves.size(): " << token_leaves.size() << endl; - std::vector> word_positions; Topster<100> topster; size_t total_results = 0; const size_t combination_limit = 10; auto product = []( long long a, vector& b ) { return a*b.size(); }; long long int N = accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product ); - vector token_to_hits(token_leaves.size()); for(long long n=0; n query_suggestion(token_leaves.size()); + + // generate the next combination from `token_leaves` and store it in `query_suggestion` ldiv_t q { n, 0 }; - for(unsigned long i= token_leaves.size() - 1; 0 <= i; --i) { + for( long long i=token_leaves.size()-1 ; 0<=i ; --i ) { q = div(q.quot, token_leaves[i].size()); - token_to_hits[i] = token_leaves[i][q.rem]; + query_suggestion[i] = token_leaves[i][q.rem]; } - for(art_leaf* x : token_to_hits) { - cout << x->key << ', '; - } - - // every element in vector `u` represents a token and its associated hits - // sort ascending based on matched document size to perform effective intersection - sort(token_to_hits.begin(), token_to_hits.end(), [](const art_leaf* left, const art_leaf* right) { + // sort ascending based on matched documents for each token to perform effective intersection + sort(query_suggestion.begin(), query_suggestion.end(), [](const art_leaf* left, const art_leaf* right) { return left->values->ids.getLength() < right->values->ids.getLength(); }); - uint32_t*result_ids = token_to_hits[0]->values->ids.uncompress(); - size_t result_size = token_to_hits[0]->values->ids.getLength(); + // initialize results with the starting element (for further intersection) + uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress(); + size_t result_size = query_suggestion[0]->values->ids.getLength(); if(result_size == 0) continue; // intersect the document ids for each token to find docs that contain all the tokens (stored in `result_ids`) - for(auto i=1; i < token_to_hits.size(); i++) { + for(auto i=1; i < query_suggestion.size(); i++) { uint32_t* out = new uint32_t[result_size]; - uint32_t* curr = token_to_hits[i]->values->ids.uncompress(); - result_size = Intersection::scalar(result_ids, result_size, curr, token_to_hits[i]->values->ids.getLength(), out); + uint32_t* curr = query_suggestion[i]->values->ids.uncompress(); + result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out); delete result_ids; delete curr; result_ids = out; } - // go through each document id and calculate match score + // go through each matching document id and calculate match score for(auto i=0; i positions; uint32_t doc_index = token_leaf->values->ids.indexOf(result_ids[i]); uint32_t offset_index = token_leaf->values->offset_index.at(doc_index); uint32_t num_offsets = token_leaf->values->offsets.at(offset_index); - for (auto offset_count = 0; offset_count < num_offsets; offset_count++) { + for (auto offset_count = 1; offset_count <= num_offsets; offset_count++) { positions.push_back((uint16_t) token_leaf->values->offsets.at(offset_index + offset_count)); } word_positions.push_back(positions); } MatchScore score = match_score(word_positions); - topster.add(result_ids[i], (const uint32_t &) (score.words_present * 16 + score.distance)); + const uint16_t cumulativeScore = (const uint16_t) (score.words_present * 16 + score.distance); + + //cout << "result_ids[i]: " << result_ids[i] << " - cumulativeScore: " << cumulativeScore << endl; + topster.add(result_ids[i], cumulativeScore); } total_results += result_size; - cout << endl << "RESULT SIZE: " << result_size << endl; delete result_ids; if(total_results >= max_results) break; } - //std::sort(topster.data); + topster.sort(); + + cout << "RESULTS: " << endl << endl; + + for(uint32_t i=0; i parts; tokenize(line, parts, "\t", true); vector tokens; tokenize(parts[0], tokens, " ", true); - index_document(t, num, tokens, stoi(parts[1])); - num++; + index_document(t, doc_id, tokens, stoi(parts[1])); + doc_id++; } - const unsigned char *prefix = (const unsigned char *) "the"; + /*const unsigned char *prefix = (const unsigned char *) "the"; size_t prefix_len = strlen((const char *) prefix); std::vector results; @@ -213,8 +216,8 @@ int main() { art_iter_fuzzy_prefix(&t, prefix, prefix_len, 0, 2, results); long long int timeMillis = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - begin).count(); -// art_iter_prefix(&t, prefix, strlen((const char *) prefix), test_prefix_cb, NULL); -// art_iter(&t, test_prefix_cb, NULL); + art_iter_prefix(&t, prefix, strlen((const char *) prefix), test_prefix_cb, NULL); + art_iter(&t, test_prefix_cb, NULL); cout << "Time taken: " << timeMillis << "us" << endl; @@ -223,8 +226,8 @@ int main() { for(uint32_t i=0; ivalues->ids.getLength(); i++) { std::cout << ", ID: " << leaf->values->ids.at(i) << std::endl; } - //std::cout << ", Value: " << leaf->values->ids.at(0) << std::endl; - } + std::cout << ", Value: " << leaf->values->ids.at(0) << std::endl; + }*/ find_documents(t, "are the", 10); diff --git a/test/documents.txt b/test/documents.txt index 5f79f481..de85437f 100644 --- a/test/documents.txt +++ b/test/documents.txt @@ -1,5 +1,5 @@ How are cryogenic rocket propellants delivered to the launch pad? 15 -Are there any (free) online data archives for data from instruments on Soviet / Russian missions? 14 +Are there any (free) are online data archives for data from instruments on Soviet / Russian missions? 14 Where should I look in ISS to find mouldy food? 13 Is solar system active cryovolcanism a potential viable power source for future colonies? 13 The heaviest martian spacecraft 13