From d831c49817710da1cfa18eaffb6ca307accbabab Mon Sep 17 00:00:00 2001 From: Kishore Nallan Date: Sun, 8 Jan 2017 21:44:36 +0530 Subject: [PATCH] Move duplicate ID detection right inside topster. --- TODO.md | 3 +- include/collection.h | 8 ++---- include/topster.h | 9 ++++++ src/collection.cpp | 60 ++++++++++++++-------------------------- test/collection_test.cpp | 4 +++ 5 files changed, 38 insertions(+), 46 deletions(-) diff --git a/TODO.md b/TODO.md index a40c2df3..9aca740b 100644 --- a/TODO.md +++ b/TODO.md @@ -32,10 +32,11 @@ - Assumption that all tokens match for scoring is no longer true - Primary_rank_scores and secondary_rank_scores hashmaps should be combined - Proper logging -- Have set inside topster itself +- ~~Have set inside topster itself~~ - Restore records as well on restart (like for meta) - Persist next_seq_id - collection_id should be int, not string +- clean special chars before indexing **API** diff --git a/include/collection.h b/include/collection.h index df72d090..bb5f5816 100644 --- a/include/collection.h +++ b/include/collection.h @@ -43,12 +43,10 @@ private: long long int n); void log_leaves(const int cost, const std::string &token, const std::vector &leaves) const; - std::vector::KV> search(std::string & query, const std::string & field, const int num_typos, const size_t num_results, - std::vector::KV> & result_kvs, spp::sparse_hash_set & result_set, - const token_ordering token_order = FREQUENCY, const bool prefix = false); + void search(std::string & query, const std::string & field, const int num_typos, const size_t num_results, + Topster<100> & topster, const token_ordering token_order = FREQUENCY, const bool prefix = false); - void search_candidates(int & token_rank, std::vector> & token_leaves, - std::vector::KV> & result_kvs, spp::sparse_hash_set & dedup_seq_ids, + void search_candidates(int & token_rank, std::vector> & token_leaves, Topster<100> & topster, size_t & total_results, const size_t & max_results); void index_string_field(const std::string &field_name, art_tree *t, const nlohmann::json &document, uint32_t seq_id) const; diff --git a/include/topster.h b/include/topster.h index 34b37a87..b541667a 100644 --- a/include/topster.h +++ b/include/topster.h @@ -4,6 +4,7 @@ #include #include #include +#include /* * Remembers the max-K elements seen so far using a min-heap @@ -19,6 +20,8 @@ struct Topster { uint32_t size; + spp::sparse_hash_set dedup_keys; + Topster(): size(0){ } @@ -30,6 +33,12 @@ struct Topster { } void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr, const int64_t &secondary_attr){ + if(dedup_keys.count(key) != 0) { + return ; + } + + dedup_keys.insert(key); + if (size >= MAX_SIZE) { if(!is_greater(data[0], match_score, primary_attr, secondary_attr)) { // when incoming value is less than the smallest in the heap, ignore diff --git a/src/collection.cpp b/src/collection.cpp index a3d3f406..7c819db5 100644 --- a/src/collection.cpp +++ b/src/collection.cpp @@ -131,8 +131,7 @@ void Collection::index_string_field(const std::string &field_name, art_tree *t, } void Collection::search_candidates(int & token_rank, std::vector> & token_leaves, - std::vector::KV> & result_kvs, spp::sparse_hash_set & dedup_seq_ids, - size_t & total_results, const size_t & max_results) { + Topster<100> & topster, size_t & total_results, const size_t & max_results) { const size_t combination_limit = 10; auto product = []( long long a, std::vector& b ) { return a*b.size(); }; long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product); @@ -165,19 +164,10 @@ void Collection::search_candidates(int & token_rank, std::vector topster; score_results(topster, token_rank, query_suggestion, result_ids, result_size); delete[] result_ids; - topster.sort(); - for (uint32_t i = 0; i < topster.size && total_results < max_results; i++) { - uint64_t seq_id = topster.getKeyAt(i); - if(dedup_seq_ids.count(seq_id) == 0) { - result_kvs.push_back(topster.getKV(i)); - dedup_seq_ids.emplace(seq_id); - total_results++; - } - } + total_results += topster.size; if(total_results >= max_results) { break; @@ -188,8 +178,7 @@ void Collection::search_candidates(int & token_rank, std::vector Collection::search(std::string query, const std::vector fields, const int num_typos, const size_t num_results, const token_ordering token_order, const bool prefix) { - int size = index_map.size(); - std::cout << "search size: " << size << std::endl; + Topster<100> topster; // Order of `fields` are used to rank results auto begin = std::chrono::high_resolution_clock::now(); @@ -198,15 +187,11 @@ std::vector Collection::search(std::string query, const std::vec for(int i = 0; i < fields.size(); i++) { const std::string & field = fields[i]; - // Container for holding the results - std::vector::KV> result_kvs; + search(query, field, num_typos, num_results, topster, token_order, prefix); + topster.sort(); - // To prevent duplicate results, while preserving order of result vector - spp::sparse_hash_set result_set; - - search(query, field, num_typos, num_results, result_kvs, result_set, token_order, prefix); - for(auto result_kv: result_kvs) { - field_order_kvs.push_back(std::make_pair(fields.size() - i, result_kv)); + for(auto t = 0; t < topster.size && t < num_results; t++) { + field_order_kvs.push_back(std::make_pair(fields.size() - i, topster.getKV(t))); } } @@ -243,18 +228,15 @@ std::vector Collection::search(std::string query, const std::vec 4. Intersect the lists to find docs that match each phrase 5. Sort the docs based on some ranking criteria */ -std::vector::KV> Collection::search(std::string & query, const std::string & field, - const int num_typos, const size_t num_results, - std::vector::KV> & result_kvs, - spp::sparse_hash_set & result_set, - const token_ordering token_order, const bool prefix) { +void Collection::search(std::string & query, const std::string & field, const int num_typos, const size_t num_results, + Topster<100> & topster, const token_ordering token_order, const bool prefix) { std::vector tokens; StringUtils::tokenize(query, tokens, " ", true); const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos; const size_t max_results = std::min(num_results, (size_t) Collection::MAX_RESULTS); - size_t total_results = result_kvs.size(); + size_t total_results = topster.size; // To prevent us from doing ART search repeatedly as we iterate through possible corrections spp::sparse_hash_map> token_cost_cache; @@ -345,7 +327,7 @@ std::vector::KV> Collection::search(std::string & query, const std: if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) { // If a) all tokens were found, or b) Some were skipped because they don't exist within max_cost, // go ahead and search for candidates with what we have so far - search_candidates(token_rank, token_leaves, result_kvs, result_set, total_results, max_results); + search_candidates(token_rank, token_leaves, topster, total_results, max_results); if (total_results >= max_results) { // If we don't find enough results, we continue outerloop (looking at tokens with greater cost) @@ -357,7 +339,7 @@ std::vector::KV> Collection::search(std::string & query, const std: } // When there are not enough overall results and atleast one token has results - if(result_kvs.size() < max_results && token_to_count.size() > 1) { + if(topster.size < max_results && token_to_count.size() > 1) { // Drop certain token with least hits and try searching again std::string truncated_query; @@ -378,10 +360,8 @@ std::vector::KV> Collection::search(std::string & query, const std: } } - return search(truncated_query, field, num_typos, num_results, result_kvs, result_set, token_order, prefix); + return search(truncated_query, field, num_typos, num_results, topster, token_order, prefix); } - - return result_kvs; } void Collection::log_leaves(const int cost, const std::string &token, const std::vector &leaves) const { @@ -402,7 +382,7 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank, const int max_token_rank = 250; for(auto i=0; i> token_positions; MatchScore mscore; @@ -413,7 +393,7 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank, // for each token in the query, find the positions that it appears in this document for (art_leaf *token_leaf : query_suggestion) { std::vector positions; - uint32_t doc_index = token_leaf->values->ids.indexOf(doc_id); + uint32_t doc_index = token_leaf->values->ids.indexOf(seq_id); uint32_t start_offset = token_leaf->values->offset_index.at(doc_index); uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ? token_leaf->values->offsets.getLength() : @@ -427,7 +407,7 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank, token_positions.push_back(positions); } - mscore = MatchScore::match_score(doc_id, token_positions); + mscore = MatchScore::match_score(seq_id, token_positions); } int token_rank_score = max_token_rank - token_rank; @@ -437,11 +417,11 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank, ((uint64_t)(mscore.words_present) << 8) + (MAX_SEARCH_TOKENS - mscore.distance); - int64_t primary_rank_score = primary_rank_scores.count(doc_id) > 0 ? primary_rank_scores.at(doc_id) : 0; - int64_t secondary_rank_score = secondary_rank_scores.count(doc_id) > 0 ? secondary_rank_scores.at(doc_id) : 0; - topster.add(doc_id, match_score, primary_rank_score, secondary_rank_score); + int64_t primary_rank_score = primary_rank_scores.count(seq_id) > 0 ? primary_rank_scores.at(seq_id) : 0; + int64_t secondary_rank_score = secondary_rank_scores.count(seq_id) > 0 ? secondary_rank_scores.at(seq_id) : 0; + topster.add(seq_id, match_score, primary_rank_score, secondary_rank_score); /*std::cout << "token_rank_score: " << token_rank_score << ", match_score: " - << match_score << ", primary_rank_score: " << primary_rank_score << ", doc_id: " << doc_id << std::endl;*/ + << match_score << ", primary_rank_score: " << primary_rank_score << ", seq_id: " << seq_id << std::endl;*/ } } diff --git a/test/collection_test.cpp b/test/collection_test.cpp index ce2d1696..0322de6c 100644 --- a/test/collection_test.cpp +++ b/test/collection_test.cpp @@ -65,6 +65,10 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) { TEST_F(CollectionTest, ExactPhraseSearch) { std::vector results = collection->search("rocket launch", search_fields, 0, 10); + for(auto res: results) { + std::cout << res << std::endl; + } + std::cout << std::endl; ASSERT_EQ(5, results.size()); /*