Move duplicate ID detection right inside topster.

2025-05-22 06:40:30 +08:00 · 2017-01-08 21:44:36 +05:30 · 2017-01-08 21:44:36 +05:30 · d831c49817
commit d831c49817
parent 2f08eca12e
5 changed files with 38 additions and 46 deletions
--- a/TODO.md
+++ b/TODO.md
@ -32,10 +32,11 @@
 - Assumption that all tokens match for scoring is no longer true
 - Primary_rank_scores and secondary_rank_scores hashmaps should be combined
 - Proper logging
- Have set inside topster itself
+- ~~Have set inside topster itself~~
 - Restore records as well on restart (like for meta)
 - Persist next_seq_id
 - collection_id should be int, not string
+- clean special chars before indexing

 **API**

--- a/include/collection.h
+++ b/include/collection.h
@ -43,12 +43,10 @@ private:
                                                          long long int n);
    void log_leaves(const int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const;

-    std::vector<Topster<100>::KV> search(std::string & query, const std::string & field, const int num_typos, const size_t num_results,
-                                         std::vector<Topster<100>::KV> & result_kvs, spp::sparse_hash_set<uint64_t> & result_set,
-                                         const token_ordering token_order = FREQUENCY, const bool prefix = false);
+    void search(std::string & query, const std::string & field, const int num_typos, const size_t num_results,
+                Topster<100> & topster, const token_ordering token_order = FREQUENCY, const bool prefix = false);

-    void search_candidates(int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves,
-                           std::vector<Topster<100>::KV> & result_kvs, spp::sparse_hash_set<uint64_t> & dedup_seq_ids,
+    void search_candidates(int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
                           size_t & total_results, const size_t & max_results);

    void index_string_field(const std::string &field_name, art_tree *t, const nlohmann::json &document, uint32_t seq_id) const;
--- a/include/topster.h
+++ b/include/topster.h
@ -4,6 +4,7 @@
 #include <climits>
 #include <cstdio>
 #include <algorithm>
+#include <sparsepp.h>

 /*
 * Remembers the max-K elements seen so far using a min-heap
@ -19,6 +20,8 @@ struct Topster {

    uint32_t size;

+    spp::sparse_hash_set<uint64_t> dedup_keys;
+
    Topster(): size(0){

    }
@ -30,6 +33,12 @@ struct Topster {
    }

    void add(const uint64_t &key, const uint64_t &match_score, const int64_t &primary_attr, const int64_t &secondary_attr){
+        if(dedup_keys.count(key) != 0) {
+            return ;
+        }
+
+        dedup_keys.insert(key);
+
        if (size >= MAX_SIZE) {
            if(!is_greater(data[0], match_score, primary_attr, secondary_attr)) {
                // when incoming value is less than the smallest in the heap, ignore
--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -131,8 +131,7 @@ void Collection::index_string_field(const std::string &field_name, art_tree *t,
 }

 void Collection::search_candidates(int & token_rank, std::vector<std::vector<art_leaf*>> & token_leaves,
-                                   std::vector<Topster<100>::KV> & result_kvs, spp::sparse_hash_set<uint64_t> & dedup_seq_ids,
-                                   size_t & total_results, const size_t & max_results) {
+                                   Topster<100> & topster, size_t & total_results, const size_t & max_results) {
    const size_t combination_limit = 10;
    auto product = []( long long a, std::vector<art_leaf*>& b ) { return a*b.size(); };
    long long int N = std::accumulate(token_leaves.begin(), token_leaves.end(), 1LL, product);
@ -165,19 +164,10 @@ void Collection::search_candidates(int & token_rank, std::vector<std::vector<art
        }

        // go through each matching document id and calculate match score
-        Topster<100> topster;
        score_results(topster, token_rank, query_suggestion, result_ids, result_size);
        delete[] result_ids;
-        topster.sort();

-        for (uint32_t i = 0; i < topster.size && total_results < max_results; i++) {
-            uint64_t seq_id = topster.getKeyAt(i);
-            if(dedup_seq_ids.count(seq_id) == 0) {
-                result_kvs.push_back(topster.getKV(i));
-                dedup_seq_ids.emplace(seq_id);
-                total_results++;
-            }
-        }
+        total_results += topster.size;

        if(total_results >= max_results) {
            break;
@ -188,8 +178,7 @@ void Collection::search_candidates(int & token_rank, std::vector<std::vector<art
 std::vector<nlohmann::json> Collection::search(std::string query, const std::vector<std::string> fields,
                                               const int num_typos, const size_t num_results,
                                               const token_ordering token_order, const bool prefix) {
-    int size = index_map.size();
-    std::cout << "search size: " << size << std::endl;
+    Topster<100> topster;

    // Order of `fields` are used to rank results
    auto begin = std::chrono::high_resolution_clock::now();
@ -198,15 +187,11 @@ std::vector<nlohmann::json> Collection::search(std::string query, const std::vec
    for(int i = 0; i < fields.size(); i++) {
        const std::string & field = fields[i];

-        // Container for holding the results
-        std::vector<Topster<100>::KV> result_kvs;
+        search(query, field, num_typos, num_results, topster, token_order, prefix);
+        topster.sort();

-        // To prevent duplicate results, while preserving order of result vector
-        spp::sparse_hash_set<uint64_t> result_set;
-
-        search(query, field, num_typos, num_results, result_kvs, result_set, token_order, prefix);
-        for(auto result_kv: result_kvs) {
-            field_order_kvs.push_back(std::make_pair(fields.size() - i, result_kv));
+        for(auto t = 0; t < topster.size && t < num_results; t++) {
+            field_order_kvs.push_back(std::make_pair(fields.size() - i, topster.getKV(t)));
        }
    }

@ -243,18 +228,15 @@ std::vector<nlohmann::json> Collection::search(std::string query, const std::vec
   4. Intersect the lists to find docs that match each phrase
   5. Sort the docs based on some ranking criteria
 */
-std::vector<Topster<100>::KV> Collection::search(std::string & query, const std::string & field,
-                                                 const int num_typos, const size_t num_results,
-                                                 std::vector<Topster<100>::KV> & result_kvs,
-                                                 spp::sparse_hash_set<uint64_t> & result_set,
-                                                 const token_ordering token_order, const bool prefix) {
+void Collection::search(std::string & query, const std::string & field, const int num_typos, const size_t num_results,
+                        Topster<100> & topster, const token_ordering token_order, const bool prefix) {
    std::vector<std::string> tokens;
    StringUtils::tokenize(query, tokens, " ", true);

    const int max_cost = (num_typos < 0 || num_typos > 2) ? 2 : num_typos;
    const size_t max_results = std::min(num_results, (size_t) Collection::MAX_RESULTS);

-    size_t total_results = result_kvs.size();
+    size_t total_results = topster.size;

    // To prevent us from doing ART search repeatedly as we iterate through possible corrections
    spp::sparse_hash_map<std::string, std::vector<art_leaf*>> token_cost_cache;
@ -345,7 +327,7 @@ std::vector<Topster<100>::KV> Collection::search(std::string & query, const std:
        if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) {
            // If a) all tokens were found, or b) Some were skipped because they don't exist within max_cost,
            // go ahead and search for candidates with what we have so far
-            search_candidates(token_rank, token_leaves, result_kvs, result_set, total_results, max_results);
+            search_candidates(token_rank, token_leaves, topster, total_results, max_results);

            if (total_results >= max_results) {
                // If we don't find enough results, we continue outerloop (looking at tokens with greater cost)
@ -357,7 +339,7 @@ std::vector<Topster<100>::KV> Collection::search(std::string & query, const std:
    }

    // When there are not enough overall results and atleast one token has results
-    if(result_kvs.size() < max_results && token_to_count.size() > 1) {
+    if(topster.size < max_results && token_to_count.size() > 1) {
        // Drop certain token with least hits and try searching again
        std::string truncated_query;

@ -378,10 +360,8 @@ std::vector<Topster<100>::KV> Collection::search(std::string & query, const std:
            }
        }

-        return search(truncated_query, field, num_typos, num_results, result_kvs, result_set, token_order, prefix);
+        return search(truncated_query, field, num_typos, num_results, topster, token_order, prefix);
    }
-
-    return result_kvs;
 }

 void Collection::log_leaves(const int cost, const std::string &token, const std::vector<art_leaf *> &leaves) const {
@ -402,7 +382,7 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank,
    const int max_token_rank = 250;

    for(auto i=0; i<result_size; i++) {
-        uint32_t doc_id = result_ids[i];
+        uint32_t seq_id = result_ids[i];
        std::vector<std::vector<uint16_t>> token_positions;

        MatchScore mscore;
@ -413,7 +393,7 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank,
            // for each token in the query, find the positions that it appears in this document
            for (art_leaf *token_leaf : query_suggestion) {
                std::vector<uint16_t> positions;
-                uint32_t doc_index = token_leaf->values->ids.indexOf(doc_id);
+                uint32_t doc_index = token_leaf->values->ids.indexOf(seq_id);
                uint32_t start_offset = token_leaf->values->offset_index.at(doc_index);
                uint32_t end_offset = (doc_index == token_leaf->values->ids.getLength() - 1) ?
                                      token_leaf->values->offsets.getLength() :
@ -427,7 +407,7 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank,
                token_positions.push_back(positions);
            }

-            mscore = MatchScore::match_score(doc_id, token_positions);
+            mscore = MatchScore::match_score(seq_id, token_positions);
        }

        int token_rank_score = max_token_rank - token_rank;
@ -437,11 +417,11 @@ void Collection::score_results(Topster<100> &topster, const int & token_rank,
                                     ((uint64_t)(mscore.words_present) << 8) +
                                     (MAX_SEARCH_TOKENS - mscore.distance);

-        int64_t primary_rank_score = primary_rank_scores.count(doc_id) > 0 ? primary_rank_scores.at(doc_id) : 0;
-        int64_t secondary_rank_score = secondary_rank_scores.count(doc_id) > 0 ? secondary_rank_scores.at(doc_id) : 0;
-        topster.add(doc_id, match_score, primary_rank_score, secondary_rank_score);
+        int64_t primary_rank_score = primary_rank_scores.count(seq_id) > 0 ? primary_rank_scores.at(seq_id) : 0;
+        int64_t secondary_rank_score = secondary_rank_scores.count(seq_id) > 0 ? secondary_rank_scores.at(seq_id) : 0;
+        topster.add(seq_id, match_score, primary_rank_score, secondary_rank_score);
        /*std::cout << "token_rank_score: " << token_rank_score << ", match_score: "
-                  << match_score << ", primary_rank_score: " << primary_rank_score << ", doc_id: " << doc_id << std::endl;*/
+                  << match_score << ", primary_rank_score: " << primary_rank_score << ", seq_id: " << seq_id << std::endl;*/
    }
 }

--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -65,6 +65,10 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {

 TEST_F(CollectionTest, ExactPhraseSearch) {
    std::vector<nlohmann::json> results = collection->search("rocket launch", search_fields, 0, 10);
+    for(auto res: results) {
+        std::cout << res << std::endl;
+    }
+    std::cout << std::endl;
    ASSERT_EQ(5, results.size());

    /*