Fixed a search issue: tokens that are not found in the index should be skipped.

2025-05-21 14:12:27 +08:00 · 2016-11-19 16:56:59 +05:30 · 2016-11-19 16:56:59 +05:30 · 44d55cb13d
commit 44d55cb13d
parent 5736888935
4 changed files with 59 additions and 11 deletions
--- a/TODO.md
+++ b/TODO.md
@ -14,6 +14,7 @@
 - ~~Delete should remove from RocksDB~~
 - ~~Speed up UUID generation~~
 - Prefix-search strings should not be null terminated
+- Make the search score computation customizable

 **API**

--- a/src/collection.cpp
+++ b/src/collection.cpp
@ -38,8 +38,6 @@ std::string Collection::add(std::string json_str) {
    store->insert(get_seq_id_key(seq_id), document.dump());
    store->insert(get_id_key(document["id"]), seq_id_str);

-    std::cout << "ID: " << document["id"] << ", Title: " << document["title"] << std::endl;
-
    std::vector<std::string> tokens;
    StringUtils::tokenize(document["title"], tokens, " ", true);

@ -119,12 +117,17 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
                    printf("%s - ", token.c_str());
                    printf("%.*s", leaves[i]->key_len, leaves[i]->key);
                    printf(" - max_cost: %d, - num_ids: %d\n", max_cost, leaves[i]->values->ids.getLength());
+                    /*for(auto j=0; j<leaves[i]->values->ids.getLength(); j++) {
+                        printf("id: %d\n", leaves[i]->values->ids.at(j));
+                    }*/
                }
                token_leaves.push_back(leaves);
            }
        }

-        if(token_leaves.size() != tokens.size()) {
+        if(token_leaves.size() != tokens.size() && cost != max_cost) {
+            // There could have been a typo in one of the tokens, so let's try again with greater cost
+            // Or this could be a token that does not exist at all (rare)
            //std::cout << "token_leaves.size() != tokens.size(), continuing..." << std::endl << std::endl;
            cost++;
            continue;
@ -149,8 +152,8 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
                uint32_t* out = new uint32_t[result_size];
                uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
                result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
-                delete result_ids;
-                delete curr;
+                delete[] result_ids;
+                delete[] curr;
                result_ids = out;
            }

@ -158,7 +161,7 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
            score_results(topster, query_suggestion, result_ids, result_size);

            total_results += result_size;
-            delete result_ids;
+            delete[] result_ids;

            if(total_results >= max_results) break;
        }
@ -182,6 +185,10 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
        cost++;
    }

+    if(results.size() == 0) {
+        // We could drop certain tokens and try
+    }
+
    long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
    std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;

@ -225,7 +232,7 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
        const uint64_t final_score = ((uint64_t)(mscore.words_present * 32 + (20 - mscore.distance)) * UINT32_MAX) +
                                     doc_scores.at(doc_id);

-        std::cout << "final_score: " << final_score << ", doc_id: " << doc_id << std::endl;
+        //std::cout << "final_score: " << final_score << ", doc_id: " << doc_id << std::endl;

        /*
          std::cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: "
--- a/test/collection_test.cpp
+++ b/test/collection_test.cpp
@ -33,6 +33,43 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
    // For two documents of the same score, the larger doc_id appears first
    std::vector<std::string> ids = {"1", "6", "foo", "13", "10", "8", "16"};

+    for(size_t i = 0; i < results.size(); i++) {
+        nlohmann::json result = results.at(i);
+        std::string id = ids.at(i);
+        std::string result_id = result["id"];
+        ASSERT_STREQ(id.c_str(), result_id.c_str());
+    }
+}
+
+TEST_F(CollectionTest, ExactPhraseSearch) {
+    std::vector<nlohmann::json> results = collection->search("rocket launch", 0, 10);
+    ASSERT_EQ(4, results.size());
+
+    /*
+       Sort by (match, diff, score)
+       8:   score: 12, diff: 0
+       1:   score: 15, diff: 4
+       17:  score: 8,  diff: 4
+       16:  score: 10, diff: 5
+    */
+
+    std::vector<std::string> ids = {"8", "1", "17", "16"};
+
+    for(size_t i = 0; i < results.size(); i++) {
+        nlohmann::json result = results.at(i);
+        std::string id = ids.at(i);
+        std::string result_id = result["id"];
+        ASSERT_STREQ(id.c_str(), result_id.c_str());
+    }
+}
+
+TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
+    // Tokens that are not found in the index should be skipped
+    std::vector<nlohmann::json> results = collection->search("from DoesNotExist", 0, 10);
+    ASSERT_EQ(2, results.size());
+
+    std::vector<std::string> ids = {"2", "17"};
+
    for(size_t i = 0; i < results.size(); i++) {
        nlohmann::json result = results.at(i);
        std::string id = ids.at(i);
--- a/test/documents.jsonl
+++ b/test/documents.jsonl
@ -1,11 +1,11 @@
-{"points":15,"title":"How are cryogenic rocket plant propellants delivered to the growing launch pad?"}
-{"points":14,"title":"Are there any (free) are online data archives for data from instruments on Soviet / Russian missions?"}
+{"points":15,"title":"How are cryogenic rocket propellants delivered to the launch pad?"}
+{"points":14,"title":"Are there any (free) online data archives for data from instruments on Soviet / Russian missions?"}
 {"points":13,"title":"Where should I look in ISS to find mouldy food?"}
 {"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"}
 {"id": "foo", "points":13,"title":"The heaviest martian spacecraft"}
 {"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"}
 {"points":12,"title":"Could future astronauts eat during EVAs?"}
-{"points":12,"title":"What is the power requirement of a spacesuit?"}
+{"points":12,"title":"What is the power requirement of a rocket launch these days?"}
 {"points":12,"title":"How does plant growing medium not scatter around?"}
 {"points":12,"title":"Is there research for the optimal small crew size for a long space voyage?"}
 {"points":12,"title":"Do long term missions receive insurance coverage?"}
@ -13,4 +13,7 @@
 {"points":12,"title":"What were emergency procedures for failure of launch vehicles with the nuclear upper stages?"}
 {"points":11,"title":"Mathematics used for F9R flyback lunch and landing"}
 {"points":11,"title":"What considerations have been made lunch for waste produced during colonisation?"}
-{"points":10,"title":"Do late do the propellants lunch ionize in chemical rockets?"}
+{"points":10,"title":"How late do the launch propellants ionize in a chemical rocket mission?"}
+{"points":8,"title":"How much does it cost to launch (right from start) a rocket today?"}
+{"points":16,"title":"Difference between Space Dynamics & Astrodynamics in engineering perspective?"}
+{"points":18,"title":"What kind of biological research does ISS do?"}