Fixed a search issue: tokens that are not found in the index should be skipped.

This commit is contained in:
Kishore Nallan 2016-11-19 16:56:59 +05:30
parent 5736888935
commit 44d55cb13d
4 changed files with 59 additions and 11 deletions

View File

@ -14,6 +14,7 @@
- ~~Delete should remove from RocksDB~~
- ~~Speed up UUID generation~~
- Prefix-search strings should not be null terminated
- Make the search score computation customizable
**API**

View File

@ -38,8 +38,6 @@ std::string Collection::add(std::string json_str) {
store->insert(get_seq_id_key(seq_id), document.dump());
store->insert(get_id_key(document["id"]), seq_id_str);
std::cout << "ID: " << document["id"] << ", Title: " << document["title"] << std::endl;
std::vector<std::string> tokens;
StringUtils::tokenize(document["title"], tokens, " ", true);
@ -119,12 +117,17 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
printf("%s - ", token.c_str());
printf("%.*s", leaves[i]->key_len, leaves[i]->key);
printf(" - max_cost: %d, - num_ids: %d\n", max_cost, leaves[i]->values->ids.getLength());
/*for(auto j=0; j<leaves[i]->values->ids.getLength(); j++) {
printf("id: %d\n", leaves[i]->values->ids.at(j));
}*/
}
token_leaves.push_back(leaves);
}
}
if(token_leaves.size() != tokens.size()) {
if(token_leaves.size() != tokens.size() && cost != max_cost) {
// There could have been a typo in one of the tokens, so let's try again with greater cost
// Or this could be a token that does not exist at all (rare)
//std::cout << "token_leaves.size() != tokens.size(), continuing..." << std::endl << std::endl;
cost++;
continue;
@ -149,8 +152,8 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
uint32_t* out = new uint32_t[result_size];
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
delete result_ids;
delete curr;
delete[] result_ids;
delete[] curr;
result_ids = out;
}
@ -158,7 +161,7 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
score_results(topster, query_suggestion, result_ids, result_size);
total_results += result_size;
delete result_ids;
delete[] result_ids;
if(total_results >= max_results) break;
}
@ -182,6 +185,10 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
cost++;
}
if(results.size() == 0) {
// We could drop certain tokens and try
}
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
@ -225,7 +232,7 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
const uint64_t final_score = ((uint64_t)(mscore.words_present * 32 + (20 - mscore.distance)) * UINT32_MAX) +
doc_scores.at(doc_id);
std::cout << "final_score: " << final_score << ", doc_id: " << doc_id << std::endl;
//std::cout << "final_score: " << final_score << ", doc_id: " << doc_id << std::endl;
/*
std::cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: "

View File

@ -33,6 +33,43 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
// For two documents of the same score, the larger doc_id appears first
std::vector<std::string> ids = {"1", "6", "foo", "13", "10", "8", "16"};
for(size_t i = 0; i < results.size(); i++) {
nlohmann::json result = results.at(i);
std::string id = ids.at(i);
std::string result_id = result["id"];
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
}
TEST_F(CollectionTest, ExactPhraseSearch) {
std::vector<nlohmann::json> results = collection->search("rocket launch", 0, 10);
ASSERT_EQ(4, results.size());
/*
Sort by (match, diff, score)
8: score: 12, diff: 0
1: score: 15, diff: 4
17: score: 8, diff: 4
16: score: 10, diff: 5
*/
std::vector<std::string> ids = {"8", "1", "17", "16"};
for(size_t i = 0; i < results.size(); i++) {
nlohmann::json result = results.at(i);
std::string id = ids.at(i);
std::string result_id = result["id"];
ASSERT_STREQ(id.c_str(), result_id.c_str());
}
}
TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
// Tokens that are not found in the index should be skipped
std::vector<nlohmann::json> results = collection->search("from DoesNotExist", 0, 10);
ASSERT_EQ(2, results.size());
std::vector<std::string> ids = {"2", "17"};
for(size_t i = 0; i < results.size(); i++) {
nlohmann::json result = results.at(i);
std::string id = ids.at(i);

View File

@ -1,11 +1,11 @@
{"points":15,"title":"How are cryogenic rocket plant propellants delivered to the growing launch pad?"}
{"points":14,"title":"Are there any (free) are online data archives for data from instruments on Soviet / Russian missions?"}
{"points":15,"title":"How are cryogenic rocket propellants delivered to the launch pad?"}
{"points":14,"title":"Are there any (free) online data archives for data from instruments on Soviet / Russian missions?"}
{"points":13,"title":"Where should I look in ISS to find mouldy food?"}
{"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"}
{"id": "foo", "points":13,"title":"The heaviest martian spacecraft"}
{"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"}
{"points":12,"title":"Could future astronauts eat during EVAs?"}
{"points":12,"title":"What is the power requirement of a spacesuit?"}
{"points":12,"title":"What is the power requirement of a rocket launch these days?"}
{"points":12,"title":"How does plant growing medium not scatter around?"}
{"points":12,"title":"Is there research for the optimal small crew size for a long space voyage?"}
{"points":12,"title":"Do long term missions receive insurance coverage?"}
@ -13,4 +13,7 @@
{"points":12,"title":"What were emergency procedures for failure of launch vehicles with the nuclear upper stages?"}
{"points":11,"title":"Mathematics used for F9R flyback lunch and landing"}
{"points":11,"title":"What considerations have been made lunch for waste produced during colonisation?"}
{"points":10,"title":"Do late do the propellants lunch ionize in chemical rockets?"}
{"points":10,"title":"How late do the launch propellants ionize in a chemical rocket mission?"}
{"points":8,"title":"How much does it cost to launch (right from start) a rocket today?"}
{"points":16,"title":"Difference between Space Dynamics & Astrodynamics in engineering perspective?"}
{"points":18,"title":"What kind of biological research does ISS do?"}