mirror of
https://github.com/typesense/typesense.git
synced 2025-05-20 13:42:26 +08:00
Fixed a search issue: tokens that are not found in the index should be skipped.
This commit is contained in:
parent
5736888935
commit
44d55cb13d
1
TODO.md
1
TODO.md
@ -14,6 +14,7 @@
|
||||
- ~~Delete should remove from RocksDB~~
|
||||
- ~~Speed up UUID generation~~
|
||||
- Prefix-search strings should not be null terminated
|
||||
- Make the search score computation customizable
|
||||
|
||||
**API**
|
||||
|
||||
|
@ -38,8 +38,6 @@ std::string Collection::add(std::string json_str) {
|
||||
store->insert(get_seq_id_key(seq_id), document.dump());
|
||||
store->insert(get_id_key(document["id"]), seq_id_str);
|
||||
|
||||
std::cout << "ID: " << document["id"] << ", Title: " << document["title"] << std::endl;
|
||||
|
||||
std::vector<std::string> tokens;
|
||||
StringUtils::tokenize(document["title"], tokens, " ", true);
|
||||
|
||||
@ -119,12 +117,17 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
|
||||
printf("%s - ", token.c_str());
|
||||
printf("%.*s", leaves[i]->key_len, leaves[i]->key);
|
||||
printf(" - max_cost: %d, - num_ids: %d\n", max_cost, leaves[i]->values->ids.getLength());
|
||||
/*for(auto j=0; j<leaves[i]->values->ids.getLength(); j++) {
|
||||
printf("id: %d\n", leaves[i]->values->ids.at(j));
|
||||
}*/
|
||||
}
|
||||
token_leaves.push_back(leaves);
|
||||
}
|
||||
}
|
||||
|
||||
if(token_leaves.size() != tokens.size()) {
|
||||
if(token_leaves.size() != tokens.size() && cost != max_cost) {
|
||||
// There could have been a typo in one of the tokens, so let's try again with greater cost
|
||||
// Or this could be a token that does not exist at all (rare)
|
||||
//std::cout << "token_leaves.size() != tokens.size(), continuing..." << std::endl << std::endl;
|
||||
cost++;
|
||||
continue;
|
||||
@ -149,8 +152,8 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
|
||||
uint32_t* out = new uint32_t[result_size];
|
||||
uint32_t* curr = query_suggestion[i]->values->ids.uncompress();
|
||||
result_size = Intersection::scalar(result_ids, result_size, curr, query_suggestion[i]->values->ids.getLength(), out);
|
||||
delete result_ids;
|
||||
delete curr;
|
||||
delete[] result_ids;
|
||||
delete[] curr;
|
||||
result_ids = out;
|
||||
}
|
||||
|
||||
@ -158,7 +161,7 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
|
||||
score_results(topster, query_suggestion, result_ids, result_size);
|
||||
|
||||
total_results += result_size;
|
||||
delete result_ids;
|
||||
delete[] result_ids;
|
||||
|
||||
if(total_results >= max_results) break;
|
||||
}
|
||||
@ -182,6 +185,10 @@ std::vector<nlohmann::json> Collection::search(std::string query, const int num_
|
||||
cost++;
|
||||
}
|
||||
|
||||
if(results.size() == 0) {
|
||||
// We could drop certain tokens and try
|
||||
}
|
||||
|
||||
long long int timeMillis = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - begin).count();
|
||||
std::cout << "Time taken for result calc: " << timeMillis << "us" << std::endl;
|
||||
|
||||
@ -225,7 +232,7 @@ void Collection::score_results(Topster<100> &topster, const std::vector<art_leaf
|
||||
const uint64_t final_score = ((uint64_t)(mscore.words_present * 32 + (20 - mscore.distance)) * UINT32_MAX) +
|
||||
doc_scores.at(doc_id);
|
||||
|
||||
std::cout << "final_score: " << final_score << ", doc_id: " << doc_id << std::endl;
|
||||
//std::cout << "final_score: " << final_score << ", doc_id: " << doc_id << std::endl;
|
||||
|
||||
/*
|
||||
std::cout << "result_ids[i]: " << result_ids[i] << " - mscore.distance: "
|
||||
|
@ -33,6 +33,43 @@ TEST_F(CollectionTest, ExactSearchShouldBeStable) {
|
||||
// For two documents of the same score, the larger doc_id appears first
|
||||
std::vector<std::string> ids = {"1", "6", "foo", "13", "10", "8", "16"};
|
||||
|
||||
for(size_t i = 0; i < results.size(); i++) {
|
||||
nlohmann::json result = results.at(i);
|
||||
std::string id = ids.at(i);
|
||||
std::string result_id = result["id"];
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, ExactPhraseSearch) {
|
||||
std::vector<nlohmann::json> results = collection->search("rocket launch", 0, 10);
|
||||
ASSERT_EQ(4, results.size());
|
||||
|
||||
/*
|
||||
Sort by (match, diff, score)
|
||||
8: score: 12, diff: 0
|
||||
1: score: 15, diff: 4
|
||||
17: score: 8, diff: 4
|
||||
16: score: 10, diff: 5
|
||||
*/
|
||||
|
||||
std::vector<std::string> ids = {"8", "1", "17", "16"};
|
||||
|
||||
for(size_t i = 0; i < results.size(); i++) {
|
||||
nlohmann::json result = results.at(i);
|
||||
std::string id = ids.at(i);
|
||||
std::string result_id = result["id"];
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, SkipUnindexedTokensDuringPhraseSearch) {
|
||||
// Tokens that are not found in the index should be skipped
|
||||
std::vector<nlohmann::json> results = collection->search("from DoesNotExist", 0, 10);
|
||||
ASSERT_EQ(2, results.size());
|
||||
|
||||
std::vector<std::string> ids = {"2", "17"};
|
||||
|
||||
for(size_t i = 0; i < results.size(); i++) {
|
||||
nlohmann::json result = results.at(i);
|
||||
std::string id = ids.at(i);
|
||||
|
@ -1,11 +1,11 @@
|
||||
{"points":15,"title":"How are cryogenic rocket plant propellants delivered to the growing launch pad?"}
|
||||
{"points":14,"title":"Are there any (free) are online data archives for data from instruments on Soviet / Russian missions?"}
|
||||
{"points":15,"title":"How are cryogenic rocket propellants delivered to the launch pad?"}
|
||||
{"points":14,"title":"Are there any (free) online data archives for data from instruments on Soviet / Russian missions?"}
|
||||
{"points":13,"title":"Where should I look in ISS to find mouldy food?"}
|
||||
{"points":13,"title":"Is solar system active cryovolcanism a potential viable power source for future colonies?"}
|
||||
{"id": "foo", "points":13,"title":"The heaviest martian spacecraft"}
|
||||
{"points":13,"title":"To what extent are the US modules of ISS based on the Spacelab design?"}
|
||||
{"points":12,"title":"Could future astronauts eat during EVAs?"}
|
||||
{"points":12,"title":"What is the power requirement of a spacesuit?"}
|
||||
{"points":12,"title":"What is the power requirement of a rocket launch these days?"}
|
||||
{"points":12,"title":"How does plant growing medium not scatter around?"}
|
||||
{"points":12,"title":"Is there research for the optimal small crew size for a long space voyage?"}
|
||||
{"points":12,"title":"Do long term missions receive insurance coverage?"}
|
||||
@ -13,4 +13,7 @@
|
||||
{"points":12,"title":"What were emergency procedures for failure of launch vehicles with the nuclear upper stages?"}
|
||||
{"points":11,"title":"Mathematics used for F9R flyback lunch and landing"}
|
||||
{"points":11,"title":"What considerations have been made lunch for waste produced during colonisation?"}
|
||||
{"points":10,"title":"Do late do the propellants lunch ionize in chemical rockets?"}
|
||||
{"points":10,"title":"How late do the launch propellants ionize in a chemical rocket mission?"}
|
||||
{"points":8,"title":"How much does it cost to launch (right from start) a rocket today?"}
|
||||
{"points":16,"title":"Difference between Space Dynamics & Astrodynamics in engineering perspective?"}
|
||||
{"points":18,"title":"What kind of biological research does ISS do?"}
|
Loading…
x
Reference in New Issue
Block a user