mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 21:22:25 +08:00
Tweak score calculation - number of words present is more important than candidate rank score.
This commit is contained in:
parent
b7bc974b8e
commit
20a3139dd2
@ -342,7 +342,7 @@ void Collection::do_facets(std::vector<facet> & facets, uint32_t* result_ids, si
|
||||
}
|
||||
|
||||
void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_length, std::vector<facet> & facets,
|
||||
const std::vector<sort_field> & sort_fields, int & token_rank,
|
||||
const std::vector<sort_field> & sort_fields, int & candidate_rank,
|
||||
std::vector<std::vector<art_leaf*>> & token_leaves, Topster<100> & topster,
|
||||
size_t & total_results, uint32_t** all_result_ids, size_t & all_result_ids_len,
|
||||
const size_t & max_results) {
|
||||
@ -353,7 +353,11 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
|
||||
for(long long n=0; n<N && n<combination_limit; ++n) {
|
||||
// every element in `query_suggestion` contains a token and its associated hits
|
||||
std::vector<art_leaf *> query_suggestion = next_suggestion(token_leaves, n);
|
||||
token_rank++;
|
||||
candidate_rank++;
|
||||
|
||||
/*for(auto i=0; i < query_suggestion.size(); i++) {
|
||||
std::cout << "i: " << i << " - " << query_suggestion[i]->key << std::endl;
|
||||
}*/
|
||||
|
||||
// initialize results with the starting element (for further intersection)
|
||||
uint32_t* result_ids = query_suggestion[0]->values->ids.uncompress();
|
||||
@ -384,7 +388,7 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
|
||||
do_facets(facets, filtered_result_ids, filtered_results_size);
|
||||
|
||||
// go through each matching document id and calculate match score
|
||||
score_results(sort_fields, token_rank, topster, query_suggestion, filtered_result_ids, filtered_results_size);
|
||||
score_results(sort_fields, candidate_rank, topster, query_suggestion, filtered_result_ids, filtered_results_size);
|
||||
|
||||
delete[] filtered_result_ids;
|
||||
delete[] result_ids;
|
||||
@ -397,7 +401,7 @@ void Collection::search_candidates(uint32_t* filter_ids, size_t filter_ids_lengt
|
||||
delete [] *all_result_ids;
|
||||
*all_result_ids = new_all_result_ids;
|
||||
|
||||
score_results(sort_fields, token_rank, topster, query_suggestion, result_ids, result_size);
|
||||
score_results(sort_fields, candidate_rank, topster, query_suggestion, result_ids, result_size);
|
||||
delete[] result_ids;
|
||||
}
|
||||
|
||||
@ -731,7 +735,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui
|
||||
|
||||
const size_t combination_limit = 10;
|
||||
auto product = []( long long a, std::vector<int>& b ) { return a*b.size(); };
|
||||
int token_rank = 0;
|
||||
int candidate_rank = 0;
|
||||
long long n = 0;
|
||||
long long int N = std::accumulate(token_to_costs.begin(), token_to_costs.end(), 1LL, product);
|
||||
|
||||
@ -754,8 +758,8 @@ void Collection::search_field(std::string & query, const std::string & field, ui
|
||||
const std::string token_cost_hash = token + std::to_string(costs[token_index]);
|
||||
|
||||
std::vector<art_leaf*> leaves;
|
||||
/*std::cout << "\nSearching for: " << token << " - cost: " << costs[token_index] << ", token_rank: "
|
||||
<< token_rank << std::endl;*/
|
||||
/*std::cout << "\nSearching for: " << token << " - cost: " << costs[token_index] << ", candidate_rank: "
|
||||
<< candidate_rank << std::endl;*/
|
||||
|
||||
if(token_cost_cache.count(token_cost_hash) != 0) {
|
||||
leaves = token_cost_cache[token_cost_hash];
|
||||
@ -801,7 +805,7 @@ void Collection::search_field(std::string & query, const std::string & field, ui
|
||||
|
||||
if(token_leaves.size() != 0 && token_leaves.size() == tokens.size()) {
|
||||
// If all tokens were found, go ahead and search for candidates with what we have so far
|
||||
search_candidates(filter_ids, filter_ids_length, facets, sort_fields, token_rank, token_leaves, topster,
|
||||
search_candidates(filter_ids, filter_ids_length, facets, sort_fields, candidate_rank, token_leaves, topster,
|
||||
total_results, all_result_ids, all_result_ids_len, max_results);
|
||||
|
||||
if (total_results >= max_results) {
|
||||
@ -851,11 +855,11 @@ void Collection::log_leaves(const int cost, const std::string &token, const std:
|
||||
}
|
||||
}
|
||||
|
||||
void Collection::score_results(const std::vector<sort_field> & sort_fields, const int & token_rank,
|
||||
void Collection::score_results(const std::vector<sort_field> & sort_fields, const int & candidate_rank,
|
||||
Topster<100> & topster, const std::vector<art_leaf *> &query_suggestion,
|
||||
const uint32_t *result_ids, const size_t result_size) const {
|
||||
|
||||
const int max_token_rank = 250;
|
||||
const int max_candidate_rank = 250;
|
||||
spp::sparse_hash_map<art_leaf*, uint32_t*> leaf_to_indices;
|
||||
|
||||
if(query_suggestion.size() != 1) {
|
||||
@ -922,11 +926,11 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
|
||||
mscore = MatchScore::match_score(seq_id, token_positions);
|
||||
}
|
||||
|
||||
int token_rank_score = max_token_rank - token_rank;
|
||||
int candidate_rank_score = max_candidate_rank - candidate_rank;
|
||||
|
||||
// Construct a single match_score from individual components (for multi-field sort)
|
||||
const uint64_t match_score = (token_rank_score << 16) +
|
||||
((uint64_t)(mscore.words_present) << 8) +
|
||||
const uint64_t match_score = ((uint64_t)(mscore.words_present) << 16) +
|
||||
(candidate_rank_score << 8) +
|
||||
(MAX_SEARCH_TOKENS - mscore.distance);
|
||||
|
||||
int64_t primary_rank_score = (primary_rank_scores && primary_rank_scores->count(seq_id) > 0) ?
|
||||
@ -937,7 +941,7 @@ void Collection::score_results(const std::vector<sort_field> & sort_fields, cons
|
||||
primary_rank_factor * primary_rank_score,
|
||||
secondary_rank_factor * secondary_rank_score);
|
||||
|
||||
/*std::cout << "token_rank_score: " << token_rank_score << ", match_score: "
|
||||
/*std::cout << "candidate_rank_score: " << candidate_rank_score << ", match_score: "
|
||||
<< match_score << ", primary_rank_score: " << primary_rank_score << ", seq_id: " << seq_id << std::endl;*/
|
||||
}
|
||||
|
||||
|
@ -352,6 +352,19 @@ TEST_F(CollectionTest, PrefixSearching) {
|
||||
std::string id = ids.at(i);
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
|
||||
std::cout << "WHAT EX..." << std::endl;
|
||||
|
||||
results = collection->search("what ex", query_fields, "", facets, sort_fields, 0, 10, MAX_SCORE, true);
|
||||
ASSERT_EQ(9, results["hits"].size());
|
||||
ids = {"6", "12", "19", "22", "13", "8", "15", "24", "21"};
|
||||
|
||||
for(size_t i = 0; i < results["hits"].size(); i++) {
|
||||
nlohmann::json result = results["hits"].at(i);
|
||||
std::string result_id = result["id"];
|
||||
std::string id = ids.at(i);
|
||||
ASSERT_STREQ(id.c_str(), result_id.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CollectionTest, MultipleFields) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user