Allow text match bucket of 1.

This commit is contained in:
Kishore Nallan 2023-05-25 12:10:45 +05:30
parent 0419a40e6f
commit 38c5c0b035
2 changed files with 42 additions and 37 deletions

View File

@ -1549,12 +1549,12 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
return Option<nlohmann::json>(408, "Request Timeout");
}
if(match_score_index >= 0 && sort_fields_std[match_score_index].text_match_buckets > 1) {
if(match_score_index >= 0 && sort_fields_std[match_score_index].text_match_buckets > 0) {
size_t num_buckets = sort_fields_std[match_score_index].text_match_buckets;
const size_t max_kvs_bucketed = std::min<size_t>(DEFAULT_TOPSTER_SIZE, raw_result_kvs.size());
if(max_kvs_bucketed >= num_buckets) {
std::vector<int64_t> result_scores(max_kvs_bucketed);
spp::sparse_hash_map<uint64_t, int64_t> result_scores;
// only first `max_kvs_bucketed` elements are bucketed to prevent pagination issues past 250 records
size_t block_len = (max_kvs_bucketed / num_buckets);
@ -1563,7 +1563,7 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
int64_t anchor_score = raw_result_kvs[i][0]->scores[raw_result_kvs[i][0]->match_score_index];
size_t j = 0;
while(j < block_len && i+j < max_kvs_bucketed) {
result_scores[i+j] = raw_result_kvs[i+j][0]->scores[raw_result_kvs[i+j][0]->match_score_index];
result_scores[raw_result_kvs[i+j][0]->key] = raw_result_kvs[i+j][0]->scores[raw_result_kvs[i+j][0]->match_score_index];
raw_result_kvs[i+j][0]->scores[raw_result_kvs[i+j][0]->match_score_index] = anchor_score;
j++;
}
@ -1577,7 +1577,8 @@ Option<nlohmann::json> Collection::search(std::string raw_query,
// restore original scores
for(i = 0; i < max_kvs_bucketed; i++) {
raw_result_kvs[i][0]->scores[raw_result_kvs[i][0]->match_score_index] = result_scores[i];
raw_result_kvs[i][0]->scores[raw_result_kvs[i][0]->match_score_index] =
result_scores[raw_result_kvs[i][0]->key];
}
}
}

View File

@ -1636,7 +1636,7 @@ TEST_F(CollectionSortingTest, TextMatchBucketRanking) {
nlohmann::json doc1;
doc1["id"] = "0";
doc1["title"] = "Mark Antony";
doc1["description"] = "Marriage Counsellor";
doc1["description"] = "Counsellor";
doc1["points"] = 100;
nlohmann::json doc2;
@ -1653,47 +1653,51 @@ TEST_F(CollectionSortingTest, TextMatchBucketRanking) {
sort_by("points", "DESC"),
};
auto results = coll1->search("mark", {"title", "description"},
"", {}, sort_fields, {2, 2}, 10,
1, FREQUENCY, {true, true},
auto results = coll1->search("mark", {"title"},
"", {}, sort_fields, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3, 1}, 1000, true).get();
"<mark>", "</mark>", {3}, 1000, true).get();
// when there are more buckets than results, no bucketing will happen
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
// bucketing by 1 produces original text match
// bucketing by 1 makes the text match score the same
sort_fields = {
sort_by("_text_match(buckets: 1)", "DESC"),
sort_by("points", "DESC"),
};
results = coll1->search("mark", {"title", "description"},
"", {}, sort_fields, {2, 2}, 10,
1, FREQUENCY, {true, true},
results = coll1->search("mark", {"title"},
"", {}, sort_fields, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3, 1}, 1000, true).get();
"<mark>", "</mark>", {3}, 1000, true).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][1]["document"]["id"].get<std::string>());
ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
// likewise with bucket 0
size_t score1 = std::stoul(results["hits"][0]["text_match_info"]["score"].get<std::string>());
size_t score2 = std::stoul(results["hits"][1]["text_match_info"]["score"].get<std::string>());
ASSERT_TRUE(score1 < score2);
// bucketing by 0 produces original text match
sort_fields = {
sort_by("_text_match(buckets: 0)", "DESC"),
sort_by("points", "DESC"),
};
results = coll1->search("mark", {"title", "description"},
"", {}, sort_fields, {2, 2}, 10,
1, FREQUENCY, {true, true},
results = coll1->search("mark", {"title"},
"", {}, sort_fields, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3, 1}, 1000, true).get();
"<mark>", "</mark>", {3}, 1000, true).get();
ASSERT_EQ(2, results["hits"].size());
ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
@ -1702,46 +1706,46 @@ TEST_F(CollectionSortingTest, TextMatchBucketRanking) {
// don't allow bad parameter name
sort_fields[0] = sort_by("_text_match(foobar: 0)", "DESC");
auto res_op = coll1->search("mark", {"title", "description"},
"", {}, sort_fields, {2, 2}, 10,
1, FREQUENCY, {true, true},
auto res_op = coll1->search("mark", {"title"},
"", {}, sort_fields, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3, 1}, 1000, true);
"<mark>", "</mark>", {3}, 1000, true);
ASSERT_FALSE(res_op.ok());
ASSERT_EQ("Invalid sorting parameter passed for _text_match.", res_op.error());
// handle bad syntax
sort_fields[0] = sort_by("_text_match(foobar:", "DESC");
res_op = coll1->search("mark", {"title", "description"},
"", {}, sort_fields, {2, 2}, 10,
1, FREQUENCY, {true, true},
res_op = coll1->search("mark", {"title"},
"", {}, sort_fields, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3, 1}, 1000, true);
"<mark>", "</mark>", {3}, 1000, true);
ASSERT_FALSE(res_op.ok());
ASSERT_EQ("Could not find a field named `_text_match(foobar:` in the schema for sorting.", res_op.error());
// handle bad value
sort_fields[0] = sort_by("_text_match(buckets: x)", "DESC");
res_op = coll1->search("mark", {"title", "description"},
"", {}, sort_fields, {2, 2}, 10,
1, FREQUENCY, {true, true},
res_op = coll1->search("mark", {"title"},
"", {}, sort_fields, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3, 1}, 1000, true);
"<mark>", "</mark>", {3}, 1000, true);
ASSERT_FALSE(res_op.ok());
ASSERT_EQ("Invalid value passed for _text_match `buckets` configuration.", res_op.error());
// handle negative value
sort_fields[0] = sort_by("_text_match(buckets: -1)", "DESC");
res_op = coll1->search("mark", {"title", "description"},
"", {}, sort_fields, {2, 2}, 10,
1, FREQUENCY, {true, true},
res_op = coll1->search("mark", {"title"},
"", {}, sort_fields, {2}, 10,
1, FREQUENCY, {true},
10, spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
"<mark>", "</mark>", {3, 1}, 1000, true);
"<mark>", "</mark>", {3}, 1000, true);
ASSERT_FALSE(res_op.ok());
ASSERT_EQ("Invalid value passed for _text_match `buckets` configuration.", res_op.error());