Vector flat search: handle missing values.

This commit is contained in:
Kishore Nallan 2022-10-21 09:28:26 +05:30
parent 790fac008b
commit ae49dab8fd
3 changed files with 112 additions and 3 deletions

View File

@ -604,7 +604,7 @@ struct sort_by {
struct vector_query_t {
std::string field_name;
size_t k = 0;
size_t flat_search_cutoff = 1000;
size_t flat_search_cutoff = 0;
std::vector<float> values;
void _reset() {

View File

@ -2515,9 +2515,16 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
if(!no_filters_provided && filter_ids_length < vector_query.flat_search_cutoff) {
for(size_t i = 0; i < filter_ids_length; i++) {
auto seq_id = filter_ids[i];
const std::vector<float>& values = field_vector_index->vecdex->getDataByLabel<float>(seq_id);
float dist;
std::vector<float> values;
try {
values = field_vector_index->vecdex->getDataByLabel<float>(seq_id);
} catch(...) {
// likely not found
continue;
}
float dist;
if(field_vector_index->distance_type == cosine) {
std::vector<float> normalized_q(vector_query.values.size());
hnsw_index_t::normalize_vector(vector_query.values, normalized_q);
@ -2565,6 +2572,7 @@ void Index::search(std::vector<query_tokens_t>& field_query_tokens, const std::v
}
if(!nearest_ids.empty()) {
std::sort(nearest_ids.begin(), nearest_ids.end()); // seq_ids should be in ascending order
all_result_ids = new uint32[nearest_ids.size()];
std::copy(nearest_ids.begin(), nearest_ids.end(), all_result_ids);
all_result_ids_len = nearest_ids.size();

View File

@ -319,6 +319,107 @@ TEST_F(CollectionVectorTest, VecSearchWithFiltering) {
ASSERT_EQ(1, results["hits"].size());
}
TEST_F(CollectionVectorTest, VecSearchWithFilteringWithMissingVectorValues) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string"},
{"name": "points", "type": "int32"},
{"name": "vec", "type": "float[]", "num_dim": 4, "optional": true}
]
})"_json;
Collection* coll1 = collectionManager.create_collection(schema).get();
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib;
size_t num_docs = 20;
for (size_t i = 0; i < num_docs; i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = std::to_string(i) + " title";
doc["points"] = i;
std::vector<float> values;
for(size_t j = 0; j < 4; j++) {
values.push_back(distrib(rng));
}
if(i != 5 && i != 15) {
doc["vec"] = values;
}
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
auto results = coll1->search("*", {}, "", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488])").get();
ASSERT_EQ(18, results["found"].get<size_t>());
ASSERT_EQ(18, results["hits"].size());
// with points:<10, non-flat-search
results = coll1->search("*", {}, "points:<10", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 0)").get();
ASSERT_EQ(9, results["found"].get<size_t>());
ASSERT_EQ(9, results["hits"].size());
// with points:<10, flat-search
results = coll1->search("*", {}, "points:<10", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 1000)").get();
ASSERT_EQ(9, results["found"].get<size_t>());
ASSERT_EQ(9, results["hits"].size());
// single point
results = coll1->search("*", {}, "points:1", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 0)").get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
results = coll1->search("*", {}, "points:1", {}, {}, {0}, 20, 1, FREQUENCY, {true}, Index::DROP_TOKENS_THRESHOLD,
spp::sparse_hash_set<std::string>(),
spp::sparse_hash_set<std::string>(), 10, "", 30, 5,
"", 10, {}, {}, {}, 0,
"<mark>", "</mark>", {}, 1000, true, false, true, "", false, 6000 * 1000, 4, 7,
fallback,
4, {off}, 32767, 32767, 2,
false, true, "vec:([0.96826, 0.94, 0.39557, 0.306488], flat_search_cutoff: 1000)").get();
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
}
TEST_F(CollectionVectorTest, VectorSearchTestDeletion) {
nlohmann::json schema = R"({
"name": "coll1",