Reclaim deleted memory on vector insert.

This commit is contained in:
Kishore Nallan 2022-10-11 16:25:13 +05:30
parent f75cbca920
commit 374cb6948b
4 changed files with 109 additions and 4 deletions

View File

@ -1,6 +1,6 @@
# Download hnsw (header-only)
set(HNSW_VERSION cc69672b90ff7150016ceeba7d547d8a9608db41)
set(HNSW_VERSION 7f971d46d6c14c187e6ad8cf237e8e121aae2257)
set(HNSW_NAME hnswlib-${HNSW_VERSION})
set(HNSW_TAR_PATH ${DEP_ROOT_DIR}/${HNSW_NAME}.tar.gz)

View File

@ -257,7 +257,7 @@ struct hnsw_index_t {
hnsw_index_t(size_t num_dim, size_t init_size, vector_distance_type_t distance_type):
space(new hnswlib::InnerProductSpace(num_dim)),
vecdex(new hnswlib::HierarchicalNSW<float, VectorFilterFunctor>(space, init_size)),
vecdex(new hnswlib::HierarchicalNSW<float, VectorFilterFunctor>(space, init_size, 16, 200, 100, true)),
num_dim(num_dim), distance_type(distance_type) {
}
@ -594,6 +594,8 @@ public:
const spp::sparse_hash_map<std::string, array_mapped_infix_t>& _get_infix_index() const;
const spp::sparse_hash_map<std::string, hnsw_index_t*>& _get_vector_index() const;
static int get_bounded_typo_cost(const size_t max_cost, const size_t token_len,
size_t min_len_1typo, size_t min_len_2typo);

View File

@ -957,9 +957,9 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
if(afield.vec_dist == cosine) {
std::vector<float> normalized_vals(afield.num_dim);
hnsw_index_t::normalize_vector(float_vals, normalized_vals);
vector_index[afield.name]->vecdex->addPoint(normalized_vals.data(), (size_t)seq_id);
vector_index[afield.name]->vecdex->insertPoint(normalized_vals.data(), (size_t)seq_id);
} else {
vector_index[afield.name]->vecdex->addPoint(float_vals.data(), (size_t)seq_id);
vector_index[afield.name]->vecdex->insertPoint(float_vals.data(), (size_t)seq_id);
}
}
});
@ -5292,6 +5292,10 @@ const spp::sparse_hash_map<std::string, array_mapped_infix_t>& Index::_get_infix
return infix_index;
};
const spp::sparse_hash_map<std::string, hnsw_index_t*>& Index::_get_vector_index() const {
return vector_index;
}
void Index::refresh_schemas(const std::vector<field>& new_fields, const std::vector<field>& del_fields) {
std::unique_lock lock(mutex);

View File

@ -278,3 +278,102 @@ TEST_F(CollectionVectorTest, VecSearchWithFiltering) {
ASSERT_EQ(1, results["found"].get<size_t>());
ASSERT_EQ(1, results["hits"].size());
}
TEST_F(CollectionVectorTest, VectorSearchTestDeletion) {
nlohmann::json schema = R"({
"name": "coll1",
"fields": [
{"name": "title", "type": "string"},
{"name": "points", "type": "int32"},
{"name": "vec", "type": "float[]", "num_dim": 4}
]
})"_json;
Collection* coll1 = collectionManager.create_collection(schema).get();
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib;
size_t num_docs = 20;
for (size_t i = 0; i < num_docs; i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i);
doc["title"] = std::to_string(i) + " title";
doc["points"] = i;
std::vector<float> values;
for(size_t j = 0; j < 4; j++) {
values.push_back(distrib(rng));
}
doc["vec"] = values;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
ASSERT_EQ(1024, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getMaxElements());
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getCurrentElementCount());
ASSERT_EQ(0, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getDeletedCount());
// now delete these docs
for (size_t i = 0; i < num_docs; i++) {
ASSERT_TRUE(coll1->remove(std::to_string(i)).ok());
}
ASSERT_EQ(1024, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getMaxElements());
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getCurrentElementCount());
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getDeletedCount());
for (size_t i = 0; i < num_docs; i++) {
nlohmann::json doc;
doc["id"] = std::to_string(i + num_docs);
doc["title"] = std::to_string(i + num_docs) + " title";
doc["points"] = i;
std::vector<float> values;
for(size_t j = 0; j < 4; j++) {
values.push_back(distrib(rng));
}
doc["vec"] = values;
ASSERT_TRUE(coll1->add(doc.dump()).ok());
}
ASSERT_EQ(1024, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getMaxElements());
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getCurrentElementCount());
ASSERT_EQ(0, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getDeletedCount());
// delete those docs again and ensure that while reindexing till 1024 live docs, max count is not changed
for (size_t i = 0; i < num_docs; i++) {
ASSERT_TRUE(coll1->remove(std::to_string(i + num_docs)).ok());
}
ASSERT_EQ(1024, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getMaxElements());
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getCurrentElementCount());
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getDeletedCount());
for (size_t i = 0; i < 1014; i++) {
nlohmann::json doc;
doc["id"] = std::to_string(10000 + i);
doc["title"] = std::to_string(10000 + i) + " title";
doc["points"] = i;
std::vector<float> values;
for(size_t j = 0; j < 4; j++) {
values.push_back(distrib(rng));
}
doc["vec"] = values;
const Option<nlohmann::json>& add_op = coll1->add(doc.dump());
if(!add_op.ok()) {
LOG(ERROR) << add_op.error();
}
ASSERT_TRUE(add_op.ok());
}
ASSERT_EQ(1024, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getMaxElements());
ASSERT_EQ(1014, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getCurrentElementCount());
ASSERT_EQ(0, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getDeletedCount());
}