mirror of
https://github.com/typesense/typesense.git
synced 2025-05-16 03:12:32 +08:00
Reclaim deleted memory on vector insert.
This commit is contained in:
parent
f75cbca920
commit
374cb6948b
@ -1,6 +1,6 @@
|
||||
# Download hnsw (header-only)
|
||||
|
||||
set(HNSW_VERSION cc69672b90ff7150016ceeba7d547d8a9608db41)
|
||||
set(HNSW_VERSION 7f971d46d6c14c187e6ad8cf237e8e121aae2257)
|
||||
set(HNSW_NAME hnswlib-${HNSW_VERSION})
|
||||
set(HNSW_TAR_PATH ${DEP_ROOT_DIR}/${HNSW_NAME}.tar.gz)
|
||||
|
||||
|
@ -257,7 +257,7 @@ struct hnsw_index_t {
|
||||
|
||||
hnsw_index_t(size_t num_dim, size_t init_size, vector_distance_type_t distance_type):
|
||||
space(new hnswlib::InnerProductSpace(num_dim)),
|
||||
vecdex(new hnswlib::HierarchicalNSW<float, VectorFilterFunctor>(space, init_size)),
|
||||
vecdex(new hnswlib::HierarchicalNSW<float, VectorFilterFunctor>(space, init_size, 16, 200, 100, true)),
|
||||
num_dim(num_dim), distance_type(distance_type) {
|
||||
|
||||
}
|
||||
@ -594,6 +594,8 @@ public:
|
||||
|
||||
const spp::sparse_hash_map<std::string, array_mapped_infix_t>& _get_infix_index() const;
|
||||
|
||||
const spp::sparse_hash_map<std::string, hnsw_index_t*>& _get_vector_index() const;
|
||||
|
||||
static int get_bounded_typo_cost(const size_t max_cost, const size_t token_len,
|
||||
size_t min_len_1typo, size_t min_len_2typo);
|
||||
|
||||
|
@ -957,9 +957,9 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
|
||||
if(afield.vec_dist == cosine) {
|
||||
std::vector<float> normalized_vals(afield.num_dim);
|
||||
hnsw_index_t::normalize_vector(float_vals, normalized_vals);
|
||||
vector_index[afield.name]->vecdex->addPoint(normalized_vals.data(), (size_t)seq_id);
|
||||
vector_index[afield.name]->vecdex->insertPoint(normalized_vals.data(), (size_t)seq_id);
|
||||
} else {
|
||||
vector_index[afield.name]->vecdex->addPoint(float_vals.data(), (size_t)seq_id);
|
||||
vector_index[afield.name]->vecdex->insertPoint(float_vals.data(), (size_t)seq_id);
|
||||
}
|
||||
}
|
||||
});
|
||||
@ -5292,6 +5292,10 @@ const spp::sparse_hash_map<std::string, array_mapped_infix_t>& Index::_get_infix
|
||||
return infix_index;
|
||||
};
|
||||
|
||||
const spp::sparse_hash_map<std::string, hnsw_index_t*>& Index::_get_vector_index() const {
|
||||
return vector_index;
|
||||
}
|
||||
|
||||
void Index::refresh_schemas(const std::vector<field>& new_fields, const std::vector<field>& del_fields) {
|
||||
std::unique_lock lock(mutex);
|
||||
|
||||
|
@ -278,3 +278,102 @@ TEST_F(CollectionVectorTest, VecSearchWithFiltering) {
|
||||
ASSERT_EQ(1, results["found"].get<size_t>());
|
||||
ASSERT_EQ(1, results["hits"].size());
|
||||
}
|
||||
|
||||
TEST_F(CollectionVectorTest, VectorSearchTestDeletion) {
|
||||
nlohmann::json schema = R"({
|
||||
"name": "coll1",
|
||||
"fields": [
|
||||
{"name": "title", "type": "string"},
|
||||
{"name": "points", "type": "int32"},
|
||||
{"name": "vec", "type": "float[]", "num_dim": 4}
|
||||
]
|
||||
})"_json;
|
||||
|
||||
Collection* coll1 = collectionManager.create_collection(schema).get();
|
||||
|
||||
std::mt19937 rng;
|
||||
rng.seed(47);
|
||||
std::uniform_real_distribution<> distrib;
|
||||
|
||||
size_t num_docs = 20;
|
||||
|
||||
for (size_t i = 0; i < num_docs; i++) {
|
||||
nlohmann::json doc;
|
||||
doc["id"] = std::to_string(i);
|
||||
doc["title"] = std::to_string(i) + " title";
|
||||
doc["points"] = i;
|
||||
|
||||
std::vector<float> values;
|
||||
for(size_t j = 0; j < 4; j++) {
|
||||
values.push_back(distrib(rng));
|
||||
}
|
||||
|
||||
doc["vec"] = values;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
}
|
||||
|
||||
ASSERT_EQ(1024, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getMaxElements());
|
||||
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getCurrentElementCount());
|
||||
ASSERT_EQ(0, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getDeletedCount());
|
||||
|
||||
// now delete these docs
|
||||
|
||||
for (size_t i = 0; i < num_docs; i++) {
|
||||
ASSERT_TRUE(coll1->remove(std::to_string(i)).ok());
|
||||
}
|
||||
|
||||
ASSERT_EQ(1024, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getMaxElements());
|
||||
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getCurrentElementCount());
|
||||
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getDeletedCount());
|
||||
|
||||
for (size_t i = 0; i < num_docs; i++) {
|
||||
nlohmann::json doc;
|
||||
doc["id"] = std::to_string(i + num_docs);
|
||||
doc["title"] = std::to_string(i + num_docs) + " title";
|
||||
doc["points"] = i;
|
||||
|
||||
std::vector<float> values;
|
||||
for(size_t j = 0; j < 4; j++) {
|
||||
values.push_back(distrib(rng));
|
||||
}
|
||||
|
||||
doc["vec"] = values;
|
||||
ASSERT_TRUE(coll1->add(doc.dump()).ok());
|
||||
}
|
||||
|
||||
ASSERT_EQ(1024, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getMaxElements());
|
||||
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getCurrentElementCount());
|
||||
ASSERT_EQ(0, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getDeletedCount());
|
||||
|
||||
// delete those docs again and ensure that while reindexing till 1024 live docs, max count is not changed
|
||||
for (size_t i = 0; i < num_docs; i++) {
|
||||
ASSERT_TRUE(coll1->remove(std::to_string(i + num_docs)).ok());
|
||||
}
|
||||
|
||||
ASSERT_EQ(1024, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getMaxElements());
|
||||
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getCurrentElementCount());
|
||||
ASSERT_EQ(20, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getDeletedCount());
|
||||
|
||||
for (size_t i = 0; i < 1014; i++) {
|
||||
nlohmann::json doc;
|
||||
doc["id"] = std::to_string(10000 + i);
|
||||
doc["title"] = std::to_string(10000 + i) + " title";
|
||||
doc["points"] = i;
|
||||
|
||||
std::vector<float> values;
|
||||
for(size_t j = 0; j < 4; j++) {
|
||||
values.push_back(distrib(rng));
|
||||
}
|
||||
|
||||
doc["vec"] = values;
|
||||
const Option<nlohmann::json>& add_op = coll1->add(doc.dump());
|
||||
if(!add_op.ok()) {
|
||||
LOG(ERROR) << add_op.error();
|
||||
}
|
||||
ASSERT_TRUE(add_op.ok());
|
||||
}
|
||||
|
||||
ASSERT_EQ(1024, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getMaxElements());
|
||||
ASSERT_EQ(1014, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getCurrentElementCount());
|
||||
ASSERT_EQ(0, coll1->_get_index()->_get_vector_index().at("vec")->vecdex->getDeletedCount());
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user