mirror of
https://github.com/apple/foundationdb.git
synced 2025-05-15 02:18:39 +08:00
Fix read iterator pool race (#7455)
This commit is contained in:
parent
d9b0ff418f
commit
61d621a4c4
@ -100,6 +100,7 @@ std::string getErrorReason(BackgroundErrorReason reason) {
|
|||||||
return format("%d Unknown", reason);
|
return format("%d Unknown", reason);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Background error handling is tested with Chaos test.
|
// Background error handling is tested with Chaos test.
|
||||||
// TODO: Test background error in simulation. RocksDB doesn't use flow IO in simulation, which limits our ability to
|
// TODO: Test background error in simulation. RocksDB doesn't use flow IO in simulation, which limits our ability to
|
||||||
// inject IO errors. We could implement rocksdb::FileSystem using flow IO to unblock simulation. Also, trace event is
|
// inject IO errors. We could implement rocksdb::FileSystem using flow IO to unblock simulation. Also, trace event is
|
||||||
@ -144,6 +145,11 @@ private:
|
|||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Encapsulation of shared states.
|
||||||
|
struct ShardedRocksDBState {
|
||||||
|
bool closing = false;
|
||||||
|
};
|
||||||
|
|
||||||
std::shared_ptr<rocksdb::Cache> rocksdb_block_cache = nullptr;
|
std::shared_ptr<rocksdb::Cache> rocksdb_block_cache = nullptr;
|
||||||
|
|
||||||
rocksdb::Slice toSlice(StringRef s) {
|
rocksdb::Slice toSlice(StringRef s) {
|
||||||
@ -328,7 +334,7 @@ public:
|
|||||||
ASSERT(cf);
|
ASSERT(cf);
|
||||||
readRangeOptions.background_purge_on_iterator_cleanup = true;
|
readRangeOptions.background_purge_on_iterator_cleanup = true;
|
||||||
readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0);
|
readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0);
|
||||||
TraceEvent("ReadIteratorPool")
|
TraceEvent(SevDebug, "ReadIteratorPool")
|
||||||
.detail("Path", path)
|
.detail("Path", path)
|
||||||
.detail("KnobRocksDBReadRangeReuseIterators", SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS)
|
.detail("KnobRocksDBReadRangeReuseIterators", SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS)
|
||||||
.detail("KnobRocksDBPrefixLen", SERVER_KNOBS->ROCKSDB_PREFIX_LEN);
|
.detail("KnobRocksDBPrefixLen", SERVER_KNOBS->ROCKSDB_PREFIX_LEN);
|
||||||
@ -407,25 +413,6 @@ private:
|
|||||||
uint64_t iteratorsReuseCount;
|
uint64_t iteratorsReuseCount;
|
||||||
};
|
};
|
||||||
|
|
||||||
ACTOR Future<Void> refreshReadIteratorPool(
|
|
||||||
std::unordered_map<std::string, std::shared_ptr<PhysicalShard>>* physicalShards) {
|
|
||||||
state Reference<Histogram> histogram = Histogram::getHistogram(
|
|
||||||
ROCKSDBSTORAGE_HISTOGRAM_GROUP, "TimeSpentRefreshIterators"_sr, Histogram::Unit::microseconds);
|
|
||||||
|
|
||||||
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) {
|
|
||||||
loop {
|
|
||||||
wait(delay(SERVER_KNOBS->ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME));
|
|
||||||
|
|
||||||
double startTime = timer_monotonic();
|
|
||||||
for (auto& [_, shard] : *physicalShards) {
|
|
||||||
shard->readIterPool->refreshIterators();
|
|
||||||
}
|
|
||||||
histogram->sample(timer_monotonic() - startTime);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return Void();
|
|
||||||
}
|
|
||||||
|
|
||||||
ACTOR Future<Void> flowLockLogger(const FlowLock* readLock, const FlowLock* fetchLock) {
|
ACTOR Future<Void> flowLockLogger(const FlowLock* readLock, const FlowLock* fetchLock) {
|
||||||
loop {
|
loop {
|
||||||
wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
|
wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
|
||||||
@ -450,7 +437,7 @@ struct DataShard {
|
|||||||
// PhysicalShard represent a collection of logical shards. A PhysicalShard could have one or more DataShards. A
|
// PhysicalShard represent a collection of logical shards. A PhysicalShard could have one or more DataShards. A
|
||||||
// PhysicalShard is stored as a column family in rocksdb. Each PhysicalShard has its own iterator pool.
|
// PhysicalShard is stored as a column family in rocksdb. Each PhysicalShard has its own iterator pool.
|
||||||
struct PhysicalShard {
|
struct PhysicalShard {
|
||||||
PhysicalShard(rocksdb::DB* db, std::string id) : db(db), id(id) {}
|
PhysicalShard(rocksdb::DB* db, std::string id) : db(db), id(id), isInitialized(false) {}
|
||||||
|
|
||||||
rocksdb::Status init() {
|
rocksdb::Status init() {
|
||||||
if (cf) {
|
if (cf) {
|
||||||
@ -462,10 +449,16 @@ struct PhysicalShard {
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
readIterPool = std::make_shared<ReadIteratorPool>(db, cf, id);
|
readIterPool = std::make_shared<ReadIteratorPool>(db, cf, id);
|
||||||
|
this->isInitialized.store(true);
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool initialized() { return cf != nullptr; }
|
bool initialized() { return this->isInitialized.load(); }
|
||||||
|
|
||||||
|
void refreshReadIteratorPool() {
|
||||||
|
ASSERT(this->readIterPool != nullptr);
|
||||||
|
this->readIterPool->refreshIterators();
|
||||||
|
}
|
||||||
|
|
||||||
~PhysicalShard() {
|
~PhysicalShard() {
|
||||||
if (!deletePending)
|
if (!deletePending)
|
||||||
@ -487,6 +480,7 @@ struct PhysicalShard {
|
|||||||
std::unordered_map<std::string, std::unique_ptr<DataShard>> dataShards;
|
std::unordered_map<std::string, std::unique_ptr<DataShard>> dataShards;
|
||||||
std::shared_ptr<ReadIteratorPool> readIterPool;
|
std::shared_ptr<ReadIteratorPool> readIterPool;
|
||||||
bool deletePending = false;
|
bool deletePending = false;
|
||||||
|
std::atomic<bool> isInitialized;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Manages physical shards and maintains logical shard mapping.
|
// Manages physical shards and maintains logical shard mapping.
|
||||||
@ -1310,6 +1304,39 @@ ACTOR Future<Void> rocksDBAggregatedMetricsLogger(std::shared_ptr<RocksDBMetrics
|
|||||||
struct ShardedRocksDBKeyValueStore : IKeyValueStore {
|
struct ShardedRocksDBKeyValueStore : IKeyValueStore {
|
||||||
using CF = rocksdb::ColumnFamilyHandle*;
|
using CF = rocksdb::ColumnFamilyHandle*;
|
||||||
|
|
||||||
|
ACTOR static Future<Void> refreshReadIteratorPools(
|
||||||
|
std::shared_ptr<ShardedRocksDBState> rState,
|
||||||
|
Future<Void> readyToStart,
|
||||||
|
std::unordered_map<std::string, std::shared_ptr<PhysicalShard>>* physicalShards) {
|
||||||
|
state Reference<Histogram> histogram = Histogram::getHistogram(
|
||||||
|
ROCKSDBSTORAGE_HISTOGRAM_GROUP, "TimeSpentRefreshIterators"_sr, Histogram::Unit::microseconds);
|
||||||
|
|
||||||
|
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) {
|
||||||
|
try {
|
||||||
|
wait(readyToStart);
|
||||||
|
loop {
|
||||||
|
wait(delay(SERVER_KNOBS->ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME));
|
||||||
|
if (rState->closing) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
double startTime = timer_monotonic();
|
||||||
|
for (auto& [_, shard] : *physicalShards) {
|
||||||
|
if (shard->initialized()) {
|
||||||
|
shard->refreshReadIteratorPool();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
histogram->sample(timer_monotonic() - startTime);
|
||||||
|
}
|
||||||
|
} catch (Error& e) {
|
||||||
|
if (e.code() != error_code_actor_cancelled) {
|
||||||
|
TraceEvent(SevError, "RefreshReadIteratorPoolError").errorUnsuppressed(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Void();
|
||||||
|
}
|
||||||
|
|
||||||
struct Writer : IThreadPoolReceiver {
|
struct Writer : IThreadPoolReceiver {
|
||||||
int threadIndex;
|
int threadIndex;
|
||||||
std::unordered_map<uint32_t, rocksdb::ColumnFamilyHandle*>* columnFamilyMap;
|
std::unordered_map<uint32_t, rocksdb::ColumnFamilyHandle*>* columnFamilyMap;
|
||||||
@ -1360,15 +1387,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (g_network->isSimulated()) {
|
|
||||||
a.metrics = refreshReadIteratorPool(a.shardManager->getAllShards());
|
|
||||||
} else {
|
|
||||||
onMainThread([&] {
|
|
||||||
a.metrics = refreshReadIteratorPool(a.shardManager->getAllShards());
|
|
||||||
return Future<bool>(true);
|
|
||||||
}).blockUntilReady();
|
|
||||||
}
|
|
||||||
|
|
||||||
TraceEvent(SevInfo, "RocksDB").detail("Method", "Open");
|
TraceEvent(SevInfo, "RocksDB").detail("Method", "Open");
|
||||||
a.done.send(Void());
|
a.done.send(Void());
|
||||||
}
|
}
|
||||||
@ -1865,7 +1883,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
|
|||||||
|
|
||||||
// Persist shard mappinng key range should not be in shardMap.
|
// Persist shard mappinng key range should not be in shardMap.
|
||||||
explicit ShardedRocksDBKeyValueStore(const std::string& path, UID id)
|
explicit ShardedRocksDBKeyValueStore(const std::string& path, UID id)
|
||||||
: path(path), id(id), readSemaphore(SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
|
: rState(std::make_shared<ShardedRocksDBState>()), path(path), id(id),
|
||||||
|
readSemaphore(SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
|
||||||
fetchSemaphore(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
|
fetchSemaphore(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
|
||||||
numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
|
numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
|
||||||
numFetchWaiters(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
|
numFetchWaiters(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
|
||||||
@ -1898,8 +1917,10 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
|
|||||||
Future<Void> getError() const override { return errorFuture; }
|
Future<Void> getError() const override { return errorFuture; }
|
||||||
|
|
||||||
ACTOR static void doClose(ShardedRocksDBKeyValueStore* self, bool deleteOnClose) {
|
ACTOR static void doClose(ShardedRocksDBKeyValueStore* self, bool deleteOnClose) {
|
||||||
|
self->rState->closing = true;
|
||||||
// The metrics future retains a reference to the DB, so stop it before we delete it.
|
// The metrics future retains a reference to the DB, so stop it before we delete it.
|
||||||
self->metrics.reset();
|
self->metrics.reset();
|
||||||
|
self->refreshHolder.cancel();
|
||||||
|
|
||||||
wait(self->readThreads->stop());
|
wait(self->readThreads->stop());
|
||||||
auto a = new Writer::CloseAction(&self->shardManager, deleteOnClose);
|
auto a = new Writer::CloseAction(&self->shardManager, deleteOnClose);
|
||||||
@ -1930,6 +1951,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
|
|||||||
auto a = std::make_unique<Writer::OpenAction>(
|
auto a = std::make_unique<Writer::OpenAction>(
|
||||||
&shardManager, metrics, &readSemaphore, &fetchSemaphore, errorListener);
|
&shardManager, metrics, &readSemaphore, &fetchSemaphore, errorListener);
|
||||||
openFuture = a->done.getFuture();
|
openFuture = a->done.getFuture();
|
||||||
|
this->refreshHolder = refreshReadIteratorPools(this->rState, openFuture, shardManager.getAllShards());
|
||||||
writeThread->post(a.release());
|
writeThread->post(a.release());
|
||||||
return openFuture;
|
return openFuture;
|
||||||
}
|
}
|
||||||
@ -2092,6 +2114,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
|
|||||||
// Used for debugging shard mapping issue.
|
// Used for debugging shard mapping issue.
|
||||||
std::vector<std::pair<KeyRange, std::string>> getDataMapping() { return shardManager.getDataMapping(); }
|
std::vector<std::pair<KeyRange, std::string>> getDataMapping() { return shardManager.getDataMapping(); }
|
||||||
|
|
||||||
|
std::shared_ptr<ShardedRocksDBState> rState;
|
||||||
|
ShardManager shardManager;
|
||||||
std::shared_ptr<RocksDBMetrics> rocksDBMetrics;
|
std::shared_ptr<RocksDBMetrics> rocksDBMetrics;
|
||||||
std::string path;
|
std::string path;
|
||||||
const std::string dataPath;
|
const std::string dataPath;
|
||||||
@ -2101,7 +2125,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
|
|||||||
std::shared_ptr<RocksDBErrorListener> errorListener;
|
std::shared_ptr<RocksDBErrorListener> errorListener;
|
||||||
Future<Void> errorFuture;
|
Future<Void> errorFuture;
|
||||||
Promise<Void> closePromise;
|
Promise<Void> closePromise;
|
||||||
ShardManager shardManager;
|
|
||||||
Future<Void> openFuture;
|
Future<Void> openFuture;
|
||||||
Optional<Future<Void>> metrics;
|
Optional<Future<Void>> metrics;
|
||||||
FlowLock readSemaphore;
|
FlowLock readSemaphore;
|
||||||
@ -2109,6 +2132,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
|
|||||||
FlowLock fetchSemaphore;
|
FlowLock fetchSemaphore;
|
||||||
int numFetchWaiters;
|
int numFetchWaiters;
|
||||||
Counters counters;
|
Counters counters;
|
||||||
|
Future<Void> refreshHolder;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
@ -190,7 +190,7 @@ if(WITH_PYTHON)
|
|||||||
add_fdb_test(TEST_FILES noSim/RandomUnitTests.toml UNIT)
|
add_fdb_test(TEST_FILES noSim/RandomUnitTests.toml UNIT)
|
||||||
if (WITH_ROCKSDB_EXPERIMENTAL)
|
if (WITH_ROCKSDB_EXPERIMENTAL)
|
||||||
add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml)
|
add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml)
|
||||||
add_fdb_test(TEST_FILES noSim/ShardedRocksDBTest.toml IGNORE) # TODO: re-enable once storage engine is stable.
|
add_fdb_test(TEST_FILES noSim/ShardedRocksDBTest.toml UNIT)
|
||||||
add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml)
|
add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml)
|
||||||
else()
|
else()
|
||||||
add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml IGNORE)
|
add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml IGNORE)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user