Fix read iterator pool race (#7455)

This commit is contained in:
He Liu 2022-06-27 14:51:26 -07:00 committed by GitHub
parent d9b0ff418f
commit 61d621a4c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 58 additions and 34 deletions

View File

@ -100,6 +100,7 @@ std::string getErrorReason(BackgroundErrorReason reason) {
return format("%d Unknown", reason); return format("%d Unknown", reason);
} }
} }
// Background error handling is tested with Chaos test. // Background error handling is tested with Chaos test.
// TODO: Test background error in simulation. RocksDB doesn't use flow IO in simulation, which limits our ability to // TODO: Test background error in simulation. RocksDB doesn't use flow IO in simulation, which limits our ability to
// inject IO errors. We could implement rocksdb::FileSystem using flow IO to unblock simulation. Also, trace event is // inject IO errors. We could implement rocksdb::FileSystem using flow IO to unblock simulation. Also, trace event is
@ -144,6 +145,11 @@ private:
std::mutex mutex; std::mutex mutex;
}; };
// Encapsulation of shared states.
struct ShardedRocksDBState {
bool closing = false;
};
std::shared_ptr<rocksdb::Cache> rocksdb_block_cache = nullptr; std::shared_ptr<rocksdb::Cache> rocksdb_block_cache = nullptr;
rocksdb::Slice toSlice(StringRef s) { rocksdb::Slice toSlice(StringRef s) {
@ -328,7 +334,7 @@ public:
ASSERT(cf); ASSERT(cf);
readRangeOptions.background_purge_on_iterator_cleanup = true; readRangeOptions.background_purge_on_iterator_cleanup = true;
readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0); readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0);
TraceEvent("ReadIteratorPool") TraceEvent(SevDebug, "ReadIteratorPool")
.detail("Path", path) .detail("Path", path)
.detail("KnobRocksDBReadRangeReuseIterators", SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) .detail("KnobRocksDBReadRangeReuseIterators", SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS)
.detail("KnobRocksDBPrefixLen", SERVER_KNOBS->ROCKSDB_PREFIX_LEN); .detail("KnobRocksDBPrefixLen", SERVER_KNOBS->ROCKSDB_PREFIX_LEN);
@ -407,25 +413,6 @@ private:
uint64_t iteratorsReuseCount; uint64_t iteratorsReuseCount;
}; };
ACTOR Future<Void> refreshReadIteratorPool(
std::unordered_map<std::string, std::shared_ptr<PhysicalShard>>* physicalShards) {
state Reference<Histogram> histogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, "TimeSpentRefreshIterators"_sr, Histogram::Unit::microseconds);
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) {
loop {
wait(delay(SERVER_KNOBS->ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME));
double startTime = timer_monotonic();
for (auto& [_, shard] : *physicalShards) {
shard->readIterPool->refreshIterators();
}
histogram->sample(timer_monotonic() - startTime);
}
}
return Void();
}
ACTOR Future<Void> flowLockLogger(const FlowLock* readLock, const FlowLock* fetchLock) { ACTOR Future<Void> flowLockLogger(const FlowLock* readLock, const FlowLock* fetchLock) {
loop { loop {
wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY)); wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
@ -450,7 +437,7 @@ struct DataShard {
// PhysicalShard represent a collection of logical shards. A PhysicalShard could have one or more DataShards. A // PhysicalShard represent a collection of logical shards. A PhysicalShard could have one or more DataShards. A
// PhysicalShard is stored as a column family in rocksdb. Each PhysicalShard has its own iterator pool. // PhysicalShard is stored as a column family in rocksdb. Each PhysicalShard has its own iterator pool.
struct PhysicalShard { struct PhysicalShard {
PhysicalShard(rocksdb::DB* db, std::string id) : db(db), id(id) {} PhysicalShard(rocksdb::DB* db, std::string id) : db(db), id(id), isInitialized(false) {}
rocksdb::Status init() { rocksdb::Status init() {
if (cf) { if (cf) {
@ -462,10 +449,16 @@ struct PhysicalShard {
return status; return status;
} }
readIterPool = std::make_shared<ReadIteratorPool>(db, cf, id); readIterPool = std::make_shared<ReadIteratorPool>(db, cf, id);
this->isInitialized.store(true);
return status; return status;
} }
bool initialized() { return cf != nullptr; } bool initialized() { return this->isInitialized.load(); }
void refreshReadIteratorPool() {
ASSERT(this->readIterPool != nullptr);
this->readIterPool->refreshIterators();
}
~PhysicalShard() { ~PhysicalShard() {
if (!deletePending) if (!deletePending)
@ -487,6 +480,7 @@ struct PhysicalShard {
std::unordered_map<std::string, std::unique_ptr<DataShard>> dataShards; std::unordered_map<std::string, std::unique_ptr<DataShard>> dataShards;
std::shared_ptr<ReadIteratorPool> readIterPool; std::shared_ptr<ReadIteratorPool> readIterPool;
bool deletePending = false; bool deletePending = false;
std::atomic<bool> isInitialized;
}; };
// Manages physical shards and maintains logical shard mapping. // Manages physical shards and maintains logical shard mapping.
@ -1310,6 +1304,39 @@ ACTOR Future<Void> rocksDBAggregatedMetricsLogger(std::shared_ptr<RocksDBMetrics
struct ShardedRocksDBKeyValueStore : IKeyValueStore { struct ShardedRocksDBKeyValueStore : IKeyValueStore {
using CF = rocksdb::ColumnFamilyHandle*; using CF = rocksdb::ColumnFamilyHandle*;
ACTOR static Future<Void> refreshReadIteratorPools(
std::shared_ptr<ShardedRocksDBState> rState,
Future<Void> readyToStart,
std::unordered_map<std::string, std::shared_ptr<PhysicalShard>>* physicalShards) {
state Reference<Histogram> histogram = Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, "TimeSpentRefreshIterators"_sr, Histogram::Unit::microseconds);
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) {
try {
wait(readyToStart);
loop {
wait(delay(SERVER_KNOBS->ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME));
if (rState->closing) {
break;
}
double startTime = timer_monotonic();
for (auto& [_, shard] : *physicalShards) {
if (shard->initialized()) {
shard->refreshReadIteratorPool();
}
}
histogram->sample(timer_monotonic() - startTime);
}
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent(SevError, "RefreshReadIteratorPoolError").errorUnsuppressed(e);
}
}
}
return Void();
}
struct Writer : IThreadPoolReceiver { struct Writer : IThreadPoolReceiver {
int threadIndex; int threadIndex;
std::unordered_map<uint32_t, rocksdb::ColumnFamilyHandle*>* columnFamilyMap; std::unordered_map<uint32_t, rocksdb::ColumnFamilyHandle*>* columnFamilyMap;
@ -1360,15 +1387,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
return; return;
} }
if (g_network->isSimulated()) {
a.metrics = refreshReadIteratorPool(a.shardManager->getAllShards());
} else {
onMainThread([&] {
a.metrics = refreshReadIteratorPool(a.shardManager->getAllShards());
return Future<bool>(true);
}).blockUntilReady();
}
TraceEvent(SevInfo, "RocksDB").detail("Method", "Open"); TraceEvent(SevInfo, "RocksDB").detail("Method", "Open");
a.done.send(Void()); a.done.send(Void());
} }
@ -1865,7 +1883,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
// Persist shard mappinng key range should not be in shardMap. // Persist shard mappinng key range should not be in shardMap.
explicit ShardedRocksDBKeyValueStore(const std::string& path, UID id) explicit ShardedRocksDBKeyValueStore(const std::string& path, UID id)
: path(path), id(id), readSemaphore(SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX), : rState(std::make_shared<ShardedRocksDBState>()), path(path), id(id),
readSemaphore(SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
fetchSemaphore(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX), fetchSemaphore(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX), numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
numFetchWaiters(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX), numFetchWaiters(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
@ -1898,8 +1917,10 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
Future<Void> getError() const override { return errorFuture; } Future<Void> getError() const override { return errorFuture; }
ACTOR static void doClose(ShardedRocksDBKeyValueStore* self, bool deleteOnClose) { ACTOR static void doClose(ShardedRocksDBKeyValueStore* self, bool deleteOnClose) {
self->rState->closing = true;
// The metrics future retains a reference to the DB, so stop it before we delete it. // The metrics future retains a reference to the DB, so stop it before we delete it.
self->metrics.reset(); self->metrics.reset();
self->refreshHolder.cancel();
wait(self->readThreads->stop()); wait(self->readThreads->stop());
auto a = new Writer::CloseAction(&self->shardManager, deleteOnClose); auto a = new Writer::CloseAction(&self->shardManager, deleteOnClose);
@ -1930,6 +1951,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
auto a = std::make_unique<Writer::OpenAction>( auto a = std::make_unique<Writer::OpenAction>(
&shardManager, metrics, &readSemaphore, &fetchSemaphore, errorListener); &shardManager, metrics, &readSemaphore, &fetchSemaphore, errorListener);
openFuture = a->done.getFuture(); openFuture = a->done.getFuture();
this->refreshHolder = refreshReadIteratorPools(this->rState, openFuture, shardManager.getAllShards());
writeThread->post(a.release()); writeThread->post(a.release());
return openFuture; return openFuture;
} }
@ -2092,6 +2114,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
// Used for debugging shard mapping issue. // Used for debugging shard mapping issue.
std::vector<std::pair<KeyRange, std::string>> getDataMapping() { return shardManager.getDataMapping(); } std::vector<std::pair<KeyRange, std::string>> getDataMapping() { return shardManager.getDataMapping(); }
std::shared_ptr<ShardedRocksDBState> rState;
ShardManager shardManager;
std::shared_ptr<RocksDBMetrics> rocksDBMetrics; std::shared_ptr<RocksDBMetrics> rocksDBMetrics;
std::string path; std::string path;
const std::string dataPath; const std::string dataPath;
@ -2101,7 +2125,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
std::shared_ptr<RocksDBErrorListener> errorListener; std::shared_ptr<RocksDBErrorListener> errorListener;
Future<Void> errorFuture; Future<Void> errorFuture;
Promise<Void> closePromise; Promise<Void> closePromise;
ShardManager shardManager;
Future<Void> openFuture; Future<Void> openFuture;
Optional<Future<Void>> metrics; Optional<Future<Void>> metrics;
FlowLock readSemaphore; FlowLock readSemaphore;
@ -2109,6 +2132,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
FlowLock fetchSemaphore; FlowLock fetchSemaphore;
int numFetchWaiters; int numFetchWaiters;
Counters counters; Counters counters;
Future<Void> refreshHolder;
}; };
} // namespace } // namespace

View File

@ -190,7 +190,7 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES noSim/RandomUnitTests.toml UNIT) add_fdb_test(TEST_FILES noSim/RandomUnitTests.toml UNIT)
if (WITH_ROCKSDB_EXPERIMENTAL) if (WITH_ROCKSDB_EXPERIMENTAL)
add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml) add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml)
add_fdb_test(TEST_FILES noSim/ShardedRocksDBTest.toml IGNORE) # TODO: re-enable once storage engine is stable. add_fdb_test(TEST_FILES noSim/ShardedRocksDBTest.toml UNIT)
add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml) add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml)
else() else()
add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml IGNORE) add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml IGNORE)