diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 994f367292..6d74c2f67a 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -335,7 +335,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // KeyValueStoreRocksDB init( ROCKSDB_BACKGROUND_PARALLELISM, 0 ); init( ROCKSDB_READ_PARALLELISM, 4 ); - init( ROCKSDB_MEMTABLE_BYTES, 512 * 1024 * 1024 ); + // Use a smaller memtable in simulation to avoid OOMs. + int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024; + init( ROCKSDB_MEMTABLE_BYTES, memtableBytes ); init( ROCKSDB_UNSAFE_AUTO_FSYNC, false ); init( ROCKSDB_PERIODIC_COMPACTION_SECONDS, 0 ); init( ROCKSDB_PREFIX_LEN, 0 ); diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp index 99278a7862..08dee91f04 100644 --- a/fdbserver/KeyValueStoreRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp @@ -7,6 +7,7 @@ #include <rocksdb/slice_transform.h> #include <rocksdb/table.h> #include <rocksdb/utilities/table_properties_collectors.h> +#include "fdbserver/CoroFlow.h" #include "flow/flow.h" #include "flow/IThreadPool.h" @@ -283,7 +284,9 @@ struct RocksDBKeyValueStore : IKeyValueStore { std::min(value.size(), size_t(a.maxLength))))); } else { if (!s.IsNotFound()) { - TraceEvent(SevError, "RocksDBError").detail("Error", s.ToString()).detail("Method", "ReadValuePrefix"); + TraceEvent(SevError, "RocksDBError") + .detail("Error", s.ToString()) + .detail("Method", "ReadValuePrefix"); } a.result.send(Optional<Value>()); } @@ -367,8 +370,23 @@ struct RocksDBKeyValueStore : IKeyValueStore { std::unique_ptr<rocksdb::WriteBatch> writeBatch; explicit RocksDBKeyValueStore(const std::string& path, UID id) : path(path), id(id) { - writeThread = createGenericThreadPool(); - readThreads = createGenericThreadPool(); + // In simluation, run the reader/writer threads as Coro threads (i.e. in the network thread. The storage engine + // is still multi-threaded as background compaction threads are still present. Reads/writes to disk will also + // block the network thread in a way that would be unacceptable in production but is a necessary evil here. When + // performing the reads in background threads in simulation, the event loop thinks there is no work to do and + // advances time faster than 1 sec/sec. By the time the blocking read actually finishes, simulation has advanced + // time by more than 5 seconds, so every read fails with a transaction_too_old error. Doing blocking IO on the + // main thread solves this issue. There are almost certainly better fixes, but my goal was to get a less + // invasive change merged first and work on a more realistic version if/when we think that would provide + // substantially more confidence in the correctness. + // TODO: Adapt the simulation framework to not advance time quickly when background reads/writes are occurring. + if (g_network->isSimulated()) { + writeThread = CoroThreadPool::createThreadPool(); + readThreads = CoroThreadPool::createThreadPool(); + } else { + writeThread = createGenericThreadPool(); + readThreads = createGenericThreadPool(); + } writeThread->addThread(new Writer(db, id), "fdb-rocksdb-wr"); for (unsigned i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; ++i) { readThreads->addThread(new Reader(db), "fdb-rocksdb-re"); diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index a8a152f141..5f656f13f1 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -22,6 +22,7 @@ #include <fstream> #include <ostream> #include <sstream> +#include <string_view> #include <toml.hpp> #include "fdbrpc/Locality.h" #include "fdbrpc/simulator.h" @@ -50,6 +51,19 @@ extern const char* getSourceVersion(); using namespace std::literals; +// TODO: Defining these here is just asking for ODR violations. +template <> +std::string describe(bool const& val) { + return val ? "true" : "false"; +} + +template <> +std::string describe(int const& val) { + return format("%d", val); +} + +namespace { + const int MACHINE_REBOOT_TIME = 10; bool destructed = false; @@ -232,6 +246,7 @@ public: // 1 = "memory" // 2 = "memory-radixtree-beta" // 3 = "ssd-redwood-experimental" + // 4 = "ssd-rocksdb-experimental" // Requires a comma-separated list of numbers WITHOUT whitespaces std::vector<int> storageEngineExcludeTypes; // Set the maximum TLog version that can be selected for a test @@ -629,16 +644,6 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<ClusterConnec } } -template <> -std::string describe(bool const& val) { - return val ? "true" : "false"; -} - -template <> -std::string describe(int const& val) { - return format("%d", val); -} - // Since a datacenter kill is considered to be the same as killing a machine, files cannot be swapped across datacenters std::map<Optional<Standalone<StringRef>>, std::vector<std::vector<std::string>>> availableFolders; // process count is no longer needed because it is now the length of the vector of ip's, because it was one ip per @@ -1252,7 +1257,7 @@ void SimulationConfig::setDatacenters(const TestConfig& testConfig) { // Sets storage engine based on testConfig details void SimulationConfig::setStorageEngine(const TestConfig& testConfig) { - int storage_engine_type = deterministicRandom()->randomInt(0, 4); + int storage_engine_type = deterministicRandom()->randomInt(0, 5); if (testConfig.storageEngineType.present()) { storage_engine_type = testConfig.storageEngineType.get(); } else { @@ -1260,7 +1265,7 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) { while (std::find(testConfig.storageEngineExcludeTypes.begin(), testConfig.storageEngineExcludeTypes.end(), storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) { - storage_engine_type = deterministicRandom()->randomInt(0, 4); + storage_engine_type = deterministicRandom()->randomInt(0, 5); } } @@ -1285,6 +1290,16 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) { set_config("ssd-redwood-experimental"); break; } + case 4: { + TEST(true); // Simulated cluster using RocksDB storage engine + set_config("ssd-rocksdb-experimental"); + // Tests using the RocksDB engine are necessarily non-deterministic because of RocksDB + // background threads. + TraceEvent(SevWarn, "RocksDBNonDeterminism") + .detail("Explanation", "The RocksDB storage engine is threaded and non-deterministic"); + noUnseed = true; + break; + } default: ASSERT(false); // Programmer forgot to adjust cases. } @@ -2081,9 +2096,17 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors, using namespace std::literals; +#ifdef SSD_ROCKSDB_EXPERIMENTAL +bool rocksDBEnabled = true; +#else +bool rocksDBEnabled = false; +#endif + // Populates the TestConfig fields according to what is found in the test file. void checkTestConf(const char* testFile, TestConfig* testConfig) {} +} // namespace + ACTOR void setupAndRun(std::string dataFolder, const char* testFile, bool rebooting, @@ -2098,6 +2121,19 @@ ACTOR void setupAndRun(std::string dataFolder, g_simulator.hasDiffProtocolProcess = testConfig.startIncompatibleProcess; g_simulator.setDiffProtocol = false; + // The RocksDB storage engine does not support the restarting tests because you cannot consistently get a clean + // snapshot of the storage engine without a snapshotting file system. + // https://github.com/apple/foundationdb/issues/5155 + if (std::string_view(testFile).find("restarting") != std::string_view::npos) { + testConfig.storageEngineExcludeTypes.push_back(4); + } + + // The RocksDB engine is not always built with the rest of fdbserver. Don't try to use it if it is not included + // in the build. + if (!rocksDBEnabled) { + testConfig.storageEngineExcludeTypes.push_back(4); + } + state ProtocolVersion protocolVersion = currentProtocolVersion; if (testConfig.startIncompatibleProcess) { // isolates right most 1 bit of compatibleProtocolVersionMask to make this protocolVersion incompatible