#ifdef SSD_ROCKSDB_EXPERIMENTAL #include #include #include #include "flow/flow.h" #include "flow/IThreadPool.h" #endif // SSD_ROCKSDB_EXPERIMENTAL #include "fdbserver/IKeyValueStore.h" #include "flow/actorcompiler.h" // has to be last include #ifdef SSD_ROCKSDB_EXPERIMENTAL namespace { rocksdb::Slice toSlice(StringRef s) { return rocksdb::Slice(reinterpret_cast(s.begin()), s.size()); } StringRef toStringRef(rocksdb::Slice s) { return StringRef(reinterpret_cast(s.data()), s.size()); } rocksdb::ColumnFamilyOptions getCFOptions() { rocksdb::ColumnFamilyOptions options; options.level_compaction_dynamic_level_bytes = true; options.OptimizeLevelStyleCompaction(SERVER_KNOBS->ROCKSDB_MEMTABLE_BYTES); if (SERVER_KNOBS->ROCKSDB_PERIODIC_COMPACTION_SECONDS > 0) { options.periodic_compaction_seconds = SERVER_KNOBS->ROCKSDB_PERIODIC_COMPACTION_SECONDS; } // Compact sstables when there's too much deleted stuff. options.table_properties_collector_factories = { rocksdb::NewCompactOnDeletionCollectorFactory(128, 1) }; return options; } rocksdb::Options getOptions() { rocksdb::Options options({}, getCFOptions()); options.avoid_unnecessary_blocking_io = true; options.create_if_missing = true; if (SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM > 0) { options.IncreaseParallelism(SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM); } return options; } struct RocksDBKeyValueStore : IKeyValueStore { using DB = rocksdb::DB*; using CF = rocksdb::ColumnFamilyHandle*; struct Writer : IThreadPoolReceiver { DB& db; UID id; explicit Writer(DB& db, UID id) : db(db), id(id) {} ~Writer() override { if (db) { delete db; } } void init() override {} Error statusToError(const rocksdb::Status& s) { if (s == rocksdb::Status::IOError()) { return io_error(); } else { return unknown_error(); } } struct OpenAction : TypedAction { std::string path; ThreadReturnPromise done; double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; } }; void action(OpenAction& a) { std::vector defaultCF = { rocksdb::ColumnFamilyDescriptor{ "default", getCFOptions() } }; std::vector handle; auto status = rocksdb::DB::Open(getOptions(), a.path, defaultCF, &handle, &db); if (!status.ok()) { TraceEvent(SevError, "RocksDBError").detail("Error", status.ToString()).detail("Method", "Open"); a.done.sendError(statusToError(status)); } else { TraceEvent(SevInfo, "RocksDB").detail("Path", a.path).detail("Method", "Open"); a.done.send(Void()); } } struct DeleteVisitor : public rocksdb::WriteBatch::Handler { VectorRef& deletes; Arena& arena; DeleteVisitor(VectorRef& deletes, Arena& arena) : deletes(deletes), arena(arena) {} rocksdb::Status DeleteRangeCF(uint32_t /*column_family_id*/, const rocksdb::Slice& begin, const rocksdb::Slice& end) override { KeyRangeRef kr(toStringRef(begin), toStringRef(end)); deletes.push_back_deep(arena, kr); return rocksdb::Status::OK(); } }; struct CommitAction : TypedAction { std::unique_ptr batchToCommit; ThreadReturnPromise done; double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; } }; void action(CommitAction& a) { Standalone> deletes; DeleteVisitor dv(deletes, deletes.arena()); ASSERT(a.batchToCommit->Iterate(&dv).ok()); // If there are any range deletes, we should have added them to be deleted. ASSERT(!deletes.empty() || !a.batchToCommit->HasDeleteRange()); rocksdb::WriteOptions options; options.sync = !SERVER_KNOBS->ROCKSDB_UNSAFE_AUTO_FSYNC; auto s = db->Write(options, a.batchToCommit.get()); if (!s.ok()) { TraceEvent(SevError, "RocksDBError").detail("Error", s.ToString()).detail("Method", "Commit"); a.done.sendError(statusToError(s)); } else { a.done.send(Void()); for (const auto& keyRange : deletes) { auto begin = toSlice(keyRange.begin); auto end = toSlice(keyRange.end); ASSERT(db->SuggestCompactRange(db->DefaultColumnFamily(), &begin, &end).ok()); } } } struct CloseAction : TypedAction { ThreadReturnPromise done; std::string path; bool deleteOnClose; CloseAction(std::string path, bool deleteOnClose) : path(path), deleteOnClose(deleteOnClose) {} double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; } }; void action(CloseAction& a) { if (db == nullptr) { a.done.send(Void()); return; } auto s = db->Close(); if (!s.ok()) { TraceEvent(SevError, "RocksDBError").detail("Error", s.ToString()).detail("Method", "Close"); } if (a.deleteOnClose) { std::vector defaultCF = { rocksdb::ColumnFamilyDescriptor{ "default", getCFOptions() } }; s = rocksdb::DestroyDB(a.path, getOptions(), defaultCF); if (!s.ok()) { TraceEvent(SevError, "RocksDBError").detail("Error", s.ToString()).detail("Method", "Destroy"); } else { TraceEvent(SevInfo, "RocksDB").detail("Path", a.path).detail("Method", "Destroy"); } } TraceEvent(SevInfo, "RocksDB").detail("Path", a.path).detail("Method", "Close"); a.done.send(Void()); } }; struct Reader : IThreadPoolReceiver { DB& db; explicit Reader(DB& db) : db(db) {} void init() override {} struct ReadValueAction : TypedAction { Key key; Optional debugID; ThreadReturnPromise> result; ReadValueAction(KeyRef key, Optional debugID) : key(key), debugID(debugID) {} double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; } }; void action(ReadValueAction& a) { Optional traceBatch; if (a.debugID.present()) { traceBatch = { TraceBatch{} }; traceBatch.get().addEvent("GetValueDebug", a.debugID.get().first(), "Reader.Before"); } rocksdb::PinnableSlice value; auto s = db->Get({}, db->DefaultColumnFamily(), toSlice(a.key), &value); if (a.debugID.present()) { traceBatch.get().addEvent("GetValueDebug", a.debugID.get().first(), "Reader.After"); traceBatch.get().dump(); } if (s.ok()) { a.result.send(Value(toStringRef(value))); } else { if (!s.IsNotFound()) { TraceEvent(SevError, "RocksDBError").detail("Error", s.ToString()).detail("Method", "ReadValue"); } a.result.send(Optional()); } } struct ReadValuePrefixAction : TypedAction { Key key; int maxLength; Optional debugID; ThreadReturnPromise> result; ReadValuePrefixAction(Key key, int maxLength, Optional debugID) : key(key), maxLength(maxLength), debugID(debugID) {}; double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; } }; void action(ReadValuePrefixAction& a) { rocksdb::PinnableSlice value; Optional traceBatch; if (a.debugID.present()) { traceBatch = { TraceBatch{} }; traceBatch.get().addEvent("GetValuePrefixDebug", a.debugID.get().first(), "Reader.Before"); //.detail("TaskID", g_network->getCurrentTask()); } auto s = db->Get({}, db->DefaultColumnFamily(), toSlice(a.key), &value); if (a.debugID.present()) { traceBatch.get().addEvent("GetValuePrefixDebug", a.debugID.get().first(), "Reader.After"); //.detail("TaskID", g_network->getCurrentTask()); traceBatch.get().dump(); } if (s.ok()) { a.result.send(Value(StringRef(reinterpret_cast(value.data()), std::min(value.size(), size_t(a.maxLength))))); } else { TraceEvent(SevError, "RocksDBError").detail("Error", s.ToString()).detail("Method", "ReadValuePrefix"); a.result.send(Optional()); } } struct ReadRangeAction : TypedAction, FastAllocated { KeyRange keys; int rowLimit, byteLimit; ThreadReturnPromise> result; ReadRangeAction(KeyRange keys, int rowLimit, int byteLimit) : keys(keys), rowLimit(rowLimit), byteLimit(byteLimit) {} double getTimeEstimate() const override { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; } }; void action(ReadRangeAction& a) { Standalone result; if (a.rowLimit == 0 || a.byteLimit == 0) { a.result.send(result); } int accumulatedBytes = 0; rocksdb::Status s; if (a.rowLimit >= 0) { rocksdb::ReadOptions options; auto endSlice = toSlice(a.keys.end); options.iterate_upper_bound = &endSlice; auto cursor = std::unique_ptr(db->NewIterator(options)); cursor->Seek(toSlice(a.keys.begin)); while (cursor->Valid() && toStringRef(cursor->key()) < a.keys.end) { KeyValueRef kv(toStringRef(cursor->key()), toStringRef(cursor->value())); accumulatedBytes += sizeof(KeyValueRef) + kv.expectedSize(); result.push_back_deep(result.arena(), kv); // Calling `cursor->Next()` is potentially expensive, so short-circut here just in case. if (result.size() >= a.rowLimit || accumulatedBytes >= a.byteLimit) { break; } cursor->Next(); } s = cursor->status(); } else { rocksdb::ReadOptions options; auto beginSlice = toSlice(a.keys.begin); options.iterate_lower_bound = &beginSlice; auto cursor = std::unique_ptr(db->NewIterator(options)); cursor->SeekForPrev(toSlice(a.keys.end)); if (cursor->Valid() && toStringRef(cursor->key()) == a.keys.end) { cursor->Prev(); } while (cursor->Valid() && toStringRef(cursor->key()) >= a.keys.begin) { KeyValueRef kv(toStringRef(cursor->key()), toStringRef(cursor->value())); accumulatedBytes += sizeof(KeyValueRef) + kv.expectedSize(); result.push_back_deep(result.arena(), kv); // Calling `cursor->Prev()` is potentially expensive, so short-circut here just in case. if (result.size() >= -a.rowLimit || accumulatedBytes >= a.byteLimit) { break; } cursor->Prev(); } s = cursor->status(); } if (!s.ok()) { TraceEvent(SevError, "RocksDBError").detail("Error", s.ToString()).detail("Method", "ReadRange"); } result.more = (result.size() == a.rowLimit) || (result.size() == -a.rowLimit) || (accumulatedBytes >= a.byteLimit); if (result.more) { result.readThrough = result[result.size()-1].key; } a.result.send(result); } }; DB db = nullptr; std::string path; UID id; Reference writeThread; Reference readThreads; Promise errorPromise; Promise closePromise; std::unique_ptr writeBatch; explicit RocksDBKeyValueStore(const std::string& path, UID id) : path(path) , id(id) { writeThread = createGenericThreadPool(); readThreads = createGenericThreadPool(); writeThread->addThread(new Writer(db, id)); for (unsigned i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; ++i) { readThreads->addThread(new Reader(db)); } } Future getError() override { return errorPromise.getFuture(); } ACTOR static void doClose(RocksDBKeyValueStore* self, bool deleteOnClose) { wait(self->readThreads->stop()); auto a = new Writer::CloseAction(self->path, deleteOnClose); auto f = a->done.getFuture(); self->writeThread->post(a); wait(f); wait(self->writeThread->stop()); if (self->closePromise.canBeSet()) self->closePromise.send(Void()); if (self->errorPromise.canBeSet()) self->errorPromise.send(Never()); delete self; } Future onClosed() override { return closePromise.getFuture(); } void dispose() override { doClose(this, true); } void close() override { doClose(this, false); } KeyValueStoreType getType() const override { return KeyValueStoreType(KeyValueStoreType::SSD_ROCKSDB_V1); } Future init() override { std::unique_ptr a(new Writer::OpenAction()); a->path = path; auto res = a->done.getFuture(); writeThread->post(a.release()); return res; } void set(KeyValueRef kv, const Arena*) override { if (writeBatch == nullptr) { writeBatch.reset(new rocksdb::WriteBatch()); } writeBatch->Put(toSlice(kv.key), toSlice(kv.value)); } void clear(KeyRangeRef keyRange, const Arena*) override { if (writeBatch == nullptr) { writeBatch.reset(new rocksdb::WriteBatch()); } writeBatch->DeleteRange(toSlice(keyRange.begin), toSlice(keyRange.end)); } Future commit(bool) override { // If there is nothing to write, don't write. if (writeBatch == nullptr) { return Void(); } auto a = new Writer::CommitAction(); a->batchToCommit = std::move(writeBatch); auto res = a->done.getFuture(); writeThread->post(a); return res; } Future> readValue(KeyRef key, Optional debugID) override { auto a = new Reader::ReadValueAction(key, debugID); auto res = a->result.getFuture(); readThreads->post(a); return res; } Future> readValuePrefix(KeyRef key, int maxLength, Optional debugID) override { auto a = new Reader::ReadValuePrefixAction(key, maxLength, debugID); auto res = a->result.getFuture(); readThreads->post(a); return res; } Future> readRange(KeyRangeRef keys, int rowLimit, int byteLimit) override { auto a = new Reader::ReadRangeAction(keys, rowLimit, byteLimit); auto res = a->result.getFuture(); readThreads->post(a); return res; } StorageBytes getStorageBytes() const override { uint64_t live = 0; ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kEstimateLiveDataSize, &live)); int64_t free; int64_t total; g_network->getDiskBytes(path, free, total); return StorageBytes(free, total, live, free); } }; } // namespace #endif // SSD_ROCKSDB_EXPERIMENTAL IKeyValueStore* keyValueStoreRocksDB(std::string const& path, UID logID, KeyValueStoreType storeType, bool checkChecksums, bool checkIntegrity) { #ifdef SSD_ROCKSDB_EXPERIMENTAL return new RocksDBKeyValueStore(path, logID); #else TraceEvent(SevError, "RocksDBEngineInitFailure").detail("Reason", "Built without RocksDB"); ASSERT(false); return nullptr; #endif // SSD_ROCKSDB_EXPERIMENTAL }