From d128252e904f9cc0ee83c6218610d13daa10a399 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 22 May 2020 09:25:32 -0700 Subject: [PATCH] Merge release-6.3 into master --- README.md | 88 +- .../tuple/FastByteComparisons.java | 2 +- cmake/ConfigureCompiler.cmake | 10 + .../transaction_profiling_analyzer.py | 52 +- design/special-key-space.md | 21 +- .../source/mr-status-json-schemas.rst.inc | 2 - fdbbackup/FileDecoder.actor.cpp | 11 +- fdbbackup/backup.actor.cpp | 18 +- fdbcli/fdbcli.actor.cpp | 367 ++++++-- fdbclient/ClientLogEvents.h | 48 +- fdbclient/FDBTypes.h | 15 + fdbclient/IncludeVersions.h | 28 - fdbclient/Knobs.cpp | 1 + fdbclient/Knobs.h | 1 + fdbclient/MasterProxyInterface.h | 55 +- fdbclient/NativeAPI.actor.cpp | 41 +- fdbclient/NativeAPI.actor.h | 2 + fdbclient/Schemas.cpp | 2 - fdbclient/SpecialKeySpace.actor.cpp | 36 +- fdbclient/SpecialKeySpace.actor.h | 20 +- fdbclient/StorageServerInterface.h | 31 +- fdbclient/SystemData.cpp | 8 + fdbclient/SystemData.h | 1 + fdbclient/TagThrottle.actor.cpp | 98 +- fdbclient/TagThrottle.h | 29 +- fdbclient/ThreadSafeTransaction.actor.cpp | 2 +- fdbmonitor/fdbmonitor.cpp | 2 +- fdbrpc/ActorFuzz.actor.cpp | 60 +- fdbrpc/ActorFuzz.h | 8 - fdbrpc/AsyncFileCached.actor.cpp | 20 +- fdbrpc/AsyncFileCached.actor.h | 10 +- fdbrpc/FailureMonitor.actor.cpp | 14 +- fdbrpc/FailureMonitor.h | 2 + fdbrpc/FlowTransport.actor.cpp | 11 +- fdbrpc/FlowTransport.h | 26 +- fdbrpc/actorFuzz.py | 2 +- fdbrpc/sim2.actor.cpp | 11 - fdbserver/BackupProgress.actor.cpp | 13 +- fdbserver/BackupWorker.actor.cpp | 81 +- fdbserver/CMakeLists.txt | 1 + fdbserver/DataDistribution.actor.cpp | 18 +- fdbserver/DataDistribution.actor.h | 10 + fdbserver/DataDistributionTracker.actor.cpp | 51 ++ fdbserver/DataDistributorInterface.h | 30 +- fdbserver/FDBExecHelper.actor.cpp | 2 +- fdbserver/Knobs.cpp | 10 +- fdbserver/Knobs.h | 10 +- fdbserver/MasterInterface.h | 14 +- fdbserver/MasterProxyServer.actor.cpp | 20 + fdbserver/Ratekeeper.actor.cpp | 4 +- fdbserver/RestoreLoader.actor.cpp | 3 + fdbserver/SimulatedCluster.actor.cpp | 2 +- fdbserver/SkipList.cpp | 2 +- fdbserver/Status.actor.cpp | 33 +- fdbserver/TLogInterface.h | 26 +- fdbserver/VersionedBTree.actor.cpp | 841 ++++++++++++------ fdbserver/fdbserver.actor.cpp | 2 +- fdbserver/storageserver.actor.cpp | 1 + ...kupAndParallelRestoreCorrectness.actor.cpp | 4 +- ...entTransactionProfileCorrectness.actor.cpp | 8 +- .../workloads/ConfigureDatabase.actor.cpp | 7 +- .../DataDistributionMetrics.actor.cpp | 108 +++ fdbserver/workloads/TagThrottleApi.actor.cpp | 94 +- fdbservice/FDBService.cpp | 2 +- flow/IThreadPool.cpp | 2 +- flow/TLSConfig.actor.cpp | 2 +- flow/network.h | 20 +- tests/CMakeLists.txt | 1 + tests/DataDistributionMetrics.txt | 21 + 69 files changed, 1762 insertions(+), 836 deletions(-) delete mode 100644 fdbclient/IncludeVersions.h mode change 100644 => 100755 fdbrpc/actorFuzz.py create mode 100644 fdbserver/workloads/DataDistributionMetrics.actor.cpp create mode 100644 tests/DataDistributionMetrics.txt diff --git a/README.md b/README.md index e27dca73fc..e42dfccd63 100755 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Contributing to FoundationDB can be in contributions to the code base, sharing y ### Binary downloads -Developers interested in using the FoundationDB store for an application can get started easily by downloading and installing a binary package. Please see the [downloads page](https://www.foundationdb.org/download/) for a list of available packages. +Developers interested in using FoundationDB can get started by downloading and installing a binary package. Please see the [downloads page](https://www.foundationdb.org/download/) for a list of available packages. ### Compiling from source @@ -28,44 +28,24 @@ Developers interested in using the FoundationDB store for an application can get Developers on an OS for which there is no binary package, or who would like to start hacking on the code, can get started by compiling from source. -Currently there are two build systems: a collection of Makefiles and a -CMake-based build system. Both of them should currently work for most users, -and CMake should be the preferred choice as it will eventually become the only -build system available. +The official docker image for building is `foundationdb/foundationdb-build`. It has all dependencies installed. To build outside the official docker image you'll need at least these dependencies: + +1. Install cmake Version 3.13 or higher [CMake](https://cmake.org/) +1. Install [Mono](http://www.mono-project.com/download/stable/) +1. Install [Ninja](https://ninja-build.org/) (optional, but recommended) If compiling for local development, please set `-DUSE_WERROR=ON` in cmake. Our CI compiles with `-Werror` on, so this way you'll find out about compiler warnings that break the build earlier. -## CMake - -To build with CMake, generally the following is required (works on Linux and -Mac OS - for Windows see below): +Once you have your dependencies, you can run cmake and then build: 1. Check out this repository. -1. Install cmake Version 3.13 or higher [CMake](https://cmake.org/) -1. Download version 1.67 of [Boost](https://sourceforge.net/projects/boost/files/boost/1.67.0/). -1. Unpack boost (you don't need to compile it) -1. Install [Mono](http://www.mono-project.com/download/stable/). -1. Install a [JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html). FoundationDB currently builds with Java 8. 1. Create a build directory (you can have the build directory anywhere you - like): `mkdir build` -1. `cd build` -1. `cmake -GNinja -DBOOST_ROOT= ` -1. `ninja` - -CMake will try to find its dependencies. However, for LibreSSL this can be often -problematic (especially if OpenSSL is installed as well). For that we recommend -passing the argument `-DLibreSSL_ROOT` to cmake. So, for example, if you -LibreSSL is installed under `/usr/local/libressl-2.8.3`, you should call cmake like -this: - -``` -cmake -GNinja -DLibreSSL_ROOT=/usr/local/libressl-2.8.3/ ../foundationdb -``` - -FoundationDB will build just fine without LibreSSL, however, the resulting -binaries won't support TLS connections. + like). There is currently a directory in the source tree called build, but you should not use it. See [#3098](https://github.com/apple/foundationdb/issues/3098) +1. `cd ` +1. `cmake -G Ninja ` +1. `ninja # If this crashes it probably ran out of memory. Try ninja -j1` ### Language Bindings @@ -120,8 +100,7 @@ create a XCode-project with the following command: cmake -G Xcode -DOPEN_FOR_IDE=ON ``` -You should create a second build-directory which you will use for building -(probably with make or ninja) and debugging. +You should create a second build-directory which you will use for building and debugging. #### FreeBSD @@ -160,11 +139,8 @@ There are no special requirements for Linux. A docker image can be pulled from `foundationdb/foundationdb-build` that has all of FoundationDB's dependencies pre-installed, and is what the CI uses to build and test PRs. -If you want to create a package you have to tell cmake what platform it is for. -And then you can build by simply calling `cpack`. So for debian, call: - ``` -cmake -GNinja +cmake -G Ninja ninja cpack -G DEB ``` @@ -173,20 +149,15 @@ For RPM simply replace `DEB` with `RPM`. ### MacOS -The build under MacOS will work the same way as on Linux. To get LibreSSL, -boost, and ninja you can use [Homebrew](https://brew.sh/). LibreSSL will not be -installed in `/usr/local` instead it will stay in `/usr/local/Cellar`. So the -cmake command will look something like this: +The build under MacOS will work the same way as on Linux. To get boost and ninja you can use [Homebrew](https://brew.sh/). ```sh -cmake -GNinja -DLibreSSL_ROOT=/usr/local/Cellar/libressl/2.8.3 +cmake -G Ninja ``` -To generate a installable package, you have to call CMake with the corresponding -arguments and then use cpack to generate the package: +To generate a installable package, you can use cpack: ```sh -cmake -GNinja ninja cpack -G productbuild ``` @@ -198,15 +169,15 @@ that Visual Studio is used to compile. 1. Install Visual Studio 2017 (Community Edition is tested) 1. Install cmake Version 3.12 or higher [CMake](https://cmake.org/) -1. Download version 1.67 of [Boost](https://sourceforge.net/projects/boost/files/boost/1.67.0/). +1. Download version 1.72 of [Boost](https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.bz2) 1. Unpack boost (you don't need to compile it) -1. Install [Mono](http://www.mono-project.com/download/stable/). -1. Install a [JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html). FoundationDB currently builds with Java 8. +1. Install [Mono](http://www.mono-project.com/download/stable/) +1. (Optional) Install a [JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html). FoundationDB currently builds with Java 8 1. Set `JAVA_HOME` to the unpacked location and JAVA_COMPILE to `$JAVA_HOME/bin/javac`. -1. Install [Python](https://www.python.org/downloads/) if it is not already installed by Visual Studio. +1. Install [Python](https://www.python.org/downloads/) if it is not already installed by Visual Studio 1. (Optional) Install [WIX](http://wixtoolset.org/). Without it Visual Studio - won't build the Windows installer. + won't build the Windows installer 1. Create a build directory (you can have the build directory anywhere you like): `mkdir build` 1. `cd build` @@ -218,22 +189,7 @@ that Visual Studio is used to compile. Studio will only know about the generated files. `msbuild` is located at `c:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe` for Visual Studio 15. -If you want TLS support to be enabled under Windows you currently have to build -and install LibreSSL yourself as the newer LibreSSL versions are not provided -for download from the LibreSSL homepage. To build LibreSSL: - -1. Download and unpack libressl (>= 2.8.2) -2. `cd libressl-2.8.2` -3. `mkdir build` -4. `cd build` -5. `cmake -G "Visual Studio 15 2017 Win64" ..` -6. Open the generated `LibreSSL.sln` in Visual Studio as administrator (this is - necessary for the install) -7. Build the `INSTALL` project in `Release` mode - -This will install LibreSSL under `C:\Program Files\LibreSSL`. After that `cmake` -will automatically find it and build with TLS support. - If you installed WIX before running `cmake` you should find the `FDBInstaller.msi` in your build directory under `packaging/msi`. +TODO: Re-add instructions for TLS support [#3022](https://github.com/apple/foundationdb/issues/3022) diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/FastByteComparisons.java b/bindings/java/src/main/com/apple/foundationdb/tuple/FastByteComparisons.java index 77add1db7f..83f5f399de 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/FastByteComparisons.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/FastByteComparisons.java @@ -1,5 +1,5 @@ /* - * ByteArrayUtil.java + * FastByteComparisons.java * * This source file is part of the FoundationDB open source project * diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index 1a45498b37..ddb2f38792 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -85,7 +85,17 @@ include(CheckFunctionExists) set(CMAKE_REQUIRED_INCLUDES stdlib.h malloc.h) set(CMAKE_REQUIRED_LIBRARIES c) set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_C_STANDARD 11) +set(CMAKE_C_STANDARD_REQUIRED ON) + +if(NOT WIN32) + include(CheckIncludeFile) + CHECK_INCLUDE_FILE("stdatomic.h" HAS_C11_ATOMICS) + if (NOT HAS_C11_ATOMICS) + message(FATAL_ERROR "C compiler does not support c11 atomics") + endif() +endif() if(WIN32) # see: https://docs.microsoft.com/en-us/windows/desktop/WinProg/using-the-windows-headers diff --git a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py index b57d3b506c..58e2cf2548 100755 --- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py +++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py @@ -173,44 +173,45 @@ class Mutation(object): class BaseInfo(object): - def __init__(self, start_timestamp): - self.start_timestamp = start_timestamp - + def __init__(self, bb, protocol_version): + self.start_timestamp = bb.get_double() + if protocol_version >= PROTOCOL_VERSION_6_3: + self.dc_id = bb.get_bytes_with_length() class GetVersionInfo(BaseInfo): def __init__(self, bb, protocol_version): - super().__init__(bb.get_double()) + super().__init__(bb, protocol_version) self.latency = bb.get_double() if protocol_version >= PROTOCOL_VERSION_6_2: self.transaction_priority_type = bb.get_int() if protocol_version >= PROTOCOL_VERSION_6_3: - self.read_version = bb.get_long() + self.read_version = bb.get_long() class GetInfo(BaseInfo): - def __init__(self, bb): - super().__init__(bb.get_double()) + def __init__(self, bb, protocol_version): + super().__init__(bb, protocol_version) self.latency = bb.get_double() self.value_size = bb.get_int() self.key = bb.get_bytes_with_length() class GetRangeInfo(BaseInfo): - def __init__(self, bb): - super().__init__(bb.get_double()) + def __init__(self, bb, protocol_version): + super().__init__(bb, protocol_version) self.latency = bb.get_double() self.range_size = bb.get_int() self.key_range = bb.get_key_range() class CommitInfo(BaseInfo): - def __init__(self, bb, full_output=True): - super().__init__(bb.get_double()) + def __init__(self, bb, protocol_version, full_output=True): + super().__init__(bb, protocol_version) self.latency = bb.get_double() self.num_mutations = bb.get_int() self.commit_bytes = bb.get_int() - + if protocol_version >= PROTOCOL_VERSION_6_3: - self.commit_version = bb.get_long() + self.commit_version = bb.get_long() read_conflict_range = bb.get_key_range_list() if full_output: self.read_conflict_range = read_conflict_range @@ -225,22 +226,22 @@ class CommitInfo(BaseInfo): class ErrorGetInfo(BaseInfo): - def __init__(self, bb): - super().__init__(bb.get_double()) + def __init__(self, bb, protocol_version): + super().__init__(bb, protocol_version) self.error_code = bb.get_int() self.key = bb.get_bytes_with_length() class ErrorGetRangeInfo(BaseInfo): - def __init__(self, bb): - super().__init__(bb.get_double()) + def __init__(self, bb, protocol_version): + super().__init__(bb, protocol_version) self.error_code = bb.get_int() self.key_range = bb.get_key_range() class ErrorCommitInfo(BaseInfo): - def __init__(self, bb, full_output=True): - super().__init__(bb.get_double()) + def __init__(self, bb, protocol_version, full_output=True): + super().__init__(bb, protocol_version) self.error_code = bb.get_int() read_conflict_range = bb.get_key_range_list() @@ -282,33 +283,33 @@ class ClientTransactionInfo: if (not type_filter or "get_version" in type_filter): self.get_version = get_version elif event == 1: - get = GetInfo(bb) + get = GetInfo(bb, protocol_version) if (not type_filter or "get" in type_filter): # because of the crappy json serializtion using __dict__ we have to set the list here otherwise # it doesn't print if not self.gets: self.gets = [] self.gets.append(get) elif event == 2: - get_range = GetRangeInfo(bb) + get_range = GetRangeInfo(bb, protocol_version) if (not type_filter or "get_range" in type_filter): if not self.get_ranges: self.get_ranges = [] self.get_ranges.append(get_range) elif event == 3: - commit = CommitInfo(bb, full_output=full_output) + commit = CommitInfo(bb, protocol_version, full_output=full_output) if (not type_filter or "commit" in type_filter): self.commit = commit elif event == 4: - error_get = ErrorGetInfo(bb) + error_get = ErrorGetInfo(bb, protocol_version) if (not type_filter or "error_gets" in type_filter): if not self.error_gets: self.error_gets = [] self.error_gets.append(error_get) elif event == 5: - error_get_range = ErrorGetRangeInfo(bb) + error_get_range = ErrorGetRangeInfo(bb, protocol_version) if (not type_filter or "error_get_range" in type_filter): if not self.error_get_ranges: self.error_get_ranges = [] self.error_get_ranges.append(error_get_range) elif event == 6: - error_commit = ErrorCommitInfo(bb, full_output=full_output) + error_commit = ErrorCommitInfo(bb, protocol_version, full_output=full_output) if (not type_filter or "error_commit" in type_filter): if not self.error_commits: self.error_commits = [] self.error_commits.append(error_commit) @@ -978,4 +979,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/design/special-key-space.md b/design/special-key-space.md index 15386de508..e6bc0796f6 100644 --- a/design/special-key-space.md +++ b/design/special-key-space.md @@ -7,7 +7,7 @@ Currently, there are several client functions implemented as FDB calls by passin - **cluster_file_path**: `get("\xff\xff/cluster_file_path)` - **connection_string**: `get("\xff\xff/connection_string)` - **worker_interfaces**: `getRange("\xff\xff/worker_interfaces", )` -- **conflicting-keys**: `getRange("\xff\xff/transaction/conflicting_keys/", "\xff\xff/transaction/conflicting_keys/\xff")` +- **conflicting_keys**: `getRange("\xff\xff/transaction/conflicting_keys/", "\xff\xff/transaction/conflicting_keys/\xff")` At present, implementions are hard-coded and the pain points are obvious: - **Maintainability**: As more features added, the hard-coded snippets are hard to maintain @@ -78,4 +78,21 @@ ASSERT( res2[0].value == LiteralStringRef("London") && res2[1].value == LiteralStringRef("Washington, D.C.") ); -``` \ No newline at end of file +``` + +## Module +We introduce this `module` concept after a [discussion](https://forums.foundationdb.org/t/versioning-of-special-key-space/2068) on cross module read on special-key-space. By default, range reads cover more than one module will not be allowed with `special_keys_cross_module_read` errors. In addition, range reads touch no modules will come with `special_keys_no_module_found` errors. The motivation here is to avoid unexpected blocking or errors happen in a wide-scope range read. In particular, you write code `getRange("A", "Z")` when all registered calls between `[A, Z)` happen locally, thus your code does not have any error-handling. However, if in the future, anyone register a new call in `[A, Z)` and sometimes throw errors like `time_out()`, then your original code is broken. The `module` is like a top-level directory where inside the module, calls are homogeneous. So we allow cross range read inside each module by default but cross module reads are forbidden. Right now, there are two modules available to use: + +- TRANSACTION : `\xff\xff/transaction/, \xff\xff/transaction0`, all transaction related information like *read_conflict_range*, *write_conflict_range*, *conflicting_keys*.(All happen locally). Right now we have: + - `\xff\xff/transaction/conflicting_keys/, \xff\xff/transaction/conflicting_keys0` : conflicting keys that caused conflicts + - `\xff\xff/transaction/read_conflict_range/, \xff\xff/transaction/read_conflict_range0` : read conflict ranges of the transaction + - `\xff\xff/transaction/write_conflict_range/, \xff\xff/transaction/write_conflict_range0` : write conflict ranges of the transaction +- METRICS: `\xff\xff/metrics/, \xff\xff/metrics0`, all metrics like data-distribution metrics or healthy metrics are planned to put here. All need to call the rpc, so time_out error s may happen. Right now we have: + - `\xff\xff/metrics/data_distribution_stats, \xff\xff/metrics/data_distribution_stats` : stats info about data-distribution +- WORKERINTERFACE : `\xff\xff/worker_interfaces/, \xff\xff/worker_interfaces0`, which is compatible with previous implementation, thus should not be used to add new functions. + +In addition, all singleKeyRanges are formatted as modules and cannot be used again. In particular, you should call `get` not `getRange` on these keys. Below are existing ones: + +- STATUSJSON : `\xff\xff/status/json` +- CONNECTIONSTRING : `\xff\xff/connection_string` +- CLUSTERFILEPATH : `\xff\xff/cluster_file_path` \ No newline at end of file diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 3b8df02949..31ccb629fc 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -284,8 +284,6 @@ }, "limiting_queue_bytes_storage_server":0, "worst_queue_bytes_storage_server":0, - "limiting_version_lag_storage_server":0, - "worst_version_lag_storage_server":0, "limiting_data_lag_storage_server":{ "versions":0, "seconds":0.0 diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp index f7de6d70ff..f294f724c9 100644 --- a/fdbbackup/FileDecoder.actor.cpp +++ b/fdbbackup/FileDecoder.actor.cpp @@ -219,8 +219,9 @@ struct VersionedMutations { */ struct DecodeProgress { DecodeProgress() = default; - DecodeProgress(const LogFile& file, std::vector> values) - : file(file), keyValues(values) {} + template + DecodeProgress(const LogFile& file, U &&values) + : file(file), keyValues(std::forward(values)) {} // If there are no more mutations to pull from the file. // However, we could have unfinished version in the buffer when EOF is true, @@ -228,7 +229,7 @@ struct DecodeProgress { // should call getUnfinishedBuffer() to get these left data. bool finished() { return (eof && keyValues.empty()) || (leftover && !keyValues.empty()); } - std::vector>&& getUnfinishedBuffer() { return std::move(keyValues); } + std::vector>&& getUnfinishedBuffer() && { return std::move(keyValues); } // Returns all mutations of the next version in a batch. Future getNextBatch() { return getNextBatchImpl(this); } @@ -448,7 +449,7 @@ ACTOR Future decode_logs(DecodeParams params) { for (; i < logs.size(); i++) { if (logs[i].fileSize == 0) continue; - state DecodeProgress progress(logs[i], left); + state DecodeProgress progress(logs[i], std::move(left)); wait(progress.openFile(container)); while (!progress.finished()) { VersionedMutations vms = wait(progress.getNextBatch()); @@ -456,7 +457,7 @@ ACTOR Future decode_logs(DecodeParams params) { std::cout << vms.version << " " << m.toString() << "\n"; } } - left = progress.getUnfinishedBuffer(); + left = std::move(progress).getUnfinishedBuffer(); if (!left.empty()) { TraceEvent("UnfinishedFile").detail("File", logs[i].fileName).detail("Q", left.size()); } diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 54ef9fbb06..128470674d 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -63,7 +63,7 @@ using std::endl; #endif #endif -#include "fdbclient/IncludeVersions.h" +#include "fdbclient/versions.h" #include "flow/SimpleOpt.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -593,9 +593,7 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = { { OPT_RESTORE_TIMESTAMP, "--timestamp", SO_REQ_SEP }, { OPT_KNOB, "--knob_", SO_REQ_SEP }, { OPT_RESTORECONTAINER,"-r", SO_REQ_SEP }, - { OPT_PREFIX_ADD, "-add_prefix", SO_REQ_SEP }, // TODO: Remove in 6.3 { OPT_PREFIX_ADD, "--add_prefix", SO_REQ_SEP }, - { OPT_PREFIX_REMOVE, "-remove_prefix", SO_REQ_SEP }, // TODO: Remove in 6.3 { OPT_PREFIX_REMOVE, "--remove_prefix", SO_REQ_SEP }, { OPT_TAGNAME, "-t", SO_REQ_SEP }, { OPT_TAGNAME, "--tagname", SO_REQ_SEP }, @@ -2709,7 +2707,13 @@ extern uint8_t *g_extra_memory; int main(int argc, char* argv[]) { platformInit(); - int status = FDB_EXIT_SUCCESS; + int status = FDB_EXIT_SUCCESS; + + std::string commandLine; + for(int a=0; a getTransaction(Database db, Reference& lc) { +void compGenerator(const char* text, bool help, std::vector& lc) { std::map::const_iterator iter; int len = strlen(text); @@ -2438,7 +2438,7 @@ void comp_generator(const char* text, bool help, std::vector& lc) { for (auto iter = helpMap.begin(); iter != helpMap.end(); ++iter) { const char* name = (*iter).first.c_str(); if (!strncmp(name, text, len)) { - lc.push_back( new_completion(help ? "help " : "", name) ); + lc.push_back( newCompletion(help ? "help " : "", name) ); } } @@ -2447,31 +2447,31 @@ void comp_generator(const char* text, bool help, std::vector& lc) { const char* name = *he; he++; if (!strncmp(name, text, len)) - lc.push_back( new_completion("help ", name) ); + lc.push_back( newCompletion("help ", name) ); } } } -void cmd_generator(const char* text, std::vector& lc) { - comp_generator(text, false, lc); +void cmdGenerator(const char* text, std::vector& lc) { + compGenerator(text, false, lc); } -void help_generator(const char* text, std::vector& lc) { - comp_generator(text, true, lc); +void helpGenerator(const char* text, std::vector& lc) { + compGenerator(text, true, lc); } -void option_generator(const char* text, const char *line, std::vector& lc) { +void optionGenerator(const char* text, const char *line, std::vector& lc) { int len = strlen(text); for (auto iter = validOptions.begin(); iter != validOptions.end(); ++iter) { const char* name = (*iter).c_str(); if (!strncmp(name, text, len)) { - lc.push_back( new_completion(line, name) ); + lc.push_back( newCompletion(line, name) ); } } } -void array_generator(const char* text, const char *line, const char** options, std::vector& lc) { +void arrayGenerator(const char* text, const char *line, const char** options, std::vector& lc) { const char** iter = options; int len = strlen(text); @@ -2479,32 +2479,57 @@ void array_generator(const char* text, const char *line, const char** options, s const char* name = *iter; iter++; if (!strncmp(name, text, len)) { - lc.push_back( new_completion(line, name) ); + lc.push_back( newCompletion(line, name) ); } } } -void onoff_generator(const char* text, const char *line, std::vector& lc) { - const char* opts[] = {"on", "off", NULL}; - array_generator(text, line, opts, lc); +void onOffGenerator(const char* text, const char *line, std::vector& lc) { + const char* opts[] = {"on", "off", nullptr}; + arrayGenerator(text, line, opts, lc); } -void configure_generator(const char* text, const char *line, std::vector& lc) { - const char* opts[] = {"new", "single", "double", "triple", "three_data_hall", "three_datacenter", "ssd", "ssd-1", "ssd-2", "memory", "memory-1", "memory-2", "memory-radixtree-beta", "proxies=", "logs=", "resolvers=", NULL}; - array_generator(text, line, opts, lc); +void configureGenerator(const char* text, const char *line, std::vector& lc) { + const char* opts[] = {"new", "single", "double", "triple", "three_data_hall", "three_datacenter", "ssd", "ssd-1", "ssd-2", "memory", "memory-1", "memory-2", "memory-radixtree-beta", "proxies=", "logs=", "resolvers=", nullptr}; + arrayGenerator(text, line, opts, lc); } -void status_generator(const char* text, const char *line, std::vector& lc) { - const char* opts[] = {"minimal", "details", "json", NULL}; - array_generator(text, line, opts, lc); +void statusGenerator(const char* text, const char *line, std::vector& lc) { + const char* opts[] = {"minimal", "details", "json", nullptr}; + arrayGenerator(text, line, opts, lc); } -void kill_generator(const char* text, const char *line, std::vector& lc) { - const char* opts[] = {"all", "list", NULL}; - array_generator(text, line, opts, lc); +void killGenerator(const char* text, const char *line, std::vector& lc) { + const char* opts[] = {"all", "list", nullptr}; + arrayGenerator(text, line, opts, lc); } -void fdbcli_comp_cmd(std::string const& text, std::vector& lc) { +void throttleGenerator(const char* text, const char *line, std::vector& lc, std::vector const& tokens) { + if(tokens.size() == 1) { + const char* opts[] = { "on tag", "off", "enable auto", "disable auto", "list", nullptr }; + arrayGenerator(text, line, opts, lc); + } + else if(tokens.size() >= 2 && tokencmp(tokens[1], "on")) { + if(tokens.size() == 2) { + const char* opts[] = { "tag", nullptr }; + arrayGenerator(text, line, opts, lc); + } + else if(tokens.size() == 6) { + const char* opts[] = { "default", "immediate", "batch", nullptr }; + arrayGenerator(text, line, opts, lc); + } + } + else if(tokens.size() >= 2 && tokencmp(tokens[1], "off") && !tokencmp(tokens[tokens.size()-1], "tag")) { + const char* opts[] = { "all", "auto", "manual", "tag", "default", "immediate", "batch", nullptr }; + arrayGenerator(text, line, opts, lc); + } + else if(tokens.size() == 2 && tokencmp(tokens[1], "enable") || tokencmp(tokens[1], "disable")) { + const char* opts[] = { "auto", nullptr }; + arrayGenerator(text, line, opts, lc); + } +} + +void fdbcliCompCmd(std::string const& text, std::vector& lc) { bool err, partial; std::string whole_line = text; auto parsed = parseLine(whole_line, err, partial); @@ -2531,37 +2556,102 @@ void fdbcli_comp_cmd(std::string const& text, std::vector& lc) { // printf("final text (%d tokens): `%s' & `%s'\n", count, base_input.c_str(), ntext.c_str()); if (!count) { - cmd_generator(ntext.c_str(), lc); + cmdGenerator(ntext.c_str(), lc); return; } if (tokencmp(tokens[0], "help") && count == 1) { - help_generator(ntext.c_str(), lc); + helpGenerator(ntext.c_str(), lc); return; } if (tokencmp(tokens[0], "option")) { if (count == 1) - onoff_generator(ntext.c_str(), base_input.c_str(), lc); + onOffGenerator(ntext.c_str(), base_input.c_str(), lc); if (count == 2) - option_generator(ntext.c_str(), base_input.c_str(), lc); + optionGenerator(ntext.c_str(), base_input.c_str(), lc); } if (tokencmp(tokens[0], "writemode") && count == 1) { - onoff_generator(ntext.c_str(), base_input.c_str(), lc); + onOffGenerator(ntext.c_str(), base_input.c_str(), lc); } if (tokencmp(tokens[0], "configure")) { - configure_generator(ntext.c_str(), base_input.c_str(), lc); + configureGenerator(ntext.c_str(), base_input.c_str(), lc); } if (tokencmp(tokens[0], "status") && count == 1) { - status_generator(ntext.c_str(), base_input.c_str(), lc); + statusGenerator(ntext.c_str(), base_input.c_str(), lc); } if (tokencmp(tokens[0], "kill") && count == 1) { - kill_generator(ntext.c_str(), base_input.c_str(), lc); + killGenerator(ntext.c_str(), base_input.c_str(), lc); } + + if (tokencmp(tokens[0], "throttle")) { + throttleGenerator(ntext.c_str(), base_input.c_str(), lc, tokens); + } +} + +std::vector throttleHintGenerator(std::vector const& tokens, bool inArgument) { + if(tokens.size() == 1) { + return { "", "[ARGS]" }; + } + else if(tokencmp(tokens[1], "on")) { + std::vector opts = { "tag", "", "[RATE]", "[DURATION]", "[default|immediate|batch]" }; + if(tokens.size() == 2) { + return opts; + } + else if(((tokens.size() == 3 && inArgument) || tokencmp(tokens[2], "tag")) && tokens.size() < 7) { + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } + } + else if(tokencmp(tokens[1], "off")) { + if(tokencmp(tokens[tokens.size()-1], "tag")) { + return { "" }; + } + else { + bool hasType = false; + bool hasTag = false; + bool hasPriority = false; + for(int i = 2; i < tokens.size(); ++i) { + if(tokencmp(tokens[i], "all") || tokencmp(tokens[i], "auto") || tokencmp(tokens[i], "manual")) { + hasType = true; + } + else if(tokencmp(tokens[i], "default") || tokencmp(tokens[i], "immediate") || tokencmp(tokens[i], "batch")) { + hasPriority = true; + } + else if(tokencmp(tokens[i], "tag")) { + hasTag = true; + ++i; + } + else { + return {}; + } + } + + std::vector options; + if(!hasType) { + options.push_back("[all|auto|manual]"); + } + if(!hasTag) { + options.push_back("[tag ]"); + } + if(!hasPriority) { + options.push_back("[default|immediate|batch]"); + } + + return options; + } + } + else if((tokencmp(tokens[1], "enable") || tokencmp(tokens[1], "disable")) && tokens.size() == 2) { + return { "auto" }; + } + else if(tokens.size() == 2 && inArgument) { + return { "[ARGS]" }; + } + + return std::vector(); } void LogCommand(std::string line, UID randomID, std::string errMsg) { @@ -3919,7 +4009,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { (int)(itr->tpsRate), std::min((int)(itr->expirationTime-now()), (int)(itr->initialDuration)), transactionPriorityToString(itr->priority, false), - itr->autoThrottled ? "auto" : "manual", + itr->throttleType == TagThrottleType::AUTO ? "auto" : "manual", itr->tag.toString().c_str()); } } @@ -3932,19 +4022,21 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { printf("There are no throttled tags\n"); } } - else if(tokencmp(tokens[1], "on") && tokens.size() <=6) { - if(tokens.size() < 4 || !tokencmp(tokens[2], "tag")) { - printf("Usage: throttle on tag [RATE] [DURATION]\n"); + else if(tokencmp(tokens[1], "on")) { + if(tokens.size() < 4 || !tokencmp(tokens[2], "tag") || tokens.size() > 7) { + printf("Usage: throttle on tag [RATE] [DURATION] [PRIORITY]\n"); printf("\n"); printf("Enables throttling for transactions with the specified tag.\n"); printf("An optional transactions per second rate can be specified (default 0).\n"); printf("An optional duration can be specified, which must include a time suffix (s, m, h, d) (default 1h).\n"); + printf("An optional priority can be specified. Choices are `default', `immediate', and `batch' (default `default').\n"); is_error = true; continue; } double tpsRate = 0.0; uint64_t duration = 3600; + TransactionPriority priority = TransactionPriority::DEFAULT; if(tokens.size() >= 5) { char *end; @@ -3968,70 +4060,145 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { continue; } duration = parsedDuration.get(); - } - if(duration == 0) { - printf("ERROR: throttle duration cannot be 0\n"); - is_error = true; - continue; + if(duration == 0) { + printf("ERROR: throttle duration cannot be 0\n"); + is_error = true; + continue; + } + } + if(tokens.size() == 7) { + if(tokens[6] == LiteralStringRef("default")) { + priority = TransactionPriority::DEFAULT; + } + else if(tokens[6] == LiteralStringRef("immediate")) { + priority = TransactionPriority::IMMEDIATE; + } + else if(tokens[6] == LiteralStringRef("batch")) { + priority = TransactionPriority::BATCH; + } + else { + printf("ERROR: unrecognized priority `%s'. Must be one of `default',\n `immediate', or `batch'.\n", tokens[6].toString().c_str()); + is_error = true; + continue; + } } TagSet tags; tags.addTag(tokens[3]); - wait(ThrottleApi::throttleTags(db, tags, tpsRate, duration, false, TransactionPriority::DEFAULT)); + wait(ThrottleApi::throttleTags(db, tags, tpsRate, duration, TagThrottleType::MANUAL, priority)); printf("Tag `%s' has been throttled\n", tokens[3].toString().c_str()); } else if(tokencmp(tokens[1], "off")) { - if(tokencmp(tokens[2], "tag") && tokens.size() == 4) { - TagSet tags; - tags.addTag(tokens[3]); - bool success = wait(ThrottleApi::unthrottleTags(db, tags, false, TransactionPriority::DEFAULT)); // TODO: Allow targeting priority and auto/manual - if(success) { - printf("Unthrottled tag `%s'\n", tokens[3].toString().c_str()); + int nextIndex = 2; + TagSet tags; + bool throttleTypeSpecified = false; + Optional throttleType = TagThrottleType::MANUAL; + Optional priority; + + if(tokens.size() == 2) { + is_error = true; + } + + while(nextIndex < tokens.size() && !is_error) { + if(tokencmp(tokens[nextIndex], "all")) { + if(throttleTypeSpecified) { + is_error = true; + continue; + } + throttleTypeSpecified = true; + throttleType = Optional(); + ++nextIndex; } - else { - printf("Tag `%s' was not throttled\n", tokens[3].toString().c_str()); + else if(tokencmp(tokens[nextIndex], "auto")) { + if(throttleTypeSpecified) { + is_error = true; + continue; + } + throttleTypeSpecified = true; + throttleType = TagThrottleType::AUTO; + ++nextIndex; + } + else if(tokencmp(tokens[nextIndex], "manual")) { + if(throttleTypeSpecified) { + is_error = true; + continue; + } + throttleTypeSpecified = true; + throttleType = TagThrottleType::MANUAL; + ++nextIndex; + } + else if(tokencmp(tokens[nextIndex], "default")) { + if(priority.present()) { + is_error = true; + continue; + } + priority = TransactionPriority::DEFAULT; + ++nextIndex; + } + else if(tokencmp(tokens[nextIndex], "immediate")) { + if(priority.present()) { + is_error = true; + continue; + } + priority = TransactionPriority::IMMEDIATE; + ++nextIndex; + } + else if(tokencmp(tokens[nextIndex], "batch")) { + if(priority.present()) { + is_error = true; + continue; + } + priority = TransactionPriority::BATCH; + ++nextIndex; + } + else if(tokencmp(tokens[nextIndex], "tag")) { + if(tags.size() > 0 || nextIndex == tokens.size()-1) { + is_error = true; + continue; + } + tags.addTag(tokens[nextIndex+1]); + nextIndex += 2; } } - else if(tokencmp(tokens[2], "all") && tokens.size() == 3) { - bool unthrottled = wait(ThrottleApi::unthrottleAll(db)); - if(unthrottled) { - printf("Unthrottled all tags\n"); + + if(!is_error) { + state const char *throttleTypeString = !throttleType.present() ? "" : (throttleType.get() == TagThrottleType::AUTO ? "auto-" : "manually "); + state std::string priorityString = priority.present() ? format(" at %s priority", transactionPriorityToString(priority.get(), false)) : ""; + + if(tags.size() > 0) { + bool success = wait(ThrottleApi::unthrottleTags(db, tags, throttleType, priority)); + if(success) { + printf("Unthrottled tag `%s'%s\n", tokens[3].toString().c_str(), priorityString.c_str()); + } + else { + printf("Tag `%s' was not %sthrottled%s\n", tokens[3].toString().c_str(), throttleTypeString, priorityString.c_str()); + } } else { - printf("There were no tags being throttled\n"); - } - } - else if(tokencmp(tokens[2], "auto") && tokens.size() == 3) { - bool unthrottled = wait(ThrottleApi::unthrottleAuto(db)); - if(unthrottled) { - printf("Unthrottled all auto-throttled tags\n"); - } - else { - printf("There were no tags being throttled\n"); - } - } - else if(tokencmp(tokens[2], "manual") && tokens.size() == 3) { - bool unthrottled = wait(ThrottleApi::unthrottleManual(db)); - if(unthrottled) { - printf("Unthrottled all manually throttled tags\n"); - } - else { - printf("There were no tags being throttled\n"); + bool unthrottled = wait(ThrottleApi::unthrottleAll(db, throttleType, priority)); + if(unthrottled) { + printf("Unthrottled all %sthrottled tags%s\n", throttleTypeString, priorityString.c_str()); + } + else { + printf("There were no tags being %sthrottled%s\n", throttleTypeString, priorityString.c_str()); + } } } else { - printf("Usage: throttle off [TAG]\n"); + printf("Usage: throttle off [all|auto|manual] [tag ] [PRIORITY]\n"); printf("\n"); - printf("Disables throttling for the specified tag(s).\n"); - printf("Use `all' to turn off all tag throttles, `auto' to turn off throttles created by\n"); - printf("the cluster, and `manual' to turn off throttles created manually. Use `tag '\n"); - printf("to turn off throttles for a specific tag\n"); - is_error = true; + printf("Disables throttling for throttles matching the specified filters. At least one filter must be used.\n\n"); + printf("An optional qualifier `all', `auto', or `manual' can be used to specify the type of throttle\n"); + printf("affected. `all' targets all throttles, `auto' targets those created by the cluster, and\n"); + printf("`manual' targets those created manually (default `manual').\n\n"); + printf("The `tag' filter can be use to turn off only a specific tag.\n\n"); + printf("The priority filter can be used to turn off only throttles at specific priorities. Choices are\n"); + printf("`default', `immediate', or `batch'. By default, all priorities are targeted.\n"); } } - else if((tokencmp(tokens[1], "enable") || tokencmp(tokens[1], "disable")) && tokens.size() == 3 && tokencmp(tokens[2], "auto")) { + else if(tokencmp(tokens[1], "enable") || tokencmp(tokens[1], "disable")) { if(tokens.size() != 3 || !tokencmp(tokens[2], "auto")) { printf("Usage: throttle auto\n"); printf("\n"); @@ -4077,7 +4244,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { ACTOR Future runCli(CLIOptions opt) { state LineNoise linenoise( [](std::string const& line, std::vector& completions) { - fdbcli_comp_cmd(line, completions); + fdbcliCompCmd(line, completions); }, [enabled=opt.cliHints](std::string const& line)->LineNoise::Hint { if (!enabled) { @@ -4098,18 +4265,32 @@ ACTOR Future runCli(CLIOptions opt) { // being entered. if (error && line.back() != '\\') return LineNoise::Hint(std::string(" {malformed escape sequence}"), 90, false); - auto iter = helpMap.find(command.toString()); - if (iter != helpMap.end()) { - std::string helpLine = iter->second.usage; - std::vector> parsedHelp = parseLine(helpLine, error, partial); - std::string hintLine = (*(line.end() - 1) == ' ' ? "" : " "); - for (int i = finishedParameters; i < parsedHelp.back().size(); i++) { - hintLine = hintLine + parsedHelp.back()[i].toString() + " "; + bool inArgument = *(line.end() - 1) != ' '; + std::string hintLine = inArgument ? " " : ""; + if(tokencmp(command, "throttle")) { + std::vector hintItems = throttleHintGenerator(parsed.back(), inArgument); + if(hintItems.empty()) { + return LineNoise::Hint(); + } + for(auto item : hintItems) { + hintLine = hintLine + item + " "; } - return LineNoise::Hint(hintLine, 90, false); - } else { - return LineNoise::Hint(); } + else { + auto iter = helpMap.find(command.toString()); + if(iter != helpMap.end()) { + std::string helpLine = iter->second.usage; + std::vector> parsedHelp = parseLine(helpLine, error, partial); + for (int i = finishedParameters; i < parsedHelp.back().size(); i++) { + hintLine = hintLine + parsedHelp.back()[i].toString() + " "; + } + } + else { + return LineNoise::Hint(); + } + } + + return LineNoise::Hint(hintLine, 90, false); }, 1000, false); diff --git a/fdbclient/ClientLogEvents.h b/fdbclient/ClientLogEvents.h index 614c8cf7c7..67c4ba64d0 100644 --- a/fdbclient/ClientLogEvents.h +++ b/fdbclient/ClientLogEvents.h @@ -44,19 +44,28 @@ namespace FdbClientLogEvents { }; struct Event { - Event(EventType t, double ts) : type(t), startTs(ts) { } + Event(EventType t, double ts, const Optional> &dc) : type(t), startTs(ts){ + if (dc.present()) + dcId = dc.get(); + } Event() { } - template Ar& serialize(Ar &ar) { return serializer(ar, type, startTs); } + template Ar& serialize(Ar &ar) { + if (ar.protocolVersion().version() >= (uint64_t) 0x0FDB00B063010001LL) { + return serializer(ar, type, startTs, dcId); + } else { + return serializer(ar, type, startTs); + } + } EventType type{ EVENTTYPEEND }; double startTs{ 0 }; + Key dcId{}; void logEvent(std::string id, int maxFieldLength) const {} }; struct EventGetVersion : public Event { - EventGetVersion(double ts, double lat) : Event(GET_VERSION_LATENCY, ts), latency(lat) { } EventGetVersion() { } template Ar& serialize(Ar &ar) { @@ -77,22 +86,6 @@ namespace FdbClientLogEvents { // Version V2 of EventGetVersion starting at 6.2 struct EventGetVersion_V2 : public Event { - EventGetVersion_V2(double ts, double lat, TransactionPriority priority) : Event(GET_VERSION_LATENCY, ts), latency(lat) { - switch(priority) { - // Unfortunately, the enum serialized here disagrees with the enum used elsewhere for the values used by each priority - case TransactionPriority::IMMEDIATE: - priorityType = PRIORITY_IMMEDIATE; - break; - case TransactionPriority::DEFAULT: - priorityType = PRIORITY_DEFAULT; - break; - case TransactionPriority::BATCH: - priorityType = PRIORITY_BATCH; - break; - default: - ASSERT(false); - } - } EventGetVersion_V2() { } template Ar& serialize(Ar &ar) { @@ -115,7 +108,7 @@ namespace FdbClientLogEvents { // Version V3 of EventGetVersion starting at 6.3 struct EventGetVersion_V3 : public Event { - EventGetVersion_V3(double ts, double lat, TransactionPriority priority, Version version) : Event(GET_VERSION_LATENCY, ts), latency(lat), readVersion(version) { + EventGetVersion_V3(double ts, const Optional> &dcId, double lat, TransactionPriority priority, Version version) : Event(GET_VERSION_LATENCY, ts, dcId), latency(lat), readVersion(version) { switch(priority) { // Unfortunately, the enum serialized here disagrees with the enum used elsewhere for the values used by each priority case TransactionPriority::IMMEDIATE: @@ -154,7 +147,7 @@ namespace FdbClientLogEvents { }; struct EventGet : public Event { - EventGet(double ts, double lat, int size, const KeyRef &in_key) : Event(GET_LATENCY, ts), latency(lat), valueSize(size), key(in_key) { } + EventGet(double ts, const Optional> &dcId, double lat, int size, const KeyRef &in_key) : Event(GET_LATENCY, ts, dcId), latency(lat), valueSize(size), key(in_key) { } EventGet() { } template Ar& serialize(Ar &ar) { @@ -180,7 +173,7 @@ namespace FdbClientLogEvents { }; struct EventGetRange : public Event { - EventGetRange(double ts, double lat, int size, const KeyRef &start_key, const KeyRef & end_key) : Event(GET_RANGE_LATENCY, ts), latency(lat), rangeSize(size), startKey(start_key), endKey(end_key) { } + EventGetRange(double ts, const Optional> &dcId, double lat, int size, const KeyRef &start_key, const KeyRef & end_key) : Event(GET_RANGE_LATENCY, ts, dcId), latency(lat), rangeSize(size), startKey(start_key), endKey(end_key) { } EventGetRange() { } template Ar& serialize(Ar &ar) { @@ -208,7 +201,6 @@ namespace FdbClientLogEvents { }; struct EventCommit : public Event { - EventCommit(double ts, double lat, int mut, int bytes, const CommitTransactionRequest &commit_req) : Event(COMMIT_LATENCY, ts), latency(lat), numMutations(mut), commitBytes(bytes), req(commit_req) { } EventCommit() { } template Ar& serialize(Ar &ar) { @@ -260,8 +252,8 @@ namespace FdbClientLogEvents { // Version V2 of EventGetVersion starting at 6.3 struct EventCommit_V2 : public Event { - EventCommit_V2(double ts, double lat, int mut, int bytes, Version version, const CommitTransactionRequest &commit_req) - : Event(COMMIT_LATENCY, ts), latency(lat), numMutations(mut), commitBytes(bytes), commitVersion(version), req(commit_req) { } + EventCommit_V2(double ts, const Optional> &dcId, double lat, int mut, int bytes, Version version, const CommitTransactionRequest &commit_req) + : Event(COMMIT_LATENCY, ts, dcId), latency(lat), numMutations(mut), commitBytes(bytes), commitVersion(version), req(commit_req) { } EventCommit_V2() { } template Ar& serialize(Ar &ar) { @@ -314,7 +306,7 @@ namespace FdbClientLogEvents { }; struct EventGetError : public Event { - EventGetError(double ts, int err_code, const KeyRef &in_key) : Event(ERROR_GET, ts), errCode(err_code), key(in_key) { } + EventGetError(double ts, const Optional> &dcId, int err_code, const KeyRef &in_key) : Event(ERROR_GET, ts, dcId), errCode(err_code), key(in_key) { } EventGetError() { } template Ar& serialize(Ar &ar) { @@ -338,7 +330,7 @@ namespace FdbClientLogEvents { }; struct EventGetRangeError : public Event { - EventGetRangeError(double ts, int err_code, const KeyRef &start_key, const KeyRef & end_key) : Event(ERROR_GET_RANGE, ts), errCode(err_code), startKey(start_key), endKey(end_key) { } + EventGetRangeError(double ts, const Optional> &dcId, int err_code, const KeyRef &start_key, const KeyRef & end_key) : Event(ERROR_GET_RANGE, ts, dcId), errCode(err_code), startKey(start_key), endKey(end_key) { } EventGetRangeError() { } template Ar& serialize(Ar &ar) { @@ -364,7 +356,7 @@ namespace FdbClientLogEvents { }; struct EventCommitError : public Event { - EventCommitError(double ts, int err_code, const CommitTransactionRequest &commit_req) : Event(ERROR_COMMIT, ts), errCode(err_code), req(commit_req) { } + EventCommitError(double ts, const Optional> &dcId, int err_code, const CommitTransactionRequest &commit_req) : Event(ERROR_COMMIT, ts, dcId), errCode(err_code), req(commit_req) { } EventCommitError() { } template Ar& serialize(Ar &ar) { diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 5a3ace0532..7d4e450c30 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -1019,6 +1019,21 @@ struct HealthMetrics { } }; +struct DDMetricsRef { + int64_t shardBytes; + KeyRef beginKey; + + DDMetricsRef() : shardBytes(0) {} + DDMetricsRef(int64_t bytes, KeyRef begin) : shardBytes(bytes), beginKey(begin) {} + DDMetricsRef(Arena& a, const DDMetricsRef& copyFrom) + : shardBytes(copyFrom.shardBytes), beginKey(a, copyFrom.beginKey) {} + + template + void serialize(Ar& ar) { + serializer(ar, shardBytes, beginKey); + } +}; + struct WorkerBackupStatus { LogEpoch epoch; Version version; diff --git a/fdbclient/IncludeVersions.h b/fdbclient/IncludeVersions.h deleted file mode 100644 index 66bdccf43d..0000000000 --- a/fdbclient/IncludeVersions.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * IncludeVersions.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// This is a simple header to isolate the stupidity that results out of two -// build systems and versions.h include directives - -#if defined(CMAKE_BUILD) -# include "fdbclient/versions.h" -#elif !defined(WIN32) -# include "versions.h" -#endif diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index a7d62ad684..c334d3c500 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -92,6 +92,7 @@ void ClientKnobs::initialize(bool randomize) { init( STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, 15.0 ); init( AGGREGATE_HEALTH_METRICS_MAX_STALENESS, 0.5 ); init( DETAILED_HEALTH_METRICS_MAX_STALENESS, 5.0 ); + init( TAG_ENCODE_KEY_SERVERS, false ); if( randomize && BUGGIFY ) TAG_ENCODE_KEY_SERVERS = true; //KeyRangeMap init( KRM_GET_RANGE_LIMIT, 1e5 ); if( randomize && BUGGIFY ) KRM_GET_RANGE_LIMIT = 10; diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h index a28c05e19a..31919811f3 100644 --- a/fdbclient/Knobs.h +++ b/fdbclient/Knobs.h @@ -85,6 +85,7 @@ public: double STORAGE_METRICS_TOO_MANY_SHARDS_DELAY; double AGGREGATE_HEALTH_METRICS_MAX_STALENESS; double DETAILED_HEALTH_METRICS_MAX_STALENESS; + bool TAG_ENCODE_KEY_SERVERS; //KeyRangeMap int KRM_GET_RANGE_LIMIT; diff --git a/fdbclient/MasterProxyInterface.h b/fdbclient/MasterProxyInterface.h index abb3069490..7216015535 100644 --- a/fdbclient/MasterProxyInterface.h +++ b/fdbclient/MasterProxyInterface.h @@ -42,7 +42,6 @@ struct MasterProxyInterface { Optional processId; bool provisional; - Endpoint base; RequestStream< struct CommitTransactionRequest > commit; RequestStream< struct GetReadVersionRequest > getConsistentReadVersion; // Returns a version which (1) is committed, and (2) is >= the latest version reported committed (by a commit response) when this request was sent // (at some point between when this request is sent and when its response is received, the latest version reported committed) @@ -56,6 +55,7 @@ struct MasterProxyInterface { RequestStream< struct GetHealthMetricsRequest > getHealthMetrics; RequestStream< struct ProxySnapRequest > proxySnapReq; RequestStream< struct ExclusionSafetyCheckRequest > exclusionSafetyCheckReq; + RequestStream< struct GetDDMetricsRequest > getDDMetrics; UID id() const { return commit.getEndpoint().token; } std::string toString() const { return id().shortString(); } @@ -65,18 +65,18 @@ struct MasterProxyInterface { template void serialize(Archive& ar) { - serializer(ar, processId, provisional, base); + serializer(ar, processId, provisional, commit); if( Archive::isDeserializing ) { - commit = RequestStream< struct CommitTransactionRequest >( base.getAdjustedEndpoint(0) ); - getConsistentReadVersion = RequestStream< struct GetReadVersionRequest >( base.getAdjustedEndpoint(1) ); - getKeyServersLocations = RequestStream< struct GetKeyServerLocationsRequest >( base.getAdjustedEndpoint(2) ); - getStorageServerRejoinInfo = RequestStream< struct GetStorageServerRejoinInfoRequest >( base.getAdjustedEndpoint(3) ); - waitFailure = RequestStream>( base.getAdjustedEndpoint(4) ); - getRawCommittedVersion = RequestStream< struct GetRawCommittedVersionRequest >( base.getAdjustedEndpoint(5) ); - txnState = RequestStream< struct TxnStateRequest >( base.getAdjustedEndpoint(6) ); - getHealthMetrics = RequestStream< struct GetHealthMetricsRequest >( base.getAdjustedEndpoint(7) ); - proxySnapReq = RequestStream< struct ProxySnapRequest >( base.getAdjustedEndpoint(8) ); - exclusionSafetyCheckReq = RequestStream< struct ExclusionSafetyCheckRequest >( base.getAdjustedEndpoint(9) ); + getConsistentReadVersion = RequestStream< struct GetReadVersionRequest >( commit.getEndpoint().getAdjustedEndpoint(1) ); + getKeyServersLocations = RequestStream< struct GetKeyServerLocationsRequest >( commit.getEndpoint().getAdjustedEndpoint(2) ); + getStorageServerRejoinInfo = RequestStream< struct GetStorageServerRejoinInfoRequest >( commit.getEndpoint().getAdjustedEndpoint(3) ); + waitFailure = RequestStream>( commit.getEndpoint().getAdjustedEndpoint(4) ); + getRawCommittedVersion = RequestStream< struct GetRawCommittedVersionRequest >( commit.getEndpoint().getAdjustedEndpoint(5) ); + txnState = RequestStream< struct TxnStateRequest >( commit.getEndpoint().getAdjustedEndpoint(6) ); + getHealthMetrics = RequestStream< struct GetHealthMetricsRequest >( commit.getEndpoint().getAdjustedEndpoint(7) ); + proxySnapReq = RequestStream< struct ProxySnapRequest >( commit.getEndpoint().getAdjustedEndpoint(8) ); + exclusionSafetyCheckReq = RequestStream< struct ExclusionSafetyCheckRequest >( commit.getEndpoint().getAdjustedEndpoint(9) ); + getDDMetrics = RequestStream< struct GetDDMetricsRequest >( commit.getEndpoint().getAdjustedEndpoint(10) ); } } @@ -92,7 +92,8 @@ struct MasterProxyInterface { streams.push_back(getHealthMetrics.getReceiver()); streams.push_back(proxySnapReq.getReceiver()); streams.push_back(exclusionSafetyCheckReq.getReceiver()); - base = FlowTransport::transport().addEndpoints(streams); + streams.push_back(getDDMetrics.getReceiver()); + FlowTransport::transport().addEndpoints(streams); } }; @@ -391,6 +392,34 @@ struct GetHealthMetricsRequest } }; +struct GetDDMetricsReply +{ + constexpr static FileIdentifier file_identifier = 7277713; + Standalone> storageMetricsList; + + GetDDMetricsReply() {} + + template + void serialize(Ar& ar) { + serializer(ar, storageMetricsList); + } +}; + +struct GetDDMetricsRequest { + constexpr static FileIdentifier file_identifier = 14536812; + KeyRange keys; + int shardLimit; + ReplyPromise reply; + + GetDDMetricsRequest() {} + explicit GetDDMetricsRequest(KeyRange const& keys, const int shardLimit) : keys(keys), shardLimit(shardLimit) {} + + template + void serialize(Ar& ar) { + serializer(ar, keys, shardLimit, reply); + } +}; + struct ProxySnapRequest { constexpr static FileIdentifier file_identifier = 22204900; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 72ab5d2878..25899fcfda 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -49,7 +49,7 @@ #include "flow/TLSConfig.actor.h" #include "flow/UnitTest.h" -#include "fdbclient/IncludeVersions.h" +#include "fdbclient/versions.h" #ifdef WIN32 #define WIN32_LEAN_AND_MEAN @@ -607,6 +607,8 @@ DatabaseContext::DatabaseContext(Reference(conflictingKeysRange)); registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, std::make_unique(readConflictRangeKeysRange)); registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, std::make_unique(writeConflictRangeKeysRange)); + registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::METRICS, + std::make_unique(ddStatsRange)); registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::WORKERINTERFACE, std::make_unique(KeyRangeRef( LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")))); registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::STATUSJSON, std::make_unique( @@ -738,7 +740,7 @@ Reference DatabaseContext::setCachedLocation( const KeyRangeRef& k locationCache.insert( KeyRangeRef(begin, end), Reference() ); } locationCache.insert( keys, loc ); - return std::move(loc); + return loc; } void DatabaseContext::invalidateCache( const KeyRef& key, bool isBackward ) { @@ -1518,7 +1520,7 @@ ACTOR Future> getValue( Future version, Key key, Databa cx->readLatencies.addSample(latency); if (trLogInfo) { int valueSize = reply.value.present() ? reply.value.get().size() : 0; - trLogInfo->addLog(FdbClientLogEvents::EventGet(startTimeD, latency, valueSize, key)); + trLogInfo->addLog(FdbClientLogEvents::EventGet(startTimeD, cx->clientLocality.dcId(), latency, valueSize, key)); } cx->getValueCompleted->latency = timer_int() - startTime; cx->getValueCompleted->log(); @@ -1550,7 +1552,7 @@ ACTOR Future> getValue( Future version, Key key, Databa wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID)); } else { if (trLogInfo) - trLogInfo->addLog(FdbClientLogEvents::EventGetError(startTimeD, static_cast(e.code()), key)); + trLogInfo->addLog(FdbClientLogEvents::EventGetError(startTimeD, cx->clientLocality.dcId(), static_cast(e.code()), key)); throw e; } } @@ -1955,7 +1957,7 @@ void getRangeFinished(Database cx, Reference trLogInfo, doub cx->transactionKeysRead += result.size(); if( trLogInfo ) { - trLogInfo->addLog(FdbClientLogEvents::EventGetRange(startTime, now()-startTime, bytes, begin.getKey(), end.getKey())); + trLogInfo->addLog(FdbClientLogEvents::EventGetRange(startTime, cx->clientLocality.dcId(), now()-startTime, bytes, begin.getKey(), end.getKey())); } if( !snapshot ) { @@ -2195,7 +2197,7 @@ ACTOR Future> getRange( Database cx, ReferenceWRONG_SHARD_SERVER_DELAY, info.taskID)); } else { if (trLogInfo) - trLogInfo->addLog(FdbClientLogEvents::EventGetRangeError(startTime, static_cast(e.code()), begin.getKey(), end.getKey())); + trLogInfo->addLog(FdbClientLogEvents::EventGetRangeError(startTime, cx->clientLocality.dcId(), static_cast(e.code()), begin.getKey(), end.getKey())); throw e; } @@ -2449,7 +2451,7 @@ ACTOR Future< Key > getKeyAndConflictRange( conflictRange.send( std::make_pair( rep, k.orEqual ? keyAfter( k.getKey() ) : Key(k.getKey(), k.arena()) ) ); else conflictRange.send( std::make_pair( k.orEqual ? keyAfter( k.getKey() ) : Key(k.getKey(), k.arena()), keyAfter( rep ) ) ); - return std::move(rep); + return rep; } catch( Error&e ) { conflictRange.send(std::make_pair(Key(), Key())); throw; @@ -2975,7 +2977,7 @@ ACTOR static Future tryCommit( Database cx, Reference cx->commitLatencies.addSample(latency); cx->latencies.addSample(now() - tr->startTime); if (trLogInfo) - trLogInfo->addLog(FdbClientLogEvents::EventCommit_V2(startTime, latency, req.transaction.mutations.size(), req.transaction.mutations.expectedSize(), ci.version, req)); + trLogInfo->addLog(FdbClientLogEvents::EventCommit_V2(startTime, cx->clientLocality.dcId(), latency, req.transaction.mutations.size(), req.transaction.mutations.expectedSize(), ci.version, req)); return Void(); } else { // clear the RYW transaction which contains previous conflicting keys @@ -3038,7 +3040,7 @@ ACTOR static Future tryCommit( Database cx, Reference TraceEvent(SevError, "TryCommitError").error(e); } if (trLogInfo) - trLogInfo->addLog(FdbClientLogEvents::EventCommitError(startTime, static_cast(e.code()), req)); + trLogInfo->addLog(FdbClientLogEvents::EventCommitError(startTime, cx->clientLocality.dcId(), static_cast(e.code()), req)); throw; } } @@ -3449,7 +3451,7 @@ ACTOR Future extractReadVersion(DatabaseContext* cx, TransactionPriorit double latency = now() - startTime; cx->GRVLatencies.addSample(latency); if (trLogInfo) - trLogInfo->addLog(FdbClientLogEvents::EventGetVersion_V3(startTime, latency, priority, rep.version)); + trLogInfo->addLog(FdbClientLogEvents::EventGetVersion_V3(startTime, cx->clientLocality.dcId(), latency, priority, rep.version)); if (rep.version == 1 && rep.locked) { throw proxy_memory_limit_exceeded(); } @@ -3858,6 +3860,25 @@ Future< StorageMetrics > Transaction::getStorageMetrics( KeyRange const& keys, i } } +ACTOR Future>> waitDataDistributionMetricsList(Database cx, KeyRange keys, + int shardLimit) { + state Future clientTimeout = delay(5.0); + loop { + choose { + when(wait(cx->onMasterProxiesChanged())) {} + when(ErrorOr rep = + wait(errorOr(basicLoadBalance(cx->getMasterProxies(false), &MasterProxyInterface::getDDMetrics, + GetDDMetricsRequest(keys, shardLimit))))) { + if (rep.isError()) { + throw rep.getError(); + } + return rep.get().storageMetricsList; + } + when(wait(clientTimeout)) { throw timed_out(); } + } + } +} + Future>> Transaction::getReadHotRanges(KeyRange const& keys) { return ::getReadHotRanges(cx, keys); } diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index b32e980c85..ac252345fc 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -330,6 +330,8 @@ private: }; ACTOR Future waitForCommittedVersion(Database cx, Version version); +ACTOR Future>> waitDataDistributionMetricsList(Database cx, KeyRange keys, + int shardLimit); std::string unprintable( const std::string& ); diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 74b410efb2..6111ed1114 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -312,8 +312,6 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( }, "limiting_queue_bytes_storage_server":0, "worst_queue_bytes_storage_server":0, - "limiting_version_lag_storage_server":0, - "worst_version_lag_storage_server":0, "limiting_data_lag_storage_server":{ "versions":0, "seconds":0.0 diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 86bc2a6c49..00e05118fe 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -29,7 +29,9 @@ std::unordered_map SpecialKeySpace::moduleToB KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")) }, { SpecialKeySpace::MODULE::STATUSJSON, singleKeyRange(LiteralStringRef("\xff\xff/status/json")) }, { SpecialKeySpace::MODULE::CONNECTIONSTRING, singleKeyRange(LiteralStringRef("\xff\xff/connection_string")) }, - { SpecialKeySpace::MODULE::CLUSTERFILEPATH, singleKeyRange(LiteralStringRef("\xff\xff/cluster_file_path")) } + { SpecialKeySpace::MODULE::CLUSTERFILEPATH, singleKeyRange(LiteralStringRef("\xff\xff/cluster_file_path")) }, + { SpecialKeySpace::MODULE::METRICS, + KeyRangeRef(LiteralStringRef("\xff\xff/metrics/"), LiteralStringRef("\xff\xff/metrics0")) } }; // This function will move the given KeySelector as far as possible to the standard form: @@ -164,7 +166,6 @@ SpecialKeySpace::getRangeAggregationActor(SpecialKeySpace* sks, Reference lastModuleRead; wait(normalizeKeySelectorActor(sks, ryw, &begin, &lastModuleRead, &actualBeginOffset, &result)); - // TODO : check if end the boundary of a module wait(normalizeKeySelectorActor(sks, ryw, &end, &lastModuleRead, &actualEndOffset, &result)); // Handle all corner cases like what RYW does // return if range inverted @@ -314,6 +315,37 @@ Future> ConflictingKeysImpl::getRange(Reference> ddStatsGetRangeActor(Reference ryw, + KeyRangeRef kr) { + try { + auto keys = kr.removePrefix(ddStatsRange.begin); + Standalone> resultWithoutPrefix = + wait(waitDataDistributionMetricsList(ryw->getDatabase(), keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT)); + Standalone result; + for (const auto& ddMetricsRef : resultWithoutPrefix) { + // each begin key is the previous end key, thus we only encode the begin key in the result + KeyRef beginKey = ddMetricsRef.beginKey.withPrefix(ddStatsRange.begin, result.arena()); + // Use json string encoded in utf-8 to encode the values, easy for adding more fields in the future + json_spirit::mObject statsObj; + statsObj["ShardBytes"] = ddMetricsRef.shardBytes; + std::string statsString = + json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); + ValueRef bytes(result.arena(), statsString); + result.push_back(result.arena(), KeyValueRef(beginKey, bytes)); + } + return result; + } catch (Error& e) { + throw; + } +} + +DDStatsRangeImpl::DDStatsRangeImpl(KeyRangeRef kr) : SpecialKeyRangeBaseImpl(kr) {} + +Future> DDStatsRangeImpl::getRange(Reference ryw, + KeyRangeRef kr) const { + return ddStatsGetRangeActor(ryw, kr); +} + class SpecialKeyRangeTestImpl : public SpecialKeyRangeBaseImpl { public: explicit SpecialKeyRangeTestImpl(KeyRangeRef kr, const std::string& prefix, int size) diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h index 45ed78cc00..a7b03a4ff5 100644 --- a/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/SpecialKeySpace.actor.h @@ -51,13 +51,14 @@ protected: class SpecialKeySpace { public: enum class MODULE { - UNKNOWN, // default value for all unregistered range - TESTONLY, // only used by correctness tests - TRANSACTION, - WORKERINTERFACE, - STATUSJSON, CLUSTERFILEPATH, - CONNECTIONSTRING + CONNECTIONSTRING, + METRICS, // data-distribution metrics + TESTONLY, // only used by correctness tests + TRANSACTION, // transaction related info, conflicting keys, read/write conflict range + STATUSJSON, + UNKNOWN, // default value for all unregistered range + WORKERINTERFACE, }; Future> get(Reference ryw, const Key& key); @@ -152,5 +153,12 @@ public: KeyRangeRef kr) const override; }; +class DDStatsRangeImpl : public SpecialKeyRangeBaseImpl { +public: + explicit DDStatsRangeImpl(KeyRangeRef kr); + Future> getRange(Reference ryw, + KeyRangeRef kr) const override; +}; + #include "flow/unactorcompiler.h" #endif diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 575d234159..b8970c86e0 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -54,7 +54,6 @@ struct StorageServerInterface { LocalityData locality; UID uniqueID; - Endpoint base; RequestStream getValue; RequestStream getKey; @@ -87,20 +86,19 @@ struct StorageServerInterface { // versioned carefully! if (ar.protocolVersion().hasSmallEndpoints()) { - serializer(ar, uniqueID, locality, base); + serializer(ar, uniqueID, locality, getValue); if( Ar::isDeserializing ) { - getValue = RequestStream( base.getAdjustedEndpoint(0) ); - getKey = RequestStream( base.getAdjustedEndpoint(1) ); - getKeyValues = RequestStream( base.getAdjustedEndpoint(2) ); - getShardState = RequestStream( base.getAdjustedEndpoint(3) ); - waitMetrics = RequestStream( base.getAdjustedEndpoint(4) ); - splitMetrics = RequestStream( base.getAdjustedEndpoint(5) ); - getStorageMetrics = RequestStream( base.getAdjustedEndpoint(6) ); - waitFailure = RequestStream>( base.getAdjustedEndpoint(7) ); - getQueuingMetrics = RequestStream( base.getAdjustedEndpoint(8) ); - getKeyValueStoreType = RequestStream>( base.getAdjustedEndpoint(9) ); - watchValue = RequestStream( base.getAdjustedEndpoint(10) ); - getReadHotRanges = RequestStream( base.getAdjustedEndpoint(11) ); + getKey = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(1) ); + getKeyValues = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(2) ); + getShardState = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(3) ); + waitMetrics = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(4) ); + splitMetrics = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(5) ); + getStorageMetrics = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(6) ); + waitFailure = RequestStream>( getValue.getEndpoint().getAdjustedEndpoint(7) ); + getQueuingMetrics = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(8) ); + getKeyValueStoreType = RequestStream>( getValue.getEndpoint().getAdjustedEndpoint(9) ); + watchValue = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(10) ); + getReadHotRanges = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(11) ); } } else { ASSERT(Ar::isDeserializing); @@ -110,7 +108,6 @@ struct StorageServerInterface { serializer(ar, uniqueID, locality, getValue, getKey, getKeyValues, getShardState, waitMetrics, splitMetrics, getStorageMetrics, waitFailure, getQueuingMetrics, getKeyValueStoreType); if (ar.protocolVersion().hasWatches()) serializer(ar, watchValue); - base = getValue.getEndpoint(); } } bool operator == (StorageServerInterface const& s) const { return uniqueID == s.uniqueID; } @@ -129,7 +126,7 @@ struct StorageServerInterface { streams.push_back(getKeyValueStoreType.getReceiver()); streams.push_back(watchValue.getReceiver()); streams.push_back(getReadHotRanges.getReceiver()); - base = FlowTransport::transport().addEndpoints(streams); + FlowTransport::transport().addEndpoints(streams); } }; @@ -320,6 +317,8 @@ struct GetShardStateRequest { struct StorageMetrics { constexpr static FileIdentifier file_identifier = 13622226; int64_t bytes = 0; // total storage + // FIXME: currently, neither of bytesPerKSecond or iosPerKSecond are actually used in DataDistribution calculations. + // This may change in the future, but this comment is left here to avoid any confusion for the time being. int64_t bytesPerKSecond = 0; // network bandwidth (average over 10s) int64_t iosPerKSecond = 0; int64_t bytesReadPerKSecond = 0; diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index a8cc2851e1..7c27227483 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -46,6 +46,11 @@ const KeyRef keyServersKey( const KeyRef& k, Arena& arena ) { return k.withPrefix( keyServersPrefix, arena ); } const Value keyServersValue( Standalone result, const std::vector& src, const std::vector& dest ) { + if(!CLIENT_KNOBS->TAG_ENCODE_KEY_SERVERS) { + BinaryWriter wr(IncludeVersion()); wr << src << dest; + return wr.toValue(); + } + std::vector srcTag; std::vector destTag; @@ -203,6 +208,9 @@ const KeyRangeRef writeConflictRangeKeysRange = KeyRangeRef(LiteralStringRef("\xff\xff/transaction/write_conflict_range/"), LiteralStringRef("\xff\xff/transaction/write_conflict_range/\xff\xff")); +const KeyRangeRef ddStatsRange = KeyRangeRef(LiteralStringRef("\xff\xff/metrics/data_distribution_stats/"), + LiteralStringRef("\xff\xff/metrics/data_distribution_stats/\xff\xff")); + // "\xff/storageCache/[[begin]]" := "[[vector]]" const KeyRangeRef storageCacheKeys( LiteralStringRef("\xff/storageCache/"), LiteralStringRef("\xff/storageCache0") ); const KeyRef storageCachePrefix = storageCacheKeys.begin; diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 54e7271456..fd588a6e94 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -81,6 +81,7 @@ extern const KeyRangeRef conflictingKeysRange; extern const ValueRef conflictingKeysTrue, conflictingKeysFalse; extern const KeyRangeRef writeConflictRangeKeysRange; extern const KeyRangeRef readConflictRangeKeysRange; +extern const KeyRangeRef ddStatsRange; extern const KeyRef cacheKeysPrefix; diff --git a/fdbclient/TagThrottle.actor.cpp b/fdbclient/TagThrottle.actor.cpp index 074d39e158..40de79d325 100644 --- a/fdbclient/TagThrottle.actor.cpp +++ b/fdbclient/TagThrottle.actor.cpp @@ -73,7 +73,7 @@ Key TagThrottleKey::toKey() const { memcpy(str, tagThrottleKeysPrefix.begin(), tagThrottleKeysPrefix.size()); str += tagThrottleKeysPrefix.size(); - *(str++) = autoThrottled ? 1 : 0; + *(str++) = (uint8_t)throttleType; *(str++) = (uint8_t)priority; for(auto tag : tags) { @@ -89,7 +89,7 @@ Key TagThrottleKey::toKey() const { TagThrottleKey TagThrottleKey::fromKey(const KeyRef& key) { const uint8_t *str = key.substr(tagThrottleKeysPrefix.size()).begin(); - bool autoThrottled = *(str++) != 0; + TagThrottleType throttleType = TagThrottleType(*(str++)); TransactionPriority priority = TransactionPriority(*(str++)); TagSet tags; @@ -99,7 +99,7 @@ TagThrottleKey TagThrottleKey::fromKey(const KeyRef& key) { str += size; } - return TagThrottleKey(tags, autoThrottled, priority); + return TagThrottleKey(tags, throttleType, priority); } TagThrottleValue TagThrottleValue::fromValue(const ValueRef& value) { @@ -164,9 +164,9 @@ namespace ThrottleApi { } } - ACTOR Future throttleTags(Database db, TagSet tags, double tpsRate, double initialDuration, bool autoThrottled, TransactionPriority priority, Optional expirationTime) { + ACTOR Future throttleTags(Database db, TagSet tags, double tpsRate, double initialDuration, TagThrottleType throttleType, TransactionPriority priority, Optional expirationTime) { state Transaction tr(db); - state Key key = TagThrottleKey(tags, autoThrottled, priority).toKey(); + state Key key = TagThrottleKey(tags, throttleType, priority).toKey(); ASSERT(initialDuration > 0); @@ -177,7 +177,7 @@ namespace ThrottleApi { loop { try { - if(!autoThrottled) { + if(throttleType == TagThrottleType::MANUAL) { Optional oldThrottle = wait(tr.get(key)); if(!oldThrottle.present()) { wait(updateThrottleCount(&tr, 1)); @@ -186,7 +186,7 @@ namespace ThrottleApi { tr.set(key, value); - if(!autoThrottled) { + if(throttleType == TagThrottleType::MANUAL) { signalThrottleChange(tr); } @@ -199,28 +199,54 @@ namespace ThrottleApi { } } - ACTOR Future unthrottleTags(Database db, TagSet tags, bool autoThrottled, TransactionPriority priority) { + ACTOR Future unthrottleTags(Database db, TagSet tags, Optional throttleType, Optional priority) { state Transaction tr(db); - state Key key = TagThrottleKey(tags, autoThrottled, priority).toKey(); - state bool removed = false; + state std::vector keys; + for(auto p : allTransactionPriorities) { + if(!priority.present() || priority.get() == p) { + if(!throttleType.present() || throttleType.get() == TagThrottleType::AUTO) { + keys.push_back(TagThrottleKey(tags, TagThrottleType::AUTO, p).toKey()); + } + if(!throttleType.present() || throttleType.get() == TagThrottleType::MANUAL) { + keys.push_back(TagThrottleKey(tags, TagThrottleType::MANUAL, p).toKey()); + } + } + } + + state bool removed = false; loop { try { - state Optional value = wait(tr.get(key)); - if(value.present()) { - if(!autoThrottled) { - wait(updateThrottleCount(&tr, -1)); + state std::vector>> values; + for(auto key : keys) { + values.push_back(tr.get(key)); + } + + wait(waitForAll(values)); + + int delta = 0; + for(int i = 0; i < values.size(); ++i) { + if(values[i].get().present()) { + if(TagThrottleKey::fromKey(keys[i]).throttleType == TagThrottleType::MANUAL) { + delta -= 1; + } + + tr.clear(keys[i]); + + // Report that we are removing this tag if we ever see it present. + // This protects us from getting confused if the transaction is maybe committed. + // It's ok if someone else actually ends up removing this tag at the same time + // and we aren't the ones to actually do it. + removed = true; } + } - tr.clear(key); + if(delta != 0) { + wait(updateThrottleCount(&tr, delta)); + } + if(removed) { signalThrottleChange(tr); - - // Report that we are removing this tag if we ever see it present. - // This protects us from getting confused if the transaction is maybe committed. - // It's ok if someone else actually ends up removing this tag at the same time - // and we aren't the ones to actually do it. - removed = true; wait(tr.commit()); } @@ -232,7 +258,7 @@ namespace ThrottleApi { } } - ACTOR Future unthrottleTags(Database db, KeyRef beginKey, KeyRef endKey, bool onlyExpiredThrottles) { + ACTOR Future unthrottleMatchingThrottles(Database db, KeyRef beginKey, KeyRef endKey, Optional priority, bool onlyExpiredThrottles) { state Transaction tr(db); state KeySelector begin = firstGreaterOrEqual(beginKey); @@ -253,8 +279,12 @@ namespace ThrottleApi { } } - bool autoThrottled = TagThrottleKey::fromKey(tag.key).autoThrottled; - if(!autoThrottled) { + TagThrottleKey key = TagThrottleKey::fromKey(tag.key); + if(priority.present() && key.priority != priority.get()) { + continue; + } + + if(key.throttleType == TagThrottleType::MANUAL) { ++manualUnthrottledTags; } @@ -285,20 +315,22 @@ namespace ThrottleApi { } } - Future unthrottleManual(Database db) { - return unthrottleTags(db, tagThrottleKeysPrefix, tagThrottleAutoKeysPrefix, false); - } + Future unthrottleAll(Database db, Optional tagThrottleType, Optional priority) { + KeyRef begin = tagThrottleKeys.begin; + KeyRef end = tagThrottleKeys.end; - Future unthrottleAuto(Database db) { - return unthrottleTags(db, tagThrottleAutoKeysPrefix, tagThrottleKeys.end, false); - } + if(tagThrottleType.present() && tagThrottleType == TagThrottleType::AUTO) { + begin = tagThrottleAutoKeysPrefix; + } + else if(tagThrottleType.present() && tagThrottleType == TagThrottleType::MANUAL) { + end = tagThrottleAutoKeysPrefix; + } - Future unthrottleAll(Database db) { - return unthrottleTags(db, tagThrottleKeys.begin, tagThrottleKeys.end, false); + return unthrottleMatchingThrottles(db, begin, end, priority, false); } Future expire(Database db) { - return unthrottleTags(db, tagThrottleKeys.begin, tagThrottleKeys.end, true); + return unthrottleMatchingThrottles(db, tagThrottleKeys.begin, tagThrottleKeys.end, Optional(), true); } ACTOR Future enableAuto(Database db, bool enabled) { diff --git a/fdbclient/TagThrottle.h b/fdbclient/TagThrottle.h index 944e307152..a79c962fb1 100644 --- a/fdbclient/TagThrottle.h +++ b/fdbclient/TagThrottle.h @@ -107,14 +107,19 @@ struct dynamic_size_traits : std::true_type { } }; +enum class TagThrottleType : uint8_t { + MANUAL, + AUTO +}; + struct TagThrottleKey { TagSet tags; - bool autoThrottled; + TagThrottleType throttleType; TransactionPriority priority; - TagThrottleKey() : autoThrottled(false), priority(TransactionPriority::DEFAULT) {} - TagThrottleKey(TagSet tags, bool autoThrottled, TransactionPriority priority) - : tags(tags), autoThrottled(autoThrottled), priority(priority) {} + TagThrottleKey() : throttleType(TagThrottleType::MANUAL), priority(TransactionPriority::DEFAULT) {} + TagThrottleKey(TagSet tags, TagThrottleType throttleType, TransactionPriority priority) + : tags(tags), throttleType(throttleType), priority(priority) {} Key toKey() const; static TagThrottleKey fromKey(const KeyRef& key); @@ -139,17 +144,17 @@ struct TagThrottleValue { struct TagThrottleInfo { TransactionTag tag; - bool autoThrottled; + TagThrottleType throttleType; TransactionPriority priority; double tpsRate; double expirationTime; double initialDuration; - TagThrottleInfo(TransactionTag tag, bool autoThrottled, TransactionPriority priority, double tpsRate, double expirationTime, double initialDuration) - : tag(tag), autoThrottled(autoThrottled), priority(priority), tpsRate(tpsRate), expirationTime(expirationTime), initialDuration(initialDuration) {} + TagThrottleInfo(TransactionTag tag, TagThrottleType throttleType, TransactionPriority priority, double tpsRate, double expirationTime, double initialDuration) + : tag(tag), throttleType(throttleType), priority(priority), tpsRate(tpsRate), expirationTime(expirationTime), initialDuration(initialDuration) {} TagThrottleInfo(TagThrottleKey key, TagThrottleValue value) - : autoThrottled(key.autoThrottled), priority(key.priority), tpsRate(value.tpsRate), expirationTime(value.expirationTime), initialDuration(value.initialDuration) + : throttleType(key.throttleType), priority(key.priority), tpsRate(value.tpsRate), expirationTime(value.expirationTime), initialDuration(value.initialDuration) { ASSERT(key.tags.size() == 1); // Multiple tags per throttle is not currently supported tag = *key.tags.begin(); @@ -160,13 +165,11 @@ namespace ThrottleApi { Future> getThrottledTags(Database const& db, int const& limit); Future throttleTags(Database const& db, TagSet const& tags, double const& tpsRate, double const& initialDuration, - bool const& autoThrottled, TransactionPriority const& priority, Optional const& expirationTime = Optional()); + TagThrottleType const& throttleType, TransactionPriority const& priority, Optional const& expirationTime = Optional()); - Future unthrottleTags(Database const& db, TagSet const& tags, bool const& autoThrottled, TransactionPriority const& priority); + Future unthrottleTags(Database const& db, TagSet const& tags, Optional const& throttleType, Optional const& priority); - Future unthrottleManual(Database db); - Future unthrottleAuto(Database db); - Future unthrottleAll(Database db); + Future unthrottleAll(Database db, Optional throttleType, Optional priority); Future expire(Database db); Future enableAuto(Database const& db, bool const& enabled); diff --git a/fdbclient/ThreadSafeTransaction.actor.cpp b/fdbclient/ThreadSafeTransaction.actor.cpp index 5b69c1656c..d26ff900fd 100644 --- a/fdbclient/ThreadSafeTransaction.actor.cpp +++ b/fdbclient/ThreadSafeTransaction.actor.cpp @@ -21,7 +21,7 @@ #include "fdbclient/ThreadSafeTransaction.h" #include "fdbclient/ReadYourWrites.h" #include "fdbclient/DatabaseContext.h" -#include "fdbclient/IncludeVersions.h" +#include "fdbclient/versions.h" // Users of ThreadSafeTransaction might share Reference between different threads as long as they don't call addRef (e.g. C API follows this). // Therefore, it is unsafe to call (explicitly or implicitly) this->addRef in any of these functions. diff --git a/fdbmonitor/fdbmonitor.cpp b/fdbmonitor/fdbmonitor.cpp index 4f5d061d82..feeff186e2 100644 --- a/fdbmonitor/fdbmonitor.cpp +++ b/fdbmonitor/fdbmonitor.cpp @@ -77,7 +77,7 @@ #include "flow/SimpleOpt.h" #include "SimpleIni.h" -#include "fdbclient/IncludeVersions.h" +#include "fdbclient/versions.h" #ifdef __linux__ typedef fd_set* fdb_fd_set; diff --git a/fdbrpc/ActorFuzz.actor.cpp b/fdbrpc/ActorFuzz.actor.cpp index 88a51e9343..f622504a1d 100644 --- a/fdbrpc/ActorFuzz.actor.cpp +++ b/fdbrpc/ActorFuzz.actor.cpp @@ -802,36 +802,36 @@ ACTOR Future actorFuzz29( FutureStream inputStream, PromiseStream std::pair actorFuzzTests() { int testsOK = 0; - testsOK += testFuzzActor( &actorFuzz0, "actorFuzz0", (vector(),390229,596271,574865) ); - testsOK += testFuzzActor( &actorFuzz1, "actorFuzz1", (vector(),477566,815578,477566,815578,477566,815578,477566,815578,477566,815578,917160) ); - testsOK += testFuzzActor( &actorFuzz2, "actorFuzz2", (vector(),476677,930237) ); - testsOK += testFuzzActor( &actorFuzz3, "actorFuzz3", (vector(),1000) ); - testsOK += testFuzzActor( &actorFuzz4, "actorFuzz4", (vector(),180600,177605,177605,177605,954508,810052) ); - testsOK += testFuzzActor( &actorFuzz5, "actorFuzz5", (vector(),1000) ); - testsOK += testFuzzActor( &actorFuzz6, "actorFuzz6", (vector(),320321,266526,762336,463730,320321,266526,762336,463730,320321,266526,762336,463730,320321,266526,762336,463730,320321,266526,762336,463730,945289) ); - testsOK += testFuzzActor( &actorFuzz7, "actorFuzz7", (vector(),406152,478841,609181,634881,253861,592023,240597,253861,593023,240597,253861,594023,240597,415949,169335,478331,634881,253861,596023,240597,253861,597023,240597,253861,598023,240597,415949,173335,478331,634881,253861,600023,240597,253861,601023,240597,253861,602023,240597,415949,177335,478331,634881,253861,604023,240597,253861,605023,240597,253861,606023,240597,415949,181335,478331,634881,253861,608023,240597,253861,609023,240597,253861,610023,240597,415949,185335,478331,331905,946924,663973,797073,971923,295772,923567,559259,559259,559259,325678,679187,295772,923567,559259,559259,559259,325678,679187,295772,923567,559259,559259,559259,325678,679187,295772,923567,559259,559259,559259,325678,679187,295772,923567,559259,559259,559259,325678,679187,534407,814172,949658) ); - testsOK += testFuzzActor( &actorFuzz8, "actorFuzz8", (vector(),285937,696473) ); - testsOK += testFuzzActor( &actorFuzz9, "actorFuzz9", (vector(),141463,397424) ); - testsOK += testFuzzActor( &actorFuzz10, "actorFuzz10", (vector(),543113,1000) ); - testsOK += testFuzzActor( &actorFuzz11, "actorFuzz11", (vector(),1000) ); - testsOK += testFuzzActor( &actorFuzz12, "actorFuzz12", (vector(),970588,981887) ); - testsOK += testFuzzActor( &actorFuzz13, "actorFuzz13", (vector(),861219) ); - testsOK += testFuzzActor( &actorFuzz14, "actorFuzz14", (vector(),527098,527098,527098,628047) ); - testsOK += testFuzzActor( &actorFuzz15, "actorFuzz15", (vector(),582389,240216,732317,582389,240216,732317,582389,240216,732317,582389,240216,732317,582389,240216,732317,884781) ); - testsOK += testFuzzActor( &actorFuzz16, "actorFuzz16", (vector(),943071,492690,908751,198776,537939) ); - testsOK += testFuzzActor( &actorFuzz17, "actorFuzz17", (vector(),249436,416782,249436,416782,249436,416782,299183) ); - testsOK += testFuzzActor( &actorFuzz18, "actorFuzz18", (vector(),337649,395297,807261,517901) ); - testsOK += testFuzzActor( &actorFuzz19, "actorFuzz19", (vector(),492598,139186,742053,492598,140186,742053,492598,141186,742053,592919) ); - testsOK += testFuzzActor( &actorFuzz20, "actorFuzz20", (vector(),760082,1000) ); - testsOK += testFuzzActor( &actorFuzz21, "actorFuzz21", (vector(),806394) ); - testsOK += testFuzzActor( &actorFuzz22, "actorFuzz22", (vector(),722878,369302,416748) ); - testsOK += testFuzzActor( &actorFuzz23, "actorFuzz23", (vector(),562792,231437) ); - testsOK += testFuzzActor( &actorFuzz24, "actorFuzz24", (vector(),847672,835175) ); - testsOK += testFuzzActor( &actorFuzz25, "actorFuzz25", (vector(),843261,327560,592398) ); - testsOK += testFuzzActor( &actorFuzz26, "actorFuzz26", (vector(),520263,306397,944232,366272,700651,146918,191890) ); - testsOK += testFuzzActor( &actorFuzz27, "actorFuzz27", (vector(),313322,196907) ); - testsOK += testFuzzActor( &actorFuzz28, "actorFuzz28", (vector(),715827,529509,449273,715827,529509,449273,715827,529509,449273,715827,529509,449273,715827,529509,449273,743922) ); - testsOK += testFuzzActor( &actorFuzz29, "actorFuzz29", (vector(),821092,901028,617942,821092,902028,617942,821092,903028,617942,821092,904028,617942,821092,905028,617942,560881) ); + testsOK += testFuzzActor( &actorFuzz0, "actorFuzz0", {390229,596271,574865}); + testsOK += testFuzzActor( &actorFuzz1, "actorFuzz1", {477566,815578,477566,815578,477566,815578,477566,815578,477566,815578,917160}); + testsOK += testFuzzActor( &actorFuzz2, "actorFuzz2", {476677,930237}); + testsOK += testFuzzActor( &actorFuzz3, "actorFuzz3", {1000}); + testsOK += testFuzzActor( &actorFuzz4, "actorFuzz4", {180600,177605,177605,177605,954508,810052}); + testsOK += testFuzzActor( &actorFuzz5, "actorFuzz5", {1000}); + testsOK += testFuzzActor( &actorFuzz6, "actorFuzz6", {320321,266526,762336,463730,320321,266526,762336,463730,320321,266526,762336,463730,320321,266526,762336,463730,320321,266526,762336,463730,945289}); + testsOK += testFuzzActor( &actorFuzz7, "actorFuzz7", {406152,478841,609181,634881,253861,592023,240597,253861,593023,240597,253861,594023,240597,415949,169335,478331,634881,253861,596023,240597,253861,597023,240597,253861,598023,240597,415949,173335,478331,634881,253861,600023,240597,253861,601023,240597,253861,602023,240597,415949,177335,478331,634881,253861,604023,240597,253861,605023,240597,253861,606023,240597,415949,181335,478331,634881,253861,608023,240597,253861,609023,240597,253861,610023,240597,415949,185335,478331,331905,946924,663973,797073,971923,295772,923567,559259,559259,559259,325678,679187,295772,923567,559259,559259,559259,325678,679187,295772,923567,559259,559259,559259,325678,679187,295772,923567,559259,559259,559259,325678,679187,295772,923567,559259,559259,559259,325678,679187,534407,814172,949658}); + testsOK += testFuzzActor( &actorFuzz8, "actorFuzz8", {285937,696473}); + testsOK += testFuzzActor( &actorFuzz9, "actorFuzz9", {141463,397424}); + testsOK += testFuzzActor( &actorFuzz10, "actorFuzz10", {543113,1000}); + testsOK += testFuzzActor( &actorFuzz11, "actorFuzz11", {1000}); + testsOK += testFuzzActor( &actorFuzz12, "actorFuzz12", {970588,981887}); + testsOK += testFuzzActor( &actorFuzz13, "actorFuzz13", {861219}); + testsOK += testFuzzActor( &actorFuzz14, "actorFuzz14", {527098,527098,527098,628047}); + testsOK += testFuzzActor( &actorFuzz15, "actorFuzz15", {582389,240216,732317,582389,240216,732317,582389,240216,732317,582389,240216,732317,582389,240216,732317,884781}); + testsOK += testFuzzActor( &actorFuzz16, "actorFuzz16", {943071,492690,908751,198776,537939}); + testsOK += testFuzzActor( &actorFuzz17, "actorFuzz17", {249436,416782,249436,416782,249436,416782,299183}); + testsOK += testFuzzActor( &actorFuzz18, "actorFuzz18", {337649,395297,807261,517901}); + testsOK += testFuzzActor( &actorFuzz19, "actorFuzz19", {492598,139186,742053,492598,140186,742053,492598,141186,742053,592919}); + testsOK += testFuzzActor( &actorFuzz20, "actorFuzz20", {760082,1000}); + testsOK += testFuzzActor( &actorFuzz21, "actorFuzz21", {806394}); + testsOK += testFuzzActor( &actorFuzz22, "actorFuzz22", {722878,369302,416748}); + testsOK += testFuzzActor( &actorFuzz23, "actorFuzz23", {562792,231437}); + testsOK += testFuzzActor( &actorFuzz24, "actorFuzz24", {847672,835175}); + testsOK += testFuzzActor( &actorFuzz25, "actorFuzz25", {843261,327560,592398}); + testsOK += testFuzzActor( &actorFuzz26, "actorFuzz26", {520263,306397,944232,366272,700651,146918,191890}); + testsOK += testFuzzActor( &actorFuzz27, "actorFuzz27", {313322,196907}); + testsOK += testFuzzActor( &actorFuzz28, "actorFuzz28", {715827,529509,449273,715827,529509,449273,715827,529509,449273,715827,529509,449273,715827,529509,449273,743922}); + testsOK += testFuzzActor( &actorFuzz29, "actorFuzz29", {821092,901028,617942,821092,902028,617942,821092,903028,617942,821092,904028,617942,821092,905028,617942,560881}); return std::make_pair(testsOK, 30); } #endif // WIN32 diff --git a/fdbrpc/ActorFuzz.h b/fdbrpc/ActorFuzz.h index 74289b06e3..e718f344e5 100644 --- a/fdbrpc/ActorFuzz.h +++ b/fdbrpc/ActorFuzz.h @@ -24,14 +24,6 @@ using std::vector; -inline vector& operator , (vector& v, int a) { - v.push_back(a); - return v; -} - -inline vector& operator , (vector const& v, int a) { - return (const_cast&>(v), a); -} inline void throw_operation_failed() { throw operation_failed(); } // This is in dsltest.actor.cpp: diff --git a/fdbrpc/AsyncFileCached.actor.cpp b/fdbrpc/AsyncFileCached.actor.cpp index ec5b6f6e73..86d8141273 100644 --- a/fdbrpc/AsyncFileCached.actor.cpp +++ b/fdbrpc/AsyncFileCached.actor.cpp @@ -80,16 +80,17 @@ Future> AsyncFileCached::open_impl( std::string filename, return open_impl(filename, flags, mode, pageCache); } -Future AsyncFileCached::read_write_impl( AsyncFileCached* self, void* data, int length, int64_t offset, bool writing ) { - if (writing) { +template +Future AsyncFileCached::read_write_impl(AsyncFileCached* self, + typename std::conditional_t data, + int length, int64_t offset) { + if constexpr (writing) { if (offset + length > self->length) self->length = offset + length; } std::vector> actors; - uint8_t* cdata = static_cast(data); - int offsetInPage = offset % self->pageCache->pageSize; int64_t pageOffset = offset - offsetInPage; @@ -108,13 +109,16 @@ Future AsyncFileCached::read_write_impl( AsyncFileCached* self, void* data int bytesInPage = std::min(self->pageCache->pageSize - offsetInPage, remaining); - auto w = writing - ? p->second->write( cdata, bytesInPage, offsetInPage ) - : p->second->read( cdata, bytesInPage, offsetInPage ); + Future w; + if constexpr (writing) { + w = p->second->write(data, bytesInPage, offsetInPage); + } else { + w = p->second->read(data, bytesInPage, offsetInPage); + } if (!w.isReady() || w.isError()) actors.push_back( w ); - cdata += bytesInPage; + data += bytesInPage; pageOffset += self->pageCache->pageSize; offsetInPage = 0; diff --git a/fdbrpc/AsyncFileCached.actor.h b/fdbrpc/AsyncFileCached.actor.h index d9b192b662..66599e6fe9 100644 --- a/fdbrpc/AsyncFileCached.actor.h +++ b/fdbrpc/AsyncFileCached.actor.h @@ -28,6 +28,7 @@ #define FLOW_ASYNCFILECACHED_ACTOR_H #include +#include #include "flow/flow.h" #include "fdbrpc/IAsyncFile.h" @@ -166,7 +167,7 @@ public: length = int(this->length - offset); ASSERT(length >= 0); } - auto f = read_write_impl(this, data, length, offset, false); + auto f = read_write_impl(this, static_cast(data), length, offset); if( f.isReady() && !f.isError() ) return length; ++countFileCacheReadsBlocked; ++countCacheReadsBlocked; @@ -180,7 +181,7 @@ public: wait(self->currentTruncate); ++self->countFileCacheWrites; ++self->countCacheWrites; - Future f = read_write_impl(self, const_cast(data), length, offset, true); + Future f = read_write_impl(self, static_cast(data), length, offset); if (!f.isReady()) { ++self->countFileCacheWritesBlocked; ++self->countCacheWritesBlocked; @@ -346,7 +347,10 @@ private: return Void(); } - static Future read_write_impl( AsyncFileCached* self, void* data, int length, int64_t offset, bool writing ); + template + static Future read_write_impl(AsyncFileCached* self, + typename std::conditional_t data, + int length, int64_t offset); void remove_page( AFCPage* page ); }; diff --git a/fdbrpc/FailureMonitor.actor.cpp b/fdbrpc/FailureMonitor.actor.cpp index 799c4fda77..ceb709a6c7 100644 --- a/fdbrpc/FailureMonitor.actor.cpp +++ b/fdbrpc/FailureMonitor.actor.cpp @@ -121,7 +121,8 @@ void SimpleFailureMonitor::endpointNotFound(Endpoint const& endpoint) { .suppressFor(1.0) .detail("Address", endpoint.getPrimaryAddress()) .detail("Token", endpoint.token); - endpointKnownFailed.set(endpoint, true); + failedEndpoints.insert(endpoint); + endpointKnownFailed.trigger(endpoint); } void SimpleFailureMonitor::notifyDisconnect(NetworkAddress const& address) { @@ -132,7 +133,7 @@ void SimpleFailureMonitor::notifyDisconnect(NetworkAddress const& address) { Future SimpleFailureMonitor::onDisconnectOrFailure(Endpoint const& endpoint) { // If the endpoint or address is already failed, return right away auto i = addressStatus.find(endpoint.getPrimaryAddress()); - if (i == addressStatus.end() || i->second.isFailed() || endpointKnownFailed.get(endpoint)) { + if (i == addressStatus.end() || i->second.isFailed() || failedEndpoints.count(endpoint)) { TraceEvent("AlreadyDisconnected").detail("Addr", endpoint.getPrimaryAddress()).detail("Tok", endpoint.token); return Void(); } @@ -149,14 +150,14 @@ Future SimpleFailureMonitor::onStateChanged(Endpoint const& endpoint) { // failure status for that endpoint can never change (and we could be spuriously triggered by setStatus) // Also returns spuriously when notifyDisconnect is called (which doesn't actually change the state), but callers // check the state so it's OK - if (endpointKnownFailed.get(endpoint)) + if (failedEndpoints.count(endpoint)) return Never(); else return endpointKnownFailed.onChange(endpoint); } FailureStatus SimpleFailureMonitor::getState(Endpoint const& endpoint) { - if (endpointKnownFailed.get(endpoint)) + if (failedEndpoints.count(endpoint)) return FailureStatus(true); else { auto a = addressStatus.find(endpoint.getPrimaryAddress()); @@ -178,7 +179,7 @@ FailureStatus SimpleFailureMonitor::getState(NetworkAddress const& address) { } bool SimpleFailureMonitor::onlyEndpointFailed(Endpoint const& endpoint) { - if (!endpointKnownFailed.get(endpoint)) return false; + if (!failedEndpoints.count(endpoint)) return false; auto a = addressStatus.find(endpoint.getPrimaryAddress()); if (a == addressStatus.end()) return true; @@ -187,10 +188,11 @@ bool SimpleFailureMonitor::onlyEndpointFailed(Endpoint const& endpoint) { } bool SimpleFailureMonitor::permanentlyFailed(Endpoint const& endpoint) { - return endpointKnownFailed.get(endpoint); + return failedEndpoints.count(endpoint); } void SimpleFailureMonitor::reset() { addressStatus = std::unordered_map(); + failedEndpoints = std::unordered_set(); endpointKnownFailed.resetNoWaiting(); } diff --git a/fdbrpc/FailureMonitor.h b/fdbrpc/FailureMonitor.h index d6d11e6e3e..434f0f9a91 100644 --- a/fdbrpc/FailureMonitor.h +++ b/fdbrpc/FailureMonitor.h @@ -25,6 +25,7 @@ #include "flow/flow.h" #include "fdbrpc/FlowTransport.h" // Endpoint #include +#include using std::vector; @@ -153,6 +154,7 @@ public: private: std::unordered_map addressStatus; YieldedAsyncMap endpointKnownFailed; + std::unordered_set failedEndpoints; friend class OnStateChangedActorActor; }; diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 65ef314e9f..7c9e3ed912 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -122,10 +122,11 @@ const Endpoint& EndpointMap::insert( NetworkAddressList localAddresses, std::vec } UID base = deterministicRandom()->randomUniqueID(); - for(int i=0; isetEndpoint( Endpoint( localAddresses, UID( base.first() | TOKEN_STREAM_FLAG, (base.second()&0xffffffff00000000LL) | index) ) ); - data[index].token() = Endpoint::Token( base.first() | TOKEN_STREAM_FLAG, (base.second()&0xffffffff00000000LL) | static_cast(streams[i].second) ); + uint64_t first = (base.first()+(i<<32)) | TOKEN_STREAM_FLAG; + streams[i].first->setEndpoint( Endpoint( localAddresses, UID( first, (base.second()&0xffffffff00000000LL) | index) ) ); + data[index].token() = Endpoint::Token( first, (base.second()&0xffffffff00000000LL) | static_cast(streams[i].second) ); data[index].receiver = (NetworkMessageReceiver*) streams[i].first; } @@ -1277,8 +1278,8 @@ void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* rec self->endpoints.insert( receiver, endpoint.token, taskID ); } -const Endpoint& FlowTransport::addEndpoints( std::vector> const& streams ) { - return self->endpoints.insert( self->localAddresses, streams ); +void FlowTransport::addEndpoints( std::vector> const& streams ) { + self->endpoints.insert( self->localAddresses, streams ); } void FlowTransport::removeEndpoint( const Endpoint& endpoint, NetworkMessageReceiver* receiver ) { diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h index 1573577ee6..0f7326b35e 100644 --- a/fdbrpc/FlowTransport.h +++ b/fdbrpc/FlowTransport.h @@ -68,23 +68,17 @@ public: Endpoint getAdjustedEndpoint( uint32_t index ) { uint32_t newIndex = token.second(); newIndex += index; - return Endpoint( addresses, UID(token.first(), (token.second()&0xffffffff00000000LL) | newIndex) ); + return Endpoint( addresses, UID(token.first()+(uint64_t(index)<<32), (token.second()&0xffffffff00000000LL) | newIndex) ); } bool operator == (Endpoint const& r) const { - return getPrimaryAddress() == r.getPrimaryAddress() && token == r.token; + return token == r.token && getPrimaryAddress() == r.getPrimaryAddress(); } bool operator != (Endpoint const& r) const { return !(*this == r); } - bool operator < (Endpoint const& r) const { - const NetworkAddress& left = getPrimaryAddress(); - const NetworkAddress& right = r.getPrimaryAddress(); - if (left != right) - return left < right; - else - return token < r.token; + return addresses.address < r.addresses.address || (addresses.address == r.addresses.address && token < r.token); } template @@ -109,6 +103,18 @@ public: }; #pragma pack(pop) +namespace std +{ + template <> + struct hash + { + size_t operator()(const Endpoint& ep) const + { + return ep.token.hash() + ep.addresses.address.hash(); + } + }; +} + class ArenaObjectReader; class NetworkMessageReceiver { public: @@ -186,7 +192,7 @@ public: void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID ); // Sets endpoint to be a new local endpoint which delivers messages to the given receiver - const Endpoint& addEndpoints( std::vector> const& streams ); + void addEndpoints( std::vector> const& streams ); void removeEndpoint( const Endpoint&, NetworkMessageReceiver* ); // The given local endpoint no longer delivers messages to the given receiver or uses resources diff --git a/fdbrpc/actorFuzz.py b/fdbrpc/actorFuzz.py old mode 100644 new mode 100755 index 05eb22e4de..dc83b7dbaa --- a/fdbrpc/actorFuzz.py +++ b/fdbrpc/actorFuzz.py @@ -449,7 +449,7 @@ for actor in actors: print("std::pair actorFuzzTests() {\n\tint testsOK = 0;", file=outputFile) for actor in actors: - print('\ttestsOK += testFuzzActor( &%s, "%s", (vector(),%s) );' % (actor.name, actor.name, ','.join(str(e) for e in actor.ecx.output)), + print('\ttestsOK += testFuzzActor( &%s, "%s", {%s} );' % (actor.name, actor.name, ','.join(str(e) for e in actor.ecx.output)), file=outputFile) print("\treturn std::make_pair(testsOK, %d);\n}" % len(actors), file=outputFile) print('#endif // WIN32\n', file=outputFile) diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index ddfc66c42c..6114956dc9 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -85,17 +85,6 @@ void ISimulator::displayWorkers() const return; } -namespace std { -template<> -class hash { -public: - size_t operator()(const Endpoint &s) const - { - return crc32c_append(0, (const uint8_t*)&s, sizeof(s)); - } -}; -} - const UID TOKEN_ENDPOINT_NOT_FOUND(-1, -1); ISimulator* g_pSimulator = 0; diff --git a/fdbserver/BackupProgress.actor.cpp b/fdbserver/BackupProgress.actor.cpp index 985fcb7f93..898ce31b70 100644 --- a/fdbserver/BackupProgress.actor.cpp +++ b/fdbserver/BackupProgress.actor.cpp @@ -83,21 +83,24 @@ std::map, std::map> BackupProgr auto progressIt = progress.lower_bound(epoch); if (progressIt != progress.end() && progressIt->first == epoch) { - if (progressIt != progress.begin()) { + std::set toCheck = tags; + for (auto current = progressIt; current != progress.begin() && !toCheck.empty();) { + auto prev = std::prev(current); // Previous epoch is gone, consolidate the progress. - auto prev = std::prev(progressIt); for (auto [tag, version] : prev->second) { - if (tags.count(tag) > 0) { + if (toCheck.count(tag) > 0) { progressIt->second[tag] = std::max(version, progressIt->second[tag]); + toCheck.erase(tag); } } + current = prev; } updateTagVersions(&tagVersions, &tags, progressIt->second, info.epochEnd, adjustedBeginVersion, epoch); } else { auto rit = std::find_if( progress.rbegin(), progress.rend(), [epoch = epoch](const std::pair>& p) { return p.first < epoch; }); - if (!(rit == progress.rend())) { + while (!(rit == progress.rend())) { // A partial recovery can result in empty epoch that copies previous // epoch's version range. In this case, we should check previous // epoch's savedVersion. @@ -112,7 +115,9 @@ std::map, std::map> BackupProgr // ASSERT(info.logRouterTags == epochTags[rit->first]); updateTagVersions(&tagVersions, &tags, rit->second, info.epochEnd, adjustedBeginVersion, epoch); + break; } + rit++; } } diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index b1d666e0f4..e1e0aee474 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -34,14 +34,17 @@ #include "flow/actorcompiler.h" // This must be the last #include. +#define SevDebugMemory SevVerbose + struct VersionedMessage { LogMessageVersion version; StringRef message; VectorRef tags; Arena arena; // Keep a reference to the memory containing the message + size_t bytes; // arena's size when inserted, which can grow afterwards VersionedMessage(LogMessageVersion v, StringRef m, const VectorRef& t, const Arena& a) - : version(v), message(m), tags(t), arena(a) {} + : version(v), message(m), tags(t), arena(a), bytes(a.getSize()) {} const Version getVersion() const { return version.version; } const uint32_t getSubVersion() const { return version.sub; } @@ -64,6 +67,10 @@ struct VersionedMessage { } }; +static bool sameArena(const Arena& a, const Arena& b) { + return a.impl.getPtr() == b.impl.getPtr(); +} + struct BackupData { const UID myId; const Tag tag; // LogRouter tag for this worker, i.e., (-2, i) @@ -84,6 +91,7 @@ struct BackupData { bool stopped = false; bool exitEarly = false; // If the worker is on an old epoch and all backups starts a version >= the endVersion AsyncVar paused; // Track if "backupPausedKey" is set. + Reference lock; struct PerBackupInfo { PerBackupInfo() = default; @@ -231,12 +239,14 @@ struct BackupData { : myId(id), tag(req.routerTag), totalTags(req.totalTags), startVersion(req.startVersion), endVersion(req.endVersion), recruitedEpoch(req.recruitedEpoch), backupEpoch(req.backupEpoch), minKnownCommittedVersion(invalidVersion), savedVersion(req.startVersion - 1), popVersion(req.startVersion - 1), - cc("BackupWorker", myId.toString()), pulledVersion(0), paused(false) { + cc("BackupWorker", myId.toString()), pulledVersion(0), paused(false), + lock(new FlowLock(SERVER_KNOBS->BACKUP_LOCK_BYTES)) { cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true); specialCounter(cc, "SavedVersion", [this]() { return this->savedVersion; }); specialCounter(cc, "MinKnownCommittedVersion", [this]() { return this->minKnownCommittedVersion; }); specialCounter(cc, "MsgQ", [this]() { return this->messages.size(); }); + specialCounter(cc, "BufferedBytes", [this]() { return this->lock->activePermits(); }); logger = traceCounters("BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "BackupWorkerMetrics"); } @@ -310,6 +320,34 @@ struct BackupData { doneTrigger.trigger(); } + // Erases messages and updates lock with memory released. + void eraseMessages(int num) { + ASSERT(num <= messages.size()); + if (num == 0) return; + + if (messages.size() == num) { + messages.clear(); + TraceEvent(SevDebugMemory, "BackupWorkerMemory", myId).detail("ReleaseAll", lock->activePermits()); + lock->release(lock->activePermits()); + return; + } + + // keep track of each arena and accumulate their sizes + int64_t bytes = 0; + for (int i = 0; i < num; i++) { + const Arena& a = messages[i].arena; + const Arena& b = messages[i + 1].arena; + if (!sameArena(a, b)) { + bytes += messages[i].bytes; + TraceEvent(SevDebugMemory, "BackupWorkerMemory", myId) + .detail("Release", messages[i].bytes) + .detail("Arena", (void*)a.impl.getPtr()); + } + } + lock->release(bytes); + messages.erase(messages.begin(), messages.begin() + num); + } + void eraseMessagesAfterEndVersion() { ASSERT(endVersion.present()); const Version ver = endVersion.get(); @@ -637,6 +675,7 @@ ACTOR Future saveMutationsToFile(BackupData* self, Version popVersion, int state std::vector> logFiles; state std::vector blockEnds; state std::vector activeUids; // active Backups' UIDs + state std::vector beginVersions; // logFiles' begin versions state KeyRangeMap> keyRangeMap; // range to index in logFileFutures, logFiles, & blockEnds state std::vector> mutations; state int idx; @@ -655,15 +694,20 @@ ACTOR Future saveMutationsToFile(BackupData* self, Version popVersion, int const int index = logFileFutures.size(); activeUids.push_back(it->first); self->insertRanges(keyRangeMap, it->second.ranges.get(), index); + if (it->second.lastSavedVersion == invalidVersion) { if (it->second.startVersion > self->startVersion && !self->messages.empty()) { // True-up first mutation log's begin version it->second.lastSavedVersion = self->messages[0].getVersion(); } else { - it->second.lastSavedVersion = - std::max(self->popVersion, std::max(self->savedVersion, self->startVersion)); + it->second.lastSavedVersion = std::max({ self->popVersion, self->savedVersion, self->startVersion }); } + TraceEvent("BackupWorkerTrueUp", self->myId).detail("LastSavedVersion", it->second.lastSavedVersion); } + // The true-up version can be larger than first message version, so keep + // the begin versions for later muation filtering. + beginVersions.push_back(it->second.lastSavedVersion); + logFileFutures.push_back(it->second.container.get().get()->writeTaggedLogFile( it->second.lastSavedVersion, popVersion + 1, blockSize, self->tag.id, self->totalTags)); it++; @@ -675,7 +719,7 @@ ACTOR Future saveMutationsToFile(BackupData* self, Version popVersion, int std::transform(logFileFutures.begin(), logFileFutures.end(), std::back_inserter(logFiles), [](const Future>& f) { return f.get(); }); - ASSERT(activeUids.size() == logFiles.size()); + ASSERT(activeUids.size() == logFiles.size() && beginVersions.size() == logFiles.size()); for (int i = 0; i < logFiles.size(); i++) { TraceEvent("OpenMutationFile", self->myId) .detail("BackupID", activeUids[i]) @@ -698,7 +742,10 @@ ACTOR Future saveMutationsToFile(BackupData* self, Version popVersion, int std::vector> adds; if (m.type != MutationRef::Type::ClearRange) { for (int index : keyRangeMap[m.param1]) { - adds.push_back(addMutation(logFiles[index], message, message.message, &blockEnds[index], blockSize)); + if (message.getVersion() >= beginVersions[index]) { + adds.push_back( + addMutation(logFiles[index], message, message.message, &blockEnds[index], blockSize)); + } } } else { KeyRangeRef mutationRange(m.param1, m.param2); @@ -713,8 +760,10 @@ ACTOR Future saveMutationsToFile(BackupData* self, Version popVersion, int wr << subm; mutations.push_back(wr.toValue()); for (int index : range.value()) { - adds.push_back( - addMutation(logFiles[index], message, mutations.back(), &blockEnds[index], blockSize)); + if (message.getVersion() >= beginVersions[index]) { + adds.push_back( + addMutation(logFiles[index], message, mutations.back(), &blockEnds[index], blockSize)); + } } } } @@ -791,12 +840,12 @@ ACTOR Future uploadData(BackupData* self) { .detail("MsgQ", self->messages.size()); // save an empty file for old epochs so that log file versions are continuous wait(saveMutationsToFile(self, popVersion, numMsg)); - self->messages.erase(self->messages.begin(), self->messages.begin() + numMsg); + self->eraseMessages(numMsg); } // If transition into NOOP mode, should clear messages if (!self->pulling) { - self->messages.clear(); + self->eraseMessages(self->messages.size()); } if (popVersion > self->savedVersion && popVersion > self->popVersion) { @@ -810,7 +859,7 @@ ACTOR Future uploadData(BackupData* self) { } if (self->allMessageSaved()) { - self->messages.clear(); + self->eraseMessages(self->messages.size()); return Void(); } @@ -825,6 +874,7 @@ ACTOR Future pullAsyncData(BackupData* self) { state Future logSystemChange = Void(); state Reference r; state Version tagAt = std::max(self->pulledVersion.get(), std::max(self->startVersion, self->savedVersion)); + state Arena prev; TraceEvent("BackupWorkerPull", self->myId); loop { @@ -850,6 +900,15 @@ ACTOR Future pullAsyncData(BackupData* self) { // Note we aggressively peek (uncommitted) messages, but only committed // messages/mutations will be flushed to disk/blob in uploadData(). while (r->hasMessage()) { + if (!sameArena(prev, r->arena())) { + TraceEvent(SevDebugMemory, "BackupWorkerMemory", self->myId) + .detail("Take", r->arena().getSize()) + .detail("Arena", (void*)r->arena().impl.getPtr()) + .detail("Current", self->lock->activePermits()); + + wait(self->lock->take(TaskPriority::DefaultYield, r->arena().getSize())); + prev = r->arena(); + } self->messages.emplace_back(r->version(), r->getMessage(), r->getTags(), r->arena()); r->nextMessage(); } diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 0a12adedf4..52a08a6ef4 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -134,6 +134,7 @@ set(FDBSERVER_SRCS workloads/ConsistencyCheck.actor.cpp workloads/CpuProfiler.actor.cpp workloads/Cycle.actor.cpp + workloads/DataDistributionMetrics.actor.cpp workloads/DDBalance.actor.cpp workloads/DDMetrics.actor.cpp workloads/DDMetricsExclude.actor.cpp diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 7f4549ed76..65bbe60f3b 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -4429,7 +4429,7 @@ ACTOR Future monitorBatchLimitedTime(Reference> db, } } -ACTOR Future dataDistribution(Reference self) +ACTOR Future dataDistribution(Reference self, PromiseStream getShardMetricsList) { state double lastLimited = 0; self->addActor.send( monitorBatchLimitedTime(self->dbInfo, &lastLimited) ); @@ -4605,7 +4605,7 @@ ACTOR Future dataDistribution(Reference self) } actors.push_back( pollMoveKeysLock(cx, lock) ); - actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId ), "DDTracker", self->ddId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getShardMetricsList, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId ), "DDTracker", self->ddId, &normalDDQueueErrors() ) ); actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, configuration.storageTeamSize, &lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); vector teamCollectionsPtrs; @@ -4856,6 +4856,7 @@ ACTOR Future ddExclusionSafetyCheck(DistributorExclusionSafetyCheckRequest ACTOR Future dataDistributor(DataDistributorInterface di, Reference> db ) { state Reference self( new DataDistributorData(db, di.id()) ); state Future collection = actorCollection( self->addActor.getFuture() ); + state PromiseStream getShardMetricsList; state Database cx = openDBOnServer(db, TaskPriority::DefaultDelay, true, true); state ActorCollection actors(false); self->addActor.send(actors.getResult()); @@ -4864,7 +4865,7 @@ ACTOR Future dataDistributor(DataDistributorInterface di, ReferenceaddActor.send( waitFailureServer(di.waitFailure.getFuture()) ); - state Future distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); + state Future distributor = reportErrorsExcept( dataDistribution(self, getShardMetricsList), "DataDistribution", di.id(), &normalDataDistributorErrors() ); loop choose { when ( wait(distributor || collection) ) { @@ -4876,6 +4877,17 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Reference>> result = wait(errorOr(brokenPromiseToNever( + getShardMetricsList.getReply(GetMetricsListRequest(req.keys, req.shardLimit))))); + if ( result.isError() ) { + req.reply.sendError(result.getError()); + } else { + GetDataDistributorMetricsReply rep; + rep.storageMetricsList = result.get(); + req.reply.send(rep); + } + } when(DistributorSnapRequest snapReq = waitNext(di.distributorSnapReq.getFuture())) { actors.add(ddSnapCreate(snapReq, db)); } diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index f07a15dbfd..116d6a9234 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -107,6 +107,15 @@ struct GetMetricsRequest { GetMetricsRequest( KeyRange const& keys ) : keys(keys) {} }; +struct GetMetricsListRequest { + KeyRange keys; + int shardLimit; + Promise>> reply; + + GetMetricsListRequest() {} + GetMetricsListRequest( KeyRange const& keys, const int shardLimit ) : keys(keys), shardLimit(shardLimit) {} +}; + struct TeamCollectionInterface { PromiseStream< GetTeamRequest > getTeam; }; @@ -203,6 +212,7 @@ Future dataDistributionTracker( PromiseStream const& output, Reference const& shardsAffectedByTeamFailure, PromiseStream const& getShardMetrics, + PromiseStream const& getShardMetricsList, FutureStream> const& getAverageShardBytes, Promise const& readyToStart, Reference> const& zeroHealthyTeams, diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index dae7942057..01d2a34ab9 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -813,12 +813,60 @@ ACTOR Future fetchShardMetrics( DataDistributionTracker* self, GetMetricsR return Void(); } + +ACTOR Future fetchShardMetricsList_impl( DataDistributionTracker* self, GetMetricsListRequest req ) { + try { + loop { + // used to control shard limit + int shardNum = 0; + // list of metrics, regenerate on loop when full range unsuccessful + Standalone> result; + Future onChange; + for (auto t : self->shards.containedRanges(req.keys)) { + auto &stats = t.value().stats; + if( !stats->get().present() ) { + onChange = stats->onChange(); + break; + } + result.push_back_deep(result.arena(), + DDMetricsRef(stats->get().get().metrics.bytes, KeyRef(t.begin().toString()))); + ++shardNum; + if (shardNum >= req.shardLimit) { + break; + } + } + + if( !onChange.isValid() ) { + req.reply.send( result ); + return Void(); + } + + wait( onChange ); + } + } catch( Error &e ) { + if( e.code() != error_code_actor_cancelled && !req.reply.isSet() ) + req.reply.sendError(e); + throw; + } +} + +ACTOR Future fetchShardMetricsList( DataDistributionTracker* self, GetMetricsListRequest req ) { + choose { + when( wait( fetchShardMetricsList_impl( self, req ) ) ) {} + when( wait( delay( SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT ) ) ) { + req.reply.sendError(timed_out()); + } + } + return Void(); +} + ACTOR Future dataDistributionTracker( Reference initData, Database cx, PromiseStream output, Reference shardsAffectedByTeamFailure, PromiseStream getShardMetrics, + PromiseStream getShardMetricsList, FutureStream> getAverageShardBytes, Promise readyToStart, Reference> anyZeroHealthyTeams, @@ -847,6 +895,9 @@ ACTOR Future dataDistributionTracker( when( GetMetricsRequest req = waitNext( getShardMetrics.getFuture() ) ) { self.sizeChanges.add( fetchShardMetrics( &self, req ) ); } + when( GetMetricsListRequest req = waitNext( getShardMetricsList.getFuture() ) ) { + self.sizeChanges.add( fetchShardMetricsList( &self, req ) ); + } when( wait( self.sizeChanges.getResult() ) ) {} } } catch (Error& e) { diff --git a/fdbserver/DataDistributorInterface.h b/fdbserver/DataDistributorInterface.h index a1e0ffb35e..063772f02b 100644 --- a/fdbserver/DataDistributorInterface.h +++ b/fdbserver/DataDistributorInterface.h @@ -32,6 +32,7 @@ struct DataDistributorInterface { struct LocalityData locality; RequestStream distributorSnapReq; RequestStream distributorExclCheckReq; + RequestStream dataDistributorMetrics; DataDistributorInterface() {} explicit DataDistributorInterface(const struct LocalityData& l) : locality(l) {} @@ -48,7 +49,7 @@ struct DataDistributorInterface { template void serialize(Archive& ar) { - serializer(ar, waitFailure, haltDataDistributor, locality, distributorSnapReq, distributorExclCheckReq); + serializer(ar, waitFailure, haltDataDistributor, locality, distributorSnapReq, distributorExclCheckReq, dataDistributorMetrics); } }; @@ -66,6 +67,33 @@ struct HaltDataDistributorRequest { } }; +struct GetDataDistributorMetricsReply { + constexpr static FileIdentifier file_identifier = 1284337; + Standalone> storageMetricsList; + + GetDataDistributorMetricsReply() {} + + template + void serialize(Ar& ar) { + serializer(ar,storageMetricsList); + } +}; + +struct GetDataDistributorMetricsRequest { + constexpr static FileIdentifier file_identifier = 1059267; + KeyRange keys; + int shardLimit; + ReplyPromise reply; + + GetDataDistributorMetricsRequest() {} + explicit GetDataDistributorMetricsRequest(KeyRange const& keys, const int shardLimit) : keys(keys), shardLimit(shardLimit) {} + + template + void serialize(Ar& ar) { + serializer(ar, keys, shardLimit, reply); + } +}; + struct DistributorSnapRequest { constexpr static FileIdentifier file_identifier = 22204900; diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp index 0aa2332fc2..69d73c24d4 100644 --- a/fdbserver/FDBExecHelper.actor.cpp +++ b/fdbserver/FDBExecHelper.actor.cpp @@ -7,7 +7,7 @@ #include "fdbserver/FDBExecHelper.actor.h" #include "flow/Trace.h" #include "flow/flow.h" -#include "fdbclient/IncludeVersions.h" +#include "fdbclient/versions.h" #include "fdbserver/Knobs.h" #include "flow/actorcompiler.h" // This must be the last #include. diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index f1b1b26034..12fdadf4b3 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -387,7 +387,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( BACKUP_TIMEOUT, 0.4 ); init( BACKUP_NOOP_POP_DELAY, 5.0 ); init( BACKUP_FILE_BLOCK_BYTES, 1024 * 1024 ); - init( BACKUP_UPLOAD_DELAY, 10.0 ); if( randomize && BUGGIFY ) BACKUP_UPLOAD_DELAY = deterministicRandom()->random01() * 20; // TODO: Increase delay range + init( BACKUP_LOCK_BYTES, 3e9 ); if(randomize && BUGGIFY) BACKUP_LOCK_BYTES = deterministicRandom()->randomInt(1024, 4096) * 1024; + init( BACKUP_UPLOAD_DELAY, 10.0 ); if(randomize && BUGGIFY) BACKUP_UPLOAD_DELAY = deterministicRandom()->random01() * 60; //Cluster Controller init( CLUSTER_CONTROLLER_LOGGING_DELAY, 5.0 ); @@ -629,6 +630,13 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( REDWOOD_DEFAULT_PAGE_SIZE, 4096 ); init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 ); init( REDWOOD_PAGE_REBUILD_FILL_FACTOR, 0.66 ); + init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES, 10 ); + init( REDWOOD_LAZY_CLEAR_MIN_PAGES, 0 ); + init( REDWOOD_LAZY_CLEAR_MAX_PAGES, 1e6 ); + init( REDWOOD_REMAP_CLEANUP_BATCH_SIZE, 5000 ); + init( REDWOOD_REMAP_CLEANUP_VERSION_LAG_MIN, 4 ); + init( REDWOOD_REMAP_CLEANUP_VERSION_LAG_MAX, 15 ); + init( REDWOOD_LOGGING_INTERVAL, 5.0 ); // clang-format on diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 0569660c40..e91a03a42b 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -179,7 +179,7 @@ public: int64_t DD_SS_FAILURE_VERSIONLAG; // Allowed SS version lag from the current read version before marking it as failed. int64_t DD_SS_ALLOWED_VERSIONLAG; // SS will be marked as healthy if it's version lag goes below this value. double DD_SS_STUCK_TIME_LIMIT; // If a storage server is not getting new versions for this amount of time, then it becomes undesired. - + // TeamRemover to remove redundant teams bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor double TR_REMOVE_MACHINE_TEAM_DELAY; // wait for the specified time before try to remove next machine team @@ -313,6 +313,7 @@ public: double BACKUP_TIMEOUT; // master's reaction time for backup failure double BACKUP_NOOP_POP_DELAY; int BACKUP_FILE_BLOCK_BYTES; + int64_t BACKUP_LOCK_BYTES; double BACKUP_UPLOAD_DELAY; //Cluster Controller @@ -561,6 +562,13 @@ public: int REDWOOD_DEFAULT_PAGE_SIZE; // Page size for new Redwood files int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress. double REDWOOD_PAGE_REBUILD_FILL_FACTOR; // When rebuilding pages, start a new page after this capacity + int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at once + int REDWOOD_LAZY_CLEAR_MIN_PAGES; // Minimum number of pages to free before ending a lazy clear cycle, unless the queue is empty + int REDWOOD_LAZY_CLEAR_MAX_PAGES; // Maximum number of pages to free before ending a lazy clear cycle, unless the queue is empty + int REDWOOD_REMAP_CLEANUP_BATCH_SIZE; // Number of queue entries for remap cleanup to process and potentially coalesce at once. + int REDWOOD_REMAP_CLEANUP_VERSION_LAG_MIN; // Number of versions between head of remap queue and oldest retained version before remap cleanup starts + int REDWOOD_REMAP_CLEANUP_VERSION_LAG_MAX; // Number of versions between head of remap queue and oldest retained version before remap cleanup may stop + double REDWOOD_LOGGING_INTERVAL; ServerKnobs(); void initialize(bool randomize = false, ClientKnobs* clientKnobs = NULL, bool isSimulated = false); diff --git a/fdbserver/MasterInterface.h b/fdbserver/MasterInterface.h index ccf08e4f70..df76a8fcf0 100644 --- a/fdbserver/MasterInterface.h +++ b/fdbserver/MasterInterface.h @@ -33,7 +33,6 @@ typedef uint64_t DBRecoveryCount; struct MasterInterface { constexpr static FileIdentifier file_identifier = 5979145; LocalityData locality; - Endpoint base; RequestStream< ReplyPromise > waitFailure; RequestStream< struct TLogRejoinRequest > tlogRejoin; // sent by tlog (whether or not rebooted) to communicate with a new master RequestStream< struct ChangeCoordinatorsRequest > changeCoordinators; @@ -49,13 +48,12 @@ struct MasterInterface { if constexpr (!is_fb_function) { ASSERT(ar.protocolVersion().isValid()); } - serializer(ar, locality, base); + serializer(ar, locality, waitFailure); if( Archive::isDeserializing ) { - waitFailure = RequestStream< ReplyPromise >( base.getAdjustedEndpoint(0) ); - tlogRejoin = RequestStream< struct TLogRejoinRequest >( base.getAdjustedEndpoint(1) ); - changeCoordinators = RequestStream< struct ChangeCoordinatorsRequest >( base.getAdjustedEndpoint(2) ); - getCommitVersion = RequestStream< struct GetCommitVersionRequest >( base.getAdjustedEndpoint(3) ); - notifyBackupWorkerDone = RequestStream( base.getAdjustedEndpoint(4) ); + tlogRejoin = RequestStream< struct TLogRejoinRequest >( waitFailure.getEndpoint().getAdjustedEndpoint(1) ); + changeCoordinators = RequestStream< struct ChangeCoordinatorsRequest >( waitFailure.getEndpoint().getAdjustedEndpoint(2) ); + getCommitVersion = RequestStream< struct GetCommitVersionRequest >( waitFailure.getEndpoint().getAdjustedEndpoint(3) ); + notifyBackupWorkerDone = RequestStream( waitFailure.getEndpoint().getAdjustedEndpoint(4) ); } } @@ -66,7 +64,7 @@ struct MasterInterface { streams.push_back(changeCoordinators.getReceiver()); streams.push_back(getCommitVersion.getReceiver(TaskPriority::GetConsistentReadVersion)); streams.push_back(notifyBackupWorkerDone.getReceiver()); - base = FlowTransport::transport().addEndpoints(streams); + FlowTransport::transport().addEndpoints(streams); } }; diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index fc70d975fe..af91c46105 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -1756,6 +1756,25 @@ ACTOR Future healthMetricsRequestServer(MasterProxyInterface proxy, GetHea } } +ACTOR Future ddMetricsRequestServer(MasterProxyInterface proxy, Reference> db) +{ + loop { + choose { + when(state GetDDMetricsRequest req = waitNext(proxy.getDDMetrics.getFuture())) + { + ErrorOr reply = wait(errorOr(db->get().distributor.get().dataDistributorMetrics.getReply(GetDataDistributorMetricsRequest(req.keys, req.shardLimit)))); + if ( reply.isError() ) { + req.reply.sendError(reply.getError()); + } else { + GetDDMetricsReply newReply; + newReply.storageMetricsList = reply.get().storageMetricsList; + req.reply.send(newReply); + } + } + } + } +} + ACTOR Future monitorRemoteCommitted(ProxyCommitData* self) { loop { wait(delay(0)); //allow this actor to be cancelled if we are removed after db changes. @@ -1996,6 +2015,7 @@ ACTOR Future masterProxyServerCore( addActor.send(readRequestServer(proxy, addActor, &commitData)); addActor.send(rejoinServer(proxy, &commitData)); addActor.send(healthMetricsRequestServer(proxy, &healthMetricsReply, &detailedHealthMetricsReply)); + addActor.send(ddMetricsRequestServer(proxy, db)); // wait for txnStateStore recovery wait(success(commitData.txnStateStore->readValue(StringRef()))); diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 6f3ce7c2d7..b65fe21f52 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -782,7 +782,7 @@ ACTOR Future monitorThrottlingChanges(RatekeeperData *self) { TransactionTag tag = *tagKey.tags.begin(); Optional oldLimits = self->throttledTags.getManualTagThrottleLimits(tag, tagKey.priority); - if(tagKey.autoThrottled) { + if(tagKey.throttleType == TagThrottleType::AUTO) { updatedTagThrottles.autoThrottleTag(self->id, tag, 0, tagValue.tpsRate, tagValue.expirationTime); } else { @@ -819,7 +819,7 @@ void tryAutoThrottleTag(RatekeeperData *self, StorageQueueInfo const& ss, RkTagT TagSet tags; tags.addTag(ss.busiestTag.get()); - self->addActor.send(ThrottleApi::throttleTags(self->db, tags, clientRate.get(), SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION, true, TransactionPriority::DEFAULT, now() + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION)); + self->addActor.send(ThrottleApi::throttleTags(self->db, tags, clientRate.get(), SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION, TagThrottleType::AUTO, TransactionPriority::DEFAULT, now() + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION)); } } } diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 570d46d4bd..ccc1dfa9f3 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -217,6 +217,9 @@ ACTOR static Future _parsePartitionedLogFileOnLoader( VersionedMutationsMap::iterator it; bool inserted; std::tie(it, inserted) = kvOps.emplace(msgVersion, MutationsVec()); + // A clear mutation can be split into multiple mutations with the same (version, sub). + // See saveMutationsToFile(). Current tests only use one key range per backup, thus + // only one clear mutation is generated (i.e., always inserted). ASSERT(inserted); ArenaReader rd(buf.arena(), StringRef(message, msgSize), AssumeVersion(currentProtocolVersion)); diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 0a5ad07a37..ed381389c6 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -31,7 +31,7 @@ #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/BackupAgent.actor.h" -#include "fdbclient/IncludeVersions.h" +#include "fdbclient/versions.h" #include "flow/actorcompiler.h" // This must be the last #include. #undef max diff --git a/fdbserver/SkipList.cpp b/fdbserver/SkipList.cpp index 0cb3636aed..8cf6a5977e 100644 --- a/fdbserver/SkipList.cpp +++ b/fdbserver/SkipList.cpp @@ -728,7 +728,7 @@ StringRef setK(Arena& arena, int i) { #include "fdbserver/ConflictSet.h" struct ConflictSet { - ConflictSet() : oldestVersion(0) {} + ConflictSet() : oldestVersion(0), removalKey(makeString(0)) {} ~ConflictSet() {} SkipList versionHistory; diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index e1db7b9512..2e7f17b21c 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -377,9 +377,9 @@ JsonBuilderObject getLagObject(int64_t versions) { struct MachineMemoryInfo { double memoryUsage; - double numProcesses; + double aggregateLimit; - MachineMemoryInfo() : memoryUsage(0), numProcesses(0) {} + MachineMemoryInfo() : memoryUsage(0), aggregateLimit(0) {} bool valid() { return memoryUsage >= 0; } void invalidate() { memoryUsage = -1; } @@ -613,11 +613,12 @@ ACTOR static Future processStatusFetcher( try { ASSERT(pMetrics.count(workerItr->interf.address())); const TraceEventFields& processMetrics = pMetrics[workerItr->interf.address()]; + const TraceEventFields& programStart = programStarts[workerItr->interf.address()]; if(memInfo->second.valid()) { - if(processMetrics.size() > 0) { + if(processMetrics.size() > 0 && programStart.size() > 0) { memInfo->second.memoryUsage += processMetrics.getDouble("Memory"); - ++memInfo->second.numProcesses; + memInfo->second.aggregateLimit += programStart.getDouble("MemoryLimit"); } else memInfo->second.invalidate(); @@ -789,19 +790,21 @@ ACTOR static Future processStatusFetcher( memoryObj.setKeyRawNumber("unused_allocated_memory", processMetrics.getValue("UnusedAllocatedMemory")); } + int64_t memoryLimit = 0; if (programStarts.count(address)) { - auto const& psxml = programStarts.at(address); + auto const& programStartEvent = programStarts.at(address); - if(psxml.size() > 0) { - memoryObj.setKeyRawNumber("limit_bytes",psxml.getValue("MemoryLimit")); + if(programStartEvent.size() > 0) { + memoryLimit = programStartEvent.getInt64("MemoryLimit"); + memoryObj.setKey("limit_bytes", memoryLimit); std::string version; - if (psxml.tryGetValue("Version", version)) { + if (programStartEvent.tryGetValue("Version", version)) { statusObj["version"] = version; } std::string commandLine; - if (psxml.tryGetValue("CommandLine", commandLine)) { + if (programStartEvent.tryGetValue("CommandLine", commandLine)) { statusObj["command_line"] = commandLine; } } @@ -813,10 +816,10 @@ ACTOR static Future processStatusFetcher( availableMemory = mMetrics[address].getDouble("AvailableMemory"); auto machineMemInfo = machineMemoryUsage[workerItr->interf.locality.machineId()]; - if (machineMemInfo.valid()) { - ASSERT(machineMemInfo.numProcesses > 0); - int64_t memory = (availableMemory + machineMemInfo.memoryUsage) / machineMemInfo.numProcesses; - memoryObj["available_bytes"] = std::max(memory, 0); + if (machineMemInfo.valid() && memoryLimit > 0) { + ASSERT(machineMemInfo.aggregateLimit > 0); + int64_t memory = (availableMemory + machineMemInfo.memoryUsage) * memoryLimit / machineMemInfo.aggregateLimit; + memoryObj["available_bytes"] = std::min(std::max(memory, 0), memoryLimit); } } @@ -1725,10 +1728,6 @@ ACTOR static Future workloadStatusFetcher(Reference peekMessages; RequestStream< struct TLogPopRequest > popMessages; @@ -75,7 +74,7 @@ struct TLogInterface { streams.push_back(disablePopRequest.getReceiver()); streams.push_back(enablePopRequest.getReceiver()); streams.push_back(snapRequest.getReceiver()); - base = FlowTransport::transport().addEndpoints(streams); + FlowTransport::transport().addEndpoints(streams); } template @@ -83,19 +82,18 @@ struct TLogInterface { if constexpr (!is_fb_function) { ASSERT(ar.isDeserializing || uniqueID != UID()); } - serializer(ar, uniqueID, sharedTLogID, filteredLocality, base); + serializer(ar, uniqueID, sharedTLogID, filteredLocality, peekMessages); if( Ar::isDeserializing ) { - peekMessages = RequestStream< struct TLogPeekRequest >( base.getAdjustedEndpoint(0) ); - popMessages = RequestStream< struct TLogPopRequest >( base.getAdjustedEndpoint(1) ); - commit = RequestStream< struct TLogCommitRequest >( base.getAdjustedEndpoint(2) ); - lock = RequestStream< ReplyPromise< struct TLogLockResult > >( base.getAdjustedEndpoint(3) ); - getQueuingMetrics = RequestStream< struct TLogQueuingMetricsRequest >( base.getAdjustedEndpoint(4) ); - confirmRunning = RequestStream< struct TLogConfirmRunningRequest >( base.getAdjustedEndpoint(5) ); - waitFailure = RequestStream< ReplyPromise >( base.getAdjustedEndpoint(6) ); - recoveryFinished = RequestStream< struct TLogRecoveryFinishedRequest >( base.getAdjustedEndpoint(7) ); - disablePopRequest = RequestStream< struct TLogDisablePopRequest >( base.getAdjustedEndpoint(8) ); - enablePopRequest = RequestStream< struct TLogEnablePopRequest >( base.getAdjustedEndpoint(9) ); - snapRequest = RequestStream< struct TLogSnapRequest >( base.getAdjustedEndpoint(10) ); + popMessages = RequestStream< struct TLogPopRequest >( peekMessages.getEndpoint().getAdjustedEndpoint(1) ); + commit = RequestStream< struct TLogCommitRequest >( peekMessages.getEndpoint().getAdjustedEndpoint(2) ); + lock = RequestStream< ReplyPromise< struct TLogLockResult > >( peekMessages.getEndpoint().getAdjustedEndpoint(3) ); + getQueuingMetrics = RequestStream< struct TLogQueuingMetricsRequest >( peekMessages.getEndpoint().getAdjustedEndpoint(4) ); + confirmRunning = RequestStream< struct TLogConfirmRunningRequest >( peekMessages.getEndpoint().getAdjustedEndpoint(5) ); + waitFailure = RequestStream< ReplyPromise >( peekMessages.getEndpoint().getAdjustedEndpoint(6) ); + recoveryFinished = RequestStream< struct TLogRecoveryFinishedRequest >( peekMessages.getEndpoint().getAdjustedEndpoint(7) ); + disablePopRequest = RequestStream< struct TLogDisablePopRequest >( peekMessages.getEndpoint().getAdjustedEndpoint(8) ); + enablePopRequest = RequestStream< struct TLogEnablePopRequest >( peekMessages.getEndpoint().getAdjustedEndpoint(9) ); + snapRequest = RequestStream< struct TLogSnapRequest >( peekMessages.getEndpoint().getAdjustedEndpoint(10) ); } } }; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index ded6da82f2..2959c06b2a 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -542,6 +542,16 @@ public: Future>> peekAll() { return peekAll_impl(this); } + ACTOR static Future> peek_impl(FIFOQueue* self) { + state Cursor c; + c.initReadOnly(self->headReader); + + Optional x = wait(c.readNext()); + return x; + } + + Future> peek() { return peek_impl(this); } + // Pop the next item on front of queue if it is <= upperBound or if upperBound is not present Future> pop(Optional upperBound = {}) { return headReader.readNext(upperBound); } @@ -730,6 +740,201 @@ private: uint8_t* buffer; }; +struct RedwoodMetrics { + static constexpr int btreeLevels = 5; + + RedwoodMetrics() { clear(); } + + void clear() { + memset(this, 0, sizeof(RedwoodMetrics)); + for (auto& level : levels) { + level = {}; + } + startTime = g_network ? now() : 0; + } + + struct Level { + unsigned int pageRead; + unsigned int pageReadExt; + unsigned int pageBuild; + unsigned int pageBuildExt; + unsigned int pageCommitStart; + unsigned int pageModify; + unsigned int pageModifyExt; + unsigned int lazyClearRequeue; + unsigned int lazyClearRequeueExt; + unsigned int lazyClearFree; + unsigned int lazyClearFreeExt; + double buildStoredPct; + double buildFillPct; + unsigned int buildItemCount; + double modifyStoredPct; + double modifyFillPct; + unsigned int modifyItemCount; + }; + + Level levels[btreeLevels]; + + unsigned int opSet; + unsigned int opSetKeyBytes; + unsigned int opSetValueBytes; + unsigned int opClear; + unsigned int opClearKey; + unsigned int opCommit; + unsigned int opGet; + unsigned int opGetRange; + unsigned int pagerDiskWrite; + unsigned int pagerDiskRead; + unsigned int pagerRemapFree; + unsigned int pagerRemapCopy; + unsigned int pagerRemapSkip; + unsigned int pagerCacheHit; + unsigned int pagerCacheMiss; + unsigned int pagerProbeHit; + unsigned int pagerProbeMiss; + unsigned int pagerEvictUnhit; + unsigned int pagerEvictFail; + unsigned int btreeLeafPreload; + unsigned int btreeLeafPreloadExt; + + double startTime; + + Level& level(unsigned int level) { + static Level outOfBound; + if (level == 0 || level > btreeLevels) { + return outOfBound; + } + return levels[level - 1]; + } + + // This will populate a trace event and/or a string with Redwood metrics. The string is a + // reasonably well formatted page of information + void getFields(TraceEvent* e, std::string* s = nullptr) { + std::pair metrics[] = { { "BTreePreload", btreeLeafPreload }, + { "BTreePreloadExt", btreeLeafPreloadExt }, + { "", 0 }, + { "OpSet", opSet }, + { "OpSetKeyBytes", opSetKeyBytes }, + { "OpSetValueBytes", opSetValueBytes }, + { "OpClear", opClear }, + { "OpClearKey", opClearKey }, + { "", 0 }, + { "OpGet", opGet }, + { "OpGetRange", opGetRange }, + { "OpCommit", opCommit }, + { "", 0 }, + { "PagerDiskWrite", pagerDiskWrite }, + { "PagerDiskRead", pagerDiskRead }, + { "PagerCacheHit", pagerCacheHit }, + { "PagerCacheMiss", pagerCacheMiss }, + { "", 0 }, + { "PagerProbeHit", pagerProbeHit }, + { "PagerProbeMiss", pagerProbeMiss }, + { "PagerEvictUnhit", pagerEvictUnhit }, + { "PagerEvictFail", pagerEvictFail }, + { "", 0 }, + { "PagerRemapFree", pagerRemapFree }, + { "PagerRemapCopy", pagerRemapCopy }, + { "PagerRemapSkip", pagerRemapSkip } }; + double elapsed = now() - startTime; + for (auto& m : metrics) { + if (*m.first == '\0') { + if (s != nullptr) { + *s += "\n"; + } + } else { + if (s != nullptr) { + *s += format("%-15s %-8u %8u/s ", m.first, m.second, int(m.second / elapsed)); + } + if (e != nullptr) { + e->detail(m.first, m.second); + } + } + } + + for (int i = 0; i < btreeLevels; ++i) { + auto& level = levels[i]; + std::pair metrics[] = { + { "PageBuild", level.pageBuild }, + { "PageBuildExt", level.pageBuildExt }, + { "PageModify", level.pageModify }, + { "PageModifyExt", level.pageModifyExt }, + { "", 0 }, + { "PageRead", level.pageRead }, + { "PageReadExt", level.pageReadExt }, + { "PageCommitStart", level.pageCommitStart }, + { "", 0 }, + { "LazyClearInt", level.lazyClearRequeue }, + { "LazyClearIntExt", level.lazyClearRequeueExt }, + { "LazyClear", level.lazyClearFree }, + { "LazyClearExt", level.lazyClearFreeExt }, + { "", 0 }, + { "-BldAvgCount", level.pageBuild ? level.buildItemCount / level.pageBuild : 0 }, + { "-BldAvgFillPct", level.pageBuild ? level.buildFillPct / level.pageBuild * 100 : 0 }, + { "-BldAvgStoredPct", level.pageBuild ? level.buildStoredPct / level.pageBuild * 100 : 0 }, + { "", 0 }, + { "-ModAvgCount", level.pageModify ? level.modifyItemCount / level.pageModify : 0 }, + { "-ModAvgFillPct", level.pageModify ? level.modifyFillPct / level.pageModify * 100 : 0 }, + { "-ModAvgStoredPct", level.pageModify ? level.modifyStoredPct / level.pageModify * 100 : 0 } + }; + + if (s != nullptr) { + *s += format("\nLevel %d\n\t", i + 1); + } + for (auto& m : metrics) { + const char* name = m.first; + bool rate = elapsed != 0; + if (*name == '-') { + ++name; + rate = false; + } + + if (*name == '\0') { + if (s != nullptr) { + *s += "\n\t"; + } + } else { + if (s != nullptr) { + *s += format("%-15s %8u %8u/s ", name, m.second, rate ? int(m.second / elapsed) : 0); + } + if (e != nullptr) { + e->detail(format("L%d%s", i + 1, name), m.second); + } + } + } + } + } + + std::string toString(bool clearAfter) { + std::string s; + getFields(nullptr, &s); + + if (clearAfter) { + clear(); + } + + return s; + } +}; + +// Using a global for Redwood metrics because a single process shouldn't normally have multiple storage engines +RedwoodMetrics g_redwoodMetrics = {}; +Future g_redwoodMetricsActor; + +ACTOR Future redwoodMetricsLogger() { + g_redwoodMetrics.clear(); + + loop { + wait(delay(SERVER_KNOBS->REDWOOD_LOGGING_INTERVAL)); + + TraceEvent e("RedwoodMetrics"); + double elapsed = now() - g_redwoodMetrics.startTime; + e.detail("Elapsed", elapsed); + g_redwoodMetrics.getFields(&e); + g_redwoodMetrics.clear(); + } +} + // Holds an index of recently used objects. // ObjectType must have the methods // bool evictable() const; // return true if the entry can be evicted @@ -748,8 +953,7 @@ class ObjectCache : NonCopyable { typedef boost::intrusive::list EvictionOrderT; public: - ObjectCache(int sizeLimit = 1) - : sizeLimit(sizeLimit), cacheHits(0), cacheMisses(0), noHitEvictions(0), failedEvictions(0) {} + ObjectCache(int sizeLimit = 1) : sizeLimit(sizeLimit) {} void setSizeLimit(int n) { ASSERT(n > 0); @@ -762,52 +966,64 @@ public: auto i = cache.find(index); if (i != cache.end()) { ++i->second.hits; + ++g_redwoodMetrics.pagerProbeHit; return &i->second.item; } + ++g_redwoodMetrics.pagerProbeMiss; return nullptr; } // Get the object for i or create a new one. // After a get(), the object for i is the last in evictionOrder. - ObjectType& get(const IndexType& index, bool noHit = false) { + // If noHit is set, do not consider this access to be cache hit if the object is present + // If noMiss is set, do not consider this access to be a cache miss if the object is not present + ObjectType& get(const IndexType& index, bool noHit = false, bool noMiss = false) { Entry& entry = cache[index]; // If entry is linked into evictionOrder then move it to the back of the order if (entry.is_linked()) { if (!noHit) { ++entry.hits; - ++cacheHits; + ++g_redwoodMetrics.pagerCacheHit; + + // Move the entry to the back of the eviction order + evictionOrder.erase(evictionOrder.iterator_to(entry)); + evictionOrder.push_back(entry); } - // Move the entry to the back of the eviction order - evictionOrder.erase(evictionOrder.iterator_to(entry)); - evictionOrder.push_back(entry); } else { - ++cacheMisses; + if (!noMiss) { + ++g_redwoodMetrics.pagerCacheMiss; + } // Finish initializing entry entry.index = index; - entry.hits = noHit ? 0 : 1; + entry.hits = 0; // Insert the newly created Entry at the back of the eviction order evictionOrder.push_back(entry); // While the cache is too big, evict the oldest entry until the oldest entry can't be evicted. while (cache.size() > sizeLimit) { Entry& toEvict = evictionOrder.front(); + + // It's critical that we do not evict the item we just added because it would cause the reference + // returned to be invalid. An eviction could happen with a no-hit access to a cache resident page + // that is currently evictable and exists in the oversized portion of the cache eviction order due + // to previously failed evictions. + if (&entry == &toEvict) { + debug_printf("Cannot evict target index %s\n", toString(index).c_str()); + break; + } + debug_printf("Trying to evict %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); - // It's critical that we do not evict the item we just added (or the reference we return would be - // invalid) but since sizeLimit must be > 0, entry was just added to the end of the evictionOrder, and - // this loop will end if we move anything to the end of the eviction order, we can be guaraunted that - // entry != toEvict, so we do not need to check. If the item is not evictable then move it to the back - // of the eviction order and stop. if (!toEvict.item.evictable()) { evictionOrder.erase(evictionOrder.iterator_to(toEvict)); evictionOrder.push_back(toEvict); - ++failedEvictions; + ++g_redwoodMetrics.pagerEvictFail; break; } else { if (toEvict.hits == 0) { - ++noHitEvictions; + ++g_redwoodMetrics.pagerEvictUnhit; } debug_printf("Evicting %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); @@ -858,10 +1074,6 @@ public: private: int64_t sizeLimit; - int64_t cacheHits; - int64_t cacheMisses; - int64_t noHitEvictions; - int64_t failedEvictions; CacheT cache; EvictionOrderT evictionOrder; @@ -910,6 +1122,9 @@ public: }; struct RemappedPage { + RemappedPage() : version(invalidVersion) {} + RemappedPage(Version v, LogicalPageID o, LogicalPageID n) : version(v), originalPageID(o), newPageID(n) {} + Version version; LogicalPageID originalPageID; LogicalPageID newPageID; @@ -933,6 +1148,11 @@ public: DWALPager(int desiredPageSize, std::string filename, int64_t pageCacheSizeBytes, bool memoryOnly = false) : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes), memoryOnly(memoryOnly) { + + if (!g_redwoodMetricsActor.isValid()) { + g_redwoodMetricsActor = redwoodMetricsLogger(); + } + if (pageCacheBytes == 0) { pageCacheBytes = g_network->isSimulated() ? (BUGGIFY ? FLOW_KNOBS->BUGGIFY_SIM_PAGE_CACHE_4K : FLOW_KNOBS->SIM_PAGE_CACHE_4K) @@ -961,7 +1181,6 @@ public: ACTOR static Future recover(DWALPager* self) { ASSERT(!self->recoverFuture.isValid()); - self->remapUndoFuture = Void(); state bool exists = false; if (!self->memoryOnly) { @@ -1067,6 +1286,7 @@ public: // header) self->updateCommittedHeader(); self->addLatestSnapshot(); + self->remapCleanupFuture = remapCleanup(self); } else { // Note: If the file contains less than 2 pages but more than 0 bytes then the pager was never successfully // committed. A new pager will be created in its place. @@ -1111,6 +1331,7 @@ public: // Since there is no previously committed header use the initial header for the initial commit. self->updateCommittedHeader(); + self->remapCleanupFuture = Void(); wait(self->commit()); } @@ -1170,6 +1391,7 @@ public: debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), (header ? "writePhysicalHeader" : "writePhysical"), toString(pageID).c_str(), page->begin()); + ++g_redwoodMetrics.pagerDiskWrite; VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); ((Page*)page.getPtr())->updateChecksum(pageID); @@ -1196,7 +1418,8 @@ public: void updatePage(LogicalPageID pageID, Reference data) override { // Get the cache entry for this page, without counting it as a cache hit as we're replacing its contents now - PageCacheEntry& cacheEntry = pageCache.get(pageID, true); + // or as a cache miss because there is no benefit to the page already being in cache + PageCacheEntry& cacheEntry = pageCache.get(pageID, true, true); debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), @@ -1234,7 +1457,6 @@ public: Future atomicUpdatePage(LogicalPageID pageID, Reference data, Version v) override { debug_printf("DWALPager(%s) op=writeAtomic %s @%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v); - // This pager does not support atomic update, so it always allocates and uses a new pageID Future f = map(newPageID(), [=](LogicalPageID newPageID) { updatePage(newPageID, data); // TODO: Possibly limit size of remap queue since it must be recovered on cold start @@ -1249,16 +1471,7 @@ public: return f; } - void freePage(LogicalPageID pageID, Version v) override { - // If pageID has been remapped, then it can't be freed until all existing remaps for that page have been undone, - // so queue it for later deletion - if (remappedPages.find(pageID) != remappedPages.end()) { - debug_printf("DWALPager(%s) op=freeRemapped %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), - toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); - remapQueue.pushBack(RemappedPage{ v, pageID, invalidLogicalPageID }); - return; - } - + void freeUnmappedPage(LogicalPageID pageID, Version v) { // If v is older than the oldest version still readable then mark pageID as free as of the next commit if (v < effectiveOldestVersion()) { debug_printf("DWALPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), @@ -1270,6 +1483,19 @@ public: toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); delayedFreeList.pushBack({ v, pageID }); } + } + + void freePage(LogicalPageID pageID, Version v) override { + // If pageID has been remapped, then it can't be freed until all existing remaps for that page have been undone, + // so queue it for later deletion + if (remappedPages.find(pageID) != remappedPages.end()) { + debug_printf("DWALPager(%s) op=freeRemapped %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), + toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + remapQueue.pushBack(RemappedPage{ v, pageID, invalidLogicalPageID }); + return; + } + + freeUnmappedPage(pageID, v); }; // Read a physical page from the page file. Note that header pages use a page size of smallestPhysicalBlock @@ -1278,6 +1504,7 @@ public: ACTOR static Future> readPhysicalPage(DWALPager* self, PhysicalPageID pageID, bool header = false) { ASSERT(!self->memoryOnly); + ++g_redwoodMetrics.pagerDiskRead; if (g_network->getCurrentTask() > TaskPriority::DiskRead) { wait(delay(0, TaskPriority::DiskRead)); @@ -1320,7 +1547,8 @@ public: return readPhysicalPage(self, pageID, true); } - // Reads the most recent version of pageID either committed or written using updatePage() + // Reads the most recent version of pageID, either previously committed or written using updatePage() in the current + // commit Future> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override { // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache @@ -1393,56 +1621,134 @@ public: return std::min(pLastCommittedHeader->oldestVersion, snapshots.front().version); } - ACTOR static Future undoRemaps(DWALPager* self) { + ACTOR static Future remapCopyAndFree(DWALPager* self, RemappedPage m) { + debug_printf("DWALPager(%s) remapCleanup copyAndFree %s\n", self->filename.c_str(), m.toString().c_str()); + + // Read the data from the page that the original was mapped to + Reference data = wait(self->readPage(m.newPageID, false)); + + // Write the data to the original page so it can be read using its original pageID + self->updatePage(m.originalPageID, data); + ++g_redwoodMetrics.pagerRemapCopy; + + // Remove all remaps for the original page ID up through version + auto i = self->remappedPages.find(m.originalPageID); + i->second.erase(i->second.begin(), i->second.upper_bound(m.version)); + // If the version map for this page is now empty, erase it + if (i->second.empty()) { + self->remappedPages.erase(i); + } + + // Now that the remap has been undone nothing will read this page so it can be freed as of the next + // commit. + self->freeUnmappedPage(m.newPageID, 0); + ++g_redwoodMetrics.pagerRemapFree; + + return Void(); + } + + ACTOR static Future getRemapLag(DWALPager* self) { + Optional head = wait(self->remapQueue.peek()); + if (head.present()) { + return self->effectiveOldestVersion() - head.get().version; + } + return 0; + } + + ACTOR static Future remapCleanup(DWALPager* self) { + self->remapCleanupStop = false; + + // Cutoff is the version we can pop to state RemappedPage cutoff; cutoff.version = self->effectiveOldestVersion(); - // TODO: Use parallel reads - // TODO: One run of this actor might write to the same original page more than once, in which case just unmap - // the latest + // Each page is only updated at most once per version, so in order to coalesce multiple updates + // to the same page and skip some page writes we have to accumulate multiple versions worth of + // poppable entries. + Version lag = wait(getRemapLag(self)); + debug_printf("DWALPager(%s) remapCleanup versionLag=%" PRId64 "\n", self->filename.c_str(), lag); + if (lag < SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_VERSION_LAG_MIN) { + debug_printf("DWALPager(%s) not starting, lag too low\n", self->filename.c_str()); + return Void(); + } + loop { - if (self->remapUndoStop) { - break; - } - state Optional p = wait(self->remapQueue.pop(cutoff)); - if (!p.present()) { - break; - } - debug_printf("DWALPager(%s) undoRemaps popped %s\n", self->filename.c_str(), p.get().toString().c_str()); + // Pop up to the pop size limit from the queue, but only keep the latest remap queue entry per + // original page ID. This will coalesce multiple remaps of the same LogicalPageID within the + // interval of pages being unmapped to a single page copy. + state int toPop = SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_BATCH_SIZE; + state std::unordered_map toCopy; + toCopy.reserve(toPop); - if (p.get().newPageID == invalidLogicalPageID) { - debug_printf("DWALPager(%s) undoRemaps freeing %s\n", self->filename.c_str(), - p.get().toString().c_str()); - self->freePage(p.get().originalPageID, p.get().version); - } else { - // Read the data from the page that the original was mapped to - Reference data = wait(self->readPage(p.get().newPageID, false)); - - // Write the data to the original page so it can be read using its original pageID - self->updatePage(p.get().originalPageID, data); - - // Remove the remap from this page, deleting the entry for the pageID if its map becomes empty - auto i = self->remappedPages.find(p.get().originalPageID); - if (i->second.size() == 1) { - self->remappedPages.erase(i); - } else { - i->second.erase(p.get().version); + // Take up to batch size pages from front of queue + while (toPop > 0) { + state Optional p = wait(self->remapQueue.pop(cutoff)); + debug_printf("DWALPager(%s) remapCleanup popped %s\n", self->filename.c_str(), ::toString(p).c_str()); + if (!p.present()) { + break; } - // Now that the remap has been undone nothing will read this page so it can be freed as of the next - // commit. - self->freePage(p.get().newPageID, 0); + // Get the existing remap entry for the original page, which could be newly initialized + auto& m = toCopy[p.get().originalPageID]; + // If version is invalid then this is a newly constructed RemappedPage, so copy p.get() over it + if (m.version != invalidVersion) { + ASSERT(m.version < p.get().version); + ASSERT(m.newPageID != invalidLogicalPageID); + // We're replacing a previously popped item so we can avoid copying it over the original. + debug_printf("DWALPager(%s) remapCleanup elided %s\n", self->filename.c_str(), + m.toString().c_str()); + // The remapped pages entries will be cleaned up below. + self->freeUnmappedPage(m.newPageID, 0); + ++g_redwoodMetrics.pagerRemapFree; + ++g_redwoodMetrics.pagerRemapSkip; + } + m = p.get(); + + --toPop; + } + + std::vector> copies; + + for (auto& e : toCopy) { + const RemappedPage& m = e.second; + // If newPageID is invalid, originalPageID page was freed at version, not remapped + if (m.newPageID == invalidLogicalPageID) { + debug_printf("DWALPager(%s) remapCleanup freeNoCopy %s\n", self->filename.c_str(), + m.toString().c_str()); + self->remappedPages.erase(m.originalPageID); + self->freeUnmappedPage(m.originalPageID, 0); + ++g_redwoodMetrics.pagerRemapFree; + } else { + copies.push_back(remapCopyAndFree(self, m)); + } + } + + wait(waitForAll(copies)); + + // Stop if there was nothing more that could be popped + if (toPop > 0) { + break; + } + + // If the stop flag is set then stop but only if the remap lag is below the maximum allowed + if (self->remapCleanupStop) { + Version lag = wait(getRemapLag(self)); + if (lag <= SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_VERSION_LAG_MAX) { + break; + } else { + debug_printf("DWALPager(%s) remapCleanup refusing to stop, versionLag=%" PRId64 "\n", + self->filename.c_str(), lag); + } } } - debug_printf("DWALPager(%s) undoRemaps stopped, remapQueue size is %d\n", self->filename.c_str(), - self->remapQueue.numEntries); + debug_printf("DWALPager(%s) remapCleanup stopped (stop=%d)\n", self->filename.c_str(), self->remapCleanupStop); return Void(); } // Flush all queues so they have no operations pending. ACTOR static Future flushQueues(DWALPager* self) { - ASSERT(self->remapUndoFuture.isReady()); + ASSERT(self->remapCleanupFuture.isReady()); // Flush remap queue separately, it's not involved in free page management wait(self->remapQueue.flush()); @@ -1472,8 +1778,8 @@ public: self->writeHeaderPage(1, self->lastCommittedHeaderPage); // Trigger the remap eraser to stop and then wait for it. - self->remapUndoStop = true; - wait(self->remapUndoFuture); + self->remapCleanupStop = true; + wait(self->remapCleanupFuture); wait(flushQueues(self)); @@ -1518,8 +1824,7 @@ public: self->expireSnapshots(self->pHeader->oldestVersion); // Start unmapping pages for expired versions - self->remapUndoStop = false; - self->remapUndoFuture = undoRemaps(self); + self->remapCleanupFuture = remapCleanup(self); return Void(); } @@ -1543,7 +1848,7 @@ public: debug_printf("DWALPager(%s) shutdown cancel commit\n", self->filename.c_str()); self->commitFuture.cancel(); debug_printf("DWALPager(%s) shutdown cancel remap\n", self->filename.c_str()); - self->remapUndoFuture.cancel(); + self->remapCleanupFuture.cancel(); if (self->errorPromise.canBeSet()) { debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str()); @@ -1601,7 +1906,7 @@ public: ACTOR static Future getUserPageCount_cleanup(DWALPager* self) { // Wait for the remap eraser to finish all of its work (not triggering stop) - wait(self->remapUndoFuture); + wait(self->remapCleanupFuture); // Flush queues so there are no pending freelist operations wait(flushQueues(self)); @@ -1712,8 +2017,8 @@ private: Future commitFuture; SignalableActorCollection operations; Future recoverFuture; - Future remapUndoFuture; - bool remapUndoStop; + Future remapCleanupFuture; + bool remapCleanupStop; Reference pageFile; @@ -1984,6 +2289,7 @@ struct RedwoodRecordRef { Version version; int expectedSize() const { return key.expectedSize() + value.expectedSize(); } + int kvBytes() const { return expectedSize(); } class Reader { public: @@ -2433,8 +2739,8 @@ struct BTreePage { #pragma pack(pop) int size() const { - const BinaryTree* t = &tree(); - return (uint8_t*)t - (uint8_t*)this + t->size(); + auto& t = tree(); + return (uint8_t*)&t - (uint8_t*)this + t.size(); } bool isLeaf() const { return height == 1; } @@ -2557,11 +2863,11 @@ public: // A record which is greater than the last possible record in the tree static RedwoodRecordRef dbEnd; - struct LazyDeleteQueueEntry { + struct LazyClearQueueEntry { Version version; Standalone pageID; - bool operator<(const LazyDeleteQueueEntry& rhs) const { return version < rhs.version; } + bool operator<(const LazyClearQueueEntry& rhs) const { return version < rhs.version; } int readFromBytes(const uint8_t* src) { version = *(Version*)src; @@ -2584,15 +2890,15 @@ public: std::string toString() const { return format("{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); } }; - typedef FIFOQueue LazyDeleteQueueT; + typedef FIFOQueue LazyClearQueueT; #pragma pack(push, 1) struct MetaKey { - static constexpr int FORMAT_VERSION = 7; + static constexpr int FORMAT_VERSION = 8; // This serves as the format version for the entire tree, individual pages will not be versioned uint16_t formatVersion; uint8_t height; - LazyDeleteQueueT::QueueState lazyDeleteQueue; + LazyClearQueueT::QueueState lazyDeleteQueue; InPlaceArray root; KeyRef asKeyRef() const { return KeyRef((uint8_t*)this, sizeof(MetaKey) + root.extraSize()); } @@ -2609,68 +2915,6 @@ public: }; #pragma pack(pop) - struct Counts { - Counts() { - memset(this, 0, sizeof(Counts)); - startTime = g_network ? now() : 0; - } - - void clear() { *this = Counts(); } - - int64_t pageReads; - int64_t extPageReads; - int64_t pagePreloads; - int64_t extPagePreloads; - int64_t setBytes; - int64_t pageWrites; - int64_t extPageWrites; - int64_t sets; - int64_t clears; - int64_t clearSingleKey; - int64_t commits; - int64_t gets; - int64_t getRanges; - int64_t commitSubtreeStart; - int64_t pageUpdates; - double startTime; - - std::string toString(bool clearAfter = false) { - const char* labels[] = { "set", - "clear", - "clearSingleKey", - "get", - "getRange", - "commit", - "pageReads", - "extPageRead", - "pagePreloads", - "extPagePreloads", - "pageWrites", - "pageUpdates", - "extPageWrites", - "commitSubtreeStart" }; - const int64_t values[] = { - sets, clears, clearSingleKey, gets, getRanges, commits, pageReads, - extPageReads, pagePreloads, extPagePreloads, pageWrites, pageUpdates, extPageWrites, commitSubtreeStart - }; - - double elapsed = now() - startTime; - std::string s; - for (int i = 0; i < sizeof(values) / sizeof(int64_t); ++i) { - s += format("%s=%" PRId64 " (%d/s) ", labels[i], values[i], int(values[i] / elapsed)); - } - - if (clearAfter) { - clear(); - } - - return s; - } - }; - - // Using a static for metrics because a single process shouldn't normally have multiple storage engines - static Counts counts; - // All async opts on the btree are based on pager reads, writes, and commits, so // we can mostly forward these next few functions to the pager Future getError() { return m_pager->getError(); } @@ -2700,7 +2944,9 @@ public: // setWriteVersion() A write shall not become durable until the following call to commit() begins, and shall be // durable once the following call to commit() returns void set(KeyValueRef keyValue) { - ++counts.sets; + ++g_redwoodMetrics.opSet; + ++g_redwoodMetrics.opSetKeyBytes += keyValue.key.size(); + ++g_redwoodMetrics.opSetValueBytes += keyValue.value.size(); m_pBuffer->insert(keyValue.key).mutation().setBoundaryValue(m_pBuffer->copyToArena(keyValue.value)); } @@ -2708,13 +2954,13 @@ public: // Optimization for single key clears to create just one mutation boundary instead of two if (clearedRange.begin.size() == clearedRange.end.size() - 1 && clearedRange.end[clearedRange.end.size() - 1] == 0 && clearedRange.end.startsWith(clearedRange.begin)) { - ++counts.clears; - ++counts.clearSingleKey; + ++g_redwoodMetrics.opClear; + ++g_redwoodMetrics.opClearKey; m_pBuffer->insert(clearedRange.begin).mutation().clearBoundary(); return; } - ++counts.clears; + ++g_redwoodMetrics.opClear; MutationBuffer::iterator iBegin = m_pBuffer->insert(clearedRange.begin); MutationBuffer::iterator iEnd = m_pBuffer->insert(clearedRange.end); @@ -2743,42 +2989,46 @@ public: VersionedBTree(IPager2* pager, std::string name) : m_pager(pager), m_writeVersion(invalidVersion), m_lastCommittedVersion(invalidVersion), m_pBuffer(nullptr), m_name(name) { + m_lazyClearActor = 0; m_init = init_impl(this); m_latestCommit = m_init; } - ACTOR static Future incrementalSubtreeClear(VersionedBTree* self, bool* pStop = nullptr, int batchSize = 10, - unsigned int minPages = 0, - int maxPages = std::numeric_limits::max()) { + ACTOR static Future incrementalLazyClear(VersionedBTree* self) { + ASSERT(self->m_lazyClearActor.isReady()); + self->m_lazyClearStop = false; + // TODO: Is it contractually okay to always to read at the latest version? state Reference snapshot = self->m_pager->getReadSnapshot(self->m_pager->getLatestVersion()); state int freedPages = 0; loop { - state std::vector>>> entries; + state int toPop = SERVER_KNOBS->REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; + state std::vector>>> entries; + entries.reserve(toPop); // Take up to batchSize pages from front of queue - while (entries.size() < batchSize) { - Optional q = wait(self->m_lazyDeleteQueue.pop()); - debug_printf("LazyDelete: popped %s\n", toString(q).c_str()); + while (toPop > 0) { + Optional q = wait(self->m_lazyClearQueue.pop()); + debug_printf("LazyClear: popped %s\n", toString(q).c_str()); if (!q.present()) { break; } // Start reading the page, without caching entries.push_back( std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true))); - } - if (entries.empty()) { - break; + --toPop; } state int i; for (i = 0; i < entries.size(); ++i) { Reference p = wait(entries[i].second); - const LazyDeleteQueueEntry& entry = entries[i].first; + const LazyClearQueueEntry& entry = entries[i].first; const BTreePage& btPage = *(BTreePage*)p->begin(); - debug_printf("LazyDelete: processing %s\n", toString(entry).c_str()); + auto& metrics = g_redwoodMetrics.level(btPage.height); + + debug_printf("LazyClear: processing %s\n", toString(entry).c_str()); // Level 1 (leaf) nodes should never be in the lazy delete queue ASSERT(btPage.height > 1); @@ -2792,15 +3042,19 @@ public: while (1) { if (c.get().value.present()) { BTreePageIDRef btChildPageID = c.get().getChildPage(); - // If this page is height 2, then the children are leaves so free + // If this page is height 2, then the children are leaves so free them directly if (btPage.height == 2) { - debug_printf("LazyDelete: freeing child %s\n", toString(btChildPageID).c_str()); + debug_printf("LazyClear: freeing child %s\n", toString(btChildPageID).c_str()); self->freeBtreePage(btChildPageID, v); freedPages += btChildPageID.size(); + metrics.lazyClearFree += 1; + metrics.lazyClearFreeExt += (btChildPageID.size() - 1); } else { // Otherwise, queue them for lazy delete. - debug_printf("LazyDelete: queuing child %s\n", toString(btChildPageID).c_str()); - self->m_lazyDeleteQueue.pushFront(LazyDeleteQueueEntry{ v, btChildPageID }); + debug_printf("LazyClear: queuing child %s\n", toString(btChildPageID).c_str()); + self->m_lazyClearQueue.pushFront(LazyClearQueueEntry{ v, btChildPageID }); + metrics.lazyClearRequeue += 1; + metrics.lazyClearRequeueExt += (btChildPageID.size() - 1); } } if (!c.moveNext()) { @@ -2809,25 +3063,32 @@ public: } // Free the page, now that its children have either been freed or queued - debug_printf("LazyDelete: freeing queue entry %s\n", toString(entry.pageID).c_str()); + debug_printf("LazyClear: freeing queue entry %s\n", toString(entry.pageID).c_str()); self->freeBtreePage(entry.pageID, v); freedPages += entry.pageID.size(); + metrics.lazyClearFree += 1; + metrics.lazyClearFreeExt += entry.pageID.size() - 1; } - // If stop is set and we've freed the minimum number of pages required, or the maximum is exceeded, return. - if ((freedPages >= minPages && pStop != nullptr && *pStop) || freedPages >= maxPages) { + // Stop if + // - the poppable items in the queue have already been exhausted + // - stop flag is set and we've freed the minimum number of pages required + // - maximum number of pages to free met or exceeded + if (toPop > 0 || (freedPages >= SERVER_KNOBS->REDWOOD_LAZY_CLEAR_MIN_PAGES && self->m_lazyClearStop) || + (freedPages >= SERVER_KNOBS->REDWOOD_LAZY_CLEAR_MAX_PAGES)) { break; } } - debug_printf("LazyDelete: freed %d pages, %s has %" PRId64 " entries\n", freedPages, - self->m_lazyDeleteQueue.name.c_str(), self->m_lazyDeleteQueue.numEntries); + debug_printf("LazyClear: freed %d pages, %s has %" PRId64 " entries\n", freedPages, + self->m_lazyClearQueue.name.c_str(), self->m_lazyClearQueue.numEntries); return freedPages; } ACTOR static Future init_impl(VersionedBTree* self) { wait(self->m_pager->init()); + self->m_blockSize = self->m_pager->getUsablePageSize(); state Version latest = self->m_pager->getLatestVersion(); self->m_newOldestVersion = self->m_pager->getOldestVersion(); @@ -2849,19 +3110,20 @@ public: self->m_pager->setCommitVersion(latest); LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); - self->m_lazyDeleteQueue.create(self->m_pager, newQueuePage, "LazyDeleteQueue"); - self->m_header.lazyDeleteQueue = self->m_lazyDeleteQueue.getState(); + self->m_lazyClearQueue.create(self->m_pager, newQueuePage, "LazyClearQueue"); + self->m_header.lazyDeleteQueue = self->m_lazyClearQueue.getState(); self->m_pager->setMetaKey(self->m_header.asKeyRef()); wait(self->m_pager->commit()); debug_printf("Committed initial commit.\n"); } else { self->m_header.fromKeyRef(meta); - self->m_lazyDeleteQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyDeleteQueueRecovered"); + self->m_lazyClearQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyClearQueueRecovered"); } debug_printf("Recovered btree at version %" PRId64 ": %s\n", latest, self->m_header.toString().c_str()); self->m_lastCommittedVersion = latest; + self->m_lazyClearActor = incrementalLazyClear(self); return Void(); } @@ -2910,15 +3172,24 @@ public: ACTOR static Future destroyAndCheckSanity_impl(VersionedBTree* self) { ASSERT(g_network->isSimulated()); + // This isn't pretty but remap cleanup is controlled by knobs and for this test we need the entire remap queue + // to be processed. + const_cast(SERVER_KNOBS)->REDWOOD_REMAP_CLEANUP_VERSION_LAG_MIN = 0; + const_cast(SERVER_KNOBS)->REDWOOD_REMAP_CLEANUP_VERSION_LAG_MAX = 0; + debug_printf("Clearing tree.\n"); self->setWriteVersion(self->getLatestVersion() + 1); self->clear(KeyRangeRef(dbBegin.key, dbEnd.key)); + wait(self->commit()); + // Loop commits until the the lazy delete queue is completely processed. loop { - state int freedPages = wait(self->incrementalSubtreeClear(self)); wait(self->commit()); - // Keep looping until the last commit doesn't do anything at all - if (self->m_lazyDeleteQueue.numEntries == 0 && freedPages == 0) { + + // If the lazy delete queue is completely processed then the last time the lazy delete actor + // was started it, after the last commit, it would exist immediately and do no work, so its + // future would be ready and its value would be 0. + if (self->m_lazyClearActor.isReady() && self->m_lazyClearActor.get() == 0) { break; } self->setWriteVersion(self->getLatestVersion() + 1); @@ -2932,7 +3203,7 @@ public: // The lazy delete queue should now be empty and contain only the new page to start writing to // on the next commit. - LazyDeleteQueueT::QueueState s = self->m_lazyDeleteQueue.getState(); + LazyClearQueueT::QueueState s = self->m_lazyClearQueue.getState(); ASSERT(s.numEntries == 0); ASSERT(s.numPages == 1); @@ -3167,6 +3438,7 @@ private: Future m_latestCommit; Future m_init; std::string m_name; + int m_blockSize; // MetaKey changes size so allocate space for it to expand into union { @@ -3174,7 +3446,9 @@ private: MetaKey m_header; }; - LazyDeleteQueueT m_lazyDeleteQueue; + LazyClearQueueT m_lazyClearQueue; + Future m_lazyClearActor; + bool m_lazyClearStop; // Writes entries to 1 or more pages and return a vector of boundary keys with their IPage(s) ACTOR static Future>> writePages( @@ -3184,7 +3458,7 @@ private: state Standalone> records; // This is how much space for the binary tree exists in the page, after the header - state int blockSize = self->m_pager->getUsablePageSize(); + state int blockSize = self->m_blockSize; state int pageSize = blockSize - sizeof(BTreePage); state int pageFillTarget = pageSize * SERVER_KNOBS->REDWOOD_PAGE_REBUILD_FILL_FACTOR; state int blockCount = 1; @@ -3223,14 +3497,12 @@ private: // overhead for the delta size must be assumed. int deltaSize = entry.deltaSize(base, skip, true); - int keySize = entry.key.size(); - int valueSize = entry.value.present() ? entry.value.get().size() : 0; - int nodeSize = BTreePage::BinaryTree::Node::headerSize(largeTree) + deltaSize; debug_printf("Adding %3d of %3lu (i=%3d) klen %4d vlen %5d nodeSize %5d deltaSize %5d page usage: " "%d/%d (%.2f%%) record=%s\n", - i + 1, entries.size(), i, keySize, valueSize, nodeSize, deltaSize, compressedBytes, - pageSize, (float)compressedBytes / pageSize * 100, entry.toString(height == 1).c_str()); + i + 1, entries.size(), i, entry.key.size(), entry.value.orDefault(StringRef()).size(), + nodeSize, deltaSize, compressedBytes, pageSize, (float)compressedBytes / pageSize * 100, + entry.toString(height == 1).c_str()); // While the node doesn't fit, expand the page. // This is a loop because if the page size moves into "large" range for DeltaTree @@ -3261,7 +3533,7 @@ private: pageFillTarget = pageSize * SERVER_KNOBS->REDWOOD_PAGE_REBUILD_FILL_FACTOR; } - kvBytes += keySize + valueSize; + kvBytes += entry.kvBytes(); compressedBytes += nodeSize; ++i; } @@ -3289,14 +3561,14 @@ private: state std::vector> pages; BTreePage* btPage; + int capacity = blockSize * blockCount; if (blockCount == 1) { Reference page = self->m_pager->newPageBuffer(); btPage = (BTreePage*)page->mutate(); pages.push_back(std::move(page)); } else { ASSERT(blockCount > 1); - int size = blockSize * blockCount; - btPage = (BTreePage*)new uint8_t[size]; + btPage = (BTreePage*)new uint8_t[capacity]; } btPage->height = height; @@ -3318,6 +3590,13 @@ private: ASSERT(false); } + auto& metrics = g_redwoodMetrics.level(btPage->height); + metrics.pageBuild += 1; + metrics.pageBuildExt += blockCount - 1; + metrics.buildFillPct += (double)written / capacity; + metrics.buildStoredPct += (double)btPage->kvBytes / capacity; + metrics.buildItemCount += btPage->tree().numItems; + // Create chunked pages // TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled. if (blockCount != 1) { @@ -3362,12 +3641,6 @@ private: wait(yield()); - // Update activity counts - ++counts.pageWrites; - if (pages.size() > 1) { - counts.extPageWrites += pages.size() - 1; - } - debug_printf("Flushing %s lastPage=%d original=%s start=%d i=%d count=%d page usage: %d/%d (%.2f%%) " "bytes\nlower: %s\nupper: %s\n", toString(childPageID).c_str(), isLastPage, toString(previousID).c_str(), start, i, i - start, @@ -3466,8 +3739,8 @@ private: ACTOR static Future> readPage(Reference snapshot, BTreePageIDRef id, const RedwoodRecordRef* lowerBound, const RedwoodRecordRef* upperBound, - bool forLazyDelete = false) { - if (!forLazyDelete) { + bool forLazyClear = false) { + if (!forLazyClear) { debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString(false).c_str(), upperBound->toString(false).c_str()); @@ -3480,16 +3753,14 @@ private: state Reference page; - ++counts.pageReads; if (id.size() == 1) { - Reference p = wait(snapshot->getPhysicalPage(id.front(), !forLazyDelete, false)); + Reference p = wait(snapshot->getPhysicalPage(id.front(), !forLazyClear, false)); page = p; } else { ASSERT(!id.empty()); - counts.extPageReads += (id.size() - 1); std::vector>> reads; for (auto& pageID : id) { - reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyDelete, false)); + reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyClear, false)); } std::vector> pages = wait(getAll(reads)); // TODO: Cache reconstituted super pages somehow, perhaps with help from the Pager. @@ -3498,8 +3769,11 @@ private: debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); const BTreePage* pTreePage = (const BTreePage*)page->begin(); + auto& metrics = g_redwoodMetrics.level(pTreePage->height); + metrics.pageRead += 1; + metrics.pageReadExt += (id.size() - 1); - if (!forLazyDelete && page->userData == nullptr) { + if (!forLazyClear && page->userData == nullptr) { debug_printf("readPage() Creating Reader for %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString(false).c_str(), upperBound->toString(false).c_str()); @@ -3507,7 +3781,7 @@ private: page->userDataDestructor = [](void* ptr) { delete (BTreePage::BinaryTree::Mirror*)ptr; }; } - if (!forLazyDelete) { + if (!forLazyClear) { debug_printf("readPage() %s\n", pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); } @@ -3516,8 +3790,8 @@ private: } static void preLoadPage(IPagerSnapshot* snapshot, BTreePageIDRef id) { - ++counts.pagePreloads; - counts.extPagePreloads += (id.size() - 1); + g_redwoodMetrics.btreeLeafPreload += 1; + g_redwoodMetrics.btreeLeafPreloadExt += (id.size() - 1); for (auto pageID : id) { snapshot->getPhysicalPage(pageID, true, true); @@ -3563,12 +3837,6 @@ private: } } - // Update activity counts - ++counts.pageWrites; - if (newID.size() > 1) { - counts.extPageWrites += newID.size() - 1; - } - return newID; } @@ -3646,7 +3914,14 @@ private: } // Page was updated in-place through edits and written to maybeNewID - void updatedInPlace(BTreePageIDRef maybeNewID) { + void updatedInPlace(BTreePageIDRef maybeNewID, BTreePage* btPage, int capacity) { + auto& metrics = g_redwoodMetrics.level(btPage->height); + metrics.pageModify += 1; + metrics.pageModify += (maybeNewID.size() - 1); + metrics.modifyFillPct += (double)btPage->size() / capacity; + metrics.modifyStoredPct += (double)btPage->kvBytes / capacity; + metrics.modifyItemCount += btPage->tree().numItems; + // The boundaries can't have changed, but the child page link may have. if (maybeNewID != decodeLowerBound->getChildPage()) { // Add page's decode lower bound to newLinks set without its child page, intially @@ -3704,10 +3979,11 @@ private: struct InternalPageModifier { InternalPageModifier() {} - InternalPageModifier(BTreePage::BinaryTree::Mirror* m, bool updating) - : m(m), updating(updating), changesMade(false) {} + InternalPageModifier(BTreePage* p, BTreePage::BinaryTree::Mirror* m, bool updating) + : btPage(p), m(m), updating(updating), changesMade(false) {} bool updating; + BTreePage* btPage; BTreePage::BinaryTree::Mirror* m; Standalone> rebuild; bool changesMade; @@ -3747,6 +4023,7 @@ private: updating = false; break; } + btPage->kvBytes += rec.kvBytes(); ++i; } } @@ -3789,6 +4066,7 @@ private: auto c = u.cBegin; while (c != u.cEnd) { debug_printf("internal page (updating) erasing: %s\n", c.get().toString(false).c_str()); + btPage->kvBytes -= c.get().kvBytes(); c.erase(); } // [cBegin, cEnd) is now erased, and cBegin is invalid, so cEnd represents the end @@ -3847,12 +4125,12 @@ private: debug_printf("%s -------------------------------------\n", context.c_str()); } - ++self->counts.commitSubtreeStart; state Version writeVersion = self->getLastCommittedVersion() + 1; state Reference page = wait(readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound)); state BTreePage* btPage = (BTreePage*)page->begin(); ASSERT(isLeaf == btPage->isLeaf()); + g_redwoodMetrics.level(btPage->height).pageCommitStart += 1; // TODO: Decide if it is okay to update if the subtree boundaries are expanded. It can result in // records in a DeltaTree being outside its decode boundary range, which isn't actually invalid @@ -3943,6 +4221,7 @@ private: if (updating) { debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + btPage->kvBytes -= cursor.get().kvBytes(); cursor.erase(); } else { debug_printf("%s Skipped %s [existing, boundary start]\n", context.c_str(), @@ -3964,6 +4243,7 @@ private: // If updating, add to the page, else add to the output set if (updating) { if (cursor.mirror->insert(rec, update->skipLen, maxHeightAllowed)) { + btPage->kvBytes += rec.kvBytes(); debug_printf("%s Inserted %s [mutation, boundary start]\n", context.c_str(), rec.toString().c_str()); } else { @@ -4012,6 +4292,7 @@ private: if (updating) { debug_printf("%s Erasing %s [existing, boundary start]\n", context.c_str(), cursor.get().toString().c_str()); + btPage->kvBytes -= cursor.get().kvBytes(); cursor.erase(); changesMade = true; } else { @@ -4046,6 +4327,7 @@ private: debug_printf( "%s Erasing %s and beyond [existing, matches changed upper mutation boundary]\n", context.c_str(), cursor.get().toString().c_str()); + btPage->kvBytes -= cursor.get().kvBytes(); cursor.erase(); } else { merged.push_back(merged.arena(), cursor.get()); @@ -4086,8 +4368,7 @@ private: BTreePageIDRef newID = wait(self->updateBtreePage(self, rootID, &update->newLinks.arena(), page.castTo(), writeVersion)); - update->updatedInPlace(newID); - ++counts.pageUpdates; + update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize); debug_printf("%s Page updated in-place, returning %s\n", context.c_str(), toString(*update).c_str()); } @@ -4123,8 +4404,10 @@ private: cursor.moveFirst(); bool first = true; + while (cursor.valid()) { InternalPageSliceUpdate& u = *new (arena) InternalPageSliceUpdate(); + slices.push_back(&u); // At this point we should never be at a null child page entry because the first entry of a page // can't be null and this loop will skip over null entries that come after non-null entries. @@ -4136,8 +4419,16 @@ private: if (first) { u.subtreeLowerBound = update->subtreeLowerBound; first = false; + // mbegin is already the first mutation that could affect this subtree described by update } else { u.subtreeLowerBound = u.decodeLowerBound; + mBegin = mEnd; + // mBegin is either at or greater than subtreeLowerBound->key, which was the subtreeUpperBound->key + // for the previous subtree slice. But we need it to be at or *before* subtreeLowerBound->key + // so if mBegin.key() is not exactly the subtree lower bound key then decrement it. + if (mBegin.key() != u.subtreeLowerBound->key) { + --mBegin; + } } BTreePageIDRef pageID = cursor.get().getChildPage(); @@ -4166,28 +4457,21 @@ private: } u.subtreeUpperBound = cursor.valid() ? &cursor.get() : update->subtreeUpperBound; u.cEnd = cursor; - u.skipLen = 0; // TODO: set this - slices.push_back(&u); - // Find the mutation buffer range that includes all changes to the range described by u - MutationBuffer::const_iterator mBegin = mutationBuffer->upper_bound(u.subtreeLowerBound->key); - MutationBuffer::const_iterator mEnd = mutationBuffer->lower_bound(u.subtreeUpperBound->key); + mEnd = mutationBuffer->lower_bound(u.subtreeUpperBound->key); - // If mutation boundaries are the same, the range is fully described by (mBegin - 1).mutation() - bool fullyCovered = (mBegin == mEnd); - --mBegin; - - // If mBegin describes the entire subtree range, see if there are either no changes or if the entire - // range is cleared. - if (fullyCovered) { + // If the mutation range described by mBegin extends to mEnd, then see if the part of that range + // that overlaps with u's subtree range is being fully cleared or fully unchanged. + auto next = mBegin; + ++next; + if (next == mEnd) { + // Check for uniform clearedness or unchangedness for the range mutation where it overlaps u's + // subtree + const KeyRef& mutationBoundaryKey = mBegin.key(); const RangeMutation& range = mBegin.mutation(); - - // Check for uniform clearedness or unchangedness for the range mutation - KeyRef mutationBoundaryKey = mBegin.key(); bool uniform; - if (range.clearAfterBoundary) { // If the mutation range after the boundary key is cleared, then the mutation boundary key must // be cleared or must be different than the subtree lower bound key so that it doesn't matter @@ -4199,11 +4483,13 @@ private: uniform = !range.boundaryChanged || mutationBoundaryKey != u.subtreeLowerBound->key; } - // If the subtree range described by u is either uniformly changed or unchanged + // If u's subtree is either all cleared or all unchanged if (uniform) { - // See if we can expand the subtree range to include more subtrees which are also covered by the - // same mutation range - if (cursor.valid() && mEnd.key() != cursor.get().key) { + // We do not need to recurse to this subtree. Next, let's see if we can embiggen u's range to + // include sibling subtrees also covered by (mBegin, mEnd) so we can not recurse to those, too. + // If the cursor is valid, u.subtreeUpperBound is the cursor's position, which is >= mEnd.key(). + // If equal, no range expansion is possible. + if (cursor.valid() && mEnd.key() != u.subtreeUpperBound->key) { cursor.seekLessThanOrEqual(mEnd.key(), update->skipLen, &cursor, 1); // If this seek moved us ahead, to something other than cEnd, then update subtree range @@ -4250,8 +4536,8 @@ private: } else { debug_printf("%s: queuing subtree deletion cleared subtree range: %s\n", context.c_str(), ::toString(rec.getChildPage()).c_str()); - self->m_lazyDeleteQueue.pushFront( - LazyDeleteQueueEntry{ writeVersion, rec.getChildPage() }); + self->m_lazyClearQueue.pushFront( + LazyClearQueueEntry{ writeVersion, rec.getChildPage() }); } } c.moveNext(); @@ -4260,15 +4546,18 @@ private: // Subtree range unchanged } - debug_printf("%s: MutationBuffer covers this range in a single mutation: %s\n", context.c_str(), - u.toString().c_str()); + debug_printf("%s: MutationBuffer covers this range in a single mutation, not recursing: %s\n", + context.c_str(), u.toString().c_str()); + + // u has already been initialized with the correct result, no recursion needed, so restart the + // loop. continue; } } // If this page has height of 2 then its children are leaf nodes - recursions.push_back(self->commitSubtree(self, snapshot, mutationBuffer, pageID, btPage->height == 2, - mBegin, mEnd, slices.back())); + recursions.push_back( + self->commitSubtree(self, snapshot, mutationBuffer, pageID, btPage->height == 2, mBegin, mEnd, &u)); } debug_printf( @@ -4279,7 +4568,7 @@ private: wait(waitForAll(recursions)); debug_printf("%s Recursions done, processing slice updates.\n", context.c_str()); - state InternalPageModifier m(cursor.mirror, tryToUpdate); + state InternalPageModifier m(btPage, cursor.mirror, tryToUpdate); // Apply the possible changes for each subtree range recursed to, except the last one. // For each range, the expected next record, if any, is checked against the first boundary @@ -4310,8 +4599,7 @@ private: BTreePageIDRef newID = wait(self->updateBtreePage(self, rootID, &update->newLinks.arena(), page.castTo(), writeVersion)); - update->updatedInPlace(newID); - ++counts.pageUpdates; + update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize); debug_printf("%s Internal page updated in-place, returning %s\n", context.c_str(), toString(*update).c_str()); } else { @@ -4357,9 +4645,6 @@ private: debug_printf("%s: Beginning commit of version %" PRId64 ", new oldest version set to %" PRId64 "\n", self->m_name.c_str(), writeVersion, self->m_newOldestVersion); - state bool lazyDeleteStop = false; - state Future lazyDelete = incrementalSubtreeClear(self, &lazyDeleteStop); - // Get the latest version from the pager, which is what we will read at state Version latestVersion = self->m_pager->getLatestVersion(); debug_printf("%s: pager latestVersion %" PRId64 "\n", self->m_name.c_str(), latestVersion); @@ -4405,14 +4690,14 @@ private: self->m_header.root.set(rootPageID, sizeof(headerSpace) - sizeof(m_header)); - lazyDeleteStop = true; - wait(success(lazyDelete)); - debug_printf("Lazy delete freed %u pages\n", lazyDelete.get()); + self->m_lazyClearStop = true; + wait(success(self->m_lazyClearActor)); + debug_printf("Lazy delete freed %u pages\n", self->m_lazyClearActor.get()); self->m_pager->setCommitVersion(writeVersion); - wait(self->m_lazyDeleteQueue.flush()); - self->m_header.lazyDeleteQueue = self->m_lazyDeleteQueue.getState(); + wait(self->m_lazyClearQueue.flush()); + self->m_header.lazyDeleteQueue = self->m_lazyClearQueue.getState(); debug_printf("Setting metakey\n"); self->m_pager->setMetaKey(self->m_header.asKeyRef()); @@ -4427,9 +4712,10 @@ private: self->m_mutationBuffers.erase(self->m_mutationBuffers.begin()); self->m_lastCommittedVersion = writeVersion; - ++counts.commits; - committed.send(Void()); + ++g_redwoodMetrics.opCommit; + self->m_lazyClearActor = incrementalLazyClear(self); + committed.send(Void()); return Void(); } @@ -4899,9 +5185,8 @@ public: #include "fdbserver/art_impl.h" -RedwoodRecordRef VersionedBTree::dbBegin(StringRef(), 0); +RedwoodRecordRef VersionedBTree::dbBegin(LiteralStringRef("")); RedwoodRecordRef VersionedBTree::dbEnd(LiteralStringRef("\xff\xff\xff\xff\xff")); -VersionedBTree::Counts VersionedBTree::counts; class KeyValueStoreRedwoodUnversioned : public IKeyValueStore { public: @@ -4982,7 +5267,7 @@ public: wait(self->m_concurrentReads.take()); state FlowLock::Releaser releaser(self->m_concurrentReads); - self->m_tree->counts.getRanges++; + ++g_redwoodMetrics.opGetRange; state Standalone result; state int accumulatedBytes = 0; ASSERT(byteLimit > 0); @@ -5034,7 +5319,7 @@ public: wait(self->m_concurrentReads.take()); state FlowLock::Releaser releaser(self->m_concurrentReads); - self->m_tree->counts.gets++; + ++g_redwoodMetrics.opGet; state Reference cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion()); wait(cur->findEqual(key)); @@ -5053,7 +5338,7 @@ public: wait(self->m_concurrentReads.take()); state FlowLock::Releaser releaser(self->m_concurrentReads); - self->m_tree->counts.gets++; + ++g_redwoodMetrics.opGet; state Reference cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion()); wait(cur->findEqual(key)); @@ -6184,6 +6469,9 @@ TEST_CASE("!/redwood/performance/mutationBuffer") { } TEST_CASE("!/redwood/correctness/btree") { + g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting + g_redwoodMetrics.clear(); + state std::string pagerFile = "unittest_pageFile.redwood"; IPager2* pager; @@ -6229,6 +6517,7 @@ TEST_CASE("!/redwood/correctness/btree") { printf("Initializing...\n"); state double startTime = now(); + pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, pagerMemoryOnly); state VersionedBTree* btree = new VersionedBTree(pager, pagerFile); wait(btree->init()); @@ -6379,7 +6668,7 @@ TEST_CASE("!/redwood/correctness/btree") { } commit = map(btree->commit(), [=](Void) { - printf("Committed: %s\n", VersionedBTree::counts.toString(true).c_str()); + printf("Committed:\n%s\n", g_redwoodMetrics.toString(true).c_str()); // Notify the background verifier that version is committed and therefore readable committedVersions.send(v); return Void(); @@ -6533,7 +6822,9 @@ TEST_CASE("!/redwood/correctness/pager/cow") { TEST_CASE("!/redwood/performance/set") { state SignalableActorCollection actors; - VersionedBTree::counts.clear(); + + g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting + g_redwoodMetrics.clear(); // If a test file is passed in by environment then don't write new data to it. state bool reload = getenv("TESTFILE") == nullptr; @@ -6544,7 +6835,7 @@ TEST_CASE("!/redwood/performance/set") { deleteFile(pagerFile); } - state int pageSize = 4096; + state int pageSize = SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE; state int64_t pageCacheBytes = FLOW_KNOBS->PAGE_CACHE_4K; DWALPager* pager = new DWALPager(pageSize, pagerFile, pageCacheBytes); state VersionedBTree* btree = new VersionedBTree(pager, pagerFile); @@ -6594,7 +6885,8 @@ TEST_CASE("!/redwood/performance/set") { Version lastVer = btree->getLatestVersion(); state Version version = lastVer + 1; btree->setWriteVersion(version); - int changesThisVersion = deterministicRandom()->randomInt(0, maxRecordsPerCommit - recordsThisCommit + 1); + state int changesThisVersion = + deterministicRandom()->randomInt(0, maxRecordsPerCommit - recordsThisCommit + 1); while (changesThisVersion > 0 && kvBytesThisCommit < maxKVBytesPerCommit) { KeyValue kv; @@ -6617,6 +6909,8 @@ TEST_CASE("!/redwood/performance/set") { kvBytesThisCommit += kv.key.size() + kv.value.size(); ++recordsThisCommit; } + + wait(yield()); } if (kvBytesThisCommit >= maxKVBytesPerCommit || recordsThisCommit >= maxRecordsPerCommit) { @@ -6634,7 +6928,7 @@ TEST_CASE("!/redwood/performance/set") { double* pIntervalStart = &intervalStart; commit = map(btree->commit(), [=](Void result) { - printf("Committed: %s\n", VersionedBTree::counts.toString(true).c_str()); + printf("Committed:\n%s\n", g_redwoodMetrics.toString(true).c_str()); double elapsed = timer() - *pIntervalStart; printf("Committed %d keyValueBytes in %d records in %f seconds, %.2f MB/s\n", kvb, recs, elapsed, kvb / elapsed / 1e6); @@ -6659,46 +6953,46 @@ TEST_CASE("!/redwood/performance/set") { actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar)); actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); - printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); state int ops = 10000; printf("Serial scans with adaptive readAhead...\n"); actors.add(randomScans(btree, ops, 50, -1, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); - printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Serial scans with readAhead 3 pages...\n"); actors.add(randomScans(btree, ops, 50, 12000, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); - printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Serial scans with readAhead 2 pages...\n"); actors.add(randomScans(btree, ops, 50, 8000, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); - printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Serial scans with readAhead 1 page...\n"); actors.add(randomScans(btree, ops, 50, 4000, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); - printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Serial scans...\n"); actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); - printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Serial seeks...\n"); actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); - printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Parallel seeks...\n"); actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); - printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); Future closedFuture = btree->onClosed(); btree->close(); @@ -6991,7 +7285,6 @@ Future closeKVS(IKeyValueStore* kvs) { ACTOR Future doPrefixInsertComparison(int suffixSize, int valueSize, int recordCountTarget, bool usePrefixesInOrder, KVSource source) { - VersionedBTree::counts.clear(); deleteFile("test.redwood"); wait(delay(5)); diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index a23bfcb5a6..b43ddf11b3 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -54,7 +54,7 @@ #include "fdbrpc/AsyncFileCached.actor.h" #include "fdbserver/CoroFlow.h" #include "flow/TLSConfig.actor.h" -#include "fdbclient/IncludeVersions.h" +#include "fdbclient/versions.h" #include "fdbmonitor/SimpleIni.h" diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 2cdd37a1e1..3c335d753e 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -4101,3 +4101,4 @@ void versionedMapTest() { printf("Memory used: %f MB\n", (after - before)/ 1e6); } + diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp index 283a0d7210..16794b749f 100644 --- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp @@ -399,9 +399,11 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { if (!self->locked && BUGGIFY) { TraceEvent("BARW_SubmitBackup2", randomID).detail("Tag", printable(self->backupTag)); try { + // Note the "partitionedLog" must be false, because we change + // the configuration to disable backup workers before restore. extraBackup = backupAgent.submitBackup( cx, LiteralStringRef("file://simfdb/backups/"), deterministicRandom()->randomInt(0, 100), - self->backupTag.toString(), self->backupRanges, true, self->usePartitionedLogs); + self->backupTag.toString(), self->backupRanges, true, false); } catch (Error& e) { TraceEvent("BARW_SubmitBackup2Exception", randomID) .error(e) diff --git a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp index 7269457492..4ec2110abc 100644 --- a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp +++ b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp @@ -138,11 +138,9 @@ bool checkTxInfoEntryFormat(BinaryReader &reader) { while (!reader.empty()) { // Get EventType and timestamp - FdbClientLogEvents::EventType event; + FdbClientLogEvents::Event event; reader >> event; - double timeStamp; - reader >> timeStamp; - switch (event) + switch (event.type) { case FdbClientLogEvents::GET_VERSION_LATENCY: parser->parseGetVersion(reader); @@ -166,7 +164,7 @@ bool checkTxInfoEntryFormat(BinaryReader &reader) { parser->parseErrorCommit(reader); break; default: - TraceEvent(SevError, "ClientTransactionProfilingUnknownEvent").detail("EventType", event); + TraceEvent(SevError, "ClientTransactionProfilingUnknownEvent").detail("EventType", event.type); return false; } } diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp index e17ead6c94..4349e09619 100644 --- a/fdbserver/workloads/ConfigureDatabase.actor.cpp +++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp @@ -34,6 +34,7 @@ static const char* logTypes[] = { "log_version:=2", "log_version:=3", "log_version:=4" }; static const char* redundancies[] = { "single", "double", "triple" }; +static const char* backupTypes[] = { "backup_worker_enabled:=0", "backup_worker_enabled:=1" }; std::string generateRegions() { std::string result; @@ -271,7 +272,7 @@ struct ConfigureDatabaseWorkload : TestWorkload { if(g_simulator.speedUpSimulation) { return Void(); } - state int randomChoice = deterministicRandom()->randomInt(0, 7); + state int randomChoice = deterministicRandom()->randomInt(0, 8); if( randomChoice == 0 ) { wait( success( runRYWTransaction(cx, [=](Reference tr) -> Future> @@ -322,6 +323,10 @@ struct ConfigureDatabaseWorkload : TestWorkload { else if ( randomChoice == 6 ) { // Some configurations will be invalid, and that's fine. wait(success( IssueConfigurationChange( cx, logTypes[deterministicRandom()->randomInt( 0, sizeof(logTypes)/sizeof(logTypes[0]))], false ) )); + } else if (randomChoice == 7) { + wait(success(IssueConfigurationChange( + cx, backupTypes[deterministicRandom()->randomInt(0, sizeof(backupTypes) / sizeof(backupTypes[0]))], + false))); } else { ASSERT(false); } diff --git a/fdbserver/workloads/DataDistributionMetrics.actor.cpp b/fdbserver/workloads/DataDistributionMetrics.actor.cpp new file mode 100644 index 0000000000..96a0d37510 --- /dev/null +++ b/fdbserver/workloads/DataDistributionMetrics.actor.cpp @@ -0,0 +1,108 @@ +/* + * DataDistributionMetrics.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "fdbclient/ReadYourWrites.h" +#include "fdbserver/workloads/workloads.actor.h" +#include "flow/actorcompiler.h" // This must be the last include + +struct DataDistributionMetricsWorkload : KVWorkload { + + int numTransactions; + int writesPerTransaction; + int transactionsCommitted; + int numShards; + int64_t avgBytes; + + DataDistributionMetricsWorkload(WorkloadContext const& wcx) + : KVWorkload(wcx), transactionsCommitted(0), numShards(0), avgBytes(0) { + numTransactions = getOption(options, LiteralStringRef("numTransactions"), 100); + writesPerTransaction = getOption(options, LiteralStringRef("writesPerTransaction"), 1000); + } + + static Value getRandomValue() { + return Standalone(format("Value/%08d", deterministicRandom()->randomInt(0, 10e6))); + } + + ACTOR static Future _start(Database cx, DataDistributionMetricsWorkload* self) { + state int tNum; + for (tNum = 0; tNum < self->numTransactions; ++tNum) { + loop { + state ReadYourWritesTransaction tr(cx); + try { + state int i; + for (i = 0; i < self->writesPerTransaction; ++i) { + tr.set(StringRef(format("Key/%08d", tNum * self->writesPerTransaction + i)), getRandomValue()); + } + wait(tr.commit()); + ++self->transactionsCommitted; + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + return Void(); + } + + ACTOR static Future _check(Database cx, DataDistributionMetricsWorkload* self) { + if (self->transactionsCommitted == 0) { + TraceEvent(SevError, "NoTransactionsCommitted"); + return false; + } + state Reference tr = + Reference(new ReadYourWritesTransaction(cx)); + try { + state Standalone result = wait(tr->getRange(ddStatsRange, 100)); + ASSERT(!result.more); + self->numShards = result.size(); + if (self->numShards < 1) return false; + state int64_t totalBytes = 0; + for (int i = 0; i < result.size(); ++i) { + ASSERT(result[i].key.startsWith(ddStatsRange.begin)); + totalBytes += readJSONStrictly(result[i].value.toString()).get_obj()["ShardBytes"].get_int64(); + } + self->avgBytes = totalBytes / self->numShards; + // fetch data-distribution stats for a smalller range + state int idx = deterministicRandom()->randomInt(0, result.size()); + Standalone res = wait(tr->getRange( + KeyRangeRef(result[idx].key, idx + 1 < result.size() ? result[idx + 1].key : ddStatsRange.end), 100)); + ASSERT_WE_THINK(res.size() == 1 && + res[0] == result[idx]); // It works good now. However, not sure in any case of data-distribution, the number changes + } catch (Error& e) { + TraceEvent(SevError, "FailedToRetrieveDDMetrics").detail("Error", e.what()); + return false; + } + return true; + } + + virtual std::string description() { return "DataDistributionMetrics"; } + virtual Future setup(Database const& cx) { return Void(); } + virtual Future start(Database const& cx) { return _start(cx, this); } + virtual Future check(Database const& cx) { return _check(cx, this); } + + virtual void getMetrics(vector& m) { + m.push_back(PerfMetric("NumShards", numShards, true)); + m.push_back(PerfMetric("AvgBytes", avgBytes, true)); + } +}; + +WorkloadFactory DataDistributionMetricsWorkloadFactory("DataDistributionMetrics"); diff --git a/fdbserver/workloads/TagThrottleApi.actor.cpp b/fdbserver/workloads/TagThrottleApi.actor.cpp index 7ec90868c9..812f369882 100644 --- a/fdbserver/workloads/TagThrottleApi.actor.cpp +++ b/fdbserver/workloads/TagThrottleApi.actor.cpp @@ -50,6 +50,22 @@ struct TagThrottleApiWorkload : TestWorkload { virtual void getMetrics(vector& m) {} + static Optional randomTagThrottleType() { + Optional throttleType; + switch(deterministicRandom()->randomInt(0, 3)) { + case 0: + throttleType = TagThrottleType::AUTO; + break; + case 1: + throttleType = TagThrottleType::MANUAL; + break; + default: + break; + } + + return throttleType; + } + ACTOR Future throttleTag(Database cx, std::map, TagThrottleInfo> *manuallyThrottledTags) { state TransactionTag tag = TransactionTagRef(deterministicRandom()->randomChoice(DatabaseContext::debugTransactionTagChoices)); state TransactionPriority priority = deterministicRandom()->randomChoice(allTransactionPriorities); @@ -60,7 +76,7 @@ struct TagThrottleApiWorkload : TestWorkload { tagSet.addTag(tag); try { - wait(ThrottleApi::throttleTags(cx, tagSet, rate, duration, false, priority)); + wait(ThrottleApi::throttleTags(cx, tagSet, rate, duration, TagThrottleType::MANUAL, priority)); } catch(Error &e) { state Error err = e; @@ -72,7 +88,7 @@ struct TagThrottleApiWorkload : TestWorkload { throw err; } - manuallyThrottledTags->insert_or_assign(std::make_pair(tag, priority), TagThrottleInfo(tag, false, priority, rate, now() + duration, duration)); + manuallyThrottledTags->insert_or_assign(std::make_pair(tag, priority), TagThrottleInfo(tag, TagThrottleType::MANUAL, priority, rate, now() + duration, duration)); return Void(); } @@ -82,26 +98,30 @@ struct TagThrottleApiWorkload : TestWorkload { TagSet tagSet; tagSet.addTag(tag); - state bool autoThrottled = deterministicRandom()->coinflip(); - TransactionPriority priority = deterministicRandom()->randomChoice(allTransactionPriorities); + state Optional throttleType = TagThrottleApiWorkload::randomTagThrottleType(); + Optional priority = deterministicRandom()->coinflip() ? Optional() : deterministicRandom()->randomChoice(allTransactionPriorities); state bool erased = false; - state double expiration = 0; - if(!autoThrottled) { - auto itr = manuallyThrottledTags->find(std::make_pair(tag, priority)); - if(itr != manuallyThrottledTags->end()) { - expiration = itr->second.expirationTime; - erased = true; - manuallyThrottledTags->erase(itr); + state double maxExpiration = 0; + if(!throttleType.present() || throttleType.get() == TagThrottleType::MANUAL) { + for(auto p : allTransactionPriorities) { + if(!priority.present() || priority.get() == p) { + auto itr = manuallyThrottledTags->find(std::make_pair(tag, p)); + if(itr != manuallyThrottledTags->end()) { + maxExpiration = std::max(maxExpiration, itr->second.expirationTime); + erased = true; + manuallyThrottledTags->erase(itr); + } + } } } - bool removed = wait(ThrottleApi::unthrottleTags(cx, tagSet, autoThrottled, priority)); + bool removed = wait(ThrottleApi::unthrottleTags(cx, tagSet, throttleType, priority)); if(removed) { - ASSERT(erased || autoThrottled); + ASSERT(erased || !throttleType.present() || throttleType.get() == TagThrottleType::AUTO); } else { - ASSERT(expiration < now()); + ASSERT(maxExpiration < now()); } return Void(); @@ -113,7 +133,7 @@ struct TagThrottleApiWorkload : TestWorkload { int manualThrottledTags = 0; int activeAutoThrottledTags = 0; for(auto &tag : tags) { - if(!tag.autoThrottled) { + if(tag.throttleType == TagThrottleType::MANUAL) { ASSERT(manuallyThrottledTags->find(std::make_pair(tag.tag, tag.priority)) != manuallyThrottledTags->end()); ++manualThrottledTags; } @@ -139,34 +159,32 @@ struct TagThrottleApiWorkload : TestWorkload { } ACTOR Future unthrottleTagGroup(Database cx, std::map, TagThrottleInfo> *manuallyThrottledTags) { - state int choice = deterministicRandom()->randomInt(0, 3); + state Optional throttleType = TagThrottleApiWorkload::randomTagThrottleType(); + state Optional priority = deterministicRandom()->coinflip() ? Optional() : deterministicRandom()->randomChoice(allTransactionPriorities); - if(choice == 0) { - bool unthrottled = wait(ThrottleApi::unthrottleAll(cx)); + bool unthrottled = wait(ThrottleApi::unthrottleAll(cx, throttleType, priority)); + if(!throttleType.present() || throttleType.get() == TagThrottleType::MANUAL) { bool unthrottleExpected = false; - for(auto itr = manuallyThrottledTags->begin(); itr != manuallyThrottledTags->end(); ++itr) { - if(itr->second.expirationTime > now()) { - unthrottleExpected = true; + bool empty = manuallyThrottledTags->empty(); + for(auto itr = manuallyThrottledTags->begin(); itr != manuallyThrottledTags->end();) { + if(!priority.present() || priority.get() == itr->first.second) { + if(itr->second.expirationTime > now()) { + unthrottleExpected = true; + } + + itr = manuallyThrottledTags->erase(itr); + } + else { + ++itr; } } - ASSERT(!unthrottleExpected || unthrottled); - manuallyThrottledTags->clear(); - } - else if(choice == 1) { - bool unthrottled = wait(ThrottleApi::unthrottleManual(cx)); - bool unthrottleExpected = false; - for(auto itr = manuallyThrottledTags->begin(); itr != manuallyThrottledTags->end(); ++itr) { - if(itr->second.expirationTime > now()) { - unthrottleExpected = true; - } + if(throttleType.present()) { + ASSERT((unthrottled && !empty) || (!unthrottled && !unthrottleExpected)); + } + else { + ASSERT(unthrottled || !unthrottleExpected); } - - ASSERT((unthrottled && !manuallyThrottledTags->empty()) || (!unthrottled && !unthrottleExpected)); - manuallyThrottledTags->clear(); - } - else { - bool unthrottled = wait(ThrottleApi::unthrottleAuto(cx)); } return Void(); @@ -176,7 +194,7 @@ struct TagThrottleApiWorkload : TestWorkload { if(deterministicRandom()->coinflip()) { wait(ThrottleApi::enableAuto(cx, true)); if(deterministicRandom()->coinflip()) { - bool unthrottled = wait(ThrottleApi::unthrottleAuto(cx)); + bool unthrottled = wait(ThrottleApi::unthrottleAll(cx, TagThrottleType::AUTO, Optional())); } } else { diff --git a/fdbservice/FDBService.cpp b/fdbservice/FDBService.cpp index 59ef5c8045..fe761a0109 100644 --- a/fdbservice/FDBService.cpp +++ b/fdbservice/FDBService.cpp @@ -30,7 +30,7 @@ #include "flow/SimpleOpt.h" #include "fdbmonitor/SimpleIni.h" -#include "fdbclient/IncludeVersions.h" +#include "fdbclient/versions.h" // For PathFileExists #include "Shlwapi.h" diff --git a/flow/IThreadPool.cpp b/flow/IThreadPool.cpp index 362eee4598..6dc79f8c05 100644 --- a/flow/IThreadPool.cpp +++ b/flow/IThreadPool.cpp @@ -73,7 +73,7 @@ class ThreadPool : public IThreadPool, public ReferenceCounted { void operator()() { Thread::dispatch(action); action = NULL; } ~ActionWrapper() { if (action) { action->cancel(); } } private: - void operator=(ActionWrapper const&); + ActionWrapper &operator=(ActionWrapper const&); }; public: ThreadPool() : dontstop(ios), mode(Run) {} diff --git a/flow/TLSConfig.actor.cpp b/flow/TLSConfig.actor.cpp index 73a336e38a..0b33550104 100644 --- a/flow/TLSConfig.actor.cpp +++ b/flow/TLSConfig.actor.cpp @@ -287,7 +287,7 @@ ACTOR static Future readEntireFile( std::string filename, std::string* des throw file_too_large(); } destination->resize(filesize); - wait(success(file->read(const_cast(destination->c_str()), filesize, 0))); + wait(success(file->read(&destination[0], filesize, 0))); return Void(); } diff --git a/flow/network.h b/flow/network.h index 92980c97f4..0ce7b7d5fb 100644 --- a/flow/network.h +++ b/flow/network.h @@ -235,6 +235,17 @@ struct NetworkAddress { bool isTLS() const { return (flags & FLAG_TLS) != 0; } bool isV6() const { return ip.isV6(); } + size_t hash() const { + size_t result = 0; + if (ip.isV6()) { + uint16_t* ptr = (uint16_t*)ip.toV6().data(); + result = ((size_t)ptr[5] << 32) | ((size_t)ptr[6] << 16) | ptr[7]; + } else { + result = ip.toV4(); + } + return (result << 16) + port; + } + static NetworkAddress parse(std::string const&); // May throw connection_string_invalid static Optional parseOptional(std::string const&); static std::vector parseList( std::string const& ); @@ -270,14 +281,7 @@ namespace std { size_t operator()(const NetworkAddress& na) const { - size_t result = 0; - if (na.ip.isV6()) { - uint16_t* ptr = (uint16_t*)na.ip.toV6().data(); - result = ((size_t)ptr[5] << 32) | ((size_t)ptr[6] << 16) | ptr[7]; - } else { - result = na.ip.toV4(); - } - return (result << 16) + na.port; + return na.hash(); } }; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d60859be79..c4e8697fb7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -46,6 +46,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES BlobStore.txt IGNORE) add_fdb_test(TEST_FILES ConsistencyCheck.txt IGNORE) add_fdb_test(TEST_FILES DDMetricsExclude.txt IGNORE) + add_fdb_test(TEST_FILES DataDistributionMetrics.txt IGNORE) add_fdb_test(TEST_FILES DiskDurability.txt IGNORE) add_fdb_test(TEST_FILES FileSystem.txt IGNORE) add_fdb_test(TEST_FILES Happy.txt IGNORE) diff --git a/tests/DataDistributionMetrics.txt b/tests/DataDistributionMetrics.txt new file mode 100644 index 0000000000..77c83b0eb6 --- /dev/null +++ b/tests/DataDistributionMetrics.txt @@ -0,0 +1,21 @@ +testTitle=DataDistributionMetrics + testName=Cycle + transactionsPerSecond=2500.0 + testDuration=10.0 + expectedRate=0.025 + + testName=DataDistributionMetrics + numTransactions=100 + writesPerTransaction=1000 + + testName=Attrition + machinesToKill=1 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + testName=Attrition + machinesToKill=1 + machinesToLeave=3 + reboot=true + testDuration=10.0 \ No newline at end of file