1
0
mirror of https://github.com/apple/foundationdb.git synced 2025-06-01 02:37:02 +08:00

Merge branch 'main' into features/authz

This commit is contained in:
Junhyun Shim 2022-07-27 20:51:32 +02:00 committed by GitHub
commit c6342a6e5b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 2622 additions and 572 deletions

@ -601,7 +601,7 @@ def tenants(logger):
output = run_fdbcli_command('createtenant tenant')
assert output == 'The tenant `tenant\' has been created'
output = run_fdbcli_command('createtenant tenant2')
output = run_fdbcli_command('createtenant tenant2 tenant_group=tenant_group2')
assert output == 'The tenant `tenant2\' has been created'
output = run_fdbcli_command('listtenants')
@ -636,6 +636,58 @@ def tenants(logger):
assert('printable' in json_output['tenant']['prefix'])
assert(json_output['tenant']['tenant_state'] == 'ready')
output = run_fdbcli_command('gettenant tenant2')
lines = output.split('\n')
assert len(lines) == 4
assert lines[0].strip().startswith('id: ')
assert lines[1].strip().startswith('prefix: ')
assert lines[2].strip() == 'tenant state: ready'
assert lines[3].strip() == 'tenant group: tenant_group2'
output = run_fdbcli_command('gettenant tenant2 JSON')
json_output = json.loads(output, strict=False)
assert(len(json_output) == 2)
assert('tenant' in json_output)
assert(json_output['type'] == 'success')
assert(len(json_output['tenant']) == 4)
assert('id' in json_output['tenant'])
assert('prefix' in json_output['tenant'])
assert(json_output['tenant']['tenant_state'] == 'ready')
assert('tenant_group' in json_output['tenant'])
assert(len(json_output['tenant']['tenant_group']) == 2)
assert('base64' in json_output['tenant']['tenant_group'])
assert(json_output['tenant']['tenant_group']['printable'] == 'tenant_group2')
output = run_fdbcli_command('configuretenant tenant tenant_group=tenant_group1')
assert output == 'The configuration for tenant `tenant\' has been updated'
output = run_fdbcli_command('gettenant tenant')
lines = output.split('\n')
assert len(lines) == 4
assert lines[3].strip() == 'tenant group: tenant_group1'
output = run_fdbcli_command('configuretenant tenant unset tenant_group')
assert output == 'The configuration for tenant `tenant\' has been updated'
output = run_fdbcli_command('gettenant tenant')
lines = output.split('\n')
assert len(lines) == 3
output = run_fdbcli_command_and_get_error('configuretenant tenant tenant_group=tenant_group1 tenant_group=tenant_group2')
assert output == 'ERROR: configuration parameter `tenant_group\' specified more than once.'
output = run_fdbcli_command_and_get_error('configuretenant tenant unset')
assert output == 'ERROR: `unset\' specified without a configuration parameter.'
output = run_fdbcli_command_and_get_error('configuretenant tenant unset tenant_group=tenant_group1')
assert output == 'ERROR: unrecognized configuration parameter `tenant_group=tenant_group1\'.'
output = run_fdbcli_command_and_get_error('configuretenant tenant tenant_group')
assert output == 'ERROR: invalid configuration string `tenant_group\'. String must specify a value using `=\'.'
output = run_fdbcli_command_and_get_error('configuretenant tenant3 tenant_group=tenant_group1')
assert output == 'ERROR: Tenant does not exist (2131)'
output = run_fdbcli_command('usetenant')
assert output == 'Using the default tenant'

@ -9,6 +9,7 @@ For details, see http://sourceforge.net/projects/libb64
#define BASE64_DECODE_H
#include <iostream>
#include "libb64/encode.h"
namespace base64 {
extern "C" {

@ -2,6 +2,22 @@
Release Notes
#############
7.1.17
======
* Same as 7.1.16 release with AVX enabled.
7.1.16
======
* Released with AVX disabled.
* Fixed a crash bug when cluster controller shuts down. `(PR #7706) <https://github.com/apple/foundationdb/pull/7706>`_
* Fixed a storage server failure when getReadVersion returns an error. `(PR #7688) <https://github.com/apple/foundationdb/pull/7688>`_
* Fixed unbounded status json generation. `(PR #7680) <https://github.com/apple/foundationdb/pull/7680>`_
* Fixed ScopeEventFieldTypeMismatch error for TLogMetrics. `(PR #7640) <https://github.com/apple/foundationdb/pull/7640>`_
* Added getMappedRange latency metrics. `(PR #7632) <https://github.com/apple/foundationdb/pull/7632>`_
* Fixed a version vector performance bug due to not updating client side tag cache. `(PR #7616) <https://github.com/apple/foundationdb/pull/7616>`_
* Fixed DiskReadSeconds and DiskWriteSeconds calculaion in ProcessMetrics. `(PR #7609) <https://github.com/apple/foundationdb/pull/7609>`_
* Added Rocksdb compression and data size stats. `(PR #7596) <https://github.com/apple/foundationdb/pull/7596>`_
7.1.15
======
* Same as 7.1.14 release with AVX enabled.

@ -35,20 +35,104 @@
namespace fdb_cli {
const KeyRangeRef tenantSpecialKeyRange(LiteralStringRef("\xff\xff/management/tenant/map/"),
LiteralStringRef("\xff\xff/management/tenant/map0"));
const KeyRangeRef tenantMapSpecialKeyRange720("\xff\xff/management/tenant/map/"_sr,
"\xff\xff/management/tenant/map0"_sr);
const KeyRangeRef tenantConfigSpecialKeyRange("\xff\xff/management/tenant/configure/"_sr,
"\xff\xff/management/tenant/configure0"_sr);
const KeyRangeRef tenantMapSpecialKeyRange710("\xff\xff/management/tenant_map/"_sr,
"\xff\xff/management/tenant_map0"_sr);
KeyRangeRef const& tenantMapSpecialKeyRange(int apiVersion) {
if (apiVersion >= 720) {
return tenantMapSpecialKeyRange720;
} else {
return tenantMapSpecialKeyRange710;
}
}
Optional<std::map<Standalone<StringRef>, Optional<Value>>>
parseTenantConfiguration(std::vector<StringRef> const& tokens, int startIndex, bool allowUnset) {
std::map<Standalone<StringRef>, Optional<Value>> configParams;
for (int tokenNum = startIndex; tokenNum < tokens.size(); ++tokenNum) {
Optional<Value> value;
StringRef token = tokens[tokenNum];
StringRef param;
if (allowUnset && token == "unset"_sr) {
if (++tokenNum == tokens.size()) {
fmt::print(stderr, "ERROR: `unset' specified without a configuration parameter.\n");
return {};
}
param = tokens[tokenNum];
} else {
bool foundEquals;
param = token.eat("=", &foundEquals);
if (!foundEquals) {
fmt::print(stderr,
"ERROR: invalid configuration string `{}'. String must specify a value using `='.\n",
param.toString().c_str());
return {};
}
value = token;
}
if (configParams.count(param)) {
fmt::print(
stderr, "ERROR: configuration parameter `{}' specified more than once.\n", param.toString().c_str());
return {};
}
if (tokencmp(param, "tenant_group")) {
configParams[param] = value;
} else {
fmt::print(stderr, "ERROR: unrecognized configuration parameter `{}'.\n", param.toString().c_str());
return {};
}
}
return configParams;
}
Key makeConfigKey(TenantNameRef tenantName, StringRef configName) {
return tenantConfigSpecialKeyRange.begin.withSuffix(Tuple().append(tenantName).append(configName).pack());
}
void applyConfiguration(Reference<ITransaction> tr,
TenantNameRef tenantName,
std::map<Standalone<StringRef>, Optional<Value>> configuration) {
for (auto [configName, value] : configuration) {
if (value.present()) {
tr->set(makeConfigKey(tenantName, configName), value.get());
} else {
tr->clear(makeConfigKey(tenantName, configName));
}
}
}
// createtenant command
ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
if (tokens.size() != 2) {
ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion) {
if (tokens.size() < 2 || tokens.size() > 3) {
printUsage(tokens[0]);
return false;
}
state Key tenantNameKey = fdb_cli::tenantSpecialKeyRange.begin.withSuffix(tokens[1]);
state Key tenantNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]);
state Reference<ITransaction> tr = db->createTransaction();
state bool doneExistenceCheck = false;
state Optional<std::map<Standalone<StringRef>, Optional<Value>>> configuration =
parseTenantConfiguration(tokens, 2, false);
if (!configuration.present()) {
return false;
}
if (apiVersion < 720 && !configuration.get().empty()) {
fmt::print(stderr, "ERROR: tenants do not accept configuration options before API version 720.\n");
return false;
}
loop {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
try {
@ -63,12 +147,13 @@ ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector
}
tr->set(tenantNameKey, ValueRef());
applyConfiguration(tr, tokens[1], configuration.get());
wait(safeThreadFutureToFuture(tr->commit()));
break;
} catch (Error& e) {
state Error err(e);
if (e.code() == error_code_special_keys_api_failure) {
std::string errorMsgStr = wait(fdb_cli::getSpecialKeysFailureErrorMessage(tr));
std::string errorMsgStr = wait(getSpecialKeysFailureErrorMessage(tr));
fmt::print(stderr, "ERROR: {}\n", errorMsgStr.c_str());
return false;
}
@ -81,18 +166,18 @@ ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector
}
CommandFactory createTenantFactory("createtenant",
CommandHelp("createtenant <TENANT_NAME>",
CommandHelp("createtenant <TENANT_NAME> [tenant_group=<TENANT_GROUP>]",
"creates a new tenant in the cluster",
"Creates a new tenant in the cluster with the specified name."));
// deletetenant command
ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion) {
if (tokens.size() != 2) {
printUsage(tokens[0]);
return false;
}
state Key tenantNameKey = fdb_cli::tenantSpecialKeyRange.begin.withSuffix(tokens[1]);
state Key tenantNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]);
state Reference<ITransaction> tr = db->createTransaction();
state bool doneExistenceCheck = false;
@ -115,7 +200,7 @@ ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector
} catch (Error& e) {
state Error err(e);
if (e.code() == error_code_special_keys_api_failure) {
std::string errorMsgStr = wait(fdb_cli::getSpecialKeysFailureErrorMessage(tr));
std::string errorMsgStr = wait(getSpecialKeysFailureErrorMessage(tr));
fmt::print(stderr, "ERROR: {}\n", errorMsgStr.c_str());
return false;
}
@ -135,7 +220,7 @@ CommandFactory deleteTenantFactory(
"Deletes a tenant from the cluster. Deletion will be allowed only if the specified tenant contains no data."));
// listtenants command
ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion) {
if (tokens.size() > 4) {
printUsage(tokens[0]);
return false;
@ -157,14 +242,14 @@ ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<
}
if (tokens.size() == 4) {
int n = 0;
if (sscanf(tokens[3].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[3].size()) {
fmt::print(stderr, "ERROR: invalid limit {}\n", tokens[3].toString().c_str());
if (sscanf(tokens[3].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[3].size() || limit <= 0) {
fmt::print(stderr, "ERROR: invalid limit `{}'\n", tokens[3].toString().c_str());
return false;
}
}
state Key beginTenantKey = fdb_cli::tenantSpecialKeyRange.begin.withSuffix(beginTenant);
state Key endTenantKey = fdb_cli::tenantSpecialKeyRange.begin.withSuffix(endTenant);
state Key beginTenantKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(beginTenant);
state Key endTenantKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(endTenant);
state Reference<ITransaction> tr = db->createTransaction();
loop {
@ -186,14 +271,14 @@ ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<
for (auto tenant : tenants) {
fmt::print(" {}. {}\n",
++index,
printable(tenant.key.removePrefix(fdb_cli::tenantSpecialKeyRange.begin)).c_str());
printable(tenant.key.removePrefix(tenantMapSpecialKeyRange(apiVersion).begin)).c_str());
}
return true;
} catch (Error& e) {
state Error err(e);
if (e.code() == error_code_special_keys_api_failure) {
std::string errorMsgStr = wait(fdb_cli::getSpecialKeysFailureErrorMessage(tr));
std::string errorMsgStr = wait(getSpecialKeysFailureErrorMessage(tr));
fmt::print(stderr, "ERROR: {}\n", errorMsgStr.c_str());
return false;
}
@ -217,7 +302,7 @@ ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<St
}
state bool useJson = tokens.size() == 3;
state Key tenantNameKey = fdb_cli::tenantSpecialKeyRange.begin.withSuffix(tokens[1]);
state Key tenantNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]);
state Reference<ITransaction> tr = db->createTransaction();
loop {
@ -245,6 +330,7 @@ ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<St
int64_t id;
std::string prefix;
std::string tenantState;
std::string tenantGroup;
doc.get("id", id);
@ -255,10 +341,14 @@ ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<St
}
doc.get("tenant_state", tenantState);
bool hasTenantGroup = doc.tryGet("tenant_group.printable", tenantGroup);
fmt::print(" id: {}\n", id);
fmt::print(" prefix: {}\n", printable(prefix).c_str());
fmt::print(" tenant state: {}\n", printable(tenantState).c_str());
if (hasTenantGroup) {
fmt::print(" tenant group: {}\n", tenantGroup.c_str());
}
}
return true;
@ -299,6 +389,50 @@ CommandFactory getTenantFactory(
"prints the metadata for a tenant",
"Prints the metadata for a tenant. If JSON is specified, then the output will be in JSON format."));
// configuretenant command
ACTOR Future<bool> configureTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
if (tokens.size() < 3) {
printUsage(tokens[0]);
return false;
}
state Optional<std::map<Standalone<StringRef>, Optional<Value>>> configuration =
parseTenantConfiguration(tokens, 2, true);
if (!configuration.present()) {
return false;
}
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
try {
applyConfiguration(tr, tokens[1], configuration.get());
wait(safeThreadFutureToFuture(tr->commit()));
break;
} catch (Error& e) {
state Error err(e);
if (e.code() == error_code_special_keys_api_failure) {
std::string errorMsgStr = wait(getSpecialKeysFailureErrorMessage(tr));
fmt::print(stderr, "ERROR: {}\n", errorMsgStr.c_str());
return false;
}
wait(safeThreadFutureToFuture(tr->onError(err)));
}
}
fmt::print("The configuration for tenant `{}' has been updated\n", printable(tokens[1]).c_str());
return true;
}
CommandFactory configureTenantFactory(
"configuretenant",
CommandHelp("configuretenant <TENANT_NAME> <[unset] tenant_group[=<GROUP_NAME>]> ...",
"updates the configuration for a tenant",
"Updates the configuration for a tenant. Use `tenant_group=<GROUP_NAME>' to change the tenant group "
"that a tenant is assigned to or `unset tenant_group' to remove a tenant from its tenant group."));
// renametenant command
ACTOR Future<bool> renameTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
if (tokens.size() != 3) {
@ -316,6 +450,6 @@ CommandFactory renameTenantFactory(
"renametenant",
CommandHelp(
"renametenant <OLD_NAME> <NEW_NAME>",
"renames a tenant in the cluster.",
"renames a tenant in the cluster",
"Renames a tenant in the cluster. The old name must exist and the new name must not exist in the cluster."));
} // namespace fdb_cli

@ -1909,14 +1909,14 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
}
if (tokencmp(tokens[0], "createtenant")) {
bool _result = wait(makeInterruptable(createTenantCommandActor(db, tokens)));
bool _result = wait(makeInterruptable(createTenantCommandActor(db, tokens, opt.apiVersion)));
if (!_result)
is_error = true;
continue;
}
if (tokencmp(tokens[0], "deletetenant")) {
bool _result = wait(makeInterruptable(deleteTenantCommandActor(db, tokens)));
bool _result = wait(makeInterruptable(deleteTenantCommandActor(db, tokens, opt.apiVersion)));
if (!_result)
is_error = true;
else if (tenantName.present() && tokens[1] == tenantName.get()) {
@ -1928,7 +1928,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
}
if (tokencmp(tokens[0], "listtenants")) {
bool _result = wait(makeInterruptable(listTenantsCommandActor(db, tokens)));
bool _result = wait(makeInterruptable(listTenantsCommandActor(db, tokens, opt.apiVersion)));
if (!_result)
is_error = true;
continue;
@ -1941,7 +1941,26 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
continue;
}
if (tokencmp(tokens[0], "configuretenant")) {
if (opt.apiVersion < 720) {
fmt::print(stderr, "ERROR: tenants cannot be configured before API version 720.\n");
is_error = true;
continue;
}
bool _result = wait(makeInterruptable(configureTenantCommandActor(db, tokens)));
if (!_result)
is_error = true;
continue;
}
if (tokencmp(tokens[0], "renametenant")) {
if (opt.apiVersion < 720) {
fmt::print(stderr, "ERROR: tenants cannot be renamed before API version 720.\n");
is_error = true;
continue;
}
bool _result = wait(makeInterruptable(renameTenantCommandActor(db, tokens)));
if (!_result)
is_error = true;

@ -157,6 +157,8 @@ ACTOR Future<bool> configureCommandActor(Reference<IDatabase> db,
std::vector<StringRef> tokens,
LineNoise* linenoise,
Future<Void> warn);
// configuretenant command
ACTOR Future<bool> configureTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
// consistency command
ACTOR Future<bool> consistencyCheckCommandActor(Reference<ITransaction> tr,
std::vector<StringRef> tokens,
@ -164,11 +166,11 @@ ACTOR Future<bool> consistencyCheckCommandActor(Reference<ITransaction> tr,
// coordinators command
ACTOR Future<bool> coordinatorsCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
// createtenant command
ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion);
// datadistribution command
ACTOR Future<bool> dataDistributionCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
// deletetenant command
ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion);
// exclude command
ACTOR Future<bool> excludeCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, Future<Void> warn);
// expensive_data_check command
@ -194,7 +196,7 @@ ACTOR Future<bool> killCommandActor(Reference<IDatabase> db,
std::vector<StringRef> tokens,
std::map<Key, std::pair<Value, ClientLeaderRegInterface>>* address_interface);
// listtenants command
ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion);
// lock/unlock command
ACTOR Future<bool> lockCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
ACTOR Future<bool> unlockDatabaseActor(Reference<IDatabase> db, UID uid);

@ -1642,16 +1642,6 @@ TEST_CASE("/blobgranule/files/deltaAtVersion") {
return Void();
}
// picks a number between 2^minExp and 2^maxExp, but uniformly distributed over exponential buckets 2^n an 2^n+1
int randomExp(int minExp, int maxExp) {
if (minExp == maxExp) { // N=2, case
return 1 << minExp;
}
int val = 1 << deterministicRandom()->randomInt(minExp, maxExp);
ASSERT(val > 0);
return deterministicRandom()->randomInt(val, val * 2);
}
void checkSnapshotEmpty(const Value& serialized, Key begin, Key end, Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
std::map<KeyRef, ValueRef> result;
Arena ar = loadSnapshotFile(serialized, KeyRangeRef(begin, end), result, cipherKeysCtx);
@ -1725,7 +1715,7 @@ struct KeyValueGen {
int sharedPrefixLen = deterministicRandom()->randomInt(0, uidSize);
targetKeyLength = deterministicRandom()->randomInt(4, uidSize);
sharedPrefix = sharedPrefix.substr(0, sharedPrefixLen) + "_";
targetValueLength = randomExp(0, 12);
targetValueLength = deterministicRandom()->randomExp(0, 12);
allRange = KeyRangeRef(StringRef(sharedPrefix), LiteralStringRef("\xff"));
if (deterministicRandom()->coinflip()) {
@ -1748,13 +1738,13 @@ struct KeyValueGen {
minVersionIncrease = 1;
maxVersionIncrease = 2;
} else {
minVersionIncrease = randomExp(0, 25);
maxVersionIncrease = minVersionIncrease + randomExp(0, 25);
minVersionIncrease = deterministicRandom()->randomExp(0, 25);
maxVersionIncrease = minVersionIncrease + deterministicRandom()->randomExp(0, 25);
}
if (deterministicRandom()->coinflip()) {
targetMutationsPerDelta = 1;
} else {
targetMutationsPerDelta = randomExp(1, 5);
targetMutationsPerDelta = deterministicRandom()->randomExp(1, 5);
}
if (deterministicRandom()->coinflip()) {
@ -1918,8 +1908,8 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {
// snapshot files are likely to have a non-trivial shared prefix since they're for a small contiguous key range
KeyValueGen kvGen;
int targetChunks = randomExp(0, 9);
int targetDataBytes = randomExp(0, 25);
int targetChunks = deterministicRandom()->randomExp(0, 9);
int targetDataBytes = deterministicRandom()->randomExp(0, 25);
int targetChunkSize = targetDataBytes / targetChunks;
Standalone<GranuleSnapshot> data = genSnapshot(kvGen, targetDataBytes);
@ -1954,7 +1944,7 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {
if (data.size() > 1) {
for (int i = 0; i < std::min(100, data.size() * 2); i++) {
int width = randomExp(0, maxExp);
int width = deterministicRandom()->randomExp(0, maxExp);
ASSERT(width <= data.size());
int start = deterministicRandom()->randomInt(0, data.size() - width);
checkSnapshotRead(data, serialized, start, start + width, kvGen.cipherKeys);
@ -2056,8 +2046,8 @@ Standalone<GranuleDeltas> genDeltas(KeyValueGen& kvGen, int targetBytes) {
TEST_CASE("/blobgranule/files/deltaFormatUnitTest") {
KeyValueGen kvGen;
int targetChunks = randomExp(0, 8);
int targetDataBytes = randomExp(0, 21);
int targetChunks = deterministicRandom()->randomExp(0, 8);
int targetDataBytes = deterministicRandom()->randomExp(0, 21);
int targetChunkSize = targetDataBytes / targetChunks;
@ -2161,9 +2151,9 @@ void checkGranuleRead(const KeyValueGen& kvGen,
TEST_CASE("/blobgranule/files/granuleReadUnitTest") {
KeyValueGen kvGen;
int targetSnapshotChunks = randomExp(0, 9);
int targetDeltaChunks = randomExp(0, 8);
int targetDataBytes = randomExp(12, 25);
int targetSnapshotChunks = deterministicRandom()->randomExp(0, 9);
int targetDeltaChunks = deterministicRandom()->randomExp(0, 8);
int targetDataBytes = deterministicRandom()->randomExp(12, 25);
int targetSnapshotBytes = (int)(deterministicRandom()->randomInt(0, targetDataBytes));
int targetDeltaBytes = targetDataBytes - targetSnapshotBytes;

@ -22,6 +22,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/Tenant.h"
#include "flow/IRandom.h"
#include "flow/UnitTest.h"
#define init(...) KNOB_FN(__VA_ARGS__, INIT_ATOMIC_KNOB, INIT_KNOB)(__VA_ARGS__)
@ -289,6 +290,9 @@ void ClientKnobs::initialize(Randomize randomize) {
init( CHANGE_QUORUM_BAD_STATE_RETRY_TIMES, 3 );
init( CHANGE_QUORUM_BAD_STATE_RETRY_DELAY, 2.0 );
// Tenants and Metacluster
init( MAX_TENANTS_PER_CLUSTER, 1e6 ); if ( randomize && BUGGIFY ) MAX_TENANTS_PER_CLUSTER = deterministicRandom()->randomInt(20, 100);
// clang-format on
}

@ -9424,11 +9424,20 @@ Future<Void> DatabaseContext::getChangeFeedStream(Reference<ChangeFeedData> resu
Reference<DatabaseContext>::addRef(this), results, rangeID, begin, end, range, replyBufferSize, canReadPopped);
}
ACTOR Future<std::vector<OverlappingChangeFeedEntry>> singleLocationOverlappingChangeFeeds(
Database cx,
Reference<LocationInfo> location,
KeyRangeRef range,
Version minVersion) {
Version OverlappingChangeFeedsInfo::getFeedMetadataVersion(const KeyRangeRef& range) const {
Version v = invalidVersion;
for (auto& it : feedMetadataVersions) {
if (it.second > v && it.first.intersects(range)) {
v = it.second;
}
}
return v;
}
ACTOR Future<OverlappingChangeFeedsReply> singleLocationOverlappingChangeFeeds(Database cx,
Reference<LocationInfo> location,
KeyRangeRef range,
Version minVersion) {
state OverlappingChangeFeedsRequest req;
req.range = range;
req.minVersion = minVersion;
@ -9440,16 +9449,16 @@ ACTOR Future<std::vector<OverlappingChangeFeedEntry>> singleLocationOverlappingC
TaskPriority::DefaultPromiseEndpoint,
AtMostOnce::False,
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
return rep.rangeIds;
return rep;
}
bool compareChangeFeedResult(const OverlappingChangeFeedEntry& i, const OverlappingChangeFeedEntry& j) {
return i.rangeId < j.rangeId;
return i.feedId < j.feedId;
}
ACTOR Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeedsActor(Reference<DatabaseContext> db,
KeyRangeRef range,
Version minVersion) {
ACTOR Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeedsActor(Reference<DatabaseContext> db,
KeyRangeRef range,
Version minVersion) {
state Database cx(db);
state Span span("NAPI:GetOverlappingChangeFeeds"_loc);
@ -9475,19 +9484,33 @@ ACTOR Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeedsA
throw all_alternatives_failed();
}
state std::vector<Future<std::vector<OverlappingChangeFeedEntry>>> allOverlappingRequests;
state std::vector<Future<OverlappingChangeFeedsReply>> allOverlappingRequests;
for (auto& it : locations) {
allOverlappingRequests.push_back(
singleLocationOverlappingChangeFeeds(cx, it.locations, it.range & range, minVersion));
}
wait(waitForAll(allOverlappingRequests));
std::vector<OverlappingChangeFeedEntry> result;
for (auto& it : allOverlappingRequests) {
result.insert(result.end(), it.get().begin(), it.get().end());
OverlappingChangeFeedsInfo result;
std::unordered_map<KeyRef, OverlappingChangeFeedEntry> latestFeedMetadata;
for (int i = 0; i < locations.size(); i++) {
result.arena.dependsOn(allOverlappingRequests[i].get().arena);
result.arena.dependsOn(locations[i].range.arena());
result.feedMetadataVersions.push_back(
{ locations[i].range, allOverlappingRequests[i].get().feedMetadataVersion });
for (auto& it : allOverlappingRequests[i].get().feeds) {
auto res = latestFeedMetadata.insert({ it.feedId, it });
if (!res.second) {
CODE_PROBE(true, "deduping fetched overlapping feed by higher metadata version");
if (res.first->second.feedMetadataVersion < it.feedMetadataVersion) {
res.first->second = it;
}
}
}
}
for (auto& it : latestFeedMetadata) {
result.feeds.push_back(result.arena, it.second);
}
std::sort(result.begin(), result.end(), compareChangeFeedResult);
result.resize(std::unique(result.begin(), result.end()) - result.begin());
return result;
} catch (Error& e) {
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
@ -9500,8 +9523,7 @@ ACTOR Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeedsA
}
}
Future<std::vector<OverlappingChangeFeedEntry>> DatabaseContext::getOverlappingChangeFeeds(KeyRangeRef range,
Version minVersion) {
Future<OverlappingChangeFeedsInfo> DatabaseContext::getOverlappingChangeFeeds(KeyRangeRef range, Version minVersion) {
return getOverlappingChangeFeedsActor(Reference<DatabaseContext>::addRef(this), range, minVersion);
}

@ -1627,11 +1627,6 @@ BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value) {
return interf;
}
const KeyRangeRef tenantMapKeys("\xff/tenant/map/"_sr, "\xff/tenant/map0"_sr);
const KeyRef tenantMapPrefix = tenantMapKeys.begin;
const KeyRef tenantMapPrivatePrefix = "\xff\xff/tenant/map/"_sr;
const KeyRef tenantLastIdKey = "\xff/tenant/lastId"_sr;
const KeyRangeRef storageQuotaKeys(LiteralStringRef("\xff/storageQuota/"), LiteralStringRef("\xff/storageQuota0"));
const KeyRef storageQuotaPrefix = storageQuotaKeys.begin;

@ -74,6 +74,11 @@ TenantMapEntry::TenantMapEntry() {}
TenantMapEntry::TenantMapEntry(int64_t id, TenantState tenantState) : tenantState(tenantState) {
setId(id);
}
TenantMapEntry::TenantMapEntry(int64_t id, TenantState tenantState, Optional<TenantGroupName> tenantGroup)
: tenantState(tenantState), tenantGroup(tenantGroup) {
setId(id);
}
void TenantMapEntry::setId(int64_t id) {
ASSERT(id >= 0);
this->id = id;
@ -100,9 +105,33 @@ std::string TenantMapEntry::toJson(int apiVersion) const {
tenantEntry["tenant_state"] = TenantMapEntry::tenantStateToString(tenantState);
if (tenantGroup.present()) {
json_spirit::mObject tenantGroupObject;
std::string encodedTenantGroup = base64::encoder::from_string(tenantGroup.get().toString());
// Remove trailing newline
encodedTenantGroup.resize(encodedTenantGroup.size() - 1);
tenantGroupObject["base64"] = encodedTenantGroup;
tenantGroupObject["printable"] = printable(tenantGroup.get());
tenantEntry["tenant_group"] = tenantGroupObject;
}
return json_spirit::write_string(json_spirit::mValue(tenantEntry));
}
bool TenantMapEntry::matchesConfiguration(TenantMapEntry const& other) const {
return tenantGroup == other.tenantGroup;
}
void TenantMapEntry::configure(Standalone<StringRef> parameter, Optional<Value> value) {
if (parameter == "tenant_group"_sr) {
tenantGroup = value;
} else {
TraceEvent(SevWarnAlways, "UnknownTenantConfigurationParameter").detail("Parameter", parameter);
throw invalid_tenant_configuration();
}
}
TEST_CASE("/fdbclient/TenantMapEntry/Serialization") {
TenantMapEntry entry1(1, TenantState::READY);
ASSERT(entry1.prefix == "\x00\x00\x00\x00\x00\x00\x00\x01"_sr);

@ -31,3 +31,13 @@ const KeyRangeRef TenantRangeImpl<false>::submoduleRange = KeyRangeRef(""_sr, "\
template <>
const KeyRangeRef TenantRangeImpl<false>::mapSubRange = KeyRangeRef("tenant_map/"_sr, "tenant_map0"_sr);
template <>
bool TenantRangeImpl<true>::subRangeIntersects(KeyRangeRef subRange, KeyRangeRef range) {
return subRange.intersects(range);
}
template <>
bool TenantRangeImpl<false>::subRangeIntersects(KeyRangeRef subRange, KeyRangeRef range) {
return subRange == mapSubRange;
}

@ -284,6 +284,9 @@ public:
int CHANGE_QUORUM_BAD_STATE_RETRY_TIMES;
double CHANGE_QUORUM_BAD_STATE_RETRY_DELAY;
// Tenants and Metacluster
int MAX_TENANTS_PER_CLUSTER;
ClientKnobs(Randomize randomize);
void initialize(Randomize randomize);
};

@ -207,6 +207,16 @@ struct KeyRangeLocationInfo {
: tenantEntry(tenantEntry), range(range), locations(locations) {}
};
struct OverlappingChangeFeedsInfo {
Arena arena;
VectorRef<OverlappingChangeFeedEntry> feeds;
// would prefer to use key range map but it complicates copy/move constructors
std::vector<std::pair<KeyRangeRef, Version>> feedMetadataVersions;
// for a feed that wasn't present, returns the metadata version it would have been fetched at.
Version getFeedMetadataVersion(const KeyRangeRef& feedRange) const;
};
class DatabaseContext : public ReferenceCounted<DatabaseContext>, public FastAllocated<DatabaseContext>, NonCopyable {
public:
static DatabaseContext* allocateOnForeignThread() {
@ -361,7 +371,7 @@ public:
int replyBufferSize = -1,
bool canReadPopped = true);
Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
Future<Void> popChangeFeedMutations(Key rangeID, Version version);
Future<Key> purgeBlobGranules(KeyRange keyRange,

@ -968,39 +968,51 @@ struct FetchCheckpointKeyValuesRequest {
};
struct OverlappingChangeFeedEntry {
Key rangeId;
KeyRange range;
KeyRef feedId;
KeyRangeRef range;
Version emptyVersion;
Version stopVersion;
Version feedMetadataVersion;
bool operator==(const OverlappingChangeFeedEntry& r) const {
return rangeId == r.rangeId && range == r.range && emptyVersion == r.emptyVersion &&
stopVersion == r.stopVersion;
return feedId == r.feedId && range == r.range && emptyVersion == r.emptyVersion &&
stopVersion == r.stopVersion && feedMetadataVersion == r.feedMetadataVersion;
}
OverlappingChangeFeedEntry() {}
OverlappingChangeFeedEntry(Key const& rangeId, KeyRange const& range, Version emptyVersion, Version stopVersion)
: rangeId(rangeId), range(range), emptyVersion(emptyVersion), stopVersion(stopVersion) {}
OverlappingChangeFeedEntry(KeyRef const& feedId,
KeyRangeRef const& range,
Version emptyVersion,
Version stopVersion,
Version feedMetadataVersion)
: feedId(feedId), range(range), emptyVersion(emptyVersion), stopVersion(stopVersion),
feedMetadataVersion(feedMetadataVersion) {}
OverlappingChangeFeedEntry(Arena& arena, const OverlappingChangeFeedEntry& rhs)
: feedId(arena, rhs.feedId), range(arena, rhs.range), emptyVersion(rhs.emptyVersion),
stopVersion(rhs.stopVersion), feedMetadataVersion(rhs.feedMetadataVersion) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, rangeId, range, emptyVersion, stopVersion);
serializer(ar, feedId, range, emptyVersion, stopVersion, feedMetadataVersion);
}
};
struct OverlappingChangeFeedsReply {
constexpr static FileIdentifier file_identifier = 11815134;
std::vector<OverlappingChangeFeedEntry> rangeIds;
VectorRef<OverlappingChangeFeedEntry> feeds;
bool cached;
Arena arena;
Version feedMetadataVersion;
OverlappingChangeFeedsReply() : cached(false) {}
explicit OverlappingChangeFeedsReply(std::vector<OverlappingChangeFeedEntry> const& rangeIds)
: rangeIds(rangeIds), cached(false) {}
OverlappingChangeFeedsReply() : cached(false), feedMetadataVersion(invalidVersion) {}
explicit OverlappingChangeFeedsReply(VectorRef<OverlappingChangeFeedEntry> const& feeds,
Version feedMetadataVersion)
: feeds(feeds), cached(false), feedMetadataVersion(feedMetadataVersion) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, rangeIds, arena);
serializer(ar, feeds, arena, feedMetadataVersion);
}
};

@ -30,6 +30,8 @@
typedef StringRef TenantNameRef;
typedef Standalone<TenantNameRef> TenantName;
typedef StringRef TenantGroupNameRef;
typedef Standalone<TenantGroupNameRef> TenantGroupName;
enum class TenantState { REGISTERING, READY, REMOVING, UPDATING_CONFIGURATION, ERROR };
@ -45,28 +47,29 @@ struct TenantMapEntry {
int64_t id = -1;
Key prefix;
TenantState tenantState = TenantState::READY;
Optional<TenantGroupName> tenantGroup;
constexpr static int PREFIX_SIZE = sizeof(id);
public:
TenantMapEntry();
TenantMapEntry(int64_t id, TenantState tenantState);
TenantMapEntry(int64_t id, TenantState tenantState, Optional<TenantGroupName> tenantGroup);
void setId(int64_t id);
std::string toJson(int apiVersion) const;
Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion(ProtocolVersion::withTenants())); }
bool matchesConfiguration(TenantMapEntry const& other) const;
void configure(Standalone<StringRef> parameter, Optional<Value> value);
Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); }
static TenantMapEntry decode(ValueRef const& value) {
TenantMapEntry entry;
ObjectReader reader(value.begin(), IncludeVersion());
reader.deserialize(entry);
return entry;
return ObjectReader::fromStringRef<TenantMapEntry>(value, IncludeVersion());
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, id, tenantState);
serializer(ar, id, tenantState, tenantGroup);
if constexpr (Ar::isDeserializing) {
if (id >= 0) {
prefix = idToPrefix(id);
@ -76,15 +79,36 @@ public:
}
};
struct TenantGroupEntry {
constexpr static FileIdentifier file_identifier = 10764222;
TenantGroupEntry() = default;
Value encode() { return ObjectWriter::toValue(*this, IncludeVersion()); }
static TenantGroupEntry decode(ValueRef const& value) {
return ObjectReader::fromStringRef<TenantGroupEntry>(value, IncludeVersion());
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar);
}
};
struct TenantMetadataSpecification {
static KeyRef subspace;
KeyBackedObjectMap<TenantName, TenantMapEntry, decltype(IncludeVersion()), NullCodec> tenantMap;
KeyBackedProperty<int64_t> lastTenantId;
KeyBackedBinaryValue<int64_t> tenantCount;
KeyBackedSet<Tuple> tenantGroupTenantIndex;
KeyBackedObjectMap<TenantGroupName, TenantGroupEntry, decltype(IncludeVersion()), NullCodec> tenantGroupMap;
TenantMetadataSpecification(KeyRef subspace)
: tenantMap(subspace.withSuffix("tenant/map/"_sr), IncludeVersion(ProtocolVersion::withTenants())),
lastTenantId(subspace.withSuffix("tenant/lastId"_sr)) {}
: tenantMap(subspace.withSuffix("tenant/map/"_sr), IncludeVersion()),
lastTenantId(subspace.withSuffix("tenant/lastId"_sr)), tenantCount(subspace.withSuffix("tenant/count"_sr)),
tenantGroupTenantIndex(subspace.withSuffix("tenant/tenantGroup/tenantIndex/"_sr)),
tenantGroupMap(subspace.withSuffix("tenant/tenantGroup/map/"_sr), IncludeVersion()) {}
};
struct TenantMetadata {
@ -94,6 +118,9 @@ private:
public:
static inline auto& tenantMap = instance.tenantMap;
static inline auto& lastTenantId = instance.lastTenantId;
static inline auto& tenantCount = instance.tenantCount;
static inline auto& tenantGroupTenantIndex = instance.tenantGroupTenantIndex;
static inline auto& tenantGroupMap = instance.tenantGroupMap;
static inline Key tenantMapPrivatePrefix = "\xff"_sr.withSuffix(tenantMap.subspace.begin);
};

@ -19,6 +19,7 @@
*/
#pragma once
#include "fdbclient/ClientBooleanParams.h"
#include "flow/IRandom.h"
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H)
#define FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H
@ -102,11 +103,18 @@ Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction(Transa
if (name.startsWith("\xff"_sr)) {
throw invalid_tenant_name();
}
if (tenantEntry.tenantGroup.present() && tenantEntry.tenantGroup.get().startsWith("\xff"_sr)) {
throw invalid_tenant_group_name();
}
tr->setOption(FDBTransactionOptions::RAW_ACCESS);
state Future<Optional<TenantMapEntry>> existingEntryFuture = tryGetTenantTransaction(tr, name);
wait(checkTenantMode(tr));
state Future<Optional<TenantGroupEntry>> existingTenantGroupEntryFuture;
if (tenantEntry.tenantGroup.present()) {
existingTenantGroupEntryFuture = TenantMetadata::tenantGroupMap.get(tr, tenantEntry.tenantGroup.get());
}
Optional<TenantMapEntry> existingEntry = wait(existingEntryFuture);
if (existingEntry.present()) {
@ -123,6 +131,25 @@ Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction(Transa
tenantEntry.tenantState = TenantState::READY;
TenantMetadata::tenantMap.set(tr, name, tenantEntry);
if (tenantEntry.tenantGroup.present()) {
TenantMetadata::tenantGroupTenantIndex.insert(tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), name));
// Create the tenant group associated with this tenant if it doesn't already exist
Optional<TenantGroupEntry> existingTenantGroup = wait(existingTenantGroupEntryFuture);
if (!existingTenantGroup.present()) {
TenantMetadata::tenantGroupMap.set(tr, tenantEntry.tenantGroup.get(), TenantGroupEntry());
}
}
// This is idempotent because we only add an entry to the tenant map if it isn't already there
TenantMetadata::tenantCount.atomicOp(tr, 1, MutationRef::AddValue);
// Read the tenant count after incrementing the counter so that simultaneous attempts to create
// tenants in the same transaction are properly reflected.
int64_t tenantCount = wait(TenantMetadata::tenantCount.getD(tr, Snapshot::False, 0));
if (tenantCount > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) {
throw cluster_no_capacity();
}
return std::make_pair(tenantEntry, true);
}
@ -182,6 +209,7 @@ Future<Optional<TenantMapEntry>> createTenant(Reference<DB> db,
.detail("Tenant", name)
.detail("TenantId", newTenant.first.get().id)
.detail("Prefix", newTenant.first.get().prefix)
.detail("TenantGroup", tenantEntry.tenantGroup)
.detail("Version", tr->getCommittedVersion());
}
@ -215,7 +243,23 @@ Future<Void> deleteTenantTransaction(Transaction tr,
throw tenant_not_empty();
}
// This is idempotent because we only erase an entry from the tenant map if it is present
TenantMetadata::tenantMap.erase(tr, name);
TenantMetadata::tenantCount.atomicOp(tr, -1, MutationRef::AddValue);
if (tenantEntry.get().tenantGroup.present()) {
TenantMetadata::tenantGroupTenantIndex.erase(tr,
Tuple::makeTuple(tenantEntry.get().tenantGroup.get(), name));
KeyBackedSet<Tuple>::RangeResultType tenantsInGroup = wait(TenantMetadata::tenantGroupTenantIndex.getRange(
tr,
Tuple::makeTuple(tenantEntry.get().tenantGroup.get()),
Tuple::makeTuple(keyAfter(tenantEntry.get().tenantGroup.get())),
2));
if (tenantsInGroup.results.empty() ||
(tenantsInGroup.results.size() == 1 && tenantsInGroup.results[0].getString(1) == name)) {
TenantMetadata::tenantGroupMap.erase(tr, tenantEntry.get().tenantGroup.get());
}
}
}
return Void();
@ -256,6 +300,56 @@ Future<Void> deleteTenant(Reference<DB> db, TenantName name, Optional<int64_t> t
}
}
// This should only be called from a transaction that has already confirmed that the tenant entry
// is present. The tenantEntry should start with the existing entry and modify only those fields that need
// to be changed. This must only be called on a non-management cluster.
ACTOR template <class Transaction>
Future<Void> configureTenantTransaction(Transaction tr,
TenantNameRef tenantName,
TenantMapEntry originalEntry,
TenantMapEntry updatedTenantEntry) {
tr->setOption(FDBTransactionOptions::RAW_ACCESS);
TenantMetadata::tenantMap.set(tr, tenantName, updatedTenantEntry);
// If the tenant group was changed, we need to update the tenant group metadata structures
if (originalEntry.tenantGroup != updatedTenantEntry.tenantGroup) {
if (updatedTenantEntry.tenantGroup.present() && updatedTenantEntry.tenantGroup.get().startsWith("\xff"_sr)) {
throw invalid_tenant_group_name();
}
if (originalEntry.tenantGroup.present()) {
// Remove this tenant from the original tenant group index
TenantMetadata::tenantGroupTenantIndex.erase(tr,
Tuple::makeTuple(originalEntry.tenantGroup.get(), tenantName));
// Check if the original tenant group is now empty. If so, remove the tenant group.
KeyBackedSet<Tuple>::RangeResultType tenants = wait(TenantMetadata::tenantGroupTenantIndex.getRange(
tr,
Tuple::makeTuple(originalEntry.tenantGroup.get()),
Tuple::makeTuple(keyAfter(originalEntry.tenantGroup.get())),
2));
if (tenants.results.empty() ||
(tenants.results.size() == 1 && tenants.results[0].getString(1) == tenantName)) {
TenantMetadata::tenantGroupMap.erase(tr, originalEntry.tenantGroup.get());
}
}
if (updatedTenantEntry.tenantGroup.present()) {
// If this is creating a new tenant group, add it to the tenant group map
Optional<TenantGroupEntry> entry =
wait(TenantMetadata::tenantGroupMap.get(tr, updatedTenantEntry.tenantGroup.get()));
if (!entry.present()) {
TenantMetadata::tenantGroupMap.set(tr, updatedTenantEntry.tenantGroup.get(), TenantGroupEntry());
}
// Insert this tenant in the tenant group index
TenantMetadata::tenantGroupTenantIndex.insert(
tr, Tuple::makeTuple(updatedTenantEntry.tenantGroup.get(), tenantName));
}
}
return Void();
}
ACTOR template <class Transaction>
Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransaction(Transaction tr,
TenantNameRef begin,
@ -339,6 +433,14 @@ Future<Void> renameTenant(Reference<DB> db, TenantName oldName, TenantName newNa
TenantMetadata::tenantMap.erase(tr, oldName);
TenantMetadata::tenantMap.set(tr, newName, oldEntry.get());
// Update the tenant group index to reflect the new tenant name
if (oldEntry.get().tenantGroup.present()) {
TenantMetadata::tenantGroupTenantIndex.erase(
tr, Tuple::makeTuple(oldEntry.get().tenantGroup.get(), oldName));
TenantMetadata::tenantGroupTenantIndex.insert(
tr, Tuple::makeTuple(oldEntry.get().tenantGroup.get(), newName));
}
wait(safeThreadFutureToFuture(tr->commit()));
TraceEvent("RenameTenantSuccess").detail("OldName", oldName).detail("NewName", newName);
return Void();

@ -31,14 +31,16 @@
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/SpecialKeySpace.actor.h"
#include "fdbclient/TenantManagement.actor.h"
#include "libb64/encode.h"
#include "fdbclient/Tuple.h"
#include "flow/Arena.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // This must be the last #include.
template <bool HasSubRanges = true>
template <bool HasSubRanges>
class TenantRangeImpl : public SpecialKeyRangeRWImpl {
private:
static bool subRangeIntersects(KeyRangeRef subRange, KeyRangeRef range);
static KeyRangeRef removePrefix(KeyRangeRef range, KeyRef prefix, KeyRef defaultEnd) {
KeyRef begin = range.begin.removePrefix(prefix);
KeyRef end;
@ -53,15 +55,14 @@ private:
static KeyRef withTenantMapPrefix(KeyRef key, Arena& ar) {
int keySize = SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin.size() +
TenantRangeImpl::submoduleRange.begin.size() + TenantRangeImpl::mapSubRange.begin.size() +
key.size();
submoduleRange.begin.size() + mapSubRange.begin.size() + key.size();
KeyRef prefixedKey = makeString(keySize, ar);
uint8_t* mutableKey = mutateString(prefixedKey);
mutableKey = SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin.copyTo(mutableKey);
mutableKey = TenantRangeImpl::submoduleRange.begin.copyTo(mutableKey);
mutableKey = TenantRangeImpl::mapSubRange.begin.copyTo(mutableKey);
mutableKey = submoduleRange.begin.copyTo(mutableKey);
mutableKey = mapSubRange.begin.copyTo(mutableKey);
key.copyTo(mutableKey);
return prefixedKey;
@ -84,20 +85,21 @@ private:
return Void();
}
ACTOR static Future<RangeResult> getTenantRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) {
ACTOR template <bool B>
static Future<RangeResult> getTenantRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) {
state RangeResult results;
kr = kr.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)
.removePrefix(TenantRangeImpl::submoduleRange.begin);
.removePrefix(TenantRangeImpl<B>::submoduleRange.begin);
if (kr.intersects(TenantRangeImpl::mapSubRange)) {
if (kr.intersects(TenantRangeImpl<B>::mapSubRange)) {
GetRangeLimits limits = limitsHint;
limits.decrement(results);
wait(getTenantList(
ryw,
removePrefix(kr & TenantRangeImpl::mapSubRange, TenantRangeImpl::mapSubRange.begin, "\xff"_sr),
removePrefix(kr & TenantRangeImpl<B>::mapSubRange, TenantRangeImpl<B>::mapSubRange.begin, "\xff"_sr),
&results,
limits));
}
@ -105,25 +107,106 @@ private:
return results;
}
ACTOR static Future<Void> createTenants(ReadYourWritesTransaction* ryw, std::vector<TenantNameRef> tenants) {
int64_t _nextId = wait(TenantAPI::getNextTenantId(&ryw->getTransaction()));
int64_t nextId = _nextId;
// Returns true if the tenant was created, false if it already existed
ACTOR static Future<bool> createTenant(
ReadYourWritesTransaction* ryw,
TenantNameRef tenantName,
std::vector<std::pair<Standalone<StringRef>, Optional<Value>>> configMutations,
int64_t tenantId,
std::map<TenantGroupName, int>* tenantGroupNetTenantDelta) {
state TenantMapEntry tenantEntry;
tenantEntry.setId(tenantId);
std::vector<Future<Void>> createFutures;
for (auto tenant : tenants) {
state TenantMapEntry tenantEntry(nextId++, TenantState::READY);
createFutures.push_back(
success(TenantAPI::createTenantTransaction(&ryw->getTransaction(), tenant, tenantEntry)));
for (auto const& [name, value] : configMutations) {
tenantEntry.configure(name, value);
}
if (tenantEntry.tenantGroup.present()) {
(*tenantGroupNetTenantDelta)[tenantEntry.tenantGroup.get()]++;
}
std::pair<Optional<TenantMapEntry>, bool> entry =
wait(TenantAPI::createTenantTransaction(&ryw->getTransaction(), tenantName, tenantEntry));
return entry.second;
}
ACTOR static Future<Void> createTenants(
ReadYourWritesTransaction* ryw,
std::map<TenantName, std::vector<std::pair<Standalone<StringRef>, Optional<Value>>>> tenants,
std::map<TenantGroupName, int>* tenantGroupNetTenantDelta) {
state Future<int64_t> tenantCountFuture =
TenantMetadata::tenantCount.getD(&ryw->getTransaction(), Snapshot::False, 0);
int64_t _nextId = wait(TenantAPI::getNextTenantId(&ryw->getTransaction()));
state int64_t nextId = _nextId;
state std::vector<Future<bool>> createFutures;
for (auto const& [tenant, config] : tenants) {
createFutures.push_back(createTenant(ryw, tenant, config, nextId++, tenantGroupNetTenantDelta));
}
TenantMetadata::lastTenantId.set(&ryw->getTransaction(), nextId - 1);
wait(waitForAll(createFutures));
state int numCreatedTenants = 0;
for (auto f : createFutures) {
if (f.get()) {
++numCreatedTenants;
}
}
// Check the tenant count here rather than rely on the createTenantTransaction check because we don't have RYW
int64_t tenantCount = wait(tenantCountFuture);
if (tenantCount + numCreatedTenants > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) {
throw cluster_no_capacity();
}
return Void();
}
ACTOR static Future<Void> changeTenantConfig(
ReadYourWritesTransaction* ryw,
TenantName tenantName,
std::vector<std::pair<Standalone<StringRef>, Optional<Value>>> configEntries,
std::map<TenantGroupName, int>* tenantGroupNetTenantDelta) {
TenantMapEntry originalEntry = wait(TenantAPI::getTenantTransaction(&ryw->getTransaction(), tenantName));
TenantMapEntry updatedEntry = originalEntry;
for (auto const& [name, value] : configEntries) {
updatedEntry.configure(name, value);
}
if (originalEntry.tenantGroup != updatedEntry.tenantGroup) {
if (originalEntry.tenantGroup.present()) {
(*tenantGroupNetTenantDelta)[originalEntry.tenantGroup.get()]--;
}
if (updatedEntry.tenantGroup.present()) {
(*tenantGroupNetTenantDelta)[updatedEntry.tenantGroup.get()]++;
}
}
wait(TenantAPI::configureTenantTransaction(&ryw->getTransaction(), tenantName, originalEntry, updatedEntry));
return Void();
}
ACTOR static Future<Void> deleteSingleTenant(ReadYourWritesTransaction* ryw,
TenantName tenantName,
std::map<TenantGroupName, int>* tenantGroupNetTenantDelta) {
state Optional<TenantMapEntry> tenantEntry =
wait(TenantAPI::tryGetTenantTransaction(&ryw->getTransaction(), tenantName));
if (tenantEntry.present()) {
wait(TenantAPI::deleteTenantTransaction(&ryw->getTransaction(), tenantName));
if (tenantEntry.get().tenantGroup.present()) {
(*tenantGroupNetTenantDelta)[tenantEntry.get().tenantGroup.get()]--;
}
}
return Void();
}
ACTOR static Future<Void> deleteTenantRange(ReadYourWritesTransaction* ryw,
TenantName beginTenant,
TenantName endTenant) {
TenantName endTenant,
std::map<TenantGroupName, int>* tenantGroupNetTenantDelta) {
state std::vector<std::pair<TenantName, TenantMapEntry>> tenants = wait(
TenantAPI::listTenantsTransaction(&ryw->getTransaction(), beginTenant, endTenant, CLIENT_KNOBS->TOO_MANY));
@ -139,69 +222,154 @@ private:
std::vector<Future<Void>> deleteFutures;
for (auto tenant : tenants) {
deleteFutures.push_back(TenantAPI::deleteTenantTransaction(&ryw->getTransaction(), tenant.first));
if (tenant.second.tenantGroup.present()) {
(*tenantGroupNetTenantDelta)[tenant.second.tenantGroup.get()]--;
}
}
wait(waitForAll(deleteFutures));
return Void();
}
// Check if the number of tenants in the tenant group is equal to the net reduction in the number of tenants.
// If it is, then we can delete the tenant group.
ACTOR static Future<Void> checkAndRemoveTenantGroup(ReadYourWritesTransaction* ryw,
TenantGroupName tenantGroup,
int tenantDelta) {
ASSERT(tenantDelta < 0);
state int removedTenants = -tenantDelta;
KeyBackedSet<Tuple>::RangeResultType tenantsInGroup =
wait(TenantMetadata::tenantGroupTenantIndex.getRange(&ryw->getTransaction(),
Tuple::makeTuple(tenantGroup),
Tuple::makeTuple(keyAfter(tenantGroup)),
removedTenants + 1));
ASSERT(tenantsInGroup.results.size() >= removedTenants);
if (tenantsInGroup.results.size() == removedTenants) {
TenantMetadata::tenantGroupMap.erase(&ryw->getTransaction(), tenantGroup);
}
return Void();
}
public:
// These ranges vary based on the template parameter
const static KeyRangeRef submoduleRange;
const static KeyRangeRef mapSubRange;
// These sub-ranges should only be used if HasSubRanges=true
const inline static KeyRangeRef configureSubRange = KeyRangeRef("configure/"_sr, "configure0"_sr);
explicit TenantRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const override {
return getTenantRange(ryw, kr, limitsHint);
return getTenantRange<HasSubRanges>(ryw, kr, limitsHint);
}
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override {
auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(range);
std::vector<Future<Void>> tenantManagementFutures;
ACTOR static Future<Optional<std::string>> commitImpl(TenantRangeImpl* self, ReadYourWritesTransaction* ryw) {
state std::vector<Future<Void>> tenantManagementFutures;
std::vector<std::pair<KeyRangeRef, Optional<Value>>> mapMutations;
// This map is an ugly workaround to the fact that we cannot use RYW in these transactions.
// It tracks the net change to the number of tenants in a tenant group, and at the end we can compare
// that with how many tenants the tenant group started with. If we removed all of the tenants, then we
// delete the tenant group.
//
// SOMEDAY: enable RYW support in special keys and remove this complexity.
state std::map<TenantGroupName, int> tenantGroupNetTenantDelta;
state KeyRangeMap<std::pair<bool, Optional<Value>>>::Ranges ranges =
ryw->getSpecialKeySpaceWriteMap().containedRanges(self->range);
state std::vector<std::pair<KeyRangeRef, Optional<Value>>> mapMutations;
state std::map<TenantName, std::vector<std::pair<Standalone<StringRef>, Optional<Value>>>> configMutations;
tenantManagementFutures.push_back(TenantAPI::checkTenantMode(&ryw->getTransaction()));
for (auto range : ranges) {
if (!range.value().first) {
continue;
}
KeyRangeRef adjustedRange =
state KeyRangeRef adjustedRange =
range.range()
.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)
.removePrefix(submoduleRange.begin);
if (mapSubRange.intersects(adjustedRange)) {
if (subRangeIntersects(mapSubRange, adjustedRange)) {
adjustedRange = mapSubRange & adjustedRange;
adjustedRange = removePrefix(adjustedRange, mapSubRange.begin, "\xff"_sr);
mapMutations.push_back(std::make_pair(adjustedRange, range.value().second));
} else if (subRangeIntersects(configureSubRange, adjustedRange) && adjustedRange.singleKeyRange()) {
StringRef configTupleStr = adjustedRange.begin.removePrefix(configureSubRange.begin);
try {
Tuple tuple = Tuple::unpack(configTupleStr);
if (tuple.size() != 2) {
throw invalid_tuple_index();
}
configMutations[tuple.getString(0)].push_back(
std::make_pair(tuple.getString(1), range.value().second));
} catch (Error& e) {
TraceEvent(SevWarn, "InvalidTenantConfigurationKey").error(e).detail("Key", adjustedRange.begin);
ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString(
false, "configure tenant", "invalid tenant configuration key"));
throw special_keys_api_failure();
}
}
}
std::vector<TenantNameRef> tenantsToCreate;
std::map<TenantName, std::vector<std::pair<Standalone<StringRef>, Optional<Value>>>> tenantsToCreate;
for (auto mapMutation : mapMutations) {
TenantNameRef tenantName = mapMutation.first.begin;
if (mapMutation.second.present()) {
tenantsToCreate.push_back(tenantName);
std::vector<std::pair<Standalone<StringRef>, Optional<Value>>> createMutations;
auto itr = configMutations.find(tenantName);
if (itr != configMutations.end()) {
createMutations = itr->second;
configMutations.erase(itr);
}
tenantsToCreate[tenantName] = createMutations;
} else {
// For a single key clear, just issue the delete
if (mapMutation.first.singleKeyRange()) {
tenantManagementFutures.push_back(
TenantAPI::deleteTenantTransaction(&ryw->getTransaction(), tenantName));
tenantManagementFutures.push_back(deleteSingleTenant(ryw, tenantName, &tenantGroupNetTenantDelta));
// Configuration changes made to a deleted tenant are discarded
configMutations.erase(tenantName);
} else {
tenantManagementFutures.push_back(deleteTenantRange(ryw, tenantName, mapMutation.first.end));
tenantManagementFutures.push_back(
deleteTenantRange(ryw, tenantName, mapMutation.first.end, &tenantGroupNetTenantDelta));
// Configuration changes made to a deleted tenant are discarded
configMutations.erase(configMutations.lower_bound(tenantName),
configMutations.lower_bound(mapMutation.first.end));
}
}
}
if (!tenantsToCreate.empty()) {
tenantManagementFutures.push_back(createTenants(ryw, tenantsToCreate));
tenantManagementFutures.push_back(createTenants(ryw, tenantsToCreate, &tenantGroupNetTenantDelta));
}
for (auto configMutation : configMutations) {
tenantManagementFutures.push_back(
changeTenantConfig(ryw, configMutation.first, configMutation.second, &tenantGroupNetTenantDelta));
}
return tag(waitForAll(tenantManagementFutures), Optional<std::string>());
wait(waitForAll(tenantManagementFutures));
state std::vector<Future<Void>> tenantGroupUpdateFutures;
for (auto [tenantGroup, count] : tenantGroupNetTenantDelta) {
if (count < 0) {
tenantGroupUpdateFutures.push_back(checkAndRemoveTenantGroup(ryw, tenantGroup, count));
}
}
wait(waitForAll(tenantGroupUpdateFutures));
return Optional<std::string>();
}
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override { return commitImpl(this, ryw); }
};
#include "flow/unactorcompiler.h"

@ -24,6 +24,7 @@
#include <algorithm>
#include <string>
#include <random>
#include <limits>
#include "flow/flow.h"
#include "flow/Histogram.h"
@ -472,6 +473,8 @@ public:
bool setDiffProtocol; // true if a process with a different protocol version has been started
bool allowStorageMigrationTypeChange = false;
double injectTargetedSSRestartTime = std::numeric_limits<double>::max();
double injectSSDelayTime = std::numeric_limits<double>::max();
std::unordered_map<Standalone<StringRef>, PrivateKey> authKeys;

@ -967,19 +967,12 @@ ACTOR Future<Void> monitorClientRanges(Reference<BlobManagerData> bmData) {
} else {
state KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> tenantResults;
wait(store(tenantResults,
TenantMetadata::tenantMap.getRange(
tr, Optional<TenantName>(), Optional<TenantName>(), CLIENT_KNOBS->TOO_MANY)));
ASSERT_WE_THINK(!tenantResults.more && tenantResults.results.size() < CLIENT_KNOBS->TOO_MANY);
if (tenantResults.more || tenantResults.results.size() >= CLIENT_KNOBS->TOO_MANY) {
TraceEvent(SevError, "BlobManagerTooManyTenants", bmData->id)
.detail("Epoch", bmData->epoch)
.detail("TenantCount", tenantResults.results.size());
wait(delay(600));
if (bmData->iAmReplaced.canBeSet()) {
bmData->iAmReplaced.sendError(internal_error());
}
throw internal_error();
}
TenantMetadata::tenantMap.getRange(tr,
Optional<TenantName>(),
Optional<TenantName>(),
CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)));
ASSERT(tenantResults.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER &&
!tenantResults.more);
std::vector<Key> prefixes;
for (auto& it : tenantResults.results) {
@ -1650,7 +1643,9 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
state Key lockKey = blobGranuleLockKeyFor(parentRange);
state Future<Optional<Value>> oldLockFuture = tr->get(lockKey);
wait(updateChangeFeed(tr,
// This has to be
// TODO: fix this better! (privatize change feed key clear)
wait(updateChangeFeed(&tr->getTransaction(),
granuleIDToCFKey(parentGranuleIDs[parentIdx]),
ChangeFeedStatus::CHANGE_FEED_DESTROY,
parentRange));

@ -3999,18 +3999,11 @@ ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> tenantResults;
wait(store(tenantResults,
TenantMetadata::tenantMap.getRange(
tr, Optional<TenantName>(), Optional<TenantName>(), CLIENT_KNOBS->TOO_MANY)));
ASSERT_WE_THINK(!tenantResults.more && tenantResults.results.size() < CLIENT_KNOBS->TOO_MANY);
if (tenantResults.more || tenantResults.results.size() >= CLIENT_KNOBS->TOO_MANY) {
TraceEvent(SevError, "BlobWorkerTooManyTenants", bwData->id)
.detail("TenantCount", tenantResults.results.size());
wait(delay(600));
if (bwData->fatalError.canBeSet()) {
bwData->fatalError.sendError(internal_error());
}
throw internal_error();
}
TenantMetadata::tenantMap.getRange(tr,
Optional<TenantName>(),
Optional<TenantName>(),
CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)));
ASSERT(tenantResults.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER && !tenantResults.more);
std::vector<std::pair<TenantName, TenantMapEntry>> tenants;
for (auto& it : tenantResults.results) {

@ -110,9 +110,9 @@ class RocksDBErrorListener : public rocksdb::EventListener {
public:
RocksDBErrorListener(){};
void OnBackgroundError(rocksdb::BackgroundErrorReason reason, rocksdb::Status* bg_error) override {
TraceEvent(SevError, "RocksDBBGError")
TraceEvent(SevError, "ShardedRocksDBBGError")
.detail("Reason", getErrorReason(reason))
.detail("RocksDBSeverity", bg_error->severity())
.detail("ShardedRocksDBSeverity", bg_error->severity())
.detail("Status", bg_error->ToString());
std::unique_lock<std::mutex> lock(mutex);
if (!errorPromise.isValid())
@ -186,8 +186,8 @@ std::vector<std::pair<KeyRange, std::string>> decodeShardMapping(const RangeResu
void logRocksDBError(const rocksdb::Status& status, const std::string& method) {
auto level = status.IsTimedOut() ? SevWarn : SevError;
TraceEvent e(level, "RocksDBError");
e.detail("Error", status.ToString()).detail("Method", method).detail("RocksDBSeverity", status.severity());
TraceEvent e(level, "ShardedRocksDBError");
e.detail("Error", status.ToString()).detail("Method", method).detail("ShardedRocksDBSeverity", status.severity());
if (status.IsIOError()) {
e.detail("SubCode", status.subcode());
}
@ -219,7 +219,7 @@ const char* ShardOpToString(ShardOp op) {
}
}
void logShardEvent(StringRef name, ShardOp op, Severity severity = SevInfo, const std::string& message = "") {
TraceEvent e(severity, "KVSShardEvent");
TraceEvent e(severity, "ShardedRocksKVSShardEvent");
e.detail("Name", name).detail("Action", ShardOpToString(op));
if (!message.empty()) {
e.detail("Message", message);
@ -230,7 +230,7 @@ void logShardEvent(StringRef name,
ShardOp op,
Severity severity = SevInfo,
const std::string& message = "") {
TraceEvent e(severity, "KVSShardEvent");
TraceEvent e(severity, "ShardedRocksKVSShardEvent");
e.detail("Name", name).detail("Action", ShardOpToString(op)).detail("Begin", range.begin).detail("End", range.end);
if (message != "") {
e.detail("Message", message);
@ -343,7 +343,7 @@ public:
ASSERT(cf);
readRangeOptions.background_purge_on_iterator_cleanup = true;
readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0);
TraceEvent(SevDebug, "ReadIteratorPool")
TraceEvent(SevVerbose, "ShardedRocksReadIteratorPool")
.detail("Path", path)
.detail("KnobRocksDBReadRangeReuseIterators", SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS)
.detail("KnobRocksDBPrefixLen", SERVER_KNOBS->ROCKSDB_PREFIX_LEN);
@ -425,7 +425,7 @@ private:
ACTOR Future<Void> flowLockLogger(const FlowLock* readLock, const FlowLock* fetchLock) {
loop {
wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
TraceEvent e("RocksDBFlowLock");
TraceEvent e("ShardedRocksDBFlowLock");
e.detail("ReadAvailable", readLock->available());
e.detail("ReadActivePermits", readLock->activePermits());
e.detail("ReadWaiters", readLock->waiters());
@ -588,13 +588,13 @@ public:
if (rState->closing) {
break;
}
TraceEvent(SevInfo, "KVSPhysialShardMetrics")
TraceEvent(SevInfo, "ShardedRocksKVSPhysialShardMetrics")
.detail("NumActiveShards", shardManager->numActiveShards())
.detail("TotalPhysicalShards", shardManager->numPhysicalShards());
}
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent(SevError, "ShardMetricsLoggerError").errorUnsuppressed(e);
TraceEvent(SevError, "ShardedRocksShardMetricsLoggerError").errorUnsuppressed(e);
}
}
return Void();
@ -602,7 +602,7 @@ public:
rocksdb::Status init() {
// Open instance.
TraceEvent(SevVerbose, "ShardManagerInitBegin", this->logId).detail("DataPath", path);
TraceEvent(SevInfo, "ShardedRocksShardManagerInitBegin", this->logId).detail("DataPath", path);
std::vector<std::string> columnFamilies;
rocksdb::Options options = getOptions();
rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, path, &columnFamilies);
@ -632,6 +632,8 @@ public:
}
if (foundMetadata) {
TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId)
.detail("PhysicalShardCount", handles.size());
for (auto handle : handles) {
if (handle->GetName() == "kvs-metadata") {
metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata", handle);
@ -639,7 +641,8 @@ public:
physicalShards[handle->GetName()] = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
}
columnFamilyMap[handle->GetID()] = handle;
TraceEvent(SevInfo, "ShardedRocskDB").detail("FoundShard", handle->GetName()).detail("Action", "Init");
TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId)
.detail("PhysicalShard", handle->GetName());
}
RangeResult metadata;
readRangeInDb(metadataShard.get(), prefixRange(shardMappingPrefix), UINT16_MAX, UINT16_MAX, &metadata);
@ -647,7 +650,7 @@ public:
std::vector<std::pair<KeyRange, std::string>> mapping = decodeShardMapping(metadata, shardMappingPrefix);
for (const auto& [range, name] : mapping) {
TraceEvent(SevDebug, "ShardedRocksLoadPhysicalShard", this->logId)
TraceEvent(SevVerbose, "ShardedRocksLoadRange", this->logId)
.detail("Range", range)
.detail("PhysicalShard", name);
auto it = physicalShards.find(name);
@ -662,10 +665,10 @@ public:
activePhysicalShardIds.emplace(name);
}
// TODO: remove unused column families.
} else {
// DB is opened with default shard.
ASSERT(handles.size() == 1);
// Add SpecialKeys range. This range should not be modified.
std::shared_ptr<PhysicalShard> defaultShard = std::make_shared<PhysicalShard>(db, "default", handles[0]);
columnFamilyMap[defaultShard->cf->GetID()] = defaultShard->cf;
@ -688,7 +691,7 @@ public:
return status;
}
metadataShard->readIterPool->update();
TraceEvent(SevInfo, "InitializeMetaDataShard", this->logId)
TraceEvent(SevInfo, "ShardedRocksInitializeMetaDataShard", this->logId)
.detail("MetadataShardCF", metadataShard->cf->GetID());
}
physicalShards["kvs-metadata"] = metadataShard;
@ -696,7 +699,7 @@ public:
writeBatch = std::make_unique<rocksdb::WriteBatch>();
dirtyShards = std::make_unique<std::set<PhysicalShard*>>();
TraceEvent(SevDebug, "ShardManagerInitEnd", this->logId).detail("DataPath", path);
TraceEvent(SevInfo, "ShardedRocksShardManagerInitEnd", this->logId).detail("DataPath", path);
return status;
}
@ -712,7 +715,7 @@ public:
for (auto it = rangeIterator.begin(); it != rangeIterator.end(); ++it) {
if (it.value() == nullptr) {
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevVerbose, "ShardedRocksDB")
.detail("Info", "ShardNotFound")
.detail("BeginKey", range.begin)
.detail("EndKey", range.end);
@ -724,9 +727,10 @@ public:
}
PhysicalShard* addRange(KeyRange range, std::string id) {
TraceEvent(SevVerbose, "ShardedRocksAddRangeBegin", this->logId)
TraceEvent(SevInfo, "ShardedRocksAddRangeBegin", this->logId)
.detail("Range", range)
.detail("PhysicalShardID", id);
// Newly added range should not overlap with any existing range.
auto ranges = dataShardMap.intersectingRanges(range);
@ -750,7 +754,7 @@ public:
validate();
TraceEvent(SevVerbose, "ShardedRocksAddRangeEnd", this->logId)
TraceEvent(SevInfo, "ShardedRocksAddRangeEnd", this->logId)
.detail("Range", range)
.detail("PhysicalShardID", id);
@ -758,7 +762,7 @@ public:
}
std::vector<std::string> removeRange(KeyRange range) {
TraceEvent(SevVerbose, "ShardedRocksRemoveRangeBegin", this->logId).detail("Range", range);
TraceEvent(SevInfo, "ShardedRocksRemoveRangeBegin", this->logId).detail("Range", range);
std::vector<std::string> shardIds;
@ -796,6 +800,7 @@ public:
}
continue;
}
// Range modification could result in more than one segments. Remove the original segment key here.
existingShard->dataShards.erase(shardRange.begin.toString());
if (shardRange.begin < range.begin) {
@ -826,7 +831,7 @@ public:
validate();
TraceEvent(SevVerbose, "ShardedRocksRemoveRangeEnd", this->logId).detail("Range", range);
TraceEvent(SevInfo, "ShardedRocksRemoveRangeEnd", this->logId).detail("Range", range);
return shardIds;
}
@ -849,7 +854,7 @@ public:
TraceEvent(SevError, "ShardedRocksDB").detail("Error", "write to non-exist shard").detail("WriteKey", key);
return;
}
TraceEvent(SevVerbose, "ShardManagerPut", this->logId)
TraceEvent(SevVerbose, "ShardedRocksShardManagerPut", this->logId)
.detail("WriteKey", key)
.detail("Value", value)
.detail("MapRange", it.range())
@ -859,7 +864,9 @@ public:
ASSERT(dirtyShards != nullptr);
writeBatch->Put(it.value()->physicalShard->cf, toSlice(key), toSlice(value));
dirtyShards->insert(it.value()->physicalShard);
TraceEvent(SevVerbose, "ShardManagerPutEnd", this->logId).detail("WriteKey", key).detail("Value", value);
TraceEvent(SevVerbose, "ShardedRocksShardManagerPutEnd", this->logId)
.detail("WriteKey", key)
.detail("Value", value);
}
void clear(KeyRef key) {
@ -884,7 +891,7 @@ public:
}
void persistRangeMapping(KeyRangeRef range, bool isAdd) {
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Info", "RangeToPersist")
.detail("BeginKey", range.begin)
.detail("EndKey", range.end);
@ -902,7 +909,7 @@ public:
writeBatch->Put(metadataShard->cf,
getShardMappingKey(it.range().begin, shardMappingPrefix),
it.value()->physicalShard->id);
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Action", "PersistRangeMapping")
.detail("BeginKey", it.range().begin)
.detail("EndKey", it.range().end)
@ -911,7 +918,7 @@ public:
} else {
// Empty range.
writeBatch->Put(metadataShard->cf, getShardMappingKey(it.range().begin, shardMappingPrefix), "");
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Action", "PersistRangeMapping")
.detail("BeginKey", it.range().begin)
.detail("EndKey", it.range().end)
@ -921,7 +928,7 @@ public:
}
} else {
writeBatch->Put(metadataShard->cf, getShardMappingKey(range.begin, shardMappingPrefix), "");
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Action", "PersistRangeMapping")
.detail("RemoveRange", "True")
.detail("BeginKey", range.begin)
@ -972,7 +979,7 @@ public:
if (!s.ok()) {
logRocksDBError(s, "DestroyDB");
}
TraceEvent("RocksDB").detail("Info", "DBDestroyed");
TraceEvent("ShardedRocksDB", this->logId).detail("Info", "DBDestroyed");
}
rocksdb::DB* getDb() const { return db; }
@ -997,9 +1004,9 @@ public:
}
void validate() {
TraceEvent(SevVerbose, "ValidateShardManager", this->logId);
TraceEvent(SevVerbose, "ShardedRocksValidateShardManager", this->logId);
for (auto s = dataShardMap.ranges().begin(); s != dataShardMap.ranges().end(); ++s) {
TraceEvent e(SevVerbose, "ValidateDataShardMap", this->logId);
TraceEvent e(SevVerbose, "ShardedRocksValidateDataShardMap", this->logId);
e.detail("Range", s->range());
const DataShard* shard = s->value();
e.detail("ShardAddress", reinterpret_cast<std::uintptr_t>(shard));
@ -1008,6 +1015,13 @@ public:
} else {
e.detail("Shard", "Empty");
}
if (shard != nullptr) {
ASSERT(shard->range == static_cast<KeyRangeRef>(s->range()));
ASSERT(shard->physicalShard != nullptr);
auto it = shard->physicalShard->dataShards.find(shard->range.begin.toString());
ASSERT(it != shard->physicalShard->dataShards.end());
ASSERT(it->second.get() == shard);
}
}
}
@ -1338,7 +1352,7 @@ std::shared_ptr<rocksdb::Statistics> RocksDBMetrics::getStatsObjForRocksDB() {
}
void RocksDBMetrics::logStats(rocksdb::DB* db) {
TraceEvent e("RocksDBMetrics");
TraceEvent e("ShardedRocksDBMetrics");
uint64_t stat;
for (auto& [name, ticker, cumulation] : tickerStats) {
stat = stats->getTickerCount(ticker);
@ -1361,7 +1375,7 @@ void RocksDBMetrics::logStats(rocksdb::DB* db) {
}
void RocksDBMetrics::logMemUsagePerShard(std::string shardName, rocksdb::DB* db) {
TraceEvent e("RocksDBShardMemMetrics");
TraceEvent e("ShardedRocksDBShardMemMetrics");
uint64_t stat;
ASSERT(db != nullptr);
ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCacheUsage, &stat));
@ -1387,7 +1401,7 @@ void RocksDBMetrics::setPerfContext(int index) {
}
void RocksDBMetrics::logPerfContext(bool ignoreZeroMetric) {
TraceEvent e("RocksDBPerfContextMetrics");
TraceEvent e("ShardedRocksDBPerfContextMetrics");
e.setMaxEventLength(20000);
for (auto& [name, metric, vals] : perfContextMetrics) {
uint64_t s = 0;
@ -1650,7 +1664,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
return;
}
TraceEvent(SevInfo, "RocksDB").detail("Method", "Open");
TraceEvent(SevInfo, "ShardedRocksDB").detail("Method", "Open");
a.done.send(Void());
}
@ -1841,7 +1855,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
} else {
a.shardManager->closeAllShards();
}
TraceEvent(SevInfo, "RocksDB").detail("Method", "Close");
TraceEvent(SevInfo, "ShardedRocksDB").detail("Method", "Close");
a.done.send(Void());
}
};
@ -1908,7 +1922,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
traceBatch.get().addEvent("GetValueDebug", a.debugID.get().first(), "Reader.Before");
}
if (readBeginTime - a.startTime > readValueTimeout) {
TraceEvent(SevWarn, "RocksDBError")
TraceEvent(SevWarn, "ShardedRocksDBError")
.detail("Error", "Read value request timedout")
.detail("Method", "ReadValueAction")
.detail("Timeout value", readValueTimeout);
@ -1995,7 +2009,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
"Reader.Before"); //.detail("TaskID", g_network->getCurrentTask());
}
if (readBeginTime - a.startTime > readValuePrefixTimeout) {
TraceEvent(SevWarn, "RocksDBError")
TraceEvent(SevWarn, "ShardedRocksDBError")
.detail("Error", "Read value prefix request timedout")
.detail("Method", "ReadValuePrefixAction")
.detail("Timeout value", readValuePrefixTimeout);
@ -2080,7 +2094,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
rocksDBMetrics->getReadRangeQueueWaitHistogram(threadIndex)->sampleSeconds(readBeginTime - a.startTime);
}
if (readBeginTime - a.startTime > readRangeTimeout) {
TraceEvent(SevWarn, "KVSReadTimeout")
TraceEvent(SevWarn, "ShardedRocksKVSReadTimeout")
.detail("Error", "Read range request timedout")
.detail("Method", "ReadRangeAction")
.detail("Timeout value", readRangeTimeout);
@ -2127,10 +2141,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
}
}
Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, "ShardedRocksDBNumShardsInRangeRead"_sr, Histogram::Unit::countLinear)
->sample(numShards);
result.more =
(result.size() == a.rowLimit) || (result.size() == -a.rowLimit) || (accumulatedBytes >= a.byteLimit);
if (result.more) {
@ -2184,7 +2194,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
readThreads = createGenericThreadPool();
}
writeThread->addThread(new Writer(id, 0, shardManager.getColumnFamilyMap(), rocksDBMetrics), "fdb-rocksdb-wr");
TraceEvent("RocksDBReadThreads").detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM);
TraceEvent("ShardedRocksDBReadThreads", id)
.detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM);
for (unsigned i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; ++i) {
readThreads->addThread(new Reader(id, i, rocksDBMetrics), "fdb-rocksdb-re");
}
@ -2302,7 +2313,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
auto* shard = shardManager.getDataShard(key);
if (shard == nullptr || !shard->physicalShard->initialized()) {
// TODO: read non-exist system key range should not cause an error.
TraceEvent(SevWarnAlways, "ShardedRocksDB")
TraceEvent(SevWarnAlways, "ShardedRocksDB", this->id)
.detail("Detail", "Read non-exist key range")
.detail("ReadKey", key);
return Optional<Value>();
@ -2330,7 +2341,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
auto* shard = shardManager.getDataShard(key);
if (shard == nullptr || !shard->physicalShard->initialized()) {
// TODO: read non-exist system key range should not cause an error.
TraceEvent(SevWarnAlways, "ShardedRocksDB")
TraceEvent(SevWarnAlways, "ShardedRocksDB", this->id)
.detail("Detail", "Read non-exist key range")
.detail("ReadKey", key);
return Optional<Value>();
@ -2452,7 +2463,7 @@ IKeyValueStore* keyValueStoreShardedRocksDB(std::string const& path,
#ifdef SSD_ROCKSDB_EXPERIMENTAL
return new ShardedRocksDBKeyValueStore(path, logID);
#else
TraceEvent(SevError, "RocksDBEngineInitFailure").detail("Reason", "Built without RocksDB");
TraceEvent(SevError, "ShardedRocksDBEngineInitFailure").detail("Reason", "Built without RocksDB");
ASSERT(false);
return nullptr;
#endif // SSD_ROCKSDB_EXPERIMENTAL

@ -281,6 +281,13 @@ class TestConfig {
if (attrib == "blobGranulesEnabled") {
blobGranulesEnabled = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "injectSSTargetedRestart") {
injectTargetedSSRestart = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "injectSSDelay") {
injectSSDelay = strcmp(value.c_str(), "true") == 0;
}
}
ifs.close();
@ -328,6 +335,8 @@ public:
bool allowDefaultTenant = true;
bool allowDisablingTenants = true;
bool injectTargetedSSRestart = false;
bool injectSSDelay = false;
ConfigDBType getConfigDBType() const { return configDBType; }
@ -385,7 +394,9 @@ public:
.add("blobGranulesEnabled", &blobGranulesEnabled)
.add("allowDefaultTenant", &allowDefaultTenant)
.add("allowDisablingTenants", &allowDisablingTenants)
.add("randomlyRenameZoneId", &randomlyRenameZoneId);
.add("randomlyRenameZoneId", &randomlyRenameZoneId)
.add("injectTargetedSSRestart", &injectTargetedSSRestart)
.add("injectSSDelay", &injectSSDelay);
try {
auto file = toml::parse(testFile);
if (file.contains("configuration") && toml::find(file, "configuration").is_table()) {
@ -1388,7 +1399,7 @@ void SimulationConfig::setDatacenters(const TestConfig& testConfig) {
void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
// Using [0, 4) to disable the RocksDB storage engine.
// TODO: Figure out what is broken with the RocksDB engine in simulation.
int storage_engine_type = deterministicRandom()->randomInt(0, 4);
int storage_engine_type = deterministicRandom()->randomInt(0, 6);
if (testConfig.storageEngineType.present()) {
storage_engine_type = testConfig.storageEngineType.get();
} else {
@ -1396,7 +1407,7 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
while (std::find(testConfig.storageEngineExcludeTypes.begin(),
testConfig.storageEngineExcludeTypes.end(),
storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) {
storage_engine_type = deterministicRandom()->randomInt(0, 5);
storage_engine_type = deterministicRandom()->randomInt(0, 6);
}
}
@ -1439,6 +1450,8 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
TraceEvent(SevWarnAlways, "RocksDBNonDeterminism")
.detail("Explanation", "The Sharded RocksDB storage engine is threaded and non-deterministic");
noUnseed = true;
auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
g_knobs.setKnob("shard_encode_location_metadata", KnobValueRef::create(bool{ true }));
break;
}
default:
@ -2368,6 +2381,13 @@ ACTOR void setupAndRun(std::string dataFolder,
testConfig.readFromConfig(testFile);
g_simulator.hasDiffProtocolProcess = testConfig.startIncompatibleProcess;
g_simulator.setDiffProtocol = false;
if (testConfig.injectTargetedSSRestart && deterministicRandom()->random01() < 0.25) {
g_simulator.injectTargetedSSRestartTime = 60.0 + 340.0 * deterministicRandom()->random01();
}
if (testConfig.injectSSDelay && deterministicRandom()->random01() < 0.25) {
g_simulator.injectSSDelayTime = 60.0 + 240.0 * deterministicRandom()->random01();
}
// Build simulator allow list
allowList.addTrustedSubnet("0.0.0.0/2"sv);
@ -2381,6 +2401,7 @@ ACTOR void setupAndRun(std::string dataFolder,
// https://github.com/apple/foundationdb/issues/5155
if (std::string_view(testFile).find("restarting") != std::string_view::npos) {
testConfig.storageEngineExcludeTypes.push_back(4);
testConfig.storageEngineExcludeTypes.push_back(5);
// Disable the default tenant in restarting tests for now
// TODO: persist the chosen default tenant in the restartInfo.ini file for the second test
@ -2393,6 +2414,7 @@ ACTOR void setupAndRun(std::string dataFolder,
// Re-enable the backup and restore related simulation tests when the tests are passing again.
if (std::string_view(testFile).find("Backup") != std::string_view::npos) {
testConfig.storageEngineExcludeTypes.push_back(4);
testConfig.storageEngineExcludeTypes.push_back(5);
}
// Disable the default tenant in backup and DR tests for now. This is because backup does not currently duplicate
@ -2406,6 +2428,7 @@ ACTOR void setupAndRun(std::string dataFolder,
// in the build.
if (!rocksDBEnabled) {
testConfig.storageEngineExcludeTypes.push_back(4);
testConfig.storageEngineExcludeTypes.push_back(5);
}
state ProtocolVersion protocolVersion = currentProtocolVersion;

@ -33,8 +33,8 @@ class TenantCacheImpl {
KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> tenantList =
wait(TenantMetadata::tenantMap.getRange(
tr, Optional<TenantName>(), Optional<TenantName>(), CLIENT_KNOBS->TOO_MANY));
ASSERT(!tenantList.more && tenantList.results.size() < CLIENT_KNOBS->TOO_MANY);
tr, Optional<TenantName>(), Optional<TenantName>(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1));
ASSERT(tenantList.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER && !tenantList.more);
return tenantList.results;
}

@ -536,6 +536,9 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
Version storageVersion = invalidVersion; // The version between the storage version and the durable version are
// being written to disk as part of the current commit in updateStorage.
Version durableVersion = invalidVersion; // All versions before the durable version are durable on disk
// FIXME: this needs to get persisted to disk to still fix same races across restart!
Version metadataVersion = invalidVersion; // Last update to the change feed metadata. Used for reasoning about
// fetched metadata vs local metadata
Version emptyVersion = 0; // The change feed does not have any mutations before emptyVersion
KeyRange range;
Key id;
@ -551,8 +554,6 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
bool removing = false;
bool destroyed = false;
bool possiblyDestroyed = false;
bool refreshInProgress = false;
KeyRangeMap<std::unordered_map<UID, Promise<Void>>> moveTriggers;
@ -587,12 +588,21 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
}
void destroy(Version destroyVersion) {
updateMetadataVersion(destroyVersion);
removing = true;
destroyed = true;
refreshInProgress = false;
moved(range);
newMutations.trigger();
}
bool updateMetadataVersion(Version version) {
// don't update metadata version if removing, so that metadata version remains the moved away version
if (!removing && version > metadataVersion) {
metadataVersion = version;
return true;
}
return false;
}
};
class ServerWatchMetadata : public ReferenceCounted<ServerWatchMetadata> {
@ -895,7 +905,7 @@ public:
KeyRangeMap<std::vector<Reference<ChangeFeedInfo>>> keyChangeFeed;
std::map<Key, Reference<ChangeFeedInfo>> uidChangeFeed;
Deque<std::pair<std::vector<Key>, Version>> changeFeedVersions;
std::map<UID, PromiseStream<Key>> changeFeedRemovals;
std::map<UID, PromiseStream<Key>> changeFeedDestroys;
std::set<Key> currentChangeFeeds;
std::set<Key> fetchingChangeFeeds;
std::unordered_map<NetworkAddress, std::map<UID, Version>> changeFeedClientVersions;
@ -1400,6 +1410,28 @@ public:
req.reply.sendError(e);
}
}
void maybeInjectTargetedRestart(Version v) {
// inject an SS restart at most once per test
if (g_network->isSimulated() && !g_simulator.speedUpSimulation &&
now() > g_simulator.injectTargetedSSRestartTime &&
rebootAfterDurableVersion == std::numeric_limits<Version>::max()) {
CODE_PROBE(true, "Injecting SS targeted restart");
TraceEvent("SimSSInjectTargetedRestart", thisServerID).detail("Version", v);
rebootAfterDurableVersion = v;
g_simulator.injectTargetedSSRestartTime = std::numeric_limits<double>::max();
}
}
bool maybeInjectDelay() {
if (g_network->isSimulated() && !g_simulator.speedUpSimulation && now() > g_simulator.injectSSDelayTime) {
CODE_PROBE(true, "Injecting SS targeted delay");
TraceEvent("SimSSInjectDelay", thisServerID);
g_simulator.injectSSDelayTime = std::numeric_limits<double>::max();
return true;
}
return false;
}
};
const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = LiteralStringRef("");
@ -2212,46 +2244,54 @@ ACTOR Future<Void> overlappingChangeFeedsQ(StorageServer* data, OverlappingChang
return Void();
}
Version metadataVersion = invalidVersion;
Version metadataWaitVersion = invalidVersion;
auto ranges = data->keyChangeFeed.intersectingRanges(req.range);
std::map<Key, std::tuple<KeyRange, Version, Version>> rangeIds;
std::map<Key, std::tuple<KeyRange, Version, Version, Version>> rangeIds;
for (auto r : ranges) {
for (auto& it : r.value()) {
if (!it->removing) {
// Can't tell other SS about a change feed create or stopVersion that may get rolled back, and we only
// need to tell it about the metadata if req.minVersion > metadataVersion, since it will get the
// information from its own private mutations if it hasn't processed up that version yet
metadataVersion = std::max(metadataVersion, it->metadataCreateVersion);
metadataWaitVersion = std::max(metadataWaitVersion, it->metadataCreateVersion);
// don't wait for all it->metadataVersion updates, if metadata was fetched from elsewhere it's already
// durable, and some updates are unecessary to wait for
Version stopVersion;
if (it->stopVersion != MAX_VERSION && req.minVersion > it->stopVersion) {
stopVersion = it->stopVersion;
metadataVersion = std::max(metadataVersion, stopVersion);
metadataWaitVersion = std::max(metadataWaitVersion, stopVersion);
} else {
stopVersion = MAX_VERSION;
}
rangeIds[it->id] = std::tuple(it->range, it->emptyVersion, stopVersion);
rangeIds[it->id] = std::tuple(it->range, it->emptyVersion, stopVersion, it->metadataVersion);
}
}
}
state OverlappingChangeFeedsReply reply;
reply.feedMetadataVersion = data->version.get();
for (auto& it : rangeIds) {
reply.rangeIds.push_back(OverlappingChangeFeedEntry(
it.first, std::get<0>(it.second), std::get<1>(it.second), std::get<2>(it.second)));
reply.feeds.push_back_deep(reply.arena,
OverlappingChangeFeedEntry(it.first,
std::get<0>(it.second),
std::get<1>(it.second),
std::get<2>(it.second),
std::get<3>(it.second)));
TraceEvent(SevDebug, "OverlappingChangeFeedEntry", data->thisServerID)
.detail("MinVersion", req.minVersion)
.detail("FeedID", it.first)
.detail("Range", std::get<0>(it.second))
.detail("EmptyVersion", std::get<1>(it.second))
.detail("StopVersion", std::get<2>(it.second));
.detail("StopVersion", std::get<2>(it.second))
.detail("FeedMetadataVersion", std::get<3>(it.second));
}
// Make sure all of the metadata we are sending won't get rolled back
if (metadataVersion != invalidVersion && metadataVersion > data->knownCommittedVersion.get()) {
if (metadataWaitVersion != invalidVersion && metadataWaitVersion > data->knownCommittedVersion.get()) {
CODE_PROBE(true, "overlapping change feeds waiting for metadata version to be committed");
wait(data->desiredOldestVersion.whenAtLeast(metadataVersion));
wait(data->desiredOldestVersion.whenAtLeast(metadataWaitVersion));
}
req.reply.send(reply);
return Void();
@ -2380,12 +2420,10 @@ static std::deque<Standalone<MutationsAndVersionRef>>::const_iterator searchChan
break;
}
lastEnd = currentEnd + 1;
jump = std::min((int)(currentEnd - mutations.begin()), jump);
currentEnd -= jump;
jump <<= 1;
}
if (currentEnd < mutations.begin()) {
currentEnd = mutations.begin();
}
auto ret = std::lower_bound(currentEnd, lastEnd, searchKey, MutationsAndVersionRef::OrderByVersion());
// TODO REMOVE: for validation
if (ret != mutations.end()) {
@ -2584,21 +2622,37 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
}
} else if (memoryVerifyIdx < memoryReply.mutations.size() &&
version == memoryReply.mutations[memoryVerifyIdx].version) {
fmt::print("ERROR: SS {0} CF {1} SQ {2} has mutation at {3} in memory but all filtered out on disk!\n",
data->thisServerID.toString().substr(0, 4),
req.rangeID.printable().substr(0, 6),
streamUID.toString().substr(0, 8),
version);
if (version > feedInfo->storageVersion && version > feedInfo->fetchVersion) {
// Another validation case - feed was popped, data was fetched, fetched data was persisted but pop
// wasn't yet, then SS restarted. Now SS has the data without the popped version. This looks wrong
// here but is fine.
memoryVerifyIdx++;
} else {
fmt::print(
"ERROR: SS {0} CF {1} SQ {2} has mutation at {3} in memory but all filtered out on disk!\n",
data->thisServerID.toString().substr(0, 4),
req.rangeID.printable().substr(0, 6),
streamUID.toString().substr(0, 8),
version);
fmt::print(" Memory: ({})\n", memoryReply.mutations[memoryVerifyIdx].mutations.size());
for (auto& it : memoryReply.mutations[memoryVerifyIdx].mutations) {
if (it.type == MutationRef::SetValue) {
fmt::print(" {}=\n", it.param1.printable().c_str());
} else {
fmt::print(" {} - {}\n", it.param1.printable().c_str(), it.param2.printable().c_str());
fmt::print(" Memory: ({})\n", memoryReply.mutations[memoryVerifyIdx].mutations.size());
for (auto& it : memoryReply.mutations[memoryVerifyIdx].mutations) {
if (it.type == MutationRef::SetValue) {
fmt::print(" {}=\n", it.param1.printable().c_str());
} else {
fmt::print(" {} - {}\n", it.param1.printable().c_str(), it.param2.printable().c_str());
}
}
fmt::print(" Disk(pre-filter): ({})\n", mutations.size());
for (auto& it : mutations) {
if (it.type == MutationRef::SetValue) {
fmt::print(" {}=\n", it.param1.printable().c_str());
} else {
fmt::print(" {} - {}\n", it.param1.printable().c_str(), it.param2.printable().c_str());
}
}
ASSERT(false);
}
ASSERT(false);
}
remainingDurableBytes -=
sizeof(KeyValueRef) +
@ -5375,22 +5429,27 @@ ACTOR Future<Void> tryGetRange(PromiseStream<RangeResult> results, Transaction*
// We have to store the version the change feed was stopped at in the SS instead of just the stopped status
// In addition to simplifying stopping logic, it enables communicating stopped status when fetching change feeds
// from other SS correctly
const Value changeFeedSSValue(KeyRangeRef const& range, Version popVersion, Version stopVersion) {
const Value changeFeedSSValue(KeyRangeRef const& range,
Version popVersion,
Version stopVersion,
Version metadataVersion) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withChangeFeed()));
wr << range;
wr << popVersion;
wr << stopVersion;
wr << metadataVersion;
return wr.toValue();
}
std::tuple<KeyRange, Version, Version> decodeChangeFeedSSValue(ValueRef const& value) {
std::tuple<KeyRange, Version, Version, Version> decodeChangeFeedSSValue(ValueRef const& value) {
KeyRange range;
Version popVersion, stopVersion;
Version popVersion, stopVersion, metadataVersion;
BinaryReader reader(value, IncludeVersion());
reader >> range;
reader >> popVersion;
reader >> stopVersion;
return std::make_tuple(range, popVersion, stopVersion);
reader >> metadataVersion;
return std::make_tuple(range, popVersion, stopVersion, metadataVersion);
}
ACTOR Future<Void> changeFeedPopQ(StorageServer* self, ChangeFeedPopRequest req) {
@ -5424,10 +5483,12 @@ ACTOR Future<Void> changeFeedPopQ(StorageServer* self, ChangeFeedPopRequest req)
auto& mLV = self->addVersionToMutationLog(durableVersion);
self->addMutationToMutationLog(
mLV,
MutationRef(
MutationRef::SetValue,
persistChangeFeedKeys.begin.toString() + feed->second->id.toString(),
changeFeedSSValue(feed->second->range, feed->second->emptyVersion + 1, feed->second->stopVersion)));
MutationRef(MutationRef::SetValue,
persistChangeFeedKeys.begin.toString() + feed->second->id.toString(),
changeFeedSSValue(feed->second->range,
feed->second->emptyVersion + 1,
feed->second->stopVersion,
feed->second->metadataVersion)));
if (feed->second->storageVersion != invalidVersion) {
++self->counters.kvSystemClearRanges;
self->addMutationToMutationLog(mLV,
@ -5519,7 +5580,8 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
persistChangeFeedKeys.begin.toString() + changeFeedInfo->id.toString(),
changeFeedSSValue(changeFeedInfo->range,
changeFeedInfo->emptyVersion + 1,
changeFeedInfo->stopVersion)));
changeFeedInfo->stopVersion,
changeFeedInfo->metadataVersion)));
data->addMutationToMutationLog(
mLV,
MutationRef(MutationRef::ClearRange,
@ -5638,8 +5700,10 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
mLV,
MutationRef(MutationRef::SetValue,
persistChangeFeedKeys.begin.toString() + changeFeedInfo->id.toString(),
changeFeedSSValue(
changeFeedInfo->range, changeFeedInfo->emptyVersion + 1, changeFeedInfo->stopVersion)));
changeFeedSSValue(changeFeedInfo->range,
changeFeedInfo->emptyVersion + 1,
changeFeedInfo->stopVersion,
changeFeedInfo->metadataVersion)));
data->addMutationToMutationLog(mLV,
MutationRef(MutationRef::ClearRange,
changeFeedDurableKey(changeFeedInfo->id, 0),
@ -5736,13 +5800,6 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
}
}
/*fmt::print("DBG: SS {} Feed {} possibly destroyed {}, {} metadata create, {} desired committed\n",
data->thisServerID.toString().substr(0, 4),
changeFeedInfo->id.printable(),
changeFeedInfo->possiblyDestroyed,
changeFeedInfo->metadataCreateVersion,
data->desiredOldestVersion.get());*/
// There are two reasons for change_feed_not_registered:
// 1. The feed was just created, but the ss mutation stream is ahead of the GRV that fetchChangeFeedApplier
// uses to read the change feed data from the database. In this case we need to wait and retry
@ -5781,7 +5838,7 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
data->changeFeedCleanupDurable[changeFeedInfo->id] = cleanupVersion;
}
for (auto& it : data->changeFeedRemovals) {
for (auto& it : data->changeFeedDestroys) {
it.second.send(changeFeedInfo->id);
}
@ -5797,7 +5854,7 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
KeyRange keys,
PromiseStream<Key> removals,
PromiseStream<Key> destroyedFeeds,
UID fetchKeysID) {
// Wait for current TLog batch to finish to ensure that we're fetching metadata at a version >= the version of the
@ -5811,82 +5868,55 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
.detail("FetchVersion", fetchVersion)
.detail("FKID", fetchKeysID);
state std::set<Key> refreshedFeedIds;
state std::set<Key> destroyedFeedIds;
// before fetching feeds from other SS's, refresh any feeds we already have that are being marked as removed
state OverlappingChangeFeedsInfo feedMetadata = wait(data->cx->getOverlappingChangeFeeds(keys, fetchVersion));
// rest of this actor needs to happen without waits that might yield to scheduler, to avoid races in feed metadata.
// Find set of feeds we currently have that were not present in fetch, to infer that they may have been destroyed.
state std::unordered_map<Key, Version> missingFeeds;
auto ranges = data->keyChangeFeed.intersectingRanges(keys);
for (auto& r : ranges) {
for (auto& cfInfo : r.value()) {
auto feedCleanup = data->changeFeedCleanupDurable.find(cfInfo->id);
if (feedCleanup != data->changeFeedCleanupDurable.end() && cfInfo->removing && !cfInfo->destroyed) {
CODE_PROBE(true, "re-fetching feed scheduled for deletion! Un-mark it as removing");
destroyedFeedIds.insert(cfInfo->id);
cfInfo->removing = false;
// because we now have a gap in the metadata, it's possible this feed was destroyed
cfInfo->possiblyDestroyed = true;
// Set refreshInProgress, so that if this actor is replaced by an expanded move actor, the new actor
// picks up the refresh
cfInfo->refreshInProgress = true;
// reset fetch versions because everything previously fetched was cleaned up
cfInfo->fetchVersion = invalidVersion;
cfInfo->durableFetchVersion = NotifiedVersion();
TraceEvent(SevDebug, "ResetChangeFeedInfo", data->thisServerID)
.detail("RangeID", cfInfo->id)
.detail("Range", cfInfo->range)
.detail("FetchVersion", fetchVersion)
.detail("EmptyVersion", cfInfo->emptyVersion)
.detail("StopVersion", cfInfo->stopVersion)
.detail("FKID", fetchKeysID);
} else if (cfInfo->refreshInProgress) {
CODE_PROBE(true, "Racing refreshes for same change feed in fetch");
destroyedFeedIds.insert(cfInfo->id);
if (cfInfo->removing && !cfInfo->destroyed) {
missingFeeds.insert({ cfInfo->id, cfInfo->metadataVersion });
}
}
}
state std::vector<OverlappingChangeFeedEntry> feeds = wait(data->cx->getOverlappingChangeFeeds(keys, fetchVersion));
// handle change feeds removed while fetching overlapping
while (removals.getFuture().isReady()) {
Key remove = waitNext(removals.getFuture());
for (int i = 0; i < feeds.size(); i++) {
if (feeds[i].rangeId == remove) {
swapAndPop(&feeds, i--);
// handle change feeds destroyed while fetching overlapping info
while (destroyedFeeds.getFuture().isReady()) {
Key destroyed = waitNext(destroyedFeeds.getFuture());
for (int i = 0; i < feedMetadata.feeds.size(); i++) {
if (feedMetadata.feeds[i].feedId == destroyed) {
missingFeeds.erase(destroyed); // feed definitely destroyed, no need to infer
swapAndPop(&feedMetadata.feeds, i--);
}
}
}
std::vector<Key> feedIds;
feedIds.reserve(feeds.size());
feedIds.reserve(feedMetadata.feeds.size());
// create change feed metadata if it does not exist
for (auto& cfEntry : feeds) {
auto cleanupEntry = data->changeFeedCleanupDurable.find(cfEntry.rangeId);
for (auto& cfEntry : feedMetadata.feeds) {
auto cleanupEntry = data->changeFeedCleanupDurable.find(cfEntry.feedId);
bool cleanupPending = cleanupEntry != data->changeFeedCleanupDurable.end();
feedIds.push_back(cfEntry.rangeId);
auto existingEntry = data->uidChangeFeed.find(cfEntry.rangeId);
auto existingEntry = data->uidChangeFeed.find(cfEntry.feedId);
bool existing = existingEntry != data->uidChangeFeed.end();
TraceEvent(SevDebug, "FetchedChangeFeedInfo", data->thisServerID)
.detail("RangeID", cfEntry.rangeId)
.detail("RangeID", cfEntry.feedId)
.detail("Range", cfEntry.range)
.detail("FetchVersion", fetchVersion)
.detail("EmptyVersion", cfEntry.emptyVersion)
.detail("StopVersion", cfEntry.stopVersion)
.detail("FeedMetadataVersion", cfEntry.feedMetadataVersion)
.detail("Existing", existing)
.detail("ExistingMetadataVersion", existing ? existingEntry->second->metadataVersion : invalidVersion)
.detail("CleanupPendingVersion", cleanupPending ? cleanupEntry->second : invalidVersion)
.detail("FKID", fetchKeysID);
bool addMutationToLog = false;
Reference<ChangeFeedInfo> changeFeedInfo;
auto fid = destroyedFeedIds.find(cfEntry.rangeId);
if (fid != destroyedFeedIds.end()) {
refreshedFeedIds.insert(cfEntry.rangeId);
destroyedFeedIds.erase(fid);
}
if (!existing) {
CODE_PROBE(cleanupPending,
"Fetch change feed which is cleanup pending. This means there was a move away and a move back, "
@ -5894,24 +5924,51 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
changeFeedInfo = Reference<ChangeFeedInfo>(new ChangeFeedInfo());
changeFeedInfo->range = cfEntry.range;
changeFeedInfo->id = cfEntry.rangeId;
changeFeedInfo->id = cfEntry.feedId;
changeFeedInfo->emptyVersion = cfEntry.emptyVersion;
changeFeedInfo->stopVersion = cfEntry.stopVersion;
data->uidChangeFeed[cfEntry.rangeId] = changeFeedInfo;
data->uidChangeFeed[cfEntry.feedId] = changeFeedInfo;
auto rs = data->keyChangeFeed.modify(cfEntry.range);
for (auto r = rs.begin(); r != rs.end(); ++r) {
r->value().push_back(changeFeedInfo);
}
data->keyChangeFeed.coalesce(cfEntry.range.contents());
data->keyChangeFeed.coalesce(cfEntry.range);
addMutationToLog = true;
} else {
changeFeedInfo = existingEntry->second;
CODE_PROBE(cfEntry.feedMetadataVersion > data->version.get(),
"Change Feed fetched future metadata version");
auto fid = missingFeeds.find(cfEntry.feedId);
if (fid != missingFeeds.end()) {
TraceEvent(SevDebug, "ResetChangeFeedInfo", data->thisServerID)
.detail("RangeID", changeFeedInfo->id.printable())
.detail("Range", changeFeedInfo->range)
.detail("FetchVersion", fetchVersion)
.detail("EmptyVersion", changeFeedInfo->emptyVersion)
.detail("StopVersion", changeFeedInfo->stopVersion)
.detail("PreviousMetadataVersion", changeFeedInfo->metadataVersion)
.detail("NewMetadataVersion", cfEntry.feedMetadataVersion)
.detail("FKID", fetchKeysID);
missingFeeds.erase(fid);
ASSERT(!changeFeedInfo->destroyed);
ASSERT(changeFeedInfo->removing);
CODE_PROBE(true, "re-fetching feed scheduled for deletion! Un-mark it as removing");
changeFeedInfo->removing = false;
// reset fetch versions because everything previously fetched was cleaned up
changeFeedInfo->fetchVersion = invalidVersion;
changeFeedInfo->durableFetchVersion = NotifiedVersion();
addMutationToLog = true;
}
if (changeFeedInfo->destroyed) {
// race where multiple feeds fetched overlapping change feed, one realized feed was missing and marked
// it removed+destroyed, then this one fetched the same info
CODE_PROBE(true, "Change feed fetched and destroyed by other fetch while fetching metadata");
continue;
}
@ -5931,82 +5988,63 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
addMutationToLog = true;
}
}
feedIds.push_back(cfEntry.feedId);
addMutationToLog |= changeFeedInfo->updateMetadataVersion(cfEntry.feedMetadataVersion);
if (addMutationToLog) {
ASSERT(changeFeedInfo.isValid());
auto& mLV = data->addVersionToMutationLog(data->data().getLatestVersion());
Version logV = data->data().getLatestVersion();
auto& mLV = data->addVersionToMutationLog(logV);
data->addMutationToMutationLog(
mLV,
MutationRef(
MutationRef::SetValue,
persistChangeFeedKeys.begin.toString() + cfEntry.rangeId.toString(),
changeFeedSSValue(cfEntry.range, changeFeedInfo->emptyVersion + 1, changeFeedInfo->stopVersion)));
MutationRef(MutationRef::SetValue,
persistChangeFeedKeys.begin.toString() + cfEntry.feedId.toString(),
changeFeedSSValue(cfEntry.range,
changeFeedInfo->emptyVersion + 1,
changeFeedInfo->stopVersion,
changeFeedInfo->metadataVersion)));
// if we updated pop version, remove mutations
while (!changeFeedInfo->mutations.empty() &&
changeFeedInfo->mutations.front().version <= changeFeedInfo->emptyVersion) {
changeFeedInfo->mutations.pop_front();
}
if (BUGGIFY) {
data->maybeInjectTargetedRestart(logV);
}
}
}
CODE_PROBE(!refreshedFeedIds.empty(), "Feed refreshed between move away and move back");
CODE_PROBE(!destroyedFeedIds.empty(), "Feed destroyed between move away and move back");
for (auto& feedId : refreshedFeedIds) {
auto existingEntry = data->uidChangeFeed.find(feedId);
if (existingEntry == data->uidChangeFeed.end() || existingEntry->second->destroyed ||
!existingEntry->second->refreshInProgress) {
CODE_PROBE(true, "feed refreshed");
for (auto& feed : missingFeeds) {
auto existingEntry = data->uidChangeFeed.find(feed.first);
ASSERT(existingEntry != data->uidChangeFeed.end());
ASSERT(existingEntry->second->removing);
ASSERT(!existingEntry->second->destroyed);
Version fetchedMetadataVersion = feedMetadata.getFeedMetadataVersion(existingEntry->second->range);
Version lastMetadataVersion = feed.second;
// Look for case where feed's range was moved away, feed was destroyed, and then feed's range was moved back.
// This happens where feed is removing, the fetch metadata is higher than the moved away version, and the feed
// isn't in the fetched response. In that case, the feed must have been destroyed between lastMetadataVersion
// and fetchedMetadataVersion
if (lastMetadataVersion >= fetchedMetadataVersion) {
CODE_PROBE(true, "Change Feed fetched higher metadata version before moved away");
continue;
}
// Since cleanup put a mutation in the log to delete the change feed data, put one in the log to restore
// it
// We may just want to refactor this so updateStorage does explicit deletes based on
// changeFeedCleanupDurable and not use the mutation log at all for the change feed metadata cleanup.
// Then we wouldn't have to reset anything here or above
// Do the mutation log update here instead of above to ensure we only add it back to the mutation log if we're
// sure it wasn't deleted in the metadata gap
Version metadataVersion = data->data().getLatestVersion();
auto& mLV = data->addVersionToMutationLog(metadataVersion);
data->addMutationToMutationLog(
mLV,
MutationRef(MutationRef::SetValue,
persistChangeFeedKeys.begin.toString() + existingEntry->second->id.toString(),
changeFeedSSValue(existingEntry->second->range,
existingEntry->second->emptyVersion + 1,
existingEntry->second->stopVersion)));
TraceEvent(SevDebug, "PersistingResetChangeFeedInfo", data->thisServerID)
.detail("RangeID", existingEntry->second->id)
.detail("Range", existingEntry->second->range)
.detail("FetchVersion", fetchVersion)
.detail("EmptyVersion", existingEntry->second->emptyVersion)
.detail("StopVersion", existingEntry->second->stopVersion)
.detail("FKID", fetchKeysID)
.detail("MetadataVersion", metadataVersion);
existingEntry->second->refreshInProgress = false;
}
for (auto& feedId : destroyedFeedIds) {
auto existingEntry = data->uidChangeFeed.find(feedId);
if (existingEntry == data->uidChangeFeed.end() || existingEntry->second->destroyed) {
CODE_PROBE(true, "feed refreshed but then destroyed elsewhere");
continue;
}
/*fmt::print("DBG: SS {} fetching feed {} was refreshed but not present!! assuming destroyed\n",
data->thisServerID.toString().substr(0, 4),
feedId.printable());*/
Version cleanupVersion = data->data().getLatestVersion();
CODE_PROBE(true, "Destroying change feed from fetch metadata"); //
TraceEvent(SevDebug, "DestroyingChangeFeedFromFetchMetadata", data->thisServerID)
.detail("RangeID", feedId)
.detail("RangeID", feed.first)
.detail("Range", existingEntry->second->range)
.detail("Version", cleanupVersion)
.detail("FKID", fetchKeysID);
if (g_network->isSimulated()) {
ASSERT(g_simulator.validationData.allDestroyedChangeFeedIDs.count(feedId.toString()));
// verify that the feed was actually destroyed and it's not an error in this inference logic
ASSERT(g_simulator.validationData.allDestroyedChangeFeedIDs.count(feed.first.toString()));
}
Key beginClearKey = feedId.withPrefix(persistChangeFeedKeys.begin);
Key beginClearKey = feed.first.withPrefix(persistChangeFeedKeys.begin);
auto& mLV = data->addVersionToMutationLog(cleanupVersion);
data->addMutationToMutationLog(mLV,
@ -6014,15 +6052,18 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
++data->counters.kvSystemClearRanges;
data->addMutationToMutationLog(mLV,
MutationRef(MutationRef::ClearRange,
changeFeedDurableKey(feedId, 0),
changeFeedDurableKey(feedId, cleanupVersion)));
changeFeedDurableKey(feed.first, 0),
changeFeedDurableKey(feed.first, cleanupVersion)));
++data->counters.kvSystemClearRanges;
existingEntry->second->destroy(cleanupVersion);
data->changeFeedCleanupDurable[feedId] = cleanupVersion;
data->changeFeedCleanupDurable[feed.first] = cleanupVersion;
for (auto& it : data->changeFeedRemovals) {
it.second.send(feedId);
for (auto& it : data->changeFeedDestroys) {
it.second.send(feed.first);
}
if (BUGGIFY) {
data->maybeInjectTargetedRestart(cleanupVersion);
}
}
return feedIds;
@ -6035,7 +6076,7 @@ ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer
KeyRange keys,
Version beginVersion,
Version endVersion,
PromiseStream<Key> removals,
PromiseStream<Key> destroyedFeeds,
std::vector<Key>* feedIds,
std::unordered_set<Key> newFeedIds) {
state std::unordered_map<Key, Version> feedMaxFetched;
@ -6064,7 +6105,7 @@ ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer
loop {
Future<Version> nextFeed = Never();
if (!removals.getFuture().isReady()) {
if (!destroyedFeeds.getFuture().isReady()) {
bool done = true;
while (!feedFetches.empty()) {
if (feedFetches.begin()->second.isReady()) {
@ -6084,11 +6125,11 @@ ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer
}
}
choose {
when(state Key remove = waitNext(removals.getFuture())) {
when(state Key destroyed = waitNext(destroyedFeeds.getFuture())) {
wait(delay(0));
feedFetches.erase(remove);
feedFetches.erase(destroyed);
for (int i = 0; i < feedIds->size(); i++) {
if ((*feedIds)[i] == remove) {
if ((*feedIds)[i] == destroyed) {
swapAndPop(feedIds, i--);
}
}
@ -6099,7 +6140,7 @@ ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer
} catch (Error& e) {
if (!data->shuttingDown) {
data->changeFeedRemovals.erase(fetchKeysID);
data->changeFeedDestroys.erase(fetchKeysID);
}
throw;
}
@ -6112,6 +6153,8 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
state Future<Void> warningLogger = logFetchKeysWarning(shard);
state const double startTime = now();
state Version fetchVersion = invalidVersion;
state PromiseStream<Key> destroyedFeeds;
state FetchKeysMetricReporter metricReporter(fetchKeysID,
startTime,
keys,
@ -6120,17 +6163,27 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
data->counters.bytesFetched,
data->counters.kvFetched);
// need to set this at the very start of the fetch, to handle any private change feed destroy mutations we get for
// this key range, that apply to change feeds we don't know about yet because their metadata hasn't been fetched yet
data->changeFeedDestroys[fetchKeysID] = destroyedFeeds;
// delay(0) to force a return to the run loop before the work of fetchKeys is started.
// This allows adding->start() to be called inline with CSK.
wait(data->coreStarted.getFuture() && delay(0));
try {
wait(data->coreStarted.getFuture() && delay(0));
// On SS Reboot, durableVersion == latestVersion, so any mutations we add to the mutation log would be skipped if
// added before latest version advances.
// To ensure this doesn't happen, we wait for version to increase by one if this fetchKeys was initiated by a
// changeServerKeys from restoreDurableState
if (data->version.get() == data->durableVersion.get()) {
wait(data->version.whenAtLeast(data->version.get() + 1));
wait(delay(0));
// On SS Reboot, durableVersion == latestVersion, so any mutations we add to the mutation log would be skipped
// if added before latest version advances. To ensure this doesn't happen, we wait for version to increase by
// one if this fetchKeys was initiated by a changeServerKeys from restoreDurableState
if (data->version.get() == data->durableVersion.get()) {
wait(data->version.whenAtLeast(data->version.get() + 1));
wait(delay(0));
}
} catch (Error& e) {
if (!data->shuttingDown) {
data->changeFeedDestroys.erase(fetchKeysID);
}
throw e;
}
try {
@ -6142,9 +6195,8 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
.detail("Version", data->version.get())
.detail("FKID", fetchKeysID);
state PromiseStream<Key> removals;
data->changeFeedRemovals[fetchKeysID] = removals;
state Future<std::vector<Key>> fetchCFMetadata = fetchChangeFeedMetadata(data, keys, removals, fetchKeysID);
state Future<std::vector<Key>> fetchCFMetadata =
fetchChangeFeedMetadata(data, keys, destroyedFeeds, fetchKeysID);
validate(data);
@ -6401,8 +6453,14 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
// being recovered. Instead we wait for the updateStorage loop to commit something (and consequently also what
// we have written)
state Future<std::unordered_map<Key, Version>> feedFetchMain = dispatchChangeFeeds(
data, fetchKeysID, keys, 0, fetchVersion + 1, removals, &changeFeedsToFetch, std::unordered_set<Key>());
state Future<std::unordered_map<Key, Version>> feedFetchMain = dispatchChangeFeeds(data,
fetchKeysID,
keys,
0,
fetchVersion + 1,
destroyedFeeds,
&changeFeedsToFetch,
std::unordered_set<Key>());
state Future<Void> fetchDurable = data->durableVersion.whenAtLeast(data->storageVersion() + 1);
state Future<Void> dataArrive = data->version.whenAtLeast(fetchVersion);
@ -6465,7 +6523,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
keys,
fetchVersion + 1,
shard->transferredVersion,
removals,
destroyedFeeds,
&changeFeedsToFetch,
newChangeFeeds);
@ -6519,7 +6577,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
}
}
data->changeFeedRemovals.erase(fetchKeysID);
data->changeFeedDestroys.erase(fetchKeysID);
shard->phase = AddingShard::Waiting;
@ -6575,7 +6633,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
.errorUnsuppressed(e)
.detail("Version", data->version.get());
if (!data->shuttingDown) {
data->changeFeedRemovals.erase(fetchKeysID);
data->changeFeedDestroys.erase(fetchKeysID);
}
if (e.code() == error_code_actor_cancelled && !data->shuttingDown && shard->phase >= AddingShard::Fetching) {
if (shard->phase < AddingShard::FetchingCF) {
@ -6828,11 +6886,15 @@ void cleanUpChangeFeeds(StorageServer* data, const KeyRangeRef& keys, Version ve
auto feed = data->uidChangeFeed.find(f.first);
if (feed != data->uidChangeFeed.end()) {
feed->second->updateMetadataVersion(version);
feed->second->removing = true;
feed->second->refreshInProgress = false;
feed->second->moved(feed->second->range);
feed->second->newMutations.trigger();
}
if (BUGGIFY) {
data->maybeInjectTargetedRestart(durableVersion);
}
} else {
// if just part of feed's range is moved away
auto feed = data->uidChangeFeed.find(f.first);
@ -7453,7 +7515,7 @@ private:
.detail("Status", status);
// Because of data moves, we can get mutations operating on a change feed we don't yet know about, because
// the fetch hasn't started yet
// the metadata fetch hasn't started yet
bool createdFeed = false;
if (feed == data->uidChangeFeed.end() && status != ChangeFeedStatus::CHANGE_FEED_DESTROY) {
createdFeed = true;
@ -7485,6 +7547,9 @@ private:
}
data->keyChangeFeed.coalesce(changeFeedRange.contents());
}
if (feed != data->uidChangeFeed.end()) {
feed->second->updateMetadataVersion(currentVersion);
}
bool popMutationLog = false;
bool addMutationToLog = false;
@ -7546,22 +7611,29 @@ private:
feed->second->destroy(currentVersion);
data->changeFeedCleanupDurable[feed->first] = cleanupVersion;
if (BUGGIFY) {
data->maybeInjectTargetedRestart(cleanupVersion);
}
}
if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY) {
for (auto& it : data->changeFeedRemovals) {
for (auto& it : data->changeFeedDestroys) {
it.second.send(changeFeedId);
}
}
if (addMutationToLog) {
auto& mLV = data->addVersionToMutationLog(data->data().getLatestVersion());
Version logV = data->data().getLatestVersion();
auto& mLV = data->addVersionToMutationLog(logV);
data->addMutationToMutationLog(
mLV,
MutationRef(MutationRef::SetValue,
persistChangeFeedKeys.begin.toString() + changeFeedId.toString(),
changeFeedSSValue(
feed->second->range, feed->second->emptyVersion + 1, feed->second->stopVersion)));
changeFeedSSValue(feed->second->range,
feed->second->emptyVersion + 1,
feed->second->stopVersion,
feed->second->metadataVersion)));
if (popMutationLog) {
++data->counters.kvSystemClearRanges;
data->addMutationToMutationLog(mLV,
@ -7569,6 +7641,9 @@ private:
changeFeedDurableKey(feed->second->id, 0),
changeFeedDurableKey(feed->second->id, popVersion)));
}
if (BUGGIFY) {
data->maybeInjectTargetedRestart(logV);
}
}
} else if ((m.type == MutationRef::SetValue || m.type == MutationRef::ClearRange) &&
m.param1.startsWith(TenantMetadata::tenantMapPrivatePrefix)) {
@ -7781,6 +7856,10 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
}
}
if (data->maybeInjectDelay()) {
wait(delay(deterministicRandom()->random01() * 10.0));
}
while (data->byteSampleClearsTooLarge.get()) {
wait(data->byteSampleClearsTooLarge.onChange());
}
@ -8530,6 +8609,7 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
TraceEvent("RebootWhenDurableTriggered", data->thisServerID)
.detail("NewOldestVersion", newOldestVersion)
.detail("RebootAfterDurableVersion", data->rebootAfterDurableVersion);
CODE_PROBE(true, "SS rebooting after durable");
// To avoid brokenPromise error, which is caused by the sender of the durableInProgress (i.e., this
// process) never sets durableInProgress, we should set durableInProgress before send the
// please_reboot() error. Otherwise, in the race situation when storage server receives both reboot and
@ -8678,7 +8758,8 @@ void setAvailableStatus(StorageServer* self, KeyRangeRef keys, bool available) {
// ASSERT( self->debug_inApplyUpdate );
ASSERT(!keys.empty());
auto& mLV = self->addVersionToMutationLog(self->data().getLatestVersion());
Version logV = self->data().getLatestVersion();
auto& mLV = self->addVersionToMutationLog(logV);
KeyRange availableKeys = KeyRangeRef(persistShardAvailableKeys.begin.toString() + keys.begin.toString(),
persistShardAvailableKeys.begin.toString() + keys.end.toString());
@ -8714,6 +8795,10 @@ void setAvailableStatus(StorageServer* self, KeyRangeRef keys, bool available) {
.detail("DeleteVersion", mLV.version + 1);
}
}
if (BUGGIFY) {
self->maybeInjectTargetedRestart(logV);
}
}
void updateStorageShard(StorageServer* data, StorageServerShard shard) {
@ -8750,7 +8835,8 @@ void updateStorageShard(StorageServer* data, StorageServerShard shard) {
void setAssignedStatus(StorageServer* self, KeyRangeRef keys, bool nowAssigned) {
ASSERT(!keys.empty());
auto& mLV = self->addVersionToMutationLog(self->data().getLatestVersion());
Version logV = self->data().getLatestVersion();
auto& mLV = self->addVersionToMutationLog(logV);
KeyRange assignedKeys = KeyRangeRef(persistShardAssignedKeys.begin.toString() + keys.begin.toString(),
persistShardAssignedKeys.begin.toString() + keys.end.toString());
//TraceEvent("SetAssignedStatus", self->thisServerID).detail("Version", mLV.version).detail("RangeBegin", assignedKeys.begin).detail("RangeEnd", assignedKeys.end);
@ -8767,6 +8853,10 @@ void setAssignedStatus(StorageServer* self, KeyRangeRef keys, bool nowAssigned)
assignedKeys.end,
endAssigned ? LiteralStringRef("1") : LiteralStringRef("0")));
}
if (BUGGIFY) {
self->maybeInjectTargetedRestart(logV);
}
}
void StorageServerDisk::clearRange(KeyRangeRef keys) {
@ -9170,13 +9260,15 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
for (feedLoc = 0; feedLoc < changeFeeds.size(); feedLoc++) {
Key changeFeedId = changeFeeds[feedLoc].key.removePrefix(persistChangeFeedKeys.begin);
KeyRange changeFeedRange;
Version popVersion, stopVersion;
std::tie(changeFeedRange, popVersion, stopVersion) = decodeChangeFeedSSValue(changeFeeds[feedLoc].value);
Version popVersion, stopVersion, metadataVersion;
std::tie(changeFeedRange, popVersion, stopVersion, metadataVersion) =
decodeChangeFeedSSValue(changeFeeds[feedLoc].value);
TraceEvent(SevDebug, "RestoringChangeFeed", data->thisServerID)
.detail("RangeID", changeFeedId)
.detail("Range", changeFeedRange)
.detail("StopVersion", stopVersion)
.detail("PopVer", popVersion);
.detail("PopVer", popVersion)
.detail("MetadataVersion", metadataVersion);
Reference<ChangeFeedInfo> changeFeedInfo(new ChangeFeedInfo());
changeFeedInfo->range = changeFeedRange;
changeFeedInfo->id = changeFeedId;
@ -9184,6 +9276,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
changeFeedInfo->storageVersion = version;
changeFeedInfo->emptyVersion = popVersion - 1;
changeFeedInfo->stopVersion = stopVersion;
changeFeedInfo->metadataVersion = metadataVersion;
data->uidChangeFeed[changeFeedId] = changeFeedInfo;
auto rs = data->keyChangeFeed.modify(changeFeedRange);
for (auto r = rs.begin(); r != rs.end(); ++r) {
@ -10038,7 +10131,8 @@ ACTOR Future<Void> initTenantMap(StorageServer* self) {
// when SSs store only the local tenants
KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> entries =
wait(TenantMetadata::tenantMap.getRange(
tr, Optional<TenantName>(), Optional<TenantName>(), CLIENT_KNOBS->TOO_MANY));
tr, Optional<TenantName>(), Optional<TenantName>(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1));
ASSERT(entries.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER && !entries.more);
TraceEvent("InitTenantMap", self->thisServerID)
.detail("Version", version)

@ -1636,8 +1636,12 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
if (useDB) {
std::vector<Future<Void>> tenantFutures;
for (auto tenant : tenantsToCreate) {
TraceEvent("CreatingTenant").detail("Tenant", tenant);
tenantFutures.push_back(success(TenantAPI::createTenant(cx.getReference(), tenant)));
TenantMapEntry entry;
if (deterministicRandom()->coinflip()) {
entry.tenantGroup = "TestTenantGroup"_sr;
}
TraceEvent("CreatingTenant").detail("Tenant", tenant).detail("TenantGroup", entry.tenantGroup);
tenantFutures.push_back(success(TenantAPI::createTenant(cx.getReference(), tenant, entry)));
}
wait(waitForAll(tenantFutures));

@ -0,0 +1,767 @@
/*
* ChangeFeedOperations.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/workloads/BulkSetup.actor.h"
#include "flow/Arena.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/Util.h"
#include "flow/serialize.h"
#include <cstring>
#include <limits>
#include "flow/actorcompiler.h" // This must be the last #include.
// enable to debug specific operations for a given change feed
#define DEBUG_KEY ""_sr
#define DEBUG_CF(feedKey) (feedKey.printable() == DEBUG_KEY)
ACTOR Future<Void> doPop(Database cx, Key key, Key feedID, Version version, Version* doneOut) {
wait(cx->popChangeFeedMutations(feedID, version));
if (*doneOut < version) {
*doneOut = version;
}
if (DEBUG_CF(key)) {
fmt::print("DBG) {0} Popped through {1}\n", key.printable(), version);
}
// TODO: could strengthen pop checking by validating that a read immediately after the pop completes has no data
return Void();
}
struct FeedTestData : ReferenceCounted<FeedTestData>, NonCopyable {
Key key;
KeyRange keyRange;
Key feedID;
int nextVal;
Future<Void> liveReader;
bool lastCleared = false;
std::vector<Future<Void>> pops;
Version poppingVersion;
Version poppedVersion;
Optional<Version> stopVersion;
bool destroying;
bool destroyed;
bool complete;
int popWindow;
int popDelayWindow;
std::deque<std::pair<Version, Optional<Value>>> writesByVersion;
// these were all committed
std::deque<std::pair<Version, Optional<Value>>> pendingCheck;
NotifiedVersion checkVersion;
FeedTestData(Key key, bool doPops)
: key(key), keyRange(KeyRangeRef(key, keyAfter(key))), feedID(key.withPrefix(LiteralStringRef("CF"))), nextVal(0),
lastCleared(false), poppingVersion(0), poppedVersion(0), destroying(false), destroyed(false), complete(false),
checkVersion(0) {
if (doPops) {
popWindow = deterministicRandom()->randomExp(1, 8);
popDelayWindow = deterministicRandom()->randomInt(0, 2) * deterministicRandom()->randomExp(1, 4);
} else {
popWindow = -1;
popDelayWindow = -1;
}
}
Value nextValue() {
std::string v = std::to_string(nextVal);
nextVal++;
return Value(v);
}
void update(Version version, Optional<Value> value) {
if (!stopVersion.present()) {
// if feed is stopped, value should not get read
writesByVersion.push_back({ version, value });
pendingCheck.push_back(writesByVersion.back());
checkVersion.set(version);
}
}
void testComplete() {
complete = true;
checkVersion.set(checkVersion.get() + 1);
}
void pop(Database cx, Version v) {
if (DEBUG_CF(key)) {
fmt::print("DBG) {0} Popping through {1}\n", key.printable(), v);
}
ASSERT(poppingVersion < v);
poppingVersion = v;
while (!writesByVersion.empty() && v > writesByVersion.front().first) {
writesByVersion.pop_front();
}
while (!pendingCheck.empty() && v > pendingCheck.front().first) {
pendingCheck.pop_front();
}
pops.push_back(doPop(cx, key, feedID, v, &poppedVersion));
}
};
static void rollbackFeed(Key key,
std::deque<Standalone<MutationsAndVersionRef>>& buffered,
Version version,
MutationRef rollbackMutation) {
Version rollbackVersion;
BinaryReader br(rollbackMutation.param2, Unversioned());
br >> rollbackVersion;
TraceEvent("ChangeFeedRollback").detail("Key", key).detail("Ver", version).detail("RollbackVer", rollbackVersion);
if (DEBUG_CF(key)) {
fmt::print("DBG) {0} Rolling back {1} -> {2}\n", key.printable(), version, rollbackVersion);
}
while (!buffered.empty() && buffered.back().version > rollbackVersion) {
TraceEvent("ChangeFeedRollbackVer").detail("Ver", buffered.back().version);
buffered.pop_back();
}
}
static void checkNextResult(Key key,
std::deque<Standalone<MutationsAndVersionRef>>& buffered,
std::deque<std::pair<Version, Optional<Value>>>& checkData) {
// First asserts are checking data is in the form the test is supposed to produce
ASSERT(!buffered.empty());
ASSERT(buffered.front().mutations.size() == 1);
ASSERT(buffered.front().mutations[0].param1 == key);
// Below asserts are correctness of change feed invariants.
// Handle case where txn retried and wrote same value twice. checkData's version is the committed one, so the same
// update may appear at an earlier version. This is fine, as long as it then actually appears at the committed
// version
// TODO: could strengthen this check a bit and only allow it to appear at the lower version if the txn retried on
// commit_unknown_result?
if (checkData.front().first < buffered.front().version) {
fmt::print("ERROR. {0} Check version {1} != {2}.\n Check: {3} {4}\n Buffered: {5} {6}\n",
key.printable(),
checkData.front().first,
buffered.front().version,
checkData.front().second.present() ? "SET" : "CLEAR",
checkData.front().second.present() ? checkData.front().second.get().printable()
: keyAfter(key).printable(),
buffered.front().mutations[0].type == MutationRef::SetValue ? "SET" : "CLEAR",
buffered.front().mutations[0].param2.printable());
}
ASSERT(checkData.front().first >= buffered.front().version);
if (checkData.front().second.present()) {
ASSERT(buffered.front().mutations[0].type == MutationRef::SetValue);
ASSERT(buffered.front().mutations[0].param2 == checkData.front().second.get());
} else {
ASSERT(buffered.front().mutations[0].type == MutationRef::ClearRange);
ASSERT(buffered.front().mutations[0].param2 == keyAfter(key));
}
if (checkData.front().first == buffered.front().version) {
checkData.pop_front();
}
buffered.pop_front();
}
ACTOR Future<Void> liveReader(Database cx, Reference<FeedTestData> data, Version begin) {
state Version lastCheckVersion = 0;
state Version nextCheckVersion = 0;
state std::deque<Standalone<MutationsAndVersionRef>> buffered;
state Reference<ChangeFeedData> results = makeReference<ChangeFeedData>();
state Future<Void> stream =
cx->getChangeFeedStream(results, data->feedID, begin, std::numeric_limits<Version>::max(), data->keyRange);
try {
loop {
if (data->complete && data->pendingCheck.empty()) {
return Void();
}
nextCheckVersion = data->pendingCheck.empty() ? invalidVersion : data->pendingCheck.front().first;
choose {
when(Standalone<VectorRef<MutationsAndVersionRef>> res = waitNext(results->mutations.getFuture())) {
for (auto& it : res) {
if (it.mutations.size() == 1 && it.mutations.back().param1 == lastEpochEndPrivateKey) {
rollbackFeed(data->key, buffered, it.version, it.mutations.back());
} else {
if (it.mutations.size() == 0) {
// FIXME: THIS SHOULD NOT HAPPEN
// FIXME: these are also getting sent past stopVersion!!
} else {
if (data->stopVersion.present()) {
if (it.version > data->stopVersion.get()) {
fmt::print("DBG) {0} Read data with version {1} > stop version {2} ({3})\n",
data->key.printable(),
it.version,
data->stopVersion.get(),
it.mutations.size());
}
ASSERT(it.version <= data->stopVersion.get());
}
buffered.push_back(Standalone<MutationsAndVersionRef>(it));
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Live read through {1} ({2})\n",
data->key.printable(),
it.version,
it.mutations.size());
}
}
}
}
}
when(wait(data->checkVersion.whenAtLeast(lastCheckVersion + 1))) {
// wake loop and start new whenAtLeast whenever checkVersion is set
lastCheckVersion = data->checkVersion.get();
}
when(wait(data->pendingCheck.empty() ? Never()
: results->whenAtLeast(data->pendingCheck.front().first))) {
if (data->pendingCheck.empty() || data->pendingCheck.front().first > nextCheckVersion) {
// pendingCheck wasn't empty before whenAtLeast, and nextCheckVersion = the front version, so if
// either of these are true, the data was popped concurrently and we can move on to checking the
// next value
CODE_PROBE(true, "popped while waiting for whenAtLeast to check next value");
continue;
}
while (!buffered.empty() && buffered.front().version < data->poppingVersion) {
CODE_PROBE(true, "live reader ignoring data that is being popped");
buffered.pop_front();
}
if (buffered.empty()) {
if (data->poppingVersion < data->pendingCheck.front().first) {
fmt::print("DBG) {0} Buffered empty after ready for check, and data not popped! popped "
"{1}, popping {2}, check {3}\n",
data->key.printable(),
data->poppedVersion,
data->poppingVersion,
data->pendingCheck.front().first);
}
ASSERT(data->poppingVersion >= data->pendingCheck.front().first);
data->pendingCheck.pop_front();
} else {
Version v = buffered.front().version;
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Live checking through {1}\n",
data->key.printable(),
data->pendingCheck.front().first);
}
checkNextResult(data->key, buffered, data->pendingCheck);
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Live Checked through {1}\n", data->key.printable(), v);
}
if (data->popDelayWindow >= 0 && data->popWindow >= 0 &&
data->writesByVersion.size() == data->popWindow + data->popDelayWindow) {
data->pop(cx, data->writesByVersion[data->popWindow - 1].first + 1);
ASSERT(data->writesByVersion.size() == data->popDelayWindow);
}
}
}
}
}
} catch (Error& e) {
throw e;
}
}
ACTOR Future<Void> historicReader(Database cx,
Reference<FeedTestData> data,
Version begin,
Version end,
bool skipPopped) {
state std::deque<std::pair<Version, Optional<Value>>> checkData;
state std::deque<Standalone<MutationsAndVersionRef>> buffered;
state Reference<ChangeFeedData> results = makeReference<ChangeFeedData>();
state Future<Void> stream = cx->getChangeFeedStream(results, data->feedID, begin, end, data->keyRange);
state Version poppedVersionAtStart = data->poppedVersion;
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Starting historical read {1} - {2}\n", data->key.printable(), begin, end);
}
// TODO could cpu optimize this
for (auto& it : data->writesByVersion) {
if (it.first >= end) {
break;
}
if (it.first >= begin) {
checkData.push_back(it);
}
}
try {
loop {
Standalone<VectorRef<MutationsAndVersionRef>> res = waitNext(results->mutations.getFuture());
for (auto& it : res) {
if (it.mutations.size() == 1 && it.mutations.back().param1 == lastEpochEndPrivateKey) {
rollbackFeed(data->key, buffered, it.version, it.mutations.back());
} else {
if (it.mutations.size() == 0) {
// FIXME: THIS SHOULD NOT HAPPEN
// FIXME: these are also getting sent past stopVersion!!
} else {
if (data->stopVersion.present()) {
ASSERT(it.version <= data->stopVersion.get());
}
buffered.push_back(Standalone<MutationsAndVersionRef>(it));
}
}
}
}
} catch (Error& e) {
if (e.code() != error_code_end_of_stream) {
throw;
}
}
if (skipPopped) {
while (!buffered.empty() && buffered.front().version < data->poppingVersion) {
// ignore data
buffered.pop_front();
}
while (!checkData.empty() && checkData.front().first < data->poppingVersion) {
checkData.pop_front();
}
}
while (!checkData.empty() && !buffered.empty()) {
checkNextResult(data->key, buffered, checkData);
}
// Change feed missing data it should have
ASSERT(checkData.empty());
// Change feed read extra data it shouldn't have
ASSERT(buffered.empty());
// check pop version of cursor
// TODO: this check might not always work if read is for old data and SS is way behind
// FIXME: this check doesn't work for now, probably due to above comment
/*if (data->poppingVersion != 0) {
ASSERT(results->popVersion >= poppedVersionAtStart && results->popVersion <= data->poppingVersion);
}*/
return Void();
}
enum Op {
CREATE_DELETE = 0,
READ = 1,
UPDATE_CLEAR = 2,
STOP = 3,
POP = 4,
OP_COUNT = 5 /* keep this last */
};
struct ChangeFeedOperationsWorkload : TestWorkload {
// test settings
double testDuration;
int operationsPerSecond;
int targetFeeds;
bool clientsDisjointKeyspace;
bool clearKeyWhenDestroy;
double clearFrequency;
int popMode;
int opWeights[Op::OP_COUNT];
int totalOpWeight;
Future<Void> client;
std::unordered_set<Key> usedKeys;
std::vector<Reference<FeedTestData>> data;
ChangeFeedOperationsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
testDuration = getOption(options, "testDuration"_sr, 60.0);
operationsPerSecond = getOption(options, "opsPerSecond"_sr, 100.0);
int64_t rand = wcx.sharedRandomNumber;
targetFeeds = deterministicRandom()->randomExp(1, 1 + rand % 10);
targetFeeds *= (0.8 + (deterministicRandom()->random01() * 0.4));
targetFeeds = std::max(1, targetFeeds / clientCount);
rand /= 10;
clientsDisjointKeyspace = rand % 2;
rand /= 2;
clearKeyWhenDestroy = rand % 2;
rand /= 2;
bool doStops = rand % 2;
rand /= 2;
bool noCreateDelete = rand % 10 == 0;
rand /= 10;
popMode = rand % 3; // 0=none, 1=read-driven, 2=op-driven
rand /= 3;
ASSERT(clientId >= 0);
ASSERT(clientId < clientCount);
ASSERT(clientCount < 255);
clearFrequency = deterministicRandom()->random01();
for (int i = 0; i < Op::OP_COUNT; i++) {
int randWeight = deterministicRandom()->randomExp(0, 5);
ASSERT(randWeight > 0);
opWeights[i] = randWeight;
}
if (!doStops) {
opWeights[Op::STOP] = 0;
}
if (noCreateDelete) {
opWeights[Op::CREATE_DELETE] = 0;
}
if (popMode != 2) {
opWeights[Op::POP] = 0;
}
std::string weightString = "|";
totalOpWeight = 0;
for (int i = 0; i < Op::OP_COUNT; i++) {
totalOpWeight += opWeights[i];
weightString += std::to_string(opWeights[i]) + "|";
}
TraceEvent("ChangeFeedOperationsInit")
.detail("TargetFeeds", targetFeeds)
.detail("DisjointKeyspace", clientsDisjointKeyspace)
.detail("ClearWhenDestroy", clearKeyWhenDestroy)
.detail("DoStops", doStops)
.detail("NoCreateDelete", noCreateDelete)
.detail("Weights", weightString);
}
Key unusedNewRandomKey() {
while (true) {
Key k = newRandomKey();
if (usedKeys.insert(k).second) {
return k;
}
}
}
Key newRandomKey() {
if (clientsDisjointKeyspace) {
double keyspaceRange = (1.0 / clientCount);
double randPartOfRange = deterministicRandom()->random01() * (keyspaceRange - 0.0001);
double randomDouble = clientId * keyspaceRange + 0.0001 + randPartOfRange;
return doubleToTestKey(randomDouble);
} else {
// this is kinda hacky but it guarantees disjoint keys per client
Key ret = doubleToTestKey(deterministicRandom()->random01());
std::string str = ret.toString();
str.back() = (uint8_t)clientId;
return Key(str);
}
}
// Pick op with weighted average
Op pickRandomOp() {
int r = deterministicRandom()->randomInt(0, totalOpWeight);
int i = 0;
while (i < Op::OP_COUNT && (opWeights[i] <= r || opWeights[i] == 0)) {
r -= opWeights[i];
i++;
}
ASSERT(i < Op::OP_COUNT);
return (Op)i;
}
ACTOR Future<Void> createNewFeed(Database cx, ChangeFeedOperationsWorkload* self) {
state Transaction tr(cx);
state Key key = self->unusedNewRandomKey();
state Reference<FeedTestData> feedData = makeReference<FeedTestData>(key, self->popMode == 1);
state Value initialValue = feedData->nextValue();
if (DEBUG_CF(key)) {
fmt::print("DBG) Creating {0}\n", key.printable());
}
loop {
try {
tr.set(key, initialValue);
wait(updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_CREATE, feedData->keyRange));
wait(tr.commit());
Version createVersion = tr.getCommittedVersion();
if (DEBUG_CF(key)) {
fmt::print("DBG) Created {0} @ {1}\n", key.printable(), createVersion);
}
feedData->update(createVersion, initialValue);
feedData->liveReader = liveReader(cx, feedData, createVersion);
self->data.push_back(feedData);
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
std::string description() const override { return "ChangeFeedOperationsWorkload"; }
Future<Void> setup(Database const& cx) override { return _setup(cx, this); }
ACTOR Future<Void> _setup(Database cx, ChangeFeedOperationsWorkload* self) {
// create initial targetFeeds feeds
TraceEvent("ChangeFeedOperationsSetup").detail("InitialFeeds", self->targetFeeds).log();
state int i;
for (i = 0; i < self->targetFeeds; i++) {
wait(self->createNewFeed(cx, self));
}
TraceEvent("ChangeFeedOperationsSetupComplete");
return Void();
}
Future<Void> start(Database const& cx) override {
client = changeFeedOperationsClient(cx->clone(), this);
return delay(testDuration);
}
Future<bool> check(Database const& cx) override {
client = Future<Void>();
return _check(cx, this);
}
ACTOR Future<Void> checkFeed(Database cx, ChangeFeedOperationsWorkload* self, Reference<FeedTestData> feedData) {
state int popIdx;
feedData->testComplete();
if (DEBUG_CF(feedData->key)) {
fmt::print("Final check {0} waiting on live reader\n", feedData->key.printable());
}
// wait on live reader and pops to make sure they complete without error
wait(feedData->liveReader);
if (DEBUG_CF(feedData->key)) {
fmt::print("Final check {0} waiting on {1} pops\n", feedData->key.printable(), feedData->pops.size());
}
for (popIdx = 0; popIdx < feedData->pops.size(); popIdx++) {
wait(feedData->pops[popIdx]);
}
// do final check, read everything not popped
if (DEBUG_CF(feedData->key)) {
fmt::print("Final check {0} waiting on data check\n", feedData->key.printable(), feedData->pops.size());
}
wait(self->doRead(cx, feedData, feedData->writesByVersion.size()));
// ensure reading [0, poppedVersion) returns no results
if (feedData->poppedVersion > 0) {
if (DEBUG_CF(feedData->key)) {
fmt::print(
"Final check {0} waiting on read popped check\n", feedData->key.printable(), feedData->pops.size());
}
// FIXME: re-enable checking for popped data by changing skipPopped back to false!
wait(historicReader(cx, feedData, 0, feedData->poppedVersion, true));
}
return Void();
}
ACTOR Future<bool> _check(Database cx, ChangeFeedOperationsWorkload* self) {
TraceEvent("ChangeFeedOperationsCheck").detail("FeedCount", self->data.size()).log();
fmt::print("Checking {0} feeds\n", self->data.size()); // TODO REMOVE
state std::vector<Future<Void>> feedChecks;
for (int i = 0; i < self->data.size(); i++) {
if (self->data[i]->destroying) {
continue;
}
if (DEBUG_CF(self->data[i]->key)) {
fmt::print("Final check {0}\n", self->data[i]->key.printable());
}
feedChecks.push_back(self->checkFeed(cx, self, self->data[i]));
}
wait(waitForAll(feedChecks));
// FIXME: check that all destroyed feeds are actually destroyed?
TraceEvent("ChangeFeedOperationsCheckComplete");
return true;
}
void getMetrics(std::vector<PerfMetric>& m) override {}
ACTOR Future<Void> stopFeed(Database cx, Reference<FeedTestData> feedData) {
state Transaction tr(cx);
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Stopping\n", feedData->key.printable());
}
loop {
try {
wait(updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_STOP, feedData->keyRange));
wait(tr.commit());
Version stopVersion = tr.getCommittedVersion();
if (!feedData->stopVersion.present()) {
feedData->stopVersion = stopVersion;
}
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Stopped @ {1}\n", feedData->key.printable(), stopVersion);
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
void popFeed(Database cx, Reference<FeedTestData> feedData) {
if (!feedData->writesByVersion.empty()) {
feedData->pop(cx, feedData->writesByVersion.front().first + 1);
}
}
ACTOR Future<Void> destroyFeed(Database cx, ChangeFeedOperationsWorkload* self, int feedIdx) {
state Reference<FeedTestData> feedData = self->data[feedIdx];
state Transaction tr(cx);
feedData->destroying = true;
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Destroying\n", feedData->key.printable());
}
loop {
try {
wait(
updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_DESTROY, feedData->keyRange));
if (self->clearKeyWhenDestroy) {
tr.clear(feedData->key);
}
wait(tr.commit());
feedData->destroyed = true;
// remove feed from list
ASSERT(self->data[feedIdx]->key == feedData->key);
swapAndPop(&self->data, feedIdx);
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Destroyed @ {1}\n", feedData->key.printable(), tr.getCommittedVersion());
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<Void> doRead(Database cx, Reference<FeedTestData> feedData, int targetReadWidth) {
if (feedData->writesByVersion.empty()) {
return Void();
}
Version beginVersion;
Version endVersion;
if (targetReadWidth >= feedData->writesByVersion.size()) {
beginVersion = feedData->writesByVersion.front().first;
endVersion = feedData->writesByVersion.back().first + 1;
} else {
// either up to or including end
int randStart = deterministicRandom()->randomInt(0, feedData->writesByVersion.size() - targetReadWidth);
beginVersion = feedData->writesByVersion[randStart].first;
int end = randStart + targetReadWidth;
if (end == feedData->writesByVersion.size()) {
endVersion = feedData->writesByVersion.back().first + 1;
} else {
// Make sure last included value (end version -1) is a committed version for checking
endVersion = feedData->writesByVersion[end].first + 1;
}
}
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Reading @ {1} - {2}\n", feedData->key.printable(), beginVersion, endVersion);
}
// FIXME: this sometimes reads popped data!
wait(historicReader(cx, feedData, beginVersion, endVersion, true));
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Read complete\n", feedData->key.printable());
}
return Void();
}
ACTOR Future<Void> doUpdateClear(Database cx,
ChangeFeedOperationsWorkload* self,
Reference<FeedTestData> feedData) {
state Transaction tr(cx);
state Optional<Value> updateValue;
// if value is already not set, don't do a clear, otherwise pick either
if (feedData->lastCleared || deterministicRandom()->random01() > self->clearFrequency) {
updateValue = feedData->nextValue();
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Setting {1}\n", feedData->key.printable(), updateValue.get().printable());
}
} else if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Clearing\n", feedData->key.printable());
}
loop {
try {
if (updateValue.present()) {
tr.set(feedData->key, updateValue.get());
} else {
tr.clear(feedData->key);
}
wait(tr.commit());
Version writtenVersion = tr.getCommittedVersion();
if (DEBUG_CF(feedData->key) && updateValue.present()) {
fmt::print("DBG) {0} Set {1} @ {2}\n",
feedData->key.printable(),
updateValue.get().printable(),
writtenVersion);
}
if (DEBUG_CF(feedData->key) && !updateValue.present()) {
fmt::print("DBG) {0} Cleared @ {1}\n", feedData->key.printable(), writtenVersion);
}
feedData->update(writtenVersion, updateValue);
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<Void> changeFeedOperationsClient(Database cx, ChangeFeedOperationsWorkload* self) {
state double last = now();
loop {
state Future<Void> waitNextOp = poisson(&last, 1.0 / self->operationsPerSecond);
Op op = self->pickRandomOp();
int feedIdx = deterministicRandom()->randomInt(0, self->data.size());
if (op == Op::CREATE_DELETE) {
// bundle these together so random creates/deletes keep about the target number of feeds
if (deterministicRandom()->random01() < 0.5 || self->data.size() == 1) {
wait(self->createNewFeed(cx, self));
} else {
wait(self->destroyFeed(cx, self, feedIdx));
}
} else if (op == Op::READ) {
// relatively small random read
wait(self->doRead(cx, self->data[feedIdx], deterministicRandom()->randomExp(2, 8)));
} else if (op == Op::UPDATE_CLEAR) {
wait(self->doUpdateClear(cx, self, self->data[feedIdx]));
} else if (op == Op::STOP) {
wait(self->stopFeed(cx, self->data[feedIdx]));
} else if (op == Op::POP) {
self->popFeed(cx, self->data[feedIdx]);
} else {
ASSERT(false);
}
wait(waitNextOp);
}
}
};
WorkloadFactory<ChangeFeedOperationsWorkload> ChangeFeedOperationsWorkloadFactory("ChangeFeedOperations");

@ -131,6 +131,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
std::vector<Reference<ITenant>> tenants;
std::set<TenantName> createdTenants;
int numTenants;
int numTenantGroups;
// Map from tenant number to key prefix
std::map<int, std::string> keyPrefixes;
@ -154,6 +155,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
int maxTenants = getOption(options, "numTenants"_sr, 4);
numTenants = deterministicRandom()->randomInt(0, maxTenants + 1);
int maxTenantGroups = getOption(options, "numTenantGroups"_sr, numTenants);
numTenantGroups = deterministicRandom()->randomInt(0, maxTenantGroups + 1);
// See https://github.com/apple/foundationdb/issues/2424
if (BUGGIFY) {
enableBuggify(true, BuggifyType::Client);
@ -206,6 +210,14 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
std::string description() const override { return "FuzzApiCorrectness"; }
static TenantName getTenant(int num) { return TenantNameRef(format("tenant_%d", num)); }
Optional<TenantGroupName> getTenantGroup(int num) {
int groupNum = num % (numTenantGroups + 1);
if (groupNum == numTenantGroups - 1) {
return Optional<TenantGroupName>();
} else {
return TenantGroupNameRef(format("tenantgroup_%d", groupNum));
}
}
bool canUseTenant(Optional<TenantName> tenant) { return !tenant.present() || createdTenants.count(tenant.get()); }
Future<Void> setup(Database const& cx) override {
@ -226,7 +238,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
// The last tenant will not be created
if (i < self->numTenants) {
tenantFutures.push_back(::success(TenantAPI::createTenant(cx.getReference(), tenantName)));
TenantMapEntry entry;
entry.tenantGroup = self->getTenantGroup(i);
tenantFutures.push_back(::success(TenantAPI::createTenant(cx.getReference(), tenantName, entry)));
self->createdTenants.insert(tenantName);
}
}

@ -325,6 +325,7 @@ struct PhysicalShardMoveWorkLoad : TestWorkload {
TraceEvent("TestCancelDataMoveEnd").detail("DataMove", dataMove.toString());
}
TraceEvent("TestMoveShardStartMoveKeys").detail("DataMove", dataMoveId);
wait(moveKeys(cx,
dataMoveId,
keys,

File diff suppressed because it is too large Load Diff

@ -174,6 +174,19 @@ public:
}
bool coinflip() { return (this->random01() < 0.5); }
// Picks a number between 2^minExp and 2^maxExp, but uniformly distributed over exponential buckets 2^n - 2^n+1
// For example, randomExp(0, 4) would have a 25% chance of returning 1, a 25% chance of returning 2-3, a 25% chance
// of returning 4-7, and a 25% chance of returning 8-15
// Similar in Expected Value to doing 1 << randomInt(minExp, maxExp+1), except numbers returned aren't just powers
// of 2
int randomExp(int minExp, int maxExp) {
if (minExp == maxExp) { // N=2, case
return 1 << minExp;
}
int val = 1 << this->randomInt(minExp, maxExp);
return this->randomInt(val, val * 2);
}
};
extern FILE* randLog;

@ -228,11 +228,14 @@ ERROR( tenant_name_required, 2130, "Tenant name must be specified to access data
ERROR( tenant_not_found, 2131, "Tenant does not exist" )
ERROR( tenant_already_exists, 2132, "A tenant with the given name already exists" )
ERROR( tenant_not_empty, 2133, "Cannot delete a non-empty tenant" )
ERROR( invalid_tenant_name, 2134, "Tenant name cannot begin with \\xff");
ERROR( tenant_prefix_allocator_conflict, 2135, "The database already has keys stored at the prefix allocated for the tenant");
ERROR( tenants_disabled, 2136, "Tenants have been disabled in the cluster");
ERROR( unknown_tenant, 2137, "Tenant is not available from this server")
ERROR( illegal_tenant_access, 2138, "Illegal tenant access")
ERROR( invalid_tenant_name, 2134, "Tenant name cannot begin with \\xff" )
ERROR( tenant_prefix_allocator_conflict, 2135, "The database already has keys stored at the prefix allocated for the tenant" )
ERROR( tenants_disabled, 2136, "Tenants have been disabled in the cluster" )
ERROR( unknown_tenant, 2137, "Tenant is not available from this server" )
ERROR( illegal_tenant_access, 2138, "Illegal tenant access" )
ERROR( invalid_tenant_group_name, 2139, "Tenant group name cannot begin with \\xff" )
ERROR( invalid_tenant_configuration, 2140, "Tenant configuration is invalid" )
ERROR( cluster_no_capacity, 2141, "Cluster does not have capacity to perform the specified operation" )
// 2200 - errors from bindings and official APIs
ERROR( api_version_unset, 2200, "API version is not set" )

@ -130,8 +130,7 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES fast/BackupToDBCorrectnessClean.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifySmall.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifySmallClean.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifyAtomicOps.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifyCycle.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleMoveVerifyCycle.toml)
add_fdb_test(TEST_FILES fast/CacheTest.toml)
add_fdb_test(TEST_FILES fast/CloggedSideband.toml)
add_fdb_test(TEST_FILES fast/CompressionUtilsUnit.toml)
@ -140,6 +139,7 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES fast/CycleAndLock.toml)
add_fdb_test(TEST_FILES fast/CycleTest.toml)
add_fdb_test(TEST_FILES fast/ChangeFeeds.toml)
add_fdb_test(TEST_FILES fast/ChangeFeedOperations.toml)
add_fdb_test(TEST_FILES fast/DataLossRecovery.toml)
add_fdb_test(TEST_FILES fast/EncryptionOps.toml)
# TODO: fix failures and renable the test
@ -200,6 +200,8 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml IGNORE)
add_fdb_test(TEST_FILES fast/StorageServerCheckpointRestore.toml IGNORE)
endif()
add_fdb_test(TEST_FILES rare/BlobGranuleVerifyAtomicOps.toml)
add_fdb_test(TEST_FILES rare/BlobGranuleVerifyCycle.toml)
add_fdb_test(TEST_FILES rare/CheckRelocation.toml)
add_fdb_test(TEST_FILES rare/ClogUnclog.toml)
add_fdb_test(TEST_FILES rare/CloggedCycleWithKills.toml)

@ -0,0 +1,48 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
[[knobs]]
bg_range_source = "blobRangeKeys"
[[test]]
testTitle = 'BlobGranuleMoveVerifyCycle'
[[test.workload]]
testName = 'Cycle'
transactionsPerSecond = 250.0
testDuration = 60.0
expectedRate = 0
[[test.workload]]
testName = 'RandomMoveKeys'
testDuration = 60.0
[[test.workload]]
testName = 'BlobGranuleVerifier'
testDuration = 60.0
[[test.workload]]
testName = 'RandomClogging'
testDuration = 60.0
[[test.workload]]
testName = 'Rollback'
meanDelay = 60.0
testDuration = 60.0
[[test.workload]]
testName = 'Attrition'
machinesToKill = 10
machinesToLeave = 3
reboot = true
testDuration = 60.0
[[test.workload]]
testName = 'Attrition'
machinesToKill = 10
machinesToLeave = 3
reboot = true
testDuration = 60.0

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [3, 4, 5]

@ -3,7 +3,7 @@ blobGranulesEnabled = true
allowDefaultTenant = false
# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [3, 4]
storageEngineExcludeTypes = [3, 4, 5]
[[knobs]]
bg_range_source = "blobRangeKeys"

@ -0,0 +1,10 @@
[configuration]
allowDefaultTenant = false
# TODO add failure events, and then add a version that also supports randomMoveKeys
[[test]]
testTitle = 'ChangeFeedOperationsTest'
[[test.workload]]
testName = 'ChangeFeedOperations'

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4, 5]

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4, 5]

@ -7,4 +7,4 @@ startDelay = 0
[[test.workload]]
testName = 'UnitTests'
maxTestCases = 1
testsMatching = '/blobgranule/files/deltaFormatUnitTest'
testsMatching = '/'

@ -2,6 +2,8 @@
blobGranulesEnabled = true
allowDefaultTenant = false
allowDisablingTenants = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4, 5]

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4, 5]

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4, 5]

@ -10,7 +10,7 @@ runSetup = true
[[test.workload]]
testName = 'TenantManagement'
maxTenants = 1000
maxTenants = 1000
testDuration = 60
[[test.workload]]

@ -10,5 +10,5 @@ runSetup = true
[[test.workload]]
testName = 'TenantManagement'
maxTenants = 1000
testDuration = 60
maxTenants = 1000
testDuration = 60