/* * ManagementAPI.actor.cpp * * This source file is part of the FoundationDB open source project * * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "fdbclient/GenericManagementAPI.actor.h" #include "fmt/format.h" #include "fdbclient/Knobs.h" #include "flow/Arena.h" #include "fdbclient/ClusterConnectionMemoryRecord.h" #include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/ReadYourWrites.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/SystemData.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/CoordinationInterface.h" #include "fdbclient/DatabaseContext.h" #include "fdbrpc/simulator.h" #include "fdbclient/StatusClient.h" #include "flow/Trace.h" #include "flow/UnitTest.h" #include "fdbrpc/ReplicationPolicy.h" #include "fdbrpc/Replication.h" #include "fdbclient/Schemas.h" #include "flow/actorcompiler.h" // This must be the last #include. bool isInteger(const std::string& s) { if (s.empty()) return false; char* p; strtol(s.c_str(), &p, 10); return (*p == 0); } // Defines the mapping between configuration names (as exposed by fdbcli, buildConfiguration()) and actual configuration // parameters std::map configForToken(std::string const& mode) { std::map out; std::string p = configKeysPrefix.toString(); if (mode == "new") { out[p + "initialized"] = "1"; return out; } if (mode == "tss") { // Set temporary marker in config map to mark that this is a tss configuration and not a normal storage/log // configuration. A bit of a hack but reuses the parsing code nicely. out[p + "istss"] = "1"; return out; } if (mode == "locked") { // Setting this key is interpreted as an instruction to use the normal version-stamp-based mechanism for locking // the database. out[databaseLockedKey.toString()] = deterministicRandom()->randomUniqueID().toString(); return out; } size_t pos; // key:=value is unvalidated and unchecked pos = mode.find(":="); if (pos != std::string::npos) { out[p + mode.substr(0, pos)] = mode.substr(pos + 2); return out; } // key=value is constrained to a limited set of options and basic validation is performed pos = mode.find("="); if (pos != std::string::npos) { std::string key = mode.substr(0, pos); std::string value = mode.substr(pos + 1); if (key == "proxies" && isInteger(value)) { printf("Warning: Proxy role is being split into GRV Proxy and Commit Proxy, now prefer configuring " "'grv_proxies' and 'commit_proxies' separately. Generally we should follow that 'commit_proxies'" " is three times of 'grv_proxies' count and 'grv_proxies' should be not more than 4.\n"); int proxiesCount = atoi(value.c_str()); if (proxiesCount == -1) { proxiesCount = CLIENT_KNOBS->DEFAULT_AUTO_GRV_PROXIES + CLIENT_KNOBS->DEFAULT_AUTO_COMMIT_PROXIES; ASSERT_WE_THINK(proxiesCount >= 2); } if (proxiesCount < 2) { printf("Error: At least 2 proxies (1 GRV proxy and 1 Commit proxy) are required.\n"); return out; } int grvProxyCount = std::max(1, std::min(CLIENT_KNOBS->DEFAULT_MAX_GRV_PROXIES, proxiesCount / (CLIENT_KNOBS->DEFAULT_COMMIT_GRV_PROXIES_RATIO + 1))); int commitProxyCount = proxiesCount - grvProxyCount; ASSERT_WE_THINK(grvProxyCount >= 1 && commitProxyCount >= 1); out[p + "grv_proxies"] = std::to_string(grvProxyCount); out[p + "commit_proxies"] = std::to_string(commitProxyCount); printf("%d proxies are automatically converted into %d GRV proxies and %d Commit proxies.\n", proxiesCount, grvProxyCount, commitProxyCount); TraceEvent("DatabaseConfigurationProxiesSpecified") .detail("SpecifiedProxies", atoi(value.c_str())) .detail("EffectiveSpecifiedProxies", proxiesCount) .detail("ConvertedGrvProxies", grvProxyCount) .detail("ConvertedCommitProxies", commitProxyCount); } if ((key == "logs" || key == "commit_proxies" || key == "grv_proxies" || key == "resolvers" || key == "remote_logs" || key == "log_routers" || key == "usable_regions" || key == "repopulate_anti_quorum" || key == "count") && isInteger(value)) { out[p + key] = value; } if (key == "regions") { json_spirit::mValue mv; json_spirit::read_string(value, mv); StatusObject regionObj; regionObj["regions"] = mv; out[p + key] = BinaryWriter::toValue(regionObj, IncludeVersion(ProtocolVersion::withRegionConfiguration())).toString(); } if (key == "perpetual_storage_wiggle" && isInteger(value)) { int ppWiggle = std::stoi(value); if (ppWiggle >= 2 || ppWiggle < 0) { printf("Error: Only 0 and 1 are valid values of perpetual_storage_wiggle at present.\n"); return out; } out[p + key] = value; } if (key == "perpetual_storage_wiggle_locality") { if (!isValidPerpetualStorageWiggleLocality(value)) { printf("Error: perpetual_storage_wiggle_locality should be in : " "format or enter 0 to disable the locality match for wiggling.\n"); return out; } out[p + key] = value; } if (key == "storage_migration_type") { StorageMigrationType type; if (value == "disabled") { type = StorageMigrationType::DISABLED; } else if (value == "aggressive") { type = StorageMigrationType::AGGRESSIVE; } else if (value == "gradual") { type = StorageMigrationType::GRADUAL; } else { printf("Error: Only disabled|aggressive|gradual are valid for storage_migration_type.\n"); return out; } out[p + key] = format("%d", type); } if (key == "blob_granules_enabled") { int enabled = std::stoi(value); if (enabled != 0 && enabled != 1) { printf("Error: Only 0 or 1 are valid values for blob_granules_enabled. " "1 enables blob granules and 0 disables them.\n"); return out; } out[p + key] = value; } if (key == "tenant_mode") { TenantMode tenantMode; if (value == "disabled") { tenantMode = TenantMode::DISABLED; } else if (value == "optional_experimental") { tenantMode = TenantMode::OPTIONAL_TENANT; } else if (value == "required_experimental") { tenantMode = TenantMode::REQUIRED; } else { printf("Error: Only disabled|optional_experimental|required_experimental are valid for tenant_mode.\n"); return out; } out[p + key] = format("%d", tenantMode); } return out; } Optional logType; Optional storeType; if (mode == "ssd-1") { logType = KeyValueStoreType::SSD_BTREE_V1; storeType = KeyValueStoreType::SSD_BTREE_V1; } else if (mode == "ssd" || mode == "ssd-2") { logType = KeyValueStoreType::SSD_BTREE_V2; storeType = KeyValueStoreType::SSD_BTREE_V2; } else if (mode == "ssd-redwood-1-experimental") { logType = KeyValueStoreType::SSD_BTREE_V2; storeType = KeyValueStoreType::SSD_REDWOOD_V1; } else if (mode == "ssd-rocksdb-v1") { logType = KeyValueStoreType::SSD_BTREE_V2; storeType = KeyValueStoreType::SSD_ROCKSDB_V1; } else if (mode == "ssd-sharded-rocksdb") { logType = KeyValueStoreType::SSD_BTREE_V2; storeType = KeyValueStoreType::SSD_SHARDED_ROCKSDB; } else if (mode == "memory" || mode == "memory-2") { logType = KeyValueStoreType::SSD_BTREE_V2; storeType = KeyValueStoreType::MEMORY; } else if (mode == "memory-1") { logType = KeyValueStoreType::MEMORY; storeType = KeyValueStoreType::MEMORY; } else if (mode == "memory-radixtree-beta") { logType = KeyValueStoreType::SSD_BTREE_V2; storeType = KeyValueStoreType::MEMORY_RADIXTREE; } // Add any new store types to fdbserver/workloads/ConfigureDatabase, too if (storeType.present()) { out[p + "log_engine"] = format("%d", logType.get().storeType()); out[p + "storage_engine"] = format("%d", KeyValueStoreType::StoreType(storeType.get())); return out; } std::string redundancy, log_replicas; Reference storagePolicy; Reference tLogPolicy; bool redundancySpecified = true; if (mode == "single") { redundancy = "1"; log_replicas = "1"; storagePolicy = tLogPolicy = Reference(new PolicyOne()); } else if (mode == "double" || mode == "fast_recovery_double") { redundancy = "2"; log_replicas = "2"; storagePolicy = tLogPolicy = Reference( new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))); } else if (mode == "triple" || mode == "fast_recovery_triple") { redundancy = "3"; log_replicas = "3"; storagePolicy = tLogPolicy = Reference( new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); } else if (mode == "three_datacenter" || mode == "multi_dc") { redundancy = "6"; log_replicas = "4"; storagePolicy = Reference( new PolicyAcross(3, "dcid", Reference( new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))))); tLogPolicy = Reference( new PolicyAcross(2, "dcid", Reference( new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))))); } else if (mode == "three_datacenter_fallback") { redundancy = "4"; log_replicas = "4"; storagePolicy = tLogPolicy = Reference( new PolicyAcross(2, "dcid", Reference( new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))))); } else if (mode == "three_data_hall") { redundancy = "3"; log_replicas = "4"; storagePolicy = Reference( new PolicyAcross(3, "data_hall", Reference(new PolicyOne()))); tLogPolicy = Reference( new PolicyAcross(2, "data_hall", Reference( new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))))); } else if (mode == "three_data_hall_fallback") { redundancy = "2"; log_replicas = "4"; storagePolicy = Reference( new PolicyAcross(2, "data_hall", Reference(new PolicyOne()))); tLogPolicy = Reference( new PolicyAcross(2, "data_hall", Reference( new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))))); } else redundancySpecified = false; if (redundancySpecified) { out[p + "storage_replicas"] = redundancy; out[p + "log_replicas"] = log_replicas; out[p + "log_anti_quorum"] = "0"; BinaryWriter policyWriter(IncludeVersion(ProtocolVersion::withReplicationPolicy())); serializeReplicationPolicy(policyWriter, storagePolicy); out[p + "storage_replication_policy"] = policyWriter.toValue().toString(); policyWriter = BinaryWriter(IncludeVersion(ProtocolVersion::withReplicationPolicy())); serializeReplicationPolicy(policyWriter, tLogPolicy); out[p + "log_replication_policy"] = policyWriter.toValue().toString(); return out; } std::string remote_redundancy, remote_log_replicas; Reference remoteTLogPolicy; bool remoteRedundancySpecified = true; if (mode == "remote_default") { remote_redundancy = "0"; remote_log_replicas = "0"; remoteTLogPolicy = Reference(); } else if (mode == "remote_single") { remote_redundancy = "1"; remote_log_replicas = "1"; remoteTLogPolicy = Reference(new PolicyOne()); } else if (mode == "remote_double") { remote_redundancy = "2"; remote_log_replicas = "2"; remoteTLogPolicy = Reference( new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))); } else if (mode == "remote_triple") { remote_redundancy = "3"; remote_log_replicas = "3"; remoteTLogPolicy = Reference( new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); } else if (mode == "remote_three_data_hall") { // FIXME: not tested in simulation remote_redundancy = "3"; remote_log_replicas = "4"; remoteTLogPolicy = Reference( new PolicyAcross(2, "data_hall", Reference( new PolicyAcross(2, "zoneid", Reference(new PolicyOne()))))); } else remoteRedundancySpecified = false; if (remoteRedundancySpecified) { out[p + "remote_log_replicas"] = remote_log_replicas; BinaryWriter policyWriter(IncludeVersion(ProtocolVersion::withReplicationPolicy())); serializeReplicationPolicy(policyWriter, remoteTLogPolicy); out[p + "remote_log_policy"] = policyWriter.toValue().toString(); return out; } return out; } ConfigurationResult buildConfiguration(std::vector const& modeTokens, std::map& outConf) { for (auto it : modeTokens) { std::string mode = it.toString(); auto m = configForToken(mode); if (!m.size()) { TraceEvent(SevWarnAlways, "UnknownOption").detail("Option", mode); return ConfigurationResult::UNKNOWN_OPTION; } for (auto t = m.begin(); t != m.end(); ++t) { if (outConf.count(t->first)) { TraceEvent(SevWarnAlways, "ConflictingOption").detail("Option", t->first); return ConfigurationResult::CONFLICTING_OPTIONS; } outConf[t->first] = t->second; } } auto p = configKeysPrefix.toString(); if (!outConf.count(p + "storage_replication_policy") && outConf.count(p + "storage_replicas")) { int storageCount = stoi(outConf[p + "storage_replicas"]); Reference storagePolicy = Reference( new PolicyAcross(storageCount, "zoneid", Reference(new PolicyOne()))); BinaryWriter policyWriter(IncludeVersion(ProtocolVersion::withReplicationPolicy())); serializeReplicationPolicy(policyWriter, storagePolicy); outConf[p + "storage_replication_policy"] = policyWriter.toValue().toString(); } if (!outConf.count(p + "log_replication_policy") && outConf.count(p + "log_replicas")) { int logCount = stoi(outConf[p + "log_replicas"]); Reference logPolicy = Reference( new PolicyAcross(logCount, "zoneid", Reference(new PolicyOne()))); BinaryWriter policyWriter(IncludeVersion(ProtocolVersion::withReplicationPolicy())); serializeReplicationPolicy(policyWriter, logPolicy); outConf[p + "log_replication_policy"] = policyWriter.toValue().toString(); } if (outConf.count(p + "istss")) { // redo config parameters to be tss config instead of normal config // save param values from parsing as a normal config bool isNew = outConf.count(p + "initialized"); Optional count; Optional storageEngine; if (outConf.count(p + "count")) { count = Optional(outConf[p + "count"]); } if (outConf.count(p + "storage_engine")) { storageEngine = Optional(outConf[p + "storage_engine"]); } // A new tss setup must have count + storage engine. An adjustment must have at least one. if ((isNew && (!count.present() || !storageEngine.present())) || (!isNew && !count.present() && !storageEngine.present())) { return ConfigurationResult::INCOMPLETE_CONFIGURATION; } // clear map and only reset tss parameters outConf.clear(); if (count.present()) { outConf[p + "tss_count"] = count.get(); } if (storageEngine.present()) { outConf[p + "tss_storage_engine"] = storageEngine.get(); } } return ConfigurationResult::SUCCESS; } ConfigurationResult buildConfiguration(std::string const& configMode, std::map& outConf) { std::vector modes; int p = 0; while (p < configMode.size()) { int end = configMode.find_first_of(' ', p); if (end == configMode.npos) end = configMode.size(); modes.push_back(StringRef(configMode).substr(p, end - p)); p = end + 1; } return buildConfiguration(modes, outConf); } bool isCompleteConfiguration(std::map const& options) { std::string p = configKeysPrefix.toString(); return options.count(p + "log_replicas") == 1 && options.count(p + "log_anti_quorum") == 1 && options.count(p + "storage_replicas") == 1 && options.count(p + "log_engine") == 1 && options.count(p + "storage_engine") == 1; } ACTOR Future getDatabaseConfiguration(Transaction* tr) { tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); RangeResult res = wait(tr->getRange(configKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(res.size() < CLIENT_KNOBS->TOO_MANY); DatabaseConfiguration config; config.fromKeyValues((VectorRef)res); return config; } ACTOR Future getDatabaseConfiguration(Database cx) { state Transaction tr(cx); loop { try { DatabaseConfiguration config = wait(getDatabaseConfiguration(&tr)); return config; } catch (Error& e) { wait(tr.onError(e)); } } } ConfigureAutoResult parseConfig(StatusObject const& status) { ConfigureAutoResult result; StatusObjectReader statusObj(status); StatusObjectReader statusObjCluster; if (!statusObj.get("cluster", statusObjCluster)) return ConfigureAutoResult(); StatusObjectReader statusObjConfig; if (!statusObjCluster.get("configuration", statusObjConfig)) return ConfigureAutoResult(); if (!statusObjConfig.get("redundancy.factor", result.old_replication)) return ConfigureAutoResult(); result.auto_replication = result.old_replication; [[maybe_unused]] int storage_replication; int log_replication; if (result.old_replication == "single") { result.auto_replication = "double"; storage_replication = 2; log_replication = 2; } else if (result.old_replication == "double" || result.old_replication == "fast_recovery_double") { storage_replication = 2; log_replication = 2; } else if (result.old_replication == "triple" || result.old_replication == "fast_recovery_triple") { storage_replication = 3; log_replication = 3; } else if (result.old_replication == "three_datacenter") { storage_replication = 6; log_replication = 4; } else if (result.old_replication == "three_datacenter_fallback") { storage_replication = 4; log_replication = 4; } else if (result.old_replication == "three_data_hall") { storage_replication = 3; log_replication = 4; } else if (result.old_replication == "three_data_hall_fallback") { storage_replication = 2; log_replication = 4; } else return ConfigureAutoResult(); StatusObjectReader machinesMap; if (!statusObjCluster.get("machines", machinesMap)) return ConfigureAutoResult(); std::map machineid_dcid; std::set datacenters; int machineCount = 0; for (auto mach : machinesMap.obj()) { StatusObjectReader machine(mach.second); std::string dcId; if (machine.get("datacenter_id", dcId)) { machineid_dcid[mach.first] = dcId; datacenters.insert(dcId); } machineCount++; } result.machines = machineCount; if (datacenters.size() > 1) return ConfigureAutoResult(); StatusObjectReader processesMap; if (!statusObjCluster.get("processes", processesMap)) return ConfigureAutoResult(); std::set oldMachinesWithTransaction; int oldTransactionProcesses = 0; std::map>> machine_processes; int processCount = 0; for (auto proc : processesMap.obj()) { StatusObjectReader process(proc.second); if (!process.has("excluded") || !process.last().get_bool()) { std::string addrStr; if (!process.get("address", addrStr)) return ConfigureAutoResult(); std::string class_source; if (!process.get("class_source", class_source)) return ConfigureAutoResult(); std::string class_type; if (!process.get("class_type", class_type)) return ConfigureAutoResult(); std::string machineId; if (!process.get("machine_id", machineId)) return ConfigureAutoResult(); NetworkAddress addr = NetworkAddress::parse(addrStr); ProcessClass processClass(class_type, class_source); if (processClass.classType() == ProcessClass::TransactionClass || processClass.classType() == ProcessClass::LogClass) { oldMachinesWithTransaction.insert(machineId); } if (processClass.classType() == ProcessClass::TransactionClass || processClass.classType() == ProcessClass::CommitProxyClass || processClass.classType() == ProcessClass::GrvProxyClass || processClass.classType() == ProcessClass::ResolutionClass || processClass.classType() == ProcessClass::StatelessClass || processClass.classType() == ProcessClass::LogClass) { oldTransactionProcesses++; } if (processClass.classSource() == ProcessClass::AutoSource) { processClass = ProcessClass(ProcessClass::UnsetClass, ProcessClass::CommandLineSource); result.address_class[addr] = processClass; } if (processClass.classType() != ProcessClass::TesterClass) { machine_processes[machineId].emplace_back(addr, processClass); processCount++; } } } result.processes = processCount; result.old_processes_with_transaction = oldTransactionProcesses; result.old_machines_with_transaction = oldMachinesWithTransaction.size(); std::map, std::vector>> count_processes; for (auto& it : machine_processes) { count_processes[std::make_pair(it.second.size(), it.first)] = it.second; } std::set machinesWithTransaction; std::set machinesWithStorage; int totalTransactionProcesses = 0; int existingProxyCount = 0; int existingGrvProxyCount = 0; int existingResolverCount = 0; int existingStatelessCount = 0; for (auto& it : machine_processes) { for (auto& proc : it.second) { if (proc.second == ProcessClass::TransactionClass || proc.second == ProcessClass::LogClass) { totalTransactionProcesses++; machinesWithTransaction.insert(it.first); } if (proc.second == ProcessClass::StatelessClass) { existingStatelessCount++; } if (proc.second == ProcessClass::CommitProxyClass) { existingProxyCount++; } if (proc.second == ProcessClass::GrvProxyClass) { existingGrvProxyCount++; } if (proc.second == ProcessClass::ResolutionClass) { existingResolverCount++; } if (proc.second == ProcessClass::StorageClass) { machinesWithStorage.insert(it.first); } if (proc.second == ProcessClass::UnsetClass && proc.second.classSource() == ProcessClass::DBSource) { machinesWithStorage.insert(it.first); } } } if (processCount < 10) return ConfigureAutoResult(); result.desired_resolvers = 1; int resolverCount; if (!statusObjConfig.get("resolvers", result.old_resolvers)) { result.old_resolvers = CLIENT_KNOBS->DEFAULT_AUTO_RESOLVERS; statusObjConfig.get("auto_resolvers", result.old_resolvers); result.auto_resolvers = result.desired_resolvers; resolverCount = result.auto_resolvers; } else { result.auto_resolvers = result.old_resolvers; resolverCount = result.old_resolvers; } result.desired_commit_proxies = std::max(std::min(12, processCount / 15), 1); int proxyCount; if (!statusObjConfig.get("commit_proxies", result.old_commit_proxies)) { result.old_commit_proxies = CLIENT_KNOBS->DEFAULT_AUTO_COMMIT_PROXIES; statusObjConfig.get("auto_commit_proxies", result.old_commit_proxies); result.auto_commit_proxies = result.desired_commit_proxies; proxyCount = result.auto_commit_proxies; } else { result.auto_commit_proxies = result.old_commit_proxies; proxyCount = result.old_commit_proxies; } result.desired_grv_proxies = std::max(std::min(4, processCount / 20), 1); int grvProxyCount; if (!statusObjConfig.get("grv_proxies", result.old_grv_proxies)) { result.old_grv_proxies = CLIENT_KNOBS->DEFAULT_AUTO_GRV_PROXIES; statusObjConfig.get("auto_grv_proxies", result.old_grv_proxies); result.auto_grv_proxies = result.desired_grv_proxies; grvProxyCount = result.auto_grv_proxies; } else { result.auto_grv_proxies = result.old_grv_proxies; grvProxyCount = result.old_grv_proxies; } result.desired_logs = std::min(12, processCount / 20); result.desired_logs = std::max(result.desired_logs, log_replication + 1); result.desired_logs = std::min(result.desired_logs, machine_processes.size()); int logCount; if (!statusObjConfig.get("logs", result.old_logs)) { result.old_logs = CLIENT_KNOBS->DEFAULT_AUTO_LOGS; statusObjConfig.get("auto_logs", result.old_logs); result.auto_logs = result.desired_logs; logCount = result.auto_logs; } else { result.auto_logs = result.old_logs; logCount = result.old_logs; } logCount = std::max(logCount, log_replication); totalTransactionProcesses += std::min(existingProxyCount, proxyCount); totalTransactionProcesses += std::min(existingGrvProxyCount, grvProxyCount); totalTransactionProcesses += std::min(existingResolverCount, resolverCount); totalTransactionProcesses += existingStatelessCount; // if one process on a machine is transaction class, make them all transaction class for (auto& it : count_processes) { if (machinesWithTransaction.count(it.first.second) && !machinesWithStorage.count(it.first.second)) { for (auto& proc : it.second) { if (proc.second == ProcessClass::UnsetClass && proc.second.classSource() == ProcessClass::CommandLineSource) { result.address_class[proc.first] = ProcessClass(ProcessClass::TransactionClass, ProcessClass::AutoSource); totalTransactionProcesses++; } } } } int desiredTotalTransactionProcesses = logCount + resolverCount + proxyCount + grvProxyCount; // add machines with all transaction class until we have enough processes and enough machines for (auto& it : count_processes) { if (machinesWithTransaction.size() >= logCount && totalTransactionProcesses >= desiredTotalTransactionProcesses) break; if (!machinesWithTransaction.count(it.first.second) && !machinesWithStorage.count(it.first.second)) { for (auto& proc : it.second) { if (proc.second == ProcessClass::UnsetClass && proc.second.classSource() == ProcessClass::CommandLineSource) { ASSERT(proc.second != ProcessClass::TransactionClass); result.address_class[proc.first] = ProcessClass(ProcessClass::TransactionClass, ProcessClass::AutoSource); totalTransactionProcesses++; machinesWithTransaction.insert(it.first.second); } } } } if (machinesWithTransaction.size() < logCount || totalTransactionProcesses < desiredTotalTransactionProcesses) return ConfigureAutoResult(); result.auto_processes_with_transaction = totalTransactionProcesses; result.auto_machines_with_transaction = machinesWithTransaction.size(); if (3 * totalTransactionProcesses > processCount) return ConfigureAutoResult(); return result; } ACTOR Future> getWorkers(Transaction* tr) { state Future processClasses = tr->getRange(processClassKeys, CLIENT_KNOBS->TOO_MANY); state Future processData = tr->getRange(workerListKeys, CLIENT_KNOBS->TOO_MANY); wait(success(processClasses) && success(processData)); ASSERT(!processClasses.get().more && processClasses.get().size() < CLIENT_KNOBS->TOO_MANY); ASSERT(!processData.get().more && processData.get().size() < CLIENT_KNOBS->TOO_MANY); std::map>, ProcessClass> id_class; for (int i = 0; i < processClasses.get().size(); i++) { id_class[decodeProcessClassKey(processClasses.get()[i].key)] = decodeProcessClassValue(processClasses.get()[i].value); } std::vector results; for (int i = 0; i < processData.get().size(); i++) { ProcessData data = decodeWorkerListValue(processData.get()[i].value); ProcessClass processClass = id_class[data.locality.processId()]; if (processClass.classSource() == ProcessClass::DBSource || data.processClass.classType() == ProcessClass::UnsetClass) data.processClass = processClass; if (data.processClass.classType() != ProcessClass::TesterClass) results.push_back(data); } return results; } ACTOR Future> getWorkers(Database cx) { state Transaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); // necessary? tr.setOption(FDBTransactionOptions::LOCK_AWARE); std::vector workers = wait(getWorkers(&tr)); return workers; } catch (Error& e) { wait(tr.onError(e)); } } } ACTOR Future> getConnectionString(Database cx) { state Transaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); Optional currentKey = wait(tr.get(coordinatorsKey)); if (!currentKey.present()) return Optional(); return ClusterConnectionString(currentKey.get().toString()); } catch (Error& e) { wait(tr.onError(e)); } } } static std::vector connectionStrings; namespace { ACTOR Future> getClusterConnectionStringFromStorageServer(Transaction* tr) { tr->setOption(FDBTransactionOptions::LOCK_AWARE); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES); tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); state int retryTimes = 0; loop { if (retryTimes >= CLIENT_KNOBS->CHANGE_QUORUM_BAD_STATE_RETRY_TIMES) { return Optional(); } Version readVersion = wait(tr->getReadVersion()); state Optional currentKey = wait(tr->get(coordinatorsKey)); if (g_network->isSimulated() && currentKey.present()) { // If the change coordinators request succeeded, the coordinators // should have changed to the connection string of the most // recently issued request. If instead the connection string is // equal to one of the previously issued requests, there is a bug // and we are breaking the promises we make with // commit_unknown_result (the transaction must no longer be in // progress when receiving this error). int n = connectionStrings.size() > 0 ? connectionStrings.size() - 1 : 0; // avoid underflow for (int i = 0; i < n; ++i) { ASSERT(currentKey.get() != connectionStrings.at(i)); } } if (!currentKey.present()) { // Someone deleted this key entirely? ++retryTimes; wait(delay(CLIENT_KNOBS->CHANGE_QUORUM_BAD_STATE_RETRY_DELAY)); continue; } state ClusterConnectionString clusterConnectionString(currentKey.get().toString()); if (tr->getDatabase()->getConnectionRecord() && clusterConnectionString.clusterKeyName().toString() != tr->getDatabase()->getConnectionRecord()->getConnectionString().clusterKeyName()) { // Someone changed the "name" of the database?? ++retryTimes; wait(delay(CLIENT_KNOBS->CHANGE_QUORUM_BAD_STATE_RETRY_DELAY)); continue; } return clusterConnectionString; } } } // namespace ACTOR Future> changeQuorumChecker(Transaction* tr, ClusterConnectionString* conn, std::string newName) { state Optional clusterConnectionStringOptional = wait(getClusterConnectionStringFromStorageServer(tr)); if (!clusterConnectionStringOptional.present()) { return CoordinatorsResult::BAD_DATABASE_STATE; } // The cluster connection string stored in the storage server state ClusterConnectionString old = clusterConnectionStringOptional.get(); if (conn->hostnames.size() + conn->coords.size() == 0) { conn->hostnames = old.hostnames; conn->coords = old.coords; } std::vector desiredCoordinators = wait(conn->tryResolveHostnames()); if (desiredCoordinators.size() != conn->hostnames.size() + conn->coords.size()) { TraceEvent("ChangeQuorumCheckerEarlyTermination") .detail("Reason", "One or more hostnames are unresolvable") .backtrace(); return CoordinatorsResult::COORDINATOR_UNREACHABLE; } if (newName.empty()) { newName = old.clusterKeyName().toString(); } std::sort(conn->hostnames.begin(), conn->hostnames.end()); std::sort(conn->coords.begin(), conn->coords.end()); std::sort(old.hostnames.begin(), old.hostnames.end()); std::sort(old.coords.begin(), old.coords.end()); if (conn->hostnames == old.hostnames && conn->coords == old.coords && old.clusterKeyName() == newName) { connectionStrings.clear(); return CoordinatorsResult::SAME_NETWORK_ADDRESSES; } conn->parseKey(newName + ':' + deterministicRandom()->randomAlphaNumeric(32)); connectionStrings.push_back(conn->toString()); if (g_network->isSimulated()) { int i = 0; int protectedCount = 0; while ((protectedCount < ((desiredCoordinators.size() / 2) + 1)) && (i < desiredCoordinators.size())) { auto process = g_simulator.getProcessByAddress(desiredCoordinators[i]); auto addresses = process->addresses; if (!process->isReliable()) { i++; continue; } g_simulator.protectedAddresses.insert(process->addresses.address); if (addresses.secondaryAddress.present()) { g_simulator.protectedAddresses.insert(process->addresses.secondaryAddress.get()); } TraceEvent("ProtectCoordinator").detail("Address", desiredCoordinators[i]).backtrace(); protectedCount++; i++; } } std::vector>> leaderServers; ClientCoordinators coord(Reference(new ClusterConnectionMemoryRecord(*conn))); leaderServers.reserve(coord.clientLeaderServers.size()); for (int i = 0; i < coord.clientLeaderServers.size(); i++) { if (coord.clientLeaderServers[i].hostname.present()) { leaderServers.push_back(retryGetReplyFromHostname(GetLeaderRequest(coord.clusterKey, UID()), coord.clientLeaderServers[i].hostname.get(), WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::CoordinationReply)); } else { leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskPriority::CoordinationReply)); } } choose { when(wait(waitForAll(leaderServers))) {} when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; } } tr->set(coordinatorsKey, conn->toString()); return Optional(); } ACTOR Future changeQuorum(Database cx, Reference change) { state Transaction tr(cx); state int retries = 0; state std::vector desiredCoordinators; state int notEnoughMachineResults = 0; loop { try { state Optional clusterConnectionStringOptional = wait(getClusterConnectionStringFromStorageServer(&tr)); if (!clusterConnectionStringOptional.present()) { return CoordinatorsResult::BAD_DATABASE_STATE; } // The cluster connection string stored in the storage server state ClusterConnectionString oldClusterConnectionString = clusterConnectionStringOptional.get(); state Key oldClusterKeyName = oldClusterConnectionString.clusterKeyName(); state std::vector oldCoordinators = wait(oldClusterConnectionString.tryResolveHostnames()); state CoordinatorsResult result = CoordinatorsResult::SUCCESS; if (!desiredCoordinators.size()) { std::vector _desiredCoordinators = wait( change->getDesiredCoordinators(&tr, oldCoordinators, Reference( new ClusterConnectionMemoryRecord(oldClusterConnectionString)), result)); desiredCoordinators = _desiredCoordinators; } if (result == CoordinatorsResult::NOT_ENOUGH_MACHINES && notEnoughMachineResults < 1) { // we could get not_enough_machines if we happen to see the database while the cluster controller is // updating the worker list, so make sure it happens twice before returning a failure notEnoughMachineResults++; wait(delay(1.0)); tr.reset(); continue; } if (result != CoordinatorsResult::SUCCESS) return result; if (!desiredCoordinators.size()) return CoordinatorsResult::INVALID_NETWORK_ADDRESSES; std::sort(desiredCoordinators.begin(), desiredCoordinators.end()); std::string newName = change->getDesiredClusterKeyName(); if (newName.empty()) newName = oldClusterKeyName.toString(); if (oldCoordinators == desiredCoordinators && oldClusterKeyName == newName) return retries ? CoordinatorsResult::SUCCESS : CoordinatorsResult::SAME_NETWORK_ADDRESSES; state ClusterConnectionString newClusterConnectionString( desiredCoordinators, StringRef(newName + ':' + deterministicRandom()->randomAlphaNumeric(32))); state Key newClusterKeyName = newClusterConnectionString.clusterKeyName(); if (g_network->isSimulated()) { for (int i = 0; i < (desiredCoordinators.size() / 2) + 1; i++) { auto process = g_simulator.getProcessByAddress(desiredCoordinators[i]); ASSERT(process->isReliable() || process->rebooting); g_simulator.protectedAddresses.insert(process->addresses.address); if (process->addresses.secondaryAddress.present()) { g_simulator.protectedAddresses.insert(process->addresses.secondaryAddress.get()); } TraceEvent("ProtectCoordinator").detail("Address", desiredCoordinators[i]).backtrace(); } } TraceEvent("AttemptingQuorumChange") .detail("FromCS", oldClusterConnectionString.toString()) .detail("ToCS", newClusterConnectionString.toString()); CODE_PROBE(oldClusterKeyName != newClusterKeyName, "Quorum change with new name"); CODE_PROBE(oldClusterKeyName == newClusterKeyName, "Quorum change with unchanged name"); state std::vector>> leaderServers; state ClientCoordinators coord(Reference( new ClusterConnectionMemoryRecord(newClusterConnectionString))); // check if allowed to modify the cluster descriptor if (!change->getDesiredClusterKeyName().empty()) { CheckDescriptorMutableReply mutabilityReply = wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply(CheckDescriptorMutableRequest())); if (!mutabilityReply.isMutable) { return CoordinatorsResult::BAD_DATABASE_STATE; } } leaderServers.reserve(coord.clientLeaderServers.size()); for (int i = 0; i < coord.clientLeaderServers.size(); i++) leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskPriority::CoordinationReply)); choose { when(wait(waitForAll(leaderServers))) {} when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; } } tr.set(coordinatorsKey, newClusterConnectionString.toString()); wait(tr.commit()); ASSERT(false); // commit should fail, but the value has changed } catch (Error& e) { TraceEvent("RetryQuorumChange").error(e).detail("Retries", retries); wait(tr.onError(e)); ++retries; } } } struct NameQuorumChange final : IQuorumChange { std::string newName; Reference otherChange; explicit NameQuorumChange(std::string const& newName, Reference const& otherChange) : newName(newName), otherChange(otherChange) {} Future> getDesiredCoordinators(Transaction* tr, std::vector oldCoordinators, Reference ccr, CoordinatorsResult& t) override { return otherChange->getDesiredCoordinators(tr, oldCoordinators, ccr, t); } std::string getDesiredClusterKeyName() const override { return newName; } }; Reference nameQuorumChange(std::string const& name, Reference const& other) { return Reference(new NameQuorumChange(name, other)); } struct AutoQuorumChange final : IQuorumChange { int desired; explicit AutoQuorumChange(int desired) : desired(desired) {} Future> getDesiredCoordinators(Transaction* tr, std::vector oldCoordinators, Reference ccr, CoordinatorsResult& err) override { return getDesired(Reference::addRef(this), tr, oldCoordinators, ccr, &err); } ACTOR static Future getRedundancy(AutoQuorumChange* self, Transaction* tr) { state Future> fStorageReplicas = tr->get(LiteralStringRef("storage_replicas").withPrefix(configKeysPrefix)); state Future> fLogReplicas = tr->get(LiteralStringRef("log_replicas").withPrefix(configKeysPrefix)); wait(success(fStorageReplicas) && success(fLogReplicas)); int redundancy = std::min(atoi(fStorageReplicas.get().get().toString().c_str()), atoi(fLogReplicas.get().get().toString().c_str())); return redundancy; } ACTOR static Future isAcceptable(AutoQuorumChange* self, Transaction* tr, std::vector oldCoordinators, Reference ccr, int desiredCount, std::set* excluded) { ClusterConnectionString cs = ccr->getConnectionString(); if (oldCoordinators.size() != cs.hostnames.size() + cs.coords.size()) { return false; } // Are there enough coordinators for the redundancy level? if (oldCoordinators.size() < desiredCount) return false; if (oldCoordinators.size() % 2 != 1) return false; // Check exclusions for (auto& c : oldCoordinators) { if (addressExcluded(*excluded, c)) return false; } // Check locality // FIXME: Actual locality! std::sort(oldCoordinators.begin(), oldCoordinators.end()); for (int i = 1; i < oldCoordinators.size(); i++) if (oldCoordinators[i - 1].ip == oldCoordinators[i].ip) return false; // Multiple coordinators share an IP // Check availability ClientCoordinators coord(ccr); std::vector>> leaderServers; leaderServers.reserve(coord.clientLeaderServers.size()); for (int i = 0; i < coord.clientLeaderServers.size(); i++) { if (coord.clientLeaderServers[i].hostname.present()) { leaderServers.push_back(retryGetReplyFromHostname(GetLeaderRequest(coord.clusterKey, UID()), coord.clientLeaderServers[i].hostname.get(), WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::CoordinationReply)); } else { leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskPriority::CoordinationReply)); } } Optional>> results = wait(timeout(getAll(leaderServers), CLIENT_KNOBS->IS_ACCEPTABLE_DELAY)); if (!results.present()) { return false; } // Not all responded for (auto& r : results.get()) { if (!r.present()) { return false; // Coordinator doesn't know about this database? } } return true; // The status quo seems fine } ACTOR static Future> getDesired(Reference self, Transaction* tr, std::vector oldCoordinators, Reference ccr, CoordinatorsResult* err) { state int desiredCount = self->desired; if (desiredCount == -1) { int redundancy = wait(getRedundancy(self.getPtr(), tr)); desiredCount = redundancy * 2 - 1; } std::vector excl = wait(getExcludedServers(tr)); state std::set excluded(excl.begin(), excl.end()); std::vector _workers = wait(getWorkers(tr)); state std::vector workers = _workers; std::map addr_locality; for (auto w : workers) addr_locality[w.address] = w.locality; // since we don't have the locality data for oldCoordinators: // check if every old coordinator is in the workers vector and // check if multiple old coordinators map to the same locality data (same machine) bool checkAcceptable = true; std::set>> checkDuplicates; for (auto addr : oldCoordinators) { auto findResult = addr_locality.find(addr); if (findResult == addr_locality.end() || checkDuplicates.count(findResult->second.zoneId())) { checkAcceptable = false; break; } checkDuplicates.insert(findResult->second.zoneId()); } if (checkAcceptable) { bool ok = wait(isAcceptable(self.getPtr(), tr, oldCoordinators, ccr, desiredCount, &excluded)); if (ok) { *err = CoordinatorsResult::SAME_NETWORK_ADDRESSES; return oldCoordinators; } } std::vector chosen; self->addDesiredWorkers(chosen, workers, desiredCount, excluded); if (chosen.size() < desiredCount) { if (chosen.empty() || chosen.size() < oldCoordinators.size()) { TraceEvent("NotEnoughMachinesForCoordinators") .detail("EligibleWorkers", workers.size()) .detail("ChosenWorkers", chosen.size()) .detail("DesiredCoordinators", desiredCount) .detail("CurrentCoordinators", oldCoordinators.size()); *err = CoordinatorsResult::NOT_ENOUGH_MACHINES; return std::vector(); } chosen.resize((chosen.size() - 1) | 1); } return chosen; } // Select a desired set of workers such that // (1) the number of workers at each locality type (e.g., dcid) <= desiredCount; and // (2) prefer workers at a locality where less workers has been chosen than other localities: evenly distribute // workers. void addDesiredWorkers(std::vector& chosen, const std::vector& workers, int desiredCount, const std::set& excluded) { std::vector remainingWorkers(workers); deterministicRandom()->randomShuffle(remainingWorkers); std::partition(remainingWorkers.begin(), remainingWorkers.end(), [](const ProcessData& data) { return (data.processClass == ProcessClass::CoordinatorClass); }); TraceEvent(SevDebug, "AutoSelectCoordinators").detail("CandidateWorkers", remainingWorkers.size()); for (auto worker = remainingWorkers.begin(); worker != remainingWorkers.end(); worker++) { TraceEvent(SevDebug, "AutoSelectCoordinators") .detail("Worker", worker->processClass.toString()) .detail("Address", worker->address.toString()) .detail("Locality", worker->locality.toString()); } TraceEvent(SevDebug, "AutoSelectCoordinators").detail("ExcludedAddress", excluded.size()); for (auto& excludedAddr : excluded) { TraceEvent(SevDebug, "AutoSelectCoordinators").detail("ExcludedAddress", excludedAddr.toString()); } std::map maxCounts; std::map> currentCounts; std::map hardLimits; std::vector fields({ LiteralStringRef("dcid"), LiteralStringRef("data_hall"), LiteralStringRef("zoneid"), LiteralStringRef("machineid") }); for (auto field = fields.begin(); field != fields.end(); field++) { if (field->toString() == "zoneid") { hardLimits[*field] = 1; } else { hardLimits[*field] = desiredCount; } } while (chosen.size() < desiredCount) { bool found = false; for (auto worker = remainingWorkers.begin(); worker != remainingWorkers.end(); worker++) { if (addressExcluded(excluded, worker->address)) { continue; } // Exclude faulty node due to machine assassination if (g_network->isSimulated() && !g_simulator.getProcessByAddress(worker->address)->isReliable()) { TraceEvent("AutoSelectCoordinators").detail("SkipUnreliableWorker", worker->address.toString()); continue; } bool valid = true; for (auto field = fields.begin(); field != fields.end(); field++) { if (maxCounts[*field] == 0) { maxCounts[*field] = 1; } auto value = worker->locality.get(*field).orDefault(LiteralStringRef("")); auto currentCount = currentCounts[*field][value]; if (currentCount >= maxCounts[*field]) { valid = false; break; } } if (valid) { for (auto field = fields.begin(); field != fields.end(); field++) { auto value = worker->locality.get(*field).orDefault(LiteralStringRef("")); currentCounts[*field][value] += 1; } chosen.push_back(worker->address); remainingWorkers.erase(worker); found = true; break; } } if (!found) { bool canIncrement = false; for (auto field = fields.begin(); field != fields.end(); field++) { if (maxCounts[*field] < hardLimits[*field]) { maxCounts[*field] += 1; canIncrement = true; break; } } if (!canIncrement) { break; } } } } }; Reference autoQuorumChange(int desired) { return Reference(new AutoQuorumChange(desired)); } void excludeServers(Transaction& tr, std::vector& servers, bool failed) { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES); std::string excludeVersionKey = deterministicRandom()->randomUniqueID().toString(); auto serversVersionKey = failed ? failedServersVersionKey : excludedServersVersionKey; tr.addReadConflictRange(singleKeyRange(serversVersionKey)); // To conflict with parallel includeServers tr.set(serversVersionKey, excludeVersionKey); for (auto& s : servers) { if (failed) { tr.set(encodeFailedServersKey(s), StringRef()); } else { tr.set(encodeExcludedServersKey(s), StringRef()); } } TraceEvent("ExcludeServersCommit").detail("Servers", describe(servers)).detail("ExcludeFailed", failed); } ACTOR Future excludeServers(Database cx, std::vector servers, bool failed) { if (cx->apiVersionAtLeast(700)) { state ReadYourWritesTransaction ryw(cx); loop { try { ryw.setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); ryw.set( SpecialKeySpace::getManagementApiCommandOptionSpecialKey(failed ? "failed" : "excluded", "force"), ValueRef()); for (auto& s : servers) { Key addr = failed ? SpecialKeySpace::getManagementApiCommandPrefix("failed").withSuffix(s.toString()) : SpecialKeySpace::getManagementApiCommandPrefix("exclude").withSuffix(s.toString()); ryw.set(addr, ValueRef()); } TraceEvent("ExcludeServersSpecialKeySpaceCommit") .detail("Servers", describe(servers)) .detail("ExcludeFailed", failed); wait(ryw.commit()); return Void(); } catch (Error& e) { TraceEvent("ExcludeServersError").errorUnsuppressed(e); wait(ryw.onError(e)); } } } else { state Transaction tr(cx); loop { try { excludeServers(tr, servers, failed); wait(tr.commit()); return Void(); } catch (Error& e) { TraceEvent("ExcludeServersError").errorUnsuppressed(e); wait(tr.onError(e)); } } } } // excludes localities by setting the keys in api version below 7.0 void excludeLocalities(Transaction& tr, std::unordered_set localities, bool failed) { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES); std::string excludeVersionKey = deterministicRandom()->randomUniqueID().toString(); auto localityVersionKey = failed ? failedLocalityVersionKey : excludedLocalityVersionKey; tr.addReadConflictRange(singleKeyRange(localityVersionKey)); // To conflict with parallel includeLocalities tr.set(localityVersionKey, excludeVersionKey); for (const auto& l : localities) { if (failed) { tr.set(encodeFailedLocalityKey(l), StringRef()); } else { tr.set(encodeExcludedLocalityKey(l), StringRef()); } } TraceEvent("ExcludeLocalitiesCommit").detail("Localities", describe(localities)).detail("ExcludeFailed", failed); } // Exclude the servers matching the given set of localities from use as state servers. // excludes localities by setting the keys. ACTOR Future excludeLocalities(Database cx, std::unordered_set localities, bool failed) { if (cx->apiVersionAtLeast(700)) { state ReadYourWritesTransaction ryw(cx); loop { try { ryw.setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); ryw.set(SpecialKeySpace::getManagementApiCommandOptionSpecialKey( failed ? "failed_locality" : "excluded_locality", "force"), ValueRef()); for (const auto& l : localities) { Key addr = failed ? SpecialKeySpace::getManagementApiCommandPrefix("failedlocality").withSuffix(l) : SpecialKeySpace::getManagementApiCommandPrefix("excludedlocality").withSuffix(l); ryw.set(addr, ValueRef()); } TraceEvent("ExcludeLocalitiesSpecialKeySpaceCommit") .detail("Localities", describe(localities)) .detail("ExcludeFailed", failed); wait(ryw.commit()); return Void(); } catch (Error& e) { TraceEvent("ExcludeLocalitiesError").errorUnsuppressed(e); wait(ryw.onError(e)); } } } else { state Transaction tr(cx); loop { try { excludeLocalities(tr, localities, failed); wait(tr.commit()); return Void(); } catch (Error& e) { TraceEvent("ExcludeLocalitiesError").errorUnsuppressed(e); wait(tr.onError(e)); } } } } ACTOR Future includeServers(Database cx, std::vector servers, bool failed) { state std::string versionKey = deterministicRandom()->randomUniqueID().toString(); if (cx->apiVersionAtLeast(700)) { state ReadYourWritesTransaction ryw(cx); loop { try { ryw.setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); for (auto& s : servers) { if (!s.isValid()) { if (failed) { ryw.clear(SpecialKeySpace::getManagementApiCommandRange("failed")); } else { ryw.clear(SpecialKeySpace::getManagementApiCommandRange("exclude")); } } else { Key addr = failed ? SpecialKeySpace::getManagementApiCommandPrefix("failed").withSuffix(s.toString()) : SpecialKeySpace::getManagementApiCommandPrefix("exclude").withSuffix(s.toString()); ryw.clear(addr); // Eliminate both any ip-level exclusion (1.2.3.4) and any // port-level exclusions (1.2.3.4:5) // The range ['IP', 'IP;'] was originally deleted. ';' is // char(':' + 1). This does not work, as other for all // x between 0 and 9, 'IPx' will also be in this range. // // This is why we now make two clears: first only of the ip // address, the second will delete all ports. if (s.isWholeMachine()) ryw.clear(KeyRangeRef(addr.withSuffix(LiteralStringRef(":")), addr.withSuffix(LiteralStringRef(";")))); } } TraceEvent("IncludeServersCommit").detail("Servers", describe(servers)).detail("Failed", failed); wait(ryw.commit()); return Void(); } catch (Error& e) { TraceEvent("IncludeServersError").errorUnsuppressed(e); wait(ryw.onError(e)); } } } else { state Transaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES); // includeServers might be used in an emergency transaction, so make sure it is retry-self-conflicting // and CAUSAL_WRITE_RISKY tr.setOption(FDBTransactionOptions::CAUSAL_WRITE_RISKY); if (failed) { tr.addReadConflictRange(singleKeyRange(failedServersVersionKey)); tr.set(failedServersVersionKey, versionKey); } else { tr.addReadConflictRange(singleKeyRange(excludedServersVersionKey)); tr.set(excludedServersVersionKey, versionKey); } for (auto& s : servers) { if (!s.isValid()) { if (failed) { tr.clear(failedServersKeys); } else { tr.clear(excludedServersKeys); } } else if (s.isWholeMachine()) { // Eliminate both any ip-level exclusion (1.2.3.4) and any // port-level exclusions (1.2.3.4:5) // The range ['IP', 'IP;'] was originally deleted. ';' is // char(':' + 1). This does not work, as other for all // x between 0 and 9, 'IPx' will also be in this range. // // This is why we now make two clears: first only of the ip // address, the second will delete all ports. auto addr = failed ? encodeFailedServersKey(s) : encodeExcludedServersKey(s); tr.clear(singleKeyRange(addr)); tr.clear(KeyRangeRef(addr + ':', addr + char(':' + 1))); } else { if (failed) { tr.clear(encodeFailedServersKey(s)); } else { tr.clear(encodeExcludedServersKey(s)); } } } TraceEvent("IncludeServersCommit").detail("Servers", describe(servers)).detail("Failed", failed); wait(tr.commit()); return Void(); } catch (Error& e) { TraceEvent("IncludeServersError").errorUnsuppressed(e); wait(tr.onError(e)); } } } } // Remove the given localities from the exclusion list. // include localities by clearing the keys. ACTOR Future includeLocalities(Database cx, std::vector localities, bool failed, bool includeAll) { state std::string versionKey = deterministicRandom()->randomUniqueID().toString(); if (cx->apiVersionAtLeast(700)) { state ReadYourWritesTransaction ryw(cx); loop { try { ryw.setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); if (includeAll) { if (failed) { ryw.clear(SpecialKeySpace::getManagementApiCommandRange("failedlocality")); } else { ryw.clear(SpecialKeySpace::getManagementApiCommandRange("excludedlocality")); } } else { for (const auto& l : localities) { Key locality = failed ? SpecialKeySpace::getManagementApiCommandPrefix("failedlocality").withSuffix(l) : SpecialKeySpace::getManagementApiCommandPrefix("excludedlocality").withSuffix(l); ryw.clear(locality); } } TraceEvent("IncludeLocalitiesCommit") .detail("Localities", describe(localities)) .detail("Failed", failed) .detail("IncludeAll", includeAll); wait(ryw.commit()); return Void(); } catch (Error& e) { TraceEvent("IncludeLocalitiesError").errorUnsuppressed(e); wait(ryw.onError(e)); } } } else { state Transaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES); // includeLocalities might be used in an emergency transaction, so make sure it is // retry-self-conflicting and CAUSAL_WRITE_RISKY tr.setOption(FDBTransactionOptions::CAUSAL_WRITE_RISKY); if (failed) { tr.addReadConflictRange(singleKeyRange(failedLocalityVersionKey)); tr.set(failedLocalityVersionKey, versionKey); } else { tr.addReadConflictRange(singleKeyRange(excludedLocalityVersionKey)); tr.set(excludedLocalityVersionKey, versionKey); } if (includeAll) { if (failed) { tr.clear(failedLocalityKeys); } else { tr.clear(excludedLocalityKeys); } } else { for (const auto& l : localities) { if (failed) { tr.clear(encodeFailedLocalityKey(l)); } else { tr.clear(encodeExcludedLocalityKey(l)); } } } TraceEvent("IncludeLocalitiesCommit") .detail("Localities", describe(localities)) .detail("Failed", failed) .detail("IncludeAll", includeAll); wait(tr.commit()); return Void(); } catch (Error& e) { TraceEvent("IncludeLocalitiesError").errorUnsuppressed(e); wait(tr.onError(e)); } } } } ACTOR Future setClass(Database cx, AddressExclusion server, ProcessClass processClass) { state Transaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES); std::vector workers = wait(getWorkers(&tr)); bool foundChange = false; for (int i = 0; i < workers.size(); i++) { if (server.excludes(workers[i].address)) { if (processClass.classType() != ProcessClass::InvalidClass) tr.set(processClassKeyFor(workers[i].locality.processId().get()), processClassValue(processClass)); else tr.clear(processClassKeyFor(workers[i].locality.processId().get())); foundChange = true; } } if (foundChange) tr.set(processClassChangeKey, deterministicRandom()->randomUniqueID().toString()); wait(tr.commit()); return Void(); } catch (Error& e) { wait(tr.onError(e)); } } } ACTOR Future> getExcludedServers(Transaction* tr) { state RangeResult r = wait(tr->getRange(excludedServersKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!r.more && r.size() < CLIENT_KNOBS->TOO_MANY); state RangeResult r2 = wait(tr->getRange(failedServersKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!r2.more && r2.size() < CLIENT_KNOBS->TOO_MANY); std::vector exclusions; for (auto i = r.begin(); i != r.end(); ++i) { auto a = decodeExcludedServersKey(i->key); if (a.isValid()) exclusions.push_back(a); } for (auto i = r2.begin(); i != r2.end(); ++i) { auto a = decodeFailedServersKey(i->key); if (a.isValid()) exclusions.push_back(a); } uniquify(exclusions); return exclusions; } ACTOR Future> getExcludedServers(Database cx) { state Transaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); // necessary? tr.setOption(FDBTransactionOptions::LOCK_AWARE); std::vector exclusions = wait(getExcludedServers(&tr)); return exclusions; } catch (Error& e) { wait(tr.onError(e)); } } } // Get the current list of excluded localities by reading the keys. ACTOR Future> getExcludedLocalities(Transaction* tr) { state RangeResult r = wait(tr->getRange(excludedLocalityKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!r.more && r.size() < CLIENT_KNOBS->TOO_MANY); state RangeResult r2 = wait(tr->getRange(failedLocalityKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!r2.more && r2.size() < CLIENT_KNOBS->TOO_MANY); std::vector excludedLocalities; for (const auto& i : r) { auto a = decodeExcludedLocalityKey(i.key); excludedLocalities.push_back(a); } for (const auto& i : r2) { auto a = decodeFailedLocalityKey(i.key); excludedLocalities.push_back(a); } uniquify(excludedLocalities); return excludedLocalities; } // Get the list of excluded localities by reading the keys. ACTOR Future> getExcludedLocalities(Database cx) { state Transaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr.setOption(FDBTransactionOptions::LOCK_AWARE); std::vector exclusions = wait(getExcludedLocalities(&tr)); return exclusions; } catch (Error& e) { wait(tr.onError(e)); } } } // Decodes the locality string to a pair of locality prefix and its value. // The prefix could be dcid, processid, machineid, processid. std::pair decodeLocality(const std::string& locality) { StringRef localityRef((const uint8_t*)(locality.c_str()), locality.size()); std::string localityKeyValue = localityRef.removePrefix(LocalityData::ExcludeLocalityPrefix).toString(); int split = localityKeyValue.find(':'); if (split != std::string::npos) { return std::make_pair(localityKeyValue.substr(0, split), localityKeyValue.substr(split + 1)); } return std::make_pair("", ""); } // Returns the list of IPAddresses of the workers that match the given locality. // Example: locality="dcid:primary" returns all the ip addresses of the workers in the primary dc. std::set getAddressesByLocality(const std::vector& workers, const std::string& locality) { std::pair localityKeyValue = decodeLocality(locality); std::set localityAddresses; for (int i = 0; i < workers.size(); i++) { if (workers[i].locality.isPresent(localityKeyValue.first) && workers[i].locality.get(localityKeyValue.first) == localityKeyValue.second) { localityAddresses.insert(AddressExclusion(workers[i].address.ip, workers[i].address.port)); } } return localityAddresses; } ACTOR Future printHealthyZone(Database cx) { state Transaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); Optional val = wait(tr.get(healthyZoneKey)); if (val.present() && decodeHealthyZoneValue(val.get()).first == ignoreSSFailuresZoneString) { printf("Data distribution has been disabled for all storage server failures in this cluster and thus " "maintenance mode is not active.\n"); } else if (!val.present() || decodeHealthyZoneValue(val.get()).second <= tr.getReadVersion().get()) { printf("No ongoing maintenance.\n"); } else { auto healthyZone = decodeHealthyZoneValue(val.get()); fmt::print("Maintenance for zone {0} will continue for {1} seconds.\n", healthyZone.first.toString(), (healthyZone.second - tr.getReadVersion().get()) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND); } return Void(); } catch (Error& e) { wait(tr.onError(e)); } } } ACTOR Future clearHealthyZone(Database cx, bool printWarning, bool clearSSFailureZoneString) { state Transaction tr(cx); TraceEvent("ClearHealthyZone").detail("ClearSSFailureZoneString", clearSSFailureZoneString); loop { try { tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); Optional val = wait(tr.get(healthyZoneKey)); if (!clearSSFailureZoneString && val.present() && decodeHealthyZoneValue(val.get()).first == ignoreSSFailuresZoneString) { if (printWarning) { printf("ERROR: Maintenance mode cannot be used while data distribution is disabled for storage " "server failures. Use 'datadistribution on' to reenable data distribution.\n"); } return false; } tr.clear(healthyZoneKey); wait(tr.commit()); return true; } catch (Error& e) { wait(tr.onError(e)); } } } ACTOR Future setHealthyZone(Database cx, StringRef zoneId, double seconds, bool printWarning) { state Transaction tr(cx); TraceEvent("SetHealthyZone").detail("Zone", zoneId).detail("DurationSeconds", seconds); loop { try { tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); Optional val = wait(tr.get(healthyZoneKey)); if (val.present() && decodeHealthyZoneValue(val.get()).first == ignoreSSFailuresZoneString) { if (printWarning) { printf("ERROR: Maintenance mode cannot be used while data distribution is disabled for storage " "server failures. Use 'datadistribution on' to reenable data distribution.\n"); } return false; } Version readVersion = wait(tr.getReadVersion()); tr.set(healthyZoneKey, healthyZoneValue(zoneId, readVersion + (seconds * CLIENT_KNOBS->CORE_VERSIONSPERSECOND))); wait(tr.commit()); return true; } catch (Error& e) { wait(tr.onError(e)); } } } ACTOR Future setDDMode(Database cx, int mode) { state Transaction tr(cx); state int oldMode = -1; state BinaryWriter wr(Unversioned()); wr << mode; loop { try { tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); Optional old = wait(tr.get(dataDistributionModeKey)); if (oldMode < 0) { oldMode = 1; if (old.present()) { BinaryReader rd(old.get(), Unversioned()); rd >> oldMode; } } BinaryWriter wrMyOwner(Unversioned()); wrMyOwner << dataDistributionModeLock; tr.set(moveKeysLockOwnerKey, wrMyOwner.toValue()); BinaryWriter wrLastWrite(Unversioned()); wrLastWrite << deterministicRandom()->randomUniqueID(); tr.set(moveKeysLockWriteKey, wrLastWrite.toValue()); tr.set(dataDistributionModeKey, wr.toValue()); if (mode) { // set DDMode to 1 will enable all disabled parts, for instance the SS failure monitors. Optional currentHealthyZoneValue = wait(tr.get(healthyZoneKey)); if (currentHealthyZoneValue.present() && decodeHealthyZoneValue(currentHealthyZoneValue.get()).first == ignoreSSFailuresZoneString) { // only clear the key if it is currently being used to disable all SS failure data movement tr.clear(healthyZoneKey); } tr.clear(rebalanceDDIgnoreKey); } wait(tr.commit()); return oldMode; } catch (Error& e) { TraceEvent("SetDDModeRetrying").error(e); wait(tr.onError(e)); } } } ACTOR Future checkForExcludingServersTxActor(ReadYourWritesTransaction* tr, std::set* exclusions, std::set* inProgressExclusion) { // TODO : replace using ExclusionInProgressRangeImpl in special key space ASSERT(inProgressExclusion->size() == 0); // Make sure every time it is cleared beforehand if (!exclusions->size()) return true; tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); // necessary? tr->setOption(FDBTransactionOptions::LOCK_AWARE); // Just getting a consistent read version proves that a set of tlogs satisfying the exclusions has completed // recovery // Check that there aren't any storage servers with addresses violating the exclusions RangeResult serverList = wait(tr->getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY); state bool ok = true; for (auto& s : serverList) { auto addresses = decodeServerListValue(s.value).getKeyValues.getEndpoint().addresses; if (addressExcluded(*exclusions, addresses.address)) { ok = false; inProgressExclusion->insert(addresses.address); } if (addresses.secondaryAddress.present() && addressExcluded(*exclusions, addresses.secondaryAddress.get())) { ok = false; inProgressExclusion->insert(addresses.secondaryAddress.get()); } } if (ok) { Optional> value = wait(tr->get(logsKey)); ASSERT(value.present()); auto logs = decodeLogsValue(value.get()); for (auto const& log : logs.first) { if (log.second == NetworkAddress() || addressExcluded(*exclusions, log.second)) { ok = false; inProgressExclusion->insert(log.second); } } for (auto const& log : logs.second) { if (log.second == NetworkAddress() || addressExcluded(*exclusions, log.second)) { ok = false; inProgressExclusion->insert(log.second); } } } return ok; } ACTOR Future> checkForExcludingServers(Database cx, std::vector excl, bool waitForAllExcluded) { state std::set exclusions(excl.begin(), excl.end()); state std::set inProgressExclusion; loop { state ReadYourWritesTransaction tr(cx); inProgressExclusion.clear(); try { bool ok = wait(checkForExcludingServersTxActor(&tr, &exclusions, &inProgressExclusion)); if (ok) return inProgressExclusion; if (!waitForAllExcluded) break; wait(delayJittered(1.0)); // SOMEDAY: watches! } catch (Error& e) { TraceEvent("CheckForExcludingServersError").error(e); wait(tr.onError(e)); } } return inProgressExclusion; } ACTOR Future mgmtSnapCreate(Database cx, Standalone snapCmd, UID snapUID) { try { wait(snapCreate(cx, snapCmd, snapUID)); TraceEvent("SnapCreateSucceeded").detail("snapUID", snapUID); return Void(); } catch (Error& e) { TraceEvent(SevWarn, "SnapCreateFailed").error(e).detail("snapUID", snapUID); throw; } } ACTOR Future waitForFullReplication(Database cx) { state ReadYourWritesTransaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr.setOption(FDBTransactionOptions::LOCK_AWARE); RangeResult confResults = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!confResults.more && confResults.size() < CLIENT_KNOBS->TOO_MANY); state DatabaseConfiguration config; config.fromKeyValues((VectorRef)confResults); state std::vector>> replicasFutures; for (auto& region : config.regions) { replicasFutures.push_back(tr.get(datacenterReplicasKeyFor(region.dcId))); } wait(waitForAll(replicasFutures)); state std::vector> watchFutures; for (int i = 0; i < config.regions.size(); i++) { if (!replicasFutures[i].get().present() || decodeDatacenterReplicasValue(replicasFutures[i].get().get()) < config.storageTeamSize) { watchFutures.push_back(tr.watch(datacenterReplicasKeyFor(config.regions[i].dcId))); } } if (!watchFutures.size() || (config.usableRegions == 1 && watchFutures.size() < config.regions.size())) { return Void(); } wait(tr.commit()); wait(waitForAny(watchFutures)); tr.reset(); } catch (Error& e) { wait(tr.onError(e)); } } } ACTOR Future timeKeeperSetDisable(Database cx) { loop { state Transaction tr(cx); try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.set(timeKeeperDisableKey, StringRef()); wait(tr.commit()); return Void(); } catch (Error& e) { wait(tr.onError(e)); } } } ACTOR Future lockDatabase(Transaction* tr, UID id) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); Optional val = wait(tr->get(databaseLockedKey)); if (val.present()) { if (BinaryReader::fromStringRef(val.get().substr(10), Unversioned()) == id) { return Void(); } else { //TraceEvent("DBA_LockLocked").detail("Expecting", id).detail("Lock", BinaryReader::fromStringRef(val.get().substr(10), Unversioned())); throw database_locked(); } } tr->atomicOp(databaseLockedKey, BinaryWriter::toValue(id, Unversioned()) .withPrefix(LiteralStringRef("0123456789")) .withSuffix(LiteralStringRef("\x00\x00\x00\x00")), MutationRef::SetVersionstampedValue); tr->addWriteConflictRange(normalKeys); return Void(); } ACTOR Future lockDatabase(Reference tr, UID id) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); Optional val = wait(tr->get(databaseLockedKey)); if (val.present()) { if (BinaryReader::fromStringRef(val.get().substr(10), Unversioned()) == id) { return Void(); } else { //TraceEvent("DBA_LockLocked").detail("Expecting", id).detail("Lock", BinaryReader::fromStringRef(val.get().substr(10), Unversioned())); throw database_locked(); } } tr->atomicOp(databaseLockedKey, BinaryWriter::toValue(id, Unversioned()) .withPrefix(LiteralStringRef("0123456789")) .withSuffix(LiteralStringRef("\x00\x00\x00\x00")), MutationRef::SetVersionstampedValue); tr->addWriteConflictRange(normalKeys); return Void(); } ACTOR Future lockDatabase(Database cx, UID id) { state Transaction tr(cx); loop { try { wait(lockDatabase(&tr, id)); wait(tr.commit()); return Void(); } catch (Error& e) { if (e.code() == error_code_database_locked) throw e; wait(tr.onError(e)); } } } ACTOR Future unlockDatabase(Transaction* tr, UID id) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); Optional val = wait(tr->get(databaseLockedKey)); if (!val.present()) return Void(); if (val.present() && BinaryReader::fromStringRef(val.get().substr(10), Unversioned()) != id) { //TraceEvent("DBA_UnlockLocked").detail("Expecting", id).detail("Lock", BinaryReader::fromStringRef(val.get().substr(10), Unversioned())); throw database_locked(); } tr->clear(singleKeyRange(databaseLockedKey)); return Void(); } ACTOR Future unlockDatabase(Reference tr, UID id) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); Optional val = wait(tr->get(databaseLockedKey)); if (!val.present()) return Void(); if (val.present() && BinaryReader::fromStringRef(val.get().substr(10), Unversioned()) != id) { //TraceEvent("DBA_UnlockLocked").detail("Expecting", id).detail("Lock", BinaryReader::fromStringRef(val.get().substr(10), Unversioned())); throw database_locked(); } tr->clear(singleKeyRange(databaseLockedKey)); return Void(); } ACTOR Future unlockDatabase(Database cx, UID id) { state Transaction tr(cx); loop { try { wait(unlockDatabase(&tr, id)); wait(tr.commit()); return Void(); } catch (Error& e) { if (e.code() == error_code_database_locked) throw e; wait(tr.onError(e)); } } } ACTOR Future checkDatabaseLock(Transaction* tr, UID id) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); Optional val = wait(tr->get(databaseLockedKey)); if (val.present() && BinaryReader::fromStringRef(val.get().substr(10), Unversioned()) != id) { //TraceEvent("DBA_CheckLocked").detail("Expecting", id).detail("Lock", BinaryReader::fromStringRef(val.get().substr(10), Unversioned())).backtrace(); throw database_locked(); } return Void(); } ACTOR Future checkDatabaseLock(Reference tr, UID id) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); Optional val = wait(tr->get(databaseLockedKey)); if (val.present() && BinaryReader::fromStringRef(val.get().substr(10), Unversioned()) != id) { //TraceEvent("DBA_CheckLocked").detail("Expecting", id).detail("Lock", BinaryReader::fromStringRef(val.get().substr(10), Unversioned())).backtrace(); throw database_locked(); } return Void(); } ACTOR Future updateChangeFeed(Transaction* tr, Key rangeID, ChangeFeedStatus status, KeyRange range) { state Key rangeIDKey = rangeID.withPrefix(changeFeedPrefix); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); Optional val = wait(tr->get(rangeIDKey)); if (status == ChangeFeedStatus::CHANGE_FEED_CREATE) { if (!val.present()) { tr->set(rangeIDKey, changeFeedValue(range, invalidVersion, status)); } else if (std::get<0>(decodeChangeFeedValue(val.get())) != range) { throw unsupported_operation(); } } else if (status == ChangeFeedStatus::CHANGE_FEED_STOP) { if (val.present()) { tr->set(rangeIDKey, changeFeedValue(std::get<0>(decodeChangeFeedValue(val.get())), std::get<1>(decodeChangeFeedValue(val.get())), status)); } else { throw unsupported_operation(); } } else if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY) { if (val.present()) { if (g_network->isSimulated()) { g_simulator.validationData.allDestroyedChangeFeedIDs.insert(rangeID.toString()); } tr->set(rangeIDKey, changeFeedValue(std::get<0>(decodeChangeFeedValue(val.get())), std::get<1>(decodeChangeFeedValue(val.get())), status)); tr->clear(rangeIDKey); } } return Void(); } ACTOR Future updateChangeFeed(Reference tr, Key rangeID, ChangeFeedStatus status, KeyRange range) { state Key rangeIDKey = rangeID.withPrefix(changeFeedPrefix); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); Optional val = wait(tr->get(rangeIDKey)); if (status == ChangeFeedStatus::CHANGE_FEED_CREATE) { if (!val.present()) { tr->set(rangeIDKey, changeFeedValue(range, invalidVersion, status)); } else if (std::get<0>(decodeChangeFeedValue(val.get())) != range) { throw unsupported_operation(); } } else if (status == ChangeFeedStatus::CHANGE_FEED_STOP) { if (val.present()) { tr->set(rangeIDKey, changeFeedValue(std::get<0>(decodeChangeFeedValue(val.get())), std::get<1>(decodeChangeFeedValue(val.get())), status)); } else { throw unsupported_operation(); } } else if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY) { if (val.present()) { if (g_network->isSimulated()) { g_simulator.validationData.allDestroyedChangeFeedIDs.insert(rangeID.toString()); } tr->set(rangeIDKey, changeFeedValue(std::get<0>(decodeChangeFeedValue(val.get())), std::get<1>(decodeChangeFeedValue(val.get())), status)); tr->clear(rangeIDKey); } } return Void(); } ACTOR Future updateChangeFeed(Database cx, Key rangeID, ChangeFeedStatus status, KeyRange range) { state Transaction tr(cx); loop { try { wait(updateChangeFeed(&tr, rangeID, status, range)); wait(tr.commit()); return Void(); } catch (Error& e) { wait(tr.onError(e)); } } } ACTOR Future advanceVersion(Database cx, Version v) { state Transaction tr(cx); loop { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::LOCK_AWARE); try { Version rv = wait(tr.getReadVersion()); if (rv <= v) { tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(v + 1, Unversioned())); wait(tr.commit()); } else { fmt::print("Current read version is {}\n", rv); return Void(); } } catch (Error& e) { wait(tr.onError(e)); } } } ACTOR Future forceRecovery(Reference clusterFile, Key dcId) { state Reference>> clusterInterface(new AsyncVar>); state Future leaderMon = monitorLeader(clusterFile, clusterInterface); loop { choose { when(wait(clusterInterface->get().present() ? brokenPromiseToNever( clusterInterface->get().get().forceRecovery.getReply(ForceRecoveryRequest(dcId))) : Never())) { return Void(); } when(wait(clusterInterface->onChange())) {} } } } ACTOR Future waitForPrimaryDC(Database cx, StringRef dcId) { state ReadYourWritesTransaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); Optional res = wait(tr.get(primaryDatacenterKey)); if (res.present() && res.get() == dcId) { return Void(); } state Future watchFuture = tr.watch(primaryDatacenterKey); wait(tr.commit()); wait(watchFuture); tr.reset(); } catch (Error& e) { wait(tr.onError(e)); } } } json_spirit::Value_type normJSONType(json_spirit::Value_type type) { if (type == json_spirit::int_type) return json_spirit::real_type; return type; } void schemaCoverage(std::string const& spath, bool covered) { static std::map> coveredSchemaPaths; if (coveredSchemaPaths[covered].insert(spath).second) { TraceEvent ev(SevInfo, "CodeCoverage"); ev.detail("File", "documentation/StatusSchema.json/" + spath).detail("Line", 0); if (!covered) ev.detail("Covered", 0); } } bool schemaMatch(json_spirit::mValue const& schemaValue, json_spirit::mValue const& resultValue, std::string& errorStr, Severity sev, bool checkCoverage, std::string path, std::string schemaPath) { // Returns true if everything in `result` is permitted by `schema` bool ok = true; try { if (normJSONType(schemaValue.type()) != normJSONType(resultValue.type())) { errorStr += format("ERROR: Incorrect value type for key `%s'\n", path.c_str()); TraceEvent(sev, "SchemaMismatch") .detail("Path", path) .detail("SchemaType", schemaValue.type()) .detail("ValueType", resultValue.type()); return false; } if (resultValue.type() == json_spirit::obj_type) { auto& result = resultValue.get_obj(); auto& schema = schemaValue.get_obj(); for (auto& rkv : result) { auto& key = rkv.first; auto& rv = rkv.second; std::string kpath = path + "." + key; std::string spath = schemaPath + "." + key; if (checkCoverage) { schemaCoverage(spath); } if (!schema.count(key)) { errorStr += format("ERROR: Unknown key `%s'\n", kpath.c_str()); TraceEvent(sev, "SchemaMismatch").detail("Path", kpath).detail("SchemaPath", spath); ok = false; continue; } auto& sv = schema.at(key); if (sv.type() == json_spirit::obj_type && sv.get_obj().count("$enum")) { auto& enum_values = sv.get_obj().at("$enum").get_array(); bool any_match = false; for (auto& enum_item : enum_values) if (enum_item == rv) { any_match = true; if (checkCoverage) { schemaCoverage(spath + ".$enum." + enum_item.get_str()); } break; } if (!any_match) { errorStr += format("ERROR: Unknown value `%s' for key `%s'\n", json_spirit::write_string(rv).c_str(), kpath.c_str()); TraceEvent(sev, "SchemaMismatch") .detail("Path", kpath) .detail("SchemaEnumItems", enum_values.size()) .detail("Value", json_spirit::write_string(rv)); if (checkCoverage) { schemaCoverage(spath + ".$enum." + json_spirit::write_string(rv)); } ok = false; } } else if (sv.type() == json_spirit::obj_type && sv.get_obj().count("$map")) { if (rv.type() != json_spirit::obj_type) { errorStr += format("ERROR: Expected an object as the value for key `%s'\n", kpath.c_str()); TraceEvent(sev, "SchemaMismatch") .detail("Path", kpath) .detail("SchemaType", sv.type()) .detail("ValueType", rv.type()); ok = false; continue; } if (sv.get_obj().at("$map").type() != json_spirit::obj_type) { continue; } auto& schemaVal = sv.get_obj().at("$map"); auto& valueObj = rv.get_obj(); if (checkCoverage) { schemaCoverage(spath + ".$map"); } for (auto& valuePair : valueObj) { auto vpath = kpath + "[" + valuePair.first + "]"; auto upath = spath + ".$map"; if (valuePair.second.type() != json_spirit::obj_type) { errorStr += format("ERROR: Expected an object for `%s'\n", vpath.c_str()); TraceEvent(sev, "SchemaMismatch") .detail("Path", vpath) .detail("ValueType", valuePair.second.type()); ok = false; continue; } if (!schemaMatch(schemaVal, valuePair.second, errorStr, sev, checkCoverage, vpath, upath)) { ok = false; } } } else { if (!schemaMatch(sv, rv, errorStr, sev, checkCoverage, kpath, spath)) { ok = false; } } } } else if (resultValue.type() == json_spirit::array_type) { auto& valueArray = resultValue.get_array(); auto& schemaArray = schemaValue.get_array(); if (!schemaArray.size()) { // An empty schema array means that the value array is required to be empty if (valueArray.size()) { errorStr += format("ERROR: Expected an empty array for key `%s'\n", path.c_str()); TraceEvent(sev, "SchemaMismatch") .detail("Path", path) .detail("SchemaSize", schemaArray.size()) .detail("ValueSize", valueArray.size()); return false; } } else if (schemaArray.size() == 1) { // A one item schema array means that all items in the value must match the first item in the schema int index = 0; for (auto& valueItem : valueArray) { if (!schemaMatch(schemaArray[0], valueItem, errorStr, sev, checkCoverage, path + format("[%d]", index), schemaPath + "[0]")) { ok = false; } index++; } } else { ASSERT(false); // Schema doesn't make sense } } return ok; } catch (std::exception& e) { TraceEvent(SevError, "SchemaMatchException") .detail("What", e.what()) .detail("Path", path) .detail("SchemaPath", schemaPath); throw unknown_error(); } } void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota) { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); auto key = storageQuotaKey(tenantName); tr.set(key, BinaryWriter::toValue(quota, Unversioned())); } ACTOR Future> getStorageQuota(Transaction* tr, StringRef tenantName) { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); state Optional v = wait(tr->get(storageQuotaKey(tenantName))); if (!v.present()) { return Optional(); } return BinaryReader::fromStringRef(v.get(), Unversioned()); } std::string ManagementAPI::generateErrorMessage(const CoordinatorsResult& res) { // Note: the error message here should not be changed if possible // If you do change the message here, // please update the corresponding fdbcli code to support both the old and the new message std::string msg; switch (res) { case CoordinatorsResult::INVALID_NETWORK_ADDRESSES: msg = "The specified network addresses are invalid"; break; case CoordinatorsResult::SAME_NETWORK_ADDRESSES: msg = "No change (existing configuration satisfies request)"; break; case CoordinatorsResult::NOT_COORDINATORS: msg = "Coordination servers are not running on the specified network addresses"; break; case CoordinatorsResult::DATABASE_UNREACHABLE: msg = "Database unreachable"; break; case CoordinatorsResult::BAD_DATABASE_STATE: msg = "The database is in an unexpected state from which changing coordinators might be unsafe"; break; case CoordinatorsResult::COORDINATOR_UNREACHABLE: msg = "One of the specified coordinators is unreachable"; break; case CoordinatorsResult::NOT_ENOUGH_MACHINES: msg = "Too few fdbserver machines to provide coordination at the current redundancy level"; break; default: break; } return msg; } TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") { wait(Future(Void())); std::vector workers; std::vector chosen; std::set excluded; AutoQuorumChange change(5); for (int i = 0; i < 10; i++) { ProcessData data; auto dataCenter = std::to_string(i / 4 % 2); auto dataHall = dataCenter + std::to_string(i / 2 % 2); auto rack = dataHall + std::to_string(i % 2); auto machineId = rack + std::to_string(i); data.locality.set(LiteralStringRef("dcid"), StringRef(dataCenter)); data.locality.set(LiteralStringRef("data_hall"), StringRef(dataHall)); data.locality.set(LiteralStringRef("rack"), StringRef(rack)); data.locality.set(LiteralStringRef("zoneid"), StringRef(rack)); data.locality.set(LiteralStringRef("machineid"), StringRef(machineId)); data.address.ip = IPAddress(i); if (g_network->isSimulated()) { g_simulator.newProcess("TestCoordinator", data.address.ip, data.address.port, false, 1, data.locality, ProcessClass(ProcessClass::CoordinatorClass, ProcessClass::CommandLineSource), "", "", currentProtocolVersion); } workers.push_back(data); } auto noAssignIndex = deterministicRandom()->randomInt(0, workers.size()); workers[noAssignIndex].processClass._class = ProcessClass::CoordinatorClass; change.addDesiredWorkers(chosen, workers, 5, excluded); std::map> chosenValues; ASSERT(chosen.size() == 5); std::vector fields({ LiteralStringRef("dcid"), LiteralStringRef("data_hall"), LiteralStringRef("zoneid"), LiteralStringRef("machineid") }); for (auto worker = chosen.begin(); worker != chosen.end(); worker++) { ASSERT(worker->ip.toV4() < workers.size()); LocalityData data = workers[worker->ip.toV4()].locality; for (auto field = fields.begin(); field != fields.end(); field++) { chosenValues[*field].insert(data.get(*field).get()); } } ASSERT(chosenValues[LiteralStringRef("dcid")].size() == 2); ASSERT(chosenValues[LiteralStringRef("data_hall")].size() == 4); ASSERT(chosenValues[LiteralStringRef("zoneid")].size() == 5); ASSERT(chosenValues[LiteralStringRef("machineid")].size() == 5); ASSERT(std::find(chosen.begin(), chosen.end(), workers[noAssignIndex].address) != chosen.end()); return Void(); }