foundationdb/fdbserver/workloads/SaveAndKill.actor.cpp
Chaoguang Lin 7d365bd1bb
Remote ikvs debugging (#6465)
* initial structure for remote IKVS server

* moved struct to .h file, added new files to CMakeList

* happy path implementation, connection error when testing

* saved minor local change

* changed tracing to debug

* fixed onClosed and getError being called before init is finished

* fix spawn process bug, now use absolute path

* added server knob to set ikvs process port number

* added server knob for remote/local kv store

* implement simulator remote process spawning

* fixed bug for simulator timeout

* commit all changes

* removed print lines in trace

* added FlowProcess implementation by Markus

* initial debug of FlowProcess, stuck at parent sending OpenKVStoreRequest to child

* temporary fix for process factory throwing segfault on create

* specify public address in command

* change remote kv store knob to false for jenkins build

* made port 0 open random unused port

* change remote store knob to true for benchmark

* set listening port to randomly opened port

* added print lines for jenkins run open kv store timeout debug

* removed most tracing and print lines

* removed tutorial changes

* update handleIOErrors error handling to handle remote-ikvs cases

* Push all debugging changes

* A version where worker bug exists

* A version where restarting tests fail

* Use both the name and the port to determine the child process

* Remove unnecessary update on local address

* Disable remote-kvs for DiskFailureCycle test

* A version where restarting stuck

* A version where most restarting tests green

* Reset connection with child process explicitly

* Remove change on unnecessary files

* Unify flags from _ to -

* fix merging unexpected changes

* fix trac.error to .errorUnsuppressed

* Add license header

* Remove unnecessary header in FlowProcess.actor.cpp

* Fix Windows build

* Fix Windows build, add missing ;

* Fix a stupid bug caused by code dropped by code merging

* Disable remote kvs by default

* Pass the conn_file path to the flow process, though not needed, but the buildNetwork is difficult to tune

* serialization change on readrange

* Update traces

* Refactor the RemoteIKVS interface

* Format files

* Update sim2 interface to not clog connections between parent and child processes in simulation

* Update comments; remove debugging symbols; Add error handling for remote_kvs_cancelled

* Add comments, format files

* Change method name from isBuggifyDisabled to isStableConnection; Decrease(0.1x) latency for stable connections

* Commit the IConnection interface change, forgot in previous commit

* Fix the issue that onClosed request is cancelled by ActorCollection

* Enable the remote kv store knob

* Remove FlowProcess.actor.cpp and move functions to RemoteIKeyValueStore.actor.cpp; Add remote kv store delay to avoid race; Bind the child process to die with parent process

* Fix the bug where one process starts storage server more than once

* Add a please_reboot_remote_kv_store error to restart the storage server worker if remote kvs died abnormally

* Remove unreachable code path and add comments

* Clang format the code

* Fix a simple wait error

* Clang format after merging the main branch

* Testing mixed mode in simulation if remote_kvs knob is enabled, setting the default to false

* Disable remote kvs for PhysicalShardMove which is for RocksDB

* Cleanup #include orders, remove debugging traces

* Revert the reorder in fdbserver.actor.cpp, which fails the gcc build

Co-authored-by: “Lincoln <“lincoln.xiao@snowflake.com”>
2022-03-31 17:08:59 -07:00

151 lines
6.2 KiB
C++

/*
* SaveAndKill.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/NativeAPI.actor.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbrpc/simulator.h"
#include "boost/algorithm/string/predicate.hpp"
#undef state
#include "fdbclient/SimpleIni.h"
#define state
#undef max
#undef min
#include "flow/actorcompiler.h" // This must be the last #include.
struct SaveAndKillWorkload : TestWorkload {
std::string restartInfo;
double testDuration;
int isRestoring;
SaveAndKillWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
restartInfo = getOption(options, "restartInfoLocation"_sr, "simfdb/restartInfo.ini"_sr).toString();
testDuration = getOption(options, "testDuration"_sr, 10.0);
isRestoring = getOption(options, "isRestoring"_sr, 0);
}
std::string description() const override { return "SaveAndKillWorkload"; }
Future<Void> setup(Database const& cx) override {
g_simulator.disableSwapsToAll();
return Void();
}
Future<Void> start(Database const& cx) override { return _start(this); }
ACTOR Future<Void> _start(SaveAndKillWorkload* self) {
state int i;
wait(delay(deterministicRandom()->random01() * self->testDuration));
CSimpleIni ini;
ini.SetUnicode();
ini.LoadFile(self->restartInfo.c_str());
ini.SetValue("RESTORE", "isRestoring", format("%d", self->isRestoring).c_str());
ini.SetValue("META", "processesPerMachine", format("%d", g_simulator.processesPerMachine).c_str());
ini.SetValue("META", "listenersPerProcess", format("%d", g_simulator.listenersPerProcess).c_str());
ini.SetValue("META", "desiredCoordinators", format("%d", g_simulator.desiredCoordinators).c_str());
ini.SetValue("META", "connectionString", g_simulator.connectionString.c_str());
ini.SetValue("META", "testerCount", format("%d", g_simulator.testerCount).c_str());
ini.SetValue("META", "tssMode", format("%d", g_simulator.tssMode).c_str());
ini.SetValue("META", "mockDNS", INetworkConnections::net()->convertMockDNSToString().c_str());
std::vector<ISimulator::ProcessInfo*> processes = g_simulator.getAllProcesses();
std::map<NetworkAddress, ISimulator::ProcessInfo*> rebootingProcesses = g_simulator.currentlyRebootingProcesses;
std::map<std::string, ISimulator::ProcessInfo*> allProcessesMap;
for (const auto& [_, process] : rebootingProcesses) {
if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end() &&
std::string(process->name) != "remote flow process") {
allProcessesMap[process->dataFolder] = process;
}
}
for (const auto& process : processes) {
if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end() &&
std::string(process->name) != "remote flow process") {
allProcessesMap[process->dataFolder] = process;
}
}
ini.SetValue("META", "processCount", format("%d", allProcessesMap.size() - 1).c_str());
std::map<std::string, int> machines;
int j = 0;
for (const auto& [_, process] : allProcessesMap) {
std::string machineId = printable(process->locality.machineId());
const char* machineIdString = machineId.c_str();
if (strcmp(process->name, "TestSystem") != 0) {
if (machines.find(machineId) == machines.end()) {
machines.insert(std::pair<std::string, int>(machineId, 1));
ini.SetValue("META", format("%d", j).c_str(), machineIdString);
ini.SetValue(
machineIdString,
"dcUID",
(process->locality.dcId().present()) ? process->locality.dcId().get().printable().c_str() : "");
ini.SetValue(machineIdString,
"zoneId",
(process->locality.zoneId().present())
? process->locality.zoneId().get().printable().c_str()
: "");
ini.SetValue(machineIdString, "mClass", format("%d", process->startingClass.classType()).c_str());
ini.SetValue(machineIdString,
format("ipAddr%d", process->address.port - 1).c_str(),
process->address.ip.toString().c_str());
ini.SetValue(machineIdString, format("%d", process->address.port - 1).c_str(), process->dataFolder);
ini.SetValue(
machineIdString, format("c%d", process->address.port - 1).c_str(), process->coordinationFolder);
j++;
} else {
ini.SetValue(machineIdString,
format("ipAddr%d", process->address.port - 1).c_str(),
process->address.ip.toString().c_str());
int oldValue = machines.find(machineId)->second;
ini.SetValue(machineIdString, format("%d", process->address.port - 1).c_str(), process->dataFolder);
ini.SetValue(
machineIdString, format("c%d", process->address.port - 1).c_str(), process->coordinationFolder);
machines.erase(machines.find(machineId));
machines.insert(std::pair<std::string, int>(machineId, oldValue + 1));
}
}
}
for (auto entry = machines.begin(); entry != machines.end(); entry++) {
ini.SetValue((*entry).first.c_str(), "processes", format("%d", (*entry).second).c_str());
}
ini.SetValue("META", "machineCount", format("%d", machines.size()).c_str());
ini.SaveFile(self->restartInfo.c_str());
for (auto process = allProcessesMap.begin(); process != allProcessesMap.end(); process++) {
g_simulator.killProcess(process->second, ISimulator::Reboot);
}
for (i = 0; i < 100; i++) {
wait(delay(0.0));
}
g_simulator.stop();
return Void();
}
Future<bool> check(Database const& cx) override { return true; }
void getMetrics(std::vector<PerfMetric>&) override {}
};
WorkloadFactory<SaveAndKillWorkload> SaveAndKillWorkloadFactory("SaveAndKill");