Fix globalconfig refresh hang issue

CC sets a version to int_max in ClientDBInfo indicating a refresh, however,
proxy server would reject this version for the error of future_version.

This change fixes this issue by not sending int_max, instead maintaining a
lastKnown in memory and send it to grvproxy to get latest globalconfig.

this change also fixes some java tests that were used to test the fix
This commit is contained in:
hao fu 2024-05-14 15:26:44 -07:00
parent ede22972c5
commit 6b782c10f6
7 changed files with 69 additions and 63 deletions

View File

@ -21,15 +21,13 @@ package com.apple.foundationdb;
import java.util.ArrayList;
import java.util.Collection;
import org.junit.jupiter.api.extension.AfterEachCallback;
import org.junit.jupiter.api.extension.BeforeAllCallback;
import org.junit.jupiter.api.extension.ExtensionContext;
import org.junit.jupiter.api.extension.BeforeAllCallback;
/**
* Callback to help define a multi-client scenario and ensure that
* the clients can be configured properly.
*/
public class MultiClientHelper implements BeforeAllCallback,AfterEachCallback{
public class MultiClientHelper implements BeforeAllCallback {
private String[] clusterFiles;
private Collection<Database> openDatabases;
@ -67,16 +65,4 @@ public class MultiClientHelper implements BeforeAllCallback,AfterEachCallback{
public void beforeAll(ExtensionContext arg0) throws Exception {
clusterFiles = readClusterFromEnv();
}
@Override
public void afterEach(ExtensionContext arg0) throws Exception {
//close any databases that have been opened
if(openDatabases!=null){
for(Database db : openDatabases){
db.close();
}
}
openDatabases = null;
}
}

View File

@ -154,8 +154,9 @@ void GlobalConfig::erase(KeyRangeRef range) {
}
// Updates local copy of global configuration by reading the entire key-range
// from storage (proxied through the GrvProxies).
ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self, Version lastKnown) {
// from storage (proxied through the GrvProxies). Returns the version of the
// refreshed data.
ACTOR Future<Version> GlobalConfig::refresh(GlobalConfig* self, Version lastKnown, Version largestSeen) {
// TraceEvent trace(SevInfo, "GlobalConfigRefresh");
self->erase(KeyRangeRef(""_sr, "\xff"_sr));
@ -171,7 +172,11 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self, Version lastKnown)
KeyRef systemKey = kv.key.removePrefix(globalConfigKeysPrefix);
self->insert(systemKey, kv.value);
}
return Void();
if (reply.version >= largestSeen || largestSeen == std::numeric_limits<Version>::max()) {
return reply.version;
} else {
wait(delay(0.25));
}
} catch (Error& e) {
wait(backoff.onError());
}
@ -186,32 +191,38 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, const ClientDBInfo*
if (self->initialized.canBeSet()) {
wait(self->cx->onConnected());
wait(self->refresh(self, -1));
Version version = wait(self->refresh(self, -1, 0));
self->lastUpdate = version;
self->cx->addref();
self->initialized.send(Void());
self->cx->delref();
}
loop {
try {
wait(self->dbInfoChanged.onTrigger());
auto& history = dbInfo->history;
if (history.size() == 0) {
continue;
}
if (self->lastUpdate < history[0].version) {
// run one iteration at the beginning
wait(delay(0));
if (dbInfo->history.size() > 0) {
if (self->lastUpdate < dbInfo->history[0].version) {
// This process missed too many global configuration
// history updates or the protocol version changed, so it
// must re-read the entire configuration range.
wait(self->refresh(self, history.back().version));
if (dbInfo->history.size() > 0) {
self->lastUpdate = dbInfo->history.back().version;
Version version =
wait(self->refresh(self, self->lastUpdate, dbInfo->history.back().version));
self->lastUpdate = version;
// DBInfo could have changed after the wait. If
// changes are present, re-run the loop to make
// sure they are applied.
if (dbInfo->history.size() > 0 &&
dbInfo->history[0].version != std::numeric_limits<Version>::max()) {
continue;
}
} else {
// Apply history in order, from lowest version to highest
// version. Mutation history should already be stored in
// ascending version order.
for (const auto& vh : history) {
for (const auto& vh : dbInfo->history) {
if (vh.version <= self->lastUpdate) {
continue; // already applied this mutation
}
@ -230,8 +241,11 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, const ClientDBInfo*
self->lastUpdate = vh.version;
}
}
self->configChanged.trigger();
}
// In case this actor is canceled in the d'tor of GlobalConfig we can exit here.
wait(delay(0));
wait(self->dbInfoChanged.onTrigger());
} catch (Error& e) {
throw;
}

View File

@ -9075,8 +9075,9 @@ void Transaction::checkDeferredError() const {
Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database& cx) {
if (!cx->isError()) {
double clientSamplingProbability =
cx->globalConfig->get<double>(fdbClientInfoTxnSampleRate, CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY);
double sampleRate =
cx->globalConfig->get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) &&
deterministicRandom()->random01() < clientSamplingProbability &&
(!g_network->isSimulated() || !g_simulator->speedUpSimulation)) {

View File

@ -727,14 +727,16 @@ struct ExclusionSafetyCheckRequest {
struct GlobalConfigRefreshReply {
constexpr static FileIdentifier file_identifier = 12680327;
Arena arena;
Version version;
RangeResultRef result;
GlobalConfigRefreshReply() {}
GlobalConfigRefreshReply(Arena const& arena, RangeResultRef result) : arena(arena), result(result) {}
GlobalConfigRefreshReply(Arena const& arena, Version version, RangeResultRef result)
: arena(arena), version(version), result(result) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, result, arena);
serializer(ar, result, version, arena);
}
};

View File

@ -163,7 +163,7 @@ private:
// of the global configuration keyspace.
void erase(KeyRangeRef range);
ACTOR static Future<Void> refresh(GlobalConfig* self, Version lastKnown);
ACTOR static Future<Version> refresh(GlobalConfig* self, Version lastKnown, Version largestSeen);
ACTOR static Future<Void> updater(GlobalConfig* self, const ClientDBInfo* dbInfo);
DatabaseContext* cx;

View File

@ -1077,6 +1077,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
.detail("GrvProxies", db->clientInfo->get().grvProxies)
.detail("ReqGrvProxies", req.grvProxies)
.detail("CommitProxies", db->clientInfo->get().commitProxies)
.detail("GlobalConfigHistorySize", db->clientInfo->get().history.size())
.detail("ReqCPs", req.commitProxies)
.detail("TenantMode", db->clientInfo->get().tenantMode.toString())
.detail("ReqTenantMode", db->config.tenantMode.toString())
@ -1093,6 +1094,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
clientInfo.id = deterministicRandom()->randomUniqueID();
clientInfo.commitProxies = req.commitProxies;
clientInfo.grvProxies = req.grvProxies;
clientInfo.history = db->clientInfo->get().history;
clientInfo.tenantMode = TenantAPI::tenantModeForClusterType(db->clusterType, db->config.tenantMode);
clientInfo.clusterId = db->serverInfo->get().client.clusterId;
clientInfo.clusterType = db->clusterType;
@ -1742,12 +1744,12 @@ ACTOR Future<Void> monitorStorageMetadata(ClusterControllerData* self) {
ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
loop {
state ReadYourWritesTransaction tr(db->db);
state ClientDBInfo clientInfo;
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state Optional<Value> globalConfigVersion = wait(tr.get(globalConfigVersionKey));
state ClientDBInfo clientInfo = db->serverInfo->get().client;
if (globalConfigVersion.present()) {
// Since the history keys end with versionstamps, they
@ -1758,11 +1760,12 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
// If the global configuration version key has been set,
// the history should contain at least one item.
ASSERT(globalConfigHistory.size() > 0);
clientInfo = db->serverInfo->get().client;
clientInfo.history.clear();
for (const auto& kv : globalConfigHistory) {
ObjectReader reader(kv.value.begin(), IncludeVersion());
if (reader.protocolVersion() != g_network->protocolVersion()) {
if (reader.protocolVersion() != g_network->protocolVersion() || BUGGIFY_WITH_PROB(0.01)) {
// If the protocol version has changed, the
// GlobalConfig actor should refresh its view by
// reading the entire global configuration key

View File

@ -366,7 +366,7 @@ ACTOR Future<Void> globalConfigRequestServer(GrvProxyData* grvProxyData, GrvProx
// point of view. The client learns the version through a
// ClientDBInfo update).
if (refresh.lastKnown <= cachedVersion) {
refresh.reply.send(GlobalConfigRefreshReply{ cachedData.arena(), cachedData });
refresh.reply.send(GlobalConfigRefreshReply{ cachedData.arena(), cachedVersion, cachedData });
} else {
refresh.reply.sendError(future_version());
}