foundationdb/fdbclient/AuditUtils.actor.cpp
2024-08-02 09:40:11 -07:00

1142 lines
46 KiB
C++

/*
* AuditUtils.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2024 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/AuditUtils.actor.h"
#include "fdbclient/Audit.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/ClientKnobs.h"
#include <fmt/format.h>
#include "flow/actorcompiler.h" // has to be last include
void clearAuditProgressMetadata(Transaction* tr, AuditType auditType, UID auditId) {
// There are two possible places to store AuditProgressMetadata:
// (1) auditServerBasedProgressRangeFor or (2) auditRangeBasedProgressRangeFor
// Which place stores the progress metadata is decided by DDAudit design
// This function enforces the DDAudit design when clear the progress metadata
// Design: for replica/ha/locationMetadata, the audit always writes to RangeBased space
// for SSShard, the audit always writes to ServerBased space
// This function clears the progress metadata accordingly
if (auditType == AuditType::ValidateStorageServerShard) {
tr->clear(auditServerBasedProgressRangeFor(auditType, auditId));
} else if (auditType == AuditType::ValidateHA) {
tr->clear(auditRangeBasedProgressRangeFor(auditType, auditId));
} else if (auditType == AuditType::ValidateReplica) {
tr->clear(auditRangeBasedProgressRangeFor(auditType, auditId));
} else if (auditType == AuditType::ValidateLocationMetadata) {
tr->clear(auditRangeBasedProgressRangeFor(auditType, auditId));
} else {
UNREACHABLE();
}
return;
}
ACTOR Future<bool> checkStorageServerRemoved(Database cx, UID ssid) {
state bool res = false;
state Transaction tr(cx);
TraceEvent(SevDebug, "AuditUtilStorageServerRemovedStart").detail("StorageServer", ssid);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> serverListValue = wait(tr.get(serverListKeyFor(ssid)));
if (!serverListValue.present()) {
res = true; // SS is removed
}
break;
} catch (Error& e) {
TraceEvent(SevDebug, "AuditUtilStorageServerRemovedError")
.errorUnsuppressed(e)
.detail("StorageServer", ssid);
wait(tr.onError(e));
}
}
TraceEvent(SevDebug, "AuditUtilStorageServerRemovedEnd").detail("StorageServer", ssid).detail("Removed", res);
return res;
}
ACTOR Future<Void> cancelAuditMetadata(Database cx, AuditType auditType, UID auditId) {
try {
state Transaction tr(cx);
TraceEvent(SevInfo, "AuditUtilCancelAuditMetadataStart", auditId)
.detail("AuditKey", auditKey(auditType, auditId));
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> res_ = wait(tr.get(auditKey(auditType, auditId)));
if (!res_.present()) { // has been cancelled
break; // Nothing to cancel
}
state AuditStorageState toCancelState = decodeAuditStorageState(res_.get());
// For a zombie audit, it is in running state
ASSERT(toCancelState.id == auditId && toCancelState.getType() == auditType);
toCancelState.setPhase(AuditPhase::Failed);
tr.set(auditKey(toCancelState.getType(), toCancelState.id), auditStorageStateValue(toCancelState));
clearAuditProgressMetadata(&tr, toCancelState.getType(), toCancelState.id);
wait(tr.commit());
TraceEvent(SevInfo, "AuditUtilCancelAuditMetadataEnd", auditId)
.detail("AuditKey", auditKey(auditType, auditId));
break;
} catch (Error& e) {
TraceEvent(SevWarn, "AuditUtilCancelAuditMetadataError", auditId)
.detail("AuditKey", auditKey(auditType, auditId));
wait(tr.onError(e));
}
}
} catch (Error& e) {
throw cancel_audit_storage_failed();
}
return Void();
}
AuditPhase stringToAuditPhase(std::string auditPhaseStr) {
// Convert chars of auditPhaseStr to lower case
std::transform(auditPhaseStr.begin(), auditPhaseStr.end(), auditPhaseStr.begin(), [](unsigned char c) {
return std::tolower(c);
});
if (auditPhaseStr == "running") {
return AuditPhase::Running;
} else if (auditPhaseStr == "complete") {
return AuditPhase::Complete;
} else if (auditPhaseStr == "failed") {
return AuditPhase::Failed;
} else if (auditPhaseStr == "error") {
return AuditPhase::Error;
} else {
return AuditPhase::Invalid;
}
}
// This is not transactional
ACTOR Future<std::vector<AuditStorageState>> getAuditStates(Database cx,
AuditType auditType,
bool newFirst,
Optional<int> num,
Optional<AuditPhase> phase) {
state Transaction tr(cx);
state std::vector<AuditStorageState> auditStates;
state Key readBegin;
state Key readEnd;
state Reverse reverse = newFirst ? Reverse::True : Reverse::False;
if (num.present() && num.get() == 0) {
return auditStates;
}
loop {
try {
readBegin = auditKeyRange(auditType).begin;
readEnd = auditKeyRange(auditType).end;
auditStates.clear();
while (true) {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
KeyRangeRef rangeToRead(readBegin, readEnd);
state RangeResult res = wait(tr.getRange(rangeToRead,
num.present() ? GetRangeLimits(num.get()) : GetRangeLimits(),
Snapshot::False,
reverse));
for (int i = 0; i < res.size(); ++i) {
const AuditStorageState auditState = decodeAuditStorageState(res[i].value);
if (phase.present() && auditState.getPhase() != phase.get()) {
continue;
}
auditStates.push_back(auditState);
if (num.present() && auditStates.size() == num.get()) {
return auditStates; // since res.more is not reliable when GetRangeLimits is set to 1
}
}
if (!res.more) {
break;
}
if (newFirst) {
readEnd = res.front().key; // we are reversely reading the range
} else {
readBegin = keyAfter(res.back().key);
}
tr.reset();
}
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
return auditStates;
}
ACTOR Future<Void> clearAuditMetadataForType(Database cx,
AuditType auditType,
UID maxAuditIdToClear,
int numFinishAuditToKeep) {
state Transaction tr(cx);
state int numFinishAuditCleaned = 0; // We regard "Complete" and "Failed" audits as finish audits
TraceEvent(SevDebug, "AuditUtilClearAuditMetadataForTypeStart")
.detail("AuditType", auditType)
.detail("MaxAuditIdToClear", maxAuditIdToClear);
try {
loop { // Cleanup until succeed or facing unretriable error
try {
state std::vector<AuditStorageState> auditStates =
wait(getAuditStates(cx, auditType, /*newFirst=*/false));
// auditStates has ascending order of auditIds
// Read and clear are not atomic
int numFinishAudit = 0;
for (const auto& auditState : auditStates) {
if (auditState.id.first() > maxAuditIdToClear.first()) {
continue; // ignore any audit with a larger auditId than the input threshold
}
if (auditState.getPhase() == AuditPhase::Complete || auditState.getPhase() == AuditPhase::Failed) {
numFinishAudit++;
}
}
const int numFinishAuditToClean = numFinishAudit - numFinishAuditToKeep;
numFinishAuditCleaned = 0;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
for (const auto& auditState : auditStates) {
if (auditState.id.first() > maxAuditIdToClear.first()) {
continue; // ignore any audit with a larger auditId than the input threshold
}
ASSERT(auditState.getType() == auditType);
if (auditState.getPhase() == AuditPhase::Complete &&
numFinishAuditCleaned < numFinishAuditToClean) {
// Clear audit metadata
tr.clear(auditKey(auditType, auditState.id));
// No need to clear progress metadata of Complete audits
// which has been done when Complete phase persistent
numFinishAuditCleaned++;
} else if (auditState.getPhase() == AuditPhase::Failed &&
numFinishAuditCleaned < numFinishAuditToClean) {
// Clear audit metadata
tr.clear(auditKey(auditType, auditState.id));
// Clear progress metadata
clearAuditProgressMetadata(&tr, auditType, auditState.id);
numFinishAuditCleaned++;
}
// For a zombie audit, it is in running state
}
wait(tr.commit());
TraceEvent(SevDebug, "AuditUtilClearAuditMetadataForTypeEnd")
.detail("AuditType", auditType)
.detail("NumCleanedFinishAudits", numFinishAuditCleaned);
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
} catch (Error& e) {
TraceEvent(SevInfo, "AuditUtilClearAuditMetadataForTypeError")
.detail("AuditType", auditType)
.errorUnsuppressed(e);
// We do not want audit cleanup effects DD
}
return Void();
}
ACTOR static Future<Void> checkMoveKeysLockForAudit(Transaction* tr,
MoveKeyLockInfo lock,
bool isDDEnabled,
bool isWrite = true) {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
if (!isDDEnabled) {
TraceEvent(SevDebug, "AuditUtilDisabledByInMemoryCheck").log();
throw movekeys_conflict(); // need a new name
}
Optional<Value> readVal = wait(tr->get(moveKeysLockOwnerKey));
UID currentOwner = readVal.present() ? BinaryReader::fromStringRef<UID>(readVal.get(), Unversioned()) : UID();
if (currentOwner == lock.prevOwner) {
// Check that the previous owner hasn't touched the lock since we took it
Optional<Value> readVal = wait(tr->get(moveKeysLockWriteKey));
UID lastWrite = readVal.present() ? BinaryReader::fromStringRef<UID>(readVal.get(), Unversioned()) : UID();
if (lastWrite != lock.prevWrite) {
TraceEvent(SevDebug, "ConflictWithPreviousOwner");
throw movekeys_conflict(); // need a new name
}
// Take the lock
if (isWrite) {
BinaryWriter wrMyOwner(Unversioned());
wrMyOwner << lock.myOwner;
tr->set(moveKeysLockOwnerKey, wrMyOwner.toValue());
BinaryWriter wrLastWrite(Unversioned());
UID lastWriter = deterministicRandom()->randomUniqueID();
wrLastWrite << lastWriter;
tr->set(moveKeysLockWriteKey, wrLastWrite.toValue());
TraceEvent("AuditUtilCheckMoveKeysLock")
.detail("PrevOwner", lock.prevOwner.toString())
.detail("PrevWrite", lock.prevWrite.toString())
.detail("MyOwner", lock.myOwner.toString())
.detail("Writer", lastWriter.toString());
}
return Void();
} else if (currentOwner == lock.myOwner) {
if (isWrite) {
// Touch the lock, preventing overlapping attempts to take it
BinaryWriter wrLastWrite(Unversioned());
wrLastWrite << deterministicRandom()->randomUniqueID();
tr->set(moveKeysLockWriteKey, wrLastWrite.toValue());
// Make this transaction self-conflicting so the database will not execute it twice with the same write key
tr->makeSelfConflicting();
}
return Void();
} else {
TraceEvent(SevDebug, "AuditUtilConflictWithNewOwner")
.detail("CurrentOwner", currentOwner.toString())
.detail("PrevOwner", lock.prevOwner.toString())
.detail("PrevWrite", lock.prevWrite.toString())
.detail("MyOwner", lock.myOwner.toString());
throw movekeys_conflict(); // need a new name
}
}
ACTOR Future<UID> persistNewAuditState(Database cx,
AuditStorageState auditState,
MoveKeyLockInfo lock,
bool ddEnabled) {
ASSERT(!auditState.id.isValid());
state Transaction tr(cx);
state UID auditId;
state AuditStorageState latestExistingAuditState;
TraceEvent(SevDebug, "AuditUtilPersistedNewAuditStateStart", auditId);
try {
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
wait(checkMoveKeysLockForAudit(&tr, lock, ddEnabled, true));
RangeResult res =
wait(tr.getRange(auditKeyRange(auditState.getType()), 1, Snapshot::False, Reverse::True));
ASSERT(res.size() == 0 || res.size() == 1);
uint64_t nextId = 1;
if (!res.empty()) {
latestExistingAuditState = decodeAuditStorageState(res[0].value);
if (auditId.isValid()) { // new audit state persist gets failed last time
// Check to confirm no other actor can persist new audit state
ASSERT(latestExistingAuditState.id.first() <= auditId.first());
if (latestExistingAuditState.id.first() == auditId.first()) {
// The new audit Id has been successfully persisted
// No more action needed
return auditId;
} else {
// When latestExistingAuditState.id.first() < auditId
// The new audit Id is failed to persist
// Check to confirm no other actor can persist new audit state
ASSERT(auditId.first() == latestExistingAuditState.id.first() + 1);
}
}
nextId = latestExistingAuditState.id.first() + 1;
}
auditId = UID(nextId, 0LL);
auditState.id = auditId;
TraceEvent(SevVerbose, "AuditUtilPersistedNewAuditStateIdSelected", auditId)
.detail("AuditKey", auditKey(auditState.getType(), auditId));
tr.set(auditKey(auditState.getType(), auditId), auditStorageStateValue(auditState));
wait(tr.commit());
TraceEvent(SevDebug, "AuditUtilPersistedNewAuditState", auditId)
.detail("AuditKey", auditKey(auditState.getType(), auditId));
break;
} catch (Error& e) {
TraceEvent(SevDebug, "AuditUtilPersistedNewAuditStateError", auditId)
.errorUnsuppressed(e)
.detail("AuditKey", auditKey(auditState.getType(), auditId));
wait(tr.onError(e));
}
}
} catch (Error& e) {
TraceEvent(SevWarn, "AuditUtilPersistedNewAuditStateUnretriableError", auditId)
.errorUnsuppressed(e)
.detail("AuditKey", auditKey(auditState.getType(), auditId));
ASSERT_WE_THINK(e.code() == error_code_actor_cancelled || e.code() == error_code_movekeys_conflict);
if (e.code() == error_code_actor_cancelled) {
throw e;
} else {
throw persist_new_audit_metadata_error();
}
}
return auditId;
}
ACTOR Future<Void> persistAuditState(Database cx,
AuditStorageState auditState,
std::string context,
MoveKeyLockInfo lock,
bool ddEnabled) {
state Transaction tr(cx);
state AuditPhase auditPhase = auditState.getPhase();
ASSERT(auditPhase == AuditPhase::Complete || auditPhase == AuditPhase::Failed || auditPhase == AuditPhase::Error);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
wait(checkMoveKeysLockForAudit(&tr, lock, ddEnabled, true));
// Clear persistent progress data of the new audit if complete
if (auditPhase == AuditPhase::Complete) {
clearAuditProgressMetadata(&tr, auditState.getType(), auditState.id);
} // We keep the progress metadata of Failed and Error audits for further investigations
// Check existing state
Optional<Value> res_ = wait(tr.get(auditKey(auditState.getType(), auditState.id)));
if (!res_.present()) { // has been cancelled
throw audit_storage_cancelled();
} else {
const AuditStorageState currentState = decodeAuditStorageState(res_.get());
ASSERT(currentState.id == auditState.id && currentState.getType() == auditState.getType());
if (currentState.getPhase() == AuditPhase::Failed) {
throw audit_storage_cancelled();
}
}
// Persist audit result
tr.set(auditKey(auditState.getType(), auditState.id), auditStorageStateValue(auditState));
wait(tr.commit());
TraceEvent(SevInfo, "AuditUtilPersistAuditState", auditState.id)
.detail("AuditID", auditState.id)
.detail("AuditType", auditState.getType())
.detail("AuditPhase", auditPhase)
.detail("AuditKey", auditKey(auditState.getType(), auditState.id))
.detail("Context", context);
break;
} catch (Error& e) {
TraceEvent(SevWarn, "AuditUtilPersistAuditStateError", auditState.id)
.errorUnsuppressed(e)
.detail("AuditID", auditState.id)
.detail("AuditType", auditState.getType())
.detail("AuditPhase", auditPhase)
.detail("AuditKey", auditKey(auditState.getType(), auditState.id))
.detail("Context", context);
wait(tr.onError(e));
}
}
return Void();
}
ACTOR Future<AuditStorageState> getAuditState(Database cx, AuditType type, UID id) {
state Transaction tr(cx);
state Optional<Value> res;
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> res_ = wait(tr.get(auditKey(type, id)));
res = res_;
TraceEvent(SevDebug, "AuditUtilReadAuditState", id)
.detail("AuditID", id)
.detail("AuditType", type)
.detail("AuditKey", auditKey(type, id));
break;
} catch (Error& e) {
TraceEvent(SevDebug, "AuditUtilReadAuditStateError", id)
.errorUnsuppressed(e)
.detail("AuditID", id)
.detail("AuditType", type)
.detail("AuditKey", auditKey(type, id));
wait(tr.onError(e));
}
}
if (!res.present()) {
throw key_not_found();
}
return decodeAuditStorageState(res.get());
}
ACTOR Future<Void> persistAuditStateByRange(Database cx, AuditStorageState auditState) {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> ddAuditState_ = wait(tr.get(auditKey(auditState.getType(), auditState.id)));
if (!ddAuditState_.present()) {
throw audit_storage_cancelled();
}
AuditStorageState ddAuditState = decodeAuditStorageState(ddAuditState_.get());
ASSERT(ddAuditState.ddId.isValid());
if (ddAuditState.ddId != auditState.ddId) {
throw audit_storage_task_outdated(); // a new dd starts and this audit task is outdated
}
// It is possible ddAuditState is complete while some progress is about to persist
// Since doAuditOnStorageServer may repeatedly issue multiple requests (see getReplyUnlessFailedFor)
// For this case, no need to proceed. Silently exit
if (ddAuditState.getPhase() == AuditPhase::Complete) {
break;
}
// If this is the same dd, the phase must be following
ASSERT(ddAuditState.getPhase() == AuditPhase::Running || ddAuditState.getPhase() == AuditPhase::Failed);
if (ddAuditState.getPhase() == AuditPhase::Failed) {
throw audit_storage_cancelled();
}
wait(krmSetRange(&tr,
auditRangeBasedProgressPrefixFor(auditState.getType(), auditState.id),
auditState.range,
auditStorageStateValue(auditState)));
wait(tr.commit());
break;
} catch (Error& e) {
TraceEvent(SevDebug, "AuditUtilPersistAuditStateByRangeError")
.errorUnsuppressed(e)
.detail("AuditID", auditState.id)
.detail("AuditType", auditState.getType())
.detail("AuditPhase", auditState.getPhase());
wait(tr.onError(e));
}
}
return Void();
}
ACTOR Future<std::vector<AuditStorageState>> getAuditStateByRange(Database cx,
AuditType type,
UID auditId,
KeyRange range) {
state RangeResult auditStates;
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
RangeResult res_ = wait(krmGetRanges(&tr,
auditRangeBasedProgressPrefixFor(type, auditId),
range,
CLIENT_KNOBS->KRM_GET_RANGE_LIMIT,
CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES));
auditStates = res_;
break;
} catch (Error& e) {
TraceEvent(SevDebug, "AuditUtilGetAuditStateForRangeError").errorUnsuppressed(e).detail("AuditID", auditId);
wait(tr.onError(e));
}
}
// For a range of state that have value read from auditRangeBasedProgressPrefixFor
// add the state with the same range to res (these states are persisted by auditServers)
// For a range of state that does not have value read from auditRangeBasedProgressPrefixFor
// add an default (Invalid phase) state with the same range to res (DD will start audit for these ranges)
std::vector<AuditStorageState> res;
for (int i = 0; i < auditStates.size() - 1; ++i) {
KeyRange currentRange = KeyRangeRef(auditStates[i].key, auditStates[i + 1].key);
AuditStorageState auditState(auditId, currentRange, type);
if (!auditStates[i].value.empty()) {
AuditStorageState auditState_ = decodeAuditStorageState(auditStates[i].value);
auditState.setPhase(auditState_.getPhase());
auditState.error = auditState_.error;
}
res.push_back(auditState);
}
return res;
}
ACTOR Future<Void> persistAuditStateByServer(Database cx, AuditStorageState auditState) {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> ddAuditState_ = wait(tr.get(auditKey(auditState.getType(), auditState.id)));
if (!ddAuditState_.present()) {
throw audit_storage_cancelled();
}
AuditStorageState ddAuditState = decodeAuditStorageState(ddAuditState_.get());
ASSERT(ddAuditState.ddId.isValid());
if (ddAuditState.ddId != auditState.ddId) {
throw audit_storage_task_outdated(); // a new dd starts and this audit task is outdated
}
// It is possible ddAuditState is complete while some progress is about to persist
// Since doAuditOnStorageServer may repeatedly issue multiple requests (see getReplyUnlessFailedFor)
// For this case, no need to proceed. Silently exit
if (ddAuditState.getPhase() == AuditPhase::Complete) {
break;
}
// If this is the same dd, the phase must be following
ASSERT(ddAuditState.getPhase() == AuditPhase::Running || ddAuditState.getPhase() == AuditPhase::Failed);
if (ddAuditState.getPhase() == AuditPhase::Failed) {
throw audit_storage_cancelled();
}
wait(krmSetRange(
&tr,
auditServerBasedProgressPrefixFor(auditState.getType(), auditState.id, auditState.auditServerId),
auditState.range,
auditStorageStateValue(auditState)));
wait(tr.commit());
break;
} catch (Error& e) {
TraceEvent(SevDebug, "AuditUtilPersistAuditStateByRangeError")
.errorUnsuppressed(e)
.detail("AuditID", auditState.id)
.detail("AuditType", auditState.getType())
.detail("AuditPhase", auditState.getPhase())
.detail("AuditServerID", auditState.auditServerId);
wait(tr.onError(e));
}
}
return Void();
}
ACTOR Future<std::vector<AuditStorageState>> getAuditStateByServer(Database cx,
AuditType type,
UID auditId,
UID auditServerId,
KeyRange range) {
state RangeResult auditStates;
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
RangeResult res_ = wait(krmGetRanges(&tr,
auditServerBasedProgressPrefixFor(type, auditId, auditServerId),
range,
CLIENT_KNOBS->KRM_GET_RANGE_LIMIT,
CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES));
auditStates = res_;
break;
} catch (Error& e) {
TraceEvent(SevDebug, "AuditUtilGetAuditStateForRangeError")
.errorUnsuppressed(e)
.detail("AuditID", auditId)
.detail("AuditType", type)
.detail("AuditServerID", auditServerId);
wait(tr.onError(e));
}
}
// For a range of state that have value read from auditServerBasedProgressPrefixFor
// add the state with the same range to res (these states are persisted by auditServers)
// For a range of state that does not have value read from auditServerBasedProgressPrefixFor
// add an default (Invalid phase) state with the same range to res (DD will start audit for these ranges)
std::vector<AuditStorageState> res;
for (int i = 0; i < auditStates.size() - 1; ++i) {
KeyRange currentRange = KeyRangeRef(auditStates[i].key, auditStates[i + 1].key);
AuditStorageState auditState(auditId, currentRange, type);
if (!auditStates[i].value.empty()) {
AuditStorageState auditState_ = decodeAuditStorageState(auditStates[i].value);
auditState.setPhase(auditState_.getPhase());
auditState.error = auditState_.error;
}
res.push_back(auditState);
}
return res;
}
ACTOR Future<bool> checkAuditProgressCompleteByRange(Database cx,
AuditType auditType,
UID auditId,
KeyRange auditRange) {
ASSERT(auditType == AuditType::ValidateHA || auditType == AuditType::ValidateReplica ||
auditType == AuditType::ValidateLocationMetadata);
state KeyRange rangeToRead = auditRange;
state Key rangeToReadBegin = auditRange.begin;
state int retryCount = 0;
while (rangeToReadBegin < auditRange.end) {
loop {
try {
rangeToRead = KeyRangeRef(rangeToReadBegin, auditRange.end);
state std::vector<AuditStorageState> auditStates =
wait(getAuditStateByRange(cx, auditType, auditId, rangeToRead));
for (int i = 0; i < auditStates.size(); i++) {
AuditPhase phase = auditStates[i].getPhase();
if (phase == AuditPhase::Invalid) {
TraceEvent(SevWarn, "AuditUtilCheckAuditProgressNotFinished")
.detail("AuditID", auditId)
.detail("AuditRange", auditRange)
.detail("AuditType", auditType)
.detail("UnfinishedRange", auditStates[i].range);
return false;
}
}
rangeToReadBegin = auditStates.back().range.end;
break;
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw e;
}
if (retryCount > 30) {
TraceEvent(SevWarn, "AuditUtilCheckAuditProgressFailed")
.detail("AuditID", auditId)
.detail("AuditRange", auditRange)
.detail("AuditType", auditType);
throw audit_storage_failed();
}
wait(delay(0.5));
retryCount++;
}
}
}
TraceEvent(SevInfo, "AuditUtilCheckAuditProgressFinish")
.detail("AuditID", auditId)
.detail("AuditRange", auditRange)
.detail("AuditType", auditType);
return true;
}
ACTOR Future<bool> checkAuditProgressCompleteByServer(Database cx,
AuditType auditType,
UID auditId,
KeyRange auditRange,
UID serverId,
std::shared_ptr<AsyncVar<int>> checkProgressBudget) {
ASSERT(auditType == AuditType::ValidateStorageServerShard);
state KeyRange rangeToRead = auditRange;
state Key rangeToReadBegin = auditRange.begin;
state int retryCount = 0;
while (rangeToReadBegin < auditRange.end) {
loop {
try {
rangeToRead = KeyRangeRef(rangeToReadBegin, auditRange.end);
state std::vector<AuditStorageState> auditStates =
wait(getAuditStateByServer(cx, auditType, auditId, serverId, rangeToRead));
for (int i = 0; i < auditStates.size(); i++) {
AuditPhase phase = auditStates[i].getPhase();
if (phase == AuditPhase::Invalid) {
TraceEvent(SevWarn, "AuditUtilCheckAuditProgressNotFinished")
.detail("ServerID", serverId)
.detail("AuditID", auditId)
.detail("AuditRange", auditRange)
.detail("AuditType", auditType)
.detail("UnfinishedRange", auditStates[i].range);
checkProgressBudget->set(checkProgressBudget->get() + 1);
return false;
}
}
rangeToReadBegin = auditStates.back().range.end;
break;
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw e;
}
if (retryCount > 30) {
TraceEvent(SevWarn, "AuditUtilCheckAuditProgressFailed")
.detail("ServerID", serverId)
.detail("AuditID", auditId)
.detail("AuditRange", auditRange)
.detail("AuditType", auditType);
checkProgressBudget->set(checkProgressBudget->get() + 1);
throw audit_storage_failed();
}
wait(delay(0.5));
retryCount++;
}
}
}
checkProgressBudget->set(checkProgressBudget->get() + 1);
TraceEvent(SevInfo, "AuditUtilCheckAuditProgressFinish")
.detail("ServerID", serverId)
.detail("AuditID", auditId)
.detail("AuditRange", auditRange)
.detail("AuditType", auditType);
return true;
}
// Load RUNNING audit states to resume, clean up COMPLETE and FAILED audit states
// Update ddId for RUNNING audit states
ACTOR Future<std::vector<AuditStorageState>> initAuditMetadata(Database cx,
MoveKeyLockInfo lock,
bool ddEnabled,
UID dataDistributorId,
int persistFinishAuditCount) {
state std::unordered_map<AuditType, std::vector<AuditStorageState>> existingAuditStates;
state std::vector<AuditStorageState> auditStatesToResume;
state Transaction tr(cx);
state int retryCount = 0;
loop {
try {
// Load existing audit states and update ddId in audit states
existingAuditStates.clear();
auditStatesToResume.clear();
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
wait(checkMoveKeysLockForAudit(&tr, lock, ddEnabled, true));
RangeResult result = wait(tr.getRange(auditKeys, CLIENT_KNOBS->TOO_MANY));
if (result.more || result.size() >= CLIENT_KNOBS->TOO_MANY) {
TraceEvent(g_network->isSimulated() ? SevError : SevWarnAlways,
"AuditUtilLoadMetadataIncomplete",
dataDistributorId)
.detail("ResMore", result.more)
.detail("ResSize", result.size());
}
for (int i = 0; i < result.size(); ++i) {
auto auditState = decodeAuditStorageState(result[i].value);
TraceEvent(SevVerbose, "AuditUtilLoadMetadataEach", dataDistributorId)
.detail("CurrentDDID", dataDistributorId)
.detail("AuditDDID", auditState.ddId)
.detail("AuditType", auditState.getType())
.detail("AuditID", auditState.id)
.detail("AuditPhase", auditState.getPhase());
if (auditState.getPhase() == AuditPhase::Running) {
AuditStorageState toUpdate = auditState;
toUpdate.ddId = dataDistributorId;
tr.set(auditKey(toUpdate.getType(), toUpdate.id), auditStorageStateValue(toUpdate));
}
existingAuditStates[auditState.getType()].push_back(auditState);
}
// Cleanup Complete/Failed audit metadata for each type separately
for (const auto& [auditType, _] : existingAuditStates) {
int numFinishAudit = 0; // "finish" audits include Complete/Failed audits
for (const auto& auditState : existingAuditStates[auditType]) {
if (auditState.getPhase() == AuditPhase::Complete || auditState.getPhase() == AuditPhase::Failed) {
numFinishAudit++;
}
}
const int numFinishAuditsToClear = numFinishAudit - persistFinishAuditCount;
int numFinishAuditsCleared = 0;
std::sort(existingAuditStates[auditType].begin(),
existingAuditStates[auditType].end(),
[](AuditStorageState a, AuditStorageState b) {
return a.id < b.id; // Inplacement sort in ascending order
});
for (const auto& auditState : existingAuditStates[auditType]) {
if (auditState.getPhase() == AuditPhase::Failed) {
if (numFinishAuditsCleared < numFinishAuditsToClear) {
// Clear both audit metadata and corresponding progress metadata
tr.clear(auditKey(auditState.getType(), auditState.id));
clearAuditProgressMetadata(&tr, auditState.getType(), auditState.id);
numFinishAuditsCleared++;
TraceEvent(SevInfo, "AuditUtilMetadataCleared", dataDistributorId)
.detail("AuditID", auditState.id)
.detail("AuditType", auditState.getType())
.detail("AuditRange", auditState.range);
}
} else if (auditState.getPhase() == AuditPhase::Complete) {
if (numFinishAuditsCleared < numFinishAuditsToClear) {
// Clear audit metadata only
// No need to clear the corresponding progress metadata
// since it has been cleared for Complete audits
tr.clear(auditKey(auditState.getType(), auditState.id));
numFinishAuditsCleared++;
TraceEvent(SevInfo, "AuditUtilMetadataCleared", dataDistributorId)
.detail("AuditID", auditState.id)
.detail("AuditType", auditState.getType())
.detail("AuditRange", auditState.range);
}
} else if (auditState.getPhase() == AuditPhase::Running) {
auditStatesToResume.push_back(auditState);
TraceEvent(SevInfo, "AuditUtilMetadataAddedToResume", dataDistributorId)
.detail("AuditID", auditState.id)
.detail("AuditType", auditState.getType())
.detail("AuditRange", auditState.range);
}
}
}
wait(tr.commit());
break;
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled || e.code() == error_code_movekeys_conflict) {
throw e;
}
if (retryCount > 50) {
TraceEvent(SevWarnAlways, "AuditUtilInitAuditMetadataExceedRetryMax", dataDistributorId)
.errorUnsuppressed(e);
break;
}
try {
wait(tr.onError(e));
} catch (Error& e) {
retryCount++;
tr.reset();
}
}
}
return auditStatesToResume;
}
// Check if any pair of ranges are exclusive with each other
// This is not a part in consistency check of audit metadata
// This is used for checking the validity of inputs to rangesSame()
bool elementsAreExclusiveWithEachOther(std::vector<KeyRange> ranges) {
ASSERT(std::is_sorted(ranges.begin(), ranges.end(), KeyRangeRef::ArbitraryOrder()));
for (int i = 0; i < ranges.size() - 1; ++i) {
if (ranges[i].end > ranges[i + 1].begin) {
TraceEvent(SevError, "AuditUtilElementsAreNotExclusiveWithEachOther").detail("Ranges", describe(ranges));
return false;
}
}
return true;
}
// Check if any range is empty in the given list of ranges
// This is not a part in consistency check of audit metadata
// This is used for checking the validity of inputs to rangesSame()
bool noEmptyRangeInRanges(std::vector<KeyRange> ranges) {
for (const auto& range : ranges) {
if (range.empty()) {
return false;
}
}
return true;
}
// Given a list of ranges, where ranges can overlap with each other
// Return a list of exclusive ranges which covers the ranges exactly
// the same as the input list of ranges
std::vector<KeyRange> coalesceRangeList(std::vector<KeyRange> ranges) {
std::sort(ranges.begin(), ranges.end(), [](KeyRange a, KeyRange b) { return a.begin < b.begin; });
std::vector<KeyRange> res;
for (const auto& range : ranges) {
if (res.empty()) {
res.push_back(range);
continue;
}
if (range.begin <= res.back().end) {
if (range.end > res.back().end) { // update res.back if current range extends the back range
KeyRange newBack = Standalone(KeyRangeRef(res.back().begin, range.end));
res.pop_back();
res.push_back(newBack);
}
} else {
res.push_back(range);
}
}
return res;
}
// Given two lists of ranges --- rangesA and rangesB, check if two lists are identical
// If not, return the mismatched two ranges of rangeA and rangeB respectively
Optional<std::pair<KeyRange, KeyRange>> rangesSame(std::vector<KeyRange> rangesA, std::vector<KeyRange> rangesB) {
if (g_network->isSimulated()) {
ASSERT(noEmptyRangeInRanges(rangesA));
ASSERT(noEmptyRangeInRanges(rangesB));
}
KeyRange emptyRange;
if (rangesA.empty() && rangesB.empty()) { // no mismatch
return Optional<std::pair<KeyRange, KeyRange>>();
} else if (rangesA.empty() && !rangesB.empty()) { // rangesA is empty while rangesB has a range
return std::make_pair(emptyRange, rangesB.front());
} else if (!rangesA.empty() && rangesB.empty()) { // rangesB is empty while rangesA has a range
return std::make_pair(rangesA.front(), emptyRange);
}
TraceEvent(SevVerbose, "AuditUtilRangesSameBeforeSort").detail("RangesA", rangesA).detail("Rangesb", rangesB);
// sort in ascending order
std::sort(rangesA.begin(), rangesA.end(), [](KeyRange a, KeyRange b) { return a.begin < b.begin; });
std::sort(rangesB.begin(), rangesB.end(), [](KeyRange a, KeyRange b) { return a.begin < b.begin; });
TraceEvent(SevVerbose, "AuditUtilRangesSameAfterSort").detail("RangesA", rangesA).detail("Rangesb", rangesB);
if (g_network->isSimulated()) {
ASSERT(elementsAreExclusiveWithEachOther(rangesA));
ASSERT(elementsAreExclusiveWithEachOther(rangesB));
}
if (rangesA.front().begin != rangesB.front().begin) { // rangeList heads mismatch
return std::make_pair(rangesA.front(), rangesB.front());
} else if (rangesA.back().end != rangesB.back().end) { // rangeList backs mismatch
return std::make_pair(rangesA.back(), rangesB.back());
}
int ia = 0;
int ib = 0;
KeyRangeRef rangeA = rangesA[0];
KeyRangeRef rangeB = rangesB[0];
KeyRange lastRangeA = Standalone(rangeA);
KeyRange lastRangeB = Standalone(rangeB);
while (true) {
if (rangeA.begin == rangeB.begin) {
if (rangeA.end == rangeB.end) {
if (rangeA.end == rangesA.back().end) {
break;
}
++ia;
++ib;
rangeA = rangesA[ia];
rangeB = rangesB[ib];
lastRangeA = Standalone(rangeA);
lastRangeB = Standalone(rangeB);
} else if (rangeA.end > rangeB.end) {
rangeA = KeyRangeRef(rangeB.end, rangeA.end);
++ib;
rangeB = rangesB[ib];
lastRangeB = Standalone(rangeB);
} else {
rangeB = KeyRangeRef(rangeA.end, rangeB.end);
++ia;
rangeA = rangesA[ia];
lastRangeA = Standalone(rangeA);
}
} else {
return std::make_pair(lastRangeA, lastRangeB);
}
}
return Optional<std::pair<KeyRange, KeyRange>>();
}
// Given an input server, get ranges within the input range via the input transaction
// from the perspective of ServerKeys system key space
// Input: (1) SS id; (2) transaction tr; (3) within range
// Return AuditGetServerKeysRes, including: (1) complete range by a single read range;
// (2) version of the read; (3) ranges of the input SS
ACTOR Future<AuditGetServerKeysRes> getThisServerKeysFromServerKeys(UID serverID, Transaction* tr, KeyRange range) {
state RangeResult readResult;
state AuditGetServerKeysRes res;
try {
wait(store(readResult,
krmGetRanges(tr,
serverKeysPrefixFor(serverID),
range,
CLIENT_KNOBS->KRM_GET_RANGE_LIMIT,
CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES)));
Future<Version> grvF = tr->getReadVersion();
if (!grvF.isReady()) {
TraceEvent(SevWarnAlways, "AuditUtilReadServerKeysGRVError", serverID);
throw audit_storage_cancelled();
}
Version readAtVersion = grvF.get();
TraceEvent(SevVerbose, "AuditUtilGetThisServerKeysFromServerKeysReadDone", serverID)
.detail("Range", range)
.detail("Prefix", serverKeysPrefixFor(serverID))
.detail("ResultSize", readResult.size())
.detail("AuditServerID", serverID);
std::vector<KeyRange> ownRanges;
for (int i = 0; i < readResult.size() - 1; ++i) {
TraceEvent(SevVerbose, "AuditUtilGetThisServerKeysFromServerKeysAddToResult", serverID)
.detail("ValueIsServerKeysFalse", readResult[i].value == serverKeysFalse)
.detail("ServerHasKey", serverHasKey(readResult[i].value))
.detail("Range", KeyRangeRef(readResult[i].key, readResult[i + 1].key))
.detail("AuditServerID", serverID);
if (serverHasKey(readResult[i].value)) {
KeyRange shardRange;
ownRanges.push_back(Standalone(KeyRangeRef(readResult[i].key, readResult[i + 1].key)));
}
}
const KeyRange completeRange = Standalone(KeyRangeRef(range.begin, readResult.back().key));
TraceEvent(SevVerbose, "AuditUtilGetThisServerKeysFromServerKeysEnd", serverID)
.detail("AuditServerID", serverID)
.detail("Range", range)
.detail("Prefix", serverKeysPrefixFor(serverID))
.detail("ReadAtVersion", readAtVersion)
.detail("CompleteRange", completeRange)
.detail("ResultSize", ownRanges.size());
res = AuditGetServerKeysRes(completeRange, readAtVersion, serverID, ownRanges, readResult.logicalSize());
} catch (Error& e) {
TraceEvent(SevDebug, "AuditUtilGetThisServerKeysError", serverID)
.errorUnsuppressed(e)
.detail("AuditServerID", serverID);
throw e;
}
return res;
}
// Given an input server, get ranges within the input range via the input transaction
// from the perspective of KeyServers system key space
// Input: (1) Audit Server ID (for logging); (2) transaction tr; (3) within range
// Return AuditGetKeyServersRes, including : (1) complete range by a single read range; (2) version of the read;
// (3) map between SSes and their ranges --- in KeyServers space, a range corresponds to multiple SSes
ACTOR Future<AuditGetKeyServersRes> getShardMapFromKeyServers(UID auditServerId, Transaction* tr, KeyRange range) {
state AuditGetKeyServersRes res;
state std::vector<Future<Void>> actors;
state RangeResult readResult;
state RangeResult UIDtoTagMap;
state int64_t totalShardsCount = 0;
state int64_t shardsInAnonymousPhysicalShardCount = 0;
try {
// read
actors.push_back(store(readResult,
krmGetRanges(tr,
keyServersPrefix,
range,
CLIENT_KNOBS->KRM_GET_RANGE_LIMIT,
CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES)));
actors.push_back(store(UIDtoTagMap, tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY)));
wait(waitForAll(actors));
if (UIDtoTagMap.more || UIDtoTagMap.size() >= CLIENT_KNOBS->TOO_MANY) {
TraceEvent(g_network->isSimulated() ? SevError : SevWarnAlways,
"AuditUtilReadKeyServersReadTagError",
auditServerId);
throw audit_storage_cancelled();
}
Future<Version> grvF = tr->getReadVersion();
if (!grvF.isReady()) {
TraceEvent(SevWarnAlways, "AuditUtilReadKeyServersGRVError", auditServerId);
throw audit_storage_cancelled();
}
Version readAtVersion = grvF.get();
TraceEvent(SevVerbose, "AuditUtilGetThisServerKeysFromKeyServersReadDone", auditServerId)
.detail("Range", range)
.detail("ResultSize", readResult.size())
.detail("AuditServerID", auditServerId);
// produce result
std::unordered_map<UID, std::vector<KeyRange>> serverOwnRanges;
for (int i = 0; i < readResult.size() - 1; ++i) {
std::vector<UID> src;
std::vector<UID> dest;
UID srcID;
UID destID;
decodeKeyServersValue(UIDtoTagMap, readResult[i].value, src, dest, srcID, destID);
if (srcID == anonymousShardId) {
shardsInAnonymousPhysicalShardCount++;
}
totalShardsCount++;
std::vector<UID> servers(src.size() + dest.size());
std::merge(src.begin(), src.end(), dest.begin(), dest.end(), servers.begin());
for (auto& ssid : servers) {
serverOwnRanges[ssid].push_back(Standalone(KeyRangeRef(readResult[i].key, readResult[i + 1].key)));
}
}
const KeyRange completeRange = Standalone(KeyRangeRef(range.begin, readResult.back().key));
TraceEvent(SevInfo, "AuditUtilGetThisServerKeysFromKeyServersEnd", auditServerId)
.detail("Range", range)
.detail("CompleteRange", completeRange)
.detail("AtVersion", readAtVersion)
.detail("ShardsInAnonymousPhysicalShardCount", shardsInAnonymousPhysicalShardCount)
.detail("TotalShardsCount", totalShardsCount);
res = AuditGetKeyServersRes(completeRange, readAtVersion, serverOwnRanges, readResult.logicalSize());
} catch (Error& e) {
TraceEvent(SevDebug, "AuditUtilGetThisServerKeysFromKeyServersError", auditServerId)
.errorUnsuppressed(e)
.detail("AuditServerId", auditServerId);
throw e;
}
return res;
}