mirror of
https://github.com/apple/foundationdb.git
synced 2025-05-31 10:14:52 +08:00
support reads from range feeds that span multiple storage servers
This commit is contained in:
parent
a278d2977a
commit
27151b0831
@ -6579,6 +6579,115 @@ Future<Standalone<VectorRef<MutationsAndVersionRef>>> DatabaseContext::getChange
|
||||
return getChangeFeedMutationsActor(Reference<DatabaseContext>::addRef(this), rangeID, begin, end, range);
|
||||
}
|
||||
|
||||
ACTOR Future<Void> singleChangeFeedStream(StorageServerInterface interf,
|
||||
PromiseStream<Standalone<MutationsAndVersionRef>> results,
|
||||
Key rangeID,
|
||||
Version begin,
|
||||
Version end,
|
||||
KeyRange range) {
|
||||
loop {
|
||||
try {
|
||||
state ChangeFeedStreamRequest req;
|
||||
req.rangeID = rangeID;
|
||||
req.begin = begin;
|
||||
req.end = end;
|
||||
req.range = range;
|
||||
|
||||
state ReplyPromiseStream<ChangeFeedStreamReply> replyStream = interf.changeFeedStream.getReplyStream(req);
|
||||
|
||||
loop {
|
||||
state ChangeFeedStreamReply rep = waitNext(replyStream.getFuture());
|
||||
begin = rep.mutations.back().version + 1;
|
||||
state int resultLoc = 0;
|
||||
while (resultLoc < rep.mutations.size()) {
|
||||
if (rep.mutations[resultLoc].mutations.size() || rep.mutations[resultLoc].version + 1 == end) {
|
||||
wait(results.onEmpty());
|
||||
results.send(rep.mutations[resultLoc]);
|
||||
}
|
||||
resultLoc++;
|
||||
}
|
||||
if (begin == end) {
|
||||
return Void();
|
||||
}
|
||||
}
|
||||
} catch (Error& e) {
|
||||
if (e.code() == error_code_actor_cancelled) {
|
||||
throw;
|
||||
}
|
||||
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed ||
|
||||
e.code() == error_code_connection_failed) {
|
||||
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY));
|
||||
} else {
|
||||
results.sendError(e);
|
||||
return Void();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct MutationAndVersionStream {
|
||||
Standalone<MutationsAndVersionRef> next;
|
||||
PromiseStream<Standalone<MutationsAndVersionRef>> results;
|
||||
|
||||
bool operator<(MutationAndVersionStream const& rhs) const { return next.version < rhs.next.version; }
|
||||
};
|
||||
|
||||
ACTOR Future<Void> mergeChangeFeedStream(std::vector<std::pair<StorageServerInterface, KeyRange>> interfs,
|
||||
PromiseStream<Standalone<VectorRef<MutationsAndVersionRef>>> results,
|
||||
Key rangeID,
|
||||
Version* begin,
|
||||
Version end) {
|
||||
state std::priority_queue<MutationAndVersionStream, std::vector<MutationAndVersionStream>> mutations;
|
||||
state std::vector<Future<Void>> fetchers(interfs.size());
|
||||
state std::vector<MutationAndVersionStream> streams(interfs.size());
|
||||
for (int i = 0; i < interfs.size(); i++) {
|
||||
fetchers[i] =
|
||||
singleChangeFeedStream(interfs[i].first, streams[i].results, rangeID, *begin, end, interfs[i].second);
|
||||
}
|
||||
state int interfNum = 0;
|
||||
while (interfNum < interfs.size()) {
|
||||
try {
|
||||
Standalone<MutationsAndVersionRef> res = waitNext(streams[interfNum].results.getFuture());
|
||||
streams[interfNum].next = res;
|
||||
mutations.push(streams[interfNum]);
|
||||
} catch (Error& e) {
|
||||
if (e.code() != error_code_end_of_stream) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
interfNum++;
|
||||
}
|
||||
state Version checkVersion = invalidVersion;
|
||||
state Standalone<VectorRef<MutationsAndVersionRef>> nextOut;
|
||||
while (mutations.size()) {
|
||||
state MutationAndVersionStream nextStream = mutations.top();
|
||||
mutations.pop();
|
||||
ASSERT(nextStream.next.version >= checkVersion);
|
||||
if (nextStream.next.version != checkVersion) {
|
||||
if (nextOut.size()) {
|
||||
*begin = checkVersion + 1;
|
||||
results.send(nextOut);
|
||||
nextOut = Standalone<VectorRef<MutationsAndVersionRef>>();
|
||||
}
|
||||
checkVersion = nextStream.next.version;
|
||||
}
|
||||
nextOut.push_back_deep(nextOut.arena(), nextStream.next);
|
||||
try {
|
||||
Standalone<MutationsAndVersionRef> res = waitNext(nextStream.results.getFuture());
|
||||
nextStream.next = res;
|
||||
mutations.push(nextStream);
|
||||
} catch (Error& e) {
|
||||
if (e.code() != error_code_end_of_stream) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nextOut.size()) {
|
||||
results.send(nextOut);
|
||||
}
|
||||
throw end_of_stream();
|
||||
}
|
||||
|
||||
ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
|
||||
PromiseStream<Standalone<VectorRef<MutationsAndVersionRef>>> results,
|
||||
StringRef rangeID,
|
||||
@ -6609,26 +6718,27 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
|
||||
state vector<pair<KeyRange, Reference<LocationInfo>>> locations =
|
||||
wait(getKeyRangeLocations(cx,
|
||||
keys,
|
||||
100,
|
||||
1000,
|
||||
Reverse::False,
|
||||
&StorageServerInterface::changeFeed,
|
||||
TransactionInfo(TaskPriority::DefaultEndpoint, span.context)));
|
||||
|
||||
if (locations.size() > 1) {
|
||||
if (locations.size() >= 1000) {
|
||||
results.sendError(unsupported_operation());
|
||||
return Void();
|
||||
}
|
||||
|
||||
state int useIdx = -1;
|
||||
|
||||
loop {
|
||||
state std::vector<int> chosenLocations(locations.size());
|
||||
state int loc = 0;
|
||||
while (loc < locations.size()) {
|
||||
// FIXME: create a load balance function for this code so future users of reply streams do not have
|
||||
// to duplicate this code
|
||||
int count = 0;
|
||||
for (int i = 0; i < locations[0].second->size(); i++) {
|
||||
int useIdx = -1;
|
||||
for (int i = 0; i < locations[loc].second->size(); i++) {
|
||||
if (!IFailureMonitor::failureMonitor()
|
||||
.getState(
|
||||
locations[0].second->get(i, &StorageServerInterface::changeFeedStream).getEndpoint())
|
||||
locations[loc].second->get(i, &StorageServerInterface::changeFeedStream).getEndpoint())
|
||||
.failed) {
|
||||
if (deterministicRandom()->random01() <= 1.0 / ++count) {
|
||||
useIdx = i;
|
||||
@ -6637,13 +6747,15 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
|
||||
}
|
||||
|
||||
if (useIdx >= 0) {
|
||||
break;
|
||||
chosenLocations[loc] = useIdx;
|
||||
loc++;
|
||||
continue;
|
||||
}
|
||||
|
||||
vector<Future<Void>> ok(locations[0].second->size());
|
||||
vector<Future<Void>> ok(locations[loc].second->size());
|
||||
for (int i = 0; i < ok.size(); i++) {
|
||||
ok[i] = IFailureMonitor::failureMonitor().onStateEqual(
|
||||
locations[0].second->get(i, &StorageServerInterface::changeFeedStream).getEndpoint(),
|
||||
locations[loc].second->get(i, &StorageServerInterface::changeFeedStream).getEndpoint(),
|
||||
FailureStatus(false));
|
||||
}
|
||||
|
||||
@ -6654,23 +6766,36 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
|
||||
}
|
||||
|
||||
wait(allAlternativesFailedDelay(quorum(ok, 1)));
|
||||
loc = 0;
|
||||
}
|
||||
|
||||
state ChangeFeedStreamRequest req;
|
||||
req.rangeID = rangeID;
|
||||
req.begin = begin;
|
||||
req.end = end;
|
||||
if (locations.size() > 1) {
|
||||
std::vector<std::pair<StorageServerInterface, KeyRange>> interfs;
|
||||
for (int i = 0; i < locations.size(); i++) {
|
||||
interfs.push_back(
|
||||
std::make_pair(locations[i].second->getInterface(chosenLocations[i]), locations[i].first));
|
||||
}
|
||||
wait(mergeChangeFeedStream(interfs, results, rangeID, &begin, end) || cx->connectionFileChanged());
|
||||
} else {
|
||||
state ChangeFeedStreamRequest req;
|
||||
req.rangeID = rangeID;
|
||||
req.begin = begin;
|
||||
req.end = end;
|
||||
req.range = range;
|
||||
|
||||
state ReplyPromiseStream<ChangeFeedStreamReply> replyStream =
|
||||
locations[0].second->get(useIdx, &StorageServerInterface::changeFeedStream).getReplyStream(req);
|
||||
state ReplyPromiseStream<ChangeFeedStreamReply> replyStream =
|
||||
locations[0]
|
||||
.second->get(chosenLocations[0], &StorageServerInterface::changeFeedStream)
|
||||
.getReplyStream(req);
|
||||
|
||||
loop {
|
||||
wait(results.onEmpty());
|
||||
choose {
|
||||
when(wait(cx->connectionFileChanged())) { break; }
|
||||
when(ChangeFeedStreamReply rep = waitNext(replyStream.getFuture())) {
|
||||
begin = rep.mutations.back().version + 1;
|
||||
results.send(Standalone<VectorRef<MutationsAndVersionRef>>(rep.mutations, rep.arena));
|
||||
loop {
|
||||
wait(results.onEmpty());
|
||||
choose {
|
||||
when(wait(cx->connectionFileChanged())) { break; }
|
||||
when(ChangeFeedStreamReply rep = waitNext(replyStream.getFuture())) {
|
||||
begin = rep.mutations.back().version + 1;
|
||||
results.send(Standalone<VectorRef<MutationsAndVersionRef>>(rep.mutations, rep.arena));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -6699,6 +6824,29 @@ Future<Void> DatabaseContext::getChangeFeedStream(
|
||||
return getChangeFeedStreamActor(Reference<DatabaseContext>::addRef(this), results, rangeID, begin, end, range);
|
||||
}
|
||||
|
||||
ACTOR Future<std::vector<std::pair<Key, KeyRange>>> singleLocationOverlappingChangeFeeds(
|
||||
Database cx,
|
||||
Reference<LocationInfo> location,
|
||||
KeyRangeRef range,
|
||||
Version minVersion) {
|
||||
state OverlappingChangeFeedsRequest req;
|
||||
req.range = range;
|
||||
req.minVersion = minVersion;
|
||||
|
||||
OverlappingChangeFeedsReply rep = wait(loadBalance(cx.getPtr(),
|
||||
location,
|
||||
&StorageServerInterface::overlappingChangeFeeds,
|
||||
req,
|
||||
TaskPriority::DefaultPromiseEndpoint,
|
||||
AtMostOnce::False,
|
||||
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
|
||||
return rep.rangeIds;
|
||||
}
|
||||
|
||||
bool compareChangeFeedResult(const std::pair<Key, KeyRange>& i, const std::pair<Key, KeyRange>& j) {
|
||||
return i.first < j.first;
|
||||
}
|
||||
|
||||
ACTOR Future<std::vector<std::pair<Key, KeyRange>>> getOverlappingChangeFeedsActor(Reference<DatabaseContext> db,
|
||||
KeyRangeRef range,
|
||||
Version minVersion) {
|
||||
@ -6708,26 +6856,28 @@ ACTOR Future<std::vector<std::pair<Key, KeyRange>>> getOverlappingChangeFeedsAct
|
||||
state vector<pair<KeyRange, Reference<LocationInfo>>> locations =
|
||||
wait(getKeyRangeLocations(cx,
|
||||
range,
|
||||
100,
|
||||
1000,
|
||||
Reverse::False,
|
||||
&StorageServerInterface::changeFeed,
|
||||
TransactionInfo(TaskPriority::DefaultEndpoint, span.context)));
|
||||
|
||||
if (locations.size() > 1) {
|
||||
if (locations.size() >= 1000) {
|
||||
throw unsupported_operation();
|
||||
}
|
||||
|
||||
state OverlappingChangeFeedsRequest req;
|
||||
req.range = range;
|
||||
state std::vector<Future<std::vector<std::pair<Key, KeyRange>>>> allOverlappingRequests;
|
||||
for (auto& it : locations) {
|
||||
allOverlappingRequests.push_back(singleLocationOverlappingChangeFeeds(cx, it.second, range, minVersion));
|
||||
}
|
||||
wait(waitForAll(allOverlappingRequests));
|
||||
|
||||
OverlappingChangeFeedsReply rep = wait(loadBalance(cx.getPtr(),
|
||||
locations[0].second,
|
||||
&StorageServerInterface::overlappingChangeFeeds,
|
||||
req,
|
||||
TaskPriority::DefaultPromiseEndpoint,
|
||||
AtMostOnce::False,
|
||||
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
|
||||
return rep.rangeIds;
|
||||
std::vector<std::pair<Key, KeyRange>> result;
|
||||
for (auto& it : allOverlappingRequests) {
|
||||
result.insert(result.end(), it.get().begin(), it.get().end());
|
||||
}
|
||||
std::sort(result.begin(), result.end(), compareChangeFeedResult);
|
||||
result.resize(std::unique(result.begin(), result.end()) - result.begin());
|
||||
return result;
|
||||
}
|
||||
|
||||
Future<std::vector<std::pair<Key, KeyRange>>> DatabaseContext::getOverlappingChangeFeeds(KeyRangeRef range,
|
||||
@ -6748,20 +6898,27 @@ ACTOR Future<Void> popChangeFeedMutationsActor(Reference<DatabaseContext> db, St
|
||||
state vector<pair<KeyRange, Reference<LocationInfo>>> locations =
|
||||
wait(getKeyRangeLocations(cx,
|
||||
keys,
|
||||
100,
|
||||
1000,
|
||||
Reverse::False,
|
||||
&StorageServerInterface::changeFeed,
|
||||
TransactionInfo(TaskPriority::DefaultEndpoint, span.context)));
|
||||
|
||||
if (locations.size() > 1) {
|
||||
if (locations.size() >= 1000) {
|
||||
throw unsupported_operation();
|
||||
}
|
||||
|
||||
state std::vector<StorageServerInterface> allInterfs;
|
||||
for (auto& it : locations) {
|
||||
for (int i = 0; i < it.second->size(); i++) {
|
||||
allInterfs.push_back(it.second->getInterface(i));
|
||||
}
|
||||
}
|
||||
uniquify(allInterfs);
|
||||
|
||||
// FIXME: lookup both the src and dest shards as of the pop version to ensure all locations are popped
|
||||
state std::vector<Future<Void>> popRequests;
|
||||
for (int i = 0; i < locations[0].second->size(); i++) {
|
||||
popRequests.push_back(
|
||||
locations[0].second->getInterface(i).changeFeedPop.getReply(ChangeFeedPopRequest(rangeID, version)));
|
||||
for (int i = 0; i < allInterfs.size(); i++) {
|
||||
popRequests.push_back(allInterfs[i].changeFeedPop.getReply(ChangeFeedPopRequest(rangeID, version)));
|
||||
}
|
||||
wait(waitForAll(popRequests));
|
||||
return Void();
|
||||
|
@ -709,12 +709,13 @@ struct ChangeFeedStreamRequest {
|
||||
Key rangeID;
|
||||
Version begin = 0;
|
||||
Version end = 0;
|
||||
KeyRange range;
|
||||
ReplyPromiseStream<ChangeFeedStreamReply> reply;
|
||||
|
||||
ChangeFeedStreamRequest() {}
|
||||
template <class Ar>
|
||||
void serialize(Ar& ar) {
|
||||
serializer(ar, rangeID, begin, end, reply, spanContext, arena);
|
||||
serializer(ar, rangeID, begin, end, range, reply, spanContext, arena);
|
||||
}
|
||||
};
|
||||
|
||||
@ -751,6 +752,7 @@ struct OverlappingChangeFeedsReply {
|
||||
struct OverlappingChangeFeedsRequest {
|
||||
constexpr static FileIdentifier file_identifier = 10726174;
|
||||
KeyRange range;
|
||||
Version minVersion;
|
||||
ReplyPromise<OverlappingChangeFeedsReply> reply;
|
||||
|
||||
OverlappingChangeFeedsRequest() {}
|
||||
@ -758,7 +760,7 @@ struct OverlappingChangeFeedsRequest {
|
||||
|
||||
template <class Ar>
|
||||
void serialize(Ar& ar) {
|
||||
serializer(ar, range, reply);
|
||||
serializer(ar, range, minVersion, reply);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1538,6 +1538,7 @@ ACTOR Future<Void> changeFeedPopQ(StorageServer* self, ChangeFeedPopRequest req)
|
||||
|
||||
ACTOR Future<Void> overlappingChangeFeedsQ(StorageServer* data, OverlappingChangeFeedsRequest req) {
|
||||
wait(delay(0));
|
||||
wait(data->version.whenAtLeast(req.minVersion));
|
||||
auto ranges = data->keyChangeFeed.intersectingRanges(req.range);
|
||||
std::map<Key, KeyRange> rangeIds;
|
||||
for (auto r : ranges) {
|
||||
@ -1671,8 +1672,6 @@ ACTOR Future<Void> changeFeedQ(StorageServer* data, ChangeFeedRequest req) {
|
||||
}
|
||||
|
||||
ACTOR Future<Void> changeFeedStreamQ(StorageServer* data, ChangeFeedStreamRequest req)
|
||||
// Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large
|
||||
// selector offset prevents all data from being read in one range read
|
||||
{
|
||||
state Span span("SS:getChangeFeedStream"_loc, { req.spanContext });
|
||||
state Version begin = req.begin;
|
||||
@ -1680,6 +1679,11 @@ ACTOR Future<Void> changeFeedStreamQ(StorageServer* data, ChangeFeedStreamReques
|
||||
|
||||
wait(delay(0, TaskPriority::DefaultEndpoint));
|
||||
|
||||
state uint64_t changeCounter = data->shardChangeCounter;
|
||||
if (!data->isReadable(req.range)) {
|
||||
throw wrong_shard_server();
|
||||
}
|
||||
|
||||
try {
|
||||
loop {
|
||||
wait(req.reply.onReady());
|
||||
@ -1688,6 +1692,9 @@ ACTOR Future<Void> changeFeedStreamQ(StorageServer* data, ChangeFeedStreamReques
|
||||
feedRequest.begin = begin;
|
||||
feedRequest.end = req.end;
|
||||
ChangeFeedReply feedReply = wait(getChangeFeedMutations(data, feedRequest));
|
||||
|
||||
data->checkChangeCounter(changeCounter, req.range);
|
||||
|
||||
begin = feedReply.mutations.back().version + 1;
|
||||
req.reply.send(ChangeFeedStreamReply(feedReply));
|
||||
if (feedReply.mutations.back().version == req.end - 1) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user