foundationdb/fdbclient/BulkLoading.cpp
Michael Stack ff22876247
Add multiparting to s3client. (#11920)
* Add multiparting to s3client.
Fix boost::urls::parse_uri 's dislike of credentialed blobstore urls.

* fdbclient/BulkLoading.cpp
 Add blobstore regex to extract credentials before feeding the boost
 parse_uri.

* fdbclient/include/fdbclient/S3BlobStore.h
* fdbclient/S3BlobStore.actor.cpp
 Add cleanup of failed multipart -- abortMultiPartUpload l(s3 will do
 this in the background eventually but lets clean up after ourselves).
 Also add  getObjectRangeMD5 so can do multipart checksumming.

* fdbclient/S3Client.actor.cpp
 Change upload file and download file to do multipart always.
 Retry too.

* fdbclient/S3Client_cli.actor.cpp
 Add command line to trace rather than output.

* Address Zhe review

* More logging around part upload and download

* Undo assert that proved incorrect; restore the old length math
doing copy in readObject.

Cleanup around TraceEvents in HTTTP.actor.

* Undo commented out cleanup -- for debugging

* formatting

---------

Co-authored-by: stack <stack@duboce.com>
2025-02-13 09:06:17 -08:00

182 lines
7.4 KiB
C++

/*
* BulkLoading.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2024 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/BulkLoading.h"
#include "fdbclient/SystemData.h"
#include "flow/Error.h"
#include <boost/url/url.hpp>
#include <boost/url/parse.hpp>
#include <boost/url/error_types.hpp>
#include <boost/url/string_view.hpp>
bool getConductBulkLoadFromDataMoveId(const UID& dataMoveId) {
bool nowAssigned = false;
bool emptyRange = false;
DataMoveType dataMoveType = DataMoveType::LOGICAL;
DataMovementReason dataMoveReason = DataMovementReason::INVALID;
decodeDataMoveId(dataMoveId, nowAssigned, emptyRange, dataMoveType, dataMoveReason);
bool conductBulkLoad =
dataMoveType == DataMoveType::LOGICAL_BULKLOAD || dataMoveType == DataMoveType::PHYSICAL_BULKLOAD;
if (conductBulkLoad) {
ASSERT(!emptyRange && dataMoveIdIsValidForBulkLoad(dataMoveId));
ASSERT(nowAssigned);
}
if (!nowAssigned) {
ASSERT(!conductBulkLoad);
}
return conductBulkLoad;
}
bool dataMoveIdIsValidForBulkLoad(const UID& dataMoveId) {
return dataMoveId.isValid() && dataMoveId != anonymousShardId;
}
std::string stringRemovePrefix(std::string str, const std::string& prefix) {
if (str.compare(0, prefix.length(), prefix) == 0) {
str.erase(0, prefix.length());
} else {
throw bulkload_manifest_decode_error();
}
return str;
}
Key getKeyFromHexString(const std::string& hexRawString) {
if (hexRawString.empty()) {
return Key();
}
// Here is an example of the input hexRawString:
// "01 02 03". This raw string should be convered to the Key: "\x01\x02\x03".
// Note that the space is not added for the last byte in the original string.
ASSERT((hexRawString.size() + 1) % 3 == 0);
std::string res;
res.resize((hexRawString.size() + 1) / 3);
for (size_t i = 0; i < hexRawString.size(); i += 3) {
std::string byteString = hexRawString.substr(i, 2);
uint8_t byte = static_cast<uint8_t>(std::stoul(byteString, nullptr, 16));
res[i / 3] = byte;
ASSERT(i + 2 >= hexRawString.size() || hexRawString[i + 2] == ' ');
}
return Standalone(StringRef(res));
}
std::string getBulkLoadJobManifestFileName() {
return "job-manifest.txt";
}
std::string generateBulkLoadBytesSampleFileNameFromDataFileName(const std::string& dataFileName) {
return dataFileName + "-sample.sst";
}
std::string generateEmptyManifestFileName() {
return "manifest-empty.sst";
}
// Generate the bulkload job manifest file. Here is an example.
// Assuming the job manifest file is in the folder: "/tmp".
// Row 0: [FormatVersion]: 1, [ManifestCount]: 3;
// Row 1: "", "01", 100, 9000, "range1", "manifest1.txt"
// Row 2: "01", "02 ff", 200, 0, "range2", "manifest2.txt"
// Row 3: "02 ff", "ff", 300, 8100, "range3", "manifest3.txt"
// In this example, the job manifest file is in the format of version 1.
// The file contains three ranges: "" ~ "\x01", "\x01" ~ "\x02\xff", and "\x02\xff" ~ "\xff".
// For the first range, the data version is at 100, the data size is 9KB, the manifest file path is
// "/tmp/range1/manifest1.txt". For the second range, the data version is at 200, the data size is 0 indicating this is
// an empty range. The manifest file path is "/tmp/range2/manifest2.txt". For the third range, the data version is at
// 300, the data size is 8.1KB, the manifest file path is "/tmp/range1/manifest3.txt".
std::string generateBulkLoadJobManifestFileContent(const std::map<Key, BulkLoadManifest>& manifests) {
std::string res = BulkLoadJobManifestFileHeader(bulkLoadManifestFormatVersion, manifests.size()).toString() + "\n";
for (const auto& [beginKey, manifest] : manifests) {
res = res + BulkLoadJobFileManifestEntry(manifest).toString() + "\n";
}
return res;
}
// TODO(BulkLoad): Support file:// urls, etc.
// For now, we only support blobstore:// urls.
// 'blobstore://' is the first match, credentials including '@' are optional and second regex match.
// The third match is the host + path, etc. of the url.
static const std::regex BLOBSTORE_URL_PATTERN(R"((blobstore://)([A-Z0-9]+:[A-Za-z0-9+/=]+:[A-Za-z0-9+/=]+@)?(.+)$)");
std::string getPath(const std::string& path) {
std::smatch matches;
if (!std::regex_match(path, matches, BLOBSTORE_URL_PATTERN)) {
return path;
}
// We want boost::url to parse out the path but it cannot digest credentials. Strip them out
// before passing to boost::url.
try {
return boost::urls::parse_uri(matches[1].str() + matches[3].str()).value().path();
} catch (std::system_error& e) {
TraceEvent(SevError, "BulkLoadGetPathError")
.detail("Path", path)
.detail("Error", e.what())
.detail("Matches", matches.str());
throw std::invalid_argument("Invalid url " + path + " " + e.what());
}
}
// TODO(BulkLoad): use this everywhere
std::string appendToPath(const std::string& path, const std::string& append) {
std::smatch matches;
if (!std::regex_match(path, matches, BLOBSTORE_URL_PATTERN)) {
return joinPath(path, append);
}
// We want boost::url to parse out the path but it cannot digest credentials. Strip them out
// before passing to boost::url.
try {
boost::urls::url url = boost::urls::parse_uri(matches[1].str() + matches[3].str()).value();
auto newUrl = std::string(url.set_path(joinPath(url.path(), append)).buffer());
return matches[1].str() + matches[2].str() + newUrl.substr(matches[1].str().length());
} catch (std::system_error& e) {
TraceEvent(SevError, "BulkLoadAppendToPathError")
.detail("Path", path)
.detail("Error", e.what())
.detail("Matches", matches.str());
throw std::invalid_argument("Invalid url " + path + " " + e.what());
}
}
std::string getBulkLoadJobRoot(const std::string& root, const UID& jobId) {
return appendToPath(root, jobId.toString());
}
// For submitting a task manually (for testing)
BulkLoadTaskState createBulkLoadTask(const UID& jobId,
const KeyRange& range,
const BulkLoadFileSet& fileSet,
const BulkLoadByteSampleSetting& byteSampleSetting,
const Version& snapshotVersion,
const int64_t& bytes,
const int64_t& keyCount,
const BulkLoadType& type,
const BulkLoadTransportMethod& transportMethod) {
BulkLoadManifest manifest(
fileSet, range.begin, range.end, snapshotVersion, bytes, keyCount, byteSampleSetting, type, transportMethod);
return BulkLoadTaskState(jobId, manifest);
}
BulkLoadJobState createBulkLoadJob(const UID& dumpJobIdToLoad,
const KeyRange& range,
const std::string& jobRoot,
const BulkLoadTransportMethod& transportMethod) {
return BulkLoadJobState(dumpJobIdToLoad, jobRoot, range, transportMethod);
}