Skip to content

Commit 47a3bf1

Browse files
authored
Merge pull request ClickHouse#80035 from vitlibar/s3-slow-down-after-network-error
Slow down S3 client after network error
2 parents 44e8cf0 + 13a8c91 commit 47a3bf1

File tree

14 files changed

+83
-6
lines changed

14 files changed

+83
-6
lines changed

src/Backups/BackupIO_S3.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ namespace Setting
3333
extern const SettingsBool s3_disable_checksum;
3434
extern const SettingsUInt64 s3_max_connections;
3535
extern const SettingsUInt64 s3_max_redirects;
36+
extern const SettingsBool s3_slow_all_threads_after_network_error;
3637
}
3738

3839
namespace S3AuthSetting
@@ -88,6 +89,7 @@ namespace
8889
context->getRemoteHostFilter(),
8990
static_cast<unsigned>(local_settings[Setting::s3_max_redirects]),
9091
static_cast<unsigned>(local_settings[Setting::backup_restore_s3_retry_attempts]),
92+
local_settings[Setting::s3_slow_all_threads_after_network_error],
9193
local_settings[Setting::enable_s3_requests_logging],
9294
/* for_disk_s3 = */ false,
9395
request_settings.get_request_throttler,

src/Coordination/KeeperSnapshotManagerS3.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo
104104

105105
static constexpr size_t s3_max_redirects = 10;
106106
static constexpr size_t s3_retry_attempts = 10;
107+
static constexpr bool s3_slow_all_threads_after_network_error = true;
107108
static constexpr bool enable_s3_requests_logging = false;
108109

109110
if (!new_uri.key.empty())
@@ -114,7 +115,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo
114115

115116
S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration(
116117
auth_settings[S3AuthSetting::region],
117-
RemoteHostFilter(), s3_max_redirects, s3_retry_attempts,
118+
RemoteHostFilter(), s3_max_redirects, s3_retry_attempts, s3_slow_all_threads_after_network_error,
118119
enable_s3_requests_logging,
119120
/* for_disk_s3 = */ false, /* get_request_throttler = */ {}, /* put_request_throttler = */ {},
120121
new_uri.uri.getScheme());

src/Core/Settings.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,11 @@ Maximum number of files that could be returned in batch by ListObject request
430430
DECLARE(Bool, s3_use_adaptive_timeouts, S3::DEFAULT_USE_ADAPTIVE_TIMEOUTS, R"(
431431
When set to `true` than for all s3 requests first two attempts are made with low send and receive timeouts.
432432
When set to `false` than all attempts are made with identical timeouts.
433+
)", 0) \
434+
DECLARE(Bool, s3_slow_all_threads_after_network_error, true, R"(
435+
When set to `true` than all threads executing s3 requests to the same endpoint get slow down for a while
436+
after one s3 request fails with a retryable network error.
437+
When set to `false` than each thread executing s3 request uses an independent set of backoffs on network errors.
433438
)", 0) \
434439
DECLARE(UInt64, azure_list_object_keys_size, 1000, R"(
435440
Maximum number of files that could be returned in batch by ListObject request

src/Core/SettingsChangesHistory.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
105105
{"compile_expressions", false, true, "We believe that the LLVM infrastructure behind the JIT compiler is stable enough to enable this setting by default."},
106106
{"use_legacy_to_time", false, false, "New setting. Allows for user to use the old function logic for toTime, which works as toTimeWithFixedDate."},
107107
{"input_format_parquet_allow_geoparquet_parser", false, true, "A new setting to use geo columns in parquet file"},
108+
{"s3_slow_all_threads_after_network_error", false, true, "New setting"},
108109
});
109110
addSettingsChanges(settings_changes_history, "25.4",
110111
{

src/Databases/DataLake/GlueCatalog.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ namespace DB::Setting
4343
extern const SettingsUInt64 s3_max_connections;
4444
extern const SettingsUInt64 s3_max_redirects;
4545
extern const SettingsUInt64 s3_retry_attempts;
46+
extern const SettingsBool s3_slow_all_threads_after_network_error;
4647
extern const SettingsBool enable_s3_requests_logging;
4748
extern const SettingsUInt64 s3_connect_timeout_ms;
4849
extern const SettingsUInt64 s3_request_timeout_ms;
@@ -75,12 +76,15 @@ GlueCatalog::GlueCatalog(
7576

7677
int s3_max_redirects = static_cast<int>(global_settings[DB::Setting::s3_max_redirects]);
7778
int s3_retry_attempts = static_cast<int>(global_settings[DB::Setting::s3_retry_attempts]);
79+
bool s3_slow_all_threads_after_network_error = global_settings[DB::Setting::s3_slow_all_threads_after_network_error];
7880
bool enable_s3_requests_logging = global_settings[DB::Setting::enable_s3_requests_logging];
81+
7982
DB::S3::PocoHTTPClientConfiguration poco_config = DB::S3::ClientFactory::instance().createClientConfiguration(
8083
region,
8184
getContext()->getRemoteHostFilter(),
8285
s3_max_redirects,
8386
s3_retry_attempts,
87+
s3_slow_all_threads_after_network_error,
8488
enable_s3_requests_logging,
8589
false,
8690
nullptr,

src/Disks/ObjectStorages/S3/diskSettings.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ namespace Setting
3131
extern const SettingsBool enable_s3_requests_logging;
3232
extern const SettingsUInt64 s3_max_redirects;
3333
extern const SettingsUInt64 s3_retry_attempts;
34+
extern const SettingsBool s3_slow_all_threads_after_network_error;
3435
}
3536

3637
namespace S3AuthSetting
@@ -123,6 +124,10 @@ std::unique_ptr<S3::Client> getClient(
123124
if (!for_disk_s3 && local_settings.isChanged("s3_retry_attempts"))
124125
s3_retry_attempts = static_cast<int>(local_settings[Setting::s3_retry_attempts]);
125126

127+
bool s3_slow_all_threads_after_network_error = static_cast<int>(global_settings[Setting::s3_slow_all_threads_after_network_error]);
128+
if (!for_disk_s3 && local_settings.isChanged("s3_slow_all_threads_after_network_error"))
129+
s3_slow_all_threads_after_network_error = static_cast<int>(local_settings[Setting::s3_slow_all_threads_after_network_error]);
130+
126131
bool enable_s3_requests_logging = global_settings[Setting::enable_s3_requests_logging];
127132
if (!for_disk_s3 && local_settings.isChanged("enable_s3_requests_logging"))
128133
enable_s3_requests_logging = local_settings[Setting::enable_s3_requests_logging];
@@ -132,6 +137,7 @@ std::unique_ptr<S3::Client> getClient(
132137
context->getRemoteHostFilter(),
133138
s3_max_redirects,
134139
s3_retry_attempts,
140+
s3_slow_all_threads_after_network_error,
135141
enable_s3_requests_logging,
136142
for_disk_s3,
137143
request_settings.get_request_throttler,

src/IO/S3/Client.cpp

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,9 @@ Client::doRequestWithRetryNetworkErrors(RequestType & request, RequestFn request
653653
std::exception_ptr last_exception = nullptr;
654654
for (Int64 attempt_no = 0; attempt_no < max_attempts; ++attempt_no)
655655
{
656+
/// Sometimes we need to slow down because other requests failed with network errors to free the S3 server a bit.
657+
slowDownAfterNetworkError();
658+
656659
try
657660
{
658661
/// S3 does retries network errors actually.
@@ -695,10 +698,7 @@ Client::doRequestWithRetryNetworkErrors(RequestType & request, RequestFn request
695698
if (!client_configuration.retryStrategy->ShouldRetry(error, attempt_no))
696699
break;
697700

698-
auto sleep_ms = client_configuration.retryStrategy->CalculateDelayBeforeNextRetry(error, attempt_no);
699-
LOG_WARNING(log, "Request failed, now waiting {} ms before attempting again", sleep_ms);
700-
sleepForMilliseconds(sleep_ms);
701-
701+
sleepAfterNetworkError(error, attempt_no);
702702
continue;
703703
}
704704
}
@@ -730,6 +730,44 @@ RequestResult Client::processRequestResult(RequestResult && outcome) const
730730
return RequestResult(error);
731731
}
732732

733+
void Client::sleepAfterNetworkError(Aws::Client::AWSError<Aws::Client::CoreErrors> error, Int64 attempt_no) const
734+
{
735+
auto sleep_ms = client_configuration.retryStrategy->CalculateDelayBeforeNextRetry(error, attempt_no);
736+
if (!client_configuration.s3_slow_all_threads_after_network_error)
737+
{
738+
LOG_WARNING(log, "Request failed, now waiting {} ms before attempting again", sleep_ms);
739+
sleepForMilliseconds(sleep_ms);
740+
return;
741+
}
742+
743+
/// Set the time other s3 requests must wait until.
744+
UInt64 current_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now().time_since_epoch()).count();
745+
UInt64 next_time_ms = current_time_ms + sleep_ms;
746+
/// next_time_to_retry_after_network_error = std::max(next_time_to_retry_after_network_error, next_time_ms)
747+
for (UInt64 stored_next_time = next_time_to_retry_after_network_error;
748+
(stored_next_time < next_time_ms) && !next_time_to_retry_after_network_error.compare_exchange_weak(stored_next_time, next_time_ms);)
749+
{
750+
}
751+
}
752+
753+
void Client::slowDownAfterNetworkError() const
754+
{
755+
if (!client_configuration.s3_slow_all_threads_after_network_error)
756+
return;
757+
758+
/// Wait until `next_time_to_retry_after_network_error`.
759+
for (;;)
760+
{
761+
UInt64 current_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now().time_since_epoch()).count();
762+
UInt64 next_time_ms = next_time_to_retry_after_network_error.load();
763+
if (current_time_ms >= next_time_ms)
764+
break;
765+
UInt64 sleep_ms = next_time_ms - current_time_ms;
766+
LOG_WARNING(log, "Some request failed, now waiting {} ms before executing a request", sleep_ms);
767+
sleepForMilliseconds(sleep_ms);
768+
}
769+
}
770+
733771
bool Client::supportsMultiPartCopy() const
734772
{
735773
return provider_type != ProviderType::GCS;
@@ -990,6 +1028,7 @@ PocoHTTPClientConfiguration ClientFactory::createClientConfiguration( // NOLINT
9901028
const RemoteHostFilter & remote_host_filter,
9911029
unsigned int s3_max_redirects,
9921030
unsigned int s3_retry_attempts,
1031+
bool s3_slow_all_threads_after_network_error,
9931032
bool enable_s3_requests_logging,
9941033
bool for_disk_s3,
9951034
const ThrottlerPtr & get_request_throttler,
@@ -1009,6 +1048,7 @@ PocoHTTPClientConfiguration ClientFactory::createClientConfiguration( // NOLINT
10091048
remote_host_filter,
10101049
s3_max_redirects,
10111050
s3_retry_attempts,
1051+
s3_slow_all_threads_after_network_error,
10121052
enable_s3_requests_logging,
10131053
for_disk_s3,
10141054
context->getGlobalContext()->getSettingsRef()[Setting::s3_use_adaptive_timeouts],

src/IO/S3/Client.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,9 @@ class Client : private Aws::S3::S3Client
279279
template <typename RequestResult>
280280
RequestResult processRequestResult(RequestResult && outcome) const;
281281

282+
void sleepAfterNetworkError(Aws::Client::AWSError<Aws::Client::CoreErrors> error, Int64 attempt_no) const;
283+
void slowDownAfterNetworkError() const;
284+
282285
String initial_endpoint;
283286
std::shared_ptr<Aws::Auth::AWSCredentialsProvider> credentials_provider;
284287
PocoHTTPClientConfiguration client_configuration;
@@ -299,6 +302,9 @@ class Client : private Aws::S3::S3Client
299302

300303
const size_t max_redirects;
301304

305+
/// S3 requests must wait until this time because some s3 request fails with a retryable network error.
306+
mutable std::atomic<UInt64> next_time_to_retry_after_network_error = 0;
307+
302308
const ServerSideEncryptionKMSConfig sse_kms_config;
303309

304310
LoggerPtr log;
@@ -327,6 +333,7 @@ class ClientFactory
327333
const RemoteHostFilter & remote_host_filter,
328334
unsigned int s3_max_redirects,
329335
unsigned int s3_retry_attempts,
336+
bool s3_slow_all_threads_after_network_error,
330337
bool enable_s3_requests_logging,
331338
bool for_disk_s3,
332339
const ThrottlerPtr & get_request_throttler,

src/IO/S3/Credentials.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
712712
configuration.remote_host_filter,
713713
configuration.s3_max_redirects,
714714
configuration.s3_retry_attempts,
715+
configuration.s3_slow_all_threads_after_network_error,
715716
configuration.enable_s3_requests_logging,
716717
configuration.for_disk_s3,
717718
configuration.get_request_throttler,
@@ -727,6 +728,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
727728
configuration.remote_host_filter,
728729
configuration.s3_max_redirects,
729730
configuration.s3_retry_attempts,
731+
configuration.s3_slow_all_threads_after_network_error,
730732
configuration.enable_s3_requests_logging,
731733
configuration.for_disk_s3,
732734
configuration.get_request_throttler,
@@ -785,6 +787,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
785787
configuration.remote_host_filter,
786788
configuration.s3_max_redirects,
787789
configuration.s3_retry_attempts,
790+
configuration.s3_slow_all_threads_after_network_error,
788791
configuration.enable_s3_requests_logging,
789792
configuration.for_disk_s3,
790793
configuration.get_request_throttler,

src/IO/S3/PocoHTTPClient.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ PocoHTTPClientConfiguration::PocoHTTPClientConfiguration(
117117
const RemoteHostFilter & remote_host_filter_,
118118
unsigned int s3_max_redirects_,
119119
unsigned int s3_retry_attempts_,
120+
bool s3_slow_all_threads_after_network_error_,
120121
bool enable_s3_requests_logging_,
121122
bool for_disk_s3_,
122123
bool s3_use_adaptive_timeouts_,
@@ -128,6 +129,7 @@ PocoHTTPClientConfiguration::PocoHTTPClientConfiguration(
128129
, remote_host_filter(remote_host_filter_)
129130
, s3_max_redirects(s3_max_redirects_)
130131
, s3_retry_attempts(s3_retry_attempts_)
132+
, s3_slow_all_threads_after_network_error(s3_slow_all_threads_after_network_error_)
131133
, enable_s3_requests_logging(enable_s3_requests_logging_)
132134
, for_disk_s3(for_disk_s3_)
133135
, get_request_throttler(get_request_throttler_)

0 commit comments

Comments
 (0)