Skip to content

Commit ed45826

Browse files
authored
stable-25-3: add backoff on retry when s3 is unavailable (#26406)
2 parents bc84aa8 + 6ee6488 commit ed45826

File tree

5 files changed

+77
-14
lines changed

5 files changed

+77
-14
lines changed

ydb/core/kqp/ut/olap/tiering_ut.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -457,11 +457,9 @@ Y_UNIT_TEST_SUITE(KqpOlapTiering) {
457457
putController->WaitActualization(TDuration::Seconds(5));
458458
Sleep(TDuration::Seconds(5));
459459

460-
UNIT_ASSERT_C(putController->GetAbortedWrites() > 20,
461-
"Expected load spike, but only " << putController->GetAbortedWrites() << " PutObject requests recorded"); // comment after fix
462-
// UNIT_ASSERT_C(putController->GetAbortedWrites() < 10,
463-
// "Expected load spike, but was "
464-
// << putController->GetAbortedWrites() << " PutObject requests recorded"); // uncomment after fix
460+
UNIT_ASSERT_C(putController->GetAbortedWrites() < 10,
461+
"Expected reduced load, but was "
462+
<< putController->GetAbortedWrites() << " PutObject requests recorded");
465463
}
466464

467465
Y_UNIT_TEST(ReconfigureTier) {

ydb/core/wrappers/abstract.h

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
#pragma once
22

33
#include <ydb/library/actors/core/actorsystem.h>
4+
#include <ydb/library/actors/core/log.h>
5+
#include <ydb/core/util/backoff.h>
6+
#include <ydb/core/wrappers/retry_policy.h>
7+
#include <util/system/mutex.h>
48

59
#include <ydb/core/protos/s3_settings.pb.h>
610
#include <ydb/core/wrappers/events/abstract.h>
@@ -41,6 +45,25 @@ struct TEvExternalStorage {
4145

4246
namespace NExternalStorage {
4347

48+
class TThreadSafeBackoff {
49+
private:
50+
mutable TMutex Mutex;
51+
NKikimr::TBackoff Policy;
52+
public:
53+
TThreadSafeBackoff(size_t maxRetries = 100, TDuration initial = TDuration::Seconds(3), TDuration max = TDuration::Seconds(10))
54+
: Policy(maxRetries, initial, max) {}
55+
void Reset() {
56+
with_lock(Mutex) {
57+
Policy.Reset();
58+
}
59+
}
60+
TDuration Next() {
61+
with_lock(Mutex) {
62+
return Policy.Next();
63+
}
64+
}
65+
};
66+
4467
class IReplyAdapter {
4568
private:
4669
std::optional<NActors::TActorId> CustomRecipient;
@@ -69,10 +92,12 @@ class IReplyAdapter {
6992
class TReplyAdapterContainer {
7093
private:
7194
IReplyAdapter::TPtr Adapter;
95+
std::shared_ptr<TThreadSafeBackoff> Backoff;
7296
public:
7397
TReplyAdapterContainer() = default;
74-
TReplyAdapterContainer(IReplyAdapter::TPtr adapter)
75-
: Adapter(adapter) {
98+
TReplyAdapterContainer(IReplyAdapter::TPtr adapter, std::shared_ptr<TThreadSafeBackoff> backoff = {})
99+
: Adapter(adapter)
100+
, Backoff(std::move(backoff)) {
76101

77102
}
78103

@@ -95,10 +120,31 @@ class TReplyAdapterContainer {
95120

96121
template <class TBaseEventObject>
97122
void Reply(const NActors::TActorId& recipientId, std::unique_ptr<TBaseEventObject>&& ev) const {
123+
bool doBackoff = false;
124+
TDuration delay = TDuration::Zero();
125+
126+
if (ev->IsSuccess()) {
127+
128+
Backoff->Reset();
129+
} else if (NWrappers::ShouldBackoff(ev->GetError())) {
130+
AFL_VERIFY(Backoff);
131+
delay = Backoff->Next();
132+
doBackoff = delay > TDuration::Zero();
133+
}
134+
135+
std::unique_ptr<NActors::IEventBase> finalEvent;
98136
if (Adapter) {
99-
TlsActivationContext->ActorSystem()->Send(Adapter->GetRecipient(recipientId), Adapter->RebuildReplyEvent(std::move(ev)).release());
137+
finalEvent = Adapter->RebuildReplyEvent(std::move(ev));
138+
} else {
139+
finalEvent.reset(ev.release());
140+
}
141+
142+
const NActors::TActorId recipient = Adapter ? Adapter->GetRecipient(recipientId) : recipientId;
143+
if (doBackoff) {
144+
auto* handle = new NActors::IEventHandle(recipient, NActors::TActorId(), finalEvent.release());
145+
TlsActivationContext->ActorSystem()->Schedule(delay, handle);
100146
} else {
101-
TlsActivationContext->ActorSystem()->Send(recipientId, ev.release());
147+
TlsActivationContext->ActorSystem()->Send(recipient, finalEvent.release());
102148
}
103149

104150
}
@@ -108,14 +154,15 @@ class TReplyAdapterContainer {
108154
class IExternalStorageOperator {
109155
protected:
110156
TReplyAdapterContainer ReplyAdapter;
157+
std::shared_ptr<TThreadSafeBackoff> BackoffPolicy = std::make_shared<TThreadSafeBackoff>();
111158
virtual TString DoDebugString() const {
112159
return "";
113160
}
114161
public:
115162
using TPtr = std::shared_ptr<IExternalStorageOperator>;
116163
void InitReplyAdapter(IReplyAdapter::TPtr adapter) {
117164
Y_ABORT_UNLESS(!ReplyAdapter);
118-
ReplyAdapter = TReplyAdapterContainer(adapter);
165+
ReplyAdapter = TReplyAdapterContainer(adapter, BackoffPolicy);
119166
}
120167

121168

ydb/core/wrappers/retry_policy.cpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,30 @@ bool ShouldRetry(const Aws::S3::S3Error& error) {
1414
}
1515

1616
const auto& exceptionName = error.GetExceptionName();
17-
if ("TooManyRequests" == exceptionName) {
18-
return true;
19-
} else if ("OperationAborted" == exceptionName) {
17+
if (exceptionName == "TooManyRequests" ||
18+
exceptionName == "OperationAborted") {
2019
return true;
2120
}
2221

2322
return false;
2423
}
2524

25+
bool ShouldBackoff(const Aws::S3::S3Error& error) {
26+
if (ShouldRetry(error)) {
27+
return true;
28+
}
29+
30+
const auto& exceptionName = error.GetExceptionName();
31+
if (exceptionName == "AccessDenied" ||
32+
exceptionName == "InvalidAccessKeyId" ||
33+
exceptionName == "InvalidToken" ||
34+
exceptionName == "ExpiredToken" ||
35+
exceptionName == "AuthFailure" ||
36+
exceptionName == "ServiceUnavailable")
37+
{
38+
return true;
39+
}
40+
41+
return false;
42+
}
2643
}

ydb/core/wrappers/retry_policy.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,6 @@ namespace Aws::S3 {
77
namespace NKikimr::NWrappers {
88

99
bool ShouldRetry(const Aws::S3::S3Error& error);
10+
bool ShouldBackoff(const Aws::S3::S3Error& error);
1011

1112
}

ydb/core/wrappers/unavailable_storage.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class TUnavailableExternalStorageOperator: public IExternalStorageOperator {
1313
template <class TResponse, class TRequestPtr>
1414
void ExecuteImpl(TRequestPtr& ev) const {
1515
const Aws::S3::S3Error error = Aws::S3::S3Error(
16-
Aws::Client::AWSError<Aws::Client::CoreErrors>(Aws::Client::CoreErrors::SERVICE_UNAVAILABLE, Exception, Reason, false));
16+
Aws::Client::AWSError<Aws::Client::CoreErrors>(Aws::Client::CoreErrors::SERVICE_UNAVAILABLE, Exception, Reason, true));
1717
std::unique_ptr<TResponse> response;
1818
constexpr bool hasKey = requires(const TRequestPtr& r) { r->Get()->GetRequest().GetKey(); };
1919
constexpr bool hasRange = std::is_same_v<TResponse, TEvGetObjectResponse>;

0 commit comments

Comments
 (0)