Skip to content

Commit e1137d4

Browse files
authored
Merge pull request #3706 from buildkite/bm/secret_retry
Add retry logic to secret getting
2 parents 0c7c9ba + d70f0c9 commit e1137d4

File tree

4 files changed

+144
-13
lines changed

4 files changed

+144
-13
lines changed

clicommand/secret_get.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ Examples:
9494
}
9595

9696
agentClient := api.NewClient(l, loadAPIClientConfig(cfg, "AgentAccessToken"))
97-
secrets, errs := secrets.FetchSecrets(ctx, agentClient, cfg.Job, cfg.Keys, 10)
97+
secrets, errs := secrets.FetchSecrets(ctx, l, agentClient, cfg.Job, cfg.Keys, 10)
9898
if len(errs) > 0 {
9999
sb := &strings.Builder{}
100100
sb.WriteString("Failed to fetch some secrets:\n")

internal/job/executor.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -909,13 +909,14 @@ func (e *Executor) fetchAndSetSecrets(ctx context.Context) error {
909909
}
910910

911911
// Create API client for fetching secrets.
912-
apiClient := api.NewClient(logger.NewBuffer(), api.Config{
912+
secretLogger := logger.NewBuffer()
913+
apiClient := api.NewClient(secretLogger, api.Config{
913914
Endpoint: e.shell.Env.GetString("BUILDKITE_AGENT_ENDPOINT", ""),
914915
Token: e.shell.Env.GetString("BUILDKITE_AGENT_ACCESS_TOKEN", ""),
915916
})
916917

917918
// Fetch all secrets
918-
fetchedSecrets, errs := secrets.FetchSecrets(ctx, apiClient, e.JobID, keys, 10)
919+
fetchedSecrets, errs := secrets.FetchSecrets(ctx, secretLogger, apiClient, e.JobID, keys, 10)
919920
if len(errs) > 0 {
920921
var errorMsg strings.Builder
921922
for _, err := range errs {

internal/secrets/secret.go

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@ import (
44
"context"
55
"fmt"
66
"sync"
7+
"time"
78

89
"github.com/buildkite/agent/v3/api"
10+
"github.com/buildkite/agent/v3/logger"
11+
"github.com/buildkite/roko"
912
"golang.org/x/sync/semaphore"
1013
)
1114

@@ -34,9 +37,31 @@ func (e *SecretError) Unwrap() error {
3437
return e.Err
3538
}
3639

37-
// FetchSecrets retrieves all secret values from the API sequentially.
38-
// If any secret fails, returns error with details of all failed secrets.
39-
func FetchSecrets(ctx context.Context, client APIClient, jobID string, keys []string, concurrency int) ([]Secret, []error) {
40+
// FetchSecretsOpt is a functional option for FetchSecrets.
41+
type FetchSecretsOpt func(*fetchSecretsConfig)
42+
43+
type fetchSecretsConfig struct {
44+
retrySleepFunc func(time.Duration)
45+
}
46+
47+
// WithRetrySleepFunc overrides the sleep function used between retries.
48+
// This is primarily useful for unit tests.
49+
func WithRetrySleepFunc(f func(time.Duration)) FetchSecretsOpt {
50+
return func(c *fetchSecretsConfig) {
51+
c.retrySleepFunc = f
52+
}
53+
}
54+
55+
// FetchSecrets retrieves all secret values from the API concurrently.
56+
// Each individual secret fetch is retried up to 3 times with exponential
57+
// backoff on retryable errors (TLS handshake failures, timeouts, 5xx, 429).
58+
// If any secret fails after retries, returns error with details of all failed secrets.
59+
func FetchSecrets(ctx context.Context, l logger.Logger, client APIClient, jobID string, keys []string, concurrency int, opts ...FetchSecretsOpt) ([]Secret, []error) {
60+
var cfg fetchSecretsConfig
61+
for _, opt := range opts {
62+
opt(&cfg)
63+
}
64+
4065
secrets := make([]Secret, 0, len(keys))
4166
secretsMu := sync.Mutex{}
4267

@@ -55,7 +80,33 @@ func FetchSecrets(ctx context.Context, client APIClient, jobID string, keys []st
5580

5681
go func() {
5782
defer sem.Release(1)
58-
apiSecret, _, err := client.GetSecret(ctx, &api.GetSecretRequest{Key: key, JobID: jobID})
83+
84+
r := roko.NewRetrier(
85+
roko.WithMaxAttempts(3),
86+
roko.WithStrategy(roko.Exponential(2*time.Second, 0)),
87+
roko.WithJitterRange(-1*time.Second, 5*time.Second),
88+
roko.WithSleepFunc(cfg.retrySleepFunc),
89+
)
90+
91+
apiSecret, err := roko.DoFunc(ctx, r, func(r *roko.Retrier) (*api.Secret, error) {
92+
secret, resp, err := client.GetSecret(ctx, &api.GetSecretRequest{Key: key, JobID: jobID})
93+
if err != nil {
94+
if resp != nil && api.IsRetryableStatus(resp) {
95+
l.Warn("Retrying secret %q fetch after retryable HTTP status %d (%s)", key, resp.StatusCode, r)
96+
return nil, err
97+
}
98+
99+
if api.IsRetryableError(err) {
100+
l.Warn("Retrying secret %q fetch after retryable error: %v (%s)", key, err, r)
101+
return nil, err
102+
}
103+
104+
// Non-retryable error, stop retrying
105+
r.Break()
106+
return nil, err
107+
}
108+
return secret, nil
109+
})
59110
if err != nil {
60111
errsMu.Lock()
61112
errs = append(errs, &SecretError{

internal/secrets/secret_test.go

Lines changed: 85 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@ import (
88
"net/http/httptest"
99
"runtime"
1010
"strings"
11+
"sync/atomic"
1112
"syscall"
1213
"testing"
14+
"time"
1315

1416
"github.com/google/go-cmp/cmp"
1517
"github.com/google/go-cmp/cmp/cmpopts"
@@ -18,6 +20,8 @@ import (
1820
"github.com/buildkite/agent/v3/logger"
1921
)
2022

23+
var noSleep = WithRetrySleepFunc(func(time.Duration) {})
24+
2125
func TestFetchSecrets_Success(t *testing.T) {
2226
t.Parallel()
2327

@@ -44,7 +48,7 @@ func TestFetchSecrets_Success(t *testing.T) {
4448
Token: "llamas",
4549
})
4650

47-
secrets, errs := FetchSecrets(t.Context(), apiClient, "test-job-id", []string{"DATABASE_URL", "API_TOKEN"}, 10)
51+
secrets, errs := FetchSecrets(t.Context(), logger.Discard, apiClient, "test-job-id", []string{"DATABASE_URL", "API_TOKEN"}, 10)
4852
if len(errs) > 0 {
4953
t.Fatalf("expected no errors, got: %v", errs)
5054
}
@@ -73,7 +77,7 @@ func TestFetchSecrets_EmptyKeys(t *testing.T) {
7377
t.Cleanup(server.Close)
7478

7579
apiClient := api.NewClient(logger.Discard, api.Config{Endpoint: server.URL, Token: "llamas"})
76-
secrets, errs := FetchSecrets(t.Context(), apiClient, "test-job-id", []string{}, 10)
80+
secrets, errs := FetchSecrets(t.Context(), logger.Discard, apiClient, "test-job-id", []string{}, 10)
7781

7882
if len(errs) > 0 {
7983
t.Fatalf("expected no errors, got: %v", errs)
@@ -95,7 +99,7 @@ func TestFetchSecrets_NilKeys(t *testing.T) {
9599

96100
apiClient := api.NewClient(logger.Discard, api.Config{Endpoint: server.URL, Token: "llamas"})
97101

98-
secrets, errs := FetchSecrets(t.Context(), apiClient, "test-job-id", nil, 10)
102+
secrets, errs := FetchSecrets(t.Context(), logger.Discard, apiClient, "test-job-id", nil, 10)
99103

100104
if len(errs) > 0 {
101105
t.Fatalf("expected no errors, got: %v", errs)
@@ -133,7 +137,7 @@ func TestFetchSecrets_SomeSecretsFail(t *testing.T) {
133137
})
134138

135139
keys := []string{"DATABASE_URL", "MISSING"}
136-
secrets, errs := FetchSecrets(t.Context(), apiClient, "test-job-id", keys, 10)
140+
secrets, errs := FetchSecrets(t.Context(), logger.Discard, apiClient, "test-job-id", keys, 10)
137141

138142
if len(errs) != 1 {
139143
t.Fatalf("expected 1 errors, got %d: %v", len(errs), errs)
@@ -177,7 +181,7 @@ func TestFetchSecrets_AllSecretsFail(t *testing.T) {
177181
})
178182

179183
keys := []string{"API_TOKEN", "DATABASE_URL"}
180-
secrets, errs := FetchSecrets(t.Context(), apiClient, "test-job-id", keys, 10)
184+
secrets, errs := FetchSecrets(t.Context(), logger.Discard, apiClient, "test-job-id", keys, 10)
181185

182186
if len(errs) != 2 {
183187
t.Fatalf("expected 2 errors, got %d: %v", len(errs), errs)
@@ -218,7 +222,7 @@ func TestFetchSecrets_APIClientError(t *testing.T) {
218222
})
219223

220224
keys := []string{"TEST_SECRET"}
221-
secrets, errs := FetchSecrets(t.Context(), apiClient, "test-job-id", keys, 10)
225+
secrets, errs := FetchSecrets(t.Context(), logger.Discard, apiClient, "test-job-id", keys, 10, noSleep)
222226

223227
if len(errs) != 1 {
224228
t.Fatalf("expected 1 error, got %d: %v", len(errs), errs)
@@ -266,3 +270,78 @@ func TestFetchSecrets_APIClientError(t *testing.T) {
266270
t.Errorf("expected connection refused error, got: %v (type: %T)", netErr.Err, netErr.Err)
267271
}
268272
}
273+
274+
func TestFetchSecrets_RetriesOnServerError(t *testing.T) {
275+
t.Parallel()
276+
277+
var attempts atomic.Int32
278+
279+
server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
280+
n := attempts.Add(1)
281+
if n <= 2 {
282+
// First two attempts return 502 Bad Gateway (retryable)
283+
rw.WriteHeader(http.StatusBadGateway)
284+
_, _ = fmt.Fprintf(rw, `{"message": "bad gateway"}`)
285+
return
286+
}
287+
// Third attempt succeeds
288+
rw.WriteHeader(http.StatusOK)
289+
_, _ = fmt.Fprintf(rw, `{"key": "MY_SECRET", "value": "secret-value"}`)
290+
}))
291+
t.Cleanup(server.Close)
292+
293+
apiClient := api.NewClient(logger.Discard, api.Config{
294+
Endpoint: server.URL,
295+
Token: "llamas",
296+
})
297+
298+
secrets, errs := FetchSecrets(t.Context(), logger.Discard, apiClient, "test-job-id", []string{"MY_SECRET"}, 10, noSleep)
299+
if len(errs) > 0 {
300+
t.Fatalf("expected no errors after retries, got: %v", errs)
301+
}
302+
303+
if len(secrets) != 1 {
304+
t.Fatalf("expected 1 secret, got %d", len(secrets))
305+
}
306+
307+
if secrets[0].Value != "secret-value" {
308+
t.Errorf("expected secret value %q, got %q", "secret-value", secrets[0].Value)
309+
}
310+
311+
if got := attempts.Load(); got != 3 {
312+
t.Errorf("expected 3 attempts (2 failures + 1 success), got %d", got)
313+
}
314+
}
315+
316+
func TestFetchSecrets_NoRetryOnNonRetryableStatus(t *testing.T) {
317+
t.Parallel()
318+
319+
var attempts atomic.Int32
320+
321+
server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
322+
attempts.Add(1)
323+
// 404 is not retryable
324+
rw.WriteHeader(http.StatusNotFound)
325+
_, _ = fmt.Fprintf(rw, `{"message": "secret not found"}`)
326+
}))
327+
t.Cleanup(server.Close)
328+
329+
apiClient := api.NewClient(logger.Discard, api.Config{
330+
Endpoint: server.URL,
331+
Token: "llamas",
332+
})
333+
334+
secrets, errs := FetchSecrets(t.Context(), logger.Discard, apiClient, "test-job-id", []string{"MISSING"}, 10, noSleep)
335+
if len(errs) != 1 {
336+
t.Fatalf("expected 1 error, got %d: %v", len(errs), errs)
337+
}
338+
339+
if secrets != nil {
340+
t.Errorf("expected nil secrets, got: %v", secrets)
341+
}
342+
343+
// Should have only attempted once since 404 is not retryable
344+
if got := attempts.Load(); got != 1 {
345+
t.Errorf("expected 1 attempt (no retries for 404), got %d", got)
346+
}
347+
}

0 commit comments

Comments
 (0)