Skip to content

Commit 1b11fc9

Browse files
authored
[auth] Hot-reload registry auth via nydusd config API (#718)
* [auth] Hot-reload registry auth via nydusd config API Hook together the token renewal subsystem with the nydusd config API. Move renewal orchestration from pkg/auth to snapshot to avoid circular dependencies as the renewal logic now needs access to the dameons client while the daemon package already indirectly imports the auth package. pkg/auth/renewal.go becomes a pure credential cache with exported InitCredentialStore, RenewCredential, and EvictStaleCredentials. The reconciliation loop formerly in auth/renewal.go and now in snapshot/renewal.go walks all managers->daemons->rags and renews credentials, and hot-reloads daemons directly. It also updates the daemon config file so that they can use fresh creds when they restart. * [auth] Start credential renewal after daemon recovery Move startCredentialRenewal after filesystem.NewFileSystem, which calls Manager.Recover() to populate daemon caches from the DB. Previously the initial reconciliation ran against empty managers, missing all recovered daemons on restart. * [auth] Skip hot-reload when credentials are unchanged Compare the stored credential against the freshly renewed one using ToBase64 and skip UpdateAuthConfig when they match. This avoids unnecessary config file writes and nydusd API calls on each renewal tick when the token has not rotated.
1 parent 36b8200 commit 1b11fc9

File tree

10 files changed

+585
-118
lines changed

10 files changed

+585
-118
lines changed

cmd/containerd-nydus-grpc/snapshotter.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,6 @@ func Start(ctx context.Context, cfg *config.SnapshotterConfig) error {
5959
}
6060
}
6161

62-
if interval := cfg.RemoteConfig.AuthConfig.CredentialRenewalInterval; interval > 0 {
63-
auth.InitCredentialRenewal(ctx, interval)
64-
}
65-
6662
return Serve(ctx, rs, opt, stopSignal)
6763
}
6864

pkg/auth/renewal.go

Lines changed: 37 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -7,98 +7,66 @@
77
package auth
88

99
import (
10-
"context"
1110
"sync"
1211
"time"
1312

1413
"github.com/containerd/log"
1514
"github.com/containerd/nydus-snapshotter/pkg/metrics/data"
16-
"github.com/containerd/nydus-snapshotter/pkg/rafs"
1715
)
1816

1917
// renewalStore is the global credential store. It is nil when credential
2018
// renewal is disabled (the default).
2119
var renewalStore *credentialStore
2220

23-
// InitCredentialRenewal initializes the credential renewal subsystem.
24-
// It creates the global store, runs an initial reconciliation pass to seed
25-
// entries from live RAFS instances, and starts a background goroutine that
26-
// reconciles and renews credentials at the given interval.
27-
func InitCredentialRenewal(ctx context.Context, interval time.Duration) {
21+
// InitCredentialStore creates the global credential store without starting
22+
// any background goroutine. The caller is responsible for driving renewal
23+
// (e.g., from snapshot/renewal.go).
24+
func InitCredentialStore(interval time.Duration) {
2825
renewalStore = newCredentialStore(interval)
29-
// Reconcile existing credentials a first time.
30-
renewalStore.reconcile()
31-
32-
log.G(ctx).WithField("interval", interval).Info("credential renewal initialized")
33-
go renewLoop(ctx, renewalStore)
34-
}
35-
36-
// renewLoop is the background goroutine that periodically reconciles and
37-
// renews credentials.
38-
func renewLoop(ctx context.Context, store *credentialStore) {
39-
ticker := time.NewTicker(store.renewInterval)
40-
defer ticker.Stop()
41-
42-
for {
43-
select {
44-
case <-ctx.Done():
45-
return
46-
case <-ticker.C:
47-
store.reconcile()
48-
}
49-
}
5026
}
5127

52-
// reconcile reconciles the credential store against the live RAFS instance
53-
// list and renews all entries that should be active.
54-
// 4 different situations are possible:
55-
// - entry in store, live in RAFS: renew
56-
// - entry in store, not live in RAFS: evict
57-
// - entry not in store, not live in RAFS: nothing
58-
// - entry not in store, live in RAFS: add
59-
func (s *credentialStore) reconcile() {
60-
live := make(map[string]struct{})
61-
for _, r := range rafs.RafsGlobalCache.List() {
62-
if r.ImageID != "" {
63-
live[r.ImageID] = struct{}{}
64-
}
65-
}
66-
67-
for _, entry := range s.Entries() {
68-
if _, inRAFS := live[entry.ref]; inRAFS {
69-
log.L.WithField("ref", entry.ref).Debug("renewing credential entry")
70-
s.renewEntry(entry.ref)
71-
} else if time.Since(entry.renewedAt) >= s.renewInterval/2 {
72-
s.Remove(entry.ref)
73-
}
74-
// Grace period: the entry was added recently (within interval/2) but
75-
// has no RAFS entry yet. There is a possible race between a concurrent
76-
// image pull and this renewal tick: GetRegistryKeyChain adds the ref to the
77-
// store on the first layer fetch, but the RAFS entry is only created later
78-
// when the mount completes. Evicting here would cause redundant provider
79-
// lookups for every remaining layer fetch in the pull. We leave the
80-
// entry intact; the next tick will either find it in RAFS (normal) or
81-
// evict it (pull abandoned or failed).
82-
}
83-
84-
for ref := range live {
85-
if s.Get(ref) == nil {
86-
s.renewEntry(ref)
87-
}
28+
// GetStoredCredential returns the cached keychain for ref from the global
29+
// store, or nil if not present or the store is not initialized.
30+
func GetStoredCredential(ref string) *PassKeyChain {
31+
if renewalStore == nil {
32+
return nil
8833
}
34+
return renewalStore.Get(ref)
8935
}
9036

91-
// renewEntry fetches fresh credentials for ref from the renewable provider
92-
// list. fetchFromProviders writes to the renewal store on success, so
93-
// renewEntry only needs to update metrics.
94-
func (s *credentialStore) renewEntry(ref string) {
95-
kc := fetchFromProviders(&AuthRequest{Ref: ref, ValidUntil: time.Now().Add(s.renewInterval)}, renewableProviders())
37+
// RenewCredential fetches fresh credentials for ref from the renewable
38+
// provider list and caches them in the global store. Returns the keychain
39+
// on success or nil on failure. Emits renewal metrics.
40+
func RenewCredential(ref string) *PassKeyChain {
41+
kc := fetchFromProviders(
42+
&AuthRequest{Ref: ref, ValidUntil: time.Now().Add(renewalStore.renewInterval)},
43+
renewableProviders(),
44+
)
9645
if kc != nil {
9746
data.CredentialRenewals.WithLabelValues(ref, "success").Inc()
9847
} else {
9948
log.L.WithField("ref", ref).Warn("credential renewal returned no credentials from any provider")
10049
data.CredentialRenewals.WithLabelValues(ref, "failure").Inc()
10150
}
51+
return kc
52+
}
53+
54+
// EvictStaleCredentials removes store entries whose ref is not present in
55+
// liveRefs. Entries added recently (within interval/2) are kept to avoid
56+
// racing with a concurrent image pull: GetRegistryKeyChain adds the ref to
57+
// the store on the first layer fetch, but the RAFS entry is only created
58+
// later when the mount completes. Evicting here would cause redundant
59+
// provider lookups for every remaining layer fetch in the pull.
60+
func EvictStaleCredentials(liveRefs map[string]struct{}) {
61+
if renewalStore == nil {
62+
return
63+
}
64+
grace := renewalStore.renewInterval / 2
65+
for _, entry := range renewalStore.Entries() {
66+
if _, live := liveRefs[entry.ref]; !live && time.Since(entry.renewedAt) >= grace {
67+
renewalStore.Remove(entry.ref)
68+
}
69+
}
10270
}
10371

10472
// --- credentialEntry ---

pkg/auth/renewal_test.go

Lines changed: 139 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
package auth
88

99
import (
10-
"context"
1110
"fmt"
1211
"sync"
1312
"sync/atomic"
@@ -206,16 +205,17 @@ func TestCredentialStoreConcurrency(t *testing.T) {
206205
wg.Wait()
207206
}
208207

209-
// --- renewEntry ---
208+
// --- RenewCredential ---
210209

211-
func TestRenewEntry(t *testing.T) {
210+
func TestRenewCredential(t *testing.T) {
212211
const ref = "docker.io/library/nginx:latest"
213212

214213
tests := []struct {
215214
name string
216215
provider *trackingProvider
217216
wantUser string
218217
wantCall int32
218+
wantNil bool
219219
}{
220220
{
221221
name: "updates store on success",
@@ -224,24 +224,14 @@ func TestRenewEntry(t *testing.T) {
224224
wantCall: 1,
225225
},
226226
{
227-
name: "leaves existing entry unchanged on failure",
227+
name: "returns nil on failure",
228228
provider: func() *trackingProvider {
229229
p := &trackingProvider{}
230230
p.failNext.Store(true)
231231
return p
232232
}(),
233-
wantUser: "original",
234-
wantCall: 1,
235-
},
236-
{
237-
name: "leaves existing entry unchanged when provider returns nil",
238-
provider: func() *trackingProvider {
239-
p := &trackingProvider{}
240-
p.nilNext.Store(true)
241-
return p
242-
}(),
243-
wantUser: "original",
244233
wantCall: 1,
234+
wantNil: true,
245235
},
246236
}
247237

@@ -254,21 +244,147 @@ func TestRenewEntry(t *testing.T) {
254244
renewalStore = oldStore
255245
}()
256246
renewableProviders = func() []AuthProvider { return []AuthProvider{tt.provider} }
247+
renewalStore = newCredentialStore(5 * time.Minute)
257248

258-
store := newCredentialStore(5 * time.Minute)
259-
store.Add(ref, &PassKeyChain{Username: "original", Password: "original"})
260-
renewalStore = store
261-
262-
store.renewEntry(ref)
249+
got := RenewCredential(ref)
263250

264251
assert.Equal(t, tt.wantCall, tt.provider.calls.Load())
265-
got := store.Get(ref)
266-
require.NotNil(t, got)
267-
assert.Equal(t, tt.wantUser, got.Username)
252+
if tt.wantNil {
253+
assert.Nil(t, got)
254+
} else {
255+
require.NotNil(t, got)
256+
assert.Equal(t, tt.wantUser, got.Username)
257+
}
268258
})
269259
}
270260
}
271261

262+
// --- EvictStaleCredentials ---
263+
264+
func TestEvictStaleCredentials(t *testing.T) {
265+
tests := []struct {
266+
name string
267+
stored []string
268+
live map[string]struct{}
269+
wantRefs []string
270+
}{
271+
{
272+
name: "evicts refs not in live set",
273+
stored: []string{"ref-a", "ref-b", "ref-c"},
274+
live: map[string]struct{}{"ref-a": {}, "ref-c": {}},
275+
wantRefs: []string{"ref-a", "ref-c"},
276+
},
277+
{
278+
name: "evicts all when live set is empty",
279+
stored: []string{"ref-a", "ref-b"},
280+
live: map[string]struct{}{},
281+
wantRefs: nil,
282+
},
283+
{
284+
name: "keeps all when all are live",
285+
stored: []string{"ref-a", "ref-b"},
286+
live: map[string]struct{}{"ref-a": {}, "ref-b": {}},
287+
wantRefs: []string{"ref-a", "ref-b"},
288+
},
289+
}
290+
291+
for _, tt := range tests {
292+
t.Run(tt.name, func(t *testing.T) {
293+
oldStore := renewalStore
294+
defer func() { renewalStore = oldStore }()
295+
296+
// Use a tiny interval so grace period (interval/2) is effectively zero.
297+
renewalStore = newCredentialStore(time.Millisecond)
298+
for _, ref := range tt.stored {
299+
renewalStore.Add(ref, &PassKeyChain{Username: "u", Password: "p"})
300+
}
301+
time.Sleep(time.Millisecond)
302+
303+
EvictStaleCredentials(tt.live)
304+
305+
entries := renewalStore.Entries()
306+
gotRefs := make([]string, 0, len(entries))
307+
for _, e := range entries {
308+
gotRefs = append(gotRefs, e.ref)
309+
}
310+
assert.ElementsMatch(t, tt.wantRefs, gotRefs)
311+
})
312+
}
313+
}
314+
315+
func TestEvictStaleCredentials_GracePeriod(t *testing.T) {
316+
oldStore := renewalStore
317+
defer func() { renewalStore = oldStore }()
318+
319+
// Grace period is interval/2 = 2.5 minutes. A freshly added entry
320+
// should survive eviction even when not in the live set.
321+
renewalStore = newCredentialStore(5 * time.Minute)
322+
renewalStore.Add("recent-ref", &PassKeyChain{Username: "u", Password: "p"})
323+
324+
EvictStaleCredentials(map[string]struct{}{})
325+
326+
assert.NotNil(t, renewalStore.Get("recent-ref"), "entry within grace period should not be evicted")
327+
}
328+
329+
func TestGetStoredCredential(t *testing.T) {
330+
tests := []struct {
331+
name string
332+
initStore bool
333+
addEntry bool
334+
wantNil bool
335+
}{
336+
{
337+
name: "returns keychain when present",
338+
initStore: true,
339+
addEntry: true,
340+
},
341+
{
342+
name: "returns nil when not present",
343+
initStore: true,
344+
wantNil: true,
345+
},
346+
{
347+
name: "returns nil when store is nil",
348+
wantNil: true,
349+
},
350+
}
351+
352+
const ref = "docker.io/library/nginx:latest"
353+
354+
for _, tt := range tests {
355+
t.Run(tt.name, func(t *testing.T) {
356+
oldStore := renewalStore
357+
defer func() { renewalStore = oldStore }()
358+
359+
if tt.initStore {
360+
renewalStore = newCredentialStore(5 * time.Minute)
361+
if tt.addEntry {
362+
renewalStore.Add(ref, &PassKeyChain{Username: "user", Password: "pass"})
363+
}
364+
} else {
365+
renewalStore = nil
366+
}
367+
368+
got := GetStoredCredential(ref)
369+
if tt.wantNil {
370+
assert.Nil(t, got)
371+
} else {
372+
require.NotNil(t, got)
373+
assert.Equal(t, "user", got.Username)
374+
}
375+
})
376+
}
377+
}
378+
379+
func TestEvictStaleCredentials_NilStore(t *testing.T) {
380+
oldStore := renewalStore
381+
defer func() { renewalStore = oldStore }()
382+
renewalStore = nil
383+
384+
// Should not panic.
385+
EvictStaleCredentials(map[string]struct{}{"ref": {}})
386+
}
387+
272388
// --- getRegistryKeyChainFromProviders ---
273389

274390
func TestGetRegistryKeyChainFromProviders(t *testing.T) {
@@ -347,25 +463,3 @@ func TestGetRegistryKeyChainFromProviders(t *testing.T) {
347463
})
348464
}
349465
}
350-
351-
// --- InitCredentialRenewal ---
352-
353-
// TestInitCredentialRenewalLifecycle verifies that the renewal goroutine
354-
// starts, ticks, and stops cleanly on context cancellation. The RAFS cache
355-
// is empty in the test environment so reconcile is a no-op each tick.
356-
func TestInitCredentialRenewalLifecycle(t *testing.T) {
357-
oldStore := renewalStore
358-
defer func() { renewalStore = oldStore }()
359-
360-
ctx, cancel := context.WithCancel(t.Context())
361-
362-
InitCredentialRenewal(ctx, 30*time.Millisecond)
363-
require.NotNil(t, renewalStore)
364-
365-
// Let it tick a few times without error.
366-
time.Sleep(100 * time.Millisecond)
367-
368-
cancel()
369-
// Give goroutine time to observe cancellation.
370-
time.Sleep(60 * time.Millisecond)
371-
}

0 commit comments

Comments
 (0)