@@ -18,6 +18,7 @@ package manager
18
18
19
19
import (
20
20
"context"
21
+ "crypto"
21
22
"crypto/x509"
22
23
"encoding/pem"
23
24
"errors"
@@ -40,6 +41,7 @@ import (
40
41
"k8s.io/apimachinery/pkg/types"
41
42
"k8s.io/apimachinery/pkg/util/uuid"
42
43
"k8s.io/apimachinery/pkg/util/wait"
44
+ "k8s.io/client-go/tools/cache"
43
45
"k8s.io/utils/clock"
44
46
45
47
internalapi "github.com/cert-manager/csi-lib/internal/api"
@@ -157,14 +159,73 @@ func NewManager(opts Options) (*Manager, error) {
157
159
return nil , fmt .Errorf ("building node name label selector: %w" , err )
158
160
}
159
161
162
+ // construct the requestToPrivateKeyMap so we can use event handlers below to manage it
163
+ var requestToPrivateKeyLock sync.Mutex
164
+ requestToPrivateKeyMap := make (map [types.UID ]crypto.PrivateKey )
160
165
// Create an informer factory
161
166
informerFactory := cminformers .NewSharedInformerFactoryWithOptions (opts .Client , 0 , cminformers .WithTweakListOptions (func (opts * metav1.ListOptions ) {
162
167
opts .LabelSelector = labels .NewSelector ().Add (* nodeNameReq ).String ()
163
168
}))
164
169
// Fetch the lister before calling Start() to ensure this informer is
165
170
// registered with the factory
166
171
lister := informerFactory .Certmanager ().V1 ().CertificateRequests ().Lister ()
172
+ informerFactory .Certmanager ().V1 ().CertificateRequests ().Informer ().AddEventHandler (cache.ResourceEventHandlerFuncs {
173
+ DeleteFunc : func (obj interface {}) {
174
+ requestToPrivateKeyLock .Lock ()
175
+ defer requestToPrivateKeyLock .Unlock ()
176
+ key , ok := obj .(string )
177
+ if ! ok {
178
+ return
179
+ }
180
+ namespace , name , err := cache .SplitMetaNamespaceKey (key )
181
+ if err != nil {
182
+ return
183
+ }
184
+ req , err := lister .CertificateRequests (namespace ).Get (name )
185
+ if err != nil {
186
+ // we no longer have a copy of this request, so we can't work out its UID.
187
+ // instead the associated pending request entry for this request will be cleaned up by the periodic 'janitor' task.
188
+ return
189
+ }
190
+
191
+ if _ , ok := requestToPrivateKeyMap [req .UID ]; ok {
192
+ delete (requestToPrivateKeyMap , req .UID )
193
+ }
194
+ },
195
+ })
196
+
197
+ // create a stop channel that manages all sub goroutines
167
198
stopCh := make (chan struct {})
199
+ // begin a background routine which periodically checks to ensure all members of the pending request map actually
200
+ // have corresponding CertificateRequest objects in the apiserver.
201
+ // This avoids leaking memory if we don't observe a request being deleted, or we observe it after the lister has purged
202
+ // the request data from its cache.
203
+ // this routine must be careful to not delete entries from this map that have JUST been added to the map, but haven't
204
+ // been observed by the lister yet (else it may purge data we want to keep, causing a whole new request cycle).
205
+ // for now, to avoid this case, we only run the routine every 5 minutes. It would be better if we recorded the time we
206
+ // added the entry to the map instead, and only purged items from the map that are older that N duration (TBD).
207
+ janitorLogger := opts .Log .WithName ("pending_request_janitor" )
208
+ go wait .Until (func () {
209
+ requestToPrivateKeyLock .Lock ()
210
+ defer requestToPrivateKeyLock .Unlock ()
211
+ reqs , err := lister .List (labels .Everything ())
212
+ if err != nil {
213
+ janitorLogger .Error (err , "failed listing existing requests" )
214
+ return
215
+ }
216
+
217
+ existsMap := make (map [types.UID ]struct {})
218
+ for _ , req := range reqs {
219
+ existsMap [req .UID ] = struct {}{}
220
+ }
221
+
222
+ for uid := range requestToPrivateKeyMap {
223
+ if _ , ok := existsMap [uid ]; ! ok {
224
+ // purge the item from the map as it does not exist in the apiserver
225
+ delete (requestToPrivateKeyMap , uid )
226
+ }
227
+ }
228
+ }, time .Minute * 5 , stopCh )
168
229
// Begin watching the API
169
230
informerFactory .Start (stopCh )
170
231
informerFactory .WaitForCacheSync (stopCh )
@@ -173,9 +234,13 @@ func NewManager(opts Options) (*Manager, error) {
173
234
client : opts .Client ,
174
235
clientForMetadata : opts .ClientForMetadata ,
175
236
lister : lister ,
176
- metadataReader : opts .MetadataReader ,
177
- clock : opts .Clock ,
178
- log : * opts .Log ,
237
+ // we pass a pointer to the mutex as the janitor routine above also uses this lock,
238
+ // so we want to avoid making a copy of it
239
+ requestToPrivateKeyLock : & requestToPrivateKeyLock ,
240
+ requestToPrivateKeyMap : requestToPrivateKeyMap ,
241
+ metadataReader : opts .MetadataReader ,
242
+ clock : opts .Clock ,
243
+ log : * opts .Log ,
179
244
180
245
generatePrivateKey : opts .GeneratePrivateKey ,
181
246
generateRequest : opts .GenerateRequest ,
@@ -260,6 +325,10 @@ type Manager struct {
260
325
// lister is used as a read-only cache of CertificateRequest resources
261
326
lister cmlisters.CertificateRequestLister
262
327
328
+ // A map that associates a CertificateRequest's UID with its private key.
329
+ requestToPrivateKeyLock * sync.Mutex
330
+ requestToPrivateKeyMap map [types.UID ]crypto.PrivateKey
331
+
263
332
// used to read metadata from the store
264
333
metadataReader storage.MetadataReader
265
334
@@ -298,10 +367,23 @@ type Manager struct {
298
367
// requestNameGenerator generates a new random name for a certificaterequest object
299
368
// Defaults to uuid.NewUUID() from k8s.io/apimachinery/pkg/util/uuid.
300
369
requestNameGenerator func () string
370
+
371
+ // doNotUse_CallOnEachIssue is a field used SOLELY for testing, and cannot be configured by external package consumers.
372
+ // It is used to perform some action (e.g. counting) each time issue() is called.
373
+ // It will be removed as soon as we have actual metrics support in csi-lib, which will allow us to measure
374
+ // things like the number of times issue() is called.
375
+ // No thread safety is added around this field, and it MUST NOT be used for any implementation logic.
376
+ // It should not be used full-stop :).
377
+ doNotUse_CallOnEachIssue func ()
301
378
}
302
379
303
380
// issue will step through the entire issuance flow for a volume.
304
381
func (m * Manager ) issue (ctx context.Context , volumeID string ) error {
382
+ // TODO: remove this code and replace with actual metrics support
383
+ if m .doNotUse_CallOnEachIssue != nil {
384
+ m .doNotUse_CallOnEachIssue ()
385
+ }
386
+
305
387
log := m .log .WithValues ("volume_id" , volumeID )
306
388
log .Info ("Processing issuance" )
307
389
@@ -315,10 +397,32 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
315
397
}
316
398
log .V (2 ).Info ("Read metadata" , "metadata" , meta )
317
399
400
+ // check if there is already a pending request in-flight for this volume.
401
+ // if there is, and we still have a copy of its private key in memory, we can resume the existing request and
402
+ // avoid creating additional CertificateRequest objects.
403
+ existingReq , err := m .findPendingRequest (meta )
404
+ if err != nil {
405
+ return fmt .Errorf ("failed when checking if an existing request exists: %w" , err )
406
+ }
407
+ // if there is an existing in-flight request, attempt to 'resume' it (i.e. re-check to see if it is ready)
408
+ if existingReq != nil {
409
+ // we can only resume a request if we still have a reference to its private key, so look that up in our pending
410
+ // requests map
411
+ if key , ok := m .readPendingRequestPrivateKey (existingReq .UID ); ok {
412
+ log .V (4 ).Info ("Re-using existing certificaterequest" )
413
+ return m .handleRequest (ctx , volumeID , meta , key , existingReq )
414
+ }
415
+
416
+ // if we don't have a copy of the associated private key, delete the currently in-flight request
417
+ log .V (2 ).Info ("Found existing request that we don't have corresponding private key for - restarting request process" )
418
+ if err := m .client .CertmanagerV1 ().CertificateRequests (existingReq .Namespace ).Delete (ctx , existingReq .Name , metav1.DeleteOptions {}); err != nil {
419
+ return fmt .Errorf ("failed to delete existing in-flight request: %w" , err )
420
+ }
421
+ }
422
+
318
423
if ready , reason := m .readyToRequest (meta ); ! ready {
319
424
return fmt .Errorf ("driver is not ready to request a certificate for this volume: %v" , reason )
320
425
}
321
-
322
426
key , err := m .generatePrivateKey (meta )
323
427
if err != nil {
324
428
return fmt .Errorf ("generating private key: %w" , err )
@@ -343,11 +447,78 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
343
447
}
344
448
log .Info ("Created new CertificateRequest resource" )
345
449
346
- // Poll every 1s for the CertificateRequest to be ready
450
+ // persist the reference to the private key in memory so we can resume this request in future if it doesn't complete
451
+ // the first time.
452
+ m .writePendingRequestPrivateKey (req .UID , key )
453
+ return m .handleRequest (ctx , volumeID , meta , key , req )
454
+ }
455
+
456
+ func (m * Manager ) readPendingRequestPrivateKey (uid types.UID ) (crypto.PrivateKey , bool ) {
457
+ m .requestToPrivateKeyLock .Lock ()
458
+ defer m .requestToPrivateKeyLock .Unlock ()
459
+ key , ok := m .requestToPrivateKeyMap [uid ]
460
+ return key , ok
461
+ }
462
+
463
+ func (m * Manager ) writePendingRequestPrivateKey (uid types.UID , key crypto.PrivateKey ) {
464
+ m .requestToPrivateKeyLock .Lock ()
465
+ defer m .requestToPrivateKeyLock .Unlock ()
466
+ m .requestToPrivateKeyMap [uid ] = key
467
+ }
468
+
469
+ func (m * Manager ) deletePendingRequestPrivateKey (uid types.UID ) {
470
+ m .requestToPrivateKeyLock .Lock ()
471
+ defer m .requestToPrivateKeyLock .Unlock ()
472
+ delete (m .requestToPrivateKeyMap , uid )
473
+ }
474
+
475
+ func (m * Manager ) findPendingRequest (meta metadata.Metadata ) (* cmapi.CertificateRequest , error ) {
476
+ reqs , err := m .listAllRequestsForVolume (meta .VolumeID )
477
+ if err != nil {
478
+ return nil , err
479
+ }
480
+
481
+ if len (reqs ) == 0 {
482
+ return nil , nil
483
+ }
484
+
485
+ // only consider the newest request - we will never resume an older request
486
+ req := reqs [0 ]
487
+ if ! certificateRequestCanBeResumed (req ) {
488
+ return nil , nil
489
+ }
490
+
491
+ // TODO: check if this request is still actually valid for the input metadata
492
+ return req , nil
493
+ }
494
+
495
+ func certificateRequestCanBeResumed (req * cmapi.CertificateRequest ) bool {
496
+ for _ , cond := range req .Status .Conditions {
497
+ if cond .Type == cmapi .CertificateRequestConditionReady {
498
+ switch cond .Reason {
499
+ case cmapi .CertificateRequestReasonPending , cmapi .CertificateRequestReasonIssued , "" :
500
+ // either explicit Pending, Issued or empty is considered re-sumable
501
+ return true
502
+ default :
503
+ // any other state is a terminal failed state and means the request has failed
504
+ return false
505
+ }
506
+ }
507
+ }
508
+ // if there is no Ready condition, the request is still pending processing
509
+ return true
510
+ }
511
+
512
+ func (m * Manager ) handleRequest (ctx context.Context , volumeID string , meta metadata.Metadata , key crypto.PrivateKey , req * cmapi.CertificateRequest ) error {
513
+ log := m .log .WithValues ("volume_id" , volumeID )
514
+
515
+ // Poll every 200ms for the CertificateRequest to be ready
347
516
lastFailureReason := ""
348
- if err := wait .PollUntilWithContext (ctx , time .Second , func (ctx context.Context ) (done bool , err error ) {
517
+ if err := wait .PollUntilWithContext (ctx , time .Millisecond * 200 , func (ctx context.Context ) (done bool , err error ) {
518
+ log .V (4 ).Info ("Reading CertificateRequest from lister cache" )
349
519
updatedReq , err := m .lister .CertificateRequests (req .Namespace ).Get (req .Name )
350
520
if apierrors .IsNotFound (err ) {
521
+ log .V (4 ).Info ("Failed to read CertificateRequest from lister cache" , "error" , err )
351
522
// A NotFound error implies something deleted the resource - fail
352
523
// early to allow a retry to occur at a later time if needed.
353
524
return false , err
@@ -371,6 +542,7 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
371
542
372
543
isApproved := apiutil .CertificateRequestIsApproved (updatedReq )
373
544
if ! isApproved {
545
+ log .V (4 ).Info ("CertificateRequest is not explicitly approved - continuing to check if the request has been issued anyway" )
374
546
lastFailureReason = fmt .Sprintf ("request %q has not yet been approved by approval plugin" , updatedReq .Name )
375
547
// we don't stop execution here, as some versions of cert-manager (and some external issuer plugins)
376
548
// may not be aware/utilise approval.
@@ -380,6 +552,7 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
380
552
381
553
readyCondition := apiutil .GetCertificateRequestCondition (updatedReq , cmapi .CertificateRequestConditionReady )
382
554
if readyCondition == nil {
555
+ log .V (4 ).Info ("Ready condition not found - will recheck..." )
383
556
// only overwrite the approval failure message if the request is actually approved
384
557
// otherwise we may hide more useful information from the user by accident.
385
558
if isApproved {
@@ -390,10 +563,12 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
390
563
391
564
switch readyCondition .Reason {
392
565
case cmapi .CertificateRequestReasonIssued :
566
+ log .V (4 ).Info ("CertificateRequest has been issued!" )
393
567
break
394
568
case cmapi .CertificateRequestReasonFailed :
395
569
return false , fmt .Errorf ("request %q has failed: %s" , updatedReq .Name , readyCondition .Message )
396
570
case cmapi .CertificateRequestReasonPending :
571
+ log .V (4 ).Info ("CertificateRequest is still pending..." )
397
572
if isApproved {
398
573
lastFailureReason = fmt .Sprintf ("request %q is pending: %v" , updatedReq .Name , readyCondition .Message )
399
574
}
@@ -425,37 +600,55 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
425
600
return fmt .Errorf ("calculating next issuance time: %w" , err )
426
601
}
427
602
meta .NextIssuanceTime = & renewalPoint
603
+ log .V (4 ).Info ("Persisting next issuance time to metadata store" , "next_issuance_time" , renewalPoint )
428
604
429
605
if err := m .writeKeypair (meta , key , req .Status .Certificate , req .Status .CA ); err != nil {
430
606
return fmt .Errorf ("writing keypair: %w" , err )
431
607
}
432
608
log .V (2 ).Info ("Wrote new keypair to storage" )
433
609
610
+ // We must explicitly delete the private key from the pending requests map so that the existing Completed
611
+ // request will not be re-used upon renewal.
612
+ // Without this, the renewal would pick up the existing issued certificate and re-issue, rather than requesting
613
+ // a new certificate.
614
+ m .deletePendingRequestPrivateKey (req .UID )
615
+ log .V (4 ).Info ("Removed pending request private key from internal cache" )
616
+
434
617
return nil
435
618
}
436
619
437
- func (m * Manager ) cleanupStaleRequests (ctx context.Context , log logr.Logger , volumeID string ) error {
620
+ // returns a list of all pending certificaterequest objects for the given volumeID.
621
+ // the returned slice will be ordered with the most recent request FIRST.
622
+ func (m * Manager ) listAllRequestsForVolume (volumeID string ) ([]* cmapi.CertificateRequest , error ) {
438
623
sel , err := m .labelSelectorForVolume (volumeID )
439
624
if err != nil {
440
- return fmt .Errorf ("internal error building label selector - this is a bug, please open an issue: %w" , err )
625
+ return nil , fmt .Errorf ("internal error building label selector - this is a bug, please open an issue: %w" , err )
441
626
}
442
627
443
628
reqs , err := m .lister .List (sel )
444
629
if err != nil {
445
- return fmt .Errorf ("listing certificaterequests: %w" , err )
446
- }
447
-
448
- if len (reqs ) < m .maxRequestsPerVolume {
449
- return nil
630
+ return nil , fmt .Errorf ("listing certificaterequests: %w" , err )
450
631
}
451
632
452
633
// sort newest first to oldest last
453
634
sort .Slice (reqs , func (i , j int ) bool {
454
635
return reqs [i ].CreationTimestamp .After (reqs [j ].CreationTimestamp .Time )
455
636
})
456
637
638
+ return reqs , nil
639
+ }
640
+
641
+ func (m * Manager ) cleanupStaleRequests (ctx context.Context , log logr.Logger , volumeID string ) error {
642
+ reqs , err := m .listAllRequestsForVolume (volumeID )
643
+ if err != nil {
644
+ return err
645
+ }
646
+ if len (reqs ) <= m .maxRequestsPerVolume {
647
+ return nil
648
+ }
649
+
457
650
// start at the end of the slice and work back to maxRequestsPerVolume
458
- for i := len (reqs ) - 1 ; i >= m .maxRequestsPerVolume - 1 ; i -- {
651
+ for i := len (reqs ) - 1 ; i > m .maxRequestsPerVolume - 1 ; i -- {
459
652
toDelete := reqs [i ]
460
653
if err := m .client .CertmanagerV1 ().CertificateRequests (toDelete .Namespace ).Delete (ctx , toDelete .Name , metav1.DeleteOptions {}); err != nil {
461
654
if apierrors .IsNotFound (err ) {
0 commit comments