refactor nodeserver to only attempt issuance once in a single NodePublishVolume call

munnerz · munnerz · commit 4d748d622c72 · 2022-09-09T17:29:36.000+01:00
Signed-off-by: James Munnelly &lt;jmunnelly@apple.com&gt;
diff --git a/driver/nodeserver.go b/driver/nodeserver.go
@@ -26,7 +26,6 @@ import (
 	"github.com/go-logr/logr"
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/status"
-	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/mount-utils"
 
 	"github.com/cert-manager/csi-lib/manager"
@@ -79,32 +78,31 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
 		}
 	}
 
-	if err := ns.manager.ManageVolume(req.GetVolumeId()); err != nil {
-		return nil, err
-	}
-
-	log.Info("Volume registered for management")
-
-	// Only wait for the volume to be ready if it is in a state of 'ready to request'
-	// already. This allows implementors to defer actually requesting certificates
-	// until later in the pod lifecycle (e.g. after CNI has run & an IP address has been
-	// allocated, if a user wants to embed pod IPs into their requests).
-	isReadyToRequest, reason := ns.manager.IsVolumeReadyToRequest(req.GetVolumeId())
-	if !isReadyToRequest {
-		log.Info("Unable to request a certificate right now, will be retried", "reason", reason)
-	}
-	if isReadyToRequest || !ns.continueOnNotReady {
-		log.Info("Waiting for certificate to be issued...")
-		if err := wait.PollUntil(time.Second, func() (done bool, err error) {
-			return ns.manager.IsVolumeReady(req.GetVolumeId()), nil
-		}, ctx.Done()); err != nil {
-			return nil, err
+	if !ns.manager.IsVolumeReady(req.GetVolumeId()) {
+		// Only wait for the volume to be ready if it is in a state of 'ready to request'
+		// already. This allows implementors to defer actually requesting certificates
+		// until later in the pod lifecycle (e.g. after CNI has run & an IP address has been
+		// allocated, if a user wants to embed pod IPs into their requests).
+		isReadyToRequest, reason := ns.manager.IsVolumeReadyToRequest(req.GetVolumeId())
+		if isReadyToRequest {
+			log.V(4).Info("Waiting for certificate to be issued...")
+			if _, err := ns.manager.ManageVolumeImmediate(ctx, req.GetVolumeId()); err != nil {
+				return nil, err
+			}
+			log.Info("Volume registered for management")
+		} else {
+			if ns.continueOnNotReady {
+				log.V(4).Info("Skipping waiting for certificate to be issued")
+				ns.manager.ManageVolume(req.GetVolumeId())
+				log.V(4).Info("Volume registered for management")
+			} else {
+				log.Info("Unable to request a certificate right now, will be retried", "reason", reason)
+				return nil, fmt.Errorf("volume is not yet ready to be setup, will be retried: %s", reason)
+			}
 		}
-	} else {
-		log.Info("Skipping waiting for certificate to be issued")
 	}
 
-	log.Info("Volume ready for mounting")
+	log.Info("Ensuring data directory for volume is mounted into pod...")
 	notMnt, err := mount.IsNotMountPoint(ns.mounter, req.GetTargetPath())
 	switch {
 	case os.IsNotExist(err):
@@ -118,11 +116,12 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
 
 	if !notMnt {
 		// Nothing more to do if the targetPath is already a bind mount
+		log.Info("Volume already mounted to pod, nothing to do")
 		success = true
 		return &csi.NodePublishVolumeResponse{}, nil
 	}
 
-	log.Info("Bind mounting data directory to the targetPath")
+	log.Info("Bind mounting data directory to the pod's mount namespace")
 	// bind mount the targetPath to the data directory
 	if err := ns.mounter.Mount(ns.store.PathForVolume(req.GetVolumeId()), req.GetTargetPath(), "", []string{"bind", "ro"}); err != nil {
 		return nil, err
diff --git a/manager/manager.go b/manager/manager.go
@@ -454,22 +454,67 @@ func (m *Manager) submitRequest(ctx context.Context, meta metadata.Metadata, csr
 	return req, nil
 }
 
-// ManageVolume will initiate management of data for the given volumeID.
-func (m *Manager) ManageVolume(volumeID string) error {
+// ManageVolumeImmediate will register a volume for management and immediately attempt a single issuance.
+// This
+func (m *Manager) ManageVolumeImmediate(ctx context.Context, volumeID string) (managed bool, err error) {
+	if !m.manageVolumeIfNotManaged(volumeID) {
+		return false, nil
+	}
+
+	meta, err := m.metadataReader.ReadMetadata(volumeID)
+	if err != nil {
+		return true, fmt.Errorf("reading metadata: %w", err)
+	}
+
+	// Only attempt issuance immediately if there isn't already an issued certificate
+	if meta.NextIssuanceTime == nil {
+		// If issuance fails, immediately return without retrying so the caller can decide
+		// how to proceed depending on the context this method was called within.
+		if err := m.issue(ctx, volumeID); err != nil {
+			return true, err
+		}
+	}
+
+	if !m.startRenewalRoutine(volumeID) {
+		return true, fmt.Errorf("unexpected state: renewal routine not started, please open an issue at https://github.com/cert-manager/csi-lib")
+	}
+
+	return true, nil
+}
+
+// manageVolumeIfNotManaged will ensure the named volume has been registered for management.
+// It returns 'true' if the volume was not previously managed, and false if the volume was already managed.
+func (m *Manager) manageVolumeIfNotManaged(volumeID string) (managed bool) {
 	m.lock.Lock()
 	defer m.lock.Unlock()
 	log := m.log.WithValues("volume_id", volumeID)
 
 	// if the volume is already managed, return early
-	if _, ok := m.managedVolumes[volumeID]; ok {
+	if _, managed := m.managedVolumes[volumeID]; managed {
 		log.V(2).Info("Volume already registered for management")
-		return nil
+		return false
 	}
 
 	// construct a new channel used to stop management of the volume
 	stopCh := make(chan struct{})
 	m.managedVolumes[volumeID] = stopCh
 
+	return true
+}
+
+// startRenewalRoutine will begin the background issuance goroutine for the given volumeID.
+// It is the caller's responsibility to ensure this is only called once per volume.
+func (m *Manager) startRenewalRoutine(volumeID string) (started bool) {
+	m.lock.Lock()
+	defer m.lock.Unlock()
+	log := m.log.WithValues("volume_id", volumeID)
+
+	stopCh, ok := m.managedVolumes[volumeID]
+	if !ok {
+		log.Info("Volume not registered for management, cannot start renewal routine...")
+		return false
+	}
+
 	// Create a context that will be cancelled when the stopCh is closed
 	ctx, cancel := context.WithCancel(context.Background())
 	go func() {
@@ -481,7 +526,6 @@ func (m *Manager) ManageVolume(volumeID string) error {
 
 	go func() {
 		// check every volume once per second
-		// TODO: optimise this to not check so often
 		ticker := time.NewTicker(time.Second)
 		for {
 			select {
@@ -496,9 +540,14 @@ func (m *Manager) ManageVolume(volumeID string) error {
 				}
 
 				if meta.NextIssuanceTime == nil || m.clock.Now().After(*meta.NextIssuanceTime) {
-					wait.ExponentialBackoffWithContext(ctx, wait.Backoff{
-						// 2s is the 'base' amount of time for the backoff
-						Duration: time.Second * 2,
+					// If issuing a certificate fails, we don't go around the outer for loop again (as we'd then be creating
+					// a new CertificateRequest every second).
+					// Instead, retry within the same iteration of the for loop and apply an exponential backoff.
+					// Because we pass ctx through to the 'wait' package, if the stopCh is closed/context is cancelled,
+					// we'll immediately stop waiting and 'continue' which will then hit the `case <-stopCh` case in the `select`.
+					if err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{
+						// 8s is the 'base' amount of time for the backoff
+						Duration: time.Second * 8,
 						// We multiple the 'duration' by 2.0 if the attempt fails/errors
 						Factor: 2.0,
 						// Add a jitter of +/- 1s (0.5 of the 'duration')
@@ -507,22 +556,39 @@ func (m *Manager) ManageVolume(volumeID string) error {
 						// reset back to the 'base duration'. Set this to the MaxInt32, as we never want to
 						// reset this unless we get a successful attempt.
 						Steps: math.MaxInt32,
-						// The maximum time between calls will be 1 minute
-						Cap: time.Minute,
+						// The maximum time between calls will be 5 minutes
+						Cap: time.Minute * 5,
 					}, func() (bool, error) {
 						log.Info("Triggering new issuance")
 						if err := m.issue(ctx, volumeID); err != nil {
 							log.Error(err, "Failed to issue certificate, retrying after applying exponential backoff")
 							return false, nil
 						}
 						return true, nil
-					})
+					}); err != nil {
+						if errors.Is(err, wait.ErrWaitTimeout) || errors.Is(err, context.DeadlineExceeded) {
+							continue
+						}
+						// this should never happen as the function above never actually returns errors
+						log.Error(err, "unexpected error")
+					}
 				}
 			}
 		}
 	}()
+	return true
+}
 
-	return nil
+// ManageVolume will initiate management of data for the given volumeID.
+func (m *Manager) ManageVolume(volumeID string) (managed bool) {
+	log := m.log.WithValues("volume_id", volumeID)
+	if managed := m.manageVolumeIfNotManaged(volumeID); !managed {
+		return false
+	}
+	if started := m.startRenewalRoutine(volumeID); !started {
+		log.Info("unexpected state: renewal routine not started, please open an issue at https://github.com/cert-manager/csi-lib")
+	}
+	return true
 }
 
 func (m *Manager) UnmanageVolume(volumeID string) {
@@ -546,6 +612,13 @@ func (m *Manager) IsVolumeReadyToRequest(volumeID string) (bool, string) {
 }
 
 func (m *Manager) IsVolumeReady(volumeID string) bool {
+	m.lock.Lock()
+	defer m.lock.Unlock()
+	// a volume is not classed as Ready if it is not managed
+	if _, managed := m.managedVolumes[volumeID]; !managed {
+		return false
+	}
+
 	meta, err := m.metadataReader.ReadMetadata(volumeID)
 	if err != nil {
 		m.log.Error(err, "failed to read metadata", "volume_id", volumeID)
diff --git a/test/integration/ready_to_request_test.go b/test/integration/ready_to_request_test.go
@@ -180,8 +180,8 @@ func TestFailsIfNotReadyToRequest_ContinueOnNotReadyDisabled(t *testing.T) {
 		TargetPath: tmpDir,
 		Readonly:   true,
 	})
-	if status.Code(err) != codes.DeadlineExceeded {
-		t.Errorf("Expected timeout to be returned from NodePublishVolume but got: %v", err)
+	if status.Code(err) != codes.Unknown || err.Error() != "rpc error: code = Unknown desc = volume is not yet ready to be setup, will be retried: never ready" {
+		t.Errorf("unexpected error: %v", err)
 	}
 
 	// allow 1s for the cleanup functions in NodePublishVolume to be run