@@ -30,9 +30,12 @@ import (
3030 "sync"
3131 "time"
3232
33+ "github.com/kubernetes-csi/csi-lib-utils/slowset"
3334 "github.com/prometheus/client_golang/prometheus"
3435 "github.com/prometheus/client_golang/prometheus/promhttp"
3536 "golang.org/x/time/rate"
37+ "google.golang.org/grpc/codes"
38+ "google.golang.org/grpc/status"
3639 v1 "k8s.io/api/core/v1"
3740 storage "k8s.io/api/storage/v1"
3841 storagebeta "k8s.io/api/storage/v1beta1"
@@ -183,6 +186,10 @@ type ProvisionController struct {
183186 volumeStore VolumeStore
184187
185188 volumeNameHook VolumeNameHook
189+
190+ slowSet * slowset.SlowSet
191+
192+ retryIntervalMax time.Duration
186193}
187194
188195const (
@@ -216,6 +223,8 @@ const (
216223 DefaultMetricsPath = "/metrics"
217224 // DefaultAddFinalizer is used when option function AddFinalizer is omitted
218225 DefaultAddFinalizer = false
226+ // DefaultRetryIntervalMax is used when option function RetryIntervalMax is omitted
227+ DefaultRetryIntervalMax = 5 * time .Minute
219228)
220229
221230var errRuntime = fmt .Errorf ("cannot call option functions after controller has Run" )
@@ -451,6 +460,18 @@ func RetryPeriod(retryPeriod time.Duration) func(*ProvisionController) error {
451460 }
452461}
453462
463+ // RetryIntervalMax is the maximum retry interval of failed provisioning or deletion.
464+ // Defaults to 5 minutes.
465+ func RetryIntervalMax (retryIntervalMax time.Duration ) func (* ProvisionController ) error {
466+ return func (c * ProvisionController ) error {
467+ if c .HasRun () {
468+ return errRuntime
469+ }
470+ c .retryIntervalMax = retryIntervalMax
471+ return nil
472+ }
473+ }
474+
454475// ClaimsInformer sets the informer to use for accessing PersistentVolumeClaims.
455476// Defaults to using a internal informer.
456477func ClaimsInformer (informer cache.SharedIndexInformer ) func (* ProvisionController ) error {
@@ -667,8 +688,11 @@ func NewProvisionController(
667688 hasRun : false ,
668689 hasRunLock : & sync.Mutex {},
669690 volumeNameHook : getProvisionedVolumeNameForClaim ,
691+ retryIntervalMax : DefaultRetryIntervalMax ,
670692 }
671693
694+ controller .slowSet = slowset .NewSlowSet (controller .retryIntervalMax )
695+
672696 for _ , option := range options {
673697 err := option (controller )
674698 if err != nil {
@@ -840,6 +864,8 @@ func (ctrl *ProvisionController) Run(ctx context.Context) {
840864 defer ctrl .claimQueue .ShutDown ()
841865 defer ctrl .volumeQueue .ShutDown ()
842866
867+ go ctrl .slowSet .Run (ctx .Done ())
868+
843869 ctrl .hasRunLock .Lock ()
844870 ctrl .hasRun = true
845871 ctrl .hasRunLock .Unlock ()
@@ -1085,6 +1111,10 @@ func (ctrl *ProvisionController) syncClaim(ctx context.Context, obj interface{})
10851111 return fmt .Errorf ("expected claim but got %+v" , obj )
10861112 }
10871113
1114+ if err := ctrl .delayProvisioningIfRecentlyInfeasible (claim ); err != nil {
1115+ return err
1116+ }
1117+
10881118 should , err := ctrl .shouldProvision (ctx , claim )
10891119 if err != nil {
10901120 ctrl .updateProvisionStats (claim , err , time.Time {})
@@ -1494,7 +1524,20 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
14941524 }
14951525
14961526 ctx2 := klog .NewContext (ctx , logger )
1497- err = fmt .Errorf ("failed to provision volume with StorageClass %q: %v" , claimClass , err )
1527+
1528+ if isInfeasibleError (err ) {
1529+ logger .V (2 ).Info ("Detected infeasible volume provisioning request" ,
1530+ "error" , err ,
1531+ "claim" , klog .KObj (claim ))
1532+
1533+ ctrl .markForSlowRetry (ctx , claim , err )
1534+
1535+ ctrl .eventRecorder .Event (claim , v1 .EventTypeWarning , "ProvisioningFailed" ,
1536+ fmt .Sprintf ("Volume provisioning failed with infeasible error. Retries will be delayed. %v" , err ))
1537+
1538+ return ProvisioningFinished , err
1539+ }
1540+
14981541 return ctrl .provisionVolumeErrorHandling (ctx2 , result , err , claim )
14991542 }
15001543
@@ -1519,6 +1562,62 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
15191562 return ProvisioningFinished , nil
15201563}
15211564
1565+ func (ctrl * ProvisionController ) delayProvisioningIfRecentlyInfeasible (claim * v1.PersistentVolumeClaim ) error {
1566+ key := string (claim .UID )
1567+
1568+ claimClass := util .GetPersistentVolumeClaimClass (claim )
1569+ currentClass , err := ctrl .getStorageClass (claimClass )
1570+ if err != nil {
1571+ return nil
1572+ }
1573+
1574+ if info , exists := ctrl .slowSet .Get (key ); exists {
1575+ if info .StorageClassUID != string (currentClass .UID ) {
1576+ ctrl .slowSet .Remove (key )
1577+ return nil
1578+ }
1579+ }
1580+ if delay := ctrl .slowSet .TimeRemaining (key ); delay > 0 {
1581+ return util .NewDelayRetryError (fmt .Sprintf ("skipping volume provisioning for pvc %s, because provisioning previously failed with infeasible error" , key ))
1582+ }
1583+ return nil
1584+ }
1585+
1586+ func (ctrl * ProvisionController ) markForSlowRetry (ctx context.Context , claim * v1.PersistentVolumeClaim , err error ) {
1587+ if isInfeasibleError (err ) {
1588+ key := string (claim .UID )
1589+
1590+ claimClass := util .GetPersistentVolumeClaimClass (claim )
1591+ class , err := ctrl .getStorageClass (claimClass )
1592+ if err != nil {
1593+ logger := klog .FromContext (ctx )
1594+ logger .Error (err , "Failed to get StorageClass for delay tracking" ,
1595+ "PVC" , klog .KObj (claim ))
1596+ return
1597+ }
1598+
1599+ info := slowset.ObjectData {
1600+ Timestamp : time .Now (),
1601+ StorageClassUID : string (class .UID ),
1602+ }
1603+ ctrl .slowSet .Add (key , info )
1604+ }
1605+ }
1606+
1607+ func isInfeasibleError (err error ) bool {
1608+
1609+ st , ok := status .FromError (err )
1610+ if ! ok {
1611+ return false
1612+ }
1613+
1614+ switch st .Code () {
1615+ case codes .InvalidArgument :
1616+ return true
1617+ }
1618+ return false
1619+ }
1620+
15221621func (ctrl * ProvisionController ) provisionVolumeErrorHandling (ctx context.Context , result ProvisioningState , err error , claim * v1.PersistentVolumeClaim ) (ProvisioningState , error ) {
15231622 logger := klog .FromContext (ctx )
15241623 ctrl .eventRecorder .Event (claim , v1 .EventTypeWarning , "ProvisioningFailed" , err .Error ())
0 commit comments