Skip to content

Commit a9849f2

Browse files
committed
nfd-master: fix retry of node updates
This patch addresses issues with slow node status (extended resources) updates. Previously we did just a few retries in quick succession which could result in the node update failing, just because node status was updated slower than our retry window. The patch mitigates the issue by increasing the number of tries to 15. In addition, it creates a ratelimiter with a longer per-item (per-node) base delay. The patch also fixes the e2e-tests to expose the issue.
1 parent b6231b6 commit a9849f2

File tree

3 files changed

+13
-3
lines changed

3 files changed

+13
-3
lines changed

pkg/nfd-master/node-updater-pool.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ package nfdmaster
1818

1919
import (
2020
"sync"
21+
"time"
2122

23+
"golang.org/x/time/rate"
2224
"k8s.io/client-go/util/workqueue"
2325
"k8s.io/klog/v2"
2426
)
@@ -48,7 +50,7 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI
4850

4951
nodeUpdateRequests.Inc()
5052
if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName.(string)); err != nil {
51-
if queue.NumRequeues(nodeName) < 5 {
53+
if queue.NumRequeues(nodeName) < 15 {
5254
klog.InfoS("retrying node update", "nodeName", nodeName)
5355
queue.AddRateLimited(nodeName)
5456
return true
@@ -77,7 +79,14 @@ func (u *nodeUpdaterPool) start(parallelism int) {
7779
}
7880

7981
klog.InfoS("starting the NFD master node updater pool", "parallelism", parallelism)
80-
u.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
82+
83+
// Create ratelimiter. Mimic workqueue.DefaultControllerRateLimiter() but
84+
// with modified per-item (node) rate limiting parameters.
85+
rl := workqueue.NewMaxOfRateLimiter(
86+
workqueue.NewItemExponentialFailureRateLimiter(50*time.Millisecond, 100*time.Second),
87+
&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
88+
)
89+
u.queue = workqueue.NewRateLimitingQueue(rl)
8190

8291
for i := 0; i < parallelism; i++ {
8392
u.wg.Add(1)

test/e2e/gomega.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ type k8sAnnotations map[string]string
3939
func eventuallyNonControlPlaneNodes(ctx context.Context, cli clientset.Interface) AsyncAssertion {
4040
return Eventually(func(g Gomega, ctx context.Context) ([]corev1.Node, error) {
4141
return getNonControlPlaneNodes(ctx, cli)
42-
}).WithPolling(1 * time.Second).WithTimeout(10 * time.Second).WithContext(ctx)
42+
}).WithPolling(1 * time.Second).WithTimeout(20 * time.Second).WithContext(ctx)
4343
}
4444

4545
// MatchLabels returns a specialized Gomega matcher for checking if a list of

test/e2e/node_feature_discovery_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,7 @@ core:
793793
Expect(err).NotTo(HaveOccurred())
794794

795795
By("Verfiying node status capacity from NodeFeatureRules #4")
796+
expectedCapacity = map[string]corev1.ResourceList{"*": {}}
796797
eventuallyNonControlPlaneNodes(ctx, f.ClientSet).Should(MatchCapacity(expectedCapacity, nodes, false))
797798

798799
By("Deleting nfd-worker daemonset")

0 commit comments

Comments
 (0)