Skip to content

Commit a2068f7

Browse files
committed
nfd-master: tweak list options for NodeFeature informer
Fix cache syncing problems on big clusters with thousands of NodeFeature objects. On the initial list (sync) the client-go cache reflector sets the ResourceVersion to "0" (instead of leaving it empty). This causes problems in the api server with (apiserver) logs like: E writers.go:122] apiserver was unable to write a JSON response: http: Handler timeout E status.go:71] apiserver received an error that is not an metav1.Status: &errors.errorString{s:"http: Handler timeout"}: http: Handler timeout On the nfd-master side we see corresponding log snippets like: W reflector.go:547] failed to list *v1alpha1.NodeFeature: stream error when reading response body, may be caused by closed connection. Please retry. Original error: stream error: stream ID 1521; INTERNAL_ERROR; received from peer I trace.go:236] "Reflector ListAndWatch" name:*** (***) (total time: 61126ms): ---"Objects listed" error:stream error when reading response body, may be caused by closed connection. Please retry. Original error: stream error: stream ID 1521; INTERNAL_ERROR; received from peer 61126ms (***) Decreasing the page size (opts.Limits) does not have any effect on the timeouts. However, setting ResourceVersion to an empty value seems to get the paging on its tracks, eliminating the timeouts. TODO: investigate in Kubernetes upstream the root cause of the timeouts with ResourceVersion="0".
1 parent bd8d74d commit a2068f7

File tree

1 file changed

+12
-1
lines changed

1 file changed

+12
-1
lines changed

pkg/nfd-master/nfd-api-controller.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
nfdclientset "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned"
3030
nfdscheme "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned/scheme"
3131
nfdinformers "sigs.k8s.io/node-feature-discovery/api/generated/informers/externalversions"
32+
nfdinformersv1alpha1 "sigs.k8s.io/node-feature-discovery/api/generated/informers/externalversions/nfd/v1alpha1"
3233
nfdlisters "sigs.k8s.io/node-feature-discovery/api/generated/listers/nfd/v1alpha1"
3334
nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/api/nfd/v1alpha1"
3435
"sigs.k8s.io/node-feature-discovery/pkg/utils"
@@ -67,13 +68,23 @@ func newNfdController(config *restclient.Config, nfdApiControllerOptions nfdApiC
6768
}
6869

6970
nfdClient := nfdclientset.NewForConfigOrDie(config)
71+
7072
klog.V(2).InfoS("initializing new NFD API controller", "options", utils.DelayedDumper(nfdApiControllerOptions))
7173

7274
informerFactory := nfdinformers.NewSharedInformerFactory(nfdClient, nfdApiControllerOptions.ResyncPeriod)
7375

7476
// Add informer for NodeFeature objects
7577
if !nfdApiControllerOptions.DisableNodeFeature {
76-
featureInformer := informerFactory.Nfd().V1alpha1().NodeFeatures()
78+
tweakListOpts := func(opts *metav1.ListOptions) {
79+
// Tweak list opts on initial sync to avoid timeouts on the apiserver.
80+
// NodeFeature objects are huge and the Kubernetes apiserver
81+
// (v1.30) experiences http handler timeouts when the resource
82+
// version is set to some non-empty value (TODO: find out why).
83+
if opts.ResourceVersion == "0" {
84+
opts.ResourceVersion = ""
85+
}
86+
}
87+
featureInformer := nfdinformersv1alpha1.New(informerFactory, "", tweakListOpts).NodeFeatures()
7788
if _, err := featureInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
7889
AddFunc: func(obj interface{}) {
7990
nfr := obj.(*nfdv1alpha1.NodeFeature)

0 commit comments

Comments
 (0)