nfd-master: tweak list options for NodeFeature informer

marquiz · marquiz · commit a2068f7ce339 · 2024-07-25T16:29:05.000+03:00
Fix cache syncing problems on big clusters with thousands of NodeFeature
objects.

On the initial list (sync) the client-go cache reflector sets the
ResourceVersion to "0" (instead of leaving it empty). This causes
problems in the api server with (apiserver) logs like:

E writers.go:122] apiserver was unable to write a JSON response: http:
                  Handler timeout
E status.go:71] apiserver received an error that is not an
                metav1.Status: &amp;errors.errorString{s:"http: Handler timeout"}:
                http: Handler timeout

On the nfd-master side we see corresponding log snippets like:

W reflector.go:547] failed to list *v1alpha1.NodeFeature: stream error
                    when reading response body, may be caused by closed
                    connection. Please retry. Original error: stream
                    error: stream ID 1521; INTERNAL_ERROR; received from
                    peer
I trace.go:236] "Reflector ListAndWatch" name:*** (***) (total time:
                61126ms): ---"Objects listed" error:stream error when
                reading response body, may be caused by closed
                connection. Please retry. Original error: stream
                error: stream ID 1521; INTERNAL_ERROR; received from
                peer 61126ms (***)

Decreasing the page size (opts.Limits) does not have any effect on the
timeouts. However, setting ResourceVersion to an empty value seems to
get the paging on its tracks, eliminating the timeouts.

TODO: investigate in Kubernetes upstream the root cause of the timeouts
with ResourceVersion="0".
diff --git a/pkg/nfd-master/nfd-api-controller.go b/pkg/nfd-master/nfd-api-controller.go
@@ -29,6 +29,7 @@ import (
 	nfdclientset "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned"
 	nfdscheme "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned/scheme"
 	nfdinformers "sigs.k8s.io/node-feature-discovery/api/generated/informers/externalversions"
+	nfdinformersv1alpha1 "sigs.k8s.io/node-feature-discovery/api/generated/informers/externalversions/nfd/v1alpha1"
 	nfdlisters "sigs.k8s.io/node-feature-discovery/api/generated/listers/nfd/v1alpha1"
 	nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/api/nfd/v1alpha1"
 	"sigs.k8s.io/node-feature-discovery/pkg/utils"
@@ -67,13 +68,23 @@ func newNfdController(config *restclient.Config, nfdApiControllerOptions nfdApiC
 	}
 
 	nfdClient := nfdclientset.NewForConfigOrDie(config)
+
 	klog.V(2).InfoS("initializing new NFD API controller", "options", utils.DelayedDumper(nfdApiControllerOptions))
 
 	informerFactory := nfdinformers.NewSharedInformerFactory(nfdClient, nfdApiControllerOptions.ResyncPeriod)
 
 	// Add informer for NodeFeature objects
 	if !nfdApiControllerOptions.DisableNodeFeature {
-		featureInformer := informerFactory.Nfd().V1alpha1().NodeFeatures()
+		tweakListOpts := func(opts *metav1.ListOptions) {
+			// Tweak list opts on initial sync to avoid timeouts on the apiserver.
+			// NodeFeature objects are huge and the Kubernetes apiserver
+			// (v1.30) experiences http handler timeouts when the resource
+			// version is set to some non-empty value (TODO: find out why).
+			if opts.ResourceVersion == "0" {
+				opts.ResourceVersion = ""
+			}
+		}
+		featureInformer := nfdinformersv1alpha1.New(informerFactory, "", tweakListOpts).NodeFeatures()
 		if _, err := featureInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
 			AddFunc: func(obj interface{}) {
 				nfr := obj.(*nfdv1alpha1.NodeFeature)