Skip to content

Commit 5402115

Browse files
authored
Merge pull request #2000 from ivelichkovich/master
add configurable pagination to nfd-master
2 parents 5b20c77 + e37c949 commit 5402115

File tree

8 files changed

+60
-1
lines changed

8 files changed

+60
-1
lines changed

cmd/nfd-master/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
140140
flagset.Var(overrides.ResyncPeriod, "resync-period", "Specify the NFD API controller resync period.")
141141
overrides.NfdApiParallelism = flagset.Int("nfd-api-parallelism", 10, "Defines the maximum number of goroutines responsible of updating nodes. "+
142142
"Can be used for the throttling mechanism.")
143+
overrides.InformerPageSize = flagset.Int64("informer-page-size", 200,
144+
"The list size to use when listing NodeFeature objects to sync informer cache.")
143145

144146
return args, overrides
145147
}

deployment/components/master-config/nfd-master.conf.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# extraLabelNs: ["added.ns.io","added.kubernets.io"]
33
# denyLabelNs: ["denied.ns.io","denied.kubernetes.io"]
44
# enableTaints: false
5+
# informerPageSize: 200
56
# labelWhiteList: "foo"
67
# resyncPeriod: "2h"
78
# restrictions:

deployment/helm/node-feature-discovery/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ master:
2828
# extraLabelNs: ["added.ns.io","added.kubernets.io"]
2929
# denyLabelNs: ["denied.ns.io","denied.kubernetes.io"]
3030
# enableTaints: false
31+
# informerPageSize: 200
3132
# labelWhiteList: "foo"
3233
# resyncPeriod: "2h"
3334
# restrictions:

docs/reference/master-commandline-reference.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,21 @@ Example:
173173
nfd-master -deny-label-ns=*.vendor.com,vendor-2.io
174174
```
175175

176+
### -informer-page-size
177+
178+
The `-informer-page-size` flag is used to control pagination
179+
during informer cache sync on nfd-master startup.
180+
This is useful to control load on api-server/etcd as listing
181+
NodeFeature objects can be expensive, especially in large clusters.
182+
183+
Default: 200
184+
185+
Example:
186+
187+
```bash
188+
nfd-master -informer-page-size=20
189+
```
190+
176191
### -config
177192

178193
The `-config` flag specifies the path of the nfd-master configuration file to

docs/reference/master-configuration-reference.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,21 @@ Example:
180180
nfdApiParallelism: 1
181181
```
182182

183+
## informerPageSize
184+
185+
The `informerPageSize` option is used to control pagination
186+
during informer cache sync on nfd-master startup.
187+
This is useful to control load on api-server/etcd as listing
188+
NodeFeature objects can be expensive, especially in large clusters.
189+
190+
Default: 200
191+
192+
Example:
193+
194+
```yaml
195+
informerPageSize: 50
196+
```
197+
183198
## klog
184199

185200
The following options specify the logger configuration. Most of which can be

docs/usage/nfd-master.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,11 @@ If you have RBAC authorization enabled (as is the default e.g. with clusters
8484
initialized with kubeadm) you need to configure the appropriate ClusterRoles,
8585
ClusterRoleBindings and a ServiceAccount for NFD to create node
8686
labels. The provided template will configure these for you.
87+
88+
## Informer List Pagination
89+
90+
When NFD Master starts up it starts an informer on the nodefeatures resources.
91+
These resources can be large and in a large cluster this initial list call
92+
to sync the informer cache can be expensive and heavy on api-server/etcd.
93+
You can use the `informer-list-size` argument to NFD master to
94+
control pagination size to help control the load during NFD-Master restart.

pkg/nfd-master/nfd-api-controller.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ type nfdApiControllerOptions struct {
5656
ResyncPeriod time.Duration
5757
K8sClient k8sclient.Interface
5858
NodeFeatureNamespaceSelector *metav1.LabelSelector
59+
ListSize int64
5960
}
6061

6162
func init() {
@@ -95,11 +96,19 @@ func newNfdController(config *restclient.Config, nfdApiControllerOptions nfdApiC
9596
// Tweak list opts on initial sync to avoid timeouts on the apiserver.
9697
// NodeFeature objects are huge and the Kubernetes apiserver
9798
// (v1.30) experiences http handler timeouts when the resource
98-
// version is set to some non-empty value (TODO: find out why).
99+
// version is set to some non-empty value
100+
// https://github.com/kubernetes/kubernetes/blob/ace55542575fb098b3e413692bbe2bc20d2348ba/staging/src/k8s.io/apiserver/pkg/storage/cacher/cacher.go#L600-L616 if you set resource version to 0
101+
// it serves the request from apiservers cache and doesn't use pagination otherwise pagination will default to 500
102+
// so that's why this is required on large clusters
103+
// So by setting this we're making it go to ETCD instead of from api-server cache, there's some WIP in k/k
104+
// that seems to imply they're working on improving this behavior where you'll be able to paginate from apiserver cache
105+
// it's not supported yet (2/2025), would be good to track this though kubernetes/kubernetes#108003
99106
if opts.ResourceVersion == "0" {
100107
opts.ResourceVersion = ""
101108
}
109+
opts.Limit = nfdApiControllerOptions.ListSize // value of 0 disables pagination
102110
}
111+
103112
featureInformer := nfdinformersv1alpha1.New(informerFactory, "", tweakListOpts).NodeFeatures()
104113
if _, err := featureInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
105114
AddFunc: func(obj interface{}) {

pkg/nfd-master/nfd-master.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ type NFDConfig struct {
9191
NfdApiParallelism int
9292
Klog klogutils.KlogConfigOpts
9393
Restrictions Restrictions
94+
InformerPageSize int64
9495
}
9596

9697
// LeaderElectionConfig contains the configuration for leader election
@@ -109,6 +110,7 @@ type ConfigOverrideArgs struct {
109110
NoPublish *bool
110111
ResyncPeriod *utils.DurationVal
111112
NfdApiParallelism *int
113+
InformerPageSize *int64
112114
}
113115

114116
// Args holds command line arguments
@@ -121,6 +123,7 @@ type Args struct {
121123
Prune bool
122124
Options string
123125
EnableLeaderElection bool
126+
MetricsPort int
124127

125128
Overrides ConfigOverrideArgs
126129
}
@@ -243,6 +246,7 @@ func newDefaultConfig() *NFDConfig {
243246
NfdApiParallelism: 10,
244247
EnableTaints: false,
245248
ResyncPeriod: utils.DurationVal{Duration: time.Duration(1) * time.Hour},
249+
InformerPageSize: 200,
246250
LeaderElection: LeaderElectionConfig{
247251
LeaseDuration: utils.DurationVal{Duration: time.Duration(15) * time.Second},
248252
RetryPeriod: utils.DurationVal{Duration: time.Duration(2) * time.Second},
@@ -1187,6 +1191,9 @@ func (m *nfdMaster) configure(filepath string, overrides string) error {
11871191
if m.args.Overrides.NfdApiParallelism != nil {
11881192
c.NfdApiParallelism = *m.args.Overrides.NfdApiParallelism
11891193
}
1194+
if m.args.Overrides.InformerPageSize != nil {
1195+
c.InformerPageSize = *m.args.Overrides.InformerPageSize
1196+
}
11901197

11911198
if c.NfdApiParallelism <= 0 {
11921199
return fmt.Errorf("the maximum number of concurrent labelers should be a non-zero positive number")
@@ -1293,6 +1300,7 @@ func (m *nfdMaster) startNfdApiController() error {
12931300
K8sClient: m.k8sClient,
12941301
NodeFeatureNamespaceSelector: m.config.Restrictions.NodeFeatureNamespaceSelector,
12951302
DisableNodeFeatureGroup: !nfdfeatures.NFDFeatureGate.Enabled(nfdfeatures.NodeFeatureGroupAPI),
1303+
ListSize: m.config.InformerPageSize,
12961304
})
12971305
if err != nil {
12981306
return fmt.Errorf("failed to initialize CRD controller: %w", err)

0 commit comments

Comments
 (0)