Skip to content

Commit 5f7022b

Browse files
authored
Merge pull request #994 from ykulazhenkov/pr-block-device-plugin-code
feat: block device plugin until SR-IOV config applied
2 parents eb1149b + f1e5508 commit 5f7022b

File tree

18 files changed

+1232
-168
lines changed

18 files changed

+1232
-168
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,10 @@ Feature gates are used to enable or disable specific features in the operator.
419419
- **Description:** Enables the firmware reset via `mstfwreset` before a system reboot. This feature is specific to Mellanox network devices and is used to ensure that the firmware is properly reset during system maintenance.
420420
- **Default:** Disabled
421421

422+
6. **Block Device Plugin Until Configured** (`blockDevicePluginUntilConfigured`)
423+
- **Description:** Prevents the SR-IOV device plugin from starting until the sriov-config-daemon has applied the SR-IOV configuration for the node. When enabled, the device plugin daemonset runs an init container that sets a wait-for-config annotation on its pod and waits until the sriov-config-daemon removes this annotation after applying the configuration. This addresses the race condition where the device plugin starts and reports available resources before the configuration is actually applied, which can lead to pods being scheduled prematurely.
424+
- **Default:** Enabled
425+
422426
### Enabling Feature Gates
423427

424428
To enable a feature gate, add it to your configuration file or command line with the desired state. For example, to enable the `resourceInjectorMatchCondition` feature gate, you would specify:

bindata/manifests/plugins/002-rbac.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,27 @@ subjects:
3333
- kind: ServiceAccount
3434
name: sriov-device-plugin
3535
namespace: {{.Namespace}}
36+
---
37+
apiVersion: rbac.authorization.k8s.io/v1
38+
kind: Role
39+
metadata:
40+
name: sriov-device-plugin-pod-access
41+
namespace: {{.Namespace}}
42+
rules:
43+
- apiGroups: [""]
44+
resources: ["pods"]
45+
verbs: ["get", "list", "watch", "update", "patch"]
46+
---
47+
apiVersion: rbac.authorization.k8s.io/v1
48+
kind: RoleBinding
49+
metadata:
50+
name: sriov-device-plugin-pod-access
51+
namespace: {{.Namespace}}
52+
roleRef:
53+
apiGroup: rbac.authorization.k8s.io
54+
kind: Role
55+
name: sriov-device-plugin-pod-access
56+
subjects:
57+
- kind: ServiceAccount
58+
name: sriov-device-plugin
59+
namespace: {{.Namespace}}

bindata/manifests/plugins/sriov-device-plugin.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@ spec:
2323
component: network
2424
type: infra
2525
openshift.io/component: network
26+
# NOTE: The controller uses equality.Semantic.DeepDerivative(in.Spec, ds.Spec)
27+
# to detect changes in the DaemonSet's spec and decide if an update is needed.
28+
# To ensure the init container is properly managed when toggling the
29+
# BlockDevicePluginUntilConfigured feature gate, we define an explicit field
30+
# (init-container-enabled) that is always present in the DaemonSet labels.
31+
# Its value reflects the state of the feature gate and guarantees spec changes
32+
# are propagated, ensuring the init container is added or removed as required.
33+
init-container-enabled: "{{ .BlockDevicePluginUntilConfigured }}"
2634
spec:
2735
hostNetwork: true
2836
nodeSelector:
@@ -39,6 +47,25 @@ spec:
3947
- name: {{ . }}
4048
{{- end }}
4149
{{- end }}
50+
{{- if .BlockDevicePluginUntilConfigured }}
51+
initContainers:
52+
- name: sriov-device-plugin-init
53+
image: {{.SRIOVNetworkConfigDaemonImage}}
54+
command:
55+
- sriov-network-config-daemon
56+
- wait-for-config
57+
- --pod-name=$(POD_NAME)
58+
- --pod-namespace=$(POD_NAMESPACE)
59+
env:
60+
- name: POD_NAME
61+
valueFrom:
62+
fieldRef:
63+
fieldPath: metadata.name
64+
- name: POD_NAMESPACE
65+
valueFrom:
66+
fieldRef:
67+
fieldPath: metadata.namespace
68+
{{- end }}
4269
containers:
4370
- name: sriov-device-plugin
4471
image: {{.SRIOVDevicePluginImage}}

cmd/sriov-network-config-daemon/start.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ func initFeatureGates(defaultConfig *sriovnetworkv1.SriovOperatorConfig) (featur
185185
featureGates := featuregate.New()
186186
featureGates.Init(defaultConfig.Spec.FeatureGates)
187187
fnLogger.Info("Enabled featureGates", "featureGates", featureGates.String())
188-
188+
vars.FeatureGate = featureGates
189189
return featureGates, nil
190190
}
191191

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
/*
2+
Copyright 2025.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
package main
17+
18+
import (
19+
"context"
20+
"fmt"
21+
"time"
22+
23+
"github.com/go-logr/logr"
24+
"github.com/spf13/cobra"
25+
corev1 "k8s.io/api/core/v1"
26+
"k8s.io/apimachinery/pkg/fields"
27+
"k8s.io/apimachinery/pkg/types"
28+
"k8s.io/apimachinery/pkg/util/wait"
29+
"k8s.io/client-go/rest"
30+
ctrl "sigs.k8s.io/controller-runtime"
31+
"sigs.k8s.io/controller-runtime/pkg/cache"
32+
"sigs.k8s.io/controller-runtime/pkg/client"
33+
"sigs.k8s.io/controller-runtime/pkg/log"
34+
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
35+
36+
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
37+
snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log"
38+
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils"
39+
)
40+
41+
var (
42+
waitForConfigCmd = &cobra.Command{
43+
Use: "wait-for-config",
44+
Short: "Wait for SR-IOV configuration to be applied",
45+
Long: "Init container command that sets annotation on pod and waits for " +
46+
"sriov-config-daemon to apply configuration and remove the annotation",
47+
RunE: runWaitForConfigCmd,
48+
}
49+
50+
waitForConfigOpts struct {
51+
podName string
52+
podNamespace string
53+
}
54+
)
55+
56+
func init() {
57+
rootCmd.AddCommand(waitForConfigCmd)
58+
waitForConfigCmd.PersistentFlags().StringVar(&waitForConfigOpts.podName, "pod-name", "",
59+
"kubernetes pod name of the device plugin")
60+
waitForConfigCmd.PersistentFlags().StringVar(&waitForConfigOpts.podNamespace, "pod-namespace", "",
61+
"kubernetes namespace where the device plugin pod is running")
62+
}
63+
64+
type WaitForConfigReconciler struct {
65+
client.Client
66+
Pod types.NamespacedName
67+
Cancel context.CancelFunc
68+
}
69+
70+
func (r *WaitForConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
71+
logger := log.FromContext(ctx)
72+
73+
// This check is currently redundant since the cache is configured to watch only the target pod object.
74+
// However, it is intentionally included to document this assumption and to provide a safeguard in case
75+
// the cache configuration is modified in the future to watch additional pods.
76+
if r.Pod != req.NamespacedName {
77+
return ctrl.Result{}, nil
78+
}
79+
80+
pod := &corev1.Pod{}
81+
if err := r.Get(ctx, req.NamespacedName, pod); err != nil {
82+
logger.Error(err, "Failed to get pod")
83+
return ctrl.Result{}, client.IgnoreNotFound(err)
84+
}
85+
86+
if !utils.ObjectHasAnnotationKey(pod, consts.DevicePluginWaitConfigAnnotation) {
87+
logger.Info("Annotation removed, device plugin can proceed")
88+
r.Cancel()
89+
return ctrl.Result{}, nil
90+
}
91+
92+
logger.Info("Annotation still present, waiting...")
93+
return ctrl.Result{RequeueAfter: consts.DaemonRequeueTime}, nil
94+
}
95+
96+
func (r *WaitForConfigReconciler) SetupWithManager(mgr ctrl.Manager) error {
97+
return ctrl.NewControllerManagedBy(mgr).
98+
For(&corev1.Pod{}).
99+
Complete(r)
100+
}
101+
102+
func validateWaitForConfigOpts() error {
103+
if waitForConfigOpts.podName == "" {
104+
return fmt.Errorf("--pod-name is required")
105+
}
106+
if waitForConfigOpts.podNamespace == "" {
107+
return fmt.Errorf("--pod-namespace is required")
108+
}
109+
return nil
110+
}
111+
112+
func runWaitForConfigCmd(cmd *cobra.Command, args []string) error {
113+
snolog.InitLog()
114+
setupLog := log.Log.WithName("wait-for-config")
115+
116+
if err := validateWaitForConfigOpts(); err != nil {
117+
setupLog.Error(err, "invalid command line arguments")
118+
return err
119+
}
120+
121+
config, err := rest.InClusterConfig()
122+
if err != nil {
123+
setupLog.Error(err, "failed to get in-cluster config")
124+
return err
125+
}
126+
127+
return startWaitForConfigManager(setupLog, config, types.NamespacedName{Name: waitForConfigOpts.podName, Namespace: waitForConfigOpts.podNamespace})
128+
}
129+
130+
func startWaitForConfigManager(setupLog logr.Logger, config *rest.Config, podName types.NamespacedName) error {
131+
ctx, cancel := context.WithCancel(ctrl.SetupSignalHandler())
132+
defer cancel()
133+
134+
setupLog.Info("Starting wait-for-config", "pod", podName)
135+
136+
// Create a temporary client to set the annotation immediately
137+
tempClient, err := client.New(config, client.Options{})
138+
if err != nil {
139+
setupLog.Error(err, "failed to create kubernetes client")
140+
return err
141+
}
142+
143+
// Set annotation on pod to signal that we are waiting for config
144+
setupLog.Info("Setting annotation on pod", "annotation", consts.DevicePluginWaitConfigAnnotation)
145+
err = setAnnotationOnPod(ctx, setupLog, tempClient, podName)
146+
if err != nil {
147+
setupLog.Error(err, "failed to set annotation on pod")
148+
return err
149+
}
150+
setupLog.Info("Annotation set successfully, waiting for removal")
151+
152+
// Configure Manager
153+
// Watch only specific pod object
154+
selector := fields.SelectorFromSet(fields.Set{"metadata.name": podName.Name})
155+
mgr, err := ctrl.NewManager(config, ctrl.Options{
156+
Metrics: metricsserver.Options{BindAddress: "0"},
157+
Cache: cache.Options{
158+
DefaultNamespaces: map[string]cache.Config{podName.Namespace: {}},
159+
ByObject: map[client.Object]cache.ByObject{
160+
&corev1.Pod{}: {Field: selector},
161+
},
162+
},
163+
})
164+
if err != nil {
165+
setupLog.Error(err, "unable to start manager")
166+
return err
167+
}
168+
169+
if err = (&WaitForConfigReconciler{
170+
Client: mgr.GetClient(),
171+
Pod: podName,
172+
Cancel: cancel,
173+
}).SetupWithManager(mgr); err != nil {
174+
setupLog.Error(err, "unable to create controller")
175+
return err
176+
}
177+
178+
setupLog.Info("Starting manager")
179+
if err := mgr.Start(ctx); err != nil {
180+
setupLog.Error(err, "problem running manager")
181+
return err
182+
}
183+
return nil
184+
}
185+
186+
// setAnnotationOnPod sets the wait-for-config annotation on the pod
187+
func setAnnotationOnPod(ctx context.Context, logger logr.Logger, c client.Client, podName types.NamespacedName) error {
188+
return wait.ExponentialBackoff(wait.Backoff{
189+
Steps: 10,
190+
Duration: 1 * time.Second,
191+
Factor: 2.0,
192+
Jitter: 0.1,
193+
Cap: 30 * time.Second,
194+
}, func() (bool, error) {
195+
pod := &corev1.Pod{}
196+
if err := c.Get(ctx, podName, pod); err != nil {
197+
logger.Error(err, "failed to get pod, retrying")
198+
return false, nil
199+
}
200+
if err := utils.AnnotateObject(ctx, pod, consts.DevicePluginWaitConfigAnnotation, "true", c); err != nil {
201+
logger.Error(err, "failed to annotate pod, retrying")
202+
return false, nil
203+
}
204+
return true, nil
205+
})
206+
}

0 commit comments

Comments
 (0)