@@ -9,89 +9,78 @@ import (
99 "k8s.io/apimachinery/pkg/watch"
1010)
1111
12- type k8sClients []k8sClient
13-
12+ // A k8sClient represents a client that can be stoped or restarted.
1413type k8sClient interface {
1514 Stop () error
1615 Restart () error
1716}
1817
18+ // k8sClients represents a set of clients that can be stopped or restarted
19+ // together.
20+ type k8sClients []k8sClient
21+
22+ var _ k8sClient = (k8sClient )(nil )
23+
24+ // a withNoStopClient wraps the specified client so that the Stop function is a no-op.
1925type withNoStopClient struct {
2026 k8sClient
2127}
2228
29+ // a withNoRestartClient wraps the specified client so that the Restart function is a no-op.
2330type withNoRestartClient struct {
2431 k8sClient
2532}
2633
34+ // A pod represents a kubernetes pod with a specified app= label.
2735type pod struct {
2836 app string
2937 // manager stores a reference to the mig manager managing these clients.
3038 manager * migManager
3139}
3240
33- // An operand is a GPU client that is controlled by a deploy label.
41+ // An operand is an operand of the GPU Operator that is represented by a pod
42+ // and constolled by a deploy label.
3443type operand struct {
35- pod
44+ * pod
3645 deployLabel string
3746 lastValue string
3847}
3948
40- func (m * migManager ) getK8sClients (opts * reconfigureMIGOptions ) k8sClients {
49+ // getK8sClients returns the k8sClients managed by the specified mig manager.
50+ // TODO: This should be configurable so that it can be used as-is from the vgpu-device-manager
51+ // where k8s-clients are not considered.
52+ func (m * migManager ) getK8sClients () k8sClients {
4153 k8sGPUClients := k8sClients {
42- & operand {
43- pod : pod {
44- app : "nvidia-device-plugin-daemonset" ,
45- manager : m ,
46- },
47- deployLabel : "nvidia.com/gpu.deploy.device-plugin" ,
48- },
49- & operand {
50- pod : pod {
51- app : "gpu-feature-discovery" ,
52- manager : m ,
53- },
54- deployLabel : "nvidia.com/gpu.deploy.gpu-feature-discovery" ,
55- },
56- & operand {
57- pod : pod {
58- app : "nvidia-dcgm-exporter" ,
59- manager : m ,
60- },
61- deployLabel : "nvidia.com/gpu.deploy.dcgm-exporter" ,
62- },
63- & operand {
64- pod : pod {
65- app : "nvidia-dcgm" ,
66- manager : m ,
67- },
68- deployLabel : "nvidia.com/gpu.deploy.dcgm" ,
69- },
70- & operand {
71- // TODO: Why don't we wait for the following pd deletion.
72- pod : pod {
73- app : "" ,
74- manager : m ,
75- },
76- deployLabel : "nvidia.com/gpu.deploy.nvsm" ,
77- },
78- withNoRestart (& pod {
79- app : "nvidia-cuda-validator" ,
80- manager : m ,
81- }),
82- withNoRestart (& pod {
83- app : "nvidia-device-plugin-validator" ,
84- manager : m ,
85- }),
86- withNoStop (& pod {
87- app : "nvidia-operator-validator" ,
88- manager : m ,
89- }),
54+ m .newOperand ("nvidia-device-plugin-daemonset" , "nvidia.com/gpu.deploy.device-plugin" ),
55+ m .newOperand ("gpu-feature-discovery" , "nvidia.com/gpu.deploy.gpu-feature-discovery" ),
56+ m .newOperand ("nvidia-dcgm-exporter" , "nvidia.com/gpu.deploy.dcgm-exporter" ),
57+ m .newOperand ("nvidia-dcgm" , "nvidia.com/gpu.deploy.dcgm" ),
58+ // TODO: Why don't we wait for the following pod deletion.
59+ m .newOperand ("" , "nvidia.com/gpu.deploy.nvsm" ),
60+ withNoRestart (m .newPod ("nvidia-cuda-validator" )),
61+ withNoRestart (m .newPod ("nvidia-device-plugin-validator" )),
62+ withNoStop (m .newPod ("nvidia-operator-validator" )),
9063 }
9164
9265 return k8sGPUClients
9366}
9467
68+ func (m * migManager ) newOperand (app string , deployLabel string ) * operand {
69+ return & operand {
70+ pod : m .newPod (app ),
71+ deployLabel : deployLabel ,
72+ }
73+ }
74+
75+ func (m * migManager ) newPod (app string ) * pod {
76+ return & pod {
77+ manager : m ,
78+ app : app ,
79+ }
80+ }
81+
82+ // Restart restarts each of a set of k8s clients.
83+ // The first error encountered is returned and not further clients are restarted.
9584func (o k8sClients ) Restart () error {
9685 for _ , c := range o {
9786 if c == nil {
@@ -104,6 +93,8 @@ func (o k8sClients) Restart() error {
10493 return nil
10594}
10695
96+ // Stop stops each of a set of k8s clients.
97+ // The first error encountered is returned and not further clients are stopped.
10798func (o k8sClients ) Stop () error {
10899 for _ , c := range o {
109100 if c == nil {
@@ -116,10 +107,12 @@ func (o k8sClients) Stop() error {
116107 return nil
117108}
118109
110+ // withNoRestart wraps the specified client so that restarts are disabled.
119111func withNoRestart (k k8sClient ) k8sClient {
120112 return & withNoRestartClient {k }
121113}
122114
115+ // withNoStop wraps the specified client so that stopss are disabled.
123116func withNoStop (k k8sClient ) k8sClient {
124117 return & withNoStopClient {k }
125118}
@@ -132,26 +125,17 @@ func (o *withNoStopClient) Stop() error {
132125 return nil
133126}
134127
128+ // Restart the specified pod.
135129func (o * pod ) Restart () error {
136- // TODO: We need to add this namespace to the options.
137- err := o .manager .clientset .CoreV1 ().Pods (o .manager .Namespace ).DeleteCollection (
138- context .TODO (),
139- metav1.DeleteOptions {},
140- metav1.ListOptions {
141- FieldSelector : fmt .Sprintf ("spec.nodeName=%s" , o .manager .NodeName ),
142- LabelSelector : fmt .Sprintf ("app=%s" , o .app ),
143- },
144- )
145- if err != nil {
146- return fmt .Errorf ("unable to delete pods for app %s: %w" , o .app , err )
147- }
148- return nil
130+ return o .delete ()
149131}
150132
133+ // Stop the specified pod.
151134func (o * pod ) Stop () error {
152- if o .app == "" {
153- return nil
154- }
135+ return o .delete ()
136+ }
137+
138+ func (o * pod ) delete () error {
155139 err := o .manager .clientset .CoreV1 ().Pods (o .manager .Namespace ).DeleteCollection (
156140 context .TODO (),
157141 metav1.DeleteOptions {},
@@ -167,9 +151,6 @@ func (o *pod) Stop() error {
167151}
168152
169153func (o * pod ) waitForDeletion () error {
170- if o .app == "" {
171- return nil
172- }
173154 timeout := 5 * time .Minute
174155 ctx , cancel := context .WithTimeout (context .Background (), timeout )
175156 defer cancel ()
@@ -207,8 +188,11 @@ func (o *pod) waitForDeletion() error {
207188 }
208189}
209190
191+ // Restart the specified operand by setting its deployLabel to 'true'
192+ // If the deploy label is already set to false, this is assumed to be controlled
193+ // by an external entity and no changes are made.
210194func (o * operand ) Restart () error {
211- if o .deployLabel == "" || o . lastValue == "false" {
195+ if o .lastValue == "false" {
212196 return nil
213197 }
214198 err := o .manager .setNodeLabelValue (o .deployLabel , "true" )
@@ -218,20 +202,23 @@ func (o *operand) Restart() error {
218202 return nil
219203}
220204
205+ // Stop the specified operand by setting its deploy label to 'paused-for-mig-change'.
206+ // If the deploy label is already set to false, this is assumed to be controlled
207+ // by an external entity and no changes are made.
221208func (o * operand ) Stop () error {
222209 value , err := o .manager .getNodeLabelValue (o .deployLabel )
223210 if err != nil {
224211 return fmt .Errorf ("unable to get the value of the %q label: %w" , o .deployLabel , err )
225212 }
226213 o .lastValue = value
227- // Only set 'paused-*' if the current value is not 'false'.
228- // It should only be 'false' if some external entity has forced it to
229- // this value, at which point we want to honor it's existing value and
230- // not change it.
231214 if value != "false" {
232215 if err := o .manager .setNodeLabelValue (o .deployLabel , "paused-for-mig-change" ); err != nil {
233216 return err
234217 }
235218 }
219+ // TODO: For the nvidia.com/gpu.deploy.nvsm label we have no associated app name.
220+ if o .app == "" {
221+ return nil
222+ }
236223 return o .waitForDeletion ()
237224}
0 commit comments