Skip to content

Commit 3d5d277

Browse files
committed
Cleanup k8sclients
Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent b1403d8 commit 3d5d277

File tree

2 files changed

+65
-78
lines changed

2 files changed

+65
-78
lines changed

cmd/nvidia-mig-manager/clients.go

Lines changed: 64 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -9,89 +9,78 @@ import (
99
"k8s.io/apimachinery/pkg/watch"
1010
)
1111

12-
type k8sClients []k8sClient
13-
12+
// A k8sClient represents a client that can be stoped or restarted.
1413
type k8sClient interface {
1514
Stop() error
1615
Restart() error
1716
}
1817

18+
// k8sClients represents a set of clients that can be stopped or restarted
19+
// together.
20+
type k8sClients []k8sClient
21+
22+
var _ k8sClient = (k8sClient)(nil)
23+
24+
// a withNoStopClient wraps the specified client so that the Stop function is a no-op.
1925
type withNoStopClient struct {
2026
k8sClient
2127
}
2228

29+
// a withNoRestartClient wraps the specified client so that the Restart function is a no-op.
2330
type withNoRestartClient struct {
2431
k8sClient
2532
}
2633

34+
// A pod represents a kubernetes pod with a specified app= label.
2735
type pod struct {
2836
app string
2937
// manager stores a reference to the mig manager managing these clients.
3038
manager *migManager
3139
}
3240

33-
// An operand is a GPU client that is controlled by a deploy label.
41+
// An operand is an operand of the GPU Operator that is represented by a pod
42+
// and constolled by a deploy label.
3443
type operand struct {
35-
pod
44+
*pod
3645
deployLabel string
3746
lastValue string
3847
}
3948

40-
func (m *migManager) getK8sClients(opts *reconfigureMIGOptions) k8sClients {
49+
// getK8sClients returns the k8sClients managed by the specified mig manager.
50+
// TODO: This should be configurable so that it can be used as-is from the vgpu-device-manager
51+
// where k8s-clients are not considered.
52+
func (m *migManager) getK8sClients() k8sClients {
4153
k8sGPUClients := k8sClients{
42-
&operand{
43-
pod: pod{
44-
app: "nvidia-device-plugin-daemonset",
45-
manager: m,
46-
},
47-
deployLabel: "nvidia.com/gpu.deploy.device-plugin",
48-
},
49-
&operand{
50-
pod: pod{
51-
app: "gpu-feature-discovery",
52-
manager: m,
53-
},
54-
deployLabel: "nvidia.com/gpu.deploy.gpu-feature-discovery",
55-
},
56-
&operand{
57-
pod: pod{
58-
app: "nvidia-dcgm-exporter",
59-
manager: m,
60-
},
61-
deployLabel: "nvidia.com/gpu.deploy.dcgm-exporter",
62-
},
63-
&operand{
64-
pod: pod{
65-
app: "nvidia-dcgm",
66-
manager: m,
67-
},
68-
deployLabel: "nvidia.com/gpu.deploy.dcgm",
69-
},
70-
&operand{
71-
// TODO: Why don't we wait for the following pd deletion.
72-
pod: pod{
73-
app: "",
74-
manager: m,
75-
},
76-
deployLabel: "nvidia.com/gpu.deploy.nvsm",
77-
},
78-
withNoRestart(&pod{
79-
app: "nvidia-cuda-validator",
80-
manager: m,
81-
}),
82-
withNoRestart(&pod{
83-
app: "nvidia-device-plugin-validator",
84-
manager: m,
85-
}),
86-
withNoStop(&pod{
87-
app: "nvidia-operator-validator",
88-
manager: m,
89-
}),
54+
m.newOperand("nvidia-device-plugin-daemonset", "nvidia.com/gpu.deploy.device-plugin"),
55+
m.newOperand("gpu-feature-discovery", "nvidia.com/gpu.deploy.gpu-feature-discovery"),
56+
m.newOperand("nvidia-dcgm-exporter", "nvidia.com/gpu.deploy.dcgm-exporter"),
57+
m.newOperand("nvidia-dcgm", "nvidia.com/gpu.deploy.dcgm"),
58+
// TODO: Why don't we wait for the following pod deletion.
59+
m.newOperand("", "nvidia.com/gpu.deploy.nvsm"),
60+
withNoRestart(m.newPod("nvidia-cuda-validator")),
61+
withNoRestart(m.newPod("nvidia-device-plugin-validator")),
62+
withNoStop(m.newPod("nvidia-operator-validator")),
9063
}
9164

9265
return k8sGPUClients
9366
}
9467

68+
func (m *migManager) newOperand(app string, deployLabel string) *operand {
69+
return &operand{
70+
pod: m.newPod(app),
71+
deployLabel: deployLabel,
72+
}
73+
}
74+
75+
func (m *migManager) newPod(app string) *pod {
76+
return &pod{
77+
manager: m,
78+
app: app,
79+
}
80+
}
81+
82+
// Restart restarts each of a set of k8s clients.
83+
// The first error encountered is returned and not further clients are restarted.
9584
func (o k8sClients) Restart() error {
9685
for _, c := range o {
9786
if c == nil {
@@ -104,6 +93,8 @@ func (o k8sClients) Restart() error {
10493
return nil
10594
}
10695

96+
// Stop stops each of a set of k8s clients.
97+
// The first error encountered is returned and not further clients are stopped.
10798
func (o k8sClients) Stop() error {
10899
for _, c := range o {
109100
if c == nil {
@@ -116,10 +107,12 @@ func (o k8sClients) Stop() error {
116107
return nil
117108
}
118109

110+
// withNoRestart wraps the specified client so that restarts are disabled.
119111
func withNoRestart(k k8sClient) k8sClient {
120112
return &withNoRestartClient{k}
121113
}
122114

115+
// withNoStop wraps the specified client so that stopss are disabled.
123116
func withNoStop(k k8sClient) k8sClient {
124117
return &withNoStopClient{k}
125118
}
@@ -132,26 +125,17 @@ func (o *withNoStopClient) Stop() error {
132125
return nil
133126
}
134127

128+
// Restart the specified pod.
135129
func (o *pod) Restart() error {
136-
// TODO: We need to add this namespace to the options.
137-
err := o.manager.clientset.CoreV1().Pods(o.manager.Namespace).DeleteCollection(
138-
context.TODO(),
139-
metav1.DeleteOptions{},
140-
metav1.ListOptions{
141-
FieldSelector: fmt.Sprintf("spec.nodeName=%s", o.manager.NodeName),
142-
LabelSelector: fmt.Sprintf("app=%s", o.app),
143-
},
144-
)
145-
if err != nil {
146-
return fmt.Errorf("unable to delete pods for app %s: %w", o.app, err)
147-
}
148-
return nil
130+
return o.delete()
149131
}
150132

133+
// Stop the specified pod.
151134
func (o *pod) Stop() error {
152-
if o.app == "" {
153-
return nil
154-
}
135+
return o.delete()
136+
}
137+
138+
func (o *pod) delete() error {
155139
err := o.manager.clientset.CoreV1().Pods(o.manager.Namespace).DeleteCollection(
156140
context.TODO(),
157141
metav1.DeleteOptions{},
@@ -167,9 +151,6 @@ func (o *pod) Stop() error {
167151
}
168152

169153
func (o *pod) waitForDeletion() error {
170-
if o.app == "" {
171-
return nil
172-
}
173154
timeout := 5 * time.Minute
174155
ctx, cancel := context.WithTimeout(context.Background(), timeout)
175156
defer cancel()
@@ -207,8 +188,11 @@ func (o *pod) waitForDeletion() error {
207188
}
208189
}
209190

191+
// Restart the specified operand by setting its deployLabel to 'true'
192+
// If the deploy label is already set to false, this is assumed to be controlled
193+
// by an external entity and no changes are made.
210194
func (o *operand) Restart() error {
211-
if o.deployLabel == "" || o.lastValue == "false" {
195+
if o.lastValue == "false" {
212196
return nil
213197
}
214198
err := o.manager.setNodeLabelValue(o.deployLabel, "true")
@@ -218,20 +202,23 @@ func (o *operand) Restart() error {
218202
return nil
219203
}
220204

205+
// Stop the specified operand by setting its deploy label to 'paused-for-mig-change'.
206+
// If the deploy label is already set to false, this is assumed to be controlled
207+
// by an external entity and no changes are made.
221208
func (o *operand) Stop() error {
222209
value, err := o.manager.getNodeLabelValue(o.deployLabel)
223210
if err != nil {
224211
return fmt.Errorf("unable to get the value of the %q label: %w", o.deployLabel, err)
225212
}
226213
o.lastValue = value
227-
// Only set 'paused-*' if the current value is not 'false'.
228-
// It should only be 'false' if some external entity has forced it to
229-
// this value, at which point we want to honor it's existing value and
230-
// not change it.
231214
if value != "false" {
232215
if err := o.manager.setNodeLabelValue(o.deployLabel, "paused-for-mig-change"); err != nil {
233216
return err
234217
}
235218
}
219+
// TODO: For the nvidia.com/gpu.deploy.nvsm label we have no associated app name.
220+
if o.app == "" {
221+
return nil
222+
}
236223
return o.waitForDeletion()
237224
}

cmd/nvidia-mig-manager/reconfigure_mig.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ func (m *migManager) reconfigureMIG(opts *reconfigureMIGOptions) error {
157157
return fmt.Errorf("unable to set the value of %q to %q: %w", opts.ConfigStateLabel, "pending", err)
158158
}
159159

160-
k8sClients := m.getK8sClients(opts)
160+
k8sClients := m.getK8sClients()
161161
if err := k8sClients.Stop(); err != nil {
162162
return fmt.Errorf("unable to shutdown k8s GPU clients: %w", err)
163163
}

0 commit comments

Comments
 (0)