Skip to content

Commit 64105d2

Browse files
committed
Add validating admission webhook to verify opaque configs
This is a straight copy of this PR kubernetes-sigs/dra-example-driver#75, with minimal changes to make it work in this repo. Signed-off-by: Kevin Klues <kklues@nvidia.com>
1 parent 8eb6fdd commit 64105d2

23 files changed

+3250
-20
lines changed

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,22 @@ As of today, the recommended installation method is via Helm.
4646
Detailed instructions can (for now) be found [here](https://github.com/NVIDIA/k8s-dra-driver-gpu/discussions/249).
4747
In the future, this driver will be included in the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) and does not need to be installed separately anymore.
4848

49+
### Validating Admission Webhook
50+
51+
The validating admission webhook is disabled by default. To enable it, install cert-manager and its CRDs, then set the `webhook.enabled=true` value when the nvidia-dra-driver-gpu chart is installed.
52+
53+
```bash
54+
helm install \
55+
--repo https://charts.jetstack.io \
56+
--version v1.16.3 \
57+
--create-namespace \
58+
--namespace cert-manager \
59+
--wait \
60+
--set crds.enabled=true \
61+
cert-manager \
62+
cert-manager
63+
```
64+
4965
## A (kind) demo
5066

5167
Below, we demonstrate a basic use case: sharing a single GPU across two containers running in the same Kubernetes pod.

cmd/webhook/main.go

Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
Copyright 2025 NVIDIA Corporation.
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
*/
17+
18+
package main
19+
20+
import (
21+
"encoding/json"
22+
"fmt"
23+
"io"
24+
"net/http"
25+
"os"
26+
"strings"
27+
28+
"github.com/urfave/cli/v2"
29+
30+
admissionv1 "k8s.io/api/admission/v1"
31+
resourceapi "k8s.io/api/resource/v1beta1"
32+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
33+
"k8s.io/apimachinery/pkg/runtime"
34+
"k8s.io/apimachinery/pkg/runtime/serializer"
35+
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
36+
"k8s.io/klog/v2"
37+
38+
configapi "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
39+
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
40+
)
41+
42+
const (
43+
DriverName = "gpu.nvidia.com"
44+
)
45+
46+
var (
47+
resourceClaimResource = metav1.GroupVersionResource{
48+
Group: resourceapi.SchemeGroupVersion.Group,
49+
Version: resourceapi.SchemeGroupVersion.Version,
50+
Resource: "resourceclaims",
51+
}
52+
resourceClaimTemplateResource = metav1.GroupVersionResource{
53+
Group: resourceapi.SchemeGroupVersion.Group,
54+
Version: resourceapi.SchemeGroupVersion.Version,
55+
Resource: "resourceclaimtemplates",
56+
}
57+
)
58+
59+
type Flags struct {
60+
loggingConfig *flags.LoggingConfig
61+
62+
certFile string
63+
keyFile string
64+
port int
65+
}
66+
67+
var scheme = runtime.NewScheme()
68+
var codecs = serializer.NewCodecFactory(scheme)
69+
70+
func init() {
71+
utilruntime.Must(admissionv1.AddToScheme(scheme))
72+
}
73+
74+
func main() {
75+
if err := newApp().Run(os.Args); err != nil {
76+
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
77+
os.Exit(1)
78+
}
79+
}
80+
81+
func newApp() *cli.App {
82+
flags := &Flags{
83+
loggingConfig: flags.NewLoggingConfig(),
84+
}
85+
cliFlags := []cli.Flag{
86+
&cli.StringFlag{
87+
Name: "tls-cert-file",
88+
Usage: "File containing the default x509 Certificate for HTTPS. (CA cert, if any, concatenated after server cert).",
89+
Destination: &flags.certFile,
90+
Required: true,
91+
},
92+
&cli.StringFlag{
93+
Name: "tls-private-key-file",
94+
Usage: "File containing the default x509 private key matching --tls-cert-file.",
95+
Destination: &flags.keyFile,
96+
Required: true,
97+
},
98+
&cli.IntFlag{
99+
Name: "port",
100+
Usage: "Secure port that the webhook listens on",
101+
Value: 443,
102+
Destination: &flags.port,
103+
},
104+
}
105+
cliFlags = append(cliFlags, flags.loggingConfig.Flags()...)
106+
107+
app := &cli.App{
108+
Name: "webhook",
109+
Usage: "webhook implements a validating admission webhook complementing a DRA driver plugin.",
110+
ArgsUsage: " ",
111+
HideHelpCommand: true,
112+
Flags: cliFlags,
113+
Before: func(c *cli.Context) error {
114+
if c.Args().Len() > 0 {
115+
return fmt.Errorf("arguments not supported: %v", c.Args().Slice())
116+
}
117+
return flags.loggingConfig.Apply()
118+
},
119+
Action: func(c *cli.Context) error {
120+
server := &http.Server{
121+
Handler: newMux(),
122+
Addr: fmt.Sprintf(":%d", flags.port),
123+
}
124+
klog.Info("starting webhook server on", server.Addr)
125+
return server.ListenAndServeTLS(flags.certFile, flags.keyFile)
126+
},
127+
}
128+
129+
return app
130+
}
131+
132+
func newMux() *http.ServeMux {
133+
mux := http.NewServeMux()
134+
mux.HandleFunc("/validate-resource-claim-parameters", serveResourceClaim)
135+
mux.HandleFunc("/readyz", func(w http.ResponseWriter, req *http.Request) {
136+
_, err := w.Write([]byte("ok"))
137+
if err != nil {
138+
http.Error(w, err.Error(), http.StatusInternalServerError)
139+
return
140+
}
141+
})
142+
return mux
143+
}
144+
145+
func serveResourceClaim(w http.ResponseWriter, r *http.Request) {
146+
serve(w, r, admitResourceClaimParameters)
147+
}
148+
149+
// serve handles the http portion of a request prior to handing to an admit
150+
// function.
151+
func serve(w http.ResponseWriter, r *http.Request, admit func(admissionv1.AdmissionReview) *admissionv1.AdmissionResponse) {
152+
var body []byte
153+
if r.Body != nil {
154+
data, err := io.ReadAll(r.Body)
155+
if err != nil {
156+
klog.Error(err)
157+
http.Error(w, err.Error(), http.StatusInternalServerError)
158+
return
159+
}
160+
body = data
161+
}
162+
163+
// verify the content type is accurate
164+
contentType := r.Header.Get("Content-Type")
165+
if contentType != "application/json" {
166+
msg := fmt.Sprintf("contentType=%s, expected application/json", contentType)
167+
klog.Error(msg)
168+
http.Error(w, msg, http.StatusUnsupportedMediaType)
169+
return
170+
}
171+
172+
klog.V(2).Infof("handling request: %s", body)
173+
174+
requestedAdmissionReview, err := readAdmissionReview(body)
175+
if err != nil {
176+
msg := fmt.Sprintf("failed to read AdmissionReview from request body: %v", err)
177+
klog.Error(msg)
178+
http.Error(w, msg, http.StatusBadRequest)
179+
return
180+
}
181+
responseAdmissionReview := &admissionv1.AdmissionReview{}
182+
responseAdmissionReview.SetGroupVersionKind(requestedAdmissionReview.GroupVersionKind())
183+
responseAdmissionReview.Response = admit(*requestedAdmissionReview)
184+
responseAdmissionReview.Response.UID = requestedAdmissionReview.Request.UID
185+
186+
klog.V(2).Infof("sending response: %v", responseAdmissionReview)
187+
respBytes, err := json.Marshal(responseAdmissionReview)
188+
if err != nil {
189+
klog.Error(err)
190+
http.Error(w, err.Error(), http.StatusInternalServerError)
191+
return
192+
}
193+
w.Header().Set("Content-Type", "application/json")
194+
if _, err := w.Write(respBytes); err != nil {
195+
klog.Error(err)
196+
}
197+
}
198+
199+
func readAdmissionReview(data []byte) (*admissionv1.AdmissionReview, error) {
200+
deserializer := codecs.UniversalDeserializer()
201+
obj, gvk, err := deserializer.Decode(data, nil, nil)
202+
if err != nil {
203+
return nil, fmt.Errorf("request could not be decoded: %w", err)
204+
}
205+
206+
if *gvk != admissionv1.SchemeGroupVersion.WithKind("AdmissionReview") {
207+
return nil, fmt.Errorf("unsupported group version kind: %v", gvk)
208+
}
209+
210+
requestedAdmissionReview, ok := obj.(*admissionv1.AdmissionReview)
211+
if !ok {
212+
return nil, fmt.Errorf("expected v1.AdmissionReview but got: %T", obj)
213+
}
214+
215+
return requestedAdmissionReview, nil
216+
}
217+
218+
// admitResourceClaimParameters accepts both ResourceClaims and ResourceClaimTemplates and validates their
219+
// opaque device configuration parameters for this driver.
220+
func admitResourceClaimParameters(ar admissionv1.AdmissionReview) *admissionv1.AdmissionResponse {
221+
klog.V(2).Info("admitting resource claim parameters")
222+
223+
var deviceConfigs []resourceapi.DeviceClaimConfiguration
224+
var specPath string
225+
226+
raw := ar.Request.Object.Raw
227+
deserializer := codecs.UniversalDeserializer()
228+
229+
switch ar.Request.Resource {
230+
case resourceClaimResource:
231+
claim := resourceapi.ResourceClaim{}
232+
if _, _, err := deserializer.Decode(raw, nil, &claim); err != nil {
233+
klog.Error(err)
234+
return &admissionv1.AdmissionResponse{
235+
Result: &metav1.Status{
236+
Message: err.Error(),
237+
Reason: metav1.StatusReasonBadRequest,
238+
},
239+
}
240+
}
241+
deviceConfigs = claim.Spec.Devices.Config
242+
specPath = "spec"
243+
case resourceClaimTemplateResource:
244+
claimTemplate := resourceapi.ResourceClaimTemplate{}
245+
if _, _, err := deserializer.Decode(raw, nil, &claimTemplate); err != nil {
246+
klog.Error(err)
247+
return &admissionv1.AdmissionResponse{
248+
Result: &metav1.Status{
249+
Message: err.Error(),
250+
Reason: metav1.StatusReasonBadRequest,
251+
},
252+
}
253+
}
254+
deviceConfigs = claimTemplate.Spec.Spec.Devices.Config
255+
specPath = "spec.spec"
256+
default:
257+
msg := fmt.Sprintf("expected resource to be %s or %s, got %s", resourceClaimResource, resourceClaimTemplateResource, ar.Request.Resource)
258+
klog.Error(msg)
259+
return &admissionv1.AdmissionResponse{
260+
Result: &metav1.Status{
261+
Message: msg,
262+
Reason: metav1.StatusReasonBadRequest,
263+
},
264+
}
265+
}
266+
267+
var errs []error
268+
for configIndex, config := range deviceConfigs {
269+
if config.Opaque == nil || config.Opaque.Driver != DriverName {
270+
continue
271+
}
272+
273+
fieldPath := fmt.Sprintf("%s.devices.config[%d].opaque.parameters", specPath, configIndex)
274+
decodedConfig, err := runtime.Decode(configapi.Decoder, config.Opaque.Parameters.Raw)
275+
if err != nil {
276+
errs = append(errs, fmt.Errorf("error decoding object at %s: %w", fieldPath, err))
277+
continue
278+
}
279+
gpuConfig, ok := decodedConfig.(*configapi.GpuConfig)
280+
if !ok {
281+
errs = append(errs, fmt.Errorf("expected v1beta1.GpuConfig at %s but got: %T", fieldPath, decodedConfig))
282+
continue
283+
}
284+
err = gpuConfig.Validate()
285+
if err != nil {
286+
errs = append(errs, fmt.Errorf("object at %s is invalid: %w", fieldPath, err))
287+
}
288+
}
289+
290+
if len(errs) > 0 {
291+
var errMsgs []string
292+
for _, err := range errs {
293+
errMsgs = append(errMsgs, err.Error())
294+
}
295+
msg := fmt.Sprintf("%d configs failed to validate: %s", len(errs), strings.Join(errMsgs, "; "))
296+
klog.Error(msg)
297+
return &admissionv1.AdmissionResponse{
298+
Result: &metav1.Status{
299+
Message: msg,
300+
Reason: metav1.StatusReason(metav1.StatusReasonInvalid),
301+
},
302+
}
303+
}
304+
305+
return &admissionv1.AdmissionResponse{
306+
Allowed: true,
307+
}
308+
}

0 commit comments

Comments
 (0)