Skip to content

Commit fdac825

Browse files
committed
add talos support
Signed-off-by: hydazz <alexanderhyde@icloud.com>
2 parents 2762688 + 151c766 commit fdac825

File tree

9 files changed

+45
-49
lines changed

9 files changed

+45
-49
lines changed

.github/workflows/image.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
steps:
3232
- uses: actions/checkout@v5
3333
name: Check out code
34-
34+
3535
- name: Install Go
3636
uses: actions/setup-go@v6
3737
with:
@@ -61,3 +61,9 @@ jobs:
6161
run: |
6262
echo "${VERSION}"
6363
make -f deployments/container/Makefile build
64+
# Push the image again, using a shorter tag (only the 8 character
65+
# commit hash). That's for consumption in downstream CI (for copying
66+
# the image from GHCR to elsewhere). See
67+
# https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/688
68+
export IMAGE_TAG="$(echo $GITHUB_SHA | cut -c1-8)"
69+
make -f deployments/container/Makefile build

.nvidia-ci.yml

Lines changed: 3 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,11 @@ variables:
3838
# Define the public staging registry
3939
STAGING_REGISTRY: ghcr.io/nvidia
4040
STAGING_VERSION: ${CI_COMMIT_SHORT_SHA}
41-
ARTIFACTORY_REPO_BASE: "https://urm.nvidia.com/artifactory/sw-gpu-cloudnative"
4241
KITMAKER_RELEASE_FOLDER: "kitmaker"
4342
PACKAGE_ARCHIVE_RELEASE_FOLDER: "releases"
4443

4544
stages:
4645
- pull
47-
- scan
4846
- release
4947
- ngc-publish
5048

@@ -69,7 +67,7 @@ workflow:
6967
# Download the regctl binary for use in the release steps
7068
.regctl-setup:
7169
before_script:
72-
- export REGCTL_VERSION=v0.4.5
70+
- export REGCTL_VERSION=v0.4.8
7371
- apk add --no-cache curl
7472
- mkdir -p bin
7573
- curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64
@@ -146,48 +144,11 @@ pull-images:
146144
script:
147145
- echo "Skipped in internal CI"
148146

149-
# The .scan step forms the base of the image scan operation performed before releasing
150-
# images.
151-
scan-images:
152-
stage: scan
153-
needs:
154-
- pull-images
155-
image: "${PULSE_IMAGE}"
156-
parallel:
157-
matrix:
158-
PLATFORM: ["linux/amd64", "linux/arm64"]
159-
variables:
160-
IMAGE: "${CI_REGISTRY_IMAGE}/k8s-dra-driver-gpu:${CI_COMMIT_SHORT_SHA}"
161-
IMAGE_ARCHIVE: "k8s-dra-driver-gpu-${CI_JOB_ID}.tar"
162-
allow_failure: true
163-
script:
164-
- |
165-
docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}"
166-
echo "Scanning image ${IMAGE} for ${PLATFORM}"
167-
docker pull --platform="${PLATFORM}" "${IMAGE}"
168-
docker save "${IMAGE}" -o "${IMAGE_ARCHIVE}"
169-
AuthHeader=$(echo -n $SSA_CLIENT_ID:$SSA_CLIENT_SECRET | base64 -w0)
170-
export SSA_TOKEN=$(curl --request POST --header "Authorization: Basic $AuthHeader" --header "Content-Type: application/x-www-form-urlencoded" ${SSA_ISSUER_URL} | jq ".access_token" | tr -d '"')
171-
if [ -z "$SSA_TOKEN" ]; then exit 1; else echo "SSA_TOKEN set!"; fi
172-
173-
pulse-cli -n $NSPECT_ID --ssa $SSA_TOKEN scan -i $IMAGE_ARCHIVE -p $CONTAINER_POLICY -o
174-
rm -f "${IMAGE_ARCHIVE}"
175-
artifacts:
176-
when: always
177-
expire_in: 1 week
178-
paths:
179-
- pulse-cli.log
180-
- licenses.json
181-
- sbom.json
182-
- vulns.json
183-
- policy_evaluation.json
184147

185148
push-images-to-staging:
186149
extends:
187150
- .copy-images
188151
stage: release
189-
needs:
190-
- scan-images
191152
variables:
192153
IN_REGISTRY: "${CI_REGISTRY}"
193154
IN_REGISTRY_USER: "${CI_REGISTRY_USER}"
@@ -204,7 +165,6 @@ push-images-to-staging:
204165
.publish-images:
205166
stage: ngc-publish
206167
needs:
207-
- scan-images
208168
- push-images-to-staging
209169
image:
210170
name: "${CNT_NGC_PUBLISH_IMAGE}"
@@ -254,15 +214,15 @@ push-images-to-staging:
254214
- "${PROJECT_NAME}.yaml"
255215

256216

257-
publish-images-to-ngc:
217+
create-ngc-publish-mr:
258218
extends:
259219
- .publish-images
260220
rules:
261221
- if: $CI_COMMIT_TAG
262222

263223
# We create a dummy MR that exercises the publishing logic.
264224
# TODO: This MR should be closed automatically.
265-
publish-images-dummy:
225+
create-ngc-publish-mr-dummy:
266226
extends:
267227
- .publish-images
268228
variables:

cmd/compute-domain-kubelet-plugin/cdi.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ const (
4646
defaultCDIRoot = "/var/run/cdi"
4747
)
4848

49+
func getTalosLibrarySearchPaths() []string {
50+
return []string{
51+
"/driver-root/usr/local/glibc/usr/lib",
52+
"/driver-root/usr/local/glibc/lib",
53+
"/driver-root/usr/local/glibc/lib64",
54+
}
55+
}
56+
4957
type CDIHandler struct {
5058
logger *logrus.Logger
5159
nvml nvml.Interface
@@ -103,6 +111,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) {
103111
nvcdi.WithVendor(h.vendor),
104112
nvcdi.WithClass(h.deviceClass),
105113
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath),
114+
nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()),
106115
)
107116
if err != nil {
108117
return nil, fmt.Errorf("unable to create CDI library for devices: %w", err)
@@ -120,6 +129,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) {
120129
nvcdi.WithVendor(h.vendor),
121130
nvcdi.WithClass(h.claimClass),
122131
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath),
132+
nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()),
123133
)
124134
if err != nil {
125135
return nil, fmt.Errorf("unable to create CDI library for claims: %w", err)

cmd/compute-domain-kubelet-plugin/root.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ func (r root) getDriverLibraryPath() (string, error) {
3434
"/lib64",
3535
"/lib/x86_64-linux-gnu",
3636
"/lib/aarch64-linux-gnu",
37+
"/usr/local/glibc/usr/lib",
3738
}
3839

3940
libraryPath, err := r.findFile("libnvidia-ml.so.1", librarySearchPaths...)
@@ -51,6 +52,7 @@ func (r root) getNvidiaSMIPath() (string, error) {
5152
"/usr/sbin",
5253
"/bin",
5354
"/sbin",
55+
"/usr/local/bin",
5456
}
5557

5658
binaryPath, err := r.findFile("nvidia-smi", binarySearchPaths...)

cmd/gpu-kubelet-plugin/cdi.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ const (
4646
defaultCDIRoot = "/var/run/cdi"
4747
)
4848

49+
func getTalosLibrarySearchPaths() []string {
50+
return []string{
51+
"/driver-root/usr/local/glibc/usr/lib",
52+
"/driver-root/usr/local/glibc/lib",
53+
"/driver-root/usr/local/glibc/lib64",
54+
}
55+
}
56+
4957
type CDIHandler struct {
5058
logger *logrus.Logger
5159
nvml nvml.Interface
@@ -103,6 +111,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) {
103111
nvcdi.WithVendor(h.vendor),
104112
nvcdi.WithClass(h.deviceClass),
105113
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath),
114+
nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()),
106115
)
107116
if err != nil {
108117
return nil, fmt.Errorf("unable to create CDI library for devices: %w", err)
@@ -120,6 +129,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) {
120129
nvcdi.WithVendor(h.vendor),
121130
nvcdi.WithClass(h.claimClass),
122131
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath),
132+
nvcdi.WithLibrarySearchPaths(getTalosLibrarySearchPaths()),
123133
)
124134
if err != nil {
125135
return nil, fmt.Errorf("unable to create CDI library for claims: %w", err)

cmd/gpu-kubelet-plugin/root.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ func (r root) getDriverLibraryPath() (string, error) {
3434
"/lib64",
3535
"/lib/x86_64-linux-gnu",
3636
"/lib/aarch64-linux-gnu",
37+
"/usr/local/glibc/usr/lib",
3738
}
3839

3940
libraryPath, err := r.findFile("libnvidia-ml.so.1", librarySearchPaths...)
@@ -51,6 +52,7 @@ func (r root) getNvidiaSMIPath() (string, error) {
5152
"/usr/sbin",
5253
"/bin",
5354
"/sbin",
55+
"/usr/local/bin",
5456
}
5557

5658
binaryPath, err := r.findFile("nvidia-smi", binarySearchPaths...)

deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ spec:
129129
- name: NVIDIA_VISIBLE_DEVICES
130130
value: void
131131
- name: CDI_ROOT
132-
value: /var/run/cdi
132+
value: {{ .Values.cdiRoot | quote }}
133133
- name: NVIDIA_MIG_CONFIG_DEVICES
134134
value: all
135135
- name: NODE_NAME
@@ -166,7 +166,7 @@ spec:
166166
mountPath: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }}
167167
mountPropagation: Bidirectional
168168
- name: cdi
169-
mountPath: /var/run/cdi
169+
mountPath: {{ .Values.cdiRoot | quote }}
170170
- name: driver-root
171171
mountPath: /driver-root
172172
readOnly: true
@@ -220,7 +220,7 @@ spec:
220220
- name: NVIDIA_VISIBLE_DEVICES
221221
value: void
222222
- name: CDI_ROOT
223-
value: /var/run/cdi
223+
value: {{ .Values.cdiRoot | quote }}
224224
- name: NVIDIA_MIG_CONFIG_DEVICES
225225
value: all
226226
- name: NODE_NAME
@@ -259,7 +259,7 @@ spec:
259259
mountPath: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }}
260260
mountPropagation: Bidirectional
261261
- name: cdi
262-
mountPath: /var/run/cdi
262+
mountPath: {{ .Values.cdiRoot | quote }}
263263
- name: driver-root
264264
mountPath: /driver-root
265265
readOnly: true
@@ -274,7 +274,7 @@ spec:
274274
path: {{ .Values.kubeletPlugin.kubeletPluginsDirectoryPath | quote }}
275275
- name: cdi
276276
hostPath:
277-
path: /var/run/cdi
277+
path: {{ .Values.cdiRoot | quote }}
278278
- name: driver-root-parent
279279
hostPath:
280280
# If nvidiaDriverRoot == "/" then its parent is itself. Otherwise, get

deployments/helm/nvidia-dra-driver-gpu/values.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ nvidiaDriverRoot: /
2626
# If not specified, the default path inferred from the nvidia-container-toolkit library version will be used.
2727
nvidiaCDIHookPath: ""
2828

29+
# CDI root directory path.
30+
# This is where CDI spec files are stored and accessed by the runtime.
31+
cdiRoot: "/var/run/cdi"
32+
2933
nameOverride: ""
3034
fullnameOverride: ""
3135
namespaceOverride: ""

hack/kubelet-plugin-prestart.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ validate_and_exit_on_success () {
4646
/driver-root/bin \
4747
/driver-root/sbin \
4848
/driver-root/usr/bin \
49+
/driver-root/usr/local/bin \
4950
/driver-root/sbin \
5051
-maxdepth 1 -type f -name "nvidia-smi" 2> /dev/null | head -n1
5152
)
@@ -59,6 +60,7 @@ validate_and_exit_on_success () {
5960
/driver-root/usr/lib64 \
6061
/driver-root/usr/lib/x86_64-linux-gnu \
6162
/driver-root/usr/lib/aarch64-linux-gnu \
63+
/driver-root/usr/local/glibc/usr/lib \
6264
/driver-root/lib64 \
6365
/driver-root/lib/x86_64-linux-gnu \
6466
/driver-root/lib/aarch64-linux-gnu \

0 commit comments

Comments
 (0)