Skip to content

Commit 33d8b7f

Browse files
authored
Add NVIDIA GPU jobs on 4.18 (openshift#56508)
Use 4.17 catalog sources until the NFD in NVIDIA GPU operators are available in the 4.18 catalog.
1 parent 1ac4b8f commit 33d8b7f

File tree

4 files changed

+419
-0
lines changed

4 files changed

+419
-0
lines changed
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
build_root:
2+
image_stream_tag:
3+
name: release
4+
namespace: openshift
5+
tag: golang-1.18
6+
images:
7+
- dockerfile_path: Containerfile
8+
to: nvidia-ci
9+
releases:
10+
latest:
11+
candidate:
12+
product: ocp
13+
stream: nightly
14+
version: "4.18"
15+
resources:
16+
'*':
17+
limits:
18+
memory: 2Gi
19+
requests:
20+
cpu: 2000m
21+
memory: 2Gi
22+
tests:
23+
- as: nvidia-gpu-operator-e2e-master
24+
cron: 30 00 * * 3
25+
steps:
26+
cluster_profile: aws-edge-infra
27+
env:
28+
BASE_DOMAIN: edge-sro.rhecoeng.com
29+
post:
30+
- chain: ipi-aws-post
31+
pre:
32+
- chain: ipi-conf-aws
33+
- ref: single-node-conf-aws
34+
- chain: ipi-install
35+
test:
36+
- as: gpu-operator-e2e
37+
commands: NVIDIAGPU_CLEANUP=false NVIDIAGPU_DEPLOY_FROM_BUNDLE=true TEST_FEATURES="nvidiagpu"
38+
NVIDIAGPU_GPU_MACHINESET_INSTANCE_TYPE="g4dn.xlarge" NVIDIAGPU_NFD_FALLBACK_CATALOGSOURCE_INDEX_IMAGE="registry.redhat.io/redhat/redhat-operator-index:v4.17"
39+
make run-tests
40+
from: nvidia-ci
41+
resources:
42+
requests:
43+
cpu: 2000m
44+
memory: 2Gi
45+
- as: nvidia-gpu-operator-e2e-24-9-x
46+
cron: 30 02 * * 3
47+
steps:
48+
cluster_profile: aws-edge-infra
49+
env:
50+
BASE_DOMAIN: edge-sro.rhecoeng.com
51+
post:
52+
- chain: ipi-aws-post
53+
pre:
54+
- chain: ipi-conf-aws
55+
- ref: single-node-conf-aws
56+
- chain: ipi-install
57+
test:
58+
- as: gpu-operator-e2e
59+
commands: NVIDIAGPU_CLEANUP=false NVIDIAGPU_SUBSCRIPTION_CHANNEL="v24.9" TEST_FEATURES="nvidiagpu"
60+
NVIDIAGPU_GPU_MACHINESET_INSTANCE_TYPE="g4dn.xlarge" NVIDIAGPU_NFD_FALLBACK_CATALOGSOURCE_INDEX_IMAGE="registry.redhat.io/redhat/redhat-operator-index:v4.17"
61+
NVIDIAGPU_GPU_FALLBACK_CATALOGSOURCE_INDEX_IMAGE="registry.redhat.io/redhat/certified-operator-index:v4.17"
62+
make run-tests
63+
from: nvidia-ci
64+
resources:
65+
requests:
66+
cpu: 2000m
67+
memory: 2Gi
68+
- as: nvidia-gpu-operator-e2e-24-6-x
69+
cron: 30 04 * * 3
70+
steps:
71+
cluster_profile: aws-edge-infra
72+
env:
73+
BASE_DOMAIN: edge-sro.rhecoeng.com
74+
post:
75+
- chain: ipi-aws-post
76+
pre:
77+
- chain: ipi-conf-aws
78+
- ref: single-node-conf-aws
79+
- chain: ipi-install
80+
test:
81+
- as: gpu-operator-e2e
82+
commands: NVIDIAGPU_CLEANUP=false NVIDIAGPU_SUBSCRIPTION_CHANNEL="v24.6" TEST_FEATURES="nvidiagpu"
83+
NVIDIAGPU_GPU_MACHINESET_INSTANCE_TYPE="g4dn.xlarge" NVIDIAGPU_NFD_FALLBACK_CATALOGSOURCE_INDEX_IMAGE="registry.redhat.io/redhat/redhat-operator-index:v4.17"
84+
NVIDIAGPU_GPU_FALLBACK_CATALOGSOURCE_INDEX_IMAGE="registry.redhat.io/redhat/certified-operator-index:v4.17"
85+
make run-tests
86+
from: nvidia-ci
87+
resources:
88+
requests:
89+
cpu: 2000m
90+
memory: 2Gi
91+
zz_generated_metadata:
92+
branch: main
93+
org: rh-ecosystem-edge
94+
repo: nvidia-ci
95+
variant: "4.18"
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
This table should help us space out the builds and reduce
2+
the chances of hitting GPU instance availability issues.
3+
4+
| | | 24.6 | 24.9 | master |
5+
|------|-----------|-------|-------|--------|
6+
| 4.12 | Tuesday | 04:30 | 02:30 | 00:30 |
7+
| 4.14 | Friday | 04:30 | 02:30 | 00:30 |
8+
| 4.15 | Monday | 04:30 | 02:30 | 00:30 |
9+
| 4.16 | Mon-Fri | 06:00 | 04:00 | 02:00 |
10+
| 4.17 | Mon-Fri | 05:00 | 03:00 | 01:00 |
11+
| 4.18 | Wednesday | 04:30 | 02:30 | 00:30 |

ci-operator/jobs/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main-periodics.yaml

Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,3 +1274,258 @@ periodics:
12741274
- name: result-aggregator
12751275
secret:
12761276
secretName: result-aggregator
1277+
- agent: kubernetes
1278+
cluster: build05
1279+
cron: 30 04 * * 3
1280+
decorate: true
1281+
decoration_config:
1282+
skip_cloning: true
1283+
extra_refs:
1284+
- base_ref: main
1285+
org: rh-ecosystem-edge
1286+
repo: nvidia-ci
1287+
labels:
1288+
ci-operator.openshift.io/cloud: aws
1289+
ci-operator.openshift.io/cloud-cluster-profile: aws-edge-infra
1290+
ci-operator.openshift.io/variant: "4.18"
1291+
ci.openshift.io/generator: prowgen
1292+
job-release: "4.18"
1293+
pj-rehearse.openshift.io/can-be-rehearsed: "true"
1294+
name: periodic-ci-rh-ecosystem-edge-nvidia-ci-main-4.18-nvidia-gpu-operator-e2e-24-6-x
1295+
reporter_config:
1296+
slack:
1297+
channel: '#wg-edge-nvidia-ci'
1298+
job_states_to_report:
1299+
- failure
1300+
- error
1301+
report_template: '{{if eq .Status.State "success"}} :white_check_mark: Job *{{.Spec.Job}}*
1302+
ended with *{{.Status.State}}*. <{{.Status.URL}}|View logs> :white_check_mark:
1303+
{{else}} :warning: Job *{{.Spec.Job}}* ended with *{{.Status.State}}*. <{{.Status.URL}}|View
1304+
logs> :warning: {{end}}'
1305+
spec:
1306+
containers:
1307+
- args:
1308+
- --gcs-upload-secret=/secrets/gcs/service-account.json
1309+
- --image-import-pull-secret=/etc/pull-secret/.dockerconfigjson
1310+
- --lease-server-credentials-file=/etc/boskos/credentials
1311+
- --report-credentials-file=/etc/report/credentials
1312+
- --secret-dir=/secrets/ci-pull-credentials
1313+
- --target=nvidia-gpu-operator-e2e-24-6-x
1314+
- --variant=4.18
1315+
command:
1316+
- ci-operator
1317+
image: ci-operator:latest
1318+
imagePullPolicy: Always
1319+
name: ""
1320+
resources:
1321+
requests:
1322+
cpu: 10m
1323+
volumeMounts:
1324+
- mountPath: /etc/boskos
1325+
name: boskos
1326+
readOnly: true
1327+
- mountPath: /secrets/ci-pull-credentials
1328+
name: ci-pull-credentials
1329+
readOnly: true
1330+
- mountPath: /secrets/gcs
1331+
name: gcs-credentials
1332+
readOnly: true
1333+
- mountPath: /secrets/manifest-tool
1334+
name: manifest-tool-local-pusher
1335+
readOnly: true
1336+
- mountPath: /etc/pull-secret
1337+
name: pull-secret
1338+
readOnly: true
1339+
- mountPath: /etc/report
1340+
name: result-aggregator
1341+
readOnly: true
1342+
serviceAccountName: ci-operator
1343+
volumes:
1344+
- name: boskos
1345+
secret:
1346+
items:
1347+
- key: credentials
1348+
path: credentials
1349+
secretName: boskos-credentials
1350+
- name: ci-pull-credentials
1351+
secret:
1352+
secretName: ci-pull-credentials
1353+
- name: manifest-tool-local-pusher
1354+
secret:
1355+
secretName: manifest-tool-local-pusher
1356+
- name: pull-secret
1357+
secret:
1358+
secretName: registry-pull-credentials
1359+
- name: result-aggregator
1360+
secret:
1361+
secretName: result-aggregator
1362+
- agent: kubernetes
1363+
cluster: build05
1364+
cron: 30 02 * * 3
1365+
decorate: true
1366+
decoration_config:
1367+
skip_cloning: true
1368+
extra_refs:
1369+
- base_ref: main
1370+
org: rh-ecosystem-edge
1371+
repo: nvidia-ci
1372+
labels:
1373+
ci-operator.openshift.io/cloud: aws
1374+
ci-operator.openshift.io/cloud-cluster-profile: aws-edge-infra
1375+
ci-operator.openshift.io/variant: "4.18"
1376+
ci.openshift.io/generator: prowgen
1377+
job-release: "4.18"
1378+
pj-rehearse.openshift.io/can-be-rehearsed: "true"
1379+
name: periodic-ci-rh-ecosystem-edge-nvidia-ci-main-4.18-nvidia-gpu-operator-e2e-24-9-x
1380+
reporter_config:
1381+
slack:
1382+
channel: '#wg-edge-nvidia-ci'
1383+
job_states_to_report:
1384+
- failure
1385+
- error
1386+
report_template: '{{if eq .Status.State "success"}} :white_check_mark: Job *{{.Spec.Job}}*
1387+
ended with *{{.Status.State}}*. <{{.Status.URL}}|View logs> :white_check_mark:
1388+
{{else}} :warning: Job *{{.Spec.Job}}* ended with *{{.Status.State}}*. <{{.Status.URL}}|View
1389+
logs> :warning: {{end}}'
1390+
spec:
1391+
containers:
1392+
- args:
1393+
- --gcs-upload-secret=/secrets/gcs/service-account.json
1394+
- --image-import-pull-secret=/etc/pull-secret/.dockerconfigjson
1395+
- --lease-server-credentials-file=/etc/boskos/credentials
1396+
- --report-credentials-file=/etc/report/credentials
1397+
- --secret-dir=/secrets/ci-pull-credentials
1398+
- --target=nvidia-gpu-operator-e2e-24-9-x
1399+
- --variant=4.18
1400+
command:
1401+
- ci-operator
1402+
image: ci-operator:latest
1403+
imagePullPolicy: Always
1404+
name: ""
1405+
resources:
1406+
requests:
1407+
cpu: 10m
1408+
volumeMounts:
1409+
- mountPath: /etc/boskos
1410+
name: boskos
1411+
readOnly: true
1412+
- mountPath: /secrets/ci-pull-credentials
1413+
name: ci-pull-credentials
1414+
readOnly: true
1415+
- mountPath: /secrets/gcs
1416+
name: gcs-credentials
1417+
readOnly: true
1418+
- mountPath: /secrets/manifest-tool
1419+
name: manifest-tool-local-pusher
1420+
readOnly: true
1421+
- mountPath: /etc/pull-secret
1422+
name: pull-secret
1423+
readOnly: true
1424+
- mountPath: /etc/report
1425+
name: result-aggregator
1426+
readOnly: true
1427+
serviceAccountName: ci-operator
1428+
volumes:
1429+
- name: boskos
1430+
secret:
1431+
items:
1432+
- key: credentials
1433+
path: credentials
1434+
secretName: boskos-credentials
1435+
- name: ci-pull-credentials
1436+
secret:
1437+
secretName: ci-pull-credentials
1438+
- name: manifest-tool-local-pusher
1439+
secret:
1440+
secretName: manifest-tool-local-pusher
1441+
- name: pull-secret
1442+
secret:
1443+
secretName: registry-pull-credentials
1444+
- name: result-aggregator
1445+
secret:
1446+
secretName: result-aggregator
1447+
- agent: kubernetes
1448+
cluster: build05
1449+
cron: 30 00 * * 3
1450+
decorate: true
1451+
decoration_config:
1452+
skip_cloning: true
1453+
extra_refs:
1454+
- base_ref: main
1455+
org: rh-ecosystem-edge
1456+
repo: nvidia-ci
1457+
labels:
1458+
ci-operator.openshift.io/cloud: aws
1459+
ci-operator.openshift.io/cloud-cluster-profile: aws-edge-infra
1460+
ci-operator.openshift.io/variant: "4.18"
1461+
ci.openshift.io/generator: prowgen
1462+
job-release: "4.18"
1463+
pj-rehearse.openshift.io/can-be-rehearsed: "true"
1464+
name: periodic-ci-rh-ecosystem-edge-nvidia-ci-main-4.18-nvidia-gpu-operator-e2e-master
1465+
reporter_config:
1466+
slack:
1467+
channel: '#wg-edge-nvidia-ci'
1468+
job_states_to_report:
1469+
- failure
1470+
- error
1471+
report_template: '{{if eq .Status.State "success"}} :white_check_mark: Job *{{.Spec.Job}}*
1472+
ended with *{{.Status.State}}*. <{{.Status.URL}}|View logs> :white_check_mark:
1473+
{{else}} :warning: Job *{{.Spec.Job}}* ended with *{{.Status.State}}*. <{{.Status.URL}}|View
1474+
logs> :warning: {{end}}'
1475+
spec:
1476+
containers:
1477+
- args:
1478+
- --gcs-upload-secret=/secrets/gcs/service-account.json
1479+
- --image-import-pull-secret=/etc/pull-secret/.dockerconfigjson
1480+
- --lease-server-credentials-file=/etc/boskos/credentials
1481+
- --report-credentials-file=/etc/report/credentials
1482+
- --secret-dir=/secrets/ci-pull-credentials
1483+
- --target=nvidia-gpu-operator-e2e-master
1484+
- --variant=4.18
1485+
command:
1486+
- ci-operator
1487+
image: ci-operator:latest
1488+
imagePullPolicy: Always
1489+
name: ""
1490+
resources:
1491+
requests:
1492+
cpu: 10m
1493+
volumeMounts:
1494+
- mountPath: /etc/boskos
1495+
name: boskos
1496+
readOnly: true
1497+
- mountPath: /secrets/ci-pull-credentials
1498+
name: ci-pull-credentials
1499+
readOnly: true
1500+
- mountPath: /secrets/gcs
1501+
name: gcs-credentials
1502+
readOnly: true
1503+
- mountPath: /secrets/manifest-tool
1504+
name: manifest-tool-local-pusher
1505+
readOnly: true
1506+
- mountPath: /etc/pull-secret
1507+
name: pull-secret
1508+
readOnly: true
1509+
- mountPath: /etc/report
1510+
name: result-aggregator
1511+
readOnly: true
1512+
serviceAccountName: ci-operator
1513+
volumes:
1514+
- name: boskos
1515+
secret:
1516+
items:
1517+
- key: credentials
1518+
path: credentials
1519+
secretName: boskos-credentials
1520+
- name: ci-pull-credentials
1521+
secret:
1522+
secretName: ci-pull-credentials
1523+
- name: manifest-tool-local-pusher
1524+
secret:
1525+
secretName: manifest-tool-local-pusher
1526+
- name: pull-secret
1527+
secret:
1528+
secretName: registry-pull-credentials
1529+
- name: result-aggregator
1530+
secret:
1531+
secretName: result-aggregator

0 commit comments

Comments
 (0)