Skip to content

Commit 7d8797f

Browse files
authored
Merge branch 'main' into sms-deploy
2 parents a0fb65d + 6ec3a73 commit 7d8797f

File tree

6 files changed

+35
-26
lines changed

6 files changed

+35
-26
lines changed

.github/workflows/fatimage.yml

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,20 @@ jobs:
1919
name: openstack-imagebuild
2020
runs-on: ubuntu-22.04
2121
strategy:
22-
matrix:
22+
fail-fast: false # allow other matrix jobs to continue even if one fails
23+
matrix: # build RL8, RL9+OFED, RL9+CUDA versions
2324
os_version:
2425
- RL8
2526
- RL9
2627
build:
2728
- openstack.openhpc
2829
- openstack.openhpc-ofed
30+
- openstack.openhpc-cuda
2931
exclude:
3032
- os_version: RL8
3133
build: openstack.openhpc-ofed
34+
- os_version: RL8
35+
build: openstack.openhpc-cuda
3236
- os_version: RL9
3337
build: openstack.openhpc
3438
env:
@@ -94,7 +98,9 @@ jobs:
9498
- name: Download image
9599
run: |
96100
. venv/bin/activate
97-
openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
101+
sudo mkdir /mnt/images
102+
sudo chmod 777 /mnt/images
103+
openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
98104
99105
- name: Set up QEMU
100106
uses: docker/setup-qemu-action@v3
@@ -108,13 +114,13 @@ jobs:
108114
run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
109115

110116
- name: mount qcow2 file
111-
run: sudo guestmount -a ${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
117+
run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
112118

113119
- name: Run Trivy vulnerability scanner
114120
uses: aquasecurity/[email protected]
115121
with:
116122
scan-type: fs
117-
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
123+
scan-ref: "${{ steps.manifest.outputs.image-name }}"
118124
scanners: "vuln"
119125
format: sarif
120126
output: "${{ steps.manifest.outputs.image-name }}.sarif"
@@ -130,7 +136,7 @@ jobs:
130136
uses: aquasecurity/[email protected]
131137
with:
132138
scan-type: fs
133-
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
139+
scan-ref: "${{ steps.manifest.outputs.image-name }}"
134140
scanners: "vuln"
135141
format: table
136142
exit-code: '1'

ansible/extras.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
- name: Setup CUDA
2222
hosts: cuda
2323
become: yes
24-
gather_facts: no
24+
gather_facts: yes
2525
tags: cuda
2626
tasks:
2727
- import_role:

ansible/roles/cuda/defaults/main.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
cuda_distro: rhel8
1+
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
22
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
33
cuda_driver_stream: default
4+
cuda_package_version: 'latest'
45
cuda_packages:
5-
- cuda
6+
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
67
- nvidia-gds
78
# _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
8-
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
9+
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ _cuda_version_tuple[1] }}"
910
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
1011
cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples"
1112
cuda_samples_programs:

ansible/roles/cuda/tasks/main.yml

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,13 @@
2424
failed_when: false
2525
register: _cuda_driver_module_enabled
2626

27-
- name: List nvidia driver dnf module stream versions
28-
shell:
29-
cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V
30-
# Output of interest from command is something like (some whitespace removed):
31-
# "nvidia-driver 418-dkms default [d], fm, ks Nvidia driver for 418-dkms branch "
32-
changed_when: false
33-
register: _cuda_driver_module_streams
34-
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
35-
3627
- name: Enable nvidia driver module
37-
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}"
28+
ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
3829
register: _cuda_driver_module_enable
3930
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
4031
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
4132

42-
- name: Install nvidia drivers # TODO: make removal possible?
33+
- name: Install nvidia drivers
4334
ansible.builtin.command: dnf module install -y nvidia-driver
4435
register: _cuda_driver_install
4536
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"

environments/.stackhpc/terraform/main.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ variable "cluster_image" {
3030
type = map(string)
3131
default = {
3232
# https://github.com/stackhpc/ansible-slurm-appliance/pull/413
33-
RL8: "openhpc-RL8-240813-1317-1b370a36"
34-
RL9: "openhpc-ofed-RL9-240813-1317-1b370a36"
33+
RL8: "openhpc-RL8-240904-1509-1687368f"
34+
RL9: "openhpc-ofed-RL9-240904-1509-1687368f"
3535
}
3636
}
3737

packer/openstack.pkr.hcl

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,13 @@ variable "volume_type" {
129129
}
130130

131131
variable "volume_size" {
132-
type = number
133-
default = 15
132+
type = map(number)
133+
default = {
134+
# fat image builds, GB:
135+
openhpc = 15
136+
openhpc-ofed = 15
137+
openhpc-cuda = 30
138+
}
134139
}
135140

136141
variable "image_disk_format" {
@@ -150,6 +155,7 @@ variable "groups" {
150155
# fat image builds:
151156
openhpc = ["control", "compute", "login"]
152157
openhpc-ofed = ["control", "compute", "login", "ofed"]
158+
openhpc-cuda = ["control", "compute", "login", "ofed", "cuda"]
153159
}
154160
}
155161

@@ -158,11 +164,11 @@ source "openstack" "openhpc" {
158164
flavor = var.flavor
159165
use_blockstorage_volume = var.use_blockstorage_volume
160166
volume_type = var.volume_type
167+
volume_size = var.volume_size[source.name]
161168
metadata = var.metadata
162169
networks = var.networks
163170
floating_ip_network = var.floating_ip_network
164171
security_groups = var.security_groups
165-
volume_size = var.volume_size
166172

167173
# Input image:
168174
source_image = "${var.source_image[var.os_version]}"
@@ -178,7 +184,7 @@ source "openstack" "openhpc" {
178184
ssh_bastion_private_key_file = var.ssh_bastion_private_key_file
179185

180186
# Output image:
181-
image_disk_format = var.image_disk_format
187+
image_disk_format = "qcow2"
182188
image_visibility = var.image_visibility
183189
image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}"
184190
}
@@ -195,6 +201,11 @@ build {
195201
name = "openhpc-ofed"
196202
}
197203

204+
# CUDA fat image:
205+
source "source.openstack.openhpc" {
206+
name = "openhpc-cuda"
207+
}
208+
198209
# Extended site-specific image, built on fat image:
199210
source "source.openstack.openhpc" {
200211
name = "openhpc-extra"

0 commit comments

Comments
 (0)