Skip to content

Commit 22a4e6d

Browse files
authored
Merge branch 'main' into feature/k3s-ansible-init
2 parents 0975257 + 17a2432 commit 22a4e6d

File tree

20 files changed

+238
-101
lines changed

20 files changed

+238
-101
lines changed

.github/workflows/fatimage.yml

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,51 @@
11

22
name: Build fat image
3-
'on':
3+
on:
44
workflow_dispatch:
5-
concurrency:
6-
group: ${{ github.ref }}-{{ matrix.os_version }}-{{ matrix.build }} # to branch/PR + OS + build
7-
cancel-in-progress: true
5+
inputs:
6+
ci_cloud:
7+
description: 'Select the CI_CLOUD'
8+
required: true
9+
type: choice
10+
options:
11+
- LEAFCLOUD
12+
- SMS
13+
- ARCUS
814
jobs:
915
openstack:
1016
name: openstack-imagebuild
17+
concurrency:
18+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
19+
cancel-in-progress: true
1120
runs-on: ubuntu-22.04
1221
strategy:
13-
matrix:
22+
fail-fast: false # allow other matrix jobs to continue even if one fails
23+
matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions
1424
os_version:
1525
- RL8
1626
- RL9
1727
build:
18-
- openstack.openhpc
1928
- openstack.openhpc-ofed
29+
- openstack.openhpc-cuda
2030
exclude:
2131
- os_version: RL8
22-
build: openstack.openhpc-ofed
23-
- os_version: RL9
24-
build: openstack.openhpc
32+
build: openstack.openhpc-cuda
2533
env:
2634
ANSIBLE_FORCE_COLOR: True
2735
OS_CLOUD: openstack
28-
CI_CLOUD: ${{ vars.CI_CLOUD }}
36+
CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
2937
steps:
3038
- uses: actions/checkout@v2
3139

40+
- name: Record settings for CI cloud
41+
run: |
42+
echo CI_CLOUD: ${{ env.CI_CLOUD }}
43+
3244
- name: Setup ssh
3345
run: |
3446
set -x
3547
mkdir ~/.ssh
36-
echo "${{ secrets[format('{0}_SSH_KEY', vars.CI_CLOUD)] }}" > ~/.ssh/id_rsa
48+
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
3749
chmod 0600 ~/.ssh/id_rsa
3850
shell: bash
3951

@@ -47,7 +59,7 @@ jobs:
4759
- name: Write clouds.yaml
4860
run: |
4961
mkdir -p ~/.config/openstack/
50-
echo "${{ secrets[format('{0}_CLOUDS_YAML', vars.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
62+
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
5163
shell: bash
5264

5365
- name: Setup environment
@@ -62,7 +74,7 @@ jobs:
6274
. environments/.stackhpc/activate
6375
cd packer/
6476
packer init .
65-
PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -only=${{ matrix.build }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
77+
PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -only=${{ matrix.build }} -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
6678
env:
6779
PKR_VAR_os_version: ${{ matrix.os_version }}
6880

@@ -81,7 +93,9 @@ jobs:
8193
- name: Download image
8294
run: |
8395
. venv/bin/activate
84-
openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
96+
sudo mkdir /mnt/images
97+
sudo chmod 777 /mnt/images
98+
openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
8599
86100
- name: Set up QEMU
87101
uses: docker/setup-qemu-action@v3
@@ -95,13 +109,13 @@ jobs:
95109
run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
96110

97111
- name: mount qcow2 file
98-
run: sudo guestmount -a ${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
112+
run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
99113

100114
- name: Run Trivy vulnerability scanner
101115
uses: aquasecurity/[email protected]
102116
with:
103117
scan-type: fs
104-
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
118+
scan-ref: "${{ steps.manifest.outputs.image-name }}"
105119
scanners: "vuln"
106120
format: sarif
107121
output: "${{ steps.manifest.outputs.image-name }}.sarif"
@@ -117,7 +131,7 @@ jobs:
117131
uses: aquasecurity/[email protected]
118132
with:
119133
scan-type: fs
120-
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
134+
scan-ref: "${{ steps.manifest.outputs.image-name }}"
121135
scanners: "vuln"
122136
format: table
123137
exit-code: '1'

.github/workflows/stackhpc.yml

Lines changed: 51 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2,75 +2,92 @@
22
name: Test deployment and reimage on OpenStack
33
on:
44
workflow_dispatch:
5-
inputs:
6-
use_RL8:
7-
required: true
8-
description: Include RL8 tests
9-
type: boolean
10-
default: false
115
push:
126
branches:
137
- main
8+
paths:
9+
- '**'
10+
- '!dev/**'
11+
- 'dev/setup-env.sh'
12+
- '!docs/**'
13+
- '!README.md'
14+
- '!.gitignore'
1415
pull_request:
16+
paths:
17+
- '**'
18+
- '!dev/**'
19+
- 'dev/setup-env.sh'
20+
- '!docs/**'
21+
- '!README.md'
22+
- '!.gitignore'
1523
jobs:
1624
openstack:
1725
name: openstack-ci
18-
concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
26+
concurrency:
27+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS
28+
cancel-in-progress: true
1929
runs-on: ubuntu-22.04
2030
strategy:
31+
fail-fast: false # allow other matrix jobs to continue even if one fails
2132
matrix:
22-
os_version: [RL8, RL9]
23-
rl8_selected:
24-
- ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch
25-
rl8_branch:
26-
- ${{ startsWith(github.head_ref, 'rl8') == true }} # only potentially for pull_request, always false on merge
27-
rl8_label:
28-
- ${{ contains(github.event.pull_request.labels.*.name, 'RL8') }} # NB: needs a new commit if added after PR created
29-
exclude:
30-
- os_version: RL8
31-
rl8_selected: false
32-
rl8_branch: false
33-
rl8_label: false
33+
os_version:
34+
- RL8
35+
- RL9
3436
env:
3537
ANSIBLE_FORCE_COLOR: True
3638
OS_CLOUD: openstack
3739
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
38-
CI_CLOUD: ${{ vars.CI_CLOUD }}
40+
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
41+
TF_VAR_os_version: ${{ matrix.os_version }}
3942
steps:
4043
- uses: actions/checkout@v2
4144

45+
- name: Override CI_CLOUD if PR label is present
46+
if: ${{ github.event_name == 'pull_request' }}
47+
run: |
48+
# Iterate over the labels
49+
labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name')
50+
echo $labels
51+
for label in $labels; do
52+
if [[ $label == CI_CLOUD=* ]]; then
53+
# Extract the value after 'CI_CLOUD='
54+
CI_CLOUD_OVERRIDE=${label#CI_CLOUD=}
55+
echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV
56+
fi
57+
done
58+
4259
- name: Record settings for CI cloud
4360
run: |
44-
echo CI_CLOUD: ${{ vars.CI_CLOUD }}
61+
echo CI_CLOUD: ${{ env.CI_CLOUD }}
4562
4663
- name: Setup ssh
4764
run: |
4865
set -x
4966
mkdir ~/.ssh
50-
echo "${{ secrets[format('{0}_SSH_KEY', vars.CI_CLOUD)] }}" > ~/.ssh/id_rsa
67+
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
5168
chmod 0600 ~/.ssh/id_rsa
5269
shell: bash
53-
70+
5471
- name: Add bastion's ssh key to known_hosts
5572
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
5673
shell: bash
57-
74+
5875
- name: Install ansible etc
5976
run: dev/setup-env.sh
6077

6178
- name: Install OpenTofu
6279
uses: opentofu/setup-opentofu@v1
6380
with:
6481
tofu_version: 1.6.2
65-
82+
6683
- name: Initialise terraform
6784
run: terraform init
6885
working-directory: ${{ github.workspace }}/environments/.stackhpc/terraform
69-
86+
7087
- name: Write clouds.yaml
7188
run: |
7289
mkdir -p ~/.config/openstack/
73-
echo "${{ secrets[format('{0}_CLOUDS_YAML', vars.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
90+
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
7491
shell: bash
7592

7693
- name: Setup environment-specific inventory/terraform inputs
@@ -88,19 +105,15 @@ jobs:
88105
. venv/bin/activate
89106
. environments/.stackhpc/activate
90107
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
91-
terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
92-
env:
93-
TF_VAR_os_version: ${{ matrix.os_version }}
108+
terraform apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
94109
95110
- name: Delete infrastructure if provisioning failed
96111
run: |
97112
. venv/bin/activate
98113
. environments/.stackhpc/activate
99114
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
100-
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
115+
terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
101116
if: failure() && steps.provision_servers.outcome == 'failure'
102-
env:
103-
TF_VAR_os_version: ${{ matrix.os_version }}
104117

105118
- name: Configure cluster
106119
run: |
@@ -126,14 +139,14 @@ jobs:
126139
run: |
127140
. venv/bin/activate
128141
. environments/.stackhpc/activate
129-
142+
130143
# load ansible variables into shell:
131144
ansible-playbook ansible/ci/output_vars.yml \
132145
-e output_vars_hosts=openondemand \
133146
-e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \
134147
-e output_vars_items=bastion_ip,bastion_user,openondemand_servername
135148
source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt
136-
149+
137150
# setup ssh proxying:
138151
sudo apt-get --yes install proxychains
139152
echo proxychains installed
@@ -170,7 +183,7 @@ jobs:
170183
# ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
171184
# ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
172185
# ansible-playbook -v ansible/ci/check_slurm.yml
173-
186+
174187
- name: Test reimage of login and control nodes (via rebuild adhoc)
175188
run: |
176189
. venv/bin/activate
@@ -179,7 +192,7 @@ jobs:
179192
ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
180193
ansible-playbook -v ansible/site.yml
181194
ansible-playbook -v ansible/ci/check_slurm.yml
182-
195+
183196
- name: Check sacct state survived reimage
184197
run: |
185198
. venv/bin/activate
@@ -197,10 +210,8 @@ jobs:
197210
. venv/bin/activate
198211
. environments/.stackhpc/activate
199212
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
200-
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
213+
terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
201214
if: ${{ success() || cancelled() }}
202-
env:
203-
TF_VAR_os_version: ${{ matrix.os_version }}
204215

205216
# - name: Delete images
206217
# run: |

ansible/bootstrap.yml

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -227,24 +227,25 @@
227227
- update
228228
tasks:
229229
- name: Check for pending reboot from package updates
230-
stat:
231-
path: /var/run/reboot-required
230+
command:
231+
cmd: dnf needs-restarting -r
232232
register: update_reboot_required
233-
- debug:
234-
msg: "setstatus:{{ (sestatus.reboot_required | default(false)) }} packages: {{ (update_reboot_required.stat.exists | bool) }}"
235-
- name: Reboot if required from SELinux state change or package upgrades
233+
failed_when: "update_reboot_required.rc not in [0, 1]"
234+
changed_when: false
235+
- name: Reboot to cover SELinux state change or package upgrades
236236
reboot:
237237
post_reboot_delay: 30
238-
when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.stat.exists | bool)
238+
when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.rc == 1)
239239
- name: Wait for hosts to be reachable
240240
wait_for_connection:
241241
sleep: 15
242-
- name: update facts
242+
- name: Clear facts
243+
meta: clear_facts
244+
- name: Update facts
243245
setup:
244-
when: (sestatus.changed | default(false)) or (sestatus.reboot_required | default(false))
245246

246247
- hosts: ofed
247-
gather_facts: no
248+
gather_facts: yes
248249
become: yes
249250
tags: ofed
250251
tasks:

ansible/extras.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
- name: Setup CUDA
2222
hosts: cuda
2323
become: yes
24-
gather_facts: no
24+
gather_facts: yes
2525
tags: cuda
2626
tasks:
2727
- import_role:

ansible/roles/cuda/defaults/main.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
cuda_distro: rhel8
1+
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
22
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
33
cuda_driver_stream: default
4+
cuda_package_version: 'latest'
45
cuda_packages:
5-
- cuda
6+
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
67
- nvidia-gds
78
# _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
8-
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
9+
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ _cuda_version_tuple[1] }}"
910
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
1011
cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples"
1112
cuda_samples_programs:

ansible/roles/cuda/tasks/main.yml

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,13 @@
2424
failed_when: false
2525
register: _cuda_driver_module_enabled
2626

27-
- name: List nvidia driver dnf module stream versions
28-
shell:
29-
cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V
30-
# Output of interest from command is something like (some whitespace removed):
31-
# "nvidia-driver 418-dkms default [d], fm, ks Nvidia driver for 418-dkms branch "
32-
changed_when: false
33-
register: _cuda_driver_module_streams
34-
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
35-
3627
- name: Enable nvidia driver module
37-
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}"
28+
ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
3829
register: _cuda_driver_module_enable
3930
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
4031
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
4132

42-
- name: Install nvidia drivers # TODO: make removal possible?
33+
- name: Install nvidia drivers
4334
ansible.builtin.command: dnf module install -y nvidia-driver
4435
register: _cuda_driver_install
4536
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"

0 commit comments

Comments
 (0)