Skip to content

Commit d79f3ea

Browse files
committed
Merge branch 'main' into refactor/toplevel-playbooks
2 parents b398135 + ede7cb3 commit d79f3ea

File tree

11 files changed

+66
-22
lines changed

11 files changed

+66
-22
lines changed

.github/workflows/extra.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ jobs:
121121
- name: Make image usable for further builds
122122
run: |
123123
. venv/bin/activate
124-
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
124+
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" || true
125125
126126
- name: Delete image for automatically-run workflows
127127
run: |

.github/workflows/fatimage.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@ jobs:
2323
matrix: # build RL8, RL9
2424
build:
2525
- image_name: openhpc-RL8
26-
source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2
26+
source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.raw
2727
inventory_groups: control,compute,login,update
2828
- image_name: openhpc-RL9
29-
source_image_name: Rocky-9-GenericCloud-Base-9.5-20241118.0.x86_64.qcow2
29+
source_image_name: Rocky-9-GenericCloud-Base-9.5-20241118.0.x86_64.raw
3030
inventory_groups: control,compute,login,update
3131
env:
3232
ANSIBLE_FORCE_COLOR: True
@@ -102,7 +102,7 @@ jobs:
102102
- name: Make image usable for further builds
103103
run: |
104104
. venv/bin/activate
105-
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
105+
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" || true
106106
107107
- name: Upload manifest artifact
108108
uses: actions/upload-artifact@v4

.github/workflows/nightly-cleanup.yml

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,34 @@ jobs:
5959
- name: Delete CI clusters
6060
run: |
6161
. venv/bin/activate
62-
if [[ -z "${ci_clusters}" ]]; then
62+
if [[ -z ${ci_clusters} ]]; then
6363
echo "No clusters to delete."
6464
exit 0
6565
fi
66-
echo "Deleting clusters: ${ci_clusters}"
67-
./dev/delete-cluster.py ${ci_clusters} --force
66+
67+
for cluster_prefix in ${ci_clusters}
68+
do
69+
echo "Processing cluster: $cluster_prefix"
70+
71+
# Get all servers with the matching name for control node
72+
CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json)
73+
74+
# Get unique server names to avoid duplicate cleanup
75+
UNIQUE_NAMES=$(echo "$CONTROL_SERVERS" | jq -r '.[].Name' | sort | uniq)
76+
for name in $UNIQUE_NAMES; do
77+
echo "Deleting cluster with control node: $name"
78+
79+
# Get the first matching server ID by name
80+
server=$(echo "$CONTROL_SERVERS" | jq -r '.[] | select(.Name=="'"$name"'") | .ID' | head -n1)
81+
82+
# Make sure server still exists (wasn't deleted earlier)
83+
if ! openstack server show "$server" &>/dev/null; then
84+
echo "Server $server no longer exists, skipping $name."
85+
continue
86+
fi
87+
88+
echo "Deleting cluster $cluster_prefix (server $server)..."
89+
./dev/delete-cluster.py $cluster_prefix --force
90+
done
91+
done
6892
shell: bash

.github/workflows/s3-image-sync.yml

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ jobs:
2626

2727
- name: Install s3cmd
2828
run: |
29+
sudo apt-get update
2930
sudo apt-get --yes install s3cmd
3031
3132
- name: Cleanup S3 bucket
@@ -75,20 +76,38 @@ jobs:
7576
echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg
7677
shell: bash
7778

78-
- name: Install s3cmd
79+
- name: Install s3cmd and qemu-utils
7980
run: |
80-
sudo apt-get --yes install s3cmd
81+
sudo apt-get update
82+
sudo apt-get --yes install s3cmd qemu-utils
8183
8284
- name: Retrieve image name
8385
run: |
8486
TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
8587
echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
8688
shell: bash
8789

90+
- name: Clear up some space on runner
91+
run: |
92+
df -h
93+
sudo rm -rf /usr/share/dotnet
94+
sudo rm -rf /opt/ghc
95+
sudo rm -rf "/usr/local/share/boost"
96+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
97+
sudo apt-get clean
98+
df -h
99+
88100
- name: Download image to runner
89101
run: |
90102
. venv/bin/activate
91-
openstack image save --file ${{ env.TARGET_IMAGE }} ${{ env.TARGET_IMAGE }}
103+
openstack image save --file "${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}"
104+
df -h
105+
shell: bash
106+
107+
- name: Convert image to QCOW2
108+
run: |
109+
. venv/bin/activate
110+
qemu-img convert -f raw -O qcow2 -c "${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}"
92111
shell: bash
93112

94113
- name: Upload Image to S3

ansible/roles/hpctests/defaults/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
---
22
hpctests_user: "{{ ansible_user }}"
3+
hpctests_group: "{{ ansible_user }}"
34
hpctests_rootdir: "/home/{{ hpctests_user }}/hpctests"
45
hpctests_pre_cmd: ''
56
hpctests_pingmatrix_modules: [gnu12 openmpi4]

ansible/roles/hpctests/tasks/setup.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
path: "{{ hpctests_rootdir }}"
2727
state: directory
2828
owner: "{{ hpctests_user }}"
29-
group: "{{ hpctests_user }}"
29+
group: "{{ hpctests_group }}"
3030

3131
- name: Set fact for UCX_NET_DEVICES
3232
set_fact:

docs/experimental/slurm-controlled-rebuild.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ The configuration of this is complex and involves:
103103
be in:
104104
- `environments/site/tofu/variables.tf`: `cluster_image_id` for the default
105105
cluster image.
106-
- `enviroments/$ENV/tofu/main.tf`: parameter `image_id` in node groups
106+
- `environments/$ENV/tofu/main.tf`: parameter `image_id` in node groups
107107
defined in the `compute` or `login` variables, to override the default
108108
image for specific node groups.
109109

docs/operations.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ See [Reconfiguring Slurm](#Reconfiguring-Slurm) to apply changes.
4444

4545
# Modifying Slurm Partition-specific Configuration
4646

47-
Modify the `openhpc_slurm_partitions` mapping usually in `enviroments/$SITE_ENV/inventory/group_vars/all/openhpc.yml` as described for [stackhpc.openhpc:slurmconf](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) (note the relevant version of this role is defined in the `requirements.yml`)
47+
Modify the `openhpc_slurm_partitions` mapping usually in `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml` as described for [stackhpc.openhpc:slurmconf](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) (note the relevant version of this role is defined in the `requirements.yml`)
4848

4949
Note an Ansible inventory group for the partition is required. This is generally auto-defined by a template in the OpenTofu configuration.
5050

environments/common/inventory/group_vars/all/defaults.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ appliances_local_users_default:
4242
home: /var/lib/{{ appliances_local_users_ansible_user_name }}
4343
move_home: true
4444
local: true
45-
45+
4646
- user: "{{ appliances_local_users_podman }}"
4747
enable: "{{ 'podman' in group_names }}"
4848

@@ -53,7 +53,7 @@ appliances_local_users_default:
5353
shell: /sbin/nologin
5454
uid: 202
5555
system: true
56-
56+
5757
- group:
5858
name: prometheus
5959
gid: 976
@@ -64,7 +64,7 @@ appliances_local_users_default:
6464
shell: /usr/sbin/nologin
6565
system: true
6666
enable: "{{ 'prometheus' in group_names }}"
67-
67+
6868
- group:
6969
name: grafana
7070
gid: 979
@@ -79,7 +79,7 @@ appliances_local_users_default:
7979

8080
# Overide this to add extra users whilst keeping the defaults.
8181
appliances_local_users_extra: [] # see format of appliances_local_users_default above
82-
appliances_local_users: "{{ appliances_local_users_default + appliances_local_users_extra }}"
82+
appliances_local_users: "{{ (appliances_local_users_default + appliances_local_users_extra) | select | list }}"
8383

8484
################## bootstrap: extra package installs ######################################
8585

@@ -94,7 +94,7 @@ appliances_extra_packages_default:
9494
- postfix
9595
- git
9696
- "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}"
97-
97+
9898
appliances_extra_packages_other: []
99-
100-
appliances_extra_packages: "{{ appliances_extra_packages_default + appliances_extra_packages_other }}"
99+
100+
appliances_extra_packages: "{{ (appliances_extra_packages_default + appliances_extra_packages_other) | select | list }}"

environments/common/inventory/group_vars/all/openhpc.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ openhpc_packages_default:
2828
- apptainer
2929
- podman-compose
3030
openhpc_packages_extra: []
31-
openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}"
31+
openhpc_packages: "{{ (openhpc_packages_default + openhpc_packages_extra) | select | list }}"
3232
openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}"
3333
openhpc_login_only_nodes: login
3434
openhpc_config_default:

0 commit comments

Comments
 (0)