Skip to content

Commit 42d6124

Browse files
committed
Merge branch 'main' into feat/nhc-v2
2 parents fe4c25a + 13fa5c2 commit 42d6124

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+845
-367
lines changed

.github/workflows/extra.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ jobs:
121121
- name: Make image usable for further builds
122122
run: |
123123
. venv/bin/activate
124-
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
124+
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" || true
125125
126126
- name: Delete image for automatically-run workflows
127127
run: |

.github/workflows/fatimage.yml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ on:
1010
- LEAFCLOUD
1111
- SMS
1212
- ARCUS
13+
cleanup_on_failure:
14+
description: Cleanup Packer resources on failure
15+
type: boolean
16+
required: true
17+
default: true
1318

1419
jobs:
1520
openstack:
@@ -23,10 +28,10 @@ jobs:
2328
matrix: # build RL8, RL9
2429
build:
2530
- image_name: openhpc-RL8
26-
source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2
31+
source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.raw
2732
inventory_groups: control,compute,login,update
2833
- image_name: openhpc-RL9
29-
source_image_name: Rocky-9-GenericCloud-Base-9.5-20241118.0.x86_64.qcow2
34+
source_image_name: Rocky-9-GenericCloud-Base-9.5-20241118.0.x86_64.raw
3035
inventory_groups: control,compute,login,update
3136
env:
3237
ANSIBLE_FORCE_COLOR: True
@@ -78,7 +83,7 @@ jobs:
7883
packer init .
7984
8085
PACKER_LOG=1 packer build \
81-
-on-error=${{ vars.PACKER_ON_ERROR }} \
86+
-on-error=${{ github.event.inputs.cleanup_on_failure && 'cleanup' || 'abort' }} \
8287
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
8388
-var "source_image_name=${{ matrix.build.source_image_name }}" \
8489
-var "image_name=${{ matrix.build.image_name }}" \
@@ -102,7 +107,7 @@ jobs:
102107
- name: Make image usable for further builds
103108
run: |
104109
. venv/bin/activate
105-
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
110+
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" || true
106111
107112
- name: Upload manifest artifact
108113
uses: actions/upload-artifact@v4

.github/workflows/nightly-cleanup.yml

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,34 @@ jobs:
5959
- name: Delete CI clusters
6060
run: |
6161
. venv/bin/activate
62-
if [[ -z "${ci_clusters}" ]]; then
62+
if [[ -z ${ci_clusters} ]]; then
6363
echo "No clusters to delete."
6464
exit 0
6565
fi
66-
echo "Deleting clusters: ${ci_clusters}"
67-
./dev/delete-cluster.py ${ci_clusters} --force
66+
67+
for cluster_prefix in ${ci_clusters}
68+
do
69+
echo "Processing cluster: $cluster_prefix"
70+
71+
# Get all servers with the matching name for control node
72+
CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json)
73+
74+
# Get unique server names to avoid duplicate cleanup
75+
UNIQUE_NAMES=$(echo "$CONTROL_SERVERS" | jq -r '.[].Name' | sort | uniq)
76+
for name in $UNIQUE_NAMES; do
77+
echo "Deleting cluster with control node: $name"
78+
79+
# Get the first matching server ID by name
80+
server=$(echo "$CONTROL_SERVERS" | jq -r '.[] | select(.Name=="'"$name"'") | .ID' | head -n1)
81+
82+
# Make sure server still exists (wasn't deleted earlier)
83+
if ! openstack server show "$server" &>/dev/null; then
84+
echo "Server $server no longer exists, skipping $name."
85+
continue
86+
fi
87+
88+
echo "Deleting cluster $cluster_prefix (server $server)..."
89+
./dev/delete-cluster.py $cluster_prefix --force
90+
done
91+
done
6892
shell: bash
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: Release images
2+
on:
3+
workflow_dispatch:
4+
release:
5+
types:
6+
- published # should work for both pre-releases and releases
7+
env:
8+
IMAGE_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
9+
jobs:
10+
ci-image-release:
11+
name: ci-image-release
12+
runs-on: ubuntu-22.04
13+
concurrency: ${{ github.workflow }}-${{ github.ref }}
14+
strategy:
15+
fail-fast: false
16+
matrix:
17+
build:
18+
- RL8
19+
- RL9
20+
steps:
21+
- uses: actions/checkout@v2
22+
23+
- name: Write s3cmd configuration
24+
run: echo "${{ secrets.ARCUS_S3_CFG }}" > ~/.s3cfg
25+
26+
- name: Install s3cmd
27+
run: |
28+
sudo apt-get update
29+
sudo apt-get --yes install s3cmd
30+
31+
- name: Retrieve image name
32+
run: |
33+
TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
34+
echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
35+
36+
- name: Copy image from pre-release to release bucket
37+
run: s3cmd cp s3://openhpc-images-prerelease/${{ env.TARGET_IMAGE }} s3://openhpc-images

.github/workflows/s3-image-sync.yml

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ jobs:
2626

2727
- name: Install s3cmd
2828
run: |
29+
sudo apt-get update
2930
sudo apt-get --yes install s3cmd
3031
3132
- name: Cleanup S3 bucket
@@ -75,20 +76,38 @@ jobs:
7576
echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg
7677
shell: bash
7778

78-
- name: Install s3cmd
79+
- name: Install s3cmd and qemu-utils
7980
run: |
80-
sudo apt-get --yes install s3cmd
81+
sudo apt-get update
82+
sudo apt-get --yes install s3cmd qemu-utils
8183
8284
- name: Retrieve image name
8385
run: |
8486
TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
8587
echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
8688
shell: bash
8789

90+
- name: Clear up some space on runner
91+
run: |
92+
df -h
93+
sudo rm -rf /usr/share/dotnet
94+
sudo rm -rf /opt/ghc
95+
sudo rm -rf "/usr/local/share/boost"
96+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
97+
sudo apt-get clean
98+
df -h
99+
88100
- name: Download image to runner
89101
run: |
90102
. venv/bin/activate
91-
openstack image save --file ${{ env.TARGET_IMAGE }} ${{ env.TARGET_IMAGE }}
103+
openstack image save --file "${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}"
104+
df -h
105+
shell: bash
106+
107+
- name: Convert image to QCOW2
108+
run: |
109+
. venv/bin/activate
110+
qemu-img convert -f raw -O qcow2 -c "${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}"
92111
shell: bash
93112

94113
- name: Upload Image to S3

.github/workflows/stackhpc.yml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,9 @@ jobs:
9191
run: dev/setup-env.sh
9292

9393
- name: Install OpenTofu
94-
uses: opentofu/setup-opentofu@v1
94+
uses: opentofu/setup-opentofu@v1.0.5
9595
with:
96-
tofu_version: 1.6.2
96+
tofu_version: 1.9.0
9797

9898
- name: Initialise tofu
9999
run: tofu init
@@ -230,6 +230,16 @@ jobs:
230230
env:
231231
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
232232

233+
- name: Delete possible volume snapshot from slurm upgrade
234+
run: |
235+
. venv/bin/activate
236+
. environments/.stackhpc/activate
237+
if [ -n "$SNAPSHOT" ]
238+
then
239+
echo Deleting $SNAPSHOT
240+
openstack volume snapshot delete $SNAPSHOT
241+
fi
242+
233243
- name: Delete infrastructure
234244
run: |
235245
. venv/bin/activate

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ venv
55
*.pyc
66
packer/openhpc2
77
.vscode
8+
requirements.yml.last

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@ The default configuration in this repository may be used to create a cluster to
2525
- Persistent state backed by an OpenStack volume.
2626
- NFS-based shared file system backed by another OpenStack volume.
2727

28-
Note that the Open OnDemand portal and its remote apps are not usable with this default configuration.
29-
3028
It requires an OpenStack cloud, and an Ansible "deploy host" with access to that cloud.
3129

3230
Before starting ensure that:

ansible/bootstrap.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@
144144
- appliances_mode == 'configure'
145145
- not (dnf_repos_allow_insecure_creds | default(false)) # useful for development
146146

147-
- hosts: cacerts:!builder
147+
- hosts: cacerts
148148
tags: cacerts
149149
gather_facts: false
150150
tasks:

ansible/roles/cacerts/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Configure CA certificates and trusts.
44

55
## Role variables
66

7-
- `ca-certificates`: Optional str. Path to directory containing certificates
7+
- `cacerts_cert_dir`: Optional str. Path to directory containing certificates
88
in PEM or DER format. Any files here will be added to the list of CAs trusted
99
by the system.
1010

0 commit comments

Comments
 (0)