Skip to content

Commit 35693fb

Browse files
committed
Merge remote-tracking branch 'origin/main' into HEAD
2 parents 54ed6b6 + b99b1e9 commit 35693fb

File tree

47 files changed

+278
-148
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+278
-148
lines changed

.github/workflows/fatimage.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ on:
1010
- LEAFCLOUD
1111
- SMS
1212
- ARCUS
13+
cleanup_on_failure:
14+
description: Cleanup Packer resources on failure
15+
type: boolean
16+
required: true
17+
default: true
1318

1419
jobs:
1520
openstack:
@@ -78,7 +83,7 @@ jobs:
7883
packer init .
7984
8085
PACKER_LOG=1 packer build \
81-
-on-error=${{ vars.PACKER_ON_ERROR }} \
86+
-on-error=${{ github.event.inputs.cleanup_on_failure && 'cleanup' || 'abort' }} \
8287
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
8388
-var "source_image_name=${{ matrix.build.source_image_name }}" \
8489
-var "image_name=${{ matrix.build.image_name }}" \

.github/workflows/nightly-cleanup.yml

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,34 @@ jobs:
5959
- name: Delete CI clusters
6060
run: |
6161
. venv/bin/activate
62-
if [[ -z "${ci_clusters}" ]]; then
62+
if [[ -z ${ci_clusters} ]]; then
6363
echo "No clusters to delete."
6464
exit 0
6565
fi
66-
echo "Deleting clusters: ${ci_clusters}"
67-
./dev/delete-cluster.py ${ci_clusters} --force
66+
67+
for cluster_prefix in ${ci_clusters}
68+
do
69+
echo "Processing cluster: $cluster_prefix"
70+
71+
# Get all servers with the matching name for control node
72+
CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json)
73+
74+
# Get unique server names to avoid duplicate cleanup
75+
UNIQUE_NAMES=$(echo "$CONTROL_SERVERS" | jq -r '.[].Name' | sort | uniq)
76+
for name in $UNIQUE_NAMES; do
77+
echo "Deleting cluster with control node: $name"
78+
79+
# Get the first matching server ID by name
80+
server=$(echo "$CONTROL_SERVERS" | jq -r '.[] | select(.Name=="'"$name"'") | .ID' | head -n1)
81+
82+
# Make sure server still exists (wasn't deleted earlier)
83+
if ! openstack server show "$server" &>/dev/null; then
84+
echo "Server $server no longer exists, skipping $name."
85+
continue
86+
fi
87+
88+
echo "Deleting cluster $cluster_prefix (server $server)..."
89+
./dev/delete-cluster.py $cluster_prefix --force
90+
done
91+
done
6892
shell: bash
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: Release images
2+
on:
3+
workflow_dispatch:
4+
release:
5+
types:
6+
- published # should work for both pre-releases and releases
7+
env:
8+
IMAGE_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
9+
jobs:
10+
ci-image-release:
11+
name: ci-image-release
12+
runs-on: ubuntu-22.04
13+
concurrency: ${{ github.workflow }}-${{ github.ref }}
14+
strategy:
15+
fail-fast: false
16+
matrix:
17+
build:
18+
- RL8
19+
- RL9
20+
steps:
21+
- uses: actions/checkout@v2
22+
23+
- name: Write s3cmd configuration
24+
run: echo "${{ secrets.ARCUS_S3_CFG }}" > ~/.s3cfg
25+
26+
- name: Install s3cmd
27+
run: |
28+
sudo apt-get update
29+
sudo apt-get --yes install s3cmd
30+
31+
- name: Retrieve image name
32+
run: |
33+
TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
34+
echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
35+
36+
- name: Copy image from pre-release to release bucket
37+
run: s3cmd cp s3://openhpc-images-prerelease/${{ env.TARGET_IMAGE }} s3://openhpc-images

.github/workflows/stackhpc.yml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,9 @@ jobs:
9191
run: dev/setup-env.sh
9292

9393
- name: Install OpenTofu
94-
uses: opentofu/setup-opentofu@v1
94+
uses: opentofu/setup-opentofu@v1.0.5
9595
with:
96-
tofu_version: 1.6.2
96+
tofu_version: 1.9.0
9797

9898
- name: Initialise tofu
9999
run: tofu init
@@ -230,6 +230,16 @@ jobs:
230230
env:
231231
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
232232

233+
- name: Delete possible volume snapshot from slurm upgrade
234+
run: |
235+
. venv/bin/activate
236+
. environments/.stackhpc/activate
237+
if [ -n "$SNAPSHOT" ]
238+
then
239+
echo Deleting $SNAPSHOT
240+
openstack volume snapshot delete $SNAPSHOT
241+
fi
242+
233243
- name: Delete infrastructure
234244
run: |
235245
. venv/bin/activate

ansible/disable-repos.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
ansible.builtin.include_role:
66
name: dnf_repos
77
tasks_from: disable_repos.yml
8+
when: not dnf_repos_enabled | default(false) | bool

ansible/roles/cluster_infra/templates/outputs.tf.j2

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ output "cluster_nodes" {
3232
}
3333
}
3434
],
35-
{% for partition in openhpc_slurm_partitions %}
35+
{% for nodegroup in openhpc_nodegroups %}
3636
[
37-
for compute in openstack_compute_instance_v2.{{ partition.name }}: {
37+
for compute in openstack_compute_instance_v2.{{ nodegroup.name }}: {
3838
name = compute.name
3939
ip = compute.network[0].fixed_ip_v4
40-
groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"],
40+
groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ nodegroup.name }}"],
4141
facts = {
4242
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
4343
}

ansible/roles/cluster_infra/templates/resources.tf.j2

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -282,11 +282,11 @@ resource "openstack_networking_port_v2" "control_storage" {
282282
###
283283
# Workers
284284
###
285-
{% for partition in openhpc_slurm_partitions %}
285+
{% for nodegroup in openhpc_nodegroups %}
286286
# Primary network
287-
resource "openstack_networking_port_v2" "{{ partition.name }}" {
288-
count = {{ partition.count }}
289-
name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}"
287+
resource "openstack_networking_port_v2" "{{ nodegroup.name }}" {
288+
count = {{ nodegroup.count }}
289+
name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-${count.index}"
290290
network_id = "${data.openstack_networking_network_v2.cluster_network.id}"
291291
admin_state_up = "true"
292292

@@ -305,9 +305,9 @@ resource "openstack_networking_port_v2" "{{ partition.name }}" {
305305

306306
# Storage network
307307
{% if cluster_storage_network is defined %}
308-
resource "openstack_networking_port_v2" "{{ partition.name }}_storage" {
309-
count = {{ partition.count }}
310-
name = "{{ cluster_name }}-compute-{{ partition.name }}-storage-${count.index}"
308+
resource "openstack_networking_port_v2" "{{ nodegroup.name }}_storage" {
309+
count = {{ nodegroup.count }}
310+
name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-storage-${count.index}"
311311
network_id = data.openstack_networking_network_v2.cluster_storage.id
312312
admin_state_up = "true"
313313

@@ -499,25 +499,25 @@ resource "openstack_compute_instance_v2" "control" {
499499
}
500500
}
501501

502-
{% for partition in openhpc_slurm_partitions %}
503-
resource "openstack_compute_instance_v2" "{{ partition.name }}" {
504-
count = {{ partition.count }}
502+
{% for nodegroup in openhpc_nodegroups %}
503+
resource "openstack_compute_instance_v2" "{{ nodegroup.name }}" {
504+
count = {{ nodegroup.count }}
505505

506-
name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}"
506+
name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-${count.index}"
507507
image_id = "{{ cluster_image }}"
508-
{% if 'flavor_name' in partition %}
509-
flavor_name = "{{ partition.flavor_name }}"
508+
{% if 'flavor_name' in nodegroup %}
509+
flavor_name = "{{ nodegroup.flavor_name }}"
510510
{% else %}
511-
flavor_id = "{{ partition.flavor }}"
511+
flavor_id = "{{ nodegroup.flavor }}"
512512
{% endif %}
513513

514514
network {
515-
port = openstack_networking_port_v2.{{ partition.name }}[count.index].id
515+
port = openstack_networking_port_v2.{{ nodegroup.name }}[count.index].id
516516
}
517517

518518
{% if cluster_storage_network is defined %}
519519
network {
520-
port = openstack_networking_port_v2.{{ partition.name }}_storage[count.index].id
520+
port = openstack_networking_port_v2.{{ nodegroup.name }}_storage[count.index].id
521521
}
522522
{% endif %}
523523

ansible/roles/dnf_repos/defaults/main.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@ dnf_repos_filenames:
88
appstream: 'Rocky-AppStream'
99
crb: 'Rocky-PowerTools'
1010
extras: 'Rocky-Extras'
11+
grafana: 'grafana'
1112
'9':
1213
baseos: 'rocky'
1314
appstream: 'rocky'
1415
crb: 'rocky'
1516
extras: 'rocky-extras'
17+
grafana: 'grafana'
1618

1719
dnf_repos_version_filenames: "{{ dnf_repos_filenames[ansible_distribution_major_version] }}"
1820

@@ -33,6 +35,9 @@ dnf_repos_default_repolist:
3335
- file: ceph
3436
name: Ceph
3537
base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.ceph[ansible_distribution_major_version] | appliances_repo_to_subpath }}"
38+
- file: "{{ dnf_repos_version_filenames.grafana }}"
39+
name: grafana
40+
base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.grafana[ansible_distribution_major_version] | appliances_repo_to_subpath }}"
3641

3742
dnf_repos_openhpc_repolist:
3843
- name: OpenHPC

ansible/roles/hpctests/defaults/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
---
22
hpctests_user: "{{ ansible_user }}"
3+
hpctests_group: "{{ hpctests_user }}"
34
hpctests_rootdir: "/home/{{ hpctests_user }}/hpctests"
45
hpctests_pre_cmd: ''
56
hpctests_pingmatrix_modules: [gnu12 openmpi4]

ansible/roles/hpctests/tasks/setup.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
path: "{{ hpctests_rootdir }}"
2727
state: directory
2828
owner: "{{ hpctests_user }}"
29-
group: "{{ hpctests_user }}"
29+
group: "{{ hpctests_group }}"
3030

3131
- name: Set fact for UCX_NET_DEVICES
3232
set_fact:

0 commit comments

Comments
 (0)