Skip to content

Commit feab4cf

Browse files
authored
Merge branch 'main' into feat/compute-script
2 parents 998ebf1 + 969c6f2 commit feab4cf

File tree

30 files changed

+624
-231
lines changed

30 files changed

+624
-231
lines changed

.github/workflows/fatimage.yml

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,29 +20,23 @@ jobs:
2020
runs-on: ubuntu-22.04
2121
strategy:
2222
fail-fast: false # allow other matrix jobs to continue even if one fails
23-
matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions
23+
matrix: # build RL8, RL9
2424
os_version:
2525
- RL8
2626
- RL9
2727
build:
2828
- openstack.openhpc
29-
- openstack.openhpc-cuda
30-
exclude:
31-
- os_version: RL8
32-
build: openstack.openhpc-cuda
3329
env:
3430
ANSIBLE_FORCE_COLOR: True
3531
OS_CLOUD: openstack
3632
CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
3733
SOURCE_IMAGES_MAP: |
3834
{
3935
"RL8": {
40-
"openstack.openhpc": "rocky-latest-RL8",
41-
"openstack.openhpc-cuda": "rocky-latest-cuda-RL8"
36+
"openstack.openhpc": "rocky-latest-RL8"
4237
},
4338
"RL9": {
44-
"openstack.openhpc": "rocky-latest-RL9",
45-
"openstack.openhpc-cuda": "rocky-latest-cuda-RL9"
39+
"openstack.openhpc": "rocky-latest-RL9"
4640
}
4741
}
4842
@@ -117,4 +111,4 @@ jobs:
117111
path: |
118112
./image-id.txt
119113
./image-name.txt
120-
overwrite: true
114+
overwrite: true
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
name: Cleanup CI clusters
2+
on:
3+
workflow_dispatch:
4+
inputs:
5+
ci_cloud:
6+
description: 'Select the CI_CLOUD'
7+
required: true
8+
type: choice
9+
options:
10+
- LEAFCLOUD
11+
- SMS
12+
- ARCUS
13+
schedule:
14+
- cron: '0 20 * * *' # Run at 8PM - image sync runs at midnight
15+
16+
jobs:
17+
ci_cleanup:
18+
name: ci-cleanup
19+
concurrency: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.cloud }}
20+
strategy:
21+
fail-fast: false
22+
matrix:
23+
cloud:
24+
- LEAFCLOUD
25+
- SMS
26+
- ARCUS
27+
runs-on: ubuntu-22.04
28+
env:
29+
OS_CLOUD: openstack
30+
CI_CLOUD: ${{ matrix.cloud }}
31+
steps:
32+
- uses: actions/checkout@v2
33+
34+
- name: Record which cloud CI is running on
35+
run: |
36+
echo CI_CLOUD: ${{ env.CI_CLOUD }}
37+
38+
- name: Setup environment
39+
run: |
40+
python3 -m venv venv
41+
. venv/bin/activate
42+
pip install -U pip
43+
pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
44+
shell: bash
45+
46+
- name: Write clouds.yaml
47+
run: |
48+
mkdir -p ~/.config/openstack/
49+
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
50+
shell: bash
51+
52+
- name: Find CI clusters
53+
run: |
54+
. venv/bin/activate
55+
CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq)
56+
echo "ci_clusters=${CI_CLUSTERS}" >> GITHUB_ENV
57+
shell: bash
58+
59+
- name: Delete clusters if control node not tagged with keep
60+
run: |
61+
. venv/bin/activate
62+
for cluster_prefix in ${CI_CLUSTERS}
63+
do
64+
TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value)
65+
if [[ $TAGS =~ "keep" ]]; then
66+
echo "Skipping ${cluster_prefix} - control instance is tagged as keep"
67+
else
68+
yes | ./dev/delete-cluster.py ${cluster_prefix}
69+
fi
70+
done
71+
shell: bash

.github/workflows/nightlybuild.yml

Lines changed: 2 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,12 @@ jobs:
2222
runs-on: ubuntu-22.04
2323
strategy:
2424
fail-fast: false # allow other matrix jobs to continue even if one fails
25-
matrix: # build RL8, RL9, RL9+CUDA versions
25+
matrix: # build RL8, RL9
2626
os_version:
2727
- RL8
2828
- RL9
2929
build:
3030
- openstack.rocky-latest
31-
- openstack.rocky-latest-cuda
32-
exclude:
33-
- os_version: RL8
34-
build: openstack.rocky-latest-cuda
35-
3631
env:
3732
ANSIBLE_FORCE_COLOR: True
3833
OS_CLOUD: openstack
@@ -108,68 +103,12 @@ jobs:
108103
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
109104
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
110105
111-
- name: Download image
106+
- name: Make image usable for further builds
112107
run: |
113108
. venv/bin/activate
114-
sudo mkdir /mnt/images
115-
sudo chmod 777 /mnt/images
116109
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
117-
openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-id }}
118-
119-
- name: Set up QEMU
120-
uses: docker/setup-qemu-action@v3
121-
122-
- name: install libguestfs
123-
run: |
124-
sudo apt -y update
125-
sudo apt -y install libguestfs-tools
126-
127-
- name: mkdir for mount
128-
run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
129-
130-
- name: mount qcow2 file
131-
run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
132-
133-
- name: Run Trivy vulnerability scanner
134-
uses: aquasecurity/[email protected]
135-
with:
136-
scan-type: fs
137-
scan-ref: "${{ steps.manifest.outputs.image-name }}"
138-
scanners: "vuln"
139-
format: sarif
140-
output: "${{ steps.manifest.outputs.image-name }}.sarif"
141-
# turn off secret scanning to speed things up
142-
env:
143-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
144-
145-
- name: Upload Trivy scan results to GitHub Security tab
146-
uses: github/codeql-action/upload-sarif@v3
147-
with:
148-
sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif"
149-
category: "${{ matrix.os_version }}-${{ matrix.build }}"
150-
151-
- name: Fail if scan has CRITICAL vulnerabilities
152-
uses: aquasecurity/[email protected]
153-
with:
154-
scan-type: fs
155-
scan-ref: "${{ steps.manifest.outputs.image-name }}"
156-
scanners: "vuln"
157-
format: table
158-
exit-code: '1'
159-
severity: 'CRITICAL'
160-
ignore-unfixed: true
161-
env:
162-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
163-
164-
- name: Delete new image if Trivy scan fails
165-
if: failure() && steps.packer_build.outcome == 'success' # Runs if the Trivy scan found crit vulnerabilities or failed
166-
run: |
167-
. venv/bin/activate
168-
echo "Deleting new image due to critical vulnerabilities or scan failure ..."
169-
openstack image delete "${{ steps.manifest.outputs.image-id }}"
170110
171111
- name: Delete old latest image
172-
if: success() # Runs only if Trivy scan passed
173112
run: |
174113
. venv/bin/activate
175114
IMAGE_COUNT=$(openstack image list --name ${{ steps.manifest.outputs.image-name }} -f value -c ID | wc -l)
@@ -200,10 +139,7 @@ jobs:
200139
- RL9
201140
image:
202141
- rocky-latest
203-
- rocky-latest-cuda
204142
exclude:
205-
- os_version: RL8
206-
image: rocky-latest-cuda
207143
- target_cloud: LEAFCLOUD
208144
env:
209145
OS_CLOUD: openstack

.github/workflows/s3-image-sync.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ jobs:
4242
build:
4343
- RL8
4444
- RL9
45-
- RL9-cuda
4645
env:
4746
ANSIBLE_FORCE_COLOR: True
4847
OS_CLOUD: openstack
@@ -112,7 +111,6 @@ jobs:
112111
build:
113112
- RL8
114113
- RL9
115-
- RL9-cuda
116114
exclude:
117115
- cloud: ${{ needs.image_upload.outputs.ci_cloud }}
118116

.github/workflows/stackhpc.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ on:
1212
- '!docs/**'
1313
- '!README.md'
1414
- '!.gitignore'
15+
- '!.github/workflows/'
16+
- '.github/workflows/stackhpc'
1517
pull_request:
1618
paths:
1719
- '**'
@@ -20,6 +22,8 @@ on:
2022
- '!docs/**'
2123
- '!README.md'
2224
- '!.gitignore'
25+
- '!.github/workflows/'
26+
- '.github/workflows/stackhpc'
2327
jobs:
2428
openstack:
2529
name: openstack-ci

.github/workflows/trivyscan.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ on:
1010
jobs:
1111
scan:
1212
concurrency:
13-
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }} # to branch/PR + OS + build
13+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }} # to branch/PR + build
1414
cancel-in-progress: true
1515
runs-on: ubuntu-latest
1616
strategy:
1717
fail-fast: false
1818
matrix:
19-
build: ["RL8", "RL9", "RL9-cuda"]
19+
build: ["RL8", "RL9"]
2020
env:
2121
JSON_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
2222
OS_CLOUD: openstack
@@ -94,12 +94,13 @@ jobs:
9494
timeout: 15m
9595
env:
9696
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
97+
TRIVY_DB_REPOSITORY: ghcr.io/azimuth-cloud/trivy-db:2
9798

9899
- name: Upload Trivy scan results to GitHub Security tab
99100
uses: github/codeql-action/upload-sarif@v3
100101
with:
101102
sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif"
102-
category: "${{ matrix.os_version }}-${{ matrix.build }}"
103+
category: "${{ matrix.build }}"
103104

104105
- name: Fail if scan has CRITICAL vulnerabilities
105106
uses: aquasecurity/[email protected]
@@ -114,3 +115,4 @@ jobs:
114115
timeout: 15m
115116
env:
116117
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
118+
TRIVY_DB_REPOSITORY: ghcr.io/azimuth-cloud/trivy-db:2

README.md

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,21 +55,28 @@ You will also need to install [OpenTofu](https://opentofu.org/docs/intro/install
5555

5656
### Create a new environment
5757

58-
Use the `cookiecutter` template to create a new environment to hold your configuration. In the repository root run:
58+
Run the following from the repository root to activate the venv:
5959

6060
. venv/bin/activate
61+
62+
Use the `cookiecutter` template to create a new environment to hold your configuration:
63+
6164
cd environments
6265
cookiecutter skeleton
6366

6467
and follow the prompts to complete the environment name and description.
6568

6669
**NB:** In subsequent sections this new environment is refered to as `$ENV`.
6770

68-
Now generate secrets for this environment:
71+
Activate the new environment:
72+
73+
. environments/$ENV/activate
74+
75+
And generate secrets for it:
6976

7077
ansible-playbook ansible/adhoc/generate-passwords.yml
7178

72-
### Define infrastructure configuration
79+
### Define and deploy infrastructure
7380

7481
Create an OpenTofu variables file to define the required infrastructure, e.g.:
7582

@@ -91,20 +98,28 @@ Create an OpenTofu variables file to define the required infrastructure, e.g.:
9198
}
9299
}
93100

94-
Variables marked `*` refer to OpenStack resources which must already exist. The above is a minimal configuration - for all variables
95-
and descriptions see `environments/$ENV/terraform/terraform.tfvars`.
101+
Variables marked `*` refer to OpenStack resources which must already exist. The above is a minimal configuration - for all variables and descriptions see `environments/$ENV/terraform/terraform.tfvars`.
102+
103+
To deploy this infrastructure, ensure the venv and the environment are [activated](#create-a-new-environment) and run:
96104

97-
### Deploy appliance
105+
export OS_CLOUD=openstack
106+
cd environments/$ENV/terraform/
107+
tofu apply
108+
109+
and follow the prompts. Note the OS_CLOUD environment variable assumes that OpenStack credentials are defined using a [clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) file in a default location with the default cloud name of `openstack`.
110+
111+
### Configure appliance
112+
113+
To configure the appliance, ensure the venv and the environment are [activated](#create-a-new-environment) and run:
98114

99115
ansible-playbook ansible/site.yml
100116

101-
You can now log in to the cluster using:
117+
Once it completes you can log in to the cluster using:
102118

103119
ssh rocky@$login_ip
104120

105121
where the IP of the login node is given in `environments/$ENV/inventory/hosts.yml`
106122

107-
108123
## Overview of directory structure
109124

110125
- `environments/`: See [docs/environments.md](docs/environments.md).

ansible/.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,5 @@ roles/*
6060
!roles/tuned/**
6161
!roles/compute_init/
6262
!roles/compute_init/**
63-
63+
!roles/lustre/
64+
!roles/lustre/**

0 commit comments

Comments
 (0)