Skip to content

Commit 3045637

Browse files
authored
Merge branch 'feature/k3s-monitoring' into feat/loki
2 parents 1b3e94d + 8ca0407 commit 3045637

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+607
-268
lines changed

.github/workflows/fatimage.yml

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,29 +20,23 @@ jobs:
2020
runs-on: ubuntu-22.04
2121
strategy:
2222
fail-fast: false # allow other matrix jobs to continue even if one fails
23-
matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions
23+
matrix: # build RL8, RL9
2424
os_version:
2525
- RL8
2626
- RL9
2727
build:
2828
- openstack.openhpc
29-
- openstack.openhpc-cuda
30-
exclude:
31-
- os_version: RL8
32-
build: openstack.openhpc-cuda
3329
env:
3430
ANSIBLE_FORCE_COLOR: True
3531
OS_CLOUD: openstack
3632
CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
3733
SOURCE_IMAGES_MAP: |
3834
{
3935
"RL8": {
40-
"openstack.openhpc": "rocky-latest-RL8",
41-
"openstack.openhpc-cuda": "rocky-latest-cuda-RL8"
36+
"openstack.openhpc": "rocky-latest-RL8"
4237
},
4338
"RL9": {
44-
"openstack.openhpc": "rocky-latest-RL9",
45-
"openstack.openhpc-cuda": "rocky-latest-cuda-RL9"
39+
"openstack.openhpc": "rocky-latest-RL9"
4640
}
4741
}
4842
@@ -117,4 +111,4 @@ jobs:
117111
path: |
118112
./image-id.txt
119113
./image-name.txt
120-
overwrite: true
114+
overwrite: true

.github/workflows/nightlybuild.yml

Lines changed: 2 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,12 @@ jobs:
2222
runs-on: ubuntu-22.04
2323
strategy:
2424
fail-fast: false # allow other matrix jobs to continue even if one fails
25-
matrix: # build RL8, RL9, RL9+CUDA versions
25+
matrix: # build RL8, RL9
2626
os_version:
2727
- RL8
2828
- RL9
2929
build:
3030
- openstack.rocky-latest
31-
- openstack.rocky-latest-cuda
32-
exclude:
33-
- os_version: RL8
34-
build: openstack.rocky-latest-cuda
35-
3631
env:
3732
ANSIBLE_FORCE_COLOR: True
3833
OS_CLOUD: openstack
@@ -108,68 +103,12 @@ jobs:
108103
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
109104
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
110105
111-
- name: Download image
106+
- name: Make image usable for further builds
112107
run: |
113108
. venv/bin/activate
114-
sudo mkdir /mnt/images
115-
sudo chmod 777 /mnt/images
116109
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
117-
openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-id }}
118-
119-
- name: Set up QEMU
120-
uses: docker/setup-qemu-action@v3
121-
122-
- name: install libguestfs
123-
run: |
124-
sudo apt -y update
125-
sudo apt -y install libguestfs-tools
126-
127-
- name: mkdir for mount
128-
run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'
129-
130-
- name: mount qcow2 file
131-
run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
132-
133-
- name: Run Trivy vulnerability scanner
134-
uses: aquasecurity/[email protected]
135-
with:
136-
scan-type: fs
137-
scan-ref: "${{ steps.manifest.outputs.image-name }}"
138-
scanners: "vuln"
139-
format: sarif
140-
output: "${{ steps.manifest.outputs.image-name }}.sarif"
141-
# turn off secret scanning to speed things up
142-
env:
143-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
144-
145-
- name: Upload Trivy scan results to GitHub Security tab
146-
uses: github/codeql-action/upload-sarif@v3
147-
with:
148-
sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif"
149-
category: "${{ matrix.os_version }}-${{ matrix.build }}"
150-
151-
- name: Fail if scan has CRITICAL vulnerabilities
152-
uses: aquasecurity/[email protected]
153-
with:
154-
scan-type: fs
155-
scan-ref: "${{ steps.manifest.outputs.image-name }}"
156-
scanners: "vuln"
157-
format: table
158-
exit-code: '1'
159-
severity: 'CRITICAL'
160-
ignore-unfixed: true
161-
env:
162-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
163-
164-
- name: Delete new image if Trivy scan fails
165-
if: failure() && steps.packer_build.outcome == 'success' # Runs if the Trivy scan found crit vulnerabilities or failed
166-
run: |
167-
. venv/bin/activate
168-
echo "Deleting new image due to critical vulnerabilities or scan failure ..."
169-
openstack image delete "${{ steps.manifest.outputs.image-id }}"
170110
171111
- name: Delete old latest image
172-
if: success() # Runs only if Trivy scan passed
173112
run: |
174113
. venv/bin/activate
175114
IMAGE_COUNT=$(openstack image list --name ${{ steps.manifest.outputs.image-name }} -f value -c ID | wc -l)
@@ -200,10 +139,7 @@ jobs:
200139
- RL9
201140
image:
202141
- rocky-latest
203-
- rocky-latest-cuda
204142
exclude:
205-
- os_version: RL8
206-
image: rocky-latest-cuda
207143
- target_cloud: LEAFCLOUD
208144
env:
209145
OS_CLOUD: openstack

.github/workflows/s3-image-sync.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ jobs:
4242
build:
4343
- RL8
4444
- RL9
45-
- RL9-cuda
4645
env:
4746
ANSIBLE_FORCE_COLOR: True
4847
OS_CLOUD: openstack
@@ -112,7 +111,6 @@ jobs:
112111
build:
113112
- RL8
114113
- RL9
115-
- RL9-cuda
116114
exclude:
117115
- cloud: ${{ needs.image_upload.outputs.ci_cloud }}
118116

.github/workflows/trivyscan.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ on:
1010
jobs:
1111
scan:
1212
concurrency:
13-
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }} # to branch/PR + OS + build
13+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }} # to branch/PR + build
1414
cancel-in-progress: true
1515
runs-on: ubuntu-latest
1616
strategy:
1717
fail-fast: false
1818
matrix:
19-
build: ["RL8", "RL9", "RL9-cuda"]
19+
build: ["RL8", "RL9"]
2020
env:
2121
JSON_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
2222
OS_CLOUD: openstack
@@ -94,12 +94,13 @@ jobs:
9494
timeout: 15m
9595
env:
9696
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
97+
TRIVY_DB_REPOSITORY: ghcr.io/azimuth-cloud/trivy-db:2
9798

9899
- name: Upload Trivy scan results to GitHub Security tab
99100
uses: github/codeql-action/upload-sarif@v3
100101
with:
101102
sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif"
102-
category: "${{ matrix.os_version }}-${{ matrix.build }}"
103+
category: "${{ matrix.build }}"
103104

104105
- name: Fail if scan has CRITICAL vulnerabilities
105106
uses: aquasecurity/[email protected]
@@ -114,3 +115,4 @@ jobs:
114115
timeout: 15m
115116
env:
116117
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
118+
TRIVY_DB_REPOSITORY: ghcr.io/azimuth-cloud/trivy-db:2

README.md

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,21 +55,28 @@ You will also need to install [OpenTofu](https://opentofu.org/docs/intro/install
5555

5656
### Create a new environment
5757

58-
Use the `cookiecutter` template to create a new environment to hold your configuration. In the repository root run:
58+
Run the following from the repository root to activate the venv:
5959

6060
. venv/bin/activate
61+
62+
Use the `cookiecutter` template to create a new environment to hold your configuration:
63+
6164
cd environments
6265
cookiecutter skeleton
6366

6467
and follow the prompts to complete the environment name and description.
6568

6669
**NB:** In subsequent sections this new environment is refered to as `$ENV`.
6770

68-
Now generate secrets for this environment:
71+
Activate the new environment:
72+
73+
. environments/$ENV/activate
74+
75+
And generate secrets for it:
6976

7077
ansible-playbook ansible/adhoc/generate-passwords.yml
7178

72-
### Define infrastructure configuration
79+
### Define and deploy infrastructure
7380

7481
Create an OpenTofu variables file to define the required infrastructure, e.g.:
7582

@@ -91,20 +98,28 @@ Create an OpenTofu variables file to define the required infrastructure, e.g.:
9198
}
9299
}
93100

94-
Variables marked `*` refer to OpenStack resources which must already exist. The above is a minimal configuration - for all variables
95-
and descriptions see `environments/$ENV/terraform/terraform.tfvars`.
101+
Variables marked `*` refer to OpenStack resources which must already exist. The above is a minimal configuration - for all variables and descriptions see `environments/$ENV/terraform/terraform.tfvars`.
102+
103+
To deploy this infrastructure, ensure the venv and the environment are [activated](#create-a-new-environment) and run:
96104

97-
### Deploy appliance
105+
export OS_CLOUD=openstack
106+
cd environments/$ENV/terraform/
107+
tofu apply
108+
109+
and follow the prompts. Note the OS_CLOUD environment variable assumes that OpenStack credentials are defined using a [clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) file in a default location with the default cloud name of `openstack`.
110+
111+
### Configure appliance
112+
113+
To configure the appliance, ensure the venv and the environment are [activated](#create-a-new-environment) and run:
98114

99115
ansible-playbook ansible/site.yml
100116

101-
You can now log in to the cluster using:
117+
Once it completes you can log in to the cluster using:
102118

103119
ssh rocky@$login_ip
104120

105121
where the IP of the login node is given in `environments/$ENV/inventory/hosts.yml`
106122

107-
108123
## Overview of directory structure
109124

110125
- `environments/`: See [docs/environments.md](docs/environments.md).

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,5 @@ roles/*
6464
!roles/k9s/**
6565
!roles/kube_prometheus_stack
6666
!roles/kube_prometheus_stack/**
67+
!roles/lustre/
68+
!roles/lustre/**

ansible/bootstrap.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,3 +266,4 @@
266266
tasks:
267267
- ansible.builtin.include_role:
268268
name: k3s
269+
tasks_from: install.yml

ansible/cleanup.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,6 @@
3838

3939
- name: Cleanup /tmp
4040
command : rm -rf /tmp/*
41-
42-
- name: Delete ansible-init sentinel file created if ansible-init has run during build
43-
ansible.builtin.file:
44-
path: /var/lib/ansible-init.done
45-
state: absent
4641

4742
- name: Get package facts
4843
package_facts:

ansible/fatimage.yml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
- hosts: builder
2727
become: yes
28-
gather_facts: no
28+
gather_facts: yes
2929
tasks:
3030
# - import_playbook: iam.yml
3131
- name: Install FreeIPA client
@@ -44,6 +44,11 @@
4444
name: stackhpc.os-manila-mount
4545
tasks_from: install.yml
4646
when: "'manila' in group_names"
47+
- name: Install Lustre packages
48+
include_role:
49+
name: lustre
50+
tasks_from: install.yml
51+
when: "'lustre' in group_names"
4752

4853
- import_playbook: extras.yml
4954

@@ -57,6 +62,7 @@
5762
name: mysql
5863
tasks_from: install.yml
5964
when: "'mysql' in group_names"
65+
6066
- name: OpenHPC
6167
import_role:
6268
name: stackhpc.openhpc
@@ -83,18 +89,21 @@
8389
import_role:
8490
name: openondemand
8591
tasks_from: vnc_compute.yml
92+
8693
when: "'openondemand_desktop' in group_names"
94+
8795
- name: Open Ondemand jupyter node
8896
import_role:
8997
name: openondemand
9098
tasks_from: jupyter_compute.yml
91-
when: "'openondemand' in group_names"
99+
when: "'openondemand_jupyter' in group_names"
92100

93101
# - import_playbook: monitoring.yml:
94102
- import_role:
95103
name: opensearch
96104
tasks_from: install.yml
97105
when: "'opensearch' in group_names"
106+
98107
# slurm_stats - nothing to do
99108
- import_role:
100109
name: filebeat
@@ -114,6 +123,10 @@
114123
slurm_exporter_state: stopped
115124
when: "'slurm_exporter' in group_names"
116125

126+
- hosts: prometheus
127+
become: yes
128+
gather_facts: yes
129+
tasks:
117130
- name: kube prometheus stack
118131
import_role:
119132
name: kube_prometheus_stack

ansible/filesystems.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,13 @@
2424
tasks:
2525
- include_role:
2626
name: stackhpc.os-manila-mount
27+
28+
- name: Setup Lustre clients
29+
hosts: lustre
30+
become: true
31+
tags: lustre
32+
tasks:
33+
- include_role:
34+
name: lustre
35+
# NB install is ONLY run in builder
36+
tasks_from: configure.yml

0 commit comments

Comments
 (0)