Skip to content

Commit 3adc8e4

Browse files
committed
Merge branch 'main' into cuda
2 parents 96ad27f + 999cfc8 commit 3adc8e4

File tree

36 files changed

+459
-79
lines changed

36 files changed

+459
-79
lines changed

.github/CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
./.github/CODEOWNERS
1+
* @stackhpc/batch

.github/workflows/fatimage.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,14 @@ jobs:
4848
. venv/bin/activate
4949
. environments/.stackhpc/activate
5050
cd packer/
51-
packer init
51+
packer init .
5252
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
5353
5454
- name: Get created image name from manifest
5555
id: manifest
5656
run: |
5757
. venv/bin/activate
58-
IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer-manifest.json)
58+
IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
5959
while ! openstack image show -f value -c name $IMAGE_ID; do
6060
sleep 30
6161
done

.github/workflows/stackhpc.yml

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -66,24 +66,13 @@ jobs:
6666
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
6767
terraform apply -auto-approve
6868
69-
- name: Get server provisioning failure messages
70-
id: provision_failure
71-
run: |
72-
. venv/bin/activate
73-
. environments/.stackhpc/activate
74-
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
75-
TF_FAIL_MSGS="$(../../skeleton/\{\{cookiecutter.environment\}\}/terraform/getfaults.py $PWD)"
76-
echo TF failure messages: $TF_FAIL_MSGS
77-
echo "::set-output name=messages::${TF_FAIL_MSGS}"
78-
if: always() && steps.provision_servers.outcome == 'failure'
79-
80-
- name: Delete infrastructure if failed due to lack of hosts
69+
- name: Delete infrastructure if provisioning failed
8170
run: |
8271
. venv/bin/activate
8372
. environments/.stackhpc/activate
8473
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
8574
terraform destroy -auto-approve
86-
if: ${{ always() && steps.provision_servers.outcome == 'failure' && contains(steps.provision_failure.messages, 'not enough hosts available') }}
75+
if: failure() && steps.provision_servers.outcome == 'failure'
8776

8877
- name: Configure cluster
8978
run: |
@@ -99,6 +88,12 @@ jobs:
9988
. environments/.stackhpc/activate
10089
ansible-playbook -vv ansible/adhoc/hpctests.yml
10190
91+
- name: Run EESSI tests
92+
run: |
93+
. venv/bin/activate
94+
. environments/.stackhpc/activate
95+
ansible-playbook -vv ansible/ci/check_eessi.yml
96+
10297
- name: Confirm Open Ondemand is up (via SOCKS proxy)
10398
run: |
10499
. venv/bin/activate

ansible/.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,9 @@ roles/*
3636
!roles/systemd/**
3737
!roles/cuda/
3838
!roles/cuda/**
39+
!roles/freeipa/
40+
!roles/freeipa/**
41+
!roles/proxy/
42+
!roles/proxy/**
43+
!roles/resolv_conf/
44+
!roles/resolv_conf/**

ansible/bootstrap.yml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,14 @@
1313
to update these variable names. ** NB: The actual secrets will not be changed.**
1414
when: "'secrets_openhpc_' in (hostvars[inventory_hostname] | join)"
1515

16+
- hosts: resolv_conf
17+
become: yes
18+
gather_facts: false
19+
tags: resolv_conf
20+
tasks:
21+
- import_role:
22+
name: resolv_conf
23+
1624
- hosts: etc_hosts
1725
gather_facts: false
1826
tags: etc_hosts
@@ -21,6 +29,14 @@
2129
- import_role:
2230
name: etc_hosts
2331

32+
- hosts: proxy
33+
gather_facts: false
34+
tags: proxy
35+
become: yes
36+
tasks:
37+
- import_role:
38+
name: proxy
39+
2440
- hosts: cluster
2541
gather_facts: false
2642
tasks:
@@ -96,6 +112,16 @@
96112
tasks_from: config.yml
97113
tags: config
98114

115+
- name: Setup EESSI
116+
hosts: eessi
117+
tags: eessi
118+
become: true
119+
gather_facts: false
120+
tasks:
121+
- name: Install and configure EESSI
122+
import_role:
123+
name: eessi
124+
99125
- hosts: update
100126
gather_facts: false
101127
become: yes

ansible/ci/check_eessi.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
---
2+
- name: Run EESSI test job
3+
hosts: login[0]
4+
vars:
5+
eessi_test_rootdir: /home/eessi_test
6+
tasks:
7+
- name: Create test root directory
8+
file:
9+
path: "{{ eessi_test_rootdir }}"
10+
state: directory
11+
owner: "{{ ansible_user }}"
12+
group: "{{ ansible_user }}"
13+
become: true
14+
15+
- name: Clone eessi-demo repo
16+
ansible.builtin.git:
17+
repo: "https://github.com/eessi/eessi-demo.git"
18+
dest: "{{ eessi_test_rootdir }}/eessi-demo"
19+
20+
- name: Run test job
21+
ansible.builtin.shell:
22+
cmd: |
23+
source /cvmfs/pilot.eessi-hpc.org/latest/init/bash
24+
srun ./run.sh
25+
chdir: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow"
26+
executable: /bin/bash
27+
register: job_output
28+
29+
- name: Fail if job output contains error
30+
fail:
31+
# Note: Job prints live progress bar to terminal, so use regex filter to remove this from stdout
32+
msg: "Test job using EESSI modules failed. Job output was: {{ job_output.stdout | regex_replace('\b', '') }}"
33+
when: '"Epoch 5/5" not in job_output.stdout'
34+

ansible/ci/check_sacct_hpctests.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
gather_facts: false
33
become: true
44
vars:
5-
sacct_stdout_expected: |- # based on CI running hpctests as the first job - NB note no trailing newline
5+
sacct_stdout_expected: |- # based on CI running hpctests as the first job
66
JobID,JobName,State
77
1,pingpong.sh,COMPLETED
88
2,pingmatrix.sh,COMPLETED
@@ -18,10 +18,10 @@
1818
register: sacct
1919
- name: Check info for ended jobs
2020
assert:
21-
that: sacct.stdout == sacct_stdout_expected
21+
that: sacct_stdout_expected in sacct.stdout
2222
fail_msg: |
2323
Expected:
2424
--{{ sacct_stdout_expected }}--
2525
Got:
2626
--{{ sacct.stdout }}--
27-
success_msg: sacct shows hpctests jobs as first and only jobs
27+
success_msg: sacct shows hpctests jobs as first jobs in list

ansible/cleanup.yml

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,28 @@
55
- name: Remove dnf caches
66
command: dnf clean all
77

8+
# If image build happens on a Neutron subnet with property dns_namservers defined, then cloud-init
9+
# disables NetworkManager's control of /etc/resolv.conf and appends nameservers itself.
10+
# We don't want network configuration during instance boot to depend on the configuration
11+
# of the network the builder was on, so we reset these aspects.
812
- name: Delete /etc/resolv.conf
9-
# required as if cloud-init (rather than network manager) controls this on next boot it won't be entirely overrwritten
1013
file:
1114
path: /etc/resolv.conf
1215
state: absent
16+
when: "'resolv_conf' not in group_names" # if its been overriden, deleting it is the wrong thing to do
1317

14-
- name: Delete any injected ssh config for rocky
18+
- name: Reenable NetworkManager control of resolv.conf
19+
# NB: This *doesn't* delete the 90-dns-none.conf file created by the resolv_conf role
20+
# as if nameservers are explicitly being set by that role we don't want to allow NM
21+
# to override it again.
1522
file:
16-
path: /home/rocky/.ssh/
23+
path: /etc/NetworkManager/conf.d/99-cloud-init.conf
24+
state: absent
25+
26+
- name: Delete any injected ssh config for ansible_user
27+
file:
28+
path: "/home/{{ ansible_user }}/.ssh/"
1729
state: absent
1830

1931
- name: Run cloud-init cleanup
2032
command: cloud-init clean --logs --seed
21-

ansible/fatimage.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,21 @@
11
# Builder version of site.yml just installing binaries
22

3+
- name: Run pre.yml hook
4+
vars:
5+
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
6+
hook_path: "{{ appliances_environment_root }}/hooks/pre.yml"
7+
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
8+
when: hook_path | exists
9+
310
- import_playbook: bootstrap.yml
411

12+
- name: Run post-bootstrap.yml hook
13+
vars:
14+
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
15+
hook_path: "{{ appliances_environment_root }}/hooks/post-bootstrap.yml"
16+
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
17+
when: hook_path | exists
18+
519
- hosts: builder
620
become: yes
721
gather_facts: no
@@ -100,6 +114,7 @@
100114
- name: unpack prometheus binaries
101115
become: false
102116
unarchive:
117+
remote_src: yes
103118
src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz"
104119
dest: "/tmp"
105120
creates: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}/prometheus"
@@ -124,6 +139,18 @@
124139

125140
# - import_playbook: iam.yml - nothing to do
126141

142+
- name: Run post.yml hook
143+
vars:
144+
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
145+
hook_path: "{{ appliances_environment_root }}/hooks/post.yml"
146+
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
147+
when: hook_path | exists
148+
149+
- hosts: builder
150+
become: yes
151+
gather_facts: no
152+
tasks:
153+
# - meta: end_here
127154
- name: Cleanup image
128155
import_tasks: cleanup.yml
129156

ansible/roles/eessi/README.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
EESSI
2+
=====
3+
4+
Configure the EESSI pilot respository for use on given hosts.
5+
6+
Requirements
7+
------------
8+
9+
None.
10+
11+
Role Variables
12+
--------------
13+
14+
- `cvmfs_quota_limit_mb`: Optional int. Maximum size of local package cache on each node in MB.
15+
- `cvmfs_config_overrides`: Optional dict. Set of key-value pairs for additional CernVM-FS settings see [official docs](https://cvmfs.readthedocs.io/en/stable/cpt-configure.html) for list of options. Each dict key should correspond to a valid config variable (e.g. `CVMFS_HTTP_PROXY`) and the corresponding dict value will be set as the variable value (e.g. `https://my-proxy.com`). These configuration parameters will be written to the `/etc/cvmfs/default.local` config file on each host in the form `KEY=VALUE`.
16+
17+
Dependencies
18+
------------
19+
20+
None.
21+
22+
Example Playbook
23+
----------------
24+
25+
```yaml
26+
- name: Setup EESSI
27+
hosts: eessi
28+
tags: eessi
29+
become: true
30+
tasks:
31+
- name: Install and configure EESSI
32+
import_role:
33+
name: eessi
34+
```

0 commit comments

Comments
 (0)