Test deployment and reimage on OpenStack #2835
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
# Test deployment and reimage on OpenStack. | |
# This workflow can run standalone or as part of the main CI workflow. | |
# See the workflow file 'main.yml' for how this is CI triggered. | |
name: Test deployment and reimage on OpenStack | |
on: | |
workflow_call: | |
workflow_dispatch: | |
permissions: | |
contents: read | |
packages: write | |
# To report GitHub Actions status checks | |
statuses: write | |
jobs: | |
openstack: | |
name: openstack-ci | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS | |
cancel-in-progress: true | |
runs-on: ubuntu-22.04 | |
strategy: | |
fail-fast: false # allow other matrix jobs to continue even if one fails | |
matrix: | |
os_version: | |
# Temporarily disable Rocky 8 to speed up testing and reduce CI resources | |
#- RL8 | |
- RL9 | |
env: | |
ANSIBLE_FORCE_COLOR: True | |
OS_CLOUD: openstack | |
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} | |
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings | |
TF_VAR_os_version: ${{ matrix.os_version }} | |
STACKHPC_TF_DIR: environments/.stackhpc/tofu | |
steps: | |
- name: Find the latest release | |
run: | | |
echo "LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name)" >> "$GITHUB_ENV" | |
- name: Checkout latest release | |
uses: actions/checkout@v4 | |
with: | |
ref: ${{ env.LATEST_RELEASE_TAG }} | |
fetch-depth: 0 | |
- name: Override CI_CLOUD if PR label is present | |
if: ${{ github.event_name == 'pull_request' }} | |
run: | | |
# Iterate over the labels | |
labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name') | |
echo "$labels" | |
for label in $labels; do | |
if [[ $label == CI_CLOUD=* ]]; then | |
# Extract the value after 'CI_CLOUD=' | |
CI_CLOUD_OVERRIDE=${label#CI_CLOUD=} | |
echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> "$GITHUB_ENV" | |
fi | |
done | |
- name: Record debug info | |
run: | | |
echo "LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG" | |
echo "CI_CLOUD: $CI_CLOUD" | |
- name: Setup ssh | |
run: | | |
set -x | |
mkdir ~/.ssh | |
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa | |
chmod 0600 ~/.ssh/id_rsa | |
shell: bash | |
- name: Add bastion's ssh key to known_hosts | |
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts | |
shell: bash | |
- name: Install ansible, pip and galaxy requirements | |
run: dev/setup-env.sh | |
- name: Install OpenTofu | |
uses: opentofu/[email protected] | |
with: | |
tofu_version: 1.9.0 | |
- name: Initialise tofu | |
run: tofu init | |
working-directory: ${{ env.STACKHPC_TF_DIR }} | |
- name: Write clouds.yaml | |
run: | | |
mkdir -p ~/.config/openstack/ | |
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml | |
shell: bash | |
- name: Setup environment-specific inventory/tofu inputs | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
echo "vault_demo_user_password: $DEMO_USER_PASSWORD" > "$APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml" | |
env: | |
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} | |
- name: Provision nodes using latest release image | |
id: provision_servers | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd "$STACKHPC_TF_DIR" | |
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
- name: Delete infrastructure if provisioning failed | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd "$STACKHPC_TF_DIR" | |
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
if: failure() && steps.provision_servers.outcome == 'failure' | |
- name: Configure cluster at latest release | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible all -m wait_for_connection | |
ansible-playbook ansible/adhoc/generate-passwords.yml | |
ansible-playbook -v ansible/site.yml | |
ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Run MPI-based tests at latest release | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/adhoc/hpctests.yml --tags pingpong | |
# - name: Run EESSI tests | |
# run: | | |
# . venv/bin/activate | |
# . environments/.stackhpc/activate | |
# ansible-playbook -vv ansible/ci/check_eessi.yml | |
- name: Checkout current branch | |
run: git checkout ${{ github.head_ref || github.ref_name }} | |
- name: Update ansible, pip and galaxy requirements | |
run: dev/setup-env.sh | |
- name: Reimage login and control nodes to image in current branch | |
id: reimage_non_compute | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd "$STACKHPC_TF_DIR" | |
tofu init | |
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
- name: Configure cluster using current branch | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible all -m wait_for_connection | |
ansible-playbook ansible/adhoc/generate-passwords.yml | |
ansible-playbook -v ansible/site.yml | |
ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Reimage compute nodes to image in current branch using slurm | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml | |
ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Check sacct state survived reimage to current branch | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml | |
- name: Check MPI-based tests are shown in Grafana | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/ci/check_grafana.yml | |
- name: Run MPI-based tests again in current branch | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/adhoc/hpctests.yml | |
- name: Confirm Open Ondemand is up (via SOCKS proxy) | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
# load ansible variables into shell: | |
ansible-playbook ansible/ci/output_vars.yml \ | |
-e output_vars_hosts=openondemand \ | |
-e output_vars_path="$APPLIANCES_ENVIRONMENT_ROOT/vars.txt" \ | |
-e output_vars_items=bastion_ip,bastion_user,openondemand_servername | |
source "$APPLIANCES_ENVIRONMENT_ROOT/vars.txt" | |
# setup ssh proxying: | |
sudo apt-get --yes install proxychains | |
echo proxychains installed | |
ssh -v -fN -D 9050 "${bastion_user}@${bastion_ip}" | |
echo port 9050 forwarded | |
# check OOD server returns 200: | |
statuscode=$(proxychains wget \ | |
--quiet \ | |
--spider \ | |
--server-response \ | |
--no-check-certificate \ | |
--http-user=demo_user \ | |
--http-password="${DEMO_USER_PASSWORD}" "https://${openondemand_servername}" \ | |
2>&1) | |
(echo "$statuscode" | grep "200 OK") || (echo "$statuscode" && exit 1) | |
env: | |
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} | |
- name: Delete possible volume snapshot from slurm upgrade | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
if [ -n "$SNAPSHOT" ] | |
then | |
echo "Deleting $SNAPSHOT" | |
openstack volume snapshot delete "$SNAPSHOT" | |
fi | |
- name: Delete infrastructure | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd "$STACKHPC_TF_DIR" | |
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR" | |
if: ${{ success() || cancelled() }} |