diff --git a/.github/extra_vars/arcus.yml b/.github/extra_vars/arcus.yml new file mode 100644 index 00000000..1f149b96 --- /dev/null +++ b/.github/extra_vars/arcus.yml @@ -0,0 +1,35 @@ +# Cluster instance vars +cluster_id: "{{ cluster_name }}" +openhpc_slurm_partitions: + - name: "small" + count: 2 + flavor_name: "vm.ska.cpu.general.small" + default: "YES" +cluster_run_validation: true +cluster_user_ssh_public_key: "" +home_volume_size: 20 +cluster_use_root_volumes: true + +# Cloud vars +cluster_external_network: "CUDN-Internet" +login_flavor_name: "vm.ska.cpu.general.small" +control_flavor_name: "vm.ska.cpu.general.small" +metrics_db_maximum_size: 5 + +# Image build +image_build_manage_infra: false +image_build_use_blockstorage_volume: true +image_build_image_disk_format: "raw" +image_build_volume_size: 10 +image_build_metadata: + hw_vif_multiqueue_enabled: "yes" + hw_scsi_model: "virtio-scsi" + hw_disk_bus: "scsi" + hw_qemu_guest_agent: "yes" + os_require_quiesce: "yes" +image_build_flavor_name: "vm.ska.cpu.general.small" +image_build_network_id: "4b6b2722-ee5b-40ec-8e52-a6610e14cc51" +image_build_attach_floating_ip: true +image_build_floating_ip_network: "CUDN-Internet" +image_build_source_image_id: "2a77064b-be40-4065-b0f4-4d5417a4460a" +image_build_security_group_id: "486dfc85-099b-4bbb-9375-60f320a7de18" diff --git a/.github/workflows/build-image-deploy.yml b/.github/workflows/build-image-deploy.yml new file mode 100644 index 00000000..2689a7e5 --- /dev/null +++ b/.github/workflows/build-image-deploy.yml @@ -0,0 +1,131 @@ + +name: Build, deploy and promote a new OHPC image +on: + workflow_dispatch: + inputs: + promote_community: + description: 'Set the community property on a successfully tested image' + required: true + default: false + type: boolean + pull_request: + push: + branches: + - main + tags: + - '*' +jobs: + build-deploy-promote: + name: Build, deploy and promote a new OHPC image + if: github.repository == 'stackhpc/caas-slurm-appliance' + concurrency: ${{ github.ref }} + runs-on: ubuntu-20.04 + env: + PACKER_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + OS_CLIENT_CONFIG_FILE: ${{ github.workspace }}/clouds.yaml + EXTRA_VARS_FILE: .github/extra_vars/arcus.yml + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Write clouds.yaml + run: | + echo "${CLOUDS_YAML}" > ${OS_CLIENT_CONFIG_FILE} + shell: bash + env: + CLOUDS_YAML: ${{ secrets.CLOUDS_YAML }} + + - name: Build OHPC image + id: build + run: | + source venv/bin/activate + ansible-playbook \ + -i image-build/hosts \ + -e @${EXTRA_VARS_FILE} \ + -e '{"write_cluster_image_uuid_file": true}' \ + -e image_build_cluster_name="image-build-${GITHUB_SHA::7}" \ + image-build.yml + echo "CLUSTER_IMAGE=$(cat cluster_image_uuid.txt)" >> $GITHUB_OUTPUT + env: + PACKER_LOG_PATH: ${{ github.workspace }}/packer-build.log + + - name: Remove image build infra + run: | + source venv/bin/activate + ansible-playbook \ + -i image-build/hosts \ + -e @${EXTRA_VARS_FILE} \ + -e cluster_state=absent \ + -e image_build_cluster_name="image-build-${GITHUB_SHA::7}" \ + image-build.yml + if: always() + + - name: Deploy a cluster based on the new OPHC image + id: deploy + run: | + source venv/bin/activate + ansible-playbook \ + -i image-build/hosts \ + -e @${EXTRA_VARS_FILE} \ + -e cluster_image=${{ steps.build.outputs.CLUSTER_IMAGE }} \ + -e cluster_name="caas-ci-${GITHUB_SHA::7}" \ + slurm-infra.yml + env: + SLURM_INFRA_HIDE_DEBUG_OUTPUT: True + if: success() + + - name: Remove cluster based on the new OHPC image + run: | + source venv/bin/activate + ansible-playbook \ + -i image-build/hosts \ + -e @${EXTRA_VARS_FILE} \ + -e cluster_image=${{ steps.build.outputs.CLUSTER_IMAGE }} \ + -e cluster_state=absent \ + -e cluster_name="caas-ci-${GITHUB_SHA::7}" \ + slurm-infra.yml + if: | + ( success() || failure() || cancelled() ) && + steps.build.outcome == 'success' + + - name: Delete built image after testing + run: | + source venv/bin/activate + ansible-playbook \ + -i image-build/hosts \ + -e cluster_image=${{ steps.build.outputs.CLUSTER_IMAGE }} \ + -e '{"cluster_image_delete": true}' \ + image-build/image-delete-or-promote.yml + if: | + ( success() || failure() || cancelled() ) && + steps.build.outcome == 'success' && + github.event_name == 'pull_request' + + - name: Promote built image from Private to Community after testing + run: | + source venv/bin/activate + ansible-playbook \ + -i image-build/hosts \ + -e cluster_image=${{ steps.build.outputs.CLUSTER_IMAGE }} \ + -e '{"cluster_image_promote_community": true}' \ + image-build/image-delete-or-promote.yml + if: | + success() && + steps.build.outcome == 'success' && + steps.deploy.outcome == 'success' && + (( github.event_name == 'workflow_dispatch' && inputs.promote_community == true ) + || github.event_name == 'push' ) + + - name: Upload packer build log artifact + uses: actions/upload-artifact@v3 + with: + name: packer-build-log + path: ${{ github.workspace }}/packer-build.log + if: failure() || success() || cancelled() diff --git a/dev/setup-env.sh b/dev/setup-env.sh new file mode 100755 index 00000000..d4fb4749 --- /dev/null +++ b/dev/setup-env.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -euo pipefail + +/usr/bin/python3.8 -m venv venv +source venv/bin/activate +pip install -U pip +pip install -r requirements.txt +ansible --version +# Install ansible dependencies ... +ansible-galaxy role install -r requirements.yml --force +ansible-galaxy collection install -r requirements.yml --force diff --git a/image-build.yml b/image-build.yml index ceb6c374..4ff9b6ed 100644 --- a/image-build.yml +++ b/image-build.yml @@ -21,4 +21,12 @@ - name: Print cluster_image UUID debug: msg: "{{ cluster_image }}" + + - name: Write cluster_image UUID to file + copy: + dest: "{{ playbook_dir }}/cluster_image_uuid.txt" + content: "{{ cluster_image }}" + when: + - write_cluster_image_uuid_file is defined + - write_cluster_image_uuid_file when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent") diff --git a/image-build/image-delete-or-promote.yml b/image-build/image-delete-or-promote.yml new file mode 100644 index 00000000..ec414c4e --- /dev/null +++ b/image-build/image-delete-or-promote.yml @@ -0,0 +1,23 @@ +- hosts: openstack + tasks: + - block: + # Use command module because openstack.image doesn't + # support setting the community property + - name: Set image community property + command: + cmd: >- + openstack image set --community {{ cluster_image }} + changed_when: true + when: + - cluster_image_promote_community is defined + - cluster_image_promote_community + + - name: Delete image + openstack.cloud.image: + name: "{{ cluster_image }}" + state: absent + when: + - cluster_image_delete is defined + - cluster_image_delete + + when: cluster_image is defined \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..1a441e18 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +ansible==6.0.0 +openstacksdk<0.99.0 +python-openstackclient +jmespath +passlib[bcrypt]==1.7.4 +cookiecutter +selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 +netaddr +matplotlib diff --git a/roles/image_build/defaults/main.yml b/roles/image_build/defaults/main.yml index 37a3503d..d8348908 100644 --- a/roles/image_build/defaults/main.yml +++ b/roles/image_build/defaults/main.yml @@ -14,6 +14,9 @@ image_build_metadata: {} # The directory that contains the openstack.pkr.hcl to build the Slurm image image_build_packer_root_path: "{{ playbook_dir }}/vendor/stackhpc/ansible-slurm-appliance/packer" +# Extra args to pass to the packer build command +image_build_packer_extra_args: "" + # The appliances_environment_root directory. This may contain a hooks directory # optionally containing pre.yml, post-bootstrap.yml and post.yml playbooks, to # run during the image-build process diff --git a/roles/image_build/tasks/main.yml b/roles/image_build/tasks/main.yml index 5dc9b645..1c48b91b 100644 --- a/roles/image_build/tasks/main.yml +++ b/roles/image_build/tasks/main.yml @@ -67,7 +67,7 @@ - name: Build image with packer command: cmd: | - packer build -only openstack.openhpc -var-file={{ pkrvars_hcl_file.path }} openstack.pkr.hcl + packer build {{ image_build_packer_extra_args }} -only openstack.openhpc -var-file={{ pkrvars_hcl_file.path }} openstack.pkr.hcl chdir: "{{ image_build_packer_root_path }}" environment: APPLIANCES_ENVIRONMENT_ROOT: "{{ image_build_appliances_environment_root }}" diff --git a/roles/image_build_infra/defaults/main.yml b/roles/image_build_infra/defaults/main.yml index adce2f82..d8db9a1a 100644 --- a/roles/image_build_infra/defaults/main.yml +++ b/roles/image_build_infra/defaults/main.yml @@ -2,6 +2,8 @@ image_build_terraform_project_path: "{{ playbook_dir }}/terraform-caas-image-build" image_build_cluster_name: "caas-image-build" +cluster_name: "{{ image_build_cluster_name }}" +cluster_id: "{{ image_build_cluster_name }}" # Regex to capture existing cloud image names to use as the # OpenHPC Slurm base-image diff --git a/slurm-infra.yml b/slurm-infra.yml index 95a309a8..be4d77a9 100644 --- a/slurm-infra.yml +++ b/slurm-infra.yml @@ -121,8 +121,12 @@ # Write the outputs as the final task - hosts: localhost + vars: + hide_debug_outputs: "{{ lookup('ansible.builtin.env', 'SLURM_INFRA_HIDE_DEBUG_OUTPUT', default=false) | bool }}" tasks: - debug: var=outputs + when: + - not hide_debug_outputs vars: # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, # so we have to repeat logic here unfortunately diff --git a/vendor/stackhpc/ansible-slurm-appliance b/vendor/stackhpc/ansible-slurm-appliance index b9c474fa..56829020 160000 --- a/vendor/stackhpc/ansible-slurm-appliance +++ b/vendor/stackhpc/ansible-slurm-appliance @@ -1 +1 @@ -Subproject commit b9c474fa5132da68ed5cc2ad64d60f9dd9f89c5a +Subproject commit 56829020478bdaf723b50ee303f65458daa65a13