diff --git a/.github/workflows/s3-image-sync.yml b/.github/workflows/s3-image-sync.yml index a7a681bb7..0ffaae954 100644 --- a/.github/workflows/s3-image-sync.yml +++ b/.github/workflows/s3-image-sync.yml @@ -23,3 +23,143 @@ jobs: run: | echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg shell: bash + + - name: Install s3cmd + run: | + sudo apt-get --yes install s3cmd + + - name: Cleanup S3 bucket + run: | + s3cmd rm s3://${{ env.S3_BUCKET }} --recursive --force + + image_upload: + runs-on: ubuntu-22.04 + needs: s3_cleanup + concurrency: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }} + strategy: + fail-fast: false + matrix: + build: + - RL8 + - RL9 + - RL9-cuda + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + CI_CLOUD: ${{ vars.CI_CLOUD }} + outputs: + ci_cloud: ${{ steps.ci.outputs.CI_CLOUD }} + steps: + - uses: actions/checkout@v2 + + - name: Record which cloud CI is running on + id: ci + run: | + echo "CI_CLOUD=${{ env.CI_CLOUD }}" >> "$GITHUB_OUTPUT" + + - name: Setup environment + run: | + python3 -m venv venv + . venv/bin/activate + pip install -U pip + pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + shell: bash + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml + shell: bash + + - name: Write s3cmd configuration + run: | + echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg + shell: bash + + - name: Install s3cmd + run: | + sudo apt-get --yes install s3cmd + + - name: Retrieve image name + run: | + TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}") + echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV" + shell: bash + + - name: Download image to runner + run: | + . venv/bin/activate + openstack image save --file ${{ env.TARGET_IMAGE }} ${{ env.TARGET_IMAGE }} + shell: bash + + - name: Upload Image to S3 + run: | + echo "Uploading Image: ${{ env.TARGET_IMAGE }} to S3..." + s3cmd --multipart-chunk-size-mb=150 put ${{ env.TARGET_IMAGE }} s3://${{ env.S3_BUCKET }} + shell: bash + + image_sync: + needs: image_upload + runs-on: ubuntu-22.04 + concurrency: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.cloud }}-${{ matrix.build }} + strategy: + fail-fast: false + matrix: + cloud: + - LEAFCLOUD + - SMS + - ARCUS + build: + - RL8 + - RL9 + - RL9-cuda + exclude: + - cloud: ${{ needs.image_upload.outputs.ci_cloud }} + + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + CI_CLOUD: ${{ matrix.cloud }} + steps: + - uses: actions/checkout@v2 + + - name: Record which cloud CI is running on + run: | + echo CI_CLOUD: ${{ env.CI_CLOUD }} + + - name: Setup environment + run: | + python3 -m venv venv + . venv/bin/activate + pip install -U pip + pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + shell: bash + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml + shell: bash + + - name: Retrieve image name + run: | + TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}") + echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV" + + - name: Download latest image if missing + run: | + . venv/bin/activate + bash .github/bin/get-s3-image.sh ${{ env.TARGET_IMAGE }} ${{ env.S3_BUCKET }} + + - name: Cleanup OpenStack Image (on error or cancellation) + if: cancelled() || failure() + run: | + . venv/bin/activate + image_hanging=$(openstack image list --name ${{ env.TARGET_IMAGE }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}') + if [ -n "$image_hanging" ]; then + echo "Cleaning up OpenStack image with ID: $image_hanging" + openstack image delete $image_hanging + else + echo "No image ID found, skipping cleanup." + fi + shell: bash diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample index 264a96143..0b123bcf4 100644 --- a/.github/workflows/upload-release-image.yml.sample +++ b/.github/workflows/upload-release-image.yml.sample @@ -53,7 +53,7 @@ jobs: bash .github/bin/get-s3-image.sh ${{ inputs.image_name }} ${{ inputs.bucket_name }} - name: Cleanup OpenStack Image (on error or cancellation) - if: cancelled() + if: cancelled() || failure() run: | . venv/bin/activate image_hanging=$(openstack image list --name ${{ inputs.image_name }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}') diff --git a/packer/README.md b/packer/README.md index 3bc188c7e..5e1d57dc2 100644 --- a/packer/README.md +++ b/packer/README.md @@ -7,9 +7,9 @@ The Packer configuration defined here builds "fat images" which contain binaries - Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). - Improves deployment speed by reducing the number of package downloads to improve deployment speed. -By default, a fat image build starts from a RockyLinux GenericCloud image and updates all DNF packages already present. +By default, a fat image build starts from a nightly image build containing Mellanox OFED, and updates all DNF packages already present. The 'latest' nightly build itself is from a RockyLinux GenericCloud image. -The fat images StackHPC builds and test in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: +The fat images StackHPC builds and test in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: 1. Build site-specific fat images from scratch. 2. Extend an existing fat image with additional software. @@ -39,9 +39,9 @@ The steps for building site-specific fat images or extending an existing fat ima cd packer/ PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - Note that the `-only` flag here restricts the build to the non-OFED fat image "source" (in Packer terminology). Other + Note that the `-only` flag here restricts the build to the non-CUDA fat image "source" (in Packer terminology). Other source options are: - - `-only=openstack.openhpc-ofed`: Build a fat image including Mellanox OFED + - `-only=openstack.openhpc-cuda`: Build a fat image including CUDA packages. - `-only=openstack.openhpc-extra`: Build an image which extends an existing fat image - in this case the variable `source_image` or `source_image_name}` must also be set in the Packer variables file. 5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash. @@ -70,7 +70,7 @@ What is Slurm Appliance-specific are the details of how Ansible is run: openhpc-extra = ["foo"] } - the build VM uses an existing "fat image" (rather than a RockyLinyux GenericCloud one) and is added to the `builder` and `foo` groups. This means only code targeting `builder` and `foo` groups runs. In this way an existing image can be extended with site-specific code, without modifying the part of the image which has already been tested in the StackHPC CI. + the build VM uses an existing "fat image" (rather than a 'latest' nightly one) and is added to the `builder` and `foo` groups. This means only code targeting `builder` and `foo` groups runs. In this way an existing image can be extended with site-specific code, without modifying the part of the image which has already been tested in the StackHPC CI. - The playbook `ansible/fatimage.yml` is run which is only a subset of `ansible/site.yml`. This allows restricting the code which runs during build for cases where setting `builder` groupvars is not sufficient (e.g. a role always attempts to configure or start services). This may eventually be removed. @@ -82,5 +82,5 @@ There are some things to be aware of when developing Ansible to run in a Packer - Build VM hostnames are not the same as for equivalent "real" hosts and do not contain `login`, `control` etc. Therefore variables used by the build VM must be defined as groupvars not hostvars. - Ansible may need to proxy to real compute nodes. If Packer should not use the same proxy to connect to the build VMs (e.g. build happens on a different network), proxy configuration should not be added to the `all` group. - - Currently two fat image "sources" are defined, with and without OFED. This simplifies CI configuration by allowing the + - Currently two fat image "sources" are defined, with and without CUDA. This simplifies CI configuration by allowing the default source images to be defined in the `openstack.pkr.hcl` definition.