From cc8950f15935714b26b4fd4835d8bdb379701c0d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 18 Jul 2024 11:34:00 +0000 Subject: [PATCH 01/21] add trivy image scanning --- .github/workflows/fatimage.yml | 43 +++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index e6727948b..60b40811f 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -77,5 +77,46 @@ jobs: sleep 5 done IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) - echo $IMAGE_NAME + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" done + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: install libguestfs + run: sudo apt-get -y install libguestfs-tools + + - name: mkdir for mount + run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}' + + - name: mount qcow2 file + run: sudo guestmount -a ${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}' + + + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@0.17.0 + with: + scan-type: fs + scan-ref: "./${{ steps.manifest.outputs.image-name }}" + scanners: "vuln" + format: sarif + output: "${{ steps.manifest.outputs.image-name }}.sarif" + # turn off secret scanning to speed things up + + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif" + category: "${{ matrix.os_version }}" + + - name: Fail if scan has CRITICAL vulnerabilities + uses: aquasecurity/trivy-action@0.16.1 + with: + scan-type: fs + scan-ref: "./${{ steps.manifest.outputs.image-name }}" + scanners: "vuln" + format: table + exit-code: '1' + severity: 'CRITICAL' + ignore-unfixed: true From 327fbfe98d72eb5e20f7d65567a6d4f8b4a94509 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 18 Jul 2024 12:21:05 +0000 Subject: [PATCH 02/21] bump fatimage workflow to ubuntu 22.04 --- .github/workflows/fatimage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 60b40811f..b2be6f077 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -14,7 +14,7 @@ concurrency: jobs: openstack: name: openstack-imagebuild - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: os_version: [RL8, RL9] From 82b88c13321f46c33420f842fd07dee32eaff1c9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 18 Jul 2024 13:06:55 +0000 Subject: [PATCH 03/21] make setup script work in CI TODO: FIXME --- dev/setup-env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/setup-env.sh b/dev/setup-env.sh index e47b3d8a9..7c26e31e6 100755 --- a/dev/setup-env.sh +++ b/dev/setup-env.sh @@ -3,7 +3,7 @@ set -euo pipefail if [[ ! -d "venv" ]]; then - /usr/bin/python3.8 -m venv venv # use `sudo yum install python38` on Rocky Linux 8 to install this + /usr/bin/python3 -m venv venv # use `sudo yum install python38` on Rocky Linux 8 to install this fi . venv/bin/activate pip install -U pip From e0a00d81c0ab6f7831c3e69e383c07d292f5d014 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 18 Jul 2024 15:13:44 +0000 Subject: [PATCH 04/21] fix libguestfs install --- .github/workflows/fatimage.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index b2be6f077..ac3c6797d 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -85,7 +85,9 @@ jobs: uses: docker/setup-qemu-action@v3 - name: install libguestfs - run: sudo apt-get -y install libguestfs-tools + run: | + sudo apt -y update + sudo apt -y install libguestfs-tools - name: mkdir for mount run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}' From 6f8b72d6adfcbc605433bc2db2841faf1d38a7b8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 18 Jul 2024 16:00:00 +0000 Subject: [PATCH 05/21] run only 1x build per matrix entry, & only builds required --- .github/workflows/fatimage.yml | 43 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index ac3c6797d..97470713a 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -2,14 +2,8 @@ name: Build fat image 'on': workflow_dispatch: - inputs: - use_RL8: - required: true - description: Include RL8 image build - type: boolean - default: false concurrency: - group: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS + group: ${{ github.ref }}-{{ matrix.os_version }}-{{ matrix.build }} # to branch/PR + OS + build cancel-in-progress: true jobs: openstack: @@ -17,12 +11,17 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - os_version: [RL8, RL9] - rl8_selected: - - ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch + os_version: + - RL8 + - RL9 + build: + - openstack.openhpc + - openstack.openhpc-ofed exclude: - os_version: RL8 - rl8_selected: false + build: openstack.openhpc-ofed + - os_version: RL9 + build: openstack.openhpc env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -63,7 +62,7 @@ jobs: . environments/.stackhpc/activate cd packer/ packer init . - PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -except=openstack.openhpc-extra -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl + PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -only={{ matrix.build }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl env: PKR_VAR_os_version: ${{ matrix.os_version }} @@ -71,15 +70,16 @@ jobs: id: manifest run: | . venv/bin/activate - for IMAGE_ID in $(jq --raw-output '.builds[].artifact_id' packer/packer-manifest.json) - do - while ! openstack image show -f value -c name $IMAGE_ID; do - sleep 5 - done - IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) - echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" - echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" + IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) + while ! openstack image show -f value -c name $IMAGE_ID; do + sleep 5 done + IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" + + - name: Download image + run: openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }} - name: Set up QEMU uses: docker/setup-qemu-action@v3 @@ -95,7 +95,6 @@ jobs: - name: mount qcow2 file run: sudo guestmount -a ${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}' - - name: Run Trivy vulnerability scanner uses: aquasecurity/trivy-action@0.17.0 with: @@ -110,7 +109,7 @@ jobs: uses: github/codeql-action/upload-sarif@v3 with: sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif" - category: "${{ matrix.os_version }}" + category: "${{ matrix.os_version }}-${{ matrix.build }}" - name: Fail if scan has CRITICAL vulnerabilities uses: aquasecurity/trivy-action@0.16.1 From 0a44b2371b59c85972a361badf6ed602ea3f425f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 18 Jul 2024 16:00:13 +0000 Subject: [PATCH 06/21] fix packer README --- .github/workflows/fatimage.yml | 2 +- packer/README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 97470713a..dbe276b71 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -62,7 +62,7 @@ jobs: . environments/.stackhpc/activate cd packer/ packer init . - PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -only={{ matrix.build }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl + PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -only=${{ matrix.build }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl env: PKR_VAR_os_version: ${{ matrix.os_version }} diff --git a/packer/README.md b/packer/README.md index 597cfd4f9..3bc188c7e 100644 --- a/packer/README.md +++ b/packer/README.md @@ -41,8 +41,8 @@ The steps for building site-specific fat images or extending an existing fat ima Note that the `-only` flag here restricts the build to the non-OFED fat image "source" (in Packer terminology). Other source options are: - - `-only=openhpc-ofed`: Build a fat image including Mellanox OFED - - `-only=openhpc-extra`: Build an image which extends an existing fat image - in this case the variable `source_image` or `source_image_name}` must also be set in the Packer variables file. + - `-only=openstack.openhpc-ofed`: Build a fat image including Mellanox OFED + - `-only=openstack.openhpc-extra`: Build an image which extends an existing fat image - in this case the variable `source_image` or `source_image_name}` must also be set in the Packer variables file. 5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash. From edbf17dc18d6dd50a93a6a2bf76b7d7cf4195c7a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 19 Jul 2024 09:40:40 +0000 Subject: [PATCH 07/21] fix image download --- .github/workflows/fatimage.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index dbe276b71..31fcc789a 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -79,7 +79,9 @@ jobs: echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" - name: Download image - run: openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }} + run: | + . venv/bin/activate + openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }} - name: Set up QEMU uses: docker/setup-qemu-action@v3 From 38a580006f7dfd2d1cf173f3ba6fe091bd9d48d9 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Tue, 23 Jul 2024 13:25:52 +0100 Subject: [PATCH 08/21] Use shorter names for CI clusters (#415) * use run_number as a shorter ID for CI * slurmci group name warning * Revert "slurmci group name warning" - underscores not valid linux hostname and stripped in host, leading to slurmdbd config failure This reverts commit 61dfad6716ff144a4b3769319d02394d61df5675. --------- Co-authored-by: Bertie --- .github/workflows/stackhpc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index d0f74ad1c..401530fb4 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -34,7 +34,7 @@ jobs: env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_id }} + TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} CI_CLOUD: ${{ vars.CI_CLOUD }} steps: - uses: actions/checkout@v2 From 406af42063ccae52f5709c59f5748c9321ab8341 Mon Sep 17 00:00:00 2001 From: Bertie Date: Fri, 19 Jul 2024 16:06:18 +0000 Subject: [PATCH 09/21] install ood apps in fatimage --- ansible/fatimage.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 0764477b3..82c1ecc07 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -62,12 +62,16 @@ tasks_from: install.yml # - import_playbook: portal.yml - - name: Open Ondemand server + - name: Open Ondemand server (packages) include_role: name: osc.ood tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" # # FUTURE: install-apps.yml - this is git clones + - name: Open Ondemand server (apps) + include_role: + name: osc.ood + tasks_from: install-apps.yml - name: Open Ondemand remote desktop import_role: name: openondemand From 2ee66da0b91cec505318b0b5bbd8d8bd79317c9c Mon Sep 17 00:00:00 2001 From: Bertie Date: Mon, 22 Jul 2024 09:27:49 +0000 Subject: [PATCH 10/21] add ood jupyter install to fatimage --- ansible/fatimage.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 82c1ecc07..25968cfe5 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -72,6 +72,7 @@ include_role: name: osc.ood tasks_from: install-apps.yml + vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" - name: Open Ondemand remote desktop import_role: name: openondemand From dd2a7ace19db072611c1e6aca78e53b01db7cb0f Mon Sep 17 00:00:00 2001 From: Bertie Date: Mon, 22 Jul 2024 12:40:51 +0000 Subject: [PATCH 11/21] jupyter_compute ood into fatimage --- ansible/fatimage.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 25968cfe5..35f1b10ab 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -77,6 +77,10 @@ import_role: name: openondemand tasks_from: vnc_compute.yml + - name: Open Ondemand jupyter node + import_role: + name: openondemand + tasks_from: jupyter_compute.yml # - import_playbook: monitoring.yml: - import_role: From c65973930e71fb4b8b2d65c51d95e0d8b18ad71a Mon Sep 17 00:00:00 2001 From: Bertie Date: Tue, 23 Jul 2024 10:11:32 +0000 Subject: [PATCH 12/21] bump fatimage --- environments/.stackhpc/terraform/main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index b98818653..faf87c0fb 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -29,9 +29,9 @@ variable "cluster_image" { description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" type = map(string) default = { - # https://github.com/stackhpc/ansible-slurm-appliance/pull/410 - RL8: "openhpc-RL8-240712-1426-6830f97b" - RL9: "openhpc-ofed-RL9-240712-1425-6830f97b" + # https://github.com/stackhpc/ansible-slurm-appliance/pull/414 + RL8: "openhpc-RL8-240723-0907-b560bf4c" + RL9: "openhpc-ofed-RL9-240723-0907-b560bf4c" } } From 83d2c79212757ebae5ca945a64bff9c0740404d9 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Tue, 23 Jul 2024 13:30:08 +0100 Subject: [PATCH 13/21] allow items in compute mapping to have different keys e.g. only specify image_id for some compute groups (#412) --- .../{{cookiecutter.environment}}/terraform/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index ba0dbfb20..289de3fef 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -40,7 +40,7 @@ variable "cluster_image_id" { } variable "compute" { - type = map + type = any description = <<-EOF Mapping defining compute infrastructure. Keys are names of groups. Values are a mapping as follows: From 81c8ca29783260aafa096b51f99d373a4df3080e Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:06:37 +0100 Subject: [PATCH 14/21] Support ansible-init for remote collections (#411) * Add ansible-init role to requirements.yml * Add ansible-init to groups and plays * Configure cluster_infra ansible-init metadata * Only run site.yml once ansible-init has completed * Wait for ansible init to finish before running bootstrap * revert to using cluster_infra metadata defaults * update image * revert sausage bastion changes * set ansible_init_wait as common var * use run_number as a shorter ID for CI * install ood apps in fatimage * add ood jupyter install to fatimage * bump image * jupyter_compute ood into fatimage * bump fatimage for jupyter_compute ood * Update stackhpc.yml * duplicate tuned inventory group name * Fix invalid group name for slurmci * Update stackhpc.yml undo groupname changes * slurmci group name warning * rm ood changes * bump fatimage * change azimuth collection in bootstrap * update azimuth image utils version * update requirements * Update bastion.yml * Use azimuth image utils collection for ansible-init * bump fatimage --------- Co-authored-by: bertie Co-authored-by: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> --- ansible/bootstrap.yml | 23 +++++++++ ansible/fatimage.yml | 2 + ansible/roles/cluster_infra/defaults/main.yml | 2 + .../cluster_infra/templates/resources.tf.j2 | 51 +++++++++++++++++++ environments/.stackhpc/terraform/main.tf | 6 +-- .../inventory/group_vars/all/ansible_init.yml | 1 + environments/common/inventory/groups | 3 ++ environments/common/layouts/everything | 4 ++ requirements.yml | 3 ++ 9 files changed, 92 insertions(+), 3 deletions(-) create mode 100644 ansible/roles/cluster_infra/defaults/main.yml create mode 100644 environments/common/inventory/group_vars/all/ansible_init.yml diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index b53a4f29a..e8e2713a5 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -1,5 +1,20 @@ --- +- hosts: cluster + gather_facts: false + become: yes + tasks: + - name: Check if ansible-init is installed + stat: + path: /etc/systemd/system/ansible-init.service + register: _stat_ansible_init_unitfile + + - name: Wait for ansible-init to finish + wait_for: + path: /var/lib/ansible-init.done + timeout: "{{ ansible_init_wait }}" # seconds + when: _stat_ansible_init_unitfile.stat.exists + - hosts: localhost gather_facts: false become: false @@ -235,3 +250,11 @@ tasks: - include_role: name: ofed + +- hosts: ansible_init + gather_facts: yes + become: yes + tags: linux_ansible_init + tasks: + - include_role: + name: azimuth_cloud.image_utils.linux_ansible_init diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 35f1b10ab..58e1d72c7 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -68,11 +68,13 @@ tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" # # FUTURE: install-apps.yml - this is git clones + - name: Open Ondemand server (apps) include_role: name: osc.ood tasks_from: install-apps.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" + - name: Open Ondemand remote desktop import_role: name: openondemand diff --git a/ansible/roles/cluster_infra/defaults/main.yml b/ansible/roles/cluster_infra/defaults/main.yml new file mode 100644 index 000000000..f2f9637b9 --- /dev/null +++ b/ansible/roles/cluster_infra/defaults/main.yml @@ -0,0 +1,2 @@ +ansible_init_collections: [] +ansible_init_playbooks: [] diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index 03eab5afb..4c7534d62 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -370,6 +370,23 @@ resource "openstack_compute_instance_v2" "login" { - "${openstack_compute_keypair_v2.cluster_keypair.public_key}" {%- endif %} EOF + + metadata = { + {% for playbook in ansible_init_playbooks %} + ansible_init_pb_{{ loop.index0 }}_name = "{{ playbook.name }}" + {% if playbook.stage is defined %} + ansible_init_pb_{{ loop.index0 }}_stage = "{{ playbook.stage }}" + {% endif %} + {% endfor %} + {% for collection in ansible_init_collections %} + ansible_init_coll_{{ loop.index0 }}_name = "{{ collection.name }}" + ansible_init_coll_{{ loop.index0 }}_type = "{{ collection.type }}" + ansible_init_coll_{{ loop.index0 }}_version = "{{ collection.version }}" + {% if collection.source is defined %} + ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" + {% endif %} + {% endfor %} + } } resource "openstack_compute_instance_v2" "control" { @@ -446,6 +463,23 @@ resource "openstack_compute_instance_v2" "control" { - [LABEL=home, /exports/home, auto] {% endif %} EOF + + metadata = { + {% for playbook in ansible_init_playbooks %} + ansible_init_pb_{{ loop.index0 }}_name = "{{ playbook.name }}" + {% if playbook.stage is defined %} + ansible_init_pb_{{ loop.index0 }}_stage = "{{ playbook.stage }}" + {% endif %} + {% endfor %} + {% for collection in ansible_init_collections %} + ansible_init_coll_{{ loop.index0 }}_name = "{{ collection.name }}" + ansible_init_coll_{{ loop.index0 }}_type = "{{ collection.type }}" + ansible_init_coll_{{ loop.index0 }}_version = "{{ collection.version }}" + {% if collection.source is defined %} + ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" + {% endif %} + {% endfor %} + } } {% for partition in openhpc_slurm_partitions %} @@ -498,6 +532,23 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" { - "${openstack_compute_keypair_v2.cluster_keypair.public_key}" {%- endif %} EOF + + metadata = { + {% for playbook in ansible_init_playbooks %} + ansible_init_pb_{{ loop.index0 }}_name = "{{ playbook.name }}" + {% if playbook.stage is defined %} + ansible_init_pb_{{ loop.index0 }}_stage = "{{ playbook.stage }}" + {% endif %} + {% endfor %} + {% for collection in ansible_init_collections %} + ansible_init_coll_{{ loop.index0 }}_name = "{{ collection.name }}" + ansible_init_coll_{{ loop.index0 }}_type = "{{ collection.type }}" + ansible_init_coll_{{ loop.index0 }}_version = "{{ collection.version }}" + {% if collection.source is defined %} + ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" + {% endif %} + {% endfor %} + } } {% endfor %} diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index faf87c0fb..f6168dcb7 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -29,9 +29,9 @@ variable "cluster_image" { description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" type = map(string) default = { - # https://github.com/stackhpc/ansible-slurm-appliance/pull/414 - RL8: "openhpc-RL8-240723-0907-b560bf4c" - RL9: "openhpc-ofed-RL9-240723-0907-b560bf4c" + # https://github.com/stackhpc/ansible-slurm-appliance/pull/411 + RL8: "openhpc-RL8-240725-1710-325c7b47" + RL9: "openhpc-ofed-RL9-240725-1710-325c7b47" } } diff --git a/environments/common/inventory/group_vars/all/ansible_init.yml b/environments/common/inventory/group_vars/all/ansible_init.yml new file mode 100644 index 000000000..be68dbe8c --- /dev/null +++ b/environments/common/inventory/group_vars/all/ansible_init.yml @@ -0,0 +1 @@ +ansible_init_wait: 1200 # seconds \ No newline at end of file diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index a48e6823f..ea0bebebc 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -132,3 +132,6 @@ freeipa_client [tuned] # Hosts to run TuneD configuration + +[ansible_init] +# Hosts to run linux-anisble-init \ No newline at end of file diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index e9523eec9..85af46c06 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -78,3 +78,7 @@ openhpc [tuned:children] # Hosts to run TuneD configuration + +[ansible_init:children] +# Hosts to run ansible-init +cluster \ No newline at end of file diff --git a/requirements.yml b/requirements.yml index 757c851d5..da6ac5d29 100644 --- a/requirements.yml +++ b/requirements.yml @@ -46,4 +46,7 @@ collections: - name: https://github.com/stackhpc/ansible-collection-terraform type: git version: 0.2.0 + - name: https://github.com/azimuth-cloud/ansible-collection-image-utils + type: git + version: main # update on release ... From be69b8ad15d87c6d0f24049c3424aa955e1316dc Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 7 Aug 2024 14:41:30 +0100 Subject: [PATCH 15/21] avoid python-openstackclient v7 due to rebuild bug (#420) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index badb1a94b..bf5a43430 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ ansible==6.0.0 openstacksdk -python-openstackclient +python-openstackclient==6.6.1 # v7.0.0 has a bug re. rebuild python-manilaclient jmespath passlib[bcrypt]==1.7.4 From deec81c6513349585580a378536cdf99ffb367a4 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 7 Aug 2024 15:27:13 +0100 Subject: [PATCH 16/21] Update hpctests to obey UCX_NET_DEVICES when RoCE devices present (#421) * Turn off higher priority MPI net devices * Update pingmatrix.sh.j2 * Update pingmatrix.sh.j2 * Update pingpong.sh.j2 * Replace j2 comments with bash * Update pingpong.sh.j2 --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/hpctests/templates/pingmatrix.sh.j2 | 6 +++++- ansible/roles/hpctests/templates/pingpong.sh.j2 | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 index d886e9ac8..990018d85 100644 --- a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 +++ b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 @@ -16,4 +16,8 @@ echo UCX_NET_DEVICES: $UCX_NET_DEVICES module load {{ hpctests_pingmatrix_modules | join(' ' ) }} mpicc -o nxnlatbw mpi_nxnlatbw.c -mpirun nxnlatbw + +# mpirun flags force using UCX TCP transports, overriding higher +# priority of OpenMPI btl/openib component, which is also using RDMA +# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 +mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any nxnlatbw diff --git a/ansible/roles/hpctests/templates/pingpong.sh.j2 b/ansible/roles/hpctests/templates/pingpong.sh.j2 index 4dc2eebd5..dad4499b1 100644 --- a/ansible/roles/hpctests/templates/pingpong.sh.j2 +++ b/ansible/roles/hpctests/templates/pingpong.sh.j2 @@ -16,4 +16,8 @@ echo UCX_NET_DEVICES: $UCX_NET_DEVICES module load {{ hpctests_pingpong_modules | join(' ' ) }} #srun --mpi=pmi2 IMB-MPI1 pingpong # doesn't work in ohpc v2.1 -mpirun IMB-MPI1 pingpong + +# mpirun flags force using UCX TCP transports, overriding higher +# priority of OpenMPI btl/openib component, which is also using RDMA +# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 +mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any IMB-MPI1 pingpong From 813bf0e8059fae624ebed5b477be568291a13c0f Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 9 Aug 2024 15:25:37 +0000 Subject: [PATCH 17/21] delete trivy scanned vulnerabilities --- environments/.stackhpc/hooks/post.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 environments/.stackhpc/hooks/post.yml diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml new file mode 100644 index 000000000..5afe5b091 --- /dev/null +++ b/environments/.stackhpc/hooks/post.yml @@ -0,0 +1,12 @@ +- hosts: openondemand + become: yes + gather_facts: false + tasks: + - name: Delete trivy vulnerable files + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_items: + - /opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/3.1.7-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock + - /opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/3.1.7-1/gems/bootstrap_form-4.5.0/demo/yarn.lock + - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock \ No newline at end of file From 1b370a36e1c79351d28415944d8f35f7e761c474 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 13 Aug 2024 13:13:14 +0000 Subject: [PATCH 18/21] update grafana --- environments/common/inventory/group_vars/all/grafana.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 8222a3cca..90ef51c59 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -2,7 +2,7 @@ # See: https://github.com/cloudalchemy/ansible-grafana # for variable definitions. -grafana_version: '9.0.3' +grafana_version: '9.5.21' # need to copy some role defaults here so we can use in inventory: grafana_port: 3000 From 1ce39e90fff782fe91f52fba9d50c5a60e67e59d Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 13 Aug 2024 14:14:51 +0000 Subject: [PATCH 19/21] bump image --- environments/.stackhpc/terraform/main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index f6168dcb7..ac588930c 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -29,9 +29,9 @@ variable "cluster_image" { description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" type = map(string) default = { - # https://github.com/stackhpc/ansible-slurm-appliance/pull/411 - RL8: "openhpc-RL8-240725-1710-325c7b47" - RL9: "openhpc-ofed-RL9-240725-1710-325c7b47" + # https://github.com/stackhpc/ansible-slurm-appliance/pull/413 + RL8: "openhpc-RL8-240813-1317-1b370a36" + RL9: "openhpc-ofed-RL9-240813-1317-1b370a36" } } From bdccb0a7c6df779533b7ff5516a8200946d6be84 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Tue, 13 Aug 2024 17:45:14 +0100 Subject: [PATCH 20/21] Update environments/.stackhpc/hooks/post.yml Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- environments/.stackhpc/hooks/post.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index 5afe5b091..eceadcbd8 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -2,7 +2,9 @@ become: yes gather_facts: false tasks: - - name: Delete trivy vulnerable files + - name: Delete ondemand files causing Trivy scan false-positives + # Raised at https://github.com/OSC/ondemand/security/advisories/GHSA-f7j8-ppqm-m5vw + # All declared not to be an issue by Open Ondemand as relevant packages not installed ansible.builtin.file: path: "{{ item }}" state: absent From 73a4e5ed0ca2f1c2ddfa25a836ebd4b0c1bcf7eb Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 14 Aug 2024 11:49:38 +0100 Subject: [PATCH 21/21] Update setup-env.sh --- dev/setup-env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/setup-env.sh b/dev/setup-env.sh index 7c26e31e6..e47b3d8a9 100755 --- a/dev/setup-env.sh +++ b/dev/setup-env.sh @@ -3,7 +3,7 @@ set -euo pipefail if [[ ! -d "venv" ]]; then - /usr/bin/python3 -m venv venv # use `sudo yum install python38` on Rocky Linux 8 to install this + /usr/bin/python3.8 -m venv venv # use `sudo yum install python38` on Rocky Linux 8 to install this fi . venv/bin/activate pip install -U pip