From 6e650af0d36314d82e12257cca08f2912ad0beeb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 10 Dec 2024 20:37:32 +0000 Subject: [PATCH 01/23] add doca role run by fatimage --- ansible/.gitignore | 2 + ansible/fatimage.yml | 8 +++ ansible/roles/doca/defaults/main.yml | 3 ++ .../roles/doca/tasks/install-kernel-devel.yml | 24 +++++++++ ansible/roles/doca/tasks/install.yml | 53 +++++++++++++++++++ 5 files changed, 90 insertions(+) create mode 100644 ansible/roles/doca/defaults/main.yml create mode 100644 ansible/roles/doca/tasks/install-kernel-devel.yml create mode 100644 ansible/roles/doca/tasks/install.yml diff --git a/ansible/.gitignore b/ansible/.gitignore index 48c917c4f..3fef64ecc 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -66,3 +66,5 @@ roles/* !roles/lustre/** !roles/dnf_repos/ !roles/dnf_repos/** +!roles/doca/ +!roles/doca/** diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index b28e4f308..b0f428c90 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -199,6 +199,14 @@ name: cloudalchemy.grafana tasks_from: install.yml +- hosts: doca + become: yes + gather_facts: yes + tasks: + - name: Install Mellanox DOCA + include_role: + name: doca + - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/roles/doca/defaults/main.yml b/ansible/roles/doca/defaults/main.yml new file mode 100644 index 000000000..66437cd04 --- /dev/null +++ b/ansible/roles/doca/defaults/main.yml @@ -0,0 +1,3 @@ +doca_version: '2.9.1' # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates +doca_profile: doca-ofed +doca_repo_url: "https://linux.mellanox.com/public/repo/doca/{{ doca_version }}/rhel{{ ansible_distribution_version }}/{{ ansible_architecture }}/" diff --git a/ansible/roles/doca/tasks/install-kernel-devel.yml b/ansible/roles/doca/tasks/install-kernel-devel.yml new file mode 100644 index 000000000..6a1943a32 --- /dev/null +++ b/ansible/roles/doca/tasks/install-kernel-devel.yml @@ -0,0 +1,24 @@ +- name: Get installed kernels + command: dnf list --installed kernel + register: _ofed_dnf_kernels + changed_when: false + +- name: Determine running kernel + command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 + register: _ofed_loaded_kernel + changed_when: false + +- name: Check current kernel is newest installed + assert: + that: _ofed_kernel_current == _ofed_dnf_kernels_newest + fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" + vars: + _ofed_kernel_current: >- + {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} + _ofed_dnf_kernels_newest: >- + {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} + # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " + +- name: Install matching kernel-devel package + dnf: + name: "kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}" diff --git a/ansible/roles/doca/tasks/install.yml b/ansible/roles/doca/tasks/install.yml new file mode 100644 index 000000000..9d297e946 --- /dev/null +++ b/ansible/roles/doca/tasks/install.yml @@ -0,0 +1,53 @@ +- import_tasks: install-kernel-devel.yml + +- name: Install DOCA repo + ansible.builtin.yum_repository: + name: doca + file: doca + description: DOCA Online Repo + baseurl: "{{ doca_repo_url }}" + enabled: true + gpgcheck: false + +- name: Install doca-extra package + ansible.builtin.dnf: + name: doca-extra + +- name: Build DOCA kernel modules + ansible.builtin.shell: + cmd: /opt/mellanox/doca/tools/doca-kernel-support + register: _doca_kernel_build + + +- name: Find generated doca-kernel-repo + ansible.builtin.shell: 'find /tmp/DOCA.* -name doca-kernel-repo-*' + register: _doca_kernel_repo # e.g. /tmp/DOCA.WVMchs2QWo/doca-kernel-repo-24.10.1.1.4.0-1.kver.5.14.0.427.31.1.el9.4.x86.64.x86_64.rpm + changed_when: false + +- name: Create dnf cache + ansible.builtin.command: dnf makecache + +- name: Install DOCA repository package + ansible.builtin.dnf: + name: "{{ _doca_kernel_repo.stdout }}" + disable_gpg_check: true + +- name: Install DOCA packages + ansible.builtin.dnf: + name: "{{ doca_profile }}" + +- name: Cleanup DOCA build directories + ansible.builtin.file: + state: absent + path: "{{ (_doca_kernel_repo.stdout | split('/'))[:2] | join('/') }}" + +- name: Update initramfs + ansible.builtin.command: + cmd: dracut -f --tmpdir /var/tmp + environment: + TMPDIR: /var/tmp + register: _doca_dracut + failed_when: _doca_dracut.stderr != '' # appears rc is always 0 + +- name: Load the new driver + ansible.builtin.command: /etc/init.d/openibd restart From 505d5f46d09ae178bc32d9d715650b7bdfaacf34 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 10 Dec 2024 21:06:00 +0000 Subject: [PATCH 02/23] add workflow to test doca build --- .github/workflows/doca.yml | 117 +++++++++++++++++++++++++++++++++++++ packer/openstack.pkr.hcl | 70 +++++++++------------- 2 files changed, 145 insertions(+), 42 deletions(-) create mode 100644 .github/workflows/doca.yml diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml new file mode 100644 index 000000000..491ab0d04 --- /dev/null +++ b/.github/workflows/doca.yml @@ -0,0 +1,117 @@ +name: Test DOCA extra build +on: + workflow_dispatch: + push: + branches: + - main + paths: + - '**' + - '!dev/**' + - 'dev/setup-env.sh' + - '!docs/**' + - '!README.md' + - '!.gitignore' + - '!.github/workflows/' + - '.github/workflows/doca' + pull_request: + paths: + - '**' + - '!dev/**' + - 'dev/setup-env.sh' + - '!docs/**' + - '!README.md' + - '!.gitignore' + - '!.github/workflows/' + - '.github/workflows/doca' + +jobs: + openstack: + name: openstack-docabuild + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.label }} # to branch/PR + OS + cancel-in-progress: true + runs-on: ubuntu-22.04 + strategy: + fail-fast: false # allow other matrix jobs to continue even if one fails + matrix: # build RL8, RL9 + build: + - label: RL8 + source_image_name: rocky-latest-RL8 + - label: RL9 + source_image_name: rocky-latest-RL9 + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} + + steps: + - uses: actions/checkout@v2 + + - name: Record settings for CI cloud + run: | + echo CI_CLOUD: ${{ env.CI_CLOUD }} + + - name: Setup ssh + run: | + set -x + mkdir ~/.ssh + echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + shell: bash + + - name: Add bastion's ssh key to known_hosts + run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts + shell: bash + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml + shell: bash + + - name: Setup environment + run: | + . venv/bin/activate + . environments/.stackhpc/activate + + - name: Build fat image with packer + id: packer_build + run: | + set -x + . venv/bin/activate + . environments/.stackhpc/activate + cd packer/ + packer init . + + PACKER_LOG=1 packer build \ + -on-error=${{ vars.PACKER_ON_ERROR }} \ + -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ + -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "image_name=openhpc-doca" \ + -var "groups=doca" \ + openstack.pkr.hcl + + - name: Get created image names from manifest + id: manifest + run: | + . venv/bin/activate + IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) + while ! openstack image show -f value -c name $IMAGE_ID; do + sleep 5 + done + IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo $IMAGE_ID > image-id.txt + echo $IMAGE_NAME > image-name.txt + + - name: Upload manifest artifact + uses: actions/upload-artifact@v4 + with: + name: image-details-openhpc-${{ matrix.label }} + path: | + ./image-id.txt + ./image-name.txt + overwrite: true diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 52202ead1..d46546665 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -23,6 +23,7 @@ data "git-commit" "cwd-head" { } locals { git_commit = data.git-commit.cwd-head.hash timestamp = formatdate("YYMMDD-hhmm", timestamp()) + image_name_version = var.image_name_version == "auto" ? "-${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_name_version } # Path pointing to root of repository - automatically set by environment variable PKR_VAR_repo_root @@ -39,12 +40,6 @@ variable "networks" { type = list(string) } -variable "os_version" { - type = string - description = "'RL8' or 'RL9' with default source_image_* mappings" - default = "RL9" -} - # Must supply either source_image_name or source_image_id variable "source_image_name" { type = string @@ -123,15 +118,6 @@ variable "volume_type" { } variable "volume_size" { - type = map(number) - default = { - # fat image builds, GB: - rocky-latest = 15 - openhpc = 15 - } -} - -variable "extra_build_volume_size" { type = number default = 15 } @@ -147,24 +133,23 @@ variable "metadata" { } variable "groups" { - type = map(list(string)) - description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name" - default = { - # fat image builds: - rocky-latest = ["update"] - openhpc = ["control", "compute", "login"] - } + type = string + description = "Comma-separated list of additional inventory groups (other than 'builder') to add build VM to" + default = "" # this is + # rocky-latest = ["update"] + # openhpc = ["control", "compute", "login"] } -variable "extra_build_groups" { - type = list(string) - default = [] +variable "image_name" { + type = string + description = "Name of image" + default = "openhpc" } -variable "extra_build_image_name" { +variable "image_name_version" { type = string - description = "Infix for 'extra' build image name" - default = "extra" + description = "Suffix for image name giving version. Default of 'auto' appends timestamp + short commit" + default = "auto" } source "openstack" "openhpc" { @@ -172,9 +157,11 @@ source "openstack" "openhpc" { flavor = var.flavor use_blockstorage_volume = var.use_blockstorage_volume volume_type = var.volume_type - volume_size = lookup(var.volume_size, source.name, var.extra_build_volume_size) + volume_size = var.volume_size metadata = var.metadata - instance_metadata = {ansible_init_disable = "true"} + instance_metadata = { + ansible_init_disable = "true" + } networks = var.networks floating_ip_network = var.floating_ip_network security_groups = var.security_groups @@ -201,26 +188,25 @@ source "openstack" "openhpc" { build { # latest nightly image: - source "source.openstack.openhpc" { - name = "rocky-latest" - image_name = "${source.name}-${var.os_version}" - } + # source "source.openstack.openhpc" { + # name = "rocky-latest" + # image_name = "${source.name}-${var.os_version}" + # } # fat image: source "source.openstack.openhpc" { - name = "openhpc" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" + image_name = "${var.image_name}${local.image_name_version}" } - # Extended site-specific image, built on fat image: - source "source.openstack.openhpc" { - name = "openhpc-extra" - image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" - } + # # Extended site-specific image, built on fat image: + # source "source.openstack.openhpc" { + # name = "openhpc-extra" + # image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" + # } provisioner "ansible" { playbook_file = "${var.repo_root}/ansible/fatimage.yml" - groups = concat(["builder"], lookup(var.groups, source.name, var.extra_build_groups)) + groups = concat(["builder"], split(",", var.groups)) keep_inventory_file = true # for debugging use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting extra_arguments = [ From f9f1105e07487d1e3046ad744eb31e0400bb02b6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 08:13:20 +0000 Subject: [PATCH 03/23] make packer inventory groups clearer and allow defining no extra --- .github/workflows/doca.yml | 2 +- packer/openstack.pkr.hcl | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml index 491ab0d04..a1696747e 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/doca.yml @@ -92,7 +92,7 @@ jobs: -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ -var "image_name=openhpc-doca" \ - -var "groups=doca" \ + -var "inventory_groups=doca" \ openstack.pkr.hcl - name: Get created image names from manifest diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index d46546665..2d7dede35 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -132,11 +132,11 @@ variable "metadata" { default = {} } -variable "groups" { +variable "inventory_groups" { type = string - description = "Comma-separated list of additional inventory groups (other than 'builder') to add build VM to" - default = "" # this is - # rocky-latest = ["update"] + description = "Comma-separated list of additional inventory groups (other than 'builder') to add build VM to. Default is none." + default = "" + # rocky-latest = ["update"] # TODO: fix this in workflow # openhpc = ["control", "compute", "login"] } @@ -206,7 +206,7 @@ build { provisioner "ansible" { playbook_file = "${var.repo_root}/ansible/fatimage.yml" - groups = concat(["builder"], split(",", var.groups)) + groups = concat(["builder"], var.inventory_groups == "" ? [] : split(",", var.inventory_groups)) keep_inventory_file = true # for debugging use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting extra_arguments = [ From 02765734f69fe3c3cf6b6d7898333a1d5b76156b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 08:36:10 +0000 Subject: [PATCH 04/23] update packer workflows for new packer config --- .github/workflows/doca.yml | 4 ++-- .github/workflows/fatimage.yml | 29 +++++++---------------- .github/workflows/nightlybuild.yml | 38 +++++++++++------------------- packer/openstack.pkr.hcl | 2 -- 4 files changed, 25 insertions(+), 48 deletions(-) diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml index a1696747e..497b8d872 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/doca.yml @@ -91,7 +91,7 @@ jobs: -on-error=${{ vars.PACKER_ON_ERROR }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ - -var "image_name=openhpc-doca" \ + -var "image_name=openhpc-doca-${{ matrix.build.label }}" \ -var "inventory_groups=doca" \ openstack.pkr.hcl @@ -110,7 +110,7 @@ jobs: - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-openhpc-${{ matrix.label }} + name: image-details-${{ matrix.build.label }} path: | ./image-id.txt ./image-name.txt diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 217b09c22..aed674862 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -15,30 +15,21 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.label }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 - os_version: - - RL8 - - RL9 build: - - openstack.openhpc + - label: RL8 + source_image_name: rocky-latest-RL8 + - label: RL9 + source_image_name: rocky-latest-RL9 env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud }} - SOURCE_IMAGES_MAP: | - { - "RL8": { - "openstack.openhpc": "rocky-latest-RL8" - }, - "RL9": { - "openstack.openhpc": "rocky-latest-RL9" - } - } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: @@ -85,13 +76,11 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "image_name=openhpc-${{ matrix.build.label }}" \ + -var "inventory_groups=control,compute,login" \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }} - name: Get created image names from manifest id: manifest @@ -108,7 +97,7 @@ jobs: - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-${{ matrix.build }}-${{ matrix.os_version }} + name: image-details-${{ matrix.build.label }} path: | ./image-id.txt ./image-name.txt diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 9f45b0890..3a58fa46c 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -11,32 +11,27 @@ on: - SMS - ARCUS schedule: - - cron: '0 0 * * *' # Run at midnight + - cron: '0 0 * * *' # Run at midnight on default branch jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.label }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 - os_version: - - RL8 - - RL9 build: - - openstack.rocky-latest + - label: RL8 + source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + - label: RL9 + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} - SOURCE_IMAGES_MAP: | - { - "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", - "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" - } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: @@ -83,15 +78,12 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "image_name=rocky-latest-${{ matrix.build.label }}" \ + -var "inventory_groups=update" \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }} - - name: Get created image names from manifest id: manifest run: | @@ -125,7 +117,7 @@ jobs: name: upload-nightly-targets needs: openstack concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.label }}-${{ matrix.target_cloud }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: @@ -135,18 +127,16 @@ jobs: - LEAFCLOUD - SMS - ARCUS - os_version: - - RL8 - - RL9 - image: - - rocky-latest + build: + - label: RL8 + - label: RL9 exclude: - target_cloud: LEAFCLOUD env: OS_CLOUD: openstack SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} TARGET_CLOUD: ${{ matrix.target_cloud }} - IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}" + IMAGE_NAME: "rocky-latest-${{ matrix.build.label }}" steps: - uses: actions/checkout@v2 diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 2d7dede35..dbfa0709e 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -136,8 +136,6 @@ variable "inventory_groups" { type = string description = "Comma-separated list of additional inventory groups (other than 'builder') to add build VM to. Default is none." default = "" - # rocky-latest = ["update"] # TODO: fix this in workflow - # openhpc = ["control", "compute", "login"] } variable "image_name" { From d15ffce4fa22cdd3c348c43ba5b041a748b32d15 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 10:38:26 +0000 Subject: [PATCH 05/23] define builds entirely via matrix --- .github/workflows/doca.yml | 14 ++++++++------ .github/workflows/fatimage.yml | 12 +++++++----- .github/workflows/nightlybuild.yml | 10 ++++++---- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml index 497b8d872..2135de14b 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/doca.yml @@ -35,10 +35,12 @@ jobs: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 build: - - label: RL8 - source_image_name: rocky-latest-RL8 - - label: RL9 - source_image_name: rocky-latest-RL9 + - image_name: openhpc-doca-RL8 + source_image_name: openhpc-RL8 # TODO: needs to be injected from environments/.stackhpc/terraform/cluster_image.auto.tfvars.json + inventory_groups: doca + - image_name: openhpc-doca-RL9 + source_image_name: openhpc-RL9 # TODO: as above + inventory_groups: doca env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -91,8 +93,8 @@ jobs: -on-error=${{ vars.PACKER_ON_ERROR }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ - -var "image_name=openhpc-doca-${{ matrix.build.label }}" \ - -var "inventory_groups=doca" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl - name: Get created image names from manifest diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index aed674862..2760360f4 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -22,10 +22,12 @@ jobs: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 build: - - label: RL8 + - image_name: openhpc-RL8 source_image_name: rocky-latest-RL8 - - label: RL9 + inventory_groups: control,compute,login + - image_name: openhpc-RL9 source_image_name: rocky-latest-RL9 + inventory_groups: control,compute,login env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -78,8 +80,8 @@ jobs: -on-error=${{ vars.PACKER_ON_ERROR }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ - -var "image_name=openhpc-${{ matrix.build.label }}" \ - -var "inventory_groups=control,compute,login" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl - name: Get created image names from manifest @@ -97,7 +99,7 @@ jobs: - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-${{ matrix.build.label }} + name: image-details-${{ matrix.build.image_name }} path: | ./image-id.txt ./image-name.txt diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 3a58fa46c..7b79a6839 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -24,10 +24,12 @@ jobs: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 build: - - label: RL8 + - image_name: rocky-latest-RL8 source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 - - label: RL9 + inventory_groups: update + - image_name: rocky-latest-RL9 source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: update env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -80,8 +82,8 @@ jobs: -on-error=${{ vars.PACKER_ON_ERROR }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ - -var "image_name=rocky-latest-${{ matrix.build.label }}" \ - -var "inventory_groups=update" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl - name: Get created image names from manifest From 0dc6549044ec86b4084a3d8740d0801897bb3aca Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 10:56:35 +0000 Subject: [PATCH 06/23] WIP: do DOCA CI build on top of current fat image --- .github/workflows/doca.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml index 2135de14b..4e07268cc 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/doca.yml @@ -50,9 +50,19 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Record settings for CI cloud + - name: Load current fat images into GITHUB_ENV + # see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string + run: | + { + echo 'FAT_IMAGES<> "$GITHUB_ENV" + + - name: Record settings run: | echo CI_CLOUD: ${{ env.CI_CLOUD }} + echo FAT_IMAGES: ${FAT_IMAGES} - name: Setup ssh run: | From 1d2213b328e77dc51d5c312855816c67eacc4d2c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 11:01:25 +0000 Subject: [PATCH 07/23] fixup matrix for changes --- .github/workflows/doca.yml | 4 ++-- .github/workflows/fatimage.yml | 2 +- .github/workflows/nightlybuild.yml | 22 ++++++++-------------- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml index 4e07268cc..d0776f3ad 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/doca.yml @@ -28,7 +28,7 @@ jobs: openstack: name: openstack-docabuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.label }} # to branch/PR + OS + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: @@ -122,7 +122,7 @@ jobs: - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-${{ matrix.build.label }} + name: image-details-${{ matrix.build.image_name }} path: | ./image-id.txt ./image-name.txt diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 2760360f4..ea704bc2d 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -15,7 +15,7 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.label }} # to branch/PR + OS + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 7b79a6839..9187ab4ce 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -17,7 +17,7 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.label }} # to branch/PR + OS + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: @@ -130,15 +130,14 @@ jobs: - SMS - ARCUS build: - - label: RL8 - - label: RL9 + - image_name: rocky-latest-RL8 + - image_name: rocky-latest-RL9 exclude: - target_cloud: LEAFCLOUD env: OS_CLOUD: openstack SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} TARGET_CLOUD: ${{ matrix.target_cloud }} - IMAGE_NAME: "rocky-latest-${{ matrix.build.label }}" steps: - uses: actions/checkout@v2 @@ -153,42 +152,37 @@ jobs: . venv/bin/activate pip install -U pip pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) - shell: bash - name: Write clouds.yaml run: | mkdir -p ~/.config/openstack/ echo "${{ secrets[format('{0}_CLOUDS_YAML', env.SOURCE_CLOUD)] }}" > ~/.config/openstack/source_clouds.yaml echo "${{ secrets[format('{0}_CLOUDS_YAML', env.TARGET_CLOUD)] }}" > ~/.config/openstack/target_clouds.yaml - shell: bash - name: Download source image run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml - openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }} - shell: bash + openstack image save --file ${{ matrix.build.image_name }} ${{ matrix.build.image_name }} - name: Upload to target cloud run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - openstack image create "${{ env.IMAGE_NAME }}" \ - --file "${{ env.IMAGE_NAME }}" \ + openstack image create "${{ matrix.build.image_name }}" \ + --file "${{ matrix.build.image_name }}" \ --disk-format qcow2 \ - shell: bash - name: Delete old latest image from target cloud run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l) + IMAGE_COUNT=$(openstack image list --name ${{ matrix.build.image_name }} -f value -c ID | wc -l) if [ "$IMAGE_COUNT" -gt 1 ]; then - OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}" -f value -c ID | head -n 1) + OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ matrix.build.image_name }}" -f value -c ID | head -n 1) openstack image delete "$OLD_IMAGE_ID" else echo "Only one image exists, skipping deletion." fi - shell: bash From 4831b9c7ddfc8576a58d679ddb93f1e21a76ffc8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 11:02:49 +0000 Subject: [PATCH 08/23] fix doca workflow typo --- .github/workflows/doca.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml index d0776f3ad..4e1131810 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/doca.yml @@ -55,7 +55,7 @@ jobs: run: | { echo 'FAT_IMAGES<> "$GITHUB_ENV" From 8620c1b8ebf3c70d2365706295d99d59e7722998 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 11:05:53 +0000 Subject: [PATCH 09/23] use current fatimage for doca test build --- .github/workflows/doca.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml index 4e1131810..3cf2045a6 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/doca.yml @@ -36,10 +36,10 @@ jobs: matrix: # build RL8, RL9 build: - image_name: openhpc-doca-RL8 - source_image_name: openhpc-RL8 # TODO: needs to be injected from environments/.stackhpc/terraform/cluster_image.auto.tfvars.json + source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json inventory_groups: doca - image_name: openhpc-doca-RL9 - source_image_name: openhpc-RL9 # TODO: as above + source_image_name_key: RL9 inventory_groups: doca env: ANSIBLE_FORCE_COLOR: True @@ -102,7 +102,7 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \ -var "image_name=${{ matrix.build.image_name }}" \ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl From ded60c2c049acf60bbc4dcbbb1937545bae2ac7a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 12:27:47 +0000 Subject: [PATCH 10/23] enable fatimage to be used for volume-backed builds --- .github/workflows/fatimage.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index ea704bc2d..da933c91d 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -93,9 +93,16 @@ jobs: sleep 5 done IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" echo $IMAGE_ID > image-id.txt echo $IMAGE_NAME > image-name.txt + - name: Make image usable for further builds + run: | + . venv/bin/activate + openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" + - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: From d694f1de793d6f6f98c5cde3ab93d507cafd8666 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 14:21:53 +0000 Subject: [PATCH 11/23] bump CI image --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 14c997596..5b9d845ef 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241203-1659-b0558b95", - "RL9": "openhpc-RL9-241203-1659-b0558b95" + "RL8": "openhpc-RL8-241211-1322-ded60c2c", + "RL9": "openhpc-RL9-241211-1322-ded60c2c" } } From 1ec02622cec497912cc61f9eb898abec840a4b25 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 14:22:54 +0000 Subject: [PATCH 12/23] doca workflow: clean up image and only run on relevant changes --- .github/workflows/doca.yml | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml index 3cf2045a6..cfd3bb982 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/doca.yml @@ -5,28 +5,18 @@ on: branches: - main paths: - - '**' - - '!dev/**' - - 'dev/setup-env.sh' - - '!docs/**' - - '!README.md' - - '!.gitignore' - - '!.github/workflows/' + - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' - '.github/workflows/doca' pull_request: paths: - - '**' - - '!dev/**' - - 'dev/setup-env.sh' - - '!docs/**' - - '!README.md' - - '!.gitignore' - - '!.github/workflows/' + - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' - '.github/workflows/doca' jobs: - openstack: - name: openstack-docabuild + doca: + name: doca-build concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true @@ -116,9 +106,22 @@ jobs: sleep 5 done IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" echo $IMAGE_ID > image-id.txt echo $IMAGE_NAME > image-name.txt + - name: Make image usable for further builds + run: | + . venv/bin/activate + openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" + + - name: Delete image for automatically-run workflows + run: | + . venv/bin/activate + openstack image delete "${{ steps.manifest.outputs.image-id }}" + if: ${{ github.event_name != 'workflow_dispatch' }} + - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: From 642925c603b2c75df6bc9e56191c4e7a579b6eac Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 14:45:58 +0000 Subject: [PATCH 13/23] remove commented-out code --- packer/openstack.pkr.hcl | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index dbfa0709e..2ba0a1e63 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -185,23 +185,10 @@ source "openstack" "openhpc" { build { - # latest nightly image: - # source "source.openstack.openhpc" { - # name = "rocky-latest" - # image_name = "${source.name}-${var.os_version}" - # } - - # fat image: source "source.openstack.openhpc" { image_name = "${var.image_name}${local.image_name_version}" } - # # Extended site-specific image, built on fat image: - # source "source.openstack.openhpc" { - # name = "openhpc-extra" - # image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" - # } - provisioner "ansible" { playbook_file = "${var.repo_root}/ansible/fatimage.yml" groups = concat(["builder"], var.inventory_groups == "" ? [] : split(",", var.inventory_groups)) From 532da615500cf65b9d85204d8317ea3c93bee7f4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 14:53:28 +0000 Subject: [PATCH 14/23] add DOCA README --- ansible/roles/doca/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 ansible/roles/doca/README.md diff --git a/ansible/roles/doca/README.md b/ansible/roles/doca/README.md new file mode 100644 index 000000000..5f898add5 --- /dev/null +++ b/ansible/roles/doca/README.md @@ -0,0 +1,12 @@ +# doca + +Install [NVIDIA DOCA](https://docs.nvidia.com/doca/sdk/index.html). + +This role is not idempotent and is only intended to be run during an image build. It builds DOCA kernel modules to match the installed kernel and then installs these +plus the selected DOCA packages. + +## Role Variables + +- `doca_version`: Optional. String giving doca version. +- `doca_profile`: Optional. Name of [profile](https://docs.nvidia.com/doca/sdk/nvidia+doca+profiles/index.html) defining subset of DOCA to install. Default is `doca-ofed`. +- `doca_repo_url`: Optional. URL of DOCA repository. Default is appropriate upstream public repository for DOCA version, distro version and architecture. From 80f8a3c218c0c0c78f5892a958a31a3223a4a934 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 15:28:36 +0000 Subject: [PATCH 15/23] fix DOCA role actually running --- ansible/roles/doca/tasks/main.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 ansible/roles/doca/tasks/main.yml diff --git a/ansible/roles/doca/tasks/main.yml b/ansible/roles/doca/tasks/main.yml new file mode 100644 index 000000000..e7a272f38 --- /dev/null +++ b/ansible/roles/doca/tasks/main.yml @@ -0,0 +1 @@ +- include_tasks: install.yml From b2001585d35da9aea8cc6a2aa178135f20077239 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 15:29:01 +0000 Subject: [PATCH 16/23] tidyup DOCA play --- ansible/fatimage.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index b0f428c90..c8cbe7cb2 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -6,6 +6,9 @@ tasks: - name: Report hostname (= final image name) command: hostname + - name: Report inventory groups + debug: + var: group_names - name: Run pre.yml hook vars: @@ -203,9 +206,10 @@ become: yes gather_facts: yes tasks: - - name: Install Mellanox DOCA - include_role: + - name: Install NVIDIA DOCA + import_role: name: doca + - meta: end_here - name: Run post.yml hook vars: From cc4fb92ec2670e42811b7edb4ee7ccee767abe5f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 15:32:18 +0000 Subject: [PATCH 17/23] include doca packages in image summary --- ansible/cleanup.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index cf9b0bdab..3f059d157 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -61,5 +61,10 @@ os: "{{ ansible_distribution }} {{ ansible_distribution_version }}" kernel: "{{ ansible_kernel }}" ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}" + doca: "{{ ansible_facts.packages[doca_profile | default('doca-ofed') ].0.version | default('-') }}" cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}" slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" + +- name: Show image summary + debug: + var: image_info From 64b231e7e78768c5eaa270f5d31038651edc8ff4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 15:33:18 +0000 Subject: [PATCH 18/23] fix squid being selected for any stackhopc build VM --- environments/.stackhpc/inventory/extra_groups | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index 7c9a7c774..f8bdd613b 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -30,4 +30,4 @@ compute [squid:children] # Install squid into fat image -builder +control From f8d6f0ac8dfa4da709c76a7ce60c6224640f07eb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 15:33:40 +0000 Subject: [PATCH 19/23] fix nightly build concurrency --- .github/workflows/nightlybuild.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 9187ab4ce..a0e78cd0b 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -119,7 +119,7 @@ jobs: name: upload-nightly-targets needs: openstack concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.label }}-${{ matrix.target_cloud }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }}-${{ matrix.target_cloud }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: From 00a1d06b213de9233f25ef14bbe25b945711f0a8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 15:52:14 +0000 Subject: [PATCH 20/23] re-add squid back to Stackhpc builder group --- environments/.stackhpc/inventory/extra_groups | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index f8bdd613b..7c9a7c774 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -30,4 +30,4 @@ compute [squid:children] # Install squid into fat image -control +builder From 5a00228b5eec322384f4185f3581e0d4b9e50977 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 16:39:29 +0000 Subject: [PATCH 21/23] remove debugging exit --- ansible/fatimage.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index c8cbe7cb2..439c50e70 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -209,7 +209,6 @@ - name: Install NVIDIA DOCA import_role: name: doca - - meta: end_here - name: Run post.yml hook vars: From 03c85777ec43f1d65936ceda4e2bcbdfbc40db6c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 11 Dec 2024 17:23:00 +0000 Subject: [PATCH 22/23] update image build docs --- docs/image-build.md | 57 ++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 39 deletions(-) diff --git a/docs/image-build.md b/docs/image-build.md index 4896bde57..b07c27778 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -24,65 +24,44 @@ The steps for building site-specific fat images or extending an existing fat ima ```hcl flavor = "general.v1.small" # VM flavor to use for builder VMs networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to + source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image + inventory_groups = "control,login,compute" # Comma-separated list of inventory groups to add build VM to, in addition to "builder" group + ``` Note that: - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). - For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`. - - For an example of configuration for extending an existing fat image see below. + - The `control,login,compute` inventory groups mean that the resultant image contains packages for all nodes in the cluster - this produces + a site-specific fat image. 3. Activate the venv and the relevant environment. 4. Build images using the relevant variable definition file, e.g.: cd packer/ - PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - - Note that the `-only` flag here restricts Packer to a single specific "build" definition (in Packer terminology). Options here are: - - `-only=openstack.openhpc`: Build a fat image including Mellanox OFED - - `-only=openstack.openhpc-cuda`: Build a fat image including Mellanox OFED, Nvidia drivers and CUDA - - `-only=openstack.openhpc-extra`: Build an image which *extends* an existing fat image - -5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash. - -# Defining an "extra" image build - -An "extra" image build starts with an existing fat image (e.g. one provided by StackHPC) rather than a RockyLinux GenericCloud image, and only runs a specific subset of the -Ansible in the appliance. This allows adding additional functionality into site-specific images, without modifying the existing functionality in the base fat image. This is the recommended way to build site-specific images. - -To configure an "extra" image build, prepare a Packer variable definition file as described above but also including: - -- `extra_build_image_name`: A string to add into the final image name. -- `source_image` or `source_image_name`: The UUID or name of the fat image to start from (which must already be present in OpenStack). -- `extra_build_groups`: A list of Ansible inventory groups to put the build VM into, in addition to the `builder` group. This defines the roles/functionality - which are added to the image. -- `extra_build_volume_size`: A number giving the size in GB of the volume for the build VM's root disk and therefore the resulting image size. - Note this assumes the default of `use_blockstorage_volume = true`. - -E.g. to add the lustre client to an RockyLinux 9 image: - - # environments/site/lustre.pkvars.hcl + PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - extra_build_image_name = "lustre" # output image name will be like "openhpc-lustre-RL9-$timestamp-$commit" - source_image_name = "openhpc-ofed-RL9-240906-1041-32568dbb" # e.g. current StackHPC RL9 image - extra_build_groups = ["lustre"] # only run lustre role during this extra build - extra_build_volume_size = 15 # default non-CUDA build image size has enough free space + **NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: - # ... define flavor, network, etc as normal + openstack image show $SOURCE_IMAGE + If it does, remove this property: -Then, reference this build and variables file in the Packer build command: + openstack image unset --property signature_verified $SOURCE_IMAGE - PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc-extra --on-error=ask -var-file=environments/site/lustre.pkvars.hcl openstack.pkr.hcl + then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). -**NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: - openstack image show $SOURCE_IMAGE +5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. -If it does, remove this property: +# Extending an existing image - openstack image unset --property signature_verified $SOURCE_IMAGE +Extending an existing images uses the same process as described above is followed, but the Packer variable definition file should: + - Set `source_image_name` to be an existing fat image (e.g. one provided by StackHPC) rather than a RockyLinux GenericCloud image + - Set `inventory_groups` should only include the additional functionality (= role name), e.g. `lustre` + - Probably set the variable `image_name`, e.g. to `openhpc-lustre` to distinguish it from an existing fat image -then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). +Setting the inventory groups in this way allows adding additional functionality into images from StackHPC, without modifying the existing packages in the image (which have been tested in CI). This is the recommended way to modify images to add site-specific functionality. # Build Process From c88b050b9f50d31b7374da12eed28ae806bb5894 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 12 Dec 2024 11:55:29 +0000 Subject: [PATCH 23/23] update packer docs --- docs/image-build.md | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/docs/image-build.md b/docs/image-build.md index b07c27778..a7d2e951b 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -2,37 +2,38 @@ The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. -The Packer configuration defined here builds "fat images" which contain binaries for all nodes, but no cluster-specific configuration. Using these: +The Packer configuration defined here builds "fat images" which contain packages, binaries and container images but no cluster-specific configuration. Using these: - Enables the image to be tested in CI before production use. - Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). - Improves deployment speed by reducing the number of package downloads to improve deployment speed. -By default, a fat image build starts from a nightly image build containing Mellanox OFED, and updates all DNF packages already present. The 'latest' nightly build itself is from a RockyLinux GenericCloud image. - -The fat images StackHPC builds and test in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: +The fat images StackHPC builds and tests in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: 1. Build site-specific fat images from scratch. -2. Extend an existing fat image with additional software. +2. Extend an existing fat image with additional functionality. # Usage -The steps for building site-specific fat images or extending an existing fat image are the same: +To build either a site-specific fat image from scratch, or to extend an existing StackHPC fat image: 1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). -2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum e.g.: +2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: ```hcl flavor = "general.v1.small" # VM flavor to use for builder VMs networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image - inventory_groups = "control,login,compute" # Comma-separated list of inventory groups to add build VM to, in addition to "builder" group + inventory_groups = "control,login,compute" # Additional inventory groups to add build VM to ``` + Note that: - - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). - - For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`. - - The `control,login,compute` inventory groups mean that the resultant image contains packages for all nodes in the cluster - this produces - a site-specific fat image. + - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). + - The flavor used must have sufficent memory for the build tasks, but otherwise does not need to match the final cluster nodes. Usually 8GB is sufficent. By default, the build VM is volume-backed to allow control of the root disk size (and hence final image size) so the flavor disk size does not matter. + - The source image should be either a RockyLinux GenericCloud image for a site-specific image build from scratch, or a StackHPC fat image if extending an existing image. + - The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to. This is in addition to the `builder` group which it is always added to. This controls which Ansible roles and functionality run during build, and hence what gets added to the image. All possible groups are listed in `environments/common/groups` but common options for this variable will be: + - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. + - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. 3. Activate the venv and the relevant environment. @@ -51,18 +52,8 @@ The steps for building site-specific fat images or extending an existing fat ima then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). - 5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. -# Extending an existing image - -Extending an existing images uses the same process as described above is followed, but the Packer variable definition file should: - - Set `source_image_name` to be an existing fat image (e.g. one provided by StackHPC) rather than a RockyLinux GenericCloud image - - Set `inventory_groups` should only include the additional functionality (= role name), e.g. `lustre` - - Probably set the variable `image_name`, e.g. to `openhpc-lustre` to distinguish it from an existing fat image - -Setting the inventory groups in this way allows adding additional functionality into images from StackHPC, without modifying the existing packages in the image (which have been tested in CI). This is the recommended way to modify images to add site-specific functionality. - # Build Process In summary, Packer creates an OpenStack VM, runs Ansible on that, shuts it down, then creates an image from the root disk.