From 3ead8d84f383e74431c0204f587147ab00e42483 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Nov 2024 09:34:50 +0000 Subject: [PATCH 01/22] bump OFED to 24.07 --- ansible/roles/ofed/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml index 0d040b55e..46902ded9 100644 --- a/ansible/roles/ofed/defaults/main.yml +++ b/ansible/roles/ofed/defaults/main.yml @@ -1,4 +1,4 @@ -ofed_version: '23.10-3.2.2.0' # LTS +ofed_version: '24.07-0.6.1.0' ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz ofed_distro: rhel # NB: not expected to work on other distros due to installation differences ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9' From 24d1ee08128c8df88a24e1729331f1dddd5e7520 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Nov 2024 09:37:38 +0000 Subject: [PATCH 02/22] allow adding suffix to image names --- packer/openstack.pkr.hcl | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index fae0bf7b2..67b12c61b 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -171,6 +171,12 @@ variable "extra_build_image_name" { default = "extra" } +variable "image_name_suffx" { + type = string + description = "Suffix for all build image names" + default = "" +} + source "openstack" "openhpc" { # Build VM: flavor = var.flavor @@ -207,31 +213,31 @@ build { # latest nightly image: source "source.openstack.openhpc" { name = "rocky-latest" - image_name = "${source.name}-${var.os_version}" + image_name = "${source.name}-${var.os_version}${var.image_name_suffx}" } # latest nightly cuda image: source "source.openstack.openhpc" { name = "rocky-latest-cuda" - image_name = "${source.name}-${var.os_version}" + image_name = "${source.name}-${var.os_version}${var.image_name_suffx}" } # OFED fat image: source "source.openstack.openhpc" { name = "openhpc" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" + image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffx}" } # CUDA fat image: source "source.openstack.openhpc" { name = "openhpc-cuda" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" + image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffx}" } # Extended site-specific image, built on fat image: source "source.openstack.openhpc" { name = "openhpc-extra" - image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" + image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffx}" } provisioner "ansible" { From f693e88dc93fc03dad3dc24492b63ead6271b5e9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Nov 2024 09:39:28 +0000 Subject: [PATCH 03/22] DEBUG: build non-cuda images using OFED24 w/ suffix --- .github/workflows/nightlybuild.yml | 2 +- environments/.stackhpc/LEAFCLOUD.pkrvars.hcl | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 333550c53..5c068635a 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -28,7 +28,7 @@ jobs: - RL9 build: - openstack.rocky-latest - - openstack.rocky-latest-cuda + # - openstack.rocky-latest-cuda exclude: - os_version: RL8 build: openstack.rocky-latest-cuda diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl index 5adf4199c..8c05e9151 100644 --- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl +++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl @@ -5,3 +5,4 @@ ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] floating_ip_network = "external" +image_name_suffx = "-ofed24" From 64161fbed931be19f184280f151378f48b2e16c8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Nov 2024 13:14:58 +0000 Subject: [PATCH 04/22] fix image name suffix typo --- packer/openstack.pkr.hcl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 67b12c61b..8deb5df33 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -171,7 +171,7 @@ variable "extra_build_image_name" { default = "extra" } -variable "image_name_suffx" { +variable "image_name_suffix" { type = string description = "Suffix for all build image names" default = "" @@ -213,31 +213,31 @@ build { # latest nightly image: source "source.openstack.openhpc" { name = "rocky-latest" - image_name = "${source.name}-${var.os_version}${var.image_name_suffx}" + image_name = "${source.name}-${var.os_version}${var.image_name_suffix}" } # latest nightly cuda image: source "source.openstack.openhpc" { name = "rocky-latest-cuda" - image_name = "${source.name}-${var.os_version}${var.image_name_suffx}" + image_name = "${source.name}-${var.os_version}${var.image_name_suffix}" } # OFED fat image: source "source.openstack.openhpc" { name = "openhpc" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffx}" + image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffix}" } # CUDA fat image: source "source.openstack.openhpc" { name = "openhpc-cuda" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffx}" + image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffix}" } # Extended site-specific image, built on fat image: source "source.openstack.openhpc" { name = "openhpc-extra" - image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffx}" + image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffix}" } provisioner "ansible" { From 986306fe256c59c42fb8b368417e9b14f78c02db Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Nov 2024 13:15:49 +0000 Subject: [PATCH 05/22] fix image name suffix for leafcloud build --- environments/.stackhpc/LEAFCLOUD.pkrvars.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl index 8c05e9151..06bc27ebf 100644 --- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl +++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl @@ -5,4 +5,4 @@ ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] floating_ip_network = "external" -image_name_suffx = "-ofed24" +image_name_suffix = "-ofed24" From 1bd2bc4843ce1c31b80a68b227023983119a39eb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 5 Nov 2024 14:58:58 +0000 Subject: [PATCH 06/22] WIP: bump CI image to OFED24-based fatimage --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 9f396e964..9e5b2fdfb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241024-1439-177083b1", - "RL9": "openhpc-RL9-241024-1438-177083b1", - "RL9-cuda": "openhpc-cuda-RL9-241024-1628-177083b1" + "RL8": "openhpc-RL8-241101-1318-986306fe-ofed24", + "RL9": "openhpc-RL9-241101-1318-986306fe-ofed24", + "RL9-cuda": "openhpc-cuda-RL9-241101-1319-986306fe-ofed24" } } \ No newline at end of file From f8d65053f1b0c8828d6241c260004b1cbbff3d44 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 5 Nov 2024 17:09:25 +0000 Subject: [PATCH 07/22] WIP: allow latest build to include branch name --- .github/workflows/nightlybuild.yml | 41 +++++++------- packer/openstack.pkr.hcl | 86 +++++++++--------------------- 2 files changed, 45 insertions(+), 82 deletions(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 5c068635a..c97a18e51 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -22,26 +22,23 @@ jobs: runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails - matrix: # build RL8, RL9, RL9+CUDA versions - os_version: - - RL8 - - RL9 - build: - - openstack.rocky-latest - # - openstack.rocky-latest-cuda + matrix: + source_image_name: + - Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + - Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: + - ["update", "ofed"] + - ["update", "ofed", "cuda"] exclude: - - os_version: RL8 - build: openstack.rocky-latest-cuda - + - source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + inventory_groups: ["update", "ofed", "cuda"] env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} - SOURCE_IMAGES_MAP: | - { - "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", - "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" - } + # set the image suffix to -latest for cron jobs or a branch name if manually-triggered + IMAGE_NAME_SUFFIX: ${{ github.event_name == 'schedule' && 'latest' || github.ref_name }} + steps: - uses: actions/checkout@v2 @@ -49,6 +46,7 @@ jobs: - name: Record settings for CI cloud run: | echo CI_CLOUD: ${{ env.CI_CLOUD }} + echo IMAGE_NAME_SUFFIX: ${{ env.IMAGE_NAME_SUFFIX }} - name: Setup ssh run: | @@ -85,16 +83,17 @@ jobs: cd packer/ packer init . - PACKER_LOG=1 packer build \ + packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var source_image_name=${{ matrix.source_image_name }} \ + -var image_name_prefix=rocky-${{ matrix.os_version }} \ + -var image_name_suffix=${{ env.IMAGE_NAME_SUFFIX }} \ + -var 'inventory_groups=${{ toJSON(matrix.inventory_groups) }}' \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }} + PACKER_LOG: '1' + - name: Get created image names from manifest id: manifest diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 8deb5df33..6fb04986b 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -23,6 +23,7 @@ data "git-commit" "cwd-head" { } locals { git_commit = data.git-commit.cwd-head.hash timestamp = formatdate("YYMMDD-hhmm", timestamp()) + image_name_suffix = var.image_name_suffix == "" ? "${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_name_suffix } # Path pointing to root of repository - automatically set by environment variable PKR_VAR_repo_root @@ -39,12 +40,6 @@ variable "networks" { type = list(string) } -variable "os_version" { - type = string - description = "'RL8' or 'RL9' with default source_image_* mappings" - default = "RL9" -} - # Must supply either source_image_name or source_image_id variable "source_image_name" { type = string @@ -123,19 +118,13 @@ variable "volume_type" { } variable "volume_size" { - type = map(number) - default = { - # fat image builds, GB: - rocky-latest = 15 - rocky-latest-cuda = 30 - openhpc = 15 - openhpc-cuda = 30 - } + type = number + default = 15 # same as default non-CUDA build } -variable "extra_build_volume_size" { +variable "volume_size_cuda" { type = number - default = 15 # same as default non-CUDA build + default = 30 } variable "image_disk_format" { @@ -148,32 +137,32 @@ variable "metadata" { default = {} } -variable "groups" { - type = map(list(string)) - description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name" - default = { - # fat image builds: - rocky-latest = ["update", "ofed"] - rocky-latest-cuda = ["update", "ofed", "cuda"] - openhpc = ["control", "compute", "login"] - openhpc-cuda = ["control", "compute", "login"] - } -} - -variable "extra_build_groups" { +variable "inventory_groups" { type = list(string) default = [] } -variable "extra_build_image_name" { +# variable "groups" { +# type = map(list(string)) +# description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name" +# default = { + # fat image builds: + # rocky-latest = ["update", "ofed"] + # rocky-latest-cuda = ["update", "ofed", "cuda"] + # openhpc = ["control", "compute", "login"] + # openhpc-cuda = ["control", "compute", "login"] +# } +# } + +variable "image_name_prefix" { type = string - description = "Infix for 'extra' build image name" - default = "extra" + description = "Prefix for built image names" + default = "openhpc" } variable "image_name_suffix" { type = string - description = "Suffix for all build image names" + description = "Suffix for built image names. If not supplied a timestamp+git commit is used" default = "" } @@ -182,7 +171,7 @@ source "openstack" "openhpc" { flavor = var.flavor use_blockstorage_volume = var.use_blockstorage_volume volume_type = var.volume_type - volume_size = lookup(var.volume_size, source.name, var.extra_build_volume_size) + volume_size = contains(var.inventory_groups, "cuda") ? var.volume_size_cuda : var.volume_size metadata = var.metadata instance_metadata = {ansible_init_disable = "true"} networks = var.networks @@ -210,39 +199,14 @@ source "openstack" "openhpc" { build { - # latest nightly image: source "source.openstack.openhpc" { name = "rocky-latest" - image_name = "${source.name}-${var.os_version}${var.image_name_suffix}" - } - - # latest nightly cuda image: - source "source.openstack.openhpc" { - name = "rocky-latest-cuda" - image_name = "${source.name}-${var.os_version}${var.image_name_suffix}" - } - - # OFED fat image: - source "source.openstack.openhpc" { - name = "openhpc" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffix}" - } - - # CUDA fat image: - source "source.openstack.openhpc" { - name = "openhpc-cuda" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffix}" - } - - # Extended site-specific image, built on fat image: - source "source.openstack.openhpc" { - name = "openhpc-extra" - image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}${var.image_name_suffix}" + image_name = "${var.image_name_prefix}-${local.image_name_suffix}" } provisioner "ansible" { playbook_file = "${var.repo_root}/ansible/fatimage.yml" - groups = concat(["builder"], lookup(var.groups, source.name, var.extra_build_groups)) + groups = concat(["builder"], var.inventory_groups) keep_inventory_file = true # for debugging use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting extra_arguments = [ From a9be849f5d912abfb20b01c95bed503bdf53b99f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 5 Nov 2024 19:53:15 +0000 Subject: [PATCH 08/22] simplify packer groups --- .github/workflows/nightlybuild.yml | 4 ++-- packer/openstack.pkr.hcl | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index c97a18e51..9ab347d0f 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -31,7 +31,7 @@ jobs: - ["update", "ofed", "cuda"] exclude: - source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 - inventory_groups: ["update", "ofed", "cuda"] + inventory_groups: 'update,ofed,cuda' env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -89,7 +89,7 @@ jobs: -var source_image_name=${{ matrix.source_image_name }} \ -var image_name_prefix=rocky-${{ matrix.os_version }} \ -var image_name_suffix=${{ env.IMAGE_NAME_SUFFIX }} \ - -var 'inventory_groups=${{ toJSON(matrix.inventory_groups) }}' \ + -var inventory_groups=${{ matrix.inventory_groups }} \ openstack.pkr.hcl env: PACKER_LOG: '1' diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 6fb04986b..8a25185c3 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -138,8 +138,9 @@ variable "metadata" { } variable "inventory_groups" { - type = list(string) - default = [] + type = string + description = "comma-separated list of inventory groups, in addition to 'builder'" + default = '' } # variable "groups" { @@ -157,6 +158,7 @@ variable "inventory_groups" { variable "image_name_prefix" { type = string description = "Prefix for built image names" + # TODO: maybe we can just make this default to the first two parts of the source image name? default = "openhpc" } @@ -171,7 +173,7 @@ source "openstack" "openhpc" { flavor = var.flavor use_blockstorage_volume = var.use_blockstorage_volume volume_type = var.volume_type - volume_size = contains(var.inventory_groups, "cuda") ? var.volume_size_cuda : var.volume_size + volume_size = contains(split(",", var.inventory_groups), "cuda") ? var.volume_size_cuda : var.volume_size metadata = var.metadata instance_metadata = {ansible_init_disable = "true"} networks = var.networks @@ -206,7 +208,7 @@ build { provisioner "ansible" { playbook_file = "${var.repo_root}/ansible/fatimage.yml" - groups = concat(["builder"], var.inventory_groups) + groups = concat(["builder"], split(",", var.inventory_groups)) keep_inventory_file = true # for debugging use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting extra_arguments = [ From fa16e2d16f6d42bf568671198b771321cbed6af6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 5 Nov 2024 21:13:02 +0000 Subject: [PATCH 09/22] swap to structured matrix vars --- .github/workflows/nightlybuild.yml | 28 +++++++++++++++------------- packer/openstack.pkr.hcl | 7 +++---- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 9ab347d0f..2da602d51 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -17,28 +17,31 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.source_image.name }}-${{ matrix.inventory_groups.label }} # to branch/PR + OS + build cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: - source_image_name: - - Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 - - Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + source_image: + - label: RL8 + name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + - label: RL9 + name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 inventory_groups: - - ["update", "ofed"] - - ["update", "ofed", "cuda"] + - label: ofed + list: 'update,ofed' + - label: cuda + list: 'update,ofed,cuda' exclude: - - source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 - inventory_groups: 'update,ofed,cuda' + - source_image: {label: RL8} + inventory_groups: {label: cuda} env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} # set the image suffix to -latest for cron jobs or a branch name if manually-triggered IMAGE_NAME_SUFFIX: ${{ github.event_name == 'schedule' && 'latest' || github.ref_name }} - steps: - uses: actions/checkout@v2 @@ -46,7 +49,6 @@ jobs: - name: Record settings for CI cloud run: | echo CI_CLOUD: ${{ env.CI_CLOUD }} - echo IMAGE_NAME_SUFFIX: ${{ env.IMAGE_NAME_SUFFIX }} - name: Setup ssh run: | @@ -86,10 +88,10 @@ jobs: packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var source_image_name=${{ matrix.source_image_name }} \ - -var image_name_prefix=rocky-${{ matrix.os_version }} \ + -var source_image_name=${{ matrix.source_image.name }} \ + -var image_name=rocky-${{ matrix.source_image.label }}-${{ matrix.inventory_groups.label }} \ -var image_name_suffix=${{ env.IMAGE_NAME_SUFFIX }} \ - -var inventory_groups=${{ matrix.inventory_groups }} \ + -var inventory_groups=${{ matrix.inventory_groups.list }} \ openstack.pkr.hcl env: PACKER_LOG: '1' diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 8a25185c3..6fa9e6c76 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -155,10 +155,9 @@ variable "inventory_groups" { # } # } -variable "image_name_prefix" { +variable "image_name" { type = string - description = "Prefix for built image names" - # TODO: maybe we can just make this default to the first two parts of the source image name? + description = "Built image name" default = "openhpc" } @@ -203,7 +202,7 @@ build { source "source.openstack.openhpc" { name = "rocky-latest" - image_name = "${var.image_name_prefix}-${local.image_name_suffix}" + image_name = "${var.image_name}-${local.image_name_suffix}" } provisioner "ansible" { From c7e9ebad40b7fcb7759e89bef948792008a9be74 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 5 Nov 2024 21:23:50 +0000 Subject: [PATCH 10/22] simplify build spec --- .github/workflows/nightlybuild.yml | 36 +++++++++++++----------------- packer/openstack.pkr.hcl | 8 +++---- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 2da602d51..f73312738 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -17,31 +17,28 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.source_image.name }}-${{ matrix.inventory_groups.label }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.builds.label }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: - source_image: - - label: RL8 - name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 - - label: RL9 - name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 - inventory_groups: - - label: ofed - list: 'update,ofed' - - label: cuda - list: 'update,ofed,cuda' - exclude: - - source_image: {label: RL8} - inventory_groups: {label: cuda} + builds: + - label: RL8-ofed + source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + inventory_groups: 'update,ofed' + - label: RL9-ofed + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: 'update,ofed' + - label: RL9-cuda + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: 'update,ofed,cuda' env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} # set the image suffix to -latest for cron jobs or a branch name if manually-triggered - IMAGE_NAME_SUFFIX: ${{ github.event_name == 'schedule' && 'latest' || github.ref_name }} + IMAGE_SUFFIX: ${{ github.event_name == 'schedule' && 'latest' || github.ref_name }} steps: - uses: actions/checkout@v2 @@ -88,14 +85,13 @@ jobs: packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var source_image_name=${{ matrix.source_image.name }} \ - -var image_name=rocky-${{ matrix.source_image.label }}-${{ matrix.inventory_groups.label }} \ - -var image_name_suffix=${{ env.IMAGE_NAME_SUFFIX }} \ - -var inventory_groups=${{ matrix.inventory_groups.list }} \ + -var source_image_name=${{ matrix.builds.source_image_name }} \ + -var image_name=${{ matrix.builds.label }} \ + -var image_version=${{ env.IMAGE_SUFFIX }} \ + -var inventory_groups=${{ matrix.builds.inventory_groups }} \ openstack.pkr.hcl env: PACKER_LOG: '1' - - name: Get created image names from manifest id: manifest diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 6fa9e6c76..6c1f7d12f 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -23,7 +23,7 @@ data "git-commit" "cwd-head" { } locals { git_commit = data.git-commit.cwd-head.hash timestamp = formatdate("YYMMDD-hhmm", timestamp()) - image_name_suffix = var.image_name_suffix == "" ? "${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_name_suffix + image_version = var.image_version == "" ? "${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_version } # Path pointing to root of repository - automatically set by environment variable PKR_VAR_repo_root @@ -140,7 +140,7 @@ variable "metadata" { variable "inventory_groups" { type = string description = "comma-separated list of inventory groups, in addition to 'builder'" - default = '' + default = "" } # variable "groups" { @@ -161,7 +161,7 @@ variable "image_name" { default = "openhpc" } -variable "image_name_suffix" { +variable "image_version" { type = string description = "Suffix for built image names. If not supplied a timestamp+git commit is used" default = "" @@ -202,7 +202,7 @@ build { source "source.openstack.openhpc" { name = "rocky-latest" - image_name = "${var.image_name}-${local.image_name_suffix}" + image_name = "${var.image_name}-${local.image_version}" } provisioner "ansible" { From 1caa2882546a3476b6f68d141d9fdf78ca49114e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 08:19:01 +0000 Subject: [PATCH 11/22] make built image name/version smarter --- packer/openstack.pkr.hcl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 6c1f7d12f..816dd1207 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -23,7 +23,8 @@ data "git-commit" "cwd-head" { } locals { git_commit = data.git-commit.cwd-head.hash timestamp = formatdate("YYMMDD-hhmm", timestamp()) - image_version = var.image_version == "" ? "${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_version + image_version = var.image_version == "auto" ? "${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_version + image_name = replace(var.image_name, '/', '-') } # Path pointing to root of repository - automatically set by environment variable PKR_VAR_repo_root @@ -157,14 +158,14 @@ variable "inventory_groups" { variable "image_name" { type = string - description = "Built image name" + description = "Built image name. Any '/' are replaced with '-'." default = "openhpc" } variable "image_version" { type = string - description = "Suffix for built image names. If not supplied a timestamp+git commit is used" - default = "" + description = "Suffix for built image names. Default special value 'auto' uses a timestamp+git commit" + default = "auto" } source "openstack" "openhpc" { @@ -202,7 +203,7 @@ build { source "source.openstack.openhpc" { name = "rocky-latest" - image_name = "${var.image_name}-${local.image_version}" + image_name = join("-", [local.image_name, local.image_version]) } provisioner "ansible" { From 9bcc95fdaad53c7975f5220af58f965bae526065 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 08:57:47 +0000 Subject: [PATCH 12/22] only scan nightly builds when run on schedule --- .github/workflows/nightlybuild.yml | 71 ++++++++++++++++-------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index f73312738..6c38544fa 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -1,3 +1,5 @@ +# NB: When run via workflow_dispatch, image scanning and distribution to other clouds does not happen +# on the basis that in this case a fatimage must be built and will be scanned. name: Build nightly image on: workflow_dispatch: @@ -14,8 +16,8 @@ on: - cron: '0 0 * * *' # Run at midnight jobs: - openstack: - name: openstack-imagebuild + build: + name: nightly-imagebuild concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.builds.label }} cancel-in-progress: true @@ -37,8 +39,7 @@ jobs: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} - # set the image suffix to -latest for cron jobs or a branch name if manually-triggered - IMAGE_SUFFIX: ${{ github.event_name == 'schedule' && 'latest' || github.ref_name }} + IMAGE_VERSION: ${{ github.event_name == 'schedule' && 'latest' || github.ref_name }} steps: - uses: actions/checkout@v2 @@ -87,7 +88,7 @@ jobs: -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var source_image_name=${{ matrix.builds.source_image_name }} \ -var image_name=${{ matrix.builds.label }} \ - -var image_version=${{ env.IMAGE_SUFFIX }} \ + -var image_version=${{ env.IMAGE_VERSION }} \ -var inventory_groups=${{ matrix.builds.inventory_groups }} \ openstack.pkr.hcl env: @@ -102,10 +103,12 @@ jobs: sleep 5 done IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo image: ${IMAGE_NAME} ${IMAGE_ID} echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" - name: Download image + if: github.event_name == 'schedule' run: | . venv/bin/activate sudo mkdir /mnt/images @@ -114,20 +117,23 @@ jobs: openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-id }} - name: Set up QEMU + if: github.event_name == 'schedule' uses: docker/setup-qemu-action@v3 - - name: install libguestfs + - name: Install libguestfs run: | sudo apt -y update sudo apt -y install libguestfs-tools + if: github.event_name == 'schedule' - - name: mkdir for mount - run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}' - - - name: mount qcow2 file - run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}' - + - name: Mount image + if: github.event_name == 'schedule' + run: | + sudo mkdir -p './${{ steps.manifest.outputs.image-name }}' + sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}' + - name: Run Trivy vulnerability scanner + if: github.event_name == 'schedule' uses: aquasecurity/trivy-action@0.17.0 with: scan-type: fs @@ -140,12 +146,14 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Upload Trivy scan results to GitHub Security tab + if: github.event_name == 'schedule' uses: github/codeql-action/upload-sarif@v3 with: sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif" - category: "${{ matrix.os_version }}-${{ matrix.build }}" + category: "${{ matrix.build.label }}" - name: Fail if scan has CRITICAL vulnerabilities + if: github.event_name == 'schedule' uses: aquasecurity/trivy-action@0.16.1 with: scan-type: fs @@ -157,16 +165,17 @@ jobs: ignore-unfixed: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Delete new image if Trivy scan fails - if: failure() && steps.packer_build.outcome == 'success' # Runs if the Trivy scan found crit vulnerabilities or failed + if: github.event_name == 'schedule' && failure() && steps.packer_build.outcome == 'success' # Runs if the Trivy scan found crit vulnerabilities or failed run: | . venv/bin/activate echo "Deleting new image due to critical vulnerabilities or scan failure ..." openstack image delete "${{ steps.manifest.outputs.image-id }}" - - name: Delete old latest image - if: success() # Runs only if Trivy scan passed + - name: Delete old image + if: github.event_name == 'schedule' run: | . venv/bin/activate IMAGE_COUNT=$(openstack image list --name ${{ steps.manifest.outputs.image-name }} -f value -c ID | wc -l) @@ -180,9 +189,10 @@ jobs: upload: name: upload-nightly-targets - needs: openstack + needs: build + if: github.event_name == 'schedule' concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.builds.label }}-${{ matrix.target_cloud }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: @@ -192,21 +202,16 @@ jobs: - LEAFCLOUD - SMS - ARCUS - os_version: - - RL8 - - RL9 - image: - - rocky-latest - - rocky-latest-cuda + builds: + - image: RL8-ofed-latest + - image: RL9-ofed-latest + - image: RL9-cuda-latest exclude: - - os_version: RL8 - image: rocky-latest-cuda - - target_cloud: LEAFCLOUD + - target_cloud: LEAFCLOUD # why?? Should this not be source_cloud/vars.CI_CLOUD env: OS_CLOUD: openstack SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} TARGET_CLOUD: ${{ matrix.target_cloud }} - IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}" steps: - uses: actions/checkout@v2 @@ -234,7 +239,7 @@ jobs: run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml - openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }} + openstack image save --file ${{ matrix.builds.image }} ${{ matrix.builds.image }} shell: bash - name: Upload to target cloud @@ -242,8 +247,8 @@ jobs: . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - openstack image create "${{ env.IMAGE_NAME }}" \ - --file "${{ env.IMAGE_NAME }}" \ + openstack image create "${{ matrix.builds.image }}" \ + --file "${{ matrix.builds.image }}" \ --disk-format qcow2 \ shell: bash @@ -252,9 +257,9 @@ jobs: . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l) + IMAGE_COUNT=$(openstack image list --name ${{ matrix.builds.image }} -f value -c ID | wc -l) if [ "$IMAGE_COUNT" -gt 1 ]; then - OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}" -f value -c ID | head -n 1) + OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ matrix.builds.image }}" -f value -c ID | head -n 1) openstack image delete "$OLD_IMAGE_ID" else echo "Only one image exists, skipping deletion." From e662e2880bb0955d8dc3b57fba651cf11b1e7336 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 09:12:14 +0000 Subject: [PATCH 13/22] fix sanitising of image name --- packer/openstack.pkr.hcl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 816dd1207..d3a0e24e0 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -24,7 +24,6 @@ locals { git_commit = data.git-commit.cwd-head.hash timestamp = formatdate("YYMMDD-hhmm", timestamp()) image_version = var.image_version == "auto" ? "${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_version - image_name = replace(var.image_name, '/', '-') } # Path pointing to root of repository - automatically set by environment variable PKR_VAR_repo_root @@ -158,7 +157,7 @@ variable "inventory_groups" { variable "image_name" { type = string - description = "Built image name. Any '/' are replaced with '-'." + description = "Built image name." default = "openhpc" } @@ -203,7 +202,7 @@ build { source "source.openstack.openhpc" { name = "rocky-latest" - image_name = join("-", [local.image_name, local.image_version]) + image_name = replace(join("-", [var.image_name, local.image_version]), "/", "-") } provisioner "ansible" { From 9e717a95998c4468cede4a5a06986cb315e7a7b2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 09:33:30 +0000 Subject: [PATCH 14/22] swap no-scan logic for nightlybuild to branch, not event --- .github/workflows/nightlybuild.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 6c38544fa..bd97fcb8c 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -1,4 +1,4 @@ -# NB: When run via workflow_dispatch, image scanning and distribution to other clouds does not happen +# NB: When run in a non-main branch (via workflow_dispatch), image scanning and distribution to other clouds does not happen # on the basis that in this case a fatimage must be built and will be scanned. name: Build nightly image on: @@ -108,7 +108,7 @@ jobs: echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" - name: Download image - if: github.event_name == 'schedule' + if: github.ref_name == 'main' run: | . venv/bin/activate sudo mkdir /mnt/images @@ -117,23 +117,23 @@ jobs: openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-id }} - name: Set up QEMU - if: github.event_name == 'schedule' + if: github.ref_name == 'main' uses: docker/setup-qemu-action@v3 - name: Install libguestfs run: | sudo apt -y update sudo apt -y install libguestfs-tools - if: github.event_name == 'schedule' + if: github.ref_name == 'main' - name: Mount image - if: github.event_name == 'schedule' + if: github.ref_name == 'main' run: | sudo mkdir -p './${{ steps.manifest.outputs.image-name }}' sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}' - name: Run Trivy vulnerability scanner - if: github.event_name == 'schedule' + if: github.ref_name == 'main' uses: aquasecurity/trivy-action@0.17.0 with: scan-type: fs @@ -146,14 +146,14 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Upload Trivy scan results to GitHub Security tab - if: github.event_name == 'schedule' + if: github.ref_name == 'main' uses: github/codeql-action/upload-sarif@v3 with: sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif" category: "${{ matrix.build.label }}" - name: Fail if scan has CRITICAL vulnerabilities - if: github.event_name == 'schedule' + if: github.ref_name == 'main' uses: aquasecurity/trivy-action@0.16.1 with: scan-type: fs @@ -168,14 +168,14 @@ jobs: - name: Delete new image if Trivy scan fails - if: github.event_name == 'schedule' && failure() && steps.packer_build.outcome == 'success' # Runs if the Trivy scan found crit vulnerabilities or failed + if: github.ref_name == 'main' && failure() && steps.packer_build.outcome == 'success' # Runs if the Trivy scan found crit vulnerabilities or failed run: | . venv/bin/activate echo "Deleting new image due to critical vulnerabilities or scan failure ..." openstack image delete "${{ steps.manifest.outputs.image-id }}" - name: Delete old image - if: github.event_name == 'schedule' + if: github.ref_name == 'main' run: | . venv/bin/activate IMAGE_COUNT=$(openstack image list --name ${{ steps.manifest.outputs.image-name }} -f value -c ID | wc -l) @@ -190,7 +190,7 @@ jobs: upload: name: upload-nightly-targets needs: build - if: github.event_name == 'schedule' + if: github.ref_name == 'main' concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.builds.label }}-${{ matrix.target_cloud }} cancel-in-progress: true From 1f393e378abe4ba177256b4d11ccc218b2e65842 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 10:14:33 +0000 Subject: [PATCH 15/22] update fatimage for new packer/flow --- .github/workflows/fatimage.yml | 57 ++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 947f9410f..35f017b43 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -21,31 +21,20 @@ jobs: strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions - os_version: - - RL8 - - RL9 - build: - - openstack.openhpc - - openstack.openhpc-cuda - exclude: - - os_version: RL8 - build: openstack.openhpc-cuda + builds: + - label: openhpc-RL8-ofed + source_image_name: RL8-ofed + inventory_groups: 'control,login,compute' + - label: openhpc-RL9-ofed + source_image_name: RL9-ofed + inventory_groups: 'control,login,compute' + - label: openhpc-RL9-cuda + source_image_name: RL9-cuda + inventory_groups: 'control,login,compute' env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud }} - SOURCE_IMAGES_MAP: | - { - "RL8": { - "openstack.openhpc": "rocky-latest-RL8", - "openstack.openhpc-cuda": "rocky-latest-cuda-RL8" - }, - "RL9": { - "openstack.openhpc": "rocky-latest-RL9", - "openstack.openhpc-cuda": "rocky-latest-cuda-RL9" - } - } - steps: - uses: actions/checkout@v2 @@ -79,6 +68,20 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate + - name: Select branch-specific or latest nightly image + id: select_source_image + run: | + . venv/bin/activate + . environments/.stackhpc/activate + BRANCH=${{ github.ref_name }} + BRANCH_VERSION=${BRANCH//\/-} + NIGHTLY_IMAGE_ID=$( \ + openstack image show -c id -f value ${{ matrix.builds.source_image_name }}-${BRANCH_VERSION} || \ + openstack image show -c id -f value ${{ matrix.builds.source_image_name }}-latest \ + ) + echo selected source_image $NIGHTLY_IMAGE_ID: $(openstack image show -c name -f value $NIGHTLY_IMAGE_ID) + echo "source_image_id=$NIGHTLY_IMAGE_ID" >> "$GITHUB_OUTPUT" + - name: Build fat image with packer id: packer_build run: | @@ -88,15 +91,15 @@ jobs: cd packer/ packer init . - PACKER_LOG=1 packer build \ + packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var source_image=${{ steps.select_source_image.outputs.source_image_id }} + -var image_name=${{ matrix.builds.label }} \ + -var inventory_groups=${{ matrix.builds.inventory_groups }} \ openstack.pkr.hcl env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }} + PACKER_LOG: '1' - name: Get created image names from manifest id: manifest @@ -113,7 +116,7 @@ jobs: - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-${{ matrix.build }}-${{ matrix.os_version }} + name: image-details-${{ matrix.builds.label }} path: | ./image-id.txt ./image-name.txt From be002ea0556880773c6efd6b611a30723df83206 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 10:21:55 +0000 Subject: [PATCH 16/22] fix branch name sanitising in fatimage --- .github/workflows/fatimage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 35f017b43..148d018cc 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -74,7 +74,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate BRANCH=${{ github.ref_name }} - BRANCH_VERSION=${BRANCH//\/-} + BRANCH_VERSION=${BRANCH//\//-} # replace '/' with '-' using bash parameter expansion NIGHTLY_IMAGE_ID=$( \ openstack image show -c id -f value ${{ matrix.builds.source_image_name }}-${BRANCH_VERSION} || \ openstack image show -c id -f value ${{ matrix.builds.source_image_name }}-latest \ From 5835cd28a166a5404457598aceec0e0cd0a32dca Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 10:23:55 +0000 Subject: [PATCH 17/22] fix fatimage concurrency --- .github/workflows/fatimage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 148d018cc..36c26fc33 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -15,7 +15,7 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.builds.label }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: From a0b3dc4361367e495297023852a0f074ddd47c79 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 10:32:34 +0000 Subject: [PATCH 18/22] move image finalisation into role --- ansible/.gitignore | 2 + ansible/fatimage.yml | 6 +- ansible/roles/builder/defaults/main.yml | 1 + ansible/roles/builder/tasks/finalise.yml | 75 ++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 ansible/roles/builder/defaults/main.yml create mode 100644 ansible/roles/builder/tasks/finalise.yml diff --git a/ansible/.gitignore b/ansible/.gitignore index f6f5c5f4d..c8296cc8c 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -60,3 +60,5 @@ roles/* !roles/tuned/** !roles/lustre/ !roles/lustre/** +!roles/builder/ +!roles/builder/** diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 7cad2dc59..3d54723c9 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -207,8 +207,10 @@ gather_facts: yes tags: finalise tasks: - - name: Cleanup image - import_tasks: cleanup.yml + - name: Finalise image + import_role: + name: builder + tasks_from: finalise.yml - name: Shutdown Packer VM community.general.shutdown: diff --git a/ansible/roles/builder/defaults/main.yml b/ansible/roles/builder/defaults/main.yml new file mode 100644 index 000000000..f7a647423 --- /dev/null +++ b/ansible/roles/builder/defaults/main.yml @@ -0,0 +1 @@ +builder_delete_syslog: true \ No newline at end of file diff --git a/ansible/roles/builder/tasks/finalise.yml b/ansible/roles/builder/tasks/finalise.yml new file mode 100644 index 000000000..5f46f1161 --- /dev/null +++ b/ansible/roles/builder/tasks/finalise.yml @@ -0,0 +1,75 @@ +# Finalise a Packer build VM + +- meta: flush_handlers + +- name: Remove dnf caches + command: dnf clean all + +# If image build happens on a Neutron subnet with property dns_namservers defined, then cloud-init +# disables NetworkManager's control of /etc/resolv.conf and appends nameservers itself. +# We don't want network configuration during instance boot to depend on the configuration +# of the network the builder was on, so we reset these aspects. +- name: Delete /etc/resolv.conf + file: + path: /etc/resolv.conf + state: absent + when: "'resolv_conf' not in group_names" # if its been overriden, deleting it is the wrong thing to do + +- name: Reenable NetworkManager control of resolv.conf + # NB: This *doesn't* delete the 90-dns-none.conf file created by the resolv_conf role + # as if nameservers are explicitly being set by that role we don't want to allow NM + # to override it again. + file: + path: /etc/NetworkManager/conf.d/99-cloud-init.conf + state: absent + +- name: Get remote environment for ansible_user + setup: + gather_subset: env + become: no + +- name: Delete any injected ssh config for ansible_user + file: + path: "{{ ansible_env.HOME }}/.ssh/" + state: absent + +- name: Run cloud-init cleanup + command: cloud-init clean --logs --seed + +- name: Cleanup /tmp + command : rm -rf /tmp/* + +- name: Get package facts + package_facts: + +- name: Ensure image summary directory exists + file: + path: /var/lib/image/ + state: directory + owner: root + group: root + mode: u=rwX,go=rX + +- name: Write image summary + copy: + content: "{{ image_info | to_nice_json }}" + dest: /var/lib/image/image.json + vars: + image_info: + branch: "{{ lookup('pipe', 'git rev-parse --abbrev-ref HEAD') }}" + build: "{{ ansible_nodename | split('.') | first }}" # hostname is image name, which contains build info + os: "{{ ansible_distribution }} {{ ansible_distribution_version }}" + kernel: "{{ ansible_kernel }}" + ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}" + cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}" + slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" + ondemand: "{{ ansible_facts.packages['ondemand'].0.version | default('-') }}" + +- name: Clear system logs + file: + path: /var/log/messages + state: absent + when: "{{ builder_delete_syslog | bool }}" + +- name: Shutdown Packer VM + community.general.shutdown: From 8bf014bf77f5031e64b271e1a8628eb312067c2c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 10:33:48 +0000 Subject: [PATCH 19/22] fix fatimage packer CLI args --- .github/workflows/fatimage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 36c26fc33..e15937099 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -94,7 +94,7 @@ jobs: packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var source_image=${{ steps.select_source_image.outputs.source_image_id }} + -var source_image=${{ steps.select_source_image.outputs.source_image_id }} \ -var image_name=${{ matrix.builds.label }} \ -var inventory_groups=${{ matrix.builds.inventory_groups }} \ openstack.pkr.hcl From a9e9afd0b81de59316ff6646ffee58aab362b43a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 11:15:34 +0000 Subject: [PATCH 20/22] fix signature verification on nightly image --- .github/workflows/nightlybuild.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index bd97fcb8c..ecad5fcee 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -94,7 +94,7 @@ jobs: env: PACKER_LOG: '1' - - name: Get created image names from manifest + - name: Get image info and ensure it can be used for subsequent builds id: manifest run: | . venv/bin/activate @@ -106,6 +106,7 @@ jobs: echo image: ${IMAGE_NAME} ${IMAGE_ID} echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" + openstack image unset --property signature_verified $IMAGE_ID - name: Download image if: github.ref_name == 'main' @@ -113,7 +114,6 @@ jobs: . venv/bin/activate sudo mkdir /mnt/images sudo chmod 777 /mnt/images - openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-id }} - name: Set up QEMU From 838d67d6dac41da2467a883ddb6396acfcaaa6c8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 15:06:25 +0000 Subject: [PATCH 21/22] add checks for ofed packages --- ansible/fatimage.yml | 4 ++++ ansible/roles/builder/tasks/checks.yml | 29 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 ansible/roles/builder/tasks/checks.yml diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 3d54723c9..0b0577289 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -207,6 +207,10 @@ gather_facts: yes tags: finalise tasks: + - name: Carry out checks on image + import_role: + name: builder + tasks_from: checks.yml - name: Finalise image import_role: name: builder diff --git a/ansible/roles/builder/tasks/checks.yml b/ansible/roles/builder/tasks/checks.yml new file mode 100644 index 000000000..4452c384c --- /dev/null +++ b/ansible/roles/builder/tasks/checks.yml @@ -0,0 +1,29 @@ +- name: Check whether OFED is installed + command: ofed_info + changed_when: false + failed_when: + - _ofed_info.rc > 0 + - "'No such file or directory' not in _ofed_info.msg" + register: _ofed_info + +- name: Get package facts + package_facts: + +- name: Check e.g. libfabric package hasn't downgraded OFED-installed packages + assert: + that: "'mlnx' in ansible_facts.packages[item].0.version" + fail_msg: "OFED is installed but package {{ item }} has a non-OFED version: {{ ansible_facts.packages[item].0.version }}" + when: "'MLNX_OFED_LINUX-' in _ofed_info.stdout" + loop: "{{ builder_ofed_check_packages }}" + vars: + builder_ofed_check_packages: + - ibacm + - infiniband-diags + - libibumad + - libibverbs + - libibverbs-utils + - librdmacm + - librdmacm-utils + - rdma-core-devel + - rdma-core # didn't actually see this one get downgraded + From cfafed76c34be1b2d3f8c2c2746d776e565ad3b4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Nov 2024 15:08:26 +0000 Subject: [PATCH 22/22] try not removing syslog during build --- ansible/roles/builder/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/builder/defaults/main.yml b/ansible/roles/builder/defaults/main.yml index f7a647423..605761c04 100644 --- a/ansible/roles/builder/defaults/main.yml +++ b/ansible/roles/builder/defaults/main.yml @@ -1 +1 @@ -builder_delete_syslog: true \ No newline at end of file +builder_delete_syslog: false \ No newline at end of file