diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index 6507caf08..4d77c6fb5 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -6,7 +6,7 @@ shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name register: sinfo changed_when: false - until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout) + until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout or "down" in sinfo.stdout) retries: 10 delay: 5 - name: Check nodes have expected slurm state diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index c7c1d4d8c..8b5d5e389 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -2,6 +2,18 @@ See the role README.md +# Changes to image / tofu state + +When a compute group has the `ignore_image_changes` parameter set to true, +changes to the `image_id` parameter (which defaults to `cluster_image_id`) are +ignored by OpenTofu. + +Regardless of whether `ignore_image_changes` is set, OpenTofu templates out the +`image_id` into the Ansible inventory for each compute node. The `compute_init` +role templates out hostvars to the control node, which means the "target" image +ID is then available on the control node. Subsequent work will use this to +rebuild the node via slurm. + # CI workflow The compute node rebuild is tested in CI after the tests for rebuilding the diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index a08d8299f..5c11b4bb4 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -81,6 +81,7 @@ module "cluster" { nodes: ["compute-0", "compute-1"] flavor: var.other_node_flavor compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi"] + # ignore_image_changes: true } # Example of how to add another partition: # extra: { diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf index baf28aaf9..6b2e9aaad 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf @@ -21,6 +21,7 @@ module "compute" { extra_volumes = lookup(each.value, "extra_volumes", {}) compute_init_enable = lookup(each.value, "compute_init_enable", []) + ignore_image_changes = lookup(each.value, "ignore_image_changes", false) key_pair = var.key_pair environment_root = var.environment_root diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/nodes.tf index 9bb75466e..07b9dfe65 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/nodes.tf @@ -9,6 +9,9 @@ locals { # this is a mapping with # keys "compute-0-vol-a", "compute-0-vol-b" ... # values which are a mapping e.g. {"node"="compute-0", "volume"="vol-a"} + + # Workaround for lifecycle meta-argument only taking static values + compute_instances = var.ignore_image_changes ? openstack_compute_instance_v2.compute_fixed_image : openstack_compute_instance_v2.compute } resource "openstack_blockstorage_volume_v3" "compute" { @@ -24,7 +27,7 @@ resource "openstack_compute_volume_attach_v2" "compute" { for_each = local.all_compute_volumes - instance_id = openstack_compute_instance_v2.compute["${each.value.node}"].id + instance_id = local.compute_instances["${each.value.node}"].id volume_id = openstack_blockstorage_volume_v3.compute["${each.key}"].id } @@ -48,9 +51,57 @@ resource "openstack_networking_port_v2" "compute" { } } +resource "openstack_compute_instance_v2" "compute_fixed_image" { + + for_each = var.ignore_image_changes ? toset(var.nodes) : [] + + name = "${var.cluster_name}-${each.key}" + image_id = var.image_id + flavor_name = var.flavor + key_pair = var.key_pair + + dynamic "block_device" { + for_each = var.volume_backed_instances ? [1]: [] + content { + uuid = var.image_id + source_type = "image" + destination_type = "volume" + volume_size = var.root_volume_size + boot_index = 0 + delete_on_termination = true + } + } + + network { + port = openstack_networking_port_v2.compute[each.key].id + access_network = true + } + + metadata = merge( + { + environment_root = var.environment_root + k3s_token = var.k3s_token + control_address = var.control_address + }, + {for e in var.compute_init_enable: e => true} + ) + + user_data = <<-EOF + #cloud-config + fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix} + EOF + + lifecycle { + ignore_changes = [ + image_id, + ] + } + +} + resource "openstack_compute_instance_v2" "compute" { - for_each = toset(var.nodes) + for_each = var.ignore_image_changes ? [] : toset(var.nodes) name = "${var.cluster_name}-${each.key}" image_id = var.image_id @@ -91,5 +142,5 @@ resource "openstack_compute_instance_v2" "compute" { } output "compute_instances" { - value = openstack_compute_instance_v2.compute + value = local.compute_instances } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/variables.tf index b0e489017..fbb2c73ce 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute/variables.tf @@ -93,4 +93,10 @@ variable "compute_init_enable" { type = list(string) description = "Groups to activate for ansible-init compute rebuilds" default = [] +} + +variable "ignore_image_changes" { + type = bool + description = "Whether to ignore changes to the image_id parameter" + default = false } \ No newline at end of file diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl b/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl index 22642e9a5..812f52825 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl @@ -28,6 +28,7 @@ ${cluster_name}_${group_name}: ${ node.name }: ansible_host: ${node.access_ip_v4} instance_id: ${ node.id } + image_id: ${ node.image_id } %{ endfor ~} %{ endfor ~} diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index 7b0b695d3..a7fd02609 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -53,6 +53,7 @@ variable "compute" { vnic_type: Overrides variable vnic_type vnic_profile: Overrides variable vnic_profile compute_init_enable: Toggles compute-init rebuild (see compute-init role docs) + ignore_image_changes: Ignore changes to the image_id parameter (see docs/experimental/compute-init.md) volume_backed_instances: Overrides variable volume_backed_instances root_volume_size: Overrides variable root_volume_size extra_volumes: Mapping defining additional volumes to create and attach