From ab65245a5a5ac4e2436ec7cd0c7437e92f7c99c9 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 10 Sep 2024 13:48:55 +0100 Subject: [PATCH 001/268] added k3s installation to bootstrap --- ansible/bootstrap.yml | 55 ++++++++++++++++++++++++++++ environments/common/inventory/groups | 15 +++++++- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index e8e2713a5..c43e8f01e 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -1,5 +1,60 @@ --- +- hosts: k3s + become: yes + tasks: + - name: Download k3s install script + ansible.builtin.get_url: + url: https://get.k3s.io/ + timeout: 120 + dest: /usr/local/bin/k3s-install.sh + owner: root + group: root + mode: "0755" + +- hosts: k3s_server + become: yes + tasks: + - name: Install k3s server + ansible.builtin.shell: + cmd: /usr/local/bin/k3s-install.sh + environment: + INSTALL_K3S_VERSION: "v1.31.0+k3s1" + INSTALL_K3S_EXEC: "server" + INSTALL_K3S_SKIP_START: true + changed_when: true + +- hosts: k3s_agent + become: yes + tasks: + - name: Install k3s agent + ansible.builtin.shell: + cmd: /usr/local/bin/k3s-install.sh + environment: + INSTALL_K3S_VERSION: "v1.31.0+k3s1" + INSTALL_K3S_EXEC: "agent" + INSTALL_K3S_SKIP_START: true + changed_when: true + +- hosts: k3s + become: yes + tasks: + - name: Creating directory on root path + ansible.builtin.file: + path: /root/bin + state: directory + owner: root + - name: Adding symlinks to k3s binaries for root + ansible.builtin.file: + src: /usr/local/bin/k3s + dest: "/root/bin/{{ item }}" + state: link + owner: root + with_items: + - k3s + - kubectl + + - hosts: cluster gather_facts: false become: yes diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index ea0bebebc..8202d109f 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -134,4 +134,17 @@ freeipa_client # Hosts to run TuneD configuration [ansible_init] -# Hosts to run linux-anisble-init \ No newline at end of file +# Hosts to run linux-anisble-init + +[k3s_server] +# Host to run k3s server +control + +[k3s_agent] +# Hosts to run as k3s agents +login +compute + +[k3s:children] +k3s_server +k3s_agent From 321fdb4bf0333e6867ad210820e8a4c096309ca6 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 10 Sep 2024 15:17:22 +0100 Subject: [PATCH 002/268] added k3ds token to terraform --- .../{{cookiecutter.environment}}/terraform/compute.tf | 1 + .../{{cookiecutter.environment}}/terraform/compute/nodes.tf | 1 + .../terraform/compute/variables.tf | 5 +++++ .../skeleton/{{cookiecutter.environment}}/terraform/nodes.tf | 2 ++ .../{{cookiecutter.environment}}/terraform/variables.tf | 5 +++++ 5 files changed, 14 insertions(+) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index c8907c836..a1b33aa78 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -15,5 +15,6 @@ module "compute" { vnic_profile = lookup(each.value, "vnic_profile", var.vnic_profile) key_pair = var.key_pair environment_root = var.environment_root + k3s_token = var.k3s_token security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 006f802c7..7c3f6b1f6 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -46,6 +46,7 @@ resource "openstack_compute_instance_v2" "compute" { metadata = { environment_root = var.environment_root + k3s_token = var.k3s_token } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 5696a94eb..2f6f17cb9 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -67,3 +67,8 @@ variable "root_volume_size" { variable "security_group_ids" { type = list } + +variable "k3s_token" { + description = "Random cryptographically secure string for K3s token" + type = string +} diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index 376d3da0e..26c305430 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -76,6 +76,7 @@ resource "openstack_compute_instance_v2" "control" { metadata = { environment_root = var.environment_root + k3s_token = var.k3s_token } user_data = <<-EOF @@ -124,6 +125,7 @@ resource "openstack_compute_instance_v2" "login" { metadata = { environment_root = var.environment_root + k3s_token = var.k3s_token } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 289de3fef..7bf80472b 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -131,3 +131,8 @@ variable "root_volume_size" { type = number default = 40 } + +variable "k3s_token" { + description = "Random cryptographically secure string for K3s token" + type = string +} From 915c4dd61f09b15cb7f610f3e02c871744a9d6df Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 11 Sep 2024 16:13:57 +0100 Subject: [PATCH 003/268] added role to install playbooks for ansible-init --- ansible/bootstrap.yml | 11 ++++++++-- ansible/roles/k3s/files/start_k3s_agent.yml | 21 +++++++++++++++++++ ansible/roles/k3s/files/start_k3s_server.yml | 19 +++++++++++++++++ ansible/roles/k3s/tasks/main.yml | 18 ++++++++++++++++ environments/common/inventory/groups | 4 ++-- .../terraform/compute/nodes.tf | 1 + .../terraform/nodes.tf | 1 + 7 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 ansible/roles/k3s/files/start_k3s_agent.yml create mode 100644 ansible/roles/k3s/files/start_k3s_server.yml create mode 100644 ansible/roles/k3s/tasks/main.yml diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index c43e8f01e..95299d5c8 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -21,7 +21,8 @@ environment: INSTALL_K3S_VERSION: "v1.31.0+k3s1" INSTALL_K3S_EXEC: "server" - INSTALL_K3S_SKIP_START: true + INSTALL_K3S_SKIP_START: "true" + INSTALL_K3S_SKIP_ENABLE: "true" changed_when: true - hosts: k3s_agent @@ -33,7 +34,8 @@ environment: INSTALL_K3S_VERSION: "v1.31.0+k3s1" INSTALL_K3S_EXEC: "agent" - INSTALL_K3S_SKIP_START: true + INSTALL_K3S_SKIP_START: "true" + INSTALL_K3S_SKIP_ENABLE: "true" changed_when: true - hosts: k3s @@ -54,6 +56,11 @@ - k3s - kubectl +- hosts: k3s + become: yes + tasks: + - include_role: + name: k3s - hosts: cluster gather_facts: false diff --git a/ansible/roles/k3s/files/start_k3s_agent.yml b/ansible/roles/k3s/files/start_k3s_agent.yml new file mode 100644 index 000000000..3f34b0b66 --- /dev/null +++ b/ansible/roles/k3s/files/start_k3s_agent.yml @@ -0,0 +1,21 @@ +- hosts: localhost + become: true + vars: + os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + k3s_token: "{{ os_metadata.meta.k3s_token }}" + k3s_server_name: "{{ os_metadata.meta.k3s_server }}" + tasks: + - name: Add the token for joining the cluster to the environment + no_log: true # avoid logging the server token + ansible.builtin.lineinfile: + path: "/etc/systemd/system/k3s-agent.service.env" + line: "{{ item }}" + with_items: + - "K3S_TOKEN={{ k3s_token }}" + - "K3S_URL=https://{{ k3s_server_name }}:6443" + - name: Start k3s service + ansible.builtin.systemd: + name: k3s-agent + daemon_reload: true + state: started + enabled: true diff --git a/ansible/roles/k3s/files/start_k3s_server.yml b/ansible/roles/k3s/files/start_k3s_server.yml new file mode 100644 index 000000000..7272a0059 --- /dev/null +++ b/ansible/roles/k3s/files/start_k3s_server.yml @@ -0,0 +1,19 @@ +- hosts: localhost + become: true + vars: + os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + k3s_token: "{{ os_metadata.meta.k3s_token }}" + tasks: + - name: Add the token for joining the cluster to the environment + no_log: true # avoid logging the server token + ansible.builtin.lineinfile: + path: "/etc/systemd/system/k3s.service.env" + line: "{{ item }}" + with_items: + - "K3S_TOKEN={{ k3s_token }}" + - name: Start k3s service + ansible.builtin.systemd: + name: k3s + daemon_reload: true + state: started + enabled: true diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml new file mode 100644 index 000000000..a84a91ee3 --- /dev/null +++ b/ansible/roles/k3s/tasks/main.yml @@ -0,0 +1,18 @@ +--- + +- name: Create ansible-init playbook install directory + file: + path: "/etc/ansible-init/playbooks" + state: directory + +- name: Install k3s server ansible-init playbook + copy: + src: start_k3s_server.yml + dest: /etc/ansible-init/playbooks/0-start-k3s.yml + when: inventory_hostname in groups["k3s_server"] + +- name: Install k3s agent ansible-init playbook + copy: + src: start_k3s_agent.yml + dest: /etc/ansible-init/playbooks/0-start-k3s.yml + when: inventory_hostname in groups["k3s_agent"] diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 8202d109f..eb8bab8f4 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -136,11 +136,11 @@ freeipa_client [ansible_init] # Hosts to run linux-anisble-init -[k3s_server] +[k3s_server:children] # Host to run k3s server control -[k3s_agent] +[k3s_agent:children] # Hosts to run as k3s agents login compute diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 7c3f6b1f6..2c69bbbb0 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -47,6 +47,7 @@ resource "openstack_compute_instance_v2" "compute" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token + k3s_server = "${var.cluster_name}-control" } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index 26c305430..de918f199 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -126,6 +126,7 @@ resource "openstack_compute_instance_v2" "login" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token + k3s_server = "${var.cluster_name}-control" } user_data = <<-EOF From 824e1176944081ddb46c8d059deec8a58587f002 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Sep 2024 13:46:18 +0100 Subject: [PATCH 004/268] Refactored so that agent or server is determined by metadata --- ansible/bootstrap.yml | 60 +------------------ ansible/roles/k3s/files/start_k3s.yml | 26 ++++++++ ansible/roles/k3s/files/start_k3s_agent.yml | 21 ------- ansible/roles/k3s/files/start_k3s_server.yml | 19 ------ ansible/roles/k3s/tasks/main.yml | 49 ++++++++++++--- environments/common/inventory/groups | 13 ---- .../terraform/compute/nodes.tf | 1 + .../terraform/nodes.tf | 3 + 8 files changed, 72 insertions(+), 120 deletions(-) create mode 100644 ansible/roles/k3s/files/start_k3s.yml delete mode 100644 ansible/roles/k3s/files/start_k3s_agent.yml delete mode 100644 ansible/roles/k3s/files/start_k3s_server.yml diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 95299d5c8..6b734e379 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -1,65 +1,9 @@ --- -- hosts: k3s - become: yes - tasks: - - name: Download k3s install script - ansible.builtin.get_url: - url: https://get.k3s.io/ - timeout: 120 - dest: /usr/local/bin/k3s-install.sh - owner: root - group: root - mode: "0755" - -- hosts: k3s_server - become: yes - tasks: - - name: Install k3s server - ansible.builtin.shell: - cmd: /usr/local/bin/k3s-install.sh - environment: - INSTALL_K3S_VERSION: "v1.31.0+k3s1" - INSTALL_K3S_EXEC: "server" - INSTALL_K3S_SKIP_START: "true" - INSTALL_K3S_SKIP_ENABLE: "true" - changed_when: true - -- hosts: k3s_agent - become: yes - tasks: - - name: Install k3s agent - ansible.builtin.shell: - cmd: /usr/local/bin/k3s-install.sh - environment: - INSTALL_K3S_VERSION: "v1.31.0+k3s1" - INSTALL_K3S_EXEC: "agent" - INSTALL_K3S_SKIP_START: "true" - INSTALL_K3S_SKIP_ENABLE: "true" - changed_when: true - -- hosts: k3s - become: yes - tasks: - - name: Creating directory on root path - ansible.builtin.file: - path: /root/bin - state: directory - owner: root - - name: Adding symlinks to k3s binaries for root - ansible.builtin.file: - src: /usr/local/bin/k3s - dest: "/root/bin/{{ item }}" - state: link - owner: root - with_items: - - k3s - - kubectl - -- hosts: k3s +- hosts: cluster become: yes tasks: - - include_role: + - ansible.builtin.include_role: name: k3s - hosts: cluster diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml new file mode 100644 index 000000000..6f512719d --- /dev/null +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -0,0 +1,26 @@ +- hosts: localhost + become: true + vars: + os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + k3s_token: "{{ os_metadata.meta.k3s_token }}" + k3s_server_name: "{{ os_metadata.meta.k3s_server }}" + k3s_node_type: "{{ os_metadata.meta.k3s_node_type }}" + service_name: "{{ 'k3s-agent' if k3s_node_type == 'agent' else 'k3s' }}" + tasks: + - name: Add the token for joining the cluster to the environment + no_log: false # avoid logging the server token + ansible.builtin.lineinfile: + path: "/etc/systemd/system/{{ service_name }}.service.env" + line: "K3S_TOKEN={{ k3s_token }}" + - name: Add server url to agents + ansible.builtin.lineinfile: + path: "/etc/systemd/system/{{ service_name }}.service.env" + line: "K3S_URL=https://{{ k3s_server_name }}:6443" + when: k3s_node_type == "agent" + - name: Start k3s service + ansible.builtin.systemd: + name: "{{ service_name }}" + daemon_reload: true + state: started + enabled: true + when: k3s_node_type != "none" diff --git a/ansible/roles/k3s/files/start_k3s_agent.yml b/ansible/roles/k3s/files/start_k3s_agent.yml deleted file mode 100644 index 3f34b0b66..000000000 --- a/ansible/roles/k3s/files/start_k3s_agent.yml +++ /dev/null @@ -1,21 +0,0 @@ -- hosts: localhost - become: true - vars: - os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" - k3s_token: "{{ os_metadata.meta.k3s_token }}" - k3s_server_name: "{{ os_metadata.meta.k3s_server }}" - tasks: - - name: Add the token for joining the cluster to the environment - no_log: true # avoid logging the server token - ansible.builtin.lineinfile: - path: "/etc/systemd/system/k3s-agent.service.env" - line: "{{ item }}" - with_items: - - "K3S_TOKEN={{ k3s_token }}" - - "K3S_URL=https://{{ k3s_server_name }}:6443" - - name: Start k3s service - ansible.builtin.systemd: - name: k3s-agent - daemon_reload: true - state: started - enabled: true diff --git a/ansible/roles/k3s/files/start_k3s_server.yml b/ansible/roles/k3s/files/start_k3s_server.yml deleted file mode 100644 index 7272a0059..000000000 --- a/ansible/roles/k3s/files/start_k3s_server.yml +++ /dev/null @@ -1,19 +0,0 @@ -- hosts: localhost - become: true - vars: - os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" - k3s_token: "{{ os_metadata.meta.k3s_token }}" - tasks: - - name: Add the token for joining the cluster to the environment - no_log: true # avoid logging the server token - ansible.builtin.lineinfile: - path: "/etc/systemd/system/k3s.service.env" - line: "{{ item }}" - with_items: - - "K3S_TOKEN={{ k3s_token }}" - - name: Start k3s service - ansible.builtin.systemd: - name: k3s - daemon_reload: true - state: started - enabled: true diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index a84a91ee3..b3e2bc70f 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -1,18 +1,49 @@ --- -- name: Create ansible-init playbook install directory +- name: Download k3s install script + ansible.builtin.get_url: + url: https://get.k3s.io/ + timeout: 120 + dest: /usr/local/bin/k3s-install.sh + owner: root + group: root + mode: "0755" + +- name: Install k3s server + ansible.builtin.shell: + cmd: /usr/local/bin/k3s-install.sh + environment: + INSTALL_K3S_VERSION: "v1.31.0+k3s1" + INSTALL_K3S_EXEC: "{{ item }}" + INSTALL_K3S_SKIP_START: "true" + INSTALL_K3S_SKIP_ENABLE: "true" + changed_when: true + with_items: + - server + - agent + +- name: Creating directory on root path + ansible.builtin.file: + path: /root/bin + state: directory + owner: root + +- name: Adding symlinks to k3s binaries for root + ansible.builtin.file: + src: /usr/local/bin/k3s + dest: "/root/bin/{{ item }}" + state: link + owner: root + with_items: + - k3s + - kubectl + +- name: Create ansible-init playbook install directory #TODO: move into ansible-init file: path: "/etc/ansible-init/playbooks" state: directory - name: Install k3s server ansible-init playbook copy: - src: start_k3s_server.yml - dest: /etc/ansible-init/playbooks/0-start-k3s.yml - when: inventory_hostname in groups["k3s_server"] - -- name: Install k3s agent ansible-init playbook - copy: - src: start_k3s_agent.yml + src: start_k3s.yml dest: /etc/ansible-init/playbooks/0-start-k3s.yml - when: inventory_hostname in groups["k3s_agent"] diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index eb8bab8f4..e96575ef4 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -135,16 +135,3 @@ freeipa_client [ansible_init] # Hosts to run linux-anisble-init - -[k3s_server:children] -# Host to run k3s server -control - -[k3s_agent:children] -# Hosts to run as k3s agents -login -compute - -[k3s:children] -k3s_server -k3s_agent diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 2c69bbbb0..610e435e9 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -48,6 +48,7 @@ resource "openstack_compute_instance_v2" "compute" { environment_root = var.environment_root k3s_token = var.k3s_token k3s_server = "${var.cluster_name}-control" + k3s_node_type = "agent" } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index de918f199..3bd23639d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -77,6 +77,8 @@ resource "openstack_compute_instance_v2" "control" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token + k3s_server = "${var.cluster_name}-control" + k3s_node_type = "server" } user_data = <<-EOF @@ -127,6 +129,7 @@ resource "openstack_compute_instance_v2" "login" { environment_root = var.environment_root k3s_token = var.k3s_token k3s_server = "${var.cluster_name}-control" + k3s_node_type = "agent" } user_data = <<-EOF From b49b22cd28bb9c284fe89253443279b83c9fec61 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Sep 2024 15:38:29 +0100 Subject: [PATCH 005/268] Added (very hacky) k3s token generation --- ansible/roles/k3s/files/start_k3s.yml | 2 ++ ansible/roles/k3s/tasks/main.yml | 2 +- ansible/roles/passwords/tasks/main.yml | 15 ++++++++++++++- .../terraform/compute/variables.tf | 2 +- .../terraform/variables.tf | 1 + 5 files changed, 19 insertions(+), 3 deletions(-) diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 6f512719d..4b7ea25be 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -12,11 +12,13 @@ ansible.builtin.lineinfile: path: "/etc/systemd/system/{{ service_name }}.service.env" line: "K3S_TOKEN={{ k3s_token }}" + - name: Add server url to agents ansible.builtin.lineinfile: path: "/etc/systemd/system/{{ service_name }}.service.env" line: "K3S_URL=https://{{ k3s_server_name }}:6443" when: k3s_node_type == "agent" + - name: Start k3s service ansible.builtin.systemd: name: "{{ service_name }}" diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index b3e2bc70f..6af70fef9 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -9,7 +9,7 @@ group: root mode: "0755" -- name: Install k3s server +- name: Install k3s ansible.builtin.shell: cmd: /usr/local/bin/k3s-install.sh environment: diff --git a/ansible/roles/passwords/tasks/main.yml b/ansible/roles/passwords/tasks/main.yml index 21b10f780..cf76cf6e4 100644 --- a/ansible/roles/passwords/tasks/main.yml +++ b/ansible/roles/passwords/tasks/main.yml @@ -7,6 +7,19 @@ delegate_to: localhost run_once: true +- name: Generate k3s token + ansible.builtin.set_fact: + k3s_token_secret: "" + +- name: Generate k3s token and add to terraform + vars: + token: "{{ lookup('ansible.builtin.password', '/dev/null', length=64) }}" + replace: + path: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/terraform/variables.tf" + regexp: "k3s_token_replace_me" + replace: "{{ token }}" + + # - name: Ensure munge key directory exists # file: # state: directory @@ -17,4 +30,4 @@ # copy: # content: "{{ lookup('password', '/dev/null chars=ascii_letters,digits,hexdigits,punctuation') }}" # dest: "{{ openhpc_passwords_mungekey_output_path }}" -# force: false \ No newline at end of file +# force: false diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 2f6f17cb9..e3ae8e9db 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -69,6 +69,6 @@ variable "security_group_ids" { } variable "k3s_token" { - description = "Random cryptographically secure string for K3s token" + description = "Random cryptographically secure string for K3s token (must be set by ../compute.tf)" type = string } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 7bf80472b..4b8bca214 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -135,4 +135,5 @@ variable "root_volume_size" { variable "k3s_token" { description = "Random cryptographically secure string for K3s token" type = string + default = "k3s_token_replace_me" } From 99c00282cbb3b982db69907f10c097bfeb25ff06 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Sep 2024 09:35:02 +0100 Subject: [PATCH 006/268] Added k9s role --- ansible/roles/k3s/files/start_k3s.yml | 2 +- ansible/roles/k3s/tasks/main.yml | 5 ++++ ansible/roles/k9s/tasks/main.yml | 33 +++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 ansible/roles/k9s/tasks/main.yml diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 4b7ea25be..304d538ba 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -18,7 +18,7 @@ path: "/etc/systemd/system/{{ service_name }}.service.env" line: "K3S_URL=https://{{ k3s_server_name }}:6443" when: k3s_node_type == "agent" - + - name: Start k3s service ansible.builtin.systemd: name: "{{ service_name }}" diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 6af70fef9..1b5b7e965 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -37,6 +37,11 @@ with_items: - k3s - kubectl + +- name: k9s install + ansible.builtin.include_role: + name: k9s + - name: Create ansible-init playbook install directory #TODO: move into ansible-init file: diff --git a/ansible/roles/k9s/tasks/main.yml b/ansible/roles/k9s/tasks/main.yml new file mode 100644 index 000000000..e431c65a5 --- /dev/null +++ b/ansible/roles/k9s/tasks/main.yml @@ -0,0 +1,33 @@ +--- + - name: Create install directory + ansible.builtin.file: + path: /root/k9s-temp + state: directory + + - name: Download k9s + ansible.builtin.get_url: + url: https://github.com/derailed/k9s/releases/download/v0.32.5/k9s_Linux_amd64.tar.gz + dest: /root/k9s-temp/k9s_Linux_amd64.tar.gz + + - name: Unpack k9s binary + ansible.builtin.unarchive: + src: /root/k9s-temp/k9s_Linux_amd64.tar.gz + dest: /root/k9s-temp + remote_src: yes + + - name: Add k9s to root path + ansible.builtin.copy: + src: /root/k9s-temp/k9s + dest: /root/bin/k9s + mode: u+rwx + remote_src: yes + + - name: Add k3s kubeconfig as environment variable + ansible.builtin.lineinfile: + path: /etc/environment + line: "KUBECONFIG=/etc/rancher/k3s/k3s.yaml" + + - name: Cleanup k9s install directory + ansible.builtin.file: + path: /root/k9s-temp + state: absent From 68feb768ddbec9088990b593fbb0e679b11c1094 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Sep 2024 10:24:54 +0100 Subject: [PATCH 007/268] Moved k3s install to after network setup --- ansible/bootstrap.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 6b734e379..cb747dca9 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -1,11 +1,5 @@ --- -- hosts: cluster - become: yes - tasks: - - ansible.builtin.include_role: - name: k3s - - hosts: cluster gather_facts: false become: yes @@ -264,3 +258,9 @@ tasks: - include_role: name: azimuth_cloud.image_utils.linux_ansible_init + +- hosts: cluster + become: yes + tasks: + - ansible.builtin.include_role: + name: k3s From 90c7a78765fae7e8c48838b511ff254b4f4ccee8 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Sep 2024 12:54:18 +0100 Subject: [PATCH 008/268] Added seperate k3s group --- ansible/bootstrap.yml | 2 +- ansible/roles/k3s/tasks/main.yml | 6 ------ environments/common/inventory/groups | 3 +++ environments/common/layouts/everything | 6 +++++- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index cb747dca9..113e828cc 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -259,7 +259,7 @@ - include_role: name: azimuth_cloud.image_utils.linux_ansible_init -- hosts: cluster +- hosts: k3s become: yes tasks: - ansible.builtin.include_role: diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 1b5b7e965..11eaa183b 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -41,12 +41,6 @@ - name: k9s install ansible.builtin.include_role: name: k9s - - -- name: Create ansible-init playbook install directory #TODO: move into ansible-init - file: - path: "/etc/ansible-init/playbooks" - state: directory - name: Install k3s server ansible-init playbook copy: diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index e96575ef4..ec487416c 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -135,3 +135,6 @@ freeipa_client [ansible_init] # Hosts to run linux-anisble-init + +[k3s] +# Hosts to run k3s server/agent diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 85af46c06..3525fc6fe 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -81,4 +81,8 @@ openhpc [ansible_init:children] # Hosts to run ansible-init -cluster \ No newline at end of file +cluster + +[k3s:children] +# Hosts to run k3s server/agent +cluster From cbcf76261c73141efbedaf844e3dc9b7bc0b0613 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Sep 2024 14:32:16 +0100 Subject: [PATCH 009/268] Added helm --- ansible/roles/k3s/tasks/main.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 11eaa183b..eda54087d 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -38,6 +38,16 @@ - k3s - kubectl +- name: Installing helm + unarchive: + src: https://get.helm.sh/helm-v3.11.0-linux-amd64.tar.gz + dest: /root/bin + extra_opts: "--strip-components=1" + owner: root + group: root + mode: 0755 + remote_src: true + - name: k9s install ansible.builtin.include_role: name: k9s From 370b1887d046dd96d43b06733f60769c30558ff0 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 17 Sep 2024 10:16:07 +0100 Subject: [PATCH 010/268] Fixed ansible-init sentinel being created in packer build --- ansible/cleanup.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index e0fabf5e1..0b6b4e084 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -38,3 +38,8 @@ - name: Cleanup /tmp command : rm -rf /tmp/* + +- name: Delete ansible-init sentintel file created during build + ansible.builtin.file: + path: /var/lib/ansible-init.done + state: absent From df02a664d83ec5fa64ac4dc01563b88056812872 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 17 Sep 2024 12:26:53 +0100 Subject: [PATCH 011/268] Moved helm install --- ansible/roles/k3s/tasks/main.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index eda54087d..bedf93857 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -41,13 +41,20 @@ - name: Installing helm unarchive: src: https://get.helm.sh/helm-v3.11.0-linux-amd64.tar.gz - dest: /root/bin + dest: /usr/local/bin extra_opts: "--strip-components=1" owner: root group: root mode: 0755 remote_src: true +- name: Add helm symlink in root path + ansible.builtin.file: + src: /usr/local/bin/helm + dest: "/root/bin/helm" + state: link + owner: root + - name: k9s install ansible.builtin.include_role: name: k9s From 327c645c242d493b8b8896714865f1fd5baa23c0 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 17 Sep 2024 12:39:16 +0100 Subject: [PATCH 012/268] Added kube roles to gitignore --- ansible/.gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ansible/.gitignore b/ansible/.gitignore index 2ceeb596b..f2268c478 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -58,4 +58,7 @@ roles/* !roles/squid/** !roles/tuned/ !roles/tuned/** - +!roles/k3s/ +!roles/k3s/** +!roles/k9s/ +!roles/k9s/** From ce82f598542e68a186738a9dcb7f2e3e4fba7ecf Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 17 Sep 2024 14:51:49 +0100 Subject: [PATCH 013/268] moved installs to usr/bin --- ansible/roles/k3s/tasks/main.yml | 30 ++++-------------------------- ansible/roles/k9s/tasks/main.yml | 2 +- 2 files changed, 5 insertions(+), 27 deletions(-) diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index bedf93857..c52c47ba6 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -4,57 +4,35 @@ ansible.builtin.get_url: url: https://get.k3s.io/ timeout: 120 - dest: /usr/local/bin/k3s-install.sh + dest: /usr/bin/k3s-install.sh owner: root group: root mode: "0755" - name: Install k3s ansible.builtin.shell: - cmd: /usr/local/bin/k3s-install.sh + cmd: /usr/bin/k3s-install.sh environment: INSTALL_K3S_VERSION: "v1.31.0+k3s1" INSTALL_K3S_EXEC: "{{ item }}" INSTALL_K3S_SKIP_START: "true" INSTALL_K3S_SKIP_ENABLE: "true" + INSTALL_K3S_BIN_DIR: "/usr/bin" changed_when: true with_items: - server - agent -- name: Creating directory on root path - ansible.builtin.file: - path: /root/bin - state: directory - owner: root - -- name: Adding symlinks to k3s binaries for root - ansible.builtin.file: - src: /usr/local/bin/k3s - dest: "/root/bin/{{ item }}" - state: link - owner: root - with_items: - - k3s - - kubectl - - name: Installing helm unarchive: src: https://get.helm.sh/helm-v3.11.0-linux-amd64.tar.gz - dest: /usr/local/bin + dest: /usr/bin extra_opts: "--strip-components=1" owner: root group: root mode: 0755 remote_src: true -- name: Add helm symlink in root path - ansible.builtin.file: - src: /usr/local/bin/helm - dest: "/root/bin/helm" - state: link - owner: root - - name: k9s install ansible.builtin.include_role: name: k9s diff --git a/ansible/roles/k9s/tasks/main.yml b/ansible/roles/k9s/tasks/main.yml index e431c65a5..bb23ec161 100644 --- a/ansible/roles/k9s/tasks/main.yml +++ b/ansible/roles/k9s/tasks/main.yml @@ -18,7 +18,7 @@ - name: Add k9s to root path ansible.builtin.copy: src: /root/k9s-temp/k9s - dest: /root/bin/k9s + dest: /usr/bin/k9s mode: u+rwx remote_src: yes From 250b4c7ba344a7239c833770428ce4050e9ab587 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 19 Sep 2024 12:54:52 +0000 Subject: [PATCH 014/268] remove local DNS as a dependency for k3s --- .../{{cookiecutter.environment}}/terraform/compute.tf | 1 + .../{{cookiecutter.environment}}/terraform/compute/nodes.tf | 2 +- .../terraform/compute/variables.tf | 5 +++++ .../skeleton/{{cookiecutter.environment}}/terraform/nodes.tf | 4 ++-- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index a1b33aa78..eb2139eba 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -16,5 +16,6 @@ module "compute" { key_pair = var.key_pair environment_root = var.environment_root k3s_token = var.k3s_token + k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 610e435e9..7b37c7625 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -47,7 +47,7 @@ resource "openstack_compute_instance_v2" "compute" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = "${var.cluster_name}-control" + k3s_server = var.k3s_server k3s_node_type = "agent" } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index e3ae8e9db..400b8fd02 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -72,3 +72,8 @@ variable "k3s_token" { description = "Random cryptographically secure string for K3s token (must be set by ../compute.tf)" type = string } + +variable "k3s_server" { + description = "Name/address of k3s server" + type = string +} diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index 3bd23639d..695e7ff45 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -77,7 +77,7 @@ resource "openstack_compute_instance_v2" "control" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = "${var.cluster_name}-control" + k3s_server = "" # think this can go? k3s_node_type = "server" } @@ -128,7 +128,7 @@ resource "openstack_compute_instance_v2" "login" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = "${var.cluster_name}-control" + k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] k3s_node_type = "agent" } From 2f26fa19cf6d452164daec3c8ab2162edae255d4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Sep 2024 13:48:55 +0100 Subject: [PATCH 015/268] agent/server config now based on if server name defined --- ansible/roles/k3s/files/start_k3s.yml | 6 ++---- .../{{cookiecutter.environment}}/terraform/compute/nodes.tf | 1 - .../{{cookiecutter.environment}}/terraform/nodes.tf | 4 +--- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 304d538ba..458e4a0da 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -4,8 +4,7 @@ os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" k3s_token: "{{ os_metadata.meta.k3s_token }}" k3s_server_name: "{{ os_metadata.meta.k3s_server }}" - k3s_node_type: "{{ os_metadata.meta.k3s_node_type }}" - service_name: "{{ 'k3s-agent' if k3s_node_type == 'agent' else 'k3s' }}" + service_name: "{{ 'k3s' if k3s_server_name == 'none' else 'k3s-agent' }}" tasks: - name: Add the token for joining the cluster to the environment no_log: false # avoid logging the server token @@ -17,7 +16,7 @@ ansible.builtin.lineinfile: path: "/etc/systemd/system/{{ service_name }}.service.env" line: "K3S_URL=https://{{ k3s_server_name }}:6443" - when: k3s_node_type == "agent" + when: k3s_server_name != "none" - name: Start k3s service ansible.builtin.systemd: @@ -25,4 +24,3 @@ daemon_reload: true state: started enabled: true - when: k3s_node_type != "none" diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 610e435e9..2c69bbbb0 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -48,7 +48,6 @@ resource "openstack_compute_instance_v2" "compute" { environment_root = var.environment_root k3s_token = var.k3s_token k3s_server = "${var.cluster_name}-control" - k3s_node_type = "agent" } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index 3bd23639d..2e4c322f0 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -77,8 +77,7 @@ resource "openstack_compute_instance_v2" "control" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = "${var.cluster_name}-control" - k3s_node_type = "server" + k3s_server = "none" } user_data = <<-EOF @@ -129,7 +128,6 @@ resource "openstack_compute_instance_v2" "login" { environment_root = var.environment_root k3s_token = var.k3s_token k3s_server = "${var.cluster_name}-control" - k3s_node_type = "agent" } user_data = <<-EOF From a8d4e173daae1fb197a9db424bd6e2c64eb35e14 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Sep 2024 15:10:06 +0100 Subject: [PATCH 016/268] k3s token now templated into terraform vars --- ansible/roles/passwords/defaults/main.yml | 3 +++ ansible/roles/passwords/tasks/main.yml | 15 +++++---------- .../templates/k3s-token.auto.tfvars.json | 1 + .../terraform/compute/variables.tf | 1 - .../terraform/variables.tf | 2 -- 5 files changed, 9 insertions(+), 13 deletions(-) create mode 100644 ansible/roles/passwords/templates/k3s-token.auto.tfvars.json diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 55680ae37..12eade641 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -12,4 +12,7 @@ slurm_appliance_secrets: secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" +k3s_secrets: + k3s_token: "{{ lookup('ansible.builtin.password', '/dev/null', length=64) }}" + openhpc_passwords_output_path: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') | default(undefined, true) | mandatory('You must define the APPLIANCES_ENVIRONMENT_ROOT environment variable') }}/inventory/group_vars/all/secrets.yml" diff --git a/ansible/roles/passwords/tasks/main.yml b/ansible/roles/passwords/tasks/main.yml index cf76cf6e4..b2b9a3ba8 100644 --- a/ansible/roles/passwords/tasks/main.yml +++ b/ansible/roles/passwords/tasks/main.yml @@ -7,17 +7,12 @@ delegate_to: localhost run_once: true -- name: Generate k3s token - ansible.builtin.set_fact: - k3s_token_secret: "" - - name: Generate k3s token and add to terraform - vars: - token: "{{ lookup('ansible.builtin.password', '/dev/null', length=64) }}" - replace: - path: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/terraform/variables.tf" - regexp: "k3s_token_replace_me" - replace: "{{ token }}" + template: + src: k3s-token.auto.tfvars.json + dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/terraform/k3s-token.auto.tfvars.json" + delegate_to: localhost + run_once: true # - name: Ensure munge key directory exists diff --git a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json new file mode 100644 index 000000000..6454ea90f --- /dev/null +++ b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json @@ -0,0 +1 @@ +{{ k3s_secrets | to_nice_json }} \ No newline at end of file diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index e3ae8e9db..f0b75d0b5 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -69,6 +69,5 @@ variable "security_group_ids" { } variable "k3s_token" { - description = "Random cryptographically secure string for K3s token (must be set by ../compute.tf)" type = string } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 4b8bca214..930972d35 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -133,7 +133,5 @@ variable "root_volume_size" { } variable "k3s_token" { - description = "Random cryptographically secure string for K3s token" type = string - default = "k3s_token_replace_me" } From 510115f8c410aaff7bc5829e277367c36f82402e Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 19 Sep 2024 15:14:10 +0100 Subject: [PATCH 017/268] Name and label suggestions from review Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/bootstrap.yml | 1 + ansible/cleanup.yml | 2 +- ansible/roles/k3s/tasks/main.yml | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 113e828cc..e6bab9cf8 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -261,6 +261,7 @@ - hosts: k3s become: yes + tags: k3s tasks: - ansible.builtin.include_role: name: k3s diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index 0b6b4e084..06681879b 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -39,7 +39,7 @@ - name: Cleanup /tmp command : rm -rf /tmp/* -- name: Delete ansible-init sentintel file created during build +- name: Delete ansible-init sentinel file created if ansible-init has run during build ansible.builtin.file: path: /var/lib/ansible-init.done state: absent diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index c52c47ba6..bfdf57953 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -23,7 +23,7 @@ - server - agent -- name: Installing helm +- name: Install helm unarchive: src: https://get.helm.sh/helm-v3.11.0-linux-amd64.tar.gz dest: /usr/bin @@ -33,7 +33,7 @@ mode: 0755 remote_src: true -- name: k9s install +- name: Install k9s ansible.builtin.include_role: name: k9s From 56c0d67e42026175d153b25dbc31db390e232f3b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Sep 2024 15:47:41 +0100 Subject: [PATCH 018/268] Refactor + group changes --- ansible/roles/k3s/defaults/main.yml | 1 + ansible/roles/k3s/tasks/main.yml | 6 +++--- environments/common/layouts/everything | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 ansible/roles/k3s/defaults/main.yml diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml new file mode 100644 index 000000000..af40af4e8 --- /dev/null +++ b/ansible/roles/k3s/defaults/main.yml @@ -0,0 +1 @@ +k3s_version: "v1.31.0+k3s1" \ No newline at end of file diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index c52c47ba6..5a7219ac0 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -13,13 +13,13 @@ ansible.builtin.shell: cmd: /usr/bin/k3s-install.sh environment: - INSTALL_K3S_VERSION: "v1.31.0+k3s1" + INSTALL_K3S_VERSION: "{{ k3s_version }}" INSTALL_K3S_EXEC: "{{ item }}" INSTALL_K3S_SKIP_START: "true" INSTALL_K3S_SKIP_ENABLE: "true" INSTALL_K3S_BIN_DIR: "/usr/bin" changed_when: true - with_items: + loop: - server - agent @@ -37,7 +37,7 @@ ansible.builtin.include_role: name: k9s -- name: Install k3s server ansible-init playbook +- name: Install ansible-init playbook for k3s agent or server activation copy: src: start_k3s.yml dest: /etc/ansible-init/playbooks/0-start-k3s.yml diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 3525fc6fe..74d5b58ad 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -85,4 +85,4 @@ cluster [k3s:children] # Hosts to run k3s server/agent -cluster +openhpc From 6d6bd2d4c0abe1d57d6b5737ecb4b5116d8e9903 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Sep 2024 16:22:54 +0100 Subject: [PATCH 019/268] Refactored k9s install --- ansible/extras.yml | 7 +++ ansible/roles/k3s/tasks/main.yml | 7 +-- ansible/roles/k9s/tasks/main.yml | 65 +++++++++++++++----------- environments/common/inventory/groups | 3 ++ environments/common/layouts/everything | 3 ++ 5 files changed, 55 insertions(+), 30 deletions(-) diff --git a/ansible/extras.yml b/ansible/extras.yml index 445a0cc16..fd4199875 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -36,3 +36,10 @@ tasks: - import_role: name: persist_hostkeys + +- name: Install k9s + become: yes + hosts: k9s + tasks: + - import_role: + name: k9s diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 5a7219ac0..48058ece4 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -33,9 +33,10 @@ mode: 0755 remote_src: true -- name: k9s install - ansible.builtin.include_role: - name: k9s +- name: Add k3s kubeconfig as environment variable + ansible.builtin.lineinfile: + path: /etc/environment + line: "KUBECONFIG=/etc/rancher/k3s/k3s.yaml" - name: Install ansible-init playbook for k3s agent or server activation copy: diff --git a/ansible/roles/k9s/tasks/main.yml b/ansible/roles/k9s/tasks/main.yml index bb23ec161..5ca111a2d 100644 --- a/ansible/roles/k9s/tasks/main.yml +++ b/ansible/roles/k9s/tasks/main.yml @@ -1,33 +1,44 @@ --- - - name: Create install directory - ansible.builtin.file: - path: /root/k9s-temp - state: directory + + - name: Check if k9s is installed + ansible.builtin.stat: + path: "/usr/bin/k9s" + register: result - - name: Download k9s - ansible.builtin.get_url: - url: https://github.com/derailed/k9s/releases/download/v0.32.5/k9s_Linux_amd64.tar.gz - dest: /root/k9s-temp/k9s_Linux_amd64.tar.gz + - name: Install k9s and clean up temporary files + block: + - name: Create install directory + ansible.builtin.file: + path: /tmp/k9s + state: directory + owner: root + group: root + mode: "744" + when: not result.stat.exists - - name: Unpack k9s binary - ansible.builtin.unarchive: - src: /root/k9s-temp/k9s_Linux_amd64.tar.gz - dest: /root/k9s-temp - remote_src: yes + - name: Download k9s + ansible.builtin.get_url: + url: https://github.com/derailed/k9s/releases/download/v0.32.5/k9s_Linux_amd64.tar.gz + dest: /tmp/k9s/k9s_Linux_amd64.tar.gz + owner: root + group: root + mode: "744" - - name: Add k9s to root path - ansible.builtin.copy: - src: /root/k9s-temp/k9s - dest: /usr/bin/k9s - mode: u+rwx - remote_src: yes + - name: Unpack k9s binary + ansible.builtin.unarchive: + src: /tmp/k9s/k9s_Linux_amd64.tar.gz + dest: /tmp/k9s + remote_src: yes - - name: Add k3s kubeconfig as environment variable - ansible.builtin.lineinfile: - path: /etc/environment - line: "KUBECONFIG=/etc/rancher/k3s/k3s.yaml" + - name: Add k9s to root path + ansible.builtin.copy: + src: /tmp/k9s/k9s + dest: /usr/bin/k9s + mode: u+rwx + remote_src: yes - - name: Cleanup k9s install directory - ansible.builtin.file: - path: /root/k9s-temp - state: absent + - name: Cleanup k9s install directory + ansible.builtin.file: + path: /tmp/k9s + state: absent + when: not result.stat.exists diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index ec487416c..353ffab4b 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -138,3 +138,6 @@ freeipa_client [k3s] # Hosts to run k3s server/agent + +[k9s] +# Hosts to install k9s on diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 74d5b58ad..51092802f 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -86,3 +86,6 @@ cluster [k3s:children] # Hosts to run k3s server/agent openhpc + +[k9s:children] +control From 2b4f1f616b9757230c5f91e5d157434d31a7679a Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Sep 2024 16:58:54 +0100 Subject: [PATCH 020/268] Removed server from control terraform and changed ansible-init file to accomadate --- ansible/roles/k3s/files/start_k3s.yml | 6 ++---- .../{{cookiecutter.environment}}/terraform/nodes.tf | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 304d538ba..8cf12200a 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -4,8 +4,7 @@ os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" k3s_token: "{{ os_metadata.meta.k3s_token }}" k3s_server_name: "{{ os_metadata.meta.k3s_server }}" - k3s_node_type: "{{ os_metadata.meta.k3s_node_type }}" - service_name: "{{ 'k3s-agent' if k3s_node_type == 'agent' else 'k3s' }}" + service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}" tasks: - name: Add the token for joining the cluster to the environment no_log: false # avoid logging the server token @@ -17,7 +16,7 @@ ansible.builtin.lineinfile: path: "/etc/systemd/system/{{ service_name }}.service.env" line: "K3S_URL=https://{{ k3s_server_name }}:6443" - when: k3s_node_type == "agent" + when: k3s_server_name is defined - name: Start k3s service ansible.builtin.systemd: @@ -25,4 +24,3 @@ daemon_reload: true state: started enabled: true - when: k3s_node_type != "none" diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index 695e7ff45..f95dfdf3d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -77,7 +77,6 @@ resource "openstack_compute_instance_v2" "control" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = "" # think this can go? k3s_node_type = "server" } From c642866b85c2a6d2f5270571e818ff0a6d609b68 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 20 Sep 2024 09:28:10 +0100 Subject: [PATCH 021/268] name update --- ansible/roles/passwords/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/passwords/tasks/main.yml b/ansible/roles/passwords/tasks/main.yml index b2b9a3ba8..c6360fdc2 100644 --- a/ansible/roles/passwords/tasks/main.yml +++ b/ansible/roles/passwords/tasks/main.yml @@ -7,7 +7,7 @@ delegate_to: localhost run_once: true -- name: Generate k3s token and add to terraform +- name: Template k3s token to terraform template: src: k3s-token.auto.tfvars.json dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/terraform/k3s-token.auto.tfvars.json" From 0975257bb4d0bcdd054204e747a3169b5bf6a38d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 20 Sep 2024 09:43:49 +0100 Subject: [PATCH 022/268] Updated .stackhpc env with k3s token --- environments/.stackhpc/terraform/main.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index ac588930c..9e6f15ce4 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -59,6 +59,10 @@ variable "volume_backed_instances" { default = false } +variable "k3s_token" { + type = string +} + data "openstack_images_image_v2" "cluster" { name = var.cluster_image[var.os_version] most_recent = true @@ -74,6 +78,7 @@ module "cluster" { key_pair = "slurm-app-ci" cluster_image_id = data.openstack_images_image_v2.cluster.id control_node_flavor = var.control_node_flavor + k3s_token = var.k3s_token login_nodes = { login-0: var.other_node_flavor From 79383feaf802d42ce1b9d01115348f99d12ba86a Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 20 Sep 2024 11:27:02 +0100 Subject: [PATCH 023/268] added k3s readme --- ansible/roles/k3s/README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 ansible/roles/k3s/README.md diff --git a/ansible/roles/k3s/README.md b/ansible/roles/k3s/README.md new file mode 100644 index 000000000..8c4c91c2f --- /dev/null +++ b/ansible/roles/k3s/README.md @@ -0,0 +1,16 @@ +k3s +===== + +Installs k3s agent and server services on nodes and an ansible-init playbook to activate them. The service that each node will activate on init is determined by OpenStack metadata. Also includes Helm install. Currently only supports a single k3s-server +(i.e one control node). Install based on the [official k3s ansible role](https://github.com/k3s-io/k3s-ansible). + + +Requirements +------------ + +`azimuth_cloud.image_utils.linux_ansible_init` must have been run previously on targeted nodes + +Role Variables +-------------- + +- `k3s_version`: Optional str. K3s version to install, see [official releases](https://github.com/k3s-io/k3s/releases/). From fa955dd14269e87c59e1a52cd41784089efaf0de Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 20 Sep 2024 13:23:09 +0100 Subject: [PATCH 024/268] bump images --- environments/.stackhpc/terraform/main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 6f81609cc..1c0a33260 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -29,9 +29,9 @@ variable "cluster_image" { description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" type = map(string) default = { - # https://github.com/stackhpc/ansible-slurm-appliance/pull/427 - RL8: "openhpc-ofed-RL8-240906-1042-32568dbb" - RL9: "openhpc-ofed-RL9-240906-1041-32568dbb" + # https://github.com/stackhpc/ansible-slurm-appliance/pull/441 + RL8: "openhpc-ofed-RL8-240920-1001-22a4e6de" + RL9: "openhpc-ofed-RL9-240920-1001-22a4e6de" } } From a8569dae3c02aee71723b02d618e0bf8a551ed67 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 30 Sep 2024 16:46:50 +0100 Subject: [PATCH 025/268] Disabled traefik for non-server nodes --- ansible/roles/k3s/files/start_k3s.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 8cf12200a..b5d000afa 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -18,6 +18,22 @@ line: "K3S_URL=https://{{ k3s_server_name }}:6443" when: k3s_server_name is defined + - name: Add loadbalancer label to server + when: k3s_server_name is undefined + block: + - name: Create override directory + ansible.builtin.file: + state: directory + path: "/etc/systemd/system/{{ service_name }}.service.d" + + - name: Add label override + ansible.builtin.copy: + dest: "/etc/systemd/system/{{ service_name }}.service.d/override.conf" + content: | + [Service] + ExecStart= + ExecStart=/usr/bin/k3s server --node-label svccontroller.k3s.cattle.io/enablelb=true + - name: Start k3s service ansible.builtin.systemd: name: "{{ service_name }}" From 825048304ff3ccd5412d5d87b34d7f88ccec5f63 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 30 Sep 2024 16:50:40 +0100 Subject: [PATCH 026/268] Revert images for clean build --- environments/.stackhpc/terraform/main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 1c0a33260..6f81609cc 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -29,9 +29,9 @@ variable "cluster_image" { description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" type = map(string) default = { - # https://github.com/stackhpc/ansible-slurm-appliance/pull/441 - RL8: "openhpc-ofed-RL8-240920-1001-22a4e6de" - RL9: "openhpc-ofed-RL9-240920-1001-22a4e6de" + # https://github.com/stackhpc/ansible-slurm-appliance/pull/427 + RL8: "openhpc-ofed-RL8-240906-1042-32568dbb" + RL9: "openhpc-ofed-RL9-240906-1041-32568dbb" } } From 46401bf28fa76d18ac758f81b085e157e5c04816 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 1 Oct 2024 09:04:14 +0100 Subject: [PATCH 027/268] bump images --- environments/.stackhpc/terraform/main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 6f81609cc..1b7006fc1 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -29,9 +29,9 @@ variable "cluster_image" { description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" type = map(string) default = { - # https://github.com/stackhpc/ansible-slurm-appliance/pull/427 - RL8: "openhpc-ofed-RL8-240906-1042-32568dbb" - RL9: "openhpc-ofed-RL9-240906-1041-32568dbb" + # https://github.com/stackhpc/ansible-slurm-appliance/pull/441 + RL8: "openhpc-ofed-RL8-240930-1721-82504830" + RL9: "openhpc-ofed-RL9-240930-1555-82504830" } } From 15d35144002736b069601c2b30f7315becb5a61f Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 2 Oct 2024 11:06:52 +0100 Subject: [PATCH 028/268] Apply suggestions from code review Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/k3s/files/start_k3s.yml | 5 +++-- environments/common/layouts/everything | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index b5d000afa..6154cbeb6 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -18,7 +18,8 @@ line: "K3S_URL=https://{{ k3s_server_name }}:6443" when: k3s_server_name is defined - - name: Add loadbalancer label to server + - name: Only run loadbalancer on k3s server + # avoids problems with Ondemand https server when: k3s_server_name is undefined block: - name: Create override directory @@ -26,7 +27,7 @@ state: directory path: "/etc/systemd/system/{{ service_name }}.service.d" - - name: Add label override + - name: Set loadbalancer label on k3s server ansible.builtin.copy: dest: "/etc/systemd/system/{{ service_name }}.service.d/override.conf" content: | diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 51092802f..a8b37fc3c 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -89,3 +89,4 @@ openhpc [k9s:children] control + From bc3706445158d07611ddfffa8b841d938257c4b6 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 2 Oct 2024 11:24:26 +0100 Subject: [PATCH 029/268] Code review tweaks --- ansible/roles/k3s/files/start_k3s.yml | 2 +- ansible/roles/passwords/tasks/main.yml | 15 +-------------- ....tfvars.json => k3s-token.auto.tfvars.json.j2} | 0 .../terraform/variables.tf | 1 + 4 files changed, 3 insertions(+), 15 deletions(-) rename ansible/roles/passwords/templates/{k3s-token.auto.tfvars.json => k3s-token.auto.tfvars.json.j2} (100%) diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 6154cbeb6..56ef20313 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -7,7 +7,7 @@ service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}" tasks: - name: Add the token for joining the cluster to the environment - no_log: false # avoid logging the server token + no_log: true # avoid logging the server token ansible.builtin.lineinfile: path: "/etc/systemd/system/{{ service_name }}.service.env" line: "K3S_TOKEN={{ k3s_token }}" diff --git a/ansible/roles/passwords/tasks/main.yml b/ansible/roles/passwords/tasks/main.yml index c6360fdc2..09603e184 100644 --- a/ansible/roles/passwords/tasks/main.yml +++ b/ansible/roles/passwords/tasks/main.yml @@ -9,20 +9,7 @@ - name: Template k3s token to terraform template: - src: k3s-token.auto.tfvars.json + src: k3s-token.auto.tfvars.json.j2 dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/terraform/k3s-token.auto.tfvars.json" delegate_to: localhost run_once: true - - -# - name: Ensure munge key directory exists -# file: -# state: directory -# recurse: true -# path: "{{ openhpc_passwords_mungekey_output_path | dirname }}" - -# - name: Create a munge key -# copy: -# content: "{{ lookup('password', '/dev/null chars=ascii_letters,digits,hexdigits,punctuation') }}" -# dest: "{{ openhpc_passwords_mungekey_output_path }}" -# force: false diff --git a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 similarity index 100% rename from ansible/roles/passwords/templates/k3s-token.auto.tfvars.json rename to ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 930972d35..0f5eefa18 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -133,5 +133,6 @@ variable "root_volume_size" { } variable "k3s_token" { + description = "K3s cluster authentication token, set automatically by Ansible" type = string } From 2ab8e524329cb7bb8d38665a8a267af4510ec2be Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 2 Oct 2024 11:49:00 +0100 Subject: [PATCH 030/268] Moved k3s token to be with rest of appliance secrets --- ansible/roles/passwords/defaults/main.yml | 4 +--- .../roles/passwords/templates/k3s-token.auto.tfvars.json.j2 | 4 +++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 12eade641..3beb83507 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -8,11 +8,9 @@ slurm_appliance_secrets: vault_openhpc_mungekey: "{{ secrets_openhpc_mungekey | default(vault_openhpc_mungekey | default(secrets_openhpc_mungekey_default)) }}" vault_freeipa_ds_password: "{{ vault_freeipa_ds_password | default(lookup('password', '/dev/null')) }}" vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}" + k3s_token: "{{ lookup('ansible.builtin.password', '/dev/null', length=64) }}" secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" -k3s_secrets: - k3s_token: "{{ lookup('ansible.builtin.password', '/dev/null', length=64) }}" - openhpc_passwords_output_path: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') | default(undefined, true) | mandatory('You must define the APPLIANCES_ENVIRONMENT_ROOT environment variable') }}/inventory/group_vars/all/secrets.yml" diff --git a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 index 6454ea90f..f97663411 100644 --- a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 +++ b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 @@ -1 +1,3 @@ -{{ k3s_secrets | to_nice_json }} \ No newline at end of file +{ + "k3s_token": "{{ slurm_appliance_secrets.k3s_token }}" +} \ No newline at end of file From 26a0f89e80de75859c44aa712cb4a8da5a293d2b Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 2 Oct 2024 14:30:51 +0100 Subject: [PATCH 031/268] bump images --- environments/.stackhpc/terraform/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 1b7006fc1..8aba2ecc3 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -30,8 +30,8 @@ variable "cluster_image" { type = map(string) default = { # https://github.com/stackhpc/ansible-slurm-appliance/pull/441 - RL8: "openhpc-ofed-RL8-240930-1721-82504830" - RL9: "openhpc-ofed-RL9-240930-1555-82504830" + RL8: "openhpc-ofed-RL8-241002-1144-2ab8e524" + RL9: "openhpc-ofed-RL9-241002-1145-2ab8e524" } } From cfb5514ea77f6e41bfd33030787b6ce1f72ca25d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 4 Oct 2024 09:22:32 +0100 Subject: [PATCH 032/268] updated caas for k3s --- .../cluster_infra/templates/outputs.tf.j2 | 4 ++-- .../cluster_infra/templates/resources.tf.j2 | 20 +++++++++++++++++++ environments/.caas/hooks/pre.yml | 7 +++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/ansible/roles/cluster_infra/templates/outputs.tf.j2 b/ansible/roles/cluster_infra/templates/outputs.tf.j2 index 4d894a1dd..90556bf7a 100644 --- a/ansible/roles/cluster_infra/templates/outputs.tf.j2 +++ b/ansible/roles/cluster_infra/templates/outputs.tf.j2 @@ -24,8 +24,8 @@ output "cluster_nodes" { } }, { - name = openstack_compute_instance_v2.control.name - ip = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + name = openstack_compute_instance_v2.control["control"].name + ip = openstack_compute_instance_v2.control["control"].network[0].fixed_ip_v4 groups = ["control", "{{ cluster_name }}_control"], facts = { openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index 344137b62..e7cd05fe7 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -7,6 +7,19 @@ data "openstack_identity_auth_scope_v3" "scope" { name = "{{ cluster_name }}" } +#### +#### Data resources +#### + +resource "terraform_data" "k3s_token" { + input = "{{ k3s_token }}" + lifecycle { + ignore_changes = [ + input, + ] + } +} + ##### ##### Security groups for the cluster ##### @@ -386,6 +399,8 @@ resource "openstack_compute_instance_v2" "login" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} + k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + k3s_token = "{{ k3s_token }}" } } @@ -397,9 +412,11 @@ resource "openstack_compute_instance_v2" "control" { {% else %} flavor_id = "{{ control_flavor }}" {% endif %} + for_each = toset(["control"]) network { port = openstack_networking_port_v2.control.id + access_network = true } {% if cluster_storage_network is defined %} @@ -479,6 +496,7 @@ resource "openstack_compute_instance_v2" "control" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} + k3s_token = "{{ k3s_token }}" } } @@ -548,6 +566,8 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} + k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + k3s_token = "{{ k3s_token }}" } } diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 05b0255c8..59b74a3af 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -1,5 +1,12 @@ --- +# Generate k3s token +- name: Generate k3s token + hosts: openstack + tasks: + - ansible.builtin.set_fact: + k3s_token: "{{ lookup('ansible.builtin.password', '/dev/null', length=64) }}" + # Provision the infrastructure using Terraform - name: Provision infrastructure hosts: openstack From 8b7941d7c4ee247a1354d6bae169875cb1b0d3eb Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 4 Oct 2024 10:53:59 +0100 Subject: [PATCH 033/268] fixed k3s install overwriting ansible-init changes --- ansible/roles/k3s/defaults/main.yml | 2 +- ansible/roles/k3s/tasks/main.yml | 50 +++++++++++++++++------------ 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml index af40af4e8..ed039e2e0 100644 --- a/ansible/roles/k3s/defaults/main.yml +++ b/ansible/roles/k3s/defaults/main.yml @@ -1 +1 @@ -k3s_version: "v1.31.0+k3s1" \ No newline at end of file +k3s_version: "v1.31.0+k3s1" # Warning: changes to this variable won't be reflected in the cluster/image if k3s is already installed \ No newline at end of file diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 97ac292d0..99cb22144 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -1,27 +1,35 @@ --- -- name: Download k3s install script - ansible.builtin.get_url: - url: https://get.k3s.io/ - timeout: 120 - dest: /usr/bin/k3s-install.sh - owner: root - group: root - mode: "0755" +- name: Check for existing k3s binaries + stat: + path: /usr/bin/k3s + register: stat_result + +- name: Download and install k3s + when: not stat_result.stat.exists + block: + - name: Download k3s install script + ansible.builtin.get_url: + url: https://get.k3s.io/ + timeout: 120 + dest: /usr/bin/k3s-install.sh + owner: root + group: root + mode: "0755" -- name: Install k3s - ansible.builtin.shell: - cmd: /usr/bin/k3s-install.sh - environment: - INSTALL_K3S_VERSION: "{{ k3s_version }}" - INSTALL_K3S_EXEC: "{{ item }}" - INSTALL_K3S_SKIP_START: "true" - INSTALL_K3S_SKIP_ENABLE: "true" - INSTALL_K3S_BIN_DIR: "/usr/bin" - changed_when: true - loop: - - server - - agent + - name: Install k3s + ansible.builtin.shell: + cmd: /usr/bin/k3s-install.sh + environment: + INSTALL_K3S_VERSION: "{{ k3s_version }}" + INSTALL_K3S_EXEC: "{{ item }}" + INSTALL_K3S_SKIP_START: "true" + INSTALL_K3S_SKIP_ENABLE: "true" + INSTALL_K3S_BIN_DIR: "/usr/bin" + changed_when: true + loop: + - server + - agent - name: Install helm unarchive: From 5dfec0d2180daacb4880556a8cd5601d026eb253 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:52:55 +0100 Subject: [PATCH 034/268] bump images --- environments/.stackhpc/terraform/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 8aba2ecc3..745bcdb2e 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -30,8 +30,8 @@ variable "cluster_image" { type = map(string) default = { # https://github.com/stackhpc/ansible-slurm-appliance/pull/441 - RL8: "openhpc-ofed-RL8-241002-1144-2ab8e524" - RL9: "openhpc-ofed-RL9-241002-1145-2ab8e524" + RL8: "openhpc-ofed-RL8-241004-0959-8b7941d7" + RL9: "openhpc-ofed-RL9-241004-0959-8b7941d7" } } From 1035460551d09a3d1ac03742a817bea112c3bab7 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 8 Oct 2024 10:47:08 +0100 Subject: [PATCH 035/268] removed k3s ingress --- ansible/roles/k3s/files/start_k3s.yml | 17 ----------------- ansible/roles/k3s/tasks/main.yml | 2 +- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 56ef20313..78f950a59 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -18,23 +18,6 @@ line: "K3S_URL=https://{{ k3s_server_name }}:6443" when: k3s_server_name is defined - - name: Only run loadbalancer on k3s server - # avoids problems with Ondemand https server - when: k3s_server_name is undefined - block: - - name: Create override directory - ansible.builtin.file: - state: directory - path: "/etc/systemd/system/{{ service_name }}.service.d" - - - name: Set loadbalancer label on k3s server - ansible.builtin.copy: - dest: "/etc/systemd/system/{{ service_name }}.service.d/override.conf" - content: | - [Service] - ExecStart= - ExecStart=/usr/bin/k3s server --node-label svccontroller.k3s.cattle.io/enablelb=true - - name: Start k3s service ansible.builtin.systemd: name: "{{ service_name }}" diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 99cb22144..51e05aa9c 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -28,7 +28,7 @@ INSTALL_K3S_BIN_DIR: "/usr/bin" changed_when: true loop: - - server + - server --disable=traefik - agent - name: Install helm From 7eb48218a13657a4445b1559b272fb8e7b83df98 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 8 Oct 2024 16:20:55 +0100 Subject: [PATCH 036/268] added k3s docs --- docs/k3s.README.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 docs/k3s.README.md diff --git a/docs/k3s.README.md b/docs/k3s.README.md new file mode 100644 index 000000000..20e7c0e44 --- /dev/null +++ b/docs/k3s.README.md @@ -0,0 +1,8 @@ +# Overview +A K3s cluster is deployed with the Slurm cluster. Both an agent and server instance of K3s is installed during image build and the correct service (determined by OpenStack metadata) will be +enabled during boot. Nodes with the `k3s_server` metadata field defined will be configured as K3s agents (this field gives them the address of the server). The Slurm control node is currently configured as a server while all other nodes configured as agents. It should be noted that running multiple K3s servers isn't supported. Currently only the root user on the control node has +access to the Kubernetes API. The `k3s` role installs Helm for package management. K9s is also installed in the image and can be used by the root user. + +# Idempotency +K3s is intended to only be installed during image build as it is configured by the appliance on first boot with `azimuth_cloud.image_utils.linux_ansible_init`. Therefore, the `k3s` role isn't +idempotent and changes to variables will not be reflected in the image when running `site.yml`. An additional consequence of this is that for changes to role variables to be correctly applied during build, a base image which has `ansible-init` installed but not existing K3s instances must be used. From e6dd8719b7ed1dea44d56b47b73fd3b056254561 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:45:00 +0100 Subject: [PATCH 037/268] bump images --- environments/.stackhpc/terraform/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 745bcdb2e..4b119d368 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -30,8 +30,8 @@ variable "cluster_image" { type = map(string) default = { # https://github.com/stackhpc/ansible-slurm-appliance/pull/441 - RL8: "openhpc-ofed-RL8-241004-0959-8b7941d7" - RL9: "openhpc-ofed-RL9-241004-0959-8b7941d7" + RL8: "openhpc-ofed-RL8-241008-1531-2861edba" + RL9: "openhpc-ofed-RL9-241008-1531-2861edba" } } From 7b3b115f6df91bbcdb1c6edac9c8fd743ba568e3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 10 Oct 2024 15:09:32 +0100 Subject: [PATCH 038/268] Fixed node passwords changing on reimage --- ansible/roles/k3s/files/start_k3s.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 78f950a59..1a5d21e72 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -6,6 +6,11 @@ k3s_server_name: "{{ os_metadata.meta.k3s_server }}" service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}" tasks: + - name: Set agent node password as token # uses token to keep password consistent between reimages + ansible.builtin.copy: + dest: /etc/rancher/node/password + content: "{{ k3s_token }}" + - name: Add the token for joining the cluster to the environment no_log: true # avoid logging the server token ansible.builtin.lineinfile: From a0d947b411286a7f08acc447cdc7753f871e727e Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 11 Oct 2024 09:11:45 +0100 Subject: [PATCH 039/268] fixed missing directory --- ansible/roles/k3s/files/start_k3s.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 1a5d21e72..9c44f407c 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -6,6 +6,11 @@ k3s_server_name: "{{ os_metadata.meta.k3s_server }}" service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}" tasks: + - name: Ensure password directory exists + ansible.builtin.file: + path: /etc/rancher/node" + state: directory + - name: Set agent node password as token # uses token to keep password consistent between reimages ansible.builtin.copy: dest: /etc/rancher/node/password From 1d1e77784d254524dc1e6052ff0340f5d07b3702 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 11 Oct 2024 09:12:18 +0100 Subject: [PATCH 040/268] typo --- ansible/roles/k3s/files/start_k3s.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 9c44f407c..8ee0e6114 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -8,7 +8,7 @@ tasks: - name: Ensure password directory exists ansible.builtin.file: - path: /etc/rancher/node" + path: "/etc/rancher/node" state: directory - name: Set agent node password as token # uses token to keep password consistent between reimages From 440f20c9177e3da1decc8e6a550b8ac99b86edf0 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 11 Oct 2024 13:36:21 +0100 Subject: [PATCH 041/268] moved CI image definition --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- environments/.stackhpc/terraform/main.tf | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index f62c8886e..4a434e02f 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241009-1523-354b048a", - "RL9": "openhpc-RL9-241009-1523-354b048a", + "RL8": "openhpc-ofed-RL8-241008-1531-2861edba", + "RL9": "openhpc-ofed-RL9-241008-1531-2861edba", "RL9-cuda": "openhpc-cuda-RL9-241009-1523-354b048a" } } \ No newline at end of file diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 4b119d368..4284ec132 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -28,11 +28,6 @@ variable "os_version" { variable "cluster_image" { description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" type = map(string) - default = { - # https://github.com/stackhpc/ansible-slurm-appliance/pull/441 - RL8: "openhpc-ofed-RL8-241008-1531-2861edba" - RL9: "openhpc-ofed-RL9-241008-1531-2861edba" - } } variable "cluster_net" {} From d69033affed35369cf8686e2997f6b1a7a3361fe Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:15:31 +0100 Subject: [PATCH 042/268] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 4a434e02f..cb7834a6a 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-ofed-RL8-241008-1531-2861edba", - "RL9": "openhpc-ofed-RL9-241008-1531-2861edba", - "RL9-cuda": "openhpc-cuda-RL9-241009-1523-354b048a" + "RL8": "openhpc-RL8-241011-1241-440f20c9", + "RL9": "openhpc-RL9-241011-1241-440f20c9" } -} \ No newline at end of file +} From 3d2e2cd89d133b28215222c11181eac3be39a7f1 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 11 Oct 2024 16:26:43 +0100 Subject: [PATCH 043/268] added cuda image for ci --- environments/.stackhpc/terraform/cluster_image.auto.tfvars.json | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index cb7834a6a..9eedba1f0 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -2,5 +2,6 @@ "cluster_image": { "RL8": "openhpc-RL8-241011-1241-440f20c9", "RL9": "openhpc-RL9-241011-1241-440f20c9" + "RL9-cuda": "openhpc-cuda-RL9-241011-1344-440f20c9" } } From 2efa193739d9b2b80917f1e7d7ffe9b93876ef68 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 11 Oct 2024 16:34:47 +0100 Subject: [PATCH 044/268] typo --- environments/.stackhpc/terraform/cluster_image.auto.tfvars.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 9eedba1f0..70fa3e227 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { "RL8": "openhpc-RL8-241011-1241-440f20c9", - "RL9": "openhpc-RL9-241011-1241-440f20c9" + "RL9": "openhpc-RL9-241011-1241-440f20c9", "RL9-cuda": "openhpc-cuda-RL9-241011-1344-440f20c9" } } From ba1d212a45b86f929af06802791279bdfd19c1da Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:30:36 +0100 Subject: [PATCH 045/268] corrected docs --- docs/k3s.README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/k3s.README.md b/docs/k3s.README.md index 20e7c0e44..547455132 100644 --- a/docs/k3s.README.md +++ b/docs/k3s.README.md @@ -5,4 +5,4 @@ access to the Kubernetes API. The `k3s` role installs Helm for package managemen # Idempotency K3s is intended to only be installed during image build as it is configured by the appliance on first boot with `azimuth_cloud.image_utils.linux_ansible_init`. Therefore, the `k3s` role isn't -idempotent and changes to variables will not be reflected in the image when running `site.yml`. An additional consequence of this is that for changes to role variables to be correctly applied during build, a base image which has `ansible-init` installed but not existing K3s instances must be used. +idempotent and changes to variables will not be reflected in the image when running `site.yml`. An additional consequence of this is that for changes to role variables to be correctly applied when extending a base image with a Packer `openhpc-extra` build, the base image must have `ansible-init` installed but not existing K3s instances. From 904df8ae35d87b7215b40dc2da3879a00d9d1846 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 22 Oct 2024 17:23:36 +0100 Subject: [PATCH 046/268] k3s install now air-gapped --- ansible/roles/k3s/defaults/main.yml | 6 +++++- ansible/roles/k3s/tasks/main.yml | 31 ++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml index ed039e2e0..f8933217b 100644 --- a/ansible/roles/k3s/defaults/main.yml +++ b/ansible/roles/k3s/defaults/main.yml @@ -1 +1,5 @@ -k3s_version: "v1.31.0+k3s1" # Warning: changes to this variable won't be reflected in the cluster/image if k3s is already installed \ No newline at end of file +# Warning: changes to these variables won't be reflected in the cluster/image if k3s is already installed +k3s_version: "v1.31.0+k3s1" +k3s_selinux_release: v1.6.latest.1 +k3s_selinux_rpm_version: 1.6-1 +rocky_version: "{{ ansible_distribution_major_version }}" diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 51e05aa9c..3ac19c3e4 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -1,13 +1,37 @@ --- -- name: Check for existing k3s binaries +- name: Check for existing k3s installation stat: - path: /usr/bin/k3s + path: /var/lib/rancher/k3s register: stat_result -- name: Download and install k3s +- name: Download and air-gapped installation of k3s when: not stat_result.stat.exists block: + + - name: Download k3s binary + ansible.builtin.get_url: + url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s" + dest: /usr/bin/k3s + owner: root + group: root + mode: "0755" + + - name: Install k3s SELinux policy package + yum: + name: "https://github.com/k3s-io/k3s-selinux/releases/download/{{ k3s_selinux_release }}/k3s-selinux-{{ k3s_selinux_rpm_version }}.el{{ rocky_version }}.noarch.rpm" + disable_gpg_check: true + + - name: Create image directory + ansible.builtin.file: + path: "/var/lib/rancher/k3s/agent/images" + state: directory + + - name: Install k3s' internal images + ansible.builtin.get_url: + url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s-airgap-images-amd64.tar.zst" + dest: /var/lib/rancher/k3s/agent/images/k3s-airgap-images-amd64.tar.zst + - name: Download k3s install script ansible.builtin.get_url: url: https://get.k3s.io/ @@ -26,6 +50,7 @@ INSTALL_K3S_SKIP_START: "true" INSTALL_K3S_SKIP_ENABLE: "true" INSTALL_K3S_BIN_DIR: "/usr/bin" + INSTALL_K3S_SKIP_DOWNLOAD: "true" changed_when: true loop: - server --disable=traefik From a67ffd32601741bd23e2d90d115aebaa1ec4ef85 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 23 Oct 2024 09:51:39 +0100 Subject: [PATCH 047/268] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 70fa3e227..83b49afd5 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241011-1241-440f20c9", - "RL9": "openhpc-RL9-241011-1241-440f20c9", - "RL9-cuda": "openhpc-cuda-RL9-241011-1344-440f20c9" + "RL8": "openhpc-RL8-241022-1626-904df8ae", + "RL9": "openhpc-RL9-241022-1626-904df8ae", + "RL9-cuda": "openhpc-cuda-RL9-241023-0809-904df8ae" } } From 7008500b74e3a30f6c7df025092565aebb864e6c Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 25 Oct 2024 16:31:00 +0100 Subject: [PATCH 048/268] ci images bumped up to date with main --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 83b49afd5..e1a0166e6 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241022-1626-904df8ae", - "RL9": "openhpc-RL9-241022-1626-904df8ae", - "RL9-cuda": "openhpc-cuda-RL9-241023-0809-904df8ae" + "RL8": "openhpc-RL8-241025-1446-04db97d3", + "RL9": "openhpc-RL9-241025-1446-04db97d3", + "RL9-cuda": "openhpc-cuda-RL9-241025-1446-04db97d3" } } From 054cb73a3eb16d2f5aaa09dd07a08ff40d421395 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 29 Oct 2024 15:49:42 +0000 Subject: [PATCH 049/268] copy /etc/hosts to /exports/hosts/hosts --- ansible/roles/etc_hosts/tasks/main.yml | 23 +++++++++++++++++++ .../common/inventory/group_vars/all/nfs.yml | 6 +++++ 2 files changed, 29 insertions(+) diff --git a/ansible/roles/etc_hosts/tasks/main.yml b/ansible/roles/etc_hosts/tasks/main.yml index 6fdabf57c..1d04ebf7c 100644 --- a/ansible/roles/etc_hosts/tasks/main.yml +++ b/ansible/roles/etc_hosts/tasks/main.yml @@ -6,3 +6,26 @@ group: root mode: 0644 become: yes + +- name: Ensure /exports/hosts directory exists and copy /etc/hosts + block: + - name: Ensure the /exports/hosts directory exists + file: + path: /exports/hosts + state: directory + owner: root + group: root + mode: 0755 + become: yes + delegate_to: "{{ groups['control'] | first }}" + + - name: Copy /etc/hosts to NFS exported directory + copy: + src: /etc/hosts + dest: /exports/hosts/hosts + owner: root + group: root + mode: 0644 + remote_src: true + become: yes + delegate_to: "{{ groups['control'] | first }}" \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index bd340b190..110a1383c 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -15,3 +15,9 @@ nfs_configurations: nfs_server: "{{ nfs_server_default }}" nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" + + - comment: Export /etc/hosts copy from Slurm control node + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: false + nfs_export: "/exports/hosts" # control node has to copy in /etc/hosts to here \ No newline at end of file From f7e77602bfa3b010c421e3aad4fad1069ca0c398 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 30 Oct 2024 08:05:02 +0000 Subject: [PATCH 050/268] Support lustre client (#447) * WIP: add lustre role * allow definition of multiple lustre_mounts * fix lustre build for 2.15.5 release candidate * simplify lustre defaults * allow lustre install during build to get kernel version * allow extending fat images with site-specific groups * fix packer build so only roles for defined groups run * enable control of 'extra' build image name * bump to release lustre * add lnet configuration * simplify lustre mount logic * provide lnet config * autodetermine lustre interface * WIP: validation needs fixing for lustre_mounts removal * add working lnet.conf template * refactor lustre role for multiple mounts, selectable lnet label * remove unneeded comments from lustre taskfiles * fix lustre net type * fixup opensearch install permissions * add docs for extra builds * fix packer volume size definition * fix missing image name for cuda build * bump CI image * update packer README for modified image vars * move packer docs into docs/ * make packer extra build directly configurable * tidy packer docs * fix build error 'Error: Unset variable extra_build_volume_size' * fix error with null default during volume size lookup * note lnet protocol limitation * bump CI image to test --- .github/workflows/fatimage.yml | 2 +- ansible/.gitignore | 3 +- ansible/fatimage.yml | 13 +- ansible/filesystems.yml | 10 ++ ansible/roles/lustre/README.md | 27 +++++ ansible/roles/lustre/defaults/main.yml | 36 ++++++ ansible/roles/lustre/tasks/configure.yml | 47 ++++++++ ansible/roles/lustre/tasks/install.yml | 70 +++++++++++ ansible/roles/lustre/tasks/validate.yml | 27 +++++ ansible/roles/lustre/templates/lnet.conf.j2 | 6 + ansible/validate.yml | 8 ++ docs/image-build.md | 113 ++++++++++++++++++ .../terraform/cluster_image.auto.tfvars.json | 6 +- environments/common/inventory/groups | 5 +- environments/common/layouts/everything | 5 +- packer/README.md | 86 ------------- packer/openstack.pkr.hcl | 23 +++- 17 files changed, 389 insertions(+), 98 deletions(-) create mode 100644 ansible/roles/lustre/README.md create mode 100644 ansible/roles/lustre/defaults/main.yml create mode 100644 ansible/roles/lustre/tasks/configure.yml create mode 100644 ansible/roles/lustre/tasks/install.yml create mode 100644 ansible/roles/lustre/tasks/validate.yml create mode 100644 ansible/roles/lustre/templates/lnet.conf.j2 create mode 100644 docs/image-build.md delete mode 100644 packer/README.md diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 5425eb4e3..947f9410f 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -117,4 +117,4 @@ jobs: path: | ./image-id.txt ./image-name.txt - overwrite: true \ No newline at end of file + overwrite: true diff --git a/ansible/.gitignore b/ansible/.gitignore index 2ceeb596b..f6f5c5f4d 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -58,4 +58,5 @@ roles/* !roles/squid/** !roles/tuned/ !roles/tuned/** - +!roles/lustre/ +!roles/lustre/** diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index e623c2794..7cad2dc59 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -25,7 +25,7 @@ - hosts: builder become: yes - gather_facts: no + gather_facts: yes tasks: # - import_playbook: iam.yml - name: Install FreeIPA client @@ -44,6 +44,11 @@ name: stackhpc.os-manila-mount tasks_from: install.yml when: "'manila' in group_names" + - name: Install Lustre packages + include_role: + name: lustre + tasks_from: install.yml + when: "'lustre' in group_names" - import_playbook: extras.yml @@ -57,6 +62,7 @@ name: mysql tasks_from: install.yml when: "'mysql' in group_names" + - name: OpenHPC import_role: name: stackhpc.openhpc @@ -83,18 +89,21 @@ import_role: name: openondemand tasks_from: vnc_compute.yml + when: "'openondemand_desktop' in group_names" + - name: Open Ondemand jupyter node import_role: name: openondemand tasks_from: jupyter_compute.yml - when: "'openondemand' in group_names" + when: "'openondemand_jupyter' in group_names" # - import_playbook: monitoring.yml: - import_role: name: opensearch tasks_from: install.yml when: "'opensearch' in group_names" + # slurm_stats - nothing to do - import_role: name: filebeat diff --git a/ansible/filesystems.yml b/ansible/filesystems.yml index e1a782bad..4665c0f8f 100644 --- a/ansible/filesystems.yml +++ b/ansible/filesystems.yml @@ -24,3 +24,13 @@ tasks: - include_role: name: stackhpc.os-manila-mount + +- name: Setup Lustre clients + hosts: lustre + become: true + tags: lustre + tasks: + - include_role: + name: lustre + # NB install is ONLY run in builder + tasks_from: configure.yml diff --git a/ansible/roles/lustre/README.md b/ansible/roles/lustre/README.md new file mode 100644 index 000000000..c0a25e037 --- /dev/null +++ b/ansible/roles/lustre/README.md @@ -0,0 +1,27 @@ +# lustre + +Install and configure a Lustre client. This builds RPM packages from source. + +**NB:** The `install.yml` playbook in this role should only be run during image build and is not idempotent. This will install the `kernel-devel` package; if not already installed (e.g. from an `ofed` installation), this may require enabling update of DNF packages during build using `update_enable=true`, which will upgrade the kernel as well. + +**NB:** Currently this only supports RockyLinux 9. + +## Role Variables + +- `lustre_version`: Optional str. Version of lustre to build, default `2.15.5` which is the first version with EL9 support +- `lustre_lnet_label`: Optional str. The "lnet label" part of the host's NID, e.g. `tcp0`. Only the `tcp` protocol type is currently supported. Default `tcp`. +- `lustre_mgs_nid`: Required str. The NID(s) for the MGS, e.g. `192.168.227.11@tcp1` (separate mutiple MGS NIDs using `:`). +- `lustre_mounts`: Required list. Define Lustre filesystems and mountpoints as a list of dicts with keys: + - `fs_name`: Required str. The name of the filesystem to mount + - `mount_point`: Required str. Path to mount filesystem at. + - `mount_state`: Optional mount state, as for [ansible.posix.mount](https://docs.ansible.com/ansible/latest/collections/ansible/posix/mount_module.html#parameter-state). Default is `lustre_mount_state`. + - `mount_options`: Optional mount options. Default is `lustre_mount_options`. +- `lustre_mount_state`. Optional default mount state for all mounts, as for [ansible.posix.mount](https://docs.ansible.com/ansible/latest/collections/ansible/posix/mount_module.html#parameter-state). Default is `mounted`. +- `lustre_mount_options`. Optional default mount options. Default values are systemd defaults from [Lustre client docs](http://wiki.lustre.org/Mounting_a_Lustre_File_System_on_Client_Nodes). + +The following variables control the package build and and install and should not generally be required: +- `lustre_build_packages`: Optional list. Prerequisite packages required to build Lustre. See `defaults/main.yml`. +- `lustre_build_dir`: Optional str. Path to build lustre at, default `/tmp/lustre-release`. +- `lustre_configure_opts`: Optional list. Options to `./configure` command. Default builds client rpms supporting Mellanox OFED, without support for GSS keys. +- `lustre_rpm_globs`: Optional list. Shell glob patterns for rpms to install. Note order is important as the built RPMs are not in a yum repo. Default is just the `kmod-lustre-client` and `lustre-client` packages. +- `lustre_build_cleanup`: Optional bool. Whether to uninstall prerequisite packages and delete the build directories etc. Default `true`. diff --git a/ansible/roles/lustre/defaults/main.yml b/ansible/roles/lustre/defaults/main.yml new file mode 100644 index 000000000..be008ad55 --- /dev/null +++ b/ansible/roles/lustre/defaults/main.yml @@ -0,0 +1,36 @@ +lustre_version: '2.15.5' # https://www.lustre.org/lustre-2-15-5-released/ +lustre_lnet_label: tcp +#lustre_mgs_nid: +lustre_mounts: [] +lustre_mount_state: mounted +lustre_mount_options: 'defaults,_netdev,noauto,x-systemd.automount,x-systemd.requires=lnet.service' + +# below variables are for build and should not generally require changes +lustre_build_packages: + - "kernel-devel-{{ ansible_kernel }}" + - git + - gcc + - libtool + - python3 + - python3-devel + - openmpi + - elfutils-libelf-devel + - libmount-devel + - libnl3-devel + - libyaml-devel + - rpm-build + - kernel-abi-stablelists + - libaio + - libaio-devel +lustre_build_dir: /tmp/lustre-release +lustre_configure_opts: + - --disable-server + - --with-linux=/usr/src/kernels/* + - --with-o2ib=/usr/src/ofa_kernel/default + - --disable-maintainer-mode + - --disable-gss-keyring + - --enable-mpitests=no +lustre_rpm_globs: # NB: order is important here, as not installing from a repo + - "kmod-lustre-client-{{ lustre_version | split('.') | first }}*" # only take part of the version as -RC versions produce _RC rpms + - "lustre-client-{{ lustre_version | split('.') | first }}*" +lustre_build_cleanup: true diff --git a/ansible/roles/lustre/tasks/configure.yml b/ansible/roles/lustre/tasks/configure.yml new file mode 100644 index 000000000..b77e02ed9 --- /dev/null +++ b/ansible/roles/lustre/tasks/configure.yml @@ -0,0 +1,47 @@ +- name: Gather Lustre interface info + shell: + cmd: | + ip r get {{ _lustre_mgs_ip }} + changed_when: false + register: _lustre_ip_r_mgs + vars: + _lustre_mgs_ip: "{{ lustre_mgs_nid | split('@') | first }}" + +- name: Set facts for Lustre interface + set_fact: + _lustre_interface: "{{ _lustre_ip_r_mgs_info[4] }}" + _lustre_ip: "{{ _lustre_ip_r_mgs_info[6] }}" + vars: + _lustre_ip_r_mgs_info: "{{ _lustre_ip_r_mgs.stdout_lines.0 | split }}" + # first line e.g. "10.167.128.1 via 10.179.0.2 dev eth0 src 10.179.3.149 uid 1000" + +- name: Write LNet configuration file + template: + src: lnet.conf.j2 + dest: /etc/lnet.conf # exists from package install, expected by lnet service + owner: root + group: root + mode: u=rw,go=r # from package install + register: _lnet_conf + +- name: Ensure lnet service state + systemd: + name: lnet + state: "{{ 'restarted' if _lnet_conf.changed else 'started' }}" + +- name: Ensure mount points exist + ansible.builtin.file: + path: "{{ item.mount_point }}" + state: directory + loop: "{{ lustre_mounts }}" + when: "(item.mount_state | default(lustre_mount_state)) != 'absent'" + +- name: Mount lustre filesystem + ansible.posix.mount: + fstype: lustre + src: "{{ lustre_mgs_nid }}:/{{ item.fs_name }}" + path: "{{ item.mount_point }}" + state: "{{ (item.mount_state | default(lustre_mount_state)) }}" + opts: "{{ item.mount_options | default(lustre_mount_options) }}" + loop: "{{ lustre_mounts }}" + \ No newline at end of file diff --git a/ansible/roles/lustre/tasks/install.yml b/ansible/roles/lustre/tasks/install.yml new file mode 100644 index 000000000..e0af857cf --- /dev/null +++ b/ansible/roles/lustre/tasks/install.yml @@ -0,0 +1,70 @@ +- name: Install lustre build prerequisites + ansible.builtin.dnf: + name: "{{ lustre_build_packages }}" + register: _lustre_dnf_build_packages + +- name: Clone lustre git repo + # https://git.whamcloud.com/?p=fs/lustre-release.git;a=summary + ansible.builtin.git: + repo: git://git.whamcloud.com/fs/lustre-release.git + dest: "{{ lustre_build_dir }}" + version: "{{ lustre_version }}" + +- name: Prepare for lustre configuration + ansible.builtin.command: + cmd: sh ./autogen.sh + chdir: "{{ lustre_build_dir }}" + +- name: Configure lustre build + ansible.builtin.command: + cmd: "./configure {{ lustre_configure_opts | join(' ') }}" + chdir: "{{ lustre_build_dir }}" + +- name: Build lustre + ansible.builtin.command: + cmd: make rpms + chdir: "{{ lustre_build_dir }}" + +- name: Find rpms + ansible.builtin.find: + paths: "{{ lustre_build_dir }}" + patterns: "{{ lustre_rpm_globs }}" + use_regex: false + register: _lustre_find_rpms + +- name: Check rpms found + assert: + that: _lustre_find_rpms.files | length + fail_msg: "No lustre repos found with lustre_rpm_globs = {{ lustre_rpm_globs }}" + +- name: Install lustre rpms + ansible.builtin.dnf: + name: "{{ _lustre_find_rpms.files | map(attribute='path')}}" + disable_gpg_check: yes + +- block: + - name: Remove lustre build prerequisites + # NB Only remove ones this role installed which weren't upgrades + ansible.builtin.dnf: + name: "{{ _new_pkgs }}" + state: absent + vars: + _installed_pkgs: | + {{ + _lustre_dnf_build_packages.results | + select('match', 'Installed:') | + map('regex_replace', '^Installed: (.+?)-[0-9].*$', '\1') + }} + _removed_pkgs: | + {{ + _lustre_dnf_build_packages.results | + select('match', 'Removed:') | + map('regex_replace', '^Removed: (.+?)-[0-9].*$', '\1') + }} + _new_pkgs: "{{ _installed_pkgs | difference(_removed_pkgs) }}" + + - name: Delete lustre build dir + file: + path: "{{ lustre_build_dir }}" + state: absent + when: lustre_build_cleanup | bool diff --git a/ansible/roles/lustre/tasks/validate.yml b/ansible/roles/lustre/tasks/validate.yml new file mode 100644 index 000000000..fe65a4d1a --- /dev/null +++ b/ansible/roles/lustre/tasks/validate.yml @@ -0,0 +1,27 @@ +- name: Assert using RockyLinux 9 + assert: + that: ansible_distribution_major_version | int == 9 + fail_msg: The 'lustre' role requires RockyLinux 9 + +- name: Check kernel-devel package is installed + command: "dnf list --installed kernel-devel-{{ ansible_kernel }}" + changed_when: false + # NB: we don't check here the kernel will remain the same after reboot etc, see ofed/install.yml + +- name: Ensure SELinux in permissive mode + assert: + that: selinux_state in ['permissive', 'disabled'] + fail_msg: "SELinux must be permissive for Lustre not '{{ selinux_state }}'; see variable selinux_state" + +- name: Ensure lustre_mgs_nid is defined + assert: + that: lustre_mgs_nid is defined + fail_msg: Variable lustre_mgs_nid must be defined + +- name: Ensure lustre_mounts entries define filesystem name and mount point + assert: + that: + - item.fs_name is defined + - item.mount_point is defined + fail_msg: All lustre_mounts entries must specify fs_name and mount_point + loop: "{{ lustre_mounts }}" diff --git a/ansible/roles/lustre/templates/lnet.conf.j2 b/ansible/roles/lustre/templates/lnet.conf.j2 new file mode 100644 index 000000000..363308e32 --- /dev/null +++ b/ansible/roles/lustre/templates/lnet.conf.j2 @@ -0,0 +1,6 @@ +net: + - net type: {{ lustre_lnet_label }} + local NI(s): + - nid: {{ _lustre_ip }}@{{ lustre_lnet_label }} + interfaces: + 0: {{ _lustre_interface }} diff --git a/ansible/validate.yml b/ansible/validate.yml index fae9c2f68..d02caac60 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -85,3 +85,11 @@ - import_role: name: freeipa tasks_from: validate.yml + +- name: Validate lustre configuration + hosts: lustre + tags: lustre + tasks: + - import_role: + name: lustre + tasks_from: validate.yml diff --git a/docs/image-build.md b/docs/image-build.md new file mode 100644 index 000000000..4896bde57 --- /dev/null +++ b/docs/image-build.md @@ -0,0 +1,113 @@ +# Packer-based image build + +The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. + +The Packer configuration defined here builds "fat images" which contain binaries for all nodes, but no cluster-specific configuration. Using these: +- Enables the image to be tested in CI before production use. +- Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). +- Improves deployment speed by reducing the number of package downloads to improve deployment speed. + +By default, a fat image build starts from a nightly image build containing Mellanox OFED, and updates all DNF packages already present. The 'latest' nightly build itself is from a RockyLinux GenericCloud image. + +The fat images StackHPC builds and test in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: +1. Build site-specific fat images from scratch. +2. Extend an existing fat image with additional software. + + +# Usage + +The steps for building site-specific fat images or extending an existing fat image are the same: + +1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). +2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum e.g.: + + ```hcl + flavor = "general.v1.small" # VM flavor to use for builder VMs + networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to + ``` + Note that: + - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). + - For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`. + - For an example of configuration for extending an existing fat image see below. + +3. Activate the venv and the relevant environment. + +4. Build images using the relevant variable definition file, e.g.: + + cd packer/ + PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl + + Note that the `-only` flag here restricts Packer to a single specific "build" definition (in Packer terminology). Options here are: + - `-only=openstack.openhpc`: Build a fat image including Mellanox OFED + - `-only=openstack.openhpc-cuda`: Build a fat image including Mellanox OFED, Nvidia drivers and CUDA + - `-only=openstack.openhpc-extra`: Build an image which *extends* an existing fat image + +5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash. + +# Defining an "extra" image build + +An "extra" image build starts with an existing fat image (e.g. one provided by StackHPC) rather than a RockyLinux GenericCloud image, and only runs a specific subset of the +Ansible in the appliance. This allows adding additional functionality into site-specific images, without modifying the existing functionality in the base fat image. This is the recommended way to build site-specific images. + +To configure an "extra" image build, prepare a Packer variable definition file as described above but also including: + +- `extra_build_image_name`: A string to add into the final image name. +- `source_image` or `source_image_name`: The UUID or name of the fat image to start from (which must already be present in OpenStack). +- `extra_build_groups`: A list of Ansible inventory groups to put the build VM into, in addition to the `builder` group. This defines the roles/functionality + which are added to the image. +- `extra_build_volume_size`: A number giving the size in GB of the volume for the build VM's root disk and therefore the resulting image size. + Note this assumes the default of `use_blockstorage_volume = true`. + +E.g. to add the lustre client to an RockyLinux 9 image: + + # environments/site/lustre.pkvars.hcl + + extra_build_image_name = "lustre" # output image name will be like "openhpc-lustre-RL9-$timestamp-$commit" + source_image_name = "openhpc-ofed-RL9-240906-1041-32568dbb" # e.g. current StackHPC RL9 image + extra_build_groups = ["lustre"] # only run lustre role during this extra build + extra_build_volume_size = 15 # default non-CUDA build image size has enough free space + + # ... define flavor, network, etc as normal + + +Then, reference this build and variables file in the Packer build command: + + PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc-extra --on-error=ask -var-file=environments/site/lustre.pkvars.hcl openstack.pkr.hcl + +**NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: + + openstack image show $SOURCE_IMAGE + +If it does, remove this property: + + openstack image unset --property signature_verified $SOURCE_IMAGE + +then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). + +# Build Process + +In summary, Packer creates an OpenStack VM, runs Ansible on that, shuts it down, then creates an image from the root disk. + +Many of the Packer variables defined in `openstack.pkr.hcl` control the definition of the build VM and how to SSH to it to run Ansible. These are generic OpenStack builder options +and are not specific to the Slurm Appliance. Packer varibles can be set in a file at any convenient path; the build example above +shows the use of the environment variable `$PKR_VAR_environment_root` (which itself sets the Packer variable +`environment_root`) to automatically select a variable file from the current environment, but for site-specific builds +using a path in a "parent" environment is likely to be more appropriate (as builds should not be environment-specific to allow testing before deployment to a production environment). + +What is Slurm Appliance-specific are the details of how Ansible is run: +- The build VM is always added to the `builder` inventory group, which differentiates it from nodes in a cluster. This allows + Ansible variables to be set differently during Packer builds, e.g. to prevent services starting. The defaults for this are in `environments/common/inventory/group_vars/builder/`, which could be extended or overriden for site-specific fat image builds using `builder` groupvars for the relevant environment. It also runs some builder-specific code (e.g. to clean up the image). +- The default fat image builds also add the build VM to the "top-level" `compute`, `control` and `login` groups. This ensures + the Ansible specific to all of these types of nodes run. Note other inventory groups are constructed from these by `environments/common/inventory/groups file` - this is not builder-specific. +- As noted above, for "extra" builds the additional groups can be specified directly. In this way an existing image can be extended with site-specific Ansible, without modifying the + part of the image which has already been tested in the StackHPC CI. +- The playbook `ansible/fatimage.yml` is run which is only a subset of `ansible/site.yml`. This allows restricting the code which runs during build for cases where setting `builder` + groupvars is not sufficient (e.g. a role always attempts to configure or start services). + +There are some things to be aware of when developing Ansible to run in a Packer build VM: + - Only some tasks make sense. E.g. any services with a reliance on the network cannot be started, and should not be enabled if, when creating an instance with the resulting image, the remote service will not be immediately present. + - Nothing should be written to the persistent state directory `appliances_state_dir`, as this is on the root filesystem rather than an OpenStack volume. + - Care should be taken not to leave data on the root filesystem which is not wanted in the final image (e.g secrets). + - Build VM hostnames are not the same as for equivalent "real" hosts and do not contain `login`, `control` etc. Therefore variables used by the build VM must be defined as groupvars not hostvars. + - Ansible may need to use a proxyjump to reach cluster nodes, which can be defined via Ansible's `ansible_ssh_common_args` variable. If Packer should not use the same proxy + to connect to build VMs (e.g. because build happens on a different network), this proxy configuration should not be added to the `all` group. diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index c07b2c4ac..9f396e964 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241022-0441-a5affa58", - "RL9": "openhpc-RL9-241022-0038-a5affa58", - "RL9-cuda": "openhpc-cuda-RL9-241022-0441-a5affa58" + "RL8": "openhpc-RL8-241024-1439-177083b1", + "RL9": "openhpc-RL9-241024-1438-177083b1", + "RL9-cuda": "openhpc-cuda-RL9-241024-1628-177083b1" } } \ No newline at end of file diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index ea0bebebc..cd9a5cb0c 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -134,4 +134,7 @@ freeipa_client # Hosts to run TuneD configuration [ansible_init] -# Hosts to run linux-anisble-init \ No newline at end of file +# Hosts to run linux-anisble-init + +[lustre] +# Hosts to run lustre client diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 205f1d334..be0b3d1b7 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -80,4 +80,7 @@ openhpc [ansible_init:children] # Hosts to run ansible-init -cluster \ No newline at end of file +cluster + +[lustre] +# Hosts to run lustre client diff --git a/packer/README.md b/packer/README.md deleted file mode 100644 index 5e1d57dc2..000000000 --- a/packer/README.md +++ /dev/null @@ -1,86 +0,0 @@ -# Packer-based image build - -The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. - -The Packer configuration defined here builds "fat images" which contain binaries for all nodes, but no cluster-specific configuration. Using these: -- Enables the image to be tested in CI before production use. -- Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). -- Improves deployment speed by reducing the number of package downloads to improve deployment speed. - -By default, a fat image build starts from a nightly image build containing Mellanox OFED, and updates all DNF packages already present. The 'latest' nightly build itself is from a RockyLinux GenericCloud image. - -The fat images StackHPC builds and test in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: -1. Build site-specific fat images from scratch. -2. Extend an existing fat image with additional software. - - -# Usage - -The steps for building site-specific fat images or extending an existing fat image are the same: - -1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). -2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum e.g.: - - ```hcl - flavor = "general.v1.small" # VM flavor to use for builder VMs - networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to - ``` - - - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). - - - For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`. - - - For an example of configuration for extending an existing fat image see below. - -3. Activate the venv and the relevant environment. - -4. Build images using the relevant variable definition file, e.g.: - - cd packer/ - PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - - Note that the `-only` flag here restricts the build to the non-CUDA fat image "source" (in Packer terminology). Other - source options are: - - `-only=openstack.openhpc-cuda`: Build a fat image including CUDA packages. - - `-only=openstack.openhpc-extra`: Build an image which extends an existing fat image - in this case the variable `source_image` or `source_image_name}` must also be set in the Packer variables file. - -5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash. - -# Build Process - -In summary, Packer creates an OpenStack VM, runs Ansible on that, shuts it down, then creates an image from the root disk. - -Many of the Packer variables defined in `openstack.pkr.hcl` control the definition of the build VM and how to SSH to it to run Ansible, which are generic OpenStack builder options. Packer varibles can be set in a file at any convenient path; the above -example shows the use of the environment variable `$PKR_VAR_environment_root` (which itself sets the Packer variable -`environment_root`) to automatically select a variable file from the current environment, but for site-specific builds -using a path in a "parent" environment is likely to be more appropriate (as builds should not be environment-specific, to allow testing). - -What is Slurm Appliance-specific are the details of how Ansible is run: -- The build VM is always added to the `builder` inventory group, which differentiates it from "real" nodes. This allows - variables to be set differently during Packer builds, e.g. to prevent services starting. The defaults for this are in `environments/common/inventory/group_vars/builder/`, which could be extended or overriden for site-specific fat image builds using `builder` groupvars for the relevant environment. It also runs some builder-specific code (e.g. to ensure Packer's SSH - keys are removed from the image). -- The default fat image build also adds the build VM to the "top-level" `compute`, `control` and `login` groups. This ensures - the Ansible specific to all of these types of nodes run (other inventory groups are constructed from these by `environments/common/inventory/groups file` - this is not builder-specific). -- Which groups the build VM is added to is controlled by the Packer `groups` variable. This can be redefined for builds using the `openhpc-extra` source to add the build VM into specific groups. E.g. with a Packer variable file: - - source_image_name = { - RL9 = "openhpc-ofed-RL9-240619-0949-66c0e540" - } - groups = { - openhpc-extra = ["foo"] - } - - the build VM uses an existing "fat image" (rather than a 'latest' nightly one) and is added to the `builder` and `foo` groups. This means only code targeting `builder` and `foo` groups runs. In this way an existing image can be extended with site-specific code, without modifying the part of the image which has already been tested in the StackHPC CI. - - - The playbook `ansible/fatimage.yml` is run which is only a subset of `ansible/site.yml`. This allows restricting the code - which runs during build for cases where setting `builder` groupvars is not sufficient (e.g. a role always attempts to configure or start services). This may eventually be removed. - -There are some things to be aware of when developing Ansible to run in a Packer build VM: - - Only some tasks make sense. E.g. any services with a reliance on the network cannot be started, and may not be able to be enabled if when creating an instance with the resulting image the remote service will not be immediately present. - - Nothing should be written to the persistent state directory `appliances_state_dir`, as this is on the root filesystem rather than an OpenStack volume. - - Care should be taken not to leave data on the root filesystem which is not wanted in the final image, (e.g secrets). - - Build VM hostnames are not the same as for equivalent "real" hosts and do not contain `login`, `control` etc. Therefore variables used by the build VM must be defined as groupvars not hostvars. - - Ansible may need to proxy to real compute nodes. If Packer should not use the same proxy to connect to the - build VMs (e.g. build happens on a different network), proxy configuration should not be added to the `all` group. - - Currently two fat image "sources" are defined, with and without CUDA. This simplifies CI configuration by allowing the - default source images to be defined in the `openstack.pkr.hcl` definition. diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index fe922c78e..fae0bf7b2 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -48,6 +48,7 @@ variable "os_version" { # Must supply either source_image_name or source_image_id variable "source_image_name" { type = string + default = null description = "name of source image" } @@ -132,6 +133,11 @@ variable "volume_size" { } } +variable "extra_build_volume_size" { + type = number + default = 15 # same as default non-CUDA build +} + variable "image_disk_format" { type = string default = "qcow2" @@ -154,12 +160,23 @@ variable "groups" { } } +variable "extra_build_groups" { + type = list(string) + default = [] +} + +variable "extra_build_image_name" { + type = string + description = "Infix for 'extra' build image name" + default = "extra" +} + source "openstack" "openhpc" { # Build VM: flavor = var.flavor use_blockstorage_volume = var.use_blockstorage_volume volume_type = var.volume_type - volume_size = var.volume_size[source.name] + volume_size = lookup(var.volume_size, source.name, var.extra_build_volume_size) metadata = var.metadata instance_metadata = {ansible_init_disable = "true"} networks = var.networks @@ -214,12 +231,12 @@ build { # Extended site-specific image, built on fat image: source "source.openstack.openhpc" { name = "openhpc-extra" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" + image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" } provisioner "ansible" { playbook_file = "${var.repo_root}/ansible/fatimage.yml" - groups = concat(["builder"], var.groups[source.name]) + groups = concat(["builder"], lookup(var.groups, source.name, var.extra_build_groups)) keep_inventory_file = true # for debugging use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting extra_arguments = [ From 36de79680823f62779595f7cb086a9375f1811c7 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 31 Oct 2024 11:51:13 +0000 Subject: [PATCH 051/268] add resolv_conf role to compute script --- ansible/.gitignore | 2 + ansible/extras.yml | 8 +++ .../roles/compute_init/files/compute-init.yml | 59 +++++++++++++++++++ ansible/roles/compute_init/tasks/main.yml | 45 ++++++++++++++ environments/common/inventory/groups | 5 +- environments/common/layouts/everything | 6 +- 6 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 ansible/roles/compute_init/files/compute-init.yml create mode 100644 ansible/roles/compute_init/tasks/main.yml diff --git a/ansible/.gitignore b/ansible/.gitignore index 2ceeb596b..677b4c31f 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -58,4 +58,6 @@ roles/* !roles/squid/** !roles/tuned/ !roles/tuned/** +!roles/compute_init/ +!roles/compute_init/** diff --git a/ansible/extras.yml b/ansible/extras.yml index c32f51c32..18bad1dfd 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -36,3 +36,11 @@ tasks: - import_role: name: persist_hostkeys + +- name: Inject ansible-init compute script + hosts: compute_init + tags: compute_init + become: yes + tasks: + - import_role: + name: compute_init \ No newline at end of file diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml new file mode 100644 index 000000000..ce797c1cf --- /dev/null +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -0,0 +1,59 @@ +--- + +- name: Compute node initialisation + hosts: localhost + become: yes + vars: + control_node_ip: "172.16.1.228" + nfs_export: "/exports/hosts" + resolv_conf_nameservers: [] + + tasks: + - name: Configure resolve.conf + block: + - name: Set nameservers in /etc/resolv.conf + ansible.builtin.template: + src: /etc/ansible-init/templates/resolv.conf.j2 + dest: /etc/resolv.conf + owner: root + group: root + mode: u=rw,og=r + + - name: Disable NetworkManager control of resolv.conf + ansible.builtin.copy: + src: /etc/ansible-init/files/NetworkManager-dns-none.conf + dest: /etc/NetworkManager/conf.d/90-dns-none.conf + owner: root + group: root + mode: u=rw,og=r + register: _copy_nm_config + + - name: Reload NetworkManager + ansible.builtin.systemd: + name: NetworkManager + state: reloaded + when: _copy_nm_config.changed | default(false) + + - name: Mount /etc/hosts on compute nodes + block: + - name: Ensure the mount directory exists + file: + path: /mnt/hosts + state: directory + mode: 0755 + + - name: Mount NFS export + mount: + path: /mnt/hosts + src: "{{ vars.control_node_ip }}:{{ nfs_export }}" + fstype: nfs + opts: rw,sync + state: mounted + + - name: Copy /exports/hosts contents to /etc/hosts + copy: + src: /mnt/hosts/hosts + dest: /etc/hosts + owner: root + group: root + mode: 0644 diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml new file mode 100644 index 000000000..49b7d37e8 --- /dev/null +++ b/ansible/roles/compute_init/tasks/main.yml @@ -0,0 +1,45 @@ +--- + +- name: Ensure templates directory exists + file: + path: /etc/ansible-init/templates + state: directory + owner: root + group: root + mode: 0644 + +- name: Inject templates + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/templates/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../resolv_conf/templates/resolv.conf.j2 + +- name: Ensure files directory exists + file: + path: /etc/ansible-init/files + state: directory + owner: root + group: root + mode: 0644 + +- name: Inject files + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/files/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../resolv_conf/files/NetworkManager-dns-none.conf + +- name: Inject compute initialisation playbook + copy: + src: compute-init.yml + dest: /etc/ansible-init/playbooks/compute-init.yml + owner: root + group: root + mode: 0644 \ No newline at end of file diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index ea0bebebc..62a1fb0d2 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -134,4 +134,7 @@ freeipa_client # Hosts to run TuneD configuration [ansible_init] -# Hosts to run linux-anisble-init \ No newline at end of file +# Hosts to run linux-anisble-init + +[compute_init] +# Hosts to deploy compute initialisation ansible-init script to. \ No newline at end of file diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 205f1d334..19880ddef 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -80,4 +80,8 @@ openhpc [ansible_init:children] # Hosts to run ansible-init -cluster \ No newline at end of file +cluster + +[compute_init:children] +# Hosts to deploy compute initialisation ansible-init script to. +compute \ No newline at end of file From 64a1e90704fa5678bbab3b6a8fc48f20e2fdf1dc Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 1 Nov 2024 13:18:46 +0000 Subject: [PATCH 052/268] Add missing bits re. initial setup to refactored README (#464) * Add missing bits re. initial setup to refactored README * try to clarify activation --- README.md | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b54cd110a..f61bf8df4 100644 --- a/README.md +++ b/README.md @@ -55,9 +55,12 @@ You will also need to install [OpenTofu](https://opentofu.org/docs/intro/install ### Create a new environment -Use the `cookiecutter` template to create a new environment to hold your configuration. In the repository root run: +Run the following from the repository root to activate the venv: . venv/bin/activate + +Use the `cookiecutter` template to create a new environment to hold your configuration: + cd environments cookiecutter skeleton @@ -65,11 +68,15 @@ and follow the prompts to complete the environment name and description. **NB:** In subsequent sections this new environment is refered to as `$ENV`. -Now generate secrets for this environment: +Activate the new environment: + + . environments/$ENV/activate + +And generate secrets for it: ansible-playbook ansible/adhoc/generate-passwords.yml -### Define infrastructure configuration +### Define and deploy infrastructure Create an OpenTofu variables file to define the required infrastructure, e.g.: @@ -91,20 +98,28 @@ Create an OpenTofu variables file to define the required infrastructure, e.g.: } } -Variables marked `*` refer to OpenStack resources which must already exist. The above is a minimal configuration - for all variables -and descriptions see `environments/$ENV/terraform/terraform.tfvars`. +Variables marked `*` refer to OpenStack resources which must already exist. The above is a minimal configuration - for all variables and descriptions see `environments/$ENV/terraform/terraform.tfvars`. + +To deploy this infrastructure, ensure the venv and the environment are [activated](#create-a-new-environment) and run: -### Deploy appliance + export OS_CLOUD=openstack + cd environments/$ENV/terraform/ + tofu apply + +and follow the prompts. Note the OS_CLOUD environment variable assumes that OpenStack credentials are defined using a [clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) file in a default location with the default cloud name of `openstack`. + +### Configure appliance + +To configure the appliance, ensure the venv and the environment are [activated](#create-a-new-environment) and run: ansible-playbook ansible/site.yml -You can now log in to the cluster using: +Once it completes you can log in to the cluster using: ssh rocky@$login_ip where the IP of the login node is given in `environments/$ENV/inventory/hosts.yml` - ## Overview of directory structure - `environments/`: See [docs/environments.md](docs/environments.md). From 79433b822f6c26b9c33764a65304ce6d934523ff Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 4 Nov 2024 08:57:55 +0000 Subject: [PATCH 053/268] fixed k3s token idempotency issues --- ansible/roles/passwords/defaults/main.yml | 2 +- ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 3beb83507..d9a339efd 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -8,7 +8,7 @@ slurm_appliance_secrets: vault_openhpc_mungekey: "{{ secrets_openhpc_mungekey | default(vault_openhpc_mungekey | default(secrets_openhpc_mungekey_default)) }}" vault_freeipa_ds_password: "{{ vault_freeipa_ds_password | default(lookup('password', '/dev/null')) }}" vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}" - k3s_token: "{{ lookup('ansible.builtin.password', '/dev/null', length=64) }}" + vault_k3s_token: "{{ vault_k3s_token | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" diff --git a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 index f97663411..2a8fabba8 100644 --- a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 +++ b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 @@ -1,3 +1,3 @@ { - "k3s_token": "{{ slurm_appliance_secrets.k3s_token }}" + "k3s_token": "{{ vault_k3s_token }}" } \ No newline at end of file From bba95bb795b5f88f2fc56479c5454e0cec136650 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 4 Nov 2024 09:05:45 +0000 Subject: [PATCH 054/268] Comment + doc changes from review Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/cluster_infra/templates/resources.tf.j2 | 2 +- ansible/roles/k3s/tasks/main.yml | 1 + docs/k3s.README.md | 2 +- environments/.caas/hooks/pre.yml | 1 + environments/common/layouts/everything | 1 + 5 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index e7cd05fe7..b10af94f9 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -15,7 +15,7 @@ resource "terraform_data" "k3s_token" { input = "{{ k3s_token }}" lifecycle { ignore_changes = [ - input, + input, # makes it a write-once value (set via Ansible) ] } } diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 3ac19c3e4..9efc9a255 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -6,6 +6,7 @@ register: stat_result - name: Download and air-gapped installation of k3s + # Using air-gapped install so containers are pre-installed to avoid rate-limiting from registries on cluster startup when: not stat_result.stat.exists block: diff --git a/docs/k3s.README.md b/docs/k3s.README.md index 547455132..92a2addd0 100644 --- a/docs/k3s.README.md +++ b/docs/k3s.README.md @@ -1,6 +1,6 @@ # Overview A K3s cluster is deployed with the Slurm cluster. Both an agent and server instance of K3s is installed during image build and the correct service (determined by OpenStack metadata) will be -enabled during boot. Nodes with the `k3s_server` metadata field defined will be configured as K3s agents (this field gives them the address of the server). The Slurm control node is currently configured as a server while all other nodes configured as agents. It should be noted that running multiple K3s servers isn't supported. Currently only the root user on the control node has +enabled during boot. Nodes with the `k3s_server` metadata field defined will be configured as K3s agents (this field gives them the address of the server). The Slurm control node is currently configured as a server while all other nodes are configured as agents. Using multiple K3s servers isn't supported. Currently only the root user on the control node has access to the Kubernetes API. The `k3s` role installs Helm for package management. K9s is also installed in the image and can be used by the root user. # Idempotency diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 59b74a3af..c8951a4af 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -2,6 +2,7 @@ # Generate k3s token - name: Generate k3s token + # NB: Although this generates a new token on each run, the actual token set in metadata is retrieved from a set-once tofu resource, hence only the first value ever generated is relevant. hosts: openstack tasks: - ansible.builtin.set_fact: diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 613d89497..17292b1c5 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -87,5 +87,6 @@ cluster openhpc [k9s:children] +# Hosts to install k9s on control From 3f599c6e2dfe877b6cc375bd39328a55f12fda2f Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 4 Nov 2024 09:06:11 +0000 Subject: [PATCH 055/268] play rename Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/k3s/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 9efc9a255..a985aab98 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -5,7 +5,7 @@ path: /var/lib/rancher/k3s register: stat_result -- name: Download and air-gapped installation of k3s +- name: Perform air-gapped installation of k3s # Using air-gapped install so containers are pre-installed to avoid rate-limiting from registries on cluster startup when: not stat_result.stat.exists block: From a6f01376cc46db0ce0aff843cbcabebce0e20e8e Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 4 Nov 2024 09:11:31 +0000 Subject: [PATCH 056/268] removed sentinel cleanup --- ansible/cleanup.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index 2aa070921..cf9b0bdab 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -38,11 +38,6 @@ - name: Cleanup /tmp command : rm -rf /tmp/* - -- name: Delete ansible-init sentinel file created if ansible-init has run during build - ansible.builtin.file: - path: /var/lib/ansible-init.done - state: absent - name: Get package facts package_facts: From 03fe568373b6b0c4699fe10c5b31c77e926329d4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 4 Nov 2024 09:25:30 +0000 Subject: [PATCH 057/268] k3s role refactor --- ansible/roles/k3s/defaults/main.yml | 2 +- ansible/roles/k3s/tasks/main.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml index f8933217b..ba9a1a899 100644 --- a/ansible/roles/k3s/defaults/main.yml +++ b/ansible/roles/k3s/defaults/main.yml @@ -2,4 +2,4 @@ k3s_version: "v1.31.0+k3s1" k3s_selinux_release: v1.6.latest.1 k3s_selinux_rpm_version: 1.6-1 -rocky_version: "{{ ansible_distribution_major_version }}" +k3s_helm_version: v3.11.0 diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 3ac19c3e4..d9fd01a09 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -19,7 +19,7 @@ - name: Install k3s SELinux policy package yum: - name: "https://github.com/k3s-io/k3s-selinux/releases/download/{{ k3s_selinux_release }}/k3s-selinux-{{ k3s_selinux_rpm_version }}.el{{ rocky_version }}.noarch.rpm" + name: "https://github.com/k3s-io/k3s-selinux/releases/download/{{ k3s_selinux_release }}/k3s-selinux-{{ k3s_selinux_rpm_version }}.el{{ ansible_distribution_major_version }}.noarch.rpm" disable_gpg_check: true - name: Create image directory @@ -58,7 +58,7 @@ - name: Install helm unarchive: - src: https://get.helm.sh/helm-v3.11.0-linux-amd64.tar.gz + src: "https://get.helm.sh/helm-{{ k3s_helm_version }}-linux-amd64.tar.gz" dest: /usr/bin extra_opts: "--strip-components=1" owner: root From d96eddd05e8575124eb512cafc06d4bf2b3c8dc3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 4 Nov 2024 09:34:48 +0000 Subject: [PATCH 058/268] updated k3s docs --- ansible/roles/k3s/README.md | 2 +- docs/k3s.README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/k3s/README.md b/ansible/roles/k3s/README.md index 8c4c91c2f..68e8e2410 100644 --- a/ansible/roles/k3s/README.md +++ b/ansible/roles/k3s/README.md @@ -8,7 +8,7 @@ Installs k3s agent and server services on nodes and an ansible-init playbook to Requirements ------------ -`azimuth_cloud.image_utils.linux_ansible_init` must have been run previously on targeted nodes +`azimuth_cloud.image_utils.linux_ansible_init` must have been run previously on targeted nodes during image build. Role Variables -------------- diff --git a/docs/k3s.README.md b/docs/k3s.README.md index 547455132..3269c3073 100644 --- a/docs/k3s.README.md +++ b/docs/k3s.README.md @@ -5,4 +5,4 @@ access to the Kubernetes API. The `k3s` role installs Helm for package managemen # Idempotency K3s is intended to only be installed during image build as it is configured by the appliance on first boot with `azimuth_cloud.image_utils.linux_ansible_init`. Therefore, the `k3s` role isn't -idempotent and changes to variables will not be reflected in the image when running `site.yml`. An additional consequence of this is that for changes to role variables to be correctly applied when extending a base image with a Packer `openhpc-extra` build, the base image must have `ansible-init` installed but not existing K3s instances. +idempotent and changes to variables will not be reflected in the image when running `site.yml`. From bf4703594be05075a32282c5c3019fb21ca77747 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 4 Nov 2024 11:39:21 +0000 Subject: [PATCH 059/268] bumped images up to date with main --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index e1a0166e6..5bdc7c9e3 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241025-1446-04db97d3", - "RL9": "openhpc-RL9-241025-1446-04db97d3", - "RL9-cuda": "openhpc-cuda-RL9-241025-1446-04db97d3" + "RL8": "openhpc-RL8-241104-1102-4f457012", + "RL9": "openhpc-RL9-241104-1102-4f457012", + "RL9-cuda": "openhpc-cuda-RL9-241104-1102-4f457012" } } From ad84877ec4bb870fcccc30b0253c792de1b3c24d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 4 Nov 2024 11:56:10 +0000 Subject: [PATCH 060/268] fixed k3s token generation --- ansible/roles/passwords/tasks/main.yml | 5 +++++ .../roles/passwords/templates/k3s-token.auto.tfvars.json.j2 | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ansible/roles/passwords/tasks/main.yml b/ansible/roles/passwords/tasks/main.yml index 09603e184..573867772 100644 --- a/ansible/roles/passwords/tasks/main.yml +++ b/ansible/roles/passwords/tasks/main.yml @@ -7,6 +7,11 @@ delegate_to: localhost run_once: true +- name: Get templated passwords from target environment + ansible.builtin.include_vars: + file: "{{ openhpc_passwords_output_path }}" + name: templated_secrets + - name: Template k3s token to terraform template: src: k3s-token.auto.tfvars.json.j2 diff --git a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 index 2a8fabba8..9ed32a3af 100644 --- a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 +++ b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 @@ -1,3 +1,3 @@ { - "k3s_token": "{{ vault_k3s_token }}" + "k3s_token": "{{ templated_secrets.vault_k3s_token }}" } \ No newline at end of file From c1065b3dcd4ed8a460f078e1ab8a23fe7e7afabb Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 6 Nov 2024 11:47:14 +0000 Subject: [PATCH 061/268] add manila to compute script --- .../roles/compute_init/files/compute-init.yml | 171 +++++++++++++++++- ansible/roles/compute_init/tasks/main.yml | 48 +++++ .../common/inventory/group_vars/all/nfs.yml | 11 +- 3 files changed, 223 insertions(+), 7 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index ce797c1cf..179b56fbf 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -5,8 +5,55 @@ become: yes vars: control_node_ip: "172.16.1.228" - nfs_export: "/exports/hosts" - resolv_conf_nameservers: [] + nfs_export_hosts: "/exports/hosts" + resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] + + # block device (disk) on which to create the exported filesystem. + # if the disk is not defined, formatting and mounting will not be done. + nfs_disk_location: + + # Path to exported filesystem mountpoint on nfs servers + nfs_export: "/exports/home" + + # nfs client mount options + nfs_client_mnt_options: + + # Path to mountpoint on nfs clients + nfs_client_mnt_point: "/home" + nfs_client_mnt_state: mounted + + nfs_server: "{{ control_node_ip }}" + + + os_manila_mount_shares: [] + os_manila_mount_state: mounted + os_manila_mount_opts: + - x-systemd.device-timeout=30 + - x-systemd.mount-timeout=30 + - noatime + - _netdev # prevents mount blocking early boot before networking available + - rw + os_manila_mount_share_info: [] # populated by lookup mode + os_manila_mount_ceph_conf_path: /etc/ceph + + + basic_users_manage_homedir: false + + basic_users_userdefaults: + state: present + create_home: "{{ basic_users_manage_homedir }}" + generate_ssh_key: "{{ basic_users_manage_homedir }}" + ssh_key_comment: "{{ item.name }}" + + test_user_password: "zXpcWyGQL7jtZnqylQra4g==" + + basic_users_users: + - name: testuser # can't use rocky as $HOME isn't shared! + password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent + uid: 1005 + state: present + + basic_users_groups: [] tasks: - name: Configure resolve.conf @@ -34,6 +81,7 @@ state: reloaded when: _copy_nm_config.changed | default(false) + - name: Mount /etc/hosts on compute nodes block: - name: Ensure the mount directory exists @@ -42,18 +90,131 @@ state: directory mode: 0755 - - name: Mount NFS export + - name: Mount /mnt/hosts mount: path: /mnt/hosts - src: "{{ vars.control_node_ip }}:{{ nfs_export }}" + src: "{{ vars.control_node_ip }}:{{ nfs_export_hosts }}" fstype: nfs opts: rw,sync state: mounted - - name: Copy /exports/hosts contents to /etc/hosts + - name: Copy /mnt/hosts/hosts contents to /etc/hosts copy: src: /mnt/hosts/hosts dest: /etc/hosts owner: root group: root mode: 0644 + + + - name: NFS client mount + block: + - name: ensure mount directory exists + file: + path: "{{ nfs_client_mnt_point }}" + state: directory + + - name: mount the filesystem + mount: + path: "{{ nfs_client_mnt_point }}" + src: "{{ nfs_server }}:{{ nfs_export }}" + fstype: nfs + state: "{{ nfs_client_mnt_state }}" + + + - name: Manila mount + block: + - name: Read manila share from nfs file + slurp: + src: "/mnt/cluster/manila_share_info.yml" + register: manila_share_info_file + + - name: Parse and set fact for manila share info + set_fact: + os_manila_mount_share_info: "{{ manila_share_info_file.content | b64decode | from_yaml }}" + + - name: Ensure Ceph configuration directory exists + ansible.builtin.file: + path: "{{ os_manila_mount_ceph_conf_path }}" + state: directory + mode: "0755" + owner: root + group: root + + - name: Configure ceph.conf using os_manila_mount_host + ansible.builtin.template: + src: /etc/ansible-init/templates/ceph.conf.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.conf" + owner: root + group: root + mode: "0600" + + - name: Ensure mount directory exists + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Write Ceph client keyring + ansible.builtin.template: + src: /etc/ansible-init/templates/ceph.keyring.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.client.{{ item.share_user }}.keyring" + mode: "0600" + owner: root + group: root + loop: "{{ os_manila_mount_share_info }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Mount the Ceph share + ansible.posix.mount: + path: "{{ item[0].mount_path }}" + src: "{{ item[1].host }}:{{ item[1].export }}" + fstype: ceph + opts: "name={{ item[1].share_user }},{{ (item[0].mount_opts | default(os_manila_mount_opts)) | join(',') }}" + # NB share_user is looked up here in case of autodetection + state: "{{ item[0].mount_state | default(os_manila_mount_state) }}" + loop: "{{ os_manila_mount_shares | zip(os_manila_mount_share_info) }}" + loop_control: + label: "{{ item[0].share_name }}" + + - name: Ensure mounted directory has correct permissions + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] + + + - name: Basic users setup + block: + - name: Create groups + ansible.builtin.group: "{{ item }}" + loop: "{{ basic_users_groups }}" + + - name: Create users + user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}" + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }} [{{ item.state | default('present') }}]" + register: basic_users_info + + - name: Write sudo rules + blockinfile: + path: /etc/sudoers.d/80-{{ item.name}}-user + block: "{{ item.sudo }}" + create: true + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }}" + when: "'sudo' in item" \ No newline at end of file diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 49b7d37e8..812ed84ff 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -17,6 +17,8 @@ mode: 0644 loop: - ../../resolv_conf/templates/resolv.conf.j2 + - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 + - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 - name: Ensure files directory exists file: @@ -36,6 +38,52 @@ loop: - ../../resolv_conf/files/NetworkManager-dns-none.conf +- name: Ensure library directory exists + file: + path: /etc/ansible-init/library + state: directory + owner: root + group: root + mode: 0644 + +- name: Inject files + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/library/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../basic_users/library/terminate_user_sessions.py + - ../../stackhpc.os-manila-mount/library/os_manila_share.py + +- name: Ensure filter_plugins directory exists + file: + path: /etc/ansible-init/filter_plugins + state: directory + owner: root + group: root + mode: 0644 + +- name: Inject filter_plugins + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/filter_plugins/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../basic_users/filter_plugins/filter_keys.py + +- name: Add filter_plugins ansible.cfg + lineinfile: + path: /etc/ansible-init/ansible.cfg + line: "filter_plugins = /etc/ansible-init/filter_plugins" + state: present + owner: root + group: root + mode: 0644 + - name: Inject compute initialisation playbook copy: src: compute-init.yml diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index 110a1383c..036850847 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -18,6 +18,13 @@ nfs_configurations: - comment: Export /etc/hosts copy from Slurm control node nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" + server: "{{ inventory_hostname in groups['control'] }}" clients: false - nfs_export: "/exports/hosts" # control node has to copy in /etc/hosts to here \ No newline at end of file + nfs_export: "/exports/hosts" # control node has to copy in /etc/hosts to here + + - comment: Export cluster info from control node + nfs_enable: + server: "{{ inventory_hostname in groups['control']}}" + clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" + nfs_server: "{{ nfs_server_default }}" + nfs_export: "/exports/cluster" \ No newline at end of file From fce13ede9a62884be2def320fc37aee07ea559d5 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 6 Nov 2024 15:02:55 +0000 Subject: [PATCH 062/268] Compute script: configure EESSI --- .../roles/compute_init/files/compute-init.yml | 70 ++++++++++++++++--- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 179b56fbf..23f865f52 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -11,17 +11,13 @@ # block device (disk) on which to create the exported filesystem. # if the disk is not defined, formatting and mounting will not be done. nfs_disk_location: - # Path to exported filesystem mountpoint on nfs servers nfs_export: "/exports/home" - # nfs client mount options nfs_client_mnt_options: - # Path to mountpoint on nfs clients nfs_client_mnt_point: "/home" nfs_client_mnt_state: mounted - nfs_server: "{{ control_node_ip }}" @@ -38,23 +34,29 @@ basic_users_manage_homedir: false - basic_users_userdefaults: state: present create_home: "{{ basic_users_manage_homedir }}" generate_ssh_key: "{{ basic_users_manage_homedir }}" ssh_key_comment: "{{ item.name }}" - test_user_password: "zXpcWyGQL7jtZnqylQra4g==" - basic_users_users: - name: testuser # can't use rocky as $HOME isn't shared! password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent uid: 1005 state: present - basic_users_groups: [] + + # Default to 10GB + cvmfs_quota_limit_mb: 10000 + cvmfs_config_default: + CVMFS_CLIENT_PROFILE: single + CVMFS_QUOTA_LIMIT: "{{ cvmfs_quota_limit_mb }}" + cvmfs_config_overrides: {} + cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" + + tasks: - name: Configure resolve.conf block: @@ -217,4 +219,54 @@ loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" - when: "'sudo' in item" \ No newline at end of file + when: "'sudo' in item" + + + - name: Configure EESSI + gather_facts: false + block: + - name: Download Cern GPG key + ansible.builtin.get_url: + url: http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM + dest: ./cvmfs-key.gpg + + - name: Import downloaded GPG key + command: rpm --import cvmfs-key.gpg + + - name: Add CVMFS repo + dnf: + name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm + + - name: Install CVMFS + dnf: + name: cvmfs + + - name: Install EESSI CVMFS config + dnf: + name: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi-latest.noarch.rpm + # NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok? + disable_gpg_check: true + + # Alternative version using official repo - still no GPG key :( + # - name: Add EESSI repo + # dnf: + # name: http://repo.eessi-infra.org/eessi/rhel/8/noarch/eessi-release-0-1.noarch.rpm + + # - name: Install EESSI CVMFS config + # dnf: + # name: cvmfs-config-eessi + + - name: Add base CVMFS config + community.general.ini_file: + dest: /etc/cvmfs/default.local + section: null + option: "{{ item.key }}" + value: "{{ item.value }}" + no_extra_spaces: true + loop: "{{ cvmfs_config | dict2items }}" + + + # NOTE: Not clear how to make this idempotent + - name: Ensure CVMFS config is setup + command: + cmd: "cvmfs_config setup" \ No newline at end of file From 36a36cd1775cffd13589f4d0a23655ba28cb5963 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:17:51 +0000 Subject: [PATCH 063/268] don't trivy scan nightly builds (#467) --- .github/workflows/nightlybuild.yml | 61 ------------------------------ 1 file changed, 61 deletions(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 333550c53..607dabd2e 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -108,68 +108,7 @@ jobs: echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" - - name: Download image - run: | - . venv/bin/activate - sudo mkdir /mnt/images - sudo chmod 777 /mnt/images - openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" - openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-id }} - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: install libguestfs - run: | - sudo apt -y update - sudo apt -y install libguestfs-tools - - - name: mkdir for mount - run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}' - - - name: mount qcow2 file - run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}' - - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@0.17.0 - with: - scan-type: fs - scan-ref: "${{ steps.manifest.outputs.image-name }}" - scanners: "vuln" - format: sarif - output: "${{ steps.manifest.outputs.image-name }}.sarif" - # turn off secret scanning to speed things up - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@v3 - with: - sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif" - category: "${{ matrix.os_version }}-${{ matrix.build }}" - - - name: Fail if scan has CRITICAL vulnerabilities - uses: aquasecurity/trivy-action@0.16.1 - with: - scan-type: fs - scan-ref: "${{ steps.manifest.outputs.image-name }}" - scanners: "vuln" - format: table - exit-code: '1' - severity: 'CRITICAL' - ignore-unfixed: true - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Delete new image if Trivy scan fails - if: failure() && steps.packer_build.outcome == 'success' # Runs if the Trivy scan found crit vulnerabilities or failed - run: | - . venv/bin/activate - echo "Deleting new image due to critical vulnerabilities or scan failure ..." - openstack image delete "${{ steps.manifest.outputs.image-id }}" - - name: Delete old latest image - if: success() # Runs only if Trivy scan passed run: | . venv/bin/activate IMAGE_COUNT=$(openstack image list --name ${{ steps.manifest.outputs.image-name }} -f value -c ID | wc -l) From 812f1a0d3381ec7fd2323a7fea19fb7abffca89e Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:18:10 +0000 Subject: [PATCH 064/268] Add generic upgrade docs (#462) * add upgrade docs * link to generic image build docs from upgrade docs * address minor upgrade docs issues * fix upgrade merge tag command * fix upgrade docs typo --- docs/upgrades.md | 103 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 docs/upgrades.md diff --git a/docs/upgrades.md b/docs/upgrades.md new file mode 100644 index 000000000..6e398934e --- /dev/null +++ b/docs/upgrades.md @@ -0,0 +1,103 @@ +# Upgrades + +This document explains the generic steps required to upgrade a deployment of the Slurm Appliance with upstream changes from StackHPC. +Generally, upstream releases will happen roughly monthly. Releases may contain new functionality and/or updated images. + +Any site-specific instructions in [docs/site/README.md](site/README.md) should be reviewed in tandem with this. + +This document assumes the deployment repository has: +1. Remotes: + - `origin` referring to the site-specific remote repository. + - `stackhpc` referring to the StackHPC repository at https://github.com/stackhpc/ansible-slurm-appliance.git. +2. Branches: + - `main` - following `main/origin`, the current site-specific code deployed to production. + - `upstream` - following `main/stackhpc`, i.e. the upstream `main` branch from `stackhpc`. +3. The following environments: + - `$PRODUCTION`: a production environment, as defined by e.g. `environments/production/`. + - `$STAGING`: a production environment, as defined by e.g. `environments/staging/`. + - `$SITE_ENV`: a base site-specific environment, as defined by e.g. `environments/mysite/`. + +**NB:** Commands which should be run on the Slurm login node are shown below prefixed `[LOGIN]$`. +All other commands should be run on the Ansible deploy host. + +1. Update the `upstream` branch from the `stackhpc` remote, including tags: + + git fetch stackhpc main --tags + +1. Identify the latest release from the [Slurm appliance release page](https://github.com/stackhpc/ansible-slurm-appliance/releases). Below this release is shown as `vX.Y`. + +1. Ensure your local site branch is up to date and create a new branch from it for the + site-specfic release code: + + git checkout main + git pull --prune + git checkout -b update/vX.Y + +1. Merge the upstream code into your release branch: + + git merge vX.Y + + It is possible this will introduce merge conflicts; fix these following the usual git + prompts. Generally merge conflicts should only exist where functionality which was added + for your site (not in a hook) has subsequently been merged upstream. + +1. Push this branch and create a PR: + + git push + # follow instructions + +1. Review the PR to see if any added/changed functionality requires alteration of + site-specific configuration. In general changes to existing functionality will aim to be + backward compatible. Alteration of site-specific configuration will usually only be + necessary to use new functionality or where functionality has been upstreamed as above. + + Make changes as necessary. + +1. Identify image(s) from the relevant [Slurm appliance release](https://github.com/stackhpc/ansible-slurm-appliance/releases), and download + using the link on the release plus the image name, e.g. for an image `openhpc-ofed-RL8-240906-1042-32568dbb`: + + wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/openhpc-ofed-RL8-240906-1042-32568dbb + + Note that some releases may not include new images. In this case use the image from the latest previous release with new images. + +1. If required, build an "extra" image with local modifications, see [docs/image-build.md](./image-build.md). + +1. Modify your site-specific environment to use this image, e.g. via `cluster_image_id` in `environments/$SITE_ENV/terraform/variables.tf`. + +1. Test this in your staging cluster. + +1. Commit changes and push to the PR created above. + +1. Declare a future outage window to cluster users. A [Slurm reservation](https://slurm.schedmd.com/scontrol.html#lbAQ) can be + used to prevent jobs running during that window, e.g.: + + [LOGIN]$ sudo scontrol create reservation Flags=MAINT ReservationName="upgrade-vX.Y" StartTime=2024-10-16T08:00:00 EndTime=2024-10-16T10:00:00 Nodes=ALL Users=root + + Note a reservation cannot be created if it may overlap with currently running jobs (defined by job or partition time limits). + +1. At the outage window, check there are no jobs running: + + [LOGIN]$ squeue + +1. Deploy the branch created above to production, i.e. activate the production environment, run OpenTofu to reimage or +delete/recreate instances with the new images (depending on how the root disk is defined), and run Ansible's `site.yml` +playbook to reconfigure the cluster, e.g. as described in the main [README.md](../README.md). + +1. Check slurm is up: + + [LOGIN]$ sinfo -R + + The `-R` shows the reason for any nodes being down. + +1. If the above shows nodes done for having been "unexpectedly rebooted", set them up again: + + [LOGIN]$ sudo scontrol update state=RESUME nodename=$HOSTLIST_EXPR + + where the hostlist expression might look like e.g. `general-[0-1]` to reset state for nodes 0 and 1 of the general partition. + +1. Delete the reservation: + + [LOGIN]$ sudo scontrol delete ReservationName="upgrade-slurm-v1.160" + +1. Tell users the cluster is available again. + From 36f1e170f04fad5ca68a4fff6e876c3bd5b9780a Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 7 Nov 2024 13:19:51 +0000 Subject: [PATCH 065/268] testing openhpc in compute script --- .../roles/compute_init/files/compute-init.yml | 221 +++++++++++++++++- ansible/roles/compute_init/tasks/main.yml | 2 + 2 files changed, 221 insertions(+), 2 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 23f865f52..bdbe8ab08 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -264,9 +264,226 @@ value: "{{ item.value }}" no_extra_spaces: true loop: "{{ cvmfs_config | dict2items }}" - # NOTE: Not clear how to make this idempotent - name: Ensure CVMFS config is setup command: - cmd: "cvmfs_config setup" \ No newline at end of file + cmd: "cvmfs_config setup" + + + - name: Configure openhpc + block: + - name: Check openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions exist + assert: + that: + - openhpc_slurm_control_host is defined + - openhpc_cluster_name is defined + - openhpc_cluster_name != '' + - openhpc_slurm_partitions is defined + fail_msg: "Undefined openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions." + + - name: Fail if control host not in play and munge key not specified + fail: + msg: "Either the slurm control node must be in the play or `openhpc_munge_key` must be set" + when: + - openhpc_slurm_control_host not in ansible_play_hosts + - not openhpc_munge_key + + # - name: Ensure Slurm directories exists + # file: + # path: "{{ openhpc_state_save_location }}" + # owner: slurm + # group: slurm + # mode: 0755 + # state: directory + # when: inventory_hostname == openhpc_slurm_control_host + + # - name: Generate a Munge key on control host + # # NB this is usually a no-op as the package install actually generates a (node-unique) one, so won't usually trigger handler + # command: "dd if=/dev/urandom of=/etc/munge/munge.key bs=1 count=1024" + # args: + # creates: "/etc/munge/munge.key" + # when: inventory_hostname == openhpc_slurm_control_host + + # - name: Retrieve Munge key from control host + # slurp: + # src: "/etc/munge/munge.key" + # register: openhpc_control_munge_key + # delegate_to: "{{ openhpc_slurm_control_host }}" + # when: openhpc_slurm_control_host in ansible_play_hosts + + - name: Fix permissions on /etc to pass Munge startup checks + # Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root) + # which fails munged startup checks + file: + path: /etc + state: directory + mode: g-w + + - name: Write Munge key + copy: + content: "{{ openhpc_munge_key or (openhpc_control_munge_key.content | b64decode) }}" + dest: "/etc/munge/munge.key" + owner: munge + group: munge + mode: 0400 + notify: + - Restart Munge service + + - name: Ensure JobComp logfile exists + file: + path: "{{ openhpc_slurm_job_comp_loc }}" + state: touch + owner: slurm + group: slurm + mode: 0644 + access_time: preserve + modification_time: preserve + when: openhpc_slurm_job_comp_type == 'jobcomp/filetxt' + + - name: Template slurmdbd.conf + template: + src: slurmdbd.conf.j2 + dest: /etc/slurm/slurmdbd.conf + mode: "0600" + owner: slurm + group: slurm + notify: Restart slurmdbd service + when: openhpc_enable.database | default(false) | bool + + - name: Make local tempfile for slurm.conf templating # ensures simultaneous runs don't clobber each other + ansible.builtin.tempfile: + register: _slurm_conf_tmpfile + delegate_to: localhost + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + changed_when: false # so molecule doesn't fail + become: no + + - name: Template basic slurm.conf + template: + src: slurm.conf.j2 + dest: "{{ _slurm_conf_tmpfile.path }}" + lstrip_blocks: true + mode: 0644 + delegate_to: localhost + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + changed_when: false # so molecule doesn't fail + become: no + + - name: Customise slurm.conf + community.general.ini_file: + path: "{{ _slurm_conf_tmpfile.path }}" + option: "{{ item.key }}" + section: '' + value: "{{ (item.value | join(',')) if (item.value is sequence and item.value is not string) else item.value }}" + no_extra_spaces: true + create: no + mode: 0644 + loop: "{{ openhpc_config | dict2items }}" + delegate_to: localhost + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + changed_when: false # so molecule doesn't fail + become: no + + - name: Create slurm.conf + copy: + src: "{{ _slurm_conf_tmpfile.path }}" + dest: /etc/slurm/slurm.conf + owner: root + group: root + mode: 0644 + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + notify: + - Restart slurmctld service + register: ohpc_slurm_conf + # NB uses restart rather than reload as number of nodes might have changed + + - name: Create gres.conf + template: + src: "{{ openhpc_gres_template }}" + dest: /etc/slurm/gres.conf + mode: "0600" + owner: slurm + group: slurm + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + notify: + - Restart slurmctld service + register: ohpc_gres_conf + # NB uses restart rather than reload as this is needed in some cases + + - name: Template cgroup.conf + # appears to be required even with NO cgroup plugins: https://slurm.schedmd.com/cgroups.html#cgroup_design + template: + src: cgroup.conf.j2 + dest: /etc/slurm/cgroup.conf + mode: "0644" # perms/ownership based off src from ohpc package + owner: root + group: root + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + + - name: Remove local tempfile for slurm.conf templating + ansible.builtin.file: + path: "{{ _slurm_conf_tmpfile.path }}" + state: absent + when: _slurm_conf_tmpfile.path is defined + delegate_to: localhost + changed_when: false # so molecule doesn't fail + become: no + + - name: Notify handler for slurmd restart + debug: + msg: "notifying handlers" # meta: noop doesn't support 'when' + changed_when: true + when: + - openhpc_slurm_control_host in ansible_play_hosts + - hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler + notify: + - Restart slurmd service + + - name: Set slurmctld location for configless operation + lineinfile: + path: /etc/sysconfig/slurmd + line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'" + regexp: "^SLURMD_OPTIONS=" + create: yes + owner: root + group: root + mode: 0644 + when: + - openhpc_enable.batch | default(false) + - openhpc_slurm_configless + notify: + - Restart slurmd service + # Reloading is sufficent, but using a single handler means no bounce. Realistically this won't regularly change on a running slurmd so restarting is ok. + + # Munge state could be unchanged but the service is not running. + # Handle that here. + - name: Configure Munge service + service: + name: munge + enabled: "{{ openhpc_slurm_service_enabled | bool }}" + state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + + - name: Flush handler + meta: flush_handlers # as then subsequent "ensure" is a no-op if slurm services bounced + + - name: Ensure slurmdbd state + service: + name: slurmdbd + enabled: "{{ openhpc_slurm_service_enabled | bool }}" + state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + when: openhpc_enable.database | default(false) | bool + + # - name: Ensure slurmctld state + # service: + # name: slurmctld + # enabled: "{{ openhpc_slurm_service_enabled | bool }}" + # state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + # when: openhpc_enable.control | default(false) | bool + + - name: Ensure slurmd state + service: + name: slurmd + enabled: "{{ openhpc_slurm_service_enabled | bool }}" + state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + when: openhpc_enable.batch | default(false) | bool \ No newline at end of file diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 812ed84ff..d236c5e57 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -56,6 +56,7 @@ loop: - ../../basic_users/library/terminate_user_sessions.py - ../../stackhpc.os-manila-mount/library/os_manila_share.py + - ../../stackhpc.openhpc/library/sacct_cluster.py - name: Ensure filter_plugins directory exists file: @@ -74,6 +75,7 @@ mode: 0644 loop: - ../../basic_users/filter_plugins/filter_keys.py + - ../../stackhpc.openhpc/filter_plugins/slurm_conf.py - name: Add filter_plugins ansible.cfg lineinfile: From 8ab12e944e9b8253243e3e128daff7883ab2752e Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 12 Nov 2024 13:02:53 +0000 Subject: [PATCH 066/268] Passwords role now reads variables into top level vars Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/passwords/tasks/main.yml | 2 +- ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/passwords/tasks/main.yml b/ansible/roles/passwords/tasks/main.yml index 573867772..d67dc84ea 100644 --- a/ansible/roles/passwords/tasks/main.yml +++ b/ansible/roles/passwords/tasks/main.yml @@ -8,9 +8,9 @@ run_once: true - name: Get templated passwords from target environment +# inventory group/host vars created in a play cannot be accessed in the same play, even after meta: refresh_inventory ansible.builtin.include_vars: file: "{{ openhpc_passwords_output_path }}" - name: templated_secrets - name: Template k3s token to terraform template: diff --git a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 index 9ed32a3af..2a8fabba8 100644 --- a/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 +++ b/ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2 @@ -1,3 +1,3 @@ { - "k3s_token": "{{ templated_secrets.vault_k3s_token }}" + "k3s_token": "{{ vault_k3s_token }}" } \ No newline at end of file From bf165471917c102d267c08a5e3a5eaed148a5c2e Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 12 Nov 2024 13:03:39 +0000 Subject: [PATCH 067/268] moved k3s plays to install script --- ansible/bootstrap.yml | 1 + ansible/roles/k3s/tasks/{main.yml => install.yml} | 0 2 files changed, 1 insertion(+) rename ansible/roles/k3s/tasks/{main.yml => install.yml} (100%) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index b3fca0210..733d4b3f8 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -266,3 +266,4 @@ tasks: - ansible.builtin.include_role: name: k3s + tasks_from: install.yml diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/install.yml similarity index 100% rename from ansible/roles/k3s/tasks/main.yml rename to ansible/roles/k3s/tasks/install.yml From 5e3927b4328054db2703b8e727649881771eedbd Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 12 Nov 2024 13:22:18 +0000 Subject: [PATCH 068/268] reverted caas changes --- ansible/roles/cluster_infra/templates/outputs.tf.j2 | 4 ++-- ansible/roles/cluster_infra/templates/resources.tf.j2 | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/ansible/roles/cluster_infra/templates/outputs.tf.j2 b/ansible/roles/cluster_infra/templates/outputs.tf.j2 index 90556bf7a..4d894a1dd 100644 --- a/ansible/roles/cluster_infra/templates/outputs.tf.j2 +++ b/ansible/roles/cluster_infra/templates/outputs.tf.j2 @@ -24,8 +24,8 @@ output "cluster_nodes" { } }, { - name = openstack_compute_instance_v2.control["control"].name - ip = openstack_compute_instance_v2.control["control"].network[0].fixed_ip_v4 + name = openstack_compute_instance_v2.control.name + ip = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 groups = ["control", "{{ cluster_name }}_control"], facts = { openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index b10af94f9..6c9a52ac4 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -399,7 +399,7 @@ resource "openstack_compute_instance_v2" "login" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} - k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 k3s_token = "{{ k3s_token }}" } } @@ -412,11 +412,9 @@ resource "openstack_compute_instance_v2" "control" { {% else %} flavor_id = "{{ control_flavor }}" {% endif %} - for_each = toset(["control"]) network { port = openstack_networking_port_v2.control.id - access_network = true } {% if cluster_storage_network is defined %} @@ -566,7 +564,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} - k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 k3s_token = "{{ k3s_token }}" } } From 8930d388fa9cacd206abd48563f9996daea11dd8 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 12 Nov 2024 13:51:28 +0000 Subject: [PATCH 069/268] finish transferring openhpc tasks to compute script --- .../roles/compute_init/files/compute-init.yml | 195 ++---------------- ansible/roles/compute_init/tasks/main.yml | 21 ++ 2 files changed, 35 insertions(+), 181 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index bdbe8ab08..aaee0718b 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -56,6 +56,15 @@ cvmfs_config_overrides: {} cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" + openhpc_conf_server: control_node_ip + openhpc_gres_template: /etc/ansible-init/templates/gres.conf.j2 + openhpc_slurm_service_enabled: true + openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}" + openhpc_enable: + control: false + batch: true + database: false + runtime: true tasks: - name: Configure resolve.conf @@ -223,7 +232,6 @@ - name: Configure EESSI - gather_facts: false block: - name: Download Cern GPG key ansible.builtin.get_url: @@ -247,15 +255,6 @@ # NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok? disable_gpg_check: true - # Alternative version using official repo - still no GPG key :( - # - name: Add EESSI repo - # dnf: - # name: http://repo.eessi-infra.org/eessi/rhel/8/noarch/eessi-release-0-1.noarch.rpm - - # - name: Install EESSI CVMFS config - # dnf: - # name: cvmfs-config-eessi - - name: Add base CVMFS config community.general.ini_file: dest: /etc/cvmfs/default.local @@ -273,45 +272,6 @@ - name: Configure openhpc block: - - name: Check openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions exist - assert: - that: - - openhpc_slurm_control_host is defined - - openhpc_cluster_name is defined - - openhpc_cluster_name != '' - - openhpc_slurm_partitions is defined - fail_msg: "Undefined openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions." - - - name: Fail if control host not in play and munge key not specified - fail: - msg: "Either the slurm control node must be in the play or `openhpc_munge_key` must be set" - when: - - openhpc_slurm_control_host not in ansible_play_hosts - - not openhpc_munge_key - - # - name: Ensure Slurm directories exists - # file: - # path: "{{ openhpc_state_save_location }}" - # owner: slurm - # group: slurm - # mode: 0755 - # state: directory - # when: inventory_hostname == openhpc_slurm_control_host - - # - name: Generate a Munge key on control host - # # NB this is usually a no-op as the package install actually generates a (node-unique) one, so won't usually trigger handler - # command: "dd if=/dev/urandom of=/etc/munge/munge.key bs=1 count=1024" - # args: - # creates: "/etc/munge/munge.key" - # when: inventory_hostname == openhpc_slurm_control_host - - # - name: Retrieve Munge key from control host - # slurp: - # src: "/etc/munge/munge.key" - # register: openhpc_control_munge_key - # delegate_to: "{{ openhpc_slurm_control_host }}" - # when: openhpc_slurm_control_host in ansible_play_hosts - - name: Fix permissions on /etc to pass Munge startup checks # Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root) # which fails munged startup checks @@ -320,83 +280,13 @@ state: directory mode: g-w - - name: Write Munge key + - name: Copy Munge key from NFS-mounted directory to /etc/munge copy: - content: "{{ openhpc_munge_key or (openhpc_control_munge_key.content | b64decode) }}" + src: "/mnt/openhpc_munge.key" dest: "/etc/munge/munge.key" owner: munge group: munge mode: 0400 - notify: - - Restart Munge service - - - name: Ensure JobComp logfile exists - file: - path: "{{ openhpc_slurm_job_comp_loc }}" - state: touch - owner: slurm - group: slurm - mode: 0644 - access_time: preserve - modification_time: preserve - when: openhpc_slurm_job_comp_type == 'jobcomp/filetxt' - - - name: Template slurmdbd.conf - template: - src: slurmdbd.conf.j2 - dest: /etc/slurm/slurmdbd.conf - mode: "0600" - owner: slurm - group: slurm - notify: Restart slurmdbd service - when: openhpc_enable.database | default(false) | bool - - - name: Make local tempfile for slurm.conf templating # ensures simultaneous runs don't clobber each other - ansible.builtin.tempfile: - register: _slurm_conf_tmpfile - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - changed_when: false # so molecule doesn't fail - become: no - - - name: Template basic slurm.conf - template: - src: slurm.conf.j2 - dest: "{{ _slurm_conf_tmpfile.path }}" - lstrip_blocks: true - mode: 0644 - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - changed_when: false # so molecule doesn't fail - become: no - - - name: Customise slurm.conf - community.general.ini_file: - path: "{{ _slurm_conf_tmpfile.path }}" - option: "{{ item.key }}" - section: '' - value: "{{ (item.value | join(',')) if (item.value is sequence and item.value is not string) else item.value }}" - no_extra_spaces: true - create: no - mode: 0644 - loop: "{{ openhpc_config | dict2items }}" - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - changed_when: false # so molecule doesn't fail - become: no - - - name: Create slurm.conf - copy: - src: "{{ _slurm_conf_tmpfile.path }}" - dest: /etc/slurm/slurm.conf - owner: root - group: root - mode: 0644 - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - notify: - - Restart slurmctld service - register: ohpc_slurm_conf - # NB uses restart rather than reload as number of nodes might have changed - name: Create gres.conf template: @@ -405,82 +295,25 @@ mode: "0600" owner: slurm group: slurm - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - notify: - - Restart slurmctld service + when: openhpc_enable.control | default(false) register: ohpc_gres_conf - # NB uses restart rather than reload as this is needed in some cases - - - name: Template cgroup.conf - # appears to be required even with NO cgroup plugins: https://slurm.schedmd.com/cgroups.html#cgroup_design - template: - src: cgroup.conf.j2 - dest: /etc/slurm/cgroup.conf - mode: "0644" # perms/ownership based off src from ohpc package - owner: root - group: root - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - - - name: Remove local tempfile for slurm.conf templating - ansible.builtin.file: - path: "{{ _slurm_conf_tmpfile.path }}" - state: absent - when: _slurm_conf_tmpfile.path is defined - delegate_to: localhost - changed_when: false # so molecule doesn't fail - become: no - - - name: Notify handler for slurmd restart - debug: - msg: "notifying handlers" # meta: noop doesn't support 'when' - changed_when: true - when: - - openhpc_slurm_control_host in ansible_play_hosts - - hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler - notify: - - Restart slurmd service - name: Set slurmctld location for configless operation lineinfile: path: /etc/sysconfig/slurmd - line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'" + line: "SLURMD_OPTIONS='--conf-server {{ openhpc_conf_server }}'" regexp: "^SLURMD_OPTIONS=" create: yes owner: root group: root mode: 0644 - when: - - openhpc_enable.batch | default(false) - - openhpc_slurm_configless - notify: - - Restart slurmd service - # Reloading is sufficent, but using a single handler means no bounce. Realistically this won't regularly change on a running slurmd so restarting is ok. - - # Munge state could be unchanged but the service is not running. - # Handle that here. + - name: Configure Munge service service: name: munge enabled: "{{ openhpc_slurm_service_enabled | bool }}" state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" - - name: Flush handler - meta: flush_handlers # as then subsequent "ensure" is a no-op if slurm services bounced - - - name: Ensure slurmdbd state - service: - name: slurmdbd - enabled: "{{ openhpc_slurm_service_enabled | bool }}" - state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" - when: openhpc_enable.database | default(false) | bool - - # - name: Ensure slurmctld state - # service: - # name: slurmctld - # enabled: "{{ openhpc_slurm_service_enabled | bool }}" - # state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" - # when: openhpc_enable.control | default(false) | bool - - name: Ensure slurmd state service: name: slurmd diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index d236c5e57..c5f884081 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -19,6 +19,7 @@ - ../../resolv_conf/templates/resolv.conf.j2 - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 + - ../../stackhpc.openhpc/templates/gres.conf.j2 - name: Ensure files directory exists file: @@ -86,6 +87,26 @@ group: root mode: 0644 +- name: Ensure /exports/cluster directory exists + file: + path: /exports/cluster + state: directory + owner: root + group: root + mode: 0644 + delegate_to: "{{ groups['control'] | first }}" + +- name: Write openhpc munge key + copy: + content: "{{ vault_openhpc_mungekey | b64decode }}" + dest: "/exports/cluster/openhpc_munge.key" + owner: munge + group: munge + mode: 0400 + become: true + delegate_to: "{{ groups['control'] | first }}" + + - name: Inject compute initialisation playbook copy: src: compute-init.yml From 20a8a6261b025a21658330fecbd87cb93b281715 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 12 Nov 2024 15:38:43 +0000 Subject: [PATCH 070/268] re-enabled caas access_network --- ansible/roles/cluster_infra/templates/resources.tf.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index 6c9a52ac4..cab7cc7a2 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -415,6 +415,7 @@ resource "openstack_compute_instance_v2" "control" { network { port = openstack_networking_port_v2.control.id + access_network = true } {% if cluster_storage_network is defined %} From 4e4f20635e2435096bd8b810887f4d860a0e9a9d Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 12 Nov 2024 16:32:07 +0000 Subject: [PATCH 071/268] move manila share info mount to compute_init role --- ansible/roles/compute_init/files/compute-init.yml | 10 +++------- ansible/roles/compute_init/tasks/main.yml | 14 +++++++++++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index aaee0718b..5c4bd0005 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -3,19 +3,16 @@ - name: Compute node initialisation hosts: localhost become: yes + # VARS TO BE SUPPLIED VIA CLOUD INIT METADATA vars: control_node_ip: "172.16.1.228" nfs_export_hosts: "/exports/hosts" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] - # block device (disk) on which to create the exported filesystem. - # if the disk is not defined, formatting and mounting will not be done. + nfs_disk_location: - # Path to exported filesystem mountpoint on nfs servers nfs_export: "/exports/home" - # nfs client mount options nfs_client_mnt_options: - # Path to mountpoint on nfs clients nfs_client_mnt_point: "/home" nfs_client_mnt_state: mounted nfs_server: "{{ control_node_ip }}" @@ -48,7 +45,6 @@ basic_users_groups: [] - # Default to 10GB cvmfs_quota_limit_mb: 10000 cvmfs_config_default: CVMFS_CLIENT_PROFILE: single @@ -137,7 +133,7 @@ block: - name: Read manila share from nfs file slurp: - src: "/mnt/cluster/manila_share_info.yml" + src: "/mnt/manila_share_info.yml" register: manila_share_info_file - name: Parse and set fact for manila share info diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index c5f884081..bcc4db800 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -96,6 +96,19 @@ mode: 0644 delegate_to: "{{ groups['control'] | first }}" +- name: Ensure /exports/cluster exists on control node + ansible.builtin.file: + path: /exports/cluster + state: directory + mode: '0755' + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy manila share info to /exports/cluster + copy: + content: "{{ os_manila_mount_share_info | to_nice_yaml }}" + dest: "/exports/cluster/manila_share_info.yml" + delegate_to: "{{ groups['control'] | first }}" + - name: Write openhpc munge key copy: content: "{{ vault_openhpc_mungekey | b64decode }}" @@ -103,7 +116,6 @@ owner: munge group: munge mode: 0400 - become: true delegate_to: "{{ groups['control'] | first }}" From 5b43d0efc7518e317d992d1ccc0f7d9da601a40a Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:13:05 +0000 Subject: [PATCH 072/268] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 5bdc7c9e3..82cf7fd8d 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241104-1102-4f457012", - "RL9": "openhpc-RL9-241104-1102-4f457012", - "RL9-cuda": "openhpc-cuda-RL9-241104-1102-4f457012" + "RL8": "openhpc-RL8-241113-0934-20a8a626", + "RL9": "openhpc-RL9-241113-0934-20a8a626", + "RL9-cuda": "openhpc-cuda-RL9-241113-0934-20a8a626" } } From a03d9f12387cee4e863e688a128a1fa9387e2a3e Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:21:22 +0000 Subject: [PATCH 073/268] remove signature_verified property from nightly/latest images to enable further builds ... (#474) accidently removed by PR 467 --- .github/workflows/nightlybuild.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 607dabd2e..5e06a3147 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -108,6 +108,11 @@ jobs: echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" + - name: Make image usable for further builds + run: | + . venv/bin/activate + openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" + - name: Delete old latest image run: | . venv/bin/activate From fda2d312fd5724e1db23578ba3b03a827b1151c9 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 13 Nov 2024 13:16:18 +0000 Subject: [PATCH 074/268] fix mounts --- ansible/filesystems.yml | 19 ++++++++++++++++ .../roles/compute_init/files/compute-init.yml | 15 +++++++++++++ ansible/roles/compute_init/tasks/main.yml | 22 ------------------- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/ansible/filesystems.yml b/ansible/filesystems.yml index e1a782bad..316ae23d8 100644 --- a/ansible/filesystems.yml +++ b/ansible/filesystems.yml @@ -24,3 +24,22 @@ tasks: - include_role: name: stackhpc.os-manila-mount + +- name: Manage /exports/cluster and Manila share info + hosts: control + become: true + tasks: + - block: + - name: Ensure /exports/cluster directory exists + file: + path: /exports/cluster + state: directory + owner: root + group: root + mode: 0755 + + - name: Copy manila share info to /exports/cluster + copy: + content: "{{ os_manila_mount_share_info | to_nice_yaml }}" + dest: "/exports/cluster/manila_share_info.yml" + when: os_manila_mount_share_info is defined \ No newline at end of file diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 5c4bd0005..0d163dbf3 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -7,6 +7,7 @@ vars: control_node_ip: "172.16.1.228" nfs_export_hosts: "/exports/hosts" + nfs_export_cluster: "/exports/cluster" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] @@ -127,6 +128,20 @@ src: "{{ nfs_server }}:{{ nfs_export }}" fstype: nfs state: "{{ nfs_client_mnt_state }}" + + - name: Ensure the mount directory exists + file: + path: /mnt/ + state: directory + mode: 0755 + + - name: Mount /mnt/ + mount: + path: /mnt/ + src: "{{ vars.control_node_ip }}:{{ nfs_export_cluster }}" + fstype: nfs + opts: rw,sync + state: mounted - name: Manila mount diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index bcc4db800..892b8b093 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -87,28 +87,6 @@ group: root mode: 0644 -- name: Ensure /exports/cluster directory exists - file: - path: /exports/cluster - state: directory - owner: root - group: root - mode: 0644 - delegate_to: "{{ groups['control'] | first }}" - -- name: Ensure /exports/cluster exists on control node - ansible.builtin.file: - path: /exports/cluster - state: directory - mode: '0755' - delegate_to: "{{ groups['control'] | first }}" - -- name: Copy manila share info to /exports/cluster - copy: - content: "{{ os_manila_mount_share_info | to_nice_yaml }}" - dest: "/exports/cluster/manila_share_info.yml" - delegate_to: "{{ groups['control'] | first }}" - - name: Write openhpc munge key copy: content: "{{ vault_openhpc_mungekey | b64decode }}" From 998ebf184445df2c9239d31ffa535702eda1d849 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 13 Nov 2024 16:46:46 +0000 Subject: [PATCH 075/268] address review comments --- ansible/filesystems.yml | 21 +----- .../roles/compute_init/files/compute-init.yml | 36 +++------ ansible/roles/compute_init/tasks/main.yml | 75 ++++++++++--------- ansible/roles/etc_hosts/tasks/main.yml | 23 ------ .../common/inventory/group_vars/all/nfs.yml | 11 +-- 5 files changed, 53 insertions(+), 113 deletions(-) diff --git a/ansible/filesystems.yml b/ansible/filesystems.yml index 316ae23d8..cf0db407f 100644 --- a/ansible/filesystems.yml +++ b/ansible/filesystems.yml @@ -23,23 +23,4 @@ tags: manila tasks: - include_role: - name: stackhpc.os-manila-mount - -- name: Manage /exports/cluster and Manila share info - hosts: control - become: true - tasks: - - block: - - name: Ensure /exports/cluster directory exists - file: - path: /exports/cluster - state: directory - owner: root - group: root - mode: 0755 - - - name: Copy manila share info to /exports/cluster - copy: - content: "{{ os_manila_mount_share_info | to_nice_yaml }}" - dest: "/exports/cluster/manila_share_info.yml" - when: os_manila_mount_share_info is defined \ No newline at end of file + name: stackhpc.os-manila-mount \ No newline at end of file diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 0d163dbf3..00043f0e8 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -6,8 +6,6 @@ # VARS TO BE SUPPLIED VIA CLOUD INIT METADATA vars: control_node_ip: "172.16.1.228" - nfs_export_hosts: "/exports/hosts" - nfs_export_cluster: "/exports/cluster" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] @@ -53,7 +51,7 @@ cvmfs_config_overrides: {} cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" - openhpc_conf_server: control_node_ip + openhpc_conf_server: "{{ control_node_ip }}" openhpc_gres_template: /etc/ansible-init/templates/gres.conf.j2 openhpc_slurm_service_enabled: true openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}" @@ -90,25 +88,25 @@ when: _copy_nm_config.changed | default(false) - - name: Mount /etc/hosts on compute nodes + - name: Mount /mnt/cluster on compute nodes and copy hosts to /etc/hosts block: - name: Ensure the mount directory exists file: - path: /mnt/hosts + path: /mnt/cluster state: directory mode: 0755 - - name: Mount /mnt/hosts + - name: Mount /mnt/cluster mount: - path: /mnt/hosts - src: "{{ vars.control_node_ip }}:{{ nfs_export_hosts }}" + path: /mnt/cluster + src: "{{ vars.control_node_ip }}:/exports/cluster" fstype: nfs opts: rw,sync state: mounted - - name: Copy /mnt/hosts/hosts contents to /etc/hosts + - name: Copy /mnt/cluster/hosts contents to /etc/hosts copy: - src: /mnt/hosts/hosts + src: /mnt/cluster/hosts dest: /etc/hosts owner: root group: root @@ -128,27 +126,13 @@ src: "{{ nfs_server }}:{{ nfs_export }}" fstype: nfs state: "{{ nfs_client_mnt_state }}" - - - name: Ensure the mount directory exists - file: - path: /mnt/ - state: directory - mode: 0755 - - - name: Mount /mnt/ - mount: - path: /mnt/ - src: "{{ vars.control_node_ip }}:{{ nfs_export_cluster }}" - fstype: nfs - opts: rw,sync - state: mounted - name: Manila mount block: - name: Read manila share from nfs file slurp: - src: "/mnt/manila_share_info.yml" + src: "/mnt/cluster/manila_share_info.yml" register: manila_share_info_file - name: Parse and set fact for manila share info @@ -293,7 +277,7 @@ - name: Copy Munge key from NFS-mounted directory to /etc/munge copy: - src: "/mnt/openhpc_munge.key" + src: "/mnt/cluster/openhpc_munge.key" dest: "/etc/munge/munge.key" owner: munge group: munge diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 892b8b093..40e30efae 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -1,12 +1,17 @@ --- -- name: Ensure templates directory exists +- name: Ensure directories exist file: - path: /etc/ansible-init/templates + path: "/etc/ansible-init/{{ item.directory }}" state: directory owner: root group: root - mode: 0644 + mode: 0755 + loop: + - { directory: "templates" } + - { directory: "files" } + - { directory: "library" } + - { directory: "filter_plugins" } - name: Inject templates copy: @@ -21,14 +26,6 @@ - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 - ../../stackhpc.openhpc/templates/gres.conf.j2 -- name: Ensure files directory exists - file: - path: /etc/ansible-init/files - state: directory - owner: root - group: root - mode: 0644 - - name: Inject files copy: src: '{{ item }}' @@ -39,14 +36,6 @@ loop: - ../../resolv_conf/files/NetworkManager-dns-none.conf -- name: Ensure library directory exists - file: - path: /etc/ansible-init/library - state: directory - owner: root - group: root - mode: 0644 - - name: Inject files copy: src: '{{ item }}' @@ -59,14 +48,6 @@ - ../../stackhpc.os-manila-mount/library/os_manila_share.py - ../../stackhpc.openhpc/library/sacct_cluster.py -- name: Ensure filter_plugins directory exists - file: - path: /etc/ansible-init/filter_plugins - state: directory - owner: root - group: root - mode: 0644 - - name: Inject filter_plugins copy: src: '{{ item }}' @@ -87,15 +68,39 @@ group: root mode: 0644 -- name: Write openhpc munge key - copy: - content: "{{ vault_openhpc_mungekey | b64decode }}" - dest: "/exports/cluster/openhpc_munge.key" - owner: munge - group: munge - mode: 0400 - delegate_to: "{{ groups['control'] | first }}" +- name: Ensure nfs /exports/cluster configured + block: + - name: Ensure the /exports/hosts directory exists + file: + path: /exports/cluster + state: directory + owner: root + group: root + mode: 0755 + - name: Copy /etc/hosts to /exports/cluster + copy: + src: /etc/hosts + dest: /exports/cluster/hosts + owner: root + group: root + mode: 0644 + remote_src: true + + - name: Copy manila share info to /exports/cluster + copy: + content: "{{ os_manila_mount_share_info | to_nice_yaml }}" + dest: "/exports/cluster/manila_share_info.yml" + when: os_manila_mount_share_info is defined + + - name: Write openhpc munge key + copy: + content: "{{ vault_openhpc_mungekey | b64decode }}" + dest: "/exports/cluster/openhpc_munge.key" + owner: munge + group: munge + mode: 0400 + delegate_to: "{{ groups['control'] | first }}" - name: Inject compute initialisation playbook copy: diff --git a/ansible/roles/etc_hosts/tasks/main.yml b/ansible/roles/etc_hosts/tasks/main.yml index 1d04ebf7c..6fdabf57c 100644 --- a/ansible/roles/etc_hosts/tasks/main.yml +++ b/ansible/roles/etc_hosts/tasks/main.yml @@ -6,26 +6,3 @@ group: root mode: 0644 become: yes - -- name: Ensure /exports/hosts directory exists and copy /etc/hosts - block: - - name: Ensure the /exports/hosts directory exists - file: - path: /exports/hosts - state: directory - owner: root - group: root - mode: 0755 - become: yes - delegate_to: "{{ groups['control'] | first }}" - - - name: Copy /etc/hosts to NFS exported directory - copy: - src: /etc/hosts - dest: /exports/hosts/hosts - owner: root - group: root - mode: 0644 - remote_src: true - become: yes - delegate_to: "{{ groups['control'] | first }}" \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index 036850847..84371c99a 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -16,15 +16,8 @@ nfs_configurations: nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" - - comment: Export /etc/hosts copy from Slurm control node + - comment: Export /exports/cluster from Slurm control node nfs_enable: server: "{{ inventory_hostname in groups['control'] }}" clients: false - nfs_export: "/exports/hosts" # control node has to copy in /etc/hosts to here - - - comment: Export cluster info from control node - nfs_enable: - server: "{{ inventory_hostname in groups['control']}}" - clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" - nfs_server: "{{ nfs_server_default }}" - nfs_export: "/exports/cluster" \ No newline at end of file + nfs_export: "/exports/cluster" # control node has to copy in /etc/hosts to here From a8f87fef1fe4e6f012855c815b084b28b89a29fb Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 14 Nov 2024 17:52:19 +0000 Subject: [PATCH 076/268] Upgrade RL8 ceph to quincy + trivy rate limit and OOD false positives fix (#477) * bumped os-manilla-mount to test branch * bump images * bumped role to new release * trivy now uses mirrored db * pinned ood fatimage version * bump images * fixed trivy ratelimiting --- .github/workflows/trivyscan.yml | 2 ++ environments/.stackhpc/hooks/post.yml | 6 ++---- .../inventory/group_vars/openondemand/overrides.yml | 2 ++ .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 8 ++++---- requirements.yml | 2 +- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/trivyscan.yml b/.github/workflows/trivyscan.yml index 2957b22ee..d1c789a19 100644 --- a/.github/workflows/trivyscan.yml +++ b/.github/workflows/trivyscan.yml @@ -94,6 +94,7 @@ jobs: timeout: 15m env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + TRIVY_DB_REPOSITORY: ghcr.io/azimuth-cloud/trivy-db:2 - name: Upload Trivy scan results to GitHub Security tab uses: github/codeql-action/upload-sarif@v3 @@ -114,3 +115,4 @@ jobs: timeout: 15m env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + TRIVY_DB_REPOSITORY: ghcr.io/azimuth-cloud/trivy-db:2 diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index 5032f0ecc..bd60015d9 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -9,8 +9,6 @@ path: "{{ item }}" state: absent with_items: - - /opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/3.1.7-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock - - /opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/3.1.9-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock - - /opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/3.1.7-1/gems/bootstrap_form-4.5.0/demo/yarn.lock - - /opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/3.1.9-1/gems/bootstrap_form-4.5.0/demo/yarn.lock + - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" + - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock \ No newline at end of file diff --git a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml b/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml index 735da25df..72b6cf476 100644 --- a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml +++ b/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml @@ -4,3 +4,5 @@ openondemand_desktop_partition: standard #openondemand_dashboard_support_url: #openondemand_dashboard_docs_url: #openondemand_filesapp_paths: +ondemand_package: ondemand-"{{ ondemand_package_version }}" +ondemand_package_version: '3.1.10' diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 9f396e964..cca779082 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241024-1439-177083b1", - "RL9": "openhpc-RL9-241024-1438-177083b1", - "RL9-cuda": "openhpc-cuda-RL9-241024-1628-177083b1" + "RL8": "openhpc-RL8-241114-1531-6f0a3a02", + "RL9": "openhpc-RL9-241114-1531-6f0a3a02", + "RL9-cuda": "openhpc-cuda-RL9-241114-1531-6f0a3a02" } -} \ No newline at end of file +} diff --git a/requirements.yml b/requirements.yml index cd8197beb..3d8c44011 100644 --- a/requirements.yml +++ b/requirements.yml @@ -21,7 +21,7 @@ roles: version: v3.1.5 - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git name: stackhpc.os-manila-mount - version: v24.5.1 # Support ceph quincy for RL9 + version: v24.11.0 # Support ceph quincy for RL9 collections: - name: containers.podman From f2be3fb3c799432c12f7a104523ce57580757975 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 14 Nov 2024 11:33:48 +0000 Subject: [PATCH 077/268] remove cuda from CI and ofed from packer config --- .github/workflows/fatimage.yml | 10 ++-------- .github/workflows/nightlybuild.yml | 8 -------- .github/workflows/s3-image-sync.yml | 2 -- .github/workflows/trivyscan.yml | 2 +- packer/openstack.pkr.hcl | 4 ++-- 5 files changed, 5 insertions(+), 21 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 947f9410f..37d402c72 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -26,10 +26,6 @@ jobs: - RL9 build: - openstack.openhpc - - openstack.openhpc-cuda - exclude: - - os_version: RL8 - build: openstack.openhpc-cuda env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -37,12 +33,10 @@ jobs: SOURCE_IMAGES_MAP: | { "RL8": { - "openstack.openhpc": "rocky-latest-RL8", - "openstack.openhpc-cuda": "rocky-latest-cuda-RL8" + "openstack.openhpc": "rocky-latest-RL8" }, "RL9": { - "openstack.openhpc": "rocky-latest-RL9", - "openstack.openhpc-cuda": "rocky-latest-cuda-RL9" + "openstack.openhpc": "rocky-latest-RL9" } } diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 5e06a3147..45b0e142e 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -28,11 +28,6 @@ jobs: - RL9 build: - openstack.rocky-latest - - openstack.rocky-latest-cuda - exclude: - - os_version: RL8 - build: openstack.rocky-latest-cuda - env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -144,10 +139,7 @@ jobs: - RL9 image: - rocky-latest - - rocky-latest-cuda exclude: - - os_version: RL8 - image: rocky-latest-cuda - target_cloud: LEAFCLOUD env: OS_CLOUD: openstack diff --git a/.github/workflows/s3-image-sync.yml b/.github/workflows/s3-image-sync.yml index 0ffaae954..85e0c2fad 100644 --- a/.github/workflows/s3-image-sync.yml +++ b/.github/workflows/s3-image-sync.yml @@ -42,7 +42,6 @@ jobs: build: - RL8 - RL9 - - RL9-cuda env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -112,7 +111,6 @@ jobs: build: - RL8 - RL9 - - RL9-cuda exclude: - cloud: ${{ needs.image_upload.outputs.ci_cloud }} diff --git a/.github/workflows/trivyscan.yml b/.github/workflows/trivyscan.yml index d1c789a19..625a4746b 100644 --- a/.github/workflows/trivyscan.yml +++ b/.github/workflows/trivyscan.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - build: ["RL8", "RL9", "RL9-cuda"] + build: ["RL8", "RL9"] env: JSON_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json OS_CLOUD: openstack diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index fae0bf7b2..00b40d243 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -153,8 +153,8 @@ variable "groups" { description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name" default = { # fat image builds: - rocky-latest = ["update", "ofed"] - rocky-latest-cuda = ["update", "ofed", "cuda"] + rocky-latest = ["update"] + rocky-latest-cuda = ["update", "cuda"] openhpc = ["control", "compute", "login"] openhpc-cuda = ["control", "compute", "login"] } From 4a3f1958f5dea077160f82734c5870c740bbe7e9 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 15 Nov 2024 09:57:11 +0000 Subject: [PATCH 078/268] remove all reference to cuda/ofed in CI and packer config --- .github/workflows/fatimage.yml | 13 ++++++------- .github/workflows/nightlybuild.yml | 9 ++++----- .github/workflows/trivyscan.yml | 4 ++-- packer/openstack.pkr.hcl | 20 ++------------------ 4 files changed, 14 insertions(+), 32 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 37d402c72..893a8f7e9 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -15,17 +15,15 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails - matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions + matrix: # build RL8, RL9 os_version: - RL8 - RL9 - build: - - openstack.openhpc env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -39,6 +37,7 @@ jobs: "openstack.openhpc": "rocky-latest-RL9" } } + BUILD: openstack.openhpc steps: - uses: actions/checkout@v2 @@ -84,13 +83,13 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ + -only=${{ env.BUILD }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ openstack.pkr.hcl env: PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }} + SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][env.BUILD] }} - name: Get created image names from manifest id: manifest @@ -107,7 +106,7 @@ jobs: - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-${{ matrix.build }}-${{ matrix.os_version }} + name: image-details-${{ env.BUILD }}-${{ matrix.os_version }} path: | ./image-id.txt ./image-name.txt diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 45b0e142e..4d51aa2f9 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -17,17 +17,15 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails - matrix: # build RL8, RL9, RL9+CUDA versions + matrix: # build RL8, RL9 os_version: - RL8 - RL9 - build: - - openstack.rocky-latest env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -37,6 +35,7 @@ jobs: "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" } + BUILD: openstack.openhpc steps: - uses: actions/checkout@v2 @@ -82,7 +81,7 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ + -only=${{ env.BUILD }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ openstack.pkr.hcl diff --git a/.github/workflows/trivyscan.yml b/.github/workflows/trivyscan.yml index 625a4746b..4c090b85a 100644 --- a/.github/workflows/trivyscan.yml +++ b/.github/workflows/trivyscan.yml @@ -10,7 +10,7 @@ on: jobs: scan: concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }} # to branch/PR + build cancel-in-progress: true runs-on: ubuntu-latest strategy: @@ -100,7 +100,7 @@ jobs: uses: github/codeql-action/upload-sarif@v3 with: sarif_file: "${{ steps.manifest.outputs.image-name }}.sarif" - category: "${{ matrix.os_version }}-${{ matrix.build }}" + category: "${{ matrix.build }}" - name: Fail if scan has CRITICAL vulnerabilities uses: aquasecurity/trivy-action@0.24.0 diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 00b40d243..52202ead1 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -127,15 +127,13 @@ variable "volume_size" { default = { # fat image builds, GB: rocky-latest = 15 - rocky-latest-cuda = 30 openhpc = 15 - openhpc-cuda = 30 } } variable "extra_build_volume_size" { type = number - default = 15 # same as default non-CUDA build + default = 15 } variable "image_disk_format" { @@ -154,9 +152,7 @@ variable "groups" { default = { # fat image builds: rocky-latest = ["update"] - rocky-latest-cuda = ["update", "cuda"] openhpc = ["control", "compute", "login"] - openhpc-cuda = ["control", "compute", "login"] } } @@ -210,24 +206,12 @@ build { image_name = "${source.name}-${var.os_version}" } - # latest nightly cuda image: - source "source.openstack.openhpc" { - name = "rocky-latest-cuda" - image_name = "${source.name}-${var.os_version}" - } - - # OFED fat image: + # fat image: source "source.openstack.openhpc" { name = "openhpc" image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" } - # CUDA fat image: - source "source.openstack.openhpc" { - name = "openhpc-cuda" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" - } - # Extended site-specific image, built on fat image: source "source.openstack.openhpc" { name = "openhpc-extra" From 097cdae12b9ffabd9fcc7965adeb5038a64619ab Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 15 Nov 2024 11:37:24 +0000 Subject: [PATCH 079/268] fix nightly building fatimage in ci, revert to matrix.build --- .github/workflows/fatimage.yml | 11 ++++++----- .github/workflows/nightlybuild.yml | 7 ++++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 893a8f7e9..a8d3dbe29 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -15,7 +15,7 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build cancel-in-progress: true runs-on: ubuntu-22.04 strategy: @@ -24,6 +24,8 @@ jobs: os_version: - RL8 - RL9 + build: + - openstack.openhpc env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -37,7 +39,6 @@ jobs: "openstack.openhpc": "rocky-latest-RL9" } } - BUILD: openstack.openhpc steps: - uses: actions/checkout@v2 @@ -83,13 +84,13 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ env.BUILD }} \ + -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ openstack.pkr.hcl env: PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][env.BUILD] }} + SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }} - name: Get created image names from manifest id: manifest @@ -106,7 +107,7 @@ jobs: - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-${{ env.BUILD }}-${{ matrix.os_version }} + name: image-details-${{ matrix.build }}-${{ matrix.os_version }} path: | ./image-id.txt ./image-name.txt diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 4d51aa2f9..da3de4ea5 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -17,7 +17,7 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build cancel-in-progress: true runs-on: ubuntu-22.04 strategy: @@ -26,6 +26,8 @@ jobs: os_version: - RL8 - RL9 + build: + - openstack.rocky-latest env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -35,7 +37,6 @@ jobs: "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" } - BUILD: openstack.openhpc steps: - uses: actions/checkout@v2 @@ -81,7 +82,7 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ env.BUILD }} \ + -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ openstack.pkr.hcl From eaa40f6d9ef69f2187d223f6be69156544f22eff Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 15 Nov 2024 13:02:08 +0000 Subject: [PATCH 080/268] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index cca779082..87f5c46cd 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241114-1531-6f0a3a02", - "RL9": "openhpc-RL9-241114-1531-6f0a3a02", - "RL9-cuda": "openhpc-cuda-RL9-241114-1531-6f0a3a02" + "RL8": "openhpc-RL8-241115-1209-097cdae1", + "RL9": "openhpc-RL9-241115-1209-097cdae1" } } From 100632f8a606c7f57a81155d6a6a3f7e0802286e Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 18 Nov 2024 09:48:17 +0000 Subject: [PATCH 081/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 87f5c46cd..f9a2087c8 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241115-1209-097cdae1", - "RL9": "openhpc-RL9-241115-1209-097cdae1" + "RL8": "openhpc-RL8-241118-0918-4538c6df", + "RL9": "openhpc-RL9-241118-0918-4538c6df" } } From 882ed7ec326f98bbed55bbaf523a46bf0a196af4 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 18 Nov 2024 10:55:19 +0000 Subject: [PATCH 082/268] Don't fail cluster cleanup when prefix not found --- .github/workflows/nightly-cleanup.yml | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 9bfc5230a..27c30d65d 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -11,7 +11,7 @@ on: - SMS - ARCUS schedule: - - cron: '0 20 * * *' # Run at 8PM - image sync runs at midnight + - cron: '0 21 * * *' # Run at 9PM - image sync runs at midnight jobs: ci_cleanup: @@ -52,13 +52,23 @@ jobs: - name: Find CI clusters run: | . venv/bin/activate - CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq) - echo "ci_clusters=${CI_CLUSTERS}" >> GITHUB_ENV + CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq || true) + if [[ -z "$CI_CLUSTERS" ]]; then + echo "No matching CI clusters found." + else + echo "Found clusters: $CI_CLUSTERS" + echo "ci_clusters=${CI_CLUSTERS}" >> GITHUB_ENV + fi shell: bash - name: Delete clusters if control node not tagged with keep run: | . venv/bin/activate + if [[ -z "${CI_CLUSTERS}" ]]; then + echo "No clusters to delete." + exit 0 + fi + for cluster_prefix in ${CI_CLUSTERS} do TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value) From 4710699d7e43392886a855ab16dbd0d8dad2dbc0 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 18 Nov 2024 11:51:46 +0000 Subject: [PATCH 083/268] Update nightly-cleanup.yml --- .github/workflows/nightly-cleanup.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 27c30d65d..4e40a514b 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -64,12 +64,12 @@ jobs: - name: Delete clusters if control node not tagged with keep run: | . venv/bin/activate - if [[ -z "${CI_CLUSTERS}" ]]; then + if [[ -z "${{ env.ci_clusters }}" ]]; then echo "No clusters to delete." exit 0 fi - for cluster_prefix in ${CI_CLUSTERS} + for cluster_prefix in ${{ env.ci_clusters }} do TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value) if [[ $TAGS =~ "keep" ]]; then From 8b6bf9714c405638035e5f5bc73bcfec49f289da Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:00:02 +0000 Subject: [PATCH 084/268] Update nightly-cleanup.yml --- .github/workflows/nightly-cleanup.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 4e40a514b..d768dcc21 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -64,6 +64,7 @@ jobs: - name: Delete clusters if control node not tagged with keep run: | . venv/bin/activate + echo "Found clusters: ${{ env.ci_clusters }}" if [[ -z "${{ env.ci_clusters }}" ]]; then echo "No clusters to delete." exit 0 From 35d41008e7809de74a76033f6f318280a7155e9b Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:10:35 +0000 Subject: [PATCH 085/268] Update nightly-cleanup.yml --- .github/workflows/nightly-cleanup.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index d768dcc21..086efea93 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -57,7 +57,7 @@ jobs: echo "No matching CI clusters found." else echo "Found clusters: $CI_CLUSTERS" - echo "ci_clusters=${CI_CLUSTERS}" >> GITHUB_ENV + echo "ci_clusters=${CI_CLUSTERS}" >> $GITHUB_ENV fi shell: bash From 127316b9132496789fa7ed92d3c35a8627477887 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:17:57 +0000 Subject: [PATCH 086/268] Update nightly-cleanup.yml --- .github/workflows/nightly-cleanup.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 086efea93..3d9ee5159 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -57,7 +57,7 @@ jobs: echo "No matching CI clusters found." else echo "Found clusters: $CI_CLUSTERS" - echo "ci_clusters=${CI_CLUSTERS}" >> $GITHUB_ENV + echo "ci_clusters=$CI_CLUSTERS" >> $GITHUB_ENV fi shell: bash From b35925d5d61a6ba004f3c9fddde144c8e5aeffaa Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:41:10 +0000 Subject: [PATCH 087/268] Update nightly-cleanup.yml --- .github/workflows/nightly-cleanup.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 3d9ee5159..4ae2d6920 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -65,12 +65,12 @@ jobs: run: | . venv/bin/activate echo "Found clusters: ${{ env.ci_clusters }}" - if [[ -z "${{ env.ci_clusters }}" ]]; then + if [[ -z ${ci_clusters} ]]; then echo "No clusters to delete." exit 0 fi - for cluster_prefix in ${{ env.ci_clusters }} + for cluster_prefix in ${ci_clusters} do TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value) if [[ $TAGS =~ "keep" ]]; then From 8aea048da3480d16ac27e21c8562a0ce24ea7ecd Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:42:43 +0000 Subject: [PATCH 088/268] Update nightly-cleanup.yml --- .github/workflows/nightly-cleanup.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 4ae2d6920..272f1b38b 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -1,15 +1,6 @@ name: Cleanup CI clusters on: workflow_dispatch: - inputs: - ci_cloud: - description: 'Select the CI_CLOUD' - required: true - type: choice - options: - - LEAFCLOUD - - SMS - - ARCUS schedule: - cron: '0 21 * * *' # Run at 9PM - image sync runs at midnight From a40b77c083bc43c0fc304242482d73e5cd93d4a5 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 18 Nov 2024 13:14:33 +0000 Subject: [PATCH 089/268] flatten multiline list of clusters --- .github/workflows/nightly-cleanup.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 272f1b38b..b98ea74bd 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -44,18 +44,21 @@ jobs: run: | . venv/bin/activate CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq || true) + echo "DEBUG: Raw CI clusters: $CI_CLUSTERS" + if [[ -z "$CI_CLUSTERS" ]]; then echo "No matching CI clusters found." else - echo "Found clusters: $CI_CLUSTERS" - echo "ci_clusters=$CI_CLUSTERS" >> $GITHUB_ENV + # Flatten multiline value so can be passed as env var + CI_CLUSTERS_FORMATTED=$(echo "$CI_CLUSTERS" | tr '\n' ' ' | sed 's/ $//') + echo "DEBUG: Formatted CI clusters: $CI_CLUSTERS_FORMATTED" + echo "ci_clusters=$CI_CLUSTERS_FORMATTED" >> $GITHUB_ENV fi shell: bash - name: Delete clusters if control node not tagged with keep run: | . venv/bin/activate - echo "Found clusters: ${{ env.ci_clusters }}" if [[ -z ${ci_clusters} ]]; then echo "No clusters to delete." exit 0 @@ -67,7 +70,7 @@ jobs: if [[ $TAGS =~ "keep" ]]; then echo "Skipping ${cluster_prefix} - control instance is tagged as keep" else - yes | ./dev/delete-cluster.py ${cluster_prefix} + ./dev/delete-cluster.py ${cluster_prefix} --force fi done shell: bash From 90016269f6e750f6cb36251d89834b7f45f25e75 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 18 Nov 2024 13:19:16 +0000 Subject: [PATCH 090/268] Update nightly-cleanup.yml --- .github/workflows/nightly-cleanup.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index b98ea74bd..f76bd51a9 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -66,6 +66,7 @@ jobs: for cluster_prefix in ${ci_clusters} do + echo "Processing cluster: $cluster_prefix" TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value) if [[ $TAGS =~ "keep" ]]; then echo "Skipping ${cluster_prefix} - control instance is tagged as keep" From f949553a0dcf601b9525fb3ca96794088257bbf9 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 18 Nov 2024 13:24:18 +0000 Subject: [PATCH 091/268] Update delete-cluster.py to allow --force flag --- dev/delete-cluster.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/dev/delete-cluster.py b/dev/delete-cluster.py index 861396efd..05f53fbfa 100755 --- a/dev/delete-cluster.py +++ b/dev/delete-cluster.py @@ -4,18 +4,18 @@ Delete infrastructure for a cluster without using Terraform. Useful for CI clusters. Usage: - delete-cluster.py PREFIX + delete-cluster.py PREFIX [--force] Where PREFIX is the string at the start of the resource's names. -It will list matching resources and prompt to confirm deletion. +If --force is provided, it will delete all resources without confirmation. """ -import sys, json, subprocess, pprint +import sys, json, subprocess CLUSTER_RESOURCES = ['server', 'port', 'volume'] -def delete_cluster(cluster_prefix): +def delete_cluster(cluster_prefix, force=False): to_delete = {} for resource_type in CLUSTER_RESOURCES: to_delete[resource_type] = [] @@ -29,7 +29,8 @@ def delete_cluster(cluster_prefix): except: print(resource_type, item) raise - if input('Delete these (y/n)?:') == 'y': + + if force or input('Delete these (y/n)?:') == 'y': for resource_type in CLUSTER_RESOURCES: items = [v['ID'] for v in to_delete[resource_type]] if items: @@ -40,7 +41,10 @@ def delete_cluster(cluster_prefix): print('Cancelled - no resources deleted') if __name__ == '__main__': - if len(sys.argv) != 2: + if len(sys.argv) < 2 or len(sys.argv) > 3: print('ERROR: Incorrect argument(s).\n' + __doc__) exit(1) - delete_cluster(sys.argv[1]) + force_flag = '--force' in sys.argv + cluster_prefix = sys.argv[1] + delete_cluster(cluster_prefix, force_flag) + From 0c17410b42090fb330c96667dd9a4963cda76612 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 19 Nov 2024 09:22:56 +0000 Subject: [PATCH 092/268] k9s tags and variable renames Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/extras.yml | 1 + ansible/roles/k9s/tasks/main.yml | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ansible/extras.yml b/ansible/extras.yml index cb6ded130..107f85252 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -40,6 +40,7 @@ - name: Install k9s become: yes hosts: k9s + tags: k9s tasks: - import_role: name: k9s diff --git a/ansible/roles/k9s/tasks/main.yml b/ansible/roles/k9s/tasks/main.yml index 5ca111a2d..674b4dffb 100644 --- a/ansible/roles/k9s/tasks/main.yml +++ b/ansible/roles/k9s/tasks/main.yml @@ -3,7 +3,7 @@ - name: Check if k9s is installed ansible.builtin.stat: path: "/usr/bin/k9s" - register: result + register: _k9s_stat_result - name: Install k9s and clean up temporary files block: @@ -14,7 +14,7 @@ owner: root group: root mode: "744" - when: not result.stat.exists + when: not _k9s_stat_result.stat.exists - name: Download k9s ansible.builtin.get_url: @@ -41,4 +41,4 @@ ansible.builtin.file: path: /tmp/k9s state: absent - when: not result.stat.exists + when: not _k9s_stat_result.stat.exists From e44e08469e749b88474ca333a16017c208042ee4 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Tue, 19 Nov 2024 13:33:27 +0000 Subject: [PATCH 093/268] Make block device detection work on ESXi (#481) Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/cluster_infra/templates/resources.tf.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index cab7cc7a2..453f01a7e 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -471,7 +471,7 @@ resource "openstack_compute_instance_v2" "control" { {%- endif %} bootcmd: %{for volume in [openstack_blockstorage_volume_v3.state, {% if not cluster_home_manila_share | bool %} openstack_blockstorage_volume_v3.home {% endif %}]} - - BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${substr(volume.id, 0, 20)}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV + - BLKDEV=$(readlink -f $(ls /dev/disk/by-id/*${replace(substr(volume.id, 0, 20), "-", "*")}* | head -n1 )); blkid -o value -s TYPE $BLKDEV || mke2fs -t ext4 -L ${lower(split(" ", volume.description)[0])} $BLKDEV %{endfor} mounts: - [LABEL=state, {{ appliances_state_dir }}, auto] From 53e43c2ddb45282d5ce463ac42b224946c7ab9b5 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 20 Nov 2024 10:15:05 +0000 Subject: [PATCH 094/268] Fix adhoc/rebuild wait_for_connection race condition When rebuilding nodes, a race condition can result in ssh connection being re-established before rebuild has begun. Delay of 60s fixes this. --- ansible/adhoc/rebuild.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ansible/adhoc/rebuild.yml b/ansible/adhoc/rebuild.yml index c30737fd6..9e7a3a770 100644 --- a/ansible/adhoc/rebuild.yml +++ b/ansible/adhoc/rebuild.yml @@ -16,3 +16,6 @@ - command: "openstack server rebuild {{ instance_id | default(inventory_hostname) }}{% if rebuild_image is defined %} --image {{ rebuild_image }}{% endif %}" delegate_to: localhost - wait_for_connection: + delay: 60 + timeout: 600 + From a32e3099020e48bc13162a6f20a23a7d86ae57f5 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 20 Nov 2024 10:48:27 +0000 Subject: [PATCH 095/268] remove gres.conf - no-op --- ansible/roles/compute_init/files/compute-init.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 00043f0e8..bd756c38a 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -52,7 +52,6 @@ cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" openhpc_conf_server: "{{ control_node_ip }}" - openhpc_gres_template: /etc/ansible-init/templates/gres.conf.j2 openhpc_slurm_service_enabled: true openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}" openhpc_enable: @@ -283,16 +282,6 @@ group: munge mode: 0400 - - name: Create gres.conf - template: - src: "{{ openhpc_gres_template }}" - dest: /etc/slurm/gres.conf - mode: "0600" - owner: slurm - group: slurm - when: openhpc_enable.control | default(false) - register: ohpc_gres_conf - - name: Set slurmctld location for configless operation lineinfile: path: /etc/sysconfig/slurmd From a1f71b6244570f0f18705ec627bd2aea81ba6ab5 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 20 Nov 2024 12:51:52 +0000 Subject: [PATCH 096/268] remove or hardcode some vars, make resolv_conf block conditional --- .../roles/compute_init/files/compute-init.yml | 19 +++++-------------- ansible/roles/compute_init/tasks/main.yml | 1 - 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index bd756c38a..2a42d4b7c 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -9,7 +9,6 @@ resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] - nfs_disk_location: nfs_export: "/exports/home" nfs_client_mnt_options: nfs_client_mnt_point: "/home" @@ -25,7 +24,6 @@ - noatime - _netdev # prevents mount blocking early boot before networking available - rw - os_manila_mount_share_info: [] # populated by lookup mode os_manila_mount_ceph_conf_path: /etc/ceph @@ -52,13 +50,6 @@ cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" openhpc_conf_server: "{{ control_node_ip }}" - openhpc_slurm_service_enabled: true - openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}" - openhpc_enable: - control: false - batch: true - database: false - runtime: true tasks: - name: Configure resolve.conf @@ -85,6 +76,7 @@ name: NetworkManager state: reloaded when: _copy_nm_config.changed | default(false) + when: resolv_conf_nameservers is defined and resolv_conf_nameservers | length > 0 - name: Mount /mnt/cluster on compute nodes and copy hosts to /etc/hosts @@ -295,12 +287,11 @@ - name: Configure Munge service service: name: munge - enabled: "{{ openhpc_slurm_service_enabled | bool }}" - state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + enabled: true + state: started - name: Ensure slurmd state service: name: slurmd - enabled: "{{ openhpc_slurm_service_enabled | bool }}" - state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" - when: openhpc_enable.batch | default(false) | bool \ No newline at end of file + enabled: true + state: started \ No newline at end of file diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 40e30efae..ca0a006a8 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -24,7 +24,6 @@ - ../../resolv_conf/templates/resolv.conf.j2 - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 - - ../../stackhpc.openhpc/templates/gres.conf.j2 - name: Inject files copy: From 61392edc02a4c896f334b7cee6c8a4f15e0d9185 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 20 Nov 2024 14:02:47 +0000 Subject: [PATCH 097/268] move EESSI CVMFS install and config to nfs export --- .../roles/compute_init/files/compute-init.yml | 42 +++---------------- ansible/roles/compute_init/tasks/main.yml | 19 ++++++++- 2 files changed, 24 insertions(+), 37 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 2a42d4b7c..65a05e1da 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -42,13 +42,6 @@ basic_users_groups: [] - cvmfs_quota_limit_mb: 10000 - cvmfs_config_default: - CVMFS_CLIENT_PROFILE: single - CVMFS_QUOTA_LIMIT: "{{ cvmfs_quota_limit_mb }}" - cvmfs_config_overrides: {} - cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" - openhpc_conf_server: "{{ control_node_ip }}" tasks: @@ -219,36 +212,13 @@ - name: Configure EESSI block: - - name: Download Cern GPG key - ansible.builtin.get_url: - url: http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM - dest: ./cvmfs-key.gpg - - - name: Import downloaded GPG key - command: rpm --import cvmfs-key.gpg - - - name: Add CVMFS repo - dnf: - name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm - - - name: Install CVMFS - dnf: - name: cvmfs - - - name: Install EESSI CVMFS config - dnf: - name: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi-latest.noarch.rpm - # NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok? - disable_gpg_check: true - - - name: Add base CVMFS config - community.general.ini_file: + - name: Copy /mnt/cluster/cvmfs/default.local contents to /etc/cvmfs/default.local + copy: + src: /mnt/cluster/cvmfs/default.local dest: /etc/cvmfs/default.local - section: null - option: "{{ item.key }}" - value: "{{ item.value }}" - no_extra_spaces: true - loop: "{{ cvmfs_config | dict2items }}" + owner: root + group: root + mode: 0644 # NOTE: Not clear how to make this idempotent - name: Ensure CVMFS config is setup diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index ca0a006a8..0e52d8892 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -69,7 +69,7 @@ - name: Ensure nfs /exports/cluster configured block: - - name: Ensure the /exports/hosts directory exists + - name: Ensure the /exports/cluster directory exists file: path: /exports/cluster state: directory @@ -92,6 +92,23 @@ dest: "/exports/cluster/manila_share_info.yml" when: os_manila_mount_share_info is defined + - name: Ensure /exports/cluster/cvmfs directory exists + file: + path: /exports/cluster/cvmfs + state: directory + owner: root + group: root + mode: 0755 + + - name: Copy EESSI CVMFS config to /exports/cluster + copy: + src: /etc/cvmfs/default.local + dest: /exports/cluster/cvmfs/default.local + owner: root + group: root + mode: 0644 + remote_src: true + - name: Write openhpc munge key copy: content: "{{ vault_openhpc_mungekey | b64decode }}" From 51b02d3f556f5c886a8ba1a3ae8f9de637ad14a6 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 20 Nov 2024 15:28:42 +0000 Subject: [PATCH 098/268] move manila mount share to nfs export --- .../roles/compute_init/files/compute-init.yml | 18 +++++++++++------- ansible/roles/compute_init/tasks/main.yml | 6 ++++++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 65a05e1da..f78bbe9b7 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -8,15 +8,12 @@ control_node_ip: "172.16.1.228" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] - nfs_export: "/exports/home" nfs_client_mnt_options: nfs_client_mnt_point: "/home" nfs_client_mnt_state: mounted nfs_server: "{{ control_node_ip }}" - - os_manila_mount_shares: [] os_manila_mount_state: mounted os_manila_mount_opts: - x-systemd.device-timeout=30 @@ -26,7 +23,6 @@ - rw os_manila_mount_ceph_conf_path: /etc/ceph - basic_users_manage_homedir: false basic_users_userdefaults: state: present @@ -38,10 +34,8 @@ - name: testuser # can't use rocky as $HOME isn't shared! password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent uid: 1005 - state: present basic_users_groups: [] - openhpc_conf_server: "{{ control_node_ip }}" tasks: @@ -114,15 +108,25 @@ - name: Manila mount block: - - name: Read manila share from nfs file + - name: Read manila share info from nfs file slurp: src: "/mnt/cluster/manila_share_info.yml" register: manila_share_info_file + no_log: true - name: Parse and set fact for manila share info set_fact: os_manila_mount_share_info: "{{ manila_share_info_file.content | b64decode | from_yaml }}" + - name: Read manila shares from nfs file + slurp: + src: "/mnt/cluster/manila_shares.yml" + register: manila_shares_file + + - name: Parse and set fact for manila shares + set_fact: + os_manila_mount_shares: "{{ manila_shares_file.content | b64decode | from_yaml }}" + - name: Ensure Ceph configuration directory exists ansible.builtin.file: path: "{{ os_manila_mount_ceph_conf_path }}" diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 0e52d8892..f2bbfb72d 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -91,6 +91,12 @@ content: "{{ os_manila_mount_share_info | to_nice_yaml }}" dest: "/exports/cluster/manila_share_info.yml" when: os_manila_mount_share_info is defined + + - name: Copy manila mount shares to /exports/cluster + copy: + content: "{{ os_manila_mount_shares | to_nice_yaml }}" + dest: "/exports/cluster/manila_shares.yml" + when: os_manila_mount_shares is defined - name: Ensure /exports/cluster/cvmfs directory exists file: From 134515d2348fe67aa02a162fa05c4a3111530092 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 20 Nov 2024 15:37:48 +0000 Subject: [PATCH 099/268] Pause CI testing for branch feat/compute-script --- .github/workflows/stackhpc.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..848517bb8 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -24,6 +24,8 @@ on: - '!.gitignore' - '!.github/workflows/' - '.github/workflows/stackhpc' + branches: + - '!feat/compute-script' jobs: openstack: name: openstack-ci From 40d9e1fe108b146c9d7b680d5834c10c940b9191 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 22 Nov 2024 16:09:47 +0000 Subject: [PATCH 100/268] replaces system repos with ark repos during ci --- .github/workflows/fatimage.yml | 2 + .github/workflows/nightlybuild.yml | 2 + ansible/.gitignore | 2 + ansible/roles/release_train/defaults/main.yml | 8 ++ .../release_train/tasks/revert_repos.yml | 19 ++++ .../roles/release_train/tasks/set_repos.yml | 22 +++++ .../templates/rocky-extras.repo.j2 | 65 +++++++++++++ .../release_train/templates/rocky.repo.j2 | 93 +++++++++++++++++++ environments/.stackhpc/hooks/post.yml | 10 +- environments/.stackhpc/hooks/pre.yml | 8 ++ packer/openstack.pkr.hcl | 6 ++ 11 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 ansible/roles/release_train/defaults/main.yml create mode 100644 ansible/roles/release_train/tasks/revert_repos.yml create mode 100644 ansible/roles/release_train/tasks/set_repos.yml create mode 100644 ansible/roles/release_train/templates/rocky-extras.repo.j2 create mode 100644 ansible/roles/release_train/templates/rocky.repo.j2 diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index a8d3dbe29..cca652ef6 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -39,6 +39,7 @@ jobs: "openstack.openhpc": "rocky-latest-RL9" } } + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: - uses: actions/checkout@v2 @@ -87,6 +88,7 @@ jobs: -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "ark_password=${ARK_PASSWORD}" \ openstack.pkr.hcl env: PKR_VAR_os_version: ${{ matrix.os_version }} diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index da3de4ea5..7fab8ebec 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -37,6 +37,7 @@ jobs: "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" } + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: - uses: actions/checkout@v2 @@ -85,6 +86,7 @@ jobs: -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "ark_password=${ARK_PASSWORD}" \ openstack.pkr.hcl env: diff --git a/ansible/.gitignore b/ansible/.gitignore index 8edcc4360..ad841dc38 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -64,3 +64,5 @@ roles/* !roles/k9s/** !roles/lustre/ !roles/lustre/** +!roles/release_train/ +!roles/release_train/** diff --git a/ansible/roles/release_train/defaults/main.yml b/ansible/roles/release_train/defaults/main.yml new file mode 100644 index 000000000..dbae5e3b9 --- /dev/null +++ b/ansible/roles/release_train/defaults/main.yml @@ -0,0 +1,8 @@ +release_train_url_prefix: https://ark.stackhpc.com/pulp/content/rocky/9.4 +release_train_url_suffix: "x86_64/os/{{ release_train_timestamp }}/" +# most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml +# note that some timestamps can't be used because not all repos have snapshots for them +release_train_timestamp: 20240816T002610 +release_train_auth: | + username = slurm-app-ci + password = {{ _github_secrets_ark_password }} \ No newline at end of file diff --git a/ansible/roles/release_train/tasks/revert_repos.yml b/ansible/roles/release_train/tasks/revert_repos.yml new file mode 100644 index 000000000..8780ac13c --- /dev/null +++ b/ansible/roles/release_train/tasks/revert_repos.yml @@ -0,0 +1,19 @@ +--- + +- name: Check for backup folder exists + stat: + path: /etc/yum.repos.d.backup + register: _stat_yum_backup_file + +- name: Fail if backup folder doesn't exist + assert: + that: _stat_yum_backup_file.stat.exists + +- name: Remove ark repos + ansible.builtin.file: + state: absent + path: /etc/yum.repos.d + +- name: Restore backup repos + ansible.builtin.shell: + cmd: mv /etc/yum.repos.d.backup /etc/yum.repos.d diff --git a/ansible/roles/release_train/tasks/set_repos.yml b/ansible/roles/release_train/tasks/set_repos.yml new file mode 100644 index 000000000..f527a85d8 --- /dev/null +++ b/ansible/roles/release_train/tasks/set_repos.yml @@ -0,0 +1,22 @@ +--- + +- name: Check for existing backup folder + stat: + path: /etc/yum.repos.d.backup + register: _stat_yum_backup_file + +- name: Backup existing package repos + ansible.builtin.copy: + remote_src: true + src: /etc/yum.repos.d/ + dest: /etc/yum.repos.d.backup + when: not _stat_yum_backup_file.stat.exists + +- name: Replace package repos with release train repos + no_log: true + ansible.builtin.template: + src: "{{ item }}.j2" + dest: /etc/yum.repos.d/{{ item }} + loop: + - rocky-extras.repo + - rocky.repo \ No newline at end of file diff --git a/ansible/roles/release_train/templates/rocky-extras.repo.j2 b/ansible/roles/release_train/templates/rocky-extras.repo.j2 new file mode 100644 index 000000000..78bed03d5 --- /dev/null +++ b/ansible/roles/release_train/templates/rocky-extras.repo.j2 @@ -0,0 +1,65 @@ +# rocky-extras.repo +# +# The mirrorlist system uses the connecting IP address of the client and the +# update status of each mirror to pick current mirrors that are geographically +# close to the client. You should use this for Rocky updates unless you are +# manually picking other mirrors. +# +# If the mirrorlist does not work for you, you can try the commented out +# baseurl line instead. + +[extras] +name=Rocky Linux $releasever - Extras +baseurl={{ release_train_url_prefix }}/extras/{{ release_train_url_suffix }} +gpgcheck=1 +enabled=1 +countme=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 +{{ release_train_auth }} + +[extras-debuginfo] +name=Rocky Linux $releasever - Extras Debug +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=extras-$releasever-debug$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/extras/$basearch/debug/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[extras-source] +name=Rocky Linux $releasever - Extras Source +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=extras-$releasever-source$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/extras/source/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[plus] +name=Rocky Linux $releasever - Plus +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=plus-$releasever$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/$basearch/os/ +gpgcheck=1 +enabled=0 +countme=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[plus-debuginfo] +name=Rocky Linux $releasever - Plus - Debug +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=plus-$releasever-debug$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/$basearch/debug/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[plus-source] +name=Rocky Linux $releasever - Plus - Source +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=plus-$releasever-source$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/source/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 diff --git a/ansible/roles/release_train/templates/rocky.repo.j2 b/ansible/roles/release_train/templates/rocky.repo.j2 new file mode 100644 index 000000000..29d6aee42 --- /dev/null +++ b/ansible/roles/release_train/templates/rocky.repo.j2 @@ -0,0 +1,93 @@ +# rocky.repo +# +# The mirrorlist system uses the connecting IP address of the client and the +# update status of each mirror to pick current mirrors that are geographically +# close to the client. You should use this for Rocky updates unless you are +# manually picking other mirrors. +# +# If the mirrorlist does not work for you, you can try the commented out +# baseurl line instead. + +[baseos] +name=Rocky Linux $releasever - BaseOS +baseurl={{ release_train_url_prefix }}/BaseOS/{{ release_train_url_suffix }} +gpgcheck=1 +enabled=1 +countme=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 +{{ release_train_auth }} + +[baseos-debuginfo] +name=Rocky Linux $releasever - BaseOS - Debug +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=BaseOS-$releasever-debug$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/BaseOS/$basearch/debug/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[baseos-source] +name=Rocky Linux $releasever - BaseOS - Source +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=BaseOS-$releasever-source$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/BaseOS/source/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[appstream] +name=Rocky Linux $releasever - AppStream +baseurl={{ release_train_url_prefix }}/AppStream/{{ release_train_url_suffix }} +gpgcheck=1 +enabled=1 +countme=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 +{{ release_train_auth }} + +[appstream-debuginfo] +name=Rocky Linux $releasever - AppStream - Debug +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=AppStream-$releasever-debug$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/AppStream/$basearch/debug/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[appstream-source] +name=Rocky Linux $releasever - AppStream - Source +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=AppStream-$releasever-source$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/AppStream/source/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[crb] +name=Rocky Linux $releasever - CRB +baseurl={{ release_train_url_prefix }}/CRB/{{ release_train_url_suffix }} +gpgcheck=1 +enabled=0 +countme=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 +{{ release_train_auth }} + +[crb-debuginfo] +name=Rocky Linux $releasever - CRB - Debug +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=CRB-$releasever-debug$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/CRB/$basearch/debug/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[crb-source] +name=Rocky Linux $releasever - CRB - Source +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=CRB-$releasever-source$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/CRB/source/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index bd60015d9..71ed02af8 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -11,4 +11,12 @@ with_items: - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" - - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock \ No newline at end of file + - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock + +- hosts: builder + become: yes + tasks: + - name: Revert ark repos + ansible.builtin.include_role: + name: release_train + tasks_from: revert_repos.yml diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 0fdbf9f60..51e4bb5f0 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -17,3 +17,11 @@ - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml" + +- hosts: builder + become: yes + tasks: + - name: Replace system repos with ark + ansible.builtin.include_role: + name: release_train + tasks_from: set_repos.yml diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 52202ead1..b5e6e4790 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -167,6 +167,11 @@ variable "extra_build_image_name" { default = "extra" } +variable "ark_password" { + type = string + default = "none" +} + source "openstack" "openhpc" { # Build VM: flavor = var.flavor @@ -228,6 +233,7 @@ build { "-i", "${var.repo_root}/packer/ansible-inventory.sh", "-vv", "-e", "@${var.repo_root}/packer/openhpc_extravars.yml", # not overridable by environments + "-e", "_github_secrets_ark_password=${var.ark_password}", ] } From 9ef7d69563c39c86c0791a5b327ca1cb898fe10f Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 25 Nov 2024 08:45:01 +0000 Subject: [PATCH 101/268] now uses lookup instead of packer args --- .github/workflows/fatimage.yml | 1 - .github/workflows/nightlybuild.yml | 1 - ansible/roles/release_train/defaults/main.yml | 2 +- packer/openstack.pkr.hcl | 6 ------ 4 files changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index cca652ef6..217b09c22 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -88,7 +88,6 @@ jobs: -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ - -var "ark_password=${ARK_PASSWORD}" \ openstack.pkr.hcl env: PKR_VAR_os_version: ${{ matrix.os_version }} diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 7fab8ebec..9f45b0890 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -86,7 +86,6 @@ jobs: -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ - -var "ark_password=${ARK_PASSWORD}" \ openstack.pkr.hcl env: diff --git a/ansible/roles/release_train/defaults/main.yml b/ansible/roles/release_train/defaults/main.yml index dbae5e3b9..7c007c59a 100644 --- a/ansible/roles/release_train/defaults/main.yml +++ b/ansible/roles/release_train/defaults/main.yml @@ -5,4 +5,4 @@ release_train_url_suffix: "x86_64/os/{{ release_train_timestamp }}/" release_train_timestamp: 20240816T002610 release_train_auth: | username = slurm-app-ci - password = {{ _github_secrets_ark_password }} \ No newline at end of file + password = {{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }} diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index b5e6e4790..52202ead1 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -167,11 +167,6 @@ variable "extra_build_image_name" { default = "extra" } -variable "ark_password" { - type = string - default = "none" -} - source "openstack" "openhpc" { # Build VM: flavor = var.flavor @@ -233,7 +228,6 @@ build { "-i", "${var.repo_root}/packer/ansible-inventory.sh", "-vv", "-e", "@${var.repo_root}/packer/openhpc_extravars.yml", # not overridable by environments - "-e", "_github_secrets_ark_password=${var.ark_password}", ] } From a6e12438d2920f3b2d928441f3e57f01dfb78ec7 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 25 Nov 2024 08:55:04 +0000 Subject: [PATCH 102/268] only applies to RL9 for now --- environments/.stackhpc/hooks/post.yml | 1 + environments/.stackhpc/hooks/pre.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index 71ed02af8..9622797ef 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -20,3 +20,4 @@ ansible.builtin.include_role: name: release_train tasks_from: revert_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 51e4bb5f0..a15df2cd4 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -25,3 +25,4 @@ ansible.builtin.include_role: name: release_train tasks_from: set_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided From 3e80268714e8a50c9d78a773e02f12daeac6240f Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 12:17:16 +0000 Subject: [PATCH 103/268] set up rocky-latest-test builds and ci --- .github/workflows/fatimage.yml | 4 ++-- .github/workflows/nightlybuild.yml | 4 ++-- packer/openstack.pkr.hcl | 8 ++++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index a8d3dbe29..d40504168 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -33,10 +33,10 @@ jobs: SOURCE_IMAGES_MAP: | { "RL8": { - "openstack.openhpc": "rocky-latest-RL8" + "openstack.openhpc": "rocky-latest-test-RL8" }, "RL9": { - "openstack.openhpc": "rocky-latest-RL9" + "openstack.openhpc": "rocky-latest-test-RL9" } } diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index da3de4ea5..66cbe8ba7 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -27,7 +27,7 @@ jobs: - RL8 - RL9 build: - - openstack.rocky-latest + - openstack.rocky-latest-test env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -138,7 +138,7 @@ jobs: - RL8 - RL9 image: - - rocky-latest + - rocky-latest-test exclude: - target_cloud: LEAFCLOUD env: diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 52202ead1..6fb1ff633 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -127,6 +127,7 @@ variable "volume_size" { default = { # fat image builds, GB: rocky-latest = 15 + rocky-latest-test = 15 openhpc = 15 } } @@ -152,6 +153,7 @@ variable "groups" { default = { # fat image builds: rocky-latest = ["update"] + rocky-latest-test = ["update"] openhpc = ["control", "compute", "login"] } } @@ -206,6 +208,12 @@ build { image_name = "${source.name}-${var.os_version}" } + # latest nightly image test: + source "source.openstack.openhpc" { + name = "rocky-latest-test" + image_name = "${source.name}-${var.os_version}" + } + # fat image: source "source.openstack.openhpc" { name = "openhpc" From 151746cad8d4d3abad63eab9df712392b2968e88 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 13:25:32 +0000 Subject: [PATCH 104/268] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index f9a2087c8..71a9162f8 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241118-0918-4538c6df", - "RL9": "openhpc-RL9-241118-0918-4538c6df" + "RL8": "openhpc-RL8-241125-1232-3e802687", + "RL9": "openhpc-RL9-241125-1232-3e802687" } } From 9c3301c28ebdccc8bb5574b50c12de7e75ef971a Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 13:28:26 +0000 Subject: [PATCH 105/268] CI_CLOUD PR label override for trivy scan --- .github/workflows/trivyscan.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/trivyscan.yml b/.github/workflows/trivyscan.yml index 4c090b85a..5b65baca1 100644 --- a/.github/workflows/trivyscan.yml +++ b/.github/workflows/trivyscan.yml @@ -25,6 +25,20 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Override CI_CLOUD if PR label is present + if: ${{ github.event_name == 'pull_request' }} + run: | + # Iterate over the labels + labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name') + echo $labels + for label in $labels; do + if [[ $label == CI_CLOUD=* ]]; then + # Extract the value after 'CI_CLOUD=' + CI_CLOUD_OVERRIDE=${label#CI_CLOUD=} + echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV + fi + done + - name: Record settings for CI cloud run: | echo CI_CLOUD: ${{ env.CI_CLOUD }} From b2b21603b4246266a72f6d7b304a1ca086eaa762 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 14:19:33 +0000 Subject: [PATCH 106/268] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 71a9162f8..70422736e 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241125-1232-3e802687", - "RL9": "openhpc-RL9-241125-1232-3e802687" + "RL8": "openhpc-RL8-241125-1349-9c3301c2", + "RL9": "openhpc-RL9-241125-1349-9c3301c2" } } From 0da074ba5b52fd0dfb63e824f88743f6e53bf562 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 15:31:55 +0000 Subject: [PATCH 107/268] bump containers.podman collection version --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 3d8c44011..142f377e5 100644 --- a/requirements.yml +++ b/requirements.yml @@ -25,7 +25,7 @@ roles: collections: - name: containers.podman - version: 1.10.2 + version: 1.16.2 - name: community.grafana version: 1.5.4 - name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools From 5ae1888fd7a70559c7b6575368adefaa25e6ddbc Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 22:30:00 +0000 Subject: [PATCH 108/268] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 70422736e..a4b65df6e 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241125-1349-9c3301c2", - "RL9": "openhpc-RL9-241125-1349-9c3301c2" + "RL8": "openhpc-RL8-241125-1804-0da074ba", + "RL9": "openhpc-RL9-241125-1804-0da074ba" } } From b4d2d19d22fa6fb18c34bf4b1551ef4eae38569b Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 26 Nov 2024 09:57:57 +0000 Subject: [PATCH 109/268] debug site.yml --- .github/workflows/stackhpc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..17b142713 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -124,7 +124,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible all -m wait_for_connection - ansible-playbook -v ansible/site.yml + ansible-playbook -vvv ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - name: Run MPI-based tests From 88e23de80019c30513b358b6c6d80bbd6c8b5acf Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 26 Nov 2024 10:21:37 +0000 Subject: [PATCH 110/268] mysql latest From 1eeef3790b91fa844290a968d438947d0f48e2ee Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Tue, 26 Nov 2024 10:45:01 +0000 Subject: [PATCH 111/268] Bump openhpc role for slurm restart, templating and nodes in multiple groups --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 3d8c44011..579cdd5d5 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ roles: - src: stackhpc.nfs version: v23.12.1 # Tolerate state nfs file handles - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.26.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/168 + version: v0.27.0 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From 6671d69c44de8dc7d5cc1ed15ad4b136eed58215 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 26 Nov 2024 12:00:52 +0000 Subject: [PATCH 112/268] bump mysql From f66feb9ba5c1b15c5a8d3e6ca25e748549a755c5 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 27 Nov 2024 11:51:33 +0000 Subject: [PATCH 113/268] simplify slurm-init file injection loop --- ansible/roles/compute_init/tasks/main.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index f2bbfb72d..15ba586d1 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -2,16 +2,16 @@ - name: Ensure directories exist file: - path: "/etc/ansible-init/{{ item.directory }}" + path: "/etc/ansible-init/{{ item }}" state: directory owner: root group: root mode: 0755 loop: - - { directory: "templates" } - - { directory: "files" } - - { directory: "library" } - - { directory: "filter_plugins" } + - templates + - files + - library + - filter_plugins - name: Inject templates copy: @@ -35,7 +35,7 @@ loop: - ../../resolv_conf/files/NetworkManager-dns-none.conf -- name: Inject files +- name: Inject libraries copy: src: '{{ item }}' dest: '/etc/ansible-init/library/{{ item | basename }}' From 6a8266c37c0aa5e6c321eabbcd539927d12e599f Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 27 Nov 2024 16:37:32 +0000 Subject: [PATCH 114/268] clear podman temp files on startup --- ansible/roles/podman/tasks/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/podman/tasks/config.yml b/ansible/roles/podman/tasks/config.yml index 5fea3c2e0..74cf1d576 100644 --- a/ansible/roles/podman/tasks/config.yml +++ b/ansible/roles/podman/tasks/config.yml @@ -55,6 +55,7 @@ # Type Path Mode User Group Age Argument R! /tmp/containers-user-* R! /tmp/podman-run-* + R! /tmp/storage-run-* dest: /etc/tmpfiles.d/podman-local.conf owner: root group: root From 33ffa655daafe55bf7d613664fdf8547716d94a2 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 27 Nov 2024 20:13:35 +0000 Subject: [PATCH 115/268] bump new images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index a4b65df6e..830a96499 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241125-1804-0da074ba", - "RL9": "openhpc-RL9-241125-1804-0da074ba" + "RL8": "openhpc-RL8-241127-1704-6a8266c3", + "RL9": "openhpc-RL9-241127-1704-6a8266c3" } } From f4c5cfe639b7b6145bea5f796cf2b9fdc7d96718 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 28 Nov 2024 10:52:22 +0000 Subject: [PATCH 116/268] stop using rocky-latest-test images in CI --- .github/workflows/fatimage.yml | 4 ++-- .github/workflows/nightlybuild.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index d40504168..a8d3dbe29 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -33,10 +33,10 @@ jobs: SOURCE_IMAGES_MAP: | { "RL8": { - "openstack.openhpc": "rocky-latest-test-RL8" + "openstack.openhpc": "rocky-latest-RL8" }, "RL9": { - "openstack.openhpc": "rocky-latest-test-RL9" + "openstack.openhpc": "rocky-latest-RL9" } } diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 66cbe8ba7..da3de4ea5 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -27,7 +27,7 @@ jobs: - RL8 - RL9 build: - - openstack.rocky-latest-test + - openstack.rocky-latest env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -138,7 +138,7 @@ jobs: - RL8 - RL9 image: - - rocky-latest-test + - rocky-latest exclude: - target_cloud: LEAFCLOUD env: From d7a8dd20110abaf759a1bc0bea3d706282e2d242 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 28 Nov 2024 10:54:34 +0000 Subject: [PATCH 117/268] low verbosity CI site.yml --- .github/workflows/stackhpc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 17b142713..b08854adb 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -124,7 +124,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible all -m wait_for_connection - ansible-playbook -vvv ansible/site.yml + ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - name: Run MPI-based tests From 6faf91958fefdd424a0330956a70c0aca2d1a53c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 29 Nov 2024 15:48:07 +0000 Subject: [PATCH 118/268] refactored ark role, disabled repos at end of build and modified site to work with disabled repos --- ansible/.gitignore | 4 +- ansible/fatimage.yml | 4 + ansible/roles/dnf_repos/defaults/main.yml | 24 +++++ .../roles/dnf_repos/tasks/disable_repos.yml | 18 ++++ ansible/roles/dnf_repos/tasks/set_repos.yml | 25 +++++ ansible/roles/openondemand/tasks/main.yml | 1 + ansible/roles/openondemand/tasks/pam_auth.yml | 3 - ansible/roles/release_train/defaults/main.yml | 8 -- .../release_train/tasks/revert_repos.yml | 19 ---- .../roles/release_train/tasks/set_repos.yml | 22 ----- .../templates/rocky-extras.repo.j2 | 65 ------------- .../release_train/templates/rocky.repo.j2 | 93 ------------------- ansible/slurm.yml | 1 + environments/.stackhpc/hooks/post.yml | 6 +- environments/.stackhpc/hooks/pre.yml | 2 +- .../inventory/group_vars/all/defaults.yml | 1 + .../inventory/group_vars/all/openhpc.yml | 10 ++ packer/openhpc_extravars.yml | 1 + 18 files changed, 91 insertions(+), 216 deletions(-) create mode 100644 ansible/roles/dnf_repos/defaults/main.yml create mode 100644 ansible/roles/dnf_repos/tasks/disable_repos.yml create mode 100644 ansible/roles/dnf_repos/tasks/set_repos.yml delete mode 100644 ansible/roles/release_train/defaults/main.yml delete mode 100644 ansible/roles/release_train/tasks/revert_repos.yml delete mode 100644 ansible/roles/release_train/tasks/set_repos.yml delete mode 100644 ansible/roles/release_train/templates/rocky-extras.repo.j2 delete mode 100644 ansible/roles/release_train/templates/rocky.repo.j2 diff --git a/ansible/.gitignore b/ansible/.gitignore index ad841dc38..48c917c4f 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -64,5 +64,5 @@ roles/* !roles/k9s/** !roles/lustre/ !roles/lustre/** -!roles/release_train/ -!roles/release_train/** +!roles/dnf_repos/ +!roles/dnf_repos/** diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 7cad2dc59..ec0d4dd74 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -69,6 +69,10 @@ tasks_from: install.yml when: "'openhpc' in group_names" + - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build + yum: + name: mod_authnz_pam + # - import_playbook: portal.yml - name: Open Ondemand server (packages) include_role: diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml new file mode 100644 index 000000000..00778533c --- /dev/null +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -0,0 +1,24 @@ +dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/9.4 +dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_ark_timestamp }}/" +# most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml +# note that some timestamps can't be used because not all repos have snapshots for them +dnf_repos_ark_timestamp: 20240816T002610 +dnf_repos_username: slurm-app-ci +dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" + +# epel installed separately +dnf_repos_repolist: +- file: rocky + name: baseos + base_url: "{{ dnf_repos_rocky_ark_prefix }}/BaseOS/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky + name: appstream + base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky + name: crb + base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky-extras + name: extras + base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" + +dnf_repos_epel_baseurl: https://ark.stackhpc.com/pulp/content/epel/9/Everything/x86_64/20240902T080424 diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml new file mode 100644 index 000000000..f8997b741 --- /dev/null +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -0,0 +1,18 @@ +--- +- name: Disable Pulp repos and remove creds + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + enabled: false + loop: "{{ dnf_repos_repolist }}" + +- name: Disable EPEL repo and remove creds + ansible.builtin.yum_repository: + name: epel + file: epel + description: epel + baseurl: "{{ dnf_repos_epel_baseurl }}" + gpgcheck: false + enabled: false diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml new file mode 100644 index 000000000..2c51b96ae --- /dev/null +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -0,0 +1,25 @@ +--- + +- name: Replace system repos with Pulp repos + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" + loop: "{{ dnf_repos_repolist }}" + +- name: Install epel-release + ansible.builtin.dnf: + name: epel-release + +- name: Use Pulp EPEL repo + ansible.builtin.yum_repository: + name: epel + file: epel + description: epel + gpgcheck: false + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" + baseurl: "{{ dnf_repos_epel_baseurl }}" diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index 86184f13c..a9b975c5b 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -12,6 +12,7 @@ tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" public: yes # Expose the vars from this role to the rest of the play + when: appliances_mode != 'configure' # can't set vars: from a dict hence the workaround above - include_tasks: diff --git a/ansible/roles/openondemand/tasks/pam_auth.yml b/ansible/roles/openondemand/tasks/pam_auth.yml index 0edce622f..3ede2d3ce 100644 --- a/ansible/roles/openondemand/tasks/pam_auth.yml +++ b/ansible/roles/openondemand/tasks/pam_auth.yml @@ -1,8 +1,5 @@ # https://osc.github.io/ood-documentation/latest/authentication/pam.html --- -- name: Install Apache PAM module - yum: - name: mod_authnz_pam - name: Enable Apache PAM module lineinfile: diff --git a/ansible/roles/release_train/defaults/main.yml b/ansible/roles/release_train/defaults/main.yml deleted file mode 100644 index 7c007c59a..000000000 --- a/ansible/roles/release_train/defaults/main.yml +++ /dev/null @@ -1,8 +0,0 @@ -release_train_url_prefix: https://ark.stackhpc.com/pulp/content/rocky/9.4 -release_train_url_suffix: "x86_64/os/{{ release_train_timestamp }}/" -# most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml -# note that some timestamps can't be used because not all repos have snapshots for them -release_train_timestamp: 20240816T002610 -release_train_auth: | - username = slurm-app-ci - password = {{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }} diff --git a/ansible/roles/release_train/tasks/revert_repos.yml b/ansible/roles/release_train/tasks/revert_repos.yml deleted file mode 100644 index 8780ac13c..000000000 --- a/ansible/roles/release_train/tasks/revert_repos.yml +++ /dev/null @@ -1,19 +0,0 @@ ---- - -- name: Check for backup folder exists - stat: - path: /etc/yum.repos.d.backup - register: _stat_yum_backup_file - -- name: Fail if backup folder doesn't exist - assert: - that: _stat_yum_backup_file.stat.exists - -- name: Remove ark repos - ansible.builtin.file: - state: absent - path: /etc/yum.repos.d - -- name: Restore backup repos - ansible.builtin.shell: - cmd: mv /etc/yum.repos.d.backup /etc/yum.repos.d diff --git a/ansible/roles/release_train/tasks/set_repos.yml b/ansible/roles/release_train/tasks/set_repos.yml deleted file mode 100644 index f527a85d8..000000000 --- a/ansible/roles/release_train/tasks/set_repos.yml +++ /dev/null @@ -1,22 +0,0 @@ ---- - -- name: Check for existing backup folder - stat: - path: /etc/yum.repos.d.backup - register: _stat_yum_backup_file - -- name: Backup existing package repos - ansible.builtin.copy: - remote_src: true - src: /etc/yum.repos.d/ - dest: /etc/yum.repos.d.backup - when: not _stat_yum_backup_file.stat.exists - -- name: Replace package repos with release train repos - no_log: true - ansible.builtin.template: - src: "{{ item }}.j2" - dest: /etc/yum.repos.d/{{ item }} - loop: - - rocky-extras.repo - - rocky.repo \ No newline at end of file diff --git a/ansible/roles/release_train/templates/rocky-extras.repo.j2 b/ansible/roles/release_train/templates/rocky-extras.repo.j2 deleted file mode 100644 index 78bed03d5..000000000 --- a/ansible/roles/release_train/templates/rocky-extras.repo.j2 +++ /dev/null @@ -1,65 +0,0 @@ -# rocky-extras.repo -# -# The mirrorlist system uses the connecting IP address of the client and the -# update status of each mirror to pick current mirrors that are geographically -# close to the client. You should use this for Rocky updates unless you are -# manually picking other mirrors. -# -# If the mirrorlist does not work for you, you can try the commented out -# baseurl line instead. - -[extras] -name=Rocky Linux $releasever - Extras -baseurl={{ release_train_url_prefix }}/extras/{{ release_train_url_suffix }} -gpgcheck=1 -enabled=1 -countme=1 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 -{{ release_train_auth }} - -[extras-debuginfo] -name=Rocky Linux $releasever - Extras Debug -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=extras-$releasever-debug$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/extras/$basearch/debug/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[extras-source] -name=Rocky Linux $releasever - Extras Source -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=extras-$releasever-source$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/extras/source/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[plus] -name=Rocky Linux $releasever - Plus -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=plus-$releasever$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/$basearch/os/ -gpgcheck=1 -enabled=0 -countme=1 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[plus-debuginfo] -name=Rocky Linux $releasever - Plus - Debug -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=plus-$releasever-debug$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/$basearch/debug/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[plus-source] -name=Rocky Linux $releasever - Plus - Source -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=plus-$releasever-source$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/source/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 diff --git a/ansible/roles/release_train/templates/rocky.repo.j2 b/ansible/roles/release_train/templates/rocky.repo.j2 deleted file mode 100644 index 29d6aee42..000000000 --- a/ansible/roles/release_train/templates/rocky.repo.j2 +++ /dev/null @@ -1,93 +0,0 @@ -# rocky.repo -# -# The mirrorlist system uses the connecting IP address of the client and the -# update status of each mirror to pick current mirrors that are geographically -# close to the client. You should use this for Rocky updates unless you are -# manually picking other mirrors. -# -# If the mirrorlist does not work for you, you can try the commented out -# baseurl line instead. - -[baseos] -name=Rocky Linux $releasever - BaseOS -baseurl={{ release_train_url_prefix }}/BaseOS/{{ release_train_url_suffix }} -gpgcheck=1 -enabled=1 -countme=1 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 -{{ release_train_auth }} - -[baseos-debuginfo] -name=Rocky Linux $releasever - BaseOS - Debug -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=BaseOS-$releasever-debug$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/BaseOS/$basearch/debug/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[baseos-source] -name=Rocky Linux $releasever - BaseOS - Source -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=BaseOS-$releasever-source$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/BaseOS/source/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[appstream] -name=Rocky Linux $releasever - AppStream -baseurl={{ release_train_url_prefix }}/AppStream/{{ release_train_url_suffix }} -gpgcheck=1 -enabled=1 -countme=1 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 -{{ release_train_auth }} - -[appstream-debuginfo] -name=Rocky Linux $releasever - AppStream - Debug -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=AppStream-$releasever-debug$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/AppStream/$basearch/debug/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[appstream-source] -name=Rocky Linux $releasever - AppStream - Source -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=AppStream-$releasever-source$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/AppStream/source/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[crb] -name=Rocky Linux $releasever - CRB -baseurl={{ release_train_url_prefix }}/CRB/{{ release_train_url_suffix }} -gpgcheck=1 -enabled=0 -countme=1 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 -{{ release_train_auth }} - -[crb-debuginfo] -name=Rocky Linux $releasever - CRB - Debug -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=CRB-$releasever-debug$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/CRB/$basearch/debug/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[crb-source] -name=Rocky Linux $releasever - CRB - Source -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=CRB-$releasever-source$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/CRB/source/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 0b7397242..f2d37a60c 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -27,6 +27,7 @@ tasks: - import_role: name: stackhpc.openhpc + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Set locked memory limits on user-facing nodes hosts: diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index 9622797ef..98e366304 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -16,8 +16,8 @@ - hosts: builder become: yes tasks: - - name: Revert ark repos + - name: Disable ark repos ansible.builtin.include_role: - name: release_train - tasks_from: revert_repos.yml + name: dnf_repos + tasks_from: disable_repos.yml when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index a15df2cd4..9ea84740d 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -23,6 +23,6 @@ tasks: - name: Replace system repos with ark ansible.builtin.include_role: - name: release_train + name: dnf_repos tasks_from: set_repos.yml when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 15340820f..2a88f035d 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -6,6 +6,7 @@ appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }} appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform +appliances_mode: configure # Address(ip/dns) for internal communication between services. This is # normally traffic you do no want to expose to users. diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index c613fc697..a23bc77ba 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -38,3 +38,13 @@ openhpc_config_default: openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" + +ohpc_default_extra_repos: + "9": [] #overriding to ensure doesn't overwrite ark epel repo + "8": + - name: epel + file: epel + description: "Extra Packages for Enterprise Linux 8 - $basearch" + metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" + gpgcheck: true + gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/packer/openhpc_extravars.yml b/packer/openhpc_extravars.yml index 66f668649..e68741c01 100644 --- a/packer/openhpc_extravars.yml +++ b/packer/openhpc_extravars.yml @@ -1 +1,2 @@ workaround_ansible_issue_61497: yes # extravars files can't be empty +appliances_mode: "build" From 0bc473c27b5c29fa15d87da059ef88d438d58766 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 3 Dec 2024 14:12:42 +0000 Subject: [PATCH 119/268] fixed ood install with disbaled repos + fixed ark CRB typo --- ansible/fatimage.yml | 8 ++++---- ansible/roles/dnf_repos/defaults/main.yml | 2 +- ansible/roles/openondemand/tasks/main.yml | 8 +++++++- ansible/roles/openondemand/tasks/pam_auth.yml | 3 +++ ansible/roles/openondemand/tasks/vnc_compute.yml | 1 + 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index ec0d4dd74..b28e4f308 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -69,10 +69,6 @@ tasks_from: install.yml when: "'openhpc' in group_names" - - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build - yum: - name: mod_authnz_pam - # - import_playbook: portal.yml - name: Open Ondemand server (packages) include_role: @@ -102,6 +98,10 @@ tasks_from: jupyter_compute.yml when: "'openondemand_jupyter' in group_names" + - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build + yum: + name: mod_authnz_pam + # - import_playbook: monitoring.yml: - import_role: name: opensearch diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 00778533c..000ae3524 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -16,7 +16,7 @@ dnf_repos_repolist: base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" - file: rocky name: crb - base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" + base_url: "{{ dnf_repos_rocky_ark_prefix }}/CRB/{{ dnf_repos_rocky_ark_suffix }}" - file: rocky-extras name: extras base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index a9b975c5b..bd5706ecb 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -6,12 +6,18 @@ loop: "{{ openondemand_osc_ood_defaults | dict2items }}" when: (item.key in hostvars[inventory_hostname]) or (item.value) +# osc.ood variables are exposed to play here instead of setting 'public' in include role so that they will still be exposed during runtime +- ansible.builtin.include_vars: + dir: "{{ playbook_dir }}/roles/osc.ood/defaults/main" + +- ansible.builtin.include_vars: + file: "{{ playbook_dir }}/roles/osc.ood/vars/Rocky/{{ ansible_distribution_major_version }}.yml" + # if using PAM auth we need apache installed but NOT started so split the osc.ood role up: - include_role: name: osc.ood tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" - public: yes # Expose the vars from this role to the rest of the play when: appliances_mode != 'configure' # can't set vars: from a dict hence the workaround above diff --git a/ansible/roles/openondemand/tasks/pam_auth.yml b/ansible/roles/openondemand/tasks/pam_auth.yml index 3ede2d3ce..6bc4bda36 100644 --- a/ansible/roles/openondemand/tasks/pam_auth.yml +++ b/ansible/roles/openondemand/tasks/pam_auth.yml @@ -1,5 +1,8 @@ # https://osc.github.io/ood-documentation/latest/authentication/pam.html --- +- name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build + yum: + name: mod_authnz_pam - name: Enable Apache PAM module lineinfile: diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index 388e3b3c5..6ec340249 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -48,6 +48,7 @@ tags: install yum: name: '@Xfce' + when: appliances_mode != 'configure' # dnf group/module installs aren't idempotent so only run during build # - name: Ensure python3.9 installed # dnf: From 364ec79252f11d707b8705068676e23e876357aa Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 3 Dec 2024 16:12:18 +0000 Subject: [PATCH 120/268] fixed eessi install and slurm not loading appliances_mode --- ansible/roles/eessi/tasks/main.yaml | 1 + ansible/slurm.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/roles/eessi/tasks/main.yaml b/ansible/roles/eessi/tasks/main.yaml index d121b6fdd..c61625b0e 100644 --- a/ansible/roles/eessi/tasks/main.yaml +++ b/ansible/roles/eessi/tasks/main.yaml @@ -10,6 +10,7 @@ - name: Add CVMFS repo dnf: name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm + disable_gpg_check: true - name: Install CVMFS dnf: diff --git a/ansible/slurm.yml b/ansible/slurm.yml index f2d37a60c..cf282f786 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -25,7 +25,7 @@ tags: - openhpc tasks: - - import_role: + - include_role: name: stackhpc.openhpc tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" From b0558b95a162064d3a058d43fa012da2d3660a5a Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 3 Dec 2024 16:29:57 +0000 Subject: [PATCH 121/268] variables renames + more ansible facts in dnf_repos --- ansible/roles/dnf_repos/defaults/main.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 000ae3524..a3e05d0e1 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,8 +1,8 @@ -dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/9.4 -dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_ark_timestamp }}/" +dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/{{ ansible_distribution_version }} +dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_rocky_ark_timestamp }}/" # most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml # note that some timestamps can't be used because not all repos have snapshots for them -dnf_repos_ark_timestamp: 20240816T002610 +dnf_repos_rocky_ark_timestamp: 20240816T002610 dnf_repos_username: slurm-app-ci dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" @@ -21,4 +21,5 @@ dnf_repos_repolist: name: extras base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" -dnf_repos_epel_baseurl: https://ark.stackhpc.com/pulp/content/epel/9/Everything/x86_64/20240902T080424 +dnf_repos_epel_timestamp: 20240902T080424 +dnf_repos_epel_baseurl: "https://ark.stackhpc.com/pulp/content/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ dnf_repos_epel_timestamp }}" From 3131bd6d600c13f73bcd2336c3f74bda07d65af9 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 3 Dec 2024 18:17:33 +0000 Subject: [PATCH 122/268] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index f9a2087c8..14c997596 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241118-0918-4538c6df", - "RL9": "openhpc-RL9-241118-0918-4538c6df" + "RL8": "openhpc-RL8-241203-1659-b0558b95", + "RL9": "openhpc-RL9-241203-1659-b0558b95" } } From 1be9c6b7697e1d0a292f27ca4ee5f2702c8612fd Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 4 Dec 2024 10:00:44 +0000 Subject: [PATCH 123/268] added review comment Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/dnf_repos/tasks/set_repos.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index 2c51b96ae..f8cca5600 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -11,6 +11,7 @@ loop: "{{ dnf_repos_repolist }}" - name: Install epel-release + # done so that roles installing epel via epel-release don't over-write our changes to the epel repo ansible.builtin.dnf: name: epel-release From b7670e94d371118f0eb1c5084d1a7a4044f6665a Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 4 Dec 2024 10:10:00 +0000 Subject: [PATCH 124/268] moved config into builder and .stackhpc --- .../inventory/group_vars/openhpc/overrides.yml | 10 ++++++++++ .../common/inventory/group_vars/all/openhpc.yml | 10 ---------- .../common/inventory/group_vars/builder/defaults.yml | 1 + packer/openhpc_extravars.yml | 1 - 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml index 5aac5f8ad..858dfd9d3 100644 --- a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml +++ b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml @@ -1,3 +1,13 @@ openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug + +ohpc_default_extra_repos: + "9": [] #overriding to ensure doesn't overwrite ark epel repo + "8": + - name: epel + file: epel + description: "Extra Packages for Enterprise Linux 8 - $basearch" + metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" + gpgcheck: true + gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index a23bc77ba..c613fc697 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -38,13 +38,3 @@ openhpc_config_default: openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" - -ohpc_default_extra_repos: - "9": [] #overriding to ensure doesn't overwrite ark epel repo - "8": - - name: epel - file: epel - description: "Extra Packages for Enterprise Linux 8 - $basearch" - metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" - gpgcheck: true - gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml index 22042c1bf..b43d9f03c 100644 --- a/environments/common/inventory/group_vars/builder/defaults.yml +++ b/environments/common/inventory/group_vars/builder/defaults.yml @@ -22,3 +22,4 @@ squid_cache_disk: 0 # just needs to be defined squid_cache_mem: 0 tuned_started: false tuned_enabled: false +appliances_mode: build diff --git a/packer/openhpc_extravars.yml b/packer/openhpc_extravars.yml index e68741c01..66f668649 100644 --- a/packer/openhpc_extravars.yml +++ b/packer/openhpc_extravars.yml @@ -1,2 +1 @@ workaround_ansible_issue_61497: yes # extravars files can't be empty -appliances_mode: "build" From 2230bb8af6b8bede2ea1f712913d76e59a70f79b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 4 Dec 2024 10:17:26 +0000 Subject: [PATCH 125/268] overriding openhpc extra repos in common --- .../inventory/group_vars/openhpc/overrides.yml | 10 ---------- .../common/inventory/group_vars/all/openhpc.yml | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml index 858dfd9d3..5aac5f8ad 100644 --- a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml +++ b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml @@ -1,13 +1,3 @@ openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug - -ohpc_default_extra_repos: - "9": [] #overriding to ensure doesn't overwrite ark epel repo - "8": - - name: epel - file: epel - description: "Extra Packages for Enterprise Linux 8 - $basearch" - metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" - gpgcheck: true - gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index c613fc697..a23bc77ba 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -38,3 +38,13 @@ openhpc_config_default: openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" + +ohpc_default_extra_repos: + "9": [] #overriding to ensure doesn't overwrite ark epel repo + "8": + - name: epel + file: epel + description: "Extra Packages for Enterprise Linux 8 - $basearch" + metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" + gpgcheck: true + gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" From 4de581c71f2ef9976aa044286b4ed12c29b729cd Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 4 Dec 2024 16:50:21 +0000 Subject: [PATCH 126/268] Use rocky 9.4 release train snapshots for builds (#486) * replaces system repos with ark repos during ci * now uses lookup instead of packer args * only applies to RL9 for now * refactored ark role, disabled repos at end of build and modified site to work with disabled repos * fixed ood install with disbaled repos + fixed ark CRB typo * fixed eessi install and slurm not loading appliances_mode * variables renames + more ansible facts in dnf_repos * bump images * added review comment Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * moved config into builder and .stackhpc --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- .github/workflows/fatimage.yml | 1 + .github/workflows/nightlybuild.yml | 1 + ansible/.gitignore | 2 ++ ansible/fatimage.yml | 4 +++ ansible/roles/dnf_repos/defaults/main.yml | 25 ++++++++++++++++++ .../roles/dnf_repos/tasks/disable_repos.yml | 18 +++++++++++++ ansible/roles/dnf_repos/tasks/set_repos.yml | 26 +++++++++++++++++++ ansible/roles/eessi/tasks/main.yaml | 1 + ansible/roles/openondemand/tasks/main.yml | 9 ++++++- ansible/roles/openondemand/tasks/pam_auth.yml | 2 +- .../roles/openondemand/tasks/vnc_compute.yml | 1 + ansible/slurm.yml | 3 ++- environments/.stackhpc/hooks/post.yml | 11 +++++++- environments/.stackhpc/hooks/pre.yml | 9 +++++++ .../group_vars/openhpc/overrides.yml | 10 +++++++ .../terraform/cluster_image.auto.tfvars.json | 4 +-- .../inventory/group_vars/all/defaults.yml | 1 + .../inventory/group_vars/builder/defaults.yml | 1 + 18 files changed, 123 insertions(+), 6 deletions(-) create mode 100644 ansible/roles/dnf_repos/defaults/main.yml create mode 100644 ansible/roles/dnf_repos/tasks/disable_repos.yml create mode 100644 ansible/roles/dnf_repos/tasks/set_repos.yml diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index a8d3dbe29..217b09c22 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -39,6 +39,7 @@ jobs: "openstack.openhpc": "rocky-latest-RL9" } } + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index da3de4ea5..9f45b0890 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -37,6 +37,7 @@ jobs: "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" } + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: - uses: actions/checkout@v2 diff --git a/ansible/.gitignore b/ansible/.gitignore index 8edcc4360..48c917c4f 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -64,3 +64,5 @@ roles/* !roles/k9s/** !roles/lustre/ !roles/lustre/** +!roles/dnf_repos/ +!roles/dnf_repos/** diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 7cad2dc59..b28e4f308 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -98,6 +98,10 @@ tasks_from: jupyter_compute.yml when: "'openondemand_jupyter' in group_names" + - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build + yum: + name: mod_authnz_pam + # - import_playbook: monitoring.yml: - import_role: name: opensearch diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml new file mode 100644 index 000000000..a3e05d0e1 --- /dev/null +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -0,0 +1,25 @@ +dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/{{ ansible_distribution_version }} +dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_rocky_ark_timestamp }}/" +# most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml +# note that some timestamps can't be used because not all repos have snapshots for them +dnf_repos_rocky_ark_timestamp: 20240816T002610 +dnf_repos_username: slurm-app-ci +dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" + +# epel installed separately +dnf_repos_repolist: +- file: rocky + name: baseos + base_url: "{{ dnf_repos_rocky_ark_prefix }}/BaseOS/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky + name: appstream + base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky + name: crb + base_url: "{{ dnf_repos_rocky_ark_prefix }}/CRB/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky-extras + name: extras + base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" + +dnf_repos_epel_timestamp: 20240902T080424 +dnf_repos_epel_baseurl: "https://ark.stackhpc.com/pulp/content/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ dnf_repos_epel_timestamp }}" diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml new file mode 100644 index 000000000..f8997b741 --- /dev/null +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -0,0 +1,18 @@ +--- +- name: Disable Pulp repos and remove creds + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + enabled: false + loop: "{{ dnf_repos_repolist }}" + +- name: Disable EPEL repo and remove creds + ansible.builtin.yum_repository: + name: epel + file: epel + description: epel + baseurl: "{{ dnf_repos_epel_baseurl }}" + gpgcheck: false + enabled: false diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml new file mode 100644 index 000000000..f8cca5600 --- /dev/null +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -0,0 +1,26 @@ +--- + +- name: Replace system repos with Pulp repos + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" + loop: "{{ dnf_repos_repolist }}" + +- name: Install epel-release + # done so that roles installing epel via epel-release don't over-write our changes to the epel repo + ansible.builtin.dnf: + name: epel-release + +- name: Use Pulp EPEL repo + ansible.builtin.yum_repository: + name: epel + file: epel + description: epel + gpgcheck: false + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" + baseurl: "{{ dnf_repos_epel_baseurl }}" diff --git a/ansible/roles/eessi/tasks/main.yaml b/ansible/roles/eessi/tasks/main.yaml index d121b6fdd..c61625b0e 100644 --- a/ansible/roles/eessi/tasks/main.yaml +++ b/ansible/roles/eessi/tasks/main.yaml @@ -10,6 +10,7 @@ - name: Add CVMFS repo dnf: name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm + disable_gpg_check: true - name: Install CVMFS dnf: diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index 86184f13c..bd5706ecb 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -6,12 +6,19 @@ loop: "{{ openondemand_osc_ood_defaults | dict2items }}" when: (item.key in hostvars[inventory_hostname]) or (item.value) +# osc.ood variables are exposed to play here instead of setting 'public' in include role so that they will still be exposed during runtime +- ansible.builtin.include_vars: + dir: "{{ playbook_dir }}/roles/osc.ood/defaults/main" + +- ansible.builtin.include_vars: + file: "{{ playbook_dir }}/roles/osc.ood/vars/Rocky/{{ ansible_distribution_major_version }}.yml" + # if using PAM auth we need apache installed but NOT started so split the osc.ood role up: - include_role: name: osc.ood tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" - public: yes # Expose the vars from this role to the rest of the play + when: appliances_mode != 'configure' # can't set vars: from a dict hence the workaround above - include_tasks: diff --git a/ansible/roles/openondemand/tasks/pam_auth.yml b/ansible/roles/openondemand/tasks/pam_auth.yml index 0edce622f..6bc4bda36 100644 --- a/ansible/roles/openondemand/tasks/pam_auth.yml +++ b/ansible/roles/openondemand/tasks/pam_auth.yml @@ -1,6 +1,6 @@ # https://osc.github.io/ood-documentation/latest/authentication/pam.html --- -- name: Install Apache PAM module +- name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build yum: name: mod_authnz_pam diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index 388e3b3c5..6ec340249 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -48,6 +48,7 @@ tags: install yum: name: '@Xfce' + when: appliances_mode != 'configure' # dnf group/module installs aren't idempotent so only run during build # - name: Ensure python3.9 installed # dnf: diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 0b7397242..cf282f786 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -25,8 +25,9 @@ tags: - openhpc tasks: - - import_role: + - include_role: name: stackhpc.openhpc + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Set locked memory limits on user-facing nodes hosts: diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index bd60015d9..98e366304 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -11,4 +11,13 @@ with_items: - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" - - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock \ No newline at end of file + - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock + +- hosts: builder + become: yes + tasks: + - name: Disable ark repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 0fdbf9f60..9ea84740d 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -17,3 +17,12 @@ - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml" + +- hosts: builder + become: yes + tasks: + - name: Replace system repos with ark + ansible.builtin.include_role: + name: dnf_repos + tasks_from: set_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml index 5aac5f8ad..858dfd9d3 100644 --- a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml +++ b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml @@ -1,3 +1,13 @@ openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug + +ohpc_default_extra_repos: + "9": [] #overriding to ensure doesn't overwrite ark epel repo + "8": + - name: epel + file: epel + description: "Extra Packages for Enterprise Linux 8 - $basearch" + metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" + gpgcheck: true + gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index f9a2087c8..14c997596 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241118-0918-4538c6df", - "RL9": "openhpc-RL9-241118-0918-4538c6df" + "RL8": "openhpc-RL8-241203-1659-b0558b95", + "RL9": "openhpc-RL9-241203-1659-b0558b95" } } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 15340820f..2a88f035d 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -6,6 +6,7 @@ appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }} appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform +appliances_mode: configure # Address(ip/dns) for internal communication between services. This is # normally traffic you do no want to expose to users. diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml index 22042c1bf..b43d9f03c 100644 --- a/environments/common/inventory/group_vars/builder/defaults.yml +++ b/environments/common/inventory/group_vars/builder/defaults.yml @@ -22,3 +22,4 @@ squid_cache_disk: 0 # just needs to be defined squid_cache_mem: 0 tuned_started: false tuned_enabled: false +appliances_mode: build From 9723782e7fcb284945e67a17aec1a756f708f89b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 13:04:29 +0000 Subject: [PATCH 127/268] testing builds with leafcloud pulp --- ansible/roles/dnf_repos/defaults/main.yml | 48 ++++++++++++++----- ansible/roles/dnf_repos/tasks/set_repos.yml | 4 -- .../inventory/group_vars/builder.yml | 1 + 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index a3e05d0e1..b997605ea 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,25 +1,47 @@ -dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/{{ ansible_distribution_version }} -dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_rocky_ark_timestamp }}/" -# most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml -# note that some timestamps can't be used because not all repos have snapshots for them -dnf_repos_rocky_ark_timestamp: 20240816T002610 -dnf_repos_username: slurm-app-ci -dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" +# dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/{{ ansible_distribution_version }} +# dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_rocky_ark_timestamp }}/" +# # most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml +# # note that some timestamps can't be used because not all repos have snapshots for them +# dnf_repos_rocky_ark_timestamp: 20240816T002610 +# dnf_repos_username: slurm-app-ci +# dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" + +# # epel installed separately +# dnf_repos_repolist: +# - file: rocky +# name: baseos +# base_url: "{{ dnf_repos_rocky_ark_prefix }}/BaseOS/{{ dnf_repos_rocky_ark_suffix }}" +# - file: rocky +# name: appstream +# base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" +# - file: rocky +# name: crb +# base_url: "{{ dnf_repos_rocky_ark_prefix }}/CRB/{{ dnf_repos_rocky_ark_suffix }}" +# - file: rocky-extras +# name: extras +# base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" + +# dnf_repos_epel_timestamp: 20240902T080424 +# dnf_repos_epel_baseurl: "https://ark.stackhpc.com/pulp/content/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ dnf_repos_epel_timestamp }}" + +dnf_repos_pulp_url: # required +dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" +dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" +dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" # epel installed separately dnf_repos_repolist: - file: rocky name: baseos - base_url: "{{ dnf_repos_rocky_ark_prefix }}/BaseOS/{{ dnf_repos_rocky_ark_suffix }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/baseos" - file: rocky name: appstream - base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/appstream" - file: rocky name: crb - base_url: "{{ dnf_repos_rocky_ark_prefix }}/CRB/{{ dnf_repos_rocky_ark_suffix }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/crb" - file: rocky-extras name: extras - base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras" -dnf_repos_epel_timestamp: 20240902T080424 -dnf_repos_epel_baseurl: "https://ark.stackhpc.com/pulp/content/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ dnf_repos_epel_timestamp }}" +dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index f8cca5600..8a8364097 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -6,8 +6,6 @@ name: "{{ item.name }}" baseurl: "{{ item.base_url }}" description: "{{ item.name }}" - username: "{{ dnf_repos_username }}" - password: "{{ dnf_repos_password }}" loop: "{{ dnf_repos_repolist }}" - name: Install epel-release @@ -21,6 +19,4 @@ file: epel description: epel gpgcheck: false - username: "{{ dnf_repos_username }}" - password: "{{ dnf_repos_password }}" baseurl: "{{ dnf_repos_epel_baseurl }}" diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 8d7ee98d2..1a65daa48 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1 +1,2 @@ #update_enable: false # Can uncomment for speed debugging non-update related build issues +dnf_repos_pulp_url: http://192.168.10.157:8080 From 127b79210af6d806c82674d4a0cbe64eb07e3fff Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 15:12:50 +0000 Subject: [PATCH 128/268] pulp integration --- ansible/.gitignore | 2 + ansible/adhoc/deploy-pulp.yml | 25 +++++++ ansible/bootstrap.yml | 17 +++++ ansible/roles/dnf_repos/defaults/main.yml | 28 +------ ansible/roles/passwords/defaults/main.yml | 1 + ansible/roles/pulp_site/defaults/main.yml | 75 +++++++++++++++++++ ansible/roles/pulp_site/tasks/install.yml | 43 +++++++++++ ansible/roles/pulp_site/tasks/sync.yml | 73 ++++++++++++++++++ ansible/roles/pulp_site/templates/cli.toml.j2 | 14 ++++ .../roles/pulp_site/templates/settings.py.j2 | 2 + ansible/site.yml | 9 +++ environments/.stackhpc/hooks/post.yml | 9 --- environments/.stackhpc/hooks/pre.yml | 9 --- .../inventory/group_vars/builder.yml | 2 +- .../inventory/group_vars/all/defaults.yml | 1 + environments/common/inventory/groups | 4 + requirements.txt | 3 +- requirements.yml | 2 + 18 files changed, 272 insertions(+), 47 deletions(-) create mode 100644 ansible/adhoc/deploy-pulp.yml create mode 100644 ansible/roles/pulp_site/defaults/main.yml create mode 100644 ansible/roles/pulp_site/tasks/install.yml create mode 100644 ansible/roles/pulp_site/tasks/sync.yml create mode 100644 ansible/roles/pulp_site/templates/cli.toml.j2 create mode 100644 ansible/roles/pulp_site/templates/settings.py.j2 diff --git a/ansible/.gitignore b/ansible/.gitignore index 48c917c4f..4eba25fa9 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -66,3 +66,5 @@ roles/* !roles/lustre/** !roles/dnf_repos/ !roles/dnf_repos/** +!roles/pulp_site/ +!roles/pulp_site/** diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml new file mode 100644 index 000000000..291da7f59 --- /dev/null +++ b/ansible/adhoc/deploy-pulp.yml @@ -0,0 +1,25 @@ +# Usage: ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server=" + +- name: Add temporary pulp server host + hosts: localhost + tasks: + - ansible.builtin.add_host: + name: "{{ pulp_server }}" + group: "_pulp_host" + +- name: Install pulp on server + become: yes + hosts: _pulp_host + tasks: + - ansible.builtin.import_role: + name: pulp_site + tasks_from: install.yml + +- name: Add pulp host to environment + hosts: localhost + tasks: + - ansible.builtin.copy: + dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" + content: | + # ansible managed + appliances_pulp_server: "http://{{ pulp_server }}" diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 733d4b3f8..cc3cf7a12 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -110,6 +110,23 @@ policy: "{{ selinux_policy }}" register: sestatus +- name: Sync pulp repos with upstream + hosts: localhost + tasks: + - ansible.builtin.include_role: + name: pulp_site + tasks_from: sync.yml + when: appliances_mode != 'configure' + +- hosts: dnf_repos + become: yes + tasks: + - name: Replace system repos with pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: set_repos.yml + when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided + # --- tasks after here require access to package repos --- - hosts: squid tags: squid diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index b997605ea..24bb4852b 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,30 +1,4 @@ -# dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/{{ ansible_distribution_version }} -# dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_rocky_ark_timestamp }}/" -# # most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml -# # note that some timestamps can't be used because not all repos have snapshots for them -# dnf_repos_rocky_ark_timestamp: 20240816T002610 -# dnf_repos_username: slurm-app-ci -# dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" - -# # epel installed separately -# dnf_repos_repolist: -# - file: rocky -# name: baseos -# base_url: "{{ dnf_repos_rocky_ark_prefix }}/BaseOS/{{ dnf_repos_rocky_ark_suffix }}" -# - file: rocky -# name: appstream -# base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" -# - file: rocky -# name: crb -# base_url: "{{ dnf_repos_rocky_ark_prefix }}/CRB/{{ dnf_repos_rocky_ark_suffix }}" -# - file: rocky-extras -# name: extras -# base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" - -# dnf_repos_epel_timestamp: 20240902T080424 -# dnf_repos_epel_baseurl: "https://ark.stackhpc.com/pulp/content/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ dnf_repos_epel_timestamp }}" - -dnf_repos_pulp_url: # required +dnf_repos_pulp_url: "{{ appliances_pulp_url }}" dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index d9a339efd..2587e8499 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -9,6 +9,7 @@ slurm_appliance_secrets: vault_freeipa_ds_password: "{{ vault_freeipa_ds_password | default(lookup('password', '/dev/null')) }}" vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}" vault_k3s_token: "{{ vault_k3s_token | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" + vault_pulp_admin_password: "{{ vault_pulp_admin_password | default(lookup('password', '/dev/null', chars=['ascii_letters', 'digits'])) }}" secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml new file mode 100644 index 000000000..077871263 --- /dev/null +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -0,0 +1,75 @@ +pulp_site_url: "http://{{ appliances_pulp_url }}:{{ pulp_site_port }}" +pulp_site_port: 8080 +pulp_site_username: admin # shouldn't be changed +pulp_site_upstream_username: slurm-app-ci +pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" +pulp_site_password: "{{ vault_pulp_admin_password }}" +pulp_site_validate_certs: false +pulp_site_install_dir: '/home/rocky/pulp' +pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" + +pulp_site_rpm_repos: + - name: baseos + url: https://ark.stackhpc.com/pulp/content/rocky/9.4/BaseOS/x86_64/os/20240816T002610 + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + - name: appstream + url: https://ark.stackhpc.com/pulp/content/rocky/9.4/AppStream/x86_64/os/20240816T002610 + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + - name: crb + url: https://ark.stackhpc.com/pulp/content/rocky/9.4/CRB/x86_64/os/20240816T002610 + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + - name: extras + url: https://ark.stackhpc.com/pulp/content/rocky/9.4/extras/x86_64/os/20240816T002610 + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + - name: epel + url: https://ark.stackhpc.com/pulp/content/epel/9/Everything/x86_64/20240902T080424 + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + +pulp_site_rpm_publications: +- repository: baseos + state: present +- repository: appstream + state: present +- repository: crb + state: present +- repository: extras + state: present +- repository: epel + state: present + +pulp_site_rpm_distributions: +- name: baseos + base_path: rocky/9.4/baseos + repository: baseos + state: present +- name: appstream + base_path: rocky/9.4/appstream + repository: appstream + state: present +- name: crb + base_path: rocky/9.4/crb + repository: crb + state: present +- name: extras + base_path: rocky/9.4/extras + repository: extras + state: present +- name: epel + base_path: epel/9 + repository: epel + state: present diff --git a/ansible/roles/pulp_site/tasks/install.yml b/ansible/roles/pulp_site/tasks/install.yml new file mode 100644 index 000000000..39b4fcd97 --- /dev/null +++ b/ansible/roles/pulp_site/tasks/install.yml @@ -0,0 +1,43 @@ +--- + +- name: Install packages + dnf: + name: + - podman + +- name: Create install directories + ansible.builtin.file: + state: directory + path: "{{ pulp_site_install_dir }}/{{ item }}" + loop: + - settings/certs + - pulp_storage + - pgsql + - containers + +- name: Template settings file + ansible.builtin.template: + src: settings.py.j2 + dest: "{{ pulp_site_install_dir }}/settings/settings.py" + +- name: Install pulp podman container + containers.podman.podman_container: + name: pulp + publish: + - "{{ pulp_site_port }}:80" + volume: + - "{{ pulp_site_install_dir }}/settings:/etc/pulp{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pulp_storage:/var/lib/pulp{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pgsql:/var/lib/pgsql{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/containers:/var/lib/containers{{ pulp_site_selinux_suffix }}" + device: /dev/fuse + image: docker.io/pulp/pulp:3.68.1 + +- name: Reset admin password once container has initialised + no_log: true + ansible.builtin.shell: + cmd: "podman exec pulp bash -c 'pulpcore-manager reset-admin-password -p {{ pulp_site_password }}'" + register: _admin_reset_output + until: 0 == _admin_reset_output.rc + retries: 6 + delay: 30 diff --git a/ansible/roles/pulp_site/tasks/sync.yml b/ansible/roles/pulp_site/tasks/sync.yml new file mode 100644 index 000000000..62395f0f3 --- /dev/null +++ b/ansible/roles/pulp_site/tasks/sync.yml @@ -0,0 +1,73 @@ +--- + +- name: Wait for Pulp server + pulp.squeezer.status: + pulp_url: "{{ pulp_site_url }}" + username: "{{ pulp_site_username }}" + password: "{{ pulp_site_password }}" + register: _pulp_status + until: _pulp_status.failed == false + retries: 30 + delay: 20 + +- name: Ensure Pulp CLI config directory exists + ansible.builtin.file: + path: ~/.config/pulp + state: directory + +- name: Create config file + no_log: true + ansible.builtin.template: + src: cli.toml.j2 + dest: ~/.config/pulp/cli.toml + mode: '0644' + +- block: + - name: Ensure squeezer cache exists + ansible.builtin.file: + path: "{{ _cache_dir }}" + state: directory + + - name: Check if squeezer cache is populated + ansible.builtin.stat: + path: "{{ _cache_dir }}/api.json" + register: _cache_stat + + - name: Prepopulate squeezer cache # workaround for race on the cache + ansible.builtin.get_url: + url: "{{ pulp_site_url }}/pulp/api/v3/docs/api.json" + dest: "{{ _cache_dir }}/api.json" + timeout: 40 + when: not _cache_stat.stat.exists + vars: + _cache_dir: "~/.cache/squeezer/{{ pulp_site_url | regex_replace( ':|/' , '_' ) }}" + +- name: Get Pulp repos from release train + ansible.builtin.include_role: + name: stackhpc.pulp.pulp_repository + tasks_from: rpm.yml + vars: + pulp_url: "{{ pulp_site_url }}" + pulp_username: "{{ pulp_site_username }}" + pulp_password: "{{ pulp_site_password }}" + pulp_repository_rpm_repos: "{{ pulp_site_rpm_repos }}" + +- name: Create Pulp publications + ansible.builtin.include_role: + name: stackhpc.pulp.pulp_publication + tasks_from: rpm.yml + vars: + pulp_url: "{{ pulp_site_url }}" + pulp_username: "{{ pulp_site_username }}" + pulp_password: "{{ pulp_site_password }}" + pulp_publication_rpm: "{{ pulp_site_rpm_publications }}" + +- name: Create Pulp distributions + ansible.builtin.include_role: + name: stackhpc.pulp.pulp_distribution + tasks_from: rpm.yml + vars: + pulp_url: "{{ pulp_site_url }}" + pulp_username: "{{ pulp_site_username }}" + pulp_password: "{{ pulp_site_password }}" + pulp_distribution_rpm: "{{ pulp_site_rpm_distributions }}" diff --git a/ansible/roles/pulp_site/templates/cli.toml.j2 b/ansible/roles/pulp_site/templates/cli.toml.j2 new file mode 100644 index 000000000..06867902f --- /dev/null +++ b/ansible/roles/pulp_site/templates/cli.toml.j2 @@ -0,0 +1,14 @@ +[cli] +base_url = "{{ pulp_site_url }}" +username = "{{ pulp_site_username }}" +password = "{{ pulp_site_password }}" +api_root = "/pulp/" +domain = "default" +headers = [] +cert = "" +key = "" +verify_ssl = true +format = "json" +dry_run = false +timeout = 0 +verbose = 0 diff --git a/ansible/roles/pulp_site/templates/settings.py.j2 b/ansible/roles/pulp_site/templates/settings.py.j2 new file mode 100644 index 000000000..200212e2c --- /dev/null +++ b/ansible/roles/pulp_site/templates/settings.py.j2 @@ -0,0 +1,2 @@ +CONTENT_ORIGIN='http://{{ ansible_fqdn }}:{{ pulp_site_port }}' +TOKEN_AUTH_DISABLED=True diff --git a/ansible/site.yml b/ansible/site.yml index bb379399d..a09d5a510 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -28,6 +28,15 @@ - import_playbook: portal.yml - import_playbook: monitoring.yml +- hosts: dnf_repos + become: yes + tasks: + - name: Disable pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml + when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided + - name: Run post.yml hook vars: # hostvars not available here, so have to recalculate environment root: diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index 98e366304..9d506d725 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -12,12 +12,3 @@ - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock - -- hosts: builder - become: yes - tasks: - - name: Disable ark repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: disable_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 9ea84740d..0fdbf9f60 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -17,12 +17,3 @@ - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml" - -- hosts: builder - become: yes - tasks: - - name: Replace system repos with ark - ansible.builtin.include_role: - name: dnf_repos - tasks_from: set_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 1a65daa48..50ef3d76c 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,2 +1,2 @@ #update_enable: false # Can uncomment for speed debugging non-update related build issues -dnf_repos_pulp_url: http://192.168.10.157:8080 +dnf_repos_pulp_url: http://192.168.10.157 diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 2a88f035d..a7bb92ee3 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -7,6 +7,7 @@ appliances_environment_name: "{{ appliances_environment_root | basename | regex_ appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform appliances_mode: configure +#appliances_pulp_url: #override required # Address(ip/dns) for internal communication between services. This is # normally traffic you do no want to expose to users. diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 9b9aa5bf0..a88ccf338 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -144,3 +144,7 @@ freeipa_client [lustre] # Hosts to run lustre client + +[dnf_repos:children] +# Hosts to replace system repos with Pulp repos +cluster diff --git a/requirements.txt b/requirements.txt index 6651506fb..7d81f3285 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ansible==6.0.0 +ansible==8.0.0 openstacksdk python-openstackclient==6.6.1 # v7.0.0 has a bug re. rebuild python-manilaclient @@ -9,3 +9,4 @@ cookiecutter selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 netaddr matplotlib +pulp-cli==0.29.2 diff --git a/requirements.yml b/requirements.yml index 3d8c44011..6a461a6fa 100644 --- a/requirements.yml +++ b/requirements.yml @@ -49,4 +49,6 @@ collections: - name: https://github.com/azimuth-cloud/ansible-collection-image-utils type: git version: 0.4.0 + - name: stackhpc.pulp + version: 0.5.5 ... From 0d8a440e742fdbe985925dd5073c341135999567 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 15:19:21 +0000 Subject: [PATCH 129/268] typos --- ansible/adhoc/deploy-pulp.yml | 2 +- environments/.stackhpc/inventory/group_vars/builder.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index 291da7f59..cbb6bb6f6 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -22,4 +22,4 @@ dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" content: | # ansible managed - appliances_pulp_server: "http://{{ pulp_server }}" + appliances_pulp_url: "http://{{ pulp_server }}" diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 50ef3d76c..f32bd2928 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,2 +1,2 @@ #update_enable: false # Can uncomment for speed debugging non-update related build issues -dnf_repos_pulp_url: http://192.168.10.157 +appliances_pulp_url: http://192.168.10.157 From 90a33fa3563c70c912dcaf821bfe91763c8cad9c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 15:26:58 +0000 Subject: [PATCH 130/268] missed merge conflict --- environments/.stackhpc/hooks/post.yml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index 31dafd8c1..9d506d725 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -12,15 +12,3 @@ - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock -<<<<<<< HEAD -======= - -- hosts: builder - become: yes - tasks: - - name: Disable ark repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: disable_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided ->>>>>>> main From eaa3680596fb1a717b9fbca3342ce58e43a43ca9 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 15:39:00 +0000 Subject: [PATCH 131/268] moved pulp port into url --- ansible/roles/pulp_site/defaults/main.yml | 2 +- environments/.stackhpc/inventory/group_vars/builder.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 077871263..d4fa8aef6 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -1,4 +1,4 @@ -pulp_site_url: "http://{{ appliances_pulp_url }}:{{ pulp_site_port }}" +pulp_site_url: "http://{{ appliances_pulp_url }}" pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed pulp_site_upstream_username: slurm-app-ci diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index f32bd2928..609e5a0c4 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,2 +1,2 @@ #update_enable: false # Can uncomment for speed debugging non-update related build issues -appliances_pulp_url: http://192.168.10.157 +appliances_pulp_url: http://192.168.10.157:8080 From 9a75656497096cfc5acafbdb4bdefd1ac80e7b8e Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 16:07:40 +0000 Subject: [PATCH 132/268] fixed port not getting added in adhoc --- ansible/adhoc/deploy-pulp.yml | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index cbb6bb6f6..ad453e3f7 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -7,19 +7,31 @@ name: "{{ pulp_server }}" group: "_pulp_host" -- name: Install pulp on server +- name: Install pulp on server and add to config become: yes hosts: _pulp_host tasks: - - ansible.builtin.import_role: + + - name: Install pulp + ansible.builtin.include_role: name: pulp_site tasks_from: install.yml + public: true -- name: Add pulp host to environment - hosts: localhost - tasks: - - ansible.builtin.copy: + - name: Add pulp host to environment + become: no + delegate_to: localhost + ansible.builtin.copy: dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" content: | # ansible managed - appliances_pulp_url: "http://{{ pulp_server }}" + appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" + +# - name: Add pulp host to environment +# hosts: localhost +# tasks: +# - ansible.builtin.copy: +# dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" +# content: | +# # ansible managed +# appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" From 741872a7ee74e143241e5afd621b6442f99623e8 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 16:15:23 +0000 Subject: [PATCH 133/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 14c997596..5e71beebd 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241203-1659-b0558b95", - "RL9": "openhpc-RL9-241203-1659-b0558b95" + "RL8": "openhpc-RL8-241206-1541-eaa36805", + "RL9": "openhpc-RL9-241206-1541-eaa36805" } } From 39cf55682d8324733fbe12cdd0d3291e6f312fed Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 16:38:34 +0000 Subject: [PATCH 134/268] cleaned up disabling repos + now optional --- ansible/roles/dnf_repos/defaults/main.yml | 1 + .../roles/dnf_repos/tasks/disable_repos.yml | 20 ++++--------------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 24bb4852b..359814e47 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -2,6 +2,7 @@ dnf_repos_pulp_url: "{{ appliances_pulp_url }}" dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" +dnf_repos_disable: true # epel installed separately dnf_repos_repolist: diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml index f8997b741..69aed3b6b 100644 --- a/ansible/roles/dnf_repos/tasks/disable_repos.yml +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -1,18 +1,6 @@ --- - name: Disable Pulp repos and remove creds - ansible.builtin.yum_repository: - file: "{{ item.file }}" - name: "{{ item.name }}" - baseurl: "{{ item.base_url }}" - description: "{{ item.name }}" - enabled: false - loop: "{{ dnf_repos_repolist }}" - -- name: Disable EPEL repo and remove creds - ansible.builtin.yum_repository: - name: epel - file: epel - description: epel - baseurl: "{{ dnf_repos_epel_baseurl }}" - gpgcheck: false - enabled: false + ansible.builtin.yum: + disablerepo: "{{ item.name }}" + loop: "{{ dnf_repos_repolist + [epel] }}" + when: dnf_repos_disable From 25644c362b93e1a8242aaca3e992c79ecd01d3bc Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 9 Dec 2024 12:05:48 +0000 Subject: [PATCH 135/268] typo --- ansible/roles/pulp_site/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index d4fa8aef6..f648696e3 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -1,4 +1,4 @@ -pulp_site_url: "http://{{ appliances_pulp_url }}" +pulp_site_url: "{{ appliances_pulp_url }}" pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed pulp_site_upstream_username: slurm-app-ci From fef3d566dcc40f1dad2cbab5fc2fb7d07d66eff3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 11 Dec 2024 14:53:15 +0000 Subject: [PATCH 136/268] repos now timestamped + synced at bootstrap --- .github/workflows/fatimage.yml | 1 + .github/workflows/nightlybuild.yml | 1 + ansible/adhoc/deploy-pulp.yml | 9 -- ansible/bootstrap.yml | 6 +- ansible/roles/dnf_repos/defaults/main.yml | 11 ++- ansible/roles/pulp_site/defaults/main.yml | 99 +++++++++---------- environments/.stackhpc/hooks/pre.yml | 9 -- .../inventory/group_vars/builder.yml | 10 +- .../inventory/group_vars/all/defaults.yml | 8 ++ environments/common/inventory/groups | 5 + 10 files changed, 82 insertions(+), 77 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 217b09c22..3a32f47b2 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -40,6 +40,7 @@ jobs: } } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} + LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }} steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 9f45b0890..ee2b4b6f8 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -38,6 +38,7 @@ jobs: "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} + LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }} steps: - uses: actions/checkout@v2 diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index ad453e3f7..89c51922a 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -26,12 +26,3 @@ content: | # ansible managed appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" - -# - name: Add pulp host to environment -# hosts: localhost -# tasks: -# - ansible.builtin.copy: -# dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" -# content: | -# # ansible managed -# appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index cc3cf7a12..dfe212d02 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -111,11 +111,15 @@ register: sestatus - name: Sync pulp repos with upstream - hosts: localhost + hosts: pulp tasks: + - debug: + var: hostvars[groups['builder'][0]]['ansible_facts'] - ansible.builtin.include_role: name: pulp_site tasks_from: sync.yml + apply: + delegate_to: localhost when: appliances_mode != 'configure' - hosts: dnf_repos diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 359814e47..0a09e5f3a 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -3,20 +3,21 @@ dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" dnf_repos_disable: true +dnf_repos_version_timestamps: "{{ appliances_repo_timestamps[ansible_distribution_version] }}" # epel installed separately dnf_repos_repolist: - file: rocky name: baseos - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/baseos" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/baseos/{{ dnf_repos_version_timestamps.baseos }}" - file: rocky name: appstream - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/appstream" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/appstream/{{ dnf_repos_version_timestamps.appstream }}" - file: rocky name: crb - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/crb" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/crb/{{ dnf_repos_version_timestamps.crb }}" - file: rocky-extras name: extras - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ dnf_repos_version_timestamps.extras }}" -dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}" +dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_version }}/{{ dnf_repos_version_timestamps.epel }}" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index f648696e3..0fc92859a 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -3,73 +3,70 @@ pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed pulp_site_upstream_username: slurm-app-ci pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" +pulp_site_default_upstream_prefix: "https://ark.stackhpc.com/pulp/content/{{ pulp_site_target_distribution }}/{{ pulp_site_target_distribution_version }}" +pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" pulp_site_password: "{{ vault_pulp_admin_password }}" pulp_site_validate_certs: false pulp_site_install_dir: '/home/rocky/pulp' pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" +pulp_site_target_facts: "{{ hostvars[groups['builder'][0]]['ansible_facts'] }}" +pulp_site_target_arch: "{{ pulp_site_target_facts['architecture'] }}" +pulp_site_target_distribution: "{{ pulp_site_target_facts['distribution'] | lower }}" +pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" +pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" +pulp_site_version_timestamps: "{{ appliances_repo_timestamps[pulp_site_target_distribution_version] }}" -pulp_site_rpm_repos: - - name: baseos - url: https://ark.stackhpc.com/pulp/content/rocky/9.4/BaseOS/x86_64/os/20240816T002610 - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present - - name: appstream - url: https://ark.stackhpc.com/pulp/content/rocky/9.4/AppStream/x86_64/os/20240816T002610 - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present - - name: crb - url: https://ark.stackhpc.com/pulp/content/rocky/9.4/CRB/x86_64/os/20240816T002610 - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present - - name: extras - url: https://ark.stackhpc.com/pulp/content/rocky/9.4/extras/x86_64/os/20240816T002610 - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present - - name: epel - url: https://ark.stackhpc.com/pulp/content/epel/9/Everything/x86_64/20240902T080424 - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present + +pulp_site_rpm_repo_list: + - name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" + url: "{{ pulp_site_default_upstream_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" + - name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" + url: "{{ pulp_site_default_upstream_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" + - name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" + url: "{{ pulp_site_default_upstream_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" + - name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" + url: "{{ pulp_site_default_upstream_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" + - name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" + url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_version_timestamps.epel }}" + +pulp_site_defaults: + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + +pulp_site_rpm_repos: "{{ pulp_site_rpm_repo_list | map('combine', pulp_site_defaults) }}" pulp_site_rpm_publications: -- repository: baseos +- repository: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" state: present -- repository: appstream +- repository: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" state: present -- repository: crb +- repository: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" state: present -- repository: extras +- repository: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" state: present -- repository: epel +- repository: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" state: present pulp_site_rpm_distributions: -- name: baseos - base_path: rocky/9.4/baseos - repository: baseos +- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" + base_path: "rocky/{{ pulp_site_target_distribution_version }}/baseos/{{ pulp_site_version_timestamps.baseos }}" + repository: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" state: present -- name: appstream - base_path: rocky/9.4/appstream - repository: appstream +- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" + base_path: "rocky/{{ pulp_site_target_distribution_version }}/appstream/{{ pulp_site_version_timestamps.appstream }}" + repository: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" state: present -- name: crb - base_path: rocky/9.4/crb - repository: crb +- name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" + base_path: "rocky/{{ pulp_site_target_distribution_version }}/crb/{{ pulp_site_version_timestamps.crb }}" + repository: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" state: present -- name: extras - base_path: rocky/9.4/extras - repository: extras +- name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" + base_path: "rocky/{{ pulp_site_target_distribution_version }}/extras/{{ pulp_site_version_timestamps.extras }}" + repository: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" state: present -- name: epel - base_path: epel/9 - repository: epel +- name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" + base_path: "epel/{{ pulp_site_target_distribution_version }}/{{ pulp_site_version_timestamps.epel }}" + repository: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" state: present diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 9ea84740d..0fdbf9f60 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -17,12 +17,3 @@ - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml" - -- hosts: builder - become: yes - tasks: - - name: Replace system repos with ark - ansible.builtin.include_role: - name: dnf_repos - tasks_from: set_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 609e5a0c4..0fd19e1f9 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,2 +1,8 @@ -#update_enable: false # Can uncomment for speed debugging non-update related build issues -appliances_pulp_url: http://192.168.10.157:8080 +# update_enable: false # Can uncomment for speed debugging non-update related build issues +pulp_server_config: + LEAFCLOUD: + url: http://192.168.10.157:8080 + password: lookup('env','LEAFCLOUD_PULP_PASSWORD') + +appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" +pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index a7bb92ee3..9d8a7ab33 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -81,3 +81,11 @@ appliances_local_users_extra: [] # see format of appliances_local_users_default appliances_local_users: "{{ appliances_local_users_default + appliances_local_users_extra }}" ########################################################################################### + +appliances_repo_timestamps: + '9.4': + baseos: 20240816T002610 + appstream: 20240816T002610 + crb: 20240816T002610 + extras: 20240816T002610 + epel: 20240902T080424 diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index a88ccf338..fbfcfa0ca 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -148,3 +148,8 @@ freeipa_client [dnf_repos:children] # Hosts to replace system repos with Pulp repos cluster +builder + +[pulp:children] +# Hosts used to run Pulp API commands +builder From 1c4a511eeb9b7102941cf116fba78c978bd68c48 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 11 Dec 2024 17:08:16 +0000 Subject: [PATCH 137/268] refactored pulp_site list --- ansible/roles/pulp_site/defaults/main.yml | 60 ++++++------------- .../filter_plugins/pulp-list-filters.py | 31 ++++++++++ 2 files changed, 48 insertions(+), 43 deletions(-) create mode 100644 ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 0fc92859a..6a9e98d74 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -16,57 +16,31 @@ pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_ pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" pulp_site_version_timestamps: "{{ appliances_repo_timestamps[pulp_site_target_distribution_version] }}" - -pulp_site_rpm_repo_list: - - name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" - url: "{{ pulp_site_default_upstream_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" - - name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" - url: "{{ pulp_site_default_upstream_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" - - name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" - url: "{{ pulp_site_default_upstream_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" - - name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" - url: "{{ pulp_site_default_upstream_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" - - name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" - url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_version_timestamps.epel }}" - -pulp_site_defaults: - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present - -pulp_site_rpm_repos: "{{ pulp_site_rpm_repo_list | map('combine', pulp_site_defaults) }}" - -pulp_site_rpm_publications: -- repository: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" - state: present -- repository: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" - state: present -- repository: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" - state: present -- repository: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" - state: present -- repository: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" - state: present - -pulp_site_rpm_distributions: +pulp_site_rpm_info: - name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" + url: "{{ pulp_site_default_upstream_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" base_path: "rocky/{{ pulp_site_target_distribution_version }}/baseos/{{ pulp_site_version_timestamps.baseos }}" - repository: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" - state: present - name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" + url: "{{ pulp_site_default_upstream_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" base_path: "rocky/{{ pulp_site_target_distribution_version }}/appstream/{{ pulp_site_version_timestamps.appstream }}" - repository: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" - state: present - name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" + url: "{{ pulp_site_default_upstream_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" base_path: "rocky/{{ pulp_site_target_distribution_version }}/crb/{{ pulp_site_version_timestamps.crb }}" - repository: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" - state: present - name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" + url: "{{ pulp_site_default_upstream_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" base_path: "rocky/{{ pulp_site_target_distribution_version }}/extras/{{ pulp_site_version_timestamps.extras }}" - repository: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" - state: present - name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" + url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_version_timestamps.epel }}" base_path: "epel/{{ pulp_site_target_distribution_version }}/{{ pulp_site_version_timestamps.epel }}" - repository: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" + +pulp_site_rpm_repo_defaults: + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand state: present + +_pulp_site_rpm_info_all: "{{ pulp_site_rpm_info | map('combine', pulp_site_rpm_repo_defaults) }}" + +pulp_site_rpm_repos: "{{ _pulp_site_rpm_info_all | to_rpm_repos }}" +pulp_site_rpm_publications: "{{ _pulp_site_rpm_info_all | to_rpm_pubs }}" +pulp_site_rpm_distributions: "{{ _pulp_site_rpm_info_all | to_rpm_distros }}" diff --git a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py new file mode 100644 index 000000000..94d89d184 --- /dev/null +++ b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py @@ -0,0 +1,31 @@ +class FilterModule(object): + def filters(self): + return { + 'to_rpm_repos': self.to_rpm_repos, + 'to_rpm_pubs': self.to_rpm_pubs, + 'to_rpm_distros': self.to_rpm_distros + } + + def to_rpm_repos(self, list): + repo_list = map(lambda x: { + 'name': x['name'], + 'url': x['url'], + 'remote_username': x['remote_username'], + 'remote_password': x['remote_password'], + 'policy': x['policy'], + 'state': x['state'] }, list) + return repo_list + + def to_rpm_pubs(self, list): + pub_list = map(lambda x: { + 'repository': x['name'], + 'state': x['state'] }, list) + return pub_list + + def to_rpm_distros(self, list): + distro_list = map(lambda x: { + 'name': x['name'], + 'repository': x['name'], + 'base_path': x['base_path'], + 'state': x['state'] }, list) + return distro_list \ No newline at end of file From 558874b3949253a34baec5e91d917d7965710725 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 11:46:24 +0000 Subject: [PATCH 138/268] Added extra package installs to bootstrap --- ansible/bootstrap.yml | 9 ++++++++ .../inventory/group_vars/all/defaults.yml | 21 ++++++++++++++++++- environments/common/inventory/groups | 5 +++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 733d4b3f8..432a2a319 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -216,6 +216,15 @@ msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details" when: "update_enable | default('false') | bool" +- hosts: extra_packages + become: yes + tags: + - extra_packages + tasks: + dnf: + - name: "{{ appliances_extra_packages }}" + when: appliances_mode != 'configure' or appliances_packages_during_configure + - hosts: - selinux - update diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 2a88f035d..c6bf8564b 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -79,4 +79,23 @@ appliances_local_users_default: appliances_local_users_extra: [] # see format of appliances_local_users_default above appliances_local_users: "{{ appliances_local_users_default + appliances_local_users_extra }}" -########################################################################################### +################## bootstrap: extra package installs ###################################### + +appliances_default_extra_packages: + - htop + - nano + - screen + - tmux + - wget + - bind-utils + - net-tools + - postfix + - git + - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" + + +appliances_packages_during_configure: false + +appliances_other_extra_packages: [] + +appliances_extra_packages: "{{ appliances_default_extra_packages + appliances_other_extra_packages }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 9b9aa5bf0..d8ad503fe 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -144,3 +144,8 @@ freeipa_client [lustre] # Hosts to run lustre client + +[extra_packages:children] +# Hosts to install specified additional packages on +cluster +builder From 187bc40b898b42f1cb67fea653687cb6c4499dd3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 13:13:02 +0000 Subject: [PATCH 139/268] added pulp sync adhoc and temporarily moved out of ci --- ansible/adhoc/sync-pulp.yml | 11 +++++++++++ ansible/roles/pulp_site/tasks/sync.yml | 5 +++++ environments/.stackhpc/inventory/extra_groups | 2 ++ 3 files changed, 18 insertions(+) create mode 100644 ansible/adhoc/sync-pulp.yml diff --git a/ansible/adhoc/sync-pulp.yml b/ansible/adhoc/sync-pulp.yml new file mode 100644 index 000000000..9c7684445 --- /dev/null +++ b/ansible/adhoc/sync-pulp.yml @@ -0,0 +1,11 @@ +- hosts: localhost + tasks: + - ansible.builtin.include_role: + name: pulp_site + tasks_from: sync.yml + vars: + pulp_site_target_arch: "x86_64" + pulp_site_target_distribution: "rocky" + pulp_site_target_distribution_version: "9.4" + pulp_site_target_distribution_version_major: "9" + pulp_site_version_timestamps: "{{ appliances_repo_timestamps[pulp_site_target_distribution_version] }}" diff --git a/ansible/roles/pulp_site/tasks/sync.yml b/ansible/roles/pulp_site/tasks/sync.yml index 62395f0f3..5ef2bc5f1 100644 --- a/ansible/roles/pulp_site/tasks/sync.yml +++ b/ansible/roles/pulp_site/tasks/sync.yml @@ -1,5 +1,10 @@ --- +- ansible.builtin.assert: + that: pulp_site_upstream_password != '' + quiet: true + fail_msg: "Upstream password not set. Either set env var ARK_PASSWORD or override pulp_site_upstream_password." + - name: Wait for Pulp server pulp.squeezer.status: pulp_url: "{{ pulp_site_url }}" diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index 7c9a7c774..c2002c59f 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -31,3 +31,5 @@ compute [squid:children] # Install squid into fat image builder + +[pulp:children] From 580b0b3b943af7272c85f46950a8e3382cdbca34 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 13:28:39 +0000 Subject: [PATCH 140/268] fixed disabling for ci --- ansible/bootstrap.yml | 4 +--- environments/.stackhpc/inventory/extra_groups | 2 -- environments/.stackhpc/inventory/group_vars/builder.yml | 1 + environments/common/inventory/group_vars/all/defaults.yml | 1 + 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index dfe212d02..8c46c5e24 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -113,14 +113,12 @@ - name: Sync pulp repos with upstream hosts: pulp tasks: - - debug: - var: hostvars[groups['builder'][0]]['ansible_facts'] - ansible.builtin.include_role: name: pulp_site tasks_from: sync.yml apply: delegate_to: localhost - when: appliances_mode != 'configure' + when: appliances_mode != 'configure' and appliances_sync_pulp_on_build - hosts: dnf_repos become: yes diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index c2002c59f..7c9a7c774 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -31,5 +31,3 @@ compute [squid:children] # Install squid into fat image builder - -[pulp:children] diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 0fd19e1f9..c4b01b03f 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -6,3 +6,4 @@ pulp_server_config: appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" +appliances_sync_pulp_on_build: false diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 9d8a7ab33..f2a6723ad 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -82,6 +82,7 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ########################################################################################### +appliances_sync_pulp_on_build: true appliances_repo_timestamps: '9.4': baseos: 20240816T002610 From 2ed66742bb9b665d879af8c0b5e6e6aa6d434163 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 13:39:54 +0000 Subject: [PATCH 141/268] made dnf epel repo more configurable --- ansible/roles/dnf_repos/defaults/main.yml | 1 + ansible/roles/dnf_repos/tasks/set_repos.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 0a09e5f3a..d4c80b0c9 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -21,3 +21,4 @@ dnf_repos_repolist: base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ dnf_repos_version_timestamps.extras }}" dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_version }}/{{ dnf_repos_version_timestamps.epel }}" +dnf_repos_epel_description: "epel" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index 8a8364097..dea803902 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -17,6 +17,6 @@ ansible.builtin.yum_repository: name: epel file: epel - description: epel + description: "{{ dnf_repos_epel_description }}" gpgcheck: false baseurl: "{{ dnf_repos_epel_baseurl }}" From efd2883211fbe9563568ff13a2c2759ef9ef31a3 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:40:41 +0000 Subject: [PATCH 142/268] Add role to install NVIDIA DOCA on top of an existing "fat" image (#492) * add doca role run by fatimage * add workflow to test doca build * make packer inventory groups clearer and allow defining no extra * update packer workflows for new packer config * define builds entirely via matrix * WIP: do DOCA CI build on top of current fat image * fixup matrix for changes * fix doca workflow typo * use current fatimage for doca test build * enable fatimage to be used for volume-backed builds * bump CI image * doca workflow: clean up image and only run on relevant changes * remove commented-out code * add DOCA README * fix DOCA role actually running * tidyup DOCA play * include doca packages in image summary * fix squid being selected for any stackhopc build VM * fix nightly build concurrency * re-add squid back to Stackhpc builder group * remove debugging exit * update image build docs * update packer docs --- .github/workflows/doca.yml | 132 ++++++++++++++++++ .github/workflows/fatimage.yml | 38 +++-- .github/workflows/nightlybuild.yml | 54 +++---- ansible/.gitignore | 2 + ansible/cleanup.yml | 5 + ansible/fatimage.yml | 11 ++ ansible/roles/doca/README.md | 12 ++ ansible/roles/doca/defaults/main.yml | 3 + .../roles/doca/tasks/install-kernel-devel.yml | 24 ++++ ansible/roles/doca/tasks/install.yml | 53 +++++++ ansible/roles/doca/tasks/main.yml | 1 + docs/image-build.md | 74 +++------- .../terraform/cluster_image.auto.tfvars.json | 4 +- packer/openstack.pkr.hcl | 65 +++------ 14 files changed, 323 insertions(+), 155 deletions(-) create mode 100644 .github/workflows/doca.yml create mode 100644 ansible/roles/doca/README.md create mode 100644 ansible/roles/doca/defaults/main.yml create mode 100644 ansible/roles/doca/tasks/install-kernel-devel.yml create mode 100644 ansible/roles/doca/tasks/install.yml create mode 100644 ansible/roles/doca/tasks/main.yml diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml new file mode 100644 index 000000000..cfd3bb982 --- /dev/null +++ b/.github/workflows/doca.yml @@ -0,0 +1,132 @@ +name: Test DOCA extra build +on: + workflow_dispatch: + push: + branches: + - main + paths: + - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - '.github/workflows/doca' + pull_request: + paths: + - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - '.github/workflows/doca' + +jobs: + doca: + name: doca-build + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS + cancel-in-progress: true + runs-on: ubuntu-22.04 + strategy: + fail-fast: false # allow other matrix jobs to continue even if one fails + matrix: # build RL8, RL9 + build: + - image_name: openhpc-doca-RL8 + source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json + inventory_groups: doca + - image_name: openhpc-doca-RL9 + source_image_name_key: RL9 + inventory_groups: doca + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} + + steps: + - uses: actions/checkout@v2 + + - name: Load current fat images into GITHUB_ENV + # see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string + run: | + { + echo 'FAT_IMAGES<> "$GITHUB_ENV" + + - name: Record settings + run: | + echo CI_CLOUD: ${{ env.CI_CLOUD }} + echo FAT_IMAGES: ${FAT_IMAGES} + + - name: Setup ssh + run: | + set -x + mkdir ~/.ssh + echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + shell: bash + + - name: Add bastion's ssh key to known_hosts + run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts + shell: bash + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml + shell: bash + + - name: Setup environment + run: | + . venv/bin/activate + . environments/.stackhpc/activate + + - name: Build fat image with packer + id: packer_build + run: | + set -x + . venv/bin/activate + . environments/.stackhpc/activate + cd packer/ + packer init . + + PACKER_LOG=1 packer build \ + -on-error=${{ vars.PACKER_ON_ERROR }} \ + -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ + -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ + openstack.pkr.hcl + + - name: Get created image names from manifest + id: manifest + run: | + . venv/bin/activate + IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) + while ! openstack image show -f value -c name $IMAGE_ID; do + sleep 5 + done + IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" + echo $IMAGE_ID > image-id.txt + echo $IMAGE_NAME > image-name.txt + + - name: Make image usable for further builds + run: | + . venv/bin/activate + openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" + + - name: Delete image for automatically-run workflows + run: | + . venv/bin/activate + openstack image delete "${{ steps.manifest.outputs.image-id }}" + if: ${{ github.event_name != 'workflow_dispatch' }} + + - name: Upload manifest artifact + uses: actions/upload-artifact@v4 + with: + name: image-details-${{ matrix.build.image_name }} + path: | + ./image-id.txt + ./image-name.txt + overwrite: true diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 217b09c22..da933c91d 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -15,30 +15,23 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 - os_version: - - RL8 - - RL9 build: - - openstack.openhpc + - image_name: openhpc-RL8 + source_image_name: rocky-latest-RL8 + inventory_groups: control,compute,login + - image_name: openhpc-RL9 + source_image_name: rocky-latest-RL9 + inventory_groups: control,compute,login env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud }} - SOURCE_IMAGES_MAP: | - { - "RL8": { - "openstack.openhpc": "rocky-latest-RL8" - }, - "RL9": { - "openstack.openhpc": "rocky-latest-RL9" - } - } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: @@ -85,13 +78,11 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }} - name: Get created image names from manifest id: manifest @@ -102,13 +93,20 @@ jobs: sleep 5 done IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" echo $IMAGE_ID > image-id.txt echo $IMAGE_NAME > image-name.txt + - name: Make image usable for further builds + run: | + . venv/bin/activate + openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" + - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-${{ matrix.build }}-${{ matrix.os_version }} + name: image-details-${{ matrix.build.image_name }} path: | ./image-id.txt ./image-name.txt diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 9f45b0890..a0e78cd0b 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -11,32 +11,29 @@ on: - SMS - ARCUS schedule: - - cron: '0 0 * * *' # Run at midnight + - cron: '0 0 * * *' # Run at midnight on default branch jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 - os_version: - - RL8 - - RL9 build: - - openstack.rocky-latest + - image_name: rocky-latest-RL8 + source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + inventory_groups: update + - image_name: rocky-latest-RL9 + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: update env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} - SOURCE_IMAGES_MAP: | - { - "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", - "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" - } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: @@ -83,15 +80,12 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }} - - name: Get created image names from manifest id: manifest run: | @@ -125,7 +119,7 @@ jobs: name: upload-nightly-targets needs: openstack concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }}-${{ matrix.target_cloud }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: @@ -135,18 +129,15 @@ jobs: - LEAFCLOUD - SMS - ARCUS - os_version: - - RL8 - - RL9 - image: - - rocky-latest + build: + - image_name: rocky-latest-RL8 + - image_name: rocky-latest-RL9 exclude: - target_cloud: LEAFCLOUD env: OS_CLOUD: openstack SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} TARGET_CLOUD: ${{ matrix.target_cloud }} - IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}" steps: - uses: actions/checkout@v2 @@ -161,42 +152,37 @@ jobs: . venv/bin/activate pip install -U pip pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) - shell: bash - name: Write clouds.yaml run: | mkdir -p ~/.config/openstack/ echo "${{ secrets[format('{0}_CLOUDS_YAML', env.SOURCE_CLOUD)] }}" > ~/.config/openstack/source_clouds.yaml echo "${{ secrets[format('{0}_CLOUDS_YAML', env.TARGET_CLOUD)] }}" > ~/.config/openstack/target_clouds.yaml - shell: bash - name: Download source image run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml - openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }} - shell: bash + openstack image save --file ${{ matrix.build.image_name }} ${{ matrix.build.image_name }} - name: Upload to target cloud run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - openstack image create "${{ env.IMAGE_NAME }}" \ - --file "${{ env.IMAGE_NAME }}" \ + openstack image create "${{ matrix.build.image_name }}" \ + --file "${{ matrix.build.image_name }}" \ --disk-format qcow2 \ - shell: bash - name: Delete old latest image from target cloud run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l) + IMAGE_COUNT=$(openstack image list --name ${{ matrix.build.image_name }} -f value -c ID | wc -l) if [ "$IMAGE_COUNT" -gt 1 ]; then - OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}" -f value -c ID | head -n 1) + OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ matrix.build.image_name }}" -f value -c ID | head -n 1) openstack image delete "$OLD_IMAGE_ID" else echo "Only one image exists, skipping deletion." fi - shell: bash diff --git a/ansible/.gitignore b/ansible/.gitignore index 48c917c4f..3fef64ecc 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -66,3 +66,5 @@ roles/* !roles/lustre/** !roles/dnf_repos/ !roles/dnf_repos/** +!roles/doca/ +!roles/doca/** diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index cf9b0bdab..3f059d157 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -61,5 +61,10 @@ os: "{{ ansible_distribution }} {{ ansible_distribution_version }}" kernel: "{{ ansible_kernel }}" ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}" + doca: "{{ ansible_facts.packages[doca_profile | default('doca-ofed') ].0.version | default('-') }}" cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}" slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" + +- name: Show image summary + debug: + var: image_info diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index b28e4f308..439c50e70 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -6,6 +6,9 @@ tasks: - name: Report hostname (= final image name) command: hostname + - name: Report inventory groups + debug: + var: group_names - name: Run pre.yml hook vars: @@ -199,6 +202,14 @@ name: cloudalchemy.grafana tasks_from: install.yml +- hosts: doca + become: yes + gather_facts: yes + tasks: + - name: Install NVIDIA DOCA + import_role: + name: doca + - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/roles/doca/README.md b/ansible/roles/doca/README.md new file mode 100644 index 000000000..5f898add5 --- /dev/null +++ b/ansible/roles/doca/README.md @@ -0,0 +1,12 @@ +# doca + +Install [NVIDIA DOCA](https://docs.nvidia.com/doca/sdk/index.html). + +This role is not idempotent and is only intended to be run during an image build. It builds DOCA kernel modules to match the installed kernel and then installs these +plus the selected DOCA packages. + +## Role Variables + +- `doca_version`: Optional. String giving doca version. +- `doca_profile`: Optional. Name of [profile](https://docs.nvidia.com/doca/sdk/nvidia+doca+profiles/index.html) defining subset of DOCA to install. Default is `doca-ofed`. +- `doca_repo_url`: Optional. URL of DOCA repository. Default is appropriate upstream public repository for DOCA version, distro version and architecture. diff --git a/ansible/roles/doca/defaults/main.yml b/ansible/roles/doca/defaults/main.yml new file mode 100644 index 000000000..66437cd04 --- /dev/null +++ b/ansible/roles/doca/defaults/main.yml @@ -0,0 +1,3 @@ +doca_version: '2.9.1' # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates +doca_profile: doca-ofed +doca_repo_url: "https://linux.mellanox.com/public/repo/doca/{{ doca_version }}/rhel{{ ansible_distribution_version }}/{{ ansible_architecture }}/" diff --git a/ansible/roles/doca/tasks/install-kernel-devel.yml b/ansible/roles/doca/tasks/install-kernel-devel.yml new file mode 100644 index 000000000..6a1943a32 --- /dev/null +++ b/ansible/roles/doca/tasks/install-kernel-devel.yml @@ -0,0 +1,24 @@ +- name: Get installed kernels + command: dnf list --installed kernel + register: _ofed_dnf_kernels + changed_when: false + +- name: Determine running kernel + command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 + register: _ofed_loaded_kernel + changed_when: false + +- name: Check current kernel is newest installed + assert: + that: _ofed_kernel_current == _ofed_dnf_kernels_newest + fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" + vars: + _ofed_kernel_current: >- + {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} + _ofed_dnf_kernels_newest: >- + {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} + # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " + +- name: Install matching kernel-devel package + dnf: + name: "kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}" diff --git a/ansible/roles/doca/tasks/install.yml b/ansible/roles/doca/tasks/install.yml new file mode 100644 index 000000000..9d297e946 --- /dev/null +++ b/ansible/roles/doca/tasks/install.yml @@ -0,0 +1,53 @@ +- import_tasks: install-kernel-devel.yml + +- name: Install DOCA repo + ansible.builtin.yum_repository: + name: doca + file: doca + description: DOCA Online Repo + baseurl: "{{ doca_repo_url }}" + enabled: true + gpgcheck: false + +- name: Install doca-extra package + ansible.builtin.dnf: + name: doca-extra + +- name: Build DOCA kernel modules + ansible.builtin.shell: + cmd: /opt/mellanox/doca/tools/doca-kernel-support + register: _doca_kernel_build + + +- name: Find generated doca-kernel-repo + ansible.builtin.shell: 'find /tmp/DOCA.* -name doca-kernel-repo-*' + register: _doca_kernel_repo # e.g. /tmp/DOCA.WVMchs2QWo/doca-kernel-repo-24.10.1.1.4.0-1.kver.5.14.0.427.31.1.el9.4.x86.64.x86_64.rpm + changed_when: false + +- name: Create dnf cache + ansible.builtin.command: dnf makecache + +- name: Install DOCA repository package + ansible.builtin.dnf: + name: "{{ _doca_kernel_repo.stdout }}" + disable_gpg_check: true + +- name: Install DOCA packages + ansible.builtin.dnf: + name: "{{ doca_profile }}" + +- name: Cleanup DOCA build directories + ansible.builtin.file: + state: absent + path: "{{ (_doca_kernel_repo.stdout | split('/'))[:2] | join('/') }}" + +- name: Update initramfs + ansible.builtin.command: + cmd: dracut -f --tmpdir /var/tmp + environment: + TMPDIR: /var/tmp + register: _doca_dracut + failed_when: _doca_dracut.stderr != '' # appears rc is always 0 + +- name: Load the new driver + ansible.builtin.command: /etc/init.d/openibd restart diff --git a/ansible/roles/doca/tasks/main.yml b/ansible/roles/doca/tasks/main.yml new file mode 100644 index 000000000..e7a272f38 --- /dev/null +++ b/ansible/roles/doca/tasks/main.yml @@ -0,0 +1 @@ +- include_tasks: install.yml diff --git a/docs/image-build.md b/docs/image-build.md index 4896bde57..a7d2e951b 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -2,87 +2,57 @@ The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. -The Packer configuration defined here builds "fat images" which contain binaries for all nodes, but no cluster-specific configuration. Using these: +The Packer configuration defined here builds "fat images" which contain packages, binaries and container images but no cluster-specific configuration. Using these: - Enables the image to be tested in CI before production use. - Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). - Improves deployment speed by reducing the number of package downloads to improve deployment speed. -By default, a fat image build starts from a nightly image build containing Mellanox OFED, and updates all DNF packages already present. The 'latest' nightly build itself is from a RockyLinux GenericCloud image. - -The fat images StackHPC builds and test in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: +The fat images StackHPC builds and tests in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: 1. Build site-specific fat images from scratch. -2. Extend an existing fat image with additional software. +2. Extend an existing fat image with additional functionality. # Usage -The steps for building site-specific fat images or extending an existing fat image are the same: +To build either a site-specific fat image from scratch, or to extend an existing StackHPC fat image: 1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). -2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum e.g.: +2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: ```hcl flavor = "general.v1.small" # VM flavor to use for builder VMs networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to + source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image + inventory_groups = "control,login,compute" # Additional inventory groups to add build VM to + ``` + Note that: - - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). - - For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`. - - For an example of configuration for extending an existing fat image see below. + - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). + - The flavor used must have sufficent memory for the build tasks, but otherwise does not need to match the final cluster nodes. Usually 8GB is sufficent. By default, the build VM is volume-backed to allow control of the root disk size (and hence final image size) so the flavor disk size does not matter. + - The source image should be either a RockyLinux GenericCloud image for a site-specific image build from scratch, or a StackHPC fat image if extending an existing image. + - The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to. This is in addition to the `builder` group which it is always added to. This controls which Ansible roles and functionality run during build, and hence what gets added to the image. All possible groups are listed in `environments/common/groups` but common options for this variable will be: + - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. + - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. 3. Activate the venv and the relevant environment. 4. Build images using the relevant variable definition file, e.g.: cd packer/ - PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - - Note that the `-only` flag here restricts Packer to a single specific "build" definition (in Packer terminology). Options here are: - - `-only=openstack.openhpc`: Build a fat image including Mellanox OFED - - `-only=openstack.openhpc-cuda`: Build a fat image including Mellanox OFED, Nvidia drivers and CUDA - - `-only=openstack.openhpc-extra`: Build an image which *extends* an existing fat image - -5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash. - -# Defining an "extra" image build - -An "extra" image build starts with an existing fat image (e.g. one provided by StackHPC) rather than a RockyLinux GenericCloud image, and only runs a specific subset of the -Ansible in the appliance. This allows adding additional functionality into site-specific images, without modifying the existing functionality in the base fat image. This is the recommended way to build site-specific images. - -To configure an "extra" image build, prepare a Packer variable definition file as described above but also including: - -- `extra_build_image_name`: A string to add into the final image name. -- `source_image` or `source_image_name`: The UUID or name of the fat image to start from (which must already be present in OpenStack). -- `extra_build_groups`: A list of Ansible inventory groups to put the build VM into, in addition to the `builder` group. This defines the roles/functionality - which are added to the image. -- `extra_build_volume_size`: A number giving the size in GB of the volume for the build VM's root disk and therefore the resulting image size. - Note this assumes the default of `use_blockstorage_volume = true`. - -E.g. to add the lustre client to an RockyLinux 9 image: - - # environments/site/lustre.pkvars.hcl - - extra_build_image_name = "lustre" # output image name will be like "openhpc-lustre-RL9-$timestamp-$commit" - source_image_name = "openhpc-ofed-RL9-240906-1041-32568dbb" # e.g. current StackHPC RL9 image - extra_build_groups = ["lustre"] # only run lustre role during this extra build - extra_build_volume_size = 15 # default non-CUDA build image size has enough free space - - # ... define flavor, network, etc as normal - - -Then, reference this build and variables file in the Packer build command: + PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc-extra --on-error=ask -var-file=environments/site/lustre.pkvars.hcl openstack.pkr.hcl + **NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: -**NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: + openstack image show $SOURCE_IMAGE - openstack image show $SOURCE_IMAGE + If it does, remove this property: -If it does, remove this property: + openstack image unset --property signature_verified $SOURCE_IMAGE - openstack image unset --property signature_verified $SOURCE_IMAGE + then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). -then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). +5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. # Build Process diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 14c997596..5b9d845ef 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241203-1659-b0558b95", - "RL9": "openhpc-RL9-241203-1659-b0558b95" + "RL8": "openhpc-RL8-241211-1322-ded60c2c", + "RL9": "openhpc-RL9-241211-1322-ded60c2c" } } diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 52202ead1..2ba0a1e63 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -23,6 +23,7 @@ data "git-commit" "cwd-head" { } locals { git_commit = data.git-commit.cwd-head.hash timestamp = formatdate("YYMMDD-hhmm", timestamp()) + image_name_version = var.image_name_version == "auto" ? "-${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_name_version } # Path pointing to root of repository - automatically set by environment variable PKR_VAR_repo_root @@ -39,12 +40,6 @@ variable "networks" { type = list(string) } -variable "os_version" { - type = string - description = "'RL8' or 'RL9' with default source_image_* mappings" - default = "RL9" -} - # Must supply either source_image_name or source_image_id variable "source_image_name" { type = string @@ -123,15 +118,6 @@ variable "volume_type" { } variable "volume_size" { - type = map(number) - default = { - # fat image builds, GB: - rocky-latest = 15 - openhpc = 15 - } -} - -variable "extra_build_volume_size" { type = number default = 15 } @@ -146,25 +132,22 @@ variable "metadata" { default = {} } -variable "groups" { - type = map(list(string)) - description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name" - default = { - # fat image builds: - rocky-latest = ["update"] - openhpc = ["control", "compute", "login"] - } +variable "inventory_groups" { + type = string + description = "Comma-separated list of additional inventory groups (other than 'builder') to add build VM to. Default is none." + default = "" } -variable "extra_build_groups" { - type = list(string) - default = [] +variable "image_name" { + type = string + description = "Name of image" + default = "openhpc" } -variable "extra_build_image_name" { +variable "image_name_version" { type = string - description = "Infix for 'extra' build image name" - default = "extra" + description = "Suffix for image name giving version. Default of 'auto' appends timestamp + short commit" + default = "auto" } source "openstack" "openhpc" { @@ -172,9 +155,11 @@ source "openstack" "openhpc" { flavor = var.flavor use_blockstorage_volume = var.use_blockstorage_volume volume_type = var.volume_type - volume_size = lookup(var.volume_size, source.name, var.extra_build_volume_size) + volume_size = var.volume_size metadata = var.metadata - instance_metadata = {ansible_init_disable = "true"} + instance_metadata = { + ansible_init_disable = "true" + } networks = var.networks floating_ip_network = var.floating_ip_network security_groups = var.security_groups @@ -200,27 +185,13 @@ source "openstack" "openhpc" { build { - # latest nightly image: - source "source.openstack.openhpc" { - name = "rocky-latest" - image_name = "${source.name}-${var.os_version}" - } - - # fat image: - source "source.openstack.openhpc" { - name = "openhpc" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" - } - - # Extended site-specific image, built on fat image: source "source.openstack.openhpc" { - name = "openhpc-extra" - image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" + image_name = "${var.image_name}${local.image_name_version}" } provisioner "ansible" { playbook_file = "${var.repo_root}/ansible/fatimage.yml" - groups = concat(["builder"], lookup(var.groups, source.name, var.extra_build_groups)) + groups = concat(["builder"], var.inventory_groups == "" ? [] : split(",", var.inventory_groups)) keep_inventory_file = true # for debugging use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting extra_arguments = [ From d12083a6953499ae8c116660ec45aca0183239b0 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 14:10:27 +0000 Subject: [PATCH 143/268] moved repo enable/disable into fatimage --- ansible/bootstrap.yml | 19 ------------------- ansible/fatimage.yml | 28 ++++++++++++++++++++++++++++ ansible/site.yml | 9 --------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 8c46c5e24..733d4b3f8 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -110,25 +110,6 @@ policy: "{{ selinux_policy }}" register: sestatus -- name: Sync pulp repos with upstream - hosts: pulp - tasks: - - ansible.builtin.include_role: - name: pulp_site - tasks_from: sync.yml - apply: - delegate_to: localhost - when: appliances_mode != 'configure' and appliances_sync_pulp_on_build - -- hosts: dnf_repos - become: yes - tasks: - - name: Replace system repos with pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: set_repos.yml - when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided - # --- tasks after here require access to package repos --- - hosts: squid tags: squid diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index b28e4f308..7c83fc2a2 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -14,6 +14,25 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- name: Sync pulp repos with upstream + hosts: pulp + tasks: + - ansible.builtin.include_role: + name: pulp_site + tasks_from: sync.yml + apply: + delegate_to: localhost + when: appliances_mode != 'configure' and appliances_sync_pulp_on_build + +- hosts: dnf_repos + become: yes + tasks: + - name: Replace system repos with pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: set_repos.yml + when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided + - import_playbook: bootstrap.yml - name: Run post-bootstrap.yml hook @@ -199,6 +218,15 @@ name: cloudalchemy.grafana tasks_from: install.yml +- hosts: dnf_repos + become: yes + tasks: + - name: Disable pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml + when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided + - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/site.yml b/ansible/site.yml index a09d5a510..bb379399d 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -28,15 +28,6 @@ - import_playbook: portal.yml - import_playbook: monitoring.yml -- hosts: dnf_repos - become: yes - tasks: - - name: Disable pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: disable_repos.yml - when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided - - name: Run post.yml hook vars: # hostvars not available here, so have to recalculate environment root: From 07dc9b796ff32002e83cae21b29ded39d688a750 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 14:28:31 +0000 Subject: [PATCH 144/268] fixed disable repos task --- ansible/roles/dnf_repos/tasks/disable_repos.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml index 69aed3b6b..53459ce49 100644 --- a/ansible/roles/dnf_repos/tasks/disable_repos.yml +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -1,6 +1,6 @@ --- - name: Disable Pulp repos and remove creds ansible.builtin.yum: - disablerepo: "{{ item.name }}" - loop: "{{ dnf_repos_repolist + [epel] }}" + disablerepo: "{{ item }}" + loop: "{{ dnf_repos_repolist | map(attribute='name') + ['epel'] }}" when: dnf_repos_disable From 3088f8375dcd5e7b4bb98b7dab008f59f36fda1c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 15:28:08 +0000 Subject: [PATCH 145/268] reverted disable repos task --- .../roles/dnf_repos/tasks/disable_repos.yml | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml index 53459ce49..2dbacc262 100644 --- a/ansible/roles/dnf_repos/tasks/disable_repos.yml +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -1,6 +1,18 @@ --- -- name: Disable Pulp repos and remove creds - ansible.builtin.yum: - disablerepo: "{{ item }}" - loop: "{{ dnf_repos_repolist | map(attribute='name') + ['epel'] }}" - when: dnf_repos_disable +- name: Disable Pulp repos + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + enabled: false + loop: "{{ dnf_repos_repolist }}" + +- name: Disable EPEL repo + ansible.builtin.yum_repository: + name: epel + file: epel + description: "{{ dnf_repos_epel_description }}" + baseurl: "{{ dnf_repos_epel_baseurl }}" + gpgcheck: false + enabled: false From c74360bf325c615b11db342a367538da5467cc1d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 15:50:24 +0000 Subject: [PATCH 146/268] fatimage with test latest (REVERT LATER) --- .github/workflows/fatimage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 331035001..d368d86cd 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -26,7 +26,7 @@ jobs: source_image_name: rocky-latest-RL8 inventory_groups: control,compute,login - image_name: openhpc-RL9 - source_image_name: rocky-latest-RL9 + source_image_name: rocky-latest-RL9-241212-1532-3088f837 inventory_groups: control,compute,login env: ANSIBLE_FORCE_COLOR: True From 67ce24bc5dda26b1ab2539e2627f6dfb59eb1b3b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 16:36:45 +0000 Subject: [PATCH 147/268] refactored pulp deploy and added pulp docs --- README.md | 2 +- ansible/adhoc/deploy-pulp.yml | 13 ++++++------- docs/experimental/pulp.md | 17 +++++++++++++++++ .../common/inventory/group_vars/all/pulp.yml | 1 + 4 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 docs/experimental/pulp.md create mode 100644 environments/common/inventory/group_vars/all/pulp.yml diff --git a/README.md b/README.md index f61bf8df4..f66441915 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ It requires an OpenStack cloud, and an Ansible "deploy host" with access to that Before starting ensure that: - You have root access on the deploy host. - You can create instances using a Rocky 9 GenericCloud image (or an image based on that). - - **NB**: In general it is recommended to use the [latest released image](https://github.com/stackhpc/ansible-slurm-appliance/releases) which already contains the required packages. This is built and tested in StackHPC's CI. However the appliance will install the necessary packages if a GenericCloud image is used. + - **NB**: In general it is recommended to use the [latest released image](https://github.com/stackhpc/ansible-slurm-appliance/releases) which already contains the required packages. This is built and tested in StackHPC's CI. - You have a SSH keypair defined in OpenStack, with the private key available on the deploy host. - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index 89c51922a..38cb79289 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -18,11 +18,10 @@ tasks_from: install.yml public: true - - name: Add pulp host to environment + - name: Print Pulp endpoint become: no - delegate_to: localhost - ansible.builtin.copy: - dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" - content: | - # ansible managed - appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" + debug: + msg: | + Server configured, override 'appliances_pulp_url' with + appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" + in your environments diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md new file mode 100644 index 000000000..974803030 --- /dev/null +++ b/docs/experimental/pulp.md @@ -0,0 +1,17 @@ +# Pulp Server + +In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's Ark Pulp server. The appliance will sync relevant repositories to local Pulp server which will be used for image builds. Using a local server can be enabled by adding `pulp` to the build groups and overriding `dnf_repos_repolist` to point at content hosted on the local server. + +## Deploying/configuring Pulp Server + +### Deploying a Pulp server +A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml' playbook. This can be run with +`ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` +This will print a Pulp endpoint which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note that this server's content isn't authenticated so assumes the server is deployed behind a secure network. + +### Using an existing Pulp server +An existing Pulp server can be used to host Ark repos by overriding `pulp_site_password` and `appliances_pulp_url` in the target environment. Note that this assumes the same configuration as the appliance deployed pulp i.e no content authentication. + +## Syncing Pulp content with Ark + +By default, the appliance will sync repos for the targetted distribution during build (can be disabled by setting `appliances_sync_pulp_on_build` to `false`). You must supply your Ark credentials, either by overriding `pulp_site_upstream_password` or setting environment variable `ARK_PASSWORD`. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`, optionally setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version`. diff --git a/environments/common/inventory/group_vars/all/pulp.yml b/environments/common/inventory/group_vars/all/pulp.yml new file mode 100644 index 000000000..02b7aa816 --- /dev/null +++ b/environments/common/inventory/group_vars/all/pulp.yml @@ -0,0 +1 @@ +pulp_site_port: 8080 From c4336055ef0f641d2f210ca8e1c345e28ec7ed4d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 16:41:24 +0000 Subject: [PATCH 148/268] testing image using site pulp --- .github/workflows/fatimage.yml | 2 +- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index d368d86cd..331035001 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -26,7 +26,7 @@ jobs: source_image_name: rocky-latest-RL8 inventory_groups: control,compute,login - image_name: openhpc-RL9 - source_image_name: rocky-latest-RL9-241212-1532-3088f837 + source_image_name: rocky-latest-RL9 inventory_groups: control,compute,login env: ANSIBLE_FORCE_COLOR: True diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 5e71beebd..5c100f999 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241206-1541-eaa36805", - "RL9": "openhpc-RL9-241206-1541-eaa36805" + "RL8": "openhpc-RL8-241212-1553-c74360bf", + "RL9": "openhpc-RL9-241212-1554-c74360bf" } } From bda3f7e7568648d03bb50ede8f11ded5e933f0cb Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 10:12:40 +0000 Subject: [PATCH 149/268] Pointed dnf repos back at ark for now + refactor --- ansible/adhoc/sync-pulp.yml | 1 - ansible/fatimage.yml | 2 +- ansible/roles/dnf_repos/defaults/main.yml | 15 ++++++------- ansible/roles/dnf_repos/tasks/set_repos.yml | 4 ++++ ansible/roles/pulp_site/defaults/main.yml | 5 +++-- docs/experimental/pulp.md | 2 +- .../inventory/group_vars/builder.yml | 21 ++++++++++++------- .../inventory/group_vars/all/defaults.yml | 8 ++++--- environments/common/inventory/groups | 2 -- 9 files changed, 35 insertions(+), 25 deletions(-) diff --git a/ansible/adhoc/sync-pulp.yml b/ansible/adhoc/sync-pulp.yml index 9c7684445..f26149bba 100644 --- a/ansible/adhoc/sync-pulp.yml +++ b/ansible/adhoc/sync-pulp.yml @@ -8,4 +8,3 @@ pulp_site_target_distribution: "rocky" pulp_site_target_distribution_version: "9.4" pulp_site_target_distribution_version_major: "9" - pulp_site_version_timestamps: "{{ appliances_repo_timestamps[pulp_site_target_distribution_version] }}" diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index eaa5215a5..5d84fcf90 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -25,7 +25,7 @@ tasks_from: sync.yml apply: delegate_to: localhost - when: appliances_mode != 'configure' and appliances_sync_pulp_on_build + when: appliances_mode != 'configure' - hosts: dnf_repos become: yes diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index d4c80b0c9..19a5d4986 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -2,23 +2,24 @@ dnf_repos_pulp_url: "{{ appliances_pulp_url }}" dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" -dnf_repos_disable: true -dnf_repos_version_timestamps: "{{ appliances_repo_timestamps[ansible_distribution_version] }}" +dnf_repos_version_timestamps: "{{ appliances_repo_minor_timestamps[ansible_distribution_version] }}" +dnf_repos_username: "{{ omit }}" +dnf_repos_password: "{{ omit }}" # epel installed separately dnf_repos_repolist: - file: rocky name: baseos - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/baseos/{{ dnf_repos_version_timestamps.baseos }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/BaseOS/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].baseos }}" - file: rocky name: appstream - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/appstream/{{ dnf_repos_version_timestamps.appstream }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].appstream }}" - file: rocky name: crb - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/crb/{{ dnf_repos_version_timestamps.crb }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].crb }}" - file: rocky-extras name: extras - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ dnf_repos_version_timestamps.extras }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].extras }}" -dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_version }}/{{ dnf_repos_version_timestamps.epel }}" +dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ appliances_repo_major_timestamps[ansible_distribution_major_version].epel }}" dnf_repos_epel_description: "epel" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index dea803902..fe5e2c02c 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -6,6 +6,8 @@ name: "{{ item.name }}" baseurl: "{{ item.base_url }}" description: "{{ item.name }}" + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" loop: "{{ dnf_repos_repolist }}" - name: Install epel-release @@ -20,3 +22,5 @@ description: "{{ dnf_repos_epel_description }}" gpgcheck: false baseurl: "{{ dnf_repos_epel_baseurl }}" + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 6a9e98d74..c342ea46f 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -14,7 +14,8 @@ pulp_site_target_arch: "{{ pulp_site_target_facts['architecture'] }}" pulp_site_target_distribution: "{{ pulp_site_target_facts['distribution'] | lower }}" pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" -pulp_site_version_timestamps: "{{ appliances_repo_timestamps[pulp_site_target_distribution_version] }}" +pulp_site_version_timestamps: "{{ appliances_repo_minor_timestamps[pulp_site_target_distribution_version] }}" +pulp_site_major_version_timestamps: "{{ appliances_repo_major_timestamps[pulp_site_target_distribution_version_major] }}" pulp_site_rpm_info: - name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" @@ -30,7 +31,7 @@ pulp_site_rpm_info: url: "{{ pulp_site_default_upstream_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" base_path: "rocky/{{ pulp_site_target_distribution_version }}/extras/{{ pulp_site_version_timestamps.extras }}" - name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" - url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_version_timestamps.epel }}" + url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_major_version_timestamps.epel }}" base_path: "epel/{{ pulp_site_target_distribution_version }}/{{ pulp_site_version_timestamps.epel }}" pulp_site_rpm_repo_defaults: diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index 974803030..d1a40ba52 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -14,4 +14,4 @@ An existing Pulp server can be used to host Ark repos by overriding `pulp_site_p ## Syncing Pulp content with Ark -By default, the appliance will sync repos for the targetted distribution during build (can be disabled by setting `appliances_sync_pulp_on_build` to `false`). You must supply your Ark credentials, either by overriding `pulp_site_upstream_password` or setting environment variable `ARK_PASSWORD`. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`, optionally setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version`. +If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must supply your Ark credentials, either by overriding `pulp_site_upstream_password` or setting environment variable `ARK_PASSWORD`. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`, optionally setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version`. diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index c4b01b03f..ce1666973 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,9 +1,14 @@ # update_enable: false # Can uncomment for speed debugging non-update related build issues -pulp_server_config: - LEAFCLOUD: - url: http://192.168.10.157:8080 - password: lookup('env','LEAFCLOUD_PULP_PASSWORD') - -appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" -pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" -appliances_sync_pulp_on_build: false + +# Uncomment below to use CI pulp servers + +# pulp_server_config: +# LEAFCLOUD: +# url: http://192.168.10.157:8080 +# password: lookup('env','LEAFCLOUD_PULP_PASSWORD') + +# appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" +# pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" + +dnf_repos_username: slurm-app-ci +dnf_repos_password: "{{ lookup('env','ARK_PASSWORD') }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index f2a6723ad..e1acdf19b 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -7,7 +7,7 @@ appliances_environment_name: "{{ appliances_environment_root | basename | regex_ appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform appliances_mode: configure -#appliances_pulp_url: #override required +appliances_pulp_url: https://ark.stackhpc.com # Address(ip/dns) for internal communication between services. This is # normally traffic you do no want to expose to users. @@ -82,11 +82,13 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ########################################################################################### -appliances_sync_pulp_on_build: true -appliances_repo_timestamps: +appliances_repo_minor_timestamps: '9.4': baseos: 20240816T002610 appstream: 20240816T002610 crb: 20240816T002610 extras: 20240816T002610 + +appliances_repo_major_timestamps: + '9': epel: 20240902T080424 diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index fbfcfa0ca..8f52477cd 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -147,9 +147,7 @@ freeipa_client [dnf_repos:children] # Hosts to replace system repos with Pulp repos -cluster builder [pulp:children] # Hosts used to run Pulp API commands -builder From 17d79241d7f76287d7208850672b1bed26ca422b Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:15:49 +0000 Subject: [PATCH 150/268] fix doca cleanup deleteing /tmp/ (#494) --- ansible/roles/doca/tasks/install.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ansible/roles/doca/tasks/install.yml b/ansible/roles/doca/tasks/install.yml index 9d297e946..d26fda79e 100644 --- a/ansible/roles/doca/tasks/install.yml +++ b/ansible/roles/doca/tasks/install.yml @@ -39,13 +39,11 @@ - name: Cleanup DOCA build directories ansible.builtin.file: state: absent - path: "{{ (_doca_kernel_repo.stdout | split('/'))[:2] | join('/') }}" + path: "{{ (_doca_kernel_repo.stdout | split('/'))[:3] | join('/') }}" # leading / means 1st element of split list is '' - name: Update initramfs ansible.builtin.command: - cmd: dracut -f --tmpdir /var/tmp - environment: - TMPDIR: /var/tmp + cmd: dracut -f register: _doca_dracut failed_when: _doca_dracut.stderr != '' # appears rc is always 0 From d6eabe69270a8a3c7b15d0eb2628bb2393dc35b5 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 10:18:23 +0000 Subject: [PATCH 151/268] unused var --- ansible/roles/dnf_repos/defaults/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 19a5d4986..3701305b6 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -2,7 +2,6 @@ dnf_repos_pulp_url: "{{ appliances_pulp_url }}" dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" -dnf_repos_version_timestamps: "{{ appliances_repo_minor_timestamps[ansible_distribution_version] }}" dnf_repos_username: "{{ omit }}" dnf_repos_password: "{{ omit }}" From 4a3074b9153d73f5cfc9c0c754546418f6e3b34a Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 13 Dec 2024 10:27:59 +0000 Subject: [PATCH 152/268] prototype script - hostvars no-op --- .../roles/compute_init/files/compute-init.yml | 40 +++++++++++++------ ansible/roles/compute_init/tasks/main.yml | 19 ++++++++- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index f78bbe9b7..e44ec32f8 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -5,14 +5,15 @@ become: yes # VARS TO BE SUPPLIED VIA CLOUD INIT METADATA vars: - control_node_ip: "172.16.1.228" + server_node_ip: "172.16.1.154" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] - nfs_export: "/exports/home" - nfs_client_mnt_options: - nfs_client_mnt_point: "/home" - nfs_client_mnt_state: mounted - nfs_server: "{{ control_node_ip }}" + nfs_configurations: + - nfs_export: "/exports/home" + nfs_client_mnt_options: + nfs_client_mnt_point: "/home" + nfs_client_mnt_state: mounted + nfs_server: "{{ server_node_ip }}" os_manila_mount_state: mounted os_manila_mount_opts: @@ -36,7 +37,7 @@ uid: 1005 basic_users_groups: [] - openhpc_conf_server: "{{ control_node_ip }}" + openhpc_conf_server: "{{ server_node_ip }}" tasks: - name: Configure resolve.conf @@ -77,7 +78,7 @@ - name: Mount /mnt/cluster mount: path: /mnt/cluster - src: "{{ vars.control_node_ip }}:/exports/cluster" + src: "{{ vars.server_node_ip }}:/exports/cluster" fstype: nfs opts: rw,sync state: mounted @@ -90,20 +91,35 @@ group: root mode: 0644 + # - name: Include hostvars from NFS share + # block: + # - name: Extract short hostname using a shell block + # shell: | + # HOSTNAME=$(hostname) + # echo "${HOSTNAME%.test.invalid}" + # register: short_hostname + + # - name: Include vars from NFS mount + # include_vars: + # file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" + - name: NFS client mount block: - name: ensure mount directory exists file: - path: "{{ nfs_client_mnt_point }}" + path: "{{ item.get('nfs_client_mnt_point', nfs_client_mnt_point) }}" state: directory + loop: "{{ nfs_configurations }}" - name: mount the filesystem mount: - path: "{{ nfs_client_mnt_point }}" - src: "{{ nfs_server }}:{{ nfs_export }}" + path: "{{ item.get('nfs_client_mnt_point', nfs_client_mnt_point) }}" + src: "{{ item.get('nfs_server', nfs_server) }}:{{ item.get('nfs_export', nfs_export) }}" + opts: "{{ item['nfs_client_mnt_options'] | default(nfs_client_mnt_options, true) | default(omit, true) }}" # for some reason items.get() here fails with "an incorrect mount option was specified" fstype: nfs - state: "{{ nfs_client_mnt_state }}" + state: "{{ item.get('nfs_client_mnt_state', nfs_client_mnt_state) }}" + loop: "{{ nfs_configurations }}" - name: Manila mount diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 15ba586d1..f5513a80a 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Ensure directories exist - file: + file: path: "/etc/ansible-init/{{ item }}" state: directory owner: root @@ -122,6 +122,23 @@ owner: munge group: munge mode: 0400 + + # - name: Ensure /exports/cluster/inventory_hostname directory exists + # file: + # path: /exports/cluster/{{ inventory_hostname }} + # state: directory + # owner: root + # group: root + # mode: 0755 + + # - name: Template hostvars + # template: + # src: ../templates/hostvars.j2 + # dest: "/exports/cluster/{{ inventory_hostname }}/hostvars.yml" + # owner: root + # group: root + # mode: 0644 + delegate_to: "{{ groups['control'] | first }}" - name: Inject compute initialisation playbook From 5a082e7d8584a7373f60a2de9208b0c08bdc5fd9 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:44:36 +0000 Subject: [PATCH 153/268] Fix nightly images getting timestamp/git hash (#493) --- .github/workflows/nightlybuild.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index a0e78cd0b..9cb1cea27 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -83,6 +83,7 @@ jobs: -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ -var "image_name=${{ matrix.build.image_name }}" \ + -var "image_name_version="\ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl From 91fe707db39d47436b84c1c63b07a22e2aa606e1 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:08:30 +0000 Subject: [PATCH 154/268] Update nightlybuild.yml --- .github/workflows/nightlybuild.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 9cb1cea27..596b85a05 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -83,7 +83,7 @@ jobs: -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ -var "image_name=${{ matrix.build.image_name }}" \ - -var "image_name_version="\ + -var "image_name_version=" \ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl From e3ce4926622f3a5fabd38eb704afd2dec4048cbe Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 13 Dec 2024 10:40:31 +0000 Subject: [PATCH 155/268] use k3s_server metadata for server_ip --- ansible/roles/compute_init/files/compute-init.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index e44ec32f8..164aab8e3 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -3,9 +3,10 @@ - name: Compute node initialisation hosts: localhost become: yes - # VARS TO BE SUPPLIED VIA CLOUD INIT METADATA vars: - server_node_ip: "172.16.1.154" + os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + server_node_ip: "{{ os_metadata.meta.k3s_server }}" + resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] nfs_configurations: @@ -78,7 +79,7 @@ - name: Mount /mnt/cluster mount: path: /mnt/cluster - src: "{{ vars.server_node_ip }}:/exports/cluster" + src: "{{ server_node_ip }}:/exports/cluster" fstype: nfs opts: rw,sync state: mounted From f0e48b90a36b16ff6e5c80740ab07587d5fcd467 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 13:24:27 +0000 Subject: [PATCH 156/268] pulp sync now mirrors upstream subpaths --- ansible/roles/pulp_site/.gitignore | 1 + ansible/roles/pulp_site/defaults/main.yml | 22 ++++++++----------- .../filter_plugins/pulp-list-filters.py | 6 ++--- 3 files changed, 13 insertions(+), 16 deletions(-) create mode 100644 ansible/roles/pulp_site/.gitignore diff --git a/ansible/roles/pulp_site/.gitignore b/ansible/roles/pulp_site/.gitignore new file mode 100644 index 000000000..6738e49c1 --- /dev/null +++ b/ansible/roles/pulp_site/.gitignore @@ -0,0 +1 @@ +filter_plugins/__pycache__ \ No newline at end of file diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index c342ea46f..76ad14988 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -1,9 +1,10 @@ pulp_site_url: "{{ appliances_pulp_url }}" pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed +pulp_site_upstream_content_url: https://ark.stackhpc.com/pulp/content pulp_site_upstream_username: slurm-app-ci pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" -pulp_site_default_upstream_prefix: "https://ark.stackhpc.com/pulp/content/{{ pulp_site_target_distribution }}/{{ pulp_site_target_distribution_version }}" +_pulp_site_rocky_prefix: "{{ pulp_site_target_distribution }}/{{ pulp_site_target_distribution_version }}" pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" pulp_site_password: "{{ vault_pulp_admin_password }}" pulp_site_validate_certs: false @@ -19,20 +20,15 @@ pulp_site_major_version_timestamps: "{{ appliances_repo_major_timestamps[pulp_si pulp_site_rpm_info: - name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" - url: "{{ pulp_site_default_upstream_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" - base_path: "rocky/{{ pulp_site_target_distribution_version }}/baseos/{{ pulp_site_version_timestamps.baseos }}" + subpath: "{{ _pulp_site_rocky_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" - name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" - url: "{{ pulp_site_default_upstream_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" - base_path: "rocky/{{ pulp_site_target_distribution_version }}/appstream/{{ pulp_site_version_timestamps.appstream }}" + subpath: "{{ _pulp_site_rocky_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" - name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" - url: "{{ pulp_site_default_upstream_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" - base_path: "rocky/{{ pulp_site_target_distribution_version }}/crb/{{ pulp_site_version_timestamps.crb }}" + subpath: "{{ _pulp_site_rocky_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" - name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" - url: "{{ pulp_site_default_upstream_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" - base_path: "rocky/{{ pulp_site_target_distribution_version }}/extras/{{ pulp_site_version_timestamps.extras }}" -- name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" - url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_major_version_timestamps.epel }}" - base_path: "epel/{{ pulp_site_target_distribution_version }}/{{ pulp_site_version_timestamps.epel }}" + subpath: "{{ _pulp_site_rocky_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" +- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ pulp_site_major_version_timestamps.epel }}" + subpath: "epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_major_version_timestamps.epel }}" pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" @@ -42,6 +38,6 @@ pulp_site_rpm_repo_defaults: _pulp_site_rpm_info_all: "{{ pulp_site_rpm_info | map('combine', pulp_site_rpm_repo_defaults) }}" -pulp_site_rpm_repos: "{{ _pulp_site_rpm_info_all | to_rpm_repos }}" +pulp_site_rpm_repos: "{{ _pulp_site_rpm_info_all | to_rpm_repos(pulp_site_upstream_content_url) }}" pulp_site_rpm_publications: "{{ _pulp_site_rpm_info_all | to_rpm_pubs }}" pulp_site_rpm_distributions: "{{ _pulp_site_rpm_info_all | to_rpm_distros }}" diff --git a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py index 94d89d184..50e912685 100644 --- a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py +++ b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py @@ -6,10 +6,10 @@ def filters(self): 'to_rpm_distros': self.to_rpm_distros } - def to_rpm_repos(self, list): + def to_rpm_repos(self, list, pulp_url): repo_list = map(lambda x: { 'name': x['name'], - 'url': x['url'], + 'url': pulp_url+'/'+x['subpath'], 'remote_username': x['remote_username'], 'remote_password': x['remote_password'], 'policy': x['policy'], @@ -26,6 +26,6 @@ def to_rpm_distros(self, list): distro_list = map(lambda x: { 'name': x['name'], 'repository': x['name'], - 'base_path': x['base_path'], + 'base_path': x['subpath'], 'state': x['state'] }, list) return distro_list \ No newline at end of file From 309bd0bd659a73a73aa3152cdcba56b32cc261a0 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 13:30:41 +0000 Subject: [PATCH 157/268] removed intermediate var --- ansible/roles/dnf_repos/defaults/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 3701305b6..281a57c7e 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,5 +1,4 @@ -dnf_repos_pulp_url: "{{ appliances_pulp_url }}" -dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" +dnf_repos_pulp_content_url: "{{ appliances_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" dnf_repos_username: "{{ omit }}" From 9065bb6d98c45170b82e8f772254e4b5cd63aa78 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 14:13:09 +0000 Subject: [PATCH 158/268] bumped repo timestamps to latest --- .../common/inventory/group_vars/all/defaults.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index e1acdf19b..e84f1e6d1 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -84,11 +84,11 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us appliances_repo_minor_timestamps: '9.4': - baseos: 20240816T002610 - appstream: 20240816T002610 - crb: 20240816T002610 - extras: 20240816T002610 + baseos: 20241115T011711 + appstream: 20241112T003151 + crb: 20241115T003133 + extras: 20241118T002802 appliances_repo_major_timestamps: '9': - epel: 20240902T080424 + epel: 20241213T010218 From 7d7bc7376fa81077e955dddc1a5a98eaf8956c62 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:35:56 +0000 Subject: [PATCH 159/268] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 5c100f999..8659f3e90 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241212-1553-c74360bf", - "RL9": "openhpc-RL9-241212-1554-c74360bf" + "RL8": "openhpc-RL8-241213-1402-a2a705c9", + "RL9": "openhpc-RL9-241213-1402-a2a705c9" } } From cc81aeff7f57b265b8dcf51beb8316ae25eeeb3d Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:48:21 +0000 Subject: [PATCH 160/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 5c100f999..125180527 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241212-1553-c74360bf", - "RL9": "openhpc-RL9-241212-1554-c74360bf" + "RL8": "openhpc-RL8-241213-1416-9065bb6d", + "RL9": "openhpc-RL9-241213-1417-9065bb6d" } } From f343f67395a143493f2b48c5638c2b7a2e4101b3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 15:02:01 +0000 Subject: [PATCH 161/268] moved to later in build/site and moved groups --- ansible/bootstrap.yml | 9 --------- ansible/fatimage.yml | 2 ++ ansible/packages.yml | 10 ++++++++++ ansible/site.yml | 1 + environments/common/inventory/groups | 4 +--- environments/common/layouts/everything | 4 ++++ 6 files changed, 18 insertions(+), 12 deletions(-) create mode 100644 ansible/packages.yml diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 432a2a319..733d4b3f8 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -216,15 +216,6 @@ msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details" when: "update_enable | default('false') | bool" -- hosts: extra_packages - become: yes - tags: - - extra_packages - tasks: - dnf: - - name: "{{ appliances_extra_packages }}" - when: appliances_mode != 'configure' or appliances_packages_during_configure - - hosts: - selinux - update diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index b28e4f308..c40aca6fd 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -199,6 +199,8 @@ name: cloudalchemy.grafana tasks_from: install.yml +- ansible.builtin.import_playbook: packages.yml + - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/packages.yml b/ansible/packages.yml new file mode 100644 index 000000000..e447dcda7 --- /dev/null +++ b/ansible/packages.yml @@ -0,0 +1,10 @@ + +- hosts: extra_packages + become: yes + tags: + - extra_packages + tasks: + - name: Install additional packages + dnf: + name: "{{ appliances_extra_packages }}" + when: appliances_mode != 'configure' or appliances_packages_during_configure diff --git a/ansible/site.yml b/ansible/site.yml index bb379399d..878b15a35 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -27,6 +27,7 @@ - import_playbook: slurm.yml - import_playbook: portal.yml - import_playbook: monitoring.yml +- import_playbook: packages.yml - name: Run post.yml hook vars: diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index d8ad503fe..2a6244962 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -145,7 +145,5 @@ freeipa_client [lustre] # Hosts to run lustre client -[extra_packages:children] +[extra_packages] # Hosts to install specified additional packages on -cluster -builder diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index ba5cbc08d..6f6f63590 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -92,3 +92,7 @@ control [lustre] # Hosts to run lustre client + +[extra_packages:children] +# Hosts to install specified additional packages on +cluster From 07ed8223147d4cfb40fbf557766920fea02b5260 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 13 Dec 2024 16:11:44 +0000 Subject: [PATCH 162/268] compute init node condition based off metadata --- .../roles/compute_init/files/compute-init.yml | 19 +++++++++++++++++-- environments/common/layouts/everything | 2 +- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 164aab8e3..3e9c6e470 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -5,6 +5,7 @@ become: yes vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + iam_slurm_compute: "{{ os_metadata.meta.slurm_compute | default(false) }}" server_node_ip: "{{ os_metadata.meta.k3s_server }}" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] @@ -41,6 +42,11 @@ openhpc_conf_server: "{{ server_node_ip }}" tasks: + - name: Skip initialization if slurm_compute metadata set to false + debug: + msg: "Skipping compute initialization" + when: not iam_slurm_compute | bool + - name: Configure resolve.conf block: - name: Set nameservers in /etc/resolv.conf @@ -65,7 +71,9 @@ name: NetworkManager state: reloaded when: _copy_nm_config.changed | default(false) - when: resolv_conf_nameservers is defined and resolv_conf_nameservers | length > 0 + when: + - resolv_conf_nameservers is defined and resolv_conf_nameservers | length > 0 + - iam_slurm_compute | bool - name: Mount /mnt/cluster on compute nodes and copy hosts to /etc/hosts @@ -91,6 +99,7 @@ owner: root group: root mode: 0644 + when: iam_slurm_compute | bool # - name: Include hostvars from NFS share # block: @@ -103,6 +112,7 @@ # - name: Include vars from NFS mount # include_vars: # file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" + # when: iam_slurm_compute | bool - name: NFS client mount @@ -121,6 +131,7 @@ fstype: nfs state: "{{ item.get('nfs_client_mnt_state', nfs_client_mnt_state) }}" loop: "{{ nfs_configurations }}" + when: iam_slurm_compute | bool - name: Manila mount @@ -205,6 +216,7 @@ loop_control: label: "{{ item.share_name }}" when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] + when: iam_slurm_compute | bool - name: Basic users setup @@ -229,6 +241,7 @@ loop_control: label: "{{ item.name }}" when: "'sudo' in item" + when: iam_slurm_compute | bool - name: Configure EESSI @@ -245,6 +258,7 @@ - name: Ensure CVMFS config is setup command: cmd: "cvmfs_config setup" + when: iam_slurm_compute | bool - name: Configure openhpc @@ -285,4 +299,5 @@ service: name: slurmd enabled: true - state: started \ No newline at end of file + state: started + when: iam_slurm_compute | bool diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 2d55c18cf..5ada017e1 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -84,7 +84,7 @@ cluster [compute_init:children] # Hosts to deploy compute initialisation ansible-init script to. -compute +cluster [k3s:children] # Hosts to run k3s server/agent From a43a5f97ee62a2ef0283dee3db3df10e34429333 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 13 Dec 2024 17:11:17 +0000 Subject: [PATCH 163/268] fail gracefully when NFS server not up --- .../roles/compute_init/files/compute-init.yml | 53 +++++++++++++------ 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 3e9c6e470..9b098fc14 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -91,6 +91,13 @@ fstype: nfs opts: rw,sync state: mounted + register: nfs_mount_result + ignore_errors: true + + - name: Fail gracefully if NFS mount is not available + debug: + msg: "NFS mount failed. Skipping compute initialization. Re-image if this persists." + when: nfs_mount_result.failed - name: Copy /mnt/cluster/hosts contents to /etc/hosts copy: @@ -99,20 +106,24 @@ owner: root group: root mode: 0644 + when: not nfs_mount_result.failed when: iam_slurm_compute | bool - # - name: Include hostvars from NFS share - # block: - # - name: Extract short hostname using a shell block - # shell: | - # HOSTNAME=$(hostname) - # echo "${HOSTNAME%.test.invalid}" - # register: short_hostname - # - name: Include vars from NFS mount - # include_vars: - # file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" - # when: iam_slurm_compute | bool + - name: Include hostvars from NFS share + block: + - name: Extract short hostname using a shell block + shell: | + HOSTNAME=$(hostname) + echo "${HOSTNAME%.test.invalid}" + register: short_hostname + + - name: Include vars from NFS mount + include_vars: + file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed - name: NFS client mount @@ -131,7 +142,9 @@ fstype: nfs state: "{{ item.get('nfs_client_mnt_state', nfs_client_mnt_state) }}" loop: "{{ nfs_configurations }}" - when: iam_slurm_compute | bool + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed - name: Manila mount @@ -216,7 +229,9 @@ loop_control: label: "{{ item.share_name }}" when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] - when: iam_slurm_compute | bool + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed - name: Basic users setup @@ -241,7 +256,9 @@ loop_control: label: "{{ item.name }}" when: "'sudo' in item" - when: iam_slurm_compute | bool + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed - name: Configure EESSI @@ -258,7 +275,9 @@ - name: Ensure CVMFS config is setup command: cmd: "cvmfs_config setup" - when: iam_slurm_compute | bool + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed - name: Configure openhpc @@ -300,4 +319,6 @@ name: slurmd enabled: true state: started - when: iam_slurm_compute | bool + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed \ No newline at end of file From 76f292e2da4d578a92434da90c98783b69aa7398 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 13 Dec 2024 17:20:34 +0000 Subject: [PATCH 164/268] rejoin node to cluster --- ansible/roles/compute_init/files/compute-init.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 9b098fc14..165700668 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -118,9 +118,9 @@ echo "${HOSTNAME%.test.invalid}" register: short_hostname - - name: Include vars from NFS mount - include_vars: - file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" + # - name: Include vars from NFS mount + # include_vars: + # file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" when: - iam_slurm_compute | bool - not nfs_mount_result.failed @@ -319,6 +319,9 @@ name: slurmd enabled: true state: started + + - name: Ensure node is in cluster + command: scontrol update state=resume nodename={{ short_hostname.stdout }} when: - iam_slurm_compute | bool - not nfs_mount_result.failed \ No newline at end of file From 1a400db7a5a73eb73c8707e384ceafa7b6f5f544 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 13 Dec 2024 20:38:19 +0000 Subject: [PATCH 165/268] ok: Skipping compute initialization as metadata compute_groups is empty --- ansible/extras.yml | 17 +- .../roles/compute_init/files/compute-init.yml | 385 ++++-------------- ansible/roles/compute_init/tasks/export.yml | 35 ++ ansible/roles/compute_init/tasks/install.yml | 73 ++++ ansible/roles/compute_init/tasks/main.yml | 8 - .../compute_init/templates/hostvars.yml.j2 | 1 + environments/common/layouts/everything | 5 +- 7 files changed, 197 insertions(+), 327 deletions(-) create mode 100644 ansible/roles/compute_init/tasks/export.yml create mode 100644 ansible/roles/compute_init/tasks/install.yml create mode 100644 ansible/roles/compute_init/templates/hostvars.yml.j2 diff --git a/ansible/extras.yml b/ansible/extras.yml index 4cbe931b1..85e068e89 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -37,13 +37,26 @@ - import_role: name: persist_hostkeys -- name: Inject ansible-init compute script +# TODO: I'm not convinced this is the right place +- hosts: compute_init:!builder + tags: compute_init + become: yes + name: Export hostvars + tasks: + - include_role: + name: compute_init + tasks_from: export.yml + +# TODO: really this should only run during build +# but handy not to for debugging +- name: Install compute_init script hosts: compute_init tags: compute_init become: yes tasks: - - import_role: + - include_role: name: compute_init + tasks_from: install.yml - name: Install k9s become: yes diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 165700668..53071cc48 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -5,323 +5,78 @@ become: yes vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" - iam_slurm_compute: "{{ os_metadata.meta.slurm_compute | default(false) }}" server_node_ip: "{{ os_metadata.meta.k3s_server }}" - - resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] - - nfs_configurations: - - nfs_export: "/exports/home" - nfs_client_mnt_options: - nfs_client_mnt_point: "/home" - nfs_client_mnt_state: mounted - nfs_server: "{{ server_node_ip }}" - - os_manila_mount_state: mounted - os_manila_mount_opts: - - x-systemd.device-timeout=30 - - x-systemd.mount-timeout=30 - - noatime - - _netdev # prevents mount blocking early boot before networking available - - rw - os_manila_mount_ceph_conf_path: /etc/ceph - - basic_users_manage_homedir: false - basic_users_userdefaults: - state: present - create_home: "{{ basic_users_manage_homedir }}" - generate_ssh_key: "{{ basic_users_manage_homedir }}" - ssh_key_comment: "{{ item.name }}" - test_user_password: "zXpcWyGQL7jtZnqylQra4g==" - basic_users_users: - - name: testuser # can't use rocky as $HOME isn't shared! - password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent - uid: 1005 - basic_users_groups: [] - - openhpc_conf_server: "{{ server_node_ip }}" + compute_groups: "{{ os_metadata.meta.compute_groups | default([]) }}" + + # TODO: "role defaults" + resolv_conf_nameservers: [] + + # nfs_configurations: + # - nfs_export: "/exports/home" + # nfs_client_mnt_options: + # nfs_client_mnt_point: "/home" + # nfs_client_mnt_state: mounted + # nfs_server: "{{ server_node_ip }}" + + # os_manila_mount_state: mounted + # os_manila_mount_opts: + # - x-systemd.device-timeout=30 + # - x-systemd.mount-timeout=30 + # - noatime + # - _netdev # prevents mount blocking early boot before networking available + # - rw + # os_manila_mount_ceph_conf_path: /etc/ceph + + # basic_users_manage_homedir: false + # basic_users_userdefaults: + # state: present + # create_home: "{{ basic_users_manage_homedir }}" + # generate_ssh_key: "{{ basic_users_manage_homedir }}" + # ssh_key_comment: "{{ item.name }}" + # test_user_password: "zXpcWyGQL7jtZnqylQra4g==" + # basic_users_users: + # - name: testuser # can't use rocky as $HOME isn't shared! + # password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent + # uid: 1005 + # basic_users_groups: [] + + # openhpc_conf_server: "{{ server_node_ip }}" tasks: - - name: Skip initialization if slurm_compute metadata set to false - debug: - msg: "Skipping compute initialization" - when: not iam_slurm_compute | bool - - - name: Configure resolve.conf - block: - - name: Set nameservers in /etc/resolv.conf - ansible.builtin.template: - src: /etc/ansible-init/templates/resolv.conf.j2 - dest: /etc/resolv.conf - owner: root - group: root - mode: u=rw,og=r - - - name: Disable NetworkManager control of resolv.conf - ansible.builtin.copy: - src: /etc/ansible-init/files/NetworkManager-dns-none.conf - dest: /etc/NetworkManager/conf.d/90-dns-none.conf - owner: root - group: root - mode: u=rw,og=r - register: _copy_nm_config - - - name: Reload NetworkManager - ansible.builtin.systemd: - name: NetworkManager - state: reloaded - when: _copy_nm_config.changed | default(false) - when: - - resolv_conf_nameservers is defined and resolv_conf_nameservers | length > 0 - - iam_slurm_compute | bool - - - - name: Mount /mnt/cluster on compute nodes and copy hosts to /etc/hosts - block: - - name: Ensure the mount directory exists - file: - path: /mnt/cluster - state: directory - mode: 0755 + - block: + - name: Report skipping initialization if not compute node + # meta: end_play produces no output + debug: + msg: "Skipping compute initialization as metadata compute_groups is empty" - - name: Mount /mnt/cluster - mount: - path: /mnt/cluster - src: "{{ server_node_ip }}:/exports/cluster" - fstype: nfs - opts: rw,sync - state: mounted - register: nfs_mount_result - ignore_errors: true - - - name: Fail gracefully if NFS mount is not available + - meta: end_play + when: compute_groups | length == 0 + + - name: Ensure the mount directory exists + file: + path: /mnt/cluster + state: directory + owner: root + group: root + mode: u=rwX,go= # is sensitive + + - name: Mount /mnt/cluster + mount: + path: /mnt/cluster + src: "{{ server_node_ip }}:/exports/cluster" + fstype: nfs + opts: ro,sync + state: mounted + register: nfs_mount_result + ignore_errors: true + register: _mount_mnt_cluster + # TODO: add some retries here? + + - block: + - name: Report skipping initialization if cannot mount nfs + # meta: end_play produces no output debug: - msg: "NFS mount failed. Skipping compute initialization. Re-image if this persists." - when: nfs_mount_result.failed - - - name: Copy /mnt/cluster/hosts contents to /etc/hosts - copy: - src: /mnt/cluster/hosts - dest: /etc/hosts - owner: root - group: root - mode: 0644 - when: not nfs_mount_result.failed - when: iam_slurm_compute | bool - - - - name: Include hostvars from NFS share - block: - - name: Extract short hostname using a shell block - shell: | - HOSTNAME=$(hostname) - echo "${HOSTNAME%.test.invalid}" - register: short_hostname - - # - name: Include vars from NFS mount - # include_vars: - # file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed - - - - name: NFS client mount - block: - - name: ensure mount directory exists - file: - path: "{{ item.get('nfs_client_mnt_point', nfs_client_mnt_point) }}" - state: directory - loop: "{{ nfs_configurations }}" - - - name: mount the filesystem - mount: - path: "{{ item.get('nfs_client_mnt_point', nfs_client_mnt_point) }}" - src: "{{ item.get('nfs_server', nfs_server) }}:{{ item.get('nfs_export', nfs_export) }}" - opts: "{{ item['nfs_client_mnt_options'] | default(nfs_client_mnt_options, true) | default(omit, true) }}" # for some reason items.get() here fails with "an incorrect mount option was specified" - fstype: nfs - state: "{{ item.get('nfs_client_mnt_state', nfs_client_mnt_state) }}" - loop: "{{ nfs_configurations }}" - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed - - - - name: Manila mount - block: - - name: Read manila share info from nfs file - slurp: - src: "/mnt/cluster/manila_share_info.yml" - register: manila_share_info_file - no_log: true - - - name: Parse and set fact for manila share info - set_fact: - os_manila_mount_share_info: "{{ manila_share_info_file.content | b64decode | from_yaml }}" - - - name: Read manila shares from nfs file - slurp: - src: "/mnt/cluster/manila_shares.yml" - register: manila_shares_file - - - name: Parse and set fact for manila shares - set_fact: - os_manila_mount_shares: "{{ manila_shares_file.content | b64decode | from_yaml }}" - - - name: Ensure Ceph configuration directory exists - ansible.builtin.file: - path: "{{ os_manila_mount_ceph_conf_path }}" - state: directory - mode: "0755" - owner: root - group: root - - - name: Configure ceph.conf using os_manila_mount_host - ansible.builtin.template: - src: /etc/ansible-init/templates/ceph.conf.j2 - dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.conf" - owner: root - group: root - mode: "0600" - - - name: Ensure mount directory exists - ansible.builtin.file: - path: "{{ item.mount_path }}" - state: directory - owner: "{{ item.mount_user | default(omit) }}" - group: "{{ item.mount_group | default(omit) }}" - mode: "{{ item.mount_mode | default(omit) }}" - loop: "{{ os_manila_mount_shares }}" - loop_control: - label: "{{ item.share_name }}" - - - name: Write Ceph client keyring - ansible.builtin.template: - src: /etc/ansible-init/templates/ceph.keyring.j2 - dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.client.{{ item.share_user }}.keyring" - mode: "0600" - owner: root - group: root - loop: "{{ os_manila_mount_share_info }}" - loop_control: - label: "{{ item.share_name }}" - - - name: Mount the Ceph share - ansible.posix.mount: - path: "{{ item[0].mount_path }}" - src: "{{ item[1].host }}:{{ item[1].export }}" - fstype: ceph - opts: "name={{ item[1].share_user }},{{ (item[0].mount_opts | default(os_manila_mount_opts)) | join(',') }}" - # NB share_user is looked up here in case of autodetection - state: "{{ item[0].mount_state | default(os_manila_mount_state) }}" - loop: "{{ os_manila_mount_shares | zip(os_manila_mount_share_info) }}" - loop_control: - label: "{{ item[0].share_name }}" - - - name: Ensure mounted directory has correct permissions - ansible.builtin.file: - path: "{{ item.mount_path }}" - state: directory - owner: "{{ item.mount_user | default(omit) }}" - group: "{{ item.mount_group | default(omit) }}" - mode: "{{ item.mount_mode | default(omit) }}" - loop: "{{ os_manila_mount_shares }}" - loop_control: - label: "{{ item.share_name }}" - when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed - - - - name: Basic users setup - block: - - name: Create groups - ansible.builtin.group: "{{ item }}" - loop: "{{ basic_users_groups }}" - - - name: Create users - user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}" - loop: "{{ basic_users_users }}" - loop_control: - label: "{{ item.name }} [{{ item.state | default('present') }}]" - register: basic_users_info - - - name: Write sudo rules - blockinfile: - path: /etc/sudoers.d/80-{{ item.name}}-user - block: "{{ item.sudo }}" - create: true - loop: "{{ basic_users_users }}" - loop_control: - label: "{{ item.name }}" - when: "'sudo' in item" - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed - - - - name: Configure EESSI - block: - - name: Copy /mnt/cluster/cvmfs/default.local contents to /etc/cvmfs/default.local - copy: - src: /mnt/cluster/cvmfs/default.local - dest: /etc/cvmfs/default.local - owner: root - group: root - mode: 0644 - - # NOTE: Not clear how to make this idempotent - - name: Ensure CVMFS config is setup - command: - cmd: "cvmfs_config setup" - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed - - - - name: Configure openhpc - block: - - name: Fix permissions on /etc to pass Munge startup checks - # Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root) - # which fails munged startup checks - file: - path: /etc - state: directory - mode: g-w - - - name: Copy Munge key from NFS-mounted directory to /etc/munge - copy: - src: "/mnt/cluster/openhpc_munge.key" - dest: "/etc/munge/munge.key" - owner: munge - group: munge - mode: 0400 - - - name: Set slurmctld location for configless operation - lineinfile: - path: /etc/sysconfig/slurmd - line: "SLURMD_OPTIONS='--conf-server {{ openhpc_conf_server }}'" - regexp: "^SLURMD_OPTIONS=" - create: yes - owner: root - group: root - mode: 0644 - - - name: Configure Munge service - service: - name: munge - enabled: true - state: started - - - name: Ensure slurmd state - service: - name: slurmd - enabled: true - state: started - - - name: Ensure node is in cluster - command: scontrol update state=resume nodename={{ short_hostname.stdout }} - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed \ No newline at end of file + msg: "Skipping compute initialization as cannot mount exports/cluster share" + + - meta: end_play + when: _mount_mnt_cluster.failed diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml new file mode 100644 index 000000000..3e9340cb5 --- /dev/null +++ b/ansible/roles/compute_init/tasks/export.yml @@ -0,0 +1,35 @@ +- name: Ensure the /exports/cluster directory exists + file: + path: /exports/cluster + state: directory + owner: root + group: root + mode: u=rwX,go= + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy /etc/hosts to /exports/cluster + copy: + src: /etc/hosts + dest: /exports/cluster/hosts + owner: root + group: root + mode: u=rw,go= + remote_src: true + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Create hostvars directory + file: + path: /exports/cluster/hostvars/{{ inventory_hostname }}/ + state: directory + mode: u=rwX,go= + # TODO: owner,mode,etc + delegate_to: "{{ groups['control'] | first }}" + +- name: Template out hostvars + template: + src: hostvars.yml.j2 + dest: /exports/cluster/hostvars/{{ inventory_hostname }}/hostvars.yml + mode: u=rw,go= + delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml new file mode 100644 index 000000000..c48ec612d --- /dev/null +++ b/ansible/roles/compute_init/tasks/install.yml @@ -0,0 +1,73 @@ +--- + +- name: Ensure directories exist + file: + path: "/etc/ansible-init/{{ item }}" + state: directory + owner: root + group: root + mode: 0755 + loop: + - templates + - files + - library + - filter_plugins + +- name: Inject templates + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/templates/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../resolv_conf/templates/resolv.conf.j2 + - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 + - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 + +- name: Inject files + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/files/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../resolv_conf/files/NetworkManager-dns-none.conf + +- name: Inject libraries + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/library/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../stackhpc.os-manila-mount/library/os_manila_share.py + +- name: Inject filter_plugins + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/filter_plugins/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../basic_users/filter_plugins/filter_keys.py + +- name: Add filter_plugins ansible.cfg + lineinfile: + path: /etc/ansible-init/ansible.cfg + line: "filter_plugins = /etc/ansible-init/filter_plugins" + state: present + owner: root + group: root + mode: 0644 + +- name: Inject compute initialisation playbook + copy: + src: compute-init.yml + dest: /etc/ansible-init/playbooks/1-compute-init.yml + owner: root + group: root + mode: 0644 \ No newline at end of file diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index f5513a80a..cb4c57d35 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -140,11 +140,3 @@ # mode: 0644 delegate_to: "{{ groups['control'] | first }}" - -- name: Inject compute initialisation playbook - copy: - src: compute-init.yml - dest: /etc/ansible-init/playbooks/compute-init.yml - owner: root - group: root - mode: 0644 \ No newline at end of file diff --git a/ansible/roles/compute_init/templates/hostvars.yml.j2 b/ansible/roles/compute_init/templates/hostvars.yml.j2 new file mode 100644 index 000000000..7d4351b44 --- /dev/null +++ b/ansible/roles/compute_init/templates/hostvars.yml.j2 @@ -0,0 +1 @@ +{{ hostvars[inventory_hostname] | to_nice_json }} \ No newline at end of file diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 5ada017e1..0fc447cf5 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -83,8 +83,9 @@ openhpc cluster [compute_init:children] -# Hosts to deploy compute initialisation ansible-init script to. -cluster +# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on +# TODO: actually should be empty for now +compute [k3s:children] # Hosts to run k3s server/agent From c9ebd482da31417a8db47a4101d2964fdebf75bf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 13 Dec 2024 20:53:21 +0000 Subject: [PATCH 166/268] compute-init stage 1 working --- ansible/extras.yml | 2 +- docs/experimental/compute-init.md | 100 ++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 docs/experimental/compute-init.md diff --git a/ansible/extras.yml b/ansible/extras.yml index 85e068e89..e615b1605 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -48,7 +48,7 @@ tasks_from: export.yml # TODO: really this should only run during build -# but handy not to for debugging +# but handy not to for debugging without build - name: Install compute_init script hosts: compute_init tags: compute_init diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md new file mode 100644 index 000000000..1e3f07c1f --- /dev/null +++ b/docs/experimental/compute-init.md @@ -0,0 +1,100 @@ + +To develop/debug this without actually having to build an image: + +On deploy host: + + .stackhpc/ (venv) [rocky@steveb-dev slurm-app-rl9]$ ansible-playbook ansible/extras.yml --tags compute_init + +On compute node: + + [root@rl9-compute-0 rocky]# rm /var/lib/ansible-init.done + [root@rl9-compute-0 rocky]# systemctl restart ansible-init + [root@rl9-compute-0 rocky]# systemctl status ansible-init + + +Without any metadata: + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: activating (start) since Fri 2024-12-13 20:41:16 UTC; 1min 45s ago + Main PID: 16089 (ansible-init) + Tasks: 8 (limit: 10912) + Memory: 99.5M + CPU: 11.687s + CGroup: /system.slice/ansible-init.service + ├─16089 /usr/lib/ansible-init/bin/python /usr/bin/ansible-init + ├─16273 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml + ├─16350 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml + ├─16361 /bin/sh -c "/usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py && sleep 0" + ├─16362 /usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py + ├─16363 /usr/bin/mount /mnt/cluster + └─16364 /sbin/mount.nfs 192.168.10.12:/exports/cluster /mnt/cluster -o ro,sync + + Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] + Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Report skipping initialization if not compute node] ********************** + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Ensure the mount directory exists] *************************************** + Dec 13 20:41:25 rl9-compute-0.rl9.invalid python3[16346]: ansible-file Invoked with path=/mnt/cluster state=directory owner=root group=root mode=u=rwX,go= recurse=False force=False follow=True modification_time_format=%Y%m%d%H%M.%S access> + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: changed: [127.0.0.1] + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Mount /mnt/cluster] ****************************************************** + Dec 13 20:41:26 rl9-compute-0.rl9.invalid python3[16362]: ansible-mount Invoked with path=/mnt/cluster src=192.168.10.12:/exports/cluster fstype=nfs opts=ro,sync state=mounted boot=True dump=0 passno=0 backup=False fstab=None + [root@rl9-compute-0 rocky]# systemctl status ansible-init + +Added metadata via horizon: + + compute_groups ["compute"] + + +OK: + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: active (exited) since Fri 2024-12-13 20:43:31 UTC; 33s ago + Process: 16089 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) + Main PID: 16089 (code=exited, status=0/SUCCESS) + CPU: 13.003s + + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] => { + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: "msg": "Skipping compute initialization as cannot mount exports/cluster share" + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: } + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: PLAY RECAP ********************************************************************* + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: 127.0.0.1 : ok=4 changed=1 unreachable=0 failed=0 skipped=1 rescued=0 ignored=1 + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] executing remote playbooks for stage - post + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] writing sentinel file /var/lib/ansible-init.done + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] ansible-init completed successfully + Dec 13 20:43:31 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + +Now run site.yml, then restart ansible-init again: + + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: active (exited) since Fri 2024-12-13 20:50:10 UTC; 11s ago + Process: 18921 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) + Main PID: 18921 (code=exited, status=0/SUCCESS) + CPU: 8.240s + + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [Report skipping initialization if cannot mount nfs] ********************** + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [meta] ******************************************************************** + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: PLAY RECAP ********************************************************************* + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: 127.0.0.1 : ok=3 changed=1 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] executing remote playbooks for stage - post + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] writing sentinel file /var/lib/ansible-init.done + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] ansible-init completed successfully + Dec 13 20:50:10 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + [root@rl9-compute-0 rocky]# ls /mnt/cluster/host + hosts hostvars/ + [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- + rl9-compute-0/ rl9-compute-1/ + [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- + rl9-compute-0/ rl9-compute-1/ + [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-0/ + hostvars.yml \ No newline at end of file From 3a583a95dfe52ecea141be9b47ae9630ed26c5da Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 13 Dec 2024 21:08:34 +0000 Subject: [PATCH 167/268] load hostvars --- .../roles/compute_init/files/compute-init.yml | 45 +++++-------------- docs/experimental/compute-init.md | 27 +++++++++-- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 53071cc48..5661b467d 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -8,40 +8,10 @@ server_node_ip: "{{ os_metadata.meta.k3s_server }}" compute_groups: "{{ os_metadata.meta.compute_groups | default([]) }}" - # TODO: "role defaults" + # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects + # this is a good example: common environment actually defines this (non-functional w/o compute groups), but role default is empty resolv_conf_nameservers: [] - - # nfs_configurations: - # - nfs_export: "/exports/home" - # nfs_client_mnt_options: - # nfs_client_mnt_point: "/home" - # nfs_client_mnt_state: mounted - # nfs_server: "{{ server_node_ip }}" - - # os_manila_mount_state: mounted - # os_manila_mount_opts: - # - x-systemd.device-timeout=30 - # - x-systemd.mount-timeout=30 - # - noatime - # - _netdev # prevents mount blocking early boot before networking available - # - rw - # os_manila_mount_ceph_conf_path: /etc/ceph - - # basic_users_manage_homedir: false - # basic_users_userdefaults: - # state: present - # create_home: "{{ basic_users_manage_homedir }}" - # generate_ssh_key: "{{ basic_users_manage_homedir }}" - # ssh_key_comment: "{{ item.name }}" - # test_user_password: "zXpcWyGQL7jtZnqylQra4g==" - # basic_users_users: - # - name: testuser # can't use rocky as $HOME isn't shared! - # password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent - # uid: 1005 - # basic_users_groups: [] - - # openhpc_conf_server: "{{ server_node_ip }}" - + tasks: - block: - name: Report skipping initialization if not compute node @@ -80,3 +50,12 @@ - meta: end_play when: _mount_mnt_cluster.failed + + - name: Load hostvars from NFS + # this is higher priority than vars block = normal ansible's hostvars + include_vars: + file: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" # can't use inventory_hostname + + - name: Demonstrate hostvars have loaded + debug: + var: prometheus_version diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index 1e3f07c1f..efc0cdcd9 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -7,8 +7,7 @@ On deploy host: On compute node: - [root@rl9-compute-0 rocky]# rm /var/lib/ansible-init.done - [root@rl9-compute-0 rocky]# systemctl restart ansible-init + [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init [root@rl9-compute-0 rocky]# systemctl status ansible-init @@ -97,4 +96,26 @@ Now run site.yml, then restart ansible-init again: [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- rl9-compute-0/ rl9-compute-1/ [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-0/ - hostvars.yml \ No newline at end of file + hostvars.yml + +This commit - shows that hostvars have loaded: + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: active (exited) since Fri 2024-12-13 21:06:20 UTC; 5s ago + Process: 27585 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) + Main PID: 27585 (code=exited, status=0/SUCCESS) + CPU: 8.161s + + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: TASK [Demonstrate hostvars have loaded] **************************************** + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: ok: [127.0.0.1] => { + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: "prometheus_version": "2.27.0" + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: } + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: PLAY RECAP ********************************************************************* + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: 127.0.0.1 : ok=5 changed=0 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] executing remote playbooks for stage - post + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] writing sentinel file /var/lib/ansible-init.done + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully + Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + From 8bb90b4b61efa94196976e6ad9b69ddbd8c1f4fc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 13 Dec 2024 21:15:24 +0000 Subject: [PATCH 168/268] simplify compute-init file copy --- ansible/roles/compute_init/tasks/install.yml | 55 ++++++-------------- 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index c48ec612d..4eef5deb8 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -13,49 +13,28 @@ - library - filter_plugins -- name: Inject templates +- name: Inject files from roles copy: - src: '{{ item }}' - dest: '/etc/ansible-init/templates/{{ item | basename }}' + src: '{{ item.src }}' + dest: '/etc/ansible-init/{{ item.dest }}' owner: root group: root mode: 0644 loop: - - ../../resolv_conf/templates/resolv.conf.j2 - - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 - - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 - -- name: Inject files - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/files/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../resolv_conf/files/NetworkManager-dns-none.conf - -- name: Inject libraries - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/library/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../stackhpc.os-manila-mount/library/os_manila_share.py - -- name: Inject filter_plugins - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/filter_plugins/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../basic_users/filter_plugins/filter_keys.py + - src: ../../resolv_conf/templates/resolv.conf.j2 + dest: templates/resolv.conf.j2 + - src: ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 + dest: templates/ceph.conf.j2 + - src: ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 + dest: templates/ceph.keyring.j2 + - src: ../../resolv_conf/files/NetworkManager-dns-none.conf + dest: files/NetworkManager-dns-none.conf + - src: ../../stackhpc.os-manila-mount/library/os_manila_share.py + dest: library/os_manila_share.py + - src: ../../basic_users/filter_plugins/filter_keys.py + dest: filter_plugins/filter_keys.py -- name: Add filter_plugins ansible.cfg +- name: Add filter_plugins to ansible.cfg lineinfile: path: /etc/ansible-init/ansible.cfg line: "filter_plugins = /etc/ansible-init/filter_plugins" @@ -64,7 +43,7 @@ group: root mode: 0644 -- name: Inject compute initialisation playbook +- name: Add compute initialisation playbook copy: src: compute-init.yml dest: /etc/ansible-init/playbooks/1-compute-init.yml From 7babc210ce686a160e6461f8ecf4bd67dd963b4f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Sat, 14 Dec 2024 21:38:11 +0000 Subject: [PATCH 169/268] move compute_init tasks to right place and document --- ansible/extras.yml | 13 +- ansible/fatimage.yml | 10 ++ ansible/roles/compute_init/tasks/main.yml | 142 ------------------ docs/experimental/compute-init.md | 42 +++++- .../common/inventory/group_vars/all/nfs.yml | 2 +- 5 files changed, 49 insertions(+), 160 deletions(-) delete mode 100644 ansible/roles/compute_init/tasks/main.yml diff --git a/ansible/extras.yml b/ansible/extras.yml index e615b1605..10ca4cc9d 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -37,7 +37,7 @@ - import_role: name: persist_hostkeys -# TODO: I'm not convinced this is the right place +# TODO: Is this is the right place? - hosts: compute_init:!builder tags: compute_init become: yes @@ -47,17 +47,6 @@ name: compute_init tasks_from: export.yml -# TODO: really this should only run during build -# but handy not to for debugging without build -- name: Install compute_init script - hosts: compute_init - tags: compute_init - become: yes - tasks: - - include_role: - name: compute_init - tasks_from: install.yml - - name: Install k9s become: yes hosts: k9s diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 439c50e70..6063196c3 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -55,6 +55,16 @@ - import_playbook: extras.yml +# TODO: is this the right place? +- name: Install compute_init script + hosts: compute_init + tags: compute_init # tagged to allow running on cluster instances for dev + become: yes + tasks: + - include_role: + name: compute_init + tasks_from: install.yml + - hosts: builder become: yes gather_facts: yes diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml deleted file mode 100644 index cb4c57d35..000000000 --- a/ansible/roles/compute_init/tasks/main.yml +++ /dev/null @@ -1,142 +0,0 @@ ---- - -- name: Ensure directories exist - file: - path: "/etc/ansible-init/{{ item }}" - state: directory - owner: root - group: root - mode: 0755 - loop: - - templates - - files - - library - - filter_plugins - -- name: Inject templates - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/templates/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../resolv_conf/templates/resolv.conf.j2 - - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 - - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 - -- name: Inject files - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/files/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../resolv_conf/files/NetworkManager-dns-none.conf - -- name: Inject libraries - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/library/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../basic_users/library/terminate_user_sessions.py - - ../../stackhpc.os-manila-mount/library/os_manila_share.py - - ../../stackhpc.openhpc/library/sacct_cluster.py - -- name: Inject filter_plugins - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/filter_plugins/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../basic_users/filter_plugins/filter_keys.py - - ../../stackhpc.openhpc/filter_plugins/slurm_conf.py - -- name: Add filter_plugins ansible.cfg - lineinfile: - path: /etc/ansible-init/ansible.cfg - line: "filter_plugins = /etc/ansible-init/filter_plugins" - state: present - owner: root - group: root - mode: 0644 - -- name: Ensure nfs /exports/cluster configured - block: - - name: Ensure the /exports/cluster directory exists - file: - path: /exports/cluster - state: directory - owner: root - group: root - mode: 0755 - - - name: Copy /etc/hosts to /exports/cluster - copy: - src: /etc/hosts - dest: /exports/cluster/hosts - owner: root - group: root - mode: 0644 - remote_src: true - - - name: Copy manila share info to /exports/cluster - copy: - content: "{{ os_manila_mount_share_info | to_nice_yaml }}" - dest: "/exports/cluster/manila_share_info.yml" - when: os_manila_mount_share_info is defined - - - name: Copy manila mount shares to /exports/cluster - copy: - content: "{{ os_manila_mount_shares | to_nice_yaml }}" - dest: "/exports/cluster/manila_shares.yml" - when: os_manila_mount_shares is defined - - - name: Ensure /exports/cluster/cvmfs directory exists - file: - path: /exports/cluster/cvmfs - state: directory - owner: root - group: root - mode: 0755 - - - name: Copy EESSI CVMFS config to /exports/cluster - copy: - src: /etc/cvmfs/default.local - dest: /exports/cluster/cvmfs/default.local - owner: root - group: root - mode: 0644 - remote_src: true - - - name: Write openhpc munge key - copy: - content: "{{ vault_openhpc_mungekey | b64decode }}" - dest: "/exports/cluster/openhpc_munge.key" - owner: munge - group: munge - mode: 0400 - - # - name: Ensure /exports/cluster/inventory_hostname directory exists - # file: - # path: /exports/cluster/{{ inventory_hostname }} - # state: directory - # owner: root - # group: root - # mode: 0755 - - # - name: Template hostvars - # template: - # src: ../templates/hostvars.j2 - # dest: "/exports/cluster/{{ inventory_hostname }}/hostvars.yml" - # owner: root - # group: root - # mode: 0644 - - delegate_to: "{{ groups['control'] | first }}" diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index efc0cdcd9..64a67125e 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -1,15 +1,47 @@ +# compute-init + +TODO: describe current status. + +# Development To develop/debug this without actually having to build an image: -On deploy host: - .stackhpc/ (venv) [rocky@steveb-dev slurm-app-rl9]$ ansible-playbook ansible/extras.yml --tags compute_init +1. Add the compute nodes into the `compute_init` group: -On compute node: + cat <> $APPLIANCES_ENVIRONMENT_ROOT/inventory/extra_groups + [compute_init:children] + compute + EOF - [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init - [root@rl9-compute-0 rocky]# systemctl status ansible-init +2. Deploy a cluster using tofu and ansible/site.yml as normal. This will + additionally configure the control node to export compute hosts over NFS. + Check the cluster is up. + +3. Reimage the compute nodes: + + ansible-playbook --limit compute ansible/adhoc/rebuild + +4. Add metadata to a compute node e.g. via Horzon to turn on compute-init + playbook functionality. + +5. Fake an image build to deploy the compute-init playbook: + + ansible-playbook ansible/fatimage.yml --tags compute_init + +6. Fake a reimage of compute to run ansible-init and the compute-init playbook: + + On compute node where metadata was added: + + [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init + [root@rl9-compute-0 rocky]# systemctl status ansible-init + + Use `systemctl status ansible-init` to view stdout/stderr from Ansible. + +Steps 5/6 can be repeated with changes to the compute script. If desirable +reimage the compute node(s) first as in step 3. +# Results/progress Without any metadata: diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index 84371c99a..e9366da2b 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -20,4 +20,4 @@ nfs_configurations: nfs_enable: server: "{{ inventory_hostname in groups['control'] }}" clients: false - nfs_export: "/exports/cluster" # control node has to copy in /etc/hosts to here + nfs_export: "/exports/cluster" From cb21e9cf5329598fdbfc048e4aae2f3154b75c41 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Sat, 14 Dec 2024 21:47:29 +0000 Subject: [PATCH 170/268] leave compute-init turned on in everything template --- docs/experimental/compute-init.md | 19 ++++++------------- environments/common/inventory/groups | 2 +- environments/common/layouts/everything | 1 - 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index 64a67125e..fe0fe5df4 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -7,29 +7,22 @@ TODO: describe current status. To develop/debug this without actually having to build an image: -1. Add the compute nodes into the `compute_init` group: - - cat <> $APPLIANCES_ENVIRONMENT_ROOT/inventory/extra_groups - [compute_init:children] - compute - EOF - -2. Deploy a cluster using tofu and ansible/site.yml as normal. This will +1. Deploy a cluster using tofu and ansible/site.yml as normal. This will additionally configure the control node to export compute hosts over NFS. Check the cluster is up. -3. Reimage the compute nodes: +2. Reimage the compute nodes: ansible-playbook --limit compute ansible/adhoc/rebuild -4. Add metadata to a compute node e.g. via Horzon to turn on compute-init +3. Add metadata to a compute node e.g. via Horzon to turn on compute-init playbook functionality. -5. Fake an image build to deploy the compute-init playbook: +4. Fake an image build to deploy the compute-init playbook: ansible-playbook ansible/fatimage.yml --tags compute_init -6. Fake a reimage of compute to run ansible-init and the compute-init playbook: +5. Fake a reimage of compute to run ansible-init and the compute-init playbook: On compute node where metadata was added: @@ -38,7 +31,7 @@ To develop/debug this without actually having to build an image: Use `systemctl status ansible-init` to view stdout/stderr from Ansible. -Steps 5/6 can be repeated with changes to the compute script. If desirable +Steps 4/5 can be repeated with changes to the compute script. If desirable reimage the compute node(s) first as in step 3. # Results/progress diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index ba846777c..b944cccd6 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -137,7 +137,7 @@ freeipa_client # Hosts to run linux-anisble-init [compute_init] -# Hosts to deploy compute initialisation ansible-init script to. +# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on [k3s] # Hosts to run k3s server/agent diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 0fc447cf5..f65e14fe6 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -84,7 +84,6 @@ cluster [compute_init:children] # EXPERIMENTAL: Compute hosts to enable joining cluster on boot on -# TODO: actually should be empty for now compute [k3s:children] From 53a7dc4fbb76abf11455d56c05df3b1701a91a8e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Sat, 14 Dec 2024 22:36:50 +0000 Subject: [PATCH 171/268] get resolv_conf, etc_hosts and stackhpc.openhpc working --- .../roles/compute_init/files/compute-init.yml | 97 +++++++++++++++++-- 1 file changed, 88 insertions(+), 9 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 5661b467d..24a5090c6 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -6,7 +6,9 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.k3s_server }}" - compute_groups: "{{ os_metadata.meta.compute_groups | default([]) }}" + enable_slurmd: "{{ os_metadata.meta.enable_slurmd | default(false) | bool }}" + enable_resolv_conf: "{{ os_metadata.meta.enable_senable_resolv_conf | default(false) | bool }}" + enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects # this is a good example: common environment actually defines this (non-functional w/o compute groups), but role default is empty @@ -17,10 +19,10 @@ - name: Report skipping initialization if not compute node # meta: end_play produces no output debug: - msg: "Skipping compute initialization as metadata compute_groups is empty" + msg: "Skipping compute initialization: Metadata enable_slurmd is not true" - meta: end_play - when: compute_groups | length == 0 + when: not enable_slurmd - name: Ensure the mount directory exists file: @@ -37,16 +39,15 @@ fstype: nfs opts: ro,sync state: mounted - register: nfs_mount_result - ignore_errors: true register: _mount_mnt_cluster + ignore_errors: true # TODO: add some retries here? - block: - name: Report skipping initialization if cannot mount nfs # meta: end_play produces no output debug: - msg: "Skipping compute initialization as cannot mount exports/cluster share" + msg: "Skipping compute initialization: Failed to mount /exports/cluster from control node {{ server_node_ip }}" - meta: end_play when: _mount_mnt_cluster.failed @@ -56,6 +57,84 @@ include_vars: file: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" # can't use inventory_hostname - - name: Demonstrate hostvars have loaded - debug: - var: prometheus_version + # TODO: should /mnt/cluster now be UNMOUNTED to avoid future hang-ups? + + - name: Configure resolve.conf + block: + - name: Set nameservers in /etc/resolv.conf + ansible.builtin.template: + src: /etc/ansible-init/templates/resolv.conf.j2 + dest: /etc/resolv.conf + owner: root + group: root + mode: u=rw,og=r + + - name: Disable NetworkManager control of resolv.conf + ansible.builtin.copy: + src: /etc/ansible-init/files/NetworkManager-dns-none.conf + dest: /etc/NetworkManager/conf.d/90-dns-none.conf + owner: root + group: root + mode: u=rw,og=r + register: _copy_nm_config + + - name: Reload NetworkManager + ansible.builtin.systemd: + name: NetworkManager + state: reloaded + when: _copy_nm_config.changed | default(false) + when: enable_resolv_conf + + - name: Copy cluster /etc/hosts + copy: + src: /mnt/cluster/hosts + dest: /etc/hosts + owner: root + group: root + mode: 0644 + when: enable_etc_hosts + + # TODO: - name: NFS client mount + + # TODO: - name: Manila mount + + # TODO: - name: Basic users setup + + # TODO: - name: Configure EESSI + + # TODO: - name: Configure openhpc + # NB: don't need conditional block on enable_slurmd as have already exited + # if not the case + - name: Write Munge key + copy: + content: "{{ openhpc_munge_key }}" + dest: "/etc/munge/munge.key" + owner: munge + group: munge + mode: 0400 + + - name: Set slurmctld location for configless operation + lineinfile: + path: /etc/sysconfig/slurmd + line: "SLURMD_OPTIONS='--conf-server {{ server_node_ip }}'" + regexp: "^SLURMD_OPTIONS=" + create: yes + owner: root + group: root + mode: 0644 + + - name: Ensure Munge service state + service: + name: munge + enabled: true + state: started + + - name: Ensure slurmd service state + service: + name: slurmd + enabled: true + state: started + + - name: Ensure node is resumed + # TODO: consider if this is always safe for all job states? + command: scontrol update state=resume nodename={{ ansible_hostname }} From 1f458516f46d223cb024447b59dfeca6b7cfcacb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Sat, 14 Dec 2024 22:38:12 +0000 Subject: [PATCH 172/268] doc problems with templating out hostvars --- docs/experimental/compute-init.md | 53 ++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index fe0fe5df4..17ed370e6 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -1,6 +1,9 @@ # compute-init -TODO: describe current status. +The following roles are currently functional: +- resolv_conf +- etc_hosts +- stackhpc.openhpc # Development @@ -8,7 +11,7 @@ To develop/debug this without actually having to build an image: 1. Deploy a cluster using tofu and ansible/site.yml as normal. This will - additionally configure the control node to export compute hosts over NFS. + additionally configure the control node to export compute hostvars over NFS. Check the cluster is up. 2. Reimage the compute nodes: @@ -22,6 +25,10 @@ To develop/debug this without actually having to build an image: ansible-playbook ansible/fatimage.yml --tags compute_init + NB: This will also re-export the compute hostvars, as the nodes are not + in the builder group, which conveniently means any changes made to that + play also get picked up. + 5. Fake a reimage of compute to run ansible-init and the compute-init playbook: On compute node where metadata was added: @@ -31,8 +38,9 @@ To develop/debug this without actually having to build an image: Use `systemctl status ansible-init` to view stdout/stderr from Ansible. -Steps 4/5 can be repeated with changes to the compute script. If desirable -reimage the compute node(s) first as in step 3. +Steps 4/5 can be repeated with changes to the compute script. If required, +reimage the compute node(s) first as in step 2 and/or add additional metadata +as in step 3. # Results/progress @@ -144,3 +152,40 @@ This commit - shows that hostvars have loaded: Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. +# Design notes + +- In general, we don't want to rely on NFS export. So should e.g. copy files + from this mount ASAP in the compute-init script. TODO: +- There are a few possible approaches: + + 1. Control node copies files resulting from role into cluster exports, + compute-init copies to local disk. Only works if files are not host-specific + Examples: etc_hosts, eessi config? + + 2. Re-implement the role. Works if the role vars are not too complicated, + (else they all need to be duplicated in compute-init). Could also only + support certain subsets of role functionality or variables + Examples: resolv_conf, stackhpc.openhpc + + +# Problems with templated hostvars + +Here are all the ones which actually rely on hostvars from other nodes, +which therefore aren't available: + +``` +[root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", + "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", + "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", + "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", + "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", + "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", + "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", + "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", + "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", + "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" +``` + +More generally, there is nothing to stop any group var depending on a +"{{ hostvars[] }}" interpolation ... From c162e18e447410487dd481abcf982477ac8b39b4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 09:53:52 +0000 Subject: [PATCH 173/268] Refactored common repolist --- ansible/roles/dnf_repos/defaults/main.yml | 10 ++++----- ansible/roles/pulp_site/defaults/main.yml | 22 +++++++++---------- .../inventory/group_vars/all/defaults.yml | 21 +++++++++--------- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 281a57c7e..4a0c9fd2a 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -8,16 +8,16 @@ dnf_repos_password: "{{ omit }}" dnf_repos_repolist: - file: rocky name: baseos - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/BaseOS/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].baseos }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/BaseOS/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.baseos[ansible_distribution_version] }}" - file: rocky name: appstream - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].appstream }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.appstream[ansible_distribution_version] }}" - file: rocky name: crb - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].crb }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.crb[ansible_distribution_version] }}" - file: rocky-extras name: extras - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].extras }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.extras[ansible_distribution_version] }}" -dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ appliances_repo_major_timestamps[ansible_distribution_major_version].epel }}" +dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ appliances_repo_timestamps.epel[ansible_distribution_major_version] }}" dnf_repos_epel_description: "epel" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 76ad14988..2c90d2968 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -15,20 +15,18 @@ pulp_site_target_arch: "{{ pulp_site_target_facts['architecture'] }}" pulp_site_target_distribution: "{{ pulp_site_target_facts['distribution'] | lower }}" pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" -pulp_site_version_timestamps: "{{ appliances_repo_minor_timestamps[pulp_site_target_distribution_version] }}" -pulp_site_major_version_timestamps: "{{ appliances_repo_major_timestamps[pulp_site_target_distribution_version_major] }}" pulp_site_rpm_info: -- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" - subpath: "{{ _pulp_site_rocky_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" -- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" - subpath: "{{ _pulp_site_rocky_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" -- name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" - subpath: "{{ _pulp_site_rocky_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" -- name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" - subpath: "{{ _pulp_site_rocky_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" -- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ pulp_site_major_version_timestamps.epel }}" - subpath: "epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_major_version_timestamps.epel }}" +- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.baseos[pulp_site_target_distribution_version] }}" + subpath: "{{ _pulp_site_rocky_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.baseos[pulp_site_target_distribution_version] }}" +- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.appstream[pulp_site_target_distribution_version] }}" + subpath: "{{ _pulp_site_rocky_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.appstream[pulp_site_target_distribution_version] }}" +- name: "crb-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.crb[pulp_site_target_distribution_version] }}" + subpath: "{{ _pulp_site_rocky_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.crb[pulp_site_target_distribution_version] }}" +- name: "extras-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.extras[pulp_site_target_distribution_version] }}" + subpath: "{{ _pulp_site_rocky_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.extras[pulp_site_target_distribution_version] }}" +- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_repo_timestamps.epel[pulp_site_target_distribution_version_major] }}" + subpath: "epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ appliances_repo_timestamps.epel[pulp_site_target_distribution_version_major] }}" pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index e1acdf19b..1bac4590d 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -82,13 +82,14 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ########################################################################################### -appliances_repo_minor_timestamps: - '9.4': - baseos: 20240816T002610 - appstream: 20240816T002610 - crb: 20240816T002610 - extras: 20240816T002610 - -appliances_repo_major_timestamps: - '9': - epel: 20240902T080424 +appliances_repo_timestamps: + baseos: + '9.4': 20240816T002610 + appstream: + '9.4': 20240816T002610 + crb: + '9.4': 20240816T002610 + extras: + '9.4': 20240816T002610 + epel: + '9': 20240902T080424 From bda3f0d5ad31e1c9e2faf1d6cbdfa0b293ce76c8 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 16 Dec 2024 10:09:04 +0000 Subject: [PATCH 174/268] Code review doc/comment suggestions Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/adhoc/deploy-pulp.yml | 1 - docs/experimental/pulp.md | 4 ++-- environments/.stackhpc/inventory/group_vars/builder.yml | 2 +- environments/common/inventory/groups | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index 38cb79289..2858d032b 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -11,7 +11,6 @@ become: yes hosts: _pulp_host tasks: - - name: Install pulp ansible.builtin.include_role: name: pulp_site diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index d1a40ba52..d2bc0db72 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -1,13 +1,13 @@ # Pulp Server -In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's Ark Pulp server. The appliance will sync relevant repositories to local Pulp server which will be used for image builds. Using a local server can be enabled by adding `pulp` to the build groups and overriding `dnf_repos_repolist` to point at content hosted on the local server. +In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. Using a local Pulp can be enabled by adding `pulp` to the build groups and overriding `dnf_repos_repolist` to point at content hosted on the local server. ## Deploying/configuring Pulp Server ### Deploying a Pulp server A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml' playbook. This can be run with `ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` -This will print a Pulp endpoint which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note that this server's content isn't authenticated so assumes the server is deployed behind a secure network. +This will print a Pulp endpoint which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network. ### Using an existing Pulp server An existing Pulp server can be used to host Ark repos by overriding `pulp_site_password` and `appliances_pulp_url` in the target environment. Note that this assumes the same configuration as the appliance deployed pulp i.e no content authentication. diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index ce1666973..8d4c8b3bb 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -9,6 +9,6 @@ # appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" # pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" - +# Alternatively, configure to use ark directly: dnf_repos_username: slurm-app-ci dnf_repos_password: "{{ lookup('env','ARK_PASSWORD') }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 8f52477cd..d49f3d6c1 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -150,4 +150,4 @@ freeipa_client builder [pulp:children] -# Hosts used to run Pulp API commands +# Add builder to this group to enable automatically syncing of pulp during image build From bc5e26efe139b50296cb9cd2a1fa47f98a9fecc7 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 10:51:18 +0000 Subject: [PATCH 175/268] docs/groups corrections --- docs/experimental/pulp.md | 6 +++--- environments/.stackhpc/inventory/group_vars/builder.yml | 1 + environments/common/inventory/groups | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index d2bc0db72..8c9bfd615 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -1,13 +1,13 @@ # Pulp Server -In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. Using a local Pulp can be enabled by adding `pulp` to the build groups and overriding `dnf_repos_repolist` to point at content hosted on the local server. +In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. Using a local Pulp can be enabled by adding `pulp` to the build groups and overriding `appliances_pulp_url` to point at the local Pulp's URL. ## Deploying/configuring Pulp Server ### Deploying a Pulp server A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml' playbook. This can be run with -`ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` -This will print a Pulp endpoint which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network. +`ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` +where `target_host` is any resolvable host. This will print a Pulp URL which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network. ### Using an existing Pulp server An existing Pulp server can be used to host Ark repos by overriding `pulp_site_password` and `appliances_pulp_url` in the target environment. Note that this assumes the same configuration as the appliance deployed pulp i.e no content authentication. diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 8d4c8b3bb..b12e81826 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -9,6 +9,7 @@ # appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" # pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" + # Alternatively, configure to use ark directly: dnf_repos_username: slurm-app-ci dnf_repos_password: "{{ lookup('env','ARK_PASSWORD') }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index d49f3d6c1..6f77eeab5 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -149,5 +149,6 @@ freeipa_client # Hosts to replace system repos with Pulp repos builder -[pulp:children] +[pulp] # Add builder to this group to enable automatically syncing of pulp during image build +# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` risks leaking Ark creds From 18b220e1b54d991946ebba4dbd386ed96f392993 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 11:42:49 +0000 Subject: [PATCH 176/268] moved defaults to CI and updated docs --- ansible/roles/pulp_site/defaults/main.yml | 4 +--- docs/experimental/pulp.md | 2 +- docs/image-build.md | 9 +++++---- .../.stackhpc/inventory/group_vars/builder.yml | 4 ++++ environments/common/inventory/group_vars/all/pulp.yml | 10 ++++++++++ 5 files changed, 21 insertions(+), 8 deletions(-) diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 2c90d2968..d343d4998 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -1,12 +1,10 @@ pulp_site_url: "{{ appliances_pulp_url }}" pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed +pulp_site_password: "{{ vault_pulp_admin_password }}" pulp_site_upstream_content_url: https://ark.stackhpc.com/pulp/content -pulp_site_upstream_username: slurm-app-ci -pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" _pulp_site_rocky_prefix: "{{ pulp_site_target_distribution }}/{{ pulp_site_target_distribution_version }}" pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" -pulp_site_password: "{{ vault_pulp_admin_password }}" pulp_site_validate_certs: false pulp_site_install_dir: '/home/rocky/pulp' pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index 8c9bfd615..e0f32cdc1 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -14,4 +14,4 @@ An existing Pulp server can be used to host Ark repos by overriding `pulp_site_p ## Syncing Pulp content with Ark -If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must supply your Ark credentials, either by overriding `pulp_site_upstream_password` or setting environment variable `ARK_PASSWORD`. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`, optionally setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version`. +If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must authenticate with Ark by overriding `pulp_site_upstream_username` and `pulp_site_upstream_password` with your vault encrypted Ark dev credentials. `dnf_repos_username` and `dnf_repos_password` must remain unset to access content from the local Pulp. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default this syncs repositories for Rocky 9.4 with x86_64 architecture, but can be overriden by setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version_major`. diff --git a/docs/image-build.md b/docs/image-build.md index a7d2e951b..db51265a3 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -17,7 +17,8 @@ The fat images StackHPC builds and tests in CI are available from [GitHub releas To build either a site-specific fat image from scratch, or to extend an existing StackHPC fat image: 1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). -2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: +2. The provided dev credentials for StackHPC's "Ark" Pulp server must be added to the target environments. This is done by overriding `dnf_repos_username` and `dnf_repos_password` with your vault encrypted credentials in `environments//inventory/group_vars/all/pulp.yml`. See the [experimental docs](experimental/pulp.md) if you wish instead wish to use a local Pulp server. +3. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: ```hcl flavor = "general.v1.small" # VM flavor to use for builder VMs @@ -35,9 +36,9 @@ To build either a site-specific fat image from scratch, or to extend an existing - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. -3. Activate the venv and the relevant environment. +4. Activate the venv and the relevant environment. -4. Build images using the relevant variable definition file, e.g.: +5. Build images using the relevant variable definition file, e.g.: cd packer/ PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl @@ -52,7 +53,7 @@ To build either a site-specific fat image from scratch, or to extend an existing then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). -5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. +6. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. # Build Process diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index b12e81826..5130e9d84 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -13,3 +13,7 @@ # Alternatively, configure to use ark directly: dnf_repos_username: slurm-app-ci dnf_repos_password: "{{ lookup('env','ARK_PASSWORD') }}" + +# Can be set regardless of approach above: +pulp_site_upstream_username: slurm-app-ci +pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" diff --git a/environments/common/inventory/group_vars/all/pulp.yml b/environments/common/inventory/group_vars/all/pulp.yml index 02b7aa816..22bb83216 100644 --- a/environments/common/inventory/group_vars/all/pulp.yml +++ b/environments/common/inventory/group_vars/all/pulp.yml @@ -1 +1,11 @@ pulp_site_port: 8080 + +# If using Ark directly (no local Pulp server), override the following with Ark creds + +# dnf_repos_username: +# dnf_repos_password: + +# If instead using local Pulp server, override below with Ark creds + +# pulp_site_upstream_username: +# pulp_site_upstream_password: From 34fee1cb17a32c4d0fc52cb2997e8f1c6458f730 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 12:26:27 +0000 Subject: [PATCH 177/268] updated docs --- docs/operations.md | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/docs/operations.md b/docs/operations.md index a20d7f10c..50eef9053 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -63,17 +63,28 @@ This is a usually a two-step process: Deploying the additional nodes and applying these changes requires rerunning both Terraform and the Ansible site.yml playbook - follow [Deploying a Cluster](#Deploying-a-Cluster). # Adding Additional Packages -Packages from any enabled DNF repositories (which always includes EPEL, PowerTools and OpenHPC) can be added to all nodes by defining a list `openhpc_packages_extra` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml`. For example: - - # environments/foo-base/inventory/group_vars/all/openhpc.yml: - openhpc_packages_extra: +By default, the following utility packages are installed during build: +- htop +- nano +- screen +- tmux +- wget +- bind-utils +- net-tools +- postfix +- git +- latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) +Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_other_extra_package` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: + + # environments/foo-base/inventory/group_vars/all/defaults.yml: + appliances_other_extra_package: - somepackage - anotherpackage The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules. -To add these packages to the current cluster, run the same command as for [Reconfiguring Slurm](#Reconfiguring-Slurm). TODO: describe what's required to add these to site-specific images. +If you wish to install packages during runtime, the `site.yml` playbook should be run `appliances_packages_during_configure` overriden to `true` and `cluster` should be added as a child of the `dnf_repos` group in order to temporarily re-enabled DNF repositories during runtime (WARNING: this should only be done if using an unauthenticated local Pulp server. If using StackHPC Ark directly, doing this WILL leak credentials to users). If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a plat should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this: - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory. From 9c41725c64d1a14bd4247bbd7c06daa4835e4240 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 16 Dec 2024 12:29:58 +0000 Subject: [PATCH 178/268] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 8659f3e90..989b9f9bb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241213-1402-a2a705c9", - "RL9": "openhpc-RL9-241213-1402-a2a705c9" + "RL8": "openhpc-RL8-241216-1146-18b220e1", + "RL9": "openhpc-RL9-241216-1146-18b220e1" } } From a4352920dcd0dd1ccb9ab798b44a75aa4d2a1ec9 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:42:52 +0000 Subject: [PATCH 179/268] bump image --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 8659f3e90..44059d97c 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241213-1402-a2a705c9", - "RL9": "openhpc-RL9-241213-1402-a2a705c9" + "RL8": "openhpc-RL8-241216-1231-83161c73", + "RL9": "openhpc-RL9-241216-1232-83161c73" } } From 30a278ee64e7202b8ec7a3da9753d81c5c7fd42d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 15:37:12 +0000 Subject: [PATCH 180/268] moved to extras --- ansible/extras.yml | 10 ++++++++++ ansible/packages.yml | 10 ---------- 2 files changed, 10 insertions(+), 10 deletions(-) delete mode 100644 ansible/packages.yml diff --git a/ansible/extras.yml b/ansible/extras.yml index 107f85252..ea5c8eb12 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -44,3 +44,13 @@ tasks: - import_role: name: k9s + +- hosts: extra_packages + become: yes + tags: + - extra_packages + tasks: + - name: Install additional packages + dnf: + name: "{{ appliances_extra_packages }}" + when: appliances_mode != 'configure' or appliances_packages_during_configure diff --git a/ansible/packages.yml b/ansible/packages.yml deleted file mode 100644 index e447dcda7..000000000 --- a/ansible/packages.yml +++ /dev/null @@ -1,10 +0,0 @@ - -- hosts: extra_packages - become: yes - tags: - - extra_packages - tasks: - - name: Install additional packages - dnf: - name: "{{ appliances_extra_packages }}" - when: appliances_mode != 'configure' or appliances_packages_during_configure From 6c74a1e15fcfe809b28a3bd7d5bc582b90175105 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 15:51:29 +0000 Subject: [PATCH 181/268] repos now controlled by groups + possible during configure + guarded against cred leaks --- ansible/bootstrap.yml | 14 ++++++++++++++ ansible/disable-repos.yml | 8 ++++++++ ansible/fatimage.yml | 18 +----------------- ansible/site.yml | 1 + environments/common/inventory/groups | 2 +- 5 files changed, 25 insertions(+), 18 deletions(-) create mode 100644 ansible/disable-repos.yml diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 733d4b3f8..a504f3545 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -110,6 +110,20 @@ policy: "{{ selinux_policy }}" register: sestatus +- hosts: dnf_repos + become: yes + tasks: + - name: Check that creds won't be leaked to users + ansible.builtin.assert: + that: dnf_repos_password is undefined + fail_msg: Passwords should not be templated into repofiles during configure, unset 'dnf_repos_password' + when: appliances_mode == 'configure' + - name: Replace system repos with pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: set_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided + # --- tasks after here require access to package repos --- - hosts: squid tags: squid diff --git a/ansible/disable-repos.yml b/ansible/disable-repos.yml new file mode 100644 index 000000000..d7dc4fd55 --- /dev/null +++ b/ansible/disable-repos.yml @@ -0,0 +1,8 @@ +- hosts: dnf_repos + become: yes + tasks: + - name: Disable pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 5d84fcf90..4c8367816 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -27,15 +27,6 @@ delegate_to: localhost when: appliances_mode != 'configure' -- hosts: dnf_repos - become: yes - tasks: - - name: Replace system repos with pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: set_repos.yml - when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided - - import_playbook: bootstrap.yml - name: Run post-bootstrap.yml hook @@ -229,14 +220,7 @@ import_role: name: doca -- hosts: dnf_repos - become: yes - tasks: - - name: Disable pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: disable_repos.yml - when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided +- import_playbook: disable_repos.yml - name: Run post.yml hook vars: diff --git a/ansible/site.yml b/ansible/site.yml index bb379399d..222ee8697 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -27,6 +27,7 @@ - import_playbook: slurm.yml - import_playbook: portal.yml - import_playbook: monitoring.yml +- import_playbook: disable_repos.yml - name: Run post.yml hook vars: diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 6f77eeab5..062276f76 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -147,8 +147,8 @@ freeipa_client [dnf_repos:children] # Hosts to replace system repos with Pulp repos +# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users builder [pulp] # Add builder to this group to enable automatically syncing of pulp during image build -# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` risks leaking Ark creds From 2357a730d060ad43289d022024de118093984017 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 15:58:58 +0000 Subject: [PATCH 182/268] typo --- ansible/fatimage.yml | 2 +- ansible/site.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 4c8367816..55e56e612 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -220,7 +220,7 @@ import_role: name: doca -- import_playbook: disable_repos.yml +- import_playbook: disable-repos.yml - name: Run post.yml hook vars: diff --git a/ansible/site.yml b/ansible/site.yml index 222ee8697..d973d9cb3 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -27,7 +27,7 @@ - import_playbook: slurm.yml - import_playbook: portal.yml - import_playbook: monitoring.yml -- import_playbook: disable_repos.yml +- import_playbook: disable-repos.yml - name: Run post.yml hook vars: From bf6f3680ec49906cc48b170b003c67627e62aca4 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:59:43 +0000 Subject: [PATCH 183/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 989b9f9bb..7c59abf36 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241216-1146-18b220e1", - "RL9": "openhpc-RL9-241216-1146-18b220e1" + "RL8": "openhpc-RL8-241216-1607-2357a730", + "RL9": "openhpc-RL9-241216-1607-2357a730" } } From c6a6bf365e74c95e2079ce5f73753f4285b3d95b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 17 Dec 2024 11:33:31 +0000 Subject: [PATCH 184/268] re-enable CI on compute-init script branch --- .github/workflows/stackhpc.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 848517bb8..b08854adb 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -24,8 +24,6 @@ on: - '!.gitignore' - '!.github/workflows/' - '.github/workflows/stackhpc' - branches: - - '!feat/compute-script' jobs: openstack: name: openstack-ci From 5455eec66a243a295f7651b23eafb3ea52bfb65c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 17 Dec 2024 11:37:00 +0000 Subject: [PATCH 185/268] doc compute_init/export.yml ordering --- ansible/extras.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ansible/extras.yml b/ansible/extras.yml index 10ca4cc9d..ad538c58a 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -37,8 +37,10 @@ - import_role: name: persist_hostkeys -# TODO: Is this is the right place? -- hosts: compute_init:!builder + +- name: Setup NFS export for compute node configuration + hosts: compute_init:!builder + # NB: has to be after eeesi and os-manila-mount tags: compute_init become: yes name: Export hostvars From 36cf771cae4d7706d8b99308ba7f028b6d279472 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 17 Dec 2024 11:39:45 +0000 Subject: [PATCH 186/268] change name for compute-init enablement --- ansible/roles/compute_init/files/compute-init.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 24a5090c6..74face5e1 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -6,7 +6,7 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.k3s_server }}" - enable_slurmd: "{{ os_metadata.meta.enable_slurmd | default(false) | bool }}" + enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" enable_resolv_conf: "{{ os_metadata.meta.enable_senable_resolv_conf | default(false) | bool }}" enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" @@ -19,10 +19,10 @@ - name: Report skipping initialization if not compute node # meta: end_play produces no output debug: - msg: "Skipping compute initialization: Metadata enable_slurmd is not true" + msg: "Skipping compute initialization: Metadata enable_compute is not true" - meta: end_play - when: not enable_slurmd + when: not enable_compute - name: Ensure the mount directory exists file: @@ -103,7 +103,7 @@ # TODO: - name: Configure EESSI # TODO: - name: Configure openhpc - # NB: don't need conditional block on enable_slurmd as have already exited + # NB: don't need conditional block on enable_compute as have already exited # if not the case - name: Write Munge key copy: From 5e7f809276ffd1c259e5d8ac19b61abd67be6b21 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 17 Dec 2024 12:05:17 +0000 Subject: [PATCH 187/268] move most compute-init docs to the role readme --- ansible/roles/compute_init/README.md | 119 +++++++++++++++++++++++++++ docs/experimental/compute-init.md | 80 +----------------- 2 files changed, 120 insertions(+), 79 deletions(-) create mode 100644 ansible/roles/compute_init/README.md diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md new file mode 100644 index 000000000..dac59e2d3 --- /dev/null +++ b/ansible/roles/compute_init/README.md @@ -0,0 +1,119 @@ +# EXPERIMENTAL: compute-init + +Experimental / in-progress functionality to allow compute nodes to rejoin the +cluster after a reboot. + +To enable this add compute nodes (or a subset of them into) the `compute_init` +group. + +This works as follows: +1. During image build, an ansible-init playbook and supporting files +(e.g. templates, filters, etc) are installed. +2. Cluster instances are created as usual; the above compute-init playbook does +not run. +3. The `site.yml` playbook is run as usual to configure all the instances into +a cluster. In addition, with `compute-init` enabled, a `/exports/cluster` NFS +share is created on the control node containing: + - an /etc/hosts file for the cluster + - Hostvars for each compute node +4. On reboot of a compute node, ansible-init runs the compute-init playbook +which: + a. Checks whether the `enable_compute` metadata flag is set, and exits if + not. + b. Tries to mount the above `/exports/cluster` NFS share from the control + node, and exits if it cannot. + c. Configures itself using the exported hostvars, depending on the + `enable_*` flags set in metadata. + d. Issues an `scontrol` command to resume the node (because Slurm will + consider it as "unexpectedly rebooted"). + +The check in 4b. above is what prevents the compute-init script from trying +to configure the node before the services on the control node are available +(which requires running the site.yml playbook). + +The following roles are currently fully functional: +- `resolv_conf` +- `etc_hosts` +- `stackhpc.openhpc` + +# Development/debugging + +To develop/debug this without actually having to build an image: + + +1. Deploy a cluster using tofu and ansible/site.yml as normal. This will + additionally configure the control node to export compute hostvars over NFS. + Check the cluster is up. + +2. Reimage the compute nodes: + + ansible-playbook --limit compute ansible/adhoc/rebuild + +3. Add metadata to a compute node e.g. via Horzon to turn on compute-init + playbook functionality. + +4. Fake an image build to deploy the compute-init playbook: + + ansible-playbook ansible/fatimage.yml --tags compute_init + + NB: This will also re-export the compute hostvars, as the nodes are not + in the builder group, which conveniently means any changes made to that + play also get picked up. + +5. Fake a reimage of compute to run ansible-init and the compute-init playbook: + + On compute node where metadata was added: + + [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init + [root@rl9-compute-0 rocky]# systemctl status ansible-init + + Use `systemctl status ansible-init` to view stdout/stderr from Ansible. + +Steps 4/5 can be repeated with changes to the compute script. If required, +reimage the compute node(s) first as in step 2 and/or add additional metadata +as in step 3. + + +# Design notes +- Duplicating code in roles into the `compute-init` script is unfortunate, but + does allow developing this functionality without wider changes to the + appliance. + +- In general, we don't want to rely on NFS export. So should e.g. copy files + from this mount ASAP in the compute-init script. TODO: + +- There are a couple of approaches to supporting existing roles using `compute-init`: + + 1. Control node copies files resulting from role into cluster exports, + compute-init copies to local disk. Only works if files are not host-specific + Examples: etc_hosts, eessi config? + + 2. Re-implement the role. Works if the role vars are not too complicated, + (else they all need to be duplicated in compute-init). Could also only + support certain subsets of role functionality or variables + Examples: resolv_conf, stackhpc.openhpc + +- Some hostvars are tempalted from hostvars from other nodes, which aren't + available in the current approach: + + ``` + [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", + "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", + "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", + "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", + "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", + "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", + "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", + "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", + "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", + "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" + ``` + + More generally, there is nothing to stop any group var depending on a + "{{ hostvars[] }}" interpolation ... + + Currently, the only functionality this has been problematic for is setting + the control node address for the slurmd node, which has been done using + the (metadata-provided) IP, given this is needed to do the NFS mount anyway + in the absence of working internal DNS. diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index 17ed370e6..dae840d95 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -1,46 +1,6 @@ # compute-init -The following roles are currently functional: -- resolv_conf -- etc_hosts -- stackhpc.openhpc - -# Development - -To develop/debug this without actually having to build an image: - - -1. Deploy a cluster using tofu and ansible/site.yml as normal. This will - additionally configure the control node to export compute hostvars over NFS. - Check the cluster is up. - -2. Reimage the compute nodes: - - ansible-playbook --limit compute ansible/adhoc/rebuild - -3. Add metadata to a compute node e.g. via Horzon to turn on compute-init - playbook functionality. - -4. Fake an image build to deploy the compute-init playbook: - - ansible-playbook ansible/fatimage.yml --tags compute_init - - NB: This will also re-export the compute hostvars, as the nodes are not - in the builder group, which conveniently means any changes made to that - play also get picked up. - -5. Fake a reimage of compute to run ansible-init and the compute-init playbook: - - On compute node where metadata was added: - - [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init - [root@rl9-compute-0 rocky]# systemctl status ansible-init - - Use `systemctl status ansible-init` to view stdout/stderr from Ansible. - -Steps 4/5 can be repeated with changes to the compute script. If required, -reimage the compute node(s) first as in step 2 and/or add additional metadata -as in step 3. +See the role README.md # Results/progress @@ -151,41 +111,3 @@ This commit - shows that hostvars have loaded: Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] writing sentinel file /var/lib/ansible-init.done Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. - -# Design notes - -- In general, we don't want to rely on NFS export. So should e.g. copy files - from this mount ASAP in the compute-init script. TODO: -- There are a few possible approaches: - - 1. Control node copies files resulting from role into cluster exports, - compute-init copies to local disk. Only works if files are not host-specific - Examples: etc_hosts, eessi config? - - 2. Re-implement the role. Works if the role vars are not too complicated, - (else they all need to be duplicated in compute-init). Could also only - support certain subsets of role functionality or variables - Examples: resolv_conf, stackhpc.openhpc - - -# Problems with templated hostvars - -Here are all the ones which actually rely on hostvars from other nodes, -which therefore aren't available: - -``` -[root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml - "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", - "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", - "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", - "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", - "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", - "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", - "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", - "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", - "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", - "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" -``` - -More generally, there is nothing to stop any group var depending on a -"{{ hostvars[] }}" interpolation ... From 11580b3d4855f01bf9f6108802b4edf5b3625227 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:09:04 +0000 Subject: [PATCH 188/268] Remove use of FIPs for leafcloud packer builds (#498) --- environments/.stackhpc/LEAFCLOUD.pkrvars.hcl | 5 ++++- environments/.stackhpc/SMS.pkrvars.hcl | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl index 5adf4199c..db0b28b49 100644 --- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl +++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl @@ -4,4 +4,7 @@ networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] -floating_ip_network = "external" +# see environments/.stackhpc/inventory/group_vars/all/bastion.yml: +ssh_bastion_username = "slurm-app-ci" +ssh_bastion_host = "195.114.30.222" +ssh_bastion_private_key_file = "~/.ssh/id_rsa" diff --git a/environments/.stackhpc/SMS.pkrvars.hcl b/environments/.stackhpc/SMS.pkrvars.hcl index b88106fe8..3ebe734eb 100644 --- a/environments/.stackhpc/SMS.pkrvars.hcl +++ b/environments/.stackhpc/SMS.pkrvars.hcl @@ -2,6 +2,7 @@ flavor = "general.v1.small" networks = ["e2b9e59f-43da-4e1c-b558-dc9da4c0d738"] # stackhpc-ipv4-geneve ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" +# see environments/.stackhpc/inventory/group_vars/all/bastion.yml: ssh_bastion_username = "slurm-app-ci" ssh_bastion_host = "185.45.78.150" -ssh_bastion_private_key_file = "~/.ssh/id_rsa" \ No newline at end of file +ssh_bastion_private_key_file = "~/.ssh/id_rsa" From 1ba41d8bfb1b1e3ec716cb39c4a34bc3ed8f4cb1 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:54:45 +0000 Subject: [PATCH 189/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 7c59abf36..be9dfe5cb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241216-1607-2357a730", - "RL9": "openhpc-RL9-241216-1607-2357a730" + "RL8": "openhpc-RL8-241217-1146-d77be652", + "RL9": "openhpc-RL9-241217-1145-d77be652" } } From a868642a8995130c187046762d00a68d109e5c0a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 17 Dec 2024 13:09:19 +0000 Subject: [PATCH 190/268] bump CI image --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 5b9d845ef..1495ce5a7 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241211-1322-ded60c2c", - "RL9": "openhpc-RL9-241211-1322-ded60c2c" + "RL8": "openhpc-RL8-241217-1210-5e7f8092", + "RL9": "openhpc-RL9-241217-1209-5e7f8092" } } From 4b0e36dd7f9a67c73e69980732cbbb908c7b5889 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 17 Dec 2024 13:33:51 +0000 Subject: [PATCH 191/268] now performs update in fatimage --- .github/workflows/fatimage.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 331035001..6649a3533 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -23,11 +23,11 @@ jobs: matrix: # build RL8, RL9 build: - image_name: openhpc-RL8 - source_image_name: rocky-latest-RL8 - inventory_groups: control,compute,login + source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + inventory_groups: control,compute,login,update - image_name: openhpc-RL9 - source_image_name: rocky-latest-RL9 - inventory_groups: control,compute,login + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: control,compute,login,update env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack From bc36b78121e8828855be9b247f2ea07fe8113882 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 17 Dec 2024 14:19:08 +0000 Subject: [PATCH 192/268] testing enabling release train for 8.10 --- .github/workflows/nightlybuild.yml | 2 +- ansible/bootstrap.yml | 1 - ansible/disable-repos.yml | 1 - ansible/roles/dnf_repos/defaults/main.yml | 24 +++++++++++++++---- .../inventory/group_vars/all/defaults.yml | 5 ++++ 5 files changed, 25 insertions(+), 8 deletions(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 2485cd2df..ec920ce8d 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -25,7 +25,7 @@ jobs: matrix: # build RL8, RL9 build: - image_name: rocky-latest-RL8 - source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2 inventory_groups: update - image_name: rocky-latest-RL9 source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index a504f3545..e2497d9c6 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -122,7 +122,6 @@ ansible.builtin.include_role: name: dnf_repos tasks_from: set_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided # --- tasks after here require access to package repos --- - hosts: squid diff --git a/ansible/disable-repos.yml b/ansible/disable-repos.yml index d7dc4fd55..3e8022965 100644 --- a/ansible/disable-repos.yml +++ b/ansible/disable-repos.yml @@ -5,4 +5,3 @@ ansible.builtin.include_role: name: dnf_repos tasks_from: disable_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 4a0c9fd2a..eb740e084 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -4,18 +4,32 @@ dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" dnf_repos_username: "{{ omit }}" dnf_repos_password: "{{ omit }}" +dnf_repos_filenames: + '8': + baseos: 'Rocky-BaseOS' + appstream: 'Rocky-AppStream' + crb: 'Rocky-PowerTools' + extras: 'Rocky-Extras' + '9': + baseos: 'rocky' + appstream: 'rocky' + crb: 'rocky' + extras: 'rocky-extras' + +dnf_repos_version_filenames: "{{ dnf_repos_filenames[ansible_distribution_major_version] }}" + # epel installed separately dnf_repos_repolist: -- file: rocky +- file: "{{ dnf_repos_version_filenames.baseos }}" name: baseos base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/BaseOS/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.baseos[ansible_distribution_version] }}" -- file: rocky +- file: "{{ dnf_repos_version_filenames.appstream }}" name: appstream base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.appstream[ansible_distribution_version] }}" -- file: rocky - name: crb +- file: "{{ dnf_repos_version_filenames.crb }}" + name: "{{ 'powertools' if ansible_distribution_major_version == '8' else 'crb' }}" base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.crb[ansible_distribution_version] }}" -- file: rocky-extras +- file: "{{ dnf_repos_version_filenames.extras }}" name: extras base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.extras[ansible_distribution_version] }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 1bac4590d..a9b7224d8 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -85,11 +85,16 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us appliances_repo_timestamps: baseos: '9.4': 20240816T002610 + '8.10': 20241217T123729 appstream: '9.4': 20240816T002610 + '8.10': 20241217T123729 crb: '9.4': 20240816T002610 + '8.10': 20241217T123729 extras: '9.4': 20240816T002610 + '8.10': 20241217T123729 epel: '9': 20240902T080424 + '8': 20241216T235733 From a9e53ba6a79857d09f1b8b95b5fe919089a9677d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 17 Dec 2024 14:21:40 +0000 Subject: [PATCH 193/268] Temporarily (?) building from rocky 8 genericcloud + update in fatimage --- .github/workflows/fatimage.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 331035001..7d7571133 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -23,8 +23,8 @@ jobs: matrix: # build RL8, RL9 build: - image_name: openhpc-RL8 - source_image_name: rocky-latest-RL8 - inventory_groups: control,compute,login + source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2 + inventory_groups: control,compute,login,update - image_name: openhpc-RL9 source_image_name: rocky-latest-RL9 inventory_groups: control,compute,login From 47b7bb3691d42705a8db37156b3e935a82651b30 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:27:46 +0000 Subject: [PATCH 194/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 125180527..67e267dfb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241213-1416-9065bb6d", - "RL9": "openhpc-RL9-241213-1417-9065bb6d" + "RL8": "openhpc-RL8-241217-1341-eeb88386", + "RL9": "openhpc-RL9-241217-1341-eeb88386" } } From 7fe3ca5b2b6ec7f005012f919c799bbe11257eec Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 17 Dec 2024 15:08:13 +0000 Subject: [PATCH 195/268] docs suggestions Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- docs/experimental/pulp.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index e0f32cdc1..6d30bec6b 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -1,11 +1,11 @@ # Pulp Server -In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. Using a local Pulp can be enabled by adding `pulp` to the build groups and overriding `appliances_pulp_url` to point at the local Pulp's URL. +In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. ## Deploying/configuring Pulp Server ### Deploying a Pulp server -A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml' playbook. This can be run with +A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml` playbook. This can be run with `ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` where `target_host` is any resolvable host. This will print a Pulp URL which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network. From 1faf4e523cf7db91e59327d978ab0aeffd05a41e Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 17 Dec 2024 15:10:41 +0000 Subject: [PATCH 196/268] stopped openhpc overwriting epel 8 --- environments/common/inventory/group_vars/all/openhpc.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index a23bc77ba..cf2762f17 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -41,10 +41,4 @@ openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if applianc ohpc_default_extra_repos: "9": [] #overriding to ensure doesn't overwrite ark epel repo - "8": - - name: epel - file: epel - description: "Extra Packages for Enterprise Linux 8 - $basearch" - metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" - gpgcheck: true - gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" + "8": [] From 6ce4953d483ba8939bdd6c344b0ecc068179a258 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 18 Dec 2024 08:57:08 +0000 Subject: [PATCH 197/268] fixed broken powertools repo --- ansible/roles/dnf_repos/defaults/main.yml | 2 +- ansible/roles/dnf_repos/tasks/set_repos.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index eb740e084..89a8229f7 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -28,7 +28,7 @@ dnf_repos_repolist: base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.appstream[ansible_distribution_version] }}" - file: "{{ dnf_repos_version_filenames.crb }}" name: "{{ 'powertools' if ansible_distribution_major_version == '8' else 'crb' }}" - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.crb[ansible_distribution_version] }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/{{ 'PowerTools' if ansible_distribution_major_version == '8' else 'CRB' }}/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.crb[ansible_distribution_version] }}" - file: "{{ dnf_repos_version_filenames.extras }}" name: extras base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.extras[ansible_distribution_version] }}" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index fe5e2c02c..c9fcb0c07 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -8,6 +8,7 @@ description: "{{ item.name }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" + gpgcheck: false loop: "{{ dnf_repos_repolist }}" - name: Install epel-release From 29a157910b3c32414899d88b22dbb7446ce57bd0 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:57:33 +0000 Subject: [PATCH 198/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 989b9f9bb..5e5acebeb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241216-1146-18b220e1", - "RL9": "openhpc-RL9-241216-1146-18b220e1" + "RL8": "openhpc-RL8-241218-0900-a99d8be6", + "RL9": "openhpc-RL9-241218-0859-a99d8be6" } } From ee4ab93037e90806282c4da650a2fe816ac04b7a Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 18 Dec 2024 11:12:12 +0000 Subject: [PATCH 199/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 67e267dfb..db25176e2 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241217-1341-eeb88386", - "RL9": "openhpc-RL9-241217-1341-eeb88386" + "RL8": "openhpc-RL8-241218-1011-5effb3fa", + "RL9": "openhpc-RL9-241218-1011-5effb3fa" } } From 82ef12bd60e1e8ee39cfb2a852531d36965f1a5c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 12:01:04 +0000 Subject: [PATCH 200/268] support nfs for compute-init --- ansible/roles/compute_init/README.md | 18 +++++++++++------- .../roles/compute_init/files/compute-init.yml | 18 ++++++++++++++++-- ansible/roles/compute_init/tasks/install.yml | 5 ++++- .../common/inventory/group_vars/all/nfs.yml | 2 +- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index dac59e2d3..2931986ba 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -32,9 +32,12 @@ to configure the node before the services on the control node are available (which requires running the site.yml playbook). The following roles are currently fully functional: -- `resolv_conf` -- `etc_hosts` -- `stackhpc.openhpc` +- `resolv_conf`: all functionality +- `etc_hosts`: all functionality +- `nfs`: client functionality only +- `stackhpc.openhpc`: all functionality, except that the control server name + must be the control node's `inventory_hostname`; `openhpc_slurm_control_host` + and `openhpc_slurm_control_host_address` are ignored. # Development/debugging @@ -113,7 +116,8 @@ as in step 3. More generally, there is nothing to stop any group var depending on a "{{ hostvars[] }}" interpolation ... - Currently, the only functionality this has been problematic for is setting - the control node address for the slurmd node, which has been done using - the (metadata-provided) IP, given this is needed to do the NFS mount anyway - in the absence of working internal DNS. + Currently, this has been worked around for the following cases: + - The inventory hostname for the control node, indirected via `.api_address` + in the above hostvars. This is needed for the default nfs configuration + and the slurmctld namne. For compute-init this has been Defined using + "{{ groups['control'] | first }}" as the hostvars do include the groups. diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 74face5e1..66cf755d4 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -13,7 +13,17 @@ # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects # this is a good example: common environment actually defines this (non-functional w/o compute groups), but role default is empty resolv_conf_nameservers: [] - + + nfs_client_mnt_point: "/mnt" + nfs_client_mnt_options: + nfs_client_mnt_state: mounted + nfs_configurations: + nfs_enable: + clients: false + nfs_enable: + server: false + clients: false + tasks: - block: - name: Report skipping initialization if not compute node @@ -95,6 +105,10 @@ when: enable_etc_hosts # TODO: - name: NFS client mount + - name: If nfs-clients is present + include_tasks: nfs-clients.yml + when: nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) + loop: "{{ nfs_configurations }}" # TODO: - name: Manila mount @@ -116,7 +130,7 @@ - name: Set slurmctld location for configless operation lineinfile: path: /etc/sysconfig/slurmd - line: "SLURMD_OPTIONS='--conf-server {{ server_node_ip }}'" + line: "SLURMD_OPTIONS='--conf-server {{ groups['control'] | first }}'" regexp: "^SLURMD_OPTIONS=" create: yes owner: root diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index 4eef5deb8..fc96d3a18 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -12,6 +12,7 @@ - files - library - filter_plugins + - playbooks - name: Inject files from roles copy: @@ -33,7 +34,9 @@ dest: library/os_manila_share.py - src: ../../basic_users/filter_plugins/filter_keys.py dest: filter_plugins/filter_keys.py - + - src: ../../stackhpc.nfs/tasks/nfs-clients.yml + dest: playbooks/nfs-clients.yml + - name: Add filter_plugins to ansible.cfg lineinfile: path: /etc/ansible-init/ansible.cfg diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index e9366da2b..7960809ca 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -3,7 +3,7 @@ # See: https://github.com/stackhpc/ansible-role-cluster-nfs # for variable definitions -nfs_server_default: "{{ hostvars[groups['control'] | first ].internal_address }}" +nfs_server_default: "{{ groups['control'] | first }}" # avoid using hostvars so nfs_configurations: - comment: Export /exports/home from Slurm control node as /home From 9049d30dc6cfdf47e378e667303c0ac5e0129f4e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 12:01:26 +0000 Subject: [PATCH 201/268] fix compute-init README typos --- ansible/roles/compute_init/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 2931986ba..733cdb80f 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -50,9 +50,9 @@ To develop/debug this without actually having to build an image: 2. Reimage the compute nodes: - ansible-playbook --limit compute ansible/adhoc/rebuild + ansible-playbook --limit compute ansible/adhoc/rebuild.yml -3. Add metadata to a compute node e.g. via Horzon to turn on compute-init +3. Add metadata to a compute node e.g. via Horizon to turn on compute-init playbook functionality. 4. Fake an image build to deploy the compute-init playbook: From 79f52f9bd91d1761ef5a3e46e5130c9564de9f17 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 12:01:52 +0000 Subject: [PATCH 202/268] fix typo in resolv_conf metadata --- ansible/roles/compute_init/files/compute-init.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 66cf755d4..1e902e073 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -7,7 +7,7 @@ os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.k3s_server }}" enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" - enable_resolv_conf: "{{ os_metadata.meta.enable_senable_resolv_conf | default(false) | bool }}" + enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects From 54381501031495f894f57396ebecdd94a3bf4a0d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 18 Dec 2024 12:45:11 +0000 Subject: [PATCH 203/268] added 9.5 ark snapshots + bumped genericcloud --- .github/workflows/fatimage.yml | 2 +- ansible/adhoc/sync-pulp.yml | 2 +- docs/experimental/pulp.md | 2 +- environments/common/inventory/group_vars/all/defaults.yml | 4 ++++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 6649a3533..fb6395e45 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -26,7 +26,7 @@ jobs: source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 inventory_groups: control,compute,login,update - image_name: openhpc-RL9 - source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + source_image_name: Rocky-9-GenericCloud-Base-9.5-20241118.0.x86_64.qcow2 inventory_groups: control,compute,login,update env: ANSIBLE_FORCE_COLOR: True diff --git a/ansible/adhoc/sync-pulp.yml b/ansible/adhoc/sync-pulp.yml index f26149bba..b2cd9a8c4 100644 --- a/ansible/adhoc/sync-pulp.yml +++ b/ansible/adhoc/sync-pulp.yml @@ -6,5 +6,5 @@ vars: pulp_site_target_arch: "x86_64" pulp_site_target_distribution: "rocky" - pulp_site_target_distribution_version: "9.4" + pulp_site_target_distribution_version: "9.5" pulp_site_target_distribution_version_major: "9" diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index 6d30bec6b..fb2cda023 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -14,4 +14,4 @@ An existing Pulp server can be used to host Ark repos by overriding `pulp_site_p ## Syncing Pulp content with Ark -If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must authenticate with Ark by overriding `pulp_site_upstream_username` and `pulp_site_upstream_password` with your vault encrypted Ark dev credentials. `dnf_repos_username` and `dnf_repos_password` must remain unset to access content from the local Pulp. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default this syncs repositories for Rocky 9.4 with x86_64 architecture, but can be overriden by setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version_major`. +If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must authenticate with Ark by overriding `pulp_site_upstream_username` and `pulp_site_upstream_password` with your vault encrypted Ark dev credentials. `dnf_repos_username` and `dnf_repos_password` must remain unset to access content from the local Pulp. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default this syncs repositories for Rocky 9.5 with x86_64 architecture, but can be overriden by setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version_major`. diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 29724fb6f..3ff5ba02a 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -85,11 +85,15 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us appliances_repo_timestamps: baseos: '9.4': 20241115T011711 + '9.5': 20241216T013503 appstream: '9.4': 20241112T003151 + '9.5': 20241217T005008 crb: '9.4': 20241115T003133 + '9.5': 20241217T005008 extras: '9.4': 20241118T002802 + '9.5': 20241218T004632 epel: '9': 20241213T010218 From e7c96ad67c89c3a05d7eef0b8521998e9cb279c7 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 18 Dec 2024 13:36:39 +0000 Subject: [PATCH 204/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 67e267dfb..74bf0295c 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241217-1341-eeb88386", - "RL9": "openhpc-RL9-241217-1341-eeb88386" + "RL8": "openhpc-RL8-241218-1254-54381501", + "RL9": "openhpc-RL9-241218-1254-54381501" } } From 4f81b89e5ca6509bca0ab8538db35655abaa0226 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 13:51:55 +0000 Subject: [PATCH 205/268] fix nfs and make openhpc fully-capable in compute-init --- ansible/roles/compute_init/README.md | 22 ++++++++++--------- .../roles/compute_init/files/compute-init.yml | 16 ++++++++------ ansible/roles/compute_init/tasks/install.yml | 6 ++--- .../common/inventory/group_vars/all/nfs.yml | 2 +- .../inventory/group_vars/all/openhpc.yml | 2 +- 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 733cdb80f..83de9d73f 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -35,9 +35,7 @@ The following roles are currently fully functional: - `resolv_conf`: all functionality - `etc_hosts`: all functionality - `nfs`: client functionality only -- `stackhpc.openhpc`: all functionality, except that the control server name - must be the control node's `inventory_hostname`; `openhpc_slurm_control_host` - and `openhpc_slurm_control_host_address` are ignored. +- `stackhpc.openhpc`: all functionality # Development/debugging @@ -96,8 +94,8 @@ as in step 3. support certain subsets of role functionality or variables Examples: resolv_conf, stackhpc.openhpc -- Some hostvars are tempalted from hostvars from other nodes, which aren't - available in the current approach: +- Some variables are defined using hostvars from other nodes, which aren't + available v the current approach: ``` [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml @@ -116,8 +114,12 @@ as in step 3. More generally, there is nothing to stop any group var depending on a "{{ hostvars[] }}" interpolation ... - Currently, this has been worked around for the following cases: - - The inventory hostname for the control node, indirected via `.api_address` - in the above hostvars. This is needed for the default nfs configuration - and the slurmctld namne. For compute-init this has been Defined using - "{{ groups['control'] | first }}" as the hostvars do include the groups. + Only `nfs_server_default` and `openhpc_slurm_control_host` are of concern + for compute nodes - both of these indirect via `api_address` to + `inventory_hostname`. This has been worked around by replacing this with + "{{ groups['control'] | first }}" which does result in the control node + inventory hostname when templating. + + Note that although `groups` is defined in the templated hostvars, when + the hostvars are loaded using `include_vars:` is is ignored as it is a + "magic variable" determined by ansible itself and cannot be set. diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 1e902e073..ce10a890f 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -9,9 +9,9 @@ enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" + enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects - # this is a good example: common environment actually defines this (non-functional w/o compute groups), but role default is empty resolv_conf_nameservers: [] nfs_client_mnt_point: "/mnt" @@ -20,9 +20,8 @@ nfs_configurations: nfs_enable: clients: false - nfs_enable: - server: false - clients: false + + # openhpc: no defaults required tasks: - block: @@ -106,8 +105,10 @@ # TODO: - name: NFS client mount - name: If nfs-clients is present - include_tasks: nfs-clients.yml - when: nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) + include_tasks: ../tasks/nfs-clients.yml + when: + - enable_nfs + - nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) loop: "{{ nfs_configurations }}" # TODO: - name: Manila mount @@ -130,7 +131,7 @@ - name: Set slurmctld location for configless operation lineinfile: path: /etc/sysconfig/slurmd - line: "SLURMD_OPTIONS='--conf-server {{ groups['control'] | first }}'" + line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'" regexp: "^SLURMD_OPTIONS=" create: yes owner: root @@ -152,3 +153,4 @@ - name: Ensure node is resumed # TODO: consider if this is always safe for all job states? command: scontrol update state=resume nodename={{ ansible_hostname }} + # TODO: make safe for repeated runs diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index fc96d3a18..8f36aa836 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -12,7 +12,7 @@ - files - library - filter_plugins - - playbooks + - tasks - name: Inject files from roles copy: @@ -35,7 +35,7 @@ - src: ../../basic_users/filter_plugins/filter_keys.py dest: filter_plugins/filter_keys.py - src: ../../stackhpc.nfs/tasks/nfs-clients.yml - dest: playbooks/nfs-clients.yml + dest: tasks/nfs-clients.yml - name: Add filter_plugins to ansible.cfg lineinfile: @@ -52,4 +52,4 @@ dest: /etc/ansible-init/playbooks/1-compute-init.yml owner: root group: root - mode: 0644 \ No newline at end of file + mode: 0644 diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index 7960809ca..45b7c6967 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -3,7 +3,7 @@ # See: https://github.com/stackhpc/ansible-role-cluster-nfs # for variable definitions -nfs_server_default: "{{ groups['control'] | first }}" # avoid using hostvars so +nfs_server_default: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init nfs_configurations: - comment: Export /exports/home from Slurm control node as /home diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index a23bc77ba..84fe6ef57 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -13,7 +13,7 @@ openhpc_slurm_accounting_storage_type: 'accounting_storage/slurmdbd' openhpc_slurmdbd_mysql_database: slurm_acct_db openhpc_slurmdbd_mysql_password: "{{ vault_mysql_slurm_password }}" openhpc_slurmdbd_mysql_username: slurm -openhpc_slurm_control_host: "{{ hostvars[groups['control'].0].api_address }}" +openhpc_slurm_control_host: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}" openhpc_slurm_partitions: - name: "compute" From 9859d542d5278ff595a9f53a8ae84910ff465937 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 14:02:30 +0000 Subject: [PATCH 206/268] make compute-init safe for rerunning ansible-init --- ansible/roles/compute_init/files/compute-init.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index ce10a890f..fbad53012 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -153,4 +153,7 @@ - name: Ensure node is resumed # TODO: consider if this is always safe for all job states? command: scontrol update state=resume nodename={{ ansible_hostname }} - # TODO: make safe for repeated runs + register: _scontrol_update + failed_when: + - _scontrol_update.rc > 0 + - "'slurm_update error: Invalid node state specified' not in _scontrol_update.stderr" From e0d0c06b126c56848cc1da7a93c9aa8a63463efd Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 14:40:10 +0000 Subject: [PATCH 207/268] support manila in compute-init --- .../roles/compute_init/files/compute-init.yml | 87 ++++++++++++++++++- ansible/roles/compute_init/tasks/export.yml | 11 +++ ansible/roles/compute_init/tasks/install.yml | 10 +-- .../inventory/group_vars/all/manila.yml | 5 ++ 4 files changed, 104 insertions(+), 9 deletions(-) create mode 100644 environments/.stackhpc/inventory/group_vars/all/manila.yml diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index fbad53012..fb853407b 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -10,6 +10,7 @@ enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" + enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] @@ -23,6 +24,16 @@ # openhpc: no defaults required + os_manila_mount_shares: [] + os_manila_mount_ceph_conf_path: /etc/ceph + os_manila_mount_state: mounted + os_manila_mount_opts: + - x-systemd.device-timeout=30 + - x-systemd.mount-timeout=30 + - noatime + - _netdev # prevents mount blocking early boot before networking available + - rw + tasks: - block: - name: Report skipping initialization if not compute node @@ -103,15 +114,85 @@ mode: 0644 when: enable_etc_hosts - # TODO: - name: NFS client mount + # NFS client mount - name: If nfs-clients is present - include_tasks: ../tasks/nfs-clients.yml + include_tasks: tasks/nfs-clients.yml when: - enable_nfs - nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) loop: "{{ nfs_configurations }}" - # TODO: - name: Manila mount + - name: Manila mounts + block: + - name: Read manila share info from nfs file + include_vars: + file: /mnt/cluster/manila_share_info.yml + no_log: true # contains secrets + + - name: Ensure Ceph configuration directory exists + ansible.builtin.file: + path: "{{ os_manila_mount_ceph_conf_path }}" + state: directory + mode: "0755" + owner: root + group: root + + - name: Configure ceph.conf using os_manila_mount_host + ansible.builtin.template: + src: ceph.conf.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.conf" + owner: root + group: root + mode: "0600" + + - name: Ensure mount directory exists + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Write Ceph client keyring + ansible.builtin.template: + src: ceph.keyring.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.client.{{ item.share_user }}.keyring" + mode: "0600" + owner: root + group: root + loop: "{{ os_manila_mount_share_info }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Mount the Ceph share + ansible.posix.mount: + path: "{{ item[0].mount_path }}" + src: "{{ item[1].host }}:{{ item[1].export }}" + fstype: ceph + opts: "name={{ item[1].share_user }},{{ (item[0].mount_opts | default(os_manila_mount_opts)) | join(',') }}" + # NB share_user is looked up here in case of autodetection + state: "{{ item[0].mount_state | default(os_manila_mount_state) }}" + loop: "{{ os_manila_mount_shares | zip(os_manila_mount_share_info) }}" + loop_control: + label: "{{ item[0].share_name }}" + + - name: Ensure mounted directory has correct permissions + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] + when: + - enable_manila + - os_manila_mount_shares | length > 0 # TODO: - name: Basic users setup diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml index 3e9340cb5..d1d609895 100644 --- a/ansible/roles/compute_init/tasks/export.yml +++ b/ansible/roles/compute_init/tasks/export.yml @@ -33,3 +33,14 @@ dest: /exports/cluster/hostvars/{{ inventory_hostname }}/hostvars.yml mode: u=rw,go= delegate_to: "{{ groups['control'] | first }}" + +- name: Copy manila share info to /exports/cluster + copy: + content: "{{ os_manila_mount_share_info_var | to_nice_yaml }}" + dest: /exports/cluster/manila_share_info.yml + run_once: true + delegate_to: "{{ groups['control'] | first }}" + when: os_manila_mount_share_info is defined + vars: + os_manila_mount_share_info_var: + os_manila_mount_share_info: "{{ os_manila_mount_share_info }}" diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index 8f36aa836..29a2f53e7 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -2,7 +2,7 @@ - name: Ensure directories exist file: - path: "/etc/ansible-init/{{ item }}" + path: "/etc/ansible-init/playbooks/{{ item }}" state: directory owner: root group: root @@ -17,7 +17,7 @@ - name: Inject files from roles copy: src: '{{ item.src }}' - dest: '/etc/ansible-init/{{ item.dest }}' + dest: '/etc/ansible-init/playbooks/{{ item.dest }}' owner: root group: root mode: 0644 @@ -30,10 +30,8 @@ dest: templates/ceph.keyring.j2 - src: ../../resolv_conf/files/NetworkManager-dns-none.conf dest: files/NetworkManager-dns-none.conf - - src: ../../stackhpc.os-manila-mount/library/os_manila_share.py - dest: library/os_manila_share.py - - src: ../../basic_users/filter_plugins/filter_keys.py - dest: filter_plugins/filter_keys.py + # - src: ../../basic_users/filter_plugins/filter_keys.py + # dest: filter_plugins/filter_keys.py - src: ../../stackhpc.nfs/tasks/nfs-clients.yml dest: tasks/nfs-clients.yml diff --git a/environments/.stackhpc/inventory/group_vars/all/manila.yml b/environments/.stackhpc/inventory/group_vars/all/manila.yml new file mode 100644 index 000000000..767a5dde8 --- /dev/null +++ b/environments/.stackhpc/inventory/group_vars/all/manila.yml @@ -0,0 +1,5 @@ +os_manila_mount_shares_arcus: + - share_name: slurm-v2-home + mount_path: /project + +os_manila_mount_shares: "{{ os_manila_mount_shares_arcus if ci_cloud == 'ARCUS' else [] }}" From 68bec3eaeb931ebb32312adee4c7b025f712aca5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 15:35:50 +0000 Subject: [PATCH 208/268] test manila if running on arcus --- environments/.stackhpc/inventory/group_vars/all/manila.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/.stackhpc/inventory/group_vars/all/manila.yml b/environments/.stackhpc/inventory/group_vars/all/manila.yml index 767a5dde8..59f935873 100644 --- a/environments/.stackhpc/inventory/group_vars/all/manila.yml +++ b/environments/.stackhpc/inventory/group_vars/all/manila.yml @@ -1,5 +1,7 @@ os_manila_mount_shares_arcus: - share_name: slurm-v2-home mount_path: /project + - share_name: slurm-scratch + mount_path: /scratch os_manila_mount_shares: "{{ os_manila_mount_shares_arcus if ci_cloud == 'ARCUS' else [] }}" From 14e7dc66d5d5254e495aee476b849d57ae187f8f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 15:52:51 +0000 Subject: [PATCH 209/268] support basic_users in compute-init --- ansible/roles/compute_init/README.md | 7 ++-- .../roles/compute_init/files/compute-init.yml | 34 ++++++++++++++++++- ansible/roles/compute_init/tasks/install.yml | 4 +-- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 83de9d73f..94d9cd51c 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -31,11 +31,14 @@ The check in 4b. above is what prevents the compute-init script from trying to configure the node before the services on the control node are available (which requires running the site.yml playbook). -The following roles are currently fully functional: +The following roles/groups are currently fully functional: - `resolv_conf`: all functionality - `etc_hosts`: all functionality - `nfs`: client functionality only -- `stackhpc.openhpc`: all functionality +- `manila`: all functionality +- `openhpc`: all functionality +- `basic_users`: all functionality, assumes home directory already exists on + shared storage # Development/debugging diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index fb853407b..1c37bbdc9 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -11,6 +11,7 @@ enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}" + enable_basic_users: "{{ os_metadata.meta.enable_basic_users | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] @@ -34,6 +35,15 @@ - _netdev # prevents mount blocking early boot before networking available - rw + basic_users_groups: [] + basic_users_manage_homedir: false # homedir must already exist on shared filesystem + basic_users_userdefaults: + state: present + create_home: "{{ basic_users_manage_homedir }}" + generate_ssh_key: "{{ basic_users_manage_homedir }}" + ssh_key_comment: "{{ item.name }}" + basic_users_users: [] + tasks: - block: - name: Report skipping initialization if not compute node @@ -194,7 +204,29 @@ - enable_manila - os_manila_mount_shares | length > 0 - # TODO: - name: Basic users setup + - name: Basic users + block: + - name: Create groups + ansible.builtin.group: "{{ item }}" + loop: "{{ basic_users_groups }}" + + - name: Create users + user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}" + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }} [{{ item.state | default('present') }}]" + register: basic_users_info + + - name: Write sudo rules + blockinfile: + path: /etc/sudoers.d/80-{{ item.name}}-user + block: "{{ item.sudo }}" + create: true + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }}" + when: "'sudo' in item" + when: enable_basic_users # TODO: - name: Configure EESSI diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index 29a2f53e7..bbcbf133f 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -30,8 +30,8 @@ dest: templates/ceph.keyring.j2 - src: ../../resolv_conf/files/NetworkManager-dns-none.conf dest: files/NetworkManager-dns-none.conf - # - src: ../../basic_users/filter_plugins/filter_keys.py - # dest: filter_plugins/filter_keys.py + - src: ../../basic_users/filter_plugins/filter_keys.py + dest: filter_plugins/filter_keys.py - src: ../../stackhpc.nfs/tasks/nfs-clients.yml dest: tasks/nfs-clients.yml From a2418ef42998e41a415388de667b42f57f963755 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 16:09:52 +0000 Subject: [PATCH 210/268] support eessi in compute-init --- ansible/roles/compute_init/README.md | 2 ++ .../roles/compute_init/files/compute-init.yml | 17 +++++++++++++-- ansible/roles/compute_init/tasks/export.yml | 21 +++++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 94d9cd51c..e9a045342 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -39,6 +39,8 @@ The following roles/groups are currently fully functional: - `openhpc`: all functionality - `basic_users`: all functionality, assumes home directory already exists on shared storage +- `eessi`: all functionality, assumes `cvmfs_config` is the same on control + node and all compute nodes. # Development/debugging diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 1c37bbdc9..6327151bb 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -12,6 +12,7 @@ enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}" enable_basic_users: "{{ os_metadata.meta.enable_basic_users | default(false) | bool }}" + enable_eessi: "{{ os_metadata.meta.enable_eessi | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] @@ -228,9 +229,21 @@ when: "'sudo' in item" when: enable_basic_users - # TODO: - name: Configure EESSI + - name: EESSI + block: + - name: Copy cvmfs config + copy: + src: /mnt/cluster/cvmfs/default.local + dest: /etc/cvmfs/default.local + owner: root + group: root + mode: 0644 + + - name: Ensure CVMFS config is setup + command: + cmd: "cvmfs_config setup" + when: enable_eessi - # TODO: - name: Configure openhpc # NB: don't need conditional block on enable_compute as have already exited # if not the case - name: Write Munge key diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml index d1d609895..12b648f6e 100644 --- a/ansible/roles/compute_init/tasks/export.yml +++ b/ansible/roles/compute_init/tasks/export.yml @@ -44,3 +44,24 @@ vars: os_manila_mount_share_info_var: os_manila_mount_share_info: "{{ os_manila_mount_share_info }}" + +- name: Ensure /exports/cluster/cvmfs directory exists + file: + path: /exports/cluster/cvmfs + state: directory + owner: root + group: root + mode: 0755 + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy EESSI CVMFS config to /exports/cluster + copy: + src: /etc/cvmfs/default.local + dest: /exports/cluster/cvmfs/default.local + owner: root + group: root + mode: 0644 + remote_src: true + run_once: true + delegate_to: "{{ groups['control'] | first }}" From cdbf005750a979bc430f00243323a6af70eecda0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 16:19:28 +0000 Subject: [PATCH 211/268] change metadat from k3s_server to control_address --- ansible/roles/cluster_infra/templates/resources.tf.j2 | 4 ++-- ansible/roles/compute_init/files/compute-init.yml | 2 +- ansible/roles/k3s/files/start_k3s.yml | 2 +- .../{{cookiecutter.environment}}/terraform/compute.tf | 2 +- .../{{cookiecutter.environment}}/terraform/compute/nodes.tf | 2 +- .../terraform/compute/variables.tf | 4 ++-- .../skeleton/{{cookiecutter.environment}}/terraform/nodes.tf | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index 453f01a7e..69d001105 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -399,7 +399,7 @@ resource "openstack_compute_instance_v2" "login" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} - k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 k3s_token = "{{ k3s_token }}" } } @@ -565,7 +565,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} - k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 k3s_token = "{{ k3s_token }}" } } diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 6327151bb..7c2ad6ae2 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -5,7 +5,7 @@ become: yes vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" - server_node_ip: "{{ os_metadata.meta.k3s_server }}" + server_node_ip: "{{ os_metadata.meta.control_address }}" enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 8ee0e6114..b9b82f1c4 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -3,7 +3,7 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" k3s_token: "{{ os_metadata.meta.k3s_token }}" - k3s_server_name: "{{ os_metadata.meta.k3s_server }}" + k3s_server_name: "{{ os_metadata.meta.control_address }}" service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}" tasks: - name: Ensure password directory exists diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index eb2139eba..14c728a5a 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -16,6 +16,6 @@ module "compute" { key_pair = var.key_pair environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index e64a2162c..7a2a706a6 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -47,7 +47,7 @@ resource "openstack_compute_instance_v2" "compute" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = var.k3s_server + control_address = var.control_address } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 9d2c2e47c..3655c9e65 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -72,7 +72,7 @@ variable "k3s_token" { type = string } -variable "k3s_server" { - description = "Name/address of k3s server" +variable "control_address" { + description = "Name/address of control node" type = string } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index bfbd1c532..8ea8cabcb 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -126,7 +126,7 @@ resource "openstack_compute_instance_v2" "login" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] } user_data = <<-EOF From 3b9eb467fd859585395d1f8450f4254dcffe75a3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 16:28:23 +0000 Subject: [PATCH 212/268] fixup resolv_conf support in cloud-init --- ansible/roles/compute_init/README.md | 2 +- ansible/roles/compute_init/files/compute-init.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index e9a045342..77a127245 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -36,11 +36,11 @@ The following roles/groups are currently fully functional: - `etc_hosts`: all functionality - `nfs`: client functionality only - `manila`: all functionality -- `openhpc`: all functionality - `basic_users`: all functionality, assumes home directory already exists on shared storage - `eessi`: all functionality, assumes `cvmfs_config` is the same on control node and all compute nodes. +- `openhpc`: all functionality # Development/debugging diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 7c2ad6ae2..c7a9048b4 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -94,7 +94,7 @@ block: - name: Set nameservers in /etc/resolv.conf ansible.builtin.template: - src: /etc/ansible-init/templates/resolv.conf.j2 + src: resolv.conf.j2 dest: /etc/resolv.conf owner: root group: root @@ -102,7 +102,7 @@ - name: Disable NetworkManager control of resolv.conf ansible.builtin.copy: - src: /etc/ansible-init/files/NetworkManager-dns-none.conf + src: files/NetworkManager-dns-none.conf dest: /etc/NetworkManager/conf.d/90-dns-none.conf owner: root group: root From 15ed0a3880e046aacf589ab28dbfe4d5b532c7a0 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 18 Dec 2024 16:52:45 +0000 Subject: [PATCH 213/268] Bump RL9.4 repo timestamps to latest snapshots (#497) * bumped repo timestamps to latest * bump * now performs update in fatimage * bump * bump --- .github/workflows/fatimage.yml | 8 ++++---- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- .../common/inventory/group_vars/all/defaults.yml | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 331035001..6649a3533 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -23,11 +23,11 @@ jobs: matrix: # build RL8, RL9 build: - image_name: openhpc-RL8 - source_image_name: rocky-latest-RL8 - inventory_groups: control,compute,login + source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + inventory_groups: control,compute,login,update - image_name: openhpc-RL9 - source_image_name: rocky-latest-RL9 - inventory_groups: control,compute,login + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: control,compute,login,update env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 7c59abf36..db25176e2 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241216-1607-2357a730", - "RL9": "openhpc-RL9-241216-1607-2357a730" + "RL8": "openhpc-RL8-241218-1011-5effb3fa", + "RL9": "openhpc-RL9-241218-1011-5effb3fa" } } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 1bac4590d..29724fb6f 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -84,12 +84,12 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us appliances_repo_timestamps: baseos: - '9.4': 20240816T002610 + '9.4': 20241115T011711 appstream: - '9.4': 20240816T002610 + '9.4': 20241112T003151 crb: - '9.4': 20240816T002610 + '9.4': 20241115T003133 extras: - '9.4': 20240816T002610 + '9.4': 20241118T002802 epel: - '9': 20240902T080424 + '9': 20241213T010218 From fed2d6eb7de7d35b9609e190007afdd3d41266da Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:09:21 +0000 Subject: [PATCH 214/268] Pin nvidia-driver and cuda packages to working packages (#496) * move cuda tasks to install * pin nvidia driver to working version and autodetect os/arch * make install of cuda packages optional * don't run cuda install tasks unless during build * move doca install before cuda * update cuda docs * add cuda to extra build test CI * add cuda runtime tasks * fix typo in extras playbook * bump extra build size to 30GB for cuda * pin both cuda package version * make cuda idempotent/restartable * allow using computed tasks_from for cuda role * fix showing image summary * rename nvidia driver version var * bump CI image --- .github/workflows/{doca.yml => extra.yml} | 21 +++++++++----- ansible/cleanup.yml | 3 +- ansible/extras.yml | 3 +- ansible/fatimage.yml | 12 ++++++-- ansible/roles/cuda/README.md | 8 ++--- ansible/roles/cuda/defaults/main.yml | 7 ++--- .../cuda/tasks/{main.yml => install.yml} | 29 ++++++++++++++----- ansible/roles/cuda/tasks/runtime.yml | 5 ++++ .../terraform/cluster_image.auto.tfvars.json | 4 +-- 9 files changed, 61 insertions(+), 31 deletions(-) rename .github/workflows/{doca.yml => extra.yml} (89%) rename ansible/roles/cuda/tasks/{main.yml => install.yml} (60%) create mode 100644 ansible/roles/cuda/tasks/runtime.yml diff --git a/.github/workflows/doca.yml b/.github/workflows/extra.yml similarity index 89% rename from .github/workflows/doca.yml rename to .github/workflows/extra.yml index cfd3bb982..dece242ce 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/extra.yml @@ -1,4 +1,4 @@ -name: Test DOCA extra build +name: Test extra build on: workflow_dispatch: push: @@ -7,16 +7,18 @@ on: paths: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - - '.github/workflows/doca' + - 'ansible/roles/cuda/**' + - '.github/workflows/extra.yml' pull_request: paths: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - - '.github/workflows/doca' + - 'ansible/roles/cuda/**' + - '.github/workflows/extra.yml' jobs: doca: - name: doca-build + name: extra-build concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true @@ -25,12 +27,14 @@ jobs: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 build: - - image_name: openhpc-doca-RL8 + - image_name: openhpc-extra-RL8 source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json - inventory_groups: doca - - image_name: openhpc-doca-RL9 + inventory_groups: doca,cuda + volume_size: 30 # needed for cuda + - image_name: openhpc-extra-RL9 source_image_name_key: RL9 - inventory_groups: doca + inventory_groups: doca,cuda + volume_size: 30 # needed for cuda env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -95,6 +99,7 @@ jobs: -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \ -var "image_name=${{ matrix.build.image_name }}" \ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ + -var "volume_size=${{ matrix.build.volume_size }}" \ openstack.pkr.hcl - name: Get created image names from manifest diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index 3f059d157..670a99b29 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -66,5 +66,4 @@ slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" - name: Show image summary - debug: - var: image_info + command: cat /var/lib/image/image.json diff --git a/ansible/extras.yml b/ansible/extras.yml index 107f85252..0a74541a5 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -24,8 +24,9 @@ gather_facts: yes tags: cuda tasks: - - import_role: + - include_role: name: cuda + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}" - name: Persist hostkeys across rebuilds # Must be after filesystems.yml (for storage) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 55e56e612..c35be5b64 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -29,6 +29,14 @@ - import_playbook: bootstrap.yml +- hosts: doca + become: yes + gather_facts: yes + tasks: + - name: Install NVIDIA DOCA + import_role: + name: doca + - name: Run post-bootstrap.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" @@ -220,8 +228,6 @@ import_role: name: doca -- import_playbook: disable-repos.yml - - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" @@ -229,6 +235,8 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- import_playbook: disable-repos.yml + - hosts: builder become: yes gather_facts: yes diff --git a/ansible/roles/cuda/README.md b/ansible/roles/cuda/README.md index 141e7b80d..be6439cd5 100644 --- a/ansible/roles/cuda/README.md +++ b/ansible/roles/cuda/README.md @@ -1,6 +1,6 @@ # cuda -Install NVIDIA CUDA. The CUDA binaries are added to the PATH for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. +Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. ## Prerequisites @@ -8,8 +8,8 @@ Requires OFED to be installed to provide required kernel-* packages. ## Role Variables -- `cuda_distro`: Optional. Default `rhel8`. -- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo` -- `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed. +- `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture. +- `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version. - `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`. +- `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA. - `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`. diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 33a25d9b4..05f1e093d 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,7 +1,6 @@ -cuda_distro: "rhel{{ ansible_distribution_major_version }}" -cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo" -cuda_driver_stream: default -cuda_package_version: 'latest' +cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" +cuda_nvidia_driver_stream: '560-open' # 565-open has problems with cuda packages +cuda_package_version: '12.6.3-1' cuda_packages: - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" - nvidia-gds diff --git a/ansible/roles/cuda/tasks/main.yml b/ansible/roles/cuda/tasks/install.yml similarity index 60% rename from ansible/roles/cuda/tasks/main.yml rename to ansible/roles/cuda/tasks/install.yml index 22f8e9e8e..51c92a0d3 100644 --- a/ansible/roles/cuda/tasks/main.yml +++ b/ansible/roles/cuda/tasks/install.yml @@ -1,7 +1,7 @@ # Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation -- name: Check for OFED +- name: Check for OFED/DOCA command: cmd: dnf list --installed rdma-core register: _dnf_rdma_core @@ -10,41 +10,53 @@ - name: Assert OFED installed assert: that: "'mlnx' in _dnf_rdma_core.stdout" - fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?" + fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?" - name: Install cuda repo get_url: - dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo" - url: "{{ cuda_repo }}" + dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo" + url: "{{ cuda_repo_url }}" - name: Check if nvidia driver module is enabled - shell: - cmd: dnf module list --enabled nvidia-driver + ansible.builtin.command: dnf module list --enabled nvidia-driver changed_when: false failed_when: false register: _cuda_driver_module_enabled - name: Enable nvidia driver module - ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms" + ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_nvidia_driver_stream }}" register: _cuda_driver_module_enable when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout" +- name: Check if nvidia driver module is installed + ansible.builtin.command: dnf module list --installed nvidia-driver + changed_when: false + failed_when: false + register: _cuda_driver_module_installed + - name: Install nvidia drivers ansible.builtin.command: dnf module install -y nvidia-driver register: _cuda_driver_install - when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" + when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr" changed_when: "'Nothing to do' not in _cuda_driver_install.stdout" +- name: Check kernel has not been modified + assert: + that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched + fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}" + - name: Install cuda packages ansible.builtin.dnf: name: "{{ cuda_packages }}" + when: cuda_package_version != 'none' register: cuda_package_install - name: Add cuda binaries to path lineinfile: path: /etc/profile.d/sh.local line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin' + when: cuda_package_version != 'none' - name: Enable NVIDIA Persistence Daemon systemd: @@ -60,3 +72,4 @@ - name: Wait for hosts to be reachable wait_for_connection: sleep: 15 + when: cuda_package_install.changed diff --git a/ansible/roles/cuda/tasks/runtime.yml b/ansible/roles/cuda/tasks/runtime.yml new file mode 100644 index 000000000..c16a48c6f --- /dev/null +++ b/ansible/roles/cuda/tasks/runtime.yml @@ -0,0 +1,5 @@ +- name: Ensure NVIDIA Persistence Daemon state + systemd: + name: nvidia-persistenced + enabled: true + state: "{{ cuda_persistenced_state }}" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index db25176e2..be2f156a3 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241218-1011-5effb3fa", - "RL9": "openhpc-RL9-241218-1011-5effb3fa" + "RL8": "openhpc-RL8-241218-1705-09ac4268", + "RL9": "openhpc-RL9-241218-1705-09ac4268" } } From 722a0c1a0d41a7c8ebdd350dfe24d739526e2665 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Dec 2024 11:04:19 +0000 Subject: [PATCH 215/268] moved pulp subpaths into common structure --- ansible/filter_plugins/utils.py | 4 ++++ ansible/roles/dnf_repos/defaults/main.yml | 12 ++++------ ansible/roles/dnf_repos/tasks/set_repos.yml | 4 ++-- ansible/roles/pulp_site/defaults/main.yml | 23 ++++++++----------- .../inventory/group_vars/all/defaults.yml | 22 +++++++++++++----- 5 files changed, 37 insertions(+), 28 deletions(-) diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index f69d6f3f7..15ba14777 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -48,6 +48,9 @@ def to_ood_regex(items): r = ['(%s)' % v for v in r] return '|'.join(r) +def appliances_repo_to_subpath(repo_entry): + return repo_entry.path+'/'+repo_entry.timestamp + class FilterModule(object): ''' Ansible core jinja2 filters ''' @@ -63,4 +66,5 @@ def filters(self): 'exists': exists, 'warn': self.warn, 'to_ood_regex': to_ood_regex, + 'appliance_repo_to_subpath': appliances_repo_to_subpath } diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 4a0c9fd2a..afc773b2f 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,6 +1,4 @@ dnf_repos_pulp_content_url: "{{ appliances_pulp_url }}/pulp/content" -dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" -dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" dnf_repos_username: "{{ omit }}" dnf_repos_password: "{{ omit }}" @@ -8,16 +6,16 @@ dnf_repos_password: "{{ omit }}" dnf_repos_repolist: - file: rocky name: baseos - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/BaseOS/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.baseos[ansible_distribution_version] }}" + subpath: "{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky name: appstream - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.appstream[ansible_distribution_version] }}" + subpath: "{{ appliances_pulp_repos.appstream[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky name: crb - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.crb[ansible_distribution_version] }}" + subpath: "{{ appliances_pulp_repos.crb[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky-extras name: extras - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.extras[ansible_distribution_version] }}" + subpath: "{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" -dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ appliances_repo_timestamps.epel[ansible_distribution_major_version] }}" +dnf_repos_epel_subpath: "{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_epel_description: "epel" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index fe5e2c02c..aab5b85e4 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -4,7 +4,7 @@ ansible.builtin.yum_repository: file: "{{ item.file }}" name: "{{ item.name }}" - baseurl: "{{ item.base_url }}" + baseurl: "{{ dnf_repos_pulp_content_url }}/{{ item.subpath }}" description: "{{ item.name }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" @@ -21,6 +21,6 @@ file: epel description: "{{ dnf_repos_epel_description }}" gpgcheck: false - baseurl: "{{ dnf_repos_epel_baseurl }}" + baseurl: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_epel_subpath }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index d343d4998..fc23a4489 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -3,28 +3,25 @@ pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed pulp_site_password: "{{ vault_pulp_admin_password }}" pulp_site_upstream_content_url: https://ark.stackhpc.com/pulp/content -_pulp_site_rocky_prefix: "{{ pulp_site_target_distribution }}/{{ pulp_site_target_distribution_version }}" pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" pulp_site_validate_certs: false pulp_site_install_dir: '/home/rocky/pulp' pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" pulp_site_target_facts: "{{ hostvars[groups['builder'][0]]['ansible_facts'] }}" -pulp_site_target_arch: "{{ pulp_site_target_facts['architecture'] }}" -pulp_site_target_distribution: "{{ pulp_site_target_facts['distribution'] | lower }}" pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" pulp_site_rpm_info: -- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.baseos[pulp_site_target_distribution_version] }}" - subpath: "{{ _pulp_site_rocky_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.baseos[pulp_site_target_distribution_version] }}" -- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.appstream[pulp_site_target_distribution_version] }}" - subpath: "{{ _pulp_site_rocky_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.appstream[pulp_site_target_distribution_version] }}" -- name: "crb-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.crb[pulp_site_target_distribution_version] }}" - subpath: "{{ _pulp_site_rocky_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.crb[pulp_site_target_distribution_version] }}" -- name: "extras-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.extras[pulp_site_target_distribution_version] }}" - subpath: "{{ _pulp_site_rocky_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.extras[pulp_site_target_distribution_version] }}" -- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_repo_timestamps.epel[pulp_site_target_distribution_version_major] }}" - subpath: "epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ appliances_repo_timestamps.epel[pulp_site_target_distribution_version_major] }}" +- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "crb-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "extras-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 29724fb6f..96db68667 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -82,14 +82,24 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ########################################################################################### -appliances_repo_timestamps: +appliances_pulp_repositories: baseos: - '9.4': 20241115T011711 + '9.4': + timestamp: 20241115T011711 + path: rocky/9.4/BaseOS/x86_64/os appstream: - '9.4': 20241112T003151 + '9.4': + timestamp: 20241112T003151 + path: rocky/9.4/AppStream/x86_64/os crb: - '9.4': 20241115T003133 + '9.4': + timestamp: 20241115T003133 + path: rocky/9.4/CRB/x86_64/os extras: - '9.4': 20241118T002802 + '9.4': + timestamp: 20241118T002802 + path: rocky/9.4/extras/x86_64/os epel: - '9': 20241213T010218 + '9': + timestamp: 20241213T010218 + path: epel/9/Everything/x86_64 \ No newline at end of file From d1f3c69e8defaa25908ddd657e8b4ffb3ef3639d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Dec 2024 11:30:09 +0000 Subject: [PATCH 216/268] typos --- ansible/filter_plugins/utils.py | 4 ++-- environments/common/inventory/group_vars/all/defaults.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index 15ba14777..9559d0fee 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -49,7 +49,7 @@ def to_ood_regex(items): return '|'.join(r) def appliances_repo_to_subpath(repo_entry): - return repo_entry.path+'/'+repo_entry.timestamp + return repo_entry['path']+'/'+repo_entry['timestamp'] class FilterModule(object): ''' Ansible core jinja2 filters ''' @@ -66,5 +66,5 @@ def filters(self): 'exists': exists, 'warn': self.warn, 'to_ood_regex': to_ood_regex, - 'appliance_repo_to_subpath': appliances_repo_to_subpath + 'appliances_repo_to_subpath': appliances_repo_to_subpath } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 96db68667..7a7d5c7c0 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -82,7 +82,7 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ########################################################################################### -appliances_pulp_repositories: +appliances_pulp_repos: baseos: '9.4': timestamp: 20241115T011711 From 357f7e25e7ccdefb8748096d25cb8f9315c63ce9 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 19 Dec 2024 11:41:06 +0000 Subject: [PATCH 217/268] docs suggestions Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- docs/operations.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/operations.md b/docs/operations.md index 50eef9053..edf8881f0 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -74,8 +74,10 @@ By default, the following utility packages are installed during build: - postfix - git - latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) -Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_other_extra_package` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: +Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_other_extra_packages` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: + +```yaml # environments/foo-base/inventory/group_vars/all/defaults.yml: appliances_other_extra_package: - somepackage @@ -84,7 +86,7 @@ Additional packages from any DNF repositories which are enabled during build (wh The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules. -If you wish to install packages during runtime, the `site.yml` playbook should be run `appliances_packages_during_configure` overriden to `true` and `cluster` should be added as a child of the `dnf_repos` group in order to temporarily re-enabled DNF repositories during runtime (WARNING: this should only be done if using an unauthenticated local Pulp server. If using StackHPC Ark directly, doing this WILL leak credentials to users). +If you wish to install packages during runtime, the `site.yml` playbook should be run with `appliances_packages_during_configure` overriden to `true` and `cluster` should be added as a child of the `dnf_repos` group in order to temporarily re-enable DNF repositories during runtime (WARNING: this should only be done if using an unauthenticated local Pulp server. If using StackHPC Ark directly, doing this WILL leak credentials to users). If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a plat should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this: - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory. From 17499e7fa386825114049da04d86221da6a34aa5 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Dec 2024 12:14:32 +0000 Subject: [PATCH 218/268] dnf_repos urls fully overridable again --- ansible/roles/dnf_repos/defaults/main.yml | 10 +++++----- ansible/roles/dnf_repos/tasks/set_repos.yml | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index afc773b2f..bff89b4a9 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -6,16 +6,16 @@ dnf_repos_password: "{{ omit }}" dnf_repos_repolist: - file: rocky name: baseos - subpath: "{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky name: appstream - subpath: "{{ appliances_pulp_repos.appstream[ansible_distribution_version] | appliances_repo_to_subpath }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.appstream[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky name: crb - subpath: "{{ appliances_pulp_repos.crb[ansible_distribution_version] | appliances_repo_to_subpath }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.crb[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky-extras name: extras - subpath: "{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" -dnf_repos_epel_subpath: "{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}" +dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_epel_description: "epel" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index aab5b85e4..fe5e2c02c 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -4,7 +4,7 @@ ansible.builtin.yum_repository: file: "{{ item.file }}" name: "{{ item.name }}" - baseurl: "{{ dnf_repos_pulp_content_url }}/{{ item.subpath }}" + baseurl: "{{ item.base_url }}" description: "{{ item.name }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" @@ -21,6 +21,6 @@ file: epel description: "{{ dnf_repos_epel_description }}" gpgcheck: false - baseurl: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_epel_subpath }}" + baseurl: "{{ dnf_repos_epel_baseurl }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" From 1e2e6d8a2c319d4d9c1e5b5d83f5bea64aed7b77 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 19 Dec 2024 13:28:34 +0000 Subject: [PATCH 219/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index be9dfe5cb..4f21c6b99 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241217-1146-d77be652", - "RL9": "openhpc-RL9-241217-1145-d77be652" + "RL8": "openhpc-RL8-241219-1232-7f84fed4", + "RL9": "openhpc-RL9-241219-1145-7f84fed4" } } From 6a8ecda6ce5c2074ba5d37cc955947626a73b7b1 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:52:11 +0000 Subject: [PATCH 220/268] variable renames from review Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- environments/common/inventory/group_vars/all/defaults.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 9ce228493..417eb9eed 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -82,7 +82,7 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ################## bootstrap: extra package installs ###################################### -appliances_default_extra_packages: +appliances_extra_packages_default: - htop - nano - screen @@ -95,11 +95,11 @@ appliances_default_extra_packages: - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" -appliances_packages_during_configure: false +appliances_extra_packages_during_configure: false -appliances_other_extra_packages: [] +appliances_extra_packages_other: [] -appliances_extra_packages: "{{ appliances_default_extra_packages + appliances_other_extra_packages }}" +appliances_extra_packages: "{{ appliances_extra_packages_default + appliances_extra_packages_other }}" ###################### ark repo timestamps ################################################### From ef33eefbef836f1dcff8d3d91be9e164d7fc9a84 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Dec 2024 14:55:35 +0000 Subject: [PATCH 221/268] updated docs --- docs/operations.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/operations.md b/docs/operations.md index edf8881f0..4bebe1b3f 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -75,11 +75,11 @@ By default, the following utility packages are installed during build: - git - latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) -Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_other_extra_packages` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: +Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_extra_packages_other` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: ```yaml # environments/foo-base/inventory/group_vars/all/defaults.yml: - appliances_other_extra_package: + appliances_extra_packages_other: - somepackage - anotherpackage From a3be506598ff8f818c0d3c053bc0f3c2a56f8dc0 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:35:22 +0000 Subject: [PATCH 222/268] missed variable rename --- ansible/extras.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/extras.yml b/ansible/extras.yml index e5ea63408..fd1aa4c1c 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -54,4 +54,4 @@ - name: Install additional packages dnf: name: "{{ appliances_extra_packages }}" - when: appliances_mode != 'configure' or appliances_packages_during_configure + when: appliances_mode != 'configure' or appliances_extra_packages_during_configure From a0ba5f17b37caba2941db4e2a1c4550883797af0 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 20 Dec 2024 13:16:53 +0000 Subject: [PATCH 223/268] bump fatimage --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 4f21c6b99..8a9e3b66a 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241219-1232-7f84fed4", - "RL9": "openhpc-RL9-241219-1145-7f84fed4" + "RL8": "openhpc-RL8-241220-1131-a2dde143", + "RL9": "openhpc-RL9-241220-1131-a2dde143" } } From ada3dc9a428ad723817d9144bab675282356a619 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 2 Jan 2025 09:55:27 +0000 Subject: [PATCH 224/268] Review linting changes Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/filter_plugins/utils.py | 4 +++- ansible/roles/pulp_site/defaults/main.yml | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index 9559d0fee..1187b3c4b 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -49,7 +49,9 @@ def to_ood_regex(items): return '|'.join(r) def appliances_repo_to_subpath(repo_entry): - return repo_entry['path']+'/'+repo_entry['timestamp'] + """ Take an element from appliances_pulp_repos and convert it to a pulp path. This assumes that the remote and local pulp structures are the same + """ + return repo_entry['path'] + '/' + repo_entry['timestamp'] class FilterModule(object): ''' Ansible core jinja2 filters ''' diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index fc23a4489..081307b6a 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -7,7 +7,7 @@ pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" pulp_site_validate_certs: false pulp_site_install_dir: '/home/rocky/pulp' pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" -pulp_site_target_facts: "{{ hostvars[groups['builder'][0]]['ansible_facts'] }}" +pulp_site_target_facts: "{{ hostvars[groups['pulp'][0]]['ansible_facts'] }}" pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" From a7690151f6fefe44ffe24aecd9a856b2862d5a5a Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:59:23 +0000 Subject: [PATCH 225/268] Add note about login node reboot (#510) --- ansible/roles/openondemand/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/openondemand/README.md b/ansible/roles/openondemand/README.md index c6a4f3f9f..365265df0 100644 --- a/ansible/roles/openondemand/README.md +++ b/ansible/roles/openondemand/README.md @@ -17,7 +17,7 @@ This uses the [osc.ood](https://github.com/OSC/ood-ansible) Ansible role to prov ### General - `openondemand_clusters`: Required. Synonym for [osc.ood: clusters](https://github.com/OSC/ood-ansible#clusters) role variable. -- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use ssh with a `DynamicForward` option and a SOCKS proxy to access this address. Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. +- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use ssh with a `DynamicForward` option and a SOCKS proxy to access this address. Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. Changing this value on an already deployed cluster requires a reboot of the login node for OOD app state to be correctly refreshed. ### Authentication See the Open Ondemand [Authentication docs](https://osc.github.io/ood-documentation/latest/authentication/overview.html) for an overview of the authentication process. @@ -77,7 +77,7 @@ The Open Ondemand portal can proxy other servers. Variables: to proxy: - All "compute" nodes, e.g. for Open Ondemand interactive apps such as remote desktop and Jupyter notebook server. - The Grafana server - note a link to Grafana is always added to the Open Ondemand dashboard. - + The exact pattern depends on inventory hostnames / partitions / addresses. - `openondemand_node_proxy_directives`: Optional, default ''. Multiline string to insert into Apache directives definition for `node_uri` ([docs](https://osc.github.io/ood-documentation/master/reference/files/ood-portal-yml.html#configure-reverse-proxy)). From 36ca0d5c0f1e21f74444aa69df6270362cc81885 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 2 Jan 2025 12:12:34 +0000 Subject: [PATCH 226/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 8a9e3b66a..8061cf356 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241220-1131-a2dde143", - "RL9": "openhpc-RL9-241220-1131-a2dde143" + "RL8": "openhpc-RL8-250102-1135-8c98e169", + "RL9": "openhpc-RL9-250102-1135-8c98e169" } } From 5fddb85ac8cc579e97f252af1caac4f160b9c265 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 2 Jan 2025 12:13:44 +0000 Subject: [PATCH 227/268] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 8a9e3b66a..f9e568c3f 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241220-1131-a2dde143", - "RL9": "openhpc-RL9-241220-1131-a2dde143" + "RL8": "openhpc-RL8-250102-1138-77cfc703", + "RL9": "openhpc-RL9-250102-1139-77cfc703" } } From 9a07ff4ddd516f7217404c79e60f7840b200a99f Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 3 Jan 2025 13:23:58 +0000 Subject: [PATCH 228/268] Stop Lustre deleting rdma packages + add to extrabuild test (#502) * move cuda tasks to install * pin nvidia driver to working version and autodetect os/arch * make install of cuda packages optional * don't run cuda install tasks unless during build * move doca install before cuda * update cuda docs * add cuda to extra build test CI * add cuda runtime tasks * fix typo in extras playbook * bump extra build size to 30GB for cuda * pin both cuda package version * make cuda idempotent/restartable * allow using computed tasks_from for cuda role * fix showing image summary * removed faulty cleanup and added lustre to extrabuild test * bumped lustre to supported version --------- Co-authored-by: Steve Brasier --- .github/workflows/extra.yml | 6 +++-- ansible/fatimage.yml | 8 ------- ansible/roles/lustre/README.md | 2 +- ansible/roles/lustre/defaults/main.yml | 2 +- ansible/roles/lustre/tasks/install.yml | 31 +++++--------------------- 5 files changed, 11 insertions(+), 38 deletions(-) diff --git a/.github/workflows/extra.yml b/.github/workflows/extra.yml index dece242ce..bf438c336 100644 --- a/.github/workflows/extra.yml +++ b/.github/workflows/extra.yml @@ -8,12 +8,14 @@ on: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - 'ansible/roles/cuda/**' + - 'ansible/roles/lustre/**' - '.github/workflows/extra.yml' pull_request: paths: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - 'ansible/roles/cuda/**' + - 'ansible/roles/lustre/**' - '.github/workflows/extra.yml' jobs: @@ -29,11 +31,11 @@ jobs: build: - image_name: openhpc-extra-RL8 source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json - inventory_groups: doca,cuda + inventory_groups: doca,cuda,lustre volume_size: 30 # needed for cuda - image_name: openhpc-extra-RL9 source_image_name_key: RL9 - inventory_groups: doca,cuda + inventory_groups: doca,cuda,lustre volume_size: 30 # needed for cuda env: ANSIBLE_FORCE_COLOR: True diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 9f1e9107c..9a8828a35 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -230,14 +230,6 @@ name: cloudalchemy.grafana tasks_from: install.yml -- hosts: doca - become: yes - gather_facts: yes - tasks: - - name: Install NVIDIA DOCA - import_role: - name: doca - - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/roles/lustre/README.md b/ansible/roles/lustre/README.md index c0a25e037..3ba0dad56 100644 --- a/ansible/roles/lustre/README.md +++ b/ansible/roles/lustre/README.md @@ -8,7 +8,7 @@ Install and configure a Lustre client. This builds RPM packages from source. ## Role Variables -- `lustre_version`: Optional str. Version of lustre to build, default `2.15.5` which is the first version with EL9 support +- `lustre_version`: Optional str. Version of lustre to build, default `2.15.6` which is the first version with EL9.5 support - `lustre_lnet_label`: Optional str. The "lnet label" part of the host's NID, e.g. `tcp0`. Only the `tcp` protocol type is currently supported. Default `tcp`. - `lustre_mgs_nid`: Required str. The NID(s) for the MGS, e.g. `192.168.227.11@tcp1` (separate mutiple MGS NIDs using `:`). - `lustre_mounts`: Required list. Define Lustre filesystems and mountpoints as a list of dicts with keys: diff --git a/ansible/roles/lustre/defaults/main.yml b/ansible/roles/lustre/defaults/main.yml index be008ad55..40389970c 100644 --- a/ansible/roles/lustre/defaults/main.yml +++ b/ansible/roles/lustre/defaults/main.yml @@ -1,4 +1,4 @@ -lustre_version: '2.15.5' # https://www.lustre.org/lustre-2-15-5-released/ +lustre_version: '2.15.6' # https://www.lustre.org/lustre-2-15-6-released/ lustre_lnet_label: tcp #lustre_mgs_nid: lustre_mounts: [] diff --git a/ansible/roles/lustre/tasks/install.yml b/ansible/roles/lustre/tasks/install.yml index e0af857cf..852b4652f 100644 --- a/ansible/roles/lustre/tasks/install.yml +++ b/ansible/roles/lustre/tasks/install.yml @@ -41,30 +41,9 @@ ansible.builtin.dnf: name: "{{ _lustre_find_rpms.files | map(attribute='path')}}" disable_gpg_check: yes - -- block: - - name: Remove lustre build prerequisites - # NB Only remove ones this role installed which weren't upgrades - ansible.builtin.dnf: - name: "{{ _new_pkgs }}" - state: absent - vars: - _installed_pkgs: | - {{ - _lustre_dnf_build_packages.results | - select('match', 'Installed:') | - map('regex_replace', '^Installed: (.+?)-[0-9].*$', '\1') - }} - _removed_pkgs: | - {{ - _lustre_dnf_build_packages.results | - select('match', 'Removed:') | - map('regex_replace', '^Removed: (.+?)-[0-9].*$', '\1') - }} - _new_pkgs: "{{ _installed_pkgs | difference(_removed_pkgs) }}" - - - name: Delete lustre build dir - file: - path: "{{ lustre_build_dir }}" - state: absent + +- name: Delete lustre build dir + file: + path: "{{ lustre_build_dir }}" + state: absent when: lustre_build_cleanup | bool From 8c979cdf8c2ee0d80cc21e55a32f2cacef15c746 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:43:59 +0000 Subject: [PATCH 229/268] Fix python/ansible/pulp squeezer versions for RL8 deploy hosts (#516) * fix python/ansible/pulp squeezer versions for RL8 deploy hosts * fixed broken pulp dependency --------- Co-authored-by: wtripp180901 --- ansible/filter_plugins/utils.py | 3 +++ dev/setup-env.sh | 1 + requirements.txt | 4 ++-- requirements.yml | 4 ++++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index 1187b3c4b..508f794cc 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -41,6 +41,9 @@ def to_ood_regex(items): eg {{ [compute-001, compute-002, control] | to_regex }} -> '(compute-\d+)|(control)' """ + # NB: for python3.12+ the \d in this function & docstring + # need to be raw strings. See https://docs.python.org/3/reference/lexical_analysis.html + # There's a python bug which means re.sub() can't use '\d' in the replacement so # have to do replacement in two stages: r = [re.sub(r"\d+", 'XBACKSLASHX', v) for v in items] diff --git a/dev/setup-env.sh b/dev/setup-env.sh index bfa0758e6..6d701f2b7 100755 --- a/dev/setup-env.sh +++ b/dev/setup-env.sh @@ -17,6 +17,7 @@ PYTHON_VERSION="" if [[ "$OS" == "ubuntu" && "$MAJOR_VERSION" == "22" ]]; then PYTHON_VERSION="/usr/bin/python3.10" elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "8" ]]; then + # python3.9+ doesn't have selinux bindings PYTHON_VERSION="/usr/bin/python3.8" # use `sudo yum install python38` on Rocky Linux 8 to install this elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "9" ]]; then PYTHON_VERSION="/usr/bin/python3.9" diff --git a/requirements.txt b/requirements.txt index 7d81f3285..872ee9516 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ansible==8.0.0 +ansible==6.7.0 # cloudalchemy.prometheus uses ansible.builtin.include, removed in ansible-core==2.16 => ansible==9 openstacksdk python-openstackclient==6.6.1 # v7.0.0 has a bug re. rebuild python-manilaclient @@ -9,4 +9,4 @@ cookiecutter selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 netaddr matplotlib -pulp-cli==0.29.2 +pulp-cli==0.23.2 diff --git a/requirements.yml b/requirements.yml index 2ede96950..7e71bb904 100644 --- a/requirements.yml +++ b/requirements.yml @@ -49,6 +49,10 @@ collections: - name: https://github.com/azimuth-cloud/ansible-collection-image-utils type: git version: 0.4.0 + # stackhpc.pulp has pulp.squeezer as dependency, any version, but latest + # requires newer ansible than can install + - name: pulp.squeezer + version: 0.0.15 - name: stackhpc.pulp version: 0.5.5 ... From 510cfd01ea27e9356a0cd07b000ca0363f3dbfad Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 6 Jan 2025 10:00:25 +0000 Subject: [PATCH 230/268] extend cookiecutter terraform config for compute init script --- .../terraform/compute.tf | 2 ++ .../terraform/compute/nodes.tf | 13 ++++++++++--- .../terraform/compute/variables.tf | 6 ++++++ .../terraform/variables.tf | 7 +++++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index 14c728a5a..d52c3c42c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -18,4 +18,6 @@ module "compute" { k3s_token = var.k3s_token control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] + + compute_init_enable = var.compute_init_enable } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 7a2a706a6..ac34a443c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -45,9 +45,16 @@ resource "openstack_compute_instance_v2" "compute" { } metadata = { - environment_root = var.environment_root - k3s_token = var.k3s_token - control_address = var.control_address + environment_root = var.environment_root + k3s_token = var.k3s_token + control_address = var.control_address + enable_compute = contains(var.compute_init_enable, "compute") + enable_resolv_conf = contains(var.compute_init_enable, "resolv_conf") + enable_etc_hosts = contains(var.compute_init_enable, "etc_hosts") + enable_nfs = contains(var.compute_init_enable, "nfs") + enable_manila = contains(var.compute_init_enable, "manila") + enable_basic_users = contains(var.compute_init_enable, "basic_users") + enable_eessi = contains(var.compute_init_enable, "eessi") } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 3655c9e65..a0e90c61b 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -76,3 +76,9 @@ variable "control_address" { description = "Name/address of control node" type = string } + +variable "compute_init_enable" { + type = list(string) + description = "Groups to activate for ansible-init compute rebuilds" + default = [] +} \ No newline at end of file diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 0f5eefa18..19027dd19 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -52,6 +52,7 @@ variable "compute" { image_id: Overrides variable cluster_image_id vnic_type: Overrides variable vnic_type vnic_profile: Overrides variable vnic_profile + compute_init_enable: Toggles ansible-init rebuild EOF } @@ -136,3 +137,9 @@ variable "k3s_token" { description = "K3s cluster authentication token, set automatically by Ansible" type = string } + +variable "compute_init_enable" { + type = list(string) + description = "Groups to activate for ansible-init compute rebuilds" + default = [] +} \ No newline at end of file From 4def5bab5f4ddc01bda873df3e6eff5330afef96 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 7 Jan 2025 09:21:00 +0000 Subject: [PATCH 231/268] Add Release Train OpenHPC repos (#515) * Added OpenHPC release train repos * bump images * Comment update Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * refactored and defaulted toggling ohpc repos * bump images * Updated comment --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/dnf_repos/defaults/main.yml | 12 +++++++++++- ansible/roles/pulp_site/defaults/main.yml | 4 ++++ .../terraform/cluster_image.auto.tfvars.json | 4 ++-- .../common/inventory/group_vars/all/defaults.yml | 15 +++++++++++++++ .../common/inventory/group_vars/all/openhpc.yml | 11 ++++++++++- 5 files changed, 42 insertions(+), 4 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 17114b49d..841631890 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -17,7 +17,7 @@ dnf_repos_filenames: dnf_repos_version_filenames: "{{ dnf_repos_filenames[ansible_distribution_major_version] }}" # epel installed separately -dnf_repos_repolist: +dnf_repos_default_repolist: - file: "{{ dnf_repos_version_filenames.baseos }}" name: baseos base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" @@ -31,5 +31,15 @@ dnf_repos_repolist: name: extras base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" +dnf_repos_openhpc_repolist: +- name: OpenHPC + file: OpenHPC + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_base[ansible_distribution_major_version] | appliances_repo_to_subpath }}" +- name: OpenHPC-updates + file: OpenHPC + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_updates[ansible_distribution_major_version] | appliances_repo_to_subpath }}" + +dnf_repos_repolist: "{{ dnf_repos_default_repolist + (dnf_repos_openhpc_repolist if (openhpc_install_type | default('ohpc')) == 'ohpc' else []) }}" + dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_epel_description: "epel" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 081307b6a..c0b191336 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -22,6 +22,10 @@ pulp_site_rpm_info: subpath: "{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" - name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major].timestamp }}" subpath: "{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" +- name: "ohpc-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" +- name: "ohpc-updates-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index f9e568c3f..943a2dfbd 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250102-1138-77cfc703", - "RL9": "openhpc-RL9-250102-1139-77cfc703" + "RL8": "openhpc-RL8-250106-0916-f8603056", + "RL9": "openhpc-RL9-250106-0916-f8603056" } } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index f32d14c60..e052eb709 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -151,3 +151,18 @@ appliances_pulp_repos: '8': timestamp: 20241216T235733 path: epel/8/Everything/x86_64 + openhpc_base: + '8': + path: OpenHPC/2/EL_8 + timestamp: 20241218T154614 + '9': + path: OpenHPC/3/EL_9 + timestamp: 20241218T154614 + openhpc_updates: + '8': + path: OpenHPC/2/updates/EL_8 + timestamp: 20241218T154614 + '9': + path: OpenHPC/3/updates/EL_9 + timestamp: 20241218T154614 + diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index e3d20b9c3..3b3879de9 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -39,6 +39,15 @@ openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" +openhpc_install_type: ohpc # 'ohpc' or 'generic', see https://github.com/stackhpc/ansible-slurm-appliance/pull/326 + +# Empty repo lists from stackhpc.openhpc role defaults, as these repofiles are +# now generated by dnf_repos to allow injecting Ark creds: +ohpc_openhpc_repos: + "9": [] + "8": [] + +# overriding to ensure doesn't overwrite Ark epel repo ohpc_default_extra_repos: - "9": [] #overriding to ensure doesn't overwrite ark epel repo + "9": [] "8": [] From 8290a313885dea62421bee125c9460acecf9570a Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 7 Jan 2025 11:04:21 +0000 Subject: [PATCH 232/268] define default compute init flags --- environments/.stackhpc/terraform/compute_init.auto.tfvars | 7 +++++++ environments/.stackhpc/terraform/main.tf | 5 +++++ 2 files changed, 12 insertions(+) create mode 100644 environments/.stackhpc/terraform/compute_init.auto.tfvars diff --git a/environments/.stackhpc/terraform/compute_init.auto.tfvars b/environments/.stackhpc/terraform/compute_init.auto.tfvars new file mode 100644 index 000000000..032ae5adb --- /dev/null +++ b/environments/.stackhpc/terraform/compute_init.auto.tfvars @@ -0,0 +1,7 @@ +compute_init_enable = [ + "compute", + "etc_hosts", + "nfs", + "basic_users", + "eessi" +] diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 4284ec132..d54903cc4 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -58,6 +58,10 @@ variable "k3s_token" { type = string } +variable "compute_init_enable" { + type = list(string) +} + data "openstack_images_image_v2" "cluster" { name = var.cluster_image[var.os_version] most_recent = true @@ -74,6 +78,7 @@ module "cluster" { cluster_image_id = data.openstack_images_image_v2.cluster.id control_node_flavor = var.control_node_flavor k3s_token = var.k3s_token + compute_init_enable = var.compute_init_enable login_nodes = { login-0: var.other_node_flavor From 354ce1e810f4be3836919ac250e3fbb9e1634f9e Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 7 Jan 2025 12:47:42 +0000 Subject: [PATCH 233/268] add CI tests for compute node rebuilds --- .github/workflows/stackhpc.yml | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..a5267e508 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -170,33 +170,22 @@ jobs: env: TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - # - name: Build environment-specific compute image - # id: packer_build - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # cd packer/ - # packer init - # PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - # ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs - - # - name: Test reimage of compute nodes to new environment-specific image (via slurm) - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]" - # ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down - # ansible-playbook -v ansible/ci/check_slurm.yml - - name: Test reimage of login and control nodes (via rebuild adhoc) run: | . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml - ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml + - name: Test reimage of compute nodes and compute-init (via rebuild adhoc) + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml + ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down + ansible-playbook -v ansible/ci/check_slurm.yml + - name: Check sacct state survived reimage run: | . venv/bin/activate From b903cdd0a3350d15eee65a8f4835477e21ed15ca Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 7 Jan 2025 14:59:10 +0000 Subject: [PATCH 234/268] document metadata toggle flags and CI workflow --- ansible/roles/compute_init/README.md | 31 +++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 77a127245..40d9b7326 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -42,10 +42,35 @@ The following roles/groups are currently fully functional: node and all compute nodes. - `openhpc`: all functionality -# Development/debugging +All of the above are defined in the skeleton cookiecutter config, and are +toggleable via a terraform compute_init autovar file. In the .stackhpc +environment, the compute init roles are set by default to: +- `enable_compute`: This encompasses the openhpc role functionality while being + a global toggle for the entire compute-init script. +- `etc_hosts` +- `nfs` +- `basic_users` +- `eessi` + +# CI workflow + +The compute node rebuild is tested in CI after the tests for rebuilding the +login and control nodes. The process follows + +1. Compute nodes are reimaged: + + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml -To develop/debug this without actually having to build an image: +2. Ansible-init runs against newly reimaged compute nodes + +3. Run sinfo and check nodes have expected slurm state + + ansible-playbook -v ansible/ci/check_slurm.yml + +# Development/debugging +To develop/debug changes to the compute script without actually having to build +a new image: 1. Deploy a cluster using tofu and ansible/site.yml as normal. This will additionally configure the control node to export compute hostvars over NFS. @@ -103,7 +128,7 @@ as in step 3. available v the current approach: ``` - [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", From 50fc320be89db6e5884323830eb5c548ddbb8199 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 8 Jan 2025 10:13:38 +0000 Subject: [PATCH 235/268] Update ceph to use ark packages and move RL9 to ceph reef (#519) * Release train support for ceph repos * bump images * Update requirements.yml * bumped rocky 9 ceph repos to reef * updated rl9 ceph version number * bump images * reverted to upstream ceph versions * Update requirements.yml * comment --- ansible/roles/dnf_repos/defaults/main.yml | 3 +++ ansible/roles/pulp_site/defaults/main.yml | 2 ++ .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- environments/common/inventory/group_vars/all/defaults.yml | 8 +++++++- .../common/inventory/group_vars/all/os-manila-mount.yml | 3 +++ requirements.yml | 2 +- 6 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 environments/common/inventory/group_vars/all/os-manila-mount.yml diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 841631890..6d41046ec 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -30,6 +30,9 @@ dnf_repos_default_repolist: - file: "{{ dnf_repos_version_filenames.extras }}" name: extras base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" +- file: ceph + name: Ceph + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.ceph[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_openhpc_repolist: - name: OpenHPC diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index c0b191336..c549dac53 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -26,6 +26,8 @@ pulp_site_rpm_info: subpath: "{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" - name: "ohpc-updates-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major].timestamp }}" subpath: "{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" +- name: "ceph-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 943a2dfbd..9c72b07ce 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250106-0916-f8603056", - "RL9": "openhpc-RL9-250106-0916-f8603056" + "RL8": "openhpc-RL8-250107-1534-b03caaf3", + "RL9": "openhpc-RL9-250107-1535-b03caaf3" } } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index e052eb709..e26bc3018 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -165,4 +165,10 @@ appliances_pulp_repos: '9': path: OpenHPC/3/updates/EL_9 timestamp: 20241218T154614 - + ceph: + '8': + timestamp: 20231104T015751 + path: centos/8-stream/storage/x86_64/ceph-quincy + '9': + timestamp: 20240923T233036 + path: centos/9-stream/storage/x86_64/ceph-reef diff --git a/environments/common/inventory/group_vars/all/os-manila-mount.yml b/environments/common/inventory/group_vars/all/os-manila-mount.yml new file mode 100644 index 000000000..6b25d62cb --- /dev/null +++ b/environments/common/inventory/group_vars/all/os-manila-mount.yml @@ -0,0 +1,3 @@ +# Empty repo lists from stackhpc.ansible-role-os-manila-mount role defaults, as these repofiles are +# now generated by dnf_repos to allow injecting Ark creds: +os_manila_mount_ceph_rpm_repos: [] diff --git a/requirements.yml b/requirements.yml index 7e71bb904..71adbc6e5 100644 --- a/requirements.yml +++ b/requirements.yml @@ -21,7 +21,7 @@ roles: version: v3.1.5 - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git name: stackhpc.os-manila-mount - version: v24.11.0 # Support ceph quincy for RL9 + version: v25.1.1 collections: - name: containers.podman From 781c2d474848309dbe42bb4ca83343b1aad3b621 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:45:03 +0000 Subject: [PATCH 236/268] Add more information re. configuring production sites (#508) * add lots of info to production docs * Production docs tweaks from review Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com> * add prod docs comment re login FIPs --------- Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com> --- docs/production.md | 150 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 145 insertions(+), 5 deletions(-) diff --git a/docs/production.md b/docs/production.md index 7219ee7fc..c1b139994 100644 --- a/docs/production.md +++ b/docs/production.md @@ -1,9 +1,149 @@ # Production Deployments -This page contains some brief notes about differences between the default/demo configuration, as described in the main [README.md](../README.md) and production-ready deployments. +This page contains some brief notes about differences between the default/demo +configuration (as described in the main [README.md](../README.md)) and +production-ready deployments. + +- Get it agreed up front what the cluster names will be. Changing this later + requires instance deletion/recreation. + +- At least three environments should be created: + - `site`: site-specific base environment + - `production`: production environment + - `staging`: staging environment + + A `dev` environment should also be created if considered required, or this + can be left until later., + + These can all be produced using the cookicutter instructions, but the + `production` and `staging` environments will need their + `environments/$ENV/ansible.cfg` file modifying so that they point to the + `site` environment: + + ```ini + inventory = ../common/inventory,../site/inventory,inventory + ``` + +- To avoid divergence of configuration all possible overrides for group/role +vars should be placed in `environments/site/inventory/group_vars/all/*.yml` +unless the value really is environment-specific (e.g. DNS names for +`openondemand_servername`). + +- Where possible hooks should also be placed in `environments/site/hooks/` +and referenced from the `site` and `production` environments, e.g.: + + ```yaml + # environments/production/hooks/pre.yml: + - name: Import parent hook + import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" + ``` + +- OpenTofu configurations should be defined in the `site` environment and used + as a module from the other environments. This can be done with the + cookie-cutter generated configurations: + - Delete the *contents* of the cookie-cutter generated `terraform/` directories + from the `production` and `staging` environments. + - Create a `main.tf` in those directories which uses `site/terraform/` as a + [module](https://opentofu.org/docs/language/modules/), e.g. : + + ``` + ... + module "cluster" { + source = "../../site/terraform/" + + cluster_name = "foo" + ... + } + ``` + + Note that: + - Environment-specific variables (`cluster_name`) should be hardcoded + into the module block. + - Environment-independent variables (e.g. maybe `cluster_net` if the + same is used for staging and production) should be set as *defaults* + in `environments/site/terraform/variables.tf`, and then don't need to + be passed in to the module. + +- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates + a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. + To ensure staging environments are a good model for production this should + generally be moved into the `site` environment. It should be be encrypted + using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) + and then committed to the repository. + +- Ensure created instances have accurate/synchronised time. For VM instances + this is usually provided by the hypervisor, but if not (or for bare metal + instances) it may be necessary to configure or proxy `chronyd` via an + environment hook. + +- The cookiecutter provided OpenTofu configurations define resources for home and + state volumes. The former may not be required if the cluster's `/home` is + provided from an external filesystem (or Manila). In any case, in at least + the production environment, and probably also in the staging environment, + the volumes should be manually created and the resources changed to [data + resources](https://opentofu.org/docs/language/data-sources/). This ensures that even if the cluster is deleted via tofu, the + volumes will persist. + + For a development environment, having volumes under tofu control via volume + resources is usually appropriate as there may be many instantiations + of this environment. + +- Enable `etc_hosts` templating: + + ```yaml + # environments/site/inventory/groups: + [etc_hosts:children] + cluster + ``` -- Create a site environment. Usually at least production, staging and possibly development environments are required. To avoid divergence of configuration these should all have an `inventory` path referencing a shared, site-specific base environment. Where possible hooks should also be placed in this site-specific environment. -- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. To ensure staging environments are a good model for production this should generally be moved into the site-specific environment. It can be be encrypted using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) and then committed to the repository. -- Ensure created instances have accurate/synchronised time. For VM instances this is usually provided by the hypervisor, but if not (or for bare metal instances) it may be necessary to configure or proxy `chronyd` via an environment hook. -- Remove production volumes from OpenTofu control. In the default OpenTofu configuration, deleting the resources also deletes the volumes used for persistent state and home directories. This is usually undesirable for production, so these resources should be removed from the OpenTofu configurations and manually deployed once. However note that for development environments leaving them under OpenTofu control is usually best. - Configure Open OpenOndemand - see [specific documentation](openondemand.README.md). + +- Modify `environments/site/terraform/nodes.tf` to provide fixed IPs for at least + the control node, and (if not using FIPs) the login node(s): + + ``` + resource "openstack_networking_port_v2" "control" { + ... + fixed_ip { + subnet_id = data.openstack_networking_subnet_v2.cluster_subnet.id + ip_address = var.control_ip_address + } + } + ``` + + Note the variable `control_ip_address` is new. + + Using fixed IPs will require either using admin credentials or policy changes. + +- If floating IPs are required for login nodes, modify the OpenTofu configurations + appropriately. + +- Enable persisting login node hostkeys so users do not get annoying ssh warning + messages on reimage: + + ```yaml + # environments/site/inventory/groups: + [persist_hostkeys:children] + login + ``` + And configure NFS to include exporting the state directory to these hosts: + + ```yaml + # environments/common/inventory/group_vars/all/nfs.yml: + nfs_configurations: + # ... potentially, /home defintion from common environment + - comment: Export state directory to login nodes + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['login'] }}" + nfs_server: "{{ nfs_server_default }}" + nfs_export: "/var/lib/state" + nfs_client_mnt_point: "/var/lib/state" + ``` + See [issue 506](https://github.com/stackhpc/ansible-slurm-appliance/issues/506). + +- Consider whether mapping of baremetal nodes to ironic nodes is required. See + [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). + +- Note [PR 473](https://github.com/stackhpc/ansible-slurm-appliance/pull/473) + may help identify any site-specific configuration. From a1e5bd7173f60735cf70270b16b2f169e81692f4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 8 Jan 2025 14:40:52 +0000 Subject: [PATCH 237/268] Reworked persist_hostkeys role to use common set of persistent keys from state directory --- ansible/roles/persist_hostkeys/README.md | 7 +-- .../roles/persist_hostkeys/defaults/main.yml | 2 + ansible/roles/persist_hostkeys/tasks/main.yml | 63 +++++++++++-------- environments/common/layouts/everything | 6 +- 4 files changed, 46 insertions(+), 32 deletions(-) create mode 100644 ansible/roles/persist_hostkeys/defaults/main.yml diff --git a/ansible/roles/persist_hostkeys/README.md b/ansible/roles/persist_hostkeys/README.md index 2d823dc36..6201a104b 100644 --- a/ansible/roles/persist_hostkeys/README.md +++ b/ansible/roles/persist_hostkeys/README.md @@ -1,8 +1,5 @@ # persist_hostkeys -Save hostkeys to persistent storage and restore them after a rebuild/reimage. +Idempotently generates a persistent set of hostkeys and restores them after a rebuild/reimage. -Add hosts to the `persist_hostkeys` group to enable. - -This role has no variables but hosts in this group must have `appliances_state_dir` -defined as a directory they can write to on persistent storage. +Add hosts to the `persist_hostkeys` group to enable. All hosts in group will share the same set hostkeys. diff --git a/ansible/roles/persist_hostkeys/defaults/main.yml b/ansible/roles/persist_hostkeys/defaults/main.yml new file mode 100644 index 000000000..3c0000466 --- /dev/null +++ b/ansible/roles/persist_hostkeys/defaults/main.yml @@ -0,0 +1,2 @@ +persist_hostkeys_state_server: "{{ groups['control'] | first }}" +persist_hostkeys_state_dir: "{{ hostvars[persist_hostkeys_state_server]['appliances_state_dir'] }}/hostkeys" diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml index 47493220d..8bb2d6306 100644 --- a/ansible/roles/persist_hostkeys/tasks/main.yml +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -1,33 +1,46 @@ --- -- name: Ensure hostkeys directory exists on persistent storage - file: - path: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}" - state: directory - owner: root - group: root - mode: 0600 +- name: Generate persistent hostkeys in state directory + delegate_to: "{{ persist_hostkeys_state_server }}" + block: + - name: Ensure hostkeys directory exists on persistent storage + file: + path: "{{ persist_hostkeys_state_dir }}" + state: directory + owner: root + group: root + mode: 0600 -- name: Copy hostkeys from persistent storage - # won't fail if no keys are in persistent storage - copy: - src: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" - dest: /etc/ssh/ - remote_src: true + - name: Check for existing hostkeys + find: + paths: "{{ persist_hostkeys_state_dir }}/" + register: _files_found + + - name: Generate hostkeys + when: _files_found.matched == 0 + shell: + cmd: | + mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh + ssh-keygen -A -N \"\" -f {{ persist_hostkeys_state_dir }} + mv {{ persist_hostkeys_state_dir }}/etc/ssh/* {{ persist_hostkeys_state_dir }} + rm -rf {{ persist_hostkeys_state_dir }}/etc/ssh + + - name: Get created key names + find: + path: "{{ persist_hostkeys_state_dir }}/" + register: _find_ssh_keys -- name: Find hostkeys - find: - path: /etc/ssh/ - patterns: ssh_host_*_key* - register: _find_ssh_keys + - name: Create in-memory copies of keys + ansible.builtin.slurp: + src: "{{ item.path }}" + loop: "{{ _find_ssh_keys.files }}" + register: _slurp_keys -- name: Persist hostkeys +- name: Copy keys to hosts + no_log: true copy: - dest: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" - src: "{{ item }}" - remote_src: true - mode: preserve - loop: "{{ _find_ssh_keys.files | map(attribute='path') }}" + content: "{{ item.content | b64decode }}" + dest: "/etc/ssh/{{ item.source | regex_search('[^/]+$') }}" + loop: "{{ _slurp_keys.results }}" - meta: reset_connection - diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 878bebbf3..0d3c57ad5 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -69,8 +69,10 @@ openhpc [manila] # Hosts to configure for manila fileshares -[persist_hostkeys] -# Hosts to persist hostkeys for across reimaging. NB: Requires appliances_state_dir on hosts. +[persist_hostkeys:children] +# Hosts to use common set of hostkeys which persist across reimaging. +login +openondemand [squid] # Hosts to run squid proxy From fa028f9acc986372e0dcd2b9f0d949fc0066c19b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 8 Jan 2025 14:44:23 +0000 Subject: [PATCH 238/268] removed unnescessary caas config --- environments/.caas/inventory/extra_groups | 3 --- environments/.caas/inventory/group_vars/all/nfs.yml | 11 +---------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/environments/.caas/inventory/extra_groups b/environments/.caas/inventory/extra_groups index d60ae7839..45a1dc7aa 100644 --- a/environments/.caas/inventory/extra_groups +++ b/environments/.caas/inventory/extra_groups @@ -14,6 +14,3 @@ compute [podman:children] zenith - -[persist_hostkeys:children] -openondemand diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml index 14fff6295..f42422601 100644 --- a/environments/.caas/inventory/group_vars/all/nfs.yml +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -1,14 +1,5 @@ nfs_server: "{{ nfs_server_default }}" -caas_nfs_ood_state: - - comment: Export /var/lib/state from Slurm control node to OOD - nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" - clients: "{{ inventory_hostname in groups['openondemand'] }}" - nfs_export: "{{ appliances_state_dir }}" - nfs_client_mnt_point: "{{ appliances_state_dir }}" - nfs_client_mnt_options: "x-systemd.required-by=zenith-ood.service,x-systemd.before=zenith-ood.service" - caas_nfs_home: - comment: Export /exports/home from Slurm control node as /home nfs_enable: @@ -17,4 +8,4 @@ caas_nfs_home: nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" -nfs_configurations: "{{ caas_nfs_ood_state + (caas_nfs_home if not cluster_home_manila_share | bool else []) }}" +nfs_configurations: "{{ caas_nfs_home if not cluster_home_manila_share | bool else [] }}" From 001c459cf65f156763abe53e36588ee26121ef42 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 8 Jan 2025 14:51:57 +0000 Subject: [PATCH 239/268] updated docs --- docs/production.md | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/docs/production.md b/docs/production.md index c1b139994..e767c61b4 100644 --- a/docs/production.md +++ b/docs/production.md @@ -118,30 +118,6 @@ and referenced from the `site` and `production` environments, e.g.: - If floating IPs are required for login nodes, modify the OpenTofu configurations appropriately. -- Enable persisting login node hostkeys so users do not get annoying ssh warning - messages on reimage: - - ```yaml - # environments/site/inventory/groups: - [persist_hostkeys:children] - login - ``` - And configure NFS to include exporting the state directory to these hosts: - - ```yaml - # environments/common/inventory/group_vars/all/nfs.yml: - nfs_configurations: - # ... potentially, /home defintion from common environment - - comment: Export state directory to login nodes - nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" - clients: "{{ inventory_hostname in groups['login'] }}" - nfs_server: "{{ nfs_server_default }}" - nfs_export: "/var/lib/state" - nfs_client_mnt_point: "/var/lib/state" - ``` - See [issue 506](https://github.com/stackhpc/ansible-slurm-appliance/issues/506). - - Consider whether mapping of baremetal nodes to ironic nodes is required. See [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). From 2bea51cdb0ec0cb32471372e908238b82f581c16 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 8 Jan 2025 16:03:13 +0000 Subject: [PATCH 240/268] review suggestions --- .github/workflows/stackhpc.yml | 1 - ansible/roles/compute_init/README.md | 26 +--- docs/experimental/compute-init.md | 111 ++---------------- .../terraform/compute_init.auto.tfvars | 7 -- environments/.stackhpc/terraform/main.tf | 6 +- .../terraform/compute.tf | 2 +- .../terraform/compute/nodes.tf | 20 ++-- .../terraform/variables.tf | 8 +- 8 files changed, 21 insertions(+), 160 deletions(-) delete mode 100644 environments/.stackhpc/terraform/compute_init.auto.tfvars diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index a5267e508..ea18a2274 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -183,7 +183,6 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml - ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down ansible-playbook -v ansible/ci/check_slurm.yml - name: Check sacct state survived reimage diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 40d9b7326..db18034aa 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -42,30 +42,8 @@ The following roles/groups are currently fully functional: node and all compute nodes. - `openhpc`: all functionality -All of the above are defined in the skeleton cookiecutter config, and are -toggleable via a terraform compute_init autovar file. In the .stackhpc -environment, the compute init roles are set by default to: -- `enable_compute`: This encompasses the openhpc role functionality while being - a global toggle for the entire compute-init script. -- `etc_hosts` -- `nfs` -- `basic_users` -- `eessi` - -# CI workflow - -The compute node rebuild is tested in CI after the tests for rebuilding the -login and control nodes. The process follows - -1. Compute nodes are reimaged: - - ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml - -2. Ansible-init runs against newly reimaged compute nodes - -3. Run sinfo and check nodes have expected slurm state - - ansible-playbook -v ansible/ci/check_slurm.yml +The above may be enabled by setting the compute_init_enable property on the +terraform compute variable. # Development/debugging diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index dae840d95..c7c1d4d8c 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -2,112 +2,17 @@ See the role README.md -# Results/progress +# CI workflow -Without any metadata: +The compute node rebuild is tested in CI after the tests for rebuilding the +login and control nodes. The process follows - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: activating (start) since Fri 2024-12-13 20:41:16 UTC; 1min 45s ago - Main PID: 16089 (ansible-init) - Tasks: 8 (limit: 10912) - Memory: 99.5M - CPU: 11.687s - CGroup: /system.slice/ansible-init.service - ├─16089 /usr/lib/ansible-init/bin/python /usr/bin/ansible-init - ├─16273 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml - ├─16350 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml - ├─16361 /bin/sh -c "/usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py && sleep 0" - ├─16362 /usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py - ├─16363 /usr/bin/mount /mnt/cluster - └─16364 /sbin/mount.nfs 192.168.10.12:/exports/cluster /mnt/cluster -o ro,sync +1. Compute nodes are reimaged: - Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] - Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Report skipping initialization if not compute node] ********************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Ensure the mount directory exists] *************************************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid python3[16346]: ansible-file Invoked with path=/mnt/cluster state=directory owner=root group=root mode=u=rwX,go= recurse=False force=False follow=True modification_time_format=%Y%m%d%H%M.%S access> - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: changed: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Mount /mnt/cluster] ****************************************************** - Dec 13 20:41:26 rl9-compute-0.rl9.invalid python3[16362]: ansible-mount Invoked with path=/mnt/cluster src=192.168.10.12:/exports/cluster fstype=nfs opts=ro,sync state=mounted boot=True dump=0 passno=0 backup=False fstab=None - [root@rl9-compute-0 rocky]# systemctl status ansible-init + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml -Added metadata via horizon: +2. Ansible-init runs against newly reimaged compute nodes - compute_groups ["compute"] +3. Run sinfo and check nodes have expected slurm state - -OK: - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 20:43:31 UTC; 33s ago - Process: 16089 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 16089 (code=exited, status=0/SUCCESS) - CPU: 13.003s - - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] => { - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: "msg": "Skipping compute initialization as cannot mount exports/cluster share" - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: } - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: PLAY RECAP ********************************************************************* - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: 127.0.0.1 : ok=4 changed=1 unreachable=0 failed=0 skipped=1 rescued=0 ignored=1 - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] executing remote playbooks for stage - post - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] ansible-init completed successfully - Dec 13 20:43:31 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. - -Now run site.yml, then restart ansible-init again: - - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 20:50:10 UTC; 11s ago - Process: 18921 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 18921 (code=exited, status=0/SUCCESS) - CPU: 8.240s - - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [Report skipping initialization if cannot mount nfs] ********************** - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [meta] ******************************************************************** - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: PLAY RECAP ********************************************************************* - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: 127.0.0.1 : ok=3 changed=1 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] executing remote playbooks for stage - post - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] ansible-init completed successfully - Dec 13 20:50:10 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. - [root@rl9-compute-0 rocky]# ls /mnt/cluster/host - hosts hostvars/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- - rl9-compute-0/ rl9-compute-1/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- - rl9-compute-0/ rl9-compute-1/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-0/ - hostvars.yml - -This commit - shows that hostvars have loaded: - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 21:06:20 UTC; 5s ago - Process: 27585 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 27585 (code=exited, status=0/SUCCESS) - CPU: 8.161s - - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: TASK [Demonstrate hostvars have loaded] **************************************** - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: ok: [127.0.0.1] => { - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: "prometheus_version": "2.27.0" - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: } - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: PLAY RECAP ********************************************************************* - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: 127.0.0.1 : ok=5 changed=0 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] executing remote playbooks for stage - post - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully - Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + ansible-playbook -v ansible/ci/check_slurm.yml \ No newline at end of file diff --git a/environments/.stackhpc/terraform/compute_init.auto.tfvars b/environments/.stackhpc/terraform/compute_init.auto.tfvars deleted file mode 100644 index 032ae5adb..000000000 --- a/environments/.stackhpc/terraform/compute_init.auto.tfvars +++ /dev/null @@ -1,7 +0,0 @@ -compute_init_enable = [ - "compute", - "etc_hosts", - "nfs", - "basic_users", - "eessi" -] diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index d54903cc4..872003db3 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -58,10 +58,6 @@ variable "k3s_token" { type = string } -variable "compute_init_enable" { - type = list(string) -} - data "openstack_images_image_v2" "cluster" { name = var.cluster_image[var.os_version] most_recent = true @@ -78,7 +74,6 @@ module "cluster" { cluster_image_id = data.openstack_images_image_v2.cluster.id control_node_flavor = var.control_node_flavor k3s_token = var.k3s_token - compute_init_enable = var.compute_init_enable login_nodes = { login-0: var.other_node_flavor @@ -87,6 +82,7 @@ module "cluster" { standard: { # NB: can't call this default! nodes: ["compute-0", "compute-1"] flavor: var.other_node_flavor + compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi"] } # Example of how to add another partition: # extra: { diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index d52c3c42c..dcc692c1a 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -19,5 +19,5 @@ module "compute" { control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] - compute_init_enable = var.compute_init_enable + compute_init_enable = each.value.compute_init_enable } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index ac34a443c..d3a37bc5b 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -44,18 +44,14 @@ resource "openstack_compute_instance_v2" "compute" { access_network = true } - metadata = { - environment_root = var.environment_root - k3s_token = var.k3s_token - control_address = var.control_address - enable_compute = contains(var.compute_init_enable, "compute") - enable_resolv_conf = contains(var.compute_init_enable, "resolv_conf") - enable_etc_hosts = contains(var.compute_init_enable, "etc_hosts") - enable_nfs = contains(var.compute_init_enable, "nfs") - enable_manila = contains(var.compute_init_enable, "manila") - enable_basic_users = contains(var.compute_init_enable, "basic_users") - enable_eessi = contains(var.compute_init_enable, "eessi") - } + metadata = merge( + { + environment_root = var.environment_root + k3s_token = var.k3s_token + control_address = var.control_address + }, + {for e in var.compute_init_enable: e => true} + ) user_data = <<-EOF #cloud-config diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 19027dd19..b2e16c942 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -52,7 +52,7 @@ variable "compute" { image_id: Overrides variable cluster_image_id vnic_type: Overrides variable vnic_type vnic_profile: Overrides variable vnic_profile - compute_init_enable: Toggles ansible-init rebuild + compute_init_enable: Toggles compute-init rebuild (see compute-init role docs) EOF } @@ -136,10 +136,4 @@ variable "root_volume_size" { variable "k3s_token" { description = "K3s cluster authentication token, set automatically by Ansible" type = string -} - -variable "compute_init_enable" { - type = list(string) - description = "Groups to activate for ansible-init compute rebuilds" - default = [] } \ No newline at end of file From dc58a257499f4e6653664f78e2853034c15f3101 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:58:23 +0000 Subject: [PATCH 241/268] Change defaults so a cookiecutter environment is fully functional (#473) * cookiecutter environment now has working defaults * updated docs * refactored ood demo user into cookiecutter * updated docs * changed secret name * Doc changes Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * rename * replaced testuser with demo_user * selinux now defaults to disabled * bump images * updated readme * moved files and removed redundant ood config * environments now have grafana anonymous auth by default * docs update Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- .github/workflows/stackhpc.yml | 10 +++++----- README.md | 1 + ansible/roles/passwords/defaults/main.yml | 1 + ansible/roles/passwords/tasks/validate.yml | 2 +- docs/{openondemand.README.md => openondemand.md} | 12 +++++++----- docs/production.md | 4 ++++ .../.caas/inventory/group_vars/all/selinux.yml | 1 - .../inventory/group_vars/all/basic_users.yml | 6 +++--- .../.stackhpc/inventory/group_vars/all/freeipa.yml | 4 ++-- .../{grafana/overrides.yml => all/grafana.yml} | 0 .../{openhpc/overrides.yml => all/openhpc.yml} | 0 .../inventory/group_vars/all/openondemand.yml | 9 ++++++++- .../inventory/group_vars/openondemand/overrides.yml | 8 -------- .../inventory/group_vars/selinux/overrides.yml | 1 - .../common/inventory/group_vars/all/openondemand.yml | 7 ++++++- .../common/inventory/group_vars/all/selinux.yml | 2 +- environments/common/layouts/everything | 6 ++++-- .../inventory/group_vars/all/basic_users.yml | 4 ++++ .../inventory/group_vars/all/grafana.yml | 1 + .../terraform/variables.tf | 2 +- 20 files changed, 49 insertions(+), 32 deletions(-) rename docs/{openondemand.README.md => openondemand.md} (76%) delete mode 100644 environments/.caas/inventory/group_vars/all/selinux.yml rename environments/.stackhpc/inventory/group_vars/{grafana/overrides.yml => all/grafana.yml} (100%) rename environments/.stackhpc/inventory/group_vars/{openhpc/overrides.yml => all/openhpc.yml} (100%) delete mode 100644 environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml delete mode 100644 environments/.stackhpc/inventory/group_vars/selinux/overrides.yml create mode 100644 environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml create mode 100644 environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..eaca3a3ae 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -99,9 +99,9 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook ansible/adhoc/generate-passwords.yml - echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml + echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml env: - TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - name: Provision nodes using fat image id: provision_servers @@ -163,12 +163,12 @@ jobs: --spider \ --server-response \ --no-check-certificate \ - --http-user=testuser \ - --http-password=${TESTUSER_PASSWORD} https://${openondemand_servername} \ + --http-user=demo_user \ + --http-password=${DEMO_USER_PASSWORD} https://${openondemand_servername} \ 2>&1) (echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1) env: - TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} # - name: Build environment-specific compute image # id: packer_build diff --git a/README.md b/README.md index f66441915..593837ccd 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,7 @@ To deploy this infrastructure, ensure the venv and the environment are [activate export OS_CLOUD=openstack cd environments/$ENV/terraform/ + tofu init tofu apply and follow the prompts. Note the OS_CLOUD environment variable assumes that OpenStack credentials are defined using a [clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) file in a default location with the default cloud name of `openstack`. diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 2587e8499..929aac465 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -10,6 +10,7 @@ slurm_appliance_secrets: vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}" vault_k3s_token: "{{ vault_k3s_token | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" vault_pulp_admin_password: "{{ vault_pulp_admin_password | default(lookup('password', '/dev/null', chars=['ascii_letters', 'digits'])) }}" + vault_demo_user_password: "{{ vault_demo_user_password | default(lookup('password', '/dev/null')) }}" secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" diff --git a/ansible/roles/passwords/tasks/validate.yml b/ansible/roles/passwords/tasks/validate.yml index 9279ffdbf..b30b0696e 100644 --- a/ansible/roles/passwords/tasks/validate.yml +++ b/ansible/roles/passwords/tasks/validate.yml @@ -1,4 +1,4 @@ - name: Assert secrets created assert: - that: (hostvars[inventory_hostname].keys() | select('contains', 'vault_') | length) > 1 # 1 as may have vault_testuser_password defined in dev + that: (hostvars[inventory_hostname].keys() | select('contains', 'vault_') | length) > 1 # 1 as may have vault_demo_user_password defined in dev fail_msg: "No inventory variables 'vault_*' found: Has ansible/adhoc/generate-passwords.yml been run?" diff --git a/docs/openondemand.README.md b/docs/openondemand.md similarity index 76% rename from docs/openondemand.README.md rename to docs/openondemand.md index 5daba3408..3bd6c9e9f 100644 --- a/docs/openondemand.README.md +++ b/docs/openondemand.md @@ -30,11 +30,10 @@ The above functionality is configured by running the `ansible/portal.yml` playbo See the [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md) for more details on the variables described below. -At minimum the following must be defined: -- `openondemand_servername` - this must be defined for both `openondemand` and `grafana` hosts (when Grafana is enabled). It is suggested to place it groupvars for `all`. -- `openondemand_auth` and any corresponding options. -- `openondemand_desktop_partition` and `openondemand_jupyter_partition` if the corresponding inventory groups are defined. -- `openondemand_host_regex` if `openondemand_desktop` or `openondemand_jupyter` inventory groups are defined and/or proxying Grafana via Open Ondemand is required. +The following variables have been given default values to allow Open Ondemand to work in a newly created environment without additional configuration, but generally should be overridden in `environment/site/inventory/group_vars/all/` with site-specific values: +- `openondemand_servername` - this must be defined for both `openondemand` and `grafana` hosts (when Grafana is enabled). Default is `ansible_host` (i.e. the IP address) of the first host in the `openondemand` group. +- `openondemand_auth` and any corresponding options. Defaults to `basic_pam`. +- `openondemand_desktop_partition` and `openondemand_jupyter_partition` if the corresponding inventory groups are defined. Defaults to the first compute group defined in the `compute` Terraform variable in `environments/$ENV/terraform`. It is also recommended to set: - `openondemand_dashboard_support_url` @@ -45,3 +44,6 @@ If shared filesystems other than `$HOME` are available, add paths to `openondema The appliance automatically configures Open Ondemand to proxy Grafana and adds a link to it on the Open Ondemand dashboard. This means no external IP (or SSH proxying etc) is required to access Grafana (which by default is deployed on the control node). To allow users to authenticate to Grafana, the simplest option is to enable anonymous (View-only) login by setting `grafana_auth_anonymous` (see [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml)[^1]). [^1]: Note that if `openondemand_auth` is `basic_pam` and anonymous Grafana login is enabled, the appliance will (by default) configure Open Ondemand's Apache server to remove the Authorisation header from proxying of all `node/` addresses. This is done as otherwise Grafana tries to use this header to authenticate, which fails with the default configuration where only the admin Grafana user `grafana` is created. Note that the removal of this header in this configuration means it cannot be used to authenticate proxied interactive applications - however the appliance-deployed remote desktop and Jupyter Notebook server applications use other authentication methods. An alternative if using `basic_pam` is not to enable anonymous Grafana login and to create Grafana users matching the local users (e.g. in `environments//hooks/post.yml`). + +# Access +By default the appliance authenticates against OOD with basic auth through PAM. When creating a new environment, a new user with username `demo_user` will be created. Its password is found under `vault_openondemand_default_user` in the appliance secrets store in `environments/{ENV}/inventory/group_vars/all/secrets.yml`. Other users can be defined by overriding the `basic_users_users` variable in your environment (templated into `environments/{ENV}/inventory/group_vars/all/basic_users.yml` by default). diff --git a/docs/production.md b/docs/production.md index c1b139994..5190ecae6 100644 --- a/docs/production.md +++ b/docs/production.md @@ -98,6 +98,10 @@ and referenced from the `site` and `production` environments, e.g.: - Configure Open OpenOndemand - see [specific documentation](openondemand.README.md). +- Remove the `demo_user` user from `environments/$ENV/inventory/group_vars/all/basic_users.yml` + +- Consider whether having (read-only) access to Grafana without login is OK. If not, remove `grafana_auth_anonymous` in `environments/$ENV/inventory/group_vars/all/grafana.yml` + - Modify `environments/site/terraform/nodes.tf` to provide fixed IPs for at least the control node, and (if not using FIPs) the login node(s): diff --git a/environments/.caas/inventory/group_vars/all/selinux.yml b/environments/.caas/inventory/group_vars/all/selinux.yml deleted file mode 100644 index 1f1098126..000000000 --- a/environments/.caas/inventory/group_vars/all/selinux.yml +++ /dev/null @@ -1 +0,0 @@ -selinux_state: disabled \ No newline at end of file diff --git a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml index ae416cf72..e2088ffd9 100644 --- a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml +++ b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml @@ -1,6 +1,6 @@ -test_user_password: "{{ lookup('env', 'TESTUSER_PASSWORD') | default(vault_testuser_password, true) }}" # CI uses env, debug can set vault_testuser_password +test_demo_user_password: "{{ lookup('env', 'DEMO_USER_PASSWORD') | default(vault_demo_user_password, true) }}" # CI uses env, debug can set vault_demo_user_password basic_users_users: - - name: testuser # can't use rocky as $HOME isn't shared! - password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent + - name: demo_user # can't use rocky as $HOME isn't shared! + password: "{{ test_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent uid: 1005 diff --git a/environments/.stackhpc/inventory/group_vars/all/freeipa.yml b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml index 4b3750650..9a979ab16 100644 --- a/environments/.stackhpc/inventory/group_vars/all/freeipa.yml +++ b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml @@ -2,8 +2,8 @@ # NB: Users defined this way have expired passwords freeipa_users: - - name: testuser # can't use rocky as $HOME isn't shared! - password: "{{ test_user_password }}" + - name: demo_user # can't use rocky as $HOME isn't shared! + password: "{{ test_demo_user_password }}" givenname: test sn: test diff --git a/environments/.stackhpc/inventory/group_vars/grafana/overrides.yml b/environments/.stackhpc/inventory/group_vars/all/grafana.yml similarity index 100% rename from environments/.stackhpc/inventory/group_vars/grafana/overrides.yml rename to environments/.stackhpc/inventory/group_vars/all/grafana.yml diff --git a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml b/environments/.stackhpc/inventory/group_vars/all/openhpc.yml similarity index 100% rename from environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml rename to environments/.stackhpc/inventory/group_vars/all/openhpc.yml diff --git a/environments/.stackhpc/inventory/group_vars/all/openondemand.yml b/environments/.stackhpc/inventory/group_vars/all/openondemand.yml index 11d475664..72b6cf476 100644 --- a/environments/.stackhpc/inventory/group_vars/all/openondemand.yml +++ b/environments/.stackhpc/inventory/group_vars/all/openondemand.yml @@ -1 +1,8 @@ -openondemand_servername: "{{ hostvars[ groups['openondemand'] | first].ansible_host }}" # Use a SOCKS proxy to acccess +openondemand_auth: basic_pam +openondemand_jupyter_partition: standard +openondemand_desktop_partition: standard +#openondemand_dashboard_support_url: +#openondemand_dashboard_docs_url: +#openondemand_filesapp_paths: +ondemand_package: ondemand-"{{ ondemand_package_version }}" +ondemand_package_version: '3.1.10' diff --git a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml b/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml deleted file mode 100644 index 72b6cf476..000000000 --- a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml +++ /dev/null @@ -1,8 +0,0 @@ -openondemand_auth: basic_pam -openondemand_jupyter_partition: standard -openondemand_desktop_partition: standard -#openondemand_dashboard_support_url: -#openondemand_dashboard_docs_url: -#openondemand_filesapp_paths: -ondemand_package: ondemand-"{{ ondemand_package_version }}" -ondemand_package_version: '3.1.10' diff --git a/environments/.stackhpc/inventory/group_vars/selinux/overrides.yml b/environments/.stackhpc/inventory/group_vars/selinux/overrides.yml deleted file mode 100644 index c3b28b913..000000000 --- a/environments/.stackhpc/inventory/group_vars/selinux/overrides.yml +++ /dev/null @@ -1 +0,0 @@ -selinux_state: disabled diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 5e85392ca..cce923fcc 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -5,7 +5,12 @@ # NB: Variables prefixed ood_ are all from https://github.com/OSC/ood-ansible -# openondemand_servername: '' # Must be defined when using openondemand +openondemand_servername: "{{ hostvars[groups['openondemand'].0].ansible_host if groups['openondemand'] else '' }}" + +openondemand_auth: basic_pam + +openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" +openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" # Regex defining hosts which openondemand can proxy; the default regex is compute nodes (for apps) and grafana host, # e.g. if the group `compute` has hosts `compute-{0,1,2,..}` this will be '(compute-\d+)|(control)'. diff --git a/environments/common/inventory/group_vars/all/selinux.yml b/environments/common/inventory/group_vars/all/selinux.yml index 25fbbd68f..fef5c3f58 100644 --- a/environments/common/inventory/group_vars/all/selinux.yml +++ b/environments/common/inventory/group_vars/all/selinux.yml @@ -1,4 +1,4 @@ --- -selinux_state: permissive +selinux_state: disabled selinux_policy: targeted diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 878bebbf3..ad9fa536a 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -36,8 +36,9 @@ login [block_devices:children] # Environment-specific so not defined here -[basic_users] +[basic_users:children] # Add `openhpc` group to add Slurm users via creation of users on each node. +openhpc [openondemand:children] # Host to run Open Ondemand server on - subset of login @@ -51,8 +52,9 @@ compute # Subset of compute to run a Jupyter Notebook servers on via Open Ondemand compute -[etc_hosts] +[etc_hosts:children] # Hosts to manage /etc/hosts e.g. if no internal DNS. See ansible/roles/etc_hosts/README.md +cluster [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml new file mode 100644 index 000000000..dc993c3b8 --- /dev/null +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml @@ -0,0 +1,4 @@ +basic_users_users: + - name: demo_user + password: "{% raw %}{{ vault_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}{% endraw %}" # idempotent + uid: 1005 diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml new file mode 100644 index 000000000..521616a1b --- /dev/null +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml @@ -0,0 +1 @@ +grafana_auth_anonymous: true \ No newline at end of file diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 0f5eefa18..0a5dde56b 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -6,7 +6,7 @@ variable "cluster_name" { variable "cluster_domain_suffix" { type = string description = "Domain suffix for cluster" - default = "invalid" + default = "internal" } variable "cluster_net" { From 038ddf744a0d4dc9e79b3d84620bff97fbf71b21 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 09:01:22 +0000 Subject: [PATCH 242/268] add delay for ansible-init to finish --- .github/workflows/stackhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index ea18a2274..4d0fbb9bb 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -183,6 +183,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml + ansible all -m wait_for_connection -a 'delay=60 timeout=600' ansible-playbook -v ansible/ci/check_slurm.yml - name: Check sacct state survived reimage From 69d9cd859fec84a9a1d337fc4d2c5ec8c47e9c85 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 9 Jan 2025 10:41:44 +0000 Subject: [PATCH 243/268] Fix epel not using Ark repos for RL8 (#526) * cookiecutter environment now has working defaults * updated docs * refactored ood demo user into cookiecutter * updated docs * changed secret name * Doc changes Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * rename * replaced testuser with demo_user * selinux now defaults to disabled * bump images * updated readme * moved files and removed redundant ood config * environments now have grafana anonymous auth by default * fixed ohpc not using ark repos * bump images --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- .../.stackhpc/inventory/group_vars/all/openhpc.yml | 10 ---------- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/environments/.stackhpc/inventory/group_vars/all/openhpc.yml b/environments/.stackhpc/inventory/group_vars/all/openhpc.yml index 858dfd9d3..5aac5f8ad 100644 --- a/environments/.stackhpc/inventory/group_vars/all/openhpc.yml +++ b/environments/.stackhpc/inventory/group_vars/all/openhpc.yml @@ -1,13 +1,3 @@ openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug - -ohpc_default_extra_repos: - "9": [] #overriding to ensure doesn't overwrite ark epel repo - "8": - - name: epel - file: epel - description: "Extra Packages for Enterprise Linux 8 - $basearch" - metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" - gpgcheck: true - gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 9c72b07ce..47681ea8a 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250107-1534-b03caaf3", - "RL9": "openhpc-RL9-250107-1535-b03caaf3" + "RL8": "openhpc-RL8-250108-1703-e515b902", + "RL9": "openhpc-RL9-250108-1703-e515b902" } } From 6929272292f0ed7675dfbe961eedf15b3042569d Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:37:26 +0000 Subject: [PATCH 244/268] fix volume_backed_instances not working for compute nodes (#527) --- .../{{cookiecutter.environment}}/terraform/compute.tf | 8 +++++++- .../{{cookiecutter.environment}}/terraform/variables.tf | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index 14c728a5a..37c9aad10 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -3,16 +3,22 @@ module "compute" { for_each = var.compute + # must be set for group: nodes = each.value.nodes + flavor = each.value.flavor + cluster_name = var.cluster_name cluster_domain_suffix = var.cluster_domain_suffix cluster_net_id = data.openstack_networking_network_v2.cluster_net.id cluster_subnet_id = data.openstack_networking_subnet_v2.cluster_subnet.id - flavor = each.value.flavor + # can be set for group, defaults to top-level value: image_id = lookup(each.value, "image_id", var.cluster_image_id) vnic_type = lookup(each.value, "vnic_type", var.vnic_type) vnic_profile = lookup(each.value, "vnic_profile", var.vnic_profile) + volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) + root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + key_pair = var.key_pair environment_root = var.environment_root k3s_token = var.k3s_token diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 0a5dde56b..f2cfe1215 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -52,6 +52,8 @@ variable "compute" { image_id: Overrides variable cluster_image_id vnic_type: Overrides variable vnic_type vnic_profile: Overrides variable vnic_profile + volume_backed_instances: Overrides variable volume_backed_instances + root_volume_size: Overrides variable root_volume_size EOF } From 4652c34c4fb365cc8a1d2628cf6299957017efc8 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:48:09 +0000 Subject: [PATCH 245/268] typo --- ansible/roles/persist_hostkeys/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml index 8bb2d6306..716b09146 100644 --- a/ansible/roles/persist_hostkeys/tasks/main.yml +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -21,7 +21,7 @@ shell: cmd: | mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh - ssh-keygen -A -N \"\" -f {{ persist_hostkeys_state_dir }} + ssh-keygen -A -N '' -f {{ persist_hostkeys_state_dir }} mv {{ persist_hostkeys_state_dir }}/etc/ssh/* {{ persist_hostkeys_state_dir }} rm -rf {{ persist_hostkeys_state_dir }}/etc/ssh From f021167e7970e23ceef94a0215b0c773a92edda7 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:49:11 +0000 Subject: [PATCH 246/268] comment update Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/persist_hostkeys/tasks/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml index 716b09146..deff112f7 100644 --- a/ansible/roles/persist_hostkeys/tasks/main.yml +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -19,6 +19,7 @@ - name: Generate hostkeys when: _files_found.matched == 0 shell: + # ssh-keygen -A needs a directory with an /etc/ssh suffix to write hostkeys into cmd: | mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh ssh-keygen -A -N '' -f {{ persist_hostkeys_state_dir }} From 7057c5090cd918c99d6339ba60a71eede8e5a004 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 12:04:29 +0000 Subject: [PATCH 247/268] remove delay in compute node rebuild ci --- .github/workflows/stackhpc.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b2651af2f..d5bd313ca 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -183,7 +183,6 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml - ansible all -m wait_for_connection -a 'delay=60 timeout=600' ansible-playbook -v ansible/ci/check_slurm.yml - name: Check sacct state survived reimage From 3faa81382941174657f0b2a8c9cf35f135c9debc Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 14:25:50 +0000 Subject: [PATCH 248/268] fix compute init metadata flags --- ansible/roles/compute_init/files/compute-init.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index c7a9048b4..430e2cf65 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -6,13 +6,13 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.control_address }}" - enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" - enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" - enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" - enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" - enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}" - enable_basic_users: "{{ os_metadata.meta.enable_basic_users | default(false) | bool }}" - enable_eessi: "{{ os_metadata.meta.enable_eessi | default(false) | bool }}" + enable_compute: "{{ os_metadata.meta.compute | default(false) | bool }}" + enable_resolv_conf: "{{ os_metadata.meta.resolv_conf | default(false) | bool }}" + enable_etc_hosts: "{{ os_metadata.meta.etc_hosts | default(false) | bool }}" + enable_nfs: "{{ os_metadata.meta.nfs | default(false) | bool }}" + enable_manila: "{{ os_metadata.meta.manila | default(false) | bool }}" + enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}" + enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] From a7876a665d2ac6ea5450eec3c4a971b59889a2b7 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:27:08 +0000 Subject: [PATCH 249/268] Support additional volumes on compute nodes (#528) --- .../terraform/compute.tf | 1 + .../terraform/compute/nodes.tf | 30 +++++++++++++++++++ .../terraform/compute/variables.tf | 12 ++++++++ .../terraform/variables.tf | 5 ++++ 4 files changed, 48 insertions(+) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index 37c9aad10..ba9da127c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -18,6 +18,7 @@ module "compute" { vnic_profile = lookup(each.value, "vnic_profile", var.vnic_profile) volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + extra_volumes = lookup(each.value, "extra_volumes", {}) key_pair = var.key_pair environment_root = var.environment_root diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 7a2a706a6..ab869e28e 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -1,3 +1,33 @@ +locals { + all_compute_volumes = {for v in setproduct(var.nodes, keys(var.extra_volumes)): "${v[0]}-${v[1]}" => {"node" = v[0], "volume" = v[1]}} + # e.g. with + # var.nodes = ["compute-0", "compute-1"] + # var.extra_volumes = { + # "vol-a" = {size = 10}, + # "vol-b" = {size = 20} + # } + # this is a mapping with + # keys "compute-0-vol-a", "compute-0-vol-b" ... + # values which are a mapping e.g. {"node"="compute-0", "volume"="vol-a"} +} + +resource "openstack_blockstorage_volume_v3" "compute" { + + for_each = local.all_compute_volumes + + name = "${var.cluster_name}-${each.key}" + description = "Compute node ${each.value.node} volume ${each.value.volume}" + size = var.extra_volumes[each.value.volume].size +} + +resource "openstack_compute_volume_attach_v2" "compute" { + + for_each = local.all_compute_volumes + + instance_id = openstack_compute_instance_v2.compute["${each.value.node}"].id + volume_id = openstack_blockstorage_volume_v3.compute["${each.key}"].id +} + resource "openstack_networking_port_v2" "compute" { for_each = toset(var.nodes) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 3655c9e65..72bcf08fd 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -64,6 +64,18 @@ variable "root_volume_size" { default = 40 } +variable "extra_volumes" { + description = <<-EOF + Mapping defining additional volumes to create and attach. + Keys are unique volume name. + Values are a mapping with: + size: Size of volume in GB + **NB**: The order in /dev is not guaranteed to match the mapping + EOF + type = any + default = {} +} + variable "security_group_ids" { type = list } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index f2cfe1215..4d8058208 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -54,6 +54,11 @@ variable "compute" { vnic_profile: Overrides variable vnic_profile volume_backed_instances: Overrides variable volume_backed_instances root_volume_size: Overrides variable root_volume_size + extra_volumes: Mapping defining additional volumes to create and attach + Keys are unique volume name. + Values are a mapping with: + size: Size of volume in GB + **NB**: The order in /dev is not guaranteed to match the mapping EOF } From bc16dbaa25da04d4a350413d343a18fbcb0f7e68 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 15:14:02 +0000 Subject: [PATCH 250/268] bump image --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 47681ea8a..cb4b4e32e 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250108-1703-e515b902", - "RL9": "openhpc-RL9-250108-1703-e515b902" + "RL8": "openhpc-RL8-250109-1431-3faa8138", + "RL9": "openhpc-RL9-250109-1431-3faa8138" } } From 2903223f34394c0f0d58190206d21f7c6ca08e18 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 9 Jan 2025 20:26:21 +0000 Subject: [PATCH 251/268] Support SSSD and optionally LDAP (#438) * support sssd configuration * make sssd-ldap optional * SSSD PR review tweaks * enable installing sssd in fatimage * install sssd and sssd-ldap packages in stackhpc fatimage * fix sssd being enabled in fatimage * bump CI image * simplify sssd-ldap package installation in fatimage * bump CI image * enable mkhomedir * add sshd role * auto enable ssh passwords if using ldap * actually run sshd role * make sshd config more flexible * add basic_users_override_sssd flag * port PR comment re. basic_users docs * add sssd-ldap package during stackhpc build only * bump CI image * add missing empty sssd group * remove deprecated & empty block_devices group * regularise common groups & everything groups template a bit * bumb CI image * sssd review comments Co-authored-by: Will Szumski --------- Co-authored-by: Will Szumski --- ansible/.gitignore | 4 +++ ansible/bootstrap.yml | 9 ++++++ ansible/fatimage.yml | 5 ++++ ansible/iam.yml | 9 ++++++ ansible/roles/basic_users/README.md | 1 + ansible/roles/basic_users/defaults/main.yml | 1 + ansible/roles/basic_users/tasks/main.yml | 17 ++++++++++- ansible/roles/sshd/README.md | 9 ++++++ ansible/roles/sshd/defaults/main.yml | 3 ++ ansible/roles/sshd/handlers/main.yml | 4 +++ ansible/roles/sshd/tasks/configure.yml | 15 ++++++++++ ansible/roles/sshd/tasks/main.yml | 1 + ansible/roles/sshd/templates/sshd.conf.j2 | 2 ++ ansible/roles/sssd/README.md | 18 ++++++++++++ ansible/roles/sssd/defaults/main.yml | 12 ++++++++ ansible/roles/sssd/handlers/main.yml | 5 ++++ ansible/roles/sssd/tasks/configure.yml | 28 +++++++++++++++++++ ansible/roles/sssd/tasks/install.yml | 13 +++++++++ ansible/roles/sssd/tasks/main.yml | 2 ++ environments/.stackhpc/inventory/extra_groups | 4 +++ .../inventory/group_vars/builder.yml | 2 ++ .../terraform/cluster_image.auto.tfvars.json | 4 +-- .../common/inventory/group_vars/all/sshd.yaml | 1 + .../inventory/group_vars/builder/defaults.yml | 2 ++ environments/common/inventory/groups | 18 ++++++++---- environments/common/layouts/everything | 9 +++++- 26 files changed, 188 insertions(+), 10 deletions(-) create mode 100644 ansible/roles/sshd/README.md create mode 100644 ansible/roles/sshd/defaults/main.yml create mode 100644 ansible/roles/sshd/handlers/main.yml create mode 100644 ansible/roles/sshd/tasks/configure.yml create mode 100644 ansible/roles/sshd/tasks/main.yml create mode 100644 ansible/roles/sshd/templates/sshd.conf.j2 create mode 100644 ansible/roles/sssd/README.md create mode 100644 ansible/roles/sssd/defaults/main.yml create mode 100644 ansible/roles/sssd/handlers/main.yml create mode 100644 ansible/roles/sssd/tasks/configure.yml create mode 100644 ansible/roles/sssd/tasks/install.yml create mode 100644 ansible/roles/sssd/tasks/main.yml create mode 100644 environments/common/inventory/group_vars/all/sshd.yaml diff --git a/ansible/.gitignore b/ansible/.gitignore index a7197ff4c..1cabb8ad8 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -58,6 +58,10 @@ roles/* !roles/squid/** !roles/tuned/ !roles/tuned/** +!roles/sssd/ +!roles/sssd/** +!roles/sshd/ +!roles/sshd/** !roles/compute_init/ !roles/compute_init/** !roles/k3s/ diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index e2497d9c6..88d9274b3 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -110,6 +110,15 @@ policy: "{{ selinux_policy }}" register: sestatus +- hosts: sshd + tags: sshd + gather_facts: no + become: yes + tasks: + - name: Configure sshd + import_role: + name: sshd + - hosts: dnf_repos become: yes tasks: diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 9a8828a35..e5de38edf 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -54,6 +54,11 @@ name: freeipa tasks_from: client-install.yml when: "'freeipa_client' in group_names" + - name: Install sssd + import_role: + name: sssd + tasks_from: install.yml + when: "'sssd' in group_names" # - import_playbook: filesystems.yml: - name: Install nfs packages diff --git a/ansible/iam.yml b/ansible/iam.yml index 0286b9df3..857b8f840 100644 --- a/ansible/iam.yml +++ b/ansible/iam.yml @@ -40,3 +40,12 @@ import_role: name: freeipa tasks_from: users.yml + +- hosts: sssd + become: yes + gather_facts: no + tags: sssd + tasks: + - name: Configure sssd + import_role: + name: sssd diff --git a/ansible/roles/basic_users/README.md b/ansible/roles/basic_users/README.md index 4b75100ca..65fdd2c4c 100644 --- a/ansible/roles/basic_users/README.md +++ b/ansible/roles/basic_users/README.md @@ -24,6 +24,7 @@ Role Variables - An additional key `sudo` may optionally be specified giving a string (possibly multiline) defining sudo rules to be templated. - Any other keys may present for other purposes (i.e. not used by this role). - `basic_users_groups`: Optional, default empty list. A list of mappings defining information for each group. Mapping keys/values are passed through as parameters to [ansible.builtin.group](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/group_module.html) and default values are as given there. +- `basic_users_override_sssd`: Optional bool, default false. Whether to disable `sssd` when ensuring users/groups exist with this role. Permits creating local users/groups even if they clash with users provided via sssd (e.g. from LDAP). Ignored if host is not in group `sssd` as well. Note with this option active `sssd` will be stopped and restarted each time this role is run. Dependencies ------------ diff --git a/ansible/roles/basic_users/defaults/main.yml b/ansible/roles/basic_users/defaults/main.yml index 9f34bdf4c..e6c6eafaa 100644 --- a/ansible/roles/basic_users/defaults/main.yml +++ b/ansible/roles/basic_users/defaults/main.yml @@ -7,3 +7,4 @@ basic_users_userdefaults: shell: "{{'/sbin/nologin' if 'control' in group_names else omit }}" basic_users_users: [] basic_users_groups: [] +basic_users_override_sssd: false diff --git a/ansible/roles/basic_users/tasks/main.yml b/ansible/roles/basic_users/tasks/main.yml index c27d024b4..c6733fb89 100644 --- a/ansible/roles/basic_users/tasks/main.yml +++ b/ansible/roles/basic_users/tasks/main.yml @@ -7,7 +7,16 @@ label: "{{ item.name }}" when: - "item.state | default('present') == 'absent'" - + +- name: Stop sssd if required + systemd: + name: sssd + state: stopped + register: _stop_sssd + when: + - "'sssd' in group_names" + - basic_users_override_sssd | bool + - name: Create groups ansible.builtin.group: "{{ item }}" loop: "{{ basic_users_groups }}" @@ -19,6 +28,12 @@ label: "{{ item.name }} [{{ item.state | default('present') }}]" register: basic_users_info +- name: Restart sssd if required + systemd: + name: sssd + state: started + when: _stop_sssd is changed + - name: Write supplied public key as authorized for SSH access authorized_key: user: "{{ item.name }}" diff --git a/ansible/roles/sshd/README.md b/ansible/roles/sshd/README.md new file mode 100644 index 000000000..0fac1d189 --- /dev/null +++ b/ansible/roles/sshd/README.md @@ -0,0 +1,9 @@ +# sshd + +Configure sshd. + +## Role variables + +- `sshd_password_authentication`: Optional bool. Whether to enable password login. Default `false`. +- `sshd_conf_src`: Optional string. Path to sshd configuration template. Default is in-role template. +- `sshd_conf_dest`: Optional string. Path to destination for sshd configuration file. Default is `/etc/ssh/sshd_config.d/10-ansible.conf` which overides `50-{cloud-init,redhat}` files, if present. diff --git a/ansible/roles/sshd/defaults/main.yml b/ansible/roles/sshd/defaults/main.yml new file mode 100644 index 000000000..672305799 --- /dev/null +++ b/ansible/roles/sshd/defaults/main.yml @@ -0,0 +1,3 @@ +sshd_password_authentication: false +sshd_conf_src: sshd.conf.j2 +sshd_conf_dest: /etc/ssh/sshd_config.d/10-ansible.conf diff --git a/ansible/roles/sshd/handlers/main.yml b/ansible/roles/sshd/handlers/main.yml new file mode 100644 index 000000000..e11aa7801 --- /dev/null +++ b/ansible/roles/sshd/handlers/main.yml @@ -0,0 +1,4 @@ +- name: Restart sshd + systemd: + name: sshd + state: restarted diff --git a/ansible/roles/sshd/tasks/configure.yml b/ansible/roles/sshd/tasks/configure.yml new file mode 100644 index 000000000..8aafb5c19 --- /dev/null +++ b/ansible/roles/sshd/tasks/configure.yml @@ -0,0 +1,15 @@ +- name: Template sshd configuration + # NB: If parameters are defined multiple times the first value wins; + # The default /etc/ssh/sshd_config has + # Include /etc/ssh/sshd_config.d/*.conf + # early on, which is generally held to be the correct approach, so adding + # values to the end of that file won't work + template: + src: "{{ sshd_conf_src }}" + dest: "{{ sshd_conf_dest }}" + owner: root + group: root + mode: u=rw,go= + validate: sshd -t -f %s + notify: + - Restart sshd diff --git a/ansible/roles/sshd/tasks/main.yml b/ansible/roles/sshd/tasks/main.yml new file mode 100644 index 000000000..84f493457 --- /dev/null +++ b/ansible/roles/sshd/tasks/main.yml @@ -0,0 +1 @@ +- import_tasks: configure.yml diff --git a/ansible/roles/sshd/templates/sshd.conf.j2 b/ansible/roles/sshd/templates/sshd.conf.j2 new file mode 100644 index 000000000..2746f0642 --- /dev/null +++ b/ansible/roles/sshd/templates/sshd.conf.j2 @@ -0,0 +1,2 @@ +# {{ ansible_managed }} +PasswordAuthentication {{ 'yes' if sshd_password_authentication | bool else 'no' }} diff --git a/ansible/roles/sssd/README.md b/ansible/roles/sssd/README.md new file mode 100644 index 000000000..da4e63f31 --- /dev/null +++ b/ansible/roles/sssd/README.md @@ -0,0 +1,18 @@ +# sssd + +Install and configure [sssd](https://sssd.io/docs/introduction.html). + + +## Role variables + +The only required configuration is to create a [sssd.conf](https://www.mankier.com/5/sssd.conf) template at the location specified by `sssd_conf_src`. + +- `sssd_packages`: Optional list. Packages to install. +- `sssd_ldap_install`: Optional bool. Whether to install packages enabling SSSD to authenticate against LDAP. Default `false`. +- `sssd_ldap_packages`: Optional list. Packages to install when using `sssd_ldap_install`. +- `sssd_enable_mkhomedir`: Optional bool. Whether to enable creation of home directories on login. Default `false`. +- `sssd_mkhomedir_packages`: Optional list. Packages to install when using `sssd_enable_mkhomedir`. +- `sssd_conf_src`: Optional string. Path to `sssd.conf` template. Default (which must be created) is `{{ appliances_environment_root }}/files/sssd.conf.j2`. +- `sssd_conf_dest`: Optional string. Path to destination for `sssd.conf`. Default `/etc/sssd/sssd.conf`. +- `sssd_started`: Optional bool. Whether `sssd` service should be started. +- `sssd_enabled`: Optional bool. Whether `sssd` service should be enabled. diff --git a/ansible/roles/sssd/defaults/main.yml b/ansible/roles/sssd/defaults/main.yml new file mode 100644 index 000000000..5bc58c990 --- /dev/null +++ b/ansible/roles/sssd/defaults/main.yml @@ -0,0 +1,12 @@ +sssd_packages: + - sssd-common +sssd_install_ldap: false +sssd_ldap_packages: + - sssd-ldap +sssd_enable_mkhomedir: false +sssd_mkhomedir_packages: + - oddjob-mkhomedir +sssd_conf_src: "{{ appliances_environment_root }}/files/sssd.conf.j2" +sssd_conf_dest: /etc/sssd/sssd.conf +sssd_started: true +sssd_enabled: true diff --git a/ansible/roles/sssd/handlers/main.yml b/ansible/roles/sssd/handlers/main.yml new file mode 100644 index 000000000..72c36e736 --- /dev/null +++ b/ansible/roles/sssd/handlers/main.yml @@ -0,0 +1,5 @@ +- name: Restart sssd + systemd: + name: sssd + state: restarted + when: sssd_started | bool diff --git a/ansible/roles/sssd/tasks/configure.yml b/ansible/roles/sssd/tasks/configure.yml new file mode 100644 index 000000000..ae636e9dd --- /dev/null +++ b/ansible/roles/sssd/tasks/configure.yml @@ -0,0 +1,28 @@ +- name: Manage sssd.conf configuration + template: + src: "{{ sssd_conf_src }}" + dest: "{{ sssd_conf_dest }}" + owner: root + group: root + mode: u=rw,go= + notify: "Restart sssd" + +- meta: flush_handlers + +- name: Ensure sssd service state + systemd: + name: sssd + state: "{{ 'started' if sssd_started | bool else 'stopped' }}" + enabled: "{{ sssd_enabled | bool }}" + +- name: Get current authselect configuration + command: authselect current --raw + changed_when: false + failed_when: + - _authselect_current.rc != 0 + - "'No existing configuration detected' not in _authselect_current.stdout" + register: _authselect_current # stdout: sssd with-mkhomedir + +- name: Configure nsswitch and PAM for SSSD + command: "authselect select sssd --force{% if sssd_enable_mkhomedir | bool %} with-mkhomedir{% endif %}" + when: "'sssd' not in _authselect_current.stdout" diff --git a/ansible/roles/sssd/tasks/install.yml b/ansible/roles/sssd/tasks/install.yml new file mode 100644 index 000000000..97aa82a2f --- /dev/null +++ b/ansible/roles/sssd/tasks/install.yml @@ -0,0 +1,13 @@ +- name: Ensure sssd packages are installed + dnf: + name: "{{ sssd_packages + sssd_ldap_packages if (sssd_install_ldap | bool) else [] }}" + +- name: Control if sssd should start on boot + # Needs to be done here to prevent starting after image build, is enabled by default + systemd: + name: sssd + enabled: "{{ sssd_enabled | bool }}" + +- name: Ensure mkhomedir packages are installed if required + dnf: + name: "{{ sssd_mkhomedir_packages }}" diff --git a/ansible/roles/sssd/tasks/main.yml b/ansible/roles/sssd/tasks/main.yml new file mode 100644 index 000000000..2b65e84b4 --- /dev/null +++ b/ansible/roles/sssd/tasks/main.yml @@ -0,0 +1,2 @@ +- import_tasks: install.yml +- import_tasks: configure.yml diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index 7c9a7c774..2531b803e 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -31,3 +31,7 @@ compute [squid:children] # Install squid into fat image builder + +[sssd:children] +# Install sssd into fat image +builder diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 5130e9d84..10b15adac 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,3 +1,5 @@ +#update_enable: false # Can uncomment for speed debugging non-update related build issues +sssd_install_ldap: true # include sssd-ldap package in fatimage # update_enable: false # Can uncomment for speed debugging non-update related build issues # Uncomment below to use CI pulp servers diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 47681ea8a..3c1e19058 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250108-1703-e515b902", - "RL9": "openhpc-RL9-250108-1703-e515b902" + "RL8": "openhpc-RL8-250109-1444-ecea8219", + "RL9": "openhpc-RL9-250109-1444-ecea8219" } } diff --git a/environments/common/inventory/group_vars/all/sshd.yaml b/environments/common/inventory/group_vars/all/sshd.yaml new file mode 100644 index 000000000..5d4ed228f --- /dev/null +++ b/environments/common/inventory/group_vars/all/sshd.yaml @@ -0,0 +1 @@ +sshd_password_authentication: "{{ sssd_install_ldap | default(false) | bool }}" diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml index b43d9f03c..dae4edd9a 100644 --- a/environments/common/inventory/group_vars/builder/defaults.yml +++ b/environments/common/inventory/group_vars/builder/defaults.yml @@ -22,4 +22,6 @@ squid_cache_disk: 0 # just needs to be defined squid_cache_mem: 0 tuned_started: false tuned_enabled: false +sssd_started: false +sssd_enabled: false appliances_mode: build diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index cbc69d800..1d756ed66 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -13,9 +13,6 @@ login control compute -[eessi:children] -# Hosts on which EESSI stack should be configured - [hpctests:children] # Login group to use for running mpi-based testing. login @@ -79,9 +76,6 @@ cluster # Hosts to install firewalld on - see ansible/roles/filewalld fail2ban -[block_devices] -# Superset of hosts to configure filesystems on - see ansible/roles/block_devices/README.md - [basic_users] # Add `openhpc` group to add slurm users via creation of users on each node. @@ -118,12 +112,18 @@ freeipa_client [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md +[eessi] +# Hosts on which EESSI stack should be configured + [resolv_conf] # Allows defining nameservers in /etc/resolv.conf - see ansible/roles/resolv_conf/README.md [proxy] # Hosts to configure http/s proxies - see ansible/roles/proxy/README.md +[manila] +# Hosts to configure for manila fileshares + [persist_hostkeys] # Hosts to persist hostkeys for across reimaging. NB: Requires appliances_state_dir on hosts. @@ -136,6 +136,12 @@ freeipa_client [ansible_init] # Hosts to run linux-anisble-init +[sssd] +# Hosts to configure sssd on + +[sshd] +# Hosts where the OpenSSH server daemon should be configured + [compute_init] # EXPERIMENTAL: Compute hosts to enable joining cluster on boot on diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index d3b8fe040..4293cbca0 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -60,6 +60,7 @@ cluster # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md [eessi:children] +# Hosts on which EESSI stack should be configured openhpc [resolv_conf] @@ -83,9 +84,15 @@ openondemand # Hosts to run TuneD configuration [ansible_init:children] -# Hosts to run ansible-init +# Hosts to run linux-anisble-init cluster +[sssd] +# Hosts to configure sssd on + +[sshd] +# Hosts where the OpenSSH server daemon should be configured + [compute_init:children] # EXPERIMENTAL: Compute hosts to enable joining cluster on boot on compute From d2e18d0c5346509abc7546bdd70fc74a5ca87e5e Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 10 Jan 2025 09:16:10 +0000 Subject: [PATCH 252/268] bump image --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 3c1e19058..3c43e02eb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250109-1444-ecea8219", - "RL9": "openhpc-RL9-250109-1444-ecea8219" + "RL8": "openhpc-RL8-250109-2102-5193ba2f", + "RL9": "openhpc-RL9-250110-0016-5193ba2f" } } From 3b09bd144361dcd4243cb89dbebf48fc68e4ba68 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Fri, 10 Jan 2025 16:50:47 +0100 Subject: [PATCH 253/268] Fix various typos in documentation --- README.md | 14 +++++++------- docs/image-build.md | 2 +- docs/monitoring-and-logging.md | 2 +- docs/openondemand.md | 24 ++++++++++++------------ docs/production.md | 2 +- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 593837ccd..dd6451011 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,10 @@ This repository contains playbooks and configuration to define a Slurm-based HPC - [Rocky Linux](https://rockylinux.org/)-based hosts. - [OpenTofu](https://opentofu.org/) configurations to define the cluster's infrastructure-as-code. - Packages for Slurm and MPI software stacks from [OpenHPC](https://openhpc.community/). -- Shared fileystem(s) using NFS (with in-cluster or external servers) or [CephFS](https://docs.ceph.com/en/latest/cephfs/) via [Openstack Manila](https://wiki.openstack.org/wiki/Manila). +- Shared fileystem(s) using NFS (with in-cluster or external servers) or [CephFS](https://docs.ceph.com/en/latest/cephfs/) via [OpenStack Manila](https://wiki.openstack.org/wiki/Manila). - Slurm accounting using a MySQL database. - Monitoring integrated with Slurm jobs using Prometheus, ElasticSearch and Grafana. -- A web-based portal from [OpenOndemand](https://openondemand.org/). +- A web-based portal from [Open OnDemand](https://openondemand.org/). - Production-ready default Slurm configurations for access and memory limits. - [Packer](https://developer.hashicorp.com/packer)-based image build configurations for node images. @@ -25,7 +25,7 @@ The default configuration in this repository may be used to create a cluster to - Persistent state backed by an OpenStack volume. - NFS-based shared file system backed by another OpenStack volume. -Note that the OpenOndemand portal and its remote apps are not usable with this default configuration. +Note that the Open OnDemand portal and its remote apps are not usable with this default configuration. It requires an OpenStack cloud, and an Ansible "deploy host" with access to that cloud. @@ -33,7 +33,7 @@ Before starting ensure that: - You have root access on the deploy host. - You can create instances using a Rocky 9 GenericCloud image (or an image based on that). - **NB**: In general it is recommended to use the [latest released image](https://github.com/stackhpc/ansible-slurm-appliance/releases) which already contains the required packages. This is built and tested in StackHPC's CI. -- You have a SSH keypair defined in OpenStack, with the private key available on the deploy host. +- You have an SSH keypair defined in OpenStack, with the private key available on the deploy host. - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). @@ -66,7 +66,7 @@ Use the `cookiecutter` template to create a new environment to hold your configu and follow the prompts to complete the environment name and description. -**NB:** In subsequent sections this new environment is refered to as `$ENV`. +**NB:** In subsequent sections this new environment is referred to as `$ENV`. Activate the new environment: @@ -124,8 +124,8 @@ where the IP of the login node is given in `environments/$ENV/inventory/hosts.ym ## Overview of directory structure - `environments/`: See [docs/environments.md](docs/environments.md). -- `ansible/`: Contains the ansible playbooks to configure the infrastruture. -- `packer/`: Contains automation to use Packer to build machine images for an enviromment - see the README in this directory for further information. +- `ansible/`: Contains the ansible playbooks to configure the infrastructure. +- `packer/`: Contains automation to use Packer to build machine images for an environment - see the README in this directory for further information. - `dev/`: Contains development tools. For further information see the [docs](docs/) directory. diff --git a/docs/image-build.md b/docs/image-build.md index db51265a3..dc968ebfd 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -51,7 +51,7 @@ To build either a site-specific fat image from scratch, or to extend an existing openstack image unset --property signature_verified $SOURCE_IMAGE - then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). + then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [OpenStack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). 6. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index 3e3de38c0..db228d410 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -96,7 +96,7 @@ The `grafana` group controls the placement of the grafana service. Load balancin ### Access -If Open Ondemand is enabled then by default this is used to proxy Grafana, otherwise Grafana is accessed through the first . See `grafana_url` in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). The port used (variable `grafana_port`) defaults to `3000`. +If Open OnDemand is enabled then by default this is used to proxy Grafana, otherwise Grafana is accessed through the first . See `grafana_url` in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). The port used (variable `grafana_port`) defaults to `3000`. The default credentials for the admin user are: diff --git a/docs/openondemand.md b/docs/openondemand.md index 3bd6c9e9f..6b501d20b 100644 --- a/docs/openondemand.md +++ b/docs/openondemand.md @@ -1,28 +1,28 @@ # Overview -The appliance can deploy the Open Ondemand portal. This page describes how to enable this and the default appliance configuration/behaviour. Note that detailed configuration documentation is provided by: +The appliance can deploy the Open OnDemand portal. This page describes how to enable this and the default appliance configuration/behaviour. Note that detailed configuration documentation is provided by: - The README for the included `openondemand` role in this repo - [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md). - The README and default variables for the underlying "official" role which the above wraps - [Open OnDemand Ansible Role](https://github.com/OSC/ood-ansible) -- The documentation for Open Ondemand [itself](https://osc.github.io/ood-documentation/latest/index.html) +- The documentation for Open OnDemand [itself](https://osc.github.io/ood-documentation/latest/index.html) This appliance can deploy and configure: -- The Open Ondemand server itself (usually on a single login node). +- The Open OnDemand server itself (usually on a single login node). - User authentication using one of: - An external OIDC provider. - HTTP basic authenication and PAM. - Virtual desktops on compute nodes. - Jupyter nodebook servers on compute nodes. -- Proxying of Grafana (usually deployed on the control node) via the Open Ondemand portal. -- Links to additional filesystems and pages from the Open Ondemand Dashboard. -- A Prometheus exporter for the Open Ondemand server and related Grafana dashboard +- Proxying of Grafana (usually deployed on the control node) via the Open OnDemand portal. +- Links to additional filesystems and pages from the Open OnDemand Dashboard. +- A Prometheus exporter for the Open OnDemand server and related Grafana dashboard For examples of all of the above see the `smslabs-example` environment in this repo. -# Enabling Open Ondemand -To enable the Open Ondemand server, add single host to the `openondemand` inventory group. Generally, this should be a node in the `login` group, as Open Ondemand must be able to access Slurm commands. +# Enabling Open OnDemand +To enable the Open OnDemand server, add single host to the `openondemand` inventory group. Generally, this should be a node in the `login` group, as Open OnDemand must be able to access Slurm commands. -To enable compute nodes for virtual desktops or Jupyter notebook servers (accessed through the Open Ondemand portal), add nodes/groups to the `openondemand_desktop` and `openondemand_jupyter` inventory groups respectively. These may be all or a subset of the `compute` group. +To enable compute nodes for virtual desktops or Jupyter notebook servers (accessed through the Open OnDemand portal), add nodes/groups to the `openondemand_desktop` and `openondemand_jupyter` inventory groups respectively. These may be all or a subset of the `compute` group. The above functionality is configured by running the `ansible/portal.yml` playbook. This is automatically run as part of `ansible/site.yml`. @@ -30,7 +30,7 @@ The above functionality is configured by running the `ansible/portal.yml` playbo See the [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md) for more details on the variables described below. -The following variables have been given default values to allow Open Ondemand to work in a newly created environment without additional configuration, but generally should be overridden in `environment/site/inventory/group_vars/all/` with site-specific values: +The following variables have been given default values to allow Open OnDemand to work in a newly created environment without additional configuration, but generally should be overridden in `environment/site/inventory/group_vars/all/` with site-specific values: - `openondemand_servername` - this must be defined for both `openondemand` and `grafana` hosts (when Grafana is enabled). Default is `ansible_host` (i.e. the IP address) of the first host in the `openondemand` group. - `openondemand_auth` and any corresponding options. Defaults to `basic_pam`. - `openondemand_desktop_partition` and `openondemand_jupyter_partition` if the corresponding inventory groups are defined. Defaults to the first compute group defined in the `compute` Terraform variable in `environments/$ENV/terraform`. @@ -41,9 +41,9 @@ It is also recommended to set: If shared filesystems other than `$HOME` are available, add paths to `openondemand_filesapp_paths`. -The appliance automatically configures Open Ondemand to proxy Grafana and adds a link to it on the Open Ondemand dashboard. This means no external IP (or SSH proxying etc) is required to access Grafana (which by default is deployed on the control node). To allow users to authenticate to Grafana, the simplest option is to enable anonymous (View-only) login by setting `grafana_auth_anonymous` (see [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml)[^1]). +The appliance automatically configures Open OnDemand to proxy Grafana and adds a link to it on the Open OnDemand dashboard. This means no external IP (or SSH proxying etc) is required to access Grafana (which by default is deployed on the control node). To allow users to authenticate to Grafana, the simplest option is to enable anonymous (View-only) login by setting `grafana_auth_anonymous` (see [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml)[^1]). -[^1]: Note that if `openondemand_auth` is `basic_pam` and anonymous Grafana login is enabled, the appliance will (by default) configure Open Ondemand's Apache server to remove the Authorisation header from proxying of all `node/` addresses. This is done as otherwise Grafana tries to use this header to authenticate, which fails with the default configuration where only the admin Grafana user `grafana` is created. Note that the removal of this header in this configuration means it cannot be used to authenticate proxied interactive applications - however the appliance-deployed remote desktop and Jupyter Notebook server applications use other authentication methods. An alternative if using `basic_pam` is not to enable anonymous Grafana login and to create Grafana users matching the local users (e.g. in `environments//hooks/post.yml`). +[^1]: Note that if `openondemand_auth` is `basic_pam` and anonymous Grafana login is enabled, the appliance will (by default) configure Open OnDemand's Apache server to remove the Authorisation header from proxying of all `node/` addresses. This is done as otherwise Grafana tries to use this header to authenticate, which fails with the default configuration where only the admin Grafana user `grafana` is created. Note that the removal of this header in this configuration means it cannot be used to authenticate proxied interactive applications - however the appliance-deployed remote desktop and Jupyter Notebook server applications use other authentication methods. An alternative if using `basic_pam` is not to enable anonymous Grafana login and to create Grafana users matching the local users (e.g. in `environments//hooks/post.yml`). # Access By default the appliance authenticates against OOD with basic auth through PAM. When creating a new environment, a new user with username `demo_user` will be created. Its password is found under `vault_openondemand_default_user` in the appliance secrets store in `environments/{ENV}/inventory/group_vars/all/secrets.yml`. Other users can be defined by overriding the `basic_users_users` variable in your environment (templated into `environments/{ENV}/inventory/group_vars/all/basic_users.yml` by default). diff --git a/docs/production.md b/docs/production.md index 9f1b3f7bf..59b9f3775 100644 --- a/docs/production.md +++ b/docs/production.md @@ -96,7 +96,7 @@ and referenced from the `site` and `production` environments, e.g.: cluster ``` -- Configure Open OpenOndemand - see [specific documentation](openondemand.README.md). +- Configure Open OnDemand - see [specific documentation](openondemand.README.md). - Remove the `demo_user` user from `environments/$ENV/inventory/group_vars/all/basic_users.yml` From 438ed3ad6f40916e4256070846724f298f8c274d Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 10 Jan 2025 17:37:44 +0000 Subject: [PATCH 254/268] adjust check_slurm logic to deal with idle* state --- ansible/ci/check_slurm.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index d95c5bb5c..6507caf08 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -6,9 +6,9 @@ shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name register: sinfo changed_when: false - until: "'boot' not in sinfo.stdout_lines" - retries: 5 - delay: 10 + until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout) + retries: 10 + delay: 5 - name: Check nodes have expected slurm state assert: that: sinfo.stdout_lines == expected_sinfo From 37c1dcebd4489f88d6e60de10ea89cac1caec26b Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 13 Jan 2025 09:29:47 +0000 Subject: [PATCH 255/268] Fix nightly cleanup to deal with duplicate server names --- .github/workflows/nightly-cleanup.yml | 40 +++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index f76bd51a9..577a20775 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -63,15 +63,43 @@ jobs: echo "No clusters to delete." exit 0 fi - + for cluster_prefix in ${ci_clusters} do echo "Processing cluster: $cluster_prefix" - TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value) - if [[ $TAGS =~ "keep" ]]; then - echo "Skipping ${cluster_prefix} - control instance is tagged as keep" - else - ./dev/delete-cluster.py ${cluster_prefix} --force + + # Retrieve all servers matching the cluster prefix + SERVERS=$(openstack server list --name "${cluster_prefix}-.*" -f value -c ID -c Name) + + if [[ -z "$SERVERS" ]]; then + echo "No servers found for cluster ${cluster_prefix}" + continue + fi + + KEEP_FLAG=false + while IFS= read -r line; do + SERVER_ID=$(echo "$line" | awk '{print $1}') + SERVER_NAME=$(echo "$line" | awk '{print $2}') + + # Check tags only on control nodes + if [[ "$SERVER_NAME" == "${cluster_prefix}-control" ]]; then + TAGS=$(openstack server show $SERVER_ID --column tags --format value) + if [[ $TAGS =~ "keep" ]]; then + echo "Skipping cluster ${cluster_prefix} - control instance is tagged as keep" + KEEP_FLAG=true + break + fi + fi + done <<< "$SERVERS" + + # Delete all servers if control node is not tagged with keep + if [[ "$KEEP_FLAG" == false ]]; then + echo "Deleting all servers in cluster ${cluster_prefix}" + while IFS= read -r line; do + SERVER_ID=$(echo "$line" | awk '{print $1}') + echo "Deleting server $SERVER_ID" + openstack server delete $SERVER_ID || true + done <<< "$SERVERS" fi done shell: bash From 9b1bf122847f8345ac70e764fd81300829de73d0 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 13 Jan 2025 10:03:32 +0000 Subject: [PATCH 256/268] Update nightly-cleanup.yml --- .github/workflows/nightly-cleanup.yml | 40 ++++----------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 577a20775..8ea3ca74d 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -63,43 +63,15 @@ jobs: echo "No clusters to delete." exit 0 fi - + for cluster_prefix in ${ci_clusters} do echo "Processing cluster: $cluster_prefix" - - # Retrieve all servers matching the cluster prefix - SERVERS=$(openstack server list --name "${cluster_prefix}-.*" -f value -c ID -c Name) - - if [[ -z "$SERVERS" ]]; then - echo "No servers found for cluster ${cluster_prefix}" - continue - fi - - KEEP_FLAG=false - while IFS= read -r line; do - SERVER_ID=$(echo "$line" | awk '{print $1}') - SERVER_NAME=$(echo "$line" | awk '{print $2}') - - # Check tags only on control nodes - if [[ "$SERVER_NAME" == "${cluster_prefix}-control" ]]; then - TAGS=$(openstack server show $SERVER_ID --column tags --format value) - if [[ $TAGS =~ "keep" ]]; then - echo "Skipping cluster ${cluster_prefix} - control instance is tagged as keep" - KEEP_FLAG=true - break - fi - fi - done <<< "$SERVERS" - - # Delete all servers if control node is not tagged with keep - if [[ "$KEEP_FLAG" == false ]]; then - echo "Deleting all servers in cluster ${cluster_prefix}" - while IFS= read -r line; do - SERVER_ID=$(echo "$line" | awk '{print $1}') - echo "Deleting server $SERVER_ID" - openstack server delete $SERVER_ID || true - done <<< "$SERVERS" + TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value) + if [[ $TAGS =~ "keep" ]]; then + echo "Skipping ${cluster_prefix} - control instance is tagged as keep" + else + ./dev/delete-cluster.py ${cluster_prefix} --force fi done shell: bash From f1fd75e772d4c9122bb7fcb79279c7a26b2f5f5b Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 13 Jan 2025 10:21:42 +0000 Subject: [PATCH 257/268] Update nightly-cleanup.yml --- .github/workflows/nightly-cleanup.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 8ea3ca74d..e15049f08 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -63,11 +63,20 @@ jobs: echo "No clusters to delete." exit 0 fi - + for cluster_prefix in ${ci_clusters} do echo "Processing cluster: $cluster_prefix" - TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value) + # Get all servers with the matching name for control node + CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json) + SERVER_COUNT=$(echo "$CONTROL_SERVERS" | jq length) + + if [[ $SERVER_COUNT -gt 1 ]]; then + echo "Warning: More than one server found for control node '${cluster_prefix}-control'." + continue + fi + TAGS=$(echo "$CONTROL_SERVERS" | jq -r '.[0].Tags' ) + if [[ $TAGS =~ "keep" ]]; then echo "Skipping ${cluster_prefix} - control instance is tagged as keep" else From edbcebc09b1321c86bcea1a7f3f181ae70b7ac14 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 13 Jan 2025 13:39:06 +0000 Subject: [PATCH 258/268] Fix tag determination --- .github/workflows/nightly-cleanup.yml | 29 ++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index e15049f08..0f7156fad 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -70,17 +70,28 @@ jobs: # Get all servers with the matching name for control node CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json) SERVER_COUNT=$(echo "$CONTROL_SERVERS" | jq length) - + if [[ $SERVER_COUNT -gt 1 ]]; then - echo "Warning: More than one server found for control node '${cluster_prefix}-control'." - continue - fi - TAGS=$(echo "$CONTROL_SERVERS" | jq -r '.[0].Tags' ) - - if [[ $TAGS =~ "keep" ]]; then - echo "Skipping ${cluster_prefix} - control instance is tagged as keep" + echo "Multiple servers found for control node '${cluster_prefix}-control'. Checking tags for each..." + + for server in $(echo "$CONTROL_SERVERS" | jq -r '.[].ID'); do + # Get tags for each control node + TAGS=$(openstack server show "$server" --column tags --format value) + + if [[ $TAGS =~ "keep" ]]; then + echo "Skipping ${cluster_prefix} (server ${server}) - control instance is tagged as keep" + else + ./dev/delete-cluster.py ${cluster_prefix} --force + fi + done else - ./dev/delete-cluster.py ${cluster_prefix} --force + # If only one server, extract its tags and proceed + TAGS=$(echo "$CONTROL_SERVERS" | jq -r '.[0].Tags') + if [[ $TAGS =~ "keep" ]]; then + echo "Skipping ${cluster_prefix} - control instance is tagged as keep" + else + ./dev/delete-cluster.py ${cluster_prefix} --force + fi fi done shell: bash From fd5cbf992bfa9aca2e018e2051172a8348e2ec70 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 09:25:07 +0000 Subject: [PATCH 259/268] pause in workflow to debug slurm state --- .github/workflows/stackhpc.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index d5bd313ca..35630d4dc 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -185,6 +185,9 @@ jobs: ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml ansible-playbook -v ansible/ci/check_slurm.yml + - name: Pause for debugging + run: sleep 1800 + - name: Check sacct state survived reimage run: | . venv/bin/activate From f661c7fef6a741fe715d24815f7350b66d2e64ea Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 10:49:01 +0000 Subject: [PATCH 260/268] debug wait on failure --- .github/workflows/stackhpc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 35630d4dc..f8b0167ae 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -186,7 +186,8 @@ jobs: ansible-playbook -v ansible/ci/check_slurm.yml - name: Pause for debugging - run: sleep 1800 + if: failure() + run: sleep 3600 - name: Check sacct state survived reimage run: | From 329e054742e1d8ddbf7670dfbfddbdf735fa0470 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Tue, 14 Jan 2025 12:45:29 +0100 Subject: [PATCH 261/268] Fix environment creation steps We need to be at the root of the repository to run the next commands. --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dd6451011..1a0acd630 100644 --- a/README.md +++ b/README.md @@ -68,8 +68,9 @@ and follow the prompts to complete the environment name and description. **NB:** In subsequent sections this new environment is referred to as `$ENV`. -Activate the new environment: +Go back to the root folder and activate the new environment: + cd .. . environments/$ENV/activate And generate secrets for it: From 81c316a594aa3bc602350d80ea31e4731c11d001 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 15:40:33 +0000 Subject: [PATCH 262/268] allow empty compute_init_enable list --- .github/workflows/stackhpc.yml | 4 ---- ansible/extras.yml | 1 - .../{{cookiecutter.environment}}/terraform/compute.tf | 4 ++-- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index f8b0167ae..d5bd313ca 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -185,10 +185,6 @@ jobs: ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml ansible-playbook -v ansible/ci/check_slurm.yml - - name: Pause for debugging - if: failure() - run: sleep 3600 - - name: Check sacct state survived reimage run: | . venv/bin/activate diff --git a/ansible/extras.yml b/ansible/extras.yml index 6bb141109..13a887dd9 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -44,7 +44,6 @@ # NB: has to be after eeesi and os-manila-mount tags: compute_init become: yes - name: Export hostvars tasks: - include_role: name: compute_init diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index 20fcd5d89..a90108924 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -20,11 +20,11 @@ module "compute" { root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) extra_volumes = lookup(each.value, "extra_volumes", {}) + compute_init_enable = lookup(each.value, "compute_init_enable", []) + key_pair = var.key_pair environment_root = var.environment_root k3s_token = var.k3s_token control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] - - compute_init_enable = each.value.compute_init_enable } From 9897f29b7220a6f7bce6b06a2da41c6b2d068158 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 17:04:16 +0000 Subject: [PATCH 263/268] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 3c43e02eb..37bd8c3d6 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250109-2102-5193ba2f", - "RL9": "openhpc-RL9-250110-0016-5193ba2f" + "RL8": "openhpc-RL8-250114-1627-bccc88b5", + "RL9": "openhpc-RL9-250114-1626-bccc88b5" } } From 257e685aa151098c5da007a032e99424b49938ff Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Wed, 15 Jan 2025 11:00:37 +0100 Subject: [PATCH 264/268] Document required security groups (#534) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1a0acd630..54b74d799 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ Before starting ensure that: - You have an SSH keypair defined in OpenStack, with the private key available on the deploy host. - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). +- Three security groups are present: ``default`` allowing intra-cluster communication, ``SSH`` allowing external access via SSH and ``HTTPS`` allowing access for Open OnDemand. ### Setup deploy host From e8f1cbe6237cb8a692ef426e8abcc97e2bcc4393 Mon Sep 17 00:00:00 2001 From: Matt Anson Date: Wed, 15 Jan 2025 10:01:53 +0000 Subject: [PATCH 265/268] Bump Zenith client to latest from azimuth-cloud namespace (#437) --- ansible/roles/zenith_proxy/defaults/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/roles/zenith_proxy/defaults/main.yml b/ansible/roles/zenith_proxy/defaults/main.yml index dbb920c58..6b1a43aaa 100644 --- a/ansible/roles/zenith_proxy/defaults/main.yml +++ b/ansible/roles/zenith_proxy/defaults/main.yml @@ -15,12 +15,12 @@ zenith_proxy_pod_name: "{{ zenith_proxy_service_name }}" zenith_proxy_client_container_name: "{{ zenith_proxy_client_service_name }}" zenith_proxy_mitm_container_name: "{{ zenith_proxy_mitm_service_name }}" -zenith_proxy_image_tag: '0.1.0' +zenith_proxy_image_tag: '0.12.0' -zenith_proxy_client_image_repository: ghcr.io/stackhpc/zenith-client +zenith_proxy_client_image_repository: ghcr.io/azimuth-cloud/zenith-client zenith_proxy_client_image: "{{ zenith_proxy_client_image_repository }}:{{ zenith_proxy_image_tag }}" -zenith_proxy_mitm_image_repository: ghcr.io/stackhpc/zenith-proxy +zenith_proxy_mitm_image_repository: ghcr.io/azimuth-cloud/zenith-proxy zenith_proxy_mitm_image: "{{ zenith_proxy_mitm_image_repository }}:{{ zenith_proxy_image_tag }}" zenith_proxy_upstream_scheme: http From 1e5e105da8b35ef74150c87ee118063750cf69bb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 15 Jan 2025 10:34:27 +0000 Subject: [PATCH 266/268] fix yaml formatting in operations docs --- docs/operations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/operations.md b/docs/operations.md index 4bebe1b3f..595ddcbf5 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -82,7 +82,7 @@ Additional packages from any DNF repositories which are enabled during build (wh appliances_extra_packages_other: - somepackage - anotherpackage - +``` The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules. From 5f7e48fbdb5f0c8843cefb7ef35cc4c23baf9f9e Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 15 Jan 2025 12:56:38 +0000 Subject: [PATCH 267/268] Enable image builds to install extra packages by default (#536) * Enable image builds to install extra packages by default * simplify adding additional packages * Fix docs typo Co-authored-by: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> --------- Co-authored-by: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> --- ansible/extras.yml | 1 - docs/operations.md | 42 +++++++++++++------ .../inventory/group_vars/all/defaults.yml | 3 -- environments/common/inventory/groups | 1 + environments/common/layouts/everything | 2 +- 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/ansible/extras.yml b/ansible/extras.yml index 13a887dd9..72c76b3b1 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -65,4 +65,3 @@ - name: Install additional packages dnf: name: "{{ appliances_extra_packages }}" - when: appliances_mode != 'configure' or appliances_extra_packages_during_configure diff --git a/docs/operations.md b/docs/operations.md index 595ddcbf5..7a0a5b919 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -63,7 +63,7 @@ This is a usually a two-step process: Deploying the additional nodes and applying these changes requires rerunning both Terraform and the Ansible site.yml playbook - follow [Deploying a Cluster](#Deploying-a-Cluster). # Adding Additional Packages -By default, the following utility packages are installed during build: +By default, the following utility packages are installed during the StackHPC image build: - htop - nano - screen @@ -75,18 +75,34 @@ By default, the following utility packages are installed during build: - git - latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) -Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_extra_packages_other` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: - -```yaml - # environments/foo-base/inventory/group_vars/all/defaults.yml: - appliances_extra_packages_other: - - somepackage - - anotherpackage -``` - -The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules. - -If you wish to install packages during runtime, the `site.yml` playbook should be run with `appliances_packages_during_configure` overriden to `true` and `cluster` should be added as a child of the `dnf_repos` group in order to temporarily re-enable DNF repositories during runtime (WARNING: this should only be done if using an unauthenticated local Pulp server. If using StackHPC Ark directly, doing this WILL leak credentials to users). +Additional packages can be added during image builds by: +- adding the `extra_packages` group to the build `inventory_groups` (see +[docs/image-build.md](./image-build.md)) +- defining a list of packages in `appliances_extra_packages_other` in e.g. +`environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: + + ```yaml + # environments/foo-base/inventory/group_vars/all/defaults.yml: + appliances_extra_packages_other: + - somepackage + - anotherpackage + ``` + +For packages which come from repositories mirroed by StackHPC's "Ark" Pulp server +(including rocky, EPEL and OpenHPC repositories), this will require either [Ark +credentials](./image-build.md)) or a [local Pulp mirror](./experimental/pulp.md) +to be configured. This includes rocky, EPEL and OpenHPC repos. + +The packages available from the OpenHPC repos are described in Appendix E of +the OpenHPC installation guide (linked from the +[OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note +"user-facing" OpenHPC packages such as compilers, mpi libraries etc. include +corresponding `lmod` modules. + +Packages *may* also be installed during the site.yml, by adding the `cluster` +group into the `extra_packages` group. An error will occur if Ark credentials +are defined in this case, as they are readable by unprivileged users in the +`.repo` files and a local Pulp mirror must be used instead. If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a plat should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this: - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory. diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index e26bc3018..23aafd73e 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -94,9 +94,6 @@ appliances_extra_packages_default: - git - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" - -appliances_extra_packages_during_configure: false - appliances_extra_packages_other: [] appliances_extra_packages: "{{ appliances_extra_packages_default + appliances_extra_packages_other }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 1d756ed66..cb49b92e2 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -161,6 +161,7 @@ freeipa_client # Hosts to replace system repos with Pulp repos # Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users builder +extra_packages [pulp] # Add builder to this group to enable automatically syncing of pulp during image build diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 4293cbca0..8b5046bfc 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -110,4 +110,4 @@ control [extra_packages:children] # Hosts to install specified additional packages on -cluster +builder From 4254e28cfae5f19dcd8802050844d040f5416395 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 30 Jan 2025 14:00:58 +0000 Subject: [PATCH 268/268] moved vtest pre-hook to nrel --- environments/nrel/hooks/pre.yml | 36 ++++++++++ environments/vtest/hooks/pre.yml | 115 ++----------------------------- 2 files changed, 40 insertions(+), 111 deletions(-) diff --git a/environments/nrel/hooks/pre.yml b/environments/nrel/hooks/pre.yml index ebc472753..a7c26588e 100644 --- a/environments/nrel/hooks/pre.yml +++ b/environments/nrel/hooks/pre.yml @@ -1,5 +1,41 @@ --- +- name: "Add a stack admin user" + hosts: all + # tags: pre_tasks_all + become: true + gather_facts: true + tasks: + - name: Does localhome/stack dir exist + ansible.builtin.stat: + path: /localhome/stack + register: localstack + + # - name: Does localhome/rocky dir exist + # ansible.builtin.stat: + # path: /localhome/rocky + # register: localrockyuserdir + + - name: Add stack user + ansible.builtin.include_role: + name: vs.core.stack_user + when: not localstack.stat.exists + + # - name: Add rocky user role + # ansible.builtin.include_role: + # name: vshpc.rocky_user + # when: not localrockyuserdir.stat.exists + # tags: + # - add_stack_user + tags: + - add_stack_user + +- name: A cp user home to /var/lib/USER + hosts: all + # tags: pre_tasks_all + become: true + gather_facts: false + - name: "Do the preliminary node setups" hosts: all # tags: pre_tasks_all diff --git a/environments/vtest/hooks/pre.yml b/environments/vtest/hooks/pre.yml index 3c551c9a5..f3a8a2220 100644 --- a/environments/vtest/hooks/pre.yml +++ b/environments/vtest/hooks/pre.yml @@ -1,111 +1,4 @@ -# - name: Import parent hook -# vars: -# appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" -# ansible.builtin.import_playbook: "{{ appliances_environment_root }}/../nrel/hooks/pre.yml" - -- name: "Add a stack admin user" - hosts: all - # tags: pre_tasks_all - become: true - gather_facts: true - tasks: - - name: Does localhome/stack dir exist - ansible.builtin.stat: - path: /localhome/stack - register: localstack - - # - name: Does localhome/rocky dir exist - # ansible.builtin.stat: - # path: /localhome/rocky - # register: localrockyuserdir - - - name: Add stack user - ansible.builtin.include_role: - name: vs.core.stack_user - when: not localstack.stat.exists - - # - name: Add rocky user role - # ansible.builtin.include_role: - # name: vshpc.rocky_user - # when: not localrockyuserdir.stat.exists - # tags: - # - add_stack_user - tags: - - add_stack_user - -- name: A cp user home to /var/lib/USER - hosts: all - # tags: pre_tasks_all - become: true - gather_facts: false - - # tasks: - # - name: Does home/USER dir exist - # ansible.builtin.stat: - # path: "/home/{{ appliances_local_users_ansible_user_name }}" - # register: homerocky - - # - name: Does var/lib/rocky dir exist - # ansible.builtin.stat: - # path: "/var/lib/{{ appliances_local_users_ansible_user_name }}" - # register: varlibrocky - - # - name: Cp homerocky to var/lib/ dir - # ansible.builtin.command: cp -a /home/{{ appliances_local_users_ansible_user_name }} /var/lib/{{ appliances_local_users_ansible_user_name }} - # when: - # - homerocky.stat.exists - # register: cph - # changed_when: cph.rc != 0 - - # # - name: cp auth_keys - # # shell: | - # # cp -a /home/rocky/.ssh /var/lib/rocky/.ssh - # # when: - # # - varlibrocky.stat.exists - # # - homerocky.stat.exists - - # - name: Hack passwd file for localhome/rocky - # ansible.builtin.lineinfile: - # path: /etc/passwd - # regexp: '^rocky.*1000.*' - # line: "rocky:x:1000:1000:vsRockyUser:/var/lib/{{ appliances_local_users_ansible_user_name }}:/bin/bash" - # tags: - # - rocky_localhome - -- name: "Do the preliminary node setups" - hosts: all - # tags: pre_tasks_all - become: true - tasks: - - name: Does vs_pre_complete.txt exist - ansible.builtin.stat: - path: /root/vs_pre_complete.txt - register: vs_pre_complete - - - name: Pre tasks now - ansible.builtin.include_role: - name: vshpc.prov.pre-tasks - when: not vs_pre_complete.stat.exists - -# TODO: KBENDL - check compatibility with new playbook -- name: "NREL pre - Mount cephfs volumes" - hosts: all - tags: - - vs_pre - - aco.core.cephfs - become: true - tasks: - - name: Does ceph exist? - ansible.builtin.stat: - path: /etc/ceph - register: vs_does_cephdir - - name: CephFS now - ansible.builtin.include_role: - name: aco.core.cephfs - tags: - - aco.core.cephfs - when: not vs_does_cephdir.stat.exists - - name: Tmp umount /home - ansible.builtin.shell: systemctl stop home.mount - when: not vs_does_cephdir.stat.exists - become: true +- name: Import parent hook + vars: + appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" + import_playbook: "{{ appliances_environment_root }}/../nrel/hooks/pre.yml"