From 9eed717d5081d355a80e5c8bf64e319dea86004f Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 22 Aug 2025 10:20:02 +0100 Subject: [PATCH 1/3] Enable persistent journaling In the cloud images, /var/log/journal doesn't exist. When this directory does not exist, journald will not write any files to disk, and as such, logging data is lost accross reboots. Persistent logging is useful for debugging issues that caused the nodes to reboot (provided that the logs are flushed to disk). --- ansible/.gitignore | 2 ++ ansible/bootstrap.yml | 9 +++++++ ansible/roles/journald/README.md | 4 +++ ansible/roles/journald/defaults/main.yml | 4 +++ ansible/roles/journald/tasks/main.yml | 32 ++++++++++++++++++++++++ docs/monitoring-and-logging.md | 5 +++- environments/common/inventory/groups | 4 +++ 7 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 ansible/roles/journald/README.md create mode 100644 ansible/roles/journald/defaults/main.yml create mode 100644 ansible/roles/journald/tasks/main.yml diff --git a/ansible/.gitignore b/ansible/.gitignore index 62c9a543c..18b8d7e52 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -98,3 +98,5 @@ roles/* !roles/eessi/** !roles/topology/ !roles/topology/** +!roles/journald/ +!roles/journald/** diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 50d024676..069d3f947 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -28,6 +28,15 @@ to update these variable names. ** NB: The actual secrets will not be changed.** when: "'secrets_openhpc_' in (hostvars[inventory_hostname] | join)" +- hosts: journald + gather_facts: false + tags: + - logging + - journald + tasks: + - import_role: + name: journald + - hosts: resolv_conf become: yes gather_facts: false diff --git a/ansible/roles/journald/README.md b/ansible/roles/journald/README.md new file mode 100644 index 000000000..bc1b81004 --- /dev/null +++ b/ansible/roles/journald/README.md @@ -0,0 +1,4 @@ +# journald + +This role is used to configue journald. Please see the [role +defaults](defaults/main.yml) for a full list of configuration options. diff --git a/ansible/roles/journald/defaults/main.yml b/ansible/roles/journald/defaults/main.yml new file mode 100644 index 000000000..9dcd9d49a --- /dev/null +++ b/ansible/roles/journald/defaults/main.yml @@ -0,0 +1,4 @@ +--- +# Journald storage. One of: volatile, persistent, auto, or none. Defaults to +# `persistent`. +journald_storage: persistent diff --git a/ansible/roles/journald/tasks/main.yml b/ansible/roles/journald/tasks/main.yml new file mode 100644 index 000000000..f49355543 --- /dev/null +++ b/ansible/roles/journald/tasks/main.yml @@ -0,0 +1,32 @@ +--- +- name: Create /var/log/journal + become: true + file: + path: /var/log/journal + owner: "root" + group: "systemd-journal" + mode: 02755 + state: directory + notify: Flush journal to disk + when: journald_storage == "persistent" + +- name: Ensure journald drop in directory exists + file: + path: "/etc/systemd/journald.conf.d/" + owner: "root" + group: "root" + mode: 0770 + state: directory + become: true + +- name: Ensure journald.conf overrides are set + copy: + content: | + [Journal] + Storage={{ journald_storage }} + dest: /etc/systemd/journald.conf.d/ansible-slurm-appliance.conf + owner: root + group: root + mode: 0660 + become: true + notify: Restart journald diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index 46b405a3e..b8565c0fb 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -301,6 +301,9 @@ slurm-stats is configured `slurm-stats` role in the [slurm_openstack_tools colle The `slurm_stats` group controls the placement of the `slurm_stats` service. This should be configured to be a group with a single host. That host must be co-located on the same host as the `filebeat` service that scrapes its output. +## Logging configuration +### Journald - +The [journald](../ansible/roles/journald/README.md) role is used to customise +journald configuration. diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 57b644152..106f93a7c 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -26,6 +26,10 @@ login openhpc additional +[journald:childen] +# Hosts where journald should be configured. See ansible/roles/journald/README.md. +cluster + [builder] # Do not add hosts here manually - used as part of Packer image build pipeline. See packer/README.md. From dc1f47d5a32327b8ac90fc43eb7faf721c050bc5 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Mon, 1 Sep 2025 09:55:31 +0100 Subject: [PATCH 2/3] Address code review comments --- ansible/roles/compute_init/README.md | 1 + docs/experimental/isolated-clusters.md | 1 + environments/common/inventory/groups | 3 +-- environments/site/inventory/groups | 4 ++++ 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 7a95d2b74..9107cafc9 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -40,6 +40,7 @@ it also requires an image build with the role name added to the | hooks/pre.yml | ? | None at present | n/a | | validate.yml | n/a | Not relevant during boot | n/a | | bootstrap.yml | (wait for ansible-init) | Not relevant during boot | n/a | +| bootstrap.yml | journald | Fully supported | No | | bootstrap.yml | resolv_conf | Fully supported | No | | bootstrap.yml | etc_hosts | Fully supported | No | | bootstrap.yml | chrony | Fully supported | No | diff --git a/docs/experimental/isolated-clusters.md b/docs/experimental/isolated-clusters.md index c136e99ea..6895f4803 100644 --- a/docs/experimental/isolated-clusters.md +++ b/docs/experimental/isolated-clusters.md @@ -44,6 +44,7 @@ See above for definition of "Default" features. In the "Isolated?" column: | gateway | n/a | n/a - build only | | grafana | Y | Y | | hpctests | Y | Y | +| journald | Y | Y | | k3s_agent | - | ? | | k3s_server | - | ? | | k9s | - | ? | diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 106f93a7c..f666ef31f 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -26,9 +26,8 @@ login openhpc additional -[journald:childen] +[journald] # Hosts where journald should be configured. See ansible/roles/journald/README.md. -cluster [builder] # Do not add hosts here manually - used as part of Packer image build pipeline. See packer/README.md. diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index 9df61dc13..87f876525 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -81,6 +81,10 @@ cluster # Hosts to recompile Slurm for - allows supporting Slurm autodetection method 'nvml' cuda +[journald:childen] +# Hosts where journald should be configured. See ansible/roles/journald/README.md. +cluster + [eessi:children] # Hosts on which EESSI stack should be configured openhpc From 74c1d727b497cdc7e77d9034e6c00e11d95906ff Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Mon, 15 Sep 2025 15:58:17 +0100 Subject: [PATCH 3/3] Update environments/site/inventory/groups --- environments/site/inventory/groups | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index 7e1e323a7..fa6b992f8 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -81,7 +81,7 @@ cluster # Hosts to recompile Slurm for - allows supporting Slurm autodetection method 'nvml' cuda -[journald:childen] +[journald:children] # Hosts where journald should be configured. See ansible/roles/journald/README.md. cluster