diff --git a/README.md b/README.md index 34c6908..693556f 100644 --- a/README.md +++ b/README.md @@ -121,10 +121,12 @@ accounting data such as start and end times. By default no job accounting is con `openhpc_slurm_job_comp_loc`: Location to store the job accounting records. Depends on value of `openhpc_slurm_job_comp_type`, e.g for `jobcomp/filetxt` represents a path on disk. -### slurmdbd.conf +### slurmdbd -The following options affect `slurmdbd.conf`. Please see the slurm [documentation](https://slurm.schedmd.com/slurmdbd.conf.html) for more details. -You will need to configure these variables if you have set `openhpc_enable.database` to `true`. +When the slurm database daemon (`slurmdbd`) is enabled by setting +`openhpc_enable.database` to `true` the following options must be configured. +See documentation for [slurmdbd.conf](https://slurm.schedmd.com/slurmdbd.conf.html) +for more details. `openhpc_slurmdbd_port`: Port for slurmdb to listen on, defaults to `6819`. @@ -136,6 +138,30 @@ You will need to configure these variables if you have set `openhpc_enable.datab `openhpc_slurmdbd_mysql_username`: Username for authenticating with the database, defaults to `slurm`. +Before starting `slurmdbd`, the role will check if a database upgrade is +required to due to a Slurm major version upgrade and carry it out if so. +Slurm versions before 24.11 do not support this check and so no upgrade will +occur. The following variables control behaviour during this upgrade: + +`openhpc_slurm_accounting_storage_client_package`: Optional. String giving the +name of the database client package to install, e.g. `mariadb`. Default `mysql`. + +`openhpc_slurm_accounting_storage_backup_cmd`: Optional. String (possibly +multi-line) giving a command for `ansible.builtin.shell` to run a backup of the +Slurm database before performing the databse upgrade. Default is the empty +string which performs no backup. + +`openhpc_slurm_accounting_storage_backup_host`: Optional. Inventory hostname +defining host to run the backup command. Default is `openhpc_slurm_accounting_storage_host`. + +`openhpc_slurm_accounting_storage_backup_become`: Optional. Whether to run the +backup command as root. Default `true`. + +`openhpc_slurm_accounting_storage_service`: Optional. Name of systemd service +for the accounting storage database, e.g. `mysql`. If this is defined this +service is stopped before the backup and restarted after, to allow for physical +backups. Default is the empty string, which does not stop/restart any service. + ## Facts This role creates local facts from the live Slurm configuration, which can be diff --git a/defaults/main.yml b/defaults/main.yml index c806809..c465fa7 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -101,3 +101,10 @@ openhpc_module_system_install: true # Auto detection openhpc_ram_multiplier: 0.95 + +# Database upgrade +openhpc_slurm_accounting_storage_service: '' +openhpc_slurm_accounting_storage_backup_cmd: '' +openhpc_slurm_accounting_storage_backup_host: "{{ openhpc_slurm_accounting_storage_host }}" +openhpc_slurm_accounting_storage_backup_become: true +openhpc_slurm_accounting_storage_client_package: mysql diff --git a/handlers/main.yml b/handlers/main.yml index 9922cfb..d29a87d 100644 --- a/handlers/main.yml +++ b/handlers/main.yml @@ -1,10 +1,4 @@ --- -# NOTE: We need this running before slurmdbd -- name: Restart Munge service - service: - name: "munge" - state: restarted - when: openhpc_slurm_service_started | bool # NOTE: we need this running before slurmctld start - name: Issue slurmdbd restart command diff --git a/molecule/test4/converge.yml b/molecule/test4/converge.yml index ec83f10..47c73bf 100644 --- a/molecule/test4/converge.yml +++ b/molecule/test4/converge.yml @@ -15,6 +15,7 @@ openhpc_slurm_partitions: - name: "compute" openhpc_cluster_name: testohpc + openhpc_slurm_accounting_storage_client_package: mariadb tasks: - name: "Include ansible-role-openhpc" include_role: diff --git a/tasks/install.yml b/tasks/install.yml index b7d950d..b3a9b91 100644 --- a/tasks/install.yml +++ b/tasks/install.yml @@ -49,8 +49,8 @@ install_weak_deps: false # avoids getting recommended packages when: openhpc_slurm_pkglist | default(false, true) -- name: Install packages from openhpc_packages variable +- name: Install other packages yum: - name: "{{ openhpc_packages }}" + name: "{{ openhpc_packages + [openhpc_slurm_accounting_storage_client_package] }}" ... diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 18d75f7..e1881d3 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -56,8 +56,7 @@ owner: munge group: munge mode: 0400 - notify: - - Restart Munge service + register: _openhpc_munge_key_copy - name: Ensure JobComp logfile exists file: @@ -159,6 +158,24 @@ changed_when: false # so molecule doesn't fail become: no +- name: Ensure Munge service is running + service: + name: munge + state: "{{ 'restarted' if _openhpc_munge_key_copy.changed else 'started' }}" + when: openhpc_slurm_service_started | bool + +- name: Check slurmdbd state + command: systemctl is-active slurmdbd # noqa: command-instead-of-module + changed_when: false + failed_when: false # rc = 0 when active + register: _openhpc_slurmdbd_state + +- name: Ensure slurm database is upgraded if slurmdbd inactive + import_tasks: upgrade.yml # need import for conditional support + when: + - "_openhpc_slurmdbd_state.stdout == 'inactive'" + - openhpc_enable.database | default(false) + - name: Notify handler for slurmd restart debug: msg: "notifying handlers" # meta: noop doesn't support 'when' diff --git a/tasks/upgrade.yml b/tasks/upgrade.yml new file mode 100644 index 0000000..7e4e779 --- /dev/null +++ b/tasks/upgrade.yml @@ -0,0 +1,81 @@ +- name: Check if slurm database has been initialised + # DB is initialised on the first slurmdbd startup (without -u option). + # If it is not initialised, `slurmdbd -u` errors with something like + # > Slurm Database is somehow higher than expected '4294967294' but I only + # > know as high as '16'. Conversion needed. + community.mysql.mysql_query: + login_db: "{{ openhpc_slurmdbd_mysql_database }}" + login_user: "{{ openhpc_slurmdbd_mysql_username }}" + login_password: "{{ openhpc_slurmdbd_mysql_password }}" + login_host: "{{ openhpc_slurmdbd_host }}" + query: SHOW TABLES + config_file: '' + register: _openhpc_slurmdb_tables + +- name: Check if slurm database requires an upgrade + ansible.builtin.command: slurmdbd -u + register: _openhpc_slurmdbd_check + changed_when: false + failed_when: >- + _openhpc_slurmdbd_check.rc > 1 or + 'Slurm Database is somehow higher than expected' in _openhpc_slurmdbd_check.stdout + # from https://github.com/SchedMD/slurm/blob/master/src/plugins/accounting_storage/mysql/as_mysql_convert.c + when: _openhpc_slurmdb_tables.query_result | flatten | length > 0 # i.e. when db is initialised + +- name: Set fact for slurm database upgrade + # Explanation of ifs below: + # - `slurmdbd -u` rc == 0 then no conversion required (from manpage) + # - default of 0 on rc skips upgrade steps if check was skipped because + # db is not initialised + # - Usage message (and rc == 1) if -u option doesn't exist, in which case + # it can't be a major upgrade due to existing openhpc versions + set_fact: + _openhpc_slurmdb_upgrade: >- + {{ false + if ( + ( _openhpc_slurmdbd_check.rc | default(0) == 0) + or + ( 'Usage: slurmdbd' in _openhpc_slurmdbd_check.stderr ) + ) else + true + }} + +- name: Ensure Slurm database service stopped + ansible.builtin.systemd: + name: "{{ openhpc_slurm_accounting_storage_service }}" + state: stopped + register: _openhpc_slurmdb_state + when: + - _openhpc_slurmdb_upgrade + - openhpc_slurm_accounting_storage_service != '' + +- name: Backup Slurm database + ansible.builtin.shell: # noqa: command-instead-of-shell + cmd: "{{ openhpc_slurm_accounting_storage_backup_cmd }}" + delegate_to: "{{ openhpc_slurm_accounting_storage_backup_host }}" + become: "{{ openhpc_slurm_accounting_storage_backup_become }}" + changed_when: true + run_once: true + when: + - _openhpc_slurmdb_upgrade + - openhpc_slurm_accounting_storage_backup_cmd != '' + +- name: Ensure Slurm database service started + ansible.builtin.systemd: + name: "{{ openhpc_slurm_accounting_storage_service }}" + state: started + when: + - openhpc_slurm_accounting_storage_service != '' + - _openhpc_slurmdb_state.changed | default(false) + +- name: Run slurmdbd in foreground for upgrade + ansible.builtin.expect: + command: /usr/sbin/slurmdbd -D -vvv + responses: + (?i)Everything rolled up: + # See https://wiki.fysik.dtu.dk/Niflheim_system/Slurm_installation/#upgrade-slurmdbd + # and + # https://github.com/SchedMD/slurm/blob/0ce058c5adcf63001ec2ad211c65e67b0e7682a8/src/plugins/accounting_storage/mysql/as_mysql_usage.c#L1042 + become: true + become_user: slurm + when: _openhpc_slurmdb_upgrade