Skip to content
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e1bdb58
WIP: add lustre role
sjpb Jun 25, 2024
410e0ed
allow definition of multiple lustre_mounts
sjpb Jun 26, 2024
9f47321
fix lustre build for 2.15.5 release candidate
sjpb Jun 27, 2024
63840ee
simplify lustre defaults
sjpb Jun 27, 2024
02b9ae1
allow lustre install during build to get kernel version
sjpb Jun 27, 2024
8df4a61
allow extending fat images with site-specific groups
sjpb Jun 27, 2024
3840baa
fix packer build so only roles for defined groups run
sjpb Sep 13, 2024
ed8f79b
enable control of 'extra' build image name
sjpb Sep 13, 2024
9f04b48
bump to release lustre
sjpb Sep 13, 2024
28a8297
add lnet configuration
sjpb Sep 13, 2024
6d5da54
simplify lustre mount logic
sjpb Sep 13, 2024
22be72c
provide lnet config
sjpb Sep 17, 2024
e59b2be
autodetermine lustre interface
sjpb Sep 19, 2024
32e3bda
WIP: validation needs fixing for lustre_mounts removal
sjpb Sep 19, 2024
37d727a
add working lnet.conf template
sjpb Sep 19, 2024
5506876
refactor lustre role for multiple mounts, selectable lnet label
sjpb Sep 20, 2024
2becb3a
remove unneeded comments from lustre taskfiles
sjpb Oct 1, 2024
2819fc3
fix lustre net type
sjpb Oct 1, 2024
75b20fa
fixup opensearch install permissions
sjpb Oct 4, 2024
023c030
add docs for extra builds
sjpb Oct 4, 2024
325889b
Merge branch 'main' into upstream-lustre
sjpb Oct 4, 2024
98d6cab
fix packer volume size definition
sjpb Oct 4, 2024
6589cb4
Merge branch 'upstream-lustre' of github.com:stackhpc/ansible-slurm-a…
sjpb Oct 4, 2024
6df790b
fix missing image name for cuda build
sjpb Oct 4, 2024
0cb2113
Merge branch 'main' into upstream-lustre
sjpb Oct 11, 2024
a62d148
bump CI image
sjpb Oct 11, 2024
300fbfa
Merge branch 'main' into upstream-lustre
sjpb Oct 24, 2024
ed695f0
update packer README for modified image vars
sjpb Oct 24, 2024
965e24a
move packer docs into docs/
sjpb Oct 24, 2024
3934ecb
make packer extra build directly configurable
sjpb Oct 24, 2024
f54d37d
tidy packer docs
sjpb Oct 24, 2024
e24997e
fix build error 'Error: Unset variable extra_build_volume_size'
sjpb Oct 24, 2024
177083b
fix error with null default during volume size lookup
sjpb Oct 24, 2024
676d7e8
note lnet protocol limitation
sjpb Oct 24, 2024
d8e161b
bump CI image to test
sjpb Oct 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,5 @@ roles/*
!roles/squid/**
!roles/tuned/
!roles/tuned/**

!roles/lustre/
!roles/lustre/**
29 changes: 27 additions & 2 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

- hosts: builder
become: yes
gather_facts: no
gather_facts: yes
tasks:
# - import_playbook: iam.yml
- name: Install FreeIPA client
Expand All @@ -44,6 +44,11 @@
name: stackhpc.os-manila-mount
tasks_from: install.yml
when: "'manila' in group_names"
- name: Install Lustre packages
include_role:
name: lustre
tasks_from: install.yml
when: "'lustre' in group_names"

- import_playbook: extras.yml

Expand All @@ -56,60 +61,76 @@
include_role:
name: mysql
tasks_from: install.yml
when: "'mysql' in group_names"
- name: OpenHPC
import_role:
name: stackhpc.openhpc
tasks_from: install.yml
when: "'openhpc' in group_names"

# - import_playbook: portal.yml
- name: Open Ondemand server (packages)
include_role:
name: osc.ood
tasks_from: install-package.yml
vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml"
when: "'openondemand' in group_names"
# # FUTURE: install-apps.yml - this is git clones

- name: Open Ondemand server (apps)
include_role:
name: osc.ood
tasks_from: install-apps.yml
vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml"
when: "'openondemand' in group_names"

- name: Open Ondemand remote desktop
import_role:
name: openondemand
tasks_from: vnc_compute.yml
when: "'openondemand_desktop' in group_names"

- name: Open Ondemand jupyter node
import_role:
name: openondemand
tasks_from: jupyter_compute.yml
when: "'openondemand_jupyter' in group_names"

# - import_playbook: monitoring.yml:
- import_role:
name: opensearch
tasks_from: install.yml
become: true
when: "'opensearch' in group_names"
# slurm_stats - nothing to do
- import_role:
name: filebeat
tasks_from: install.yml
when: "'filebeat' in group_names"

- import_role:
# can't only run cloudalchemy.node_exporter/tasks/install.yml as needs vars from preflight.yml and triggers service start
# however starting node exporter is ok
name: cloudalchemy.node_exporter
when: "'node_exporter' in group_names"

- name: openondemand exporter
dnf:
name: ondemand_exporter
name: ondemand_exporter
when: "'openondemand' in group_names"

- name: slurm exporter
import_role:
name: slurm_exporter
tasks_from: install
vars:
slurm_exporter_state: stopped
when: "'slurm_exporter' in group_names"

- hosts: prometheus
become: yes
gather_facts: yes
tasks:
- import_role:
name: cloudalchemy.prometheus
tasks_from: preflight.yml
Expand Down Expand Up @@ -162,6 +183,10 @@
- prometheus
- promtool

- hosts: grafana
become: yes
gather_facts: yes
tasks:
- name: Include distribution variables for cloudalchemy.grafana
include_vars: "{{ appliances_repository_root }}/ansible/roles/cloudalchemy.grafana/vars/redhat.yml"
- import_role:
Expand Down
10 changes: 10 additions & 0 deletions ansible/filesystems.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,13 @@
tasks:
- include_role:
name: stackhpc.os-manila-mount

- name: Setup Lustre clients
hosts: lustre
become: true
tags: lustre
tasks:
- include_role:
name: lustre
# NB install is ONLY run in builder
tasks_from: configure.yml
27 changes: 27 additions & 0 deletions ansible/roles/lustre/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# lustre

Install and configure a Lustre client. This builds RPM packages from source.

**NB:** The `install.yml` playbook in this role should only be run during image build and is not idempotent. This will install the `kernel-devel` package; if not already installed (e.g. from an `ofed` installation), this may require enabling update of DNF packages during build using `update_enable=true`, which will upgrade the kernel as well.

**NB:** Currently this only supports RockyLinux 9.

## Role Variables

- `lustre_version`: Optional str. Version of lustre to build, default `2.15.5` which is the first version with EL9 support
- `lustre_lnet_label`: Optional str. The "lnet label" part of the host's NID, e.g. `tcp0` or `o2ib1`. Default `tcp`.
- `lustre_mgs_nid`: Required str. The NID(s) for the MGS, e.g. `192.168.227.11@tcp1` (separate mutiple MGS NIDs using `:`).
- `lustre_mounts`: Required list. Define Lustre filesystems and mountpoints as a list of dicts with keys:
- `fs_name`: Required str. The name of the filesystem to mount
- `mount_point`: Required str. Path to mount filesystem at.
- `mount_state`: Optional mount state, as for [ansible.posix.mount](https://docs.ansible.com/ansible/latest/collections/ansible/posix/mount_module.html#parameter-state). Default is `lustre_mount_state`.
- `mount_options`: Optional mount options. Default is `lustre_mount_options`.
- `lustre_mount_state`. Optional default mount state for all mounts, as for [ansible.posix.mount](https://docs.ansible.com/ansible/latest/collections/ansible/posix/mount_module.html#parameter-state). Default is `mounted`.
- `lustre_mount_options`. Optional default mount options. Default values are systemd defaults from [Lustre client docs](http://wiki.lustre.org/Mounting_a_Lustre_File_System_on_Client_Nodes).

The following variables control the package build and and install and should not generally be required:
- `lustre_build_packages`: Optional list. Prerequisite packages required to build Lustre. See `defaults/main.yml`.
- `lustre_build_dir`: Optional str. Path to build lustre at, default `/tmp/lustre-release`.
- `lustre_configure_opts`: Optional list. Options to `./configure` command. Default builds client rpms supporting Mellanox OFED, without support for GSS keys.
- `lustre_rpm_globs`: Optional list. Shell glob patterns for rpms to install. Note order is important as the built RPMs are not in a yum repo. Default is just the `kmod-lustre-client` and `lustre-client` packages.
- `lustre_build_cleanup`: Optional bool. Whether to uninstall prerequisite packages and delete the build directories etc. Default `true`.
36 changes: 36 additions & 0 deletions ansible/roles/lustre/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
lustre_version: '2.15.5' # https://www.lustre.org/lustre-2-15-5-released/
lustre_lnet_label: tcp
#lustre_mgs_nid:
lustre_mounts: []
lustre_mount_state: mounted
lustre_mount_options: 'defaults,_netdev,noauto,x-systemd.automount,x-systemd.requires=lnet.service'

# below variables are for build and should not generally require changes
lustre_build_packages:
- "kernel-devel-{{ ansible_kernel }}"
- git
- gcc
- libtool
- python3
- python3-devel
- openmpi
- elfutils-libelf-devel
- libmount-devel
- libnl3-devel
- libyaml-devel
- rpm-build
- kernel-abi-stablelists
- libaio
- libaio-devel
lustre_build_dir: /tmp/lustre-release
lustre_configure_opts:
- --disable-server
- --with-linux=/usr/src/kernels/*
- --with-o2ib=/usr/src/ofa_kernel/default
- --disable-maintainer-mode
- --disable-gss-keyring
- --enable-mpitests=no
lustre_rpm_globs: # NB: order is important here, as not installing from a repo
- "kmod-lustre-client-{{ lustre_version | split('.') | first }}*" # only take part of the version as -RC versions produce _RC rpms
- "lustre-client-{{ lustre_version | split('.') | first }}*"
lustre_build_cleanup: true
47 changes: 47 additions & 0 deletions ansible/roles/lustre/tasks/configure.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
- name: Gather Lustre interface info
shell:
cmd: |
ip r get {{ _lustre_mgs_ip }}
changed_when: false
register: _lustre_ip_r_mgs
vars:
_lustre_mgs_ip: "{{ lustre_mgs_nid | split('@') | first }}"

- name: Set facts for Lustre interface
set_fact:
_lustre_interface: "{{ _lustre_ip_r_mgs_info[4] }}"
_lustre_ip: "{{ _lustre_ip_r_mgs_info[6] }}"
vars:
_lustre_ip_r_mgs_info: "{{ _lustre_ip_r_mgs.stdout_lines.0 | split }}"
# first line e.g. "10.167.128.1 via 10.179.0.2 dev eth0 src 10.179.3.149 uid 1000"

- name: Write LNet configuration file
template:
src: lnet.conf.j2
dest: /etc/lnet.conf # exists from package install, expected by lnet service
owner: root
group: root
mode: u=rw,go=r # from package install
register: _lnet_conf

- name: Ensure lnet service state
systemd:
name: lnet
state: "{{ 'restarted' if _lnet_conf.changed else 'started' }}"

- name: Ensure mount points exist
ansible.builtin.file:
path: "{{ item.mount_point }}"
state: directory
loop: "{{ lustre_mounts }}"
when: "(item.mount_state | default(lustre_mount_state)) != 'absent'"

- name: Mount lustre filesystem
ansible.posix.mount:
fstype: lustre
src: "{{ lustre_mgs_nid }}:/{{ item.fs_name }}"
path: "{{ item.mount_point }}"
state: "{{ (item.mount_state | default(lustre_mount_state)) }}"
opts: "{{ item.mount_options | default(lustre_mount_options) }}"
loop: "{{ lustre_mounts }}"

70 changes: 70 additions & 0 deletions ansible/roles/lustre/tasks/install.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
- name: Install lustre build prerequisites
ansible.builtin.dnf:
name: "{{ lustre_build_packages }}"
register: _lustre_dnf_build_packages

- name: Clone lustre git repo
# https://git.whamcloud.com/?p=fs/lustre-release.git;a=summary
ansible.builtin.git:
repo: git://git.whamcloud.com/fs/lustre-release.git
dest: "{{ lustre_build_dir }}"
version: "{{ lustre_version }}"

- name: Prepare for lustre configuration
ansible.builtin.command:
cmd: sh ./autogen.sh
chdir: "{{ lustre_build_dir }}"

- name: Configure lustre build
ansible.builtin.command:
cmd: "./configure {{ lustre_configure_opts | join(' ') }}"
chdir: "{{ lustre_build_dir }}"

- name: Build lustre
ansible.builtin.command:
cmd: make rpms
chdir: "{{ lustre_build_dir }}"

- name: Find rpms
ansible.builtin.find:
paths: "{{ lustre_build_dir }}"
patterns: "{{ lustre_rpm_globs }}"
use_regex: false
register: _lustre_find_rpms

- name: Check rpms found
assert:
that: _lustre_find_rpms.files | length
fail_msg: "No lustre repos found with lustre_rpm_globs = {{ lustre_rpm_globs }}"

- name: Install lustre rpms
ansible.builtin.dnf:
name: "{{ _lustre_find_rpms.files | map(attribute='path')}}"
disable_gpg_check: yes

- block:
- name: Remove lustre build prerequisites
# NB Only remove ones this role installed which weren't upgrades
ansible.builtin.dnf:
name: "{{ _new_pkgs }}"
state: absent
vars:
_installed_pkgs: |
{{
_lustre_dnf_build_packages.results |
select('match', 'Installed:') |
map('regex_replace', '^Installed: (.+?)-[0-9].*$', '\1')
}}
_removed_pkgs: |
{{
_lustre_dnf_build_packages.results |
select('match', 'Removed:') |
map('regex_replace', '^Removed: (.+?)-[0-9].*$', '\1')
}}
_new_pkgs: "{{ _installed_pkgs | difference(_removed_pkgs) }}"

- name: Delete lustre build dir
file:
path: "{{ lustre_build_dir }}"
state: absent
when: lustre_build_cleanup | bool
27 changes: 27 additions & 0 deletions ansible/roles/lustre/tasks/validate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
- name: Assert using RockyLinux 9
assert:
that: ansible_distribution_major_version | int == 9
fail_msg: The 'lustre' role requires RockyLinux 9

- name: Check kernel-devel package is installed
command: "dnf list --installed kernel-devel-{{ ansible_kernel }}"
changed_when: false
# NB: we don't check here the kernel will remain the same after reboot etc, see ofed/install.yml

- name: Ensure SELinux in permissive mode
assert:
that: selinux_state in ['permissive', 'disabled']
fail_msg: "SELinux must be permissive for Lustre not '{{ selinux_state }}'; see variable selinux_state"

- name: Ensure lustre_mgs_nid is defined
assert:
that: lustre_mgs_nid is defined
fail_msg: Variable lustre_mgs_nid must be defined

- name: Ensure lustre_mounts entries define filesystem name and mount point
assert:
that:
- item.fs_name is defined
- item.mount_point is defined
fail_msg: All lustre_mounts entries must specify fs_name and mount_point
loop: "{{ lustre_mounts }}"
6 changes: 6 additions & 0 deletions ansible/roles/lustre/templates/lnet.conf.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
net:
- net type: {{ lustre_lnet_label }}
local NI(s):
- nid: {{ _lustre_ip }}@{{ lustre_lnet_label }}
interfaces:
0: {{ _lustre_interface }}
8 changes: 8 additions & 0 deletions ansible/validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,11 @@
- import_role:
name: freeipa
tasks_from: validate.yml

- name: Validate lustre configuration
hosts: lustre
tags: lustre
tasks:
- import_role:
name: lustre
tasks_from: validate.yml
5 changes: 4 additions & 1 deletion environments/common/inventory/groups
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,7 @@ freeipa_client
# Hosts to run TuneD configuration

[ansible_init]
# Hosts to run linux-anisble-init
# Hosts to run linux-anisble-init

[lustre]
# Hosts to run lustre client
5 changes: 4 additions & 1 deletion environments/common/layouts/everything
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,7 @@ openhpc

[ansible_init:children]
# Hosts to run ansible-init
cluster
cluster

[lustre]
# Hosts to run lustre client
Loading
Loading