Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,17 @@
runcmd:
- /usr/local/bin/set-ssh.sh
- /usr/local/bin/configure_vast_installation.sh
{# Mount-specific runcmd entries #}
# DOCA prerequisites - mount /cert and prepare for DOCA installation
- mkdir -p {{ client_mount_path }}/slurm/ssh
- mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools
- echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab
- mount -av
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
# DOCA and IB configuration - now ready before vendor_data mounts
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #}
{%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %}
{% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %}
- {{ cmd }}
Expand Down Expand Up @@ -222,11 +232,8 @@
- echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab
- chmod {{ file_mode }} /etc/fstab
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf

{% if login_compiler_node_present %}
- /usr/local/bin/generate_install_uuid.sh
Expand Down Expand Up @@ -256,8 +263,7 @@
- echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
- /root/ldms_sampler.sh
{% endif %}
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh

- /usr/local/bin/check_slurm_controller_status.sh
- chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
- chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,17 @@
runcmd:
- /usr/local/bin/set-ssh.sh
- /usr/local/bin/configure_vast_installation.sh
{# Mount-specific runcmd entries #}
# DOCA prerequisites - mount /cert and prepare for DOCA installation
- mkdir -p {{ client_mount_path }}/slurm/ssh
- mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools
- echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab
- mount -av
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
# DOCA and IB configuration - now ready before vendor_data mounts
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #}
{%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %}
{% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %}
- {{ cmd }}
Expand Down Expand Up @@ -221,12 +231,9 @@
- echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab
- chmod {{ file_mode }} /etc/fstab
- mount -a
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf

{% if login_compiler_node_present %}
- /usr/local/bin/generate_install_uuid.sh
Expand Down Expand Up @@ -257,8 +264,6 @@
- /root/ldms_sampler.sh
{% endif %}

- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
- /usr/local/bin/check_slurm_controller_status.sh
- chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
- chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,17 @@
runcmd:
- /usr/local/bin/set-ssh.sh
- /usr/local/bin/configure_vast_installation.sh
{# Mount-specific runcmd entries #}
# DOCA prerequisites - mount /cert and prepare for DOCA installation
- mkdir -p {{ client_mount_path }}/slurm/ssh
- mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools
- echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab
- mount -av
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
# DOCA and IB configuration - now ready before vendor_data mounts
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #}
{%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %}
{% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %}
- {{ cmd }}
Expand Down Expand Up @@ -172,21 +182,17 @@
# Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia)
- mkdir -p {{ client_mount_path }}/slurm/ssh
- mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools
# Additional Slurm login node mounts (excluding cert which was mounted earlier)
- echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab
- chmod {{ file_mode }} /etc/fstab
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf

- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
- mount -av
- /usr/local/bin/check_slurm_controller_status.sh
- chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
- chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
Expand Down Expand Up @@ -251,8 +257,6 @@
- systemctl start slurmd
- systemctl daemon-reexec
- systemctl restart sshd
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
- mkdir -p /etc/containers/registries.conf.d
- mv /tmp/apptainer_mirror.conf /etc/containers/registries.conf.d/apptainer_mirror.conf

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,17 @@
runcmd:
- /usr/local/bin/set-ssh.sh
- /usr/local/bin/configure_vast_installation.sh
{# Mount-specific runcmd entries #}
# DOCA prerequisites - mount /cert and prepare for DOCA installation
- mkdir -p {{ client_mount_path }}/slurm/ssh
- mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools
- echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab
- mount -av
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
# DOCA and IB configuration - now ready before vendor_data mounts
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #}
{%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %}
{% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %}
- {{ cmd }}
Expand Down Expand Up @@ -168,25 +178,17 @@
{% for pv_entry in cloud_init_groups_dict[functional_group_name].powervault_scripts | default([]) %}
- bash /usr/local/bin/setup_iscsi_storage_{{ pv_entry.name }}.sh
{% endfor %}
# Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia)
- mkdir -p {{ client_mount_path }}/slurm/ssh
- mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools
- echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab
# Additional Slurm login node mounts (excluding cert which was mounted earlier)
- echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab
- echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab

- chmod {{ file_mode }} /etc/fstab
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
- mount -av
- /usr/local/bin/check_slurm_controller_status.sh
- chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
- chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
Expand Down Expand Up @@ -252,8 +254,6 @@
- systemctl start slurmd
- systemctl daemon-reexec
- systemctl restart sshd
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
- mkdir -p /etc/containers/registries.conf.d
- mv /tmp/apptainer_mirror.conf /etc/containers/registries.conf.d/apptainer_mirror.conf

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,17 @@

runcmd:
- /usr/local/bin/set-ssh.sh
{# Mount-specific runcmd entries #}
# DOCA prerequisites - moved early to ensure RDMA is ready before vendor_data mounts
- mkdir -p {{ client_mount_path }}/slurm/ssh
- mkdir -p {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }} {{ slurm_ctld_pid_dir_effective }} {{ slurmdbd_pid_dir_effective }} {{ slurm_state_save_location_effective }} {% if slurm_sched_log_dir_effective %}{{ slurm_sched_log_dir_effective }} {% endif %}/etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts
- echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab
- mount -av
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
# DOCA and IB configuration - now ready before vendor_data mounts
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #}
{%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %}
{% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %}
- {{ cmd }}
Expand Down Expand Up @@ -327,11 +337,6 @@
- echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab
- chmod {{ file_mode }} /etc/fstab
- mount -av
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf

- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
- chown -R {{ slurm_user }}:{{ slurm_user }} {{ home_dir }}
- chmod {{ file_mode_755 }} {{ home_dir }}
- chown -R {{ slurm_user }}:{{ slurm_user }} /etc/slurm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,14 @@
runcmd:
- /usr/local/bin/set-ssh.sh
- /usr/local/bin/configure_vast_installation.sh
{# Mount-specific runcmd entries #}
# DOCA prerequisites - moved early to ensure RDMA is ready before vendor_data mounts
- /usr/local/bin/configure_dirs_and_mounts.sh
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
# DOCA and IB configuration - now ready before vendor_data mounts
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #}
{%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %}
{% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %}
- {{ cmd }}
Expand Down Expand Up @@ -427,11 +434,7 @@
{% if dcgm_support %}
- /usr/local/bin/setup_dcgm.sh
{% endif %}
- /usr/local/bin/configure_dirs_and_mounts.sh
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh

{% if slurm_node_present %}
- |
set -e
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -395,10 +395,18 @@
{{ pv_entry.content | indent(12) }}
{% endfor %}


runcmd:
- /usr/local/bin/set-ssh.sh
- /usr/local/bin/configure_vast_installation.sh
{# Mount-specific runcmd entries #}
# DOCA prerequisites - moved early to ensure RDMA is ready before vendor_data mounts
Comment thread
jagadeeshnv marked this conversation as resolved.
- /usr/local/bin/configure_dirs_and_mounts.sh
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
# DOCA and IB configuration - now ready before vendor_data mounts
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #}
{%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %}
{% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %}
- {{ cmd }}
Expand Down Expand Up @@ -431,12 +439,6 @@
{% if dcgm_support %}
- /usr/local/bin/setup_dcgm.sh
{% endif %}
# slurm user and group created in the users module
- /usr/local/bin/configure_dirs_and_mounts.sh
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
- bash /usr/local/bin/doca-install.sh || true
- bash /usr/local/bin/configure-ib-network.sh
{% if slurm_node_present %}
- |
set -e
Expand Down Expand Up @@ -494,3 +496,4 @@

- systemctl restart slurmd
- echo "Cloud-Init has completed successfully."

Original file line number Diff line number Diff line change
Expand Up @@ -52,26 +52,21 @@ else
dnf install -y doca-ofed
fi

echo "Unloading RDMA kernel modules..."
rmmod bnxt_re || true
rmmod mlx5_ib || true
rmmod ib_uverbs || true
rmmod xpmem || true
rmmod ib_core || true
rmmod mlx5_core || true
# NOTE: Removed module unload/reload sequence - DKMS handles module installation automatically
# The modprobe commands were causing module dependency issues and are not needed
# DOCA DKMS installation automatically handles kernel module replacement

echo "Loading RDMA kernel modules..."
modprobe mlx5_core || true
modprobe mlx5_ib || true
modprobe ib_core || true
modprobe ib_uverbs || true
modprobe ib_umad || true
modprobe ib_cm || true
modprobe rdma_cm || true
modprobe rdma_ucm || true
modprobe xpmem || true
modprobe knem || true
modprobe ib_ipoib || true
# NOTE: Remove MLNX OFED kernel modules to preserve standard RDMA compatibility
# This ensures rpcrdma and other standard RDMA functionality works correctly
echo "Removing MLNX OFED kernel modules to ensure standard RDMA compatibility"
if dkms status mlnx-ofa_kernel/25.10 -k $(uname -r) >/dev/null 2>&1; then
dkms uninstall mlnx-ofa_kernel/25.10 -k $(uname -r) || true
dkms remove mlnx-ofa_kernel/25.10 --all || true
depmod -a
echo "MLNX OFED kernel modules removed successfully, standard kernel modules restored"
else
echo "MLNX OFED kernel modules not found, skipping removal"
fi

if command -v firewall-cmd &>/dev/null; then
echo "Adding firewall ports..."
Expand All @@ -86,4 +81,5 @@ else
fi

echo "DOCA-OFED installation completed successfully."


Loading