diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 02b1e0be8c..60b0a47616 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -182,7 +182,17 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/configure_vast_installation.sh -{# Mount-specific runcmd entries #} + # DOCA prerequisites - mount /cert and prepare for DOCA installation + - mkdir -p {{ client_mount_path }}/slurm/ssh + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - mount -av + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + # DOCA and IB configuration - now ready before vendor_data mounts + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh +{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #} {%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %} {% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %} - {{ cmd }} @@ -222,11 +232,8 @@ - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf {% if login_compiler_node_present %} - /usr/local/bin/generate_install_uuid.sh @@ -256,8 +263,7 @@ - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - /root/ldms_sampler.sh {% endif %} - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index c14e89a82a..a4b89e1efa 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -181,7 +181,17 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/configure_vast_installation.sh -{# Mount-specific runcmd entries #} + # DOCA prerequisites - mount /cert and prepare for DOCA installation + - mkdir -p {{ client_mount_path }}/slurm/ssh + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - mount -av + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + # DOCA and IB configuration - now ready before vendor_data mounts + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh +{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #} {%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %} {% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %} - {{ cmd }} @@ -221,12 +231,9 @@ - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf {% if login_compiler_node_present %} - /usr/local/bin/generate_install_uuid.sh @@ -257,8 +264,6 @@ - /root/ldms_sampler.sh {% endif %} - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 0fc7739e1a..ad767a2e59 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -140,7 +140,17 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/configure_vast_installation.sh -{# Mount-specific runcmd entries #} + # DOCA prerequisites - mount /cert and prepare for DOCA installation + - mkdir -p {{ client_mount_path }}/slurm/ssh + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - mount -av + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + # DOCA and IB configuration - now ready before vendor_data mounts + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh +{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #} {%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %} {% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %} - {{ cmd }} @@ -172,6 +182,7 @@ # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/slurm/ssh - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools + # Additional Slurm login node mounts (excluding cert which was mounted earlier) - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab @@ -179,14 +190,9 @@ - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - mount -av - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} @@ -251,8 +257,6 @@ - systemctl start slurmd - systemctl daemon-reexec - systemctl restart sshd - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - mkdir -p /etc/containers/registries.conf.d - mv /tmp/apptainer_mirror.conf /etc/containers/registries.conf.d/apptainer_mirror.conf diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 7571acf7c7..faa5c234b6 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -139,7 +139,17 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/configure_vast_installation.sh -{# Mount-specific runcmd entries #} + # DOCA prerequisites - mount /cert and prepare for DOCA installation + - mkdir -p {{ client_mount_path }}/slurm/ssh + - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - mount -av + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + # DOCA and IB configuration - now ready before vendor_data mounts + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh +{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #} {%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %} {% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %} - {{ cmd }} @@ -168,10 +178,7 @@ {% for pv_entry in cloud_init_groups_dict[functional_group_name].powervault_scripts | default([]) %} - bash /usr/local/bin/setup_iscsi_storage_{{ pv_entry.name }}.sh {% endfor %} - # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - - mkdir -p {{ client_mount_path }}/slurm/ssh - - mkdir -p {{ slurm_slurmd_log_dir_effective }} {{ slurm_slurmd_pid_dir_effective }} {{ slurm_slurmd_spool_dir_effective }} {{ slurm_epilog_dirs_all | join(' ') }} /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools - - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + # Additional Slurm login node mounts (excluding cert which was mounted earlier) - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm {{ slurm_slurmd_log_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd {{ slurm_slurmd_spool_dir_effective }} nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab @@ -179,14 +186,9 @@ - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools /hpc_tools nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - - chmod {{ file_mode }} /etc/fstab - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + - mount -av - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} @@ -252,8 +254,6 @@ - systemctl start slurmd - systemctl daemon-reexec - systemctl restart sshd - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - mkdir -p /etc/containers/registries.conf.d - mv /tmp/apptainer_mirror.conf /etc/containers/registries.conf.d/apptainer_mirror.conf diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index bde54bd9e7..2ee561109c 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -276,7 +276,17 @@ runcmd: - /usr/local/bin/set-ssh.sh -{# Mount-specific runcmd entries #} + # DOCA prerequisites - moved early to ensure RDMA is ready before vendor_data mounts + - mkdir -p {{ client_mount_path }}/slurm/ssh + - mkdir -p {{ slurm_ctld_log_dir_effective }} {{ slurmdbd_log_dir_effective }} {{ slurm_ctld_pid_dir_effective }} {{ slurmdbd_pid_dir_effective }} {{ slurm_state_save_location_effective }} {% if slurm_sched_log_dir_effective %}{{ slurm_sched_log_dir_effective }} {% endif %}/etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - mount -av + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + # DOCA and IB configuration - now ready before vendor_data mounts + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh +{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #} {%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %} {% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %} - {{ cmd }} @@ -327,11 +337,6 @@ - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -av - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ home_dir }} - chmod {{ file_mode_755 }} {{ home_dir }} - chown -R {{ slurm_user }}:{{ slurm_user }} /etc/slurm diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 92687f38e5..2d4b7ad001 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -394,7 +394,14 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/configure_vast_installation.sh -{# Mount-specific runcmd entries #} + # DOCA prerequisites - moved early to ensure RDMA is ready before vendor_data mounts + - /usr/local/bin/configure_dirs_and_mounts.sh + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + # DOCA and IB configuration - now ready before vendor_data mounts + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh +{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #} {%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %} {% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %} - {{ cmd }} @@ -427,11 +434,7 @@ {% if dcgm_support %} - /usr/local/bin/setup_dcgm.sh {% endif %} - - /usr/local/bin/configure_dirs_and_mounts.sh - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh + {% if slurm_node_present %} - | set -e diff --git a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 5ada0be1b7..3cae337b69 100644 --- a/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/provision/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -395,10 +395,18 @@ {{ pv_entry.content | indent(12) }} {% endfor %} + runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/configure_vast_installation.sh -{# Mount-specific runcmd entries #} + # DOCA prerequisites - moved early to ensure RDMA is ready before vendor_data mounts + - /usr/local/bin/configure_dirs_and_mounts.sh + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + # DOCA and IB configuration - now ready before vendor_data mounts + - bash /usr/local/bin/doca-install.sh || true + - bash /usr/local/bin/configure-ib-network.sh +{# Mount-specific runcmd entries - moved after DOCA to ensure RDMA is available #} {%- if cloud_init_groups_dict[functional_group_name].runcmd is defined %} {% for cmd in cloud_init_groups_dict[functional_group_name].runcmd %} - {{ cmd }} @@ -431,12 +439,6 @@ {% if dcgm_support %} - /usr/local/bin/setup_dcgm.sh {% endif %} - # slurm user and group created in the users module - - /usr/local/bin/configure_dirs_and_mounts.sh - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh || true - - bash /usr/local/bin/configure-ib-network.sh {% if slurm_node_present %} - | set -e @@ -494,3 +496,4 @@ - systemctl restart slurmd - echo "Cloud-Init has completed successfully." + diff --git a/provision/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 b/provision/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 index db8a7cb9cc..f1ce4389ef 100644 --- a/provision/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 +++ b/provision/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 @@ -52,26 +52,21 @@ else dnf install -y doca-ofed fi -echo "Unloading RDMA kernel modules..." -rmmod bnxt_re || true -rmmod mlx5_ib || true -rmmod ib_uverbs || true -rmmod xpmem || true -rmmod ib_core || true -rmmod mlx5_core || true +# NOTE: Removed module unload/reload sequence - DKMS handles module installation automatically +# The modprobe commands were causing module dependency issues and are not needed +# DOCA DKMS installation automatically handles kernel module replacement -echo "Loading RDMA kernel modules..." -modprobe mlx5_core || true -modprobe mlx5_ib || true -modprobe ib_core || true -modprobe ib_uverbs || true -modprobe ib_umad || true -modprobe ib_cm || true -modprobe rdma_cm || true -modprobe rdma_ucm || true -modprobe xpmem || true -modprobe knem || true -modprobe ib_ipoib || true +# NOTE: Remove MLNX OFED kernel modules to preserve standard RDMA compatibility +# This ensures rpcrdma and other standard RDMA functionality works correctly +echo "Removing MLNX OFED kernel modules to ensure standard RDMA compatibility" +if dkms status mlnx-ofa_kernel/25.10 -k $(uname -r) >/dev/null 2>&1; then + dkms uninstall mlnx-ofa_kernel/25.10 -k $(uname -r) || true + dkms remove mlnx-ofa_kernel/25.10 --all || true + depmod -a + echo "MLNX OFED kernel modules removed successfully, standard kernel modules restored" +else + echo "MLNX OFED kernel modules not found, skipping removal" +fi if command -v firewall-cmd &>/dev/null; then echo "Adding firewall ports..." @@ -86,4 +81,5 @@ else fi echo "DOCA-OFED installation completed successfully." +