Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/library/module_utils/local_repo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
}
CLI_FILE_PATH = "/root/.config/pulp/cli.toml"
POST_TIMEOUT = 3600
TAR_POLL_VAL = 3
TAR_POLL_VAL = 25
FILE_POLL_VAL = 1
ISO_POLL_VAL = 15
FILE_URI = "/pulp/api/v3/content/file/files/"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,201 @@

echo "===== CUDA Toolkit installation completed ====="

- path: /usr/local/bin/install_nvhpc_sdk.sh
permissions: '0755'
content: |
#!/bin/bash
LOGFILE="/var/log/nvhpc_sdk_install.log"
exec > >(tee -a "$LOGFILE") 2>&1

echo "===== Starting NVIDIA HPC SDK installation ====="

NVHPC_PKG_NAME="nvhpc_2025_2511_Linux_x86_64_cuda_13.0"
NVHPC_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk"
NVHPC_MOUNT="/shared-nvhpc-sdk"
NVHPC_TARBALL="${NVHPC_MOUNT}/${NVHPC_PKG_NAME}.tar.gz"
NVHPC_INSTALL_DIR_NFS="${NVHPC_MOUNT}/nvhpc"
NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
NVHPC_EXTRACT_DIR="${NVHPC_MOUNT}/${NVHPC_PKG_NAME}"

# If already mounted/installed, skip otherwise mount
if mountpoint -q "${NVHPC_LOCAL_MOUNT}"; then
echo "[INFO] ${NVHPC_LOCAL_MOUNT} already mounted. Skipping NVIDIA HPC SDK installation."
exit 0
fi

if [ -d "${NVHPC_LOCAL_MOUNT}" ]; then
echo "[INFO] ${NVHPC_LOCAL_MOUNT} directory already present. Assuming NVIDIA HPC SDK is installed. Skipping."
exit 0
fi

mkdir -p "${NVHPC_MOUNT}"
mount -t nfs "${NVHPC_EXPORT}" "${NVHPC_MOUNT}"
if [ $? -ne 0 ]; then
echo "[ERROR] Failed to mount ${NVHPC_EXPORT} on ${NVHPC_MOUNT}. Skipping NVIDIA HPC SDK installation."
exit 0
fi

# Check tarball on NFS
echo "[INFO] Checking for NVIDIA HPC SDK tarball at ${NVHPC_TARBALL}..."
if [ ! -f "${NVHPC_TARBALL}" ]; then
echo "[ERROR] NVIDIA HPC SDK tarball not found at ${NVHPC_TARBALL}. Skipping NVIDIA HPC SDK installation."
exit 0
fi

# 5) Extract on NFS share itself
EXTRACT_SIZE_GB=$(du -sBG "${NVHPC_EXTRACT_DIR}" 2>/dev/null | cut -f1 | tr -d 'G')

if [ -d "${NVHPC_EXTRACT_DIR}" ] && [ "${EXTRACT_SIZE_GB}" -ge 13 ] && [ -f "${NVHPC_EXTRACT_DIR}/install" ]; then
echo "[INFO] NVHPC already extracted (size=${EXTRACT_SIZE_GB}G, install file exists). Skipping extraction."
else
echo "[INFO] Extracting NVIDIA HPC SDK tarball under ${NVHPC_MOUNT}..."
# Optional: SHOw checkpoint progress
tar -xzf "${NVHPC_TARBALL}" -C "${NVHPC_MOUNT}" \
--checkpoint=2000 \
--checkpoint-action=echo="[INFO] Extracting NVHPC... please wait"

if [ $? -ne 0 ]; then
echo "[ERROR] Failed to extract NVIDIA HPC SDK tarball. Skipping installation."
exit 0
fi
fi

echo "[INFO] Ensuring NVHPC install directory exists on NFS: ${NVHPC_INSTALL_DIR_NFS}"
mkdir -p "${NVHPC_INSTALL_DIR_NFS}"

# Run installer with target on NFS
INSTALL_BIN_DIR="${NVHPC_INSTALL_DIR_NFS}/Linux_x86_64/25.11/compilers/bin"

if [ -x "${INSTALL_BIN_DIR}/nvc" ]; then
echo "[INFO] NVHPC already installed at ${NVHPC_INSTALL_DIR_NFS} (nvc found). Skipping installer."
else
echo "[INFO] Running NVIDIA HPC SDK installer..."
cd "${NVHPC_EXTRACT_DIR}" || {
echo "[ERROR] Failed to cd to extracted NVHPC directory: ${NVHPC_EXTRACT_DIR}"
exit 0
}

NVHPC_SILENT=true \
NVHPC_INSTALL_DIR="${NVHPC_INSTALL_DIR_NFS}" \
NVHPC_INSTALL_TYPE=auto \
./install 2>&1 | tee -a "${LOGFILE}"

RC=${PIPESTATUS[0]}
echo "[INFO] NVHPC installer exited with return code: ${RC}"

if [ ${RC} -ne 0 ]; then
echo "[ERROR] NVIDIA HPC SDK installer failed with status ${RC}. Skipping further configuration."
exit 0
fi
fi

echo "[SUCCESS] NVIDIA HPC SDK installation on NFS completed."

# Mount NVHPC from NFS into /opt/nvidia/nvhpc
echo "[INFO] Setting up local NVHPC mount at ${NVHPC_LOCAL_MOUNT}..."

mkdir -p "${NVHPC_LOCAL_MOUNT}"

NVHPC_INSTALL_EXPORT="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc"
FSTAB_ENTRY="${NVHPC_INSTALL_EXPORT} ${NVHPC_LOCAL_MOUNT} nfs defaults,_netdev 0 0"

# Add fstab entry only if it does not already exist
if ! grep -qE "^[^#].*${NVHPC_INSTALL_EXPORT}[[:space:]]+${NVHPC_LOCAL_MOUNT}[[:space:]]+nfs" /etc/fstab; then
echo "[INFO] Adding NVHPC mount to /etc/fstab"
echo "${FSTAB_ENTRY}" >> /etc/fstab
else
echo "[INFO] NVHPC mount already present in /etc/fstab"
fi

# Mount using fstab entry
echo "[INFO] Mounting ${NVHPC_LOCAL_MOUNT}..."
if ! mount "${NVHPC_LOCAL_MOUNT}"; then
echo "[ERROR] Failed to mount ${NVHPC_LOCAL_MOUNT}. Check NFS export and /etc/fstab."
exit 0
fi

echo "[INFO] NVHPC successfully mounted at ${NVHPC_LOCAL_MOUNT}"

- path: /usr/local/bin/configure_nvhpc_env.sh
permissions: '0755'
content: |
#!/bin/bash
LOGFILE="/var/log/nvhpc_env_config.log"
exec >> "$LOGFILE" 2>&1

echo "===== Configuring NVIDIA HPC SDK environment ====="

# Cloud-init safe defaults
export HOME=/root

NVCOMPILERS="/opt/nvidia/nvhpc"
NVARCH="$(uname -s)_$(uname -m)"
NVHPC_VERSION="25.11"

NVHPC_BASE="$NVCOMPILERS/$NVARCH/$NVHPC_VERSION"
PROFILE_FILE="/etc/profile.d/nvhpc.sh"

if [ ! -d "$NVHPC_BASE/compilers/bin" ]; then
echo "[ERROR] NVHPC compilers not found at $NVHPC_BASE"
exit 1
fi

echo "[INFO] NVHPC detected at $NVHPC_BASE"
echo "[INFO] Writing persistent environment to $PROFILE_FILE"

cat << EOF > "$PROFILE_FILE"
# NVIDIA HPC SDK environment
export NVCOMPILERS=$NVCOMPILERS
export NVARCH=$NVARCH
export NVHPC_VERSION=$NVHPC_VERSION

export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/bin:\$PATH
export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/man

# MPI (optional but recommended)
export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/bin:\$PATH
export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/man

# Modules support (optional)
export MODULEPATH=\$NVCOMPILERS/modulefiles:\${MODULEPATH:-}
EOF

chmod 644 "$PROFILE_FILE"
# **Source profile for current shell AND all future non-login shells**
if [ -f "$PROFILE_FILE" ]; then
echo "[INFO] Sourcing NVHPC profile for current shell"
# Source it for script session
source "$PROFILE_FILE"
# Also append to /etc/bashrc if not already present (for all future shells)
grep -q "nvhpc.sh" /etc/bashrc || echo "source $PROFILE_FILE" >> /etc/bashrc
fi

# NVHPC marker file path
MARKER_TARGET="/shared-nvhpc-sdk/nvhpc/.nvhpc_env_ready"

if ! grep -q "{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc" /etc/fstab; then
echo "[ERROR] NVHPC NFS path not found in /etc/fstab"
exit 1
fi

echo "[INFO] NVHPC NFS entry found in /etc/fstab"

# -------------------------------------------------------------------
# 6. Create marker ONLY after everything succeeds
# -------------------------------------------------------------------
if [ ! -d "/shared-nvhpc-sdk/nvhpc" ]; then
echo "[ERROR] Marker directory missing: /shared_nvhpc_sdk/nvhpc"
exit 1
fi

touch "$MARKER_TARGET"
echo "[SUCCESS] NVHPC marker created: $MARKER_TARGET"

echo "===== NVHPC environment configuration completed successfully ====="





{% if hostvars['localhost']['openldap_support'] %}
Expand Down Expand Up @@ -218,6 +413,9 @@
- systemctl restart sshd
- cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
- sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
# nvidia sdk install
- /usr/local/bin/install_nvhpc_sdk.sh


{% if hostvars['localhost']['openldap_support'] %}
- /usr/local/bin/update_ldap_conf.sh
Expand Down Expand Up @@ -319,4 +517,5 @@

- /root/ldms_sampler.sh
{% endif %}
- /usr/local/bin/configure_nvhpc_env.sh
- echo "Cloud-Init has completed successfully."
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,139 @@

echo "[INFO] ===== Completed Munge key and PAM configuration ====="


- path: /usr/local/bin/export_nvhpc_env.sh
permissions: '0755'
content: |
#!/bin/bash
LOGFILE="/var/log/export_nvhpc_env.log"
exec > >(tee -a "$LOGFILE") 2>&1

echo "===== NVHPC environment export ====="

NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"
NVARCH="$(uname -s)_$(uname -m)"
NVHPC_VERSION="25.11"
# Cloud-init safe defaults
export HOME=/root

NVHPC_BASE="$NVHPC_LOCAL_MOUNT/$NVARCH/$NVHPC_VERSION"
PROFILE_FILE="/etc/profile.d/nvhpc.sh"

# Validate compilers directory
if [ ! -d "$NVHPC_BASE/compilers/bin" ]; then
echo "[ERROR] NVHPC compilers not found at $NVHPC_BASE"
exit 1
fi

echo "[INFO] Writing persistent NVHPC profile"

cat > "$PROFILE_FILE" << EOF
# NVIDIA HPC SDK environment

export NVCOMPILERS=$NVHPC_LOCAL_MOUNT
export NVARCH=$NVARCH
export NVHPC_VERSION=$NVHPC_VERSION

export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/bin:\$PATH
export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/compilers/man

# MPI support
export PATH=\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/bin:\$PATH
export MANPATH=\${MANPATH:-}:\$NVCOMPILERS/\$NVARCH/\$NVHPC_VERSION/comm_libs/mpi/man

# Modules
export MODULEPATH=\$NVCOMPILERS/modulefiles:\${MODULEPATH:-}
EOF

chmod 644 "$PROFILE_FILE"

# Verify using clean login shell
echo "[INFO] Verifying NVHPC compilers"

if ! bash -lc "command -v nvc && nvc --version >/dev/null"; then
echo "[ERROR] nvc verification failed"
exit 1
fi

if ! bash -lc "command -v nvfortran && nvfortran --version >/dev/null"; then
echo "[ERROR] nvfortran verification failed"
exit 1
fi

echo "[SUCCESS] NVHPC environment exported and verified"

- path: /usr/local/bin/setup_nvhpc_sdk.sh
permissions: '0755'
content: |
#!/bin/bash
LOGFILE="/var/log/setup_nvhpc_sdk.log"
exec > >(tee -a "$LOGFILE") 2>&1

echo "===== NVHPC SDK setup (mount + wait) ====="

PARENT_NFS="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk"
PARENT_MOUNT="/shared-nvhpc-sdk"

NVHPC_NFS_SHARE="{{ cloud_init_nfs_path }}/hpc_tools/nvidia_sdk/nvhpc"
NVHPC_LOCAL_MOUNT="/opt/nvidia/nvhpc"

NVHPC_MARKER="$PARENT_MOUNT/nvhpc/.nvhpc_env_ready"

WAIT_TIMEOUT=3600
SLEEP_INTERVAL=20
ELAPSED=0

# 1. Mount parent export
mkdir -p "$PARENT_MOUNT"

if ! mountpoint -q "$PARENT_MOUNT"; then
mount -t nfs "$PARENT_NFS" "$PARENT_MOUNT"
fi

if ! mountpoint -q "$PARENT_MOUNT"; then
echo "[ERROR] Failed to mount NVHPC parent export"
exit 1
fi

echo "[INFO] Parent NVHPC export mounted"

# 2. Wait for readiness marker
echo "[INFO] Waiting for NVHPC readiness marker..."

while [ ! -f "$NVHPC_MARKER" ]; do
if [ "$ELAPSED" -ge "$WAIT_TIMEOUT" ]; then
echo "[ERROR] Timeout waiting for NVHPC readiness marker"
exit 1
fi
sleep "$SLEEP_INTERVAL"
ELAPSED=$((ELAPSED + SLEEP_INTERVAL))
done

echo "[SUCCESS] NVHPC readiness marker detected"

# 3. Ensure fstab entry exists
if ! grep -qF "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT" /etc/fstab; then
echo "$NVHPC_NFS_SHARE $NVHPC_LOCAL_MOUNT nfs defaults,_netdev 0 0" >> /etc/fstab
echo "[INFO] NVHPC fstab entry added"
else
echo "[INFO] NVHPC fstab entry already present"
fi

# 4. Mount NVHPC SDK
mkdir -p "$NVHPC_LOCAL_MOUNT"

if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
mount "$NVHPC_LOCAL_MOUNT"
fi

if ! mountpoint -q "$NVHPC_LOCAL_MOUNT"; then
echo "[ERROR] Failed to mount NVHPC SDK"
exit 1
fi

echo "[SUCCESS] NVHPC SDK mounted at $NVHPC_LOCAL_MOUNT"
echo "===== NVHPC setup completed ====="

- path: /usr/local/bin/configure_firewall_and_services.sh
permissions: '{{ file_mode_755 }}'
content: |
Expand Down Expand Up @@ -425,4 +557,6 @@

- /root/ldms_sampler.sh
{% endif %}
- /usr/local/bin/setup_nvhpc_sdk.sh
- /usr/local/bin/export_nvhpc_env.sh
- echo "Cloud-Init has completed successfully."
Loading