diff --git a/easybuild/easyblocks/generic/nvidiabase.py b/easybuild/easyblocks/generic/nvidiabase.py index 7230a3b4d3c..7cc432fdc3c 100644 --- a/easybuild/easyblocks/generic/nvidiabase.py +++ b/easybuild/easyblocks/generic/nvidiabase.py @@ -48,7 +48,8 @@ from easybuild.tools import LooseVersion from easybuild.tools.build_log import EasyBuildError, print_warning from easybuild.tools.config import build_option -from easybuild.tools.filetools import adjust_permissions, remove, symlink, write_file +from easybuild.tools.filetools import adjust_permissions, remove, symlink +from easybuild.tools.filetools import write_file, apply_regex_substitutions, resolve_path from easybuild.tools.modules import MODULE_LOAD_ENV_HEADERS, get_software_root, get_software_version from easybuild.tools.run import run_shell_cmd from easybuild.tools.systemtools import AARCH64, X86_64, get_cpu_architecture, get_shared_lib_ext @@ -447,6 +448,15 @@ def install_step(self): 'NVHPC_STDPAR_CUDACC': self.default_compute_capability[0].replace('.', ''), }) + # Before installing, make sure that NVHPC chooses the CUDA version we desire + # By default, NVHPC calls 'nvc -printcudaversion', which completely ignores our set + # version, and only cares about the supported GPUs and found CUDA driver. + # On a system without GPUs, this may return an incompatible CUDA version to the one + # we define in active_cuda_version. + desired_cuda_var_regex = [(r'DESIREDCUDA=\$(.*)', f'DESIREDCUDA={str(self.active_cuda_version)}')] + apply_regex_substitutions('./install_components/install', desired_cuda_var_regex, + on_missing_match='error') + cmd_env = ' '.join([f'{name}={value}' for name, value in sorted(nvhpc_env_vars.items())]) run_shell_cmd(f"{cmd_env} ./install") @@ -543,7 +553,30 @@ def sanity_check_step(self): os.path.join(prefix, 'comm_libs', 'mpi', 'bin', 'mpicc'), os.path.join(prefix, 'comm_libs', 'mpi', 'bin', 'mpifort'), ]) + if self.cfg['module_add_nccl']: + # Ensure that NCCL path points to correct CUDA version + comm_lib_path = os.path.join(self.installdir, prefix, 'comm_libs') + expected_path = resolve_path(os.path.join(comm_lib_path, str(self.active_cuda_version), 'nccl')) + actual_path = resolve_path(os.path.join(comm_lib_path, 'nccl')) + if actual_path != expected_path: + self.sanity_check_fail_msgs.append( + f"CUDA symlink for NCCL libraries does not match: {expected_path} != {actual_path}") + if self.cfg['module_add_nvshmem']: + # Ensure that NVSHMEM path points to correct CUDA version + comm_lib_path = os.path.join(self.installdir, prefix, 'comm_libs') + expected_path = resolve_path(os.path.join(comm_lib_path, str(self.active_cuda_version), 'nvshmem')) + actual_path = resolve_path(os.path.join(comm_lib_path, 'nvshmem')) + if actual_path != expected_path: + self.sanity_check_fail_msgs.append( + f"CUDA symlink for NVSHMEM libraries does not match: {expected_path} != {actual_path}") if self.cfg['module_add_math_libs']: + # Ensure that math_libs path points to correct CUDA version + math_lib_path = os.path.join(self.installdir, prefix, 'math_libs') + expected_path = resolve_path(os.path.join(math_lib_path, str(self.active_cuda_version), 'include')) + actual_path = resolve_path(os.path.join(math_lib_path, 'include')) + if actual_path != expected_path: + self.sanity_check_fail_msgs.append( + f"CUDA symlink for math libraries does not match: {expected_path} != {actual_path}") nvhpc_files.extend([ os.path.join(prefix, 'math_libs', 'lib64', f'libcublas.{shlib_ext}'), os.path.join(prefix, 'math_libs', 'lib64', f'libcufftw.{shlib_ext}'),