From a2f5544c51531bfb5b404b28f6edc9b08aff4af9 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 6 Aug 2025 22:30:32 +0200 Subject: [PATCH 01/27] Try to change the subdir in which the CUDA toolkit is installed so that it also doesnt include the CPU microarchitecture --- ....1-rebuild-2023a-for-cuda-sanity-check.yml | 10 ++++++++ eb_hooks.py | 25 +++++++++++++++---- .../nvidia/install_cuda_and_libraries.sh | 11 ++++++-- 3 files changed, 39 insertions(+), 7 deletions(-) create mode 100644 easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250805-eb-5.1.1-rebuild-2023a-for-cuda-sanity-check.yml diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250805-eb-5.1.1-rebuild-2023a-for-cuda-sanity-check.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250805-eb-5.1.1-rebuild-2023a-for-cuda-sanity-check.yml new file mode 100644 index 00000000..ef794f3e --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250805-eb-5.1.1-rebuild-2023a-for-cuda-sanity-check.yml @@ -0,0 +1,10 @@ +# We'll build all CUDA software, for various reasons +# 1. We now have a proper CUDA sanity check, and if anything was 'wrong' with our current CUDA installs, we'd like +# to know about it +# 2. The PR implementing a CI to check for differences between officially supported CUDA Compute Capabilities shows +# that there are a lot of missing installations https://github.com/EESSI/software-layer/pull/1087 . A rebuild PR like +# this will have the convenient side effect of filling all those holes +easyconfigs: + - CUDA-12.1.1.eb: + options: + accept-eula-for: CUDA diff --git a/eb_hooks.py b/eb_hooks.py index bdf8f49b..afda460d 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -151,7 +151,7 @@ def parse_list_of_dicts_env(var_name): if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', var_name): raise ValueError(f"Invalid environment variable name: {var_name}") list_string = os.getenv(var_name, '[]') - + list_of_dicts = [] try: # Try JSON format first @@ -162,7 +162,7 @@ def parse_list_of_dicts_env(var_name): list_of_dicts = ast.literal_eval(list_string) except (ValueError, SyntaxError): raise ValueError(f"Environment variable '{var_name}' does not contain a valid list of dictionaries.") - + return list_of_dicts @@ -211,7 +211,7 @@ def post_ready_hook(self, *args, **kwargs): parallel = self.parallel else: parallel = self.cfg['parallel'] - + if parallel == 1: return # no need to limit if already using 1 core @@ -733,7 +733,7 @@ def pre_configure_hook_score_p(self, *args, **kwargs): def pre_configure_hook_vsearch(self, *args, **kwargs): """ Pre-configure hook for VSEARCH - - Workaround for a Zlib macro being renamed in Gentoo, see https://bugs.gentoo.org/383179 + - Workaround for a Zlib macro being renamed in Gentoo, see https://bugs.gentoo.org/383179 (solves "expected initializer before 'OF'" errors) """ if self.name == 'VSEARCH': @@ -1301,13 +1301,28 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, al log.debug("%s is not found in allowlist, so replacing it with symlink: %s", print_name, full_path) # the host_injections path is under a fixed repo/location for CUDA or cuDNN + # full_path is something similar to + # /cvmfs/software.eessi.io/version/.../x86_64/amd/zen4/accel/nvidia/cc90/.../CUDA/bin/nvcc + # host_inj_path will then be + # /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/accel/nvidia/cc90/.../CUDA/bin/nvcc host_inj_path = re.sub(EESSI_INSTALLATION_REGEX, HOST_INJECTIONS_LOCATION, full_path) # CUDA and cu* libraries themselves don't care about compute capability so remove this # duplication from under host_injections (symlink to a single CUDA or cu* library # installation for all compute capabilities) accel_subdir = get_eessi_envvar("EESSI_ACCELERATOR_TARGET") + software_subdir = get_eessi_envvar("EESSI_SOFTWARE_SUBDIR") + cpu_family = get_eessi_envvar("EESSI_CPU_FAMILY") + # If accel_subdir is defined, remove it from the full path + # After removal of accel_subdir, host_inj_path will be something like + # /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/.../CUDA/bin/nvcc if accel_subdir: - host_inj_path = host_inj_path.replace("/accel/%s" % accel_subdir, '') + host_inj_path = host_inj_path.replace(accel_subdir, '') + # /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/.../CUDA/bin/nvcc + # If software_subdir is defined (it should always be...), replace it by only the cpu_family + # After this substitution, host_inj_path will be something like + # /cvmfs/software.eessi.io/host_injections/.../x86_64/.../CUDA/bin/nvcc + if software_subdir and cpu_family: + host_inj_path.replace(software_subdir, cpu_family) # make sure source and target of symlink are not the same if full_path == host_inj_path: raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 5123a7c1..85c17c9b 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -132,8 +132,15 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do # If there is a GPU on the node, the installation path will by default have an # accelerator subdirectory. For CUDA and cu*, these are binary installations and - # don't care about the target compute capability. Our hooks are aware of this and - # therefore expect CUDA to be available under EESSI_SITE_SOFTWARE_PATH + # don't care about the target compute capability nor the CPU microarchitecture. + # Our hooks are aware of this and therefore expect CUDA to be available under + # something like EESSI_SITE_SOFTWARE_PATH, but then with the CPU micro-architecture + # stripped + # This sed command will capture everything from the EESSI_SITE_SOFTWARE_PATH up until + # the EESSI_SOFTWARE_SUBDIR in a capture group. It will the replace that with the content + # of the capture group and then have the EESSI_CPU_FAMILY appended + # Thus EESSI_SITE_CPU_FAMILY_PATH is something like /cvmfs/software.eessi.io/host_injections/.../x86_64 + EESSI_SITE_CPU_FAMILY_PATH=$(echo "$EESSI_SITE_SOFTWARE_PATH" | sed 's/\(.*\)'"$EESSI_SOFTWARE_SUBDIR"'/\1'"$EESSI_CPU_FAMILY"'/') export EASYBUILD_INSTALLPATH=$EESSI_SITE_SOFTWARE_PATH # Install modules in hidden .modules dir to keep track of what was installed before From 333e00971a25b55e3dd3b59d6230260644dfc071 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 6 Aug 2025 22:56:25 +0200 Subject: [PATCH 02/27] Fix sed command --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 85c17c9b..f3bfb8ad 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -140,7 +140,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do # the EESSI_SOFTWARE_SUBDIR in a capture group. It will the replace that with the content # of the capture group and then have the EESSI_CPU_FAMILY appended # Thus EESSI_SITE_CPU_FAMILY_PATH is something like /cvmfs/software.eessi.io/host_injections/.../x86_64 - EESSI_SITE_CPU_FAMILY_PATH=$(echo "$EESSI_SITE_SOFTWARE_PATH" | sed 's/\(.*\)'"$EESSI_SOFTWARE_SUBDIR"'/\1'"$EESSI_CPU_FAMILY"'/') + EESSI_SITE_CPU_FAMILY_PATH=$(echo "$EESSI_SITE_SOFTWARE_PATH" | sed 's|\(.*\)'"$EESSI_SOFTWARE_SUBDIR"'|\1'"$EESSI_CPU_FAMILY"'|') export EASYBUILD_INSTALLPATH=$EESSI_SITE_SOFTWARE_PATH # Install modules in hidden .modules dir to keep track of what was installed before From 1ac3748ddac80db62901e870268787e9e4cc2c4a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 6 Aug 2025 22:59:33 +0200 Subject: [PATCH 03/27] Ok, now actually overwrite the EASYBUILD_INSTALLPATH --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index f3bfb8ad..87207283 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -141,7 +141,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do # of the capture group and then have the EESSI_CPU_FAMILY appended # Thus EESSI_SITE_CPU_FAMILY_PATH is something like /cvmfs/software.eessi.io/host_injections/.../x86_64 EESSI_SITE_CPU_FAMILY_PATH=$(echo "$EESSI_SITE_SOFTWARE_PATH" | sed 's|\(.*\)'"$EESSI_SOFTWARE_SUBDIR"'|\1'"$EESSI_CPU_FAMILY"'|') - export EASYBUILD_INSTALLPATH=$EESSI_SITE_SOFTWARE_PATH + export EASYBUILD_INSTALLPATH=$EESSI_SITE_CPU_FAMILY_PATH # Install modules in hidden .modules dir to keep track of what was installed before # (this action is temporary, and we do not call Lmod again within the current shell context, but in EasyBuild From 2cfe3065ff06ed00a0f49a09986b6bf4cf716909 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 6 Aug 2025 23:05:46 +0200 Subject: [PATCH 04/27] Fix the installpath that is reported --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 87207283..b1650f65 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -265,7 +265,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do cp -a ${eb_last_log} . fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..." else - echo_green "all installations at ${EESSI_SITE_SOFTWARE_PATH}/software/... succeeded!" + echo_green "all installations at ${EASYBUILD_INSTALLPATH}/... succeeded!" fi # clean up tmpdir content From d49059cd60490876d7b39f7d2acb82714d548d58 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 6 Aug 2025 23:07:47 +0200 Subject: [PATCH 05/27] Add software to reported dir --- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index b1650f65..056743d1 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -265,7 +265,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do cp -a ${eb_last_log} . fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..." else - echo_green "all installations at ${EASYBUILD_INSTALLPATH}/... succeeded!" + echo_green "all installations at ${EASYBUILD_INSTALLPATH}/software/... succeeded!" fi # clean up tmpdir content From b63778e04bb6235e02534a99fda3611c0a3b6174 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 6 Aug 2025 23:17:29 +0200 Subject: [PATCH 06/27] Reassing host_inj_path --- eb_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index afda460d..b469f364 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -1322,7 +1322,7 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, al # After this substitution, host_inj_path will be something like # /cvmfs/software.eessi.io/host_injections/.../x86_64/.../CUDA/bin/nvcc if software_subdir and cpu_family: - host_inj_path.replace(software_subdir, cpu_family) + host_inj_path = host_inj_path.replace(software_subdir, cpu_family) # make sure source and target of symlink are not the same if full_path == host_inj_path: raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " From e3f746f722d5155b07a3fdae192b967c866bf45d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 13:17:39 +0200 Subject: [PATCH 07/27] Change host injections location for binary non-redistributable files to e.g. /cvmfs/software.eessi.io/host_injections/x86_64, i.e. only include the CPU family in the prefix, not microarchitecture or accelerator architecture. Since these are binary installs, we don't need multiple copies, and requiring site admins to run the install scripts once per micro-architecture is just annoying (and requires more storage) --- eb_hooks.py | 34 +++++++++++++------ .../nvidia/install_cuda_and_libraries.sh | 6 ++-- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index b469f364..befd4240 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -1199,7 +1199,7 @@ def post_postproc_cuda(self, *args, **kwargs): # replace files that are not distributable with symlinks into # host_injections - replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) + replace_binary_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) else: print_msg(f"EESSI hook to respect CUDA license not triggered for installation path {self.installdir}") else: @@ -1249,16 +1249,19 @@ def post_postproc_cudnn(self, *args, **kwargs): # replace files that are not distributable with symlinks into # host_injections - replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) + replace_binary_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) else: print_msg(f"EESSI hook to respect cuDDN license not triggered for installation path {self.installdir}") else: raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!") -def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist): +def replace_binary_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist): """ Replace files that cannot be distributed with symlinks into host_injections + Since these are binary files, only the CPU family will be included in the prefix, + no microarchitecture or accelerator architecture will be included. For example, + /cvmfs/software.eessi.io/host_injections/x86_64/suffix/to/actual/file """ # Different packages use different ways to specify which files or file # 'types' may be redistributed. For CUDA, the 'EULA.txt' lists full file @@ -1310,19 +1313,28 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, al # duplication from under host_injections (symlink to a single CUDA or cu* library # installation for all compute capabilities) accel_subdir = get_eessi_envvar("EESSI_ACCELERATOR_TARGET") - software_subdir = get_eessi_envvar("EESSI_SOFTWARE_SUBDIR") - cpu_family = get_eessi_envvar("EESSI_CPU_FAMILY") # If accel_subdir is defined, remove it from the full path # After removal of accel_subdir, host_inj_path will be something like # /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/.../CUDA/bin/nvcc if accel_subdir: host_inj_path = host_inj_path.replace(accel_subdir, '') - # /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/.../CUDA/bin/nvcc - # If software_subdir is defined (it should always be...), replace it by only the cpu_family - # After this substitution, host_inj_path will be something like - # /cvmfs/software.eessi.io/host_injections/.../x86_64/.../CUDA/bin/nvcc - if software_subdir and cpu_family: - host_inj_path = host_inj_path.replace(software_subdir, cpu_family) + software_subdir = get_eessi_envvar("EESSI_SOFTWARE_SUBDIR") + cpu_family = get_eessi_envvar("EESSI_CPU_FAMILY") + os_type = get_eessi_envvar("EESSI_OS_TYPE") + eessi_version = get_eessi_envvar("EESSI_VERSION") + if software_subdir and cpu_family and os_type and eessi_version: + # Compose the string to be removed: + partial_path = f"{eessi_version}/software/{os_type}/{software_subdir}" + # After this, host_inj_path will be e.g. + # /cvmfs/software.eessi.io/host_injections/x86_64/software/CUDA/bin/nvcc + host_inj_path = host_inj_path.replace(partial_path, cpu_family) + else: + msg = "Failed to construct path to symlink for file (%s). All of the following values " + msg += "have to be defined: EESSI_SOFTWARE_SUBDIR='%s', EESSI_CPU_FAMILY='%s', " + msg += "EESSI_OS_TYPE='%s', EESSI_VERSION='%s'. Failed to replace non-redistributable file " + msg += "with symlink, aborting..." + raise EasyBuildError(msg, full_path, software_subdir, cpu_family, os_type, eessi_version) + # make sure source and target of symlink are not the same if full_path == host_inj_path: raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 056743d1..4baf3005 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -137,10 +137,10 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do # something like EESSI_SITE_SOFTWARE_PATH, but then with the CPU micro-architecture # stripped # This sed command will capture everything from the EESSI_SITE_SOFTWARE_PATH up until - # the EESSI_SOFTWARE_SUBDIR in a capture group. It will the replace that with the content + # the EESSI_VERSION in a capture group. It will the replace that with the content # of the capture group and then have the EESSI_CPU_FAMILY appended - # Thus EESSI_SITE_CPU_FAMILY_PATH is something like /cvmfs/software.eessi.io/host_injections/.../x86_64 - EESSI_SITE_CPU_FAMILY_PATH=$(echo "$EESSI_SITE_SOFTWARE_PATH" | sed 's|\(.*\)'"$EESSI_SOFTWARE_SUBDIR"'|\1'"$EESSI_CPU_FAMILY"'|') + # Thus EESSI_SITE_CPU_FAMILY_PATH is then something like /cvmfs/software.eessi.io/host_injections/x86_64 + EESSI_SITE_CPU_FAMILY_PATH=$(echo "$EESSI_SITE_SOFTWARE_PATH" | sed 's|\(.*\)'"$EESSI_VERSION"/software/"$EESSI_OS_TYPE"/"$EESSI_SOFTWARE_SUBDIR"'|\1'"$EESSI_CPU_FAMILY"'|') export EASYBUILD_INSTALLPATH=$EESSI_SITE_CPU_FAMILY_PATH # Install modules in hidden .modules dir to keep track of what was installed before From 2e10d3d7aa44468fc391175179518dbbb1757b6b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 13:52:01 +0200 Subject: [PATCH 08/27] Update Lmod hook to print more specific warning in case the CUDA / cuDNN package was found in the old host-injections location (with micro-arch specific subdir). Also, adapt the path to search for the regular LmodError --- create_lmodsitepackage.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index f3d11aeb..dbeb95b7 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -123,13 +123,29 @@ local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/site_specific_config/gpu/.\\n" if packagesList[simpleName] then -- simpleName is a module in packagesList - -- get the full host_injections path - local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + -- first, check the old host_injections path. If that exists, print a more targetted, explainatory warning + local previousHostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + local previousPackageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" + local previousPackageDirExists = isDir(previousPackageEasyBuildDir) + + -- get the host_injections path, and add only the EESSI_CPU_FAMILY at the end + local strip_suffix = os.getenv('EESSI_VERSION') .. "/software/" .. os.getenv('EESSI_OS_TYPE') .. "/" + strip_suffix = strip_suffix .. "os.getenv('EESSI_SOFTWARE_SUBDIR') + local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", strip_suffix, os.getenv('EESSI_CPU_FAMILY')) -- build final path where the software should be installed local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" local packageDirExists = isDir(packageEasyBuildDir) - if not packageDirExists then + if previousPackageDirExists and not packageDirExists then + local targettedAdvice = "but while the module file exists, the actual software is not entirely shipped with EESSI " + advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI " + advice = advice .. "can find it.\\n" + advice = advice .. "Note that a full copy is installed at " .. previoushostInjections .. "/software/" .. t.modFullName ". " + advice = advice .. "However, EESSI now expects it in a different location, namely at " + advice = advice .. hostInjections .. "/software/" .. t.modFullName "." + advice = advice .. "Please re-install the package at the new location." + advice = advice .. refer_to_docs + elseif not packageDirExists then local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI " advice = advice .. "can find it.\\n" From 26cd405eb595bbfbbc5f5a6a1db72dba1aeb06a3 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 14:58:21 +0200 Subject: [PATCH 09/27] Make sure update SitePackage.lua is included in the tarball --- create_tarball.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/create_tarball.sh b/create_tarball.sh index d104e210..11cce2b3 100755 --- a/create_tarball.sh +++ b/create_tarball.sh @@ -69,6 +69,11 @@ if [ -n "${accel_subdir}" ]; then fi for subdir in ${sw_subdirs}; do + if [ -d ${eessi_version}/software/${os}/${subdir}/.lmod ]; then + # lmod SitePackage or lmodrc files + find ${eessi_version}/software/${os}/${subdir}/.lmod - type f \! -name '.wh.*' >> ${files_list} + fi + if [ -d ${eessi_version}/software/${os}/${subdir}/modules ]; then # module files find ${eessi_version}/software/${os}/${subdir}/modules -type f \! -name '.wh.*' >> ${files_list} From b124fcfd58d8164913de624de6a70093e1bda53c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 15:08:01 +0200 Subject: [PATCH 10/27] fix the replacement, since the already contains , so now it was trying to find /accel/accel/nvidia/ccXX. Fixed now to /accel/nvidia/ccXX --- create_lmodsitepackage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index dbeb95b7..28b5d029 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -309,7 +309,7 @@ def error(msg): # the install path (if it exists) accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET") if accel_subdir: - sitepackage_path = sitepackage_path.replace("/accel/%s" % accel_subdir, '') + sitepackage_path = sitepackage_path.replace("/%s" % accel_subdir, '') try: os.makedirs(os.path.dirname(sitepackage_path), exist_ok=True) with open(sitepackage_path, 'w') as fp: From ecfd373ab1e42904a7b4d1409b8d8630f61e61fa Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 15:09:05 +0200 Subject: [PATCH 11/27] Undo change on create_tarball.sh --- create_tarball.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/create_tarball.sh b/create_tarball.sh index 11cce2b3..d104e210 100755 --- a/create_tarball.sh +++ b/create_tarball.sh @@ -69,11 +69,6 @@ if [ -n "${accel_subdir}" ]; then fi for subdir in ${sw_subdirs}; do - if [ -d ${eessi_version}/software/${os}/${subdir}/.lmod ]; then - # lmod SitePackage or lmodrc files - find ${eessi_version}/software/${os}/${subdir}/.lmod - type f \! -name '.wh.*' >> ${files_list} - fi - if [ -d ${eessi_version}/software/${os}/${subdir}/modules ]; then # module files find ${eessi_version}/software/${os}/${subdir}/modules -type f \! -name '.wh.*' >> ${files_list} From 448d9e0235386169d19b0f00821e83902ca169c7 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 15:34:19 +0200 Subject: [PATCH 12/27] Fix typo --- create_lmodsitepackage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 28b5d029..9c9398e1 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -130,7 +130,7 @@ -- get the host_injections path, and add only the EESSI_CPU_FAMILY at the end local strip_suffix = os.getenv('EESSI_VERSION') .. "/software/" .. os.getenv('EESSI_OS_TYPE') .. "/" - strip_suffix = strip_suffix .. "os.getenv('EESSI_SOFTWARE_SUBDIR') + strip_suffix = strip_suffix .. os.getenv('EESSI_SOFTWARE_SUBDIR') local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", strip_suffix, os.getenv('EESSI_CPU_FAMILY')) -- build final path where the software should be installed From 64069f9a40ad0725b0fdd898cb9a42d64784cc33 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 16:12:14 +0200 Subject: [PATCH 13/27] Small fix, forgot to change name when copying --- create_lmodsitepackage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 9c9398e1..83e395d9 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -125,7 +125,7 @@ -- simpleName is a module in packagesList -- first, check the old host_injections path. If that exists, print a more targetted, explainatory warning local previousHostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') - local previousPackageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" + local previousPackageEasyBuildDir = previousHostInjections .. "/software/" .. t.modFullName .. "/easybuild" local previousPackageDirExists = isDir(previousPackageEasyBuildDir) -- get the host_injections path, and add only the EESSI_CPU_FAMILY at the end From 2b03950d6f0c6feef28f356441c779ae730555a1 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 16:27:02 +0200 Subject: [PATCH 14/27] Fixed more issues --- create_lmodsitepackage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 83e395d9..85b96e49 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -137,10 +137,10 @@ local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" local packageDirExists = isDir(packageEasyBuildDir) if previousPackageDirExists and not packageDirExists then - local targettedAdvice = "but while the module file exists, the actual software is not entirely shipped with EESSI " + local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI " advice = advice .. "can find it.\\n" - advice = advice .. "Note that a full copy is installed at " .. previoushostInjections .. "/software/" .. t.modFullName ". " + advice = advice .. "Note that a full copy is installed at " .. previousHostInjections .. "/software/" .. t.modFullName .. ". " advice = advice .. "However, EESSI now expects it in a different location, namely at " advice = advice .. hostInjections .. "/software/" .. t.modFullName "." advice = advice .. "Please re-install the package at the new location." From 56bfd453dec5d57a2331bfe8d5af7d11d65bfffc Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 16:27:35 +0200 Subject: [PATCH 15/27] Fixed more issues --- create_lmodsitepackage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 85b96e49..9a77171a 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -142,7 +142,7 @@ advice = advice .. "can find it.\\n" advice = advice .. "Note that a full copy is installed at " .. previousHostInjections .. "/software/" .. t.modFullName .. ". " advice = advice .. "However, EESSI now expects it in a different location, namely at " - advice = advice .. hostInjections .. "/software/" .. t.modFullName "." + advice = advice .. hostInjections .. "/software/" .. t.modFullName .. "." advice = advice .. "Please re-install the package at the new location." advice = advice .. refer_to_docs elseif not packageDirExists then From 5e92c55c2a0a803b2fc539bd08cce929e83b29f6 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 16:32:09 +0200 Subject: [PATCH 16/27] Make sure it actually raises an error --- create_lmodsitepackage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 9a77171a..f1f66170 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -145,6 +145,7 @@ advice = advice .. hostInjections .. "/software/" .. t.modFullName .. "." advice = advice .. "Please re-install the package at the new location." advice = advice .. refer_to_docs + LmodError("\\nYou requested to load ", simpleName, " ", advice) elseif not packageDirExists then local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI " From 7119169f6ad9464c07fdab018ff700417c79a737 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 16:35:23 +0200 Subject: [PATCH 17/27] Insert two spaces --- create_lmodsitepackage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index f1f66170..3aa91401 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -142,8 +142,8 @@ advice = advice .. "can find it.\\n" advice = advice .. "Note that a full copy is installed at " .. previousHostInjections .. "/software/" .. t.modFullName .. ". " advice = advice .. "However, EESSI now expects it in a different location, namely at " - advice = advice .. hostInjections .. "/software/" .. t.modFullName .. "." - advice = advice .. "Please re-install the package at the new location." + advice = advice .. hostInjections .. "/software/" .. t.modFullName .. ". " + advice = advice .. "Please re-install the package at the new location. " advice = advice .. refer_to_docs LmodError("\\nYou requested to load ", simpleName, " ", advice) elseif not packageDirExists then From d4af942446940fd49a45a73f12cf50d95322066c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 16:39:25 +0200 Subject: [PATCH 18/27] Cleanout the old installation script for the CUDA toolkit, as it is replaced (and would install in an outdated prefix) --- .../nvidia/install_cuda_host_injections.sh | 209 +----------------- 1 file changed, 1 insertion(+), 208 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh index 954cf45c..9bbdd6a1 100755 --- a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh @@ -1,211 +1,4 @@ #!/usr/bin/env bash -# This script can be used to install CUDA under the `.../host_injections` directory. -# This provides the parts of the CUDA installation that cannot be redistributed as -# part of EESSI due to license limitations. While GPU-based software from EESSI will -# _run_ without these, installation of additional CUDA software requires the CUDA -# installation(s) under `host_injections` to be present. -# -# The `host_injections` directory is a variant symlink that by default points to -# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see -# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the -# installation to be successful, this directory needs to be writeable by the user -# executing this script. - -# Initialise our bash functions TOPDIR=$(dirname $(realpath $BASH_SOURCE)) -source "$TOPDIR"/../../utils.sh - -# Function to display help message -show_help() { - echo "Usage: $0 [OPTIONS]" - echo "Options:" - echo " --help Display this help message" - echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" - echo " CUDA, see the EULA at" - echo " https://docs.nvidia.com/cuda/eula/index.html" - echo " -c, --cuda-version CUDA_VERSION Specify a version o CUDA to install (must" - echo " have a corresponding easyconfig in the" - echo " EasyBuild release)" - echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" - echo " storage during the CUDA install" - echo " (must have >10GB available)" -} - -# Initialize variables -install_cuda_version="" -eula_accepted=0 - -# Parse command-line options -while [[ $# -gt 0 ]]; do - case "$1" in - --help) - show_help - exit 0 - ;; - -c|--cuda-version) - if [ -n "$2" ]; then - install_cuda_version="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi - ;; - --accept-cuda-eula) - eula_accepted=1 - shift 1 - ;; - -t|--temp-dir) - if [ -n "$2" ]; then - CUDA_TEMP_DIR="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi - ;; - *) - show_help - fatal_error "Error: Unknown option: $1" - ;; - esac -done - -# Make sure EESSI is initialised -check_eessi_initialised - -# Make sure the CUDA version supplied is a semantic version -is_semantic_version() { - local version=$1 - local regex='^[0-9]+\.[0-9]+\.[0-9]+$' - - if [[ $version =~ $regex ]]; then - return 0 # Return success (0) if it's a semantic version - else - return 1 # Return failure (1) if it's not a semantic version - fi -} -if ! is_semantic_version "$install_cuda_version"; then - show_help - error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n" - error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n" - error="${error}version to provide is probably one of those available under\n" - error="${error}$EESSI_SOFTWARE_PATH/software/CUDA\n" - fatal_error "${error}" -fi - -# Make sure they have accepted the CUDA EULA -if [ "$eula_accepted" -ne 1 ]; then - show_help - error="\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" - fatal_error "${error}" -fi - -# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` -# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) -cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} - -# Only install CUDA if specified version is not found. -# (existence of easybuild subdir implies a successful install) -if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then - echo_green "CUDA software found! No need to install CUDA again." -else - # We need to be able write to the installation space so let's make sure we can - if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then - fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" - fi - - # we need a directory we can use for temporary storage - if [[ -z "${CUDA_TEMP_DIR}" ]]; then - tmpdir=$(mktemp -d) - else - tmpdir="${CUDA_TEMP_DIR}"/temp - if ! mkdir -p "$tmpdir" ; then - fatal_error "Could not create directory ${tmpdir}" - fi - fi - - required_space_in_tmpdir=50000 - # Let's see if we have sources and build locations defined if not, we use the temporary space - if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then - export EASYBUILD_BUILDPATH=${tmpdir}/build - required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) - fi - if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then - export EASYBUILD_SOURCEPATH=${tmpdir}/sources - required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) - fi - - # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), - # need to do a space check before we proceed - avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < 5000000 )); then - fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." - fi - avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < required_space_in_tmpdir )); then - error="Need at least ${required_space_in_tmpdir}GB disk space under ${tmpdir}.\n" - error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check.\n" - error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH\n" - error="${error}to reduce this requirement. Exiting now..." - fatal_error "${error}" - fi - - if ! command -v "eb" &>/dev/null; then - echo_yellow "Attempting to load an EasyBuild module to do actual install" - module load EasyBuild - # There are some scenarios where this may fail - if [ $? -ne 0 ]; then - error="'eb' command not found in your environment and\n" - error="${error} module load EasyBuild\n" - error="${error}failed for some reason.\n" - error="${error}Please re-run this script with the 'eb' command available." - fatal_error "${error}" - fi - fi - - cuda_easyconfig="CUDA-${install_cuda_version}.eb" - - # Check the easyconfig file is available in the release - # (eb search always returns 0, so we need a grep to ensure a usable exit code) - eb --search ^${cuda_easyconfig}|grep CUDA > /dev/null 2>&1 - # Check the exit code - if [ $? -ne 0 ]; then - eb_version=$(eb --version) - available_cuda_easyconfigs=$(eb --search "^CUDA-.*.eb"|grep CUDA) - - error="The easyconfig ${cuda_easyconfig} was not found in EasyBuild version:\n" - error="${error} ${eb_version}\n" - error="${error}You either need to give a different version of CUDA to install _or_ \n" - error="${error}use a different version of EasyBuild for the installation.\n" - error="${error}\nThe versions of CUDA available with the current eb command are:\n" - error="${error}${available_cuda_easyconfigs}" - fatal_error "${error}" - fi - - # We need the --rebuild option, as the CUDA module may or may not be on the - # `MODULEPATH` yet. Even if it is, we still want to redo this installation - # since it will provide the symlinked targets for the parts of the CUDA - # installation in the `.../versions/...` prefix - # We install the module in our `tmpdir` since we do not need the modulefile, - # we only care about providing the targets for the symlinks. - extra_args="--rebuild --installpath-modules=${tmpdir}" - - # We don't want hooks used in this install, we need a vanilla CUDA installation - touch "$tmpdir"/none.py - # shellcheck disable=SC2086 # Intended splitting of extra_args - eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" - ret=$? - if [ $ret -ne 0 ]; then - eb_last_log=$(unset EB_VERBOSE; eb --last-log) - cp -a ${eb_last_log} . - fatal_error "CUDA installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." - else - echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" - fi - # clean up tmpdir - rm -rf "${tmpdir}" -fi +echo "This script was replaced by the $TOPDIR/install_cuda_and_libraries.sh script. See https://www.eessi.io/docs/site_specific_config/gpu/ for more information" From 8c903fd7200bc63fda5991c30bcaec0707e3bc82 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 17:08:23 +0200 Subject: [PATCH 19/27] Add easystack file to build CUDA and cuDNN in the software layer --- ...07-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml new file mode 100644 index 00000000..3f40ced9 --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml @@ -0,0 +1,11 @@ +# In https://github.com/EESSI/software-layer-scripts/pull/59 we introduced a new location for +# installing the CUDA toolkit within the host_injections directory. This requires reinstallation +# of CUDA and cuDNN to make sure all symlinks point to these new locations +easyconfigs: + - CUDA-12.1.1.eb: + options: + accept-eula-for: CUDA + - CUDA-12.4.0.eb: + options: + accept-eula-for: CUDA + - cuDNN-8.9.2.26-CUDA-12.1.1.eb From 0ab005b5a80a14d89ed6051452551d43041376ac Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 20:50:03 +0200 Subject: [PATCH 20/27] Accept EULA for cuDNN --- .../20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml index 3f40ced9..839125de 100644 --- a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml @@ -8,4 +8,6 @@ easyconfigs: - CUDA-12.4.0.eb: options: accept-eula-for: CUDA - - cuDNN-8.9.2.26-CUDA-12.1.1.eb + - cuDNN-8.9.2.26-CUDA-12.1.1.eb: + options: + accept-eula-for: cuDNN From 3dfe565b23996c4fc62ba9de48e6f2ef6dea8539 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 11 Aug 2025 10:02:22 +0200 Subject: [PATCH 21/27] Fix chicken and egg problem where EESSI_ACCELERATOR_TARGET is not set by the EESSI module, unless the installation directory already exists. --- EESSI-install-software.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index bf5c59ca..83cdd471 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -150,6 +150,24 @@ else # make sure the the software and modules directory exist # (since it's expected by init/eessi_environment_variables when using archdetect and by the EESSI module) mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/{modules,software} + + # If EESSI_ACCELERATOR_TARGET_OVERRIDE is defined, we are building for an accelerator target + # In that case, make sure the modulepath for the accelerator subdir exists, otherwise the EESSI module will not + # set EESSI_ACCELERATOR_TARGET and the if-condition later in this script which checks if EESSI_ACCELERATOR_TARGET + # is equal to EESSI_ACCELERATOR_TARGET_OVERRIDE will fail + # See https://github.com/EESSI/software-layer-scripts/pull/59#issuecomment-3173593882 + if [ -z $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE ]; then + mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all + else + # At runtime, one might want to use a different CPU subdir for a given accelerator. E.g. one could use + # a zen2 CPU subdir on a zen4 node if the required GPU software isn't available in the zen4 tree. + # At build time, this doesn't make a lot of sense: we'd probably build in a CPU prefix that is different + # from what the code will be optimized for, and we wouldn't want that + msg="When building the software subdirectory for the CPU should almost certainly be that of the host." + msg="$msg If you think this is incorrect, please implement behaviour that makes sense in " + msg="$msg EESSI-software-installation.sh, essentially replacing this error." + fatal_error "$msg" + fi ) fi From d36908d86af682463207565370cbfc2f73062d16 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 11 Aug 2025 10:20:24 +0200 Subject: [PATCH 22/27] Some more clear code commenting --- create_lmodsitepackage.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 3aa91401..ae776d1d 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -123,7 +123,8 @@ local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/site_specific_config/gpu/.\\n" if packagesList[simpleName] then -- simpleName is a module in packagesList - -- first, check the old host_injections path. If that exists, print a more targetted, explainatory warning + -- first, check the old host_injections path prior to https://github.com/EESSI/software-layer-scripts/pull/59 + -- If that exists, print a more targetted, explainatory warning local previousHostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') local previousPackageEasyBuildDir = previousHostInjections .. "/software/" .. t.modFullName .. "/easybuild" local previousPackageDirExists = isDir(previousPackageEasyBuildDir) From 8ca152ee656868f3672837245247e41d2a16d27d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 12 Aug 2025 16:51:01 +0200 Subject: [PATCH 23/27] Make sure to actual check for EESSI_ACCELERATOR_TARGET_OVERRIDE to be defined --- EESSI-install-software.sh | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 83cdd471..59b9ea10 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -156,17 +156,22 @@ else # set EESSI_ACCELERATOR_TARGET and the if-condition later in this script which checks if EESSI_ACCELERATOR_TARGET # is equal to EESSI_ACCELERATOR_TARGET_OVERRIDE will fail # See https://github.com/EESSI/software-layer-scripts/pull/59#issuecomment-3173593882 - if [ -z $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE ]; then - mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all - else - # At runtime, one might want to use a different CPU subdir for a given accelerator. E.g. one could use - # a zen2 CPU subdir on a zen4 node if the required GPU software isn't available in the zen4 tree. - # At build time, this doesn't make a lot of sense: we'd probably build in a CPU prefix that is different - # from what the code will be optimized for, and we wouldn't want that - msg="When building the software subdirectory for the CPU should almost certainly be that of the host." - msg="$msg If you think this is incorrect, please implement behaviour that makes sense in " - msg="$msg EESSI-software-installation.sh, essentially replacing this error." - fatal_error "$msg" + if [ -n $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then + # Note that ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all + # is only the correct path if EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE is not set + if [ -z $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE ]; then + mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all + else + # At runtime, one might want to use a different CPU subdir for a given accelerator. E.g. one could use + # a zen2 CPU subdir on a zen4 node if the required GPU software isn't available in the zen4 tree. + # At build time, this doesn't make a lot of sense: we'd probably build in a CPU prefix that is different + # from what the code will be optimized for, and we wouldn't want that + # So this message _should_ never be printed... + msg="When building the software subdirectory for the CPU should almost certainly be that of the host." + msg="$msg If you think this is incorrect, please implement behaviour that makes sense in " + msg="$msg EESSI-software-installation.sh, essentially replacing this error." + fatal_error "$msg" + fi fi ) fi From f237c937237ce0736215a8d5c7a12947bf2bbe5e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 12 Aug 2025 16:59:35 +0200 Subject: [PATCH 24/27] Added readme to explain that there SHOULD normally not be any easystack file being added to software-layer-scripts --- easystacks/README.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 easystacks/README.md diff --git a/easystacks/README.md b/easystacks/README.md new file mode 100644 index 00000000..0fe72342 --- /dev/null +++ b/easystacks/README.md @@ -0,0 +1,5 @@ +WARNING: in principle _all_ easystack files should go into EESSI/software-layer, not in EESSI/software-layer-scripts. Easystack files are only added in EESSI/software-layer-scripts by exception, for example when the (re)deployment of the software has to be done synchronously with a change in EESSI/software-layer-scripts. + +Here, we list past deployments for which this was the case (and why): + +[PR#59](https://github.com/EESSI/software-layer-scripts/pull/59): modified the prefix in which `install_cuda_and_libraries.sh` installs the CUDA toolkit within `host_injections`. Also, updated the Lmod SitePackage.lua to print an informative message in case the CUDA Toolkit is found in the old location. This requires synchronous deployment of new CUDA and cuDNN installations in the software layer, because the symlinks from these installations should be redirected to the new prefix in `host_injections`. From e97418321ae0484a09bc5af9e70313bce0a2ffab Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Tue, 12 Aug 2025 17:20:36 +0200 Subject: [PATCH 25/27] Apply suggestions from code review Co-authored-by: Kenneth Hoste --- create_lmodsitepackage.py | 4 ++-- scripts/gpu_support/nvidia/install_cuda_and_libraries.sh | 2 +- scripts/gpu_support/nvidia/install_cuda_host_injections.sh | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index ae776d1d..996575f1 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -124,7 +124,7 @@ if packagesList[simpleName] then -- simpleName is a module in packagesList -- first, check the old host_injections path prior to https://github.com/EESSI/software-layer-scripts/pull/59 - -- If that exists, print a more targetted, explainatory warning + -- If that exists, print a more targetted, explanatory warning local previousHostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') local previousPackageEasyBuildDir = previousHostInjections .. "/software/" .. t.modFullName .. "/easybuild" local previousPackageDirExists = isDir(previousPackageEasyBuildDir) @@ -142,7 +142,7 @@ advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI " advice = advice .. "can find it.\\n" advice = advice .. "Note that a full copy is installed at " .. previousHostInjections .. "/software/" .. t.modFullName .. ". " - advice = advice .. "However, EESSI now expects it in a different location, namely at " + advice = advice .. "However, EESSI expects it in a different location since Aug'25, namely at " advice = advice .. hostInjections .. "/software/" .. t.modFullName .. ". " advice = advice .. "Please re-install the package at the new location. " advice = advice .. refer_to_docs diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 4baf3005..491fee0f 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -132,7 +132,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do # If there is a GPU on the node, the installation path will by default have an # accelerator subdirectory. For CUDA and cu*, these are binary installations and - # don't care about the target compute capability nor the CPU microarchitecture. + # we don't care about the target compute capability nor the CPU microarchitecture. # Our hooks are aware of this and therefore expect CUDA to be available under # something like EESSI_SITE_SOFTWARE_PATH, but then with the CPU micro-architecture # stripped diff --git a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh index 9bbdd6a1..4d3cbd8a 100755 --- a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash TOPDIR=$(dirname $(realpath $BASH_SOURCE)) -echo "This script was replaced by the $TOPDIR/install_cuda_and_libraries.sh script. See https://www.eessi.io/docs/site_specific_config/gpu/ for more information" +echo "This script was replaced by the $TOPDIR/install_cuda_and_libraries.sh script. See https://www.eessi.io/docs/site_specific_config/gpu/ for more information" >&2 +exit 1 From 7a1e4c195bf22612656253dfd0584c4c138bbb3e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 12 Aug 2025 17:21:14 +0200 Subject: [PATCH 26/27] Remove easystack file. CUDA 12.1.1 was already covered in the other one as well --- ...05-eb-5.1.1-rebuild-2023a-for-cuda-sanity-check.yml | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250805-eb-5.1.1-rebuild-2023a-for-cuda-sanity-check.yml diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250805-eb-5.1.1-rebuild-2023a-for-cuda-sanity-check.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250805-eb-5.1.1-rebuild-2023a-for-cuda-sanity-check.yml deleted file mode 100644 index ef794f3e..00000000 --- a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250805-eb-5.1.1-rebuild-2023a-for-cuda-sanity-check.yml +++ /dev/null @@ -1,10 +0,0 @@ -# We'll build all CUDA software, for various reasons -# 1. We now have a proper CUDA sanity check, and if anything was 'wrong' with our current CUDA installs, we'd like -# to know about it -# 2. The PR implementing a CI to check for differences between officially supported CUDA Compute Capabilities shows -# that there are a lot of missing installations https://github.com/EESSI/software-layer/pull/1087 . A rebuild PR like -# this will have the convenient side effect of filling all those holes -easyconfigs: - - CUDA-12.1.1.eb: - options: - accept-eula-for: CUDA From 1dbfe8b248730a818280c612036e08d74c0c16d4 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 12:28:21 +0200 Subject: [PATCH 27/27] Removed EasyStacks so we can deploy the changes from #59 also for 2025.06 --- easystacks/README.md | 5 ----- ...-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml | 13 ------------- 2 files changed, 18 deletions(-) delete mode 100644 easystacks/README.md delete mode 100644 easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml diff --git a/easystacks/README.md b/easystacks/README.md deleted file mode 100644 index 0fe72342..00000000 --- a/easystacks/README.md +++ /dev/null @@ -1,5 +0,0 @@ -WARNING: in principle _all_ easystack files should go into EESSI/software-layer, not in EESSI/software-layer-scripts. Easystack files are only added in EESSI/software-layer-scripts by exception, for example when the (re)deployment of the software has to be done synchronously with a change in EESSI/software-layer-scripts. - -Here, we list past deployments for which this was the case (and why): - -[PR#59](https://github.com/EESSI/software-layer-scripts/pull/59): modified the prefix in which `install_cuda_and_libraries.sh` installs the CUDA toolkit within `host_injections`. Also, updated the Lmod SitePackage.lua to print an informative message in case the CUDA Toolkit is found in the old location. This requires synchronous deployment of new CUDA and cuDNN installations in the software layer, because the symlinks from these installations should be redirected to the new prefix in `host_injections`. diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml deleted file mode 100644 index 839125de..00000000 --- a/easystacks/software.eessi.io/2023.06/accel/nvidia/rebuilds/20250807-eb-5.1.1-CUDA-cuDNN-new-host-injections-dir.yml +++ /dev/null @@ -1,13 +0,0 @@ -# In https://github.com/EESSI/software-layer-scripts/pull/59 we introduced a new location for -# installing the CUDA toolkit within the host_injections directory. This requires reinstallation -# of CUDA and cuDNN to make sure all symlinks point to these new locations -easyconfigs: - - CUDA-12.1.1.eb: - options: - accept-eula-for: CUDA - - CUDA-12.4.0.eb: - options: - accept-eula-for: CUDA - - cuDNN-8.9.2.26-CUDA-12.1.1.eb: - options: - accept-eula-for: cuDNN