diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index bf5c59ca..59b9ea10 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -150,6 +150,29 @@ else # make sure the the software and modules directory exist # (since it's expected by init/eessi_environment_variables when using archdetect and by the EESSI module) mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/{modules,software} + + # If EESSI_ACCELERATOR_TARGET_OVERRIDE is defined, we are building for an accelerator target + # In that case, make sure the modulepath for the accelerator subdir exists, otherwise the EESSI module will not + # set EESSI_ACCELERATOR_TARGET and the if-condition later in this script which checks if EESSI_ACCELERATOR_TARGET + # is equal to EESSI_ACCELERATOR_TARGET_OVERRIDE will fail + # See https://github.com/EESSI/software-layer-scripts/pull/59#issuecomment-3173593882 + if [ -n $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then + # Note that ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all + # is only the correct path if EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE is not set + if [ -z $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE ]; then + mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all + else + # At runtime, one might want to use a different CPU subdir for a given accelerator. E.g. one could use + # a zen2 CPU subdir on a zen4 node if the required GPU software isn't available in the zen4 tree. + # At build time, this doesn't make a lot of sense: we'd probably build in a CPU prefix that is different + # from what the code will be optimized for, and we wouldn't want that + # So this message _should_ never be printed... + msg="When building the software subdirectory for the CPU should almost certainly be that of the host." + msg="$msg If you think this is incorrect, please implement behaviour that makes sense in " + msg="$msg EESSI-software-installation.sh, essentially replacing this error." + fatal_error "$msg" + fi + fi ) fi diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index f3d11aeb..996575f1 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -123,13 +123,31 @@ local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/site_specific_config/gpu/.\\n" if packagesList[simpleName] then -- simpleName is a module in packagesList - -- get the full host_injections path - local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + -- first, check the old host_injections path prior to https://github.com/EESSI/software-layer-scripts/pull/59 + -- If that exists, print a more targetted, explanatory warning + local previousHostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + local previousPackageEasyBuildDir = previousHostInjections .. "/software/" .. t.modFullName .. "/easybuild" + local previousPackageDirExists = isDir(previousPackageEasyBuildDir) + + -- get the host_injections path, and add only the EESSI_CPU_FAMILY at the end + local strip_suffix = os.getenv('EESSI_VERSION') .. "/software/" .. os.getenv('EESSI_OS_TYPE') .. "/" + strip_suffix = strip_suffix .. os.getenv('EESSI_SOFTWARE_SUBDIR') + local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", strip_suffix, os.getenv('EESSI_CPU_FAMILY')) -- build final path where the software should be installed local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" local packageDirExists = isDir(packageEasyBuildDir) - if not packageDirExists then + if previousPackageDirExists and not packageDirExists then + local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " + advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI " + advice = advice .. "can find it.\\n" + advice = advice .. "Note that a full copy is installed at " .. previousHostInjections .. "/software/" .. t.modFullName .. ". " + advice = advice .. "However, EESSI expects it in a different location since Aug'25, namely at " + advice = advice .. hostInjections .. "/software/" .. t.modFullName .. ". " + advice = advice .. "Please re-install the package at the new location. " + advice = advice .. refer_to_docs + LmodError("\\nYou requested to load ", simpleName, " ", advice) + elseif not packageDirExists then local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI " advice = advice .. "can find it.\\n" @@ -293,7 +311,7 @@ def error(msg): # the install path (if it exists) accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET") if accel_subdir: - sitepackage_path = sitepackage_path.replace("/accel/%s" % accel_subdir, '') + sitepackage_path = sitepackage_path.replace("/%s" % accel_subdir, '') try: os.makedirs(os.path.dirname(sitepackage_path), exist_ok=True) with open(sitepackage_path, 'w') as fp: diff --git a/eb_hooks.py b/eb_hooks.py index bdf8f49b..befd4240 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -151,7 +151,7 @@ def parse_list_of_dicts_env(var_name): if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', var_name): raise ValueError(f"Invalid environment variable name: {var_name}") list_string = os.getenv(var_name, '[]') - + list_of_dicts = [] try: # Try JSON format first @@ -162,7 +162,7 @@ def parse_list_of_dicts_env(var_name): list_of_dicts = ast.literal_eval(list_string) except (ValueError, SyntaxError): raise ValueError(f"Environment variable '{var_name}' does not contain a valid list of dictionaries.") - + return list_of_dicts @@ -211,7 +211,7 @@ def post_ready_hook(self, *args, **kwargs): parallel = self.parallel else: parallel = self.cfg['parallel'] - + if parallel == 1: return # no need to limit if already using 1 core @@ -733,7 +733,7 @@ def pre_configure_hook_score_p(self, *args, **kwargs): def pre_configure_hook_vsearch(self, *args, **kwargs): """ Pre-configure hook for VSEARCH - - Workaround for a Zlib macro being renamed in Gentoo, see https://bugs.gentoo.org/383179 + - Workaround for a Zlib macro being renamed in Gentoo, see https://bugs.gentoo.org/383179 (solves "expected initializer before 'OF'" errors) """ if self.name == 'VSEARCH': @@ -1199,7 +1199,7 @@ def post_postproc_cuda(self, *args, **kwargs): # replace files that are not distributable with symlinks into # host_injections - replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) + replace_binary_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) else: print_msg(f"EESSI hook to respect CUDA license not triggered for installation path {self.installdir}") else: @@ -1249,16 +1249,19 @@ def post_postproc_cudnn(self, *args, **kwargs): # replace files that are not distributable with symlinks into # host_injections - replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) + replace_binary_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) else: print_msg(f"EESSI hook to respect cuDDN license not triggered for installation path {self.installdir}") else: raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!") -def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist): +def replace_binary_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist): """ Replace files that cannot be distributed with symlinks into host_injections + Since these are binary files, only the CPU family will be included in the prefix, + no microarchitecture or accelerator architecture will be included. For example, + /cvmfs/software.eessi.io/host_injections/x86_64/suffix/to/actual/file """ # Different packages use different ways to specify which files or file # 'types' may be redistributed. For CUDA, the 'EULA.txt' lists full file @@ -1301,13 +1304,37 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, al log.debug("%s is not found in allowlist, so replacing it with symlink: %s", print_name, full_path) # the host_injections path is under a fixed repo/location for CUDA or cuDNN + # full_path is something similar to + # /cvmfs/software.eessi.io/version/.../x86_64/amd/zen4/accel/nvidia/cc90/.../CUDA/bin/nvcc + # host_inj_path will then be + # /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/accel/nvidia/cc90/.../CUDA/bin/nvcc host_inj_path = re.sub(EESSI_INSTALLATION_REGEX, HOST_INJECTIONS_LOCATION, full_path) # CUDA and cu* libraries themselves don't care about compute capability so remove this # duplication from under host_injections (symlink to a single CUDA or cu* library # installation for all compute capabilities) accel_subdir = get_eessi_envvar("EESSI_ACCELERATOR_TARGET") + # If accel_subdir is defined, remove it from the full path + # After removal of accel_subdir, host_inj_path will be something like + # /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/.../CUDA/bin/nvcc if accel_subdir: - host_inj_path = host_inj_path.replace("/accel/%s" % accel_subdir, '') + host_inj_path = host_inj_path.replace(accel_subdir, '') + software_subdir = get_eessi_envvar("EESSI_SOFTWARE_SUBDIR") + cpu_family = get_eessi_envvar("EESSI_CPU_FAMILY") + os_type = get_eessi_envvar("EESSI_OS_TYPE") + eessi_version = get_eessi_envvar("EESSI_VERSION") + if software_subdir and cpu_family and os_type and eessi_version: + # Compose the string to be removed: + partial_path = f"{eessi_version}/software/{os_type}/{software_subdir}" + # After this, host_inj_path will be e.g. + # /cvmfs/software.eessi.io/host_injections/x86_64/software/CUDA/bin/nvcc + host_inj_path = host_inj_path.replace(partial_path, cpu_family) + else: + msg = "Failed to construct path to symlink for file (%s). All of the following values " + msg += "have to be defined: EESSI_SOFTWARE_SUBDIR='%s', EESSI_CPU_FAMILY='%s', " + msg += "EESSI_OS_TYPE='%s', EESSI_VERSION='%s'. Failed to replace non-redistributable file " + msg += "with symlink, aborting..." + raise EasyBuildError(msg, full_path, software_subdir, cpu_family, os_type, eessi_version) + # make sure source and target of symlink are not the same if full_path == host_inj_path: raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 5123a7c1..491fee0f 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -132,9 +132,16 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do # If there is a GPU on the node, the installation path will by default have an # accelerator subdirectory. For CUDA and cu*, these are binary installations and - # don't care about the target compute capability. Our hooks are aware of this and - # therefore expect CUDA to be available under EESSI_SITE_SOFTWARE_PATH - export EASYBUILD_INSTALLPATH=$EESSI_SITE_SOFTWARE_PATH + # we don't care about the target compute capability nor the CPU microarchitecture. + # Our hooks are aware of this and therefore expect CUDA to be available under + # something like EESSI_SITE_SOFTWARE_PATH, but then with the CPU micro-architecture + # stripped + # This sed command will capture everything from the EESSI_SITE_SOFTWARE_PATH up until + # the EESSI_VERSION in a capture group. It will the replace that with the content + # of the capture group and then have the EESSI_CPU_FAMILY appended + # Thus EESSI_SITE_CPU_FAMILY_PATH is then something like /cvmfs/software.eessi.io/host_injections/x86_64 + EESSI_SITE_CPU_FAMILY_PATH=$(echo "$EESSI_SITE_SOFTWARE_PATH" | sed 's|\(.*\)'"$EESSI_VERSION"/software/"$EESSI_OS_TYPE"/"$EESSI_SOFTWARE_SUBDIR"'|\1'"$EESSI_CPU_FAMILY"'|') + export EASYBUILD_INSTALLPATH=$EESSI_SITE_CPU_FAMILY_PATH # Install modules in hidden .modules dir to keep track of what was installed before # (this action is temporary, and we do not call Lmod again within the current shell context, but in EasyBuild @@ -258,7 +265,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do cp -a ${eb_last_log} . fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..." else - echo_green "all installations at ${EESSI_SITE_SOFTWARE_PATH}/software/... succeeded!" + echo_green "all installations at ${EASYBUILD_INSTALLPATH}/software/... succeeded!" fi # clean up tmpdir content diff --git a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh index 954cf45c..4d3cbd8a 100755 --- a/scripts/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh @@ -1,211 +1,5 @@ #!/usr/bin/env bash -# This script can be used to install CUDA under the `.../host_injections` directory. -# This provides the parts of the CUDA installation that cannot be redistributed as -# part of EESSI due to license limitations. While GPU-based software from EESSI will -# _run_ without these, installation of additional CUDA software requires the CUDA -# installation(s) under `host_injections` to be present. -# -# The `host_injections` directory is a variant symlink that by default points to -# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see -# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the -# installation to be successful, this directory needs to be writeable by the user -# executing this script. - -# Initialise our bash functions TOPDIR=$(dirname $(realpath $BASH_SOURCE)) -source "$TOPDIR"/../../utils.sh - -# Function to display help message -show_help() { - echo "Usage: $0 [OPTIONS]" - echo "Options:" - echo " --help Display this help message" - echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" - echo " CUDA, see the EULA at" - echo " https://docs.nvidia.com/cuda/eula/index.html" - echo " -c, --cuda-version CUDA_VERSION Specify a version o CUDA to install (must" - echo " have a corresponding easyconfig in the" - echo " EasyBuild release)" - echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" - echo " storage during the CUDA install" - echo " (must have >10GB available)" -} - -# Initialize variables -install_cuda_version="" -eula_accepted=0 - -# Parse command-line options -while [[ $# -gt 0 ]]; do - case "$1" in - --help) - show_help - exit 0 - ;; - -c|--cuda-version) - if [ -n "$2" ]; then - install_cuda_version="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi - ;; - --accept-cuda-eula) - eula_accepted=1 - shift 1 - ;; - -t|--temp-dir) - if [ -n "$2" ]; then - CUDA_TEMP_DIR="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi - ;; - *) - show_help - fatal_error "Error: Unknown option: $1" - ;; - esac -done - -# Make sure EESSI is initialised -check_eessi_initialised - -# Make sure the CUDA version supplied is a semantic version -is_semantic_version() { - local version=$1 - local regex='^[0-9]+\.[0-9]+\.[0-9]+$' - - if [[ $version =~ $regex ]]; then - return 0 # Return success (0) if it's a semantic version - else - return 1 # Return failure (1) if it's not a semantic version - fi -} -if ! is_semantic_version "$install_cuda_version"; then - show_help - error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n" - error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n" - error="${error}version to provide is probably one of those available under\n" - error="${error}$EESSI_SOFTWARE_PATH/software/CUDA\n" - fatal_error "${error}" -fi - -# Make sure they have accepted the CUDA EULA -if [ "$eula_accepted" -ne 1 ]; then - show_help - error="\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" - fatal_error "${error}" -fi - -# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` -# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) -cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} - -# Only install CUDA if specified version is not found. -# (existence of easybuild subdir implies a successful install) -if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then - echo_green "CUDA software found! No need to install CUDA again." -else - # We need to be able write to the installation space so let's make sure we can - if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then - fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" - fi - - # we need a directory we can use for temporary storage - if [[ -z "${CUDA_TEMP_DIR}" ]]; then - tmpdir=$(mktemp -d) - else - tmpdir="${CUDA_TEMP_DIR}"/temp - if ! mkdir -p "$tmpdir" ; then - fatal_error "Could not create directory ${tmpdir}" - fi - fi - - required_space_in_tmpdir=50000 - # Let's see if we have sources and build locations defined if not, we use the temporary space - if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then - export EASYBUILD_BUILDPATH=${tmpdir}/build - required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) - fi - if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then - export EASYBUILD_SOURCEPATH=${tmpdir}/sources - required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) - fi - - # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), - # need to do a space check before we proceed - avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < 5000000 )); then - fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." - fi - avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < required_space_in_tmpdir )); then - error="Need at least ${required_space_in_tmpdir}GB disk space under ${tmpdir}.\n" - error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check.\n" - error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH\n" - error="${error}to reduce this requirement. Exiting now..." - fatal_error "${error}" - fi - - if ! command -v "eb" &>/dev/null; then - echo_yellow "Attempting to load an EasyBuild module to do actual install" - module load EasyBuild - # There are some scenarios where this may fail - if [ $? -ne 0 ]; then - error="'eb' command not found in your environment and\n" - error="${error} module load EasyBuild\n" - error="${error}failed for some reason.\n" - error="${error}Please re-run this script with the 'eb' command available." - fatal_error "${error}" - fi - fi - - cuda_easyconfig="CUDA-${install_cuda_version}.eb" - - # Check the easyconfig file is available in the release - # (eb search always returns 0, so we need a grep to ensure a usable exit code) - eb --search ^${cuda_easyconfig}|grep CUDA > /dev/null 2>&1 - # Check the exit code - if [ $? -ne 0 ]; then - eb_version=$(eb --version) - available_cuda_easyconfigs=$(eb --search "^CUDA-.*.eb"|grep CUDA) - - error="The easyconfig ${cuda_easyconfig} was not found in EasyBuild version:\n" - error="${error} ${eb_version}\n" - error="${error}You either need to give a different version of CUDA to install _or_ \n" - error="${error}use a different version of EasyBuild for the installation.\n" - error="${error}\nThe versions of CUDA available with the current eb command are:\n" - error="${error}${available_cuda_easyconfigs}" - fatal_error "${error}" - fi - - # We need the --rebuild option, as the CUDA module may or may not be on the - # `MODULEPATH` yet. Even if it is, we still want to redo this installation - # since it will provide the symlinked targets for the parts of the CUDA - # installation in the `.../versions/...` prefix - # We install the module in our `tmpdir` since we do not need the modulefile, - # we only care about providing the targets for the symlinks. - extra_args="--rebuild --installpath-modules=${tmpdir}" - - # We don't want hooks used in this install, we need a vanilla CUDA installation - touch "$tmpdir"/none.py - # shellcheck disable=SC2086 # Intended splitting of extra_args - eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" - ret=$? - if [ $ret -ne 0 ]; then - eb_last_log=$(unset EB_VERBOSE; eb --last-log) - cp -a ${eb_last_log} . - fatal_error "CUDA installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." - else - echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" - fi - # clean up tmpdir - rm -rf "${tmpdir}" -fi +echo "This script was replaced by the $TOPDIR/install_cuda_and_libraries.sh script. See https://www.eessi.io/docs/site_specific_config/gpu/ for more information" >&2 +exit 1