Skip to content

Commit 1118cc0

Browse files
authored
Merge pull request #4692 from jfgrimm/cuda-device-code-sanity-check
add a CUDA device code sanity check
2 parents a5891e7 + 5cef2e0 commit 1118cc0

File tree

7 files changed

+1245
-9
lines changed

7 files changed

+1245
-9
lines changed

easybuild/framework/easyblock.py

Lines changed: 388 additions & 2 deletions
Large diffs are not rendered by default.

easybuild/framework/easyconfig/default.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,11 @@
125125
'after make (for e.g.,"test" for make test)'), BUILD],
126126
'bin_lib_subdirs': [[], "List of subdirectories for binaries and libraries, which is used during sanity check "
127127
"to check RPATH linking and banned/required libraries", BUILD],
128+
'cuda_sanity_ignore_files': [[], "List of files (relative to the installation prefix) for which failures in "
129+
"the CUDA sanity check step are ignored. Typically used for files where you "
130+
"know the CUDA architectures in those files don't match the "
131+
"--cuda-compute-capabitilities configured for EasyBuild AND where you know "
132+
"that this is ok / reasonable (e.g. binary installations)", BUILD],
128133
'sanity_check_commands': [[], ("format: [(name, options)] e.g. [('gzip','-h')]. "
129134
"Using a non-tuple is equivalent to (name, '-h')"), BUILD],
130135
'sanity_check_paths': [{}, ("List of files and directories to check "

easybuild/tools/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,10 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
305305
'backup_patched_files',
306306
'consider_archived_easyconfigs',
307307
'container_build_image',
308+
'cuda_sanity_check_accept_ptx_as_devcode',
309+
'cuda_sanity_check_accept_missing_ptx',
310+
'cuda_sanity_check_error_on_failed_checks',
311+
'cuda_sanity_check_strict',
308312
'debug',
309313
'debug_lmod',
310314
'dump_autopep8',

easybuild/tools/options.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,42 @@ def override_options(self):
398398
int, 'store_or_None', None),
399399
'cuda-compute-capabilities': ("List of CUDA compute capabilities to use when building GPU software; "
400400
"values should be specified as digits separated by a dot, "
401-
"for example: 3.5,5.0,7.2", 'strlist', 'extend', None),
401+
"for example: 3.5,5.0,7.2. EasyBuild will (where possible) compile fat "
402+
"binaries with support for (at least) all requested CUDA compute "
403+
"capabilities, and PTX code for the highest CUDA compute capability (for "
404+
"forwards compatibility). The check on this behavior may be relaxed using "
405+
"--cuda-sanity-check-accept-missing-ptx, "
406+
"--cuda-sanity-check-accept-ptx-as-devcode, "
407+
"or made more stringent using --cuda-sanity-check-strict.",
408+
'strlist', 'extend', None),
409+
'cuda-sanity-check-accept-missing-ptx': ("Relax CUDA sanity check to accept that PTX code for the highest "
410+
"requested CUDA compute capability is not present (but will "
411+
"print a warning)",
412+
None, 'store_true', False),
413+
'cuda-sanity-check-accept-ptx-as-devcode': ("Relax CUDA sanity check to accept that requested device code "
414+
"is not present, as long as PTX code is present that can be "
415+
"JIT-compiled for each target in --cuda-compute-capabilities. "
416+
"For example, if --cuda-compute-capabilities=8.0 and a binary "
417+
"is found in the installation that does not have device code "
418+
"for 8.0, but it does have PTX code for 7.0, the sanity check "
419+
"will pass if, and only if, this option is enabled. "
420+
"Note that JIT-compiling means the binary will work on the "
421+
"requested architecture, but is it not necessarily as well "
422+
"optimized as when actual device code is present for the "
423+
"requested architecture ",
424+
None, 'store_true', False),
425+
'cuda-sanity-check-error-on-failed-checks': ("If enabled, failures in the CUDA sanity check will produce "
426+
"an error. If disabled, the CUDA sanity check will be "
427+
"performed and failures will be reported through warnings, "
428+
"but they will not result in an error",
429+
None, 'store_true', False),
430+
'cuda-sanity-check-strict': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity "
431+
"check will fail if the CUDA binaries don't contain code for (at least) "
432+
"all compute capabilities defined in --cude-compute-capabilities, "
433+
"but will accept if code for additional compute capabilities is present. "
434+
"With this setting, the sanity check will also fail if code is present for "
435+
"more compute capabilities than defined in --cuda-compute-capabilities.",
436+
None, 'store_true', False),
402437
'debug-lmod': ("Run Lmod modules tool commands in debug module", None, 'store_true', False),
403438
'default-opt-level': ("Specify default optimisation level", 'choice', 'store', DEFAULT_OPT_LEVEL,
404439
Compiler.COMPILER_OPT_OPTIONS),
@@ -544,7 +579,7 @@ def override_options(self):
544579
"Git commit to use for the target software build (robot capabilities are automatically disabled)",
545580
None, 'store', None),
546581
'sticky-bit': ("Set sticky bit on newly created directories", None, 'store_true', False),
547-
'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involces unsetting "
582+
'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involves unsetting "
548583
"$LD_LIBRARY_PATH before checking whether all required libraries are found",
549584
None, 'store_true', False),
550585
'sysroot': ("Location root directory of system, prefix for standard paths like /usr/lib and /usr/include",
@@ -950,7 +985,7 @@ def validate(self):
950985
# values passed to --cuda-compute-capabilities must be of form X.Y (with both X and Y integers),
951986
# see https://developer.nvidia.com/cuda-gpus
952987
if self.options.cuda_compute_capabilities:
953-
cuda_cc_regex = re.compile(r'^[0-9]+\.[0-9]+$')
988+
cuda_cc_regex = re.compile(r'^[0-9]+\.[0-9]+a?$')
954989
faulty_cuda_ccs = [x for x in self.options.cuda_compute_capabilities if not cuda_cc_regex.match(x)]
955990
if faulty_cuda_ccs:
956991
error_msg = "Incorrect values in --cuda-compute-capabilities (expected pattern: '%s'): %s"

easybuild/tools/systemtools.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,12 @@
2727
2828
Authors:
2929
30+
* Kenneth Hoste (Ghent University)
3031
* Jens Timmerman (Ghent University)
3132
* Ward Poelmans (Ghent University)
33+
* Jasper Grimm (UoY)
3234
* Jan Andre Reuter (Forschungszentrum Juelich GmbH)
35+
* Caspar van Leeuwen (SURF)
3336
"""
3437
import csv
3538
import ctypes
@@ -41,6 +44,7 @@
4144
import platform
4245
import pwd
4346
import re
47+
import shutil
4448
import struct
4549
import sys
4650
import termios
@@ -64,6 +68,7 @@
6468
pass
6569

6670
from easybuild.base import fancylogger
71+
from easybuild.tools import LooseVersion
6772
from easybuild.tools.build_log import EasyBuildError, EasyBuildExit, print_warning
6873
from easybuild.tools.config import IGNORE
6974
from easybuild.tools.filetools import is_readable, read_file, which
@@ -998,6 +1003,106 @@ def get_glibc_version():
9981003
return glibc_ver
9991004

10001005

1006+
def get_cuda_object_dump_raw(path):
1007+
"""
1008+
Get raw ouput from command which extracts information from CUDA binary files in a human-readable format,
1009+
or None for files containing no CUDA device code.
1010+
See https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#cuobjdump
1011+
"""
1012+
1013+
res = run_shell_cmd("file %s" % path, fail_on_error=False, hidden=True, output_file=False, stream_output=False)
1014+
if res.exit_code != EasyBuildExit.SUCCESS:
1015+
fail_msg = "Failed to run 'file %s': %s" % (path, res.output)
1016+
_log.warning(fail_msg)
1017+
1018+
# check that the file is an executable or object (shared library) or archive (static library)
1019+
result = None
1020+
if any(x in res.output for x in ['executable', 'object', 'archive']):
1021+
# Make sure we have a cuobjdump command
1022+
if not shutil.which('cuobjdump'):
1023+
raise EasyBuildError("Failed to get object dump from CUDA file: cuobjdump command not found")
1024+
cuda_cmd = f"cuobjdump {path}"
1025+
res = run_shell_cmd(cuda_cmd, fail_on_error=False, hidden=True, output_file=False, stream_output=False)
1026+
if res.exit_code == EasyBuildExit.SUCCESS:
1027+
result = res.output
1028+
else:
1029+
# Check and report for the common case that this is simply not a CUDA binary, i.e. does not
1030+
# contain CUDA device code
1031+
no_device_code_match = re.search(r'does not contain device code', res.output)
1032+
if no_device_code_match is not None:
1033+
# File is a regular executable, object or library, but not a CUDA file
1034+
msg = "'%s' does not appear to be a CUDA binary: cuobjdump failed to find device code in this file"
1035+
_log.debug(msg, path)
1036+
else:
1037+
# This should not happen: there was no string saying this was NOT a CUDA file, yet no device code
1038+
# was found at all
1039+
msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'"
1040+
raise EasyBuildError(msg, path, cuda_cmd, res.output)
1041+
1042+
return result
1043+
1044+
1045+
def get_cuda_architectures(path, section_type):
1046+
"""
1047+
Get a sorted list of CUDA architectures supported in the file in 'path'.
1048+
path: full path to a CUDA file
1049+
section_type: the type of section in the cuobjdump output to check for architectures ('elf' or 'ptx')
1050+
Returns None if no CUDA device code is present in the file
1051+
"""
1052+
1053+
# Note that typical output for a cuobjdump call will look like this for device code:
1054+
#
1055+
# Fatbin elf code:
1056+
# ================
1057+
# arch = sm_90
1058+
# code version = [1,7]
1059+
# host = linux
1060+
# compile_size = 64bit
1061+
#
1062+
# And for ptx code, it will look like this:
1063+
#
1064+
# Fatbin ptx code:
1065+
# ================
1066+
# arch = sm_90
1067+
# code version = [8,1]
1068+
# host = linux
1069+
# compile_size = 64bit
1070+
1071+
# Pattern to extract elf code architectures and ptx code architectures respectively
1072+
code_regex = re.compile(f'Fatbin {section_type} code:\n=+\narch = sm_([0-9]+)([0-9]a?)')
1073+
1074+
# resolve symlinks
1075+
if os.path.islink(path) and os.path.exists(path):
1076+
path = os.path.realpath(path)
1077+
1078+
cc_archs = None
1079+
cuda_raw = get_cuda_object_dump_raw(path)
1080+
if cuda_raw is not None:
1081+
# extract unique device code architectures from raw dump
1082+
code_matches = re.findall(code_regex, cuda_raw)
1083+
if code_matches:
1084+
# convert match tuples into unique list of cuda compute capabilities
1085+
# e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0']
1086+
cc_archs = sorted(['.'.join(m) for m in set(code_matches)], key=LooseVersion)
1087+
else:
1088+
# Try to be clear in the warning... did we not find elf/ptx code sections at all? or was the arch missing?
1089+
section_regex = re.compile(f'Fatbin {section_type} code')
1090+
section_matches = re.findall(section_regex, cuda_raw)
1091+
if section_matches:
1092+
fail_msg = f"Found Fatbin {section_type} code section(s) in cuobjdump output for {path}, "
1093+
fail_msg += "but failed to extract CUDA architecture"
1094+
else:
1095+
# In this case, the "Fatbin {section_type} code" section is simply missing from the binary
1096+
# It is entirely possible for a CUDA binary to have only device code or only ptx code (and thus the
1097+
# other section could be missing). However, considering --cuda-compute-capabilities is supposed to
1098+
# generate both PTX and device code (at least for the highest CC in that list), it is unexpected
1099+
# in an EasyBuild context and thus we print a warning
1100+
fail_msg = f"Failed to find Fatbin {section_type} code section(s) in cuobjdump output for {path}."
1101+
_log.warning(fail_msg)
1102+
1103+
return cc_archs
1104+
1105+
10011106
def get_linked_libs_raw(path):
10021107
"""
10031108
Get raw output from command that reports linked libraries for dynamically linked executables/libraries,

0 commit comments

Comments
 (0)