Skip to content

Commit aea60d7

Browse files
authored
Merge pull request #3569 from Flamefire/cuda_cache
create CUDA cache (for JIT compiled PTX code) in build dir instead of $HOME
2 parents aded444 + 01bcca0 commit aea60d7

File tree

7 files changed

+114
-2
lines changed

7 files changed

+114
-2
lines changed

easybuild/framework/easyblock.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,6 +1034,27 @@ def make_dir(self, dir_name, clean, dontcreateinstalldir=False):
10341034

10351035
mkdir(dir_name, parents=True)
10361036

1037+
def set_up_cuda_cache(self):
1038+
"""Set up CUDA PTX cache."""
1039+
1040+
cuda_cache_maxsize = build_option('cuda_cache_maxsize')
1041+
if cuda_cache_maxsize is None:
1042+
cuda_cache_maxsize = 1 * 1024 # 1 GiB default value
1043+
else:
1044+
cuda_cache_maxsize = int(cuda_cache_maxsize)
1045+
1046+
if cuda_cache_maxsize == 0:
1047+
self.log.info("Disabling CUDA PTX cache since cache size was set to zero")
1048+
env.setvar('CUDA_CACHE_DISABLE', '1')
1049+
else:
1050+
cuda_cache_dir = build_option('cuda_cache_dir')
1051+
if not cuda_cache_dir:
1052+
cuda_cache_dir = os.path.join(self.builddir, 'eb-cuda-cache')
1053+
self.log.info("Enabling CUDA PTX cache of size %s MiB at %s", cuda_cache_maxsize, cuda_cache_dir)
1054+
env.setvar('CUDA_CACHE_DISABLE', '0')
1055+
env.setvar('CUDA_CACHE_PATH', cuda_cache_dir)
1056+
env.setvar('CUDA_CACHE_MAXSIZE', str(cuda_cache_maxsize * 1024 * 1024))
1057+
10371058
#
10381059
# MODULE UTILITY FUNCTIONS
10391060
#
@@ -2163,6 +2184,10 @@ def prepare_step(self, start_dir=True, load_tc_deps_modules=True):
21632184
self.log.info("Loading extra modules: %s", extra_modules)
21642185
self.modules_tool.load(extra_modules)
21652186

2187+
# Setup CUDA cache if required. If we don't do this, CUDA will use the $HOME for its cache files
2188+
if get_software_root('CUDA') or get_software_root('CUDAcore'):
2189+
self.set_up_cuda_cache()
2190+
21662191
# guess directory to start configure/build/install process in, and move there
21672192
if start_dir:
21682193
self.guess_start_dir()

easybuild/tools/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
171171
'container_image_name',
172172
'container_template_recipe',
173173
'container_tmpdir',
174+
'cuda_cache_dir',
175+
'cuda_cache_maxsize',
174176
'cuda_compute_capabilities',
175177
'download_timeout',
176178
'dump_test_report',

easybuild/tools/options.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,11 @@ def override_options(self):
360360
'consider-archived-easyconfigs': ("Also consider archived easyconfigs", None, 'store_true', False),
361361
'containerize': ("Generate container recipe/image", None, 'store_true', False, 'C'),
362362
'copy-ec': ("Copy specified easyconfig(s) to specified location", None, 'store_true', False),
363+
'cuda-cache-dir': ("Path to CUDA cache dir to use if enabled. Defaults to a path inside the build dir.",
364+
str, 'store', None, {'metavar': "PATH"}),
365+
'cuda-cache-maxsize': ("Maximum size of the CUDA cache (in MiB) used for JIT compilation of PTX code. "
366+
"Leave value empty to let EasyBuild choose a value or '0' to disable the cache",
367+
int, 'store_or_None', None),
363368
'cuda-compute-capabilities': ("List of CUDA compute capabilities to use when building GPU software; "
364369
"values should be specified as digits separated by a dot, "
365370
"for example: 3.5,5.0,7.2", 'strlist', 'extend', None),

test/framework/easyblock.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1894,6 +1894,60 @@ def test_prepare_step_hmns(self):
18941894
self.assertEqual(len(loaded_modules), 1)
18951895
self.assertEqual(loaded_modules[0]['mod_name'], 'GCC/6.4.0-2.28')
18961896

1897+
def test_prepare_step_cuda_cache(self):
1898+
"""Test handling cuda-cache-* options."""
1899+
1900+
init_config(build_options={'cuda_cache_maxsize': None}) # Automatic mode
1901+
1902+
test_ecs = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'easyconfigs', 'test_ecs')
1903+
toy_ec = os.path.join(test_ecs, 't', 'toy', 'toy-0.0.eb')
1904+
ec = process_easyconfig(toy_ec)[0]
1905+
eb = EasyBlock(ec['ec'])
1906+
eb.silent = True
1907+
eb.make_builddir()
1908+
1909+
eb.prepare_step(start_dir=False)
1910+
logtxt = read_file(eb.logfile)
1911+
self.assertNotIn('Disabling CUDA PTX cache', logtxt)
1912+
self.assertNotIn('Enabling CUDA PTX cache', logtxt)
1913+
1914+
# Now with CUDA
1915+
test_ec = os.path.join(self.test_prefix, 'test.eb')
1916+
test_ectxt = re.sub('^toolchain = .*', "toolchain = {'name': 'gcccuda', 'version': '2018a'}",
1917+
read_file(toy_ec), flags=re.M)
1918+
write_file(test_ec, test_ectxt)
1919+
ec = process_easyconfig(test_ec)[0]
1920+
eb = EasyBlock(ec['ec'])
1921+
eb.silent = True
1922+
eb.make_builddir()
1923+
1924+
write_file(eb.logfile, '')
1925+
eb.prepare_step(start_dir=False)
1926+
logtxt = read_file(eb.logfile)
1927+
self.assertNotIn('Disabling CUDA PTX cache', logtxt)
1928+
self.assertIn('Enabling CUDA PTX cache', logtxt)
1929+
self.assertEqual(os.environ['CUDA_CACHE_DISABLE'], '0')
1930+
1931+
init_config(build_options={'cuda_cache_maxsize': 0}) # Disable
1932+
write_file(eb.logfile, '')
1933+
eb.prepare_step(start_dir=False)
1934+
logtxt = read_file(eb.logfile)
1935+
self.assertIn('Disabling CUDA PTX cache', logtxt)
1936+
self.assertNotIn('Enabling CUDA PTX cache', logtxt)
1937+
self.assertEqual(os.environ['CUDA_CACHE_DISABLE'], '1')
1938+
1939+
# Specified size and location
1940+
cuda_cache_dir = os.path.join(self.test_prefix, 'custom-cuda-cache')
1941+
init_config(build_options={'cuda_cache_maxsize': 1234, 'cuda_cache_dir': cuda_cache_dir})
1942+
write_file(eb.logfile, '')
1943+
eb.prepare_step(start_dir=False)
1944+
logtxt = read_file(eb.logfile)
1945+
self.assertNotIn('Disabling CUDA PTX cache', logtxt)
1946+
self.assertIn('Enabling CUDA PTX cache', logtxt)
1947+
self.assertEqual(os.environ['CUDA_CACHE_DISABLE'], '0')
1948+
self.assertEqual(os.environ['CUDA_CACHE_MAXSIZE'], str(1234 * 1024 * 1024))
1949+
self.assertEqual(os.environ['CUDA_CACHE_PATH'], cuda_cache_dir)
1950+
18971951
def test_checksum_step(self):
18981952
"""Test checksum step"""
18991953
testdir = os.path.abspath(os.path.dirname(__file__))

test/framework/modules.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454

5555

5656
# number of modules included for testing purposes
57-
TEST_MODULES_COUNT = 81
57+
TEST_MODULES_COUNT = 82
5858

5959

6060
class ModulesTest(EnhancedTestCase):
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#%Module
2+
3+
proc ModulesHelp { } {
4+
puts stderr { GCC based compiler toolchain with CUDA support, and including
5+
OpenMPI for MPI support, OpenBLAS (BLAS and LAPACK support), FFTW and ScaLAPACK. - Homepage: (none)
6+
}
7+
}
8+
9+
module-whatis {GNU Compiler Collection (GCC) based compiler toolchain, along with CUDA toolkit. - Homepage: (none)}
10+
11+
set root /prefix/software/gcccuda/2018a
12+
13+
conflict gcccuda
14+
15+
if { ![is-loaded GCC/6.4.0-2.28] } {
16+
module load GCC/6.4.0-2.28
17+
}
18+
19+
if { ![is-loaded CUDA/9.1.85] } {
20+
module load CUDA/9.1.85
21+
}
22+
23+
24+
setenv EBROOTGCCCUDA "$root"
25+
setenv EBVERSIONGCCCUDA "2018a"
26+
setenv EBDEVELGCCCUDA "$root/easybuild/gcccuda-2018a-easybuild-devel"

test/framework/options.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4666,7 +4666,7 @@ def test_modules_tool_vs_syntax_check(self):
46664666
regex = re.compile(pattern, re.M)
46674667
self.assertTrue(regex.search(stdout), "Pattern '%s' found in: %s" % (regex.pattern, stdout))
46684668

4669-
def test_prefix(self):
4669+
def test_prefix_option(self):
46704670
"""Test which configuration settings are affected by --prefix."""
46714671
txt, _ = self._run_mock_eb(['--show-full-config', '--prefix=%s' % self.test_prefix], raise_error=True)
46724672

0 commit comments

Comments
 (0)