Skip to content

Commit 1ba2f4f

Browse files
authored
Megatron cache (#488)
1 parent e9917fb commit 1ba2f4f

File tree

2 files changed

+6
-6
lines changed

2 files changed

+6
-6
lines changed

checks/apps/pytorch/pytorch_megatronlm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ class PyTorchMegatronLM(rfm.RunOnlyRegressionTest):
4747
checkpoint_steps = variable(int, value=10)
4848

4949
hf_home = variable(
50-
str, value=str(pathlib.Path.home() / '.cache' / 'huggingface')
50+
str, value=str(pathlib.Path(os.environ['SCRATCH']) / '.cache' / 'huggingface')
5151
)
5252

5353
# The number of training steps
@@ -505,7 +505,7 @@ def throughput_per_gpu(self):
505505
class PyTorchMegatronLM_CE(PyTorchMegatronLM, ContainerEngineMixin):
506506
valid_systems = ['+nvgpu +ce']
507507
valid_prog_environs = ['builtin']
508-
maintainers = ['ml-team']
508+
maintainers = ['VCUE', 'SSA']
509509
container_image = 'docker://jfrog.svc.cscs.ch#reframe-oci/pytorch:25.01-py3_nvrtc-12.9'
510510

511511
@run_after('setup')
@@ -543,7 +543,7 @@ def set_container_mounts(self):
543543
class PyTorchMegatronLM_UENV(PyTorchMegatronLM):
544544
valid_systems = ['+nvgpu +uenv']
545545
valid_prog_environs = ['+pytorch']
546-
maintainers = ['ml-team']
546+
maintainers = ['VCUE', 'SSA']
547547

548548
@run_after('setup')
549549
def patch_numpy(self):

checks/apps/pytorch/pytorch_megatronlm_amd.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class PyTorchMegatronLM_AMD(rfm.RunOnlyRegressionTest):
4646
batch_size_per_node = variable(int, value=256)
4747
checkpoint_steps = variable(int, value=10)
4848
hf_home = variable(
49-
str, value=str(pathlib.Path.home() / '.cache' / 'huggingface')
49+
str, value=str(pathlib.Path(os.environ['SCRATCH']) / '.cache' / 'huggingface')
5050
)
5151
training_steps = variable(int, value=10)
5252
wandb_logging = variable(bool, value=False)
@@ -88,7 +88,7 @@ class PyTorchMegatronLM_AMD(rfm.RunOnlyRegressionTest):
8888
sourcesdir = None
8989
executable = 'bash'
9090

91-
maintainers = ['VCUE']
91+
maintainers = ['VCUE', 'SSA']
9292
tags = {'ml', 'bencher'}
9393

9494
@run_after('setup')
@@ -383,7 +383,7 @@ def throughput_per_gpu(self):
383383
class PyTorchMegatronLM_AMD_CE(PyTorchMegatronLM_AMD, ContainerEngineMixin):
384384
valid_systems = ['+amdgpu +ce']
385385
valid_prog_environs = ['builtin']
386-
maintainers = ['ml-team']
386+
maintainers = ['VCUE', 'SSA']
387387
container_image = 'rocm/megatron-lm:v25.6_py312'
388388

389389
@run_after('setup')

0 commit comments

Comments
 (0)