Megatron cache (#488)

henrique · web-flow · commit 1ba2f4f66f70 · 2025-12-18T14:59:55.000+01:00
diff --git a/checks/apps/pytorch/pytorch_megatronlm.py b/checks/apps/pytorch/pytorch_megatronlm.py
@@ -47,7 +47,7 @@ class PyTorchMegatronLM(rfm.RunOnlyRegressionTest):
     checkpoint_steps = variable(int, value=10)
 
     hf_home = variable(
-        str, value=str(pathlib.Path.home() / '.cache' / 'huggingface')
+        str, value=str(pathlib.Path(os.environ['SCRATCH']) / '.cache' / 'huggingface')
     )
 
     # The number of training steps
@@ -505,7 +505,7 @@ def throughput_per_gpu(self):
 class PyTorchMegatronLM_CE(PyTorchMegatronLM, ContainerEngineMixin):
     valid_systems = ['+nvgpu +ce']
     valid_prog_environs = ['builtin']
-    maintainers = ['ml-team']
+    maintainers = ['VCUE', 'SSA']
     container_image = 'docker://jfrog.svc.cscs.ch#reframe-oci/pytorch:25.01-py3_nvrtc-12.9'
 
     @run_after('setup')
@@ -543,7 +543,7 @@ def set_container_mounts(self):
 class PyTorchMegatronLM_UENV(PyTorchMegatronLM):
     valid_systems = ['+nvgpu +uenv']
     valid_prog_environs = ['+pytorch']
-    maintainers = ['ml-team']
+    maintainers = ['VCUE', 'SSA']
 
     @run_after('setup')
     def patch_numpy(self):
diff --git a/checks/apps/pytorch/pytorch_megatronlm_amd.py b/checks/apps/pytorch/pytorch_megatronlm_amd.py
@@ -46,7 +46,7 @@ class PyTorchMegatronLM_AMD(rfm.RunOnlyRegressionTest):
     batch_size_per_node = variable(int, value=256)
     checkpoint_steps = variable(int, value=10)
     hf_home = variable(
-        str, value=str(pathlib.Path.home() / '.cache' / 'huggingface')
+        str, value=str(pathlib.Path(os.environ['SCRATCH']) / '.cache' / 'huggingface')
     )
     training_steps = variable(int, value=10)
     wandb_logging = variable(bool, value=False)
@@ -88,7 +88,7 @@ class PyTorchMegatronLM_AMD(rfm.RunOnlyRegressionTest):
     sourcesdir = None
     executable = 'bash'
 
-    maintainers = ['VCUE']
+    maintainers = ['VCUE', 'SSA']
     tags = {'ml', 'bencher'}
 
     @run_after('setup')
@@ -383,7 +383,7 @@ def throughput_per_gpu(self):
 class PyTorchMegatronLM_AMD_CE(PyTorchMegatronLM_AMD, ContainerEngineMixin):
     valid_systems = ['+amdgpu +ce']
     valid_prog_environs = ['builtin']
-    maintainers = ['ml-team']
+    maintainers = ['VCUE', 'SSA']
     container_image = 'rocm/megatron-lm:v25.6_py312'
 
     @run_after('setup')