1+ import re
12import sys
23import pathlib
34import reframe as rfm
4- from pytorch_test_base import PyTorchTestBase
5+ import reframe .utility .sanity as sn
6+
57
8+ from pytorch_test_base import PyTorchTestBase
69sys .path .append (str (pathlib .Path (__file__ ).parent .parent .parent / 'mixins' ))
710from container_engine import ContainerEngineMixin # noqa: E402
811
12+ sys .path .append (
13+ str (pathlib .Path (__file__ ).parent .parent .parent .parent / 'utility' )
14+ )
15+ from nvcr import nvidia_image_tags
16+
917
1018@rfm .simple_test
11- class PyTorchDdpSarus (PyTorchTestBase ):
12- valid_systems = ['+nvgpu +sarus' ]
13- platform = 'Sarus'
19+ class test_image_tag_retrieval (rfm .RunOnlyRegressionTest ):
20+ valid_systems = ['+nvgpu' ]
21+ valid_prog_environs = ['builtin' ]
22+ executable = 'echo'
1423
24+ @sanity_function
25+ def assert_found_tags (self ):
26+ return sn .assert_found (r'pytorch tags: \S+' , self .stdout )
27+
1528 @run_before ('run' )
1629 def set_container_variables (self ):
17- self .container_platform = self .platform
18- self .container_platform .command = self .executable
19- self .container_platform .image = 'nvcr.io/nvidia/pytorch:22.12-py3' # cuda11.8 pt1.14.0
20- self .job .launcher .options .append ('--mpi=pmi2' )
30+ self .executable_opts = [
31+ f'pytorch tags: { "," .join (nvidia_image_tags ("pytorch" ))} '
32+ ]
2133
2234
2335@rfm .simple_test
2436class PyTorchDdpCeNv (PyTorchTestBase , ContainerEngineMixin ):
25- descr = 'Check the training throughput using the ContainerEngine and NVIDIA NGC'
37+ descr = ('Check the training throughput using the ContainerEngine and '
38+ 'NVIDIA NGC' )
2639 valid_systems = ['+ce +nvgpu' ]
2740 aws_ofi_nccl = parameter ([True ])
28- image = parameter ([
29- 'nvcr.io#nvidia/pytorch:22.08-py3' , # same as AMD pt1.13.1
30- 'nvcr.io#nvidia/pytorch:22.12-py3' , # same as Sarus pt1.14.0
31- 'nvcr.io#nvidia/pytorch:23.10-py3' , # same as AMD pt2.1.0
32- 'nvcr.io#nvidia/pytorch:24.01-py3' , # Latest pt2.2.0
33- ])
41+ curated_images = ['nvcr.io#nvidia/pytorch:24.01-py3' ]
42+
43+ # NOTE: only the "-py3" image is supported by the test
44+ supported_flavors = ["-py3" ]
45+
46+ pytorch_tags = nvidia_image_tags ('pytorch' )
47+ latest_tags = []
48+
49+ for flavor in supported_flavors :
50+ versions = []
51+ for tag in pytorch_tags :
52+ if re .match (rf'^\d+\.\d+{ flavor } $' , tag ):
53+ versions .append (tag [:- len (flavor )])
54+
55+ if versions :
56+ latest_version = max (versions )
57+ latest_tags += [f'{ latest_version } +{ flavor } ' ]
58+
59+ latest_images = [f'nvcr.io#nvidia/pytorch:{ tag } ' for tag in latest_tags ]
60+ image = parameter (curated_images + latest_images )
3461 env_vars = {
3562 'NCCL_DEBUG' : 'Info' ,
3663 }
@@ -40,7 +67,8 @@ class PyTorchDdpCeNv(PyTorchTestBase, ContainerEngineMixin):
4067 def set_image (self ):
4168 self .container_image = self .image
4269 if self .aws_ofi_nccl :
43- cuda_major = 'cuda12' if self .image > 'nvcr.io#nvidia/pytorch:23' else 'cuda11'
70+ # Only cuda12 is supported at the moment
71+ cuda_major = 'cuda12'
4472 self .container_env_table = {
4573 'annotations.com.hooks' : {
4674 'aws_ofi_nccl.enabled' : 'true' ,
@@ -50,33 +78,37 @@ def set_image(self):
5078
5179@rfm .simple_test
5280class PyTorchDdpCeNvlarge (PyTorchDdpCeNv ):
53- aws_ofi_nccl = parameter ([True , False ])
5481 num_nodes = parameter ([3 , 8 ])
55- image = parameter ([
56- 'nvcr.io#nvidia/pytorch:24.01-py3' , # Latest pt2.2.0
57- ])
5882
5983
84+ # FIXME: libc comptibility issue on Clariden
85+ # + srun -l --gpus-per-task=1 python cnn_distr.py
86+ # srun: /lib64/libc.so.6: version `GLIBC_2.34' not found
87+ # (required by /opt/cscs/aws-ofi-ccl-plugin/cuda12/libnccl-net.so)
88+ # TODO: build libnccl-net.so plug-in in the test setup phase
6089@rfm .simple_test
6190class PyTorchDdpMambaNv (PyTorchTestBase ):
6291 descr = 'Check the training throughput on bare-metal'
63- valid_systems = ['+nvgpu' ]
92+ valid_systems = [] #DISABLED TEST, change to [ '+nvgpu'] to renable it
6493 time_limit = '30m'
6594 torch_version = parameter ([
66- 'pytorch torchvision nccl pytorch-cuda=11.8' , # Latest cu11.8
67- # 'pytorch torchvision nccl pytorch-cuda=12.1', # Latest cu12.1; incompatible driver
95+ 'nccl cuda=12.6' , # Latest cu12.6; incompatible driver
6896 ])
6997 tags = {'production' }
7098
7199 @run_after ('setup' )
72100 def activate_venv (self ):
73101 self .prerun_cmds = [
74102 f'set -xe' , f'. setup_conda.sh $PWD/forge' ,
103+ f'conda update -n base -c conda-forge conda' ,
104+ f'conda clean --all' ,
75105 f'conda create -p $PWD/forge/envs/rfm { self .torch_version } '
76- f'-c pytorch -c nvidia -y' ,
106+ f'-c nvidia -y' ,
77107 f'conda activate $PWD/forge/envs/rfm' ,
108+ f'pip install torch torchvision torchaudio --index-url '
109+ f'https://download.pytorch.org/whl/cu126' ,
78110 f'pip install python-hostlist' ,
79- f'. activate_ofi.sh cuda11 ' ,
111+ f'. activate_ofi.sh cuda12 ' ,
80112 ]
81113
82114 self .postrun_cmds = ['rm Miniforge*.sh' , 'rm -rf forge' ]
0 commit comments