Skip to content

Commit 4795f26

Browse files
author
Theofilos Manitaras
authored
Merge pull request #268 from theely/alps-latest-pytorch-image
New pytorch test that checks against the latest available nvidia image
2 parents b00c8ce + 53e2b30 commit 4795f26

File tree

5 files changed

+110
-28
lines changed

5 files changed

+110
-28
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ __pycache__/
66
*.log
77
*.out
88
*.swp
9+
.venv

README.md

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
The tests are daily checked against the tip of the `master` branch of [ReFrame](https://github.com/reframe-hpc/reframe/).
55
Several tests are built on top of the `hpctestlib` library that is provided with ReFrame.
66

7-
To run the test suite you need first to clone and bootstrap ReFrame and then this repo:
7+
## Running on a CSCS system
88

9+
To run the test suite you need first to clone and bootstrap ReFrame and then this repo:
10+
11+
### Install ReFrame
912

10-
## Install ReFrame
1113
```
1214
git clone https://github.com/reframe-hpc/reframe.git
1315
pushd reframe
@@ -16,7 +18,7 @@ export PATH=$(pwd)/bin:$PATH
1618
popd
1719
```
1820

19-
## Clone the tests
21+
### Clone the tests
2022

2123
```
2224
git clone https://github.com/eth-cscs/cscs-reframe-tests
@@ -28,3 +30,32 @@ You can then list all the tests on any CSCS supported machine as follows:
2830
```
2931
reframe -C config/cscs.py -c checks/ -R -l
3032
```
33+
34+
35+
## Local development setup
36+
37+
### Install Python3 on your machine.
38+
39+
### Create a virtual environment:
40+
41+
```bash
42+
python3 -m venv --system-site-packages .venv
43+
source .venv/bin/activate
44+
python3 -m pip install --upgrade pip
45+
pip install -r requirements.txt
46+
```
47+
48+
### Run and Debug
49+
50+
```bash
51+
cd cscs-reframe-tests
52+
source .venv/bin/activate
53+
54+
reframe -V
55+
56+
reframe \
57+
-C config/cscs.py \
58+
-c checks/microbenchmarks/gpu/node_burn/baremetal-node-burn.py \
59+
-l \
60+
--skip-prgenv-check --skip-system-check
61+
```
Lines changed: 57 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,63 @@
1+
import re
12
import sys
23
import pathlib
34
import reframe as rfm
4-
from pytorch_test_base import PyTorchTestBase
5+
import reframe.utility.sanity as sn
6+
57

8+
from pytorch_test_base import PyTorchTestBase
69
sys.path.append(str(pathlib.Path(__file__).parent.parent.parent / 'mixins'))
710
from container_engine import ContainerEngineMixin # noqa: E402
811

12+
sys.path.append(
13+
str(pathlib.Path(__file__).parent.parent.parent.parent / 'utility')
14+
)
15+
from nvcr import nvidia_image_tags
16+
917

1018
@rfm.simple_test
11-
class PyTorchDdpSarus(PyTorchTestBase):
12-
valid_systems = ['+nvgpu +sarus']
13-
platform = 'Sarus'
19+
class test_image_tag_retrieval(rfm.RunOnlyRegressionTest):
20+
valid_systems = ['+nvgpu']
21+
valid_prog_environs = ['builtin']
22+
executable = 'echo'
1423

24+
@sanity_function
25+
def assert_found_tags(self):
26+
return sn.assert_found(r'pytorch tags: \S+', self.stdout)
27+
1528
@run_before('run')
1629
def set_container_variables(self):
17-
self.container_platform = self.platform
18-
self.container_platform.command = self.executable
19-
self.container_platform.image = 'nvcr.io/nvidia/pytorch:22.12-py3' # cuda11.8 pt1.14.0
20-
self.job.launcher.options.append('--mpi=pmi2')
30+
self.executable_opts = [
31+
f'pytorch tags: {",".join(nvidia_image_tags("pytorch"))}'
32+
]
2133

2234

2335
@rfm.simple_test
2436
class PyTorchDdpCeNv(PyTorchTestBase, ContainerEngineMixin):
25-
descr = 'Check the training throughput using the ContainerEngine and NVIDIA NGC'
37+
descr = ('Check the training throughput using the ContainerEngine and '
38+
'NVIDIA NGC')
2639
valid_systems = ['+ce +nvgpu']
2740
aws_ofi_nccl = parameter([True])
28-
image = parameter([
29-
'nvcr.io#nvidia/pytorch:22.08-py3', # same as AMD pt1.13.1
30-
'nvcr.io#nvidia/pytorch:22.12-py3', # same as Sarus pt1.14.0
31-
'nvcr.io#nvidia/pytorch:23.10-py3', # same as AMD pt2.1.0
32-
'nvcr.io#nvidia/pytorch:24.01-py3', # Latest pt2.2.0
33-
])
41+
curated_images = ['nvcr.io#nvidia/pytorch:24.01-py3']
42+
43+
# NOTE: only the "-py3" image is supported by the test
44+
supported_flavors = ["-py3"]
45+
46+
pytorch_tags = nvidia_image_tags('pytorch')
47+
latest_tags = []
48+
49+
for flavor in supported_flavors:
50+
versions = []
51+
for tag in pytorch_tags:
52+
if re.match(rf'^\d+\.\d+{flavor}$', tag):
53+
versions.append(tag[:-len(flavor)])
54+
55+
if versions:
56+
latest_version = max(versions)
57+
latest_tags += [f'{latest_version}+{flavor}']
58+
59+
latest_images = [f'nvcr.io#nvidia/pytorch:{tag}' for tag in latest_tags]
60+
image = parameter(curated_images + latest_images)
3461
env_vars = {
3562
'NCCL_DEBUG': 'Info',
3663
}
@@ -40,7 +67,8 @@ class PyTorchDdpCeNv(PyTorchTestBase, ContainerEngineMixin):
4067
def set_image(self):
4168
self.container_image = self.image
4269
if self.aws_ofi_nccl:
43-
cuda_major = 'cuda12' if self.image > 'nvcr.io#nvidia/pytorch:23' else 'cuda11'
70+
# Only cuda12 is supported at the moment
71+
cuda_major = 'cuda12'
4472
self.container_env_table = {
4573
'annotations.com.hooks': {
4674
'aws_ofi_nccl.enabled': 'true',
@@ -50,33 +78,37 @@ def set_image(self):
5078

5179
@rfm.simple_test
5280
class PyTorchDdpCeNvlarge(PyTorchDdpCeNv):
53-
aws_ofi_nccl = parameter([True, False])
5481
num_nodes = parameter([3, 8])
55-
image = parameter([
56-
'nvcr.io#nvidia/pytorch:24.01-py3', # Latest pt2.2.0
57-
])
5882

5983

84+
# FIXME: libc comptibility issue on Clariden
85+
# + srun -l --gpus-per-task=1 python cnn_distr.py
86+
# srun: /lib64/libc.so.6: version `GLIBC_2.34' not found
87+
# (required by /opt/cscs/aws-ofi-ccl-plugin/cuda12/libnccl-net.so)
88+
# TODO: build libnccl-net.so plug-in in the test setup phase
6089
@rfm.simple_test
6190
class PyTorchDdpMambaNv(PyTorchTestBase):
6291
descr = 'Check the training throughput on bare-metal'
63-
valid_systems = ['+nvgpu']
92+
valid_systems = [] #DISABLED TEST, change to ['+nvgpu'] to renable it
6493
time_limit = '30m'
6594
torch_version = parameter([
66-
'pytorch torchvision nccl pytorch-cuda=11.8', # Latest cu11.8
67-
# 'pytorch torchvision nccl pytorch-cuda=12.1', # Latest cu12.1; incompatible driver
95+
'nccl cuda=12.6', # Latest cu12.6; incompatible driver
6896
])
6997
tags = {'production'}
7098

7199
@run_after('setup')
72100
def activate_venv(self):
73101
self.prerun_cmds = [
74102
f'set -xe', f'. setup_conda.sh $PWD/forge',
103+
f'conda update -n base -c conda-forge conda',
104+
f'conda clean --all',
75105
f'conda create -p $PWD/forge/envs/rfm {self.torch_version} '
76-
f'-c pytorch -c nvidia -y',
106+
f'-c nvidia -y',
77107
f'conda activate $PWD/forge/envs/rfm',
108+
f'pip install torch torchvision torchaudio --index-url '
109+
f'https://download.pytorch.org/whl/cu126',
78110
f'pip install python-hostlist',
79-
f'. activate_ofi.sh cuda11',
111+
f'. activate_ofi.sh cuda12',
80112
]
81113

82114
self.postrun_cmds = ['rm Miniforge*.sh', 'rm -rf forge']

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Reframe-HPC
2+
pyfirecrest==2.5.0
3+
python-hostlist==1.23.0

utility/nvcr.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import requests
2+
from typing import List
3+
4+
5+
def nvidia_image_tags(image_name: str) -> List[str]:
6+
token_response = requests.get(
7+
f'https://nvcr.io/proxy_auth?scope=repository:nvidia/{image_name}:pull'
8+
)
9+
tags_url = f'https://nvcr.io/v2/nvidia/{image_name}/tags/list'
10+
headers = {
11+
'Authorization': f'Bearer {token_response.json().get("token")}'
12+
}
13+
image_tags_response = requests.get(tags_url, headers=headers)
14+
15+
return image_tags_response.json().get('tags', [])

0 commit comments

Comments
 (0)