Merge pull request #268 from theely/alps-latest-pytorch-image

Theofilos Manitaras · web-flow · commit 4795f260b241 · 2025-02-14T15:42:42.000+01:00
New pytorch test that checks against the latest available nvidia image
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@ __pycache__/
 *.log
 *.out
 *.swp
+.venv
diff --git a/README.md b/README.md
@@ -4,10 +4,12 @@
 The tests are daily checked against the tip of the `master` branch of [ReFrame](https://github.com/reframe-hpc/reframe/). 
 Several tests are built on top of the `hpctestlib` library that is provided with ReFrame.
 
-To run the test suite you need first to clone and bootstrap ReFrame and then this repo:
+## Running on a CSCS system
 
+To run the test suite you need first to clone and bootstrap ReFrame and then this repo:
+ 
+### Install ReFrame
 
-## Install ReFrame
 ```
 git clone https://github.com/reframe-hpc/reframe.git
 pushd reframe
@@ -16,7 +18,7 @@ export PATH=$(pwd)/bin:$PATH
 popd
 ```
 
-## Clone the tests
+### Clone the tests
 
 ```
 git clone https://github.com/eth-cscs/cscs-reframe-tests
@@ -28,3 +30,32 @@ You can then list all the tests on any CSCS supported machine as follows:
 ```
 reframe -C config/cscs.py -c checks/ -R -l
 ```
+
+
+## Local development setup
+
+### Install Python3 on your machine.
+
+### Create a virtual environment:
+
+```bash
+python3 -m venv --system-site-packages .venv
+source .venv/bin/activate
+python3 -m pip install --upgrade pip
+pip install -r requirements.txt
+```
+
+###  Run and Debug
+
+```bash
+cd cscs-reframe-tests
+source .venv/bin/activate
+
+reframe -V
+
+reframe \
+-C config/cscs.py \
+-c checks/microbenchmarks/gpu/node_burn/baremetal-node-burn.py \
+-l \
+--skip-prgenv-check --skip-system-check
+```
diff --git a/checks/apps/pytorch/pytorch_nvidia.py b/checks/apps/pytorch/pytorch_nvidia.py
@@ -1,36 +1,63 @@
+import re
 import sys
 import pathlib
 import reframe as rfm
-from pytorch_test_base import PyTorchTestBase
+import reframe.utility.sanity as sn
+
 
+from pytorch_test_base import PyTorchTestBase
 sys.path.append(str(pathlib.Path(__file__).parent.parent.parent / 'mixins'))
 from container_engine import ContainerEngineMixin  # noqa: E402
 
+sys.path.append(
+    str(pathlib.Path(__file__).parent.parent.parent.parent / 'utility')
+)
+from nvcr import nvidia_image_tags
+
 
 @rfm.simple_test
-class PyTorchDdpSarus(PyTorchTestBase):
-    valid_systems = ['+nvgpu +sarus']
-    platform = 'Sarus'
+class test_image_tag_retrieval(rfm.RunOnlyRegressionTest):
+    valid_systems = ['+nvgpu']
+    valid_prog_environs = ['builtin']
+    executable = 'echo'
 
+    @sanity_function
+    def assert_found_tags(self):
+        return sn.assert_found(r'pytorch tags: \S+', self.stdout)
+    
     @run_before('run')
     def set_container_variables(self):
-        self.container_platform = self.platform
-        self.container_platform.command = self.executable
-        self.container_platform.image = 'nvcr.io/nvidia/pytorch:22.12-py3'  # cuda11.8 pt1.14.0
-        self.job.launcher.options.append('--mpi=pmi2')
+        self.executable_opts = [
+            f'pytorch tags: {",".join(nvidia_image_tags("pytorch"))}'
+        ]
 
 
 @rfm.simple_test
 class PyTorchDdpCeNv(PyTorchTestBase, ContainerEngineMixin):
-    descr = 'Check the training throughput using the ContainerEngine and NVIDIA NGC'
+    descr = ('Check the training throughput using the ContainerEngine and '
+             'NVIDIA NGC')
     valid_systems = ['+ce +nvgpu']
     aws_ofi_nccl = parameter([True])
-    image = parameter([
-        'nvcr.io#nvidia/pytorch:22.08-py3', # same as AMD   pt1.13.1
-        'nvcr.io#nvidia/pytorch:22.12-py3', # same as Sarus pt1.14.0
-        'nvcr.io#nvidia/pytorch:23.10-py3', # same as AMD   pt2.1.0
-        'nvcr.io#nvidia/pytorch:24.01-py3', # Latest        pt2.2.0
-    ])
+    curated_images = ['nvcr.io#nvidia/pytorch:24.01-py3']
+
+    # NOTE: only the "-py3" image is supported by the test
+    supported_flavors = ["-py3"] 
+
+    pytorch_tags = nvidia_image_tags('pytorch')
+    latest_tags = []
+
+    for flavor in supported_flavors:
+        versions = []
+        for tag in pytorch_tags:
+            if re.match(rf'^\d+\.\d+{flavor}$', tag):
+                versions.append(tag[:-len(flavor)])
+
+        if versions:
+            latest_version = max(versions)
+            latest_tags += [f'{latest_version}+{flavor}']
+
+    latest_images = [f'nvcr.io#nvidia/pytorch:{tag}' for tag in latest_tags]
+    image = parameter(curated_images + latest_images)
     env_vars = {
         'NCCL_DEBUG': 'Info',
     }
@@ -40,7 +67,8 @@ class PyTorchDdpCeNv(PyTorchTestBase, ContainerEngineMixin):
     def set_image(self):
         self.container_image = self.image
         if self.aws_ofi_nccl:
-            cuda_major = 'cuda12' if self.image > 'nvcr.io#nvidia/pytorch:23' else 'cuda11'
+            # Only cuda12 is supported at the moment
+            cuda_major = 'cuda12'
             self.container_env_table = {
                 'annotations.com.hooks': {
                     'aws_ofi_nccl.enabled': 'true',
@@ -50,33 +78,37 @@ def set_image(self):
 
 @rfm.simple_test
 class PyTorchDdpCeNvlarge(PyTorchDdpCeNv):
-    aws_ofi_nccl = parameter([True, False])
     num_nodes = parameter([3, 8])
-    image = parameter([
-        'nvcr.io#nvidia/pytorch:24.01-py3', # Latest        pt2.2.0
-    ])
 
 
+# FIXME: libc comptibility issue on Clariden
+# + srun -l --gpus-per-task=1 python cnn_distr.py
+# srun: /lib64/libc.so.6: version `GLIBC_2.34' not found
+# (required by /opt/cscs/aws-ofi-ccl-plugin/cuda12/libnccl-net.so)
+# TODO: build libnccl-net.so plug-in in the test setup phase
 @rfm.simple_test
 class PyTorchDdpMambaNv(PyTorchTestBase):
     descr = 'Check the training throughput on bare-metal'
-    valid_systems = ['+nvgpu']
+    valid_systems = []  #DISABLED TEST, change to ['+nvgpu'] to renable it
     time_limit = '30m'
     torch_version = parameter([
-        'pytorch torchvision nccl pytorch-cuda=11.8', # Latest cu11.8
-        # 'pytorch torchvision nccl pytorch-cuda=12.1', # Latest cu12.1; incompatible driver
+         'nccl cuda=12.6', # Latest cu12.6; incompatible driver
     ])
     tags = {'production'}
 
     @run_after('setup')
     def activate_venv(self):
         self.prerun_cmds = [
             f'set -xe', f'. setup_conda.sh $PWD/forge',
+            f'conda update -n base -c conda-forge conda',
+            f'conda clean --all',
             f'conda create -p $PWD/forge/envs/rfm {self.torch_version} '
-            f'-c pytorch -c nvidia -y',
+            f'-c nvidia -y',
             f'conda activate $PWD/forge/envs/rfm',
+            f'pip install torch torchvision torchaudio --index-url '
+            f'https://download.pytorch.org/whl/cu126',
             f'pip install python-hostlist',
-            f'. activate_ofi.sh cuda11',
+            f'. activate_ofi.sh cuda12',
         ]
 
         self.postrun_cmds = ['rm Miniforge*.sh', 'rm -rf forge']
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+Reframe-HPC
+pyfirecrest==2.5.0
+python-hostlist==1.23.0
diff --git a/utility/nvcr.py b/utility/nvcr.py
@@ -0,0 +1,15 @@
+import requests
+from typing import List
+
+
+def nvidia_image_tags(image_name: str) -> List[str]:
+    token_response = requests.get(
+        f'https://nvcr.io/proxy_auth?scope=repository:nvidia/{image_name}:pull'
+    )
+    tags_url = f'https://nvcr.io/v2/nvidia/{image_name}/tags/list'
+    headers = {
+        'Authorization': f'Bearer {token_response.json().get("token")}'
+    }
+    image_tags_response = requests.get(tags_url, headers=headers)
+
+    return image_tags_response.json().get('tags', [])

-Original file line number
+Diff line change
 *.log
 *.out
 *.swp
 +.venv
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Reframe-HPC`
	`2`	`+pyfirecrest==2.5.0`
	`3`	`+python-hostlist==1.23.0`