Skip to content

Commit e8d9389

Browse files
authored
Merge pull request #90 from ROCm/kithumma/test-workflows-2
enhance workflow
2 parents a6112e4 + 89c8ec8 commit e8d9389

File tree

2 files changed

+195
-16
lines changed

2 files changed

+195
-16
lines changed

.github/workflows/enroot-tests.yml

Lines changed: 188 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,26 @@
11
name: Enroot Tests
22

33
on:
4+
push:
5+
branches:
6+
- main
47
workflow_dispatch:
58
inputs:
6-
test_name:
7-
description: 'Select test to run'
8-
required: true
9-
type: choice
10-
options:
11-
- test_single_node_pytorch
12-
- test_multi_node_distributed_pytorch
9+
run_single_node_pytorch:
10+
description: 'Run single-node PyTorch test'
11+
required: false
12+
type: boolean
13+
default: true
14+
run_multi_node_pytorch:
15+
description: 'Run multi-node distributed PyTorch test'
16+
required: false
17+
type: boolean
18+
default: true
19+
run_multi_node_rccl:
20+
description: 'Run multi-node RCCL test'
21+
required: false
22+
type: boolean
23+
default: true
1324
no_install:
1425
description: 'Skip installation (--no-install)'
1526
required: false
@@ -20,20 +31,27 @@ on:
2031
required: false
2132
type: boolean
2233
default: false
23-
docker_image:
24-
description: 'Docker image to use (default: rocm/pytorch:latest for single-node, docker://rocm/pytorch:rocm7.0.2_ubuntu22.04_py3.10_pytorch_release_2.7.1 for multi-node)'
34+
docker_image_single_node:
35+
description: 'Docker image for single-node test (default: rocm/pytorch:latest)'
2536
required: false
2637
type: string
2738
default: ''
28-
testbed_file:
29-
description: 'Path to testbed file (e.g. tests/enroot/testbeds/mi325.yaml)'
39+
docker_image_multi_node:
40+
description: 'Docker image for multi-node PyTorch test (default: docker://rocm/pytorch:rocm6.2.4_ubuntu22.04_py3.10_pytorch_release_2.3.0)'
3041
required: false
3142
type: string
32-
default: 'testbed/enroot_tb.yml'
43+
default: ''
44+
docker_image_rccl:
45+
description: 'Docker image for RCCL test (default: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56)'
46+
required: false
47+
type: string
48+
default: ''
3349

3450

3551
jobs:
36-
run-enroot-tests:
52+
# Single-node PyTorch test
53+
test-single-node-pytorch:
54+
if: github.event_name == 'push' || inputs.run_single_node_pytorch == true
3755
runs-on: enroot-runners
3856
timeout-minutes: 120
3957

@@ -51,16 +69,170 @@ jobs:
5169
python3 -m pip install --upgrade pip
5270
pip install -r tests/enroot/requirements.txt
5371
54-
- name: Run enroot tests
72+
- name: Create testbed files from secrets
73+
working-directory: tests/enroot
74+
env:
75+
SINGLE_NODE_TESTBED: ${{ secrets.SINGLE_NODE_TESTBED_FILE }}
76+
run: |
77+
mkdir -p testbed
78+
if [ -n "$SINGLE_NODE_TESTBED" ]; then
79+
printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml
80+
echo "Created testbed/single_node_tb.yml from secret"
81+
else
82+
echo "[ERROR] SINGLE_NODE_TESTBED_FILE secret is not set"
83+
exit 1
84+
fi
85+
86+
- name: Run test_single_node_pytorch
87+
working-directory: tests/enroot
88+
run: |
89+
DOCKER_IMAGE="${{ inputs.docker_image_single_node }}"
90+
if [ "${{ github.event_name }}" = "push" ]; then
91+
NO_INSTALL="false"
92+
NO_UNINSTALL="false"
93+
else
94+
NO_INSTALL="${{ inputs.no_install }}"
95+
NO_UNINSTALL="${{ inputs.no_uninstall }}"
96+
fi
97+
98+
echo "Running test: test_single_node_pytorch"
99+
echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
100+
101+
python3 run_test.py "test_single_node_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/single_node_tb.yml"
102+
103+
- name: Upload test results
104+
if: always()
105+
uses: actions/upload-artifact@v4
106+
with:
107+
name: test-results-single-node-pytorch-${{ github.run_number }}
108+
path: tests/enroot/results/
109+
if-no-files-found: warn
110+
retention-days: 30
111+
112+
# Multi-node distributed PyTorch test
113+
test-multi-node-pytorch:
114+
needs: test-single-node-pytorch
115+
if: |
116+
always() &&
117+
(github.event_name == 'push' || inputs.run_multi_node_pytorch == true) &&
118+
(needs.test-single-node-pytorch.result == 'success' || needs.test-single-node-pytorch.result == 'skipped')
119+
runs-on: enroot-runners
120+
timeout-minutes: 120
121+
122+
steps:
123+
- name: Checkout repository
124+
uses: actions/checkout@v4
125+
126+
- name: Set up Python
127+
uses: actions/setup-python@v5
128+
with:
129+
python-version: '3.8'
130+
131+
- name: Install dependencies
132+
run: |
133+
python3 -m pip install --upgrade pip
134+
pip install -r tests/enroot/requirements.txt
135+
136+
- name: Create testbed files from secrets
137+
working-directory: tests/enroot
138+
env:
139+
MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }}
140+
run: |
141+
mkdir -p testbed
142+
if [ -n "$MULTI_NODE_TESTBED" ]; then
143+
printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml
144+
echo "Created testbed/multi_node_tb.yml from secret"
145+
else
146+
echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set"
147+
exit 1
148+
fi
149+
150+
- name: Run test_multi_node_distributed_pytorch
151+
working-directory: tests/enroot
152+
run: |
153+
DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}"
154+
if [ "${{ github.event_name }}" = "push" ]; then
155+
NO_INSTALL="false"
156+
NO_UNINSTALL="false"
157+
else
158+
NO_INSTALL="${{ inputs.no_install }}"
159+
NO_UNINSTALL="${{ inputs.no_uninstall }}"
160+
fi
161+
162+
echo "Running test: test_multi_node_distributed_pytorch"
163+
echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
164+
165+
python3 run_test.py "test_multi_node_distributed_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml"
166+
167+
- name: Upload test results
168+
if: always()
169+
uses: actions/upload-artifact@v4
170+
with:
171+
name: test-results-multi-node-pytorch-${{ github.run_number }}
172+
path: tests/enroot/results/
173+
if-no-files-found: warn
174+
retention-days: 30
175+
176+
# Multi-node RCCL test
177+
test-multi-node-rccl:
178+
needs: test-multi-node-pytorch
179+
if: |
180+
always() &&
181+
(github.event_name == 'push' || inputs.run_multi_node_rccl == true) &&
182+
(needs.test-multi-node-pytorch.result == 'success' || needs.test-multi-node-pytorch.result == 'skipped')
183+
runs-on: enroot-runners
184+
timeout-minutes: 120
185+
186+
steps:
187+
- name: Checkout repository
188+
uses: actions/checkout@v4
189+
190+
- name: Set up Python
191+
uses: actions/setup-python@v5
192+
with:
193+
python-version: '3.8'
194+
195+
- name: Install dependencies
196+
run: |
197+
python3 -m pip install --upgrade pip
198+
pip install -r tests/enroot/requirements.txt
199+
200+
- name: Create testbed files from secrets
201+
working-directory: tests/enroot
202+
env:
203+
MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }}
204+
run: |
205+
mkdir -p testbed
206+
if [ -n "$MULTI_NODE_TESTBED" ]; then
207+
printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml
208+
echo "Created testbed/multi_node_tb.yml from secret"
209+
else
210+
echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set"
211+
exit 1
212+
fi
213+
214+
- name: Run test_multi_node_rccl
55215
working-directory: tests/enroot
56216
run: |
57-
python3 run_test.py "${{ inputs.test_name }}" "${{ inputs.docker_image }}" "${{ inputs.no_install }}" "${{ inputs.no_uninstall }}" "${{ inputs.testbed_file }}"
217+
DOCKER_IMAGE="${{ inputs.docker_image_rccl }}"
218+
if [ "${{ github.event_name }}" = "push" ]; then
219+
NO_INSTALL="false"
220+
NO_UNINSTALL="false"
221+
else
222+
NO_INSTALL="${{ inputs.no_install }}"
223+
NO_UNINSTALL="${{ inputs.no_uninstall }}"
224+
fi
225+
226+
echo "Running test: test_multi_node_rccl"
227+
echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
228+
229+
python3 run_test.py "test_multi_node_rccl" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml"
58230
59231
- name: Upload test results
60232
if: always()
61233
uses: actions/upload-artifact@v4
62234
with:
63-
name: test-results-${{ inputs.test_name }}-${{ github.run_number }}
235+
name: test-results-multi-node-rccl-${{ github.run_number }}
64236
path: tests/enroot/results/
65237
if-no-files-found: warn
66238
retention-days: 30

tests/enroot/run_test.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ def update_docker_image(test_name, docker_image):
1818
pattern = r'export DOCKER_IMAGE=.*'
1919
replacement = f'export DOCKER_IMAGE={docker_image}'
2020
print(f"Updating distributed_pytorch_sbatch.sh with image: {docker_image}")
21+
elif test_name == "test_multi_node_rccl":
22+
script_path = Path("batch_scripts/rccl_tests_sbatch.sh")
23+
# Extract version tag from docker image (e.g., docker://rocm/roce-workload:version -> version)
24+
version = docker_image.split(':')[-1] if ':' in docker_image else docker_image
25+
pattern = r'DOCKER_IMAGE_VERSION=\${DOCKER_IMAGE_VERSION:-"[^"]*"}'
26+
replacement = f'DOCKER_IMAGE_VERSION="${{DOCKER_IMAGE_VERSION:-"{version}"}}"'
27+
print(f"Updating rccl_tests_sbatch.sh with image version: {version}")
2128
else:
2229
print(f"Unknown test name: {test_name}")
2330
return

0 commit comments

Comments
 (0)