11name : Enroot Tests
22
33on :
4+ push :
5+ branches :
6+ - main
47 workflow_dispatch :
58 inputs :
6- test_name :
7- description : ' Select test to run'
8- required : true
9- type : choice
10- options :
11- - test_single_node_pytorch
12- - test_multi_node_distributed_pytorch
9+ run_single_node_pytorch :
10+ description : ' Run single-node PyTorch test'
11+ required : false
12+ type : boolean
13+ default : true
14+ run_multi_node_pytorch :
15+ description : ' Run multi-node distributed PyTorch test'
16+ required : false
17+ type : boolean
18+ default : true
19+ run_multi_node_rccl :
20+ description : ' Run multi-node RCCL test'
21+ required : false
22+ type : boolean
23+ default : true
1324 no_install :
1425 description : ' Skip installation (--no-install)'
1526 required : false
2031 required : false
2132 type : boolean
2233 default : false
23- docker_image :
24- description : ' Docker image to use (default: rocm/pytorch:latest for single-node, docker:// rocm/pytorch:rocm7.0.2_ubuntu22.04_py3.10_pytorch_release_2.7.1 for multi-node )'
34+ docker_image_single_node :
35+ description : ' Docker image for single-node test (default: rocm/pytorch:latest )'
2536 required : false
2637 type : string
2738 default : ' '
28- testbed_file :
29- description : ' Path to testbed file (e.g. tests/enroot/testbeds/mi325.yaml )'
39+ docker_image_multi_node :
40+ description : ' Docker image for multi-node PyTorch test (default: docker://rocm/pytorch:rocm6.2.4_ubuntu22.04_py3.10_pytorch_release_2.3.0 )'
3041 required : false
3142 type : string
32- default : ' testbed/enroot_tb.yml'
43+ default : ' '
44+ docker_image_rccl :
45+ description : ' Docker image for RCCL test (default: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56)'
46+ required : false
47+ type : string
48+ default : ' '
3349
3450
3551jobs :
36- run-enroot-tests :
52+ # Single-node PyTorch test
53+ test-single-node-pytorch :
54+ if : github.event_name == 'push' || inputs.run_single_node_pytorch == true
3755 runs-on : enroot-runners
3856 timeout-minutes : 120
3957
@@ -51,16 +69,170 @@ jobs:
5169 python3 -m pip install --upgrade pip
5270 pip install -r tests/enroot/requirements.txt
5371
54- - name : Run enroot tests
72+ - name : Create testbed files from secrets
73+ working-directory : tests/enroot
74+ env :
75+ SINGLE_NODE_TESTBED : ${{ secrets.SINGLE_NODE_TESTBED_FILE }}
76+ run : |
77+ mkdir -p testbed
78+ if [ -n "$SINGLE_NODE_TESTBED" ]; then
79+ printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml
80+ echo "Created testbed/single_node_tb.yml from secret"
81+ else
82+ echo "[ERROR] SINGLE_NODE_TESTBED_FILE secret is not set"
83+ exit 1
84+ fi
85+
86+ - name : Run test_single_node_pytorch
87+ working-directory : tests/enroot
88+ run : |
89+ DOCKER_IMAGE="${{ inputs.docker_image_single_node }}"
90+ if [ "${{ github.event_name }}" = "push" ]; then
91+ NO_INSTALL="false"
92+ NO_UNINSTALL="false"
93+ else
94+ NO_INSTALL="${{ inputs.no_install }}"
95+ NO_UNINSTALL="${{ inputs.no_uninstall }}"
96+ fi
97+
98+ echo "Running test: test_single_node_pytorch"
99+ echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
100+
101+ python3 run_test.py "test_single_node_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/single_node_tb.yml"
102+
103+ - name : Upload test results
104+ if : always()
105+ uses : actions/upload-artifact@v4
106+ with :
107+ name : test-results-single-node-pytorch-${{ github.run_number }}
108+ path : tests/enroot/results/
109+ if-no-files-found : warn
110+ retention-days : 30
111+
112+ # Multi-node distributed PyTorch test
113+ test-multi-node-pytorch :
114+ needs : test-single-node-pytorch
115+ if : |
116+ always() &&
117+ (github.event_name == 'push' || inputs.run_multi_node_pytorch == true) &&
118+ (needs.test-single-node-pytorch.result == 'success' || needs.test-single-node-pytorch.result == 'skipped')
119+ runs-on : enroot-runners
120+ timeout-minutes : 120
121+
122+ steps :
123+ - name : Checkout repository
124+ uses : actions/checkout@v4
125+
126+ - name : Set up Python
127+ uses : actions/setup-python@v5
128+ with :
129+ python-version : ' 3.8'
130+
131+ - name : Install dependencies
132+ run : |
133+ python3 -m pip install --upgrade pip
134+ pip install -r tests/enroot/requirements.txt
135+
136+ - name : Create testbed files from secrets
137+ working-directory : tests/enroot
138+ env :
139+ MULTI_NODE_TESTBED : ${{ secrets.MULTI_NODE_TESTBED_FILE }}
140+ run : |
141+ mkdir -p testbed
142+ if [ -n "$MULTI_NODE_TESTBED" ]; then
143+ printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml
144+ echo "Created testbed/multi_node_tb.yml from secret"
145+ else
146+ echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set"
147+ exit 1
148+ fi
149+
150+ - name : Run test_multi_node_distributed_pytorch
151+ working-directory : tests/enroot
152+ run : |
153+ DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}"
154+ if [ "${{ github.event_name }}" = "push" ]; then
155+ NO_INSTALL="false"
156+ NO_UNINSTALL="false"
157+ else
158+ NO_INSTALL="${{ inputs.no_install }}"
159+ NO_UNINSTALL="${{ inputs.no_uninstall }}"
160+ fi
161+
162+ echo "Running test: test_multi_node_distributed_pytorch"
163+ echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
164+
165+ python3 run_test.py "test_multi_node_distributed_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml"
166+
167+ - name : Upload test results
168+ if : always()
169+ uses : actions/upload-artifact@v4
170+ with :
171+ name : test-results-multi-node-pytorch-${{ github.run_number }}
172+ path : tests/enroot/results/
173+ if-no-files-found : warn
174+ retention-days : 30
175+
176+ # Multi-node RCCL test
177+ test-multi-node-rccl :
178+ needs : test-multi-node-pytorch
179+ if : |
180+ always() &&
181+ (github.event_name == 'push' || inputs.run_multi_node_rccl == true) &&
182+ (needs.test-multi-node-pytorch.result == 'success' || needs.test-multi-node-pytorch.result == 'skipped')
183+ runs-on : enroot-runners
184+ timeout-minutes : 120
185+
186+ steps :
187+ - name : Checkout repository
188+ uses : actions/checkout@v4
189+
190+ - name : Set up Python
191+ uses : actions/setup-python@v5
192+ with :
193+ python-version : ' 3.8'
194+
195+ - name : Install dependencies
196+ run : |
197+ python3 -m pip install --upgrade pip
198+ pip install -r tests/enroot/requirements.txt
199+
200+ - name : Create testbed files from secrets
201+ working-directory : tests/enroot
202+ env :
203+ MULTI_NODE_TESTBED : ${{ secrets.MULTI_NODE_TESTBED_FILE }}
204+ run : |
205+ mkdir -p testbed
206+ if [ -n "$MULTI_NODE_TESTBED" ]; then
207+ printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml
208+ echo "Created testbed/multi_node_tb.yml from secret"
209+ else
210+ echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set"
211+ exit 1
212+ fi
213+
214+ - name : Run test_multi_node_rccl
55215 working-directory : tests/enroot
56216 run : |
57- python3 run_test.py "${{ inputs.test_name }}" "${{ inputs.docker_image }}" "${{ inputs.no_install }}" "${{ inputs.no_uninstall }}" "${{ inputs.testbed_file }}"
217+ DOCKER_IMAGE="${{ inputs.docker_image_rccl }}"
218+ if [ "${{ github.event_name }}" = "push" ]; then
219+ NO_INSTALL="false"
220+ NO_UNINSTALL="false"
221+ else
222+ NO_INSTALL="${{ inputs.no_install }}"
223+ NO_UNINSTALL="${{ inputs.no_uninstall }}"
224+ fi
225+
226+ echo "Running test: test_multi_node_rccl"
227+ echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
228+
229+ python3 run_test.py "test_multi_node_rccl" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml"
58230
59231 - name : Upload test results
60232 if : always()
61233 uses : actions/upload-artifact@v4
62234 with :
63- name : test-results-${{ inputs.test_name }} -${{ github.run_number }}
235+ name : test-results-multi-node-rccl -${{ github.run_number }}
64236 path : tests/enroot/results/
65237 if-no-files-found : warn
66238 retention-days : 30
0 commit comments