Skip to content

Commit 696a3a3

Browse files
committed
Workflow format
1 parent f77a229 commit 696a3a3

File tree

1 file changed

+144
-144
lines changed

1 file changed

+144
-144
lines changed

.github/workflows/_test_te.yaml

Lines changed: 144 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -1,152 +1,152 @@
11
name: ~test TransformerEngine
22

3-
on:
4-
workflow_call:
5-
inputs:
6-
TE_IMAGE:
7-
type: string
8-
description: 'JAX+TE+PAXML image'
9-
required: true
10-
default: 'ghcr.io/nvidia/upstream-pax:latest'
11-
ARTIFACT_PREFIX:
12-
type: string
13-
description: 'Name of the artifact zip file'
14-
required: false
15-
default: 'te'
3+
on:
4+
workflow_call:
5+
inputs:
6+
TE_IMAGE:
7+
type: string
8+
description: 'JAX+TE+PAXML image'
9+
required: true
10+
default: 'ghcr.io/nvidia/upstream-pax:latest'
11+
ARTIFACT_PREFIX:
12+
type: string
13+
description: 'Name of the artifact zip file'
14+
required: false
15+
default: 'te'
1616

17-
jobs:
18-
te-multi-gpu:
19-
uses: ./.github/workflows/_test_slurm_pyxis.yaml
20-
strategy:
21-
matrix:
22-
N_GPU: [2, 4, 8]
23-
fail-fast: false
24-
secrets:
25-
SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
26-
SLURM_LOGIN_USER: ${{ secrets.CLUSTER_LOGIN_USER }}
27-
CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
28-
with:
29-
NAME: ${{ inputs.ARTIFACT_PREFIX }}-${{ matrix.N_GPU }}GPU
30-
SLURM_LOGIN_HOSTNAME: ${{ vars.HOSTNAME_SLURM_LOGIN }}
31-
OUTPUT_BASEDIR: /nfs/cluster
32-
OUTPUT_MOUNTPOINT: /output
33-
NODES: 1
34-
GPUS_PER_NODE: ${{ matrix.N_GPU }}
35-
NTASKS: 1
36-
NTASKS_PER_NODE: 1
37-
TIME_LIMIT: '00:10:00'
38-
EXTRA_EXPORTS: 'VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model'
39-
IMAGE: ${{ inputs.TE_IMAGE }}
40-
SRUN_PREAMBLE: |
41-
nvidia-smi
42-
pip install \
43-
pytest \
44-
pytest-reportlog \
45-
cuda-python \
46-
-r ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder/requirements.txt
47-
SRUN_SCRIPT: |
48-
set -ex
49-
cd ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder
50-
pytest --report-log=/output/pytest-report.jsonl \
51-
test_single_gpu_encoder.py \
52-
test_multigpu_encoder.py \
53-
test_model_parallel_encoder.py
17+
jobs:
18+
te-multi-gpu:
19+
uses: ./.github/workflows/_test_slurm_pyxis.yaml
20+
strategy:
21+
matrix:
22+
N_GPU: [2, 4, 8]
23+
fail-fast: false
24+
secrets:
25+
SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
26+
SLURM_LOGIN_USER: ${{ secrets.CLUSTER_LOGIN_USER }}
27+
CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
28+
with:
29+
NAME: ${{ inputs.ARTIFACT_PREFIX }}-${{ matrix.N_GPU }}GPU
30+
SLURM_LOGIN_HOSTNAME: ${{ vars.HOSTNAME_SLURM_LOGIN }}
31+
OUTPUT_BASEDIR: /nfs/cluster
32+
OUTPUT_MOUNTPOINT: /output
33+
NODES: 1
34+
GPUS_PER_NODE: ${{ matrix.N_GPU }}
35+
NTASKS: 1
36+
NTASKS_PER_NODE: 1
37+
TIME_LIMIT: '00:10:00'
38+
EXTRA_EXPORTS: 'VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model'
39+
IMAGE: ${{ inputs.TE_IMAGE }}
40+
SRUN_PREAMBLE: |
41+
nvidia-smi
42+
pip install \
43+
pytest \
44+
pytest-reportlog \
45+
cuda-python \
46+
-r ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder/requirements.txt
47+
SRUN_SCRIPT: |
48+
set -ex
49+
cd ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder
50+
pytest --report-log=/output/pytest-report.jsonl \
51+
test_single_gpu_encoder.py \
52+
test_multigpu_encoder.py \
53+
test_model_parallel_encoder.py
5454
55-
te-unittests:
56-
uses: ./.github/workflows/_test_slurm_pyxis.yaml
57-
strategy:
58-
matrix:
59-
N_GPU: [2, 4, 8]
60-
fail-fast: false
61-
secrets:
62-
SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
63-
SLURM_LOGIN_USER: ${{ secrets.CLUSTER_LOGIN_USER }}
64-
CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
65-
with:
66-
NAME: ${{ inputs.ARTIFACT_PREFIX }}-transformerengine-unittests-${{ matrix.N_GPU }}GPU
67-
SLURM_LOGIN_HOSTNAME: ${{ vars.HOSTNAME_SLURM_LOGIN }}
68-
OUTPUT_BASEDIR: /nfs/cluster
69-
OUTPUT_MOUNTPOINT: /output
70-
NODES: 1
71-
GPUS_PER_NODE: ${{ matrix.N_GPU }}
72-
NTASKS: 1
73-
NTASKS_PER_NODE: 1
74-
TIME_LIMIT: '00:10:00'
75-
EXTRA_EXPORTS: 'VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model'
76-
IMAGE: ${{ inputs.TE_IMAGE }}
77-
SRUN_PREAMBLE: |
78-
nvidia-smi
79-
pip install \
80-
pytest \
81-
pytest-reportlog \
82-
cuda-python \
83-
-r ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder/requirements.txt
84-
SRUN_SCRIPT: |
85-
set -ex
55+
te-unittests:
56+
uses: ./.github/workflows/_test_slurm_pyxis.yaml
57+
strategy:
58+
matrix:
59+
N_GPU: [2, 4, 8]
60+
fail-fast: false
61+
secrets:
62+
SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
63+
SLURM_LOGIN_USER: ${{ secrets.CLUSTER_LOGIN_USER }}
64+
CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
65+
with:
66+
NAME: ${{ inputs.ARTIFACT_PREFIX }}-transformerengine-unittests-${{ matrix.N_GPU }}GPU
67+
SLURM_LOGIN_HOSTNAME: ${{ vars.HOSTNAME_SLURM_LOGIN }}
68+
OUTPUT_BASEDIR: /nfs/cluster
69+
OUTPUT_MOUNTPOINT: /output
70+
NODES: 1
71+
GPUS_PER_NODE: ${{ matrix.N_GPU }}
72+
NTASKS: 1
73+
NTASKS_PER_NODE: 1
74+
TIME_LIMIT: '00:10:00'
75+
EXTRA_EXPORTS: 'VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model'
76+
IMAGE: ${{ inputs.TE_IMAGE }}
77+
SRUN_PREAMBLE: |
78+
nvidia-smi
79+
pip install \
80+
pytest \
81+
pytest-reportlog \
82+
cuda-python \
83+
-r ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder/requirements.txt
84+
SRUN_SCRIPT: |
85+
set -ex
8686
87-
cd /opt/transformer-engine
88-
sed s@/opt/transformerengine@/opt/transformer-engine@g -i qa/L0_jax_unittest/test.sh
89-
bash qa/L0_jax_distributed_unittest/test.sh
87+
cd /opt/transformer-engine
88+
sed s@/opt/transformerengine@/opt/transformer-engine@g -i qa/L0_jax_unittest/test.sh
89+
bash qa/L0_jax_distributed_unittest/test.sh
9090
91-
sitrep:
92-
needs: [te-multi-gpu, te-unittests]
93-
if: success() || failure()
94-
runs-on: ubuntu-latest
95-
env:
96-
ARTIFACT_NAME_FULL: ${{ inputs.ARTIFACT_PREFIX }}-multigpu-test
97-
BADGE_FILENAME_FULL: badge-${{ inputs.ARTIFACT_PREFIX }}-multigpu-test.json
98-
steps:
99-
- name: Check out the repository under ${GITHUB_WORKSPACE}
100-
uses: actions/checkout@v4
91+
sitrep:
92+
needs: [te-multi-gpu, te-unittests]
93+
if: success() || failure()
94+
runs-on: ubuntu-latest
95+
env:
96+
ARTIFACT_NAME_FULL: ${{ inputs.ARTIFACT_PREFIX }}-multigpu-test
97+
BADGE_FILENAME_FULL: badge-${{ inputs.ARTIFACT_PREFIX }}-multigpu-test.json
98+
steps:
99+
- name: Check out the repository under ${GITHUB_WORKSPACE}
100+
uses: actions/checkout@v4
101101

102-
- name: Download artifacts
103-
uses: actions/download-artifact@v4
104-
with:
105-
pattern: |
106-
${{ inputs.ARTIFACT_PREFIX }}-*
107-
merge-multiple: true
102+
- name: Download artifacts
103+
uses: actions/download-artifact@v4
104+
with:
105+
pattern: |
106+
${{ inputs.ARTIFACT_PREFIX }}-*
107+
merge-multiple: true
108108

109-
- name: Generate sitrep
110-
shell: bash -x -e {0}
111-
run: |
112-
# bring in utility functions
113-
source .github/workflows/scripts/to_json.sh
114-
test_outcome_files=$(find -name pytest-report.jsonl)
115-
badge_label='TE Multi GPU tests'
116-
passed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l)
117-
failed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l)
118-
total_tests=$((failed_tests + passed_tests))
119-
120-
if [[ ${total_tests} == 0 ]]; then
121-
badge_message='error'
122-
badge_color=red
123-
summary='TE multi GPU tests did not complete due to errors.'
124-
else
125-
badge_message="${passed_tests}/${total_tests} passed"
126-
if [[ ${failed_tests} == 0 ]]; then
127-
badge_color=brightgreen
128-
else
129-
badge_color=yellow
130-
fi
131-
summary="TE multi GPU tests : $badge_message"
132-
fi
133-
run_id=${{ github.run_id }} \
134-
to_json \
135-
run_id \
136-
summary \
137-
total_tests passed_tests failed_tests \
138-
badge_label badge_color badge_message \
139-
> sitrep.json
140-
schemaVersion=1 \
141-
label="${badge_label}" \
142-
message="${badge_message}" \
143-
color="${badge_color}" \
144-
to_json schemaVersion label message color \
145-
> ${{ env.BADGE_FILENAME_FULL }}
146-
- name: Upload training logs as artifacts
147-
uses: actions/upload-artifact@v4
148-
with:
149-
name: ${{ env.ARTIFACT_NAME_FULL }}
150-
path: |
151-
sitrep.json
152-
${{ env.BADGE_FILENAME_FULL }}
109+
- name: Generate sitrep
110+
shell: bash -x -e {0}
111+
run: |
112+
# bring in utility functions
113+
source .github/workflows/scripts/to_json.sh
114+
test_outcome_files=$(find -name pytest-report.jsonl)
115+
badge_label='TE Multi GPU tests'
116+
passed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l)
117+
failed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l)
118+
total_tests=$((failed_tests + passed_tests))
119+
120+
if [[ ${total_tests} == 0 ]]; then
121+
badge_message='error'
122+
badge_color=red
123+
summary='TE multi GPU tests did not complete due to errors.'
124+
else
125+
badge_message="${passed_tests}/${total_tests} passed"
126+
if [[ ${failed_tests} == 0 ]]; then
127+
badge_color=brightgreen
128+
else
129+
badge_color=yellow
130+
fi
131+
summary="TE multi GPU tests : $badge_message"
132+
fi
133+
run_id=${{ github.run_id }} \
134+
to_json \
135+
run_id \
136+
summary \
137+
total_tests passed_tests failed_tests \
138+
badge_label badge_color badge_message \
139+
> sitrep.json
140+
schemaVersion=1 \
141+
label="${badge_label}" \
142+
message="${badge_message}" \
143+
color="${badge_color}" \
144+
to_json schemaVersion label message color \
145+
> ${{ env.BADGE_FILENAME_FULL }}
146+
- name: Upload training logs as artifacts
147+
uses: actions/upload-artifact@v4
148+
with:
149+
name: ${{ env.ARTIFACT_NAME_FULL }}
150+
path: |
151+
sitrep.json
152+
${{ env.BADGE_FILENAME_FULL }}

0 commit comments

Comments
 (0)