Skip to content

Commit 8aed044

Browse files
committed
make a test with 5 files
1 parent cbee8bb commit 8aed044

File tree

3 files changed

+21
-15
lines changed

3 files changed

+21
-15
lines changed

.github/container/test-axlearn.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ passed=0
175175
SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
176176

177177

178-
for test_file in "${final_test_files[@]}"; do
178+
for test_file in "${final_test_files[@]:0:10}"; do
179179
echo "Running: ${test_file}"
180180
log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
181181
log_file="${LOG_DIRECTORY}/${log_file_name}"

.github/eks-workflow-files/axlearn/axlearn-job.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ spec:
1515
image: PLACEHOLDER
1616
command:
1717
- bash
18-
- -exo
18+
- -xo
1919
- pipefail
2020
- -c
2121
- |
@@ -29,7 +29,8 @@ spec:
2929
--k8s
3030
3131
# Wait a moment to ensure logs are flushed
32-
sync
32+
sync
33+
3334
resources:
3435
limits:
3536
nvidia.com/gpu: 8

.github/workflows/nccl-k8s.yaml

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -53,27 +53,32 @@ jobs:
5353
env:
5454
BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
5555
TEST_NAME: ${{ matrix.test }}
56-
JOB_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}"
57-
LAUNCHER_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}-launcher"
58-
TOKEN_NAME: "nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test }}-token"
5956

6057

6158
steps:
6259
- name: Check out the repository
6360
uses: actions/checkout@v4
6461

62+
- name: Modify variables
63+
id: var
64+
shell: bash
65+
run: |
66+
echo "JOB_NAME=${{ env.JOB_NAME}//_/-}" >> $GITHUB_OUTPUT
67+
echo "LAUNCHER_NAME=nccl-test-${{ env.JOB_NAME}//_/-}-launcher" >> $GITHUB_OUTPUT
68+
echo "TOKEN_NAME=nccl-test-${{ env.JOB_NAME}//_/-}-token" >> $GITHUB_OUTPUT
69+
6570
- name: GHCR login and store K8s secret
6671
uses: ./.github/actions/ghcr-login
6772
with:
6873
docker-username: ${{ github.repository_owner }}
6974
docker-password: ${{ secrets.GITHUB_TOKEN }}
70-
token-name: ${{ env.TOKEN_NAME }}
75+
token-name: ${{ steps.var.TOKEN_NAME }}
7176
- name: Configure Kubernetes job
7277
shell: bash
7378
run: |
74-
export JOB_NAME="${{ env.JOB_NAME }}"
75-
export LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}"
76-
export TOKEN_NAME="${{ env.TOKEN_NAME }}"
79+
export JOB_NAME="${{ steps.var.JOB_NAME }}"
80+
export LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}"
81+
export TOKEN_NAME="${{ steps.var.TOKEN_NAME }}"
7782
export TEST_NAME="${{ env.TEST_NAME }}"
7883
export WORKER_NAME="${JOB_NAME}-worker"
7984
@@ -95,11 +100,11 @@ jobs:
95100
uses: ./.github/actions/submit-k8s-job
96101
with:
97102
job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml
98-
job-name: ${{ env.LAUNCHER_NAME }}
103+
job-name: ${{ steps.var.LAUNCHER_NAME }}
99104
- name: Retrieve Kubernetes job status
100105
shell: bash -exo pipefail
101106
run: |
102-
LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}"
107+
LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}"
103108
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
104109
failure=${status[0]:-0}
105110
success=${status[1]:-0}
@@ -118,7 +123,7 @@ jobs:
118123
if: failure()
119124
shell: bash
120125
run: |
121-
LAUNCHER_NAME="${{ env.LAUNCHER_NAME }}"
126+
LAUNCHER_NAME="${{ steps.var.LAUNCHER_NAME }}"
122127
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
123128
if [[ -n "${pods}" ]]; then
124129
kubectl describe ${pods}
@@ -127,9 +132,9 @@ jobs:
127132
if: always()
128133
uses: ./.github/actions/delete-k8s-job
129134
with:
130-
job-name: ${{ env.LAUNCHER_NAME }}
135+
job-name: ${{ steps.var.LAUNCHER_NAME }}
131136
- name: Delete GitHub Container Registry token
132137
uses: ./.github/actions/delete-ghcr-token
133138
if: always()
134139
with:
135-
token-name: ${{ env.TOKEN_NAME }}
140+
token-name: ${{ steps.var.TOKEN_NAME }}

0 commit comments

Comments
 (0)