Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/actions/submit-delete-k8s-job/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ runs:
uses: ./.github/actions/with-post-step
with:
main: |
echo "Checking for cluster maintenance taint..."
if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
echo "Cluster is under maintenance, skipping job submission."
exit 0 # Exit successfully without running the rest of the script
fi

set -x
TIMEOUT_JOB_CREATION=60s
TIMEOUT_JOB_WAIT=14400s
Expand Down
16 changes: 14 additions & 2 deletions .github/workflows/_test_nccl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,18 @@ jobs:
.github/eks-workflow-files/mpi-nccl-test.yml
git diff .github/eks-workflow-files/mpi-nccl-test.yml
- name: Submit Kubernetes job
run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
id: submit_job
run: |
echo "Check whether the cluster is under maintenance"
if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
echo "Cluster is under maintenance, skipping job submission"
echo "continue-run=false" >> "$GITHUB_OUTPUT"
else
kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
echo "continue-run=true" >> "$GITHUB_OUTPUT"
fi
- name: Wait for Kubernetes job to start
if: steps.submit_job.outputs.continue-run == 'true'
# Note that this is *not* using JOB_NAME
run: |
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
Expand All @@ -100,6 +110,7 @@ jobs:
kubectl wait --for=create job/${LAUNCHER_NAME}
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=14400s
- name: Stream Kubernetes job output
if: steps.submit_job.outputs.continue-run == 'true'
# Note that this is *not* JOB_NAME
run: |
# Streaming logs will fail if the container/pod is still pending
Expand All @@ -110,6 +121,7 @@ jobs:
# prefixes lines with a rather verbose tag
kubectl logs --follow job/${LAUNCHER_NAME}
- name: Retrieve Kubernetes job status
if: steps.submit_job.outputs.continue-run == 'true'
shell: bash -exo pipefail {0}
run: |
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
Expand All @@ -130,7 +142,7 @@ jobs:
# Provide more debug output in case of failure; note that some kinds of launch
# failure do not produce any log output.
- name: Debug failed Kubernetes job
if: failure()
if: failure() && steps.submit_job.outputs.continue-run == 'true'
run: |
# Provide better debug in case of launch failures that will not produce log output
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
Expand Down
Loading