Skip to content

Commit 5693808

Browse files
committed
add gpu checker
1 parent 440fac2 commit 5693808

File tree

1 file changed

+68
-0
lines changed

1 file changed

+68
-0
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,74 @@ jobs:
7070
- name: Install NVidia GPU operator for KinD
7171
uses: ./common/github-actions/nvidia-gpu-operator
7272

73+
- name: Verify GPU availability in KinD
74+
run: |
75+
echo "Checking for available GPUs in the KinD cluster..."
76+
77+
# Wait for GPU operator pods to be ready (with timeout)
78+
echo "Waiting for GPU operator pods to be ready..."
79+
TIMEOUT=300 # 5 minutes timeout
80+
END=$((SECONDS + TIMEOUT))
81+
82+
while [ $SECONDS -lt $END ]; do
83+
# Get total number of pods in the namespace
84+
TOTAL_PODS=$(kubectl get pods -n gpu-operator --no-headers | wc -l)
85+
86+
# Count pods that are either running and ready or completed successfully
87+
# Exclude pods that are still initializing
88+
READY_PODS=$(kubectl get pods -n gpu-operator --no-headers | grep -E 'Running|Completed' | grep -v 'PodInitializing' | wc -l)
89+
90+
if [ "$READY_PODS" -eq "$TOTAL_PODS" ] && [ "$TOTAL_PODS" -gt 0 ]; then
91+
echo "All GPU operator pods are ready or completed successfully!"
92+
break
93+
fi
94+
95+
echo "Waiting for GPU operator pods to be ready... ($READY_PODS/$TOTAL_PODS)"
96+
echo "Pod status:"
97+
kubectl get pods -n gpu-operator
98+
sleep 10
99+
done
100+
101+
if [ $SECONDS -ge $END ]; then
102+
echo "::error::Timeout waiting for GPU operator pods to be ready"
103+
echo "GPU operator pod status:"
104+
kubectl get pods -n gpu-operator -o wide
105+
echo "GPU operator pod logs:"
106+
kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
107+
echo "GPU operator pod events:"
108+
kubectl get events -n gpu-operator
109+
exit 1
110+
fi
111+
112+
echo "Node details:"
113+
kubectl describe nodes | grep -E 'nvidia.com/gpu|Allocatable:|Capacity:|Name:'
114+
115+
# Check if GPU operator has labeled nodes
116+
GPU_LABELS=$(kubectl describe nodes | grep -c "nvidia.com/gpu")
117+
if [ "$GPU_LABELS" -eq 0 ]; then
118+
echo "::error::No NVIDIA GPU labels found on nodes. GPU operator may not be running correctly."
119+
echo "Full node descriptions for debugging:"
120+
kubectl describe nodes
121+
exit 1
122+
fi
123+
124+
# Check if GPUs are actually allocatable
125+
GPU_ALLOCATABLE=$(kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}' | tr ' ' '\n' | grep -v '^$' | wc -l)
126+
if [ "$GPU_ALLOCATABLE" -eq 0 ]; then
127+
echo "::error::GPU operator is running but no GPUs are allocatable. Check GPU operator logs."
128+
echo "Checking GPU operator pods:"
129+
kubectl get pods -n gpu-operator -o wide
130+
echo "GPU operator pod logs:"
131+
kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
132+
echo "GPU operator pod events:"
133+
kubectl get events -n gpu-operator
134+
echo "GPU operator pod descriptions:"
135+
kubectl describe pods -n gpu-operator
136+
exit 1
137+
fi
138+
139+
echo "Successfully found $GPU_ALLOCATABLE allocatable GPU(s) in the cluster."
140+
73141
- name: Deploy CodeFlare stack
74142
id: deploy
75143
run: |

0 commit comments

Comments
 (0)