@@ -19,6 +19,8 @@ concurrency:
1919
2020env :
2121 CODEFLARE_OPERATOR_IMG : " quay.io/project-codeflare/codeflare-operator:dev"
22+ # Explicitly set Docker MTU for KIND to avoid network issues
23+ KIND_EXPERIMENTAL_DOCKER_NETWORK : " bridge"
2224
2325jobs :
2426 kubernetes :
@@ -60,13 +62,150 @@ jobs:
6062 python-version : ' 3.11'
6163 cache : ' pip' # caching pip dependencies
6264
65+ - name : Diagnose Docker environment on GPU runner
66+ run : |
67+ echo "=== Docker Environment Diagnostics ==="
68+ echo "Docker version:"
69+ docker version || true
70+ echo ""
71+ echo "Docker info:"
72+ docker info || true
73+ echo ""
74+ echo "System info:"
75+ uname -a
76+ echo ""
77+ echo "Network interfaces:"
78+ ip addr show || true
79+ echo ""
80+ echo "Checking cgroup version:"
81+ stat -fc %T /sys/fs/cgroup/ || true
82+ echo ""
83+ echo "Checking if running in container:"
84+ if [ -f /.dockerenv ]; then echo "Running inside Docker"; else echo "Not in Docker"; fi
85+ echo ""
86+ echo "Available disk space:"
87+ df -h
88+ echo ""
89+ echo "Memory info:"
90+ free -h
91+ echo "=== End Diagnostics ==="
92+
6393 - name : Setup NVidia GPU environment for KinD
6494 uses : ./common/github-actions/nvidia-gpu-setup
6595
96+ - name : Create KIND config with explicit networking
97+ run : |
98+ cat > /tmp/kind-config.yaml <<EOF
99+ kind: Cluster
100+ apiVersion: kind.x-k8s.io/v1alpha4
101+ networking:
102+ # Explicitly set pod subnet to avoid conflicts
103+ podSubnet: "10.244.0.0/16"
104+ serviceSubnet: "10.96.0.0/16"
105+ # Disable default CNI so we can ensure it's properly installed
106+ disableDefaultCNI: false
107+ # Set MTU for better compatibility
108+ kubeProxyMode: "iptables"
109+ nodes:
110+ - role: control-plane
111+ # Extra mounts that might be needed for GPU
112+ extraMounts:
113+ - containerPath: /dev/shm
114+ hostPath: /dev/shm
115+ propagation: HostToContainer
116+ - role: worker
117+ extraMounts:
118+ - containerPath: /dev/shm
119+ hostPath: /dev/shm
120+ propagation: HostToContainer
121+ containerdConfigPatches:
122+ - |-
123+ [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
124+ runtime_type = "io.containerd.runc.v2"
125+ [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
126+ SystemdCgroup = true
127+ EOF
128+
129+ echo "KIND configuration:"
130+ cat /tmp/kind-config.yaml
131+
66132 - name : Setup and start KinD cluster
67133 uses : ./common/github-actions/kind
68134 with :
69135 worker-nodes : 1
136+ kind-config : /tmp/kind-config.yaml
137+
138+ - name : Wait for KIND cluster and CNI initialization
139+ run : |
140+ echo "Waiting for KIND cluster to initialize..."
141+ # First ensure cluster API is responsive
142+ for i in {1..60}; do
143+ if kubectl cluster-info &>/dev/null; then
144+ echo "✓ Cluster API is responsive"
145+ break
146+ fi
147+ echo "Waiting for cluster API... ($i/60)"
148+ sleep 5
149+ done
150+
151+ # Check initial node status
152+ echo "Initial node status:"
153+ kubectl get nodes -o wide || true
154+
155+ # Wait for CNI to initialize on all nodes
156+ echo "Waiting for CNI plugin to initialize..."
157+ for i in {1..120}; do
158+ # Check if nodes exist
159+ node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0")
160+ if [ "$node_count" -eq "0" ]; then
161+ echo "No nodes found yet... ($i/120)"
162+ sleep 5
163+ continue
164+ fi
165+
166+ # Check if CNI is initialized (nodes won't have NetworkUnavailable condition)
167+ if kubectl get nodes -o json | jq -r '.items[].status.conditions[] | select(.type=="NetworkUnavailable") | .status' | grep -v "False" > /dev/null 2>&1; then
168+ echo "CNI still initializing... ($i/120)"
169+ if [ $((i % 10)) -eq 0 ]; then
170+ echo "Current node conditions:"
171+ kubectl describe nodes | grep -A10 "Conditions:" || true
172+ fi
173+ else
174+ echo "✓ CNI initialized on all nodes"
175+ break
176+ fi
177+
178+ if [ $i -eq 120 ]; then
179+ echo "ERROR: CNI failed to initialize"
180+ echo "Node details:"
181+ kubectl describe nodes
182+ echo "KIND logs:"
183+ docker ps -a | grep kind
184+ docker logs kind-control-plane 2>&1 | tail -100 || true
185+ exit 1
186+ fi
187+ sleep 5
188+ done
189+
190+ # Wait for nodes to be fully ready
191+ echo "Waiting for all nodes to be ready..."
192+ kubectl wait --for=condition=Ready nodes --all --timeout=300s || {
193+ echo "ERROR: Nodes failed to become ready"
194+ kubectl describe nodes
195+ kubectl get pods -A -o wide
196+ exit 1
197+ }
198+
199+ echo "✓ All nodes are ready:"
200+ kubectl get nodes -o wide
201+
202+ # Verify CNI with a test pod
203+ echo "Verifying CNI functionality..."
204+ kubectl run test-cni --image=busybox:latest --rm -it --restart=Never --command -- sh -c "echo 'CNI test successful'" || {
205+ echo "WARNING: CNI test pod failed, checking kindnet pods..."
206+ kubectl get pods -n kube-system -l app=kindnet -o wide
207+ kubectl logs -n kube-system -l app=kindnet --tail=50 || true
208+ }
70209
71210 - name : Install NVidia GPU operator for KinD
72211 uses : ./common/github-actions/nvidia-gpu-operator
0 commit comments