@@ -19,10 +19,17 @@ concurrency:
1919
2020env :
2121 CODEFLARE_OPERATOR_IMG : " quay.io/project-codeflare/codeflare-operator:dev"
22+ # Explicitly set Docker MTU for KIND to avoid network issues
23+ KIND_EXPERIMENTAL_DOCKER_NETWORK : " bridge"
2224
2325jobs :
2426 kubernetes :
2527 runs-on : gpu-t4-4-core
28+ strategy :
29+ fail-fast : false
30+ matrix :
31+ # Try with and without GPU setup to isolate the issue
32+ gpu-setup : [true, false]
2633
2734 steps :
2835 - name : Checkout code
@@ -60,15 +67,253 @@ jobs:
6067 python-version : ' 3.11'
6168 cache : ' pip' # caching pip dependencies
6269
70+ - name : Diagnose Docker environment on GPU runner
71+ run : |
72+ echo "=== Docker Environment Diagnostics ==="
73+ echo "Docker version:"
74+ docker version || true
75+ echo ""
76+ echo "Docker info:"
77+ docker info || true
78+ echo ""
79+ echo "System info:"
80+ uname -a
81+ echo ""
82+ echo "Network interfaces:"
83+ ip addr show || true
84+ echo ""
85+ echo "Checking cgroup version:"
86+ stat -fc %T /sys/fs/cgroup/ || true
87+ echo ""
88+ echo "Checking if running in container:"
89+ if [ -f /.dockerenv ]; then echo "Running inside Docker"; else echo "Not in Docker"; fi
90+ echo ""
91+ echo "Available disk space:"
92+ df -h
93+ echo ""
94+ echo "Memory info:"
95+ free -h
96+ echo ""
97+ echo "DNS Configuration:"
98+ cat /etc/resolv.conf || true
99+ echo ""
100+ echo "Testing DNS resolution:"
101+ nslookup google.com || true
102+ echo "=== End Diagnostics ==="
103+
63104 - name : Setup NVidia GPU environment for KinD
105+ if : matrix.gpu-setup == true
64106 uses : ./common/github-actions/nvidia-gpu-setup
65107
108+ - name : Create KIND config with explicit networking
109+ run : |
110+ cat > /tmp/kind-config.yaml <<EOF
111+ kind: Cluster
112+ apiVersion: kind.x-k8s.io/v1alpha4
113+ networking:
114+ # Explicitly set pod subnet to avoid conflicts
115+ podSubnet: "10.244.0.0/16"
116+ serviceSubnet: "10.96.0.0/16"
117+ # Disable default CNI so we can ensure it's properly installed
118+ disableDefaultCNI: false
119+ # Set MTU for better compatibility
120+ kubeProxyMode: "iptables"
121+ nodes:
122+ - role: control-plane
123+ # Extra mounts that might be needed for GPU
124+ extraMounts:
125+ - containerPath: /dev/shm
126+ hostPath: /dev/shm
127+ propagation: HostToContainer
128+ - role: worker
129+ extraMounts:
130+ - containerPath: /dev/shm
131+ hostPath: /dev/shm
132+ propagation: HostToContainer
133+ containerdConfigPatches:
134+ - |-
135+ [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
136+ runtime_type = "io.containerd.runc.v2"
137+ [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
138+ SystemdCgroup = true
139+ kubeadmConfigPatches:
140+ - |
141+ kind: ClusterConfiguration
142+ apiServer:
143+ extraArgs:
144+ "enable-admission-plugins": "NodeRestriction,ResourceQuota"
145+ - |
146+ kind: KubeletConfiguration
147+ serverTLSBootstrap: true
148+ cgroupDriver: systemd
149+ containerRuntimeEndpoint: unix:///run/containerd/containerd.sock
150+ - |
151+ kind: InitConfiguration
152+ nodeRegistration:
153+ kubeletExtraArgs:
154+ pod-infra-container-image: registry.k8s.io/pause:3.9
155+ - |
156+ kind: JoinConfiguration
157+ discovery:
158+ bootstrapToken:
159+ apiServerEndpoint: "{{ .ControlPlaneEndpoint }}"
160+ token: "{{ .Token }}"
161+ unsafeSkipCAVerification: true
162+ nodeRegistration:
163+ kubeletExtraArgs:
164+ pod-infra-container-image: registry.k8s.io/pause:3.9
165+ EOF
166+
167+ echo "KIND configuration:"
168+ cat /tmp/kind-config.yaml
169+
66170 - name : Setup and start KinD cluster
67171 uses : ./common/github-actions/kind
68172 with :
69173 worker-nodes : 1
174+ kind-config : /tmp/kind-config.yaml
175+ continue-on-error : true
176+ id : kind-setup
177+
178+ - name : Fallback KIND setup if custom config fails
179+ if : steps.kind-setup.outcome == 'failure'
180+ run : |
181+ echo "Custom KIND config failed, trying with default settings..."
182+ # Clean up any failed attempts
183+ kind delete cluster --name kind || true
184+ docker rm -f $(docker ps -aq --filter name=kind-) || true
185+
186+ # Create cluster with simpler config
187+ cat > /tmp/kind-simple.yaml <<EOF
188+ kind: Cluster
189+ apiVersion: kind.x-k8s.io/v1alpha4
190+ nodes:
191+ - role: control-plane
192+ - role: worker
193+ EOF
194+
195+ kind create cluster --config /tmp/kind-simple.yaml --wait 5m || {
196+ echo "ERROR: KIND cluster creation failed"
197+ docker ps -a
198+ exit 1
199+ }
200+
201+ - name : Fix KIND DNS and wait for cluster initialization
202+ run : |
203+ echo "=== KIND Cluster Setup Diagnostics ==="
204+
205+ # Check KIND containers
206+ echo "KIND containers:"
207+ docker ps -a --filter name=kind-
208+
209+ # Get control plane container name
210+ CONTROL_PLANE=$(docker ps --filter name=kind-control-plane --format "{{.Names}}" | head -1)
211+ if [ -z "$CONTROL_PLANE" ]; then
212+ CONTROL_PLANE="kind-control-plane"
213+ fi
214+ echo "Control plane container: $CONTROL_PLANE"
215+
216+ # Get control plane IP
217+ CONTROL_PLANE_IP=$(docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $CONTROL_PLANE 2>/dev/null || echo "")
218+ echo "Control plane IP: $CONTROL_PLANE_IP"
219+
220+ # Check Docker network
221+ echo "Docker networks:"
222+ docker network ls
223+ KIND_NETWORK=$(docker network ls | grep kind | awk '{print $2}' | head -1)
224+ if [ -n "$KIND_NETWORK" ]; then
225+ echo "KIND network: $KIND_NETWORK"
226+ echo "Containers on KIND network:"
227+ docker network inspect $KIND_NETWORK | jq -r '.Containers | to_entries | .[] | "\(.value.Name): \(.value.IPv4Address)"' || true
228+ fi
229+
230+ # Ensure all KIND containers are on the same network
231+ for container in $(docker ps -a --filter name=kind- --format "{{.Names}}"); do
232+ echo "Checking network for $container"
233+ docker inspect $container | jq -r '.[0].NetworkSettings.Networks | keys[]' || true
234+ done
235+
236+ echo "=== Waiting for cluster initialization ==="
237+
238+ # Wait for API server
239+ for i in {1..60}; do
240+ if kubectl cluster-info &>/dev/null; then
241+ echo "✓ Cluster API is responsive"
242+ break
243+ fi
244+ echo "Waiting for cluster API... ($i/60)"
245+
246+ # Try to diagnose connection issues
247+ if [ $i -eq 30 ]; then
248+ echo "Debugging cluster connection..."
249+ kubectl cluster-info dump --output-directory=/tmp/cluster-dump || true
250+ echo "Kubeconfig:"
251+ kubectl config view || true
252+ fi
253+ sleep 5
254+ done
255+
256+ # Check initial node status
257+ echo "Initial node status:"
258+ kubectl get nodes -o wide || true
259+
260+ # Wait for CNI to initialize on all nodes
261+ echo "Waiting for CNI plugin to initialize..."
262+ for i in {1..120}; do
263+ # Check if nodes exist
264+ node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0")
265+ if [ "$node_count" -eq "0" ]; then
266+ echo "No nodes found yet... ($i/120)"
267+ sleep 5
268+ continue
269+ fi
270+
271+ # Check if CNI is initialized (nodes won't have NetworkUnavailable condition)
272+ if kubectl get nodes -o json | jq -r '.items[].status.conditions[] | select(.type=="NetworkUnavailable") | .status' | grep -v "False" > /dev/null 2>&1; then
273+ echo "CNI still initializing... ($i/120)"
274+ if [ $((i % 10)) -eq 0 ]; then
275+ echo "Current node conditions:"
276+ kubectl describe nodes | grep -A10 "Conditions:" || true
277+ fi
278+ else
279+ echo "✓ CNI initialized on all nodes"
280+ break
281+ fi
282+
283+ if [ $i -eq 120 ]; then
284+ echo "ERROR: CNI failed to initialize"
285+ echo "Node details:"
286+ kubectl describe nodes
287+ echo "KIND logs:"
288+ docker ps -a | grep kind
289+ docker logs kind-control-plane 2>&1 | tail -100 || true
290+ exit 1
291+ fi
292+ sleep 5
293+ done
294+
295+ # Wait for nodes to be fully ready
296+ echo "Waiting for all nodes to be ready..."
297+ kubectl wait --for=condition=Ready nodes --all --timeout=300s || {
298+ echo "ERROR: Nodes failed to become ready"
299+ kubectl describe nodes
300+ kubectl get pods -A -o wide
301+ exit 1
302+ }
303+
304+ echo "✓ All nodes are ready:"
305+ kubectl get nodes -o wide
306+
307+ # Verify CNI with a test pod
308+ echo "Verifying CNI functionality..."
309+ kubectl run test-cni --image=busybox:latest --rm -it --restart=Never --command -- sh -c "echo 'CNI test successful'" || {
310+ echo "WARNING: CNI test pod failed, checking kindnet pods..."
311+ kubectl get pods -n kube-system -l app=kindnet -o wide
312+ kubectl logs -n kube-system -l app=kindnet --tail=50 || true
313+ }
70314
71315 - name : Install NVidia GPU operator for KinD
316+ if : matrix.gpu-setup == true
72317 uses : ./common/github-actions/nvidia-gpu-operator
73318
74319 - name : Deploy CodeFlare stack
@@ -122,7 +367,13 @@ jobs:
122367 pip install poetry
123368 poetry install --with test,docs
124369 echo "Running e2e tests..."
125- poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
370+ if [ "${{ matrix.gpu-setup }}" == "true" ]; then
371+ echo "Running GPU tests..."
372+ poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
373+ else
374+ echo "Running non-GPU tests (GPU setup disabled for debugging)..."
375+ poetry run pytest -v -s ./tests/e2e -m 'kind and not nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 || echo "No non-GPU tests found"
376+ fi
126377 env :
127378 GRPC_DNS_RESOLVER : " native"
128379
0 commit comments