RHOAIENG-32532: Fix broken E2E tests

kryanbeane · kryanbeane · commit 344e935eaf71 · 2025-09-19T15:51:12.000+01:00
diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
@@ -19,6 +19,8 @@ concurrency:
 
 env:
   CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
+  # Explicitly set Docker MTU for KIND to avoid network issues
+  KIND_EXPERIMENTAL_DOCKER_NETWORK: "bridge"
 
 jobs:
   kubernetes:
@@ -60,13 +62,150 @@ jobs:
           python-version: '3.11'
           cache: 'pip' # caching pip dependencies
 
+      - name: Diagnose Docker environment on GPU runner
+        run: |
+          echo "=== Docker Environment Diagnostics ==="
+          echo "Docker version:"
+          docker version || true
+          echo ""
+          echo "Docker info:"
+          docker info || true
+          echo ""
+          echo "System info:"
+          uname -a
+          echo ""
+          echo "Network interfaces:"
+          ip addr show || true
+          echo ""
+          echo "Checking cgroup version:"
+          stat -fc %T /sys/fs/cgroup/ || true
+          echo ""
+          echo "Checking if running in container:"
+          if [ -f /.dockerenv ]; then echo "Running inside Docker"; else echo "Not in Docker"; fi
+          echo ""
+          echo "Available disk space:"
+          df -h
+          echo ""
+          echo "Memory info:"
+          free -h
+          echo "=== End Diagnostics ==="
+
       - name: Setup NVidia GPU environment for KinD
         uses: ./common/github-actions/nvidia-gpu-setup
 
+      - name: Create KIND config with explicit networking
+        run: |
+          cat > /tmp/kind-config.yaml <<EOF
+          kind: Cluster
+          apiVersion: kind.x-k8s.io/v1alpha4
+          networking:
+            # Explicitly set pod subnet to avoid conflicts
+            podSubnet: "10.244.0.0/16"
+            serviceSubnet: "10.96.0.0/16"
+            # Disable default CNI so we can ensure it's properly installed
+            disableDefaultCNI: false
+            # Set MTU for better compatibility
+            kubeProxyMode: "iptables"
+          nodes:
+          - role: control-plane
+            # Extra mounts that might be needed for GPU
+            extraMounts:
+            - containerPath: /dev/shm
+              hostPath: /dev/shm
+              propagation: HostToContainer
+          - role: worker
+            extraMounts:
+            - containerPath: /dev/shm
+              hostPath: /dev/shm
+              propagation: HostToContainer
+          containerdConfigPatches:
+          - |-
+            [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
+              runtime_type = "io.containerd.runc.v2"
+            [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
+              SystemdCgroup = true
+          EOF
+
+          echo "KIND configuration:"
+          cat /tmp/kind-config.yaml
+
       - name: Setup and start KinD cluster
         uses: ./common/github-actions/kind
         with:
           worker-nodes: 1
+          kind-config: /tmp/kind-config.yaml
+
+      - name: Wait for KIND cluster and CNI initialization
+        run: |
+          echo "Waiting for KIND cluster to initialize..."
+          # First ensure cluster API is responsive
+          for i in {1..60}; do
+            if kubectl cluster-info &>/dev/null; then
+              echo "✓ Cluster API is responsive"
+              break
+            fi
+            echo "Waiting for cluster API... ($i/60)"
+            sleep 5
+          done
+
+          # Check initial node status
+          echo "Initial node status:"
+          kubectl get nodes -o wide || true
+
+          # Wait for CNI to initialize on all nodes
+          echo "Waiting for CNI plugin to initialize..."
+          for i in {1..120}; do
+            # Check if nodes exist
+            node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0")
+            if [ "$node_count" -eq "0" ]; then
+              echo "No nodes found yet... ($i/120)"
+              sleep 5
+              continue
+            fi
+
+            # Check if CNI is initialized (nodes won't have NetworkUnavailable condition)
+            if kubectl get nodes -o json | jq -r '.items[].status.conditions[] | select(.type=="NetworkUnavailable") | .status' | grep -v "False" > /dev/null 2>&1; then
+              echo "CNI still initializing... ($i/120)"
+              if [ $((i % 10)) -eq 0 ]; then
+                echo "Current node conditions:"
+                kubectl describe nodes | grep -A10 "Conditions:" || true
+              fi
+            else
+              echo "✓ CNI initialized on all nodes"
+              break
+            fi
+
+            if [ $i -eq 120 ]; then
+              echo "ERROR: CNI failed to initialize"
+              echo "Node details:"
+              kubectl describe nodes
+              echo "KIND logs:"
+              docker ps -a | grep kind
+              docker logs kind-control-plane 2>&1 | tail -100 || true
+              exit 1
+            fi
+            sleep 5
+          done
+
+          # Wait for nodes to be fully ready
+          echo "Waiting for all nodes to be ready..."
+          kubectl wait --for=condition=Ready nodes --all --timeout=300s || {
+            echo "ERROR: Nodes failed to become ready"
+            kubectl describe nodes
+            kubectl get pods -A -o wide
+            exit 1
+          }
+
+          echo "✓ All nodes are ready:"
+          kubectl get nodes -o wide
+
+          # Verify CNI with a test pod
+          echo "Verifying CNI functionality..."
+          kubectl run test-cni --image=busybox:latest --rm -it --restart=Never --command -- sh -c "echo 'CNI test successful'" || {
+            echo "WARNING: CNI test pod failed, checking kindnet pods..."
+            kubectl get pods -n kube-system -l app=kindnet -o wide
+            kubectl logs -n kube-system -l app=kindnet --tail=50 || true
+          }
 
       - name: Install NVidia GPU operator for KinD
         uses: ./common/github-actions/nvidia-gpu-operator
diff --git a/.github/workflows/rayjob_e2e_tests.yaml b/.github/workflows/rayjob_e2e_tests.yaml
@@ -1,4 +1,4 @@
-name: rayjob-e2e-with-kueue
+name: rayjob-e2e
 
 on:
   pull_request:
diff --git a/src/codeflare_sdk/ray/cluster/cluster.py b/src/codeflare_sdk/ray/cluster/cluster.py
@@ -783,6 +783,12 @@ def remove_autogenerated_fields(resource):
                 del resource[key]
             else:
                 remove_autogenerated_fields(resource[key])
+
+        # After cleaning, remove empty metadata sections
+        if "metadata" in resource and isinstance(resource["metadata"], dict):
+            if len(resource["metadata"]) == 0:
+                del resource["metadata"]
+
     elif isinstance(resource, list):
         for item in resource:
             remove_autogenerated_fields(item)
diff --git a/tests/e2e/rayjob/ray_version_validation_oauth_test.py b/tests/e2e/rayjob/ray_version_validation_oauth_test.py
@@ -11,10 +11,7 @@
     ManagedClusterConfig,
 )
 
-# This test validates Ray version compatibility checking for RayJob with cluster lifecycling scenarios
 
-
-@pytest.mark.openshift
 class TestRayJobRayVersionValidationOauth:
     def setup_method(self):
         initialize_kubernetes_client(self)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-name: rayjob-e2e-with-kueue`
	`1`	`+name: rayjob-e2e`
`2`	`2`
`3`	`3`	`on:`
`4`	`4`	`pull_request:`
Original file line number	Diff line number	Diff line change
`@@ -11,10 +11,7 @@`
`11`	`11`	`ManagedClusterConfig,`
`12`	`12`	`)`
`13`	`13`
`14`		`-# This test validates Ray version compatibility checking for RayJob with cluster lifecycling scenarios`
`15`	`14`
`16`		`-`
`17`		`-@pytest.mark.openshift`
`18`	`15`	`class TestRayJobRayVersionValidationOauth:`
`19`	`16`	`def setup_method(self):`
`20`	`17`	`initialize_kubernetes_client(self)`