@@ -19,6 +19,8 @@ concurrency:
1919
2020env :
2121 CODEFLARE_OPERATOR_IMG : " quay.io/project-codeflare/codeflare-operator:dev"
22+ # Explicitly set Docker MTU for KIND to avoid network issues
23+ KIND_EXPERIMENTAL_DOCKER_NETWORK : " bridge"
2224
2325jobs :
2426 kubernetes :
6062 python-version : ' 3.11'
6163 cache : ' pip' # caching pip dependencies
6264
65+ - name : Install required tools
66+ run : |
67+ echo "Installing KIND..."
68+ curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.22.0/kind-linux-amd64
69+ chmod +x ./kind
70+ sudo mv ./kind /usr/local/bin/kind
71+ kind version
72+
73+ echo "Installing kubectl if not present..."
74+ if ! command -v kubectl &> /dev/null; then
75+ curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
76+ chmod +x kubectl
77+ sudo mv kubectl /usr/local/bin/
78+ fi
79+ kubectl version --client
80+
81+ echo "Checking for podman..."
82+ if ! command -v podman &> /dev/null; then
83+ echo "Podman not found, installing..."
84+ sudo apt-get update
85+ sudo apt-get install -y podman || echo "Podman installation failed, might not be needed"
86+ fi
87+
6388 - name : Setup NVidia GPU environment for KinD
6489 uses : ./common/github-actions/nvidia-gpu-setup
6590
6893 with :
6994 worker-nodes : 1
7095
96+ - name : Verify Kind cluster
97+ run : |
98+ echo "Checking Kind clusters..."
99+ kind get clusters
100+ echo "Current kubectl context:"
101+ kubectl config current-context
102+ echo "Checking nodes:"
103+ kubectl get nodes
104+
105+ echo "Waiting for nodes to be ready..."
106+ kubectl wait --for=condition=Ready nodes --all --timeout=300s
107+
108+
71109 - name : Install NVidia GPU operator for KinD
72110 uses : ./common/github-actions/nvidia-gpu-operator
73111
112+ - name : Deploy KubeRay operator
113+ run : |
114+ KUBERAY_VERSION="v1.2.2"
115+ echo "Deploying KubeRay ${KUBERAY_VERSION}"
116+ kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}&timeout=180s"
117+
118+ echo "Waiting for KubeRay operator to become ready..."
119+ kubectl wait --for=condition=Available --timeout=300s deployment/kuberay-operator -n ray-system || {
120+ echo "KubeRay operator not found, checking pods..."
121+ kubectl get pods -A | grep kuberay
122+ exit 1
123+ }
124+
74125 - name : Deploy CodeFlare stack
75126 id : deploy
76127 run : |
@@ -113,11 +164,15 @@ jobs:
113164 kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
114165 kubectl config use-context sdk-user
115166
116- - name : Run e2e tests
167+ - name : Setup test output directory
117168 run : |
118- export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
169+ CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
170+ mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
119171 echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
120172
173+ - name : Run e2e tests
174+ run : |
175+
121176 set -euo pipefail
122177 pip install poetry
123178 poetry install --with test,docs
@@ -152,7 +207,7 @@ jobs:
152207 uses : ./common/github-actions/kind-export-logs
153208 if : always() && steps.deploy.outcome == 'success'
154209 with :
155- output-directory : ${CODEFLARE_TEST_OUTPUT_DIR}
210+ output-directory : ${{ env. CODEFLARE_TEST_OUTPUT_DIR } }
156211
157212 - name : Upload logs
158213 uses : actions/upload-artifact@v4
@@ -162,3 +217,4 @@ jobs:
162217 retention-days : 10
163218 path : |
164219 ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
220+ if-no-files-found : warn
0 commit comments