55 pull_request :
66 branches :
77 - main
8- - " release-*"
8+ - ' release-*'
99 - ray-jobs-feature
10- - kueue-integration
1110 paths-ignore :
12- - " docs/**"
13- - " **.adoc"
14- - " **.md"
15- - " LICENSE"
11+ - ' docs/**'
12+ - ' **.adoc'
13+ - ' **.md'
14+ - ' LICENSE'
1615
1716concurrency :
1817 group : ${{ github.head_ref }}-${{ github.workflow }}
1918 cancel-in-progress : true
2019
2120env :
22- CODEFLARE_OPERATOR_IMG : " quay.io/project-codeflare/codeflare-operator:dev"
2321 KUEUE_VERSION : " v0.13.4"
22+ KUBERAY_VERSION : " v1.4.0"
2423
2524jobs :
2625 kubernetes :
@@ -35,32 +34,15 @@ jobs:
3534 - name : Checkout common repo code
3635 uses : actions/checkout@v4
3736 with :
38- repository : " project-codeflare/codeflare-common"
39- ref : " main"
40- path : " common"
41-
42- - name : Checkout CodeFlare operator repository
43- uses : actions/checkout@v4
44- with :
45- repository : project-codeflare/codeflare-operator
46- path : codeflare-operator
47-
48- - name : Set Go
49- uses : actions/setup-go@v5
50- with :
51- go-version-file : " ./codeflare-operator/go.mod"
52- cache-dependency-path : " ./codeflare-operator/go.sum"
53-
54- - name : Set up gotestfmt
55- uses : gotesttools/gotestfmt-action@v2
56- with :
57- token : ${{ secrets.GITHUB_TOKEN }}
37+ repository : ' project-codeflare/codeflare-common'
38+ ref : ' main'
39+ path : ' common'
5840
5941 - name : Set up specific Python version
6042 uses : actions/setup-python@v5
6143 with :
62- python-version : " 3.12 "
63- cache : " pip" # caching pip dependencies
44+ python-version : ' 3.11 '
45+ cache : ' pip' # caching pip dependencies
6446
6547 - name : Setup NVidia GPU environment for KinD
6648 uses : ./common/github-actions/nvidia-gpu-setup
@@ -73,48 +55,12 @@ jobs:
7355 - name : Install NVidia GPU operator for KinD
7456 uses : ./common/github-actions/nvidia-gpu-operator
7557
76- - name : Wait for nodes to be ready
77- run : |
78- echo "Waiting for all nodes to be ready..."
79- kubectl wait --for=condition=Ready nodes --all --timeout=300s
80-
81- echo "Checking node status..."
82- kubectl get nodes -o wide
83-
84- echo "Checking for CNI readiness..."
85- for i in {1..30}; do
86- if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
87- echo "Waiting for CNI to initialize (attempt $i/30)..."
88- sleep 10
89- else
90- echo "All nodes are ready!"
91- break
92- fi
93- done
94-
95- # Final verification
96- kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
97-
98- - name : Deploy CodeFlare stack
58+ - name : Deploy KubeRay and Kueue
9959 id : deploy
10060 run : |
101- cd codeflare-operator
102- echo Setting up CodeFlare stack
103- make setup-e2e
104- echo Deploying CodeFlare operator
105- make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
106- kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
107- cd ..
108-
109- - name : Verify CodeFlare deployment
110- run : |
111- # Wait for Kueue to be ready
112- echo "Waiting for Kueue controller to be ready..."
113- kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
114- echo "Kueue deployment status:"
115- kubectl get all -n kueue-system
116- exit 1
117- }
61+ # Deploy KubeRay operator
62+ echo "Deploying KubeRay ${KUBERAY_VERSION}..."
63+ kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}&timeout=180s"
11864
11965 # Wait for KubeRay to be ready
12066 echo "Waiting for KubeRay operator to be ready..."
@@ -124,44 +70,43 @@ jobs:
12470 exit 1
12571 }
12672
127- # Verify webhook certificates
128- echo "Checking CodeFlare operator webhook certificates..."
129- kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
130- echo "Warning: Webhook certificate might be missing or invalid"
73+ # Deploy Kueue
74+ echo "Deploying Kueue ${KUEUE_VERSION}..."
75+ kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
76+
77+ # Wait for Kueue to be ready
78+ echo "Waiting for Kueue controller to be ready..."
79+ kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
80+ echo "Kueue deployment status:"
81+ kubectl get all -n kueue-system
82+ exit 1
13183 }
13284
85+ echo "✓ KubeRay and Kueue deployed successfully"
86+
13387 - name : Add user to KinD
13488 uses : ./common/github-actions/kind-add-user
13589 with :
13690 user-name : sdk-user
13791
13892 - name : Configure RBAC for sdk user with limited permissions
13993 run : |
140- # CRD permissions for discovering resource types
141- kubectl create clusterrole crd-reader --verb=get,list,watch --resource=customresourcedefinitions.apiextensions.k8s.io
142- kubectl create clusterrolebinding sdk-user-crd-reader --clusterrole=crd-reader --user=sdk-user
143-
144- # AppWrapper permissions for CodeFlare workloads
145- kubectl create clusterrole appwrapper-creator --verb=get,list,watch,create,update,patch,delete --resource=appwrappers.workload.codeflare.dev
146- kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
147-
148- # Existing permissions
14994 kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
15095 kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
15196 kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
15297 kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
15398 kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters.ray.io
15499 kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
155- kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs/status
100+ kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs.ray.io /status
156101 kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
157102 kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors.kueue.x-k8s.io
158103 kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
159104 kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues.kueue.x-k8s.io
160105 kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
161106 kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues.kueue.x-k8s.io
162107 kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
163- kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
164- kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
108+ kubectl create clusterrole workload-reader --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
109+ kubectl create clusterrolebinding sdk-user-workload-reader --clusterrole=workload-reader --user=sdk-user
165110 kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
166111 kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
167112 kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -174,74 +119,28 @@ jobs:
174119 kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
175120 kubectl config use-context sdk-user
176121
177- - name : Verify cluster readiness before tests
178- run : |
179- echo "=== Pre-test cluster verification ==="
180- echo "Current context:"
181- kubectl config current-context
182-
183- echo -e "\nNode status:"
184- kubectl get nodes -o wide
185-
186- echo -e "\nSystem pods status:"
187- kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
188-
189- echo -e "\nChecking for any pods in error state:"
190- kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
191-
192- echo -e "\nKueue resources:"
193- kubectl get resourceflavors,clusterqueues,localqueues -A || true
194-
195- echo -e "\nRay CRDs:"
196- kubectl get crd | grep ray || true
197-
198122 - name : Run e2e tests
199123 run : |
200- export CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
201- mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
124+ export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
202125 echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
203126
204127 set -euo pipefail
205128 pip install poetry
206129 poetry install --with test,docs
207130 echo "Running e2e tests..."
208- poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
209- env :
210- GRPC_DNS_RESOLVER : " native"
211-
212- - name : Run RayJob e2e tests
213- run : |
214- set -euo pipefail
215- echo "Running RayJob e2e tests..."
216- # Set environment variable to prevent default queue assignment for non-Kueue tests
217- export DISABLE_DEFAULT_KUEUE_QUEUE=true
218-
219- # Run only the tests that are designed for Kueue integration
220- poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
131+ poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
221132 env :
222133 GRPC_DNS_RESOLVER : " native"
223134
224135 - name : Switch to kind-cluster context to print logs
225136 if : always() && steps.deploy.outcome == 'success'
226137 run : kubectl config use-context kind-cluster
227138
228- - name : Print RayJob E2E Pytest output log
229- if : always() && steps.deploy.outcome == 'success'
230- run : |
231- echo "Printing RayJob Pytest output logs"
232- cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log || echo "No RayJob test output found"
233-
234- - name : Print E2E Pytest output log
235- if : always() && steps.deploy.outcome == 'success'
236- run : |
237- echo "Printing E2E Pytest output logs"
238- cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log || echo "No E2E test output found"
239-
240- - name : Print CodeFlare operator logs
139+ - name : Print Pytest output log
241140 if : always() && steps.deploy.outcome == 'success'
242141 run : |
243- echo "Printing CodeFlare operator logs"
244- kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator .log
142+ echo "Printing Pytest output logs"
143+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output .log
245144
246145 - name : Print KubeRay operator logs
247146 if : always() && steps.deploy.outcome == 'success'
@@ -253,7 +152,7 @@ jobs:
253152 uses : ./common/github-actions/kind-export-logs
254153 if : always() && steps.deploy.outcome == 'success'
255154 with :
256- output-directory : ${{ env. CODEFLARE_TEST_OUTPUT_DIR } }
155+ output-directory : ${CODEFLARE_TEST_OUTPUT_DIR}
257156
258157 - name : Upload logs
259158 uses : actions/upload-artifact@v4
@@ -263,4 +162,3 @@ jobs:
263162 retention-days : 10
264163 path : |
265164 ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
266- if-no-files-found : warn
0 commit comments