55 pull_request :
66 branches :
77 - main
8- - ' release-*'
8+ - " release-*"
99 - ray-jobs-feature
10+ - kueue-integration
1011 paths-ignore :
11- - ' docs/**'
12- - ' **.adoc'
13- - ' **.md'
14- - ' LICENSE'
12+ - " docs/**"
13+ - " **.adoc"
14+ - " **.md"
15+ - " LICENSE"
1516
1617concurrency :
1718 group : ${{ github.head_ref }}-${{ github.workflow }}
1819 cancel-in-progress : true
1920
2021env :
2122 CODEFLARE_OPERATOR_IMG : " quay.io/project-codeflare/codeflare-operator:dev"
23+ KUEUE_VERSION : " v0.13.4"
2224
2325jobs :
2426 kubernetes :
@@ -30,24 +32,17 @@ jobs:
3032 with :
3133 submodules : recursive
3234
33- - name : Checkout common repo code
35+ - name : Checkout Kuberay operator repository
3436 uses : actions/checkout@v4
3537 with :
36- repository : ' project-codeflare/codeflare-common'
37- ref : ' main'
38- path : ' common'
39-
40- - name : Checkout CodeFlare operator repository
41- uses : actions/checkout@v4
42- with :
43- repository : project-codeflare/codeflare-operator
44- path : codeflare-operator
38+ repository : ray-project/kuberay
39+ path : kuberay
4540
4641 - name : Set Go
4742 uses : actions/setup-go@v5
4843 with :
49- go-version-file : ' ./codeflare-operator /go.mod'
50- cache-dependency-path : " ./codeflare-operator /go.sum"
44+ go-version-file : " ./kuberay /go.mod"
45+ cache-dependency-path : " ./kuberay /go.sum"
5146
5247 - name : Set up gotestfmt
5348 uses : gotesttools/gotestfmt-action@v2
@@ -57,33 +52,74 @@ jobs:
5752 - name : Set up specific Python version
5853 uses : actions/setup-python@v5
5954 with :
60- python-version : ' 3.11 '
61- cache : ' pip' # caching pip dependencies
55+ python-version : " 3.12 "
56+ cache : " pip" # caching pip dependencies
6257
6358 - name : Setup NVidia GPU environment for KinD
64- uses : ./common/ github-actions /nvidia-gpu-setup
59+ uses : ./. github/nvidia /nvidia-gpu-setup
6560
6661 - name : Setup and start KinD cluster
67- uses : ./common/ github-actions /kind
62+ uses : ./. github/kind
6863 with :
6964 worker-nodes : 1
7065
7166 - name : Install NVidia GPU operator for KinD
72- uses : ./common/ github-actions /nvidia-gpu-operator
67+ uses : ./. github/nvidia /nvidia-gpu-operator
7368
74- - name : Deploy CodeFlare stack
69+ - name : Wait for nodes to be ready
70+ run : |
71+ echo "Waiting for all nodes to be ready..."
72+ kubectl wait --for=condition=Ready nodes --all --timeout=300s
73+
74+ echo "Checking node status..."
75+ kubectl get nodes -o wide
76+
77+ echo "Checking for CNI readiness..."
78+ for i in {1..30}; do
79+ if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
80+ echo "Waiting for CNI to initialize (attempt $i/30)..."
81+ sleep 10
82+ else
83+ echo "All nodes are ready!"
84+ break
85+ fi
86+ done
87+
88+ # Final verification
89+ kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
90+
91+ - name : Deploy Kuberay & Kueue
7592 id : deploy
7693 run : |
77- cd codeflare-operator
78- echo Setting up CodeFlare stack
94+ echo Setting up Kuberay and Kueue
7995 make setup-e2e
80- echo Deploying CodeFlare operator
81- make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
82- kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
83- cd ..
96+
97+ # Wait for Kueue to be ready
98+ echo "Waiting for Kueue controller to be ready..."
99+ kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
100+ echo "Kueue deployment status:"
101+ kubectl get all -n kueue-system
102+ exit 1
103+ }
104+
105+ # Wait for KubeRay to be ready
106+ echo "Waiting for KubeRay operator to be ready..."
107+ kubectl wait --for=condition=Available --timeout=300s deployment -n ray-system kuberay-operator || {
108+ echo "KubeRay deployment status:"
109+ kubectl get all -n ray-system
110+ exit 1
111+ }
112+
113+ # Verify webhook certificates if CodeFlare operator is deployed
114+ if kubectl get deployment -n openshift-operators codeflare-operator-manager 2>/dev/null; then
115+ echo "Checking CodeFlare operator webhook certificates..."
116+ kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
117+ echo "Warning: Webhook certificate might be missing or invalid"
118+ }
119+ fi
84120
85121 - name : Add user to KinD
86- uses : ./common/ github-actions /kind-add-user
122+ uses : ./. github/kind /kind-add-user
87123 with :
88124 user-name : sdk-user
89125
@@ -93,16 +129,18 @@ jobs:
93129 kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
94130 kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
95131 kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
96- kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
132+ kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters
97133 kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
98- kubectl create clusterrole appwrapper -creator --verb=get,list,create,delete,patch --resource=appwrappers
99- kubectl create clusterrolebinding sdk-user-appwrapper -creator --clusterrole=appwrapper -creator --user=sdk-user
134+ kubectl create clusterrole rayjob -creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs,rayjobs/status
135+ kubectl create clusterrolebinding sdk-user-rayjob -creator --clusterrole=rayjob -creator --user=sdk-user
100136 kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
101137 kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
102138 kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
103139 kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
104140 kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
105141 kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
142+ kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads
143+ kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
106144 kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
107145 kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
108146 kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -111,8 +149,31 @@ jobs:
111149 kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
112150 kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
113151 kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
152+ kubectl create clusterrole node-reader --verb=get,list --resource=nodes
153+ kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
114154 kubectl config use-context sdk-user
115155
156+ - name : Verify cluster readiness before tests
157+ run : |
158+ echo "=== Pre-test cluster verification ==="
159+ echo "Current context:"
160+ kubectl config current-context
161+
162+ echo -e "\nNode status:"
163+ kubectl get nodes -o wide
164+
165+ echo -e "\nSystem pods status:"
166+ kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
167+
168+ echo -e "\nChecking for any pods in error state:"
169+ kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
170+
171+ echo -e "\nKueue resources:"
172+ kubectl get resourceflavors,clusterqueues,localqueues -A || true
173+
174+ echo -e "\nRay CRDs:"
175+ kubectl get crd | grep ray || true
176+
116177 - name : Run e2e tests
117178 run : |
118179 export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
@@ -122,19 +183,37 @@ jobs:
122183 pip install poetry
123184 poetry install --with test,docs
124185 echo "Running e2e tests..."
125- poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
186+ poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
187+ env :
188+ GRPC_DNS_RESOLVER : " native"
189+
190+ - name : Run RayJob e2e tests
191+ run : |
192+ set -euo pipefail
193+ echo "Running RayJob e2e tests..."
194+ # Set environment variable to prevent default queue assignment for non-Kueue tests
195+ export DISABLE_DEFAULT_KUEUE_QUEUE=true
196+
197+ # Run only the tests that are designed for Kueue integration
198+ poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
126199 env :
127200 GRPC_DNS_RESOLVER : " native"
128201
129202 - name : Switch to kind-cluster context to print logs
130203 if : always() && steps.deploy.outcome == 'success'
131204 run : kubectl config use-context kind-cluster
132205
133- - name : Print Pytest output log
206+ - name : Print RayJob E2E Pytest output log
207+ if : always() && steps.deploy.outcome == 'success'
208+ run : |
209+ echo "Printing Pytest output logs"
210+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log
211+
212+ - name : Print E2E Pytest output log
134213 if : always() && steps.deploy.outcome == 'success'
135214 run : |
136215 echo "Printing Pytest output logs"
137- cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
216+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e- pytest_output.log
138217
139218 - name : Print CodeFlare operator logs
140219 if : always() && steps.deploy.outcome == 'success'
@@ -149,7 +228,7 @@ jobs:
149228 kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
150229
151230 - name : Export all KinD pod logs
152- uses : ./common/ github-actions /kind-export-logs
231+ uses : ./. github/kind /kind-export-logs
153232 if : always() && steps.deploy.outcome == 'success'
154233 with :
155234 output-directory : ${CODEFLARE_TEST_OUTPUT_DIR}
@@ -162,3 +241,4 @@ jobs:
162241 retention-days : 10
163242 path : |
164243 ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
244+ if-no-files-found : warn
0 commit comments