55 pull_request :
66 branches :
77 - main
8- - ' release-*'
8+ - " release-*"
99 - ray-jobs-feature
10+ - kueue-integration
1011 paths-ignore :
11- - ' docs/**'
12- - ' **.adoc'
13- - ' **.md'
14- - ' LICENSE'
12+ - " docs/**"
13+ - " **.adoc"
14+ - " **.md"
15+ - " LICENSE"
1516
1617concurrency :
1718 group : ${{ github.head_ref }}-${{ github.workflow }}
1819 cancel-in-progress : true
1920
2021env :
2122 CODEFLARE_OPERATOR_IMG : " quay.io/project-codeflare/codeflare-operator:dev"
23+ KUEUE_VERSION : " v0.13.4"
2224
2325jobs :
2426 kubernetes :
3335 - name : Checkout common repo code
3436 uses : actions/checkout@v4
3537 with :
36- repository : ' project-codeflare/codeflare-common'
37- ref : ' main'
38- path : ' common'
38+ repository : " project-codeflare/codeflare-common"
39+ ref : " main"
40+ path : " common"
3941
4042 - name : Checkout CodeFlare operator repository
4143 uses : actions/checkout@v4
4648 - name : Set Go
4749 uses : actions/setup-go@v5
4850 with :
49- go-version-file : ' ./codeflare-operator/go.mod'
51+ go-version-file : " ./codeflare-operator/go.mod"
5052 cache-dependency-path : " ./codeflare-operator/go.sum"
5153
5254 - name : Set up gotestfmt
5759 - name : Set up specific Python version
5860 uses : actions/setup-python@v5
5961 with :
60- python-version : ' 3.11 '
61- cache : ' pip' # caching pip dependencies
62+ python-version : " 3.12 "
63+ cache : " pip" # caching pip dependencies
6264
6365 - name : Setup NVidia GPU environment for KinD
6466 uses : ./common/github-actions/nvidia-gpu-setup
7173 - name : Install NVidia GPU operator for KinD
7274 uses : ./common/github-actions/nvidia-gpu-operator
7375
76+ - name : Wait for nodes to be ready
77+ run : |
78+ echo "Waiting for all nodes to be ready..."
79+ kubectl wait --for=condition=Ready nodes --all --timeout=300s
80+
81+ echo "Checking node status..."
82+ kubectl get nodes -o wide
83+
84+ echo "Checking for CNI readiness..."
85+ for i in {1..30}; do
86+ if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
87+ echo "Waiting for CNI to initialize (attempt $i/30)..."
88+ sleep 10
89+ else
90+ echo "All nodes are ready!"
91+ break
92+ fi
93+ done
94+
95+ # Final verification
96+ kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
97+
7498 - name : Deploy CodeFlare stack
7599 id : deploy
76100 run : |
@@ -82,27 +106,62 @@ jobs:
82106 kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
83107 cd ..
84108
109+ - name : Verify CodeFlare deployment
110+ run : |
111+ # Wait for Kueue to be ready
112+ echo "Waiting for Kueue controller to be ready..."
113+ kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
114+ echo "Kueue deployment status:"
115+ kubectl get all -n kueue-system
116+ exit 1
117+ }
118+
119+ # Wait for KubeRay to be ready
120+ echo "Waiting for KubeRay operator to be ready..."
121+ kubectl wait --for=condition=Available --timeout=300s deployment -n default kuberay-operator || {
122+ echo "KubeRay deployment status:"
123+ kubectl get all -n default
124+ exit 1
125+ }
126+
127+ # Verify webhook certificates
128+ echo "Checking CodeFlare operator webhook certificates..."
129+ kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
130+ echo "Warning: Webhook certificate might be missing or invalid"
131+ }
132+
85133 - name : Add user to KinD
86134 uses : ./common/github-actions/kind-add-user
87135 with :
88136 user-name : sdk-user
89137
90138 - name : Configure RBAC for sdk user with limited permissions
91139 run : |
140+ # CRD permissions for discovering resource types
141+ kubectl create clusterrole crd-reader --verb=get,list,watch --resource=customresourcedefinitions --apiGroup=apiextensions.k8s.io
142+ kubectl create clusterrolebinding sdk-user-crd-reader --clusterrole=crd-reader --user=sdk-user
143+
144+ # AppWrapper permissions for CodeFlare workloads
145+ kubectl create clusterrole appwrapper-creator --verb=get,list,watch,create,update,patch,delete --resource=appwrappers --apiGroup=workload.codeflare.dev
146+ kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
147+
148+ # Existing permissions
92149 kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
93150 kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
94151 kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
95152 kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
96- kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
153+ kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters
97154 kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
98- kubectl create clusterrole appwrapper -creator --verb=get,list,create,delete,patch --resource=appwrappers
99- kubectl create clusterrolebinding sdk-user-appwrapper -creator --clusterrole=appwrapper -creator --user=sdk-user
155+ kubectl create clusterrole rayjob -creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs,rayjobs/status
156+ kubectl create clusterrolebinding sdk-user-rayjob -creator --clusterrole=rayjob -creator --user=sdk-user
100157 kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
101158 kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
102159 kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
103160 kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
104161 kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
105162 kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
163+ kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads
164+ kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
106165 kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
107166 kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
108167 kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -111,30 +170,72 @@ jobs:
111170 kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
112171 kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
113172 kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
173+ kubectl create clusterrole node-reader --verb=get,list --resource=nodes
174+ kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
114175 kubectl config use-context sdk-user
115176
177+ - name : Verify cluster readiness before tests
178+ run : |
179+ echo "=== Pre-test cluster verification ==="
180+ echo "Current context:"
181+ kubectl config current-context
182+
183+ echo -e "\nNode status:"
184+ kubectl get nodes -o wide
185+
186+ echo -e "\nSystem pods status:"
187+ kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
188+
189+ echo -e "\nChecking for any pods in error state:"
190+ kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
191+
192+ echo -e "\nKueue resources:"
193+ kubectl get resourceflavors,clusterqueues,localqueues -A || true
194+
195+ echo -e "\nRay CRDs:"
196+ kubectl get crd | grep ray || true
197+
116198 - name : Run e2e tests
117199 run : |
118- export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
200+ export CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
201+ mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
119202 echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
120203
121204 set -euo pipefail
122205 pip install poetry
123206 poetry install --with test,docs
124207 echo "Running e2e tests..."
125- poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
208+ poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
209+ env :
210+ GRPC_DNS_RESOLVER : " native"
211+
212+ - name : Run RayJob e2e tests
213+ run : |
214+ set -euo pipefail
215+ echo "Running RayJob e2e tests..."
216+ # Set environment variable to prevent default queue assignment for non-Kueue tests
217+ export DISABLE_DEFAULT_KUEUE_QUEUE=true
218+
219+ # Run only the tests that are designed for Kueue integration
220+ poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
126221 env :
127222 GRPC_DNS_RESOLVER : " native"
128223
129224 - name : Switch to kind-cluster context to print logs
130225 if : always() && steps.deploy.outcome == 'success'
131226 run : kubectl config use-context kind-cluster
132227
133- - name : Print Pytest output log
228+ - name : Print RayJob E2E Pytest output log
229+ if : always() && steps.deploy.outcome == 'success'
230+ run : |
231+ echo "Printing RayJob Pytest output logs"
232+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log || echo "No RayJob test output found"
233+
234+ - name : Print E2E Pytest output log
134235 if : always() && steps.deploy.outcome == 'success'
135236 run : |
136- echo "Printing Pytest output logs"
137- cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
237+ echo "Printing E2E Pytest output logs"
238+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e- pytest_output.log || echo "No E2E test output found"
138239
139240 - name : Print CodeFlare operator logs
140241 if : always() && steps.deploy.outcome == 'success'
@@ -152,7 +253,7 @@ jobs:
152253 uses : ./common/github-actions/kind-export-logs
153254 if : always() && steps.deploy.outcome == 'success'
154255 with :
155- output-directory : ${CODEFLARE_TEST_OUTPUT_DIR}
256+ output-directory : ${{ env. CODEFLARE_TEST_OUTPUT_DIR } }
156257
157258 - name : Upload logs
158259 uses : actions/upload-artifact@v4
@@ -162,3 +263,4 @@ jobs:
162263 retention-days : 10
163264 path : |
164265 ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
266+ if-no-files-found : warn
0 commit comments