55 pull_request :
66 branches :
77 - main
8- - " release-*"
8+ - ' release-*'
99 - ray-jobs-feature
10- - kueue-integration
1110 paths-ignore :
12- - " docs/**"
13- - " **.adoc"
14- - " **.md"
15- - " LICENSE"
11+ - ' docs/**'
12+ - ' **.adoc'
13+ - ' **.md'
14+ - ' LICENSE'
1615
1716concurrency :
1817 group : ${{ github.head_ref }}-${{ github.workflow }}
3534 - name : Checkout common repo code
3635 uses : actions/checkout@v4
3736 with :
38- repository : " project-codeflare/codeflare-common"
39- ref : " main"
40- path : " common"
37+ repository : ' project-codeflare/codeflare-common'
38+ ref : ' main'
39+ path : ' common'
4140
4241 - name : Checkout CodeFlare operator repository
4342 uses : actions/checkout@v4
4847 - name : Set Go
4948 uses : actions/setup-go@v5
5049 with :
51- go-version-file : " ./codeflare-operator/go.mod"
50+ go-version-file : ' ./codeflare-operator/go.mod'
5251 cache-dependency-path : " ./codeflare-operator/go.sum"
5352
5453 - name : Set up gotestfmt
5958 - name : Set up specific Python version
6059 uses : actions/setup-python@v5
6160 with :
62- python-version : " 3.12"
63- cache : " pip" # caching pip dependencies
61+ python-version : ' 3.12'
62+ cache : ' pip' # caching pip dependencies
6463
6564 - name : Setup NVidia GPU environment for KinD
6665 uses : ./common/github-actions/nvidia-gpu-setup
7372 - name : Install NVidia GPU operator for KinD
7473 uses : ./common/github-actions/nvidia-gpu-operator
7574
76- - name : Wait for nodes to be ready
77- run : |
78- echo "Waiting for all nodes to be ready..."
79- kubectl wait --for=condition=Ready nodes --all --timeout=300s
80-
81- echo "Checking node status..."
82- kubectl get nodes -o wide
83-
84- echo "Checking for CNI readiness..."
85- for i in {1..30}; do
86- if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
87- echo "Waiting for CNI to initialize (attempt $i/30)..."
88- sleep 10
89- else
90- echo "All nodes are ready!"
91- break
92- fi
93- done
94-
95- # Final verification
96- kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
97-
9875 - name : Deploy CodeFlare stack
9976 id : deploy
10077 run : |
@@ -106,62 +83,27 @@ jobs:
10683 kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
10784 cd ..
10885
109- - name : Verify CodeFlare deployment
110- run : |
111- # Wait for Kueue to be ready
112- echo "Waiting for Kueue controller to be ready..."
113- kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
114- echo "Kueue deployment status:"
115- kubectl get all -n kueue-system
116- exit 1
117- }
118-
119- # Wait for KubeRay to be ready
120- echo "Waiting for KubeRay operator to be ready..."
121- kubectl wait --for=condition=Available --timeout=300s deployment -n default kuberay-operator || {
122- echo "KubeRay deployment status:"
123- kubectl get all -n default
124- exit 1
125- }
126-
127- # Verify webhook certificates
128- echo "Checking CodeFlare operator webhook certificates..."
129- kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
130- echo "Warning: Webhook certificate might be missing or invalid"
131- }
132-
13386 - name : Add user to KinD
13487 uses : ./common/github-actions/kind-add-user
13588 with :
13689 user-name : sdk-user
13790
13891 - name : Configure RBAC for sdk user with limited permissions
13992 run : |
140- # CRD permissions for discovering resource types
141- kubectl create clusterrole crd-reader --verb=get,list,watch --resource=customresourcedefinitions.apiextensions.k8s.io
142- kubectl create clusterrolebinding sdk-user-crd-reader --clusterrole=crd-reader --user=sdk-user
143-
144- # AppWrapper permissions for CodeFlare workloads
145- kubectl create clusterrole appwrapper-creator --verb=get,list,watch,create,update,patch,delete --resource=appwrappers.workload.codeflare.dev
146- kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
147-
148- # Existing permissions
14993 kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
15094 kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
15195 kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
15296 kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
153- kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters.ray.io
97+ kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
15498 kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
155- kubectl create clusterrole rayjob -creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs/status
156- kubectl create clusterrolebinding sdk-user-rayjob -creator --clusterrole=rayjob -creator --user=sdk-user
157- kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors.kueue.x-k8s.io
99+ kubectl create clusterrole appwrapper -creator --verb=get,list,create,delete,patch --resource=appwrappers
100+ kubectl create clusterrolebinding sdk-user-appwrapper -creator --clusterrole=appwrapper -creator --user=sdk-user
101+ kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
158102 kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
159- kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues.kueue.x-k8s.io
103+ kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
160104 kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
161- kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues.kueue.x-k8s.io
105+ kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
162106 kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
163- kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
164- kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
165107 kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
166108 kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
167109 kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -170,72 +112,30 @@ jobs:
170112 kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
171113 kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
172114 kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
173- kubectl create clusterrole node-reader --verb=get,list --resource=nodes
174- kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
175115 kubectl config use-context sdk-user
176116
177- - name : Verify cluster readiness before tests
178- run : |
179- echo "=== Pre-test cluster verification ==="
180- echo "Current context:"
181- kubectl config current-context
182-
183- echo -e "\nNode status:"
184- kubectl get nodes -o wide
185-
186- echo -e "\nSystem pods status:"
187- kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
188-
189- echo -e "\nChecking for any pods in error state:"
190- kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
191-
192- echo -e "\nKueue resources:"
193- kubectl get resourceflavors,clusterqueues,localqueues -A || true
194-
195- echo -e "\nRay CRDs:"
196- kubectl get crd | grep ray || true
197-
198117 - name : Run e2e tests
199118 run : |
200- export CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
201- mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
119+ export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
202120 echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
203121
204122 set -euo pipefail
205123 pip install poetry
206124 poetry install --with test,docs
207125 echo "Running e2e tests..."
208- poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
209- env :
210- GRPC_DNS_RESOLVER : " native"
211-
212- - name : Run RayJob e2e tests
213- run : |
214- set -euo pipefail
215- echo "Running RayJob e2e tests..."
216- # Set environment variable to prevent default queue assignment for non-Kueue tests
217- export DISABLE_DEFAULT_KUEUE_QUEUE=true
218-
219- # Run only the tests that are designed for Kueue integration
220- poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
126+ poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
221127 env :
222128 GRPC_DNS_RESOLVER : " native"
223129
224130 - name : Switch to kind-cluster context to print logs
225131 if : always() && steps.deploy.outcome == 'success'
226132 run : kubectl config use-context kind-cluster
227133
228- - name : Print RayJob E2E Pytest output log
229- if : always() && steps.deploy.outcome == 'success'
230- run : |
231- echo "Printing RayJob Pytest output logs"
232- cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log || echo "No RayJob test output found"
233-
234- - name : Print E2E Pytest output log
134+ - name : Print Pytest output log
235135 if : always() && steps.deploy.outcome == 'success'
236136 run : |
237- echo "Printing E2E Pytest output logs"
238- cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e- pytest_output.log || echo "No E2E test output found"
137+ echo "Printing Pytest output logs"
138+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
239139
240140 - name : Print CodeFlare operator logs
241141 if : always() && steps.deploy.outcome == 'success'
@@ -253,7 +153,7 @@ jobs:
253153 uses : ./common/github-actions/kind-export-logs
254154 if : always() && steps.deploy.outcome == 'success'
255155 with :
256- output-directory : ${{ env. CODEFLARE_TEST_OUTPUT_DIR } }
156+ output-directory : ${CODEFLARE_TEST_OUTPUT_DIR}
257157
258158 - name : Upload logs
259159 uses : actions/upload-artifact@v4
@@ -263,4 +163,3 @@ jobs:
263163 retention-days : 10
264164 path : |
265165 ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
266- if-no-files-found : warn
0 commit comments