Skip to content

Commit 3a3252a

Browse files
committed
RHOAIENG-32532: Run RayJob tests in CI
1 parent d259ec1 commit 3a3252a

File tree

1 file changed

+248
-0
lines changed

1 file changed

+248
-0
lines changed
Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
name: rayjob-e2e-with-kueue
2+
3+
on:
4+
pull_request:
5+
branches:
6+
- main
7+
- 'release-*'
8+
- ray-jobs-feature
9+
paths-ignore:
10+
- 'docs/**'
11+
- '**.adoc'
12+
- '**.md'
13+
- 'LICENSE'
14+
15+
concurrency:
16+
group: ${{ github.head_ref }}-${{ github.workflow }}
17+
cancel-in-progress: true
18+
19+
env:
20+
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
21+
KUEUE_VERSION: "v0.13.3"
22+
23+
jobs:
24+
kubernetes:
25+
runs-on: ubuntu-latest
26+
27+
steps:
28+
- name: Checkout code
29+
uses: actions/checkout@v4
30+
with:
31+
submodules: recursive
32+
33+
- name: Checkout common repo code
34+
uses: actions/checkout@v4
35+
with:
36+
repository: 'project-codeflare/codeflare-common'
37+
ref: 'main'
38+
path: 'common'
39+
40+
- name: Checkout CodeFlare operator repository
41+
uses: actions/checkout@v4
42+
with:
43+
repository: project-codeflare/codeflare-operator
44+
path: codeflare-operator
45+
46+
- name: Checkout Kueue repository
47+
uses: actions/checkout@v4
48+
with:
49+
repository: kubernetes-sigs/kueue
50+
path: kueue
51+
ref: main
52+
53+
- name: Set Go
54+
uses: actions/setup-go@v5
55+
with:
56+
go-version-file: './codeflare-operator/go.mod'
57+
cache-dependency-path: |
58+
./codeflare-operator/go.sum
59+
./kueue/go.sum
60+
61+
- name: Set up gotestfmt
62+
uses: gotesttools/gotestfmt-action@v2
63+
with:
64+
token: ${{ secrets.GITHUB_TOKEN }}
65+
66+
- name: Set up specific Python version
67+
uses: actions/setup-python@v5
68+
with:
69+
python-version: '3.11'
70+
cache: 'pip' # caching pip dependencies
71+
72+
- name: Setup and start KinD cluster
73+
uses: ./common/github-actions/kind
74+
with:
75+
worker-nodes: 2 # Multiple nodes for testing Kueue scheduling
76+
77+
- name: Verify Kind cluster
78+
run: |
79+
echo "Checking Kind clusters..."
80+
kind get clusters
81+
echo "Current kubectl context:"
82+
kubectl config current-context
83+
echo "Checking nodes:"
84+
kubectl get nodes
85+
86+
- name: Build and install Kueue
87+
run: |
88+
cd kueue
89+
echo "Building Kueue..."
90+
make manifests
91+
make install
92+
make kind-image-build
93+
94+
# Get the actual cluster name
95+
CLUSTER_NAME=$(kind get clusters | head -n 1)
96+
echo "Using Kind cluster: ${CLUSTER_NAME}"
97+
98+
# Load Kueue image into Kind - this loads to all nodes
99+
IMAGE_TAG=$(git describe --tags --dirty --always)
100+
echo "Loading image: us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:${IMAGE_TAG}"
101+
kind load docker-image us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:${IMAGE_TAG} --name ${CLUSTER_NAME}
102+
103+
# Also load with 'main' tag as fallback
104+
docker tag us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:${IMAGE_TAG} us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:main
105+
kind load docker-image us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:main --name ${CLUSTER_NAME}
106+
107+
# Deploy Kueue
108+
kubectl apply --server-side --force-conflicts -k config/default
109+
110+
# Patch to use the specific image tag and Never pull policy
111+
kubectl patch deployment kueue-controller-manager -n kueue-system --type='json' -p='[
112+
{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue:'${IMAGE_TAG}'"},
113+
{"op": "replace", "path": "/spec/template/spec/containers/0/imagePullPolicy", "value": "Never"}
114+
]'
115+
116+
# Wait for rollout to complete
117+
kubectl rollout status deployment/kueue-controller-manager -n kueue-system --timeout=120s
118+
119+
# Create a default LocalQueue in test namespaces to handle RayJobs that expect it
120+
# This prevents "Handling job with no workload" errors
121+
122+
cd ..
123+
124+
- name: Deploy KubeRay operator
125+
run: |
126+
echo "Installing KubeRay operator..."
127+
# Install KubeRay CRDs and operator
128+
kubectl create -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v1.2.2&timeout=300s" || true
129+
130+
# Wait for namespace to be created
131+
kubectl wait --for=condition=Exists namespace/ray-system --timeout=60s || kubectl create namespace ray-system
132+
133+
# Check if deployment exists and wait for it
134+
kubectl wait --timeout=300s --for=condition=Available=true deployment -n ray-system kuberay-operator || {
135+
echo "KubeRay operator deployment not found, checking pods..."
136+
kubectl get all -n ray-system
137+
exit 1
138+
}
139+
140+
- name: Add user to KinD
141+
uses: ./common/github-actions/kind-add-user
142+
with:
143+
user-name: sdk-user
144+
145+
- name: Configure RBAC for sdk user with limited permissions
146+
run: |
147+
# Basic permissions
148+
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
149+
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
150+
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
151+
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
152+
153+
# Ray permissions
154+
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters
155+
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
156+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs
157+
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
158+
159+
# Kueue permissions
160+
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
161+
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
162+
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
163+
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
164+
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
165+
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
166+
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads
167+
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
168+
169+
# Additional permissions
170+
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
171+
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
172+
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
173+
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
174+
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
175+
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
176+
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
177+
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
178+
kubectl create clusterrole node-reader --verb=get,list --resource=nodes
179+
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
180+
kubectl config use-context sdk-user
181+
182+
- name: Setup test output directory
183+
run: |
184+
CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
185+
mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
186+
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
187+
188+
- name: Run RayJob e2e tests
189+
run: |
190+
set -euo pipefail
191+
pip install poetry
192+
poetry install --with test,docs
193+
194+
# Install the SDK in editable mode
195+
pip install -e .
196+
197+
echo "Running RayJob e2e tests..."
198+
# Set environment variable to prevent default queue assignment for non-Kueue tests
199+
export DISABLE_DEFAULT_KUEUE_QUEUE=true
200+
201+
# Run only the tests that are designed for Kueue integration
202+
poetry run pytest -v -s ./tests/e2e/rayjob/rayjob_existing_cluster_test.py ./tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py -x > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
203+
env:
204+
GRPC_DNS_RESOLVER: "native"
205+
206+
- name: Switch to kind-cluster context to print logs
207+
if: always()
208+
run: kubectl config use-context kind-cluster
209+
210+
- name: Print Pytest output log
211+
if: always()
212+
run: |
213+
echo "Printing Pytest output logs"
214+
cat ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/pytest_output.log || true
215+
216+
- name: Print Kueue operator logs
217+
if: always()
218+
run: |
219+
echo "Printing Kueue operator logs"
220+
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kueue-operator.log || true
221+
222+
- name: Print KubeRay operator logs
223+
if: always()
224+
run: |
225+
echo "Printing KubeRay operator logs"
226+
echo "Checking ray-system namespace contents:"
227+
kubectl get all -n ray-system || true
228+
echo "Attempting to get KubeRay logs with different selectors:"
229+
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \
230+
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/component=kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \
231+
kubectl logs -n ray-system --tail -1 deployment/kuberay-operator | tee ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/kuberay.log || \
232+
echo "Could not find KubeRay operator logs"
233+
234+
- name: Export all KinD pod logs
235+
uses: ./common/github-actions/kind-export-logs
236+
if: always()
237+
with:
238+
output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}
239+
240+
- name: Upload logs
241+
uses: actions/upload-artifact@v4
242+
if: always()
243+
with:
244+
name: logs
245+
retention-days: 10
246+
path: |
247+
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
248+
if-no-files-found: warn

0 commit comments

Comments
 (0)