Skip to content

Commit e3aee31

Browse files
committed
RHOAIENG-32532: Fix broken E2E tests
1 parent 33fa535 commit e3aee31

File tree

15 files changed

+147
-282
lines changed

15 files changed

+147
-282
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 117 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,22 @@ on:
55
pull_request:
66
branches:
77
- main
8-
- 'release-*'
8+
- "release-*"
99
- ray-jobs-feature
10+
- kueue-integration
1011
paths-ignore:
11-
- 'docs/**'
12-
- '**.adoc'
13-
- '**.md'
14-
- 'LICENSE'
12+
- "docs/**"
13+
- "**.adoc"
14+
- "**.md"
15+
- "LICENSE"
1516

1617
concurrency:
1718
group: ${{ github.head_ref }}-${{ github.workflow }}
1819
cancel-in-progress: true
1920

2021
env:
2122
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
23+
KUEUE_VERSION: "v0.13.4"
2224

2325
jobs:
2426
kubernetes:
@@ -30,24 +32,17 @@ jobs:
3032
with:
3133
submodules: recursive
3234

33-
- name: Checkout common repo code
35+
- name: Checkout Kuberay operator repository
3436
uses: actions/checkout@v4
3537
with:
36-
repository: 'project-codeflare/codeflare-common'
37-
ref: 'main'
38-
path: 'common'
39-
40-
- name: Checkout CodeFlare operator repository
41-
uses: actions/checkout@v4
42-
with:
43-
repository: project-codeflare/codeflare-operator
44-
path: codeflare-operator
38+
repository: ray-project/kuberay
39+
path: kuberay
4540

4641
- name: Set Go
4742
uses: actions/setup-go@v5
4843
with:
49-
go-version-file: './codeflare-operator/go.mod'
50-
cache-dependency-path: "./codeflare-operator/go.sum"
44+
go-version-file: "./kuberay/go.mod"
45+
cache-dependency-path: "./kuberay/go.sum"
5146

5247
- name: Set up gotestfmt
5348
uses: gotesttools/gotestfmt-action@v2
@@ -57,33 +52,74 @@ jobs:
5752
- name: Set up specific Python version
5853
uses: actions/setup-python@v5
5954
with:
60-
python-version: '3.11'
61-
cache: 'pip' # caching pip dependencies
55+
python-version: "3.12"
56+
cache: "pip" # caching pip dependencies
6257

6358
- name: Setup NVidia GPU environment for KinD
64-
uses: ./common/github-actions/nvidia-gpu-setup
59+
uses: ./.github/nvidia/nvidia-gpu-setup
6560

6661
- name: Setup and start KinD cluster
67-
uses: ./common/github-actions/kind
62+
uses: ./.github/kind
6863
with:
6964
worker-nodes: 1
7065

7166
- name: Install NVidia GPU operator for KinD
72-
uses: ./common/github-actions/nvidia-gpu-operator
67+
uses: ./.github/nvidia/nvidia-gpu-operator
7368

74-
- name: Deploy CodeFlare stack
69+
- name: Wait for nodes to be ready
70+
run: |
71+
echo "Waiting for all nodes to be ready..."
72+
kubectl wait --for=condition=Ready nodes --all --timeout=300s
73+
74+
echo "Checking node status..."
75+
kubectl get nodes -o wide
76+
77+
echo "Checking for CNI readiness..."
78+
for i in {1..30}; do
79+
if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
80+
echo "Waiting for CNI to initialize (attempt $i/30)..."
81+
sleep 10
82+
else
83+
echo "All nodes are ready!"
84+
break
85+
fi
86+
done
87+
88+
# Final verification
89+
kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
90+
91+
- name: Deploy Kuberay & Kueue
7592
id: deploy
7693
run: |
77-
cd codeflare-operator
78-
echo Setting up CodeFlare stack
94+
echo Setting up Kuberay and Kueue
7995
make setup-e2e
80-
echo Deploying CodeFlare operator
81-
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
82-
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
83-
cd ..
96+
97+
# Wait for Kueue to be ready
98+
echo "Waiting for Kueue controller to be ready..."
99+
kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
100+
echo "Kueue deployment status:"
101+
kubectl get all -n kueue-system
102+
exit 1
103+
}
104+
105+
# Wait for KubeRay to be ready
106+
echo "Waiting for KubeRay operator to be ready..."
107+
kubectl wait --for=condition=Available --timeout=300s deployment -n ray-system kuberay-operator || {
108+
echo "KubeRay deployment status:"
109+
kubectl get all -n ray-system
110+
exit 1
111+
}
112+
113+
# Verify webhook certificates if CodeFlare operator is deployed
114+
if kubectl get deployment -n openshift-operators codeflare-operator-manager 2>/dev/null; then
115+
echo "Checking CodeFlare operator webhook certificates..."
116+
kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
117+
echo "Warning: Webhook certificate might be missing or invalid"
118+
}
119+
fi
84120
85121
- name: Add user to KinD
86-
uses: ./common/github-actions/kind-add-user
122+
uses: ./.github/kind/kind-add-user
87123
with:
88124
user-name: sdk-user
89125

@@ -93,16 +129,18 @@ jobs:
93129
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
94130
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
95131
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
96-
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
132+
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters
97133
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
98-
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
99-
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
134+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs,rayjobs/status
135+
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
100136
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
101137
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
102138
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
103139
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
104140
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
105141
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
142+
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads
143+
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
106144
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
107145
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
108146
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -111,8 +149,31 @@ jobs:
111149
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
112150
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
113151
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
152+
kubectl create clusterrole node-reader --verb=get,list --resource=nodes
153+
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
114154
kubectl config use-context sdk-user
115155
156+
- name: Verify cluster readiness before tests
157+
run: |
158+
echo "=== Pre-test cluster verification ==="
159+
echo "Current context:"
160+
kubectl config current-context
161+
162+
echo -e "\nNode status:"
163+
kubectl get nodes -o wide
164+
165+
echo -e "\nSystem pods status:"
166+
kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
167+
168+
echo -e "\nChecking for any pods in error state:"
169+
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
170+
171+
echo -e "\nKueue resources:"
172+
kubectl get resourceflavors,clusterqueues,localqueues -A || true
173+
174+
echo -e "\nRay CRDs:"
175+
kubectl get crd | grep ray || true
176+
116177
- name: Run e2e tests
117178
run: |
118179
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
@@ -122,19 +183,37 @@ jobs:
122183
pip install poetry
123184
poetry install --with test,docs
124185
echo "Running e2e tests..."
125-
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
186+
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
187+
env:
188+
GRPC_DNS_RESOLVER: "native"
189+
190+
- name: Run RayJob e2e tests
191+
run: |
192+
set -euo pipefail
193+
echo "Running RayJob e2e tests..."
194+
# Set environment variable to prevent default queue assignment for non-Kueue tests
195+
export DISABLE_DEFAULT_KUEUE_QUEUE=true
196+
197+
# Run only the tests that are designed for Kueue integration
198+
poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
126199
env:
127200
GRPC_DNS_RESOLVER: "native"
128201

129202
- name: Switch to kind-cluster context to print logs
130203
if: always() && steps.deploy.outcome == 'success'
131204
run: kubectl config use-context kind-cluster
132205

133-
- name: Print Pytest output log
206+
- name: Print RayJob E2E Pytest output log
207+
if: always() && steps.deploy.outcome == 'success'
208+
run: |
209+
echo "Printing Pytest output logs"
210+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log
211+
212+
- name: Print E2E Pytest output log
134213
if: always() && steps.deploy.outcome == 'success'
135214
run: |
136215
echo "Printing Pytest output logs"
137-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
216+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log
138217
139218
- name: Print CodeFlare operator logs
140219
if: always() && steps.deploy.outcome == 'success'
@@ -149,7 +228,7 @@ jobs:
149228
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
150229
151230
- name: Export all KinD pod logs
152-
uses: ./common/github-actions/kind-export-logs
231+
uses: ./.github/kind/kind-export-logs
153232
if: always() && steps.deploy.outcome == 'success'
154233
with:
155234
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
@@ -162,3 +241,4 @@ jobs:
162241
retention-days: 10
163242
path: |
164243
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
244+
if-no-files-found: warn

0 commit comments

Comments
 (0)