Skip to content

Commit 6bcdedd

Browse files
committed
test: revert e2e workflow
1 parent dfeeb39 commit 6bcdedd

File tree

8 files changed

+73
-160
lines changed

8 files changed

+73
-160
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 37 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,21 @@ on:
55
pull_request:
66
branches:
77
- main
8-
- "release-*"
8+
- 'release-*'
99
- ray-jobs-feature
10-
- kueue-integration
1110
paths-ignore:
12-
- "docs/**"
13-
- "**.adoc"
14-
- "**.md"
15-
- "LICENSE"
11+
- 'docs/**'
12+
- '**.adoc'
13+
- '**.md'
14+
- 'LICENSE'
1615

1716
concurrency:
1817
group: ${{ github.head_ref }}-${{ github.workflow }}
1918
cancel-in-progress: true
2019

2120
env:
22-
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
2321
KUEUE_VERSION: "v0.13.4"
22+
KUBERAY_VERSION: "v1.4.0"
2423

2524
jobs:
2625
kubernetes:
@@ -35,32 +34,15 @@ jobs:
3534
- name: Checkout common repo code
3635
uses: actions/checkout@v4
3736
with:
38-
repository: "project-codeflare/codeflare-common"
39-
ref: "main"
40-
path: "common"
41-
42-
- name: Checkout CodeFlare operator repository
43-
uses: actions/checkout@v4
44-
with:
45-
repository: project-codeflare/codeflare-operator
46-
path: codeflare-operator
47-
48-
- name: Set Go
49-
uses: actions/setup-go@v5
50-
with:
51-
go-version-file: "./codeflare-operator/go.mod"
52-
cache-dependency-path: "./codeflare-operator/go.sum"
53-
54-
- name: Set up gotestfmt
55-
uses: gotesttools/gotestfmt-action@v2
56-
with:
57-
token: ${{ secrets.GITHUB_TOKEN }}
37+
repository: 'project-codeflare/codeflare-common'
38+
ref: 'main'
39+
path: 'common'
5840

5941
- name: Set up specific Python version
6042
uses: actions/setup-python@v5
6143
with:
62-
python-version: "3.12"
63-
cache: "pip" # caching pip dependencies
44+
python-version: '3.11'
45+
cache: 'pip' # caching pip dependencies
6446

6547
- name: Setup NVidia GPU environment for KinD
6648
uses: ./common/github-actions/nvidia-gpu-setup
@@ -73,48 +55,12 @@ jobs:
7355
- name: Install NVidia GPU operator for KinD
7456
uses: ./common/github-actions/nvidia-gpu-operator
7557

76-
- name: Wait for nodes to be ready
77-
run: |
78-
echo "Waiting for all nodes to be ready..."
79-
kubectl wait --for=condition=Ready nodes --all --timeout=300s
80-
81-
echo "Checking node status..."
82-
kubectl get nodes -o wide
83-
84-
echo "Checking for CNI readiness..."
85-
for i in {1..30}; do
86-
if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
87-
echo "Waiting for CNI to initialize (attempt $i/30)..."
88-
sleep 10
89-
else
90-
echo "All nodes are ready!"
91-
break
92-
fi
93-
done
94-
95-
# Final verification
96-
kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
97-
98-
- name: Deploy CodeFlare stack
58+
- name: Deploy KubeRay and Kueue
9959
id: deploy
10060
run: |
101-
cd codeflare-operator
102-
echo Setting up CodeFlare stack
103-
make setup-e2e
104-
echo Deploying CodeFlare operator
105-
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
106-
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
107-
cd ..
108-
109-
- name: Verify CodeFlare deployment
110-
run: |
111-
# Wait for Kueue to be ready
112-
echo "Waiting for Kueue controller to be ready..."
113-
kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
114-
echo "Kueue deployment status:"
115-
kubectl get all -n kueue-system
116-
exit 1
117-
}
61+
# Deploy KubeRay operator
62+
echo "Deploying KubeRay ${KUBERAY_VERSION}..."
63+
kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}&timeout=180s"
11864
11965
# Wait for KubeRay to be ready
12066
echo "Waiting for KubeRay operator to be ready..."
@@ -124,44 +70,43 @@ jobs:
12470
exit 1
12571
}
12672
127-
# Verify webhook certificates
128-
echo "Checking CodeFlare operator webhook certificates..."
129-
kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
130-
echo "Warning: Webhook certificate might be missing or invalid"
73+
# Deploy Kueue
74+
echo "Deploying Kueue ${KUEUE_VERSION}..."
75+
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
76+
77+
# Wait for Kueue to be ready
78+
echo "Waiting for Kueue controller to be ready..."
79+
kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
80+
echo "Kueue deployment status:"
81+
kubectl get all -n kueue-system
82+
exit 1
13183
}
13284
85+
echo "✓ KubeRay and Kueue deployed successfully"
86+
13387
- name: Add user to KinD
13488
uses: ./common/github-actions/kind-add-user
13589
with:
13690
user-name: sdk-user
13791

13892
- name: Configure RBAC for sdk user with limited permissions
13993
run: |
140-
# CRD permissions for discovering resource types
141-
kubectl create clusterrole crd-reader --verb=get,list,watch --resource=customresourcedefinitions.apiextensions.k8s.io
142-
kubectl create clusterrolebinding sdk-user-crd-reader --clusterrole=crd-reader --user=sdk-user
143-
144-
# AppWrapper permissions for CodeFlare workloads
145-
kubectl create clusterrole appwrapper-creator --verb=get,list,watch,create,update,patch,delete --resource=appwrappers.workload.codeflare.dev
146-
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
147-
148-
# Existing permissions
14994
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
15095
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
15196
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
15297
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
15398
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters.ray.io
15499
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
155-
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs/status
100+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs.ray.io/status
156101
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
157102
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors.kueue.x-k8s.io
158103
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
159104
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues.kueue.x-k8s.io
160105
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
161106
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues.kueue.x-k8s.io
162107
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
163-
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
164-
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
108+
kubectl create clusterrole workload-reader --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
109+
kubectl create clusterrolebinding sdk-user-workload-reader --clusterrole=workload-reader --user=sdk-user
165110
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
166111
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
167112
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -174,86 +119,40 @@ jobs:
174119
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
175120
kubectl config use-context sdk-user
176121
177-
- name: Verify cluster readiness before tests
178-
run: |
179-
echo "=== Pre-test cluster verification ==="
180-
echo "Current context:"
181-
kubectl config current-context
182-
183-
echo -e "\nNode status:"
184-
kubectl get nodes -o wide
185-
186-
echo -e "\nSystem pods status:"
187-
kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
188-
189-
echo -e "\nChecking for any pods in error state:"
190-
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
191-
192-
echo -e "\nKueue resources:"
193-
kubectl get resourceflavors,clusterqueues,localqueues -A || true
194-
195-
echo -e "\nRay CRDs:"
196-
kubectl get crd | grep ray || true
197-
198122
- name: Run e2e tests
199123
run: |
200-
export CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
201-
mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
124+
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
202125
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
203126
204127
set -euo pipefail
205128
pip install poetry
206129
poetry install --with test,docs
207130
echo "Running e2e tests..."
208-
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
209-
env:
210-
GRPC_DNS_RESOLVER: "native"
211-
212-
- name: Run RayJob e2e tests
213-
run: |
214-
set -euo pipefail
215-
echo "Running RayJob e2e tests..."
216-
# Set environment variable to prevent default queue assignment for non-Kueue tests
217-
export DISABLE_DEFAULT_KUEUE_QUEUE=true
218-
219-
# Run only the tests that are designed for Kueue integration
220-
poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
131+
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
221132
env:
222133
GRPC_DNS_RESOLVER: "native"
223134

224135
- name: Switch to kind-cluster context to print logs
225136
if: always() && steps.deploy.outcome == 'success'
226137
run: kubectl config use-context kind-cluster
227138

228-
- name: Print RayJob E2E Pytest output log
229-
if: always() && steps.deploy.outcome == 'success'
230-
run: |
231-
echo "Printing RayJob Pytest output logs"
232-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log || echo "No RayJob test output found"
233-
234-
- name: Print E2E Pytest output log
235-
if: always() && steps.deploy.outcome == 'success'
236-
run: |
237-
echo "Printing E2E Pytest output logs"
238-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log || echo "No E2E test output found"
239-
240-
- name: Print CodeFlare operator logs
139+
- name: Print Pytest output log
241140
if: always() && steps.deploy.outcome == 'success'
242141
run: |
243-
echo "Printing CodeFlare operator logs"
244-
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
142+
echo "Printing Pytest output logs"
143+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
245144
246145
- name: Print KubeRay operator logs
247146
if: always() && steps.deploy.outcome == 'success'
248147
run: |
249148
echo "Printing KubeRay operator logs"
250-
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
149+
kubectl logs -n default --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
251150
252151
- name: Export all KinD pod logs
253152
uses: ./common/github-actions/kind-export-logs
254153
if: always() && steps.deploy.outcome == 'success'
255154
with:
256-
output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}
155+
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
257156

258157
- name: Upload logs
259158
uses: actions/upload-artifact@v4
@@ -263,4 +162,3 @@ jobs:
263162
retention-days: 10
264163
path: |
265164
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
266-
if-no-files-found: warn

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,13 @@ def is_dashboard_ready(self) -> bool:
395395
if dashboard_uri is None:
396396
return False
397397

398+
# For ClusterIP services in local/test environments, we can't make HTTP requests
399+
# Instead, check if the Ray pods are running as a proxy for dashboard readiness
400+
if ".svc.cluster.local" in dashboard_uri:
401+
# This is a ClusterIP service - check if Ray cluster is ready instead
402+
status, ready = self.status(print_to_console=False)
403+
return ready
404+
398405
try:
399406
response = requests.get(
400407
dashboard_uri,

tests/e2e/cluster_apply_kind_test.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from codeflare_sdk.common.utils import constants
66

77
from support import (
8+
get_ray_image,
89
initialize_kubernetes_client,
910
create_namespace,
1011
delete_namespace,
@@ -40,7 +41,7 @@ def test_cluster_apply(self):
4041
worker_cpu_limits="1",
4142
worker_memory_requests="1Gi",
4243
worker_memory_limits="2Gi",
43-
image=f"rayproject/ray:{constants.RAY_VERSION}",
44+
image=get_ray_image(),
4445
write_to_file=True,
4546
verify_tls=False,
4647
)
@@ -50,7 +51,7 @@ def test_cluster_apply(self):
5051
cluster.apply()
5152

5253
# Wait for the cluster to be ready
53-
cluster.wait_ready(dashboard_check=False)
54+
cluster.wait_ready()
5455
status, ready = cluster.status()
5556
assert ready, f"Cluster {cluster_name} is not ready: {status}"
5657

@@ -74,7 +75,7 @@ def test_cluster_apply(self):
7475
worker_cpu_limits="1",
7576
worker_memory_requests="1Gi",
7677
worker_memory_limits="2Gi",
77-
image=f"rayproject/ray:{constants.RAY_VERSION}",
78+
image=get_ray_image(),
7879
write_to_file=True,
7980
verify_tls=False,
8081
)

tests/e2e/heterogeneous_clusters_kind_test.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,17 @@ def run_heterogeneous_clusters(
4848
namespace=self.namespace,
4949
num_workers=1,
5050
head_cpu_requests="500m",
51-
head_cpu_limits="500m",
52-
head_memory_requests=2,
53-
head_memory_limits=2,
51+
head_cpu_limits="1",
52+
head_memory_requests="1Gi",
53+
head_memory_limits="2Gi",
5454
worker_cpu_requests="500m",
5555
worker_cpu_limits=1,
56-
worker_memory_requests=1,
57-
worker_memory_limits=4,
56+
worker_memory_requests="1Gi",
57+
worker_memory_limits="2Gi",
5858
worker_extended_resource_requests={
5959
gpu_resource_name: number_of_gpus
6060
},
61+
image=get_ray_image(),
6162
write_to_file=True,
6263
verify_tls=False,
6364
local_queue=queue_name,

tests/e2e/local_interactive_sdk_kind_test.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,15 @@ def run_local_interactives(
5555
namespace=self.namespace,
5656
num_workers=1,
5757
head_cpu_requests="500m",
58-
head_cpu_limits="500m",
58+
head_cpu_limits="1",
59+
head_memory_requests="1Gi",
60+
head_memory_limits="2Gi",
5961
worker_cpu_requests="500m",
6062
worker_cpu_limits=1,
61-
worker_memory_requests=1,
62-
worker_memory_limits=4,
63+
worker_memory_requests="1Gi",
64+
worker_memory_limits="2Gi",
6365
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
66+
image=get_ray_image(),
6467
verify_tls=False,
6568
)
6669
)

tests/e2e/mnist_raycluster_sdk_aw_kind_test.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,15 @@ def run_mnist_raycluster_sdk_kind(
4343
namespace=self.namespace,
4444
num_workers=1,
4545
head_cpu_requests="500m",
46-
head_cpu_limits="500m",
46+
head_cpu_limits="1",
47+
head_memory_requests="1Gi",
48+
head_memory_limits="2Gi",
4749
worker_cpu_requests="500m",
4850
worker_cpu_limits=1,
49-
worker_memory_requests=1,
50-
worker_memory_limits=4,
51+
worker_memory_requests="1Gi",
52+
worker_memory_limits="2Gi",
5153
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
54+
image=get_ray_image(),
5255
write_to_file=True,
5356
verify_tls=False,
5457
appwrapper=True,

0 commit comments

Comments
 (0)