Skip to content

Commit 6be1065

Browse files
committed
test: revert e2e workflow
1 parent 40c3449 commit 6be1065

File tree

9 files changed

+115
-92
lines changed

9 files changed

+115
-92
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,22 @@ on:
55
pull_request:
66
branches:
77
- main
8-
- 'release-*'
8+
- "release-*"
99
- ray-jobs-feature
1010
paths-ignore:
11-
- 'docs/**'
12-
- '**.adoc'
13-
- '**.md'
14-
- 'LICENSE'
11+
- "docs/**"
12+
- "**.adoc"
13+
- "**.md"
14+
- "LICENSE"
1515

1616
concurrency:
1717
group: ${{ github.head_ref }}-${{ github.workflow }}
1818
cancel-in-progress: true
1919

2020
env:
2121
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
22+
KUEUE_VERSION: "v0.13.4"
23+
KUERAY_VERSION: "v1.4.0"
2224

2325
jobs:
2426
kubernetes:
@@ -33,9 +35,9 @@ jobs:
3335
- name: Checkout common repo code
3436
uses: actions/checkout@v4
3537
with:
36-
repository: 'project-codeflare/codeflare-common'
37-
ref: 'main'
38-
path: 'common'
38+
repository: "project-codeflare/codeflare-common"
39+
ref: "main"
40+
path: "common"
3941

4042
- name: Checkout CodeFlare operator repository
4143
uses: actions/checkout@v4
@@ -46,7 +48,7 @@ jobs:
4648
- name: Set Go
4749
uses: actions/setup-go@v5
4850
with:
49-
go-version-file: './codeflare-operator/go.mod'
51+
go-version-file: "./codeflare-operator/go.mod"
5052
cache-dependency-path: "./codeflare-operator/go.sum"
5153

5254
- name: Set up gotestfmt
@@ -57,8 +59,8 @@ jobs:
5759
- name: Set up specific Python version
5860
uses: actions/setup-python@v5
5961
with:
60-
python-version: '3.11'
61-
cache: 'pip' # caching pip dependencies
62+
python-version: "3.11"
63+
cache: "pip" # caching pip dependencies
6264

6365
- name: Setup NVidia GPU environment for KinD
6466
uses: ./common/github-actions/nvidia-gpu-setup
@@ -76,7 +78,7 @@ jobs:
7678
run: |
7779
cd codeflare-operator
7880
echo Setting up CodeFlare stack
79-
make setup-e2e
81+
make setup-e2e KUEUE_VERSION=${KUEUE_VERSION} KUERAY_VERSION=${KUERAY_VERSION}
8082
echo Deploying CodeFlare operator
8183
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
8284
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 5 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,6 @@ def apply(self, force=False):
209209
namespace = self.config.namespace
210210
name = self.config.name
211211

212-
# Regenerate resource_yaml to reflect any configuration changes
213212
self.resource_yaml = self.create_resource()
214213

215214
try:
@@ -391,23 +390,17 @@ def is_dashboard_ready(self) -> bool:
391390
bool:
392391
True if the dashboard is ready, False otherwise.
393392
"""
394-
dashboard_uri = self.cluster_dashboard_uri()
395-
if dashboard_uri is None:
396-
return False
397393

398394
try:
399395
response = requests.get(
400-
dashboard_uri,
396+
self.cluster_dashboard_uri(),
401397
headers=self._client_headers,
402398
timeout=5,
403399
verify=self._client_verify_tls,
404400
)
405401
except requests.exceptions.SSLError: # pragma no cover
406402
# SSL exception occurs when oauth ingress has been created but cluster is not up
407403
return False
408-
except Exception: # pragma no cover
409-
# Any other exception (connection errors, timeouts, etc.)
410-
return False
411404

412405
if response.status_code == 200:
413406
return True
@@ -516,6 +509,8 @@ def cluster_dashboard_uri(self) -> str:
516509
):
517510
protocol = "https" if route["spec"].get("tls") else "http"
518511
return f"{protocol}://{route['spec']['host']}"
512+
# No route found for this cluster
513+
return "Dashboard not available yet, have you run cluster.up()?"
519514
else:
520515
try:
521516
api_instance = client.NetworkingV1Api(get_api_client())
@@ -534,25 +529,9 @@ def cluster_dashboard_uri(self) -> str:
534529
protocol = "http"
535530
elif "route.openshift.io/termination" in annotations:
536531
protocol = "https"
537-
return f"{protocol}://{ingress.spec.rules[0].host}"
532+
return f"{protocol}://{ingress.spec.rules[0].host}"
538533

539-
# For local/test environments without ingress controller (e.g., KIND)
540-
# Try to find the Ray head service
541-
try:
542-
api_instance = client.CoreV1Api(get_api_client())
543-
services = api_instance.list_namespaced_service(
544-
self.config.namespace,
545-
label_selector=f"ray.io/cluster={self.config.name},ray.io/node-type=head",
546-
)
547-
for service in services.items:
548-
if service.metadata.name == f"{self.config.name}-head-svc":
549-
# For ClusterIP services in local environments, return a placeholder
550-
# The actual connection would need port-forwarding or NodePort
551-
return f"http://{service.metadata.name}.{self.config.namespace}.svc.cluster.local:8265"
552-
except Exception: # pragma: no cover
553-
pass
554-
555-
return None
534+
return "Dashboard not available yet, have you run cluster.up()?"
556535

557536
def list_jobs(self) -> List:
558537
"""
@@ -813,11 +792,6 @@ def remove_autogenerated_fields(resource):
813792
else:
814793
remove_autogenerated_fields(resource[key])
815794

816-
# After cleaning, remove empty metadata sections
817-
if "metadata" in resource and isinstance(resource["metadata"], dict):
818-
if len(resource["metadata"]) == 0:
819-
del resource["metadata"]
820-
821795
elif isinstance(resource, list):
822796
for item in resource:
823797
remove_autogenerated_fields(item)

src/codeflare_sdk/ray/cluster/test_cluster.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,10 @@ def test_cluster_uris(mocker):
322322
mocker.patch(
323323
"kubernetes.client.NetworkingV1Api.list_namespaced_ingress",
324324
)
325-
# When no ingress/route/service is found, the method should return None
326-
assert cluster.cluster_dashboard_uri() is None
325+
assert (
326+
cluster.cluster_dashboard_uri()
327+
== "Dashboard not available yet, have you run cluster.up()?"
328+
)
327329

328330
mocker.patch(
329331
"codeflare_sdk.ray.cluster.cluster._is_openshift_cluster", return_value=True

tests/e2e/cluster_apply_kind_test.py

Lines changed: 68 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1+
from calendar import c
2+
from time import sleep
13
from codeflare_sdk import Cluster, ClusterConfiguration
24
import pytest
3-
import time
45
from kubernetes import client
5-
from codeflare_sdk.common.utils import constants
66

77
from support import (
88
initialize_kubernetes_client,
@@ -40,7 +40,6 @@ def test_cluster_apply(self):
4040
worker_cpu_limits="1",
4141
worker_memory_requests="1Gi",
4242
worker_memory_limits="2Gi",
43-
image=f"rayproject/ray:{constants.RAY_VERSION}",
4443
write_to_file=True,
4544
verify_tls=False,
4645
)
@@ -50,9 +49,9 @@ def test_cluster_apply(self):
5049
cluster.apply()
5150

5251
# Wait for the cluster to be ready
53-
cluster.wait_ready(dashboard_check=False)
52+
cluster.wait_ready()
5453
status, ready = cluster.status()
55-
assert ready, f"Cluster {cluster_name} is not ready: {status}"
54+
assert ready, f"Cluster {cluster_name} is not ready"
5655

5756
# Verify the cluster is created
5857
ray_cluster = get_ray_cluster(cluster_name, namespace)
@@ -61,7 +60,7 @@ def test_cluster_apply(self):
6160
ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] == 1
6261
), "Initial worker count does not match"
6362

64-
# Update configuration with 2 workers
63+
# Update configuration with 3 workers
6564
updated_config = ClusterConfiguration(
6665
name=cluster_name,
6766
namespace=namespace,
@@ -74,7 +73,6 @@ def test_cluster_apply(self):
7473
worker_cpu_limits="1",
7574
worker_memory_requests="1Gi",
7675
worker_memory_limits="2Gi",
77-
image=f"rayproject/ray:{constants.RAY_VERSION}",
7876
write_to_file=True,
7977
verify_tls=False,
8078
)
@@ -83,15 +81,10 @@ def test_cluster_apply(self):
8381
cluster.config = updated_config
8482
cluster.apply()
8583

86-
# Give Kubernetes a moment to process the update
87-
time.sleep(5)
88-
8984
# Wait for the updated cluster to be ready
90-
cluster.wait_ready(dashboard_check=False)
85+
cluster.wait_ready()
9186
updated_status, updated_ready = cluster.status()
92-
assert (
93-
updated_ready
94-
), f"Cluster {cluster_name} is not ready after update: {updated_status}"
87+
assert updated_ready, f"Cluster {cluster_name} is not ready after update"
9588

9689
# Verify the cluster is updated
9790
updated_ray_cluster = get_ray_cluster(cluster_name, namespace)
@@ -101,19 +94,67 @@ def test_cluster_apply(self):
10194

10295
# Clean up
10396
cluster.down()
97+
sleep(10)
98+
ray_cluster = get_ray_cluster(cluster_name, namespace)
99+
assert ray_cluster is None, "Cluster was not deleted successfully"
104100

105-
# Wait for deletion to complete (finalizers may delay deletion)
106-
max_wait = 30 # seconds
107-
wait_interval = 2
108-
elapsed = 0
101+
def test_apply_invalid_update(self):
102+
self.setup_method()
103+
create_namespace(self)
109104

110-
while elapsed < max_wait:
111-
ray_cluster = get_ray_cluster(cluster_name, namespace)
112-
if ray_cluster is None:
113-
break
114-
time.sleep(wait_interval)
115-
elapsed += wait_interval
105+
cluster_name = "test-cluster-apply-invalid"
106+
namespace = self.namespace
116107

117-
assert (
118-
ray_cluster is None
119-
), f"Cluster was not deleted successfully after {max_wait}s"
108+
# Initial configuration
109+
initial_config = ClusterConfiguration(
110+
name=cluster_name,
111+
namespace=namespace,
112+
num_workers=1,
113+
head_cpu_requests="500m",
114+
head_cpu_limits="1",
115+
head_memory_requests="1Gi",
116+
head_memory_limits="2Gi",
117+
worker_cpu_requests="500m",
118+
worker_cpu_limits="1",
119+
worker_memory_requests="1Gi",
120+
worker_memory_limits="2Gi",
121+
write_to_file=True,
122+
verify_tls=False,
123+
)
124+
125+
# Create the cluster
126+
cluster = Cluster(initial_config)
127+
cluster.apply()
128+
129+
# Wait for the cluster to be ready
130+
cluster.wait_ready()
131+
status, ready = cluster.status()
132+
assert ready, f"Cluster {cluster_name} is not ready"
133+
134+
# Update with an invalid configuration (e.g., immutable field change)
135+
invalid_config = ClusterConfiguration(
136+
name=cluster_name,
137+
namespace=namespace,
138+
num_workers=2,
139+
head_cpu_requests="1",
140+
head_cpu_limits="2", # Changing CPU limits (immutable)
141+
head_memory_requests="1Gi",
142+
head_memory_limits="2Gi",
143+
worker_cpu_requests="500m",
144+
worker_cpu_limits="1",
145+
worker_memory_requests="1Gi",
146+
worker_memory_limits="2Gi",
147+
write_to_file=True,
148+
verify_tls=False,
149+
)
150+
151+
# Try to apply the invalid configuration and expect failure
152+
cluster.config = invalid_config
153+
cluster.apply()
154+
155+
cluster.wait_ready()
156+
status, ready = cluster.status()
157+
assert ready, f"Cluster {cluster_name} is not ready"
158+
159+
# Clean up
160+
cluster.down()

tests/e2e/heterogeneous_clusters_kind_test.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,17 @@ def run_heterogeneous_clusters(
4848
namespace=self.namespace,
4949
num_workers=1,
5050
head_cpu_requests="500m",
51-
head_cpu_limits="500m",
52-
head_memory_requests=2,
53-
head_memory_limits=2,
51+
head_cpu_limits="1",
52+
head_memory_requests="1Gi",
53+
head_memory_limits="2Gi",
5454
worker_cpu_requests="500m",
5555
worker_cpu_limits=1,
56-
worker_memory_requests=1,
57-
worker_memory_limits=4,
56+
worker_memory_requests="1Gi",
57+
worker_memory_limits="2Gi",
5858
worker_extended_resource_requests={
5959
gpu_resource_name: number_of_gpus
6060
},
61+
image=get_ray_image(),
6162
write_to_file=True,
6263
verify_tls=False,
6364
local_queue=queue_name,

tests/e2e/local_interactive_sdk_kind_test.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,17 +55,19 @@ def run_local_interactives(
5555
namespace=self.namespace,
5656
num_workers=1,
5757
head_cpu_requests="500m",
58-
head_cpu_limits="500m",
58+
head_cpu_limits="1",
59+
head_memory_requests="1Gi",
60+
head_memory_limits="2Gi",
5961
worker_cpu_requests="500m",
6062
worker_cpu_limits=1,
61-
worker_memory_requests=1,
62-
worker_memory_limits=4,
63+
worker_memory_requests="1Gi",
64+
worker_memory_limits="2Gi",
6365
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
6466
verify_tls=False,
6567
)
6668
)
6769

68-
cluster.up()
70+
cluster.apply()
6971

7072
cluster.wait_ready()
7173
cluster.status()

tests/e2e/mnist_raycluster_sdk_aw_kind_test.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,19 +43,21 @@ def run_mnist_raycluster_sdk_kind(
4343
namespace=self.namespace,
4444
num_workers=1,
4545
head_cpu_requests="500m",
46-
head_cpu_limits="500m",
46+
head_cpu_limits="1",
47+
head_memory_requests="1Gi",
48+
head_memory_limits="2Gi",
4749
worker_cpu_requests="500m",
4850
worker_cpu_limits=1,
49-
worker_memory_requests=1,
50-
worker_memory_limits=4,
51+
worker_memory_requests="1Gi",
52+
worker_memory_limits="2Gi",
5153
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
5254
write_to_file=True,
5355
verify_tls=False,
5456
appwrapper=True,
5557
)
5658
)
5759

58-
cluster.up()
60+
cluster.apply()
5961

6062
cluster.status()
6163

0 commit comments

Comments
 (0)