Skip to content

Commit 9dcb037

Browse files
committed
RHOAIENG-32532: Fix broken E2E tests
1 parent 33fa535 commit 9dcb037

File tree

15 files changed

+64
-250
lines changed

15 files changed

+64
-250
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ concurrency:
1919

2020
env:
2121
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
22+
KUEUE_VERSION: "v0.13.4"
2223

2324
jobs:
2425
kubernetes:
@@ -57,7 +58,7 @@ jobs:
5758
- name: Set up specific Python version
5859
uses: actions/setup-python@v5
5960
with:
60-
python-version: '3.11'
61+
python-version: '3.12'
6162
cache: 'pip' # caching pip dependencies
6263

6364
- name: Setup NVidia GPU environment for KinD
@@ -93,8 +94,10 @@ jobs:
9394
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
9495
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
9596
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
96-
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
97+
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters
9798
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
99+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs,rayjobs/status
100+
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
98101
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
99102
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
100103
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
@@ -111,6 +114,10 @@ jobs:
111114
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
112115
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
113116
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
117+
kubectl create clusterrole node-reader --verb=get,list --resource=nodes
118+
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
119+
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads
120+
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
114121
kubectl config use-context sdk-user
115122
116123
- name: Run e2e tests
@@ -122,19 +129,40 @@ jobs:
122129
pip install poetry
123130
poetry install --with test,docs
124131
echo "Running e2e tests..."
125-
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
132+
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
133+
env:
134+
GRPC_DNS_RESOLVER: "native"
135+
136+
- name: Run RayJob e2e tests
137+
run: |
138+
echo "Running RayJob e2e tests..."
139+
# Set environment variable to prevent default queue assignment for non-Kueue tests
140+
export DISABLE_DEFAULT_KUEUE_QUEUE=true
141+
142+
# Install SDK in editable mode if not already done
143+
pip install -e .
144+
145+
# Run only the tests that are designed for Kueue integration
146+
poetry run pytest -v -s ./tests/e2e/rayjob/-x >> ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob-e2e-pytest_output.log 2>&1
126147
env:
127148
GRPC_DNS_RESOLVER: "native"
128149

129150
- name: Switch to kind-cluster context to print logs
130151
if: always() && steps.deploy.outcome == 'success'
131152
run: kubectl config use-context kind-cluster
132153

133-
- name: Print Pytest output log
154+
- name: Print E2E Pytest output log
155+
if: always() && steps.deploy.outcome == 'success'
156+
run: |
157+
echo "Printing Pytest output logs"
158+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log
159+
160+
161+
- name: Print RayJob E2E Pytest output log
134162
if: always() && steps.deploy.outcome == 'success'
135163
run: |
136164
echo "Printing Pytest output logs"
137-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
165+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob-e2e-pytest_output.log
138166
139167
- name: Print CodeFlare operator logs
140168
if: always() && steps.deploy.outcome == 'success'
@@ -162,3 +190,4 @@ jobs:
162190
retention-days: 10
163191
path: |
164192
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
193+
if-no-files-found: warn

.github/workflows/rayjob_e2e_tests.yaml

Lines changed: 0 additions & 210 deletions
This file was deleted.

src/codeflare_sdk/common/utils/k8s_utils.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,10 @@
77
from ..kubernetes_cluster import config_check, _kube_api_error_handling
88

99

10-
def get_current_namespace():
10+
def get_current_namespace(): # pragma: no cover
1111
"""
1212
Retrieves the current Kubernetes namespace.
1313
14-
This function attempts to detect the current namespace by:
15-
1. First checking if running inside a pod (reading from service account namespace file)
16-
2. Falling back to reading from the current kubeconfig context
17-
1814
Returns:
1915
str:
2016
The current namespace or None if not found.

src/codeflare_sdk/ray/cluster/build_ray_cluster.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,6 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
136136
"enableIngress": False,
137137
"rayStartParams": {
138138
"dashboard-host": "0.0.0.0",
139-
"dashboard-port": "8265",
140139
"block": "true",
141140
"num-gpus": str(head_gpu_count),
142141
"resources": head_resources,

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,12 @@ def remove_autogenerated_fields(resource):
783783
del resource[key]
784784
else:
785785
remove_autogenerated_fields(resource[key])
786+
787+
# After cleaning, remove empty metadata sections
788+
if "metadata" in resource and isinstance(resource["metadata"], dict):
789+
if len(resource["metadata"]) == 0:
790+
del resource["metadata"]
791+
786792
elif isinstance(resource, list):
787793
for item in resource:
788794
remove_autogenerated_fields(item)

src/codeflare_sdk/ray/cluster/test_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2022-2025 IBM, Red Hat
1+
# Copyright 2024 IBM, Red Hat
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.

src/codeflare_sdk/ray/rayjobs/config.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,6 @@ def _build_head_ray_params(self) -> Dict[str, str]:
284284
"""Build Ray start parameters for head node."""
285285
params = {
286286
"dashboard-host": "0.0.0.0",
287-
"dashboard-port": "8265",
288287
"block": "true",
289288
}
290289

0 commit comments

Comments
 (0)