Skip to content

Commit aeccfc5

Browse files
committed
RHOAIENG-32532: Fix broken E2E tests
1 parent 80d1307 commit aeccfc5

File tree

4 files changed

+66
-7
lines changed

4 files changed

+66
-7
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ concurrency:
1919

2020
env:
2121
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
22+
# Explicitly set Docker MTU for KIND to avoid network issues
23+
KIND_EXPERIMENTAL_DOCKER_NETWORK: "bridge"
2224

2325
jobs:
2426
kubernetes:
@@ -60,6 +62,29 @@ jobs:
6062
python-version: '3.11'
6163
cache: 'pip' # caching pip dependencies
6264

65+
- name: Install required tools
66+
run: |
67+
echo "Installing KIND..."
68+
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.22.0/kind-linux-amd64
69+
chmod +x ./kind
70+
sudo mv ./kind /usr/local/bin/kind
71+
kind version
72+
73+
echo "Installing kubectl if not present..."
74+
if ! command -v kubectl &> /dev/null; then
75+
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
76+
chmod +x kubectl
77+
sudo mv kubectl /usr/local/bin/
78+
fi
79+
kubectl version --client
80+
81+
echo "Checking for podman..."
82+
if ! command -v podman &> /dev/null; then
83+
echo "Podman not found, installing..."
84+
sudo apt-get update
85+
sudo apt-get install -y podman || echo "Podman installation failed, might not be needed"
86+
fi
87+
6388
- name: Setup NVidia GPU environment for KinD
6489
uses: ./common/github-actions/nvidia-gpu-setup
6590

@@ -68,9 +93,35 @@ jobs:
6893
with:
6994
worker-nodes: 1
7095

96+
- name: Verify Kind cluster
97+
run: |
98+
echo "Checking Kind clusters..."
99+
kind get clusters
100+
echo "Current kubectl context:"
101+
kubectl config current-context
102+
echo "Checking nodes:"
103+
kubectl get nodes
104+
105+
echo "Waiting for nodes to be ready..."
106+
kubectl wait --for=condition=Ready nodes --all --timeout=300s
107+
108+
71109
- name: Install NVidia GPU operator for KinD
72110
uses: ./common/github-actions/nvidia-gpu-operator
73111

112+
- name: Deploy KubeRay operator
113+
run: |
114+
KUBERAY_VERSION="v1.2.2"
115+
echo "Deploying KubeRay ${KUBERAY_VERSION}"
116+
kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}&timeout=180s"
117+
118+
echo "Waiting for KubeRay operator to become ready..."
119+
kubectl wait --for=condition=Available --timeout=300s deployment/kuberay-operator -n ray-system || {
120+
echo "KubeRay operator not found, checking pods..."
121+
kubectl get pods -A | grep kuberay
122+
exit 1
123+
}
124+
74125
- name: Deploy CodeFlare stack
75126
id: deploy
76127
run: |
@@ -113,11 +164,15 @@ jobs:
113164
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
114165
kubectl config use-context sdk-user
115166
116-
- name: Run e2e tests
167+
- name: Setup test output directory
117168
run: |
118-
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
169+
CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
170+
mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
119171
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
120172
173+
- name: Run e2e tests
174+
run: |
175+
121176
set -euo pipefail
122177
pip install poetry
123178
poetry install --with test,docs
@@ -152,7 +207,7 @@ jobs:
152207
uses: ./common/github-actions/kind-export-logs
153208
if: always() && steps.deploy.outcome == 'success'
154209
with:
155-
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
210+
output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}
156211

157212
- name: Upload logs
158213
uses: actions/upload-artifact@v4
@@ -162,3 +217,4 @@ jobs:
162217
retention-days: 10
163218
path: |
164219
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
220+
if-no-files-found: warn

.github/workflows/rayjob_e2e_tests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: rayjob-e2e-with-kueue
1+
name: rayjob-e2e
22

33
on:
44
pull_request:

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,12 @@ def remove_autogenerated_fields(resource):
783783
del resource[key]
784784
else:
785785
remove_autogenerated_fields(resource[key])
786+
787+
# After cleaning, remove empty metadata sections
788+
if "metadata" in resource and isinstance(resource["metadata"], dict):
789+
if len(resource["metadata"]) == 0:
790+
del resource["metadata"]
791+
786792
elif isinstance(resource, list):
787793
for item in resource:
788794
remove_autogenerated_fields(item)

tests/e2e/rayjob/ray_version_validation_oauth_test.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,7 @@
1111
ManagedClusterConfig,
1212
)
1313

14-
# This test validates Ray version compatibility checking for RayJob with cluster lifecycling scenarios
1514

16-
17-
@pytest.mark.openshift
1815
class TestRayJobRayVersionValidationOauth:
1916
def setup_method(self):
2017
initialize_kubernetes_client(self)

0 commit comments

Comments
 (0)