Skip to content

Commit 3f611b6

Browse files
committed
test: revert e2e workflow
1 parent dfeeb39 commit 3f611b6

File tree

9 files changed

+136
-219
lines changed

9 files changed

+136
-219
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 37 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,21 @@ on:
55
pull_request:
66
branches:
77
- main
8-
- "release-*"
8+
- 'release-*'
99
- ray-jobs-feature
10-
- kueue-integration
1110
paths-ignore:
12-
- "docs/**"
13-
- "**.adoc"
14-
- "**.md"
15-
- "LICENSE"
11+
- 'docs/**'
12+
- '**.adoc'
13+
- '**.md'
14+
- 'LICENSE'
1615

1716
concurrency:
1817
group: ${{ github.head_ref }}-${{ github.workflow }}
1918
cancel-in-progress: true
2019

2120
env:
22-
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
2321
KUEUE_VERSION: "v0.13.4"
22+
KUBERAY_VERSION: "v1.4.0"
2423

2524
jobs:
2625
kubernetes:
@@ -35,32 +34,15 @@ jobs:
3534
- name: Checkout common repo code
3635
uses: actions/checkout@v4
3736
with:
38-
repository: "project-codeflare/codeflare-common"
39-
ref: "main"
40-
path: "common"
41-
42-
- name: Checkout CodeFlare operator repository
43-
uses: actions/checkout@v4
44-
with:
45-
repository: project-codeflare/codeflare-operator
46-
path: codeflare-operator
47-
48-
- name: Set Go
49-
uses: actions/setup-go@v5
50-
with:
51-
go-version-file: "./codeflare-operator/go.mod"
52-
cache-dependency-path: "./codeflare-operator/go.sum"
53-
54-
- name: Set up gotestfmt
55-
uses: gotesttools/gotestfmt-action@v2
56-
with:
57-
token: ${{ secrets.GITHUB_TOKEN }}
37+
repository: 'project-codeflare/codeflare-common'
38+
ref: 'main'
39+
path: 'common'
5840

5941
- name: Set up specific Python version
6042
uses: actions/setup-python@v5
6143
with:
62-
python-version: "3.12"
63-
cache: "pip" # caching pip dependencies
44+
python-version: '3.11'
45+
cache: 'pip' # caching pip dependencies
6446

6547
- name: Setup NVidia GPU environment for KinD
6648
uses: ./common/github-actions/nvidia-gpu-setup
@@ -73,48 +55,12 @@ jobs:
7355
- name: Install NVidia GPU operator for KinD
7456
uses: ./common/github-actions/nvidia-gpu-operator
7557

76-
- name: Wait for nodes to be ready
77-
run: |
78-
echo "Waiting for all nodes to be ready..."
79-
kubectl wait --for=condition=Ready nodes --all --timeout=300s
80-
81-
echo "Checking node status..."
82-
kubectl get nodes -o wide
83-
84-
echo "Checking for CNI readiness..."
85-
for i in {1..30}; do
86-
if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
87-
echo "Waiting for CNI to initialize (attempt $i/30)..."
88-
sleep 10
89-
else
90-
echo "All nodes are ready!"
91-
break
92-
fi
93-
done
94-
95-
# Final verification
96-
kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
97-
98-
- name: Deploy CodeFlare stack
58+
- name: Deploy KubeRay and Kueue
9959
id: deploy
10060
run: |
101-
cd codeflare-operator
102-
echo Setting up CodeFlare stack
103-
make setup-e2e
104-
echo Deploying CodeFlare operator
105-
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
106-
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
107-
cd ..
108-
109-
- name: Verify CodeFlare deployment
110-
run: |
111-
# Wait for Kueue to be ready
112-
echo "Waiting for Kueue controller to be ready..."
113-
kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
114-
echo "Kueue deployment status:"
115-
kubectl get all -n kueue-system
116-
exit 1
117-
}
61+
# Deploy KubeRay operator
62+
echo "Deploying KubeRay ${KUBERAY_VERSION}..."
63+
kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}&timeout=180s"
11864
11965
# Wait for KubeRay to be ready
12066
echo "Waiting for KubeRay operator to be ready..."
@@ -124,44 +70,43 @@ jobs:
12470
exit 1
12571
}
12672
127-
# Verify webhook certificates
128-
echo "Checking CodeFlare operator webhook certificates..."
129-
kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
130-
echo "Warning: Webhook certificate might be missing or invalid"
73+
# Deploy Kueue
74+
echo "Deploying Kueue ${KUEUE_VERSION}..."
75+
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
76+
77+
# Wait for Kueue to be ready
78+
echo "Waiting for Kueue controller to be ready..."
79+
kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
80+
echo "Kueue deployment status:"
81+
kubectl get all -n kueue-system
82+
exit 1
13183
}
13284
85+
echo "✓ KubeRay and Kueue deployed successfully"
86+
13387
- name: Add user to KinD
13488
uses: ./common/github-actions/kind-add-user
13589
with:
13690
user-name: sdk-user
13791

13892
- name: Configure RBAC for sdk user with limited permissions
13993
run: |
140-
# CRD permissions for discovering resource types
141-
kubectl create clusterrole crd-reader --verb=get,list,watch --resource=customresourcedefinitions.apiextensions.k8s.io
142-
kubectl create clusterrolebinding sdk-user-crd-reader --clusterrole=crd-reader --user=sdk-user
143-
144-
# AppWrapper permissions for CodeFlare workloads
145-
kubectl create clusterrole appwrapper-creator --verb=get,list,watch,create,update,patch,delete --resource=appwrappers.workload.codeflare.dev
146-
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
147-
148-
# Existing permissions
14994
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
15095
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
15196
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
15297
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
15398
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters.ray.io
15499
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
155-
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs/status
100+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs.ray.io/status
156101
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
157102
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors.kueue.x-k8s.io
158103
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
159104
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues.kueue.x-k8s.io
160105
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
161106
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues.kueue.x-k8s.io
162107
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
163-
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
164-
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
108+
kubectl create clusterrole workload-reader --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
109+
kubectl create clusterrolebinding sdk-user-workload-reader --clusterrole=workload-reader --user=sdk-user
165110
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
166111
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
167112
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -174,86 +119,40 @@ jobs:
174119
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
175120
kubectl config use-context sdk-user
176121
177-
- name: Verify cluster readiness before tests
178-
run: |
179-
echo "=== Pre-test cluster verification ==="
180-
echo "Current context:"
181-
kubectl config current-context
182-
183-
echo -e "\nNode status:"
184-
kubectl get nodes -o wide
185-
186-
echo -e "\nSystem pods status:"
187-
kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
188-
189-
echo -e "\nChecking for any pods in error state:"
190-
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
191-
192-
echo -e "\nKueue resources:"
193-
kubectl get resourceflavors,clusterqueues,localqueues -A || true
194-
195-
echo -e "\nRay CRDs:"
196-
kubectl get crd | grep ray || true
197-
198122
- name: Run e2e tests
199123
run: |
200-
export CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
201-
mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
124+
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
202125
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
203126
204127
set -euo pipefail
205128
pip install poetry
206129
poetry install --with test,docs
207130
echo "Running e2e tests..."
208-
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
209-
env:
210-
GRPC_DNS_RESOLVER: "native"
211-
212-
- name: Run RayJob e2e tests
213-
run: |
214-
set -euo pipefail
215-
echo "Running RayJob e2e tests..."
216-
# Set environment variable to prevent default queue assignment for non-Kueue tests
217-
export DISABLE_DEFAULT_KUEUE_QUEUE=true
218-
219-
# Run only the tests that are designed for Kueue integration
220-
poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
131+
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
221132
env:
222133
GRPC_DNS_RESOLVER: "native"
223134

224135
- name: Switch to kind-cluster context to print logs
225136
if: always() && steps.deploy.outcome == 'success'
226137
run: kubectl config use-context kind-cluster
227138

228-
- name: Print RayJob E2E Pytest output log
229-
if: always() && steps.deploy.outcome == 'success'
230-
run: |
231-
echo "Printing RayJob Pytest output logs"
232-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log || echo "No RayJob test output found"
233-
234-
- name: Print E2E Pytest output log
235-
if: always() && steps.deploy.outcome == 'success'
236-
run: |
237-
echo "Printing E2E Pytest output logs"
238-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log || echo "No E2E test output found"
239-
240-
- name: Print CodeFlare operator logs
139+
- name: Print Pytest output log
241140
if: always() && steps.deploy.outcome == 'success'
242141
run: |
243-
echo "Printing CodeFlare operator logs"
244-
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
142+
echo "Printing Pytest output logs"
143+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
245144
246145
- name: Print KubeRay operator logs
247146
if: always() && steps.deploy.outcome == 'success'
248147
run: |
249148
echo "Printing KubeRay operator logs"
250-
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
149+
kubectl logs -n default --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
251150
252151
- name: Export all KinD pod logs
253152
uses: ./common/github-actions/kind-export-logs
254153
if: always() && steps.deploy.outcome == 'success'
255154
with:
256-
output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}
155+
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
257156

258157
- name: Upload logs
259158
uses: actions/upload-artifact@v4
@@ -263,4 +162,3 @@ jobs:
263162
retention-days: 10
264163
path: |
265164
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
266-
if-no-files-found: warn

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,6 @@ def apply(self, force=False):
209209
namespace = self.config.namespace
210210
name = self.config.name
211211

212-
# Regenerate resource_yaml to reflect any configuration changes
213212
self.resource_yaml = self.create_resource()
214213

215214
try:
@@ -391,23 +390,17 @@ def is_dashboard_ready(self) -> bool:
391390
bool:
392391
True if the dashboard is ready, False otherwise.
393392
"""
394-
dashboard_uri = self.cluster_dashboard_uri()
395-
if dashboard_uri is None:
396-
return False
397393

398394
try:
399395
response = requests.get(
400-
dashboard_uri,
396+
self.cluster_dashboard_uri(),
401397
headers=self._client_headers,
402398
timeout=5,
403399
verify=self._client_verify_tls,
404400
)
405401
except requests.exceptions.SSLError: # pragma no cover
406402
# SSL exception occurs when oauth ingress has been created but cluster is not up
407403
return False
408-
except Exception: # pragma no cover
409-
# Any other exception (connection errors, timeouts, etc.)
410-
return False
411404

412405
if response.status_code == 200:
413406
return True
@@ -536,24 +529,6 @@ def cluster_dashboard_uri(self) -> str:
536529
protocol = "https"
537530
return f"{protocol}://{ingress.spec.rules[0].host}"
538531

539-
# For local/test environments without ingress controller (e.g., KIND)
540-
# Try to find the Ray head service
541-
try:
542-
api_instance = client.CoreV1Api(get_api_client())
543-
services = api_instance.list_namespaced_service(
544-
self.config.namespace,
545-
label_selector=f"ray.io/cluster={self.config.name},ray.io/node-type=head",
546-
)
547-
for service in services.items:
548-
if service.metadata.name == f"{self.config.name}-head-svc":
549-
# For ClusterIP services in local environments, return a placeholder
550-
# The actual connection would need port-forwarding or NodePort
551-
return f"http://{service.metadata.name}.{self.config.namespace}.svc.cluster.local:8265"
552-
except Exception: # pragma: no cover
553-
pass
554-
555-
return None
556-
557532
def list_jobs(self) -> List:
558533
"""
559534
This method accesses the head ray node in your cluster and lists the running jobs.
@@ -813,11 +788,6 @@ def remove_autogenerated_fields(resource):
813788
else:
814789
remove_autogenerated_fields(resource[key])
815790

816-
# After cleaning, remove empty metadata sections
817-
if "metadata" in resource and isinstance(resource["metadata"], dict):
818-
if len(resource["metadata"]) == 0:
819-
del resource["metadata"]
820-
821791
elif isinstance(resource, list):
822792
for item in resource:
823793
remove_autogenerated_fields(item)

src/codeflare_sdk/ray/cluster/test_cluster.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,10 @@ def test_cluster_uris(mocker):
322322
mocker.patch(
323323
"kubernetes.client.NetworkingV1Api.list_namespaced_ingress",
324324
)
325-
# When no ingress/route/service is found, the method should return None
326-
assert cluster.cluster_dashboard_uri() is None
325+
assert (
326+
cluster.cluster_dashboard_uri()
327+
== "Dashboard not available yet, have you run cluster.up()?"
328+
)
327329

328330
mocker.patch(
329331
"codeflare_sdk.ray.cluster.cluster._is_openshift_cluster", return_value=True

0 commit comments

Comments
 (0)