Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 22 additions & 12 deletions .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ on:
pull_request:
branches:
- main
- 'release-*'
- "release-*"
- ray-jobs-feature
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
- "docs/**"
- "**.adoc"
- "**.md"
- "LICENSE"

concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
Expand All @@ -33,9 +33,9 @@ jobs:
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
repository: "project-codeflare/codeflare-common"
ref: "main"
path: "common"

- name: Checkout CodeFlare operator repository
uses: actions/checkout@v4
Expand All @@ -46,7 +46,7 @@ jobs:
- name: Set Go
uses: actions/setup-go@v5
with:
go-version-file: './codeflare-operator/go.mod'
go-version-file: "./codeflare-operator/go.mod"
cache-dependency-path: "./codeflare-operator/go.sum"

- name: Set up gotestfmt
Expand Down Expand Up @@ -76,7 +76,7 @@ jobs:
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e
make setup-e2e KUEUE_VERSION=v0.13.4 KUBERAY_VERSION=v1.4.0
echo Deploying CodeFlare operator
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
Expand All @@ -95,6 +95,10 @@ jobs:
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
kubectl create clusterrole rayjob-status-reader --verb=get,list,patch,update --resource=rayjobs/status
kubectl create clusterrolebinding sdk-user-rayjob-status-reader --clusterrole=rayjob-status-reader --user=sdk-user
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
Expand Down Expand Up @@ -122,7 +126,7 @@ jobs:
pip install poetry
poetry install --with test,docs
echo "Running e2e tests..."
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
poetry run pytest -v -s ./tests/e2e/ -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
env:
GRPC_DNS_RESOLVER: "native"

Expand All @@ -146,7 +150,13 @@ jobs:
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
kubectl logs -n default --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log

- name: Print Kueue controller logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Kueue controller logs"
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kueue.log

- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
Expand Down
172 changes: 172 additions & 0 deletions .github/workflows/rayjob_e2e_tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
# rayjob e2e tests workflow for CodeFlare-SDK
name: rayjob-e2e

on:
pull_request:
branches:
- main
- "release-*"
- ray-jobs-feature
paths-ignore:
- "docs/**"
- "**.adoc"
- "**.md"
- "LICENSE"

concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true

env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"

jobs:
kubernetes-rayjob:
runs-on: gpu-t4-4-core

steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive

- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: "project-codeflare/codeflare-common"
ref: "main"
path: "common"

- name: Checkout CodeFlare operator repository
uses: actions/checkout@v4
with:
repository: project-codeflare/codeflare-operator
path: codeflare-operator

- name: Set Go
uses: actions/setup-go@v5
with:
go-version-file: "./codeflare-operator/go.mod"
cache-dependency-path: "./codeflare-operator/go.sum"

- name: Set up gotestfmt
uses: gotesttools/gotestfmt-action@v2
with:
token: ${{ secrets.GITHUB_TOKEN }}

- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "pip" # caching pip dependencies

- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup

- name: Setup and start KinD cluster
uses: ./common/github-actions/kind
with:
worker-nodes: 1

- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator

- name: Deploy CodeFlare stack
id: deploy
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e KUEUE_VERSION=v0.13.4 KUBERAY_VERSION=v1.4.0
echo Deploying CodeFlare operator
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
cd ..

- name: Add user to KinD
uses: ./common/github-actions/kind-add-user
with:
user-name: sdk-user

- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
kubectl create clusterrole rayjob-status-reader --verb=get,list,patch,update --resource=rayjobs/status
kubectl create clusterrolebinding sdk-user-rayjob-status-reader --clusterrole=rayjob-status-reader --user=sdk-user
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
kubectl config use-context sdk-user

- name: Run RayJob E2E tests
run: |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV

set -euo pipefail
pip install poetry
poetry install --with test,docs
echo "Running RayJob e2e tests..."
poetry run pytest -v -s ./tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log 2>&1

- name: Switch to kind-cluster context to print logs
if: always() && steps.deploy.outcome == 'success'
run: kubectl config use-context kind-cluster

- name: Print Pytest output log
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Pytest output logs"
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log

- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing CodeFlare operator logs"
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log

- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n default --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log

- name: Print Kueue controller logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Kueue controller logs"
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kueue.log

- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}

- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.deploy.outcome == 'success'
with:
name: logs
retention-days: 10
path: |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 1 addition & 5 deletions src/codeflare_sdk/common/utils/k8s_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,10 @@
from ..kubernetes_cluster import config_check, _kube_api_error_handling


def get_current_namespace():
def get_current_namespace(): # pragma: no cover
"""
Retrieves the current Kubernetes namespace.
This function attempts to detect the current namespace by:
1. First checking if running inside a pod (reading from service account namespace file)
2. Falling back to reading from the current kubeconfig context
Returns:
str:
The current namespace or None if not found.
Expand Down
1 change: 0 additions & 1 deletion src/codeflare_sdk/ray/cluster/build_ray_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
"enableIngress": False,
"rayStartParams": {
"dashboard-host": "0.0.0.0",
"dashboard-port": "8265",
"block": "true",
"num-gpus": str(head_gpu_count),
"resources": head_resources,
Expand Down
21 changes: 19 additions & 2 deletions src/codeflare_sdk/ray/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,10 @@ def apply(self, force=False):
self._throw_for_no_raycluster()
namespace = self.config.namespace
name = self.config.name

# Regenerate resource_yaml to reflect any configuration changes
self.resource_yaml = self.create_resource()

try:
self.config_check()
api_instance = client.CustomObjectsApi(get_api_client())
Expand Down Expand Up @@ -387,16 +391,25 @@ def is_dashboard_ready(self) -> bool:
bool:
True if the dashboard is ready, False otherwise.
"""

dashboard_uri = self.cluster_dashboard_uri()
if dashboard_uri is None:
return False

try:
response = requests.get(
self.cluster_dashboard_uri(),
dashboard_uri,
headers=self._client_headers,
timeout=5,
verify=self._client_verify_tls,
)
except requests.exceptions.SSLError: # pragma no cover
# SSL exception occurs when oauth ingress has been created but cluster is not up
return False
except Exception: # pragma no cover
# Any other exception (connection errors, timeouts, etc.)
return False

if response.status_code == 200:
return True
else:
Expand Down Expand Up @@ -504,6 +517,8 @@ def cluster_dashboard_uri(self) -> str:
):
protocol = "https" if route["spec"].get("tls") else "http"
return f"{protocol}://{route['spec']['host']}"
# No route found for this cluster
return "Dashboard not available yet, have you run cluster.up()?"
else:
try:
api_instance = client.NetworkingV1Api(get_api_client())
Expand All @@ -522,7 +537,8 @@ def cluster_dashboard_uri(self) -> str:
protocol = "http"
elif "route.openshift.io/termination" in annotations:
protocol = "https"
return f"{protocol}://{ingress.spec.rules[0].host}"
return f"{protocol}://{ingress.spec.rules[0].host}"

return "Dashboard not available yet, have you run cluster.up()?"

def list_jobs(self) -> List:
Expand Down Expand Up @@ -783,6 +799,7 @@ def remove_autogenerated_fields(resource):
del resource[key]
else:
remove_autogenerated_fields(resource[key])

elif isinstance(resource, list):
for item in resource:
remove_autogenerated_fields(item)
Expand Down
2 changes: 1 addition & 1 deletion src/codeflare_sdk/ray/cluster/test_config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2022-2025 IBM, Red Hat
# Copyright 2024 IBM, Red Hat
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
Loading
Loading