Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
bc7d226
feat(RHOAIENG-26480): Run RayJobs against existing RayClusters
kryanbeane Jul 29, 2025
44ed88e
Updated coverage.svg
github-actions[bot] Jul 30, 2025
b82621f
feat(RHOAIENG-26590): Report RayJob status via SDK
chipspeak Jul 29, 2025
cf0e7e2
feat(RHOAIENG-26487): Cluster lifecycling via RayJob
chipspeak Jul 29, 2025
93ac746
feat(RHOAIENG-26487): rayjob lifecycled cluster improvements and tests
kryanbeane Aug 12, 2025
78e8168
task(RHOAIENG-26481): Existing cluster RayJob demo notebook
chipspeak Aug 1, 2025
793ce4d
feat(RHOAIENG-26482): add gcs fault tolerance
kryanbeane Aug 12, 2025
49c7785
feat(RHOAIENG-26482): disable usage stats and rename RayJobClusterConfig
kryanbeane Aug 13, 2025
19ea57b
feat(RHOAIENG-29330):Deny RayCluster creation with Ray Version mismat…
LilyLinh Aug 20, 2025
7f8e3be
Delete unsued code in config and test_config
LilyLinh Aug 25, 2025
54cf94f
feat(RHOAIENG-29391): Store entrypoint scripts in configMaps
chipspeak Aug 19, 2025
388342c
Changes as per review
chipspeak Aug 27, 2025
d31c925
Changes as per review again because I'm dumb
chipspeak Aug 27, 2025
248288c
added kubeconfig loads to test
chipspeak Aug 27, 2025
293e725
feat(RHOAIENG-26488): add lifecycled RayCluster demo notebook for Ray…
kryanbeane Aug 13, 2025
5adcf86
test: e2e rayjob
pawelpaszki Aug 29, 2025
5930ce6
RHOAIENG-30720: Remove GCS FT for Lifecycled RayClusters
kryanbeane Aug 27, 2025
783f474
fix: update auth methods in rayjob notebooks
kryanbeane Aug 29, 2025
090623b
RHOAIENG-27792: Add stop and resubmit functions to RayJob
kryanbeane Sep 1, 2025
5fdec3c
RHOAIENG-27792: Auto tear down training config map when job is deleted
kryanbeane Sep 1, 2025
665dcb2
RHOAIENG-27792: rayjob test improvements
kryanbeane Sep 5, 2025
8c9dd7d
added codecov permissions
laurafitzgerald Oct 6, 2025
ef75f78
feat(RHOAIENG-26480): Run RayJobs against existing RayClusters
kryanbeane Jul 29, 2025
219d1c5
RHOAIENG-32532: Add kueue integration and update tests
kryanbeane Oct 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/coverage-badge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ on:
jobs:
report:

permissions:
contents: write
pull-requests: write
runs-on: ubuntu-latest

steps:
Expand Down
34 changes: 22 additions & 12 deletions .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ on:
pull_request:
branches:
- main
- 'release-*'
- "release-*"
- ray-jobs-feature
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
- "docs/**"
- "**.adoc"
- "**.md"
- "LICENSE"

concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
Expand All @@ -33,9 +33,9 @@ jobs:
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
repository: "project-codeflare/codeflare-common"
ref: "main"
path: "common"

- name: Checkout CodeFlare operator repository
uses: actions/checkout@v4
Expand All @@ -46,7 +46,7 @@ jobs:
- name: Set Go
uses: actions/setup-go@v5
with:
go-version-file: './codeflare-operator/go.mod'
go-version-file: "./codeflare-operator/go.mod"
cache-dependency-path: "./codeflare-operator/go.sum"

- name: Set up gotestfmt
Expand Down Expand Up @@ -76,7 +76,7 @@ jobs:
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e
make setup-e2e KUEUE_VERSION=v0.13.4 KUBERAY_VERSION=v1.4.0
echo Deploying CodeFlare operator
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
Expand All @@ -95,6 +95,10 @@ jobs:
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
kubectl create clusterrole rayjob-status-reader --verb=get,list,patch,update --resource=rayjobs/status
kubectl create clusterrolebinding sdk-user-rayjob-status-reader --clusterrole=rayjob-status-reader --user=sdk-user
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
Expand Down Expand Up @@ -122,7 +126,7 @@ jobs:
pip install poetry
poetry install --with test,docs
echo "Running e2e tests..."
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
poetry run pytest -v -s ./tests/e2e/ -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
env:
GRPC_DNS_RESOLVER: "native"

Expand All @@ -146,7 +150,13 @@ jobs:
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
kubectl logs -n default --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log

- name: Print Kueue controller logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Kueue controller logs"
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kueue.log

- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
Expand Down
172 changes: 172 additions & 0 deletions .github/workflows/rayjob_e2e_tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
# rayjob e2e tests workflow for CodeFlare-SDK
name: rayjob-e2e

on:
pull_request:
branches:
- main
- "release-*"
- ray-jobs-feature
paths-ignore:
- "docs/**"
- "**.adoc"
- "**.md"
- "LICENSE"

concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true

env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"

jobs:
kubernetes-rayjob:
runs-on: gpu-t4-4-core

steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive

- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: "project-codeflare/codeflare-common"
ref: "main"
path: "common"

- name: Checkout CodeFlare operator repository
uses: actions/checkout@v4
with:
repository: project-codeflare/codeflare-operator
path: codeflare-operator

- name: Set Go
uses: actions/setup-go@v5
with:
go-version-file: "./codeflare-operator/go.mod"
cache-dependency-path: "./codeflare-operator/go.sum"

- name: Set up gotestfmt
uses: gotesttools/gotestfmt-action@v2
with:
token: ${{ secrets.GITHUB_TOKEN }}

- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "pip" # caching pip dependencies

- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup

- name: Setup and start KinD cluster
uses: ./common/github-actions/kind
with:
worker-nodes: 1

- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator

- name: Deploy CodeFlare stack
id: deploy
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e KUEUE_VERSION=v0.13.4 KUBERAY_VERSION=v1.4.0
echo Deploying CodeFlare operator
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
cd ..

- name: Add user to KinD
uses: ./common/github-actions/kind-add-user
with:
user-name: sdk-user

- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
kubectl create clusterrole rayjob-status-reader --verb=get,list,patch,update --resource=rayjobs/status
kubectl create clusterrolebinding sdk-user-rayjob-status-reader --clusterrole=rayjob-status-reader --user=sdk-user
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
kubectl config use-context sdk-user

- name: Run RayJob E2E tests
run: |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV

set -euo pipefail
pip install poetry
poetry install --with test,docs
echo "Running RayJob e2e tests..."
poetry run pytest -v -s ./tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log 2>&1

- name: Switch to kind-cluster context to print logs
if: always() && steps.deploy.outcome == 'success'
run: kubectl config use-context kind-cluster

- name: Print Pytest output log
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Pytest output logs"
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log

- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing CodeFlare operator logs"
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log

- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n default --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log

- name: Print Kueue controller logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Kueue controller logs"
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kueue.log

- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}

- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.deploy.outcome == 'success'
with:
name: logs
retention-days: 10
path: |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
14 changes: 14 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
ignore:
- "**/*.ipynb"
- "demo-notebooks/**"
- "**/__init__.py"

coverage:
precision: 2
round: down
status:
project:
default:
target: auto
threshold: 2.5%
patch:
default:
target: 85%
threshold: 2.5%
4 changes: 2 additions & 2 deletions coverage.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading