Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
b731b50
feat(RHOAIENG-26480): Run RayJobs against existing RayClusters
kryanbeane Jul 29, 2025
496db20
Updated coverage.svg
github-actions[bot] Jul 30, 2025
3e13697
feat(RHOAIENG-26590): Report RayJob status via SDK
chipspeak Jul 29, 2025
eafca47
feat(RHOAIENG-26487): Cluster lifecycling via RayJob
chipspeak Jul 29, 2025
3335613
feat(RHOAIENG-26487): rayjob lifecycled cluster improvements and tests
kryanbeane Aug 12, 2025
8668693
task(RHOAIENG-26481): Existing cluster RayJob demo notebook
chipspeak Aug 1, 2025
c35c040
feat(RHOAIENG-26482): add gcs fault tolerance
kryanbeane Aug 12, 2025
6f5accb
feat(RHOAIENG-26482): disable usage stats and rename RayJobClusterConfig
kryanbeane Aug 13, 2025
a5fa957
feat(RHOAIENG-29330):Deny RayCluster creation with Ray Version mismat…
LilyLinh Aug 20, 2025
d3b5180
Delete unsued code in config and test_config
LilyLinh Aug 25, 2025
36d24a7
feat(RHOAIENG-29391): Store entrypoint scripts in configMaps
chipspeak Aug 19, 2025
0bd26a0
Changes as per review
chipspeak Aug 27, 2025
23e368e
Changes as per review again because I'm dumb
chipspeak Aug 27, 2025
d6c9a55
added kubeconfig loads to test
chipspeak Aug 27, 2025
ca9fcb9
feat(RHOAIENG-26488): add lifecycled RayCluster demo notebook for Ray…
kryanbeane Aug 13, 2025
cb5589c
test: e2e rayjob
pawelpaszki Aug 29, 2025
416ba8d
RHOAIENG-30720: Remove GCS FT for Lifecycled RayClusters
kryanbeane Aug 27, 2025
47122ee
fix: update auth methods in rayjob notebooks
kryanbeane Aug 29, 2025
c549a0e
RHOAIENG-27792: Add stop and resubmit functions to RayJob
kryanbeane Sep 1, 2025
aea0cff
RHOAIENG-27792: Auto tear down training config map when job is deleted
kryanbeane Sep 1, 2025
538d345
RHOAIENG-27792: rayjob test improvements
kryanbeane Sep 5, 2025
d259ec1
RHOAIENG-32532: Update kueue integration
kryanbeane Sep 10, 2025
7398069
RHOAIENG-32532: Run RayJob tests in CI
kryanbeane Sep 18, 2025
80d1307
RHOAIENG-32532: Add additional test coverage
kryanbeane Sep 18, 2025
65ceb80
RHOAIENG-32532: Fix broken E2E tests
kryanbeane Sep 19, 2025
2f0e79c
Merge branch 'main' into kueue-integration
kryanbeane Sep 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 103 additions & 22 deletions .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,33 @@ jobs:
with:
worker-nodes: 1

- name: Wait for KIND nodes to be ready
run: |
echo "=== Waiting for KIND nodes to be ready ==="
echo "Initial node status:"
kubectl get nodes -o wide

# Wait for all nodes to be ready
echo "Waiting for all nodes to reach Ready state..."
kubectl wait --for=condition=Ready nodes --all --timeout=300s || {
echo "ERROR: Nodes did not become ready!"
echo "Node status:"
kubectl get nodes -o wide
echo "Node conditions:"
kubectl describe nodes
echo "System pods:"
kubectl get pods -n kube-system -o wide
exit 1
}

# Verify CNI is ready
echo "Verifying CNI (kindnet) pods are ready..."
kubectl wait --for=condition=Ready pods -n kube-system -l app=kindnet --timeout=60s

echo "Final cluster state:"
kubectl get nodes -o wide
kubectl get pods -n kube-system -o wide

- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator

Expand All @@ -89,30 +116,84 @@ jobs:

- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
echo "=== Configuring RBAC for sdk-user ==="

# Create a comprehensive ClusterRole with all needed permissions
cat <<EOF | kubectl apply -f -
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: sdk-user-role
rules:
# Core resources
- apiGroups: [""]
resources: ["pods", "pods/log", "pods/status", "pods/portforward", "services", "endpoints", "persistentvolumeclaims", "events", "configmaps", "secrets", "nodes", "namespaces", "serviceaccounts", "replicationcontrollers"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# Apps resources
- apiGroups: ["apps"]
resources: ["deployments", "daemonsets", "replicasets", "statefulsets"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# Batch resources
- apiGroups: ["batch", "batch/v1"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# Autoscaling resources
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["get", "list", "watch"]
# Networking resources
- apiGroups: ["networking.k8s.io"]
resources: ["ingresses", "networkpolicies"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# RBAC resources (read-only)
- apiGroups: ["rbac.authorization.k8s.io"]
resources: ["roles", "rolebindings", "clusterroles", "clusterrolebindings"]
verbs: ["get", "list", "watch"]
# CRD resources
- apiGroups: ["apiextensions.k8s.io"]
resources: ["customresourcedefinitions"]
verbs: ["get", "list", "watch"]
# Ray resources
- apiGroups: ["ray.io"]
resources: ["rayclusters", "rayjobs", "rayservices", "rayclusters/status", "rayjobs/status", "rayservices/status"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# AppWrapper resources (MCAD)
- apiGroups: ["workload.codeflare.dev"]
resources: ["appwrappers", "appwrappers/status"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# Kueue resources
- apiGroups: ["kueue.x-k8s.io"]
resources: ["clusterqueues", "localqueues", "resourceflavors", "workloads", "workloads/status"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# Metrics
- apiGroups: ["metrics.k8s.io"]
resources: ["pods", "nodes"]
verbs: ["get", "list"]
EOF

# Create ClusterRoleBinding
cat <<EOF | kubectl apply -f -
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: sdk-user-role-binding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: sdk-user-role
subjects:
- kind: User
name: sdk-user
apiGroup: rbac.authorization.k8s.io
EOF

echo "RBAC configuration complete. Switching context to sdk-user..."
kubectl config use-context sdk-user

# Verify permissions
echo "Verifying sdk-user permissions..."
kubectl auth can-i --list || true

- name: Run e2e tests
run: |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
Expand Down
Loading
Loading