Skip to content

Commit 2f0e79c

Browse files
committed
Merge branch 'main' into kueue-integration
2 parents 65ceb80 + 56696fd commit 2f0e79c

32 files changed

+2544
-1462
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 103 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,33 @@ jobs:
6868
with:
6969
worker-nodes: 1
7070

71+
- name: Wait for KIND nodes to be ready
72+
run: |
73+
echo "=== Waiting for KIND nodes to be ready ==="
74+
echo "Initial node status:"
75+
kubectl get nodes -o wide
76+
77+
# Wait for all nodes to be ready
78+
echo "Waiting for all nodes to reach Ready state..."
79+
kubectl wait --for=condition=Ready nodes --all --timeout=300s || {
80+
echo "ERROR: Nodes did not become ready!"
81+
echo "Node status:"
82+
kubectl get nodes -o wide
83+
echo "Node conditions:"
84+
kubectl describe nodes
85+
echo "System pods:"
86+
kubectl get pods -n kube-system -o wide
87+
exit 1
88+
}
89+
90+
# Verify CNI is ready
91+
echo "Verifying CNI (kindnet) pods are ready..."
92+
kubectl wait --for=condition=Ready pods -n kube-system -l app=kindnet --timeout=60s
93+
94+
echo "Final cluster state:"
95+
kubectl get nodes -o wide
96+
kubectl get pods -n kube-system -o wide
97+
7198
- name: Install NVidia GPU operator for KinD
7299
uses: ./common/github-actions/nvidia-gpu-operator
73100

@@ -89,30 +116,84 @@ jobs:
89116

90117
- name: Configure RBAC for sdk user with limited permissions
91118
run: |
92-
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
93-
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
94-
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
95-
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
96-
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
97-
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
98-
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
99-
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
100-
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
101-
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
102-
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
103-
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
104-
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
105-
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
106-
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
107-
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
108-
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
109-
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
110-
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
111-
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
112-
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
113-
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
119+
echo "=== Configuring RBAC for sdk-user ==="
120+
121+
# Create a comprehensive ClusterRole with all needed permissions
122+
cat <<EOF | kubectl apply -f -
123+
apiVersion: rbac.authorization.k8s.io/v1
124+
kind: ClusterRole
125+
metadata:
126+
name: sdk-user-role
127+
rules:
128+
# Core resources
129+
- apiGroups: [""]
130+
resources: ["pods", "pods/log", "pods/status", "pods/portforward", "services", "endpoints", "persistentvolumeclaims", "events", "configmaps", "secrets", "nodes", "namespaces", "serviceaccounts", "replicationcontrollers"]
131+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
132+
# Apps resources
133+
- apiGroups: ["apps"]
134+
resources: ["deployments", "daemonsets", "replicasets", "statefulsets"]
135+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
136+
# Batch resources
137+
- apiGroups: ["batch", "batch/v1"]
138+
resources: ["jobs", "cronjobs"]
139+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
140+
# Autoscaling resources
141+
- apiGroups: ["autoscaling"]
142+
resources: ["horizontalpodautoscalers"]
143+
verbs: ["get", "list", "watch"]
144+
# Networking resources
145+
- apiGroups: ["networking.k8s.io"]
146+
resources: ["ingresses", "networkpolicies"]
147+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
148+
# RBAC resources (read-only)
149+
- apiGroups: ["rbac.authorization.k8s.io"]
150+
resources: ["roles", "rolebindings", "clusterroles", "clusterrolebindings"]
151+
verbs: ["get", "list", "watch"]
152+
# CRD resources
153+
- apiGroups: ["apiextensions.k8s.io"]
154+
resources: ["customresourcedefinitions"]
155+
verbs: ["get", "list", "watch"]
156+
# Ray resources
157+
- apiGroups: ["ray.io"]
158+
resources: ["rayclusters", "rayjobs", "rayservices", "rayclusters/status", "rayjobs/status", "rayservices/status"]
159+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
160+
# AppWrapper resources (MCAD)
161+
- apiGroups: ["workload.codeflare.dev"]
162+
resources: ["appwrappers", "appwrappers/status"]
163+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
164+
# Kueue resources
165+
- apiGroups: ["kueue.x-k8s.io"]
166+
resources: ["clusterqueues", "localqueues", "resourceflavors", "workloads", "workloads/status"]
167+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
168+
# Metrics
169+
- apiGroups: ["metrics.k8s.io"]
170+
resources: ["pods", "nodes"]
171+
verbs: ["get", "list"]
172+
EOF
173+
174+
# Create ClusterRoleBinding
175+
cat <<EOF | kubectl apply -f -
176+
apiVersion: rbac.authorization.k8s.io/v1
177+
kind: ClusterRoleBinding
178+
metadata:
179+
name: sdk-user-role-binding
180+
roleRef:
181+
apiGroup: rbac.authorization.k8s.io
182+
kind: ClusterRole
183+
name: sdk-user-role
184+
subjects:
185+
- kind: User
186+
name: sdk-user
187+
apiGroup: rbac.authorization.k8s.io
188+
EOF
189+
190+
echo "RBAC configuration complete. Switching context to sdk-user..."
114191
kubectl config use-context sdk-user
115192
193+
# Verify permissions
194+
echo "Verifying sdk-user permissions..."
195+
kubectl auth can-i --list || true
196+
116197
- name: Run e2e tests
117198
run: |
118199
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}

0 commit comments

Comments
 (0)