6868 with :
6969 worker-nodes : 1
7070
71+ - name : Wait for KIND nodes to be ready
72+ run : |
73+ echo "=== Waiting for KIND nodes to be ready ==="
74+ echo "Initial node status:"
75+ kubectl get nodes -o wide
76+
77+ # Wait for all nodes to be ready
78+ echo "Waiting for all nodes to reach Ready state..."
79+ kubectl wait --for=condition=Ready nodes --all --timeout=300s || {
80+ echo "ERROR: Nodes did not become ready!"
81+ echo "Node status:"
82+ kubectl get nodes -o wide
83+ echo "Node conditions:"
84+ kubectl describe nodes
85+ echo "System pods:"
86+ kubectl get pods -n kube-system -o wide
87+ exit 1
88+ }
89+
90+ # Verify CNI is ready
91+ echo "Verifying CNI (kindnet) pods are ready..."
92+ kubectl wait --for=condition=Ready pods -n kube-system -l app=kindnet --timeout=60s
93+
94+ echo "Final cluster state:"
95+ kubectl get nodes -o wide
96+ kubectl get pods -n kube-system -o wide
97+
7198 - name : Install NVidia GPU operator for KinD
7299 uses : ./common/github-actions/nvidia-gpu-operator
73100
@@ -89,30 +116,84 @@ jobs:
89116
90117 - name : Configure RBAC for sdk user with limited permissions
91118 run : |
92- kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
93- kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
94- kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
95- kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
96- kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
97- kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
98- kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
99- kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
100- kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
101- kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
102- kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
103- kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
104- kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
105- kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
106- kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
107- kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
108- kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
109- kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
110- kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
111- kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
112- kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
113- kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
119+ echo "=== Configuring RBAC for sdk-user ==="
120+
121+ # Create a comprehensive ClusterRole with all needed permissions
122+ cat <<EOF | kubectl apply -f -
123+ apiVersion: rbac.authorization.k8s.io/v1
124+ kind: ClusterRole
125+ metadata:
126+ name: sdk-user-role
127+ rules:
128+ # Core resources
129+ - apiGroups: [""]
130+ resources: ["pods", "pods/log", "pods/status", "pods/portforward", "services", "endpoints", "persistentvolumeclaims", "events", "configmaps", "secrets", "nodes", "namespaces", "serviceaccounts", "replicationcontrollers"]
131+ verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
132+ # Apps resources
133+ - apiGroups: ["apps"]
134+ resources: ["deployments", "daemonsets", "replicasets", "statefulsets"]
135+ verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
136+ # Batch resources
137+ - apiGroups: ["batch", "batch/v1"]
138+ resources: ["jobs", "cronjobs"]
139+ verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
140+ # Autoscaling resources
141+ - apiGroups: ["autoscaling"]
142+ resources: ["horizontalpodautoscalers"]
143+ verbs: ["get", "list", "watch"]
144+ # Networking resources
145+ - apiGroups: ["networking.k8s.io"]
146+ resources: ["ingresses", "networkpolicies"]
147+ verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
148+ # RBAC resources (read-only)
149+ - apiGroups: ["rbac.authorization.k8s.io"]
150+ resources: ["roles", "rolebindings", "clusterroles", "clusterrolebindings"]
151+ verbs: ["get", "list", "watch"]
152+ # CRD resources
153+ - apiGroups: ["apiextensions.k8s.io"]
154+ resources: ["customresourcedefinitions"]
155+ verbs: ["get", "list", "watch"]
156+ # Ray resources
157+ - apiGroups: ["ray.io"]
158+ resources: ["rayclusters", "rayjobs", "rayservices", "rayclusters/status", "rayjobs/status", "rayservices/status"]
159+ verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
160+ # AppWrapper resources (MCAD)
161+ - apiGroups: ["workload.codeflare.dev"]
162+ resources: ["appwrappers", "appwrappers/status"]
163+ verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
164+ # Kueue resources
165+ - apiGroups: ["kueue.x-k8s.io"]
166+ resources: ["clusterqueues", "localqueues", "resourceflavors", "workloads", "workloads/status"]
167+ verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
168+ # Metrics
169+ - apiGroups: ["metrics.k8s.io"]
170+ resources: ["pods", "nodes"]
171+ verbs: ["get", "list"]
172+ EOF
173+
174+ # Create ClusterRoleBinding
175+ cat <<EOF | kubectl apply -f -
176+ apiVersion: rbac.authorization.k8s.io/v1
177+ kind: ClusterRoleBinding
178+ metadata:
179+ name: sdk-user-role-binding
180+ roleRef:
181+ apiGroup: rbac.authorization.k8s.io
182+ kind: ClusterRole
183+ name: sdk-user-role
184+ subjects:
185+ - kind: User
186+ name: sdk-user
187+ apiGroup: rbac.authorization.k8s.io
188+ EOF
189+
190+ echo "RBAC configuration complete. Switching context to sdk-user..."
114191 kubectl config use-context sdk-user
115192
193+ # Verify permissions
194+ echo "Verifying sdk-user permissions..."
195+ kubectl auth can-i --list || true
196+
116197 - name : Run e2e tests
117198 run : |
118199 export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
0 commit comments