@@ -28,6 +28,7 @@ import (
2828 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2929 "k8s.io/apimachinery/pkg/runtime"
3030 "k8s.io/apimachinery/pkg/types"
31+ "k8s.io/utils/ptr"
3132 "sigs.k8s.io/yaml"
3233)
3334
@@ -179,3 +180,196 @@ func deployment(replicaCount int, milliCPU int64) workloadv1beta2.AppWrapperComp
179180 Template : runtime.RawExtension {Raw : jsonBytes },
180181 }
181182}
183+
184+ const rayClusterYAML = `
185+ apiVersion: ray.io/v1
186+ kind: RayCluster
187+ metadata:
188+ labels:
189+ controller-tools.k8s.io: '1.0'
190+ name: %v
191+ spec:
192+ autoscalerOptions:
193+ idleTimeoutSeconds: 60
194+ imagePullPolicy: Always
195+ resources:
196+ limits:
197+ cpu: 500m
198+ memory: 512Mi
199+ requests:
200+ cpu: 500m
201+ memory: 512Mi
202+ upscalingMode: Default
203+ enableInTreeAutoscaling: false
204+ headGroupSpec:
205+ rayStartParams:
206+ block: 'true'
207+ dashboard-host: 0.0.0.0
208+ num-gpus: '0'
209+ serviceType: ClusterIP
210+ template:
211+ spec:
212+ containers:
213+ - env:
214+ - name: MY_POD_IP
215+ valueFrom:
216+ fieldRef:
217+ fieldPath: status.podIP
218+ - name: RAY_USE_TLS
219+ value: '0'
220+ - name: RAY_TLS_SERVER_CERT
221+ value: /home/ray/workspace/tls/server.crt
222+ - name: RAY_TLS_SERVER_KEY
223+ value: /home/ray/workspace/tls/server.key
224+ - name: RAY_TLS_CA_CERT
225+ value: /home/ray/workspace/tls/ca.crt
226+ image: quay.io/project-codeflare/ray:latest-py39-cu118
227+ imagePullPolicy: Always
228+ lifecycle:
229+ preStop:
230+ exec:
231+ command:
232+ - /bin/sh
233+ - -c
234+ - ray stop
235+ name: ray-head
236+ ports:
237+ - containerPort: 6379
238+ name: gcs
239+ - containerPort: 8265
240+ name: dashboard
241+ - containerPort: 10001
242+ name: client
243+ resources:
244+ limits:
245+ cpu: 2
246+ memory: 8G
247+ nvidia.com/gpu: 0
248+ requests:
249+ cpu: 2
250+ memory: 8G
251+ nvidia.com/gpu: 0
252+ volumeMounts:
253+ - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
254+ name: odh-trusted-ca-cert
255+ subPath: odh-trusted-ca-bundle.crt
256+ - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
257+ name: odh-trusted-ca-cert
258+ subPath: odh-trusted-ca-bundle.crt
259+ - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
260+ name: odh-ca-cert
261+ subPath: odh-ca-bundle.crt
262+ - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
263+ name: odh-ca-cert
264+ subPath: odh-ca-bundle.crt
265+ imagePullSecrets:
266+ - name: unit-test-pull-secret
267+ volumes:
268+ - configMap:
269+ items:
270+ - key: ca-bundle.crt
271+ path: odh-trusted-ca-bundle.crt
272+ name: odh-trusted-ca-bundle
273+ optional: true
274+ name: odh-trusted-ca-cert
275+ - configMap:
276+ items:
277+ - key: odh-ca-bundle.crt
278+ path: odh-ca-bundle.crt
279+ name: odh-trusted-ca-bundle
280+ optional: true
281+ name: odh-ca-cert
282+ rayVersion: 2.7.0
283+ workerGroupSpecs:
284+ - groupName: small-group-unit-test-cluster-ray
285+ maxReplicas: %v
286+ minReplicas: %v
287+ rayStartParams:
288+ block: 'true'
289+ num-gpus: '7'
290+ replicas: %v
291+ template:
292+ metadata:
293+ annotations:
294+ key: value
295+ labels:
296+ key: value
297+ spec:
298+ containers:
299+ - env:
300+ - name: MY_POD_IP
301+ valueFrom:
302+ fieldRef:
303+ fieldPath: status.podIP
304+ - name: RAY_USE_TLS
305+ value: '0'
306+ - name: RAY_TLS_SERVER_CERT
307+ value: /home/ray/workspace/tls/server.crt
308+ - name: RAY_TLS_SERVER_KEY
309+ value: /home/ray/workspace/tls/server.key
310+ - name: RAY_TLS_CA_CERT
311+ value: /home/ray/workspace/tls/ca.crt
312+ image: quay.io/project-codeflare/ray:latest-py39-cu118
313+ lifecycle:
314+ preStop:
315+ exec:
316+ command:
317+ - /bin/sh
318+ - -c
319+ - ray stop
320+ name: machine-learning
321+ resources:
322+ requests:
323+ cpu: %v
324+ memory: 5G
325+ nvidia.com/gpu: 7
326+ volumeMounts:
327+ - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
328+ name: odh-trusted-ca-cert
329+ subPath: odh-trusted-ca-bundle.crt
330+ - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
331+ name: odh-trusted-ca-cert
332+ subPath: odh-trusted-ca-bundle.crt
333+ - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
334+ name: odh-ca-cert
335+ subPath: odh-ca-bundle.crt
336+ - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
337+ name: odh-ca-cert
338+ subPath: odh-ca-bundle.crt
339+ imagePullSecrets:
340+ - name: unit-test-pull-secret
341+ volumes:
342+ - configMap:
343+ items:
344+ - key: ca-bundle.crt
345+ path: odh-trusted-ca-bundle.crt
346+ name: odh-trusted-ca-bundle
347+ optional: true
348+ name: odh-trusted-ca-cert
349+ - configMap:
350+ items:
351+ - key: odh-ca-bundle.crt
352+ path: odh-ca-bundle.crt
353+ name: odh-trusted-ca-bundle
354+ optional: true
355+ name: odh-ca-cert
356+ `
357+
358+ func rayCluster (workerCount int , milliCPU int64 ) workloadv1beta2.AppWrapperComponent {
359+ workerCPU := resource .NewMilliQuantity (milliCPU , resource .DecimalSI )
360+ yamlString := fmt .Sprintf (rayClusterYAML ,
361+ randName ("raycluster" ),
362+ workerCount , workerCount , workerCount ,
363+ workerCPU )
364+
365+ jsonBytes , err := yaml .YAMLToJSON ([]byte (yamlString ))
366+ Expect (err ).NotTo (HaveOccurred ())
367+ replicas := int32 (workerCount )
368+ return workloadv1beta2.AppWrapperComponent {
369+ PodSets : []workloadv1beta2.AppWrapperPodSet {
370+ {Replicas : ptr .To (int32 (1 )), Path : "template.spec.headGroupSpec.template" },
371+ {Replicas : & replicas , Path : "template.spec.workerGroupSpecs[0].template" },
372+ },
373+ Template : runtime.RawExtension {Raw : jsonBytes },
374+ }
375+ }
0 commit comments