@@ -16,84 +16,6 @@ metadata:
1616---
1717apiVersion : v1
1818kind : ConfigMap
19- metadata :
20- name : mlbatch-codeflare
21- namespace : redhat-ods-operator
22- data :
23- manager.yaml : |
24- apiVersion: apps/v1
25- kind: Deployment
26- metadata:
27- name: manager
28- namespace: system
29- spec:
30- selector:
31- matchLabels:
32- app.kubernetes.io/name: codeflare-operator
33- app.kubernetes.io/part-of: codeflare
34- replicas: 1
35- template:
36- metadata:
37- annotations:
38- kubectl.kubernetes.io/default-container: manager
39- labels:
40- app.kubernetes.io/name: codeflare-operator
41- app.kubernetes.io/part-of: codeflare
42- spec:
43- priorityClassName: system-node-critical
44- securityContext:
45- runAsNonRoot: true
46- # TODO(user): For common cases that do not require escalating privileges
47- # it is recommended to ensure that all your Pods/Containers are restrictive.
48- # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
49- # Please uncomment the following code if your project does NOT have to work on old Kubernetes
50- # versions < 1.20 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ).
51- # seccompProfile:
52- # type: RuntimeDefault
53- containers:
54- - command:
55- - /manager
56- image: $(codeflare_operator_controller_image)
57- imagePullPolicy: Always
58- name: manager
59- securityContext:
60- allowPrivilegeEscalation: false
61- capabilities:
62- drop:
63- - "ALL"
64- env:
65- - name: NAMESPACE
66- valueFrom:
67- fieldRef:
68- fieldPath: metadata.namespace
69- ports:
70- - containerPort: 8080
71- protocol: TCP
72- name: metrics
73- livenessProbe:
74- httpGet:
75- path: /healthz
76- port: 8081
77- initialDelaySeconds: 15
78- periodSeconds: 20
79- readinessProbe:
80- httpGet:
81- path: /readyz
82- port: 8081
83- initialDelaySeconds: 5
84- periodSeconds: 10
85- resources:
86- limits:
87- cpu: "1"
88- memory: 1Gi
89- requests:
90- cpu: "1"
91- memory: 1Gi
92- serviceAccountName: controller-manager
93- terminationGracePeriodSeconds: 10
94- ---
95- apiVersion : v1
96- kind : ConfigMap
9719metadata :
9820 name : codeflare-operator-config
9921 namespace : redhat-ods-applications
@@ -129,25 +51,6 @@ data:
12951 ---
13052apiVersion : v1
13153kind : ConfigMap
132- metadata :
133- name : mlbatch-kuberay
134- namespace : redhat-ods-operator
135- data :
136- kuberay-operator-image-patch.yaml : |
137- apiVersion: apps/v1
138- kind: Deployment
139- metadata:
140- name: kuberay-operator
141- spec:
142- template:
143- spec:
144- priorityClassName: system-node-critical
145- containers:
146- - name: kuberay-operator
147- image: $(image)
148- ---
149- apiVersion : v1
150- kind : ConfigMap
15154metadata :
15255 name : mlbatch-kueue
15356 namespace : redhat-ods-operator
15861 health:
15962 healthProbeBindAddress: :8081
16063 metrics:
161- bindAddress: :8080
64+ bindAddress: :8443
16265 enableClusterQueueResources: true
16366 webhook:
16467 port: 9443
17174 Pod: 5
17275 Workload.kueue.x-k8s.io: 5
17376 LocalQueue.kueue.x-k8s.io: 1
77+ Cohort.kueue.x-k8s.io: 1
17478 ClusterQueue.kueue.x-k8s.io: 1
17579 ResourceFlavor.kueue.x-k8s.io: 1
17680 clientConnection:
18185 enable: false
18286 blockAdmission: false
18387 manageJobsWithoutQueueName: true
88+ #managedJobsNamespaceSelector:
89+ # matchLabels:
90+ # kueue-managed: "true"
18491 #internalCertManagement:
18592 # enable: false
18693 # webhookServiceName: ""
@@ -198,6 +105,8 @@ data:
198105 - "kubeflow.org/tfjob"
199106 - "kubeflow.org/xgboostjob"
200107 # - "pod"
108+ # - "deployment" # requires enabling pod integration
109+ # - "statefulset" # requires enabling pod integration
201110 externalFrameworks:
202111 - "AppWrapper.v1beta2.workload.codeflare.dev"
203112 # podOptions:
@@ -209,31 +118,14 @@ data:
209118 fairSharing:
210119 enable: true
211120 preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare]
212- manager_config_patch.yaml : |
213- apiVersion: apps/v1
214- kind: Deployment
215- metadata:
216- name: controller-manager
217- namespace: system
218- spec:
219- template:
220- spec:
221- priorityClassName: system-node-critical
222- containers:
223- - name: manager
224- image: $(image)
225- args:
226- - "--config=/controller_manager_config.yaml"
227- - "--zap-log-level=2"
228- - "--feature-gates=LendingLimit=true"
229- volumeMounts:
230- - name: manager-config
231- mountPath: /controller_manager_config.yaml
232- subPath: controller_manager_config.yaml
233- volumes:
234- - name: manager-config
235- configMap:
236- name: manager-config
121+ #resources:
122+ # excludeResourcePrefixes: []
123+ # transformations:
124+ # - input: nvidia.com/mig-4g.5gb
125+ # strategy: Replace | Retain
126+ # outputs:
127+ # example.com/accelerator-memory: 5Gi
128+ # example.com/accelerator-gpc: 4
237129 ---
238130apiVersion : v1
239131kind : ConfigMap
@@ -249,20 +141,23 @@ data:
249141 spec:
250142 template:
251143 spec:
252- priorityClassName: system-node-critical
253144 containers:
254145 - name: training-operator
255146 image: $(image)
256147 args:
257148 - "--zap-log-level=2"
149+ - --pytorch-init-container-image
150+ - $(image)
151+ - "--webhook-secret-name"
152+ - "kubeflow-training-operator-webhook-cert"
153+ - "--webhook-service-name"
154+ - "kubeflow-training-operator"
258155 - "--gang-scheduler-name=scheduler-plugins-scheduler"
259- resources:
260- requests:
261- cpu: 100m
262- memory: 100Mi
263- limits:
264- cpu: 500m
265- memory: 1000Mi
156+ volumes:
157+ - name: cert
158+ secret:
159+ defaultMode: 420
160+ secretName: kubeflow-training-operator-webhook-cert
266161 ---
267162apiVersion : operators.coreos.com/v1alpha1
268163kind : Subscription
@@ -283,25 +178,16 @@ spec:
283178 - name : mlbatch-codeflare
284179 mountPath : /opt/manifests/codeflare/manager/manager.yaml
285180 subPath : manager.yaml
286- - name : mlbatch-kuberay
287- mountPath : /opt/manifests/ray/openshift/kuberay-operator-image-patch.yaml
288- subPath : kuberay-operator-image-patch.yaml
289181 - name : mlbatch-kueue
290182 mountPath : /opt/manifests/kueue/components/manager/controller_manager_config.yaml
291183 subPath : controller_manager_config.yaml
292- - name : mlbatch-kueue
293- mountPath : /opt/manifests/kueue/rhoai/manager_config_patch.yaml
294- subPath : manager_config_patch.yaml
295184 - name : mlbatch-training-operator
296185 mountPath : /opt/manifests/trainingoperator/rhoai/manager_config_patch.yaml
297186 subPath : manager_config_patch.yaml
298187 volumes :
299188 - name : mlbatch-codeflare
300189 configMap :
301190 name : mlbatch-codeflare
302- - name : mlbatch-kuberay
303- configMap :
304- name : mlbatch-kuberay
305191 - name : mlbatch-kueue
306192 configMap :
307193 name : mlbatch-kueue
0 commit comments