project-codeflare · dgrove-oss · Mar 17, 2025 · Mar 17, 2025
diff --git a/setup.RHOAI-v2.13/CLUSTER-SETUP.md b/setup.RHOAI-v2.13/CLUSTER-SETUP.md
@@ -1,6 +1,6 @@
 # Cluster Setup
 
-The cluster setup installs Red Hat OpenShift AI and Coscheduler, configures Kueue,
+The cluster setup installs Red Hat OpenShift AI and configures Scheduler Plugins, Kueue,
 cluster roles, and priority classes.
 
 ## Priorities
@@ -10,23 +10,26 @@ Create `default-priority`, `high-priority`, and `low-priority` priority classes:
 oc apply -f setup.RHOAI-v2.13/mlbatch-priorities.yaml
 ```
 
-## Scheduler Plugins
+## Scheduler Configuration
 
-MLBatch utilizes Kubernetes Scheduler Plugins to ensure gang scheduling of
-multi-Pod workloads and to pack `Pods` onto `Nodes` to reduce GPU fragmentation.
+MLBatch configures Kubernetes scheduling to accomplish two objectives:
++ Obtaining gang (all or nothing) scheduling for multi-Pod workloads.
++ Packing Pods whose GPU request is less than the number of GPUs on a Node to
+  maximize the number of Nodes available for Pods that request all the GPUs on a Node.
+
+This is done by installing the Coscheduling out-of-tree scheduler plugin and configuring
+the default NodeResourcesFit scheduler plugin to pack in the GPU dimension.
 
-### Coscheduler
 
-Install Coscheduler v0.28.9 as a secondary scheduler and configure packing:
 ```sh
 helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \
   scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \
   --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]'
 ```
-Patch Coscheduler pod priorities:
+Patch scheduler-plugins pod priorities:
 ```sh
-oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.13/coscheduler-priority-patch.yaml scheduler-plugins-controller
-oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.13/coscheduler-priority-patch.yaml scheduler-plugins-scheduler
+oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.13/scheduler-priority-patch.yaml scheduler-plugins-controller
+oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.13/scheduler-priority-patch.yaml scheduler-plugins-scheduler
 ```
 
 

diff --git a/...OAI-v2.13/coscheduler-priority-patch.yaml → ...RHOAI-v2.13/scheduler-priority-patch.yaml b/...OAI-v2.13/coscheduler-priority-patch.yaml → ...RHOAI-v2.13/scheduler-priority-patch.yaml
diff --git a/setup.RHOAI-v2.16/CLUSTER-SETUP.md b/setup.RHOAI-v2.16/CLUSTER-SETUP.md
@@ -1,6 +1,6 @@
 # Cluster Setup
 
-The cluster setup installs Red Hat OpenShift AI and Coscheduler, configures Kueue,
+The cluster setup installs Red Hat OpenShift AI and configures Scheduler Plugins, Kueue,
 cluster roles, and priority classes.
 
 ## Priorities
@@ -10,23 +10,26 @@ Create `default-priority`, `high-priority`, and `low-priority` priority classes:
 oc apply -f setup.RHOAI-v2.16/mlbatch-priorities.yaml
 ```
 
-## Scheduler Plugins
+## Scheduler Configuration
 
-MLBatch utilizes Kubernetes Scheduler Plugins to ensure gang scheduling of
-multi-Pod workloads and to pack `Pods` onto `Nodes` to reduce GPU fragmentation.
+MLBatch configures Kubernetes scheduling to accomplish two objectives:
++ Obtaining gang (all or nothing) scheduling for multi-Pod workloads.
++ Packing Pods whose GPU request is less than the number of GPUs on a Node to
+  maximize the number of Nodes available for Pods that request all the GPUs on a Node.
+
+This is done by installing the Coscheduling out-of-tree scheduler plugin and configuring
+the default NodeResourcesFit scheduler plugin to pack in the GPU dimension.
 
-### Coscheduler
 
-Install Coscheduler v0.28.9 as a secondary scheduler and configure packing:
 ```sh
 helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \
   scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \
   --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]'
 ```
-Patch Coscheduler pod priorities:
+Patch scheduler-plugins pod priorities:
 ```sh
-oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.16/coscheduler-priority-patch.yaml scheduler-plugins-controller
-oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.16/coscheduler-priority-patch.yaml scheduler-plugins-scheduler
+oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.16/scheduler-priority-patch.yaml scheduler-plugins-controller
+oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.16/scheduler-priority-patch.yaml scheduler-plugins-scheduler
 ```
 
 

diff --git a/...OAI-v2.16/coscheduler-priority-patch.yaml → ...RHOAI-v2.16/scheduler-priority-patch.yaml b/...OAI-v2.16/coscheduler-priority-patch.yaml → ...RHOAI-v2.16/scheduler-priority-patch.yaml
diff --git a/setup.RHOAI-v2.17/CLUSTER-SETUP.md b/setup.RHOAI-v2.17/CLUSTER-SETUP.md
@@ -1,6 +1,6 @@
 # Cluster Setup
 
-The cluster setup installs Red Hat OpenShift AI and Coscheduler, configures Kueue,
+The cluster setup installs Red Hat OpenShift AI and configures Scheduler Plugins, Kueue,
 cluster roles, and priority classes.
 
 ## Priorities
@@ -10,23 +10,26 @@ Create `default-priority`, `high-priority`, and `low-priority` priority classes:
 oc apply -f setup.RHOAI-v2.17/mlbatch-priorities.yaml
 ```
 
-## Scheduler Plugins
+## Scheduler Configuration
 
-MLBatch utilizes Kubernetes Scheduler Plugins to ensure gang scheduling of
-multi-Pod workloads and to pack `Pods` onto `Nodes` to reduce GPU fragmentation.
+MLBatch configures Kubernetes scheduling to accomplish two objectives:
++ Obtaining gang (all or nothing) scheduling for multi-Pod workloads.
++ Packing Pods whose GPU request is less than the number of GPUs on a Node to
+  maximize the number of Nodes available for Pods that request all the GPUs on a Node.
+
+This is done by installing the Coscheduling out-of-tree scheduler plugin and configuring
+the default NodeResourcesFit scheduler plugin to pack in the GPU dimension.
 
-### Coscheduler
 
-Install Coscheduler v0.28.9 as a secondary scheduler and configure packing:
 ```sh
 helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \
   scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \
   --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]'
 ```
-Patch Coscheduler pod priorities:
+Patch scheduler-plugins pod priorities:
 ```sh
-oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.17/coscheduler-priority-patch.yaml scheduler-plugins-controller
-oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.17/coscheduler-priority-patch.yaml scheduler-plugins-scheduler
+oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.17/scheduler-priority-patch.yaml scheduler-plugins-controller
+oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.17/scheduler-priority-patch.yaml scheduler-plugins-scheduler
 ```
 
 

diff --git a/...OAI-v2.17/coscheduler-priority-patch.yaml → ...RHOAI-v2.17/scheduler-priority-patch.yaml b/...OAI-v2.17/coscheduler-priority-patch.yaml → ...RHOAI-v2.17/scheduler-priority-patch.yaml
diff --git a/setup.k8s/CLUSTER-SETUP.md b/setup.k8s/CLUSTER-SETUP.md
@@ -16,24 +16,28 @@ Create `default-priority`, `high-priority`, and `low-priority` priority classes:
 kubectl apply -f setup.k8s/mlbatch-priorities.yaml
 ```
 
-## Scheduler Plugins
+## Scheduler Configuration
+
+MLBatch configures Kubernetes scheduling to accomplish two objectives:
++ Obtaining gang (all or nothing) scheduling for multi-Pod workloads.
++ Packing Pods whose GPU request is less than the number of GPUs on a Node to
+  maximize the number of Nodes available for Pods that request all the GPUs on a Node.
+
+The currently recommend way to do this is by installing the Coscheduling out-of-tree scheduler
+plugin and configuring the default NodeResourcesFit scheduler plugin to pack in the GPU dimension.
+Alternatively, you can skip the helm install and patch commands shown below and instead install
+the experimental Sakkara scheduler plugin (described next).
 
-MLBatch utilizes Kubernetes Scheduler Plugins to ensure gang scheduling of
-multi-Pod workloads and to pack `Pods` onto `Nodes` to reduce GPU fragmentation.
-Two options are described below: Coscheduler and Sakkara. You should pick and install one of them
-as a secondary scheduler for your cluster.
-### Coscheduler
 
-Install Coscheduler v0.28.9 as a secondary scheduler and configure packing:
 ```sh
 helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \
   scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \
   --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]'
 ```
-Patch Coscheduler pod priorities:
+Patch scheduler-plugins pod priorities:
 ```sh
-kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s/coscheduler-priority-patch.yaml scheduler-plugins-controller
-kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s/coscheduler-priority-patch.yaml scheduler-plugins-scheduler
+kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s/scheduler-priority-patch.yaml scheduler-plugins-controller
+kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s/scheduler-priority-patch.yaml scheduler-plugins-scheduler
 ```
 
 ### Sakkara
@@ -56,9 +60,9 @@ kubectl create namespace mlbatch-system
 
 Install the Kubeflow Training Operator
 
-If you are using Coscheduler do:
+If you are using Coscheduling do:
 ```sh
-kubectl apply --server-side -k setup.k8s/training-operator/coscheduler
+kubectl apply --server-side -k setup.k8s/training-operator/coscheduling
 ```
 If you are using Sakkara do:
 ```sh
@@ -76,9 +80,9 @@ kubectl apply --server-side -k setup.k8s/kueue
 ```
 
 Install the AppWrapper Operator
-If you are using Coscheduler do:
+If you are using Coscheduling do:
 ```sh
-kubectl apply --server-side -k setup.k8s/appwrapper/coscheduler
+kubectl apply --server-side -k setup.k8s/appwrapper/coscheduling
 ```
 If you are using Sakkara do:
 ```sh

diff --git a/.../appwrapper/coscheduler/config_patch.yaml → ...appwrapper/coscheduling/config_patch.yaml b/.../appwrapper/coscheduler/config_patch.yaml → ...appwrapper/coscheduling/config_patch.yaml
diff --git a/...appwrapper/coscheduler/kustomization.yaml → ...ppwrapper/coscheduling/kustomization.yaml b/...appwrapper/coscheduler/kustomization.yaml → ...ppwrapper/coscheduling/kustomization.yaml
diff --git a/setup.k8s/coscheduler-priority-patch.yaml → setup.k8s/scheduler-priority-patch.yaml b/setup.k8s/coscheduler-priority-patch.yaml → setup.k8s/scheduler-priority-patch.yaml
diff --git a/...g-operator/coscheduler/kustomization.yaml → ...-operator/coscheduling/kustomization.yaml b/...g-operator/coscheduler/kustomization.yaml → ...-operator/coscheduling/kustomization.yaml
diff --git a/setup.tmpl/CLUSTER-SETUP.md.tmpl b/setup.tmpl/CLUSTER-SETUP.md.tmpl
@@ -1,7 +1,7 @@
 # Cluster Setup
 
 {{ if .RHOAI -}}
-The cluster setup installs Red Hat OpenShift AI and Coscheduler, configures Kueue,
+The cluster setup installs Red Hat OpenShift AI and configures Scheduler Plugins, Kueue,
 cluster roles, and priority classes.
 
 {{- else -}}
@@ -23,26 +23,33 @@ Create `default-priority`, `high-priority`, and `low-priority` priority classes:
 {{ .KUBECTL }} apply -f setup.{{ .VERSION }}/mlbatch-priorities.yaml
 ```
 
-## Scheduler Plugins
+## Scheduler Configuration
 
-MLBatch utilizes Kubernetes Scheduler Plugins to ensure gang scheduling of
-multi-Pod workloads and to pack `Pods` onto `Nodes` to reduce GPU fragmentation.
-{{ if not .RHOAI -}}
-Two options are described below: Coscheduler and Sakkara. You should pick and install one of them
-as a secondary scheduler for your cluster.
+MLBatch configures Kubernetes scheduling to accomplish two objectives:
++ Obtaining gang (all or nothing) scheduling for multi-Pod workloads.
++ Packing Pods whose GPU request is less than the number of GPUs on a Node to
+  maximize the number of Nodes available for Pods that request all the GPUs on a Node.
+
+{{ if .RHOAI -}}
+This is done by installing the Coscheduling out-of-tree scheduler plugin and configuring
+the default NodeResourcesFit scheduler plugin to pack in the GPU dimension.
+{{- else -}}
+The currently recommend way to do this is by installing the Coscheduling out-of-tree scheduler
+plugin and configuring the default NodeResourcesFit scheduler plugin to pack in the GPU dimension.
+Alternatively, you can skip the helm install and patch commands shown below and instead install
+the experimental Sakkara scheduler plugin (described next).
 {{- end }}
-### Coscheduler
 
-Install Coscheduler v0.28.9 as a secondary scheduler and configure packing:
+
 ```sh
 helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \
   scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \
   --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]'
 ```
-Patch Coscheduler pod priorities:
+Patch scheduler-plugins pod priorities:
 ```sh
-{{ .KUBECTL }} patch deployment -n scheduler-plugins --type=json --patch-file setup.{{ .VERSION }}/coscheduler-priority-patch.yaml scheduler-plugins-controller
-{{ .KUBECTL }} patch deployment -n scheduler-plugins --type=json --patch-file setup.{{ .VERSION }}/coscheduler-priority-patch.yaml scheduler-plugins-scheduler
+{{ .KUBECTL }} patch deployment -n scheduler-plugins --type=json --patch-file setup.{{ .VERSION }}/scheduler-priority-patch.yaml scheduler-plugins-controller
+{{ .KUBECTL }} patch deployment -n scheduler-plugins --type=json --patch-file setup.{{ .VERSION }}/scheduler-priority-patch.yaml scheduler-plugins-scheduler
 ```
 
 {{ if not .RHOAI -}}
@@ -137,9 +144,9 @@ Create the mlbatch-system namespace
 
 Install the Kubeflow Training Operator
 
-If you are using Coscheduler do:
+If you are using Coscheduling do:
 ```sh
-{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/training-operator/coscheduler
+{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/training-operator/coscheduling
 ```
 If you are using Sakkara do:
 ```sh
@@ -157,9 +164,9 @@ Install Kueue
 ```
 
 Install the AppWrapper Operator
-If you are using Coscheduler do:
+If you are using Coscheduling do:
 ```sh
-{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/appwrapper/coscheduler
+{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/appwrapper/coscheduling
 ```
 If you are using Sakkara do:
 ```sh