volcano-sh
diff --git a/‎charts/kthena/charts/workload/crds/workload.serving.volcano.sh_modelservings.yaml‎
Lines changed: 1 addition & 14 deletions b/‎charts/kthena/charts/workload/crds/workload.serving.volcano.sh_modelservings.yaml‎
Lines changed: 1 addition & 14 deletions
diff --git a/‎client-go/applyconfiguration/workload/v1alpha1/rollingupdateconfiguration.go‎
Lines changed: 0 additions & 9 deletions b/‎client-go/applyconfiguration/workload/v1alpha1/rollingupdateconfiguration.go‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎docs/kthena/docs/reference/crd/workload.serving.volcano.sh.md‎
Lines changed: 1 addition & 2 deletions b/‎docs/kthena/docs/reference/crd/workload.serving.volcano.sh.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/kthena-router/ModelRouteLora.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/kthena-router/ModelRouteLora.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/apis/workload/v1alpha1/model_serving_types.go‎
Lines changed: 3 additions & 12 deletions b/‎pkg/apis/workload/v1alpha1/model_serving_types.go‎
Lines changed: 3 additions & 12 deletions
diff --git a/‎pkg/apis/workload/v1alpha1/zz_generated.deepcopy.go‎
Lines changed: 6 additions & 2 deletions b/‎pkg/apis/workload/v1alpha1/zz_generated.deepcopy.go‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎pkg/kthena-router/backend/vllm/models.go‎
Lines changed: 4 additions & 0 deletions b/‎pkg/kthena-router/backend/vllm/models.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pkg/model-serving-controller/controller/model_serving_controller.go‎
Lines changed: 35 additions & 18 deletions b/‎pkg/model-serving-controller/controller/model_serving_controller.go‎
Lines changed: 35 additions & 18 deletions
diff --git a/‎pkg/model-serving-controller/utils/utils.go‎
Lines changed: 13 additions & 0 deletions b/‎pkg/model-serving-controller/utils/utils.go‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎pkg/model-serving-controller/utils/utils_test.go‎
Lines changed: 144 additions & 0 deletions b/‎pkg/model-serving-controller/utils/utils_test.go‎
Lines changed: 144 additions & 0 deletions
@@ -65,19 +65,6 @@ spec:
                       RollingUpdateConfiguration defines the parameters to be used when type is RollingUpdateStrategyType.
                       optional
                     properties:
-                      maxSurge:
-                        anyOf:
-                        - type: integer
-                        - type: string
-                        default: 0
-                        description: |-
-                          The maximum number of replicas that can be scheduled above the original number of
-                          replicas.
-                          Value can be an absolute number (ex: 5) or a percentage of total replicas at
-                          the start of the update (ex: 10%).
-                          Absolute number is calculated from percentage by rounding up.
-                          By default, a value of 0 is used.
-                        x-kubernetes-int-or-string: true
                       maxUnavailable:
                         anyOf:
                         - type: integer
@@ -87,7 +74,7 @@ spec:
                           The maximum number of replicas that can be unavailable during the update.
                           Value can be an absolute number (ex: 5) or a percentage of total replicas at the start of update (ex: 10%).
                           Absolute number is calculated from percentage by rounding down.
-                          This can not be 0 if MaxSurge is 0.
+                          This can not be 0.
                           By default, a fixed value of 1 is used.
                         x-kubernetes-int-or-string: true
                       partition:
 
@@ -678,8 +678,7 @@ _Appears in:_
 
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `maxUnavailable` _[IntOrString](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.33/#intorstring-intstr-util)_ | The maximum number of replicas that can be unavailable during the update.<br />Value can be an absolute number (ex: 5) or a percentage of total replicas at the start of update (ex: 10%).<br />Absolute number is calculated from percentage by rounding down.<br />This can not be 0 if MaxSurge is 0.<br />By default, a fixed value of 1 is used. | 1 | XIntOrString: \{\} <br /> |
-| `maxSurge` _[IntOrString](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.33/#intorstring-intstr-util)_ | The maximum number of replicas that can be scheduled above the original number of<br />replicas.<br />Value can be an absolute number (ex: 5) or a percentage of total replicas at<br />the start of the update (ex: 10%).<br />Absolute number is calculated from percentage by rounding up.<br />By default, a value of 0 is used. | 0 | XIntOrString: \{\} <br /> |
+| `maxUnavailable` _[IntOrString](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.33/#intorstring-intstr-util)_ | The maximum number of replicas that can be unavailable during the update.<br />Value can be an absolute number (ex: 5) or a percentage of total replicas at the start of update (ex: 10%).<br />Absolute number is calculated from percentage by rounding down.<br />This can not be 0.<br />By default, a fixed value of 1 is used. | 1 | XIntOrString: \{\} <br /> |
 | `partition` _integer_ | Partition indicates the ordinal at which the ModelServing should be partitioned<br />for updates. During a rolling update, all ServingGroups from ordinal Replicas-1 to<br />Partition are updated. All ServingGroups from ordinal Partition-1 to 0 remain untouched.<br />The default value is 0. |  |  |
 
 
 
@@ -10,4 +10,4 @@ spec:
   rules:
   - name: "lora-route"
     targetModels:
-    - modelServerName: "deepseek-r1-1-5b"
+    - modelServerName: "deepseek-r1-7b"
@@ -104,21 +104,12 @@ type RollingUpdateConfiguration struct {
 	// The maximum number of replicas that can be unavailable during the update.
 	// Value can be an absolute number (ex: 5) or a percentage of total replicas at the start of update (ex: 10%).
 	// Absolute number is calculated from percentage by rounding down.
-	// This can not be 0 if MaxSurge is 0.
+	// This can not be 0.
 	// By default, a fixed value of 1 is used.
 	// +kubebuilder:validation:XIntOrString
 	// +kubebuilder:default=1
-	MaxUnavailable intstr.IntOrString `json:"maxUnavailable,omitempty"`
-
-	// The maximum number of replicas that can be scheduled above the original number of
-	// replicas.
-	// Value can be an absolute number (ex: 5) or a percentage of total replicas at
-	// the start of the update (ex: 10%).
-	// Absolute number is calculated from percentage by rounding up.
-	// By default, a value of 0 is used.
-	// +kubebuilder:validation:XIntOrString
-	// +kubebuilder:default=0
-	MaxSurge intstr.IntOrString `json:"maxSurge,omitempty"`
+	MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`
+
 	// Partition indicates the ordinal at which the ModelServing should be partitioned
 	// for updates. During a rolling update, all ServingGroups from ordinal Replicas-1 to
 	// Partition are updated. All ServingGroups from ordinal Partition-1 to 0 remain untouched.
 
@@ -41,6 +41,10 @@ func (engine *vllmEngine) GetPodModels(pod *corev1.Pod) ([]string, error) {
 	}
 	defer resp.Body.Close()
 
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("failed to get models from pod %s/%s: HTTP %d", pod.GetNamespace(), pod.GetName(), resp.StatusCode)
+	}
+
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return nil, err
 
@@ -840,16 +840,41 @@ func (c *ModelServingController) DeleteRole(ctx context.Context, ms *workloadv1a
 }
 
 func (c *ModelServingController) manageServingGroupRollingUpdate(ctx context.Context, ms *workloadv1alpha1.ModelServing, revision string) error {
+	maxUnavailable, err := utils.GetMaxUnavailable(ms)
+	if err != nil {
+		return fmt.Errorf("failed to calculate maxUnavailable: %v", err)
+	}
+
 	servingGroupList, err := c.store.GetServingGroupByModelServing(utils.GetNamespaceName(ms))
 	if err != nil {
 		return fmt.Errorf("cannot get ServingGroupList from store, err:%v", err)
 	}
 
+	// Count how many groups are currently not running(Unavailable)
+	currentUnavailableCount := 0
+	for _, sg := range servingGroupList {
+		if sg.Status != datastore.ServingGroupRunning {
+			currentUnavailableCount++
+		}
+	}
+	// Check if kthena have reached the maxUnavailable limit
+	if currentUnavailableCount >= maxUnavailable {
+		// Wait until some groups become available before continuing updates
+		klog.V(4).Infof("current unavailable ServingGroup count %d has reached the maxUnavailable limit %d, waiting for next reconcile", currentUnavailableCount, maxUnavailable)
+		return nil
+	}
+	// Calculate how many more groups we can delete in this reconcile.
+	groupToDelete := maxUnavailable - currentUnavailableCount
+
+	// Determine if partition is set
 	partition := c.getPartition(ms)
 
+	// we terminate the ServingGroup with the largest ordinal that does not match the update revision.
+	// Update outdated groups respecting the maxUnavailable constraint
+	updateCount := 0
 	if partition > 0 {
 		// When partition is set, delete ServingGroups with ordinal >= partition
-		for i := len(servingGroupList) - 1; i >= 0; i-- {
+		for i := len(servingGroupList) - 1; i >= 0 && updateCount < groupToDelete; i-- {
 			_, ordinal := utils.GetParentNameAndOrdinal(servingGroupList[i].Name)
 			if ordinal < partition {
 				// Skip partition-protected ServingGroups
@@ -859,32 +884,24 @@ func (c *ModelServingController) manageServingGroupRollingUpdate(ctx context.Con
 			if c.isServingGroupOutdated(servingGroupList[i], ms.Namespace, revision) {
 				// target ServingGroup is not the latest version, needs to be updated
 				klog.V(2).Infof("ServingGroup %s will be terminated for update (partition=%d)", servingGroupList[i].Name, partition)
-				return c.deleteServingGroup(ctx, ms, servingGroupList[i].Name)
-			}
-			if servingGroupList[i].Status != datastore.ServingGroupRunning {
-				// target ServingGroup is the latest version, but not running. We need to wait for the status to change to running.
-				klog.V(4).Infof("waiting for the ServingGroup %s status become running", servingGroupList[i].Name)
-				return nil
+				if err := c.deleteServingGroup(ctx, ms, servingGroupList[i].Name); err != nil {
+					return err
+				}
+				updateCount += 1
 			}
 		}
 		klog.V(2).Infof("all target groups of modelServing %s have been updated (partition=%d)", ms.Name, partition)
 	} else {
 		// Original behavior: terminate the ServingGroup with the largest ordinal that does not match the update revision
-		for i := len(servingGroupList) - 1; i >= 0; i-- {
+		for i := len(servingGroupList) - 1; i >= 0 && updateCount < groupToDelete; i-- {
 			if c.isServingGroupOutdated(servingGroupList[i], ms.Namespace, revision) {
 				// target ServingGroup is not the latest version, needs to be updated
 				klog.V(2).Infof("ServingGroup %s will be terminated for update", servingGroupList[i].Name)
-				return c.deleteServingGroup(ctx, ms, servingGroupList[i].Name)
-			}
-			if servingGroupList[i].Status != datastore.ServingGroupRunning {
-				// target ServingGroup is the latest version, but not running. We need to wait for the status to change to running.
-				// If the group fails after rolling, it will automatically be deleted and rebuilt when detecting the pod failure.
-				// If the group still pending due to reasons such as being unable to be scheduled, rolling update process will stop
-				// to avoid affecting other groups that are running normally.
-				klog.V(4).Infof("waiting for the ServingGroup %s status become running", servingGroupList[i].Name)
-				return nil
+				if err := c.deleteServingGroup(ctx, ms, servingGroupList[i].Name); err != nil {
+					return err
+				}
+				updateCount += 1
 			}
-			// target ServingGroup is already the latest version and running, processing the rolling update of the next group.
 		}
 		klog.V(2).Infof("all target groups of modelServing %s have been updated", ms.Name)
 	}
 
@@ -30,6 +30,7 @@ import (
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/klog/v2"
 	"k8s.io/utils/ptr"
@@ -544,3 +545,15 @@ func RoleIDIndexFunc(obj interface{}) ([]string, error) {
 	compositeKey := fmt.Sprintf("%s/%s/%s/%s", namespace, groupName, roleName, roleID)
 	return []string{compositeKey}, nil
 }
+
+func GetMaxUnavailable(mi *workloadv1alpha1.ModelServing) (int, error) {
+	maxUnavailable := intstr.FromInt(1) // Default value
+	replicas := int(*mi.Spec.Replicas)
+	if mi.Spec.RolloutStrategy != nil && mi.Spec.RolloutStrategy.RollingUpdateConfiguration != nil {
+		if mi.Spec.RolloutStrategy.RollingUpdateConfiguration.MaxUnavailable != nil {
+			maxUnavailable = *mi.Spec.RolloutStrategy.RollingUpdateConfiguration.MaxUnavailable
+		}
+	}
+	// Calculate maxUnavailable as absolute numbers
+	return intstr.GetScaledValueFromIntOrPercent(&maxUnavailable, replicas, false)
+}
@@ -21,6 +21,8 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
+	"k8s.io/utils/ptr"
 
 	workloadv1alpha1 "github.com/volcano-sh/kthena/pkg/apis/workload/v1alpha1"
 )
@@ -97,3 +99,145 @@ func TestSetCondition(t *testing.T) {
 		assert.Contains(t, cond.Message, SomeGroupsAreProgressing)
 	})
 }
+
+func TestGetMaxUnavailable(t *testing.T) {
+	tests := []struct {
+		name           string
+		modelServing   *workloadv1alpha1.ModelServing
+		expectedResult int
+		expectError    bool
+	}{
+		{
+			name: "Default case - no rollout strategy",
+			modelServing: &workloadv1alpha1.ModelServing{
+				Spec: workloadv1alpha1.ModelServingSpec{
+					Replicas: ptr.To[int32](5),
+				},
+			},
+			expectedResult: 1, // Default value
+			expectError:    false,
+		},
+		{
+			name: "Default case - rollout strategy but no rolling update config",
+			modelServing: &workloadv1alpha1.ModelServing{
+				Spec: workloadv1alpha1.ModelServingSpec{
+					Replicas: ptr.To[int32](10),
+					RolloutStrategy: &workloadv1alpha1.RolloutStrategy{
+						Type: "ServingGroupRollingUpdate",
+					},
+				},
+			},
+			expectedResult: 1, // Default value
+			expectError:    false,
+		},
+		{
+			name: "MaxUnavailable as integer - value 2",
+			modelServing: &workloadv1alpha1.ModelServing{
+				Spec: workloadv1alpha1.ModelServingSpec{
+					Replicas: ptr.To[int32](10),
+					RolloutStrategy: &workloadv1alpha1.RolloutStrategy{
+						Type: "ServingGroupRollingUpdate",
+						RollingUpdateConfiguration: &workloadv1alpha1.RollingUpdateConfiguration{
+							MaxUnavailable: ptr.To(intstr.FromInt(2)),
+						},
+					},
+				},
+			},
+			expectedResult: 2,
+			expectError:    false,
+		},
+		{
+			name: "MaxUnavailable as integer - value 0",
+			modelServing: &workloadv1alpha1.ModelServing{
+				Spec: workloadv1alpha1.ModelServingSpec{
+					Replicas: ptr.To[int32](5),
+					RolloutStrategy: &workloadv1alpha1.RolloutStrategy{
+						Type: "ServingGroupRollingUpdate",
+						RollingUpdateConfiguration: &workloadv1alpha1.RollingUpdateConfiguration{
+							MaxUnavailable: ptr.To(intstr.FromInt(0)),
+						},
+					},
+				},
+			},
+			expectedResult: 0,
+			expectError:    false,
+		},
+		{
+			name: "MaxUnavailable as percentage - 20%",
+			modelServing: &workloadv1alpha1.ModelServing{
+				Spec: workloadv1alpha1.ModelServingSpec{
+					Replicas: ptr.To[int32](10),
+					RolloutStrategy: &workloadv1alpha1.RolloutStrategy{
+						Type: "ServingGroupRollingUpdate",
+						RollingUpdateConfiguration: &workloadv1alpha1.RollingUpdateConfiguration{
+							MaxUnavailable: ptr.To(intstr.FromString("20%")),
+						},
+					},
+				},
+			},
+			expectedResult: 2, // 20% of 10 is 2
+			expectError:    false,
+		},
+		{
+			name: "MaxUnavailable as percentage - 50%",
+			modelServing: &workloadv1alpha1.ModelServing{
+				Spec: workloadv1alpha1.ModelServingSpec{
+					Replicas: ptr.To[int32](9),
+					RolloutStrategy: &workloadv1alpha1.RolloutStrategy{
+						Type: "ServingGroupRollingUpdate",
+						RollingUpdateConfiguration: &workloadv1alpha1.RollingUpdateConfiguration{
+							MaxUnavailable: ptr.To(intstr.FromString("50%")),
+						},
+					},
+				},
+			},
+			expectedResult: 4, // 50% of 9 is 4.5, rounded down to 4
+			expectError:    false,
+		},
+		{
+			name: "MaxUnavailable as percentage - 100%",
+			modelServing: &workloadv1alpha1.ModelServing{
+				Spec: workloadv1alpha1.ModelServingSpec{
+					Replicas: ptr.To[int32](3),
+					RolloutStrategy: &workloadv1alpha1.RolloutStrategy{
+						Type: "ServingGroupRollingUpdate",
+						RollingUpdateConfiguration: &workloadv1alpha1.RollingUpdateConfiguration{
+							MaxUnavailable: ptr.To(intstr.FromString("100%")),
+						},
+					},
+				},
+			},
+			expectedResult: 3, // 100% of 3 is 3
+			expectError:    false,
+		},
+		{
+			name: "MaxUnavailable as percentage - 0%",
+			modelServing: &workloadv1alpha1.ModelServing{
+				Spec: workloadv1alpha1.ModelServingSpec{
+					Replicas: ptr.To[int32](10),
+					RolloutStrategy: &workloadv1alpha1.RolloutStrategy{
+						Type: "ServingGroupRollingUpdate",
+						RollingUpdateConfiguration: &workloadv1alpha1.RollingUpdateConfiguration{
+							MaxUnavailable: ptr.To(intstr.FromString("0%")),
+						},
+					},
+				},
+			},
+			expectedResult: 0, // 0% of 10 is 0
+			expectError:    false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := GetMaxUnavailable(tt.modelServing)
+
+			if tt.expectError {
+				assert.Error(t, err)
+			} else {
+				assert.NoError(t, err)
+				assert.Equal(t, tt.expectedResult, result)
+			}
+		})
+	}
+}