fix: rename compute-isolation to isolation, add integralDecayFactor for PID controller, bump helm version (#418)

Code2Life · web-flow · commit 49107e7f7ae6 · 2025-11-02T19:45:38.000+08:00
* fix: rename compute-isolation to isolation, add integralDecayFactor for PID controller, bump helm version

* fix: bump dependency versions
diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go
@@ -297,6 +297,9 @@ type ElasticRateLimitParameters struct {
 	// token bucket min and max
 	CapacityMin string `json:"capacityMin,omitempty"`
 	CapacityMax string `json:"capacityMax,omitempty"`
+
+	// Decay factor for integral term in PID controller, to avoid integral windup
+	IntegralDecayFactor string `json:"integralDecayFactor,omitempty"`
 }
 
 // SchedulingConfigTemplateStatus defines the observed state of SchedulingConfigTemplate.
diff --git a/api/v1/workloadprofile_types.go b/api/v1/workloadprofile_types.go
@@ -60,8 +60,8 @@ type WorkloadProfileSpec struct {
 
 	// +optional
 	// +kubebuilder:default=soft
-	// How to isolate computing resources, could be `shared` or `soft` or `hard`
-	ComputeIsolation ComputingIsolationMode `json:"computeIsolation,omitempty"`
+	// How to isolate resources, could be `shared` or `soft` or `hard` or `partitioned`
+	Isolation IsolationModeType `json:"isolation,omitempty"`
 
 	// +optional
 	// GPUModel specifies the required GPU model (e.g., "A100", "H100")
@@ -91,13 +91,30 @@ type WorkloadProfileSpec struct {
 	WorkerPodTemplate *runtime.RawExtension `json:"workerPodTemplate,omitempty"`
 }
 
-// +kubebuilder:validation:Enum=shared;soft;hard
-type ComputingIsolationMode string
+// +kubebuilder:validation:Enum=shared;soft;hard;partitioned
+type IsolationModeType string
 
 const (
-	ComputingIsolationModeShared = "shared"
-	ComputingIsolationModeSoft   = "soft"
-	ComputingIsolationModeHard   = "hard"
+	// no limits, rely on GPU built-in time-slicing, each process gets equal share of GPU
+	// Pros: simple and stable, no performance overhead, maximize GPU utilization when well-scheduled
+	// Cons: can not auto-scale and differentiate QoS levels, TFLOPs limit does not take effect, may cause resource contention
+	IsolationModeShared = "shared"
+
+	// default isolation mode, use Proportional-Integral-Derivative controller to isolate computing resources and assign time slices
+	// Pros: can set QoS levels for different workloads, TFLOPs limit is relatively accurate
+	// Cons: ~1% performance overhead, resource contention may occur when burst credits are consumed
+	IsolationModeSoft = "soft"
+
+	// use dedicated SMs to isolate computing resources
+	// Pros: better performance isolation, no performance overhead, oversubscription is possible
+	// Cons: can not auto-scale dynamically, percent may not 1%/1TFLOPs accuracy, coupled with GPU vendor's SM partitioning implementation
+	// NOTE: this can only be used in Remote or Local+SidecarWorker mode, not supported in LocalGPU mode (because no TensorFusion Worker)
+	IsolationModeHard = "hard"
+
+	// use GPU driver level partitioning to isolate resources, need hardware support
+	// Pros: no performance overhead, no resource contention, fully-isolated
+	// Cons: not supported by all GPUs/XPUs, oversubscription is not possible
+	IsolationModePartitioned = "partitioned"
 )
 
 func (t WorkloadProfileSpec) IsDynamicReplica() bool {
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,10 +15,10 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.7.2
+version: 1.7.3
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "1.48.3"
+appVersion: "1.48.5"
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml
@@ -274,6 +274,10 @@ spec:
                         description: Filter ineffective requests from rate limit,
                           0.0 to 1.0
                         type: string
+                      integralDecayFactor:
+                        description: Decay factor for integral term in PID controller,
+                          to avoid integral windup
+                        type: string
                       kd:
                         type: string
                       ki:
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml
@@ -238,15 +238,6 @@ spec:
                       type: object
                     type: array
                 type: object
-              computeIsolation:
-                default: soft
-                description: How to isolate computing resources, could be `shared`
-                  or `soft` or `hard`
-                enum:
-                - shared
-                - soft
-                - hard
-                type: string
               gpuCount:
                 description: The number of GPUs to be used by the workload, default
                   to 1
@@ -266,6 +257,16 @@ spec:
                 description: Schedule the workload to the same GPU server that runs
                   vGPU worker for best performance, default to false
                 type: boolean
+              isolation:
+                default: soft
+                description: How to isolate resources, could be `shared` or `soft`
+                  or `hard` or `partitioned`
+                enum:
+                - shared
+                - soft
+                - hard
+                - partitioned
+                type: string
               nodeAffinity:
                 description: NodeAffinity specifies the node affinity requirements
                   for the workload
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml
@@ -225,15 +225,6 @@ spec:
                       type: object
                     type: array
                 type: object
-              computeIsolation:
-                default: soft
-                description: How to isolate computing resources, could be `shared`
-                  or `soft` or `hard`
-                enum:
-                - shared
-                - soft
-                - hard
-                type: string
               gpuCount:
                 description: The number of GPUs to be used by the workload, default
                   to 1
@@ -253,6 +244,16 @@ spec:
                 description: Schedule the workload to the same GPU server that runs
                   vGPU worker for best performance, default to false
                 type: boolean
+              isolation:
+                default: soft
+                description: How to isolate resources, could be `shared` or `soft`
+                  or `hard` or `partitioned`
+                enum:
+                - shared
+                - soft
+                - hard
+                - partitioned
+                type: string
               nodeAffinity:
                 description: NodeAffinity specifies the node affinity requirements
                   for the workload
diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml
@@ -31,7 +31,7 @@ controller:
   image:
     repository: tensorfusion/tensor-fusion-operator
     # Overrides the image tag whose default is the chart appVersion.
-    tag: "1.48.2"
+    tag: "1.48.5"
   # This is for setting Kubernetes Annotations to a Pod.
   # For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/ 
   
diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml
@@ -274,6 +274,10 @@ spec:
                         description: Filter ineffective requests from rate limit,
                           0.0 to 1.0
                         type: string
+                      integralDecayFactor:
+                        description: Decay factor for integral term in PID controller,
+                          to avoid integral windup
+                        type: string
                       kd:
                         type: string
                       ki:
diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml
@@ -238,15 +238,6 @@ spec:
                       type: object
                     type: array
                 type: object
-              computeIsolation:
-                default: soft
-                description: How to isolate computing resources, could be `shared`
-                  or `soft` or `hard`
-                enum:
-                - shared
-                - soft
-                - hard
-                type: string
               gpuCount:
                 description: The number of GPUs to be used by the workload, default
                   to 1
@@ -266,6 +257,16 @@ spec:
                 description: Schedule the workload to the same GPU server that runs
                   vGPU worker for best performance, default to false
                 type: boolean
+              isolation:
+                default: soft
+                description: How to isolate resources, could be `shared` or `soft`
+                  or `hard` or `partitioned`
+                enum:
+                - shared
+                - soft
+                - hard
+                - partitioned
+                type: string
               nodeAffinity:
                 description: NodeAffinity specifies the node affinity requirements
                   for the workload
diff --git a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml
@@ -225,15 +225,6 @@ spec:
                       type: object
                     type: array
                 type: object
-              computeIsolation:
-                default: soft
-                description: How to isolate computing resources, could be `shared`
-                  or `soft` or `hard`
-                enum:
-                - shared
-                - soft
-                - hard
-                type: string
               gpuCount:
                 description: The number of GPUs to be used by the workload, default
                   to 1
@@ -253,6 +244,16 @@ spec:
                 description: Schedule the workload to the same GPU server that runs
                   vGPU worker for best performance, default to false
                 type: boolean
+              isolation:
+                default: soft
+                description: How to isolate resources, could be `shared` or `soft`
+                  or `hard` or `partitioned`
+                enum:
+                - shared
+                - soft
+                - hard
+                - partitioned
+                type: string
               nodeAffinity:
                 description: NodeAffinity specifies the node affinity requirements
                   for the workload
diff --git a/go.mod b/go.mod
@@ -1,22 +1,22 @@
 module github.com/NexusGPU/tensor-fusion
 
-go 1.24.5
+go 1.24.6
 
 require (
 	github.com/DATA-DOG/go-sqlmock v1.5.2
 	github.com/NVIDIA/go-nvml v0.13.0-1
 	github.com/aliyun/alibaba-cloud-sdk-go v1.63.107
-	github.com/aws/aws-sdk-go-v2 v1.39.4
-	github.com/aws/aws-sdk-go-v2/service/ec2 v1.259.0
+	github.com/aws/aws-sdk-go-v2 v1.39.5
+	github.com/aws/aws-sdk-go-v2/service/ec2 v1.260.0
 	github.com/aws/smithy-go v1.23.1
-	github.com/awslabs/operatorpkg v0.0.0-20250916074601-4250f4c35c6b
-	github.com/gin-contrib/gzip v1.2.4
+	github.com/awslabs/operatorpkg v0.0.0-20251024191238-14554b75b88a
+	github.com/gin-contrib/gzip v1.2.5
 	github.com/gin-gonic/gin v1.11.0
 	github.com/go-sql-driver/mysql v1.9.3
 	github.com/influxdata/line-protocol/v2 v2.2.1
 	github.com/lithammer/shortuuid/v4 v4.2.0
 	github.com/mitchellh/mapstructure v1.5.0
-	github.com/onsi/ginkgo/v2 v2.27.1
+	github.com/onsi/ginkgo/v2 v2.27.2
 	github.com/onsi/gomega v1.38.2
 	github.com/pkg/errors v0.9.1
 	github.com/robfig/cron/v3 v3.0.1
@@ -42,7 +42,7 @@ require (
 	k8s.io/kubernetes v1.34.1
 	k8s.io/utils v0.0.0-20251002143259-bc988d571ff4
 	sigs.k8s.io/controller-runtime v0.22.3
-	sigs.k8s.io/karpenter v1.6.2
+	sigs.k8s.io/karpenter v1.8.0
 	sigs.k8s.io/yaml v1.6.0
 )
 
@@ -53,10 +53,10 @@ require (
 	github.com/Masterminds/semver/v3 v3.4.0 // indirect
 	github.com/NYTimes/gziphandler v1.1.1 // indirect
 	github.com/antlr4-go/antlr/v4 v4.13.1 // indirect
-	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.11 // indirect
-	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.11 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.12 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.12 // indirect
 	github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.2 // indirect
-	github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.11 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.12 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/blang/semver/v4 v4.0.0 // indirect
 	github.com/bytedance/gopkg v0.1.3 // indirect
@@ -156,7 +156,6 @@ require (
 	go.opentelemetry.io/otel/sdk v1.38.0 // indirect
 	go.opentelemetry.io/otel/trace v1.38.0 // indirect
 	go.opentelemetry.io/proto/otlp v1.8.0 // indirect
-	go.uber.org/automaxprocs v1.6.0 // indirect
 	go.uber.org/multierr v1.11.0 // indirect
 	go.yaml.in/yaml/v2 v2.4.2 // indirect
 	go.yaml.in/yaml/v3 v3.0.4 // indirect
@@ -181,9 +180,9 @@ require (
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 	k8s.io/apiextensions-apiserver v0.34.1 // indirect
-	k8s.io/cloud-provider v0.34.0 // indirect
-	k8s.io/controller-manager v0.34.0 // indirect
-	k8s.io/csi-translation-lib v0.34.0 // indirect
+	k8s.io/cloud-provider v0.34.1 // indirect
+	k8s.io/controller-manager v0.34.1 // indirect
+	k8s.io/csi-translation-lib v0.34.1 // indirect
 	k8s.io/dynamic-resource-allocation v0.34.0 // indirect
 	k8s.io/kms v0.34.1 // indirect
 	k8s.io/kube-openapi v0.0.0-20250905212525-66792eed8611 // indirect
diff --git a/go.sum b/go.sum
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
diff --git a/internal/utils/compose.go b/internal/utils/compose.go
diff --git a/internal/webhook/v1/tf_parser.go b/internal/webhook/v1/tf_parser.go

Original file line number	Diff line number	Diff line change
`@@ -297,6 +297,9 @@ type ElasticRateLimitParameters struct {`
`297`	`297`	`// token bucket min and max`
`298`	`298`	CapacityMin string `json:"capacityMin,omitempty"`
`299`	`299`	CapacityMax string `json:"capacityMax,omitempty"`
	`300`	`+`
	`301`	`+ // Decay factor for integral term in PID controller, to avoid integral windup`
	`302`	+ IntegralDecayFactor string `json:"integralDecayFactor,omitempty"`
`300`	`303`	`}`
`301`	`304`
`302`	`305`	`// SchedulingConfigTemplateStatus defines the observed state of SchedulingConfigTemplate.`