NexusGPU
diff --git a/‎api/v1/schedulingconfigtemplate_types.go‎
Lines changed: 60 additions & 6 deletions b/‎api/v1/schedulingconfigtemplate_types.go‎
Lines changed: 60 additions & 6 deletions
diff --git a/‎api/v1/tensorfusionconnection_types.go‎
Lines changed: 24 additions & 0 deletions b/‎api/v1/tensorfusionconnection_types.go‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎api/v1/tensorfusionworkload_types.go‎
Lines changed: 12 additions & 0 deletions b/‎api/v1/tensorfusionworkload_types.go‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎api/v1/workloadprofile_types.go‎
Lines changed: 1 addition & 1 deletion b/‎api/v1/workloadprofile_types.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎api/v1/zz_generated.deepcopy.go‎
Lines changed: 54 additions & 2 deletions b/‎api/v1/zz_generated.deepcopy.go‎
Lines changed: 54 additions & 2 deletions
@@ -86,17 +86,71 @@ type GPUFilter struct {
 }
 
 type AutoScalingConfig struct {
-	// layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
-	// VPA-like, aggregate metrics data <1m
-	AutoSetLimits AutoSetLimits `json:"autoSetLimits,omitempty"`
+	// layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode
+	// Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
+	AutoSetResources AutoSetResources `json:"autoSetResources,omitempty"`
 
 	// layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
 	// HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works)
 	AutoSetReplicas AutoSetReplicas `json:"autoSetReplicas,omitempty"`
 
-	// layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet
-	// Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
-	AutoSetRequests AutoSetRequests `json:"autoSetRequests,omitempty"`
+	// CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions.
+	CronScalingRules []CronScalingRule `json:"cronScalingRules,omitempty"`
+}
+
+// CronScalingRule defines the rule for scaling resources based on a cron schedule.
+// It allows enabling/disabling the scaler, specifying the time window for scaling,
+// and configuring the desired resources and replicas during the scheduled period.
+type CronScalingRule struct {
+	// Enable specifies whether the cron scaler is enabled.
+	Enable bool `json:"enable,omitempty"`
+	// Name is the identifier for the cron scaler.
+	Name string `json:"name,omitempty"`
+	// Start is the start time for the scaling schedule, in cron format.
+	Start string `json:"start,omitempty"`
+	// End is the end time for the scaling schedule, in cron format.
+	End string `json:"end,omitempty"`
+	// DesiredResources specifies the target resources to scale to during the schedule.
+	DesiredResources Resources `json:"desiredResources,omitempty"`
+	// DesiredReplicas is the target number of replicas during the schedule.
+	DesiredReplicas *int32 `json:"desiredReplicas,omitempty"`
+}
+
+type AutoSetResources struct {
+	Enable bool `json:"enable,omitempty"`
+
+	// Target resource to scale, such as "tflops", "vram", or "all" by default
+	TargetResource string `json:"targetResource,omitempty"`
+
+	// Tflops usage percentile that will be used as a base for tflops target recommendation. Default: 0.9
+	TargetTflopsPercentile string `json:"targettflopspercentile,omitempty"`
+
+	// Tflops usage percentile that will be used for the lower bound on tflops recommendation. Default: 0.5
+	LowerBoundTflopsPercentile string `json:"lowerboundtflopspercentile,omitempty"`
+
+	// Tflops usage percentile that will be used for the upper bound on tflops recommendation. Default: 0.95
+	UpperBoundTflopsPercentile string `json:"upperboundtflopspercentile,omitempty"`
+
+	// Vram usage percentile that will be used as a base for vram target recommendation. Default: 0.9
+	TargetVramPercentile string `json:"targetvrampercentile,omitempty"`
+
+	// Vram usage percentile that will be used for the lower bound on vram recommendation. Default: 0.5
+	LowerBoundVramPercentile string `json:"lowerboundvrampercentile,omitempty"`
+
+	// Vram usage percentile that will be used for the upper bound on vram recommendation. Default: 0.95
+	UpperBoundVramPercentile string `json:"upperboundvrampercentile,omitempty"`
+
+	// Fraction of usage added as the safety margin to the recommended request. Default: 0.15
+	RequestMarginFraction string `json:"requestMarginFraction,omitempty"`
+
+	// The time interval used for computing the confidence multiplier for the lower and upper bound. Default: 24h
+	ConfidenceInterval string `json:"confidenceInterval,omitempty"`
+
+	// How much time back TSDB have to be queried to get historical metrics. Default: 1d
+	HistoryLength string `json:"historyLength,omitempty"`
+
+	// Resolution at which TSDB is queried for historical metrics. Default: 1m
+	HistoryResolution string `json:"historyResolution,omitempty"`
 }
 
 // A typical autoLimits algorithm could be checking every 5m, look back 1 day data,
 
@@ -21,6 +21,13 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
+type ResourceName string
+
+const (
+	ResourceTflops ResourceName = "tflops"
+	ResourceVram   ResourceName = "vram"
+)
+
 type Resource struct {
 	Tflops resource.Quantity `json:"tflops"`
 	Vram   resource.Quantity `json:"vram"`
@@ -31,6 +38,23 @@ type Resources struct {
 	Limits   Resource `json:"limits"`
 }
 
+func (r Resources) Equal(target *Resources) bool {
+	if target == nil {
+		return false
+	}
+	return r.Requests.Tflops.Equal(target.Requests.Tflops) &&
+		r.Requests.Vram.Equal(target.Requests.Vram) &&
+		r.Limits.Tflops.Equal(target.Limits.Tflops) &&
+		r.Limits.Vram.Equal(target.Limits.Vram)
+}
+
+func (r Resources) IsZero() bool {
+	return r.Requests.Tflops.IsZero() &&
+		r.Requests.Vram.IsZero() &&
+		r.Limits.Tflops.IsZero() &&
+		r.Limits.Vram.IsZero()
+}
+
 // TensorFusionConnectionSpec defines the desired state of TensorFusionConnection.
 type TensorFusionConnectionSpec struct {
 	WorkloadName string `json:"workloadName"`
 
@@ -65,6 +65,18 @@ type TensorFusionWorkloadStatus struct {
 
 	// Hash of the pod template used to create worker pods
 	PodTemplateHash string `json:"podTemplateHash,omitempty"`
+
+	// The most recently GPU resources recommended by the autoscaler
+	// +optional
+	Recommendation *Resources `json:"recommendation,omitempty"`
+
+	// The number of replicas currently applied based on the latest recommendation
+	// +optional
+	AppliedRecommendedReplicas int32 `json:"appliedRecommendedReplicas,omitempty"`
+
+	// The currently active cron scaling rule
+	// +optional
+	ActiveCronScalingRule *CronScalingRule `json:"activeCronScalingRule,omitempty"`
 }
 
 // +kubebuilder:object:root=true
 
@@ -62,7 +62,7 @@ type WorkloadProfileSpec struct {
 	// +optional
 	// AutoScalingConfig configured here will override Pool's schedulingConfig
 	// This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
-	// user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true'
+	// user can set tensor-fusion.ai/auto-resources|replicas: 'true'
 	AutoScalingConfig AutoScalingConfig `json:"autoScalingConfig,omitempty"`
 
 	// +optional