Skip to content

Commit c113d77

Browse files
authored
fix: solve preemption issue (#555)
1 parent c7d99dc commit c113d77

File tree

9 files changed

+47
-43
lines changed

9 files changed

+47
-43
lines changed

charts/tensor-fusion/values.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,6 @@ schedulerConfig:
201201
maxWorkerPerNode: 256
202202
vramWeight: 0.7
203203
tflopsWeight: 0.3
204-
# preemptClusterWide: true = preempt pods cluster-wide; false = only accept victims in preemptor's namespace
205-
preemptClusterWide: true
206204
- name: GPUNetworkTopologyAware
207205
args:
208206
# Avoid the remote TFWorker RX/TX to avoid single node consume too much bandwidth
@@ -238,7 +236,7 @@ dynamicConfig:
238236
# extra pod labels to be added to metrics,
239237
# you can map label keys to other measure tags
240238
metricsExtraPodLabels: {}
241-
239+
preemptClusterWideFromEnv: true
242240
# alert rules
243241
alertRules:
244242
# Worker TFlops throttled alert

cmd/main.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"crypto/tls"
2222
"flag"
2323
"os"
24+
"strconv"
2425
"time"
2526

2627
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
@@ -597,7 +598,14 @@ func setupTimeSeriesAndWatchGlobalConfigChanges(ctx context.Context, mgr manager
597598
os.Exit(1)
598599
}
599600
config.SetGlobalConfig(globalConfig)
600-
601+
// set env so scheduler default preemption reads effective preempt cluster-wide
602+
preemptVal := strconv.FormatBool(globalConfig.PreemptClusterWideFromEnv)
603+
if err := os.Setenv(constants.SchedulerPreemptClusterWideEnv, preemptVal); err != nil {
604+
setupLog.Error(err, "failed to set preempt cluster-wide env")
605+
}
606+
// get env PreemptClusterWideFromEnv
607+
preemptClusterWideFromEnv := os.Getenv(constants.SchedulerPreemptClusterWideEnv)
608+
setupLog.Info("preempt cluster-wide from env", "preemptClusterWideFromEnv", preemptClusterWideFromEnv)
601609
// only init TSDB and evaluator in leader
602610
needTSDB := enableAlert || enableAutoScale
603611

@@ -636,8 +644,16 @@ func watchAndHandleConfigChanges(ctx context.Context, mgr manager.Manager, needT
636644
"configPath", dynamicConfigPath)
637645
continue
638646
}
647+
// check if preemptClusterWideFromEnv is changed
648+
if globalConfig.PreemptClusterWideFromEnv != config.GetGlobalConfig().PreemptClusterWideFromEnv {
649+
preemptVal := strconv.FormatBool(globalConfig.PreemptClusterWideFromEnv)
650+
if err := os.Setenv(constants.SchedulerPreemptClusterWideEnv, preemptVal); err != nil {
651+
ctrl.Log.Error(err, "failed to set preempt cluster-wide env")
652+
}
653+
preemptClusterWideFromEnv := os.Getenv(constants.SchedulerPreemptClusterWideEnv)
654+
ctrl.Log.Info("preempt cluster-wide from env", "preemptClusterWideFromEnv", preemptClusterWideFromEnv)
655+
}
639656
config.SetGlobalConfig(globalConfig)
640-
641657
// handle alert rules update
642658
go func() {
643659
<-alertEvaluatorReady

config/samples/dynamic-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ metricsTTL: 30d
44
metricsFormat: influx
55

66
autoScalingInterval: 10s
7-
7+
preemptClusterWideFromEnv: true
88
alertRules:
99
# Worker TFlops throttled alert
1010
- name: WorkerTFlopsThrottled

config/samples/scheduler-config.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ profiles:
3333
maxWorkerPerNode: 256
3434
vramWeight: 0.7
3535
tflopsWeight: 0.3
36-
# preemptClusterWide: true = preempt pods cluster-wide; false = only accept victims in preemptor's namespace
37-
preemptClusterWide: true
3836
- name: GPUNetworkTopologyAware
3937
args:
4038
# Avoid the remote TFWorker RX/TX to avoid single node consume too much bandwidth

internal/config/global_config.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@ import (
77
)
88

99
type GlobalConfig struct {
10-
MetricsTTL string `yaml:"metricsTTL"`
11-
MetricsFormat string `yaml:"metricsFormat"`
12-
MetricsExtraPodLabels map[string]string `yaml:"metricsExtraPodLabels"`
13-
14-
AlertRules []AlertRule `yaml:"alertRules"`
15-
AutoMigration *AutoMigrationConfig `yaml:"autoMigration"`
16-
17-
AutoScalingInterval string `yaml:"autoScalingInterval"`
18-
GPUOperatorNamespace string `yaml:"gpuOperatorNamespace"`
10+
MetricsTTL string `yaml:"metricsTTL"`
11+
MetricsFormat string `yaml:"metricsFormat"`
12+
MetricsExtraPodLabels map[string]string `yaml:"metricsExtraPodLabels"`
13+
AlertRules []AlertRule `yaml:"alertRules"`
14+
AutoMigration *AutoMigrationConfig `yaml:"autoMigration"`
15+
16+
AutoScalingInterval string `yaml:"autoScalingInterval"`
17+
GPUOperatorNamespace string `yaml:"gpuOperatorNamespace"`
18+
PreemptClusterWideFromEnv bool `yaml:"preemptClusterWideFromEnv"`
1919
}
2020

2121
type AutoMigrationConfig struct {

internal/config/scheduler_config.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,6 @@ type GPUFitConfig struct {
55

66
VramWeight float64 `json:"vramWeight"`
77
TflopsWeight float64 `json:"tflopsWeight"`
8-
9-
// PreemptClusterWide when true or unset allows preempting pods cluster-wide; when false only victims
10-
// in the preemptor pod's namespace are accepted (validated in Filter phase after DefaultPreemption selects victims).
11-
PreemptClusterWide *bool `json:"preemptClusterWide,omitempty"`
128
}
139

1410
type GPUNetworkTopologyAwareConfig struct {

internal/scheduler/gpuresources/gpuresources.go

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,30 +1062,13 @@ func (s *GPUFit) validatePreemption(state fwk.CycleState, pod *v1.Pod, nodeInfo
10621062
return fwk.NewStatus(fwk.Error, "invalid type for preempt pods")
10631063
}
10641064

1065-
// When PreemptClusterWide is explicitly false, only use victims in the preemptor's namespace for subsequent checks.
1066-
victimsToUse := victims
1067-
if s.cfg != nil && s.cfg.PreemptClusterWide != nil && !*s.cfg.PreemptClusterWide {
1068-
victimsToUse = sets.New[types.NamespacedName]()
1069-
for v := range victims {
1070-
if v.Namespace == pod.Namespace {
1071-
victimsToUse.Insert(v)
1072-
}
1073-
}
1074-
if victimsToUse.Len() == 0 {
1075-
s.logger.Info("GPU preemption: no same-namespace victims on node",
1076-
"pod", klog.KObj(pod),
1077-
"node", nodeName)
1078-
return fwk.NewStatus(fwk.Unschedulable, "no victims in preemptor namespace on this node")
1079-
}
1080-
}
1081-
10821065
victimList := victims.UnsortedList()
10831066
// Verify that preemption will release sufficient GPU resources
10841067
// This calls CheckQuotaAndFilterSingleNodePreempt which:
10851068
// - Simulates releasing victim GPU resources
10861069
// - Applies GPU filters (resource, model, vendor, affinity, same-node)
10871070
// - Checks quota constraints
1088-
err = s.allocator.CheckQuotaAndFilterSingleNodePreempt(nodeName, allocReq, victimsToUse)
1071+
err = s.allocator.CheckQuotaAndFilterSingleNodePreempt(nodeName, allocReq, victims)
10891072
if err != nil {
10901073
s.logger.Info("GPU preemption validation failed",
10911074
"pod", pod.Name,

patches/scheduler-pdb-1.patch

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,23 @@
11
--- ../vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go 2026-01-26 00:00:00
22
+++ ../vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go 2026-01-26 00:00:00
3-
@@ -20,6 +20,7 @@
4-
"context"
5-
"fmt"
3+
@@ -22,4 +22,7 @@
64
"math/rand"
75
"sort"
86

7+
+ "os"
8+
+ "strings"
99
+ "time"
1010
v1 "k8s.io/api/core/v1"
11-
@@ -108,3 +109,17 @@
11+
@@ -108,6 +111,26 @@
12+
// Default behavior: No additional filtering, beyond the internal requirement that the victim pod
13+
// have lower priority than the preemptor pod.
1214
pl.IsEligiblePod = func(nodeInfo fwk.NodeInfo, victim fwk.PodInfo, preemptor *v1.Pod) bool {
15+
+ // Cluster-wide when env is unset or "true"; "false" = same-tenant (same namespace) only.
16+
+ v := strings.ToLower(strings.TrimSpace(os.Getenv("TF_SCHEDULER_PREEMPT_CLUSTER_WIDE")))
17+
+ preemptClusterWide := v == "" || v == "true"
18+
+ if !preemptClusterWide && victim.GetPod().Namespace != preemptor.Namespace {
19+
+ return false
20+
+ }
1321
+ victimAnnotation := victim.GetPod().Annotations
1422
+ if victimAnnotation == nil {
1523
+ return true
@@ -26,3 +34,4 @@
2634
+ }
2735
return true
2836
}
37+

pkg/constants/env.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ package constants
33
// Controller itself envs
44
const NamespaceEnv = "OPERATOR_NAMESPACE"
55

6+
// Scheduler preemption: when PreemptClusterWideFromEnv is enabled, effective value is read from this env.
7+
// "true" = cluster-wide preemption; "false" = same-tenant (same namespace) only.
8+
const SchedulerPreemptClusterWideEnv = "TF_SCHEDULER_PREEMPT_CLUSTER_WIDE"
9+
610
// System feature toggles
711
const (
812
EnableWebhookEnv = "ENABLE_WEBHOOKS"

0 commit comments

Comments
 (0)