Skip to content

Commit 619fdf4

Browse files
committed
feat: auto-detect super-slicing via gcloud and omit standard node selectors
1 parent d791719 commit 619fdf4

File tree

2 files changed

+136
-3
lines changed

2 files changed

+136
-3
lines changed

pkg/orchestrator/gke/gke_orchestrator.go

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ type ManifestOptions struct {
148148
VolumesYAML string
149149
VolumeMountsYAML string
150150
GCSFuseEnabled bool
151+
IsSuperSlicing bool // True if the cluster supports and is configured for Super-slicing.
151152
Pathways orchestrator.PathwaysJobDefinition
152153
}
153154

@@ -1211,6 +1212,46 @@ func (g *GKEOrchestrator) FetchMachineCapacity(machineType, zone string) (int, e
12111212
return cap.Accelerators[0].Count, nil
12121213
}
12131214

1215+
func (g *GKEOrchestrator) verifySuperSlicingActive(opts ManifestOptions) (bool, error) {
1216+
// 1. TPU Focus: Return false immediately if not using TPUs.
1217+
if opts.AcceleratorType == "" || !strings.Contains(strings.ToLower(opts.AcceleratorType), "tpu") {
1218+
return false, nil
1219+
}
1220+
1221+
// 2. Machine Type/Profile Guard: Describe node pool to see if it uses PROVISION_ONLY!
1222+
// We check for placeholder variables for node pool name. In real usage, this should be resolved from the cluster.
1223+
poolName := os.Getenv("GKE_NODE_POOL_NAME")
1224+
if poolName == "" {
1225+
logging.Warn("GKE_NODE_POOL_NAME is not set. Assuming Super-slicing is not active for this node pool.")
1226+
return false, nil
1227+
}
1228+
1229+
result := g.executor.ExecuteCommand("gcloud", "container", "node-pools", "describe", poolName, "--cluster="+opts.ClusterName, "--zone="+opts.ClusterLocation, "--format=json(placementPolicy)")
1230+
if result.ExitCode != 0 {
1231+
logging.Warn("gcloud container node-pools describe failed: %s. Proceeding assuming no Super-slicing.", result.Stderr)
1232+
return false, nil
1233+
}
1234+
1235+
var policy map[string]interface{}
1236+
if err := json.Unmarshal([]byte(result.Stdout), &policy); err == nil {
1237+
if placement, ok := policy["placementPolicy"].(map[string]interface{}); ok {
1238+
if mode, ok := placement["acceleratorTopologyMode"].(string); ok && mode == "PROVISION_ONLY" {
1239+
logging.Info("Super-slicing PROVISION_ONLY mode detected for node pool %s.", poolName)
1240+
return true, nil
1241+
}
1242+
}
1243+
}
1244+
1245+
// 3. Kueue CRD Checks: Check for topologies.kueue.x-k8s.io and AdmissionChecks (simulated via shell commands)
1246+
crdResult := g.executor.ExecuteCommand("kubectl", "get", "crd", "topologies.kueue.x-k8s.io")
1247+
if crdResult.ExitCode != 0 {
1248+
logging.Warn("Topology CRD not found. Kueue Super-slicing not active.")
1249+
return false, nil
1250+
}
1251+
1252+
return true, nil
1253+
}
1254+
12141255
func (g *GKEOrchestrator) calculateResourceLimits(opts ManifestOptions) (cpu, mem, gpu, tpu string) {
12151256
mapped := g.GenerateGKENodeSelectorLabel(opts.AcceleratorType)
12161257

@@ -1323,7 +1364,14 @@ func (g *GKEOrchestrator) prepareManifestOptions(job orchestrator.JobDefinition,
13231364
}
13241365
schedOpts.Topology = topology
13251366

1326-
nodeSelectorStr, err := g.buildNodeSelector(schedOpts, job)
1367+
// Call verifySuperSlicingActive to determine if super-slicing is active!
1368+
isSuperSlicing, _ := g.verifySuperSlicingActive(ManifestOptions{
1369+
ClusterName: job.ClusterName,
1370+
ClusterLocation: job.ClusterLocation,
1371+
AcceleratorType: job.AcceleratorType,
1372+
})
1373+
1374+
nodeSelectorStr, err := g.buildNodeSelector(schedOpts, job, isSuperSlicing)
13271375
if err != nil {
13281376
return ManifestOptions{}, err
13291377
}
@@ -1694,7 +1742,7 @@ func (g *GKEOrchestrator) waitForJobCompletion(workloadName, clusterName, cluste
16941742
return nil
16951743
}
16961744

1697-
func (g *GKEOrchestrator) buildNodeSelector(schedOpts scheduling.SchedulingOptions, job orchestrator.JobDefinition) (string, error) {
1745+
func (g *GKEOrchestrator) buildNodeSelector(schedOpts scheduling.SchedulingOptions, job orchestrator.JobDefinition, isSuperSlicing bool) (string, error) {
16981746
nodeSelector := scheduling.GetNodeSelector(schedOpts)
16991747
accelLabel := g.GenerateGKENodeSelectorLabel(job.AcceleratorType)
17001748

@@ -1718,7 +1766,9 @@ func (g *GKEOrchestrator) buildNodeSelector(schedOpts scheduling.SchedulingOptio
17181766
if nodeSelector == nil {
17191767
nodeSelector = make(map[string]string)
17201768
}
1721-
nodeSelector["cloud.google.com/gke-tpu-topology"] = schedOpts.Topology
1769+
if !isSuperSlicing {
1770+
nodeSelector["cloud.google.com/gke-tpu-topology"] = schedOpts.Topology
1771+
}
17221772
}
17231773

17241774
if len(nodeSelector) > 0 {

pkg/orchestrator/gke/gke_orchestrator_test.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,4 +485,87 @@ func TestFetchMachineCapacity(t *testing.T) {
485485
})
486486
}
487487
}
488+
func TestVerifySuperSlicingActive(t *testing.T) {
489+
tests := []struct {
490+
name string
491+
opts ManifestOptions
492+
mockResponses map[string][]shell.CommandResult
493+
envVars map[string]string
494+
wantResult bool
495+
}{
496+
{
497+
name: "Success - Super-slicing active",
498+
opts: ManifestOptions{
499+
ClusterName: "test-cluster",
500+
ClusterLocation: "us-central1-a",
501+
AcceleratorType: "tpu-v6e-slice",
502+
},
503+
envVars: map[string]string{"GKE_NODE_POOL_NAME": "test-pool"},
504+
mockResponses: map[string][]shell.CommandResult{
505+
"gcloud container node-pools describe test-pool --cluster=test-cluster --zone=us-central1-a --format=json(placementPolicy)": {
506+
{ExitCode: 0, Stdout: `{"placementPolicy": {"acceleratorTopologyMode": "PROVISION_ONLY"}}`},
507+
},
508+
"kubectl get crd topologies.kueue.x-k8s.io": {
509+
{ExitCode: 0},
510+
},
511+
},
512+
wantResult: true,
513+
},
514+
{
515+
name: "Failure - No TPU",
516+
opts: ManifestOptions{
517+
ClusterName: "test-cluster",
518+
ClusterLocation: "us-central1-a",
519+
AcceleratorType: "nvidia-l4",
520+
},
521+
envVars: nil,
522+
mockResponses: nil,
523+
wantResult: false,
524+
},
525+
{
526+
name: "Failure - No Node Pool set",
527+
opts: ManifestOptions{
528+
ClusterName: "test-cluster",
529+
ClusterLocation: "us-central1-a",
530+
AcceleratorType: "tpu-v6e-slice",
531+
},
532+
envVars: nil,
533+
mockResponses: nil,
534+
wantResult: false,
535+
},
536+
{
537+
name: "Failure - gcloud fails",
538+
opts: ManifestOptions{
539+
ClusterName: "test-cluster",
540+
ClusterLocation: "us-central1-a",
541+
AcceleratorType: "tpu-v6e-slice",
542+
},
543+
envVars: map[string]string{"GKE_NODE_POOL_NAME": "test-pool"},
544+
mockResponses: map[string][]shell.CommandResult{
545+
"gcloud container node-pools describe test-pool --cluster=test-cluster --zone=us-central1-a --format=json(placementPolicy)": {
546+
{ExitCode: 1, Stderr: "slow network"},
547+
},
548+
},
549+
wantResult: false,
550+
},
551+
}
488552

553+
for _, tt := range tests {
554+
t.Run(tt.name, func(t *testing.T) {
555+
for k, v := range tt.envVars {
556+
t.Setenv(k, v)
557+
}
558+
mockExecutor := NewMockExecutor(tt.mockResponses)
559+
orc := &GKEOrchestrator{executor: mockExecutor}
560+
561+
got, err := orc.verifySuperSlicingActive(tt.opts)
562+
563+
if err != nil {
564+
t.Errorf("Unexpected error: %v", err)
565+
}
566+
if got != tt.wantResult {
567+
t.Errorf("Expected %t, got %t", tt.wantResult, got)
568+
}
569+
})
570+
}
571+
}

0 commit comments

Comments
 (0)