@@ -148,6 +148,7 @@ type ManifestOptions struct {
148148 VolumesYAML string
149149 VolumeMountsYAML string
150150 GCSFuseEnabled bool
151+ IsSuperSlicing bool // True if the cluster supports and is configured for Super-slicing.
151152 Pathways orchestrator.PathwaysJobDefinition
152153}
153154
@@ -1211,6 +1212,46 @@ func (g *GKEOrchestrator) FetchMachineCapacity(machineType, zone string) (int, e
12111212 return cap .Accelerators [0 ].Count , nil
12121213}
12131214
1215+ func (g * GKEOrchestrator ) verifySuperSlicingActive (opts ManifestOptions ) (bool , error ) {
1216+ // 1. TPU Focus: Return false immediately if not using TPUs.
1217+ if opts .AcceleratorType == "" || ! strings .Contains (strings .ToLower (opts .AcceleratorType ), "tpu" ) {
1218+ return false , nil
1219+ }
1220+
1221+ // 2. Machine Type/Profile Guard: Describe node pool to see if it uses PROVISION_ONLY!
1222+ // We check for placeholder variables for node pool name. In real usage, this should be resolved from the cluster.
1223+ poolName := os .Getenv ("GKE_NODE_POOL_NAME" )
1224+ if poolName == "" {
1225+ logging .Warn ("GKE_NODE_POOL_NAME is not set. Assuming Super-slicing is not active for this node pool." )
1226+ return false , nil
1227+ }
1228+
1229+ result := g .executor .ExecuteCommand ("gcloud" , "container" , "node-pools" , "describe" , poolName , "--cluster=" + opts .ClusterName , "--zone=" + opts .ClusterLocation , "--format=json(placementPolicy)" )
1230+ if result .ExitCode != 0 {
1231+ logging .Warn ("gcloud container node-pools describe failed: %s. Proceeding assuming no Super-slicing." , result .Stderr )
1232+ return false , nil
1233+ }
1234+
1235+ var policy map [string ]interface {}
1236+ if err := json .Unmarshal ([]byte (result .Stdout ), & policy ); err == nil {
1237+ if placement , ok := policy ["placementPolicy" ].(map [string ]interface {}); ok {
1238+ if mode , ok := placement ["acceleratorTopologyMode" ].(string ); ok && mode == "PROVISION_ONLY" {
1239+ logging .Info ("Super-slicing PROVISION_ONLY mode detected for node pool %s." , poolName )
1240+ return true , nil
1241+ }
1242+ }
1243+ }
1244+
1245+ // 3. Kueue CRD Checks: Check for topologies.kueue.x-k8s.io and AdmissionChecks (simulated via shell commands)
1246+ crdResult := g .executor .ExecuteCommand ("kubectl" , "get" , "crd" , "topologies.kueue.x-k8s.io" )
1247+ if crdResult .ExitCode != 0 {
1248+ logging .Warn ("Topology CRD not found. Kueue Super-slicing not active." )
1249+ return false , nil
1250+ }
1251+
1252+ return true , nil
1253+ }
1254+
12141255func (g * GKEOrchestrator ) calculateResourceLimits (opts ManifestOptions ) (cpu , mem , gpu , tpu string ) {
12151256 mapped := g .GenerateGKENodeSelectorLabel (opts .AcceleratorType )
12161257
@@ -1323,7 +1364,14 @@ func (g *GKEOrchestrator) prepareManifestOptions(job orchestrator.JobDefinition,
13231364 }
13241365 schedOpts .Topology = topology
13251366
1326- nodeSelectorStr , err := g .buildNodeSelector (schedOpts , job )
1367+ // Call verifySuperSlicingActive to determine if super-slicing is active!
1368+ isSuperSlicing , _ := g .verifySuperSlicingActive (ManifestOptions {
1369+ ClusterName : job .ClusterName ,
1370+ ClusterLocation : job .ClusterLocation ,
1371+ AcceleratorType : job .AcceleratorType ,
1372+ })
1373+
1374+ nodeSelectorStr , err := g .buildNodeSelector (schedOpts , job , isSuperSlicing )
13271375 if err != nil {
13281376 return ManifestOptions {}, err
13291377 }
@@ -1694,7 +1742,7 @@ func (g *GKEOrchestrator) waitForJobCompletion(workloadName, clusterName, cluste
16941742 return nil
16951743}
16961744
1697- func (g * GKEOrchestrator ) buildNodeSelector (schedOpts scheduling.SchedulingOptions , job orchestrator.JobDefinition ) (string , error ) {
1745+ func (g * GKEOrchestrator ) buildNodeSelector (schedOpts scheduling.SchedulingOptions , job orchestrator.JobDefinition , isSuperSlicing bool ) (string , error ) {
16981746 nodeSelector := scheduling .GetNodeSelector (schedOpts )
16991747 accelLabel := g .GenerateGKENodeSelectorLabel (job .AcceleratorType )
17001748
@@ -1718,7 +1766,9 @@ func (g *GKEOrchestrator) buildNodeSelector(schedOpts scheduling.SchedulingOptio
17181766 if nodeSelector == nil {
17191767 nodeSelector = make (map [string ]string )
17201768 }
1721- nodeSelector ["cloud.google.com/gke-tpu-topology" ] = schedOpts .Topology
1769+ if ! isSuperSlicing {
1770+ nodeSelector ["cloud.google.com/gke-tpu-topology" ] = schedOpts .Topology
1771+ }
17221772 }
17231773
17241774 if len (nodeSelector ) > 0 {
0 commit comments