@@ -21,9 +21,11 @@ package utils
2121import (
2222 "context"
2323 "fmt"
24+ "testing"
2425 "time"
2526
2627 kaischedulingv2alpha2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2alpha2"
28+ nameutils "github.com/ai-dynamo/grove/operator/api/common"
2729 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2830 "k8s.io/client-go/dynamic"
2931 "k8s.io/utils/ptr"
@@ -38,10 +40,30 @@ type ExpectedSubGroup struct {
3840 PreferredTopologyLevel string
3941}
4042
43+ // PCSGCliqueConfig defines configuration for a single clique in a PCSG
44+ type PCSGCliqueConfig struct {
45+ Name string
46+ PodCount int32
47+ Constraint string
48+ }
49+
50+ // ScaledPCSGConfig defines configuration for verifying a scaled PCSG replica
51+ type ScaledPCSGConfig struct {
52+ Name string
53+ PCSGName string
54+ PCSGReplica int
55+ MinAvailable int
56+ CliqueConfigs []PCSGCliqueConfig
57+ Constraint string
58+ }
59+
4160// CreateExpectedStandalonePCLQSubGroup creates an ExpectedSubGroup for a standalone PodClique (not in PCSG)
4261// Name format: <pcs-name>-<pcs-replica>-<clique-name>
4362func CreateExpectedStandalonePCLQSubGroup (pcsName string , pcsReplica int , cliqueName string , minMember int32 , topologyLevel string ) ExpectedSubGroup {
44- name := GetStandalonePCLQSubGroupName (pcsName , pcsReplica , cliqueName )
63+ name := nameutils .GeneratePodCliqueName (
64+ nameutils.ResourceNameReplica {Name : pcsName , Replica : pcsReplica },
65+ cliqueName ,
66+ )
4567 return ExpectedSubGroup {
4668 Name : name ,
4769 MinMember : minMember ,
@@ -53,7 +75,11 @@ func CreateExpectedStandalonePCLQSubGroup(pcsName string, pcsReplica int, clique
5375// CreateExpectedPCSGParentSubGroup creates an ExpectedSubGroup for a PCSG parent (scaling group replica)
5476// Name format: <pcs-name>-<pcs-replica>-<sg-name>-<sg-replica>
5577func CreateExpectedPCSGParentSubGroup (pcsName string , pcsReplica int , sgName string , sgReplica int , topologyLevel string ) ExpectedSubGroup {
56- name := GetPCSGParentSubGroupName (pcsName , pcsReplica , sgName , sgReplica )
78+ pcsgFQN := nameutils .GeneratePodCliqueScalingGroupName (
79+ nameutils.ResourceNameReplica {Name : pcsName , Replica : pcsReplica },
80+ sgName ,
81+ )
82+ name := fmt .Sprintf ("%s-%d" , pcsgFQN , sgReplica )
5783 return ExpectedSubGroup {
5884 Name : name ,
5985 MinMember : 0 ,
@@ -62,19 +88,41 @@ func CreateExpectedPCSGParentSubGroup(pcsName string, pcsReplica int, sgName str
6288 }
6389}
6490
65- // CreateExpectedPCLQInPCSGSubGroup creates an ExpectedSubGroup for a PodClique within a PCSG
91+ // CreateExpectedPCLQInPCSGSubGroup creates an ExpectedSubGroup for a PodClique within a PCSG with parent
6692// Name format: <pcs-name>-<pcs-replica>-<sg-name>-<sg-replica>-<clique-name>
6793func CreateExpectedPCLQInPCSGSubGroup (pcsName string , pcsReplica int , sgName string , sgReplica int , cliqueName string , minMember int32 , topologyLevel string ) ExpectedSubGroup {
68- name := GetPCLQInPCSGSubGroupName (pcsName , pcsReplica , sgName , sgReplica , cliqueName )
69- parentName := GetPCSGParentSubGroupName (pcsName , pcsReplica , sgName , sgReplica )
94+ return createExpectedPCLQInPCSGSubGroup (pcsName , pcsReplica , sgName , sgReplica , cliqueName , minMember , topologyLevel , true )
95+ }
96+
97+ func createExpectedPCLQInPCSGSubGroup (pcsName string , pcsReplica int , sgName string , sgReplica int , cliqueName string ,
98+ minMember int32 , topologyLevel string , hasParent bool ) ExpectedSubGroup {
99+ pcsgFQN := nameutils .GeneratePodCliqueScalingGroupName (
100+ nameutils.ResourceNameReplica {Name : pcsName , Replica : pcsReplica },
101+ sgName ,
102+ )
103+ name := nameutils .GeneratePodCliqueName (
104+ nameutils.ResourceNameReplica {Name : pcsgFQN , Replica : sgReplica },
105+ cliqueName ,
106+ )
107+ var parentPtr * string
108+ if hasParent {
109+ parentPtr = ptr .To (fmt .Sprintf ("%s-%d" , pcsgFQN , sgReplica ))
110+ }
70111 return ExpectedSubGroup {
71112 Name : name ,
72113 MinMember : minMember ,
73- Parent : ptr . To ( parentName ) ,
114+ Parent : parentPtr ,
74115 RequiredTopologyLevel : topologyLevel ,
75116 }
76117}
77118
119+ // CreateExpectedPCLQInPCSGSubGroupNoParent creates an ExpectedSubGroup for a PodClique within a PCSG without parent
120+ // Used when PCSG has no topology constraint (no parent SubGroup created)
121+ // Name format: <pcs-name>-<pcs-replica>-<sg-name>-<sg-replica>-<clique-name>
122+ func CreateExpectedPCLQInPCSGSubGroupNoParent (pcsName string , pcsReplica int , sgName string , sgReplica int , cliqueName string , minMember int32 , topologyLevel string ) ExpectedSubGroup {
123+ return createExpectedPCLQInPCSGSubGroup (pcsName , pcsReplica , sgName , sgReplica , cliqueName , minMember , topologyLevel , false )
124+ }
125+
78126// GetKAIPodGroupsForPCS retrieves all KAI PodGroups for a given PodCliqueSet by label selector
79127// KAI scheduler creates PodGroups with label: app.kubernetes.io/part-of=<pcs-name>
80128// Returns a list of PodGroups that tests can filter by owner reference if needed
@@ -233,7 +281,7 @@ func GetPodGroupForBasePodGangReplica(
233281 return nil , fmt .Errorf ("failed to get KAI PodGroups: %w" , err )
234282 }
235283
236- basePodGangName := GetBasePodGangName ( workloadName , pgsReplica )
284+ basePodGangName := nameutils . GenerateBasePodGangName (nameutils. ResourceNameReplica { Name : workloadName , Replica : pgsReplica } )
237285 basePodGroup , err := FilterPodGroupByOwner (podGroups , basePodGangName )
238286 if err != nil {
239287 return nil , fmt .Errorf ("failed to find PodGroup for PodGang %s: %w" , basePodGangName , err )
@@ -259,3 +307,51 @@ func VerifyPodGroupTopology(
259307
260308 return nil
261309}
310+
311+ // VerifyScaledPCSGReplicaTopology verifies KAI PodGroup for ONE scaled PCSG replica.
312+ // Scaled PodGroup top-level constraint: uses pcsConstraint ONLY if PCSG has NO constraint.
313+ func VerifyScaledPCSGReplicaTopology (
314+ ctx context.Context ,
315+ t * testing.T ,
316+ dynamicClient dynamic.Interface ,
317+ namespace string ,
318+ pcsName string ,
319+ pcsReplica int ,
320+ pcsgConfig ScaledPCSGConfig ,
321+ pcsConstraint string ,
322+ logger * Logger ,
323+ ) {
324+ podGroups , err := GetKAIPodGroupsForPCS (ctx , dynamicClient , namespace , pcsName )
325+ if err != nil {
326+ t .Fatalf ("Failed to get KAI PodGroups: %v" , err )
327+ }
328+
329+ pcsgFQN := nameutils .GeneratePodCliqueScalingGroupName (
330+ nameutils.ResourceNameReplica {Name : pcsName , Replica : pcsReplica },
331+ pcsgConfig .PCSGName ,
332+ )
333+
334+ scaledPodGangName := nameutils .CreatePodGangNameFromPCSGFQN (pcsgFQN , pcsgConfig .PCSGReplica - pcsgConfig .MinAvailable )
335+
336+ scaledPodGroup , err := FilterPodGroupByOwner (podGroups , scaledPodGangName )
337+ if err != nil {
338+ t .Fatalf ("Failed to find scaled PodGroup for %s: %v" , scaledPodGangName , err )
339+ }
340+
341+ var expectedSubGroups []ExpectedSubGroup
342+
343+ for _ , cliqueConfig := range pcsgConfig .CliqueConfigs {
344+ expectedSubGroups = append (expectedSubGroups ,
345+ CreateExpectedPCLQInPCSGSubGroupNoParent (pcsName , pcsReplica , pcsgConfig .PCSGName , pcsgConfig .PCSGReplica , cliqueConfig .Name , cliqueConfig .PodCount , cliqueConfig .Constraint ))
346+ }
347+
348+ scaledTopConstraint := pcsConstraint
349+ if pcsgConfig .Constraint != "" {
350+ scaledTopConstraint = pcsgConfig .Constraint
351+ }
352+
353+ if err := VerifyPodGroupTopology (scaledPodGroup , scaledTopConstraint , "" , expectedSubGroups , logger ); err != nil {
354+ t .Fatalf ("Failed to verify scaled PodGroup %s (%s replica %d) topology: %v" ,
355+ scaledPodGangName , pcsgConfig .Name , pcsgConfig .PCSGReplica , err )
356+ }
357+ }
0 commit comments