Skip to content

Commit 34d129e

Browse files
authored
test: remaining topology e2e tests (#383)
1 parent dc05397 commit 34d129e

20 files changed

+1724
-96
lines changed

operator/e2e/dependencies.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ images:
3131
version: v0.13.0-rc1
3232
- name: ghcr.io/nvidia/kai-scheduler/scheduler
3333
version: v0.13.0-rc1
34-
3534
# Cert-manager
3635
- name: quay.io/jetstack/cert-manager-controller
3736
version: v1.14.4
@@ -42,6 +41,10 @@ images:
4241
- name: quay.io/jetstack/cert-manager-ctl
4342
version: v1.14.4
4443

44+
# Lightweight container for test pods
45+
- name: busybox
46+
version: latest
47+
4548
# Helm charts used in E2E tests
4649
helmCharts:
4750
# Kai Scheduler - gang scheduling for Kubernetes

operator/e2e/tests/topology_test.go

Lines changed: 647 additions & 25 deletions
Large diffs are not rendered by default.

operator/e2e/utils/kai_topology.go

Lines changed: 103 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,11 @@ package utils
2121
import (
2222
"context"
2323
"fmt"
24+
"testing"
2425
"time"
2526

2627
kaischedulingv2alpha2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2alpha2"
28+
nameutils "github.com/ai-dynamo/grove/operator/api/common"
2729
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2830
"k8s.io/client-go/dynamic"
2931
"k8s.io/utils/ptr"
@@ -38,10 +40,30 @@ type ExpectedSubGroup struct {
3840
PreferredTopologyLevel string
3941
}
4042

43+
// PCSGCliqueConfig defines configuration for a single clique in a PCSG
44+
type PCSGCliqueConfig struct {
45+
Name string
46+
PodCount int32
47+
Constraint string
48+
}
49+
50+
// ScaledPCSGConfig defines configuration for verifying a scaled PCSG replica
51+
type ScaledPCSGConfig struct {
52+
Name string
53+
PCSGName string
54+
PCSGReplica int
55+
MinAvailable int
56+
CliqueConfigs []PCSGCliqueConfig
57+
Constraint string
58+
}
59+
4160
// CreateExpectedStandalonePCLQSubGroup creates an ExpectedSubGroup for a standalone PodClique (not in PCSG)
4261
// Name format: <pcs-name>-<pcs-replica>-<clique-name>
4362
func CreateExpectedStandalonePCLQSubGroup(pcsName string, pcsReplica int, cliqueName string, minMember int32, topologyLevel string) ExpectedSubGroup {
44-
name := GetStandalonePCLQSubGroupName(pcsName, pcsReplica, cliqueName)
63+
name := nameutils.GeneratePodCliqueName(
64+
nameutils.ResourceNameReplica{Name: pcsName, Replica: pcsReplica},
65+
cliqueName,
66+
)
4567
return ExpectedSubGroup{
4668
Name: name,
4769
MinMember: minMember,
@@ -53,7 +75,11 @@ func CreateExpectedStandalonePCLQSubGroup(pcsName string, pcsReplica int, clique
5375
// CreateExpectedPCSGParentSubGroup creates an ExpectedSubGroup for a PCSG parent (scaling group replica)
5476
// Name format: <pcs-name>-<pcs-replica>-<sg-name>-<sg-replica>
5577
func CreateExpectedPCSGParentSubGroup(pcsName string, pcsReplica int, sgName string, sgReplica int, topologyLevel string) ExpectedSubGroup {
56-
name := GetPCSGParentSubGroupName(pcsName, pcsReplica, sgName, sgReplica)
78+
pcsgFQN := nameutils.GeneratePodCliqueScalingGroupName(
79+
nameutils.ResourceNameReplica{Name: pcsName, Replica: pcsReplica},
80+
sgName,
81+
)
82+
name := fmt.Sprintf("%s-%d", pcsgFQN, sgReplica)
5783
return ExpectedSubGroup{
5884
Name: name,
5985
MinMember: 0,
@@ -62,19 +88,41 @@ func CreateExpectedPCSGParentSubGroup(pcsName string, pcsReplica int, sgName str
6288
}
6389
}
6490

65-
// CreateExpectedPCLQInPCSGSubGroup creates an ExpectedSubGroup for a PodClique within a PCSG
91+
// CreateExpectedPCLQInPCSGSubGroup creates an ExpectedSubGroup for a PodClique within a PCSG with parent
6692
// Name format: <pcs-name>-<pcs-replica>-<sg-name>-<sg-replica>-<clique-name>
6793
func CreateExpectedPCLQInPCSGSubGroup(pcsName string, pcsReplica int, sgName string, sgReplica int, cliqueName string, minMember int32, topologyLevel string) ExpectedSubGroup {
68-
name := GetPCLQInPCSGSubGroupName(pcsName, pcsReplica, sgName, sgReplica, cliqueName)
69-
parentName := GetPCSGParentSubGroupName(pcsName, pcsReplica, sgName, sgReplica)
94+
return createExpectedPCLQInPCSGSubGroup(pcsName, pcsReplica, sgName, sgReplica, cliqueName, minMember, topologyLevel, true)
95+
}
96+
97+
func createExpectedPCLQInPCSGSubGroup(pcsName string, pcsReplica int, sgName string, sgReplica int, cliqueName string,
98+
minMember int32, topologyLevel string, hasParent bool) ExpectedSubGroup {
99+
pcsgFQN := nameutils.GeneratePodCliqueScalingGroupName(
100+
nameutils.ResourceNameReplica{Name: pcsName, Replica: pcsReplica},
101+
sgName,
102+
)
103+
name := nameutils.GeneratePodCliqueName(
104+
nameutils.ResourceNameReplica{Name: pcsgFQN, Replica: sgReplica},
105+
cliqueName,
106+
)
107+
var parentPtr *string
108+
if hasParent {
109+
parentPtr = ptr.To(fmt.Sprintf("%s-%d", pcsgFQN, sgReplica))
110+
}
70111
return ExpectedSubGroup{
71112
Name: name,
72113
MinMember: minMember,
73-
Parent: ptr.To(parentName),
114+
Parent: parentPtr,
74115
RequiredTopologyLevel: topologyLevel,
75116
}
76117
}
77118

119+
// CreateExpectedPCLQInPCSGSubGroupNoParent creates an ExpectedSubGroup for a PodClique within a PCSG without parent
120+
// Used when PCSG has no topology constraint (no parent SubGroup created)
121+
// Name format: <pcs-name>-<pcs-replica>-<sg-name>-<sg-replica>-<clique-name>
122+
func CreateExpectedPCLQInPCSGSubGroupNoParent(pcsName string, pcsReplica int, sgName string, sgReplica int, cliqueName string, minMember int32, topologyLevel string) ExpectedSubGroup {
123+
return createExpectedPCLQInPCSGSubGroup(pcsName, pcsReplica, sgName, sgReplica, cliqueName, minMember, topologyLevel, false)
124+
}
125+
78126
// GetKAIPodGroupsForPCS retrieves all KAI PodGroups for a given PodCliqueSet by label selector
79127
// KAI scheduler creates PodGroups with label: app.kubernetes.io/part-of=<pcs-name>
80128
// Returns a list of PodGroups that tests can filter by owner reference if needed
@@ -233,7 +281,7 @@ func GetPodGroupForBasePodGangReplica(
233281
return nil, fmt.Errorf("failed to get KAI PodGroups: %w", err)
234282
}
235283

236-
basePodGangName := GetBasePodGangName(workloadName, pgsReplica)
284+
basePodGangName := nameutils.GenerateBasePodGangName(nameutils.ResourceNameReplica{Name: workloadName, Replica: pgsReplica})
237285
basePodGroup, err := FilterPodGroupByOwner(podGroups, basePodGangName)
238286
if err != nil {
239287
return nil, fmt.Errorf("failed to find PodGroup for PodGang %s: %w", basePodGangName, err)
@@ -259,3 +307,51 @@ func VerifyPodGroupTopology(
259307

260308
return nil
261309
}
310+
311+
// VerifyScaledPCSGReplicaTopology verifies KAI PodGroup for ONE scaled PCSG replica.
312+
// Scaled PodGroup top-level constraint: uses pcsConstraint ONLY if PCSG has NO constraint.
313+
func VerifyScaledPCSGReplicaTopology(
314+
ctx context.Context,
315+
t *testing.T,
316+
dynamicClient dynamic.Interface,
317+
namespace string,
318+
pcsName string,
319+
pcsReplica int,
320+
pcsgConfig ScaledPCSGConfig,
321+
pcsConstraint string,
322+
logger *Logger,
323+
) {
324+
podGroups, err := GetKAIPodGroupsForPCS(ctx, dynamicClient, namespace, pcsName)
325+
if err != nil {
326+
t.Fatalf("Failed to get KAI PodGroups: %v", err)
327+
}
328+
329+
pcsgFQN := nameutils.GeneratePodCliqueScalingGroupName(
330+
nameutils.ResourceNameReplica{Name: pcsName, Replica: pcsReplica},
331+
pcsgConfig.PCSGName,
332+
)
333+
334+
scaledPodGangName := nameutils.CreatePodGangNameFromPCSGFQN(pcsgFQN, pcsgConfig.PCSGReplica-pcsgConfig.MinAvailable)
335+
336+
scaledPodGroup, err := FilterPodGroupByOwner(podGroups, scaledPodGangName)
337+
if err != nil {
338+
t.Fatalf("Failed to find scaled PodGroup for %s: %v", scaledPodGangName, err)
339+
}
340+
341+
var expectedSubGroups []ExpectedSubGroup
342+
343+
for _, cliqueConfig := range pcsgConfig.CliqueConfigs {
344+
expectedSubGroups = append(expectedSubGroups,
345+
CreateExpectedPCLQInPCSGSubGroupNoParent(pcsName, pcsReplica, pcsgConfig.PCSGName, pcsgConfig.PCSGReplica, cliqueConfig.Name, cliqueConfig.PodCount, cliqueConfig.Constraint))
346+
}
347+
348+
scaledTopConstraint := pcsConstraint
349+
if pcsgConfig.Constraint != "" {
350+
scaledTopConstraint = pcsgConfig.Constraint
351+
}
352+
353+
if err := VerifyPodGroupTopology(scaledPodGroup, scaledTopConstraint, "", expectedSubGroups, logger); err != nil {
354+
t.Fatalf("Failed to verify scaled PodGroup %s (%s replica %d) topology: %v",
355+
scaledPodGangName, pcsgConfig.Name, pcsgConfig.PCSGReplica, err)
356+
}
357+
}

operator/e2e/utils/naming.go

Lines changed: 0 additions & 45 deletions
This file was deleted.

operator/e2e/utils/topology.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,3 +203,69 @@ func VerifyLabeledPodsInTopologyDomain(
203203

204204
return VerifyPodsInSameTopologyDomain(ctx, clientset, filteredPods, topologyKey, logger)
205205
}
206+
207+
// PCSGTypeConfig defines configuration for a PCSG type verification
208+
type PCSGTypeConfig struct {
209+
Name string // Human-readable name (e.g., "decoder")
210+
FQN string // Fully-qualified PCSG name
211+
}
212+
213+
// VerifyPCSGReplicasInTopologyDomain verifies that each PCSG replica's pods
214+
// are in the same topology domain (e.g., rack, host).
215+
func VerifyPCSGReplicasInTopologyDomain(
216+
ctx context.Context,
217+
clientset kubernetes.Interface,
218+
allPods []v1.Pod,
219+
pcsgLabel string,
220+
replicaCount int,
221+
podsPerReplica int,
222+
topologyLabel string,
223+
logger *Logger,
224+
) error {
225+
for replica := 0; replica < replicaCount; replica++ {
226+
replicaPods := FilterPodsByLabel(
227+
FilterPodsByLabel(allPods, "grove.io/podcliquescalinggroup", pcsgLabel),
228+
"grove.io/podcliquescalinggroup-replica-index",
229+
fmt.Sprintf("%d", replica),
230+
)
231+
if len(replicaPods) != podsPerReplica {
232+
return fmt.Errorf("expected %d PCSG replica %d pods, got %d", podsPerReplica, replica, len(replicaPods))
233+
}
234+
if err := VerifyPodsInSameTopologyDomain(ctx, clientset, replicaPods, topologyLabel, logger); err != nil {
235+
return fmt.Errorf("failed to verify PCSG replica %d pods in same topology domain: %w", replica, err)
236+
}
237+
}
238+
return nil
239+
}
240+
241+
// VerifyMultiTypePCSGReplicas verifies multiple PCSG types across replicas.
242+
// Each PCSG type has multiple replicas, and each replica's pods should be in the same topology domain.
243+
func VerifyMultiTypePCSGReplicas(
244+
ctx context.Context,
245+
clientset kubernetes.Interface,
246+
allPods []v1.Pod,
247+
pcsgTypes []PCSGTypeConfig,
248+
replicasPerType int,
249+
podsPerReplica int,
250+
topologyLabel string,
251+
logger *Logger,
252+
) error {
253+
for _, pcsgType := range pcsgTypes {
254+
for replica := 0; replica < replicasPerType; replica++ {
255+
replicaPods := FilterPodsByLabel(
256+
FilterPodsByLabel(allPods, "grove.io/podcliquescalinggroup", pcsgType.FQN),
257+
"grove.io/podcliquescalinggroup-replica-index",
258+
fmt.Sprintf("%d", replica),
259+
)
260+
if len(replicaPods) != podsPerReplica {
261+
return fmt.Errorf("expected %d %s replica-%d pods, got %d",
262+
podsPerReplica, pcsgType.Name, replica, len(replicaPods))
263+
}
264+
if err := VerifyPodsInSameTopologyDomain(ctx, clientset, replicaPods, topologyLabel, logger); err != nil {
265+
return fmt.Errorf("failed to verify %s replica-%d pods in same topology domain: %w",
266+
pcsgType.Name, replica, err)
267+
}
268+
}
269+
}
270+
return nil
271+
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# Workload 8: SP-1 - Full 3-Level Hierarchy with Cascading Constraints
2+
# Test scenario: PCS (block) → PCSG (rack) → PCLQ (host) - demonstrating constraint inheritance
3+
---
4+
apiVersion: grove.io/v1alpha1
5+
kind: PodCliqueSet
6+
metadata:
7+
name: tas-hierarchy
8+
labels:
9+
app: tas-hierarchy
10+
spec:
11+
replicas: 1
12+
template:
13+
topologyConstraint:
14+
packDomain: block # PCS level - broadest
15+
podCliqueScalingGroups:
16+
- name: inference-group
17+
replicas: 2
18+
minAvailable: 2
19+
topologyConstraint:
20+
packDomain: rack # PCSG level - stricter than parent
21+
cliqueNames:
22+
- prefill
23+
- decode
24+
cliques:
25+
- name: prefill
26+
labels:
27+
kai.scheduler/queue: test
28+
topologyConstraint:
29+
packDomain: host # PCLQ level - strictest
30+
spec:
31+
roleName: prefill
32+
replicas: 2
33+
minAvailable: 2
34+
podSpec:
35+
schedulerName: kai-scheduler
36+
affinity:
37+
nodeAffinity:
38+
requiredDuringSchedulingIgnoredDuringExecution:
39+
nodeSelectorTerms:
40+
- matchExpressions:
41+
- key: node_role.e2e.grove.nvidia.com
42+
operator: In
43+
values:
44+
- agent
45+
tolerations:
46+
- key: node_role.e2e.grove.nvidia.com
47+
operator: Equal
48+
value: agent
49+
effect: NoSchedule
50+
containers:
51+
- name: prefill
52+
image: registry:5001/busybox:latest
53+
command: ["sleep", "infinity"]
54+
resources:
55+
requests:
56+
memory: 40Mi
57+
- name: decode
58+
labels:
59+
kai.scheduler/queue: test
60+
topologyConstraint:
61+
packDomain: host # PCLQ level - strictest
62+
spec:
63+
roleName: decode
64+
replicas: 2
65+
minAvailable: 2
66+
podSpec:
67+
schedulerName: kai-scheduler
68+
affinity:
69+
nodeAffinity:
70+
requiredDuringSchedulingIgnoredDuringExecution:
71+
nodeSelectorTerms:
72+
- matchExpressions:
73+
- key: node_role.e2e.grove.nvidia.com
74+
operator: In
75+
values:
76+
- agent
77+
tolerations:
78+
- key: node_role.e2e.grove.nvidia.com
79+
operator: Equal
80+
value: agent
81+
effect: NoSchedule
82+
containers:
83+
- name: decode
84+
image: registry:5001/busybox:latest
85+
command: ["sleep", "infinity"]
86+
resources:
87+
requests:
88+
memory: 40Mi

operator/e2e/yaml/tas-host-level.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ spec:
3838
effect: NoSchedule
3939
containers:
4040
- name: worker
41-
image: registry:5001/nginx:alpine-slim
41+
image: registry:5001/busybox:latest
42+
command: ["sleep", "infinity"]
4243
resources:
4344
requests:
44-
memory: 30Mi
45+
memory: 40Mi
46+

0 commit comments

Comments
 (0)