diff --git a/operator/e2e/dependencies.yaml b/operator/e2e/dependencies.yaml index 1bd01a55a..7d06eccea 100644 --- a/operator/e2e/dependencies.yaml +++ b/operator/e2e/dependencies.yaml @@ -31,7 +31,6 @@ images: version: v0.13.0-rc1 - name: ghcr.io/nvidia/kai-scheduler/scheduler version: v0.13.0-rc1 - # Cert-manager - name: quay.io/jetstack/cert-manager-controller version: v1.14.4 @@ -42,6 +41,10 @@ images: - name: quay.io/jetstack/cert-manager-ctl version: v1.14.4 + # Lightweight container for test pods + - name: busybox + version: latest + # Helm charts used in E2E tests helmCharts: # Kai Scheduler - gang scheduling for Kubernetes diff --git a/operator/e2e/tests/topology_test.go b/operator/e2e/tests/topology_test.go index 1e04148d0..cadbc8f7b 100644 --- a/operator/e2e/tests/topology_test.go +++ b/operator/e2e/tests/topology_test.go @@ -23,9 +23,12 @@ import ( "fmt" "testing" + kaischedulingv2alpha2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2alpha2" + nameutils "github.com/ai-dynamo/grove/operator/api/common" corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" "github.com/ai-dynamo/grove/operator/e2e/setup" "github.com/ai-dynamo/grove/operator/e2e/utils" + "github.com/samber/lo" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/dynamic" @@ -81,6 +84,18 @@ func createTopologyTestContext( } } +// GetPodGroupOrFail retrieves a PodGroup for the specified PCS replica or fails the test. +func GetPodGroupOrFail(t *testing.T, tc TestContext, pcsReplica int) *kaischedulingv2alpha2.PodGroup { + podGroup, err := utils.GetPodGroupForBasePodGangReplica( + tc.Ctx, tc.DynamicClient, tc.Namespace, tc.Workload.Name, + pcsReplica, tc.Timeout, tc.Interval, logger, + ) + if err != nil { + t.Fatalf("Failed to get PodGroup for replica %d: %v", pcsReplica, err) + } + return podGroup +} + // Test_TAS1_TopologyInfrastructure verifies that the operator creates ClusterTopology and KAI Topology CRs at startup // 1. Verify ClusterTopology CR exists with the correct 4-level hierarchy (zone, block, rack, host) // 2. Verify KAI Topology CR exists with matching levels @@ -184,10 +199,7 @@ func Test_TAS2_MultipleCliquesWithDifferentConstraints(t *testing.T) { } logger.Info("5. Verify KAI PodGroup has correct SubGroups with topology constraints") - podGroup, err := utils.GetPodGroupForBasePodGangReplica(tc.Ctx, tc.DynamicClient, tc.Namespace, tc.Workload.Name, 0, tc.Timeout, tc.Interval, logger) - if err != nil { - t.Fatalf("Failed to get PodGroup: %v", err) - } + podGroup := GetPodGroupOrFail(t, tc, 0) // Verify top-level TopologyConstraint is empty (no PCS constraint in this test) // Verify SubGroups (2 standalone PCLQs - no PCSG) @@ -234,10 +246,7 @@ func Test_TAS3_PCSOnlyConstraint(t *testing.T) { } logger.Info("4. Verify KAI PodGroup has correct SubGroups (PCS-only constraint)") - podGroup, err := utils.GetPodGroupForBasePodGangReplica(tc.Ctx, tc.DynamicClient, tc.Namespace, tc.Workload.Name, 0, tc.Timeout, tc.Interval, logger) - if err != nil { - t.Fatalf("Failed to get PodGroup: %v", err) - } + podGroup := GetPodGroupOrFail(t, tc, 0) // Verify top-level TopologyConstraint (PCS level: rack) // Verify SubGroups (2 PCLQ children + 1 router standalone = 3 total) @@ -283,11 +292,8 @@ func Test_TAS4_PCSGOnlyConstraint(t *testing.T) { t.Fatalf("Failed to verify worker pods in same rack: %v", err) } - logger.Info("5. Verify KAI PodGroup has correct SubGroups (PCSG-only constraint)") - podGroup, err := utils.GetPodGroupForBasePodGangReplica(tc.Ctx, tc.DynamicClient, tc.Namespace, tc.Workload.Name, 0, tc.Timeout, tc.Interval, logger) - if err != nil { - t.Fatalf("Failed to get PodGroup: %v", err) - } + logger.Info("4. Verify KAI PodGroup has correct SubGroups (PCSG-only constraint)") + podGroup := GetPodGroupOrFail(t, tc, 0) // Verify top-level TopologyConstraint (no PCS constraint) // Verify SubGroups (2 PCSG parents + 2 PCLQ children + 1 router standalone = 5 total) @@ -343,10 +349,7 @@ func Test_TAS5_HostLevelConstraint(t *testing.T) { } logger.Info("4. Verify KAI PodGroup has correct SubGroups (PCLQ-only host constraint)") - podGroup, err := utils.GetPodGroupForBasePodGangReplica(tc.Ctx, tc.DynamicClient, tc.Namespace, tc.Workload.Name, 0, tc.Timeout, tc.Interval, logger) - if err != nil { - t.Fatalf("Failed to get PodGroup: %v", err) - } + podGroup := GetPodGroupOrFail(t, tc, 0) // Verify top-level TopologyConstraint (no PCS constraint) // Verify SubGroups (1 standalone PCLQ with host constraint) @@ -393,10 +396,7 @@ func Test_TAS6_StandalonePCLQOnlyPCSZoneConstraint(t *testing.T) { } logger.Info("4. Verify KAI PodGroup has correct SubGroups (Standalone PCLQ with PCS zone constraint)") - podGroup, err := utils.GetPodGroupForBasePodGangReplica(tc.Ctx, tc.DynamicClient, tc.Namespace, tc.Workload.Name, 0, tc.Timeout, tc.Interval, logger) - if err != nil { - t.Fatalf("Failed to get PodGroup: %v", err) - } + podGroup := GetPodGroupOrFail(t, tc, 0) // Verify top-level TopologyConstraint (PCS level: zone) // Verify SubGroups (1 standalone PCLQ with NO constraint - zone is at PCS level) @@ -436,10 +436,7 @@ func Test_TAS7_NoTopologyConstraint(t *testing.T) { t.Fatalf("Expected 4 pods, got %d", len(allPods)) } logger.Info("4. Verify KAI PodGroup has correct SubGroups (no constraints)") - podGroup, err := utils.GetPodGroupForBasePodGangReplica(tc.Ctx, tc.DynamicClient, tc.Namespace, tc.Workload.Name, 0, tc.Timeout, tc.Interval, logger) - if err != nil { - t.Fatalf("Failed to get PodGroup: %v", err) - } + podGroup := GetPodGroupOrFail(t, tc, 0) // Verify top-level TopologyConstraint (no PCS constraint) // Verify SubGroups (2 PCLQ children, NO constraints) @@ -455,3 +452,628 @@ func Test_TAS7_NoTopologyConstraint(t *testing.T) { logger.Info("🎉 TAS7: No Topology Constraint test completed successfully!") } + +// Test_TAS8_FullHierarchyWithCascadingConstraints tests 3-level topology hierarchy with cascading constraints +// 1. Deploy workload with PCS (block) → PCSG (rack) → PCLQ (host) constraints +// 2. PCSG: 2 replicas with prefill (2 pods) + decode (2 pods) cliques +// 3. Verify each PCLQ's pods on same host (4 verifications: prefill0, decode0, prefill1, decode1) +// 4. Verify each PCSG replica in same rack (2 verifications: replica0, replica1) +// 5. Verify all pods in same block (PCS constraint) +// 6. Verify KAI PodGroup hierarchy with correct topology constraints +func Test_TAS8_FullHierarchyWithCascadingConstraints(t *testing.T) { + ctx := context.Background() + + logger.Info("1. Initialize an 8-node Grove cluster for topology testing") + clientset, restConfig, dynamicClient, cleanup := prepareTestCluster(ctx, t, 8) + defer cleanup() + + expectedPods := 8 // 2 PCSG replicas × (prefill: 2 pods + decode: 2 pods) + tc := createTopologyTestContext(t, ctx, clientset, restConfig, dynamicClient, + "tas-hierarchy", "../yaml/tas-hierarchy.yaml", expectedPods) + + logger.Info("2. Deploy workload (TAS8: full 3-level hierarchy with cascading constraints)") + allPods, err := DeployWorkloadAndGetPods(tc, expectedPods) + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + + logger.Info("3. Verify PCLQ constraints (2 replicas × 2 clique types) - all on same host") + cliqueTypes := []string{"prefill", "decode"} + for pcsgReplica := 0; pcsgReplica < 2; pcsgReplica++ { + for _, cliqueType := range cliqueTypes { + cliquePods := utils.FilterPodsByLabel(allPods, LabelPodClique, + fmt.Sprintf("tas-hierarchy-0-inference-group-%d-%s", pcsgReplica, cliqueType)) + if len(cliquePods) != 2 { + t.Fatalf("Expected 2 %s pods for PCSG replica %d, got %d", cliqueType, pcsgReplica, len(cliquePods)) + } + if err := utils.VerifyPodsInSameTopologyDomain(tc.Ctx, tc.Clientset, cliquePods, setup.TopologyLabelHostname, logger); err != nil { + t.Fatalf("Failed to verify %s pods on same host for PCSG replica %d: %v", cliqueType, pcsgReplica, err) + } + } + } + + logger.Info("4. Verify PCSG constraints (2 replicas) - all in same rack") + if err := utils.VerifyPCSGReplicasInTopologyDomain(tc.Ctx, tc.Clientset, allPods, + "tas-hierarchy-0-inference-group", 2, 4, setup.TopologyLabelRack, logger); err != nil { + t.Fatalf("Failed to verify PCSG replicas: %v", err) + } + + logger.Info("5. Verify all pods are in same block (PCS constraint)") + if len(allPods) != expectedPods { + t.Fatalf("Expected %d pods, got %d", expectedPods, len(allPods)) + } + if err := utils.VerifyPodsInSameTopologyDomain(tc.Ctx, tc.Clientset, allPods, setup.TopologyLabelBlock, logger); err != nil { + t.Fatalf("Failed to verify all pods in same block: %v", err) + } + + logger.Info("6. Verify KAI PodGroup has correct hierarchy with topology constraints") + podGroup := GetPodGroupOrFail(t, tc, 0) + + // Verify top-level TopologyConstraint (PCS level: block) + SubGroups hierarchy (2 PCSG parents + 4 PCLQ children) + expectedSubGroups := []utils.ExpectedSubGroup{ + utils.CreateExpectedPCSGParentSubGroup(tc.Workload.Name, 0, "inference-group", 0, setup.TopologyLabelRack), + utils.CreateExpectedPCSGParentSubGroup(tc.Workload.Name, 0, "inference-group", 1, setup.TopologyLabelRack), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, 0, "inference-group", 0, "prefill", 2, setup.TopologyLabelHostname), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, 0, "inference-group", 0, "decode", 2, setup.TopologyLabelHostname), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, 0, "inference-group", 1, "prefill", 2, setup.TopologyLabelHostname), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, 0, "inference-group", 1, "decode", 2, setup.TopologyLabelHostname), + } + if err := utils.VerifyPodGroupTopology(podGroup, setup.TopologyLabelBlock, "", expectedSubGroups, logger); err != nil { + t.Fatalf("Failed to verify KAI PodGroup topology: %v", err) + } + + logger.Info("🎉 TAS8: Full Hierarchy with Cascading Constraints test completed successfully!") +} + +// Test_TAS9_PCSPlusPCLQConstraint tests PCS block constraint combined with PCLQ host constraint +// 1. Deploy workload with PCS: block constraint, PCLQ: host constraint +// 2. 2 pods total +// 3. Verify pods on same host (PCLQ constraint - strictest) +// 4. Verify KAI PodGroup has block constraint at top level, host constraint at PCLQ level +func Test_TAS9_PCSPlusPCLQConstraint(t *testing.T) { + ctx := context.Background() + + logger.Info("1. Initialize a 28-node Grove cluster for topology testing") + clientset, restConfig, dynamicClient, cleanup := prepareTestCluster(ctx, t, 28) + defer cleanup() + + expectedPods := 2 + tc := createTopologyTestContext(t, ctx, clientset, restConfig, dynamicClient, + "tas-pcs-pclq", "../yaml/tas-pcs-pclq.yaml", expectedPods) + + logger.Info("2. Deploy workload (TAS9: PCS block + PCLQ host constraint)") + allPods, err := DeployWorkloadAndGetPods(tc, expectedPods) + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + + logger.Info("3. Verify 2 pods on same host (PCLQ host constraint)") + if err := utils.VerifyPodsInSameTopologyDomain(tc.Ctx, tc.Clientset, allPods, setup.TopologyLabelHostname, logger); err != nil { + t.Fatalf("Failed to verify pods on same host: %v", err) + } + + logger.Info("4. Verify KAI PodGroup has correct SubGroups (PCS block + PCLQ host)") + podGroup := GetPodGroupOrFail(t, tc, 0) + + // Verify top-level TopologyConstraint (PCS level: block) + SubGroups (1 standalone PCLQ with host constraint) + expectedSubGroups := []utils.ExpectedSubGroup{ + utils.CreateExpectedStandalonePCLQSubGroup(tc.Workload.Name, 0, "worker", 2, setup.TopologyLabelHostname), + } + if err := utils.VerifyPodGroupTopology(podGroup, setup.TopologyLabelBlock, "", expectedSubGroups, logger); err != nil { + t.Fatalf("Failed to verify KAI PodGroup topology: %v", err) + } + + logger.Info("🎉 TAS9: PCS+PCLQ Constraint test completed successfully!") +} + +// Test_TAS10_PCSGScalingWithTopologyConstraints tests PCSG scaling with rack constraints +// 1. Deploy workload with 3 PCSG replicas, each with rack constraint +// 2. 6 pods total (2 per PCSG replica) +// 3. Verify each PCSG replica's pods in same rack +// 4. Verify all pods respect PCS-level rack constraint (all in same rack) +// 5. Verify base PodGang KAI PodGroup topology constraints +// 6. Verify scaled PodGangs' KAI PodGroups (replicas 1-2) +func Test_TAS10_PCSGScalingWithTopologyConstraints(t *testing.T) { + ctx := context.Background() + + logger.Info("1. Initialize a 28-node Grove cluster for topology testing") + clientset, restConfig, dynamicClient, cleanup := prepareTestCluster(ctx, t, 28) + defer cleanup() + + expectedPods := 6 // 3 PCSG replicas × 2 pods each + tc := createTopologyTestContext(t, ctx, clientset, restConfig, dynamicClient, + "tas-pcsg-scale", "../yaml/tas-pcsg-scale.yaml", expectedPods) + + logger.Info("2. Deploy workload (TAS10: PCSG scaling with topology constraints)") + allPods, err := DeployWorkloadAndGetPods(tc, expectedPods) + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + + logger.Info("3. Verify each PCSG replica's worker pods (2) are in same rack") + if err := utils.VerifyPCSGReplicasInTopologyDomain(tc.Ctx, tc.Clientset, allPods, + "tas-pcsg-scale-0-inference-group", 3, 2, setup.TopologyLabelRack, logger); err != nil { + t.Fatalf("Failed to verify PCSG replicas: %v", err) + } + + logger.Info("4. Verify all pods respect PCS-level block constraint") + if len(allPods) != expectedPods { + t.Fatalf("Expected %d pods, got %d", expectedPods, len(allPods)) + } + if err := utils.VerifyPodsInSameTopologyDomain(tc.Ctx, tc.Clientset, allPods, setup.TopologyLabelBlock, logger); err != nil { + t.Fatalf("Failed to verify all pods in same block: %v", err) + } + + logger.Info("5. Verify KAI PodGroup has correct SubGroups with topology constraints") + podGroup := GetPodGroupOrFail(t, tc, 0) + + // Verify top-level TopologyConstraint (PCS level: block) + // Base PodGang contains only minAvailable=1 PCSG replica + // PCSG has replicas=3 and minAvailable=1, so base PodGang contains ONLY replica 0 + // Replicas 1 and 2 are in separate scaled PodGangs + expectedSubGroups := []utils.ExpectedSubGroup{ + utils.CreateExpectedPCSGParentSubGroup(tc.Workload.Name, 0, "inference-group", 0, setup.TopologyLabelRack), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, 0, "inference-group", 0, "worker", 2, ""), + } + if err := utils.VerifyPodGroupTopology(podGroup, setup.TopologyLabelBlock, "", expectedSubGroups, logger); err != nil { + t.Fatalf("Failed to verify KAI PodGroup topology: %v", err) + } + + logger.Info("6. Verify scaled PodGangs' KAI PodGroups (replicas 1-2)") + + // Verify PCSG replicas 1-2 (minAvailable=1, totalReplicas=3) + lo.ForEach([]int{1, 2}, func(pcsgReplica int, _ int) { + utils.VerifyScaledPCSGReplicaTopology(tc.Ctx, t, tc.DynamicClient, tc.Namespace, tc.Workload.Name, 0, + utils.ScaledPCSGConfig{ + Name: "inference-group", + PCSGName: "inference-group", + PCSGReplica: pcsgReplica, + MinAvailable: 1, + CliqueConfigs: []utils.PCSGCliqueConfig{ + {Name: "worker", PodCount: 2, Constraint: ""}, + }, + Constraint: setup.TopologyLabelRack, + }, setup.TopologyLabelBlock, logger) + }) + + logger.Info("🎉 TAS10: PCSG Scaling with Topology Constraints test completed successfully!") +} + +// Test_TAS11_PCSGPlusPCLQNoParentConstraint tests PCSG rack + PCLQ host constraints without PCS constraint +// 1. Deploy workload with PCSG: rack constraint, PCLQ: host constraint, NO PCS constraint +// 2. 4 pods (2 PCSG replicas × 2 pods) +// 3. Verify each PCSG replica's pods on same host +// 4. Verify KAI PodGroup has PCSG rack + PCLQ host constraints, NO top-level PCS constraint +func Test_TAS11_PCSGPlusPCLQNoParentConstraint(t *testing.T) { + ctx := context.Background() + + logger.Info("1. Initialize a 28-node Grove cluster for topology testing") + clientset, restConfig, dynamicClient, cleanup := prepareTestCluster(ctx, t, 28) + defer cleanup() + + expectedPods := 4 // 2 PCSG replicas × 2 pods each + tc := createTopologyTestContext(t, ctx, clientset, restConfig, dynamicClient, + "tas-pcsg-pclq", "../yaml/tas-pcsg-pclq.yaml", expectedPods) + + logger.Info("2. Deploy workload (TAS11: PCSG rack + PCLQ host, no PCS constraint)") + allPods, err := DeployWorkloadAndGetPods(tc, expectedPods) + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + + logger.Info("3. Verify each PCSG replica's pods on same host") + workersPCSG := nameutils.GeneratePodCliqueScalingGroupName( + nameutils.ResourceNameReplica{Name: tc.Workload.Name, Replica: 0}, + "workers", + ) + if err := utils.VerifyPCSGReplicasInTopologyDomain(tc.Ctx, tc.Clientset, allPods, + workersPCSG, 2, 2, setup.TopologyLabelHostname, logger); err != nil { + t.Fatalf("Failed to verify PCSG replicas: %v", err) + } + + logger.Info("4. Verify KAI PodGroup has correct SubGroups (PCSG rack + PCLQ host)") + podGroup := GetPodGroupOrFail(t, tc, 0) + + // Verify top-level TopologyConstraint (no PCS constraint) + // SubGroups (2 PCSG parents with rack + 2 PCLQ children with host) + expectedSubGroups := []utils.ExpectedSubGroup{ + utils.CreateExpectedPCSGParentSubGroup(tc.Workload.Name, 0, "workers", 0, setup.TopologyLabelRack), + utils.CreateExpectedPCSGParentSubGroup(tc.Workload.Name, 0, "workers", 1, setup.TopologyLabelRack), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, 0, "workers", 0, "worker", 2, setup.TopologyLabelHostname), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, 0, "workers", 1, "worker", 2, setup.TopologyLabelHostname), + } + if err := utils.VerifyPodGroupTopology(podGroup, "", "", expectedSubGroups, logger); err != nil { + t.Fatalf("Failed to verify KAI PodGroup topology: %v", err) + } + + logger.Info("🎉 TAS11: PCSG+PCLQ Constraint test completed successfully!") +} + +// Test_TAS12_LargeScalingRatio tests large PCSG scaling ratio with minAvailable +// 1. Deploy workload with replicas=10, minAvailable=3, PCSG host constraint, PCS block constraint +// 2. 20 pods expected (only minAvailable=3 replicas × 2 pods from base PodGang + 7 scaled PodGangs × 2 pods) +// 3. Verify each PCSG replica's pods on same host +// 4. Verify all pods in same block (PCS constraint) +// 5. Verify base PodGang KAI PodGroup contains minAvailable=3 replicas +// 6. Verify 7 scaled PodGangs' KAI PodGroups (replicas 3-9) +func Test_TAS12_LargeScalingRatio(t *testing.T) { + ctx := context.Background() + + logger.Info("1. Initialize a 28-node Grove cluster for topology testing") + clientset, restConfig, dynamicClient, cleanup := prepareTestCluster(ctx, t, 28) + defer cleanup() + + expectedPods := 20 // Base PodGang: 3 PCSG replicas × 2 pods (6) + Scaled: 7 PCSG replicas × 2 pods (14) + tc := createTopologyTestContext(t, ctx, clientset, restConfig, dynamicClient, + "tas-large-scale", "../yaml/tas-large-scale.yaml", expectedPods) + + logger.Info("2. Deploy workload (TAS12: Large scaling ratio, replicas=10/minAvailable=3)") + allPods, err := DeployWorkloadAndGetPods(tc, expectedPods) + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + + logger.Info("3. Verify each PCSG replica's pods on same host") + if err := utils.VerifyPCSGReplicasInTopologyDomain(tc.Ctx, tc.Clientset, allPods, + "tas-large-scale-0-workers", 10, 2, setup.TopologyLabelHostname, logger); err != nil { + t.Fatalf("Failed to verify PCSG replicas: %v", err) + } + + logger.Info("4. Verify all 20 pods in same block (PCS block constraint)") + if err := utils.VerifyPodsInSameTopologyDomain(tc.Ctx, tc.Clientset, allPods, setup.TopologyLabelBlock, logger); err != nil { + t.Fatalf("Failed to verify all pods in same block: %v", err) + } + + logger.Info("5. Verify base PodGang's KAI PodGroup (replicas 0-2)") + podGroup := GetPodGroupOrFail(t, tc, 0) + + // Verify top-level TopologyConstraint (PCS level: block) + // SubGroups (3 worker PCLQs with host constraint, no PCSG parent since no rack constraint) + pcsgFQN := nameutils.GeneratePodCliqueScalingGroupName( + nameutils.ResourceNameReplica{Name: tc.Workload.Name, Replica: 0}, + "workers", + ) + expectedSubGroups := []utils.ExpectedSubGroup{ + utils.CreateExpectedPCLQInPCSGSubGroupNoParent(tc.Workload.Name, 0, "workers", 0, "worker", 2, setup.TopologyLabelHostname), + utils.CreateExpectedPCLQInPCSGSubGroupNoParent(tc.Workload.Name, 0, "workers", 1, "worker", 2, setup.TopologyLabelHostname), + utils.CreateExpectedPCLQInPCSGSubGroupNoParent(tc.Workload.Name, 0, "workers", 2, "worker", 2, setup.TopologyLabelHostname), + } + if err := utils.VerifyPodGroupTopology(podGroup, setup.TopologyLabelBlock, "", expectedSubGroups, logger); err != nil { + t.Fatalf("Failed to verify KAI PodGroup topology: %v", err) + } + + logger.Info("6. Verify scaled PodGangs' KAI PodGroups (replicas 3-9)") + podGroups, err := utils.GetKAIPodGroupsForPCS(tc.Ctx, tc.DynamicClient, tc.Namespace, tc.Workload.Name) + if err != nil { + t.Fatalf("Failed to get KAI PodGroups: %v", err) + } + + // PCSG config: replicas=10, minAvailable=3 + // Base PodGang contains replicas 0-2, scaled PodGangs contain replicas 3-9 (reuse pcsgFQN from above) + pcsgMinAvailable := 3 + pcsgTotalReplicas := 10 + scaledPodGangCount := pcsgTotalReplicas - pcsgMinAvailable + + for scaledIndex := 0; scaledIndex < scaledPodGangCount; scaledIndex++ { + pcsgReplicaIndex := pcsgMinAvailable + scaledIndex + scaledPodGangName := nameutils.CreatePodGangNameFromPCSGFQN(pcsgFQN, scaledIndex) + + scaledPodGroup, err := utils.FilterPodGroupByOwner(podGroups, scaledPodGangName) + if err != nil { + t.Fatalf("Failed to find scaled PodGroup for %s: %v", scaledPodGangName, err) + } + + // Each scaled PodGang contains 1 PCSG replica with 1 PCLQ SubGroup (host constraint) + expectedSubGroups := []utils.ExpectedSubGroup{ + utils.CreateExpectedPCLQInPCSGSubGroupNoParent(tc.Workload.Name, 0, "workers", pcsgReplicaIndex, "worker", 2, setup.TopologyLabelHostname), + } + + if err := utils.VerifyPodGroupTopology(scaledPodGroup, setup.TopologyLabelBlock, "", expectedSubGroups, logger); err != nil { + t.Fatalf("Failed to verify scaled PodGroup %s (PCSG replica %d) topology: %v", + scaledPodGangName, pcsgReplicaIndex, err) + } + } + + logger.Info("🎉 TAS12: Large Scaling Ratio test completed successfully!") +} + +// Test_TAS13_InsufficientNodesForConstraint tests gang scheduling failure with unsatisfiable topology constraint +// 1. Deploy workload with rack constraint requesting 10 pods (exceeds rack capacity) +// 2. Verify all 10 pods remain in Pending state (no partial scheduling) +// 3. Verify NO pods are scheduled (all-or-nothing gang behavior) +// 4. Verify pod events show Unschedulable reason +// 5. Verify KAI PodGroup exists with correct constraints even though pods are pending +func Test_TAS13_InsufficientNodesForConstraint(t *testing.T) { + ctx := context.Background() + + logger.Info("1. Initialize a 28-node Grove cluster for topology testing") + clientset, restConfig, dynamicClient, cleanup := prepareTestCluster(ctx, t, 28) + defer cleanup() + + expectedPods := 10 + tc := createTopologyTestContext(t, ctx, clientset, restConfig, dynamicClient, + "tas-insuffic", "../yaml/tas-insuffic.yaml", expectedPods) + + logger.Info("2. Deploy workload (TAS13: insufficient nodes for rack constraint)") + _, err := deployAndVerifyWorkload(tc) + if err != nil { + t.Fatalf("Failed to deploy workload: %v", err) + } + + logger.Info("3. Verify all 10 pods remain in Pending state (no partial scheduling)") + if err := verifyPodsArePendingWithUnschedulableEvents(tc, true, expectedPods); err != nil { + t.Fatalf("Failed to verify pods are pending with unschedulable events: %v", err) + } + + logger.Info("4. Verify NO pods are scheduled (all-or-nothing gang behavior)") + pods, err := listPods(tc) + if err != nil { + t.Fatalf("Failed to list pods: %v", err) + } + + lo.ForEach(pods.Items, func(pod v1.Pod, _ int) { + if pod.Spec.NodeName != "" { + t.Fatalf("Expected pod %s to have no node assignment, but assigned to %s", pod.Name, pod.Spec.NodeName) + } + }) + logger.Info("5. Verify KAI PodGroup exists with correct topology constraints (even though pods are pending)") + podGroup := GetPodGroupOrFail(t, tc, 0) + + // Verify top-level TopologyConstraint (PCS level: rack) + // SubGroups (1 standalone PCLQ - no PCSG) + expectedSubGroups := []utils.ExpectedSubGroup{ + utils.CreateExpectedStandalonePCLQSubGroup(tc.Workload.Name, 0, "worker", 10, ""), + } + if err := utils.VerifyPodGroupTopology(podGroup, setup.TopologyLabelRack, "", expectedSubGroups, logger); err != nil { + t.Fatalf("Failed to verify KAI PodGroup topology: %v", err) + } + + logger.Info("🎉 TAS13: Insufficient Nodes for Constraint test completed successfully!") +} + +// Test_TAS14_MultiReplicaWithRackConstraint tests multiple PCS replicas with rack constraints +// 1. Deploy workload with 2 PCS replicas, each with rack constraint +// 2. 4 pods (2 per PCS replica) +// 3. Verify each PCS replica's pods in same rack +// 4. Verify KAI PodGroups for both PCS replicas have correct topology constraints +func Test_TAS14_MultiReplicaWithRackConstraint(t *testing.T) { + ctx := context.Background() + + logger.Info("1. Initialize a 28-node Grove cluster for topology testing") + clientset, restConfig, dynamicClient, cleanup := prepareTestCluster(ctx, t, 28) + defer cleanup() + + expectedPods := 4 + tc := createTopologyTestContext(t, ctx, clientset, restConfig, dynamicClient, + "tas-multirep", "../yaml/tas-multirep.yaml", expectedPods) + + logger.Info("2. Deploy workload (TAS14: multi-replica with rack constraint)") + allPods, err := DeployWorkloadAndGetPods(tc, expectedPods) + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + + logger.Info("3. Verify each PCS replica's pods (2) are in same rack") + for pcsReplica := 0; pcsReplica < 2; pcsReplica++ { + replicaPods := utils.FilterPodsByLabel(allPods, "grove.io/podcliqueset-replica-index", fmt.Sprintf("%d", pcsReplica)) + if len(replicaPods) != 2 { + t.Fatalf("Expected 2 replica-%d pods, got %d", pcsReplica, len(replicaPods)) + } + if err := utils.VerifyPodsInSameTopologyDomain(tc.Ctx, tc.Clientset, replicaPods, setup.TopologyLabelRack, logger); err != nil { + t.Fatalf("Failed to verify replica-%d pods in same rack: %v", pcsReplica, err) + } + } + + logger.Info("4. Verify KAI PodGroups for both replicas have correct topology constraints") + for pcsReplica := 0; pcsReplica < 2; pcsReplica++ { + podGroup := GetPodGroupOrFail(t, tc, pcsReplica) + + expectedSubGroups := []utils.ExpectedSubGroup{ + utils.CreateExpectedStandalonePCLQSubGroup(tc.Workload.Name, pcsReplica, "worker", 2, ""), + } + if err := utils.VerifyPodGroupTopology(podGroup, setup.TopologyLabelRack, "", expectedSubGroups, logger); err != nil { + t.Fatalf("Failed to verify PodGroup-%d topology: %v", pcsReplica, err) + } + } + + logger.Info("🎉 TAS14: Multi-Replica with Rack Constraint test completed successfully!") +} + +// Test_TAS15_DisaggregatedInferenceMultiplePCSGs tests disaggregated inference with multiple PCSGs +// 1. Deploy workload with 2 PCSGs (decoder, prefill) + standalone router +// 2. decoder PCSG (2 replicas, rack constraint) + prefill PCSG (2 replicas, rack constraint) + router standalone +// 3. PCS: block constraint +// 4. 10 pods total: decoder (2×2) + prefill (2×2) + router (2) +// 5. Verify all in same block, each PCSG replica in same rack +// 6. Verify base PodGang KAI PodGroup topology for complex multi-PCSG workload +// 7. Verify scaled PodGangs' KAI PodGroups (decoder replica 1, prefill replica 1) +func Test_TAS15_DisaggregatedInferenceMultiplePCSGs(t *testing.T) { + ctx := context.Background() + + logger.Info("1. Initialize a 28-node Grove cluster for topology testing") + clientset, restConfig, dynamicClient, cleanup := prepareTestCluster(ctx, t, 28) + defer cleanup() + + expectedPods := 10 // decoder (2×2) + prefill (2×2) + router (2) + tc := createTopologyTestContext(t, ctx, clientset, restConfig, dynamicClient, + "tas-pcs-multi-pcsg", "../yaml/tas-pcs-multi-pcsg.yaml", expectedPods) + + logger.Info("2. Deploy workload (TAS15: disaggregated inference with multiple PCSGs)") + allPods, err := DeployWorkloadAndGetPods(tc, expectedPods) + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + + logger.Info("3. Verify block-level constraint (all 10 pods in same block)") + if err := utils.VerifyPodsInSameTopologyDomain(tc.Ctx, tc.Clientset, allPods, setup.TopologyLabelBlock, logger); err != nil { + t.Fatalf("Failed to verify all pods in same block: %v", err) + } + + // Generate PCSG and PCLQ names + pcsReplica := nameutils.ResourceNameReplica{Name: tc.Workload.Name, Replica: 0} + decoderPCSG := nameutils.GeneratePodCliqueScalingGroupName(pcsReplica, "decoder") + prefillPCSG := nameutils.GeneratePodCliqueScalingGroupName(pcsReplica, "prefill") + routerPCLQ := nameutils.GeneratePodCliqueName(pcsReplica, "router") + + logger.Info("4. Verify PCSG replicas (2 types × 2 replicas) are in same rack") + pcsgTypes := []utils.PCSGTypeConfig{ + {Name: "decoder", FQN: decoderPCSG}, + {Name: "prefill", FQN: prefillPCSG}, + } + if err := utils.VerifyMultiTypePCSGReplicas(tc.Ctx, tc.Clientset, allPods, pcsgTypes, 2, 2, + setup.TopologyLabelRack, logger); err != nil { + t.Fatalf("Failed to verify PCSG replicas: %v", err) + } + + logger.Info("5. Verify router pods (2 standalone, no PCSG label)") + routerPods := utils.FilterPodsByLabel(allPods, LabelPodClique, routerPCLQ) + if len(routerPods) != 2 { + t.Fatalf("Expected 2 router pods, got %d", len(routerPods)) + } + + logger.Info("6. Verify KAI PodGroup has correct SubGroups for disaggregated inference") + podGroup := GetPodGroupOrFail(t, tc, 0) + + // Verify top-level TopologyConstraint (PCS level: block) + // SubGroups (Base PodGang contains only minAvailable=1 PCSG replicas) + expectedSubGroups := []utils.ExpectedSubGroup{ + utils.CreateExpectedPCSGParentSubGroup(tc.Workload.Name, 0, "decoder", 0, setup.TopologyLabelRack), + utils.CreateExpectedPCSGParentSubGroup(tc.Workload.Name, 0, "prefill", 0, setup.TopologyLabelRack), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, 0, "decoder", 0, "dworker", 1, ""), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, 0, "decoder", 0, "dleader", 1, ""), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, 0, "prefill", 0, "pworker", 1, ""), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, 0, "prefill", 0, "pleader", 1, ""), + utils.CreateExpectedStandalonePCLQSubGroup(tc.Workload.Name, 0, "router", 2, ""), + } + if err := utils.VerifyPodGroupTopology(podGroup, setup.TopologyLabelBlock, "", expectedSubGroups, logger); err != nil { + t.Fatalf("Failed to verify KAI PodGroup topology: %v", err) + } + + logger.Info("7. Verify scaled PodGangs' KAI PodGroups (decoder replica 1, prefill replica 1)") + + // Define PCSG configurations (minAvailable=1, totalReplicas=2 for each) + pcsgConfigs := []utils.ScaledPCSGConfig{ + { + Name: "decoder", + PCSGName: "decoder", + PCSGReplica: 1, + MinAvailable: 1, + CliqueConfigs: []utils.PCSGCliqueConfig{ + {Name: "dworker", PodCount: 1, Constraint: ""}, + {Name: "dleader", PodCount: 1, Constraint: ""}, + }, + Constraint: setup.TopologyLabelRack, + }, + { + Name: "prefill", + PCSGName: "prefill", + PCSGReplica: 1, + MinAvailable: 1, + CliqueConfigs: []utils.PCSGCliqueConfig{ + {Name: "pworker", PodCount: 1, Constraint: ""}, + {Name: "pleader", PodCount: 1, Constraint: ""}, + }, + Constraint: setup.TopologyLabelRack, + }, + } + + // Verify each PCSG's scaled replica + lo.ForEach(pcsgConfigs, func(pcsgConfig utils.ScaledPCSGConfig, _ int) { + utils.VerifyScaledPCSGReplicaTopology(tc.Ctx, t, tc.DynamicClient, tc.Namespace, tc.Workload.Name, 0, + pcsgConfig, setup.TopologyLabelBlock, logger) + }) + + logger.Info("🎉 TAS15: Disaggregated Inference with Multiple PCSGs test completed successfully!") +} + +// Test_TAS16_MultiReplicaPCSWithThreeLevelHierarchy tests multi-replica PCS with full 3-level topology hierarchy +// 1. Deploy workload with 2 PCS replicas, each with full 3-level hierarchy +// 2. 20 pods (10 per PCS replica): decoder (2×2) + prefill (2×2) + router (2) +// 3. PCS: block constraint, PCSG: rack constraint, PCLQ (pworker): host constraint +// 4. Verify block constraint at PCS level, rack at PCSG, for both PCS replicas +// 5. Similar to TAS15 but scaled across 2 PCS replicas +func Test_TAS16_MultiReplicaPCSWithThreeLevelHierarchy(t *testing.T) { + ctx := context.Background() + + logger.Info("1. Initialize a 28-node Grove cluster for multi-replica PCS testing") + clientset, restConfig, dynamicClient, cleanup := prepareTestCluster(ctx, t, 28) + defer cleanup() + + expectedPods := 20 // PCS replica 0: 10 pods + PCS replica 1: 10 pods + tc := createTopologyTestContext(t, ctx, clientset, restConfig, dynamicClient, + "tas-pcs-multi-pcsg", "../yaml/tas-pcs-multi-pcsg-multi-replica.yaml", expectedPods) + + logger.Info("2. Deploy workload (TAS16: 2 PCS replicas with 3-level topology hierarchy)") + allPods, err := DeployWorkloadAndGetPods(tc, expectedPods) + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + + // Verify for each PCS replica + for pcsReplica := 0; pcsReplica < 2; pcsReplica++ { + replicaLabel := fmt.Sprintf("%d", pcsReplica) + replicaPods := utils.FilterPodsByLabel(allPods, "grove.io/podcliqueset-replica-index", replicaLabel) + if len(replicaPods) != 10 { + t.Fatalf("Expected 10 pods for PCS replica %d, got %d", pcsReplica, len(replicaPods)) + } + + logger.Infof("3.%d. Verify PCS replica %d pods in same block (PCS block constraint)", pcsReplica+1, pcsReplica) + if err := utils.VerifyPodsInSameTopologyDomain(tc.Ctx, tc.Clientset, replicaPods, setup.TopologyLabelBlock, logger); err != nil { + t.Fatalf("Failed to verify PCS replica %d pods in same block: %v", pcsReplica, err) + } + + logger.Infof("4.%d. Verify PCS replica %d pods topology constraints", pcsReplica+1, pcsReplica) + + // Generate PCSG and PCLQ names for this PCS replica + decoderPCSG := nameutils.GeneratePodCliqueScalingGroupName( + nameutils.ResourceNameReplica{Name: tc.Workload.Name, Replica: pcsReplica}, + "decoder", + ) + prefillPCSG := nameutils.GeneratePodCliqueScalingGroupName( + nameutils.ResourceNameReplica{Name: tc.Workload.Name, Replica: pcsReplica}, + "prefill", + ) + routerPCLQ := nameutils.GeneratePodCliqueName( + nameutils.ResourceNameReplica{Name: tc.Workload.Name, Replica: pcsReplica}, + "router", + ) + + // Verify PCSG replicas (2 types × 2 replicas) are in same rack + pcsgTypes := []utils.PCSGTypeConfig{ + {Name: "decoder", FQN: decoderPCSG}, + {Name: "prefill", FQN: prefillPCSG}, + } + if err := utils.VerifyMultiTypePCSGReplicas(tc.Ctx, tc.Clientset, replicaPods, pcsgTypes, 2, 2, + setup.TopologyLabelRack, logger); err != nil { + t.Fatalf("Failed to verify PCSG replicas for PCS replica %d: %v", pcsReplica, err) + } + + // Verify router pods (2 standalone) + logger.Infof("4.%d. Verify router pods (2 standalone)", pcsReplica+1) + routerPods := utils.FilterPodsByLabel(replicaPods, LabelPodClique, routerPCLQ) + if len(routerPods) != 2 { + t.Fatalf("Expected 2 router pods for PCS replica %d, got %d", pcsReplica, len(routerPods)) + } + } + + logger.Info("5. Verify KAI PodGroups for both PCS replicas have correct topology constraints") + for pcsReplica := 0; pcsReplica < 2; pcsReplica++ { + podGroup := GetPodGroupOrFail(t, tc, pcsReplica) + + // Verify SubGroups for this PCS replica (hierarchy: PCS→PCSG→PCLQ) + expectedSubGroups := []utils.ExpectedSubGroup{ + utils.CreateExpectedPCSGParentSubGroup(tc.Workload.Name, pcsReplica, "decoder", 0, setup.TopologyLabelRack), + utils.CreateExpectedPCSGParentSubGroup(tc.Workload.Name, pcsReplica, "prefill", 0, setup.TopologyLabelRack), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, pcsReplica, "decoder", 0, "dworker", 1, ""), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, pcsReplica, "decoder", 0, "dleader", 1, ""), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, pcsReplica, "prefill", 0, "pworker", 1, setup.TopologyLabelHostname), + utils.CreateExpectedPCLQInPCSGSubGroup(tc.Workload.Name, pcsReplica, "prefill", 0, "pleader", 1, ""), + utils.CreateExpectedStandalonePCLQSubGroup(tc.Workload.Name, pcsReplica, "router", 2, ""), + } + if err := utils.VerifyPodGroupTopology(podGroup, setup.TopologyLabelBlock, "", expectedSubGroups, logger); err != nil { + t.Fatalf("Failed to verify KAI PodGroup-%d topology: %v", pcsReplica, err) + } + } + + logger.Info("🎉 TAS16: Multi-replica PCS with 3-level topology hierarchy test completed successfully!") +} diff --git a/operator/e2e/utils/kai_topology.go b/operator/e2e/utils/kai_topology.go index 4d79e127e..aee1ee710 100644 --- a/operator/e2e/utils/kai_topology.go +++ b/operator/e2e/utils/kai_topology.go @@ -21,9 +21,11 @@ package utils import ( "context" "fmt" + "testing" "time" kaischedulingv2alpha2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2alpha2" + nameutils "github.com/ai-dynamo/grove/operator/api/common" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/dynamic" "k8s.io/utils/ptr" @@ -38,10 +40,30 @@ type ExpectedSubGroup struct { PreferredTopologyLevel string } +// PCSGCliqueConfig defines configuration for a single clique in a PCSG +type PCSGCliqueConfig struct { + Name string + PodCount int32 + Constraint string +} + +// ScaledPCSGConfig defines configuration for verifying a scaled PCSG replica +type ScaledPCSGConfig struct { + Name string + PCSGName string + PCSGReplica int + MinAvailable int + CliqueConfigs []PCSGCliqueConfig + Constraint string +} + // CreateExpectedStandalonePCLQSubGroup creates an ExpectedSubGroup for a standalone PodClique (not in PCSG) // Name format: -- func CreateExpectedStandalonePCLQSubGroup(pcsName string, pcsReplica int, cliqueName string, minMember int32, topologyLevel string) ExpectedSubGroup { - name := GetStandalonePCLQSubGroupName(pcsName, pcsReplica, cliqueName) + name := nameutils.GeneratePodCliqueName( + nameutils.ResourceNameReplica{Name: pcsName, Replica: pcsReplica}, + cliqueName, + ) return ExpectedSubGroup{ Name: name, MinMember: minMember, @@ -53,7 +75,11 @@ func CreateExpectedStandalonePCLQSubGroup(pcsName string, pcsReplica int, clique // CreateExpectedPCSGParentSubGroup creates an ExpectedSubGroup for a PCSG parent (scaling group replica) // Name format: --- func CreateExpectedPCSGParentSubGroup(pcsName string, pcsReplica int, sgName string, sgReplica int, topologyLevel string) ExpectedSubGroup { - name := GetPCSGParentSubGroupName(pcsName, pcsReplica, sgName, sgReplica) + pcsgFQN := nameutils.GeneratePodCliqueScalingGroupName( + nameutils.ResourceNameReplica{Name: pcsName, Replica: pcsReplica}, + sgName, + ) + name := fmt.Sprintf("%s-%d", pcsgFQN, sgReplica) return ExpectedSubGroup{ Name: name, MinMember: 0, @@ -62,19 +88,41 @@ func CreateExpectedPCSGParentSubGroup(pcsName string, pcsReplica int, sgName str } } -// CreateExpectedPCLQInPCSGSubGroup creates an ExpectedSubGroup for a PodClique within a PCSG +// CreateExpectedPCLQInPCSGSubGroup creates an ExpectedSubGroup for a PodClique within a PCSG with parent // Name format: ---- func CreateExpectedPCLQInPCSGSubGroup(pcsName string, pcsReplica int, sgName string, sgReplica int, cliqueName string, minMember int32, topologyLevel string) ExpectedSubGroup { - name := GetPCLQInPCSGSubGroupName(pcsName, pcsReplica, sgName, sgReplica, cliqueName) - parentName := GetPCSGParentSubGroupName(pcsName, pcsReplica, sgName, sgReplica) + return createExpectedPCLQInPCSGSubGroup(pcsName, pcsReplica, sgName, sgReplica, cliqueName, minMember, topologyLevel, true) +} + +func createExpectedPCLQInPCSGSubGroup(pcsName string, pcsReplica int, sgName string, sgReplica int, cliqueName string, + minMember int32, topologyLevel string, hasParent bool) ExpectedSubGroup { + pcsgFQN := nameutils.GeneratePodCliqueScalingGroupName( + nameutils.ResourceNameReplica{Name: pcsName, Replica: pcsReplica}, + sgName, + ) + name := nameutils.GeneratePodCliqueName( + nameutils.ResourceNameReplica{Name: pcsgFQN, Replica: sgReplica}, + cliqueName, + ) + var parentPtr *string + if hasParent { + parentPtr = ptr.To(fmt.Sprintf("%s-%d", pcsgFQN, sgReplica)) + } return ExpectedSubGroup{ Name: name, MinMember: minMember, - Parent: ptr.To(parentName), + Parent: parentPtr, RequiredTopologyLevel: topologyLevel, } } +// CreateExpectedPCLQInPCSGSubGroupNoParent creates an ExpectedSubGroup for a PodClique within a PCSG without parent +// Used when PCSG has no topology constraint (no parent SubGroup created) +// Name format: ---- +func CreateExpectedPCLQInPCSGSubGroupNoParent(pcsName string, pcsReplica int, sgName string, sgReplica int, cliqueName string, minMember int32, topologyLevel string) ExpectedSubGroup { + return createExpectedPCLQInPCSGSubGroup(pcsName, pcsReplica, sgName, sgReplica, cliqueName, minMember, topologyLevel, false) +} + // GetKAIPodGroupsForPCS retrieves all KAI PodGroups for a given PodCliqueSet by label selector // KAI scheduler creates PodGroups with label: app.kubernetes.io/part-of= // Returns a list of PodGroups that tests can filter by owner reference if needed @@ -233,7 +281,7 @@ func GetPodGroupForBasePodGangReplica( return nil, fmt.Errorf("failed to get KAI PodGroups: %w", err) } - basePodGangName := GetBasePodGangName(workloadName, pgsReplica) + basePodGangName := nameutils.GenerateBasePodGangName(nameutils.ResourceNameReplica{Name: workloadName, Replica: pgsReplica}) basePodGroup, err := FilterPodGroupByOwner(podGroups, basePodGangName) if err != nil { return nil, fmt.Errorf("failed to find PodGroup for PodGang %s: %w", basePodGangName, err) @@ -259,3 +307,51 @@ func VerifyPodGroupTopology( return nil } + +// VerifyScaledPCSGReplicaTopology verifies KAI PodGroup for ONE scaled PCSG replica. +// Scaled PodGroup top-level constraint: uses pcsConstraint ONLY if PCSG has NO constraint. +func VerifyScaledPCSGReplicaTopology( + ctx context.Context, + t *testing.T, + dynamicClient dynamic.Interface, + namespace string, + pcsName string, + pcsReplica int, + pcsgConfig ScaledPCSGConfig, + pcsConstraint string, + logger *Logger, +) { + podGroups, err := GetKAIPodGroupsForPCS(ctx, dynamicClient, namespace, pcsName) + if err != nil { + t.Fatalf("Failed to get KAI PodGroups: %v", err) + } + + pcsgFQN := nameutils.GeneratePodCliqueScalingGroupName( + nameutils.ResourceNameReplica{Name: pcsName, Replica: pcsReplica}, + pcsgConfig.PCSGName, + ) + + scaledPodGangName := nameutils.CreatePodGangNameFromPCSGFQN(pcsgFQN, pcsgConfig.PCSGReplica-pcsgConfig.MinAvailable) + + scaledPodGroup, err := FilterPodGroupByOwner(podGroups, scaledPodGangName) + if err != nil { + t.Fatalf("Failed to find scaled PodGroup for %s: %v", scaledPodGangName, err) + } + + var expectedSubGroups []ExpectedSubGroup + + for _, cliqueConfig := range pcsgConfig.CliqueConfigs { + expectedSubGroups = append(expectedSubGroups, + CreateExpectedPCLQInPCSGSubGroupNoParent(pcsName, pcsReplica, pcsgConfig.PCSGName, pcsgConfig.PCSGReplica, cliqueConfig.Name, cliqueConfig.PodCount, cliqueConfig.Constraint)) + } + + scaledTopConstraint := pcsConstraint + if pcsgConfig.Constraint != "" { + scaledTopConstraint = pcsgConfig.Constraint + } + + if err := VerifyPodGroupTopology(scaledPodGroup, scaledTopConstraint, "", expectedSubGroups, logger); err != nil { + t.Fatalf("Failed to verify scaled PodGroup %s (%s replica %d) topology: %v", + scaledPodGangName, pcsgConfig.Name, pcsgConfig.PCSGReplica, err) + } +} diff --git a/operator/e2e/utils/naming.go b/operator/e2e/utils/naming.go deleted file mode 100644 index 9b0cbc41a..000000000 --- a/operator/e2e/utils/naming.go +++ /dev/null @@ -1,45 +0,0 @@ -//go:build e2e - -package utils - -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -import "fmt" - -// GetBasePodGangName constructs the base PodGang name for a specific PCS replica. -// Format: - -func GetBasePodGangName(workloadName string, pcsReplica int) string { - return fmt.Sprintf("%s-%d", workloadName, pcsReplica) -} - -// GetStandalonePCLQSubGroupName constructs the SubGroup name for a standalone PodClique. -// Format: -- -func GetStandalonePCLQSubGroupName(pcsName string, pcsReplica int, cliqueName string) string { - return fmt.Sprintf("%s-%d-%s", pcsName, pcsReplica, cliqueName) -} - -// GetPCSGParentSubGroupName constructs the SubGroup name for a PCSG parent (scaling group replica). -// Format: --- -func GetPCSGParentSubGroupName(pcsName string, pcsReplica int, sgName string, sgReplica int) string { - return fmt.Sprintf("%s-%d-%s-%d", pcsName, pcsReplica, sgName, sgReplica) -} - -// GetPCLQInPCSGSubGroupName constructs the SubGroup name for a PodClique within a PCSG. -// Format: ---- -func GetPCLQInPCSGSubGroupName(pcsName string, pcsReplica int, sgName string, sgReplica int, cliqueName string) string { - return fmt.Sprintf("%s-%d-%s-%d-%s", pcsName, pcsReplica, sgName, sgReplica, cliqueName) -} diff --git a/operator/e2e/utils/topology.go b/operator/e2e/utils/topology.go index 26315b48b..3831aa4bc 100644 --- a/operator/e2e/utils/topology.go +++ b/operator/e2e/utils/topology.go @@ -203,3 +203,69 @@ func VerifyLabeledPodsInTopologyDomain( return VerifyPodsInSameTopologyDomain(ctx, clientset, filteredPods, topologyKey, logger) } + +// PCSGTypeConfig defines configuration for a PCSG type verification +type PCSGTypeConfig struct { + Name string // Human-readable name (e.g., "decoder") + FQN string // Fully-qualified PCSG name +} + +// VerifyPCSGReplicasInTopologyDomain verifies that each PCSG replica's pods +// are in the same topology domain (e.g., rack, host). +func VerifyPCSGReplicasInTopologyDomain( + ctx context.Context, + clientset kubernetes.Interface, + allPods []v1.Pod, + pcsgLabel string, + replicaCount int, + podsPerReplica int, + topologyLabel string, + logger *Logger, +) error { + for replica := 0; replica < replicaCount; replica++ { + replicaPods := FilterPodsByLabel( + FilterPodsByLabel(allPods, "grove.io/podcliquescalinggroup", pcsgLabel), + "grove.io/podcliquescalinggroup-replica-index", + fmt.Sprintf("%d", replica), + ) + if len(replicaPods) != podsPerReplica { + return fmt.Errorf("expected %d PCSG replica %d pods, got %d", podsPerReplica, replica, len(replicaPods)) + } + if err := VerifyPodsInSameTopologyDomain(ctx, clientset, replicaPods, topologyLabel, logger); err != nil { + return fmt.Errorf("failed to verify PCSG replica %d pods in same topology domain: %w", replica, err) + } + } + return nil +} + +// VerifyMultiTypePCSGReplicas verifies multiple PCSG types across replicas. +// Each PCSG type has multiple replicas, and each replica's pods should be in the same topology domain. +func VerifyMultiTypePCSGReplicas( + ctx context.Context, + clientset kubernetes.Interface, + allPods []v1.Pod, + pcsgTypes []PCSGTypeConfig, + replicasPerType int, + podsPerReplica int, + topologyLabel string, + logger *Logger, +) error { + for _, pcsgType := range pcsgTypes { + for replica := 0; replica < replicasPerType; replica++ { + replicaPods := FilterPodsByLabel( + FilterPodsByLabel(allPods, "grove.io/podcliquescalinggroup", pcsgType.FQN), + "grove.io/podcliquescalinggroup-replica-index", + fmt.Sprintf("%d", replica), + ) + if len(replicaPods) != podsPerReplica { + return fmt.Errorf("expected %d %s replica-%d pods, got %d", + podsPerReplica, pcsgType.Name, replica, len(replicaPods)) + } + if err := VerifyPodsInSameTopologyDomain(ctx, clientset, replicaPods, topologyLabel, logger); err != nil { + return fmt.Errorf("failed to verify %s replica-%d pods in same topology domain: %w", + pcsgType.Name, replica, err) + } + } + } + return nil +} diff --git a/operator/e2e/yaml/tas-hierarchy.yaml b/operator/e2e/yaml/tas-hierarchy.yaml new file mode 100644 index 000000000..4e1c11bbb --- /dev/null +++ b/operator/e2e/yaml/tas-hierarchy.yaml @@ -0,0 +1,88 @@ +# Workload 8: SP-1 - Full 3-Level Hierarchy with Cascading Constraints +# Test scenario: PCS (block) → PCSG (rack) → PCLQ (host) - demonstrating constraint inheritance +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: tas-hierarchy + labels: + app: tas-hierarchy +spec: + replicas: 1 + template: + topologyConstraint: + packDomain: block # PCS level - broadest + podCliqueScalingGroups: + - name: inference-group + replicas: 2 + minAvailable: 2 + topologyConstraint: + packDomain: rack # PCSG level - stricter than parent + cliqueNames: + - prefill + - decode + cliques: + - name: prefill + labels: + kai.scheduler/queue: test + topologyConstraint: + packDomain: host # PCLQ level - strictest + spec: + roleName: prefill + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: prefill + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 40Mi + - name: decode + labels: + kai.scheduler/queue: test + topologyConstraint: + packDomain: host # PCLQ level - strictest + spec: + roleName: decode + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: decode + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 40Mi diff --git a/operator/e2e/yaml/tas-host-level.yaml b/operator/e2e/yaml/tas-host-level.yaml index 05c8d01c0..c8c7d0be3 100644 --- a/operator/e2e/yaml/tas-host-level.yaml +++ b/operator/e2e/yaml/tas-host-level.yaml @@ -38,7 +38,9 @@ spec: effect: NoSchedule containers: - name: worker - image: registry:5001/nginx:alpine-slim + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] resources: requests: - memory: 30Mi + memory: 40Mi + diff --git a/operator/e2e/yaml/tas-indep-clq.yaml b/operator/e2e/yaml/tas-indep-clq.yaml index 10e785127..925bd412c 100644 --- a/operator/e2e/yaml/tas-indep-clq.yaml +++ b/operator/e2e/yaml/tas-indep-clq.yaml @@ -32,6 +32,15 @@ spec: operator: In values: - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-indep-clq-0-worker-rack + topologyKey: kubernetes.io/hostname tolerations: - key: node_role.e2e.grove.nvidia.com operator: Equal @@ -39,10 +48,11 @@ spec: effect: NoSchedule containers: - name: worker-rack - image: registry:5001/nginx:alpine-slim + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] resources: requests: - memory: 80Mi + memory: 20Mi - name: worker-block labels: kai.scheduler/queue: test @@ -63,6 +73,15 @@ spec: operator: In values: - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-indep-clq-0-worker-block + topologyKey: kubernetes.io/hostname tolerations: - key: node_role.e2e.grove.nvidia.com operator: Equal @@ -70,7 +89,8 @@ spec: effect: NoSchedule containers: - name: worker-block - image: registry:5001/nginx:alpine-slim + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] resources: requests: - memory: 80Mi + memory: 20Mi diff --git a/operator/e2e/yaml/tas-insuffic.yaml b/operator/e2e/yaml/tas-insuffic.yaml new file mode 100644 index 000000000..05f117b18 --- /dev/null +++ b/operator/e2e/yaml/tas-insuffic.yaml @@ -0,0 +1,44 @@ +# Workload 10: EC-1 - Insufficient Nodes for Constraint +# Test scenario: PCS with rack constraint requesting more pods than can fit in any single rack +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: tas-insuffic + labels: + app: tas-insuffic +spec: + replicas: 1 + template: + topologyConstraint: + packDomain: rack # All pods must be in same rack + cliques: + - name: worker + labels: + kai.scheduler/queue: test + spec: + roleName: worker + replicas: 10 # Exceeds capacity of any single rack + minAvailable: 10 # All-or-nothing gang scheduling + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: worker + image: registry:5001/nginx:alpine-slim + resources: + requests: + memory: 500Mi # Large enough to prevent 10 pods fitting in 1 rack (2 nodes) diff --git a/operator/e2e/yaml/tas-large-scale.yaml b/operator/e2e/yaml/tas-large-scale.yaml new file mode 100644 index 000000000..d3685abd7 --- /dev/null +++ b/operator/e2e/yaml/tas-large-scale.yaml @@ -0,0 +1,54 @@ +# Workload: SP8 - Large Scaling Ratio +# Test scenario: PCS with block constraint, PCSG with replicas=10/minAvailable=3, PCLQ with host constraint +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: tas-large-scale + labels: + app: tas-large-scale +spec: + replicas: 1 + template: + topologyConstraint: + packDomain: block + podCliqueScalingGroups: + - name: workers + replicas: 10 + minAvailable: 3 + topologyConstraint: + cliqueNames: + - worker + cliques: + - name: worker + labels: + kai.scheduler/queue: test + topologyConstraint: + packDomain: host + spec: + roleName: worker + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: worker + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 40Mi diff --git a/operator/e2e/yaml/tas-multirep.yaml b/operator/e2e/yaml/tas-multirep.yaml new file mode 100644 index 000000000..aea879e52 --- /dev/null +++ b/operator/e2e/yaml/tas-multirep.yaml @@ -0,0 +1,54 @@ +# Workload 11: MR-1 - Multi-Replica with Rack Constraint +# Test scenario: PCS with 2 replicas, each replica packs in separate rack +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: tas-multirep + labels: + app: tas-multirep +spec: + replicas: 2 # Creates 2 separate PodGangs (one per replica) + template: + topologyConstraint: + packDomain: rack # Each replica gang packs within its own rack + cliques: + - name: worker + labels: + kai.scheduler/queue: test + spec: + roleName: worker + replicas: 2 # 2 pods per replica (total: 4 pods) + minAvailable: 2 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-multirep-0-worker + topologyKey: kubernetes.io/hostname + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: worker + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 20Mi # Small footprint to ensure resources not a constraint diff --git a/operator/e2e/yaml/tas-no-constraint.yaml b/operator/e2e/yaml/tas-no-constraint.yaml index 28204b0c2..d85485737 100644 --- a/operator/e2e/yaml/tas-no-constraint.yaml +++ b/operator/e2e/yaml/tas-no-constraint.yaml @@ -35,6 +35,15 @@ spec: operator: In values: - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-no-constraint-0-workers-0-worker + topologyKey: kubernetes.io/hostname tolerations: - key: node_role.e2e.grove.nvidia.com operator: Equal @@ -42,7 +51,8 @@ spec: effect: NoSchedule containers: - name: worker - image: registry:5001/nginx:alpine-slim + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] resources: requests: - memory: 30Mi + memory: 10Mi diff --git a/operator/e2e/yaml/tas-pcs-multi-pcsg-multi-replica.yaml b/operator/e2e/yaml/tas-pcs-multi-pcsg-multi-replica.yaml new file mode 100644 index 000000000..6e321888f --- /dev/null +++ b/operator/e2e/yaml/tas-pcs-multi-pcsg-multi-replica.yaml @@ -0,0 +1,184 @@ +# Workload: Disaggregated Inference - Multi-replica PCS with 3-level topology hierarchy +# Test scenario: PCS (block) with 2 replicas, 2 PCSGs (rack), and PCLQ-level constraint (host) +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: tas-pcs-multi-pcsg + labels: + app: tas-pcs-multi-pcsg +spec: + replicas: 2 + template: + topologyConstraint: + packDomain: block + podCliqueScalingGroups: + - name: decoder + replicas: 2 + minAvailable: 1 + topologyConstraint: + packDomain: rack + cliqueNames: + - dworker + - dleader + - name: prefill + replicas: 2 + minAvailable: 1 + topologyConstraint: + packDomain: rack + cliqueNames: + - pworker + - pleader + cliques: + - name: dworker + labels: + kai.scheduler/queue: test + spec: + roleName: dworker + replicas: 1 + minAvailable: 1 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: worker + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 40Mi + - name: dleader + labels: + kai.scheduler/queue: test + spec: + roleName: dleader + replicas: 1 + minAvailable: 1 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: leader + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 40Mi + - name: pworker + topologyConstraint: + packDomain: host + labels: + kai.scheduler/queue: test + spec: + roleName: pworker + replicas: 1 + minAvailable: 1 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: worker + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 40Mi + - name: pleader + labels: + kai.scheduler/queue: test + spec: + roleName: pleader + replicas: 1 + minAvailable: 1 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: leader + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 40Mi + - name: router + labels: + kai.scheduler/queue: test + spec: + roleName: router + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: router + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 40Mi diff --git a/operator/e2e/yaml/tas-pcs-multi-pcsg.yaml b/operator/e2e/yaml/tas-pcs-multi-pcsg.yaml new file mode 100644 index 000000000..e2e752664 --- /dev/null +++ b/operator/e2e/yaml/tas-pcs-multi-pcsg.yaml @@ -0,0 +1,227 @@ +# Workload: Disaggregated Inference - PCS with PCSG and multiple cliques +# Test scenario: PCS (block) with 2 PCSGs (rack) containing disaggregated inference components +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: tas-pcs-multi-pcsg + labels: + app: tas-pcs-multi-pcsg +spec: + replicas: 1 + template: + topologyConstraint: + packDomain: block + podCliqueScalingGroups: + - name: decoder + replicas: 2 + minAvailable: 1 + topologyConstraint: + packDomain: rack + cliqueNames: + - dworker + - dleader + - name: prefill + replicas: 2 + minAvailable: 1 + topologyConstraint: + packDomain: rack + cliqueNames: + - pworker + - pleader + cliques: + - name: dworker + labels: + kai.scheduler/queue: test + spec: + roleName: dworker + replicas: 1 + minAvailable: 1 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-pcs-multi-pcsg-0-decoder-0-dworker + topologyKey: kubernetes.io/hostname + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: worker + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 10Mi + - name: dleader + labels: + kai.scheduler/queue: test + spec: + roleName: dleader + replicas: 1 + minAvailable: 1 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-pcs-multi-pcsg-0-decoder-0-dleader + topologyKey: kubernetes.io/hostname + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: leader + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 10Mi + - name: pworker + labels: + kai.scheduler/queue: test + spec: + roleName: pworker + replicas: 1 + minAvailable: 1 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-pcs-multi-pcsg-0-prefill-0-pworker + topologyKey: kubernetes.io/hostname + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: worker + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 10Mi + - name: pleader + labels: + kai.scheduler/queue: test + spec: + roleName: pleader + replicas: 1 + minAvailable: 1 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-pcs-multi-pcsg-0-prefill-0-pleader + topologyKey: kubernetes.io/hostname + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: leader + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 10Mi + - name: router + labels: + kai.scheduler/queue: test + spec: + roleName: router + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-pcs-multi-pcsg-0-router + topologyKey: kubernetes.io/hostname + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: router + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 10Mi diff --git a/operator/e2e/yaml/tas-pcs-pclq.yaml b/operator/e2e/yaml/tas-pcs-pclq.yaml new file mode 100644 index 000000000..4bf0e4a9c --- /dev/null +++ b/operator/e2e/yaml/tas-pcs-pclq.yaml @@ -0,0 +1,47 @@ +# Workload: SP2 - PCS+PCLQ Topology +# Test scenario: PCS with block constraint, standalone PCLQ with host constraint +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: tas-pcs-pclq + labels: + app: tas-pcs-pclq +spec: + replicas: 1 + template: + topologyConstraint: + packDomain: block + cliques: + - name: worker + labels: + kai.scheduler/queue: test + topologyConstraint: + packDomain: host + spec: + roleName: worker + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: worker + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 40Mi diff --git a/operator/e2e/yaml/tas-pcsg-pclq.yaml b/operator/e2e/yaml/tas-pcsg-pclq.yaml new file mode 100644 index 000000000..64408feb2 --- /dev/null +++ b/operator/e2e/yaml/tas-pcsg-pclq.yaml @@ -0,0 +1,53 @@ +# Workload: SP5 - PCSG+PCLQ Topology (no PCS constraint) +# Test scenario: No PCS constraint, PCSG with rack constraint, PCLQ with host constraint +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: tas-pcsg-pclq + labels: + app: tas-pcsg-pclq +spec: + replicas: 1 + template: + podCliqueScalingGroups: + - name: workers + replicas: 2 + minAvailable: 2 + topologyConstraint: + packDomain: rack + cliqueNames: + - worker + cliques: + - name: worker + labels: + kai.scheduler/queue: test + topologyConstraint: + packDomain: host + spec: + roleName: worker + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: worker + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 40Mi diff --git a/operator/e2e/yaml/tas-pcsg-scale.yaml b/operator/e2e/yaml/tas-pcsg-scale.yaml new file mode 100644 index 000000000..9c6346767 --- /dev/null +++ b/operator/e2e/yaml/tas-pcsg-scale.yaml @@ -0,0 +1,53 @@ +# Workload 9: SP-3 - PCSG Scaling with Topology Constraints and MinAvailable +# Test scenario: 3 PCSG replicas with rack constraint demonstrating scaling coordination +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: tas-pcsg-scale + labels: + app: tas-pcsg-scale +spec: + replicas: 1 + template: + topologyConstraint: + packDomain: block + podCliqueScalingGroups: + - name: inference-group + replicas: 3 + minAvailable: 1 + topologyConstraint: + packDomain: rack + cliqueNames: + - worker + cliques: + - name: worker + labels: + kai.scheduler/queue: test + spec: + roleName: worker + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: kai-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node_role.e2e.grove.nvidia.com + operator: In + values: + - agent + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: worker + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + resources: + requests: + memory: 40Mi diff --git a/operator/e2e/yaml/tas-sl-pcs-only.yaml b/operator/e2e/yaml/tas-sl-pcs-only.yaml index ccd560b86..d32c33a73 100644 --- a/operator/e2e/yaml/tas-sl-pcs-only.yaml +++ b/operator/e2e/yaml/tas-sl-pcs-only.yaml @@ -37,6 +37,15 @@ spec: operator: In values: - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-sl-pcs-only-0-workers-0-worker + topologyKey: kubernetes.io/hostname tolerations: - key: node_role.e2e.grove.nvidia.com operator: Equal @@ -44,10 +53,11 @@ spec: effect: NoSchedule containers: - name: worker - image: registry:5001/nginx:alpine-slim + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] resources: requests: - memory: 30Mi + memory: 10Mi - name: router labels: kai.scheduler/queue: test @@ -66,6 +76,15 @@ spec: operator: In values: - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-sl-pcs-only-0-router + topologyKey: kubernetes.io/hostname tolerations: - key: node_role.e2e.grove.nvidia.com operator: Equal @@ -73,7 +92,8 @@ spec: effect: NoSchedule containers: - name: router - image: registry:5001/nginx:alpine-slim + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] resources: requests: - memory: 30Mi + memory: 10Mi diff --git a/operator/e2e/yaml/tas-sl-pcsg-only.yaml b/operator/e2e/yaml/tas-sl-pcsg-only.yaml index eabf0ec58..b169d695e 100644 --- a/operator/e2e/yaml/tas-sl-pcsg-only.yaml +++ b/operator/e2e/yaml/tas-sl-pcsg-only.yaml @@ -37,6 +37,15 @@ spec: operator: In values: - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-sl-pcsg-only-0-workers-0-worker + topologyKey: kubernetes.io/hostname tolerations: - key: node_role.e2e.grove.nvidia.com operator: Equal @@ -44,10 +53,11 @@ spec: effect: NoSchedule containers: - name: worker - image: registry:5001/nginx:alpine-slim + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] resources: requests: - memory: 30Mi + memory: 10Mi - name: router labels: kai.scheduler/queue: test @@ -66,6 +76,15 @@ spec: operator: In values: - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-sl-pcsg-only-0-router + topologyKey: kubernetes.io/hostname tolerations: - key: node_role.e2e.grove.nvidia.com operator: Equal @@ -73,7 +92,8 @@ spec: effect: NoSchedule containers: - name: router - image: registry:5001/nginx:alpine-slim + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] resources: requests: - memory: 30Mi + memory: 10Mi diff --git a/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml b/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml index 4f4d12879..70bd2cdd5 100644 --- a/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml +++ b/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml @@ -31,6 +31,15 @@ spec: operator: In values: - agent + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: grove.io/podclique + operator: In + values: + - tas-standalone-pclq-0-worker + topologyKey: kubernetes.io/hostname tolerations: - key: node_role.e2e.grove.nvidia.com operator: Equal @@ -38,7 +47,8 @@ spec: effect: NoSchedule containers: - name: worker - image: registry:5001/nginx:alpine-slim + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] resources: requests: - memory: 30Mi + memory: 40Mi