Skip to content

Commit 1fd4e67

Browse files
ronenavopenshift-merge-bot[bot]
authored andcommitted
ECOPROJECT-4197 | feat: sizer - improve support single-node cluster sizing
Signed-off-by: Ronen Avraham <ravraham@redhat.com>
1 parent 7ef4cba commit 1fd4e67

File tree

3 files changed

+341
-18
lines changed

3 files changed

+341
-18
lines changed

internal/handlers/v1alpha1/sizer_test.go

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -569,17 +569,19 @@ var _ = Describe("sizer handler", func() {
569569

570570
It("successfully handles single node cluster (1 control plane)", func() {
571571
one := api.N1
572+
controlPlaneSchedulable := true
572573
request := &api.ClusterRequirementsRequest{
573-
ClusterId: clusterID,
574-
CpuOverCommitRatio: api.CpuOneToFour,
575-
MemoryOverCommitRatio: api.MemoryOneToTwo,
576-
WorkerNodeCPU: 8,
577-
WorkerNodeMemory: 16,
578-
ControlPlaneNodeCount: &one,
574+
ClusterId: clusterID,
575+
CpuOverCommitRatio: api.CpuOneToFour,
576+
MemoryOverCommitRatio: api.MemoryOneToTwo,
577+
WorkerNodeCPU: 8,
578+
WorkerNodeMemory: 16,
579+
ControlPlaneNodeCount: &one,
580+
ControlPlaneSchedulable: &controlPlaneSchedulable,
579581
}
580582

581583
assessment := createTestAssessment(assessmentID, user.Username, user.Organization, clusterID)
582-
sizerResponse := createTestSizerResponse(3, 2, 1, 40, 80)
584+
sizerResponse := createTestSizerResponse(1, 0, 1, 40, 80)
583585
handler, testServer = setupTestHandler(mockStore, sizerResponse, assessment)
584586

585587
resp, err := handler.CalculateAssessmentClusterRequirements(ctx, server.CalculateAssessmentClusterRequirementsRequestObject{
@@ -592,6 +594,9 @@ var _ = Describe("sizer handler", func() {
592594
successResp, ok := resp.(server.CalculateAssessmentClusterRequirements200JSONResponse)
593595
Expect(ok).To(BeTrue())
594596
Expect(successResp.ClusterSizing.ControlPlaneNodes).To(Equal(1))
597+
Expect(successResp.ClusterSizing.TotalNodes).To(Equal(1))
598+
Expect(successResp.ClusterSizing.WorkerNodes).To(Equal(0))
599+
Expect(successResp.ClusterSizing.FailoverNodes).To(Equal(0))
595600
})
596601

597602
It("successfully handles HA cluster (3 control plane) - explicit", func() {

internal/service/sizer.go

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"errors"
77
"fmt"
88
"math"
9+
"strings"
910
"time"
1011

1112
"github.com/google/uuid"
@@ -247,6 +248,15 @@ func (s *SizerService) CalculateClusterRequirements(
247248
WithInt("num_services", len(services)).
248249
Log()
249250

251+
singleNode := req.ControlPlaneNodeCount == 1
252+
// Single-node clusters must have schedulable control planes
253+
if singleNode && !controlPlaneSchedulable {
254+
return nil, NewErrInvalidRequest(
255+
"single-node clusters require schedulable control planes. " +
256+
"Set ControlPlaneSchedulable to true or use multiple control plane nodes",
257+
)
258+
}
259+
250260
// Build sizer API payload
251261
sizerPayload := s.buildSizerPayload(
252262
services,
@@ -258,12 +268,16 @@ func (s *SizerService) CalculateClusterRequirements(
258268
controlPlaneCPU,
259269
controlPlaneMemory,
260270
effectiveCPNodeCount,
271+
singleNode,
261272
)
262273

263274
// Call sizer service
264275
sizerResponse, err := s.sizerClient.CalculateSizing(ctx, sizerPayload)
265276
if err != nil {
266277
tracer.Error(err).Log()
278+
if singleNode && isSizerSchedulabilityError(err) {
279+
return nil, s.singleNodeFitError(totalCPU, totalMemory, smtMultiplier, req.ControlPlaneCPU, req.ControlPlaneMemory)
280+
}
267281
return nil, fmt.Errorf("failed to call sizer service: %w", err)
268282
}
269283
if sizerResponse == nil {
@@ -272,6 +286,10 @@ func (s *SizerService) CalculateClusterRequirements(
272286

273287
transformed := s.transformSizerResponse(sizerResponse, effectiveCPNodeCount)
274288

289+
if singleNode && sizerResponse.Data.NodeCount > 1 {
290+
return nil, s.singleNodeFitError(totalCPU, totalMemory, smtMultiplier, req.ControlPlaneCPU, req.ControlPlaneMemory)
291+
}
292+
275293
if transformed.ClusterSizing.TotalNodes > MaxNodeCount {
276294
minNodeCPU, minNodeMemory := s.calculateMinimumNodeSize(
277295
totalCPU,
@@ -526,7 +544,17 @@ func (s *SizerService) buildSizerPayload(
526544
controlPlaneCPU int,
527545
controlPlaneMemory int,
528546
controlPlaneNodeCount int,
547+
singleNode bool,
529548
) *client.SizerRequest {
549+
if singleNode {
550+
return s.buildSingleNodeSizerPayload(
551+
services,
552+
platform,
553+
controlPlaneCPU,
554+
controlPlaneMemory,
555+
)
556+
}
557+
530558
machineSets := []client.MachineSet{
531559
{
532560
Name: "worker",
@@ -616,6 +644,86 @@ func (s *SizerService) buildSizerPayload(
616644
}
617645
}
618646

647+
func (s *SizerService) buildSingleNodeSizerPayload(
648+
services []BatchedService,
649+
platform string,
650+
controlPlaneCPU int,
651+
controlPlaneMemory int,
652+
) *client.SizerRequest {
653+
machineSets := []client.MachineSet{
654+
{
655+
Name: "controlPlane",
656+
CPU: controlPlaneCPU,
657+
Memory: controlPlaneMemory,
658+
InstanceName: "control-plane",
659+
NumberOfDisks: MachineSetNumberOfDisks,
660+
OnlyFor: []string{},
661+
Label: "Control Plane",
662+
AllowWorkloadScheduling: util.BoolPtr(true),
663+
ControlPlaneReserved: &client.ControlPlaneReserved{
664+
CPU: ControlPlaneReservedCPU,
665+
Memory: ControlPlaneReservedMemory,
666+
},
667+
},
668+
}
669+
670+
allServiceNames := make([]string, len(services))
671+
for i := range services {
672+
allServiceNames[i] = services[i].Name
673+
}
674+
675+
vmServices := make([]client.ServiceDescriptor, len(services))
676+
for i, svc := range services {
677+
runsWith := make([]string, 0, len(services)-1)
678+
for j := range services {
679+
if j != i {
680+
runsWith = append(runsWith, allServiceNames[j])
681+
}
682+
}
683+
vmServices[i] = client.ServiceDescriptor{
684+
Name: svc.Name,
685+
RequiredCPU: svc.RequiredCPU,
686+
RequiredMemory: svc.RequiredMemory,
687+
LimitCPU: svc.LimitCPU,
688+
LimitMemory: svc.LimitMemory,
689+
Zones: 1,
690+
RunsWith: runsWith,
691+
Avoid: []string{},
692+
}
693+
}
694+
695+
workloads := []client.Workload{
696+
{
697+
Name: "control-plane-services",
698+
Count: 1,
699+
UsesMachines: []string{"controlPlane"},
700+
Services: []client.ServiceDescriptor{
701+
{
702+
Name: "ControlPlane",
703+
RequiredCPU: ControlPlaneReservedCPU,
704+
RequiredMemory: ControlPlaneReservedMemory,
705+
Zones: 1,
706+
RunsWith: []string{},
707+
Avoid: []string{},
708+
},
709+
},
710+
},
711+
{
712+
Name: "vm-workload",
713+
Count: 1,
714+
UsesMachines: []string{"controlPlane"},
715+
Services: vmServices,
716+
},
717+
}
718+
719+
return &client.SizerRequest{
720+
Platform: platform,
721+
MachineSets: machineSets,
722+
Workloads: workloads,
723+
Detailed: true,
724+
}
725+
}
726+
619727
// transformSizerResponse maps sizer service response to API response
620728
func (s *SizerService) transformSizerResponse(sizerResponse *client.SizerResponse, controlPlaneNodeCount int) TransformedSizerResponse {
621729
var workerNodes, controlPlaneNodes int
@@ -648,6 +756,13 @@ func (s *SizerService) transformSizerResponse(sizerResponse *client.SizerRespons
648756

649757
totalNodes := controlPlaneNodes + workerNodes
650758

759+
if controlPlaneNodeCount == 1 {
760+
totalNodes = 1
761+
controlPlaneNodes = 1
762+
workerNodes = 0
763+
failoverNodes = 0
764+
}
765+
651766
resourceConsumptionForm := mappers.ResourceConsumptionForm{
652767
CPU: sizerResponse.Data.ResourceConsumption.CPU,
653768
Memory: sizerResponse.Data.ResourceConsumption.Memory,
@@ -695,3 +810,54 @@ func calculateFailoverNodes(workerNodes int) int {
695810
percentageBased := int(math.Ceil(float64(workerNodes) * FailoverCapacityPercent / 100.0))
696811
return max(MinFailoverNodes, percentageBased)
697812
}
813+
814+
// isSizerSchedulabilityError detects schedulability failures by sizer error message substrings.
815+
func isSizerSchedulabilityError(err error) bool {
816+
if err == nil {
817+
return false
818+
}
819+
msg := err.Error()
820+
return strings.Contains(msg, "not schedulable") ||
821+
strings.Contains(msg, "Minimum required") ||
822+
strings.Contains(msg, "too small")
823+
}
824+
825+
func (s *SizerService) singleNodeFitError(totalCPU, totalMemory int, smtMultiplier float64, controlPlaneCPU, controlPlaneMemory int) error {
826+
// Calculate uncapped minimum to detect if workload truly exceeds max supported size
827+
denominator := 1.0 * CapacityMultiplier
828+
minEffectiveCPUPerNode := float64(totalCPU) / denominator
829+
uncappedMinNodeCPU := int(math.Ceil(minEffectiveCPUPerNode / smtMultiplier))
830+
uncappedMinNodeMemory := int(math.Ceil(float64(totalMemory) / denominator))
831+
832+
// Round up to nearest even number for CPU, nearest multiple of 4 for memory
833+
uncappedMinNodeCPU = int(math.Ceil(float64(uncappedMinNodeCPU)/2) * 2)
834+
uncappedMinNodeMemory = int(math.Ceil(float64(uncappedMinNodeMemory)/4) * 4)
835+
836+
// If workload truly exceeds max supported size, recommend multi-node only
837+
if uncappedMinNodeCPU > MaxRecommendedNodeCPU || uncappedMinNodeMemory > MaxRecommendedNodeMemory {
838+
return NewErrInvalidRequest("workload does not fit on a single node. Use a multi-node cluster.")
839+
}
840+
841+
// Otherwise use the calculated minimum (already below max)
842+
// Ensure minimum is at least control plane reserve so the suggested size can schedule the CP
843+
minReservedCPU := int(math.Ceil(ControlPlaneReservedCPU/2.0) * 2)
844+
minReservedMemory := int(math.Ceil(ControlPlaneReservedMemory/4.0) * 4)
845+
minNodeCPU := max(max(uncappedMinNodeCPU, MinFallbackNodeCPU), minReservedCPU)
846+
minNodeMemory := max(max(uncappedMinNodeMemory, MinFallbackNodeMemory), minReservedMemory)
847+
848+
return NewErrInvalidRequest(singleNodeFitErrorMessage(controlPlaneCPU, controlPlaneMemory, minNodeCPU, minNodeMemory))
849+
}
850+
851+
// singleNodeFitErrorMessage returns the error when single-node didn't fit. If the user
852+
// is already at/above our minimum or at max, we recommend multi-node only.
853+
func singleNodeFitErrorMessage(controlPlaneCPU, controlPlaneMemory, minNodeCPU, minNodeMemory int) string {
854+
alreadyAtOrAbove := controlPlaneCPU >= minNodeCPU && controlPlaneMemory >= minNodeMemory
855+
atMaxSupported := controlPlaneCPU >= MaxRecommendedNodeCPU && controlPlaneMemory >= MaxRecommendedNodeMemory
856+
if alreadyAtOrAbove || atMaxSupported {
857+
return "workload does not fit on a single node. Use a multi-node cluster."
858+
}
859+
return fmt.Sprintf(
860+
"workload does not fit on a single node with the specified resources. Use at least %d CPU / %d GB memory per node for a single-node cluster, or use a multi-node cluster",
861+
minNodeCPU, minNodeMemory,
862+
)
863+
}

0 commit comments

Comments
 (0)