From 1aaac7652c1a1c3f645b72288d6f50e2b3539cf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=83=A1=E7=8E=AE=E6=96=87?= Date: Tue, 23 Sep 2025 18:30:26 +0800 Subject: [PATCH 1/2] waitstatus: lower max interval to 2s to increase throughput on heavy load --- pkg/disk/disk.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/disk/disk.go b/pkg/disk/disk.go index c0d82351e..4b2ca9ce8 100644 --- a/pkg/disk/disk.go +++ b/pkg/disk/disk.go @@ -246,10 +246,12 @@ func newBatcher(fromNode bool) (waitstatus.StatusWaiter[ecs.Disk], batcher.Batch client := desc.Disk{Client: GlobalConfigVar.EcsClient} ctx := context.Background() interval := 1 * time.Second + max := 2 * time.Second if fromNode { interval = 2 * time.Second // We have many nodes, use longer interval to avoid throttling + max = 3 * time.Second } - waiter := waitstatus.NewBatched(client, clock.RealClock{}, interval, 3*time.Second) + waiter := waitstatus.NewBatched(client, clock.RealClock{}, interval, max) go waiter.Run(ctx) b := batcher.NewLowLatency(client, clock.RealClock{}, 1*time.Second, 8) From 65bcbbba1456ffbe4fed141c1637c35e67a86d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=83=A1=E7=8E=AE=E6=96=87?= Date: Tue, 23 Sep 2025 22:55:59 +0800 Subject: [PATCH 2/2] misc refine --- pkg/disk/batcher/low_latency.go | 2 +- pkg/disk/cloud.go | 2 +- pkg/disk/utils.go | 1 + pkg/disk/waitstatus/batched.go | 2 +- pkg/features/features.go | 6 ++++-- 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pkg/disk/batcher/low_latency.go b/pkg/disk/batcher/low_latency.go index 698a3a3c3..76775febc 100644 --- a/pkg/disk/batcher/low_latency.go +++ b/pkg/disk/batcher/low_latency.go @@ -170,7 +170,7 @@ func (w *LowLatency[T]) descBatch(logger logr.Logger, t time.Time, requests map[ } // Not found sendToAll(requests, getResponse[*T]{}) - logger.V(3).Info("got batch", "n", len(thisBatch), + logger.V(2).Info("got batch", "n", len(thisBatch), "requestID", resp.RequestID, "duration", w.clk.Since(t), "wait", t.Sub(firstTime)) } diff --git a/pkg/disk/cloud.go b/pkg/disk/cloud.go index 0e90ea97d..bd38535c3 100644 --- a/pkg/disk/cloud.go +++ b/pkg/disk/cloud.go @@ -158,7 +158,7 @@ func (ad *DiskAttachDetach) findDevice(ctx context.Context, diskID, serial strin // Returns device path if fromNode, disk serial number otherwise. func (ad *DiskAttachDetach) attachDisk(ctx context.Context, diskID, nodeID string, fromNode bool) (string, error) { logger := klog.FromContext(ctx) - logger.V(2).Info("Starting Do AttachDisk", "instanceID", nodeID, "region", GlobalConfigVar.Region) + logger.V(2).Info("Starting Do AttachDisk") ecsClient := GlobalConfigVar.EcsClient // Step 1: check disk status diff --git a/pkg/disk/utils.go b/pkg/disk/utils.go index 070b927a0..a37f3c7dd 100644 --- a/pkg/disk/utils.go +++ b/pkg/disk/utils.go @@ -109,6 +109,7 @@ var ecsOpenAPITransport = http.Transport{ ForceAttemptHTTP2: true, MaxIdleConns: 100, MaxIdleConnsPerHost: 100, // Set this equal to MaxIdleConns as we should only talk to one endpoint with this Transport instance. + MaxConnsPerHost: 500, // Protect our backend. Should be large enough to handle any workload. IdleConnTimeout: 90 * time.Second, TLSHandshakeTimeout: 10 * time.Second, ExpectContinueTimeout: 1 * time.Second, diff --git a/pkg/disk/waitstatus/batched.go b/pkg/disk/waitstatus/batched.go index 842415402..bf743614a 100644 --- a/pkg/disk/waitstatus/batched.go +++ b/pkg/disk/waitstatus/batched.go @@ -117,7 +117,7 @@ func (w *Batched[T]) Run(ctx context.Context) { case r := <-w.feedback: next := w.processFeedback(r) w.idQueue = append(w.idQueue, next...) - logger.V(4).Info("poll response processed", "queueDepth", len(w.idQueue), "requeue", len(next)) + logger.V(2).Info("poll response processed", "queueDepth", len(w.idQueue), "requeue", len(next)) case t := <-pollChan: logger.V(4).Info("starting poll", "queueDepth", len(w.idQueue)) w.idQueue = w.poll(t, w.idQueue) diff --git a/pkg/features/features.go b/pkg/features/features.go index ed76dbc0e..ff61ba89a 100644 --- a/pkg/features/features.go +++ b/pkg/features/features.go @@ -16,7 +16,7 @@ const ( DiskADController featuregate.Feature = "DiskADController" // Attach multiple disks to the same node in parallel. - // ECS don't allow parallel attach/detach to a node by default. + // ECS don't allow parallel attach to a node by default. // Enable this if you need faster attach, and only if your UID is whitelisted (by open a ticket), // or you have the supportConcurrencyAttach=true tag on your ECS instance. // @@ -24,7 +24,9 @@ const ( DiskParallelAttach featuregate.Feature = "DiskParallelAttach" // Detach multiple disks from the same node in parallel. - // ECS does not allow parallel detach from a node currently. This feature gate is reserved for future use. + // ECS does not allow parallel detach from a node by default. + // Enable this if you need faster detach, and only if your UID is whitelisted (by open a ticket), + // or you have the supportConcurrencyDetach=true tag on your ECS instance. // // Only effective when DiskADController is also enabled. DiskParallelDetach featuregate.Feature = "DiskParallelDetach"