From 4b882cb7c18b367161bf85388a3ab47d1eb94129 Mon Sep 17 00:00:00 2001 From: gimenes Date: Fri, 8 May 2026 16:03:34 -0300 Subject: [PATCH 1/2] fix(sandbox): fail-fast on stalled HTTP clones to escape NAT blackholes Set http.lowSpeedLimit=1000 / http.lowSpeedTime=30 in the system git config so libcurl aborts streams that drop below ~1KB/s for 30s, and extend TRANSIENT_ERRORS so the existing 3-attempt retry loop in clone.ts catches the resulting "Operation too slow" / "RPC failed" / "transfer closed with" outputs. Without this, an in-flight clone whose egress path is severed mid-stream (e.g. fck-nat ASG instance refresh, PMTUD blackhole) waits on the kernel TCP keepalive (~2h default) and visibly hangs at "Receiving objects". Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/sandbox/daemon/setup/clone.ts | 7 +++++++ packages/sandbox/image/Dockerfile | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/packages/sandbox/daemon/setup/clone.ts b/packages/sandbox/daemon/setup/clone.ts index c160bf56ad..18fca8bc79 100644 --- a/packages/sandbox/daemon/setup/clone.ts +++ b/packages/sandbox/daemon/setup/clone.ts @@ -47,6 +47,13 @@ const TRANSIENT_ERRORS = [ "unexpected disconnect", "Connection reset by peer", "Connection timed out", + // libcurl CURLE_OPERATION_TIMEDOUT triggered by http.lowSpeedLimit/Time — + // fires when the egress NAT silently drops in-flight packets (e.g. fck-nat + // ASG instance refresh) and the stream stalls below the threshold. + "Operation too slow", + "transfer closed with", + "RPC failed", + "the remote end hung up", ]; const CLONE_MAX_RETRIES = 3; const CLONE_RETRY_DELAY_MS = 3000; diff --git a/packages/sandbox/image/Dockerfile b/packages/sandbox/image/Dockerfile index ae4f9bdebf..eda3712795 100644 --- a/packages/sandbox/image/Dockerfile +++ b/packages/sandbox/image/Dockerfile @@ -44,6 +44,13 @@ RUN pip3 install --break-system-packages --no-cache-dir \ ENV LANG=en_US.UTF-8 \ LC_ALL=en_US.UTF-8 +# Convert silent stalls (NAT instance replacement, PMTUD blackholes, mid-stream +# packet drops) into fast errors that the daemon's clone retry loop can catch. +# Without this, libcurl waits on TCP keepalive (~2h default) and the clone +# hangs at "Receiving objects" indefinitely. +RUN git config --system http.lowSpeedLimit 1000 \ + && git config --system http.lowSpeedTime 30 + # Non-root sandbox user. The bun image comes with a 'bun' user (UID 1000), # but we drop privileges further by replacing it with a 'sandbox' user. RUN userdel --remove bun \ From c2a77083e56ad89eacfd8c8419338e95bf6b86c1 Mon Sep 17 00:00:00 2001 From: gimenes Date: Fri, 8 May 2026 16:09:09 -0300 Subject: [PATCH 2/2] [release] bump @decocms/sandbox to 0.4.6 Carries the http.lowSpeedLimit/lowSpeedTime + clone.ts retry-string update so the next sandbox image build picks up the fail-fast fix. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/sandbox/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/sandbox/package.json b/packages/sandbox/package.json index 119dfb0ead..39270e8805 100644 --- a/packages/sandbox/package.json +++ b/packages/sandbox/package.json @@ -1,6 +1,6 @@ { "name": "@decocms/sandbox", - "version": "0.4.5", + "version": "0.4.6", "type": "module", "description": "Sandbox runner for isolated per-user containerised tool execution", "scripts": {