Skip to content

Commit cba0f2c

Browse files
authored
fix: fast-kill agent container on SIGTERM/SIGINT before full cleanup (#1623)
1 parent ac27bc7 commit cba0f2c

File tree

3 files changed

+167
-12
lines changed

3 files changed

+167
-12
lines changed

src/cli.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import {
1414
stopContainers,
1515
cleanup,
1616
preserveIptablesAudit,
17+
fastKillAgentContainer,
1718
} from './docker-manager';
1819
import {
1920
ensureFirewallNetwork,
@@ -1898,13 +1899,25 @@ program
18981899
};
18991900

19001901
// Register signal handlers
1902+
// Fast-kill the agent container immediately so it cannot outlive the awf
1903+
// process. GH Actions sends SIGTERM then SIGKILL ~10 s later; the full
1904+
// docker compose down in performCleanup() is too slow to finish in that
1905+
// window, leaving the container running as an orphan.
1906+
/* istanbul ignore next -- signal handlers cannot be unit-tested */
19011907
process.on('SIGINT', async () => {
1908+
if (containersStarted && !config.keepContainers) {
1909+
await fastKillAgentContainer();
1910+
}
19021911
await performCleanup('SIGINT');
19031912
console.error(`Process exiting with code: 130`);
19041913
process.exit(130); // Standard exit code for SIGINT
19051914
});
19061915

1916+
/* istanbul ignore next -- signal handlers cannot be unit-tested */
19071917
process.on('SIGTERM', async () => {
1918+
if (containersStarted && !config.keepContainers) {
1919+
await fastKillAgentContainer();
1920+
}
19081921
await performCleanup('SIGTERM');
19091922
console.error(`Process exiting with code: 143`);
19101923
process.exit(143); // Standard exit code for SIGTERM

src/docker-manager.test.ts

Lines changed: 80 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { generateDockerCompose, subnetsOverlap, writeConfigs, startContainers, stopContainers, cleanup, runAgentCommand, validateIdNotInSystemRange, getSafeHostUid, getSafeHostGid, getRealUserHome, extractGhHostFromServerUrl, readGitHubPathEntries, mergeGitHubPathEntries, readEnvFile, MIN_REGULAR_UID, ACT_PRESET_BASE_IMAGE } from './docker-manager';
1+
import { generateDockerCompose, subnetsOverlap, writeConfigs, startContainers, stopContainers, fastKillAgentContainer, isAgentExternallyKilled, resetAgentExternallyKilled, AGENT_CONTAINER_NAME, cleanup, runAgentCommand, validateIdNotInSystemRange, getSafeHostUid, getSafeHostGid, getRealUserHome, extractGhHostFromServerUrl, readGitHubPathEntries, mergeGitHubPathEntries, readEnvFile, MIN_REGULAR_UID, ACT_PRESET_BASE_IMAGE } from './docker-manager';
22
import { WrapperConfig } from './types';
33
import * as fs from 'fs';
44
import * as path from 'path';
@@ -1726,7 +1726,7 @@ describe('docker-manager', () => {
17261726
// Verify working_dir is set
17271727
expect(result.services.agent.working_dir).toBe('/custom/workdir');
17281728
// Verify other config is still present
1729-
expect(result.services.agent.container_name).toBe('awf-agent');
1729+
expect(result.services.agent.container_name).toBe(AGENT_CONTAINER_NAME);
17301730
expect(result.services.agent.cap_add).toContain('SYS_CHROOT');
17311731
});
17321732

@@ -2950,12 +2950,65 @@ describe('docker-manager', () => {
29502950
});
29512951
});
29522952

2953+
describe('fastKillAgentContainer', () => {
2954+
beforeEach(() => {
2955+
jest.clearAllMocks();
2956+
resetAgentExternallyKilled();
2957+
});
2958+
2959+
it('should call docker stop with default 3-second timeout', async () => {
2960+
mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any);
2961+
2962+
await fastKillAgentContainer();
2963+
2964+
expect(mockExecaFn).toHaveBeenCalledWith(
2965+
'docker',
2966+
['stop', '-t', '3', AGENT_CONTAINER_NAME],
2967+
{ reject: false, timeout: 8000 }
2968+
);
2969+
});
2970+
2971+
it('should accept a custom stop timeout', async () => {
2972+
mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any);
2973+
2974+
await fastKillAgentContainer(5);
2975+
2976+
expect(mockExecaFn).toHaveBeenCalledWith(
2977+
'docker',
2978+
['stop', '-t', '5', AGENT_CONTAINER_NAME],
2979+
{ reject: false, timeout: 10000 }
2980+
);
2981+
});
2982+
2983+
it('should not throw when docker stop fails', async () => {
2984+
mockExecaFn.mockRejectedValueOnce(new Error('docker not found'));
2985+
2986+
await expect(fastKillAgentContainer()).resolves.toBeUndefined();
2987+
});
2988+
2989+
it('should set the externally-killed flag', async () => {
2990+
mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any);
2991+
2992+
expect(isAgentExternallyKilled()).toBe(false);
2993+
await fastKillAgentContainer();
2994+
expect(isAgentExternallyKilled()).toBe(true);
2995+
});
2996+
2997+
it('should set the externally-killed flag even when docker stop fails', async () => {
2998+
mockExecaFn.mockRejectedValueOnce(new Error('docker not found'));
2999+
3000+
await fastKillAgentContainer();
3001+
expect(isAgentExternallyKilled()).toBe(true);
3002+
});
3003+
});
3004+
29533005
describe('runAgentCommand', () => {
29543006
let testDir: string;
29553007

29563008
beforeEach(() => {
29573009
testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'awf-test-'));
29583010
jest.clearAllMocks();
3011+
resetAgentExternallyKilled();
29593012
});
29603013

29613014
afterEach(() => {
@@ -3132,6 +3185,31 @@ describe('docker-manager', () => {
31323185

31333186
jest.useRealTimers();
31343187
});
3188+
3189+
it('should skip post-run analysis when agent was externally killed', async () => {
3190+
// Create access.log with denied entries — these should be ignored
3191+
const squidLogsDir = path.join(testDir, 'squid-logs');
3192+
fs.mkdirSync(squidLogsDir, { recursive: true });
3193+
fs.writeFileSync(
3194+
path.join(squidLogsDir, 'access.log'),
3195+
'1760994429.358 172.30.0.20:36274 blocked.com:443 -:- 1.1 CONNECT 403 TCP_DENIED:HIER_NONE blocked.com:443 "curl/7.81.0"\n'
3196+
);
3197+
3198+
// Simulate fastKillAgentContainer having been called
3199+
mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); // fastKill docker stop
3200+
await fastKillAgentContainer();
3201+
3202+
// Mock docker logs -f
3203+
mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any);
3204+
// Mock docker wait — container was stopped externally, returns 143
3205+
mockExecaFn.mockResolvedValueOnce({ stdout: '143', stderr: '', exitCode: 0 } as any);
3206+
3207+
const result = await runAgentCommand(testDir, ['github.com']);
3208+
3209+
// Should return 143 and skip log analysis (empty blockedDomains)
3210+
expect(result.exitCode).toBe(143);
3211+
expect(result.blockedDomains).toEqual([]);
3212+
});
31353213
});
31363214

31373215
describe('cleanup', () => {

src/docker-manager.ts

Lines changed: 74 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,24 @@ import { DEFAULT_DNS_SERVERS } from './dns-resolver';
1111

1212
const SQUID_PORT = 3128;
1313

14+
/**
15+
* Container names used in Docker Compose and referenced by docker CLI commands.
16+
* Extracted as constants so that generateDockerCompose() and helpers like
17+
* fastKillAgentContainer() stay in sync.
18+
*/
19+
export const AGENT_CONTAINER_NAME = 'awf-agent';
20+
const SQUID_CONTAINER_NAME = 'awf-squid';
21+
const IPTABLES_INIT_CONTAINER_NAME = 'awf-iptables-init';
22+
const API_PROXY_CONTAINER_NAME = 'awf-api-proxy';
23+
const DOH_PROXY_CONTAINER_NAME = 'awf-doh-proxy';
24+
25+
/**
26+
* Flag set by fastKillAgentContainer() to signal runAgentCommand() that
27+
* the container was externally stopped. When true, runAgentCommand() skips
28+
* its own docker wait / log collection to avoid racing with the signal handler.
29+
*/
30+
let agentExternallyKilled = false;
31+
1432
// When bundled with esbuild, this global is replaced at build time with the
1533
// JSON content of containers/agent/seccomp-profile.json. In normal (tsc)
1634
// builds the identifier remains undeclared, so the typeof check below is safe.
@@ -418,7 +436,7 @@ export function generateDockerCompose(
418436

419437
// Squid service configuration
420438
const squidService: any = {
421-
container_name: 'awf-squid',
439+
container_name: SQUID_CONTAINER_NAME,
422440
networks: {
423441
'awf-net': {
424442
ipv4_address: networkConfig.squidIp,
@@ -1143,7 +1161,7 @@ export function generateDockerCompose(
11431161

11441162
// Agent service configuration
11451163
const agentService: any = {
1146-
container_name: 'awf-agent',
1164+
container_name: AGENT_CONTAINER_NAME,
11471165
networks: {
11481166
'awf-net': {
11491167
ipv4_address: networkConfig.agentIp,
@@ -1316,7 +1334,7 @@ export function generateDockerCompose(
13161334
// that shares the agent's network namespace but NEVER gives NET_ADMIN to the agent.
13171335
// This eliminates the window where the agent holds NET_ADMIN during startup.
13181336
const iptablesInitService: any = {
1319-
container_name: 'awf-iptables-init',
1337+
container_name: IPTABLES_INIT_CONTAINER_NAME,
13201338
// Share agent's network namespace so iptables rules apply to agent's traffic
13211339
network_mode: 'service:agent',
13221340
// Only mount the init signal volume and the iptables setup script
@@ -1379,7 +1397,7 @@ export function generateDockerCompose(
13791397
// Add Node.js API proxy sidecar if enabled
13801398
if (config.enableApiProxy && networkConfig.proxyIp) {
13811399
const proxyService: any = {
1382-
container_name: 'awf-api-proxy',
1400+
container_name: API_PROXY_CONTAINER_NAME,
13831401
networks: {
13841402
'awf-net': {
13851403
ipv4_address: networkConfig.proxyIp,
@@ -1513,7 +1531,7 @@ export function generateDockerCompose(
15131531
// Add DNS-over-HTTPS proxy sidecar if enabled
15141532
if (config.dnsOverHttps && networkConfig.dohProxyIp) {
15151533
const dohService: any = {
1516-
container_name: 'awf-doh-proxy',
1534+
container_name: DOH_PROXY_CONTAINER_NAME,
15171535
image: 'cloudflare/cloudflared:latest',
15181536
networks: {
15191537
'awf-net': {
@@ -1918,7 +1936,7 @@ export async function startContainers(workDir: string, allowedDomains: string[],
19181936
// This handles orphaned containers from failed/interrupted previous runs
19191937
logger.debug('Removing any existing containers with conflicting names...');
19201938
try {
1921-
await execa('docker', ['rm', '-f', 'awf-squid', 'awf-agent', 'awf-iptables-init', 'awf-api-proxy'], {
1939+
await execa('docker', ['rm', '-f', SQUID_CONTAINER_NAME, AGENT_CONTAINER_NAME, IPTABLES_INIT_CONTAINER_NAME, API_PROXY_CONTAINER_NAME], {
19221940
reject: false,
19231941
});
19241942
} catch {
@@ -2011,7 +2029,7 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[],
20112029
try {
20122030
// Stream logs in real-time using docker logs -f (follow mode)
20132031
// Run this in the background and wait for the container to exit separately
2014-
const logsProcess = execa('docker', ['logs', '-f', 'awf-agent'], {
2032+
const logsProcess = execa('docker', ['logs', '-f', AGENT_CONTAINER_NAME], {
20152033
stdio: 'inherit',
20162034
reject: false,
20172035
});
@@ -2023,7 +2041,7 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[],
20232041
logger.info(`Agent timeout: ${agentTimeoutMinutes} minutes`);
20242042

20252043
// Race docker wait against a timeout
2026-
const waitPromise = execa('docker', ['wait', 'awf-agent']).then(result => ({
2044+
const waitPromise = execa('docker', ['wait', AGENT_CONTAINER_NAME]).then(result => ({
20272045
type: 'completed' as const,
20282046
exitCodeStr: result.stdout,
20292047
}));
@@ -2038,7 +2056,7 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[],
20382056
if (raceResult.type === 'timeout') {
20392057
logger.warn(`Agent command timed out after ${agentTimeoutMinutes} minutes, stopping container...`);
20402058
// Stop the container gracefully (10 second grace period before SIGKILL)
2041-
await execa('docker', ['stop', '-t', '10', 'awf-agent'], { reject: false });
2059+
await execa('docker', ['stop', '-t', '10', AGENT_CONTAINER_NAME], { reject: false });
20422060
exitCode = 124; // Standard timeout exit code (same as coreutils timeout)
20432061
} else {
20442062
// Clear the timeout timer so it doesn't keep the event loop alive
@@ -2047,13 +2065,21 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[],
20472065
}
20482066
} else {
20492067
// No timeout - wait indefinitely
2050-
const { stdout: exitCodeStr } = await execa('docker', ['wait', 'awf-agent']);
2068+
const { stdout: exitCodeStr } = await execa('docker', ['wait', AGENT_CONTAINER_NAME]);
20512069
exitCode = parseInt(exitCodeStr.trim(), 10);
20522070
}
20532071

20542072
// Wait for the logs process to finish (it should exit automatically when container stops)
20552073
await logsProcess;
20562074

2075+
// If the container was killed externally (e.g. by fastKillAgentContainer in a
2076+
// signal handler), skip the remaining log analysis — the container state is
2077+
// unreliable and the signal handler will drive the rest of the shutdown.
2078+
if (agentExternallyKilled) {
2079+
logger.debug('Agent was externally killed, skipping post-run analysis');
2080+
return { exitCode: exitCode || 143, blockedDomains: [] };
2081+
}
2082+
20572083
logger.debug(`Agent exit code: ${exitCode}`);
20582084

20592085
// Small delay to ensure Squid logs are flushed to disk
@@ -2108,6 +2134,44 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[],
21082134
}
21092135
}
21102136

2137+
/**
2138+
* Fast-kills the agent container with a short grace period.
2139+
* Used in signal handlers (SIGTERM/SIGINT) to ensure the agent cannot outlive
2140+
* the awf process — e.g. when GH Actions sends SIGTERM followed by SIGKILL
2141+
* after ~10 seconds. The full `docker compose down -v` in stopContainers() is
2142+
* too slow to reliably complete in that window.
2143+
*
2144+
* @param stopTimeoutSeconds - Grace period before SIGKILL (default: 3)
2145+
*/
2146+
export async function fastKillAgentContainer(stopTimeoutSeconds = 3): Promise<void> {
2147+
agentExternallyKilled = true;
2148+
try {
2149+
await execa('docker', ['stop', '-t', String(stopTimeoutSeconds), AGENT_CONTAINER_NAME], {
2150+
reject: false,
2151+
timeout: (stopTimeoutSeconds + 5) * 1000, // hard deadline on the stop command itself
2152+
});
2153+
} catch {
2154+
// Best-effort — if docker CLI is unavailable or hangs, we still proceed
2155+
// to performCleanup which will attempt docker compose down.
2156+
}
2157+
}
2158+
2159+
/**
2160+
* Returns whether the agent was externally killed via fastKillAgentContainer().
2161+
* @internal Exported for testing.
2162+
*/
2163+
export function isAgentExternallyKilled(): boolean {
2164+
return agentExternallyKilled;
2165+
}
2166+
2167+
/**
2168+
* Resets the externally-killed flag. Only used in tests.
2169+
* @internal Exported for testing.
2170+
*/
2171+
export function resetAgentExternallyKilled(): void {
2172+
agentExternallyKilled = false;
2173+
}
2174+
21112175
/**
21122176
* Stops and removes Docker Compose services
21132177
*/

0 commit comments

Comments
 (0)