Skip to content

Commit 57dcc00

Browse files
sss: kill entire process group during cleanup
When using SSS with multiple pins (e.g., tang) and threshold < n, after successful decryption with t pins, the remaining child processes and their grandchildren (like curl) were not being killed. The fix uses process groups: - Each child process calls setpgid(0, 0) to become a process group leader - Parent also calls setpgid() to eliminate race condition - Any grandchildren inherit the same process group - Cleanup uses kill(-pid, SIGTERM) to kill the entire process group - Falls back to direct kill if process group doesn't exist Signed-off-by: Sergio Correia <scorreia@redhat.com>
1 parent 2ad7027 commit 57dcc00

File tree

5 files changed

+213
-2
lines changed

5 files changed

+213
-2
lines changed

src/pins/sss/clevis-decrypt-sss.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,13 @@ main(int argc, char *argv[])
314314
fclose(pin->file);
315315

316316
if (pin->pid > 0) {
317-
kill(pin->pid, SIGTERM);
317+
/*
318+
* Kill entire process group (negative PID). This ensures
319+
* grandchildren (e.g., curl spawned by clevis-decrypt-tang)
320+
* are terminated. Fall back to direct kill if group doesn't exist.
321+
*/
322+
if (kill(-pin->pid, SIGTERM) < 0)
323+
(void) kill(pin->pid, SIGTERM);
318324
waitpid(pin->pid, NULL, 0);
319325
}
320326

src/pins/sss/meson.build

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ if jansson.found() and libcrypto.found()
2121
env.prepend('PATH',
2222
join_paths(meson.source_root(), 'src'),
2323
join_paths(meson.source_root(), 'src', 'pins', 'tang'),
24+
join_paths(meson.build_root(), 'src', 'pins', 'tang', 'tests'),
2425
meson.current_build_dir(),
2526
'/usr/libexec',
2627
libexecdir,
@@ -33,6 +34,8 @@ if jansson.found() and libcrypto.found()
3334

3435
test('pin-sss', find_program(join_paths(src, 'pin-sss')), env: env)
3536
test('pin-null', find_program(join_paths(src, 'pin-null')), env: env)
37+
38+
subdir('tests')
3639
else
3740
warning('Will not install sss pin due to missing dependencies!')
3841
endif

src/pins/sss/sss.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,10 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid)
360360
goto error;
361361

362362
if (*pid == 0) {
363+
/* Create a new process group so we can kill all descendants */
364+
if (setpgid(0, 0) < 0)
365+
exit(EXIT_FAILURE);
366+
363367
if (dup2(dump[PIPE_RD], STDIN_FILENO) < 0 ||
364368
dup2(load[PIPE_WR], STDOUT_FILENO) < 0)
365369
exit(EXIT_FAILURE);
@@ -368,6 +372,11 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid)
368372
exit(EXIT_FAILURE);
369373
}
370374

375+
/* Also set from parent to eliminate race with child's setpgid().
376+
* This may fail if the child has already called setpgid(0,0) or exec'd,
377+
* which is fine - the important thing is that one of them succeeds. */
378+
(void) setpgid(*pid, *pid);
379+
371380
for (const uint8_t *tmp = buf; len > 0; tmp += wr, len -= wr) {
372381
wr = write(dump[PIPE_WR], tmp, len);
373382
if (wr < 0)
@@ -390,7 +399,13 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid)
390399
close(load[PIPE_WR]);
391400

392401
if (*pid > 0) {
393-
kill(*pid, SIGTERM);
402+
/*
403+
* Kill entire process group (negative PID). This ensures
404+
* grandchildren (e.g., curl spawned by clevis-decrypt-tang)
405+
* are terminated. Fall back to direct kill if group doesn't exist.
406+
*/
407+
if (kill(-*pid, SIGTERM) < 0)
408+
(void) kill(*pid, SIGTERM);
394409
waitpid(*pid, NULL, 0);
395410
*pid = 0;
396411
}

src/pins/sss/tests/meson.build

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
lsof = find_program('lsof', required: false)
2+
pgrep = find_program('pgrep', required: false)
3+
4+
if lsof.found() and pgrep.found()
5+
test('sss-child-process-cleanup', find_program('sss-child-process-cleanup'), env: env, timeout: 30)
6+
else
7+
warning('Will not run sss-child-process-cleanup test due to missing dependencies (lsof, pgrep)!')
8+
endif
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#!/bin/bash -e
2+
# vim: set tabstop=8 shiftwidth=4 softtabstop=4 expandtab smarttab colorcolumn=80:
3+
#
4+
# Copyright (c) 2025 Red Hat, Inc.
5+
# Author: Sergio Correia <scorreia@redhat.com>
6+
#
7+
# This program is free software: you can redistribute it and/or modify
8+
# it under the terms of the GNU General Public License as published by
9+
# the Free Software Foundation, either version 3 of the License, or
10+
# (at your option) any later version.
11+
#
12+
# This program is distributed in the hope that it will be useful,
13+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
# GNU General Public License for more details.
16+
#
17+
# You should have received a copy of the GNU General Public License
18+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
#
20+
# Test to verify that clevis-decrypt-sss properly kills all child processes
21+
# and their descendants (grandchildren) after successful decryption.
22+
#
23+
# This test addresses issue #460:
24+
# https://github.com/latchset/clevis/issues/460
25+
#
26+
# The issue: When using SSS with multiple pins (e.g., tang, fido2) and t < n,
27+
# after successful decryption with t pins, the remaining child processes
28+
# (and their grandchildren like curl or fido2-assert) were not being killed.
29+
#
30+
# The fix uses process groups: each child becomes a process group leader
31+
# via setpgid(0,0), and cleanup uses kill(-pid, SIGTERM) to kill the
32+
# entire process group.
33+
#
34+
# Test strategy:
35+
# 1. Start a real tang server that responds quickly
36+
# 2. Start a "hanging" server that accepts connections but never responds
37+
# 3. Encrypt with SSS t=1, two tang pins (real + hanging)
38+
# 4. Decrypt - the real tang succeeds, the hanging one's child should be killed
39+
# 5. Verify no orphaned curl processes remain after decryption
40+
41+
. tang-common-test-functions
42+
43+
TMP="$(mktemp -d)"
44+
45+
SCRIPT_PID=$$
46+
47+
# Force exit - kills background jobs first, then exits
48+
force_exit() {
49+
local code="$1"
50+
trap - EXIT # Disable trap to prevent recursion
51+
52+
# Kill all child processes to prevent bash from waiting
53+
# Use both pkill and direct kill to be thorough
54+
pkill -9 -P $$ 2>/dev/null || true
55+
jobs -p 2>/dev/null | xargs -r kill -9 2>/dev/null || true
56+
57+
if [ "${code}" -eq 0 ]; then
58+
# Success - normal exit after killing children
59+
exit 0
60+
else
61+
# Failure - kill entire process group for immediate exit
62+
kill -9 -$$ 2>/dev/null || kill -9 $$ 2>/dev/null || exit "${code}"
63+
fi
64+
}
65+
66+
# No cleanup trap - we'll force kill on exit
67+
68+
# Start a real tang server
69+
tang_run "${TMP}" sig exc
70+
port_real=$(tang_get_port "${TMP}")
71+
url_real="http://localhost:${port_real}"
72+
73+
# Get the advertisement from the real tang server
74+
adv="${TMP}/adv.jws"
75+
tang_get_adv "${port_real}" "${adv}"
76+
77+
# Start a "hanging" server - accepts TCP connections but never sends HTTP response.
78+
# This simulates a tang server that is unreachable or very slow.
79+
# We use socat to listen and spawn a process that just sleeps forever.
80+
# Use disown to prevent bash from waiting for this background job on exit.
81+
"${SOCAT}" TCP4-LISTEN:0,fork EXEC:"sleep 3600" &
82+
hang_pid=$!
83+
disown "${hang_pid}"
84+
echo "${hang_pid}" > "${TMP}/hang.pid"
85+
86+
# Wait for the hanging server to start listening
87+
sleep 0.5
88+
port_hang=$(lsof -nP -a -p "${hang_pid}" -iTCP -sTCP:LISTEN -Fn 2>/dev/null \
89+
| grep '^n.*:' | head -1 | sed 's/.*://')
90+
91+
if [ -z "${port_hang}" ]; then
92+
echo "Failed to start hanging server" >&2
93+
force_exit 1
94+
fi
95+
96+
url_hang="http://localhost:${port_hang}"
97+
98+
echo "Real tang server on port ${port_real}"
99+
echo "Hanging server on port ${port_hang}"
100+
101+
# Encrypt with SSS: t=1, two tang pins
102+
# Both pins use the same advertisement (from the real server) so encryption works.
103+
# During decryption, one will contact the real server (succeeds) and one will
104+
# contact the hanging server (curl hangs waiting for response).
105+
cfg=$(cat <<EOF
106+
{
107+
"t": 1,
108+
"pins": {
109+
"tang": [
110+
{"url": "${url_real}", "adv": "${adv}"},
111+
{"url": "${url_hang}", "adv": "${adv}"}
112+
]
113+
}
114+
}
115+
EOF
116+
)
117+
118+
echo "Encrypting test data..."
119+
enc="$(echo -n "test-cleanup-data" | clevis encrypt sss "${cfg}" -y)"
120+
echo "Encryption successful."
121+
122+
# Record curl processes before decryption
123+
curl_before=$(pgrep -x curl 2>/dev/null | wc -l || echo 0)
124+
125+
echo "Starting decryption... ($(date +%T))"
126+
127+
# Decrypt with a timeout. The real tang should respond quickly, so decryption
128+
# should complete in a few seconds. The hanging server's child process
129+
# (and its curl grandchild) should be killed after decryption succeeds.
130+
#
131+
# Without the fix: curl connecting to the hanging server would remain running
132+
# after clevis-decrypt-sss exits (orphaned to init).
133+
#
134+
# With the fix: the entire process group is killed, including curl.
135+
136+
# Decrypt the data. Without the fix, decryption succeeds but leaves orphaned
137+
# grandchild processes (curl) which we detect below.
138+
dec=""
139+
if ! dec="$(echo -n "${enc}" | clevis decrypt)"; then
140+
echo "FAIL: Decryption failed" >&2
141+
force_exit 1
142+
fi
143+
144+
if [ "${dec}" != "test-cleanup-data" ]; then
145+
echo "FAIL: Decrypted data mismatch: got '${dec}'" >&2
146+
force_exit 1
147+
fi
148+
149+
echo "Decryption successful: '${dec}' ($(date +%T))"
150+
151+
# Give a moment for process cleanup
152+
sleep 0.5
153+
154+
# Check for orphaned curl processes connecting to the hanging server.
155+
# Use lsof to find any process with a connection to port_hang.
156+
# -n: no hostname resolution (faster)
157+
# -P: no port name resolution (faster)
158+
# -t: terse output (just PIDs)
159+
orphaned_connections=$(lsof -nP -i "TCP:${port_hang}" -t 2>/dev/null | grep -v "^${hang_pid}$" || true)
160+
161+
if [ -n "${orphaned_connections}" ]; then
162+
echo "FAIL: Found orphaned processes connecting to hanging server:" >&2
163+
echo "${orphaned_connections}" >&2
164+
force_exit 1
165+
fi
166+
167+
# Also check that we don't have more curl processes than before
168+
# (accounting for possible legitimate curl usage)
169+
curl_after=$(pgrep -x curl 2>/dev/null | wc -l || echo 0)
170+
171+
# We expect curl_after <= curl_before (the curl that was connecting to hang
172+
# server should have been killed)
173+
if [ "${curl_after}" -gt "${curl_before}" ]; then
174+
echo "WARNING: More curl processes after decryption (${curl_after}) than before (${curl_before})" >&2
175+
echo "This might indicate orphaned processes, but checking connections is more reliable." >&2
176+
fi
177+
178+
echo "SUCCESS: No orphaned processes detected after SSS decryption cleanup"
179+
force_exit 0

0 commit comments

Comments
 (0)