Skip to content

Commit fe5cc93

Browse files
sss: kill entire process group during cleanup
When using SSS with multiple pins (e.g., tang) and threshold < n, after successful decryption with t pins, the remaining child processes and their grandchildren (like curl) were not being killed. The fix uses process groups: - Each child process calls setpgid(0, 0) to become a process group leader - Parent also calls setpgid() to eliminate race condition - Any grandchildren inherit the same process group - Cleanup uses kill(-pid, SIGTERM) to kill the entire process group - Falls back to direct kill if process group doesn't exist Signed-off-by: Sergio Correia <scorreia@redhat.com>
1 parent 2ad7027 commit fe5cc93

File tree

5 files changed

+211
-2
lines changed

5 files changed

+211
-2
lines changed

src/pins/sss/clevis-decrypt-sss.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,13 @@ main(int argc, char *argv[])
314314
fclose(pin->file);
315315

316316
if (pin->pid > 0) {
317-
kill(pin->pid, SIGTERM);
317+
/*
318+
* Kill entire process group (negative PID). This ensures
319+
* grandchildren (e.g., curl spawned by clevis-decrypt-tang)
320+
* are terminated. Fall back to direct kill if group doesn't exist.
321+
*/
322+
if (kill(-pin->pid, SIGTERM) < 0)
323+
(void) kill(pin->pid, SIGTERM);
318324
waitpid(pin->pid, NULL, 0);
319325
}
320326

src/pins/sss/meson.build

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ if jansson.found() and libcrypto.found()
2121
env.prepend('PATH',
2222
join_paths(meson.source_root(), 'src'),
2323
join_paths(meson.source_root(), 'src', 'pins', 'tang'),
24+
join_paths(meson.build_root(), 'src', 'pins', 'tang', 'tests'),
2425
meson.current_build_dir(),
2526
'/usr/libexec',
2627
libexecdir,
@@ -33,6 +34,8 @@ if jansson.found() and libcrypto.found()
3334

3435
test('pin-sss', find_program(join_paths(src, 'pin-sss')), env: env)
3536
test('pin-null', find_program(join_paths(src, 'pin-null')), env: env)
37+
38+
subdir('tests')
3639
else
3740
warning('Will not install sss pin due to missing dependencies!')
3841
endif

src/pins/sss/sss.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,10 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid)
360360
goto error;
361361

362362
if (*pid == 0) {
363+
/* Create a new process group so we can kill all descendants */
364+
if (setpgid(0, 0) < 0)
365+
exit(EXIT_FAILURE);
366+
363367
if (dup2(dump[PIPE_RD], STDIN_FILENO) < 0 ||
364368
dup2(load[PIPE_WR], STDOUT_FILENO) < 0)
365369
exit(EXIT_FAILURE);
@@ -368,6 +372,11 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid)
368372
exit(EXIT_FAILURE);
369373
}
370374

375+
/* Also set from parent to eliminate race with child's setpgid().
376+
* This may fail if the child has already called setpgid(0,0) or exec'd,
377+
* which is fine - the important thing is that one of them succeeds. */
378+
(void) setpgid(*pid, *pid);
379+
371380
for (const uint8_t *tmp = buf; len > 0; tmp += wr, len -= wr) {
372381
wr = write(dump[PIPE_WR], tmp, len);
373382
if (wr < 0)
@@ -390,7 +399,13 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid)
390399
close(load[PIPE_WR]);
391400

392401
if (*pid > 0) {
393-
kill(*pid, SIGTERM);
402+
/*
403+
* Kill entire process group (negative PID). This ensures
404+
* grandchildren (e.g., curl spawned by clevis-decrypt-tang)
405+
* are terminated. Fall back to direct kill if group doesn't exist.
406+
*/
407+
if (kill(-*pid, SIGTERM) < 0)
408+
(void) kill(*pid, SIGTERM);
394409
waitpid(*pid, NULL, 0);
395410
*pid = 0;
396411
}

src/pins/sss/tests/meson.build

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
lsof = find_program('lsof', required: false)
2+
pgrep = find_program('pgrep', required: false)
3+
4+
if lsof.found() and pgrep.found()
5+
test('sss-child-process-cleanup', find_program('sss-child-process-cleanup'), env: env, timeout: 30)
6+
else
7+
warning('Will not run sss-child-process-cleanup test due to missing dependencies (lsof, pgrep)!')
8+
endif
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/bin/bash -e
2+
# vim: set tabstop=8 shiftwidth=4 softtabstop=4 expandtab smarttab colorcolumn=80:
3+
#
4+
# Copyright (c) 2025 Red Hat, Inc.
5+
# Author: Sergio Correia <scorreia@redhat.com>
6+
#
7+
# This program is free software: you can redistribute it and/or modify
8+
# it under the terms of the GNU General Public License as published by
9+
# the Free Software Foundation, either version 3 of the License, or
10+
# (at your option) any later version.
11+
#
12+
# This program is distributed in the hope that it will be useful,
13+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
# GNU General Public License for more details.
16+
#
17+
# You should have received a copy of the GNU General Public License
18+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
#
20+
# Test to verify that clevis-decrypt-sss properly kills all child processes
21+
# and their descendants (grandchildren) after successful decryption.
22+
#
23+
# This test addresses issue #460:
24+
# https://github.com/latchset/clevis/issues/460
25+
#
26+
# The issue: When using SSS with multiple pins (e.g., tang, fido2) and t < n,
27+
# after successful decryption with t pins, the remaining child processes
28+
# (and their grandchildren like curl or fido2-assert) were not being killed.
29+
#
30+
# The fix uses process groups: each child becomes a process group leader
31+
# via setpgid(0,0), and cleanup uses kill(-pid, SIGTERM) to kill the
32+
# entire process group.
33+
#
34+
# Test strategy:
35+
# 1. Start a real tang server that responds quickly
36+
# 2. Start a "hanging" server that accepts connections but never responds
37+
# 3. Encrypt with SSS t=1, two tang pins (real + hanging)
38+
# 4. Decrypt - the real tang succeeds, the hanging one's child should be killed
39+
# 5. Verify no orphaned curl processes remain after decryption
40+
41+
. tang-common-test-functions
42+
43+
TMP="$(mktemp -d)"
44+
45+
# Force exit - kills background jobs first, then exits
46+
force_exit() {
47+
local code="$1"
48+
trap - EXIT # Disable trap to prevent recursion
49+
50+
# Kill all child processes to prevent bash from waiting
51+
# Use both pkill and direct kill to be thorough
52+
pkill -9 -P $$ 2>/dev/null || true
53+
jobs -p 2>/dev/null | xargs -r kill -9 2>/dev/null || true
54+
55+
if [ "${code}" -eq 0 ]; then
56+
# Success - normal exit after killing children
57+
exit 0
58+
else
59+
# Failure - kill entire process group for immediate exit
60+
kill -9 -$$ 2>/dev/null || kill -9 $$ 2>/dev/null || exit "${code}"
61+
fi
62+
}
63+
64+
# No cleanup trap - we'll force kill on exit
65+
66+
# Start a real tang server
67+
tang_run "${TMP}" sig exc
68+
port_real=$(tang_get_port "${TMP}")
69+
url_real="http://localhost:${port_real}"
70+
71+
# Get the advertisement from the real tang server
72+
adv="${TMP}/adv.jws"
73+
tang_get_adv "${port_real}" "${adv}"
74+
75+
# Start a "hanging" server - accepts TCP connections but never sends HTTP response.
76+
# This simulates a tang server that is unreachable or very slow.
77+
# We use socat to listen and spawn a process that just sleeps forever.
78+
# Use disown to prevent bash from waiting for this background job on exit.
79+
"${SOCAT}" TCP4-LISTEN:0,fork EXEC:"sleep 3600" &
80+
hang_pid=$!
81+
disown "${hang_pid}"
82+
echo "${hang_pid}" > "${TMP}/hang.pid"
83+
84+
# Wait for the hanging server to start listening
85+
sleep 0.5
86+
port_hang=$(lsof -nP -a -p "${hang_pid}" -iTCP -sTCP:LISTEN -Fn 2>/dev/null \
87+
| grep '^n.*:' | head -1 | sed 's/.*://')
88+
89+
if [ -z "${port_hang}" ]; then
90+
echo "Failed to start hanging server" >&2
91+
force_exit 1
92+
fi
93+
94+
url_hang="http://localhost:${port_hang}"
95+
96+
echo "Real tang server on port ${port_real}"
97+
echo "Hanging server on port ${port_hang}"
98+
99+
# Encrypt with SSS: t=1, two tang pins
100+
# Both pins use the same advertisement (from the real server) so encryption works.
101+
# During decryption, one will contact the real server (succeeds) and one will
102+
# contact the hanging server (curl hangs waiting for response).
103+
cfg=$(cat <<EOF
104+
{
105+
"t": 1,
106+
"pins": {
107+
"tang": [
108+
{"url": "${url_real}", "adv": "${adv}"},
109+
{"url": "${url_hang}", "adv": "${adv}"}
110+
]
111+
}
112+
}
113+
EOF
114+
)
115+
116+
echo "Encrypting test data..."
117+
enc="$(echo -n "test-cleanup-data" | clevis encrypt sss "${cfg}" -y)"
118+
echo "Encryption successful."
119+
120+
# Record curl processes before decryption
121+
curl_before=$(pgrep -x curl 2>/dev/null | wc -l || echo 0)
122+
123+
echo "Starting decryption... ($(date +%T))"
124+
125+
# Decrypt with a timeout. The real tang should respond quickly, so decryption
126+
# should complete in a few seconds. The hanging server's child process
127+
# (and its curl grandchild) should be killed after decryption succeeds.
128+
#
129+
# Without the fix: curl connecting to the hanging server would remain running
130+
# after clevis-decrypt-sss exits (orphaned to init).
131+
#
132+
# With the fix: the entire process group is killed, including curl.
133+
134+
# Decrypt the data. Without the fix, decryption succeeds but leaves orphaned
135+
# grandchild processes (curl) which we detect below.
136+
dec=""
137+
if ! dec="$(echo -n "${enc}" | clevis decrypt)"; then
138+
echo "FAIL: Decryption failed" >&2
139+
force_exit 1
140+
fi
141+
142+
if [ "${dec}" != "test-cleanup-data" ]; then
143+
echo "FAIL: Decrypted data mismatch: got '${dec}'" >&2
144+
force_exit 1
145+
fi
146+
147+
echo "Decryption successful: '${dec}' ($(date +%T))"
148+
149+
# Give a moment for process cleanup
150+
sleep 0.5
151+
152+
# Check for orphaned curl processes connecting to the hanging server.
153+
# Use lsof to find any process with a connection to port_hang.
154+
# -n: no hostname resolution (faster)
155+
# -P: no port name resolution (faster)
156+
# -t: terse output (just PIDs)
157+
orphaned_connections=$(lsof -nP -i "TCP:${port_hang}" -t 2>/dev/null | grep -v "^${hang_pid}$" || true)
158+
159+
if [ -n "${orphaned_connections}" ]; then
160+
echo "FAIL: Found orphaned processes connecting to hanging server:" >&2
161+
echo "${orphaned_connections}" >&2
162+
force_exit 1
163+
fi
164+
165+
# Also check that we don't have more curl processes than before
166+
# (accounting for possible legitimate curl usage)
167+
curl_after=$(pgrep -x curl 2>/dev/null | wc -l || echo 0)
168+
169+
# We expect curl_after <= curl_before (the curl that was connecting to hang
170+
# server should have been killed)
171+
if [ "${curl_after}" -gt "${curl_before}" ]; then
172+
echo "WARNING: More curl processes after decryption (${curl_after}) than before (${curl_before})" >&2
173+
echo "This might indicate orphaned processes, but checking connections is more reliable." >&2
174+
fi
175+
176+
echo "SUCCESS: No orphaned processes detected after SSS decryption cleanup"
177+
force_exit 0

0 commit comments

Comments
 (0)