Skip to content

Commit c34dd2b

Browse files
sss: kill entire process group during decryption cleanup
When using SSS with multiple pins (e.g., tang) and threshold < n, after successful decryption with t pins, the remaining child processes and their grandchildren (like curl) were not being killed. The fix uses process groups for decryption only: - Add use_pgrp parameter to call() to control process group creation - For decryption: use_pgrp=true - child calls setpgid(0, 0) to become a process group leader, parent also calls setpgid() to eliminate race - For encryption: use_pgrp=false - no process groups needed since we wait for all children anyway - Cleanup uses kill(-pid, SIGTERM) to kill the entire process group - Falls back to direct kill if process group doesn't exist Signed-off-by: Sergio Correia <scorreia@redhat.com>
1 parent 2ad7027 commit c34dd2b

File tree

7 files changed

+230
-6
lines changed

7 files changed

+230
-6
lines changed

src/pins/sss/clevis-decrypt-sss.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ main(int argc, char *argv[])
191191
chldrn.next = pin;
192192

193193
pin->file = call(args, json_string_value(val),
194-
json_string_length(val), &pin->pid);
194+
json_string_length(val), &pin->pid, true);
195195
if (!pin->file)
196196
goto egress;
197197

@@ -314,7 +314,13 @@ main(int argc, char *argv[])
314314
fclose(pin->file);
315315

316316
if (pin->pid > 0) {
317-
kill(pin->pid, SIGTERM);
317+
/*
318+
* Kill entire process group (negative PID). This ensures
319+
* grandchildren (e.g., curl spawned by clevis-decrypt-tang)
320+
* are terminated. Fall back to direct kill if group doesn't exist.
321+
*/
322+
if (kill(-pin->pid, SIGTERM) < 0)
323+
(void) kill(pin->pid, SIGTERM);
318324
waitpid(pin->pid, NULL, 0);
319325
}
320326

src/pins/sss/clevis-encrypt-sss.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ encrypt_frag(json_t *sss, const char *pin, const json_t *cfg, int assume_yes)
109109
if (!pnt)
110110
return NULL;
111111

112-
pipe = call(args, pnt, pntl, &pid);
112+
pipe = call(args, pnt, pntl, &pid, false);
113113
OPENSSL_cleanse(pnt, pntl);
114114
free(pnt);
115115
if (!pipe)

src/pins/sss/meson.build

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ if jansson.found() and libcrypto.found()
2121
env.prepend('PATH',
2222
join_paths(meson.source_root(), 'src'),
2323
join_paths(meson.source_root(), 'src', 'pins', 'tang'),
24+
join_paths(meson.build_root(), 'src', 'pins', 'tang', 'tests'),
2425
meson.current_build_dir(),
2526
'/usr/libexec',
2627
libexecdir,
@@ -33,6 +34,8 @@ if jansson.found() and libcrypto.found()
3334

3435
test('pin-sss', find_program(join_paths(src, 'pin-sss')), env: env)
3536
test('pin-null', find_program(join_paths(src, 'pin-null')), env: env)
37+
38+
subdir('tests')
3639
else
3740
warning('Will not install sss pin due to missing dependencies!')
3841
endif

src/pins/sss/sss.c

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,8 @@ enum {
340340
};
341341

342342
FILE *
343-
call(char *const argv[], const void *buf, size_t len, pid_t *pid)
343+
call(char *const argv[], const void *buf, size_t len, pid_t *pid,
344+
bool use_pgrp)
344345
{
345346
int dump[2] = { -1, -1 };
346347
int load[2] = { -1, -1 };
@@ -360,6 +361,15 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid)
360361
goto error;
361362

362363
if (*pid == 0) {
364+
/*
365+
* Create a new process group so we can kill all descendants.
366+
* Only do this when use_pgrp is set (i.e., for decryption where
367+
* we may need to kill leftover children). For encryption, this
368+
* can cause issues with terminal job control in child scripts.
369+
*/
370+
if (use_pgrp && setpgid(0, 0) < 0)
371+
exit(EXIT_FAILURE);
372+
363373
if (dup2(dump[PIPE_RD], STDIN_FILENO) < 0 ||
364374
dup2(load[PIPE_WR], STDOUT_FILENO) < 0)
365375
exit(EXIT_FAILURE);
@@ -368,6 +378,13 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid)
368378
exit(EXIT_FAILURE);
369379
}
370380

381+
if (use_pgrp) {
382+
/* Also set from parent to eliminate race with child's setpgid().
383+
* This may fail if the child has already called setpgid(0,0) or
384+
* exec'd, which is fine - the important thing is one succeeds. */
385+
(void) setpgid(*pid, *pid);
386+
}
387+
371388
for (const uint8_t *tmp = buf; len > 0; tmp += wr, len -= wr) {
372389
wr = write(dump[PIPE_WR], tmp, len);
373390
if (wr < 0)
@@ -390,7 +407,18 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid)
390407
close(load[PIPE_WR]);
391408

392409
if (*pid > 0) {
393-
kill(*pid, SIGTERM);
410+
if (use_pgrp) {
411+
/*
412+
* Kill entire process group (negative PID). This ensures
413+
* grandchildren (e.g., curl spawned by clevis-decrypt-tang)
414+
* are terminated. Fall back to direct kill if group doesn't
415+
* exist.
416+
*/
417+
if (kill(-*pid, SIGTERM) < 0)
418+
(void) kill(*pid, SIGTERM);
419+
} else {
420+
(void) kill(*pid, SIGTERM);
421+
}
394422
waitpid(*pid, NULL, 0);
395423
*pid = 0;
396424
}

src/pins/sss/sss.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#pragma once
2121
#include <jansson.h>
22+
#include <stdbool.h>
2223
#include <stdint.h>
2324
#include <sys/types.h>
2425

@@ -32,4 +33,5 @@ json_t *
3233
sss_recover(const json_t *p, size_t npnts, const uint8_t *pnts[]);
3334

3435
FILE *
35-
call(char *const argv[], const void *buf, size_t len, pid_t *pid);
36+
call(char *const argv[], const void *buf, size_t len, pid_t *pid,
37+
bool use_pgrp);

src/pins/sss/tests/meson.build

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
lsof = find_program('lsof', required: false)
2+
pgrep = find_program('pgrep', required: false)
3+
4+
if lsof.found() and pgrep.found()
5+
test('sss-child-process-cleanup', find_program('sss-child-process-cleanup'), env: env, timeout: 30)
6+
else
7+
warning('Will not run sss-child-process-cleanup test due to missing dependencies (lsof, pgrep)!')
8+
endif
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/bin/bash -e
2+
# vim: set tabstop=8 shiftwidth=4 softtabstop=4 expandtab smarttab colorcolumn=80:
3+
#
4+
# Copyright (c) 2025 Red Hat, Inc.
5+
# Author: Sergio Correia <scorreia@redhat.com>
6+
#
7+
# This program is free software: you can redistribute it and/or modify
8+
# it under the terms of the GNU General Public License as published by
9+
# the Free Software Foundation, either version 3 of the License, or
10+
# (at your option) any later version.
11+
#
12+
# This program is distributed in the hope that it will be useful,
13+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
# GNU General Public License for more details.
16+
#
17+
# You should have received a copy of the GNU General Public License
18+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
#
20+
# Test to verify that clevis-decrypt-sss properly kills all child processes
21+
# and their descendants (grandchildren) after successful decryption.
22+
#
23+
# This test addresses issue #460:
24+
# https://github.com/latchset/clevis/issues/460
25+
#
26+
# The issue: When using SSS with multiple pins (e.g., tang, fido2) and t < n,
27+
# after successful decryption with t pins, the remaining child processes
28+
# (and their grandchildren like curl or fido2-assert) were not being killed.
29+
#
30+
# The fix uses process groups: each child becomes a process group leader
31+
# via setpgid(0,0), and cleanup uses kill(-pid, SIGTERM) to kill the
32+
# entire process group.
33+
#
34+
# Test strategy:
35+
# 1. Start a real tang server that responds quickly
36+
# 2. Start a "hanging" server that accepts connections but never responds
37+
# 3. Encrypt with SSS t=1, two tang pins (real + hanging)
38+
# 4. Decrypt - the real tang succeeds, the hanging one's child should be killed
39+
# 5. Verify no orphaned curl processes remain after decryption
40+
41+
. tang-common-test-functions
42+
43+
TMP="$(mktemp -d)"
44+
45+
# Force exit - kills background jobs first, then exits
46+
force_exit() {
47+
local code="$1"
48+
trap - EXIT # Disable trap to prevent recursion
49+
50+
# Kill all child processes to prevent bash from waiting
51+
# Use both pkill and direct kill to be thorough
52+
pkill -9 -P $$ 2>/dev/null || true
53+
jobs -p 2>/dev/null | xargs -r kill -9 2>/dev/null || true
54+
55+
if [ "${code}" -eq 0 ]; then
56+
# Success - normal exit after killing children
57+
exit 0
58+
else
59+
# Failure - kill entire process group for immediate exit
60+
kill -9 -$$ 2>/dev/null || kill -9 $$ 2>/dev/null || exit "${code}"
61+
fi
62+
}
63+
64+
# No cleanup trap - we'll force kill on exit
65+
66+
# Start a real tang server
67+
tang_run "${TMP}" sig exc
68+
port_real=$(tang_get_port "${TMP}")
69+
url_real="http://localhost:${port_real}"
70+
71+
# Get the advertisement from the real tang server
72+
adv="${TMP}/adv.jws"
73+
tang_get_adv "${port_real}" "${adv}"
74+
75+
# Start a "hanging" server - accepts TCP connections but never sends HTTP response.
76+
# This simulates a tang server that is unreachable or very slow.
77+
# We use socat to listen and spawn a process that just sleeps forever.
78+
# Use disown to prevent bash from waiting for this background job on exit.
79+
"${SOCAT}" TCP4-LISTEN:0,fork EXEC:"sleep 3600" &
80+
hang_pid=$!
81+
disown "${hang_pid}"
82+
echo "${hang_pid}" > "${TMP}/hang.pid"
83+
84+
# Wait for the hanging server to start listening
85+
sleep 0.5
86+
port_hang=$(lsof -nP -a -p "${hang_pid}" -iTCP -sTCP:LISTEN -Fn 2>/dev/null \
87+
| grep '^n.*:' | head -1 | sed 's/.*://')
88+
89+
if [ -z "${port_hang}" ]; then
90+
echo "Failed to start hanging server" >&2
91+
force_exit 1
92+
fi
93+
94+
url_hang="http://localhost:${port_hang}"
95+
96+
echo "Real tang server on port ${port_real}"
97+
echo "Hanging server on port ${port_hang}"
98+
99+
# Encrypt with SSS: t=1, two tang pins
100+
# Both pins use the same advertisement (from the real server) so encryption works.
101+
# During decryption, one will contact the real server (succeeds) and one will
102+
# contact the hanging server (curl hangs waiting for response).
103+
cfg=$(cat <<EOF
104+
{
105+
"t": 1,
106+
"pins": {
107+
"tang": [
108+
{"url": "${url_real}", "adv": "${adv}"},
109+
{"url": "${url_hang}", "adv": "${adv}"}
110+
]
111+
}
112+
}
113+
EOF
114+
)
115+
116+
echo "Encrypting test data..."
117+
enc="$(echo -n "test-cleanup-data" | clevis encrypt sss "${cfg}" -y)"
118+
echo "Encryption successful."
119+
120+
# Record curl processes before decryption
121+
curl_before=$(pgrep -x curl 2>/dev/null | wc -l || echo 0)
122+
123+
echo "Starting decryption... ($(date +%T))"
124+
125+
# Decrypt with a timeout. The real tang should respond quickly, so decryption
126+
# should complete in a few seconds. The hanging server's child process
127+
# (and its curl grandchild) should be killed after decryption succeeds.
128+
#
129+
# Without the fix: curl connecting to the hanging server would remain running
130+
# after clevis-decrypt-sss exits (orphaned to init).
131+
#
132+
# With the fix: the entire process group is killed, including curl.
133+
134+
# Decrypt the data. Without the fix, decryption succeeds but leaves orphaned
135+
# grandchild processes (curl) which we detect below.
136+
dec=""
137+
if ! dec="$(echo -n "${enc}" | clevis decrypt)"; then
138+
echo "FAIL: Decryption failed" >&2
139+
force_exit 1
140+
fi
141+
142+
if [ "${dec}" != "test-cleanup-data" ]; then
143+
echo "FAIL: Decrypted data mismatch: got '${dec}'" >&2
144+
force_exit 1
145+
fi
146+
147+
echo "Decryption successful: '${dec}' ($(date +%T))"
148+
149+
# Give a moment for process cleanup
150+
sleep 0.5
151+
152+
# Check for orphaned curl processes connecting to the hanging server.
153+
# Use lsof to find any process with a connection to port_hang.
154+
# -n: no hostname resolution (faster)
155+
# -P: no port name resolution (faster)
156+
# -t: terse output (just PIDs)
157+
orphaned_connections=$(lsof -nP -i "TCP:${port_hang}" -t 2>/dev/null | grep -v "^${hang_pid}$" || true)
158+
159+
if [ -n "${orphaned_connections}" ]; then
160+
echo "FAIL: Found orphaned processes connecting to hanging server:" >&2
161+
echo "${orphaned_connections}" >&2
162+
force_exit 1
163+
fi
164+
165+
# Also check that we don't have more curl processes than before
166+
# (accounting for possible legitimate curl usage)
167+
curl_after=$(pgrep -x curl 2>/dev/null | wc -l || echo 0)
168+
169+
# We expect curl_after <= curl_before (the curl that was connecting to hang
170+
# server should have been killed)
171+
if [ "${curl_after}" -gt "${curl_before}" ]; then
172+
echo "WARNING: More curl processes after decryption (${curl_after}) than before (${curl_before})" >&2
173+
echo "This might indicate orphaned processes, but checking connections is more reliable." >&2
174+
fi
175+
176+
echo "SUCCESS: No orphaned processes detected after SSS decryption cleanup"
177+
force_exit 0

0 commit comments

Comments
 (0)