Skip to content

Commit b471a2c

Browse files
authored
Add ProcessLifecycle enum to control child process group behavior (#1014)
Replace disableNewProcessID bool with Isolated/Linked lifecycle modes. Linked mode keeps children in the parent's process group for natural signal propagation. Agent runner defaults to Linked, and server now cancels active WebSocket runs on shutdown. This will allow to tether the lifecycle of long running processes to the lifetime of the agent. Signed-off-by: Sebastian (Tiedtke) Huckleberry <sebastiantiedtke@gmail.com>
1 parent 75e41e4 commit b471a2c

File tree

7 files changed

+331
-24
lines changed

7 files changed

+331
-24
lines changed
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
//go:build !windows
2+
3+
package command
4+
5+
import (
6+
"bytes"
7+
"context"
8+
"fmt"
9+
"os"
10+
"path/filepath"
11+
"strconv"
12+
"strings"
13+
"syscall"
14+
"testing"
15+
"time"
16+
17+
"github.com/stretchr/testify/assert"
18+
"github.com/stretchr/testify/require"
19+
"go.uber.org/zap/zaptest"
20+
21+
runnerv2 "github.com/runmedev/runme/v3/api/gen/proto/go/runme/runner/v2"
22+
)
23+
24+
func pidAlive(pid int) bool {
25+
// On Unix, sending signal 0 to a process checks for existence.
26+
return syscall.Kill(pid, 0) == nil
27+
}
28+
29+
// readPIDFile reads a PID from a file, retrying until available or timeout.
30+
func readPIDFile(t *testing.T, path string, timeout time.Duration) int {
31+
t.Helper()
32+
deadline := time.Now().Add(timeout)
33+
for time.Now().Before(deadline) {
34+
data, err := os.ReadFile(path)
35+
if err == nil && len(bytes.TrimSpace(data)) > 0 {
36+
pid, err := strconv.Atoi(strings.TrimSpace(string(data)))
37+
if err == nil && pid > 0 {
38+
return pid
39+
}
40+
}
41+
time.Sleep(50 * time.Millisecond)
42+
}
43+
t.Fatalf("timed out waiting for PID in %s", path)
44+
return 0
45+
}
46+
47+
func TestProcessLifecycle_NativeLinked(t *testing.T) {
48+
t.Parallel()
49+
50+
tmpDir := t.TempDir()
51+
pidFile1 := filepath.Join(tmpDir, "pid1")
52+
53+
factory := NewFactory(
54+
WithLogger(zaptest.NewLogger(t)),
55+
WithProcessLifecycle(ProcessLifecycleLinked),
56+
)
57+
58+
// Use NoShell to bypass inline shell wrapper and its FIFO-based env collection.
59+
cfg := &ProgramConfig{
60+
ProgramName: "bash",
61+
Arguments: []string{"-c", fmt.Sprintf(
62+
`sleep 300 & echo $! > %s; wait`,
63+
pidFile1,
64+
)},
65+
Mode: runnerv2.CommandMode_COMMAND_MODE_INLINE,
66+
}
67+
68+
cmd, err := factory.Build(cfg, CommandOptions{NoShell: true})
69+
require.NoError(t, err)
70+
71+
ctx, cancel := context.WithCancel(context.Background())
72+
defer cancel()
73+
74+
require.NoError(t, cmd.Start(ctx))
75+
76+
childPid := readPIDFile(t, pidFile1, 5*time.Second)
77+
time.Sleep(100 * time.Millisecond)
78+
79+
// With Linked lifecycle, the child should be in the same process
80+
// group as the parent (no Setpgid). OS signals propagate naturally.
81+
parentPgid, err := syscall.Getpgid(os.Getpid())
82+
require.NoError(t, err)
83+
cmdPgid, err := syscall.Getpgid(cmd.Pid())
84+
require.NoError(t, err)
85+
assert.Equal(t, parentPgid, cmdPgid, "command should share parent's process group")
86+
87+
childPgid, err := syscall.Getpgid(childPid)
88+
require.NoError(t, err)
89+
assert.Equal(t, parentPgid, childPgid, "grandchild should share parent's process group")
90+
91+
// Clean up.
92+
cancel()
93+
_ = cmd.Wait(context.Background())
94+
_ = syscall.Kill(childPid, syscall.SIGKILL)
95+
}
96+
97+
func TestProcessLifecycle_NativeIsolated(t *testing.T) {
98+
t.Parallel()
99+
100+
tmpDir := t.TempDir()
101+
pidFile1 := filepath.Join(tmpDir, "pid1")
102+
pidFile2 := filepath.Join(tmpDir, "pid2")
103+
104+
factory := NewFactory(
105+
WithLogger(zaptest.NewLogger(t)),
106+
WithProcessLifecycle(ProcessLifecycleIsolated),
107+
)
108+
109+
cfg := &ProgramConfig{
110+
ProgramName: "bash",
111+
Arguments: []string{"-c", fmt.Sprintf(
112+
`sleep 300 & echo $! > %s; sleep 300 & echo $! > %s; wait`,
113+
pidFile1, pidFile2,
114+
)},
115+
Mode: runnerv2.CommandMode_COMMAND_MODE_INLINE,
116+
}
117+
118+
cmd, err := factory.Build(cfg, CommandOptions{NoShell: true})
119+
require.NoError(t, err)
120+
121+
ctx, cancel := context.WithCancel(context.Background())
122+
defer cancel()
123+
124+
require.NoError(t, cmd.Start(ctx))
125+
126+
pid1 := readPIDFile(t, pidFile1, 5*time.Second)
127+
pid2 := readPIDFile(t, pidFile2, 5*time.Second)
128+
129+
time.Sleep(100 * time.Millisecond)
130+
131+
// Kill only the main process PID directly (not the process group)
132+
// to simulate the default CommandContext behavior.
133+
mainPid := cmd.Pid()
134+
require.Greater(t, mainPid, 0)
135+
err = syscall.Kill(mainPid, syscall.SIGKILL)
136+
require.NoError(t, err)
137+
138+
// Cancel the context so the watchCtx goroutine can finish.
139+
cancel()
140+
141+
// Wait with a timeout — with isolated mode, the I/O pipe goroutines
142+
// may block because isolated children keep stdout open.
143+
waitDone := make(chan error, 1)
144+
go func() {
145+
waitDone <- cmd.Wait(context.Background())
146+
}()
147+
select {
148+
case <-waitDone:
149+
case <-time.After(5 * time.Second):
150+
// This is expected: isolated children keep pipes open.
151+
}
152+
153+
// Give the OS time to reap the main process.
154+
time.Sleep(200 * time.Millisecond)
155+
156+
// With isolated lifecycle, children should survive.
157+
alive1 := pidAlive(pid1)
158+
alive2 := pidAlive(pid2)
159+
160+
// Clean up isolated children so they don't leak.
161+
for _, pid := range []int{pid1, pid2} {
162+
_ = syscall.Kill(pid, syscall.SIGKILL)
163+
}
164+
165+
assert.True(t, alive1, "child1 should still be alive with ISOLATED lifecycle")
166+
assert.True(t, alive2, "child2 should still be alive with ISOLATED lifecycle")
167+
}
168+
169+
func TestProcessLifecycle_VirtualLinked(t *testing.T) {
170+
t.Parallel()
171+
172+
tmpDir := t.TempDir()
173+
pidFile1 := filepath.Join(tmpDir, "pid1")
174+
pidFile2 := filepath.Join(tmpDir, "pid2")
175+
176+
factory := NewFactory(
177+
WithLogger(zaptest.NewLogger(t)),
178+
WithProcessLifecycle(ProcessLifecycleLinked),
179+
)
180+
181+
cfg := &ProgramConfig{
182+
ProgramName: "bash",
183+
Arguments: []string{"-c", fmt.Sprintf(
184+
`sleep 300 & echo $! > %s; sleep 300 & echo $! > %s; wait`,
185+
pidFile1, pidFile2,
186+
)},
187+
Interactive: true,
188+
Mode: runnerv2.CommandMode_COMMAND_MODE_INLINE,
189+
}
190+
191+
cmd, err := factory.Build(cfg, CommandOptions{NoShell: true})
192+
require.NoError(t, err)
193+
194+
ctx, cancel := context.WithCancel(context.Background())
195+
defer cancel()
196+
197+
require.NoError(t, cmd.Start(ctx))
198+
199+
pid1 := readPIDFile(t, pidFile1, 5*time.Second)
200+
pid2 := readPIDFile(t, pidFile2, 5*time.Second)
201+
202+
time.Sleep(100 * time.Millisecond)
203+
204+
require.True(t, pidAlive(pid1), "child1 should be alive before cancel")
205+
require.True(t, pidAlive(pid2), "child2 should be alive before cancel")
206+
207+
cancel()
208+
_ = cmd.Wait(context.Background())
209+
210+
time.Sleep(200 * time.Millisecond)
211+
212+
assert.False(t, pidAlive(pid1), "child1 should be dead after cancel with LINKED lifecycle")
213+
assert.False(t, pidAlive(pid2), "child2 should be dead after cancel with LINKED lifecycle")
214+
}
215+
216+
func TestProcessLifecycle_CLIModeLinked(t *testing.T) {
217+
t.Parallel()
218+
219+
// Factory defaults to Isolated, but CLI mode overrides the lifecycle
220+
// to Linked so the child stays in the parent's process group.
221+
factory := NewFactory(
222+
WithLogger(zaptest.NewLogger(t)),
223+
)
224+
225+
cfg := &ProgramConfig{
226+
ProgramName: "echo",
227+
Arguments: []string{"-n", "test"},
228+
Mode: runnerv2.CommandMode_COMMAND_MODE_CLI,
229+
}
230+
231+
stdout := bytes.NewBuffer(nil)
232+
cmd, err := factory.Build(cfg, CommandOptions{Stdout: stdout})
233+
require.NoError(t, err)
234+
235+
require.NoError(t, cmd.Start(context.Background()))
236+
require.NoError(t, cmd.Wait(context.Background()))
237+
assert.Equal(t, "test", stdout.String())
238+
}

command/command_native.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ import (
1212
type nativeCommand struct {
1313
*base
1414

15-
disableNewProcessID bool
16-
logger *zap.Logger
15+
logger *zap.Logger
16+
processLifecycle ProcessLifecycle
1717

1818
cmd *exec.Cmd
1919
}
@@ -60,10 +60,15 @@ func (c *nativeCommand) Start(ctx context.Context) (err error) {
6060
c.cmd.Stdout = c.Stdout()
6161
c.cmd.Stderr = c.Stderr()
6262

63-
if !c.disableNewProcessID {
63+
switch c.processLifecycle {
64+
case ProcessLifecycleIsolated:
6465
// Creating a new process group is required to properly replicate a behaviour
6566
// similar to CTRL-C in the terminal, which sends a SIGINT to the whole group.
6667
setSysProcAttrPgid(c.cmd)
68+
case ProcessLifecycleLinked:
69+
// Child stays in parent's process group; OS signals propagate naturally.
70+
// WaitDelay ensures Wait returns even if grandchildren hold pipes open.
71+
c.cmd.WaitDelay = cmdWaitDelay
6772
}
6873

6974
c.logger.Info("starting", zap.Any("config", redactConfig(c.ProgramConfig())))
@@ -78,7 +83,7 @@ func (c *nativeCommand) Start(ctx context.Context) (err error) {
7883
func (c *nativeCommand) Signal(sig os.Signal) error {
7984
c.logger.Info("stopping with signal", zap.Stringer("signal", sig))
8085

81-
if !c.disableNewProcessID {
86+
if c.processLifecycle == ProcessLifecycleIsolated {
8287
c.logger.Info("signaling to the process group", zap.Stringer("signal", sig))
8388
// Try to terminate the whole process group. If it fails, fall back to stdlib methods.
8489
err := signalPgid(c.cmd.Process.Pid, sig)

command/command_virtual.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@ import (
1717
type virtualCommand struct {
1818
*base
1919

20-
isEchoEnabled bool
21-
logger *zap.Logger
22-
stdin io.ReadCloser // stdin is [CommandOptions.Stdin] wrapped in [readCloser]
20+
isEchoEnabled bool
21+
logger *zap.Logger
22+
processLifecycle ProcessLifecycle
23+
stdin io.ReadCloser // stdin is [CommandOptions.Stdin] wrapped in [readCloser]
2324

2425
// cmd is populated when the command is started.
2526
cmd *exec.Cmd
@@ -90,6 +91,13 @@ func (c *virtualCommand) Start(ctx context.Context) (err error) {
9091
setSysProcAttrCtty(c.cmd, 3)
9192
c.cmd.ExtraFiles = []*os.File{c.tty}
9293

94+
if c.processLifecycle == ProcessLifecycleLinked {
95+
c.cmd.Cancel = func() error {
96+
return signalPgid(c.cmd.Process.Pid, syscall.SIGKILL)
97+
}
98+
c.cmd.WaitDelay = cmdWaitDelay
99+
}
100+
93101
c.logger.Info("starting", zap.Any("config", redactConfig(c.ProgramConfig())))
94102
if err := c.cmd.Start(); err != nil {
95103
return errors.WithStack(err)

0 commit comments

Comments
 (0)