From 0c52eb13c247d2a59b66f027a24bb473c6e6d3c6 Mon Sep 17 00:00:00 2001 From: reggie-k Date: Thu, 25 Sep 2025 18:09:25 +0300 Subject: [PATCH 1/2] print process group extra info Signed-off-by: reggie-k --- util/exec/exec.go | 268 +++++++++++++++++++++++++++++++++++++- util/exec/exec_test.go | 50 +++++++ util/exec/exec_unix.go | 13 ++ util/exec/exec_windows.go | 10 ++ 4 files changed, 338 insertions(+), 3 deletions(-) create mode 100644 util/exec/exec_unix.go create mode 100644 util/exec/exec_windows.go diff --git a/util/exec/exec.go b/util/exec/exec.go index 0df69aeba7f9b..5a5262949ab29 100644 --- a/util/exec/exec.go +++ b/util/exec/exec.go @@ -6,8 +6,10 @@ import ( "fmt" "os" "os/exec" + "path/filepath" "strconv" "strings" + "sync" "syscall" "time" "unicode" @@ -183,17 +185,97 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { args := strings.Join(cmd.Args, " ") logCtx.WithFields(logrus.Fields{"dir": cmd.Dir}).Info(redactor(args)) + // Capture process group information while processes are running + var capturedProcessInfo []string + var capturedProcessMutex sync.Mutex + + // Helper: debug whether HEAD.lock exists under the current working directory + logHeadLockStatus := func(where string) { + if cmd.Dir == "" { + return + } + lockPath := filepath.Join(cmd.Dir, ".git", "HEAD.lock") + fileInfo, statErr := os.Stat(lockPath) + exists := statErr == nil + fields := logrus.Fields{ + "headLockPath": lockPath, + "headLockExists": exists, + "where": where, + } + if exists { + fields["headLockSize"] = fileInfo.Size() + fields["headLockMode"] = fileInfo.Mode().String() + fields["headLockModTime"] = fileInfo.ModTime() + fields["headLockIsDir"] = fileInfo.IsDir() + } + + // Add process group information if the process has started + if cmd.Process != nil { + pgid := cmd.Process.Pid // Process group ID is the same as the main process PID when Setpgid=true + fields["processGroupId"] = pgid + + // Try to get current process group info + currentProcesses := getProcessGroupInfo(pgid) + if len(currentProcesses) > 0 && !strings.Contains(currentProcesses[0], "terminated or no processes found") { + fields["processGroupProcesses"] = currentProcesses + // Update captured info if we got fresh data + capturedProcessMutex.Lock() + capturedProcessInfo = currentProcesses + capturedProcessMutex.Unlock() + + // Check which processes might be related to the lock file + if exists { + lockDir := filepath.Dir(lockPath) + suspiciousProcesses := findProcessesInDirectory(currentProcesses, lockDir) + if len(suspiciousProcesses) > 0 { + fields["processesInLockDirectory"] = suspiciousProcesses + } + } + } else { + capturedProcessMutex.Lock() + if len(capturedProcessInfo) > 0 { + // Use previously captured info if current query failed + fields["processGroupProcesses"] = capturedProcessInfo + fields["processGroupProcessesNote"] = "captured during execution (process group has since terminated)" + + // Check captured processes for lock file relation + if exists { + lockDir := filepath.Dir(lockPath) + suspiciousProcesses := findProcessesInDirectory(capturedProcessInfo, lockDir) + if len(suspiciousProcesses) > 0 { + fields["processesInLockDirectory"] = suspiciousProcesses + fields["processesInLockDirectoryNote"] = "from captured process info" + } + } + } else { + fields["processGroupProcesses"] = currentProcesses + } + capturedProcessMutex.Unlock() + } + } + + logCtx.WithFields(fields).Info("HEAD.lock status") + } + var stdout bytes.Buffer var stderr bytes.Buffer cmd.Stdout = &stdout cmd.Stderr = &stderr + // Configure the child to run in its own process group so we can signal the whole group on timeout/cancel. + // On Unix this sets Setpgid; on Windows this is a no-op. + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = newSysProcAttr(true) + } + start := time.Now() err = cmd.Start() if err != nil { return "", err } + logHeadLockStatus("start-exec") + done := make(chan error) go func() { done <- cmd.Wait() }() @@ -228,15 +310,47 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { select { // noinspection ALL case <-timoutCh: + // Capture process group info RIGHT BEFORE sending timeout signal + if cmd.Process != nil { + pgid := cmd.Process.Pid + preTerminationProcesses := getProcessGroupInfo(pgid) + if len(preTerminationProcesses) > 0 && !strings.Contains(preTerminationProcesses[0], "terminated or no processes found") { + capturedProcessMutex.Lock() + capturedProcessInfo = preTerminationProcesses + capturedProcessMutex.Unlock() + logCtx.WithFields(logrus.Fields{ + "processGroupId": pgid, + "processGroupProcesses": preTerminationProcesses, + "capturePoint": "pre-timeout-signal", + }).Info("Process group info captured before timeout signal") + } + } // send timeout signal - _ = cmd.Process.Signal(timeoutBehavior.Signal) + // signal the process group (negative PID) so children are terminated as well + if cmd.Process != nil { + _ = sysCallSignal(-cmd.Process.Pid, timeoutBehavior.Signal) + } // wait on timeout signal and fallback to fatal timeout signal if timeoutBehavior.ShouldWait { select { case <-done: case <-fatalTimeoutCh: - // upgrades to SIGKILL if cmd does not respect SIGTERM - _ = cmd.Process.Signal(fatalTimeoutBehaviour) + // Capture process group info RIGHT BEFORE sending fatal signal + if cmd.Process != nil { + pgid := cmd.Process.Pid + preFatalProcesses := getProcessGroupInfo(pgid) + if len(preFatalProcesses) > 0 && !strings.Contains(preFatalProcesses[0], "terminated or no processes found") { + logCtx.WithFields(logrus.Fields{ + "processGroupId": pgid, + "processGroupProcesses": preFatalProcesses, + "capturePoint": "pre-fatal-signal", + }).Info("Process group info captured before fatal signal") + } + } + // upgrades to fatal signal (default SIGKILL) if cmd does not respect the initial signal + if cmd.Process != nil { + _ = sysCallSignal(-cmd.Process.Pid, fatalTimeoutBehaviour) + } // now original cmd should exit immediately after SIGKILL <-done // return error with a marker indicating that cmd exited only after fatal SIGKILL @@ -245,6 +359,7 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { output += stderr.String() } logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output)) + logHeadLockStatus("fatal-timeout") err = newCmdError(redactor(args), fmt.Errorf("fatal timeout after %v", timeout+fatalTimeout), "") logCtx.Error(err.Error()) return strings.TrimSuffix(output, "\n"), err @@ -256,10 +371,23 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { output += stderr.String() } logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output)) + logHeadLockStatus("timeout") err = newCmdError(redactor(args), fmt.Errorf("timeout after %v", timeout), "") logCtx.Error(err.Error()) return strings.TrimSuffix(output, "\n"), err case err := <-done: + // Capture process group info right when command finishes (might catch lingering processes) + if cmd.Process != nil { + pgid := cmd.Process.Pid + postExitProcesses := getProcessGroupInfo(pgid) + if len(postExitProcesses) > 0 && !strings.Contains(postExitProcesses[0], "terminated or no processes found") { + logCtx.WithFields(logrus.Fields{ + "processGroupId": pgid, + "processGroupProcesses": postExitProcesses, + "capturePoint": "post-command-exit", + }).Info("Process group info captured right after command exit") + } + } if err != nil { output := stdout.String() if opts.CaptureStderr { @@ -270,6 +398,7 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { if !opts.SkipErrorLogging { logCtx.Error(err.Error()) } + logHeadLockStatus("done-error") return strings.TrimSuffix(output, "\n"), err } } @@ -278,6 +407,7 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { output += stderr.String() } logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output)) + logHeadLockStatus("done-success") return strings.TrimSuffix(output, "\n"), nil } @@ -285,3 +415,135 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { func RunCommand(name string, opts CmdOpts, arg ...string) (string, error) { return RunCommandExt(exec.Command(name, arg...), opts) } + +// getProcessGroupInfo returns information about processes in the given process group +func getProcessGroupInfo(pgid int) []string { + if pgid <= 0 { + return nil + } + + // Use ps to get process group information with more details + psCmd := exec.Command("ps", "-o", "pid,ppid,pgid,etime,comm,args", "-g", strconv.Itoa(pgid)) + output, err := psCmd.Output() + + // ps returns exit status 1 when no processes are found in the process group + // This is normal behavior, not an error condition + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 1 { + // Exit code 1 typically means no processes found, check if we got header output + lines := strings.Split(strings.TrimSpace(string(output)), "\n") + if len(lines) <= 1 { + return []string{"Process group terminated or no processes found"} + } + // Continue processing the output even with exit code 1 + } else { + // Other types of errors (command not found, permission denied, etc.) + return []string{fmt.Sprintf("Failed to get process group info: %v", err)} + } + } + + lines := strings.Split(strings.TrimSpace(string(output)), "\n") + if len(lines) <= 1 { + return []string{"Process group terminated or no processes found"} + } + + // Skip header line and format the output + var processes []string + for i, line := range lines { + if i == 0 { + continue // Skip header + } + line = strings.TrimSpace(line) + if line == "" { + continue + } + + pid, ppid, pgidStr, elapsed, comm, args, ok := parsePsLineFivePlus(line) + if ok { + processInfo := fmt.Sprintf("PID=%s PPID=%s PGID=%s ELAPSED=%s COMM=%s ARGS=%s", + pid, ppid, pgidStr, elapsed, comm, args) + + // Add working directory information if available + if workDir := getProcessWorkingDir(pid); workDir != "" { + processInfo += fmt.Sprintf(" CWD=%s", workDir) + } + + processes = append(processes, processInfo) + } else { + processes = append(processes, fmt.Sprintf("Raw: %s", line)) + } + } + + if len(processes) == 0 { + return []string{"Process group terminated or no processes found"} + } + + return processes +} + +// getProcessWorkingDir returns the working directory of a process +func getProcessWorkingDir(pid string) string { + // Try to read the working directory from /proc/PID/cwd (Linux) or use lsof + if cwd, err := os.Readlink(fmt.Sprintf("/proc/%s/cwd", pid)); err == nil { + return cwd + } + + // Fallback to lsof on macOS/other systems + lsofCmd := exec.Command("lsof", "-p", pid, "-d", "cwd", "-Fn") + if output, err := lsofCmd.Output(); err == nil { + lines := strings.Split(string(output), "\n") + for _, line := range lines { + if strings.HasPrefix(line, "n") { + return strings.TrimPrefix(line, "n") + } + } + } + + return "" +} + +// findProcessesInDirectory finds processes that have their working directory in or under the specified directory +func findProcessesInDirectory(processes []string, targetDir string) []string { + var matches []string + for _, process := range processes { + if strings.Contains(process, fmt.Sprintf("CWD=%s", targetDir)) || + strings.Contains(process, fmt.Sprintf("CWD=%s/", targetDir)) { + matches = append(matches, process) + } + } + return matches +} + +// parsePsLineFivePlus splits a ps output line with at least 5 whitespace-separated fields, +// returning PID, PPID, PGID, ELAPSED, COMM, and the remaining ARGS (which may contain spaces). +func parsePsLineFivePlus(line string) (string, string, string, string, string, string, bool) { + // Extract first five fields, rest is ARGS + fields := make([]string, 0, 6) + start := -1 + inSpace := true + for i, r := range line { + if r == ' ' || r == '\t' { + if !inSpace { + fields = append(fields, line[start:i]) + if len(fields) == 5 { + rest := strings.TrimLeft(line[i+1:], " \t") + fields = append(fields, rest) + break + } + } + inSpace = true + } else { + if inSpace { + start = i + } + inSpace = false + } + } + if !inSpace && len(fields) < 5 && start >= 0 { + fields = append(fields, line[start:]) + } + if len(fields) < 6 { // need at least PID, PPID, PGID, ELAPSED, COMM, ARGS + return "", "", "", "", "", "", false + } + return fields[0], fields[1], fields[2], fields[3], fields[4], fields[5], true +} diff --git a/util/exec/exec_test.go b/util/exec/exec_test.go index d8113ce260762..ccfbc4b975578 100644 --- a/util/exec/exec_test.go +++ b/util/exec/exec_test.go @@ -1,7 +1,9 @@ package exec import ( + "os" "os/exec" + "path/filepath" "regexp" "syscall" "testing" @@ -217,3 +219,51 @@ func TestRunCaptureStderr(t *testing.T) { assert.Equal(t, "hello world\nmy-error", output) assert.NoError(t, err) } + +// This test demonstrates that when a process group is signaled, all child processes are also terminated and file locks released. +func TestProcessGroupSignalRemovesChildLock(t *testing.T) { + hook := test.NewGlobal() + log.SetLevel(log.DebugLevel) + defer log.SetLevel(log.InfoLevel) + + dir := t.TempDir() + lockFile := filepath.Join(dir, "lockfile") + childScript := filepath.Join(dir, "child.sh") + parentScript := filepath.Join(dir, "parent.sh") + + // Child: create lock file; on SIGTERM remove it and exit + child := "#!/bin/sh\n" + + "trap 'rm -f lockfile; exit 0' TERM\n" + + "touch lockfile\n" + + "sleep 100\n" + require.NoError(t, os.WriteFile(childScript, []byte(child), 0o755)) + + // Parent: start child in background and sleep + parent := "#!/bin/sh\n" + + "./child.sh &\n" + + "sleep 100\n" + require.NoError(t, os.WriteFile(parentScript, []byte(parent), 0o755)) + + // Run parent with a short timeout; our implementation signals the process group + opts := CmdOpts{ + Timeout: 500 * time.Millisecond, + FatalTimeout: 500 * time.Millisecond, + TimeoutBehavior: TimeoutBehavior{ + Signal: syscall.SIGTERM, + ShouldWait: true, + }, + } + _, err := RunCommand("sh", opts, "-c", "cd "+dir+" && ./parent.sh") + require.Error(t, err) + + // Give a bit of time for traps to run and for the process tree to settle + time.Sleep(200 * time.Millisecond) + + // Because the process group was signaled, the child should have removed the lock + _, statErr := os.Stat(lockFile) + require.Error(t, statErr, "expected lock file to be removed when process group is signaled") + assert.True(t, os.IsNotExist(statErr)) + + // basic sanity: logs produced + require.GreaterOrEqual(t, len(hook.Entries), 1) +} diff --git a/util/exec/exec_unix.go b/util/exec/exec_unix.go new file mode 100644 index 0000000000000..648a633e60e6e --- /dev/null +++ b/util/exec/exec_unix.go @@ -0,0 +1,13 @@ +//go:build !windows +// +build !windows + +package exec + +import "syscall" + +func newSysProcAttr(setpgid bool) *syscall.SysProcAttr { + return &syscall.SysProcAttr{Setpgid: setpgid} +} +func sysCallSignal(pid int, sig syscall.Signal) error { + return syscall.Kill(pid, sig) +} diff --git a/util/exec/exec_windows.go b/util/exec/exec_windows.go new file mode 100644 index 0000000000000..229ca37f31a5a --- /dev/null +++ b/util/exec/exec_windows.go @@ -0,0 +1,10 @@ +//go:build windows +// +build windows + +package exec + +import "syscall" + +func newSysProcAttr(_ bool) *syscall.SysProcAttr { return &syscall.SysProcAttr{} } + +func sysCallSignal(_ int, _ syscall.Signal) error { return nil } From c0b4453210687764c5a3c24eb03b516f6bcb48f5 Mon Sep 17 00:00:00 2001 From: reggie-k Date: Thu, 25 Sep 2025 18:15:29 +0300 Subject: [PATCH 2/2] without bin file --- util/exec/exec_test.go | 50 ------------------------------------------ 1 file changed, 50 deletions(-) diff --git a/util/exec/exec_test.go b/util/exec/exec_test.go index ccfbc4b975578..d8113ce260762 100644 --- a/util/exec/exec_test.go +++ b/util/exec/exec_test.go @@ -1,9 +1,7 @@ package exec import ( - "os" "os/exec" - "path/filepath" "regexp" "syscall" "testing" @@ -219,51 +217,3 @@ func TestRunCaptureStderr(t *testing.T) { assert.Equal(t, "hello world\nmy-error", output) assert.NoError(t, err) } - -// This test demonstrates that when a process group is signaled, all child processes are also terminated and file locks released. -func TestProcessGroupSignalRemovesChildLock(t *testing.T) { - hook := test.NewGlobal() - log.SetLevel(log.DebugLevel) - defer log.SetLevel(log.InfoLevel) - - dir := t.TempDir() - lockFile := filepath.Join(dir, "lockfile") - childScript := filepath.Join(dir, "child.sh") - parentScript := filepath.Join(dir, "parent.sh") - - // Child: create lock file; on SIGTERM remove it and exit - child := "#!/bin/sh\n" + - "trap 'rm -f lockfile; exit 0' TERM\n" + - "touch lockfile\n" + - "sleep 100\n" - require.NoError(t, os.WriteFile(childScript, []byte(child), 0o755)) - - // Parent: start child in background and sleep - parent := "#!/bin/sh\n" + - "./child.sh &\n" + - "sleep 100\n" - require.NoError(t, os.WriteFile(parentScript, []byte(parent), 0o755)) - - // Run parent with a short timeout; our implementation signals the process group - opts := CmdOpts{ - Timeout: 500 * time.Millisecond, - FatalTimeout: 500 * time.Millisecond, - TimeoutBehavior: TimeoutBehavior{ - Signal: syscall.SIGTERM, - ShouldWait: true, - }, - } - _, err := RunCommand("sh", opts, "-c", "cd "+dir+" && ./parent.sh") - require.Error(t, err) - - // Give a bit of time for traps to run and for the process tree to settle - time.Sleep(200 * time.Millisecond) - - // Because the process group was signaled, the child should have removed the lock - _, statErr := os.Stat(lockFile) - require.Error(t, statErr, "expected lock file to be removed when process group is signaled") - assert.True(t, os.IsNotExist(statErr)) - - // basic sanity: logs produced - require.GreaterOrEqual(t, len(hook.Entries), 1) -}