Skip to content

Commit 400235e

Browse files
authored
[RSDK-13495] Log orphaned processes (#210)
1 parent f25c4dc commit 400235e

File tree

4 files changed

+206
-0
lines changed

4 files changed

+206
-0
lines changed

manager.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,42 @@ func NewManager(ctx context.Context, logger logging.Logger, cfg utils.AgentConfi
8383
cache: NewVersionCache(logger),
8484
agentStartTime: time.Now(),
8585
}
86+
87+
preexistingProcesses := manager.findPreexistingViamServerProcesses(ctx)
88+
if len(preexistingProcesses) > 0 {
89+
logger.Warnw(
90+
"found process(es) from before agent startup still running; will log every minute while they remain",
91+
"processes", preexistingProcesses,
92+
)
93+
manager.activeBackgroundWorkers.Go(func() {
94+
defer utils.Recover(manager.logger, nil)
95+
96+
remaining := preexistingProcesses
97+
ticker := time.NewTicker(time.Minute)
98+
defer ticker.Stop()
99+
for {
100+
select {
101+
case <-ctx.Done():
102+
return
103+
case <-ticker.C:
104+
var stillRunning []utils.Process
105+
for _, proc := range remaining {
106+
if utils.IsProcessAlive(proc.PID) {
107+
stillRunning = append(stillRunning, proc)
108+
}
109+
}
110+
if len(stillRunning) == 0 {
111+
manager.logger.Info("all process(es) from before agent startup have exited")
112+
return
113+
}
114+
manager.logger.Warnw("process(es) from before agent startup are still running",
115+
"processes", stillRunning)
116+
remaining = stillRunning
117+
}
118+
}
119+
})
120+
}
121+
86122
manager.setDebug(cfg.AdvancedSettings.Debug.Get())
87123
manager.sysConfig = syscfg.New(
88124
ctx,
@@ -803,6 +839,27 @@ func (m *Manager) getVersions() *pb.VersionInfo {
803839
return vers
804840
}
805841

842+
// findPreexistingViamServerProcesses returns any viam-server processes already running when the
843+
// agent starts, along with their module children. Returns nil if none are found.
844+
func (m *Manager) findPreexistingViamServerProcesses(ctx context.Context) []utils.Process {
845+
pids, err := utils.FindProcessesByName(ctx, viamserver.SubsysName)
846+
if err != nil {
847+
m.logger.Warnw("error checking for preexisting viam-server processes", "err", err)
848+
return nil
849+
}
850+
var all []utils.Process
851+
for _, pid := range pids {
852+
all = append(all, utils.Process{PID: pid, Name: viamserver.SubsysName})
853+
children, err := utils.FindChildProcesses(ctx, pid)
854+
if err != nil {
855+
m.logger.Warnw("error checking for module processes under preexisting viam-server", "pid", pid, "err", err)
856+
continue
857+
}
858+
all = append(all, children...)
859+
}
860+
return all
861+
}
862+
806863
func (m *Manager) Exit(reason string) {
807864
m.logger.Infow(fmt.Sprintf("%s will now exit to be restarted by service manager", SubsystemName), "reason", reason)
808865
m.globalCancel()

utils/utils.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,12 @@ var (
6464
maxBytesForTesting int64
6565
)
6666

67+
// Process holds identifying info about a running process.
68+
type Process struct {
69+
PID int
70+
Name string
71+
}
72+
6773
type ViamDirsData struct {
6874
Viam string
6975
Bin string

utils/utils_unix.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
"os"
1010
"os/exec"
1111
"path/filepath"
12+
"strconv"
13+
"strings"
1214
"syscall"
1315

1416
errw "github.com/pkg/errors"
@@ -50,6 +52,60 @@ func KillTree(ctx context.Context, pid int) error {
5052
return nil
5153
}
5254

55+
// FindProcessesByName returns PIDs of all running processes with the given name (exact match).
56+
func FindProcessesByName(ctx context.Context, name string) ([]int, error) {
57+
out, err := exec.CommandContext(ctx, "pgrep", "-x", name).Output()
58+
if err != nil {
59+
// pgrep exits with code 1 when no processes are found — not an error for us.
60+
var exitErr *exec.ExitError
61+
if errors.As(err, &exitErr) && exitErr.ExitCode() == 1 {
62+
return nil, nil
63+
}
64+
return nil, err
65+
}
66+
var pids []int
67+
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
68+
pid, err := strconv.Atoi(strings.TrimSpace(line))
69+
if err != nil {
70+
continue
71+
}
72+
pids = append(pids, pid)
73+
}
74+
return pids, nil
75+
}
76+
77+
// FindChildProcesses returns the direct child processes of parentPID.
78+
func FindChildProcesses(ctx context.Context, parentPID int) ([]Process, error) {
79+
//nolint:gosec
80+
out, err := exec.CommandContext(ctx, "pgrep", "-l", "-P", strconv.Itoa(parentPID)).Output()
81+
if err != nil {
82+
// pgrep exits with code 1 when no processes are found — not an error for us.
83+
var exitErr *exec.ExitError
84+
if errors.As(err, &exitErr) && exitErr.ExitCode() == 1 {
85+
return nil, nil
86+
}
87+
return nil, err
88+
}
89+
var children []Process
90+
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
91+
parts := strings.SplitN(strings.TrimSpace(line), " ", 2)
92+
if len(parts) != 2 {
93+
continue
94+
}
95+
pid, err := strconv.Atoi(parts[0])
96+
if err != nil {
97+
continue
98+
}
99+
children = append(children, Process{PID: pid, Name: parts[1]})
100+
}
101+
return children, nil
102+
}
103+
104+
// IsProcessAlive returns true if the process with the given PID is still running.
105+
func IsProcessAlive(pid int) bool {
106+
return syscall.Kill(pid, 0) == nil
107+
}
108+
53109
func SyncFS(syncPath string) (errRet error) {
54110
file, errRet := os.Open(filepath.Dir(syncPath))
55111
if errRet != nil {

utils/utils_windows.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@ package utils
22

33
import (
44
"context"
5+
"fmt"
56
"io/fs"
67
"os"
78
"os/exec"
89
"strconv"
10+
"strings"
911
"syscall"
12+
"unsafe"
1013

1114
errw "github.com/pkg/errors"
1215
goutils "go.viam.com/utils"
@@ -52,6 +55,90 @@ func SyncFS(syncPath string) error {
5255
return nil
5356
}
5457

58+
// FindProcessesByName returns PIDs of all running processes with the given name (exact match).
59+
func FindProcessesByName(ctx context.Context, name string) ([]int, error) {
60+
//nolint:gosec
61+
out, err := exec.CommandContext(ctx, "tasklist", "/FI", "IMAGENAME eq "+name+".exe", "/FO", "CSV", "/NH").Output()
62+
if err != nil {
63+
return nil, err
64+
}
65+
var pids []int
66+
for _, line := range strings.Split(string(out), "\n") {
67+
line = strings.TrimSpace(line)
68+
if line == "" {
69+
continue
70+
}
71+
// CSV format: "name.exe","1234","Console","1","10,000 K"
72+
parts := strings.SplitN(line, ",", 3)
73+
if len(parts) < 2 {
74+
continue
75+
}
76+
pidStr := strings.Trim(parts[1], `"`)
77+
pid, err := strconv.Atoi(pidStr)
78+
if err != nil {
79+
continue
80+
}
81+
pids = append(pids, pid)
82+
}
83+
return pids, nil
84+
}
85+
86+
// FindChildProcesses returns the direct child processes of parentPID using the Windows API.
87+
func FindChildProcesses(_ context.Context, parentPID int) ([]Process, error) {
88+
snapshot, err := windows.CreateToolhelp32Snapshot(windows.TH32CS_SNAPPROCESS, 0)
89+
if err != nil {
90+
return nil, err
91+
}
92+
defer func() {
93+
if err := windows.CloseHandle(snapshot); err != nil {
94+
fmt.Fprintf(os.Stderr, "utils: error closing snapshot handle: %v\n", err)
95+
}
96+
}()
97+
98+
var entry windows.ProcessEntry32
99+
entry.Size = uint32(unsafe.Sizeof(entry))
100+
if err := windows.Process32First(snapshot, &entry); err != nil {
101+
return nil, err
102+
}
103+
104+
var children []Process
105+
for {
106+
if int(entry.ParentProcessID) == parentPID {
107+
children = append(children, Process{
108+
PID: int(entry.ProcessID),
109+
Name: windows.UTF16ToString(entry.ExeFile[:]),
110+
})
111+
}
112+
if err := windows.Process32Next(snapshot, &entry); err != nil {
113+
break
114+
}
115+
}
116+
return children, nil
117+
}
118+
119+
// stillActive is the value returned by GetExitCodeProcess for a still-running process (STILL_ACTIVE / STATUS_PENDING).
120+
// See: https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getexitcodeprocess
121+
const stillActive = 259
122+
123+
// IsProcessAlive returns true if the process with the given PID is still running.
124+
func IsProcessAlive(pid int) bool {
125+
//nolint:gosec
126+
h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid))
127+
if err != nil {
128+
return false
129+
}
130+
defer func() {
131+
if err := windows.CloseHandle(h); err != nil {
132+
fmt.Fprintf(os.Stderr, "utils: error closing process handle for pid %d: %v\n", pid, err)
133+
}
134+
}()
135+
var exitCode uint32
136+
if err := windows.GetExitCodeProcess(h, &exitCode); err != nil {
137+
return false
138+
}
139+
return exitCode == stillActive
140+
}
141+
55142
func SignalForTermination(pid int) error {
56143
return windows.GenerateConsoleCtrlEvent(syscall.CTRL_BREAK_EVENT, uint32(pid)) //nolint:gosec
57144
}

0 commit comments

Comments
 (0)