Skip to content

Commit 25d2eb0

Browse files
authored
Merge pull request #69 from bburgin/users/bburgin/process-tree-fix
Fix issue with stopping orphan children
2 parents a4593c0 + 222ca08 commit 25d2eb0

File tree

3 files changed

+63
-311
lines changed

3 files changed

+63
-311
lines changed

process/process.go

Lines changed: 41 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -469,17 +469,6 @@ func canSendSignal(childProcessName string, noSignalExes []string) (canSendSigna
469469
return canSendSignal, err
470470
}
471471

472-
var processHandleStartTimes = make(map[int]uint64)
473-
474-
func getProcessStartTimeFromHandle(processHandle int) (startTime uint64, err error) {
475-
startTime, keyPresent := processHandleStartTimes[processHandle]
476-
if !keyPresent {
477-
startTime, err = _getProcessStartTime(processHandle)
478-
processHandleStartTimes[processHandle] = startTime
479-
}
480-
return
481-
}
482-
483472
type ProcessRecord struct {
484473
pid int
485474
processHandle int
@@ -490,7 +479,8 @@ type ProcessRecord struct {
490479

491480
var pidRecords = make(map[int]ProcessRecord)
492481

493-
func getProcessRecordFromPid(pid int) (processRecord ProcessRecord, err error) {
482+
func getProcessRecordFromPid(pidRecordsLock *sync.Mutex, pid int) (processRecord ProcessRecord, err error) {
483+
pidRecordsLock.Lock()
494484
processRecord, keyPresent := pidRecords[pid]
495485
if !keyPresent {
496486
processHandle, processExists, accessGranted, err := getProcess(pid)
@@ -504,67 +494,30 @@ func getProcessRecordFromPid(pid int) (processRecord ProcessRecord, err error) {
504494
pidRecords[pid] = processRecord
505495
}
506496
}
497+
pidRecordsLock.Unlock()
507498
return
508499
}
509500

510-
func getProcessStartTimeFromPid(pid int) (startTime uint64, err error) {
511-
startTime = 0
512-
processRecord, err := getProcessRecordFromPid(pid)
513-
if processRecord.processHandle != INVALID_HANDLE && err == nil {
514-
startTime, err = getProcessStartTimeFromHandle(processRecord.processHandle)
515-
}
516-
return
517-
}
518-
519-
type ChildCheckRecord struct {
520-
process *psutil.Process
521-
knownChild bool
522-
}
523-
524-
func checkChildProcess(childCheckRecord ChildCheckRecord, noStopExes []string, noKillExes []string) (
501+
func checkChildProcess(pidRecordsLock *sync.Mutex, childCheckProcess *psutil.Process, noStopExes []string, noKillExes []string) (
525502
processHandle int,
526503
considerChild bool,
527504
signalsToSend int,
528505
err error) {
529506
considerChild = false // first assume false
530507
signalsToSend = SEND_NO_SIGNAL // first assume no signal
531508
// use err2 for getProcessRecordFromPid(), since child may no longer exist
532-
processRecord, err2 := getProcessRecordFromPid(int(childCheckRecord.process.Pid))
509+
processRecord, err2 := getProcessRecordFromPid(pidRecordsLock, int(childCheckProcess.Pid))
533510
processHandle = processRecord.processHandle
534511
if err2 == nil {
535512
if processHandle != INVALID_HANDLE {
536-
doEnvCheck := true // first assume true
537-
if childCheckRecord.knownChild {
538-
// use err3 for the next operations, since
539-
// we may fallback to env check below
540-
processStartTime, err3 := getProcessStartTimeFromHandle(processHandle)
541-
if err3 == nil {
542-
parentPid, err3 := childCheckRecord.process.Ppid() // get parent pid
543-
if err3 == nil {
544-
// use err3 for getProcessStartTimeFromPid(), since parent may no longer exist
545-
parentStartTime, err3 := getProcessStartTimeFromPid(int(parentPid))
546-
if err3 == nil {
547-
if processStartTime >= parentStartTime {
548-
// this our child and
549-
// not from a previous parent with reused pid,
550-
// so we can skip the env check
551-
doEnvCheck = false
552-
considerChild = true
553-
}
554-
}
555-
}
556-
}
557-
}
558-
if doEnvCheck {
559-
// handle the orphaned child case
560-
// by looking at the env
561-
considerChild, err = doesChildHaveMarker(processHandle)
562-
}
513+
// Don't look at parent-child relationships, since children, grandchildren, etc.
514+
// could become orphaned at any time. Just look for the child marker to know.
515+
considerChild, err = doesChildHaveMarker(childCheckProcess, processHandle)
563516
if err == nil {
564517
if considerChild {
565518
var childProcessName string = ""
566519
if len(noStopExes) > 0 || len(noKillExes) > 0 {
567-
childProcessName, err = childCheckRecord.process.Name()
520+
childProcessName, err = childCheckProcess.Name()
568521
if err == nil {
569522
if len(childProcessName) == 0 {
570523
err = errors.New("childProcessName is empty")
@@ -608,16 +561,18 @@ func checkChildProcess(childCheckRecord ChildCheckRecord, noStopExes []string, n
608561

609562
// get process tree in bottom up order
610563
func getProcessTreeRecursive(
611-
childCheckRecord ChildCheckRecord,
564+
pidRecordsLock *sync.Mutex,
565+
childCheckProcess *psutil.Process,
612566
noStopExes []string,
613567
noKillExes []string,
614568
pidsVisited *IntSet,
615569
) (processRecords []ProcessRecord) {
616-
if considerPid(int(childCheckRecord.process.Pid)) {
570+
if considerPid(int(childCheckProcess.Pid)) {
617571
// avoid cycles in pid tree by looking at visited set
618-
if pidsVisited.Add(int(childCheckRecord.process.Pid)) {
572+
if pidsVisited.Add(int(childCheckProcess.Pid)) {
619573
processHandle, considerChild, signalsToSend, err := checkChildProcess(
620-
childCheckRecord,
574+
pidRecordsLock,
575+
childCheckProcess,
621576
noStopExes,
622577
noKillExes)
623578
if err != nil {
@@ -627,7 +582,7 @@ func getProcessTreeRecursive(
627582
}
628583
if processHandle != INVALID_HANDLE {
629584
if considerChild {
630-
grandChildren, err := childCheckRecord.process.Children()
585+
grandChildren, err := childCheckProcess.Children()
631586
if err != nil {
632587
if err == psutil.ErrorNoChildren {
633588
// ignore this error
@@ -640,10 +595,9 @@ func getProcessTreeRecursive(
640595
} else {
641596
if grandChildren != nil {
642597
for _, grandChildProcess := range grandChildren {
643-
var grandChildCheckRecord = ChildCheckRecord{
644-
process: grandChildProcess, knownChild: childCheckRecord.knownChild}
645598
subProcessRecords := getProcessTreeRecursive(
646-
grandChildCheckRecord,
599+
pidRecordsLock,
600+
grandChildProcess,
647601
noStopExes,
648602
noKillExes,
649603
pidsVisited)
@@ -654,66 +608,23 @@ func getProcessTreeRecursive(
654608
}
655609
}
656610
var processRecord = ProcessRecord{
657-
processHandle: processHandle, pid: int(childCheckRecord.process.Pid), signalsToSend: signalsToSend}
611+
processHandle: processHandle, pid: int(childCheckProcess.Pid), signalsToSend: signalsToSend}
658612
processRecords = append(processRecords, processRecord)
659613
} else {
660-
releaseProcessByPid(int(childCheckRecord.process.Pid))
614+
pidRecordsLock.Lock()
615+
releaseProcessByPid(int(childCheckProcess.Pid))
616+
pidRecordsLock.Unlock()
661617
}
662618
}
663619
}
664620
}
665621
return processRecords
666622
}
667623

668-
func getChildProcesses(noStopExes []string, noKillExes []string) (processRecords []ProcessRecord) {
669-
var childCheckRecords []ChildCheckRecord
670-
671-
// to handle non-orphaned children, get our immediate children
672-
thisProcess, err := psutil.NewProcess(int32(os.Getpid()))
673-
if err == nil {
674-
immediateChildren, err := thisProcess.Children()
675-
if err == nil {
676-
if immediateChildren != nil {
677-
for _, process := range immediateChildren {
678-
var childCheckRecord = ChildCheckRecord{
679-
process: process, knownChild: true} // process is known to be our child
680-
childCheckRecords = append(childCheckRecords, childCheckRecord)
681-
}
682-
}
683-
} else {
684-
if err == psutil.ErrorNoChildren {
685-
// ignore this error
686-
err = nil
687-
} else {
688-
if Verbose {
689-
Log.Error(err)
690-
}
691-
}
692-
}
693-
} else {
694-
if Verbose {
695-
Log.Error(err)
696-
}
697-
}
698-
699-
if err == nil {
700-
// to handle orphaned children, get all processes
701-
// we'll check for duplicates later
702-
allProcesses, err := psutil.Processes()
703-
if err == nil {
704-
if allProcesses != nil {
705-
for _, process := range allProcesses {
706-
var childCheckRecord = ChildCheckRecord{
707-
process: process, knownChild: false} // process may or may not be our child
708-
childCheckRecords = append(childCheckRecords, childCheckRecord)
709-
}
710-
}
711-
} else {
712-
if Verbose {
713-
Log.Error(err)
714-
}
715-
}
716-
}
624+
func getChildProcesses(pidRecordsLock *sync.Mutex, noStopExes []string, noKillExes []string) (processRecords []ProcessRecord) {
625+
// handle normal and orphaned children by getting all processes
626+
// we'll check for duplicates later
627+
allProcesses, err := psutil.Processes()
717628

718629
if err == nil {
719630
// pidsVisited := IntSet{set: make(map[int]bool)}
@@ -732,13 +643,14 @@ func getChildProcesses(noStopExes []string, noKillExes []string) (processRecords
732643
tokens := make(chan int, threads)
733644
var wg sync.WaitGroup
734645

735-
for _, childCheckRecord := range childCheckRecords {
646+
for _, childCheckProcess := range allProcesses {
736647

737648
wg.Add(1)
738649
tokens <- 1
739-
go func(childCheckRecord ChildCheckRecord) {
650+
go func(childCheckProcess *psutil.Process) {
740651
subProcessRecords := getProcessTreeRecursive(
741-
childCheckRecord,
652+
pidRecordsLock,
653+
childCheckProcess,
742654
noStopExes,
743655
noKillExes,
744656
&pidsVisited)
@@ -749,7 +661,7 @@ func getChildProcesses(noStopExes []string, noKillExes []string) (processRecords
749661

750662
wg.Done()
751663
<-tokens
752-
}(childCheckRecord)
664+
}(childCheckProcess)
753665
}
754666

755667
wg.Wait()
@@ -877,12 +789,12 @@ func pollKillProcess(processRecord ProcessRecord) (err error) {
877789
}
878790

879791
// ensure our child processes are stopped
880-
func stopChildProcesses(noStopExes []string, noKillExes []string, cleanupTime time.Duration) (err error) {
792+
func stopChildProcesses(pidRecordsLock *sync.Mutex, noStopExes []string, noKillExes []string, cleanupTime time.Duration) (err error) {
881793
err = nil // first assume no error
882794
anyRemaining := true // first assume some children
883795
totalNumSignaled := 0
884796
if canStopChildProcesses() {
885-
processRecords := getChildProcesses(noStopExes, noKillExes)
797+
processRecords := getChildProcesses(pidRecordsLock, noStopExes, noKillExes)
886798
// progress from most graceful to most invasive stop signal
887799
// if no matching children, then call is a noop
888800
numSignaled := signalChildProcesses(processRecords, CTRL_C_SIGNAL)
@@ -908,7 +820,7 @@ func stopChildProcesses(noStopExes []string, noKillExes []string, cleanupTime ti
908820
anyRemaining = pollRemainingChildren(processRecords, 0) // wait zero time, since already waited above
909821
// release process handles only after descending into all processes,
910822
// to ensure pids do not get reused while descending
911-
releaseProcesses()
823+
releaseProcesses(pidRecordsLock)
912824
}
913825
if anyRemaining && totalNumSignaled == 0 {
914826
msg := "No child processes stopped or killed\n"
@@ -920,17 +832,20 @@ func stopChildProcesses(noStopExes []string, noKillExes []string, cleanupTime ti
920832
}
921833

922834
func releaseProcessByPid(pid int) {
835+
// no pidRecordsLock here, rely on caller to do it
923836
processRecord, keyPresent := pidRecords[pid]
924837
if !keyPresent {
925838
delete(pidRecords, processRecord.pid)
926839
releaseProcessByHandle(processRecord.processHandle)
927840
}
928841
}
929842

930-
func releaseProcesses() {
843+
func releaseProcesses(pidRecordsLock *sync.Mutex) {
844+
pidRecordsLock.Lock()
931845
for _, processRecord := range pidRecords {
932846
releaseProcessByPid(processRecord.pid)
933847
}
848+
pidRecordsLock.Unlock()
934849
}
935850

936851
func getChildMarkerKey() string {
@@ -1037,7 +952,7 @@ func (c *Command) run(opts *Options, tryNumber int) error {
1037952
// ensure we only initiate the stop attempt once,
1038953
// from all our command threads
1039954
stopOnce.Do(func() {
1040-
err = stopChildProcesses(opts.NoStopExes, opts.NoKillExes, opts.CleanupTime)
955+
err = stopChildProcesses(&opts.PidRecordsLock, opts.NoStopExes, opts.NoKillExes, opts.CleanupTime)
1041956
if err != nil {
1042957
if Verbose {
1043958
Log.Error(err)
@@ -1203,6 +1118,7 @@ type Options struct {
12031118
PrintRetryOutput bool // print output from retries
12041119
Timeout time.Duration // timeout
12051120
StopOnErr bool // stop on any error
1121+
PidRecordsLock sync.Mutex // make stop on error thread-safe
12061122
NoStopExes []string // exe names to exclude from stop signal
12071123
NoKillExes []string // exe names to exclude from kill signal
12081124
CleanupTime time.Duration // time to allow children to clean up
@@ -1452,7 +1368,7 @@ func Run(opts *Options, cancel chan struct{}, chCmdStr chan string) (chan *Comma
14521368
if opts.StopOnErr {
14531369
Log.Error("stop on first error")
14541370
}
1455-
err = stopChildProcesses(opts.NoStopExes, opts.NoKillExes, opts.CleanupTime)
1371+
err = stopChildProcesses(&opts.PidRecordsLock, opts.NoStopExes, opts.NoKillExes, opts.CleanupTime)
14561372
if err != nil {
14571373
if Verbose {
14581374
Log.Error(err)

process/process_others.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ import (
3535
"strconv"
3636
"strings"
3737
"syscall"
38+
39+
psutil "github.com/shirou/gopsutil/process"
3840
)
3941

4042
func getProcess(pid int) (processHandle int, processExists bool, accessGranted bool, err error) {
@@ -194,7 +196,7 @@ func getSignalsToSend(childProcessName string, noStopExes []string, noKillExes [
194196
return signalsToSend, err
195197
}
196198

197-
func doesChildHaveMarker(processHandle int) (hasMarker bool, err error) {
199+
func doesChildHaveMarker(_ *psutil.Process, processHandle int) (hasMarker bool, err error) {
198200
// the process handle is the pid
199201
envCmd := fmt.Sprintf("xargs -0 -n 1 < %s/%d/environ", Procd, processHandle)
200202
env, err := exec.Command(getShell(), "-c", envCmd).Output()

0 commit comments

Comments
 (0)