Skip to content

Commit d40311b

Browse files
committed
Add Debug logs with process tree (thought-machine#339)
* Add logProcOnTimeout() * Bump version
1 parent c32cb2e commit d40311b

File tree

3 files changed

+66
-16
lines changed

3 files changed

+66
-16
lines changed

ChangeLog

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
Version 11.16.6
2+
---------------
3+
* Improve runCommand timeout diagnostics with logProcOnTimeout()
4+
15
Version 11.16.5
26
---------------
37
* Improve runCommand timeout diagnostics (process snapshot + output summary)

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
11.16.5
1+
11.16.6

mettle/worker/worker.go

Lines changed: 61 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"path"
1515
"path/filepath"
1616
"runtime"
17+
"strconv"
1718
"strings"
1819
"syscall"
1920
"time"
@@ -732,12 +733,23 @@ func (w *worker) execute(req *pb.ExecuteRequest, action *pb.Action, command *pb.
732733
duration, _ := ptypes.Duration(action.Timeout)
733734
ctx, cancel := context.WithTimeout(context.Background(), duration)
734735
defer cancel()
736+
735737
logr.WithFields(logrus.Fields{
736738
"hash": w.actionDigest.Hash,
737739
"stage": "exec_action",
738740
"timeout": duration.String(),
741+
"argv0": command.Arguments[0],
742+
"argc": len(command.Arguments),
739743
}).Debug("Executing action - started")
740744

745+
logr.WithFields(logrus.Fields{
746+
"hash": w.actionDigest.Hash,
747+
"sandbox": w.sandbox,
748+
"altSandbox": w.altSandbox,
749+
"shouldSandbox": w.shouldSandbox(command),
750+
"argv0": command.Arguments[0],
751+
}).Debug("Sandbox selection")
752+
741753
cmd := exec.CommandContext(ctx, commandPath(command), command.Arguments[1:]...)
742754
// Setting Pdeathsig should ideally make subprocesses get kill signals if we die.
743755

@@ -962,21 +974,8 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur
962974
if !errors.Is(ctx.Err(), context.DeadlineExceeded) {
963975
return
964976
}
965-
966-
pgid, _ := syscall.Getpgid(pid)
967-
var processTree string
968-
if pgid > 0 {
969-
args := []string{"-g", fmt.Sprintf("%d", pgid), "-o", "pid,ppid,pgid,state,etime,wchan,%cpu,%mem,command"}
970-
if psOut, psErr := exec.Command("ps", args...).Output(); psErr == nil {
971-
processTree = string(psOut)
972-
}
973-
}
974-
logr.WithFields(logrus.Fields{
975-
"hash": w.actionDigest.Hash,
976-
"pid": pid,
977-
"pgid": pgid,
978-
"hangingProcessTree": processTree,
979-
}).Debug("Deadline exceeded: Analyzing hanging group")
977+
// # TODO(INFRA-130229): Revert this temporary change mettle timeouts investigations are completed
978+
w.logProcOnTimeout(pid)
980979
}(pid)
981980

982981
err := cmd.Wait()
@@ -1030,6 +1029,53 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur
10301029
return err
10311030
}
10321031

1032+
// logProcOnTimeout logs /proc diagnostics for the timed-out pid and its process group.
1033+
// It does not rely on `ps` existing in the container.
1034+
func (w *worker) logProcOnTimeout(pid int) {
1035+
const maxFDs = 200
1036+
const maxTargetLen = 256
1037+
p := strconv.Itoa(pid)
1038+
base := filepath.Join("/proc", p)
1039+
1040+
cmdlineBytes, err := os.ReadFile(filepath.Join(base, "cmdline"))
1041+
cmdline := ""
1042+
if err == nil && len(cmdlineBytes) > 0 {
1043+
cmdline = strings.TrimSpace(strings.ReplaceAll(string(cmdlineBytes), "\x00", " "))
1044+
}
1045+
1046+
wchanBytes, err := os.ReadFile(filepath.Join(base, "wchan"))
1047+
wchan := ""
1048+
if err == nil && len(wchanBytes) > 0 {
1049+
wchan = strings.TrimSpace(string(wchanBytes))
1050+
}
1051+
1052+
var fds []string
1053+
if entries, err := os.ReadDir(filepath.Join(base, "fd")); err == nil {
1054+
for i, e := range entries {
1055+
if i >= maxFDs {
1056+
fds = append(fds, "...truncated")
1057+
break
1058+
}
1059+
target, err := os.Readlink(filepath.Join(base, "fd", e.Name()))
1060+
if err != nil {
1061+
continue
1062+
}
1063+
if len(target) > maxTargetLen {
1064+
target = target[:maxTargetLen] + "…"
1065+
}
1066+
fds = append(fds, e.Name()+" -> "+target)
1067+
}
1068+
}
1069+
1070+
logr.WithFields(logrus.Fields{
1071+
"hash": w.actionDigest.Hash,
1072+
"pid": pid,
1073+
"cmdline": cmdline,
1074+
"wchan": wchan,
1075+
"fds": fds,
1076+
}).Debug("Deadline exceeded: /proc snapshot")
1077+
}
1078+
10331079
// writeUncachedResult attempts to write an uncached action result proto.
10341080
// This is an extension for buildbarn-browser that lets it display results of a failed build.
10351081
// It returns a string that, on success, contains an appropriate message about it (they are

0 commit comments

Comments
 (0)