Skip to content
4 changes: 4 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
Version 11.16.7
---------------
* Improve runCommand timeout diagnostics with logProcChain() with child processes

Version 11.16.6
---------------
* Improve runCommand timeout diagnostics with logProcOnTimeout()
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
11.16.6
11.16.7
142 changes: 142 additions & 0 deletions mettle/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -976,6 +976,7 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur
}
// # TODO(INFRA-130229): Revert this temporary change mettle timeouts investigations are completed
w.logProcOnTimeout(pid)
w.logProcChain(pid)
}(pid)

err := cmd.Wait()
Expand Down Expand Up @@ -1076,6 +1077,147 @@ func (w *worker) logProcOnTimeout(pid int) {
}).Debug("Deadline exceeded: /proc snapshot")
}

// logProcChain logs a compact parent→child execution chain for the given PID
// This helps quickly identify which subprocess is still running when the worker
// times out
// If multiple children exist, they are listed instead.
func (w *worker) logProcChain(pid int) {
const maxDepth = 10

cur := pid
parts := make([]string, 0, maxDepth+1)
parts = append(parts, fmt.Sprintf("%d:%s", cur, procName(cur)))

var childrenSummary string

for i := 0; i < maxDepth; i++ {
children := childPIDs(cur)

if len(children) == 0 {
break
}

if len(children) > 1 {
items := make([]string, 0, len(children))
for _, c := range children {
state := procState(c)
items = append(items,
fmt.Sprintf("%d:%s:%s(%s)",
c,
procName(c),
state,
stateComment(state),
),
)
}
childrenSummary = strings.Join(items, " | ")
break
}

cur = children[0]
parts = append(parts, fmt.Sprintf("%d:%s", cur, procName(cur)))
}

tailState := procState(cur)

fields := logrus.Fields{
"hash": w.actionDigest.Hash,
"pid": pid,
"chain": strings.Join(parts, " -> "),
"tail_pid": cur,
"tail_name": procName(cur),
"tail_state": tailState,
"tail_state_comment": stateComment(tailState),
"tail_wchan": readTrim(filepath.Join(procBase(cur), "wchan")),
}

if childrenSummary != "" {
fields["children"] = childrenSummary
} else {
fields["children"] = "none"
}

logr.WithFields(fields).Debug("Deadline exceeded: process chain")
}

// procBase returns the /proc path for a pid
func procBase(pid int) string {
return filepath.Join("/proc", strconv.Itoa(pid))
}

// readTrim reads a file and returns trimmed content, or "" on error/empty content
func readTrim(path string) string {
b, err := os.ReadFile(path)
if err != nil || len(b) == 0 {
return ""
}
return strings.TrimSpace(string(b))
}

// procName returns the process name from /proc/<pid>/comm, or "?" if unavailable
func procName(pid int) string {
name := readTrim(filepath.Join(procBase(pid), "comm"))
if name == "" {
return "?"
}
return name
}

// procState returns the Linux process state code from /proc/<pid>/status
func procState(pid int) string {
b, err := os.ReadFile(filepath.Join(procBase(pid), "status"))
if err != nil {
return ""
}
for _, ln := range strings.Split(string(b), "\n") {
k, v, ok := strings.Cut(ln, ":")
if !ok || strings.TrimSpace(k) != "State" {
continue
}
if f := strings.Fields(strings.TrimSpace(v)); len(f) > 0 {
return f[0]
}
return ""
}
return ""
}

// childPIDs returns direct children of pid from /proc/<pid>/task/<pid>/children
func childPIDs(pid int) []int {
b, err := os.ReadFile(filepath.Join(procBase(pid), "task", strconv.Itoa(pid), "children"))
if err != nil || len(b) == 0 {
return nil
}
out := make([]int, 0, 4)
for _, s := range strings.Fields(string(b)) {
n, err := strconv.Atoi(s)
if err == nil && n > 0 {
out = append(out, n)
}
}
return out
}

// stateComment is a helper to translate tail_state code into text
func stateComment(s string) string {
switch s {
case "R":
return "running (executing on CPU)"
case "S":
return "sleeping (waiting on event / lock / pipe)"
case "D":
return "uninterruptible sleep (kernel I/O or filesystem wait)"
case "I":
return "idle kernel thread (not doing work)"
case "T":
return "stopped (signal or debugger)"
case "Z":
return "zombie (process exited but parent has not reaped it)"
default:
return "unknown process state"
}
}

// writeUncachedResult attempts to write an uncached action result proto.
// This is an extension for buildbarn-browser that lets it display results of a failed build.
// It returns a string that, on success, contains an appropriate message about it (they are
Expand Down
Loading