Skip to content

Commit bf1938e

Browse files
authored
Investigate Mettle timeout on premerge and postmerge tests (#340)
* Add logProcOnTimeout()
1 parent 9880d7a commit bf1938e

File tree

3 files changed

+168
-9
lines changed

3 files changed

+168
-9
lines changed

ChangeLog

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
Version 11.16.7
2+
---------------
3+
* Improve runCommand timeout diagnostics with logProcChain() with child processes
4+
15
Version 11.16.6
26
---------------
37
* Improve runCommand timeout diagnostics with logProcOnTimeout()

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
11.16.6
1+
11.16.7

mettle/worker/worker.go

Lines changed: 163 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -976,6 +976,7 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur
976976
}
977977
// # TODO(INFRA-130229): Revert this temporary change mettle timeouts investigations are completed
978978
w.logProcOnTimeout(pid)
979+
w.logProcChain(pid)
979980
}(pid)
980981

981982
err := cmd.Wait()
@@ -1036,21 +1037,29 @@ func (w *worker) logProcOnTimeout(pid int) {
10361037
const maxTargetLen = 256
10371038
p := strconv.Itoa(pid)
10381039
base := filepath.Join("/proc", p)
1040+
var errs []error
10391041

10401042
cmdlineBytes, err := os.ReadFile(filepath.Join(base, "cmdline"))
10411043
cmdline := ""
1042-
if err == nil && len(cmdlineBytes) > 0 {
1044+
if err != nil {
1045+
errs = append(errs, fmt.Errorf("cmdline: %v", err))
1046+
} else if len(cmdlineBytes) > 0 {
10431047
cmdline = strings.TrimSpace(strings.ReplaceAll(string(cmdlineBytes), "\x00", " "))
10441048
}
10451049

10461050
wchanBytes, err := os.ReadFile(filepath.Join(base, "wchan"))
10471051
wchan := ""
1048-
if err == nil && len(wchanBytes) > 0 {
1052+
if err != nil {
1053+
errs = append(errs, fmt.Errorf("wchan: %v", err))
1054+
} else if len(wchanBytes) > 0 {
10491055
wchan = strings.TrimSpace(string(wchanBytes))
10501056
}
10511057

10521058
var fds []string
1053-
if entries, err := os.ReadDir(filepath.Join(base, "fd")); err == nil {
1059+
entries, err := os.ReadDir(filepath.Join(base, "fd"))
1060+
if err != nil {
1061+
errs = append(errs, fmt.Errorf("fd: %v", err))
1062+
} else {
10541063
for i, e := range entries {
10551064
if i >= maxFDs {
10561065
fds = append(fds, "...truncated")
@@ -1067,15 +1076,161 @@ func (w *worker) logProcOnTimeout(pid int) {
10671076
}
10681077
}
10691078

1079+
var errSummary string
1080+
if len(errs) > 0 {
1081+
s := make([]string, len(errs))
1082+
for i, e := range errs {
1083+
s[i] = e.Error()
1084+
}
1085+
errSummary = strings.Join(s, ", ")
1086+
}
1087+
10701088
logr.WithFields(logrus.Fields{
1071-
"hash": w.actionDigest.Hash,
1072-
"pid": pid,
1073-
"cmdline": cmdline,
1074-
"wchan": wchan,
1075-
"fds": fds,
1089+
"hash": w.actionDigest.Hash,
1090+
"pid": pid,
1091+
"cmdline": cmdline,
1092+
"wchan": wchan,
1093+
"fds": fds,
1094+
"proc_err": errSummary,
10761095
}).Debug("Deadline exceeded: /proc snapshot")
10771096
}
10781097

1098+
// logProcTree logs a bounded process subtree starting from pid.
1099+
// Example:
1100+
//
1101+
// 24877:bash[S - sleeping/waiting]{do_wait} -> 24890:test_binary[D - io_wait)]{tcp_recvmsg}
1102+
// 24877:bash[S - sleeping/waiting]{do_wait} -> 24891:tee[S - sleeping/waitin]{pipe_read}
1103+
func (w *worker) logProcChain(pid int) {
1104+
const (
1105+
maxDepth = 6
1106+
maxNodes = 40
1107+
)
1108+
lines := make([]string, 0, maxNodes)
1109+
nodes := 0
1110+
var walk func(pid, depth int, path []string)
1111+
walk = func(pid, depth int, path []string) {
1112+
if depth > maxDepth || nodes >= maxNodes {
1113+
return
1114+
}
1115+
1116+
current := formatProc(pid)
1117+
path = append(path, current)
1118+
1119+
children := childPIDs(pid)
1120+
if len(children) == 0 {
1121+
lines = append(lines, strings.Join(path, " -> "))
1122+
nodes++
1123+
return
1124+
}
1125+
for _, c := range children {
1126+
if nodes >= maxNodes {
1127+
return
1128+
}
1129+
walk(c, depth+1, path)
1130+
}
1131+
}
1132+
1133+
walk(pid, 0, nil)
1134+
1135+
logr.WithFields(logrus.Fields{
1136+
"hash": w.actionDigest.Hash,
1137+
"pid": pid,
1138+
"tree": strings.Join(lines, "\n"),
1139+
}).Debug("Deadline exceeded: /proc process tree")
1140+
}
1141+
1142+
// formatProc creates formatted log line
1143+
func formatProc(pid int) string {
1144+
state := procState(pid)
1145+
wchan := readTrim(filepath.Join(procBase(pid), "wchan"))
1146+
1147+
return fmt.Sprintf(
1148+
"%d:%s[%s - %s]{%s}",
1149+
pid,
1150+
procName(pid),
1151+
state,
1152+
stateComment(state),
1153+
wchan,
1154+
)
1155+
}
1156+
1157+
// procBase returns the /proc path for a pid
1158+
func procBase(pid int) string {
1159+
return filepath.Join("/proc", strconv.Itoa(pid))
1160+
}
1161+
1162+
// readTrim reads a file and returns trimmed content, or "" on error/empty content
1163+
func readTrim(path string) string {
1164+
b, err := os.ReadFile(path)
1165+
if err != nil || len(b) == 0 {
1166+
return ""
1167+
}
1168+
return strings.TrimSpace(string(b))
1169+
}
1170+
1171+
// procName returns the process name from /proc/<pid>/comm, or "?" if unavailable
1172+
func procName(pid int) string {
1173+
name := readTrim(filepath.Join(procBase(pid), "comm"))
1174+
if name == "" {
1175+
return "?"
1176+
}
1177+
return name
1178+
}
1179+
1180+
// procState returns the Linux process state code from /proc/<pid>/status
1181+
func procState(pid int) string {
1182+
b, err := os.ReadFile(filepath.Join(procBase(pid), "status"))
1183+
if err != nil {
1184+
return ""
1185+
}
1186+
for _, ln := range strings.Split(string(b), "\n") {
1187+
k, v, ok := strings.Cut(ln, ":")
1188+
if !ok || strings.TrimSpace(k) != "State" {
1189+
continue
1190+
}
1191+
if f := strings.Fields(strings.TrimSpace(v)); len(f) > 0 {
1192+
return f[0]
1193+
}
1194+
return ""
1195+
}
1196+
return ""
1197+
}
1198+
1199+
// childPIDs returns direct children of pid from /proc/<pid>/task/<pid>/children
1200+
func childPIDs(pid int) []int {
1201+
b, err := os.ReadFile(filepath.Join(procBase(pid), "task", strconv.Itoa(pid), "children"))
1202+
if err != nil || len(b) == 0 {
1203+
return nil
1204+
}
1205+
var out []int
1206+
for _, s := range strings.Fields(string(b)) {
1207+
if n, err := strconv.Atoi(s); err == nil && n > 0 {
1208+
out = append(out, n)
1209+
}
1210+
}
1211+
return out
1212+
}
1213+
1214+
// stateComment is a helper to translate tail_state code into text
1215+
func stateComment(s string) string {
1216+
switch s {
1217+
case "R":
1218+
return "running"
1219+
case "S":
1220+
return "sleeping/waiting"
1221+
case "D":
1222+
return "io_wait"
1223+
case "I":
1224+
return "idle"
1225+
case "T":
1226+
return "stopped"
1227+
case "Z":
1228+
return "zombie"
1229+
default:
1230+
return "unknown"
1231+
}
1232+
}
1233+
10791234
// writeUncachedResult attempts to write an uncached action result proto.
10801235
// This is an extension for buildbarn-browser that lets it display results of a failed build.
10811236
// It returns a string that, on success, contains an appropriate message about it (they are

0 commit comments

Comments
 (0)