@@ -976,6 +976,7 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur
976976 }
977977 // # TODO(INFRA-130229): Revert this temporary change mettle timeouts investigations are completed
978978 w .logProcOnTimeout (pid )
979+ w .logProcChain (pid )
979980 }(pid )
980981
981982 err := cmd .Wait ()
@@ -1036,21 +1037,29 @@ func (w *worker) logProcOnTimeout(pid int) {
10361037 const maxTargetLen = 256
10371038 p := strconv .Itoa (pid )
10381039 base := filepath .Join ("/proc" , p )
1040+ var errs []error
10391041
10401042 cmdlineBytes , err := os .ReadFile (filepath .Join (base , "cmdline" ))
10411043 cmdline := ""
1042- if err == nil && len (cmdlineBytes ) > 0 {
1044+ if err != nil {
1045+ errs = append (errs , fmt .Errorf ("cmdline: %v" , err ))
1046+ } else if len (cmdlineBytes ) > 0 {
10431047 cmdline = strings .TrimSpace (strings .ReplaceAll (string (cmdlineBytes ), "\x00 " , " " ))
10441048 }
10451049
10461050 wchanBytes , err := os .ReadFile (filepath .Join (base , "wchan" ))
10471051 wchan := ""
1048- if err == nil && len (wchanBytes ) > 0 {
1052+ if err != nil {
1053+ errs = append (errs , fmt .Errorf ("wchan: %v" , err ))
1054+ } else if len (wchanBytes ) > 0 {
10491055 wchan = strings .TrimSpace (string (wchanBytes ))
10501056 }
10511057
10521058 var fds []string
1053- if entries , err := os .ReadDir (filepath .Join (base , "fd" )); err == nil {
1059+ entries , err := os .ReadDir (filepath .Join (base , "fd" ))
1060+ if err != nil {
1061+ errs = append (errs , fmt .Errorf ("fd: %v" , err ))
1062+ } else {
10541063 for i , e := range entries {
10551064 if i >= maxFDs {
10561065 fds = append (fds , "...truncated" )
@@ -1067,15 +1076,161 @@ func (w *worker) logProcOnTimeout(pid int) {
10671076 }
10681077 }
10691078
1079+ var errSummary string
1080+ if len (errs ) > 0 {
1081+ s := make ([]string , len (errs ))
1082+ for i , e := range errs {
1083+ s [i ] = e .Error ()
1084+ }
1085+ errSummary = strings .Join (s , ", " )
1086+ }
1087+
10701088 logr .WithFields (logrus.Fields {
1071- "hash" : w .actionDigest .Hash ,
1072- "pid" : pid ,
1073- "cmdline" : cmdline ,
1074- "wchan" : wchan ,
1075- "fds" : fds ,
1089+ "hash" : w .actionDigest .Hash ,
1090+ "pid" : pid ,
1091+ "cmdline" : cmdline ,
1092+ "wchan" : wchan ,
1093+ "fds" : fds ,
1094+ "proc_err" : errSummary ,
10761095 }).Debug ("Deadline exceeded: /proc snapshot" )
10771096}
10781097
1098+ // logProcTree logs a bounded process subtree starting from pid.
1099+ // Example:
1100+ //
1101+ // 24877:bash[S - sleeping/waiting]{do_wait} -> 24890:test_binary[D - io_wait)]{tcp_recvmsg}
1102+ // 24877:bash[S - sleeping/waiting]{do_wait} -> 24891:tee[S - sleeping/waitin]{pipe_read}
1103+ func (w * worker ) logProcChain (pid int ) {
1104+ const (
1105+ maxDepth = 6
1106+ maxNodes = 40
1107+ )
1108+ lines := make ([]string , 0 , maxNodes )
1109+ nodes := 0
1110+ var walk func (pid , depth int , path []string )
1111+ walk = func (pid , depth int , path []string ) {
1112+ if depth > maxDepth || nodes >= maxNodes {
1113+ return
1114+ }
1115+
1116+ current := formatProc (pid )
1117+ path = append (path , current )
1118+
1119+ children := childPIDs (pid )
1120+ if len (children ) == 0 {
1121+ lines = append (lines , strings .Join (path , " -> " ))
1122+ nodes ++
1123+ return
1124+ }
1125+ for _ , c := range children {
1126+ if nodes >= maxNodes {
1127+ return
1128+ }
1129+ walk (c , depth + 1 , path )
1130+ }
1131+ }
1132+
1133+ walk (pid , 0 , nil )
1134+
1135+ logr .WithFields (logrus.Fields {
1136+ "hash" : w .actionDigest .Hash ,
1137+ "pid" : pid ,
1138+ "tree" : strings .Join (lines , "\n " ),
1139+ }).Debug ("Deadline exceeded: /proc process tree" )
1140+ }
1141+
1142+ // formatProc creates formatted log line
1143+ func formatProc (pid int ) string {
1144+ state := procState (pid )
1145+ wchan := readTrim (filepath .Join (procBase (pid ), "wchan" ))
1146+
1147+ return fmt .Sprintf (
1148+ "%d:%s[%s - %s]{%s}" ,
1149+ pid ,
1150+ procName (pid ),
1151+ state ,
1152+ stateComment (state ),
1153+ wchan ,
1154+ )
1155+ }
1156+
1157+ // procBase returns the /proc path for a pid
1158+ func procBase (pid int ) string {
1159+ return filepath .Join ("/proc" , strconv .Itoa (pid ))
1160+ }
1161+
1162+ // readTrim reads a file and returns trimmed content, or "" on error/empty content
1163+ func readTrim (path string ) string {
1164+ b , err := os .ReadFile (path )
1165+ if err != nil || len (b ) == 0 {
1166+ return ""
1167+ }
1168+ return strings .TrimSpace (string (b ))
1169+ }
1170+
1171+ // procName returns the process name from /proc/<pid>/comm, or "?" if unavailable
1172+ func procName (pid int ) string {
1173+ name := readTrim (filepath .Join (procBase (pid ), "comm" ))
1174+ if name == "" {
1175+ return "?"
1176+ }
1177+ return name
1178+ }
1179+
1180+ // procState returns the Linux process state code from /proc/<pid>/status
1181+ func procState (pid int ) string {
1182+ b , err := os .ReadFile (filepath .Join (procBase (pid ), "status" ))
1183+ if err != nil {
1184+ return ""
1185+ }
1186+ for _ , ln := range strings .Split (string (b ), "\n " ) {
1187+ k , v , ok := strings .Cut (ln , ":" )
1188+ if ! ok || strings .TrimSpace (k ) != "State" {
1189+ continue
1190+ }
1191+ if f := strings .Fields (strings .TrimSpace (v )); len (f ) > 0 {
1192+ return f [0 ]
1193+ }
1194+ return ""
1195+ }
1196+ return ""
1197+ }
1198+
1199+ // childPIDs returns direct children of pid from /proc/<pid>/task/<pid>/children
1200+ func childPIDs (pid int ) []int {
1201+ b , err := os .ReadFile (filepath .Join (procBase (pid ), "task" , strconv .Itoa (pid ), "children" ))
1202+ if err != nil || len (b ) == 0 {
1203+ return nil
1204+ }
1205+ var out []int
1206+ for _ , s := range strings .Fields (string (b )) {
1207+ if n , err := strconv .Atoi (s ); err == nil && n > 0 {
1208+ out = append (out , n )
1209+ }
1210+ }
1211+ return out
1212+ }
1213+
1214+ // stateComment is a helper to translate tail_state code into text
1215+ func stateComment (s string ) string {
1216+ switch s {
1217+ case "R" :
1218+ return "running"
1219+ case "S" :
1220+ return "sleeping/waiting"
1221+ case "D" :
1222+ return "io_wait"
1223+ case "I" :
1224+ return "idle"
1225+ case "T" :
1226+ return "stopped"
1227+ case "Z" :
1228+ return "zombie"
1229+ default :
1230+ return "unknown"
1231+ }
1232+ }
1233+
10791234// writeUncachedResult attempts to write an uncached action result proto.
10801235// This is an extension for buildbarn-browser that lets it display results of a failed build.
10811236// It returns a string that, on success, contains an appropriate message about it (they are
0 commit comments