Skip to content

Commit b092374

Browse files
authored
Add timeout metric and improve logging in worker.go (#334)
* Add timeout metric and improve logging in worker.go
1 parent 064d525 commit b092374

File tree

3 files changed

+15
-5
lines changed

3 files changed

+15
-5
lines changed

ChangeLog

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
Version 11.16.1
2+
---------------
3+
* Add timeout metric + timeout value in action timout logs
4+
15
Version 11.16.0
26
---------------
37
* Remove the whole digest of some elan logs and only add the hash using structured logs

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
11.16.0
1+
11.16.1

mettle/worker/worker.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ var nackedMessages = prometheus.NewCounter(prometheus.CounterOpts{
119119
Name: "nacked_messages_total",
120120
})
121121

122+
var actionTimeout = prometheus.NewCounter(prometheus.CounterOpts{
123+
Namespace: "mettle",
124+
Name: "action_timeout",
125+
})
126+
122127
// ErrTimeout is returned when the build action exceeds the action timeout
123128
var ErrTimeout = errors.New("action execution timed out")
124129

@@ -141,6 +146,7 @@ var metrics = []prometheus.Collector{
141146
packBytesRead,
142147
collectOutputErrors,
143148
nackedMessages,
149+
actionTimeout,
144150
}
145151

146152
func init() {
@@ -559,12 +565,12 @@ func (w *worker) runTaskRequest(req *pb.ExecuteRequest) *pb.ExecuteResponse {
559565
// forceShutdown sends any shutdown reports and calls log.Fatal() to shut down the worker
560566
func (w *worker) forceShutdown(shutdownMsg string) {
561567
w.Report(false, false, false, "%s", shutdownMsg)
562-
log.Info("Force shutting down worker")
568+
log.Debug("Force shutting down worker")
563569
if w.currentMsg != nil {
564570
if w.actionDigest != nil {
565571
logr.WithFields(logrus.Fields{
566572
"hash": w.actionDigest.Hash,
567-
}).Info("Nacking action")
573+
}).Debug("Nacking action")
568574
} else {
569575
log.Error("Nacking action but action digest is nil")
570576
}
@@ -942,9 +948,9 @@ func (w *worker) runCommand(cmd *exec.Cmd, timeout time.Duration) error {
942948
case <-time.After(timeout):
943949
logr.WithFields(logrus.Fields{
944950
"hash": w.actionDigest.Hash,
945-
}).Warn("Terminating process due to timeout")
951+
}).Warnf("Terminating process due to timeout: %s", timeout)
946952
cmd.Process.Signal(os.Signal(syscall.SIGTERM))
947-
953+
actionTimeout.Inc()
948954
select {
949955
case <-ch:
950956
return ErrTimeout

0 commit comments

Comments
 (0)