Skip to content

Commit 7ed96b3

Browse files
authored
Merge pull request #594 from prometheus/beorn7/promhttp
Add an error counter for internal errors in the HTTP handler
2 parents ea8c935 + 4d8144c commit 7ed96b3

File tree

2 files changed

+70
-8
lines changed

2 files changed

+70
-8
lines changed

prometheus/promhttp/http.go

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,32 @@ func Handler() http.Handler {
8484
// instrumentation. Use the InstrumentMetricHandler function to apply the same
8585
// kind of instrumentation as it is used by the Handler function.
8686
func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler {
87-
var inFlightSem chan struct{}
87+
var (
88+
inFlightSem chan struct{}
89+
errCnt = prometheus.NewCounterVec(
90+
prometheus.CounterOpts{
91+
Name: "promhttp_metric_handler_errors_total",
92+
Help: "Total number of internal errors encountered by the promhttp metric handler.",
93+
},
94+
[]string{"cause"},
95+
)
96+
)
97+
8898
if opts.MaxRequestsInFlight > 0 {
8999
inFlightSem = make(chan struct{}, opts.MaxRequestsInFlight)
90100
}
101+
if opts.Registry != nil {
102+
// Initialize all possibilites that can occur below.
103+
errCnt.WithLabelValues("gathering")
104+
errCnt.WithLabelValues("encoding")
105+
if err := opts.Registry.Register(errCnt); err != nil {
106+
if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
107+
errCnt = are.ExistingCollector.(*prometheus.CounterVec)
108+
} else {
109+
panic(err)
110+
}
111+
}
112+
}
91113

92114
h := http.HandlerFunc(func(rsp http.ResponseWriter, req *http.Request) {
93115
if inFlightSem != nil {
@@ -106,6 +128,7 @@ func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler {
106128
if opts.ErrorLog != nil {
107129
opts.ErrorLog.Println("error gathering metrics:", err)
108130
}
131+
errCnt.WithLabelValues("gathering").Inc()
109132
switch opts.ErrorHandling {
110133
case PanicOnError:
111134
panic(err)
@@ -146,6 +169,7 @@ func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler {
146169
if opts.ErrorLog != nil {
147170
opts.ErrorLog.Println("error encoding and sending metric family:", err)
148171
}
172+
errCnt.WithLabelValues("encoding").Inc()
149173
switch opts.ErrorHandling {
150174
case PanicOnError:
151175
panic(err)
@@ -236,9 +260,12 @@ const (
236260
// Ignore errors and try to serve as many metrics as possible. However,
237261
// if no metrics can be served, serve an HTTP status code 500 and the
238262
// last error message in the body. Only use this in deliberate "best
239-
// effort" metrics collection scenarios. It is recommended to at least
240-
// log errors (by providing an ErrorLog in HandlerOpts) to not mask
241-
// errors completely.
263+
// effort" metrics collection scenarios. In this case, it is highly
264+
// recommended to provide other means of detecting errors: By setting an
265+
// ErrorLog in HandlerOpts, the errors are logged. By providing a
266+
// Registry in HandlerOpts, the exposed metrics include an error counter
267+
// "promhttp_metric_handler_errors_total", which can be used for
268+
// alerts.
242269
ContinueOnError
243270
// Panic upon the first error encountered (useful for "crash only" apps).
244271
PanicOnError
@@ -261,6 +288,18 @@ type HandlerOpts struct {
261288
// logged regardless of the configured ErrorHandling provided ErrorLog
262289
// is not nil.
263290
ErrorHandling HandlerErrorHandling
291+
// If Registry is not nil, it is used to register a metric
292+
// "promhttp_metric_handler_errors_total", partitioned by "cause". A
293+
// failed registration causes a panic. Note that this error counter is
294+
// different from the instrumentation you get from the various
295+
// InstrumentHandler... helpers. It counts errors that don't necessarily
296+
// result in a non-2xx HTTP status code. There are two typical cases:
297+
// (1) Encoding errors that only happen after streaming of the HTTP body
298+
// has already started (and the status code 200 has been sent). This
299+
// should only happen with custom collectors. (2) Collection errors with
300+
// no effect on the HTTP status code because ErrorHandling is set to
301+
// ContinueOnError.
302+
Registry prometheus.Registerer
264303
// If DisableCompression is true, the handler will never compress the
265304
// response, even if requested by the client.
266305
DisableCompression bool

prometheus/promhttp/http_test.go

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ func (b blockingCollector) Collect(ch chan<- prometheus.Metric) {
5959
func TestHandlerErrorHandling(t *testing.T) {
6060

6161
// Create a registry that collects a MetricFamily with two elements,
62-
// another with one, and reports an error.
62+
// another with one, and reports an error. Further down, we'll use the
63+
// same registry in the HandlerOpts.
6364
reg := prometheus.NewRegistry()
6465

6566
cnt := prometheus.NewCounter(prometheus.CounterOpts{
@@ -92,25 +93,47 @@ func TestHandlerErrorHandling(t *testing.T) {
9293
errorHandler := HandlerFor(reg, HandlerOpts{
9394
ErrorLog: logger,
9495
ErrorHandling: HTTPErrorOnError,
96+
Registry: reg,
9597
})
9698
continueHandler := HandlerFor(reg, HandlerOpts{
9799
ErrorLog: logger,
98100
ErrorHandling: ContinueOnError,
101+
Registry: reg,
99102
})
100103
panicHandler := HandlerFor(reg, HandlerOpts{
101104
ErrorLog: logger,
102105
ErrorHandling: PanicOnError,
106+
Registry: reg,
103107
})
104108
wantMsg := `error gathering metrics: error collecting metric Desc{fqName: "invalid_metric", help: "not helpful", constLabels: {}, variableLabels: []}: collect error
105109
`
106110
wantErrorBody := `An error has occurred while serving metrics:
107111
108112
error collecting metric Desc{fqName: "invalid_metric", help: "not helpful", constLabels: {}, variableLabels: []}: collect error
109113
`
110-
wantOKBody := `# HELP name docstring
114+
wantOKBody1 := `# HELP name docstring
111115
# TYPE name counter
112116
name{constname="constvalue",labelname="val1"} 1
113117
name{constname="constvalue",labelname="val2"} 1
118+
# HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler.
119+
# TYPE promhttp_metric_handler_errors_total counter
120+
promhttp_metric_handler_errors_total{cause="encoding"} 0
121+
promhttp_metric_handler_errors_total{cause="gathering"} 1
122+
# HELP the_count Ah-ah-ah! Thunder and lightning!
123+
# TYPE the_count counter
124+
the_count 0
125+
`
126+
// It might happen that counting the gathering error makes it to the
127+
// promhttp_metric_handler_errors_total counter before it is gathered
128+
// itself. Thus, we have to bodies that are acceptable for the test.
129+
wantOKBody2 := `# HELP name docstring
130+
# TYPE name counter
131+
name{constname="constvalue",labelname="val1"} 1
132+
name{constname="constvalue",labelname="val2"} 1
133+
# HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler.
134+
# TYPE promhttp_metric_handler_errors_total counter
135+
promhttp_metric_handler_errors_total{cause="encoding"} 0
136+
promhttp_metric_handler_errors_total{cause="gathering"} 2
114137
# HELP the_count Ah-ah-ah! Thunder and lightning!
115138
# TYPE the_count counter
116139
the_count 0
@@ -137,8 +160,8 @@ the_count 0
137160
if got := logBuf.String(); got != wantMsg {
138161
t.Errorf("got log message %q, want %q", got, wantMsg)
139162
}
140-
if got := writer.Body.String(); got != wantOKBody {
141-
t.Errorf("got body %q, want %q", got, wantOKBody)
163+
if got := writer.Body.String(); got != wantOKBody1 && got != wantOKBody2 {
164+
t.Errorf("got body %q, want either %q or %q", got, wantOKBody1, wantOKBody2)
142165
}
143166

144167
defer func() {

0 commit comments

Comments
 (0)