Skip to content

Commit 261b80b

Browse files
craig[bot]TheComputerM
andcommitted
Merge #150622
150622: pkg/util/log: add metrics to otlp sink r=TheComputerM a=TheComputerM adds metrics to monitor internal statistics of otlp sinks, similar to the metrics exposed for fluent sinks. `log.otlp.sink.write.attempts`: Number of write attempts experienced by otlp-server logging sinks `log.otlp.sink.write.errors`: Number of write errors experienced by otlp-server logging sinks `log.otlp.sink.grpc.transparent_retries`: Number of transparent retries done by otlp-server logging sinks when using GRPC Epic: none Release note: none --- **Happy case** <img width="1133" height="238" alt="image" src="https://github.com/user-attachments/assets/1c03c159-e88e-42e1-9e09-e4a66a5f2fe1" /> **When OTLP receiver is down** <img width="1133" height="238" alt="image" src="https://github.com/user-attachments/assets/3cc59391-f6be-4452-b4df-aff2606b34c8" /> Co-authored-by: Mudit Somani <[email protected]>
2 parents 8403878 + 2d709ac commit 261b80b

File tree

5 files changed

+100
-6
lines changed

5 files changed

+100
-6
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10127,6 +10127,30 @@ layers:
1012710127
unit: COUNT
1012810128
aggregation: AVG
1012910129
derivative: NON_NEGATIVE_DERIVATIVE
10130+
- name: log.otlp.sink.grpc.transparent_retries
10131+
exported_name: log_otlp_sink_grpc_transparent_retries
10132+
description: Number of transparent retries done by otlp-server logging sinks when using GRPC
10133+
y_axis_label: Retries
10134+
type: COUNTER
10135+
unit: COUNT
10136+
aggregation: AVG
10137+
derivative: NON_NEGATIVE_DERIVATIVE
10138+
- name: log.otlp.sink.write.attempts
10139+
exported_name: log_otlp_sink_write_attempts
10140+
description: Number of write attempts experienced by otlp-server logging sinks
10141+
y_axis_label: Attempts
10142+
type: COUNTER
10143+
unit: COUNT
10144+
aggregation: AVG
10145+
derivative: NON_NEGATIVE_DERIVATIVE
10146+
- name: log.otlp.sink.write.errors
10147+
exported_name: log_otlp_sink_write_errors
10148+
description: Number of write errors experienced by otlp-server logging sinks
10149+
y_axis_label: Errors
10150+
type: COUNTER
10151+
unit: COUNT
10152+
aggregation: AVG
10153+
derivative: NON_NEGATIVE_DERIVATIVE
1013010154
- name: sys.cgo.allocbytes
1013110155
exported_name: sys_cgo_allocbytes
1013210156
description: Current bytes of memory allocated by cgo

pkg/util/log/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ go_library(
9696
"@org_golang_google_grpc//connectivity",
9797
"@org_golang_google_grpc//credentials/insecure",
9898
"@org_golang_google_grpc//encoding/gzip",
99+
"@org_golang_google_grpc//stats",
99100
"@org_golang_google_grpc//status",
100101
] + select({
101102
"@io_bazel_rules_go//go/platform:aix": [

pkg/util/log/logmetrics/metrics.go

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,27 @@ var (
4343
Unit: metric.Unit_COUNT,
4444
MetricType: io_prometheus_client.MetricType_COUNTER,
4545
}
46+
otlpSinkWriteAttempts = metric.Metadata{
47+
Name: "log.otlp.sink.write.attempts",
48+
Help: "Number of write attempts experienced by otlp-server logging sinks",
49+
Measurement: "Attempts",
50+
Unit: metric.Unit_COUNT,
51+
MetricType: io_prometheus_client.MetricType_COUNTER,
52+
}
53+
otlpSinkWriteErrors = metric.Metadata{
54+
Name: "log.otlp.sink.write.errors",
55+
Help: "Number of write errors experienced by otlp-server logging sinks",
56+
Measurement: "Errors",
57+
Unit: metric.Unit_COUNT,
58+
MetricType: io_prometheus_client.MetricType_COUNTER,
59+
}
60+
otlpSinkGRPCTransparentRetries = metric.Metadata{
61+
Name: "log.otlp.sink.grpc.transparent_retries",
62+
Help: "Number of transparent retries done by otlp-server logging sinks when using GRPC",
63+
Measurement: "Retries",
64+
Unit: metric.Unit_COUNT,
65+
MetricType: io_prometheus_client.MetricType_COUNTER,
66+
}
4667
bufferedSinkMessagesDropped = metric.Metadata{
4768
Name: "log.buffered.messages.dropped",
4869
Help: "Count of log messages that are dropped by buffered log sinks. When CRDB attempts to buffer a log message in a buffered log sink whose buffer is already full, it drops the oldest buffered messages to make space for the new message",
@@ -87,12 +108,17 @@ var _ log.LogMetrics = (*logMetricsRegistry)(nil)
87108
func newLogMetricsRegistry() *logMetricsRegistry {
88109
return &logMetricsRegistry{
89110
counters: []*metric.Counter{
111+
log.BufferedSinkMessagesDropped: metric.NewCounter(bufferedSinkMessagesDropped),
112+
log.LogMessageCount: metric.NewCounter(logMessageCount),
113+
// fluent sink metrics
90114
log.FluentSinkConnectionAttempt: metric.NewCounter(fluentSinkConnAttempts),
91115
log.FluentSinkConnectionError: metric.NewCounter(fluentSinkConnErrors),
92116
log.FluentSinkWriteAttempt: metric.NewCounter(fluentSinkWriteAttempts),
93117
log.FluentSinkWriteError: metric.NewCounter(fluentSinkWriteErrors),
94-
log.BufferedSinkMessagesDropped: metric.NewCounter(bufferedSinkMessagesDropped),
95-
log.LogMessageCount: metric.NewCounter(logMessageCount),
118+
// otlp sink metrics
119+
log.OTLPSinkWriteAttempt: metric.NewCounter(otlpSinkWriteAttempts),
120+
log.OTLPSinkWriteError: metric.NewCounter(otlpSinkWriteErrors),
121+
log.OTLPSinkGRPCTransparentRetries: metric.NewCounter(otlpSinkGRPCTransparentRetries),
96122
},
97123
}
98124
}

pkg/util/log/metric.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,15 @@ type LogMetrics interface {
3333
type Metric int
3434

3535
const (
36-
FluentSinkConnectionAttempt Metric = iota
36+
BufferedSinkMessagesDropped Metric = iota
37+
LogMessageCount
38+
// fluent sink metrics
39+
FluentSinkConnectionAttempt
3740
FluentSinkConnectionError
3841
FluentSinkWriteAttempt
3942
FluentSinkWriteError
40-
BufferedSinkMessagesDropped
41-
LogMessageCount
43+
// otlp sink metrics
44+
OTLPSinkWriteAttempt
45+
OTLPSinkWriteError
46+
OTLPSinkGRPCTransparentRetries
4247
)

pkg/util/log/otlp_client.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"google.golang.org/grpc/connectivity"
2222
"google.golang.org/grpc/credentials/insecure"
2323
"google.golang.org/grpc/encoding/gzip"
24+
"google.golang.org/grpc/stats"
2425
"google.golang.org/grpc/status"
2526
)
2627

@@ -53,9 +54,12 @@ type otlpSink struct {
5354
requestObject *collpb.ExportLogsServiceRequest
5455
}
5556

57+
var statsHandlerOption = &otlpStatsHandler{}
58+
5659
func newOTLPSink(config logconfig.OTLPSinkConfig) (*otlpSink, error) {
5760
dialOpts := []grpc.DialOption{
5861
grpc.WithTransportCredentials(insecure.NewCredentials()),
62+
grpc.WithStatsHandler(statsHandlerOption),
5963
}
6064

6165
if *config.Compression == logconfig.GzipCompression {
@@ -143,6 +147,7 @@ func otlpExtractRecords(b []byte) []*lpb.LogRecord {
143147
}
144148

145149
func (sink *otlpSink) output(b []byte, opts sinkOutputOptions) error {
150+
logging.metrics.IncrementCounter(OTLPSinkWriteAttempt, 1)
146151
ctx := context.Background()
147152

148153
records := otlpExtractRecords(b)
@@ -161,5 +166,38 @@ func (sink *otlpSink) output(b []byte, opts sinkOutputOptions) error {
161166
return nil
162167
}
163168

164-
return err
169+
if err != nil {
170+
logging.metrics.IncrementCounter(OTLPSinkWriteError, 1)
171+
return err
172+
}
173+
174+
return nil
165175
}
176+
177+
// otlpStatsHandler implements the stats.Handler interface to and is passed as
178+
// a dial option to the grpc client in the otlp log sink to get grpc metrics.
179+
type otlpStatsHandler struct{}
180+
181+
// TagRPC exists to satisfy the stats.Handler interface.
182+
func (h *otlpStatsHandler) TagRPC(ctx context.Context, info *stats.RPCTagInfo) context.Context {
183+
return ctx
184+
}
185+
186+
// TagConn exists to satisfy the stats.Handler interface.
187+
func (h *otlpStatsHandler) TagConn(ctx context.Context, info *stats.ConnTagInfo) context.Context {
188+
return ctx
189+
}
190+
191+
// HandleConn exists to satisfy the stats.Handler interface.
192+
func (h *otlpStatsHandler) HandleConn(ctx context.Context, connInfo stats.ConnStats) {}
193+
194+
func (h *otlpStatsHandler) HandleRPC(ctx context.Context, rpcInfo stats.RPCStats) {
195+
switch st := rpcInfo.(type) {
196+
case *stats.Begin:
197+
if st.IsTransparentRetryAttempt {
198+
logging.metrics.IncrementCounter(OTLPSinkGRPCTransparentRetries, 1)
199+
}
200+
}
201+
}
202+
203+
var _ stats.Handler = (*otlpStatsHandler)(nil)

0 commit comments

Comments
 (0)