|
| 1 | +/* |
| 2 | +Copyright AppsCode Inc. and Contributors |
| 3 | +
|
| 4 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +you may not use this file except in compliance with the License. |
| 6 | +You may obtain a copy of the License at |
| 7 | +
|
| 8 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +
|
| 10 | +Unless required by applicable law or agreed to in writing, software |
| 11 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +See the License for the specific language governing permissions and |
| 14 | +limitations under the License. |
| 15 | +*/ |
| 16 | + |
1 | 17 | package apis |
2 | 18 |
|
3 | 19 | import ( |
4 | | - "github.com/prometheus/client_golang/prometheus" |
5 | | - "github.com/prometheus/client_golang/prometheus/promauto" |
| 20 | + "context" |
| 21 | + |
| 22 | + "go.opentelemetry.io/otel" |
| 23 | + "go.opentelemetry.io/otel/attribute" |
| 24 | + "go.opentelemetry.io/otel/metric" |
| 25 | +) |
| 26 | + |
| 27 | +const ( |
| 28 | + meterName = "kubeops.dev/pgoutbox" |
| 29 | + |
| 30 | + attrKeyTable = "table" |
| 31 | + attrKeySubject = "subject" |
| 32 | + attrKeyKind = "kind" |
6 | 33 | ) |
7 | 34 |
|
8 | | -// Metrics Prometheus metrics. |
9 | 35 | type Metrics struct { |
10 | | - filterSkippedEvents, publishedEvents, problematicEvents *prometheus.CounterVec |
| 36 | + // Counters |
| 37 | + eventsPublished metric.Int64Counter |
| 38 | + eventsFiltered metric.Int64Counter |
| 39 | + eventsFailed metric.Int64Counter |
| 40 | + |
| 41 | + // Histograms |
| 42 | + processingDuration metric.Float64Histogram |
| 43 | + publishDuration metric.Float64Histogram |
| 44 | + |
| 45 | + // Gauges (observable) |
| 46 | + replicationLagBytes metric.Int64ObservableGauge |
| 47 | + currentLSN metric.Int64ObservableGauge |
11 | 48 | } |
12 | 49 |
|
13 | | -const ( |
14 | | - labelApp = "app" |
15 | | - labelTable = "table" |
16 | | - labelSubject = "subject" |
17 | | - labelKind = "kind" |
18 | | -) |
| 50 | +func NewMetrics() (*Metrics, error) { |
| 51 | + meter := otel.Meter(meterName) |
| 52 | + |
| 53 | + eventsPublished, err := meter.Int64Counter( |
| 54 | + "pgoutbox.events.published", |
| 55 | + metric.WithDescription("Total number of successfully published events"), |
| 56 | + metric.WithUnit("{event}"), |
| 57 | + ) |
| 58 | + if err != nil { |
| 59 | + return nil, err |
| 60 | + } |
| 61 | + |
| 62 | + eventsFiltered, err := meter.Int64Counter( |
| 63 | + "pgoutbox.events.filtered", |
| 64 | + metric.WithDescription("Total number of events skipped by filter"), |
| 65 | + metric.WithUnit("{event}"), |
| 66 | + ) |
| 67 | + if err != nil { |
| 68 | + return nil, err |
| 69 | + } |
| 70 | + |
| 71 | + eventsFailed, err := meter.Int64Counter( |
| 72 | + "pgoutbox.events.failed", |
| 73 | + metric.WithDescription("Total number of events that failed processing"), |
| 74 | + metric.WithUnit("{event}"), |
| 75 | + ) |
| 76 | + if err != nil { |
| 77 | + return nil, err |
| 78 | + } |
| 79 | + |
| 80 | + processingDuration, err := meter.Float64Histogram( |
| 81 | + "pgoutbox.processing.duration", |
| 82 | + metric.WithDescription("Time to process a WAL message end-to-end"), |
| 83 | + metric.WithUnit("s"), |
| 84 | + metric.WithExplicitBucketBoundaries(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10), |
| 85 | + ) |
| 86 | + if err != nil { |
| 87 | + return nil, err |
| 88 | + } |
| 89 | + |
| 90 | + publishDuration, err := meter.Float64Histogram( |
| 91 | + "pgoutbox.publish.duration", |
| 92 | + metric.WithDescription("Time to publish an event to the message broker"), |
| 93 | + metric.WithUnit("s"), |
| 94 | + metric.WithExplicitBucketBoundaries(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5), |
| 95 | + ) |
| 96 | + if err != nil { |
| 97 | + return nil, err |
| 98 | + } |
| 99 | + |
| 100 | + replicationLagBytes, err := meter.Int64ObservableGauge( |
| 101 | + "pgoutbox.replication.lag.bytes", |
| 102 | + metric.WithDescription("Replication lag in bytes (difference between server WAL end and current LSN)"), |
| 103 | + metric.WithUnit("By"), |
| 104 | + ) |
| 105 | + if err != nil { |
| 106 | + return nil, err |
| 107 | + } |
| 108 | + |
| 109 | + currentLSN, err := meter.Int64ObservableGauge( |
| 110 | + "pgoutbox.replication.lsn", |
| 111 | + metric.WithDescription("Current LSN position in the replication stream"), |
| 112 | + metric.WithUnit("{position}"), |
| 113 | + ) |
| 114 | + if err != nil { |
| 115 | + return nil, err |
| 116 | + } |
19 | 117 |
|
20 | | -// NewMetrics create and initialize new Prometheus metrics. |
21 | | -func NewMetrics() *Metrics { |
22 | 118 | return &Metrics{ |
23 | | - publishedEvents: promauto.NewCounterVec(prometheus.CounterOpts{ |
24 | | - Name: "published_events_total", |
25 | | - Help: "The total number of published events", |
26 | | - }, |
27 | | - []string{labelApp, labelSubject, labelTable}, |
28 | | - ), |
29 | | - problematicEvents: promauto.NewCounterVec(prometheus.CounterOpts{ |
30 | | - Name: "problematic_events_total", |
31 | | - Help: "The total number of skipped problematic events", |
32 | | - }, |
33 | | - []string{labelApp, labelKind}, |
34 | | - ), |
35 | | - filterSkippedEvents: promauto.NewCounterVec(prometheus.CounterOpts{ |
36 | | - Name: "filter_skipped_events_total", |
37 | | - Help: "The total number of skipped events", |
38 | | - }, |
39 | | - []string{labelApp, labelTable}, |
| 119 | + eventsPublished: eventsPublished, |
| 120 | + eventsFiltered: eventsFiltered, |
| 121 | + eventsFailed: eventsFailed, |
| 122 | + processingDuration: processingDuration, |
| 123 | + publishDuration: publishDuration, |
| 124 | + replicationLagBytes: replicationLagBytes, |
| 125 | + currentLSN: currentLSN, |
| 126 | + }, nil |
| 127 | +} |
| 128 | + |
| 129 | +func (m *Metrics) IncPublishedEvents(subject, table string) { |
| 130 | + if m == nil || m.eventsPublished == nil { |
| 131 | + return |
| 132 | + } |
| 133 | + m.eventsPublished.Add(context.Background(), 1, |
| 134 | + metric.WithAttributes( |
| 135 | + attribute.String(attrKeySubject, subject), |
| 136 | + attribute.String(attrKeyTable, table), |
40 | 137 | ), |
| 138 | + ) |
| 139 | +} |
| 140 | + |
| 141 | +func (m *Metrics) IncFilterSkippedEvents(table string) { |
| 142 | + if m == nil || m.eventsFiltered == nil { |
| 143 | + return |
41 | 144 | } |
| 145 | + m.eventsFiltered.Add(context.Background(), 1, |
| 146 | + metric.WithAttributes(attribute.String(attrKeyTable, table)), |
| 147 | + ) |
42 | 148 | } |
43 | 149 |
|
44 | | -const appName = "pgoutbox" |
| 150 | +func (m *Metrics) IncProblematicEvents(kind string) { |
| 151 | + if m == nil || m.eventsFailed == nil { |
| 152 | + return |
| 153 | + } |
| 154 | + m.eventsFailed.Add(context.Background(), 1, |
| 155 | + metric.WithAttributes(attribute.String(attrKeyKind, kind)), |
| 156 | + ) |
| 157 | +} |
| 158 | + |
| 159 | +func (m *Metrics) RecordProcessingDuration(seconds float64) { |
| 160 | + if m == nil || m.processingDuration == nil { |
| 161 | + return |
| 162 | + } |
| 163 | + m.processingDuration.Record(context.Background(), seconds) |
| 164 | +} |
45 | 165 |
|
46 | | -// IncPublishedEvents increment published events counter. |
47 | | -func (m Metrics) IncPublishedEvents(subject, table string) { |
48 | | - m.publishedEvents.With(prometheus.Labels{labelApp: appName, labelSubject: subject, labelTable: table}).Inc() |
| 166 | +func (m *Metrics) RecordPublishDuration(seconds float64, subject string) { |
| 167 | + if m == nil || m.publishDuration == nil { |
| 168 | + return |
| 169 | + } |
| 170 | + m.publishDuration.Record(context.Background(), seconds, |
| 171 | + metric.WithAttributes(attribute.String(attrKeySubject, subject)), |
| 172 | + ) |
49 | 173 | } |
50 | 174 |
|
51 | | -// IncFilterSkippedEvents increment skipped by filter events counter. |
52 | | -func (m Metrics) IncFilterSkippedEvents(table string) { |
53 | | - m.filterSkippedEvents.With(prometheus.Labels{labelApp: appName, labelTable: table}).Inc() |
| 175 | +type GaugeCallbacks struct { |
| 176 | + GetCurrentLSN func() uint64 |
| 177 | + GetServerLSN func() uint64 |
54 | 178 | } |
55 | 179 |
|
56 | | -// IncProblematicEvents increment skipped by filter events counter. |
57 | | -func (m Metrics) IncProblematicEvents(kind string) { |
58 | | - m.problematicEvents.With(prometheus.Labels{labelApp: appName, labelKind: kind}).Inc() |
| 180 | +func (m *Metrics) RegisterCallbacks(cb GaugeCallbacks) error { |
| 181 | + if m == nil { |
| 182 | + return nil |
| 183 | + } |
| 184 | + |
| 185 | + meter := otel.Meter(meterName) |
| 186 | + |
| 187 | + _, err := meter.RegisterCallback( |
| 188 | + func(ctx context.Context, o metric.Observer) error { |
| 189 | + if cb.GetCurrentLSN != nil { |
| 190 | + currentLSN := cb.GetCurrentLSN() |
| 191 | + o.ObserveInt64(m.currentLSN, int64(currentLSN)) |
| 192 | + if cb.GetServerLSN != nil { |
| 193 | + serverLSN := cb.GetServerLSN() |
| 194 | + if serverLSN > currentLSN { |
| 195 | + o.ObserveInt64(m.replicationLagBytes, int64(serverLSN-currentLSN)) |
| 196 | + } else { |
| 197 | + o.ObserveInt64(m.replicationLagBytes, 0) |
| 198 | + } |
| 199 | + } |
| 200 | + } |
| 201 | + |
| 202 | + return nil |
| 203 | + }, |
| 204 | + m.currentLSN, |
| 205 | + m.replicationLagBytes, |
| 206 | + ) |
| 207 | + |
| 208 | + return err |
59 | 209 | } |
0 commit comments