forked from ihippik/wal-listener
-
Notifications
You must be signed in to change notification settings - Fork 1
feat: add otel metrics and enable '/metrics' route #16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from 4 commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
7f640cd
otel metrics wip
samiulsami 80a397e
gracefully shutdown http server upon exit
samiulsami f0b0d63
Undo unnecessary changes
samiulsami 0e7afc6
Fix test
samiulsami c16582c
fix linter issues
samiulsami d8292ec
force shutdown upon receiving second os signal
samiulsami 703f90b
Add utility package for graceful shutdown context
samiulsami 9e17d1b
remove duplicate metrics file
samiulsami 8b3bdfe
fix tests by adding mock metrics object
samiulsami File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,59 +1,209 @@ | ||
| /* | ||
| Copyright AppsCode Inc. and Contributors | ||
|
|
||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||
| you may not use this file except in compliance with the License. | ||
| You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. | ||
| */ | ||
|
|
||
| package apis | ||
|
|
||
| import ( | ||
| "github.com/prometheus/client_golang/prometheus" | ||
| "github.com/prometheus/client_golang/prometheus/promauto" | ||
| "context" | ||
|
|
||
| "go.opentelemetry.io/otel" | ||
| "go.opentelemetry.io/otel/attribute" | ||
| "go.opentelemetry.io/otel/metric" | ||
| ) | ||
|
|
||
| const ( | ||
| meterName = "kubeops.dev/pgoutbox" | ||
|
|
||
| attrKeyTable = "table" | ||
| attrKeySubject = "subject" | ||
| attrKeyKind = "kind" | ||
| ) | ||
|
|
||
| // Metrics Prometheus metrics. | ||
| type Metrics struct { | ||
| filterSkippedEvents, publishedEvents, problematicEvents *prometheus.CounterVec | ||
| // Counters | ||
| eventsPublished metric.Int64Counter | ||
| eventsFiltered metric.Int64Counter | ||
| eventsFailed metric.Int64Counter | ||
|
|
||
| // Histograms | ||
| processingDuration metric.Float64Histogram | ||
| publishDuration metric.Float64Histogram | ||
|
|
||
| // Gauges (observable) | ||
| replicationLagBytes metric.Int64ObservableGauge | ||
| currentLSN metric.Int64ObservableGauge | ||
| } | ||
|
|
||
| const ( | ||
| labelApp = "app" | ||
| labelTable = "table" | ||
| labelSubject = "subject" | ||
| labelKind = "kind" | ||
| ) | ||
| func NewMetrics() (*Metrics, error) { | ||
| meter := otel.Meter(meterName) | ||
|
|
||
| eventsPublished, err := meter.Int64Counter( | ||
| "pgoutbox.events.published", | ||
| metric.WithDescription("Total number of successfully published events"), | ||
| metric.WithUnit("{event}"), | ||
| ) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| eventsFiltered, err := meter.Int64Counter( | ||
| "pgoutbox.events.filtered", | ||
| metric.WithDescription("Total number of events skipped by filter"), | ||
| metric.WithUnit("{event}"), | ||
| ) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| eventsFailed, err := meter.Int64Counter( | ||
| "pgoutbox.events.failed", | ||
| metric.WithDescription("Total number of events that failed processing"), | ||
| metric.WithUnit("{event}"), | ||
| ) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| processingDuration, err := meter.Float64Histogram( | ||
| "pgoutbox.processing.duration", | ||
| metric.WithDescription("Time to process a WAL message end-to-end"), | ||
| metric.WithUnit("s"), | ||
| metric.WithExplicitBucketBoundaries(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10), | ||
| ) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| publishDuration, err := meter.Float64Histogram( | ||
| "pgoutbox.publish.duration", | ||
| metric.WithDescription("Time to publish an event to the message broker"), | ||
| metric.WithUnit("s"), | ||
| metric.WithExplicitBucketBoundaries(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5), | ||
| ) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| replicationLagBytes, err := meter.Int64ObservableGauge( | ||
| "pgoutbox.replication.lag.bytes", | ||
| metric.WithDescription("Replication lag in bytes (difference between server WAL end and current LSN)"), | ||
| metric.WithUnit("By"), | ||
| ) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| currentLSN, err := meter.Int64ObservableGauge( | ||
| "pgoutbox.replication.lsn", | ||
| metric.WithDescription("Current LSN position in the replication stream"), | ||
| metric.WithUnit("{position}"), | ||
| ) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| // NewMetrics create and initialize new Prometheus metrics. | ||
| func NewMetrics() *Metrics { | ||
| return &Metrics{ | ||
| publishedEvents: promauto.NewCounterVec(prometheus.CounterOpts{ | ||
| Name: "published_events_total", | ||
| Help: "The total number of published events", | ||
| }, | ||
| []string{labelApp, labelSubject, labelTable}, | ||
| ), | ||
| problematicEvents: promauto.NewCounterVec(prometheus.CounterOpts{ | ||
| Name: "problematic_events_total", | ||
| Help: "The total number of skipped problematic events", | ||
| }, | ||
| []string{labelApp, labelKind}, | ||
| ), | ||
| filterSkippedEvents: promauto.NewCounterVec(prometheus.CounterOpts{ | ||
| Name: "filter_skipped_events_total", | ||
| Help: "The total number of skipped events", | ||
| }, | ||
| []string{labelApp, labelTable}, | ||
| eventsPublished: eventsPublished, | ||
| eventsFiltered: eventsFiltered, | ||
| eventsFailed: eventsFailed, | ||
| processingDuration: processingDuration, | ||
| publishDuration: publishDuration, | ||
| replicationLagBytes: replicationLagBytes, | ||
| currentLSN: currentLSN, | ||
| }, nil | ||
| } | ||
|
|
||
| func (m *Metrics) IncPublishedEvents(subject, table string) { | ||
| if m == nil || m.eventsPublished == nil { | ||
| return | ||
| } | ||
| m.eventsPublished.Add(context.Background(), 1, | ||
| metric.WithAttributes( | ||
| attribute.String(attrKeySubject, subject), | ||
| attribute.String(attrKeyTable, table), | ||
| ), | ||
| ) | ||
| } | ||
|
|
||
| func (m *Metrics) IncFilterSkippedEvents(table string) { | ||
| if m == nil || m.eventsFiltered == nil { | ||
| return | ||
| } | ||
| m.eventsFiltered.Add(context.Background(), 1, | ||
| metric.WithAttributes(attribute.String(attrKeyTable, table)), | ||
| ) | ||
| } | ||
|
|
||
| const appName = "pgoutbox" | ||
| func (m *Metrics) IncProblematicEvents(kind string) { | ||
| if m == nil || m.eventsFailed == nil { | ||
| return | ||
| } | ||
| m.eventsFailed.Add(context.Background(), 1, | ||
| metric.WithAttributes(attribute.String(attrKeyKind, kind)), | ||
| ) | ||
| } | ||
|
|
||
| func (m *Metrics) RecordProcessingDuration(seconds float64) { | ||
| if m == nil || m.processingDuration == nil { | ||
| return | ||
| } | ||
| m.processingDuration.Record(context.Background(), seconds) | ||
| } | ||
|
|
||
| // IncPublishedEvents increment published events counter. | ||
| func (m Metrics) IncPublishedEvents(subject, table string) { | ||
| m.publishedEvents.With(prometheus.Labels{labelApp: appName, labelSubject: subject, labelTable: table}).Inc() | ||
| func (m *Metrics) RecordPublishDuration(seconds float64, subject string) { | ||
| if m == nil || m.publishDuration == nil { | ||
| return | ||
| } | ||
| m.publishDuration.Record(context.Background(), seconds, | ||
| metric.WithAttributes(attribute.String(attrKeySubject, subject)), | ||
| ) | ||
| } | ||
|
|
||
| // IncFilterSkippedEvents increment skipped by filter events counter. | ||
| func (m Metrics) IncFilterSkippedEvents(table string) { | ||
| m.filterSkippedEvents.With(prometheus.Labels{labelApp: appName, labelTable: table}).Inc() | ||
| type GaugeCallbacks struct { | ||
| GetCurrentLSN func() uint64 | ||
| GetServerLSN func() uint64 | ||
| } | ||
|
|
||
| // IncProblematicEvents increment skipped by filter events counter. | ||
| func (m Metrics) IncProblematicEvents(kind string) { | ||
| m.problematicEvents.With(prometheus.Labels{labelApp: appName, labelKind: kind}).Inc() | ||
| func (m *Metrics) RegisterCallbacks(cb GaugeCallbacks) error { | ||
| if m == nil { | ||
| return nil | ||
| } | ||
|
|
||
| meter := otel.Meter(meterName) | ||
|
|
||
| _, err := meter.RegisterCallback( | ||
| func(ctx context.Context, o metric.Observer) error { | ||
| if cb.GetCurrentLSN != nil { | ||
| currentLSN := cb.GetCurrentLSN() | ||
| o.ObserveInt64(m.currentLSN, int64(currentLSN)) | ||
| if cb.GetServerLSN != nil { | ||
| serverLSN := cb.GetServerLSN() | ||
| if serverLSN > currentLSN { | ||
| o.ObserveInt64(m.replicationLagBytes, int64(serverLSN-currentLSN)) | ||
| } else { | ||
| o.ObserveInt64(m.replicationLagBytes, 0) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return nil | ||
| }, | ||
| m.currentLSN, | ||
| m.replicationLagBytes, | ||
| ) | ||
|
|
||
| return err | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.