Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
8 changes: 8 additions & 0 deletions apis/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ type Config struct {
Database *DatabaseCfg `valid:"required" json:"database" mapstructure:"database"`
Publisher *PublisherCfg `valid:"required" json:"publisher" mapstructure:"publisher"`
Logger *Logger `valid:"required" json:"logger" mapstructure:"logger"`
Telemetry *TelemetryCfg `json:"telemetry" mapstructure:"telemetry"`
}

// TelemetryCfg holds telemetry and metrics configuration.
type TelemetryCfg struct {
// Enabled enables the Prometheus metrics endpoint at /metrics.
// Defaults to true if not specified.
Enabled bool `json:"enabled" mapstructure:"enabled"`
}

// ListenerCfg path of the listener config.
Expand Down
3 changes: 2 additions & 1 deletion apis/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ package apis

import (
"fmt"
"github.com/google/uuid"
"time"

"github.com/google/uuid"
)

// Event structure for publishing to the NATS server.
Expand Down
228 changes: 189 additions & 39 deletions apis/metrics.go
Original file line number Diff line number Diff line change
@@ -1,59 +1,209 @@
/*
Copyright AppsCode Inc. and Contributors

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package apis

import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"context"

"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
)

const (
meterName = "kubeops.dev/pgoutbox"

attrKeyTable = "table"
attrKeySubject = "subject"
attrKeyKind = "kind"
)

// Metrics Prometheus metrics.
type Metrics struct {
filterSkippedEvents, publishedEvents, problematicEvents *prometheus.CounterVec
// Counters
eventsPublished metric.Int64Counter
eventsFiltered metric.Int64Counter
eventsFailed metric.Int64Counter

// Histograms
processingDuration metric.Float64Histogram
publishDuration metric.Float64Histogram

// Gauges (observable)
replicationLagBytes metric.Int64ObservableGauge
currentLSN metric.Int64ObservableGauge
}

const (
labelApp = "app"
labelTable = "table"
labelSubject = "subject"
labelKind = "kind"
)
func NewMetrics() (*Metrics, error) {
meter := otel.Meter(meterName)

eventsPublished, err := meter.Int64Counter(
"pgoutbox.events.published",
metric.WithDescription("Total number of successfully published events"),
metric.WithUnit("{event}"),
)
if err != nil {
return nil, err
}

eventsFiltered, err := meter.Int64Counter(
"pgoutbox.events.filtered",
metric.WithDescription("Total number of events skipped by filter"),
metric.WithUnit("{event}"),
)
if err != nil {
return nil, err
}

eventsFailed, err := meter.Int64Counter(
"pgoutbox.events.failed",
metric.WithDescription("Total number of events that failed processing"),
metric.WithUnit("{event}"),
)
if err != nil {
return nil, err
}

processingDuration, err := meter.Float64Histogram(
"pgoutbox.processing.duration",
metric.WithDescription("Time to process a WAL message end-to-end"),
metric.WithUnit("s"),
metric.WithExplicitBucketBoundaries(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10),
)
if err != nil {
return nil, err
}

publishDuration, err := meter.Float64Histogram(
"pgoutbox.publish.duration",
metric.WithDescription("Time to publish an event to the message broker"),
metric.WithUnit("s"),
metric.WithExplicitBucketBoundaries(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5),
)
if err != nil {
return nil, err
}

replicationLagBytes, err := meter.Int64ObservableGauge(
"pgoutbox.replication.lag.bytes",
metric.WithDescription("Replication lag in bytes (difference between server WAL end and current LSN)"),
metric.WithUnit("By"),
)
if err != nil {
return nil, err
}

currentLSN, err := meter.Int64ObservableGauge(
"pgoutbox.replication.lsn",
metric.WithDescription("Current LSN position in the replication stream"),
metric.WithUnit("{position}"),
)
if err != nil {
return nil, err
}

// NewMetrics create and initialize new Prometheus metrics.
func NewMetrics() *Metrics {
return &Metrics{
publishedEvents: promauto.NewCounterVec(prometheus.CounterOpts{
Name: "published_events_total",
Help: "The total number of published events",
},
[]string{labelApp, labelSubject, labelTable},
),
problematicEvents: promauto.NewCounterVec(prometheus.CounterOpts{
Name: "problematic_events_total",
Help: "The total number of skipped problematic events",
},
[]string{labelApp, labelKind},
),
filterSkippedEvents: promauto.NewCounterVec(prometheus.CounterOpts{
Name: "filter_skipped_events_total",
Help: "The total number of skipped events",
},
[]string{labelApp, labelTable},
eventsPublished: eventsPublished,
eventsFiltered: eventsFiltered,
eventsFailed: eventsFailed,
processingDuration: processingDuration,
publishDuration: publishDuration,
replicationLagBytes: replicationLagBytes,
currentLSN: currentLSN,
}, nil
}

func (m *Metrics) IncPublishedEvents(subject, table string) {
if m == nil || m.eventsPublished == nil {
return
}
m.eventsPublished.Add(context.Background(), 1,
metric.WithAttributes(
attribute.String(attrKeySubject, subject),
attribute.String(attrKeyTable, table),
),
)
}

func (m *Metrics) IncFilterSkippedEvents(table string) {
if m == nil || m.eventsFiltered == nil {
return
}
m.eventsFiltered.Add(context.Background(), 1,
metric.WithAttributes(attribute.String(attrKeyTable, table)),
)
}

const appName = "pgoutbox"
func (m *Metrics) IncProblematicEvents(kind string) {
if m == nil || m.eventsFailed == nil {
return
}
m.eventsFailed.Add(context.Background(), 1,
metric.WithAttributes(attribute.String(attrKeyKind, kind)),
)
}

func (m *Metrics) RecordProcessingDuration(seconds float64) {
if m == nil || m.processingDuration == nil {
return
}
m.processingDuration.Record(context.Background(), seconds)
}

// IncPublishedEvents increment published events counter.
func (m Metrics) IncPublishedEvents(subject, table string) {
m.publishedEvents.With(prometheus.Labels{labelApp: appName, labelSubject: subject, labelTable: table}).Inc()
func (m *Metrics) RecordPublishDuration(seconds float64, subject string) {
if m == nil || m.publishDuration == nil {
return
}
m.publishDuration.Record(context.Background(), seconds,
metric.WithAttributes(attribute.String(attrKeySubject, subject)),
)
}

// IncFilterSkippedEvents increment skipped by filter events counter.
func (m Metrics) IncFilterSkippedEvents(table string) {
m.filterSkippedEvents.With(prometheus.Labels{labelApp: appName, labelTable: table}).Inc()
type GaugeCallbacks struct {
GetCurrentLSN func() uint64
GetServerLSN func() uint64
}

// IncProblematicEvents increment skipped by filter events counter.
func (m Metrics) IncProblematicEvents(kind string) {
m.problematicEvents.With(prometheus.Labels{labelApp: appName, labelKind: kind}).Inc()
func (m *Metrics) RegisterCallbacks(cb GaugeCallbacks) error {
if m == nil {
return nil
}

meter := otel.Meter(meterName)

_, err := meter.RegisterCallback(
func(ctx context.Context, o metric.Observer) error {
if cb.GetCurrentLSN != nil {
currentLSN := cb.GetCurrentLSN()
o.ObserveInt64(m.currentLSN, int64(currentLSN))
if cb.GetServerLSN != nil {
serverLSN := cb.GetServerLSN()
if serverLSN > currentLSN {
o.ObserveInt64(m.replicationLagBytes, int64(serverLSN-currentLSN))
} else {
o.ObserveInt64(m.replicationLagBytes, 0)
}
}
}

return nil
},
m.currentLSN,
m.replicationLagBytes,
)

return err
}
29 changes: 28 additions & 1 deletion cmd/pgoutbox/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package main

import (
"context"
"encoding/binary"
"fmt"
"log/slog"
Expand All @@ -29,6 +30,7 @@ import (
"kubeops.dev/pgoutbox/apis"
"kubeops.dev/pgoutbox/internal/listener"
"kubeops.dev/pgoutbox/internal/listener/transaction"
"kubeops.dev/pgoutbox/internal/telemetry"

"github.com/urfave/cli/v2"
)
Expand Down Expand Up @@ -85,6 +87,19 @@ func main() {

logger := apis.InitSlog(cfg.Logger, version, false)

if cfg.Telemetry == nil || cfg.Telemetry.Enabled {
if err = telemetry.InitMetrics(ctx, version); err != nil {
return fmt.Errorf("initialize telemetry: %w", err)
}
defer func() {
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second)
defer shutdownCancel()
if err := telemetry.Shutdown(shutdownCtx); err != nil {
slog.Error("telemetry shutdown failed", "err", err)
}
}()
}

pgxConn, pgConn, err := initPgxConnections(cfg.Database, logger, time.Minute*10)
if err != nil {
return fmt.Errorf("pgx connection: %w", err)
Expand All @@ -104,16 +119,28 @@ func main() {
}
}()

metrics, err := apis.NewMetrics()
if err != nil {
return fmt.Errorf("initialize metrics: %w", err)
}

svc := listener.NewWalListener(
cfg,
logger,
listener.NewRepository(pgxConn),
newReplicationConn(pgConn),
pub,
transaction.NewBinaryParser(logger, binary.BigEndian),
apis.NewMetrics(),
metrics,
)

if err := metrics.RegisterCallbacks(apis.GaugeCallbacks{
GetCurrentLSN: func() uint64 { return uint64(svc.ReadLSN()) },
GetServerLSN: func() uint64 { return uint64(svc.ReadServerLSN()) },
}); err != nil {
return fmt.Errorf("register metrics callbacks: %w", err)
}

go svc.InitHandlers(ctx)

if err = svc.Process(ctx); err != nil {
Expand Down
Loading