Skip to content

Commit 3fef0d1

Browse files
rohitnarayanRohit Jaiswalmarkphelpserka
authored
feat(metrics): add Git sync observability metrics to feature flag backend (#4673)
* This change enhances Flipt's Git-based feature flag backend observability by adding detailed synchronization metrics. Currently, failures during Git sync are only logged without metric visibility, limiting proactive monitoring and alerting capabilities. - Introduce new OpenTelemetry metrics for Git sync operations: - Last sync time as an observable gauge (timestamp). - Sync duration histogram. - Counters for number of flags fetched. - Success and failure counts with failure reason attributes. - Instrument the `SnapshotStore.update` method, the core sync loop, to record these metrics accurately on every sync attempt, including partial failures and cleanups. - Extend the `Snapshot` type with `TotalFlagsCount()` to count all flags across namespaces for metric reporting. - Integrate metrics initialization in app startup ensuring consistent telemetry setup. - Improve test coverage by suggesting strategies to verify metric emission and sync behavior. These metric additions enable operators to monitor Git sync health, detect failures promptly, and troubleshoot issues efficiently, significantly improving runtime observability and system reliability. Signed-off-by: Rohit Jaiswal <[email protected]> * fix(tests): close channel only once Signed-off-by: Rohit Jaiswal <[email protected]> * fix(tests): add check for missing features repoURL Signed-off-by: Rohit Jaiswal <[email protected]> * feat(metrics): move git metrics in git package Signed-off-by: Rohit Jaiswal <[email protected]> * feat(metrics): remove reason from sync metrics Signed-off-by: Rohit Jaiswal <[email protected]> * feat(metrics): remove url check Signed-off-by: Rohit Jaiswal <[email protected]> * fix(tests): metrics unit tests Signed-off-by: Rohit Jaiswal <[email protected]> * refactor: simplify git sync metrics Signed-off-by: Roman Dmytrenko <[email protected]> * feat(metrics): use consistent naming and code patterns Signed-off-by: Rohit Jaiswal <[email protected]> * feat(metrics): add unit tests for metrics Signed-off-by: Rohit Jaiswal <[email protected]> * feat(metrics): use correct meter func Signed-off-by: Rohit Jaiswal <[email protected]> * remove unused code Signed-off-by: Roman Dmytrenko <[email protected]> --------- Signed-off-by: Rohit Jaiswal <[email protected]> Signed-off-by: Roman Dmytrenko <[email protected]> Co-authored-by: Rohit Jaiswal <[email protected]> Co-authored-by: Mark Phelps <[email protected]> Co-authored-by: Roman Dmytrenko <[email protected]>
1 parent 94735fc commit 3fef0d1

File tree

8 files changed

+795
-31
lines changed

8 files changed

+795
-31
lines changed

DEVELOPMENT.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ If you run into errors such as:
2828
undefined: sqlite3.Error
2929
```
3030

31-
Then you need to enable CGO.
31+
Then you need to enable CGO.
3232

3333
### Windows
3434

@@ -43,10 +43,10 @@ Then you need to enable CGO.
4343
## Setup
4444

4545
1. Clone this repo: `git clone https://github.com/flipt-io/flipt`.
46-
1. Run `mage bootstrap` to install required development tools. See [#bootstrap](#bootstrap) below.
47-
1. Run `mage go:test` to execute the Go test suite. For more information on tests, see also [here](build/README.md)
48-
1. Run `mage` to build the binary with embedded assets.
49-
1. Run `mage -l` to see a full list of possible commands.
46+
2. Run `mage bootstrap` to install required development tools. See [#bootstrap](#bootstrap) below.
47+
3. Run `mage go:test` to execute the Go test suite. For more information on tests, see also [here](build/README.md)
48+
4. Run `mage` to build the binary with embedded assets.
49+
5. Run `mage -l` to see a full list of possible commands.
5050

5151
## Conventional Commits
5252

@@ -108,7 +108,7 @@ These ports will be forwarded to your local machine automatically if you are dev
108108

109109
## Docker Compose
110110

111-
If you want to develop Flipt using Docker Compose, you can use the `docker-compose.yml` file in the root of this repository.
111+
If you want to develop Flipt using Docker Compose, you can use the `docker-compose.yml` file in the root of this repository.
112112

113113
This will start two Docker containers:
114114

go.work.sum

Lines changed: 565 additions & 4 deletions
Large diffs are not rendered by default.

internal/metrics/metrics.go

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,15 @@ type MustInt64Meter interface {
4444
// with options. The instrument is used to synchronously record increasing
4545
// int64 measurements during a computational operation.
4646
Counter(name string, options ...metric.Int64CounterOption) metric.Int64Counter
47-
// UpDownCounter returns a new instrument identified by name and
48-
// configured with options. The instrument is used to synchronously record
49-
// int64 measurements during a computational operation.
50-
UpDownCounter(name string, options ...metric.Int64UpDownCounterOption) metric.Int64UpDownCounter
5147
// Histogram returns a new instrument identified by name and
5248
// configured with options. The instrument is used to synchronously record
5349
// the distribution of int64 measurements during a computational operation.
5450
Histogram(name string, options ...metric.Int64HistogramOption) metric.Int64Histogram
51+
// ObservableGauge returns a new instrument
52+
// identified by name and configured with options. The instrument is used
53+
// to asynchronously record instantaneous measurements once per a
54+
// measurement collection cycle.
55+
ObservableGauge(name string, v func() int64, options ...metric.Int64ObservableGaugeOption) metric.Int64ObservableGauge
5556
}
5657

5758
type mustInt64Meter struct{}
@@ -66,14 +67,23 @@ func (m mustInt64Meter) Counter(name string, opts ...metric.Int64CounterOption)
6667
return counter
6768
}
6869

69-
// UpDownCounter creates an instrument for recording changes of a value.
70-
func (m mustInt64Meter) UpDownCounter(name string, opts ...metric.Int64UpDownCounterOption) metric.Int64UpDownCounter {
71-
counter, err := meter().Int64UpDownCounter(name, opts...)
70+
// Gauge creates an instrument for recording changes of a value.
71+
func (m mustInt64Meter) ObservableGauge(name string, v func() int64, opts ...metric.Int64ObservableGaugeOption) metric.Int64ObservableGauge {
72+
gauge, err := meter().Int64ObservableGauge(name, opts...)
7273
if err != nil {
7374
panic(err)
7475
}
75-
76-
return counter
76+
_, err = meter().RegisterCallback(
77+
func(ctx context.Context, observer metric.Observer) error {
78+
observer.ObserveInt64(gauge, v())
79+
return nil
80+
},
81+
gauge,
82+
)
83+
if err != nil {
84+
panic(err)
85+
}
86+
return gauge
7787
}
7888

7989
// Histogram creates an instrument for recording a distribution of values.
@@ -100,10 +110,6 @@ type MustFloat64Meter interface {
100110
// with options. The instrument is used to synchronously record increasing
101111
// float64 measurements during a computational operation.
102112
Counter(name string, options ...metric.Float64CounterOption) metric.Float64Counter
103-
// UpDownCounter returns a new instrument identified by name and
104-
// configured with options. The instrument is used to synchronously record
105-
// float64 measurements during a computational operation.
106-
UpDownCounter(name string, options ...metric.Float64UpDownCounterOption) metric.Float64UpDownCounter
107113
// Histogram returns a new instrument identified by name and
108114
// configured with options. The instrument is used to synchronously record
109115
// the distribution of float64 measurements during a computational operation.

internal/storage/fs/git/metrics.go

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
package git
2+
3+
import (
4+
"context"
5+
"sync/atomic"
6+
"time"
7+
8+
"github.com/prometheus/client_golang/prometheus"
9+
"go.flipt.io/flipt/internal/metrics"
10+
"go.opentelemetry.io/otel/metric"
11+
)
12+
13+
const (
14+
namespace = "flipt"
15+
subsystem = "git_sync"
16+
)
17+
18+
var (
19+
// syncLatency is a histogram for git sync operation duration.
20+
syncLatency = metrics.MustFloat64().
21+
Histogram(
22+
prometheus.BuildFQName(namespace, subsystem, "latency"),
23+
metric.WithDescription("The duration of git sync operations in seconds"),
24+
metric.WithUnit("s"),
25+
)
26+
27+
// syncFlags is a counter for the number of flags during sync.
28+
syncFlags = metrics.MustInt64().
29+
Counter(
30+
prometheus.BuildFQName(namespace, subsystem, "flags"),
31+
metric.WithDescription("The number of flags fetched during git sync"),
32+
)
33+
34+
// syncErrors is a counter for failed git sync operations.
35+
syncErrors = metrics.MustInt64().
36+
Counter(
37+
prometheus.BuildFQName(namespace, subsystem, "errors"),
38+
metric.WithDescription("The number of errors git sync operations"),
39+
)
40+
41+
_ = metrics.MustInt64().
42+
ObservableGauge(
43+
prometheus.BuildFQName(namespace, subsystem, "last_time"),
44+
getLastSyncTime,
45+
metric.WithDescription("The unix timestamp of the last git sync operation"),
46+
)
47+
48+
// internal storage for last sync time value
49+
lastSyncTimeValue atomic.Int64
50+
)
51+
52+
// observeSync records a complete git sync operation with all relevant metrics.
53+
func observeSync(ctx context.Context, duration time.Duration, flagsFetched int64, success bool) {
54+
// Always record duration and update last sync time
55+
syncLatency.Record(ctx, duration.Seconds())
56+
setLastSyncTime(time.Now().UTC())
57+
58+
syncFlags.Add(ctx, flagsFetched)
59+
if !success {
60+
syncErrors.Add(ctx, 1)
61+
}
62+
}
63+
64+
// setLastSyncTime updates the last sync time value that will be reported by the observable gauge.
65+
func setLastSyncTime(ts time.Time) {
66+
lastSyncTimeValue.Store(ts.Unix())
67+
}
68+
69+
// getLastSyncTime returns the current last sync time value.
70+
func getLastSyncTime() int64 {
71+
return lastSyncTimeValue.Load()
72+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package git
2+
3+
import (
4+
"testing"
5+
"time"
6+
)
7+
8+
// TestObserveSyncSuccess ensures success path sets last sync time and updates counters.
9+
func TestObserveSyncSuccess(t *testing.T) {
10+
before := getLastSyncTime()
11+
observeSync(t.Context(), 2*time.Second, 3, true)
12+
13+
after := getLastSyncTime()
14+
if after <= before {
15+
t.Errorf("expected last sync time to update, before=%d after=%d", before, after)
16+
}
17+
}
18+
19+
// TestObserveSyncFailure ensures failure path sets last sync time and updates counters.
20+
func TestObserveSyncFailure(t *testing.T) {
21+
setLastSyncTime(time.Now().UTC().Add(-time.Minute))
22+
before := getLastSyncTime()
23+
observeSync(t.Context(), time.Second, 0, false)
24+
25+
after := getLastSyncTime()
26+
if after <= before {
27+
t.Errorf("expected last sync time to update, before=%d after=%d", before, after)
28+
}
29+
}
30+
31+
// TestSetAndGetLastSyncTime directly validates setter/getter.
32+
func TestSetAndGetLastSyncTime(t *testing.T) {
33+
now := time.Now()
34+
setLastSyncTime(now)
35+
got := getLastSyncTime()
36+
if got != now.Unix() {
37+
t.Errorf("expected last sync time %d, got %d", now.Unix(), got)
38+
}
39+
}

internal/storage/fs/git/store.go

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"os"
99
"slices"
1010
"sync"
11+
"time"
1112

1213
"github.com/go-git/go-billy/v5/osfs"
1314
"github.com/go-git/go-git/v5"
@@ -18,6 +19,7 @@ import (
1819
gitstorage "github.com/go-git/go-git/v5/storage"
1920
"github.com/go-git/go-git/v5/storage/filesystem"
2021
"github.com/go-git/go-git/v5/storage/memory"
22+
2123
"go.flipt.io/flipt/internal/containers"
2224
"go.flipt.io/flipt/internal/gitfs"
2325
"go.flipt.io/flipt/internal/storage"
@@ -139,6 +141,7 @@ func NewSnapshotStore(ctx context.Context, logger *zap.Logger, url string, opts
139141
baseRef: "main",
140142
referenceResolver: staticResolver(),
141143
}
144+
142145
containers.ApplyAll(store, opts...)
143146

144147
store.logger = store.logger.With(zap.String("ref", store.baseRef))
@@ -335,14 +338,20 @@ func (s *SnapshotStore) listRemoteRefs(ctx context.Context) (map[string]struct{}
335338
// HEAD updates to a new revision, it builds a snapshot and updates it
336339
// on the store.
337340
func (s *SnapshotStore) update(ctx context.Context) (bool, error) {
341+
syncStart := time.Now()
338342
updated, fetchErr := s.fetch(ctx, s.snaps.References())
339343

344+
flagsFetched := int64(0)
345+
var errs []error
346+
340347
if !updated && fetchErr == nil {
348+
// No update and no error: record metrics for a successful no-change sync
349+
duration := time.Since(syncStart)
350+
observeSync(ctx, duration, 0, true)
341351
return false, nil
342352
}
343353

344-
// If we can't fetch, we need to check if the remote refs have changed
345-
// and remove any references that are no longer present
354+
// If fetchErr exists, try cleaning up refs but do not declare full failure yet
346355
if fetchErr != nil {
347356
remoteRefs, listErr := s.listRemoteRefs(ctx)
348357
if listErr != nil {
@@ -361,23 +370,37 @@ func (s *SnapshotStore) update(ctx context.Context) (bool, error) {
361370
}
362371
}
363372
}
364-
}
365-
366-
var errs []error
367-
if fetchErr != nil {
368373
errs = append(errs, fetchErr)
369374
}
375+
376+
// Try to rebuild refs even if fetch failed
370377
for _, ref := range s.snaps.References() {
371378
hash, err := s.resolve(ref)
372379
if err != nil {
373380
errs = append(errs, err)
374381
continue
375382
}
376-
if _, err := s.snaps.AddOrBuild(ctx, ref, hash, s.buildSnapshot); err != nil {
383+
384+
snap, err := s.snaps.AddOrBuild(ctx, ref, hash, s.buildSnapshot)
385+
if err != nil {
377386
errs = append(errs, err)
387+
continue
388+
}
389+
390+
if snap != nil {
391+
flagsFetched += int64(snap.TotalFlagsCount())
378392
}
379393
}
380-
return true, errors.Join(errs...)
394+
395+
duration := time.Since(syncStart)
396+
observeSync(ctx, duration, flagsFetched, len(errs) == 0)
397+
398+
if len(errs) > 0 {
399+
s.logger.Error("git backend flag sync failed", zap.Errors("errors", errs))
400+
return false, errors.Join(errs...)
401+
}
402+
403+
return true, nil
381404
}
382405

383406
func (s *SnapshotStore) fetch(ctx context.Context, heads []string) (bool, error) {

internal/storage/fs/snapshot.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,14 @@ func (ss *Snapshot) CountFlags(ctx context.Context, p storage.NamespaceRequest)
789789
return uint64(len(ns.flags)), nil
790790
}
791791

792+
func (ss *Snapshot) TotalFlagsCount() int {
793+
total := 0
794+
for _, ns := range ss.ns {
795+
total += len(ns.flags)
796+
}
797+
return total
798+
}
799+
792800
func (ss *Snapshot) GetEvaluationRules(ctx context.Context, flag storage.ResourceRequest) ([]*storage.EvaluationRule, error) {
793801
ns, ok := ss.ns[flag.Namespace()]
794802
if !ok {

internal/storage/fs/snapshot_test.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1840,3 +1840,58 @@ func TestEtagWithFewDocs(t *testing.T) {
18401840
require.NoError(t, err)
18411841
assert.Equal(t, "dc26c96ddf6430ed603862ffe5d3ac9a", version)
18421842
}
1843+
1844+
func TestTotalFlagsCount(t *testing.T) {
1845+
testCases := []struct {
1846+
name string
1847+
setupSnapshot func() *Snapshot
1848+
expectedCount int
1849+
}{
1850+
{
1851+
name: "EmptySnapshot",
1852+
setupSnapshot: func() *Snapshot {
1853+
return &Snapshot{ns: map[string]*namespace{}}
1854+
},
1855+
expectedCount: 0,
1856+
},
1857+
{
1858+
name: "SingleNamespaceWithTwoFlags",
1859+
setupSnapshot: func() *Snapshot {
1860+
ns := &namespace{
1861+
flags: map[string]*flipt.Flag{
1862+
"flag1": {Key: "flag1", NamespaceKey: "default"},
1863+
"flag2": {Key: "flag2", NamespaceKey: "default"},
1864+
},
1865+
}
1866+
return &Snapshot{ns: map[string]*namespace{"default": ns}}
1867+
},
1868+
expectedCount: 2,
1869+
},
1870+
{
1871+
name: "MultipleNamespaces",
1872+
setupSnapshot: func() *Snapshot {
1873+
ns1 := &namespace{
1874+
flags: map[string]*flipt.Flag{
1875+
"flag1": {Key: "flag1", NamespaceKey: "default"},
1876+
},
1877+
}
1878+
ns2 := &namespace{
1879+
flags: map[string]*flipt.Flag{
1880+
"flagA": {Key: "flagA", NamespaceKey: "other"},
1881+
"flagB": {Key: "flagB", NamespaceKey: "other"},
1882+
},
1883+
}
1884+
return &Snapshot{ns: map[string]*namespace{"default": ns1, "other": ns2}}
1885+
},
1886+
expectedCount: 3,
1887+
},
1888+
}
1889+
1890+
for _, tc := range testCases {
1891+
t.Run(tc.name, func(t *testing.T) {
1892+
snapshot := tc.setupSnapshot()
1893+
got := snapshot.TotalFlagsCount()
1894+
assert.Equal(t, tc.expectedCount, got, "unexpected flag count")
1895+
})
1896+
}
1897+
}

0 commit comments

Comments
 (0)