diff --git a/internal/datastore/crdb/pool/balancer.go b/internal/datastore/crdb/pool/balancer.go index 38f428bae..dbbbd23d2 100644 --- a/internal/datastore/crdb/pool/balancer.go +++ b/internal/datastore/crdb/pool/balancer.go @@ -23,12 +23,12 @@ import ( var ( connectionsPerCRDBNodeCountGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "crdb_connections_per_node", - Help: "the number of connections spicedb has to each crdb node", + Help: "The number of active connections SpiceDB holds to each CockroachDB node, by pool (read/write). Imbalanced values across nodes suggest the connection balancer is unable to redistribute connections evenly.", }, []string{"pool", "node_id"}) pruningTimeHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "crdb_pruning_duration", - Help: "milliseconds spent on one iteration of pruning excess connections", + Help: "Duration in milliseconds of one iteration of the CockroachDB connection balancer pruning excess connections from over-represented nodes. Elevated values indicate the balancer is struggling to rebalance connections.", Buckets: []float64{.1, .2, .5, 1, 2, 5, 10, 20, 50, 100}, }, []string{"pool"}) ) diff --git a/internal/datastore/crdb/pool/pool.go b/internal/datastore/crdb/pool/pool.go index fd5f79440..4699b33bd 100644 --- a/internal/datastore/crdb/pool/pool.go +++ b/internal/datastore/crdb/pool/pool.go @@ -30,7 +30,7 @@ type pgxPool interface { var resetHistogram = prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "crdb_client_resets", - Help: "cockroachdb client-side tx reset distribution", + Help: "Distribution of the number of client-side transaction restarts per transaction attempt. Restarts occur when CockroachDB returns a serialization failure (40001) and the driver retries the transaction from scratch. Sustained high values indicate transaction contention.", Buckets: []float64{0, 1, 2, 5, 10, 20, 50}, }) diff --git a/internal/datastore/crdb/watch.go b/internal/datastore/crdb/watch.go index 24c0a2a7c..e6e21e885 100644 --- a/internal/datastore/crdb/watch.go +++ b/internal/datastore/crdb/watch.go @@ -11,7 +11,6 @@ import ( "time" "github.com/jackc/pgx/v5" - "github.com/prometheus/client_golang/prometheus" "google.golang.org/protobuf/types/known/structpb" "google.golang.org/protobuf/types/known/timestamppb" @@ -37,18 +36,6 @@ const ( queryChangefeedPreV22 = "EXPERIMENTAL CHANGEFEED FOR %s WITH updated, cursor = '%s', resolved = '%s';" ) -var retryHistogram = prometheus.NewHistogram(prometheus.HistogramOpts{ - Namespace: "spicedb", - Subsystem: "datastore", - Name: "crdb_watch_retries", - Help: "watch retry distribution", - Buckets: []float64{0, 1, 2, 5, 10, 20, 50}, -}) - -func init() { - prometheus.MustRegister(retryHistogram) -} - type changeDetails struct { Resolved string Updated string diff --git a/internal/datastore/proxy/observable.go b/internal/datastore/proxy/observable.go index 39c575acb..aa87bfd60 100644 --- a/internal/datastore/proxy/observable.go +++ b/internal/datastore/proxy/observable.go @@ -27,7 +27,7 @@ var ( Subsystem: "datastore", Name: "loaded_relationships_count", Buckets: []float64{0, 1, 3, 10, 32, 100, 316, 1000, 3162, 10000}, - Help: "total number of relationships loaded for a query", + Help: "Histogram of the number of relationships loaded per individual datastore query. High p99 values (>1000) may indicate broad permission checks or missing filters.", }) queryLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ diff --git a/internal/datastore/proxy/schemacaching/watchingcache.go b/internal/datastore/proxy/schemacaching/watchingcache.go index 77c329705..d69b85e6b 100644 --- a/internal/datastore/proxy/schemacaching/watchingcache.go +++ b/internal/datastore/proxy/schemacaching/watchingcache.go @@ -23,21 +23,21 @@ var namespacesFallbackModeGauge = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "spicedb", Subsystem: "datastore", Name: "watching_schema_cache_namespaces_fallback_mode", - Help: "value of 1 if the cache is in fallback mode and 0 otherwise", + Help: "Whether the watching schema cache for namespace definitions is in fallback mode (1) or normal mode (0). Fallback is triggered when the CockroachDB changefeed used to track schema updates becomes unavailable; in this state every schema lookup hits the datastore directly.", }) var caveatsFallbackModeGauge = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "spicedb", Subsystem: "datastore", Name: "watching_schema_cache_caveats_fallback_mode", - Help: "value of 1 if the cache is in fallback mode and 0 otherwise", + Help: "Whether the watching schema cache for caveat definitions is in fallback mode (1) or normal mode (0). Fallback is triggered when the CockroachDB changefeed used to track schema updates becomes unavailable; in this state every schema lookup hits the datastore directly.", }) var schemaCacheRevisionGauge = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "spicedb", Subsystem: "datastore", Name: "watching_schema_cache_tracked_revision", - Help: "the currently tracked max revision for the schema cache", + Help: "The current maximum revision tracked by the CockroachDB changefeed-backed schema cache. A value that is not advancing over time indicates the changefeed has stalled.", }) var definitionsReadCachedCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ diff --git a/internal/datastore/spanner/watch.go b/internal/datastore/spanner/watch.go index 26d993767..ce8e537e3 100644 --- a/internal/datastore/spanner/watch.go +++ b/internal/datastore/spanner/watch.go @@ -12,7 +12,6 @@ import ( "cloud.google.com/go/spanner" sppb "cloud.google.com/go/spanner/apiv1/spannerpb" "github.com/cloudspannerecosystem/spanner-change-streams-tail/changestreams" - "github.com/prometheus/client_golang/prometheus" "github.com/puzpuzpuz/xsync/v4" "google.golang.org/api/option" @@ -28,18 +27,6 @@ const ( CombinedChangeStreamName = "combined_change_stream" ) -var retryHistogram = prometheus.NewHistogram(prometheus.HistogramOpts{ - Namespace: "spicedb", - Subsystem: "datastore", - Name: "spanner_watch_retries", - Help: "watch retry distribution", - Buckets: []float64{0, 1, 2, 5, 10, 20, 50}, -}) - -func init() { - prometheus.MustRegister(retryHistogram) -} - // Copied from the spanner library: https://github.com/googleapis/google-cloud-go/blob/f03779538f949fb4ad93d5247d3c6b3e5b21091a/spanner/client.go#L67 // License: Apache License, Version 2.0, Copyright 2017 Google LLC var validDBPattern = regexp.MustCompile("^projects/(?P[^/]+)/instances/(?P[^/]+)/databases/(?P[^/]+)$") diff --git a/internal/dispatch/caching/caching.go b/internal/dispatch/caching/caching.go index e08360073..105c08221 100644 --- a/internal/dispatch/caching/caching.go +++ b/internal/dispatch/caching/caching.go @@ -62,33 +62,39 @@ func NewCachingDispatcher(cacheInst cache.Cache[keys.DispatchCacheKey, any], met Namespace: prometheusNamespace, Subsystem: prometheusSubsystem, Name: "check_total", + Help: "Total number of CheckPermission dispatch requests processed.", }) checkFromCacheCounter := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: prometheusNamespace, Subsystem: prometheusSubsystem, Name: "check_from_cache_total", + Help: "Total number of CheckPermission dispatch requests served directly from the dispatch cache, avoiding re-computation.", }) lookupResourcesTotalCounter := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: prometheusNamespace, Subsystem: prometheusSubsystem, Name: "lookup_resources_total", + Help: "Total number of LookupResources dispatch requests processed.", }) lookupResourcesFromCacheCounter := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: prometheusNamespace, Subsystem: prometheusSubsystem, Name: "lookup_resources_from_cache_total", + Help: "Total number of LookupResources dispatch requests served directly from the dispatch cache.", }) lookupSubjectsTotalCounter := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: prometheusNamespace, Subsystem: prometheusSubsystem, Name: "lookup_subjects_total", + Help: "Total number of LookupSubjects dispatch requests processed.", }) lookupSubjectsFromCacheCounter := prometheus.NewCounter(prometheus.CounterOpts{ Namespace: prometheusNamespace, Subsystem: prometheusSubsystem, Name: "lookup_subjects_from_cache_total", + Help: "Total number of LookupSubjects dispatch requests served directly from the dispatch cache.", }) if metricsEnabled && prometheusSubsystem != "" { diff --git a/internal/graph/check.go b/internal/graph/check.go index 9d1d66b77..a1f3a8e0e 100644 --- a/internal/graph/check.go +++ b/internal/graph/check.go @@ -39,16 +39,9 @@ var dispatchChunkCountHistogram = prometheus.NewHistogram(prometheus.HistogramOp Buckets: []float64{1, 2, 3, 5, 10, 25, 100, 250}, }) -var directDispatchQueryHistogram = prometheus.NewHistogram(prometheus.HistogramOpts{ - Name: "spicedb_check_direct_dispatch_query_count", - Help: "number of queries made per direct dispatch", - Buckets: []float64{1, 2}, -}) - const noOriginalRelation = "" func init() { - prometheus.MustRegister(directDispatchQueryHistogram) prometheus.MustRegister(dispatchChunkCountHistogram) } @@ -385,10 +378,6 @@ func (cc *ConcurrentChecker) checkDirect(ctx context.Context, crc currentRequest // If the direct subject or a wildcard form can be found, issue a query for just that // subject. - var queryCount float64 - defer func() { - directDispatchQueryHistogram.Observe(queryCount) - }() hasDirectSubject := totalDirectSubjects > 0 hasWildcardSubject := totalWildcardSubjects > 0 @@ -429,8 +418,6 @@ func (cc *ConcurrentChecker) checkDirect(ctx context.Context, crc currentRequest if err != nil { return checkResultError(NewCheckFailureErr(err), emptyMetadata) } - queryCount += 1.0 - // Find the matching subject(s). for rel, err := range it { if err != nil { @@ -482,8 +469,6 @@ func (cc *ConcurrentChecker) checkDirect(ctx context.Context, crc currentRequest if err != nil { return checkResultError(NewCheckFailureErr(err), emptyMetadata) } - queryCount += 1.0 - // Build the set of subjects over which to dispatch, along with metadata for // mapping over caveats (if any). checksToDispatch := newCheckDispatchSet()