Skip to content

Commit 0f77225

Browse files
boxofradmmcallister
authored andcommitted
MWI: Add teleport_bot_instances metric (#59774)
* MWI: Add `teleport_bot_instances` metric changelog: MWI: Add `teleport_bot_instances` metric * Use `InEpsilon` * Fix import ordering
1 parent b97e73a commit 0f77225

File tree

5 files changed

+134
-0
lines changed

5 files changed

+134
-0
lines changed

docs/pages/includes/metrics.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
| `teleport_registered_servers_by_install_methods` | gauge | Teleport Auth | The number of Teleport services that are connected to an Auth Service instance grouped by install methods. |
7171
| `teleport_roles_total` | gauge | Teleport Auth | The number of roles that exist in the cluster. |
7272
| `teleport_migrations` | gauge | Teleport Auth | Tracks for each migration if it is active (1) or not (0). |
73+
| `teleport_bot_instances` | gauge | Teleport Auth | The number of bot instances across the entire cluster grouped by version. |
7374
| `user_login_total` | counter | Teleport Auth | Number of user logins. |
7475
| `watcher_event_sizes` | histogram | cache | Overall size of events emitted. |
7576
| `watcher_events` | histogram | cache | Per resource size of events emitted. |

lib/auth/auth.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,6 +1052,18 @@ var (
10521052
[]string{teleport.TagPrivateKeyPolicy},
10531053
)
10541054

1055+
botInstancesMetric = prometheus.NewGaugeVec(
1056+
prometheus.GaugeOpts{
1057+
Namespace: teleport.MetricNamespace,
1058+
Name: teleport.MetricBotInstances,
1059+
Help: "The number of bot instances across the entire cluster",
1060+
},
1061+
[]string{
1062+
teleport.TagVersion,
1063+
teleport.TagAutomaticUpdates,
1064+
},
1065+
)
1066+
10551067
prometheusCollectors = []prometheus.Collector{
10561068
generateRequestsCount, generateThrottledRequestsCount,
10571069
generateRequestsCurrent, generateRequestsLatencies, UserLoginCount, heartbeatsMissedByAuth,
@@ -1061,6 +1073,7 @@ var (
10611073
registeredAgentsInstallMethod,
10621074
userCertificatesGeneratedMetric,
10631075
roleCount,
1076+
botInstancesMetric,
10641077
}
10651078
)
10661079

@@ -1610,6 +1623,7 @@ const (
16101623
accessListReminderNotificationsKey
16111624
autoUpdateAgentReportKey
16121625
autoUpdateBotInstanceReportKey
1626+
autoUpdateBotInstanceMetricsKey
16131627
)
16141628

16151629
// runPeriodicOperations runs some periodic bookkeeping operations
@@ -1711,6 +1725,12 @@ func (a *Server) runPeriodicOperations() {
17111725
FirstDuration: retryutils.HalfJitter(10 * time.Second),
17121726
Jitter: retryutils.SeventhJitter,
17131727
})
1728+
ticker.Push(interval.SubInterval[periodicIntervalKey]{
1729+
Key: autoUpdateBotInstanceMetricsKey,
1730+
Duration: constants.AutoUpdateAgentReportPeriod / 2,
1731+
FirstDuration: retryutils.HalfJitter(10 * time.Second),
1732+
Jitter: retryutils.SeventhJitter,
1733+
})
17141734
}
17151735

17161736
if modules.GetModules().IsOSSBuild() {
@@ -1839,6 +1859,8 @@ func (a *Server) runPeriodicOperations() {
18391859
go a.reportAgentVersions(a.closeCtx)
18401860
case autoUpdateBotInstanceReportKey:
18411861
go a.botVersionReporter.Report(a.closeCtx)
1862+
case autoUpdateBotInstanceMetricsKey:
1863+
go a.updateBotInstanceMetrics()
18421864
}
18431865
}
18441866
}
@@ -2154,6 +2176,18 @@ func (a *Server) updateAgentMetrics() {
21542176
}
21552177
}
21562178

2179+
func (a *Server) updateBotInstanceMetrics() {
2180+
report, err := a.GetAutoUpdateBotInstanceReport(a.closeCtx)
2181+
switch {
2182+
case trace.IsNotFound(err):
2183+
// No report to emit.
2184+
case err != nil:
2185+
a.logger.ErrorContext(a.closeCtx, "Failed to get bot instance report", "error", err)
2186+
default:
2187+
machineidv1.EmitInstancesMetric(report, botInstancesMetric)
2188+
}
2189+
}
2190+
21572191
var (
21582192
// remoteClusterRefreshLimit is the maximum number of backend updates that will be performed
21592193
// during periodic remote cluster connection status refresh.

lib/auth/machineid/machineidv1/auto_update_version_reporter.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@ import (
2525

2626
"github.com/gravitational/trace"
2727
"github.com/jonboulle/clockwork"
28+
"github.com/prometheus/client_golang/prometheus"
2829
"google.golang.org/protobuf/types/known/timestamppb"
2930

31+
"github.com/gravitational/teleport"
3032
"github.com/gravitational/teleport/api/defaults"
3133
"github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
3234
machineidv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1"
@@ -318,3 +320,34 @@ func (r *AutoUpdateVersionReporter) IsLeader() bool {
318320
return false
319321
}
320322
}
323+
324+
// EmitInstancesMetric updates the given gauge metric based on the instance report.
325+
func EmitInstancesMetric(report *autoupdate.AutoUpdateBotInstanceReport, gauge *prometheus.GaugeVec) {
326+
gauge.Reset()
327+
328+
byVersion := make(map[string]int32)
329+
330+
for group, groupMetrics := range report.GetSpec().GetGroups() {
331+
// Empty group means the bot isn't using Managed Updates.
332+
if group == "" {
333+
for version, versionMetrics := range groupMetrics.GetVersions() {
334+
gauge.With(prometheus.Labels{
335+
teleport.TagVersion: version,
336+
teleport.TagAutomaticUpdates: "false",
337+
}).Set(float64(versionMetrics.Count))
338+
}
339+
continue
340+
}
341+
342+
for version, metrics := range groupMetrics.GetVersions() {
343+
byVersion[version] += metrics.Count
344+
}
345+
}
346+
347+
for version, count := range byVersion {
348+
gauge.With(prometheus.Labels{
349+
teleport.TagVersion: version,
350+
teleport.TagAutomaticUpdates: "true",
351+
}).Set(float64(count))
352+
}
353+
}

lib/auth/machineid/machineidv1/auto_update_version_reporter_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,21 @@ package machineidv1_test
1919

2020
import (
2121
"context"
22+
"fmt"
23+
"strconv"
2224
"testing"
2325
"time"
2426

2527
"github.com/google/go-cmp/cmp"
2628
"github.com/google/uuid"
2729
"github.com/jonboulle/clockwork"
30+
"github.com/prometheus/client_golang/prometheus"
31+
"github.com/prometheus/client_golang/prometheus/testutil"
2832
"github.com/stretchr/testify/require"
2933
"google.golang.org/protobuf/testing/protocmp"
3034
"google.golang.org/protobuf/types/known/timestamppb"
3135

36+
"github.com/gravitational/teleport"
3237
autoupdatev1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
3338
headerv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/header/v1"
3439
machineidv1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1"
@@ -146,6 +151,64 @@ func TestAutoUpdateVersionReporter(t *testing.T) {
146151
}
147152
}
148153

154+
func TestEmitInstancesMetric(t *testing.T) {
155+
gauge := prometheus.NewGaugeVec(
156+
prometheus.GaugeOpts{
157+
Namespace: teleport.MetricNamespace,
158+
Name: teleport.MetricBotInstances,
159+
},
160+
[]string{
161+
teleport.TagVersion,
162+
teleport.TagAutomaticUpdates,
163+
},
164+
)
165+
166+
machineidv1.EmitInstancesMetric(
167+
&autoupdatev1pb.AutoUpdateBotInstanceReport{
168+
Spec: &autoupdatev1pb.AutoUpdateBotInstanceReportSpec{
169+
Groups: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroup{
170+
"prod": {
171+
Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{
172+
"18.0.0": {Count: 1},
173+
"19.0.0": {Count: 1},
174+
},
175+
},
176+
"stage": {
177+
Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{
178+
"18.0.0": {Count: 1},
179+
"19.0.0": {Count: 1},
180+
},
181+
},
182+
"": {
183+
Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{
184+
"19.0.0": {Count: 123},
185+
"20.0.0": {Count: 321},
186+
},
187+
},
188+
},
189+
},
190+
},
191+
gauge,
192+
)
193+
194+
for _, tc := range []struct {
195+
version string
196+
automaticUpdates bool
197+
expectedValue float64
198+
}{
199+
{version: "18.0.0", automaticUpdates: true, expectedValue: 2},
200+
{version: "19.0.0", automaticUpdates: true, expectedValue: 2},
201+
{version: "19.0.0", automaticUpdates: false, expectedValue: 123},
202+
{version: "20.0.0", automaticUpdates: false, expectedValue: 321},
203+
} {
204+
t.Run(fmt.Sprintf("%s/%v", tc.version, tc.automaticUpdates), func(t *testing.T) {
205+
metric := gauge.WithLabelValues(tc.version, strconv.FormatBool(tc.automaticUpdates))
206+
require.InEpsilon(t, tc.expectedValue, testutil.ToFloat64(metric), 0)
207+
})
208+
}
209+
210+
}
211+
149212
type testSemaphores struct{ types.Semaphores }
150213

151214
func (s *testSemaphores) AcquireSemaphore(ctx context.Context, params types.AcquireSemaphoreRequest) (*types.SemaphoreLease, error) {

metrics.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,9 @@ const (
263263
// MetricRegisteredServers tracks the number of Teleport servers that have successfully registered with the Teleport cluster and have not reached the end of their ttl
264264
MetricRegisteredServers = "registered_servers"
265265

266+
// MetricBotInstances tracks the number of bot instances across the entire cluster, labeled by version
267+
MetricBotInstances = "bot_instances"
268+
266269
// MetricRegisteredServersByInstallMethods tracks the number of Teleport servers, and their installation method,
267270
// that have successfully registered with the Teleport cluster and have not reached the end of their ttl
268271
MetricRegisteredServersByInstallMethods = "registered_servers_by_install_methods"

0 commit comments

Comments
 (0)