Skip to content

Commit 372fb22

Browse files
committed
MWI: Add teleport_bot_instances metric (#59774)
1 parent 005dff8 commit 372fb22

File tree

5 files changed

+134
-0
lines changed

5 files changed

+134
-0
lines changed

docs/pages/includes/metrics.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
| `teleport_registered_servers_by_install_methods` | gauge | Teleport Auth | The number of Teleport services that are connected to an Auth Service instance grouped by install methods. |
7171
| `teleport_roles_total` | gauge | Teleport Auth | The number of roles that exist in the cluster. |
7272
| `teleport_migrations` | gauge | Teleport Auth | Tracks for each migration if it is active (1) or not (0). |
73+
| `teleport_bot_instances` | gauge | Teleport Auth | The number of bot instances across the entire cluster grouped by version. |
7374
| `user_login_total` | counter | Teleport Auth | Number of user logins. |
7475
| `watcher_event_sizes` | histogram | cache | Overall size of events emitted. |
7576
| `watcher_events` | histogram | cache | Per resource size of events emitted. |

lib/auth/auth.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -958,6 +958,18 @@ var (
958958
[]string{teleport.TagPrivateKeyPolicy},
959959
)
960960

961+
botInstancesMetric = prometheus.NewGaugeVec(
962+
prometheus.GaugeOpts{
963+
Namespace: teleport.MetricNamespace,
964+
Name: teleport.MetricBotInstances,
965+
Help: "The number of bot instances across the entire cluster",
966+
},
967+
[]string{
968+
teleport.TagVersion,
969+
teleport.TagAutomaticUpdates,
970+
},
971+
)
972+
961973
prometheusCollectors = []prometheus.Collector{
962974
generateRequestsCount, generateThrottledRequestsCount,
963975
generateRequestsCurrent, generateRequestsLatencies, UserLoginCount, heartbeatsMissedByAuth,
@@ -967,6 +979,7 @@ var (
967979
registeredAgentsInstallMethod,
968980
userCertificatesGeneratedMetric,
969981
roleCount,
982+
botInstancesMetric,
970983
}
971984
)
972985

@@ -1463,6 +1476,7 @@ const (
14631476
accessListReminderNotificationsKey
14641477
autoUpdateAgentReportKey
14651478
autoUpdateBotInstanceReportKey
1479+
autoUpdateBotInstanceMetricsKey
14661480
)
14671481

14681482
// runPeriodicOperations runs some periodic bookkeeping operations
@@ -1564,6 +1578,12 @@ func (a *Server) runPeriodicOperations() {
15641578
FirstDuration: retryutils.HalfJitter(10 * time.Second),
15651579
Jitter: retryutils.SeventhJitter,
15661580
})
1581+
ticker.Push(interval.SubInterval[periodicIntervalKey]{
1582+
Key: autoUpdateBotInstanceMetricsKey,
1583+
Duration: constants.AutoUpdateAgentReportPeriod / 2,
1584+
FirstDuration: retryutils.HalfJitter(10 * time.Second),
1585+
Jitter: retryutils.SeventhJitter,
1586+
})
15671587
}
15681588

15691589
if modules.GetModules().IsOSSBuild() {
@@ -1692,6 +1712,8 @@ func (a *Server) runPeriodicOperations() {
16921712
go a.reportAgentVersions(a.closeCtx)
16931713
case autoUpdateBotInstanceReportKey:
16941714
go a.botVersionReporter.Report(a.closeCtx)
1715+
case autoUpdateBotInstanceMetricsKey:
1716+
go a.updateBotInstanceMetrics()
16951717
}
16961718
}
16971719
}
@@ -2007,6 +2029,18 @@ func (a *Server) updateAgentMetrics() {
20072029
}
20082030
}
20092031

2032+
func (a *Server) updateBotInstanceMetrics() {
2033+
report, err := a.GetAutoUpdateBotInstanceReport(a.closeCtx)
2034+
switch {
2035+
case trace.IsNotFound(err):
2036+
// No report to emit.
2037+
case err != nil:
2038+
a.logger.ErrorContext(a.closeCtx, "Failed to get bot instance report", "error", err)
2039+
default:
2040+
machineidv1.EmitInstancesMetric(report, botInstancesMetric)
2041+
}
2042+
}
2043+
20102044
var (
20112045
// remoteClusterRefreshLimit is the maximum number of backend updates that will be performed
20122046
// during periodic remote cluster connection status refresh.

lib/auth/machineid/machineidv1/auto_update_version_reporter.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@ import (
2525

2626
"github.com/gravitational/trace"
2727
"github.com/jonboulle/clockwork"
28+
"github.com/prometheus/client_golang/prometheus"
2829
"google.golang.org/protobuf/types/known/timestamppb"
2930

31+
"github.com/gravitational/teleport"
3032
"github.com/gravitational/teleport/api/defaults"
3133
"github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
3234
machineidv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1"
@@ -320,3 +322,34 @@ func (r *AutoUpdateVersionReporter) IsLeader() bool {
320322
return false
321323
}
322324
}
325+
326+
// EmitInstancesMetric updates the given gauge metric based on the instance report.
327+
func EmitInstancesMetric(report *autoupdate.AutoUpdateBotInstanceReport, gauge *prometheus.GaugeVec) {
328+
gauge.Reset()
329+
330+
byVersion := make(map[string]int32)
331+
332+
for group, groupMetrics := range report.GetSpec().GetGroups() {
333+
// Empty group means the bot isn't using Managed Updates.
334+
if group == "" {
335+
for version, versionMetrics := range groupMetrics.GetVersions() {
336+
gauge.With(prometheus.Labels{
337+
teleport.TagVersion: version,
338+
teleport.TagAutomaticUpdates: "false",
339+
}).Set(float64(versionMetrics.Count))
340+
}
341+
continue
342+
}
343+
344+
for version, metrics := range groupMetrics.GetVersions() {
345+
byVersion[version] += metrics.Count
346+
}
347+
}
348+
349+
for version, count := range byVersion {
350+
gauge.With(prometheus.Labels{
351+
teleport.TagVersion: version,
352+
teleport.TagAutomaticUpdates: "true",
353+
}).Set(float64(count))
354+
}
355+
}

lib/auth/machineid/machineidv1/auto_update_version_reporter_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,21 @@ package machineidv1_test
1919

2020
import (
2121
"context"
22+
"fmt"
23+
"strconv"
2224
"testing"
2325
"time"
2426

2527
"github.com/google/go-cmp/cmp"
2628
"github.com/google/uuid"
2729
"github.com/jonboulle/clockwork"
30+
"github.com/prometheus/client_golang/prometheus"
31+
"github.com/prometheus/client_golang/prometheus/testutil"
2832
"github.com/stretchr/testify/require"
2933
"google.golang.org/protobuf/testing/protocmp"
3034
"google.golang.org/protobuf/types/known/timestamppb"
3135

36+
"github.com/gravitational/teleport"
3237
autoupdatev1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
3338
headerv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/header/v1"
3439
machineidv1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1"
@@ -146,6 +151,64 @@ func TestAutoUpdateVersionReporter(t *testing.T) {
146151
}
147152
}
148153

154+
func TestEmitInstancesMetric(t *testing.T) {
155+
gauge := prometheus.NewGaugeVec(
156+
prometheus.GaugeOpts{
157+
Namespace: teleport.MetricNamespace,
158+
Name: teleport.MetricBotInstances,
159+
},
160+
[]string{
161+
teleport.TagVersion,
162+
teleport.TagAutomaticUpdates,
163+
},
164+
)
165+
166+
machineidv1.EmitInstancesMetric(
167+
&autoupdatev1pb.AutoUpdateBotInstanceReport{
168+
Spec: &autoupdatev1pb.AutoUpdateBotInstanceReportSpec{
169+
Groups: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroup{
170+
"prod": {
171+
Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{
172+
"18.0.0": {Count: 1},
173+
"19.0.0": {Count: 1},
174+
},
175+
},
176+
"stage": {
177+
Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{
178+
"18.0.0": {Count: 1},
179+
"19.0.0": {Count: 1},
180+
},
181+
},
182+
"": {
183+
Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{
184+
"19.0.0": {Count: 123},
185+
"20.0.0": {Count: 321},
186+
},
187+
},
188+
},
189+
},
190+
},
191+
gauge,
192+
)
193+
194+
for _, tc := range []struct {
195+
version string
196+
automaticUpdates bool
197+
expectedValue float64
198+
}{
199+
{version: "18.0.0", automaticUpdates: true, expectedValue: 2},
200+
{version: "19.0.0", automaticUpdates: true, expectedValue: 2},
201+
{version: "19.0.0", automaticUpdates: false, expectedValue: 123},
202+
{version: "20.0.0", automaticUpdates: false, expectedValue: 321},
203+
} {
204+
t.Run(fmt.Sprintf("%s/%v", tc.version, tc.automaticUpdates), func(t *testing.T) {
205+
metric := gauge.WithLabelValues(tc.version, strconv.FormatBool(tc.automaticUpdates))
206+
require.InEpsilon(t, tc.expectedValue, testutil.ToFloat64(metric), 0)
207+
})
208+
}
209+
210+
}
211+
149212
type testSemaphores struct{ types.Semaphores }
150213

151214
func (s *testSemaphores) AcquireSemaphore(ctx context.Context, params types.AcquireSemaphoreRequest) (*types.SemaphoreLease, error) {

metrics.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,9 @@ const (
263263
// MetricRegisteredServers tracks the number of Teleport servers that have successfully registered with the Teleport cluster and have not reached the end of their ttl
264264
MetricRegisteredServers = "registered_servers"
265265

266+
// MetricBotInstances tracks the number of bot instances across the entire cluster, labeled by version
267+
MetricBotInstances = "bot_instances"
268+
266269
// MetricRegisteredServersByInstallMethods tracks the number of Teleport servers, and their installation method,
267270
// that have successfully registered with the Teleport cluster and have not reached the end of their ttl
268271
MetricRegisteredServersByInstallMethods = "registered_servers_by_install_methods"

0 commit comments

Comments
 (0)