Skip to content

Commit 5a22c83

Browse files
authored
MWI: Add certificate expiry check leeway for tbot (#64293)
* MWI: Add certificate expiry check leeway for app tunnel service This adds some leeway to the application-tunnel service's certificate expiration check. For context, the app tunnel service does not follow Machine ID's usual certificate renewal cycle and instead opts to renew certificates just-in-time before completing a request if the certificate has expired per the local clock. Unfortunately, the local clock is not always accurate, and if the certificate or underlying app session has already expired from the server's perspective, client requests can still fail until the local clock catches up and the certificate is refreshed. To mitigate this, this change adds a `leeway` parameter to the service, configurable via YAML, with a default value of 1m. This is added to the current time when certificate validity is checked. This means that, in the worst case, certificates will be refreshed to early rather than too late. See also: #64284 * Remove duration pointer and ignore leeway if greater than cert TTL * Make tbot's leeway parameter global This makes the leeway parameter global, and additionally uses it in the main renewal loop (for the expired bot internal identity detection) and in the database tunnel service. Also adds some test coverage for app tunnel cert renewals. * Tweak doc comment for clarity * Remove unused code * Ignore excessive leeway values in identity service * Honor effective lifetime in leeway check * Fix leeway var reference * Update golden tests * Codex reviewer appeasement Factor in actual cert TTL in the leeway cap. * Fix failing test * Fix imports
1 parent 7a07204 commit 5a22c83

File tree

18 files changed

+343
-20
lines changed

18 files changed

+343
-20
lines changed

lib/srv/alpnproxy/local_proxy.go

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"net/http/httputil"
3131
"strings"
3232
"sync"
33+
"time"
3334

3435
"github.com/gravitational/trace"
3536
"github.com/jackc/pgproto3/v2"
@@ -434,8 +435,11 @@ func (l *LocalProxy) getCert() tls.Certificate {
434435
return l.cfg.Cert
435436
}
436437

437-
// CheckDBCert checks the proxy certificates for expiration and that the cert subject matches a database route.
438-
func (l *LocalProxy) CheckDBCert(ctx context.Context, dbRoute tlsca.RouteToDatabase) error {
438+
// CheckDBCertWithLeeway checks the proxy certificates for expiration and that
439+
// the cert subject matches a database route. The provided leeway value is added
440+
// to the current time when checking certificate expiration and can be used to
441+
// account for potential client-side clock drift.
442+
func (l *LocalProxy) CheckDBCertWithLeeway(ctx context.Context, dbRoute tlsca.RouteToDatabase, leeway time.Duration) error {
439443
l.cfg.Log.DebugContext(ctx, "checking local proxy database certs")
440444
l.certMu.RLock()
441445
defer l.certMu.RUnlock()
@@ -450,15 +454,22 @@ func (l *LocalProxy) CheckDBCert(ctx context.Context, dbRoute tlsca.RouteToDatab
450454
}
451455

452456
// Check for cert expiration.
453-
if err := utils.VerifyCertificateExpiry(cert, l.cfg.Clock); err != nil {
457+
if err := utils.VerifyCertificateExpiryWithLeeway(cert, l.cfg.Clock, leeway); err != nil {
454458
return trace.Wrap(err)
455459
}
456460

457461
return trace.Wrap(CheckDBCertSubject(cert, dbRoute))
458462
}
459463

460-
// CheckCertExpiry checks the proxy certificates for expiration.
461-
func (l *LocalProxy) CheckCertExpiry(ctx context.Context) error {
464+
// CheckDBCert checks the proxy certificates for expiration and that the cert subject matches a database route.
465+
func (l *LocalProxy) CheckDBCert(ctx context.Context, dbRoute tlsca.RouteToDatabase) error {
466+
return l.CheckDBCertWithLeeway(ctx, dbRoute, 0)
467+
}
468+
469+
// CheckCertExpiryWithLeeway checks the proxy certificates for expiration. The
470+
// provided leeway value is added to the current time when compared to the cert
471+
// expiration and can be used to account for potential client-side clock drift.
472+
func (l *LocalProxy) CheckCertExpiryWithLeeway(ctx context.Context, leeway time.Duration) error {
462473
l.cfg.Log.DebugContext(ctx, "checking local proxy certs")
463474
l.certMu.RLock()
464475
defer l.certMu.RUnlock()
@@ -472,7 +483,12 @@ func (l *LocalProxy) CheckCertExpiry(ctx context.Context) error {
472483
return trace.Wrap(err)
473484
}
474485

475-
return trace.Wrap(utils.VerifyCertificateExpiry(cert, l.cfg.Clock))
486+
return trace.Wrap(utils.VerifyCertificateExpiryWithLeeway(cert, l.cfg.Clock, leeway))
487+
}
488+
489+
// CheckCertExpiry checks the proxy certificates for expiration.
490+
func (l *LocalProxy) CheckCertExpiry(ctx context.Context) error {
491+
return trace.Wrap(l.CheckCertExpiryWithLeeway(ctx, 0))
476492
}
477493

478494
// CheckDBCertSubject checks if the route to the database from the cert matches the provided route in

lib/tbot/bot/bot.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ func (b *Bot) buildIdentityService(
355355
Destination: b.cfg.InternalStorage,
356356
TTL: b.cfg.CredentialLifetime.TTL,
357357
RenewalInterval: b.cfg.CredentialLifetime.RenewalInterval,
358+
Leeway: b.cfg.Leeway,
358359
FIPS: b.cfg.FIPS,
359360
Logger: b.cfg.Logger.With(
360361
teleport.ComponentKey,

lib/tbot/bot/config.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package bot
2020

2121
import (
2222
"log/slog"
23+
"time"
2324

2425
"github.com/gravitational/trace"
2526
"github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus"
@@ -52,6 +53,12 @@ type Config struct {
5253
// internal credentials.
5354
CredentialLifetime CredentialLifetime
5455

56+
// Leeway is a duration added to local system time when checking for expired
57+
// certificates in certain cases, particularly with app and database
58+
// tunnels. It can be useful to account for clock drift, or if a negative
59+
// duration is provided, to simulate clock drift.
60+
Leeway time.Duration `yaml:"leeway,omitempty"`
61+
5562
// FIPS controls whether the bot will run in a mode designed to comply with
5663
// Federal Information Processing Standards.
5764
FIPS bool

lib/tbot/config/config.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ import (
5656
const (
5757
DefaultCertificateTTL = 60 * time.Minute
5858
DefaultRenewInterval = 20 * time.Minute
59+
DefaultLeeway = 1 * time.Minute
5960
)
6061

6162
var log = logutils.NewPackageLogger(teleport.ComponentKey, teleport.ComponentTBot)
@@ -87,7 +88,15 @@ type BotConfig struct {
8788
JoinURI string `yaml:"join_uri,omitempty"`
8889

8990
CredentialLifetime bot.CredentialLifetime `yaml:",inline"`
90-
Oneshot bool `yaml:"oneshot"`
91+
92+
// Leeway is a duration added to local system time when checking for expired
93+
// certificates in certain cases, particularly with app and database
94+
// tunnels. It can be useful to account for clock drift, or if a negative
95+
// duration is provided, to simulate clock drift.
96+
Leeway time.Duration `yaml:"leeway,omitempty"`
97+
98+
Oneshot bool `yaml:"oneshot"`
99+
91100
// FIPS instructs `tbot` to run in a mode designed to comply with FIPS
92101
// regulations. This means the bot should:
93102
// - Refuse to run if not compiled with boringcrypto
@@ -259,6 +268,10 @@ func (conf *BotConfig) CheckAndSetDefaults() error {
259268
conf.CredentialLifetime.RenewalInterval = DefaultRenewInterval
260269
}
261270

271+
if conf.Leeway == 0 {
272+
conf.Leeway = DefaultLeeway
273+
}
274+
262275
// We require the join method for `configure` and `start` but not for `init`
263276
// Therefore, we need to check its valid here, but enforce its presence
264277
// elsewhere.

lib/tbot/internal/identity/service.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ type Config struct {
6363

6464
TTL time.Duration
6565
RenewalInterval time.Duration
66+
Leeway time.Duration
6667

6768
FIPS bool
6869

@@ -543,13 +544,29 @@ func renewIdentity(
543544
return newIdentity, nil
544545
}
545546

547+
// Note: This leeway cap check is simpler than app/db tunnel services as the
548+
// main renewal loop is, well, a loop, and will not attempt to renew more
549+
// often than the configured renewal interval.
550+
leeway := cfg.Leeway
551+
if leeway >= cfg.TTL {
552+
log.WarnContext(ctx, "leeway is greater than credential lifetime and "+
553+
"will be ignored, be aware of potential failures due to clock drift",
554+
"credential_ttl", cfg.TTL,
555+
"configured_leeway", cfg.Leeway,
556+
)
557+
leeway = 0
558+
}
559+
546560
// Note: This simple expiration check is probably not the best possible
547561
// solution to determine when to discard an existing identity: the client
548562
// could have severe clock drift, or there could be non-expiry related
549563
// reasons that an identity should be thrown out. We may improve this
550-
// discard logic in the future if we determine we're still creating excess
564+
// discard logic in the future if we determine we're still creating excess
551565
// bot instances.
552-
now := time.Now()
566+
// To allow users to manually compensate for clock drift if e.g. using very
567+
// tight renewal/TTL values, we expose a configurable leeway to trigger
568+
// modestly early renewal.
569+
now := time.Now().Add(leeway)
553570
if expiry, ok := facade.Expiry(); !ok || now.After(expiry) {
554571
slog.WarnContext(
555572
ctx,
@@ -564,6 +581,7 @@ func renewIdentity(
564581
"expiry", expiry,
565582
"ttl", cfg.TTL,
566583
"renewal_interval", cfg.RenewalInterval,
584+
"leeway", cfg.Leeway,
567585
)
568586

569587
newIdentity, err := botIdentityFromToken(ctx, log, cfg, nil)

lib/tbot/services/application/tunnel_config.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"net/url"
2424

2525
"github.com/gravitational/trace"
26+
"github.com/jonboulle/clockwork"
2627
"gopkg.in/yaml.v3"
2728

2829
"github.com/gravitational/teleport/lib/tbot/bot"
@@ -54,6 +55,14 @@ type TunnelConfig struct {
5455
// Listener overrides "listen" and directly provides an opened listener to
5556
// use.
5657
Listener net.Listener `yaml:"-"`
58+
59+
// Clock is a clock. If unset, the standard system clock is used. Used in
60+
// tests.
61+
clock clockwork.Clock `yaml:"-"`
62+
63+
// certIssuedHook is an optional hook called when the tunnel requests a new
64+
// certificate. Used in tests.
65+
certIssuedHook func() `yaml:"-"`
5766
}
5867

5968
// GetName returns the user-given name of the service, used for validation purposes.
@@ -94,6 +103,10 @@ func (s *TunnelConfig) CheckAndSetDefaults() error {
94103
if _, err := url.Parse(s.Listen); err != nil {
95104
return trace.Wrap(err, "parsing listen")
96105
}
106+
if s.clock == nil {
107+
s.clock = clockwork.NewRealClock()
108+
}
109+
97110
return nil
98111
}
99112

lib/tbot/services/application/tunnel_config_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ import (
2222
"testing"
2323
"time"
2424

25+
"github.com/jonboulle/clockwork"
26+
2527
"github.com/gravitational/teleport/lib/tbot/bot"
2628
)
2729

@@ -47,6 +49,8 @@ func TestApplicationTunnelService_YAML(t *testing.T) {
4749
func TestApplicationTunnelService_CheckAndSetDefaults(t *testing.T) {
4850
t.Parallel()
4951

52+
clock := clockwork.NewFakeClock()
53+
5054
tests := []testCheckAndSetDefaultsCase[*TunnelConfig]{
5155
{
5256
name: "valid",
@@ -55,8 +59,15 @@ func TestApplicationTunnelService_CheckAndSetDefaults(t *testing.T) {
5559
Listen: "tcp://0.0.0.0:3621",
5660
Roles: []string{"role1", "role2"},
5761
AppName: "my-app",
62+
clock: clock,
5863
}
5964
},
65+
want: &TunnelConfig{
66+
Listen: "tcp://0.0.0.0:3621",
67+
Roles: []string{"role1", "role2"},
68+
AppName: "my-app",
69+
clock: clock,
70+
},
6071
wantErr: "",
6172
},
6273
{

lib/tbot/services/application/tunnel_service.go

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"crypto/tls"
2525
"fmt"
2626
"log/slog"
27+
"time"
2728

2829
"github.com/gravitational/trace"
2930

@@ -44,6 +45,7 @@ func TunnelServiceBuilder(
4445
cfg *TunnelConfig,
4546
connCfg connection.Config,
4647
defaultCredentialLifetime bot.CredentialLifetime,
48+
leeway time.Duration,
4749
) bot.ServiceBuilder {
4850
buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) {
4951
if err := cfg.CheckAndSetDefaults(); err != nil {
@@ -52,6 +54,7 @@ func TunnelServiceBuilder(
5254
svc := &TunnelService{
5355
connCfg: connCfg,
5456
defaultCredentialLifetime: defaultCredentialLifetime,
57+
leeway: leeway,
5558
getBotIdentity: deps.BotIdentity,
5659
botIdentityReadyCh: deps.BotIdentityReadyCh,
5760
proxyPinger: deps.ProxyPinger,
@@ -74,6 +77,7 @@ func TunnelServiceBuilder(
7477
type TunnelService struct {
7578
connCfg connection.Config
7679
defaultCredentialLifetime bot.CredentialLifetime
80+
leeway time.Duration
7781
cfg *TunnelConfig
7882
proxyPinger connection.ProxyPinger
7983
log *slog.Logger
@@ -183,13 +187,40 @@ func (s *TunnelService) buildLocalProxyConfig(ctx context.Context) (lpCfg alpnpr
183187
}
184188
s.log.DebugContext(ctx, "Issued initial certificate for local proxy.")
185189

190+
// Attempt to provide a sensible cap for leeway values based on the
191+
// configured and actual cert TTL to guard against rapid renewals. This
192+
// doesn't attempt to be perfect, and it's still possible to force a new
193+
// cert to be issued on every connection with the right set of unreasonable
194+
// values. However, we want it to be possible to specify nearly-too-large
195+
// values for testing purposes (i.e. using negative leeway to simulate clock
196+
// drift) and do not want to be in the business of calculating leeway-leeway
197+
// so we'll keep the check somewhat simple.
198+
issuedCertLifetime := appCert.Leaf.NotAfter.Sub(appCert.Leaf.NotBefore)
199+
effectiveLifetime := cmp.Or(s.cfg.CredentialLifetime, s.defaultCredentialLifetime)
200+
effectiveTTL := min(issuedCertLifetime, effectiveLifetime.TTL)
201+
202+
leeway := s.leeway
203+
if leeway >= effectiveTTL {
204+
s.log.WarnContext(ctx,
205+
"leeway is greater than the credential lifetime and will be "+
206+
"ignored, be aware of potential failures due to clock drift",
207+
"configured_ttl", effectiveLifetime.TTL,
208+
"configured_leeway", leeway,
209+
"issued_cert_ttl", issuedCertLifetime,
210+
)
211+
leeway = 0
212+
}
213+
186214
middleware := internal.ALPNProxyMiddleware{
187215
OnNewConnectionFunc: func(ctx context.Context, lp *alpnproxy.LocalProxy) error {
188216
ctx, span := tracer.Start(ctx, "TunnelService/OnNewConnection")
189217
defer span.End()
190218

191-
if err := lp.CheckCertExpiry(ctx); err != nil {
192-
s.log.InfoContext(ctx, "Certificate for tunnel needs reissuing.", "reason", err.Error())
219+
if err := lp.CheckCertExpiryWithLeeway(ctx, leeway); err != nil {
220+
s.log.InfoContext(ctx, "Certificate for tunnel needs reissuing.",
221+
"reason", err.Error(),
222+
"leeway", leeway,
223+
)
193224
cert, _, err := s.issueCert(ctx)
194225
if err != nil {
195226
return trace.Wrap(err, "issuing cert")
@@ -208,6 +239,7 @@ func (s *TunnelService) buildLocalProxyConfig(ctx context.Context) (lpCfg alpnpr
208239
Protocols: []common.Protocol{alpnProtocolForApp(app)},
209240
Cert: *appCert,
210241
InsecureSkipVerify: s.connCfg.Insecure,
242+
Clock: s.cfg.clock,
211243
}
212244
if apiclient.IsALPNConnUpgradeRequired(
213245
ctx,
@@ -264,7 +296,17 @@ func (s *TunnelService) issueCert(
264296
}
265297
s.log.InfoContext(ctx, "Certificate issued for tunnel proxy.")
266298

267-
return routedIdent.TLSCert, app, nil
299+
// In tests, notify the test that a cert has been issued.
300+
if s.cfg.certIssuedHook != nil {
301+
s.cfg.certIssuedHook()
302+
}
303+
304+
// The leaf isn't appended by the stdlib, so add it here so we can inspect
305+
// the TTL downstream.
306+
cert := routedIdent.TLSCert
307+
cert.Leaf = routedIdent.X509Cert
308+
309+
return cert, app, nil
268310
}
269311

270312
// String returns a human-readable string that can uniquely identify the

0 commit comments

Comments
 (0)