@@ -64,6 +64,7 @@ import (
6464 "github.com/gravitational/teleport/lib/utils"
6565 "github.com/gravitational/teleport/lib/utils/interval"
6666 logutils "github.com/gravitational/teleport/lib/utils/log"
67+ "github.com/gravitational/teleport/lib/utils/set"
6768)
6869
6970const updateClientsJoinWarning = "This agent joined the cluster during the update_clients phase of a host CA rotation, so its services might not be usable by clients that haven't logged in recently."
@@ -226,6 +227,19 @@ func (process *TeleportProcess) connect(role types.SystemRole, opts ...certOptio
226227 return nil , trace .Wrap (err )
227228 }
228229
230+ if role == types .RoleInstance {
231+ // If necessary, heal the instance identity by rejoining to get a new
232+ // identity with all required system roles. This is best-effort, any
233+ // error will be logged and the current identity will continue to be
234+ // used.
235+ newIdentity , err := process .healInstanceIdentity (identity )
236+ if err != nil {
237+ process .logger .WarnContext (process .ExitContext (), "Failed to heal instance identity" , "error" , err )
238+ } else {
239+ identity = newIdentity
240+ }
241+ }
242+
229243 rotation := processState .Spec .Rotation
230244
231245 switch rotation .State {
@@ -309,6 +323,98 @@ func (process *TeleportProcess) connect(role types.SystemRole, opts ...certOptio
309323 }
310324}
311325
326+ func (process * TeleportProcess ) healInstanceIdentity (currentIdentity * state.Identity ) (* state.Identity , error ) {
327+ currentSystemRoles := set .New (currentIdentity .SystemRoles ... )
328+ wantSystemRoles := set.NewWithCapacity [string ](len (process .instanceRoles ))
329+ for role := range process .instanceRoles {
330+ wantSystemRoles .Add (string (role ))
331+ }
332+
333+ missingSystemRoles := wantSystemRoles .Clone ().Subtract (currentSystemRoles )
334+ if len (missingSystemRoles ) == 0 {
335+ // The current instance identity contains all required roles, nothing to do.
336+ return currentIdentity , nil
337+ }
338+
339+ process .logger .InfoContext (process .ExitContext (), "Instance identity is missing required system roles, will attempt to self-heal" , "missing_roles" , missingSystemRoles .Elements ())
340+ additionalPrincipals , dnsNames := process .instanceAdditionalPrincipals ()
341+ var (
342+ newIdentity * state.Identity
343+ err error
344+ )
345+ if server := process .getLocalAuth (); server != nil {
346+ process .logger .InfoContext (process .ExitContext (), "Generating new Instance identity with local auth service" )
347+ newIdentity , err = auth .GenerateIdentity (server , currentIdentity .ID , additionalPrincipals , dnsNames )
348+ if err != nil {
349+ return nil , trace .Wrap (err , "failed to generate new instance identity with local auth service" )
350+ }
351+ } else {
352+ process .logger .InfoContext (process .ExitContext (), "Must rejoin to get a new Instance identity" )
353+ if ! process .Config .HasToken () {
354+ return nil , trace .Errorf ("must rejoin to obtain missing system roles but no join token is configured" )
355+ }
356+
357+ // Make an auth client authenticated with the current instance identity to use for the rejoin.
358+ currentConnector , err := process .getConnector (currentIdentity , currentIdentity )
359+ if err != nil {
360+ return nil , trace .Wrap (err , "failed to make connector with current instance identity" )
361+ }
362+ currentAuthClient := currentConnector .Client
363+
364+ // Rejoin.
365+ joinParams , err := process .makeJoinParams (
366+ currentIdentity .ID ,
367+ additionalPrincipals ,
368+ dnsNames ,
369+ )
370+ if err != nil {
371+ return nil , trace .Wrap (err , "failed to make join params" )
372+ }
373+ joinParams .AuthClient = currentAuthClient
374+ rejoinResult , err := joinclient .Join (process .GracefulExitContext (), * joinParams )
375+ if err != nil {
376+ return nil , trace .Wrap (err , "failed to rejoin" )
377+ }
378+ privateKeyPEM , err := keys .MarshalPrivateKey (rejoinResult .PrivateKey )
379+ if err != nil {
380+ return nil , trace .Wrap (err , "failed to marshal private key" )
381+ }
382+ newIdentity , err = state .ReadIdentityFromKeyPair (privateKeyPEM , rejoinResult .Certs )
383+ if err != nil {
384+ return nil , trace .Wrap (err , "failed to parse new identity" )
385+ }
386+ }
387+
388+ newSystemRoles := set .New (newIdentity .SystemRoles ... )
389+
390+ // Sanity check we didn't lose any system roles.
391+ if lostRoles := currentSystemRoles .Clone ().Subtract (newSystemRoles ); len (lostRoles ) > 0 {
392+ return nil , trace .Errorf ("new Instance identity is missing the following system roles from the current identity (this is a bug): %v" , lostRoles .Elements ())
393+ }
394+
395+ gainedSystemRoles := newSystemRoles .Clone ().Subtract (currentSystemRoles )
396+ if len (gainedSystemRoles ) == 0 {
397+ process .logger .WarnContext (process .ExitContext (), "Did not gain any system roles" )
398+ // Don't bother saving or returning the new identity if no system roles were gained.
399+ return currentIdentity , nil
400+ }
401+
402+ if rolesStillMissing := missingSystemRoles .Clone ().Subtract (newSystemRoles ); len (rolesStillMissing ) > 0 {
403+ process .logger .WarnContext (process .ExitContext (), "Partially healed instance identity but some required system roles are still missing" ,
404+ "gained_roles" , gainedSystemRoles .Elements (),
405+ "missing_roles" , rolesStillMissing .Elements ())
406+ } else {
407+ process .logger .InfoContext (process .ExitContext (), "Obtained new instance identity with all required system roles" ,
408+ "gained_roles" , gainedSystemRoles .Elements (),
409+ )
410+ }
411+
412+ if err := process .storage .WriteIdentity (state .IdentityCurrent , * newIdentity ); err != nil {
413+ return nil , trace .Wrap (err , "failed to write new identity to storage" )
414+ }
415+ return newIdentity , nil
416+ }
417+
312418// newWatcher returns a new watcher,
313419// either using local auth server connection or remote client
314420func (process * TeleportProcess ) newWatcher (conn * Connector , watch types.Watch ) (types.Watcher , error ) {
@@ -495,6 +601,8 @@ func (process *TeleportProcess) firstTimeConnectIdentityRemote(role types.System
495601 // reach this point if the Instance identity couldn't get a certificate
496602 // with this requested role, which should only happen if the new join
497603 // service with auth-assigned host UUIDs is not available.
604+ //
605+ // TODO(nklaassen): DELETE IN 20
498606 process .Config .Logger .InfoContext (process .GracefulExitContext (), "Instance identity does not include required system role, must re-join with a provision token" , "role" , role )
499607 return process .legacyJoinWithHostUUID (role , instanceIdentity .ID .HostID ())
500608 }
@@ -967,6 +1075,12 @@ func (process *TeleportProcess) rotate(conn *Connector, localState state.StateV2
9671075 for _ , baseRole := range clientIdentity .SystemRoles {
9681076 baseSystemRoles = append (baseSystemRoles , types .SystemRole (baseRole ))
9691077 }
1078+ // Dangling system roles here should only be possible if
1079+ // healInstanceIdentity failed to get the necessary roles by rejoining
1080+ // via the new join service, this should only be necessary until the
1081+ // new join service supports all join methods.
1082+ //
1083+ // TODO(nklaassen): DELETE IN 20
9701084 var danglingSystemRoles []types.SystemRole
9711085 for _ , activeRole := range process .getInstanceRoles () {
9721086 if slices .Contains (baseSystemRoles , activeRole ) {
0 commit comments