Skip to content

Commit 3f3458c

Browse files
committed
[v18] heal instance identity by re-joining via new join service
Backport #59872 to branch/v18
1 parent 4ac5ef2 commit 3f3458c

File tree

3 files changed

+124
-8
lines changed

3 files changed

+124
-8
lines changed

lib/join/joinclient/join.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,10 @@ func Join(ctx context.Context, params JoinParams) (*JoinResult, error) {
5050
if err := params.CheckAndSetDefaults(); err != nil {
5151
return nil, trace.Wrap(err)
5252
}
53-
if params.ID.HostUUID != "" {
53+
if params.AuthClient == nil && params.ID.HostUUID != "" {
54+
// This check is skipped if AuthClient is provided because this is a
55+
// re-join with an existing identity and the HostUUID will be
56+
// maintained.
5457
return nil, trace.BadParameter("HostUUID must not be provided to Join, it will be assigned by the Auth server")
5558
}
5659
if params.ID.Role != types.RoleInstance && params.ID.Role != types.RoleBot {

lib/service/connect.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ import (
6464
"github.com/gravitational/teleport/lib/utils"
6565
"github.com/gravitational/teleport/lib/utils/interval"
6666
logutils "github.com/gravitational/teleport/lib/utils/log"
67+
"github.com/gravitational/teleport/lib/utils/set"
6768
)
6869

6970
const updateClientsJoinWarning = "This agent joined the cluster during the update_clients phase of a host CA rotation, so its services might not be usable by clients that haven't logged in recently."
@@ -226,6 +227,19 @@ func (process *TeleportProcess) connect(role types.SystemRole, opts ...certOptio
226227
return nil, trace.Wrap(err)
227228
}
228229

230+
if role == types.RoleInstance {
231+
// If necessary, heal the instance identity by rejoining to get a new
232+
// identity with all required system roles. This is best-effort, any
233+
// error will be logged and the current identity will continue to be
234+
// used.
235+
newIdentity, err := process.healInstanceIdentity(identity)
236+
if err != nil {
237+
process.logger.WarnContext(process.ExitContext(), "Failed to heal instance identity", "error", err)
238+
} else {
239+
identity = newIdentity
240+
}
241+
}
242+
229243
rotation := processState.Spec.Rotation
230244

231245
switch rotation.State {
@@ -309,6 +323,98 @@ func (process *TeleportProcess) connect(role types.SystemRole, opts ...certOptio
309323
}
310324
}
311325

326+
func (process *TeleportProcess) healInstanceIdentity(currentIdentity *state.Identity) (*state.Identity, error) {
327+
currentSystemRoles := set.New(currentIdentity.SystemRoles...)
328+
wantSystemRoles := set.NewWithCapacity[string](len(process.instanceRoles))
329+
for role := range process.instanceRoles {
330+
wantSystemRoles.Add(string(role))
331+
}
332+
333+
missingSystemRoles := wantSystemRoles.Clone().Subtract(currentSystemRoles)
334+
if len(missingSystemRoles) == 0 {
335+
// The current instance identity contains all required roles, nothing to do.
336+
return currentIdentity, nil
337+
}
338+
339+
process.logger.InfoContext(process.ExitContext(), "Instance identity is missing required system roles, will attempt to self-heal", "missing_roles", missingSystemRoles.Elements())
340+
additionalPrincipals, dnsNames := process.instanceAdditionalPrincipals()
341+
var (
342+
newIdentity *state.Identity
343+
err error
344+
)
345+
if server := process.getLocalAuth(); server != nil {
346+
process.logger.InfoContext(process.ExitContext(), "Generating new Instance identity with local auth service")
347+
newIdentity, err = auth.GenerateIdentity(server, currentIdentity.ID, additionalPrincipals, dnsNames)
348+
if err != nil {
349+
return nil, trace.Wrap(err, "failed to generate new instance identity with local auth service")
350+
}
351+
} else {
352+
process.logger.InfoContext(process.ExitContext(), "Must rejoin to get a new Instance identity")
353+
if !process.Config.HasToken() {
354+
return nil, trace.Errorf("must rejoin to obtain missing system roles but no join token is configured")
355+
}
356+
357+
// Make an auth client authenticated with the current instance identity to use for the rejoin.
358+
currentConnector, err := process.getConnector(currentIdentity, currentIdentity)
359+
if err != nil {
360+
return nil, trace.Wrap(err, "failed to make connector with current instance identity")
361+
}
362+
currentAuthClient := currentConnector.Client
363+
364+
// Rejoin.
365+
joinParams, err := process.makeJoinParams(
366+
currentIdentity.ID,
367+
additionalPrincipals,
368+
dnsNames,
369+
)
370+
if err != nil {
371+
return nil, trace.Wrap(err, "failed to make join params")
372+
}
373+
joinParams.AuthClient = currentAuthClient
374+
rejoinResult, err := joinclient.Join(process.GracefulExitContext(), *joinParams)
375+
if err != nil {
376+
return nil, trace.Wrap(err, "failed to rejoin")
377+
}
378+
privateKeyPEM, err := keys.MarshalPrivateKey(rejoinResult.PrivateKey)
379+
if err != nil {
380+
return nil, trace.Wrap(err, "failed to marshal private key")
381+
}
382+
newIdentity, err = state.ReadIdentityFromKeyPair(privateKeyPEM, rejoinResult.Certs)
383+
if err != nil {
384+
return nil, trace.Wrap(err, "failed to parse new identity")
385+
}
386+
}
387+
388+
newSystemRoles := set.New(newIdentity.SystemRoles...)
389+
390+
// Sanity check we didn't lose any system roles.
391+
if lostRoles := currentSystemRoles.Clone().Subtract(newSystemRoles); len(lostRoles) > 0 {
392+
return nil, trace.Errorf("new Instance identity is missing the following system roles from the current identity (this is a bug): %v", lostRoles.Elements())
393+
}
394+
395+
gainedSystemRoles := newSystemRoles.Clone().Subtract(currentSystemRoles)
396+
if len(gainedSystemRoles) == 0 {
397+
process.logger.WarnContext(process.ExitContext(), "Did not gain any system roles")
398+
// Don't bother saving or returning the new identity if no system roles were gained.
399+
return currentIdentity, nil
400+
}
401+
402+
if rolesStillMissing := missingSystemRoles.Clone().Subtract(newSystemRoles); len(rolesStillMissing) > 0 {
403+
process.logger.WarnContext(process.ExitContext(), "Partially healed instance identity but some required system roles are still missing",
404+
"gained_roles", gainedSystemRoles.Elements(),
405+
"missing_roles", rolesStillMissing.Elements())
406+
} else {
407+
process.logger.InfoContext(process.ExitContext(), "Obtained new instance identity with all required system roles",
408+
"gained_roles", gainedSystemRoles.Elements(),
409+
)
410+
}
411+
412+
if err := process.storage.WriteIdentity(state.IdentityCurrent, *newIdentity); err != nil {
413+
return nil, trace.Wrap(err, "failed to write new identity to storage")
414+
}
415+
return newIdentity, nil
416+
}
417+
312418
// newWatcher returns a new watcher,
313419
// either using local auth server connection or remote client
314420
func (process *TeleportProcess) newWatcher(conn *Connector, watch types.Watch) (types.Watcher, error) {
@@ -495,6 +601,8 @@ func (process *TeleportProcess) firstTimeConnectIdentityRemote(role types.System
495601
// reach this point if the Instance identity couldn't get a certificate
496602
// with this requested role, which should only happen if the new join
497603
// service with auth-assigned host UUIDs is not available.
604+
//
605+
// TODO(nklaassen): DELETE IN 20
498606
process.Config.Logger.InfoContext(process.GracefulExitContext(), "Instance identity does not include required system role, must re-join with a provision token", "role", role)
499607
return process.legacyJoinWithHostUUID(role, instanceIdentity.ID.HostID())
500608
}
@@ -967,6 +1075,12 @@ func (process *TeleportProcess) rotate(conn *Connector, localState state.StateV2
9671075
for _, baseRole := range clientIdentity.SystemRoles {
9681076
baseSystemRoles = append(baseSystemRoles, types.SystemRole(baseRole))
9691077
}
1078+
// Dangling system roles here should only be possible if
1079+
// healInstanceIdentity failed to get the necessary roles by rejoining
1080+
// via the new join service, this should only be necessary until the
1081+
// new join service supports all join methods.
1082+
//
1083+
// TODO(nklaassen): DELETE IN 20
9701084
var danglingSystemRoles []types.SystemRole
9711085
for _, activeRole := range process.getInstanceRoles() {
9721086
if slices.Contains(baseSystemRoles, activeRole) {

lib/service/service_test.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,13 +1057,6 @@ func TestInstanceSelfRepair(t *testing.T) {
10571057
// the previous Instance and Proxy certs, but enable the SSH service and
10581058
// provide the token that allows only role Node.
10591059
process = newStartedProcess(sshToken.GetName(), true)
1060-
// Wait for the TeleportCredentialsUpdatedEvent which will be emitted after
1061-
// the rotation logic detects the dangling system role and repairs the
1062-
// Instance identity.
1063-
ctx, cancel := context.WithTimeout(t.Context(), 30*time.Second)
1064-
defer cancel()
1065-
_, err = process.WaitForEvent(ctx, TeleportCredentialsUpdatedEvent)
1066-
require.NoError(t, err)
10671060
// Get the new Instance identity and make sure it includes both the Proxy
10681061
// and Node system roles.
10691062
instanceConnector, err := process.WaitForConnector(InstanceIdentityEvent, logger)
@@ -1072,6 +1065,12 @@ func TestInstanceSelfRepair(t *testing.T) {
10721065
instanceID := instanceConnector.clientState.Load().identity
10731066
require.Equal(t, types.RoleInstance, instanceID.ID.Role)
10741067
assert.ElementsMatch(t, []string{types.RoleProxy.String(), types.RoleNode.String()}, instanceID.SystemRoles)
1068+
// Make sure the SSH identity becomes available.
1069+
sshConnector, err := process.WaitForConnector(SSHIdentityEvent, logger)
1070+
require.NoError(t, err)
1071+
require.NotNil(t, sshConnector)
1072+
sshID := sshConnector.clientState.Load().identity
1073+
require.Equal(t, types.RoleNode, sshID.ID.Role)
10751074
// Close the process to clean up.
10761075
require.NoError(t, process.Close())
10771076
require.NoError(t, process.Wait())

0 commit comments

Comments
 (0)