diff --git a/client/internal/login.go b/client/internal/login.go index f528783ef0d..8dbd81972a6 100644 --- a/client/internal/login.go +++ b/client/internal/login.go @@ -2,7 +2,9 @@ package internal import ( "context" + "fmt" "net/url" + "time" "github.com/google/uuid" log "github.com/sirupsen/logrus" @@ -71,6 +73,11 @@ func Login(ctx context.Context, config *profilemanager.Config, setupKey string, serverKey, _, err := doMgmLogin(ctx, mgmClient, pubSSHKey, config) if serverKey != nil && isRegistrationNeeded(err) { + // Only attempt registration if we have credentials (setup key or JWT token) + if setupKey == "" && jwtToken == "" { + log.Debugf("peer registration required but no credentials provided") + return err + } log.Debugf("peer registration required") _, err = registerPeer(ctx, *serverKey, mgmClient, setupKey, jwtToken, pubSSHKey, config) if err != nil { @@ -163,6 +170,11 @@ func registerPeer(ctx context.Context, serverPublicKey wgtypes.Key, client *mgm. ) loginResp, err := client.Register(serverPublicKey, validSetupKey.String(), jwtToken, info, pubSSHKey, config.DNSLabels) if err != nil { + // Check if this is a timeout that might succeed on the server side + if s, ok := status.FromError(err); ok && s.Code() == codes.DeadlineExceeded { + log.Infof("registration request timed out, waiting for server to complete processing...") + return retryLoginAfterRegistrationTimeout(ctx, client, serverPublicKey, pubSSHKey, config) + } log.Errorf("failed registering peer %v", err) return nil, err } @@ -172,6 +184,89 @@ func registerPeer(ctx context.Context, serverPublicKey wgtypes.Key, client *mgm. return loginResp, nil } +// retryLoginAfterRegistrationTimeout handles the case where a registration request times out +// but may have succeeded on the server side. It waits for the server to complete processing +// and then retries with login requests to avoid showing errors to the user. +func retryLoginAfterRegistrationTimeout( + ctx context.Context, + client *mgm.GrpcClient, + serverPublicKey wgtypes.Key, + pubSSHKey []byte, + config *profilemanager.Config, +) (*mgmProto.LoginResponse, error) { + // Wait periods between login attempts: 60s, 40s, 40s (total ~180s with attempts) + waitPeriods := []time.Duration{60 * time.Second, 40 * time.Second, 40 * time.Second} + + for i, waitDuration := range waitPeriods { + attemptNum := i + 1 + + // Check if context is cancelled before sleeping + if ctx.Err() != nil { + return nil, ctx.Err() + } + + // Sleep to allow server to complete registration + log.Infof("waiting %v before login attempt %d/3...", waitDuration, attemptNum) + select { + case <-time.After(waitDuration): + // Continue to login attempt + case <-ctx.Done(): + return nil, ctx.Err() + } + + // Attempt login + log.Debugf("attempting login %d/3 after registration timeout", attemptNum) + sysInfo := system.GetInfo(ctx) + sysInfo.SetFlags( + config.RosenpassEnabled, + config.RosenpassPermissive, + config.ServerSSHAllowed, + config.DisableClientRoutes, + config.DisableServerRoutes, + config.DisableDNS, + config.DisableFirewall, + config.BlockLANAccess, + config.BlockInbound, + config.LazyConnectionEnabled, + config.EnableSSHRoot, + config.EnableSSHSFTP, + config.EnableSSHLocalPortForwarding, + config.EnableSSHRemotePortForwarding, + config.DisableSSHAuth, + ) + + loginResp, err := client.Login(serverPublicKey, sysInfo, pubSSHKey, config.DNSLabels) + if err == nil { + log.Infof("registration completed successfully on server (recovered from timeout)") + return loginResp, nil + } + + // Check error type + if s, ok := status.FromError(err); ok { + switch s.Code() { + case codes.PermissionDenied, codes.NotFound: + // Peer not ready yet, continue to next retry + log.Debugf("peer not ready yet (attempt %d/3): %v", attemptNum, s.Code()) + continue + default: + // Unexpected error, return it + log.Errorf("login attempt %d/3 failed with unexpected error: %v", attemptNum, err) + return nil, fmt.Errorf("registration timed out and login recovery failed: %w", err) + } + } + + // Last attempt - return descriptive error + if attemptNum == 3 { + return nil, status.Errorf(codes.DeadlineExceeded, + "peer registration is taking longer than expected, please try 'netbird up' again in a few minutes") + } + } + + // Should never reach here, but just in case + return nil, status.Errorf(codes.DeadlineExceeded, + "peer registration is taking longer than expected, please try 'netbird up' again in a few minutes") +} + func isLoginNeeded(err error) bool { if err == nil { return false diff --git a/shared/management/client/grpc.go b/shared/management/client/grpc.go index 520a83e3604..8305eb8fe9e 100644 --- a/shared/management/client/grpc.go +++ b/shared/management/client/grpc.go @@ -25,7 +25,17 @@ import ( "github.com/netbirdio/netbird/util/wsproxy" ) -const ConnectTimeout = 10 * time.Second +const ( + ConnectTimeout = 10 * time.Second + + RegistrationRetryTimeout = 180 * time.Second + RegistrationRetryInterval = 5 * time.Second + RegistrationRetryMaxDelay = 30 * time.Second + + LoginRetryTimeout = 45 * time.Second + LoginRetryInterval = 3 * time.Second + LoginRetryMaxDelay = 15 * time.Second +) const ( errMsgMgmtPublicKey = "failed getting Management Service public key: %s" @@ -312,6 +322,38 @@ func (c *GrpcClient) IsHealthy() bool { return true } +// newRegistrationBackoff creates a backoff strategy for peer registration operations +func newRegistrationBackoff(ctx context.Context) backoff.BackOff { + return backoff.WithContext(&backoff.ExponentialBackOff{ + InitialInterval: RegistrationRetryInterval, + RandomizationFactor: backoff.DefaultRandomizationFactor, + Multiplier: backoff.DefaultMultiplier, + MaxInterval: RegistrationRetryMaxDelay, + MaxElapsedTime: RegistrationRetryTimeout, + Stop: backoff.Stop, + Clock: backoff.SystemClock, + }, ctx) +} + +// newLoginBackoff creates a backoff strategy for peer login operations +func newLoginBackoff(ctx context.Context) backoff.BackOff { + return backoff.WithContext(&backoff.ExponentialBackOff{ + InitialInterval: LoginRetryInterval, + RandomizationFactor: backoff.DefaultRandomizationFactor, + Multiplier: backoff.DefaultMultiplier, + MaxInterval: LoginRetryMaxDelay, + MaxElapsedTime: LoginRetryTimeout, + Stop: backoff.Stop, + Clock: backoff.SystemClock, + }, ctx) +} + +// isRegistrationRequest determines if a login request is actually a registration. +// Returns true if either the setup key or the JWT token is present in the request. +func isRegistrationRequest(req *proto.LoginRequest) bool { + return req.SetupKey != "" || req.JwtToken != "" +} + func (c *GrpcClient) login(serverKey wgtypes.Key, req *proto.LoginRequest) (*proto.LoginResponse, error) { if !c.ready() { return nil, errors.New(errMsgNoMgmtConnection) @@ -324,6 +366,8 @@ func (c *GrpcClient) login(serverKey wgtypes.Key, req *proto.LoginRequest) (*pro } var resp *proto.EncryptedMessage + isRegistration := isRegistrationRequest(req) + operation := func() error { mgmCtx, cancel := context.WithTimeout(context.Background(), ConnectTimeout) defer cancel() @@ -334,9 +378,24 @@ func (c *GrpcClient) login(serverKey wgtypes.Key, req *proto.LoginRequest) (*pro Body: loginReq, }) if err != nil { - // retry only on context canceled - if s, ok := gstatus.FromError(err); ok && s.Code() == codes.Canceled { - return err + if s, ok := gstatus.FromError(err); ok { + if s.Code() == codes.Canceled { + log.Debugf("login canceled") + return backoff.Permanent(err) + } + if s.Code() == codes.DeadlineExceeded { + // For registration, deadline exceeded should not retry as it may cause + // duplicate device creation. The client timeout is short (10s) but server + // processing (DB writes, group calculations, IdP validation) can take longer. + // Retrying would send parallel requests that each create a new device. + if isRegistration { + log.Debugf("registration timeout - not retrying to avoid duplicates") + return backoff.Permanent(err) + } + // For regular login, we can safely retry since it's idempotent + log.Debugf("login timeout, retrying...") + return err + } } return backoff.Permanent(err) } @@ -344,7 +403,14 @@ func (c *GrpcClient) login(serverKey wgtypes.Key, req *proto.LoginRequest) (*pro return nil } - err = backoff.Retry(operation, nbgrpc.Backoff(c.ctx)) + var backoffStrategy backoff.BackOff + if isRegistration { + backoffStrategy = newRegistrationBackoff(c.ctx) + } else { + backoffStrategy = newLoginBackoff(c.ctx) + } + + err = backoff.Retry(operation, backoffStrategy) if err != nil { log.Errorf("failed to login to Management Service: %v", err) return nil, err