Skip to content

Commit 55ac0ba

Browse files
fix(auth): filesystem error during SSO token refresh expires session #5549
## Problem: When we get SSO cache errors, such as a failed write, we do not want to invalidate the connection. Instead we want to gracefully handle this similar to network errors, since the connection can still be recovered on a subsequent token refresh. A place that this happens is during SSO token refresh. Where if there is a filesystem error when writing the new token to disk, the connection will become invalid. ## Solution: Catch sso cache errors and wrap them in a high level `DiskCacheError` so that we can easily determine an sso cache error during token refresh. Then we can know not to invalidate the connection when there is a cache error since `isRecoverableError()` will catch the error. Additionally a message is shown to the user explaining how the sso token refresh failed and guides them to the logs in hopes that they can fix it themselves, or at least give it to us when the report the error.
1 parent 265f49e commit 55ac0ba

File tree

11 files changed

+279
-31
lines changed

11 files changed

+279
-31
lines changed

packages/amazonq/package.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,10 @@
110110
"amazonQSessionConfigurationMessage": {
111111
"type": "boolean",
112112
"default": false
113+
},
114+
"ssoCacheError": {
115+
"type": "boolean",
116+
"default": false
113117
}
114118
},
115119
"additionalProperties": false

packages/core/package.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,10 @@
192192
"codeCatalystConnectionExpired": {
193193
"type": "boolean",
194194
"default": false
195+
},
196+
"ssoCacheError": {
197+
"type": "boolean",
198+
"default": false
195199
}
196200
},
197201
"additionalProperties": false

packages/core/src/auth/auth.ts

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import * as localizedText from '../shared/localizedText'
1313
import { Credentials } from '@aws-sdk/types'
1414
import { SsoAccessTokenProvider } from './sso/ssoAccessTokenProvider'
1515
import { Timeout } from '../shared/utilities/timeoutUtils'
16-
import { errorCode, isAwsError, isNetworkError, ToolkitError, UnknownError } from '../shared/errors'
16+
import { DiskCacheError, errorCode, isAwsError, isNetworkError, ToolkitError, UnknownError } from '../shared/errors'
1717
import { getCache } from './sso/cache'
1818
import { isNonNullable, Mutable } from '../shared/utilities/tsUtils'
1919
import { builderIdStartUrl, SsoToken, truncateStartUrl } from './sso/model'
@@ -652,7 +652,7 @@ export class Auth implements AuthService, ConnectionManager {
652652
}
653653

654654
private async handleSsoTokenError(id: Connection['id'], err: unknown) {
655-
this.throwOnNetworkError(err)
655+
this.throwOnRecoverableError(err)
656656

657657
this.#validationErrors.set(id, UnknownError.cast(err))
658658
getLogger().info(`auth: Handling validation error of connection: ${id}`)
@@ -830,7 +830,7 @@ export class Auth implements AuthService, ConnectionManager {
830830
@withTelemetryContext({ name: '_getToken', class: authClassName })
831831
private async _getToken(id: Connection['id'], provider: SsoAccessTokenProvider): Promise<SsoToken> {
832832
const token = await provider.getToken().catch((err) => {
833-
this.throwOnNetworkError(err)
833+
this.throwOnRecoverableError(err)
834834

835835
this.#validationErrors.set(id, err)
836836
})
@@ -842,16 +842,25 @@ export class Auth implements AuthService, ConnectionManager {
842842
* Auth processes can fail due to reasons outside of authentication actually being expired.
843843
* We do not want to intepret these failures as invalid/expired auth tokens.
844844
*
845-
* We use this to check if the given error is unanticipated. If it is we throw,
846-
* expecting the caller to not change the state of the connection.
845+
* We use this to check if the given error is recoverable, meaning retrying getToken at
846+
* a later time could succeed. If it is recoverable, we throw, expecting the caller to not
847+
* change the state of the connection.
847848
*/
848-
private throwOnNetworkError(e: unknown) {
849+
private throwOnRecoverableError(e: unknown) {
849850
if (isNetworkError(e)) {
850851
throw new ToolkitError('Failed to update connection due to networking issues', {
851852
cause: e,
852853
code: e.code,
853854
})
854855
}
856+
857+
const possibleCacheError = DiskCacheError.instanceIf(e)
858+
if (possibleCacheError instanceof DiskCacheError) {
859+
throw new ToolkitError('Failed to update connection due to file system operation failures', {
860+
cause: possibleCacheError,
861+
code: possibleCacheError.code,
862+
})
863+
}
855864
}
856865

857866
private readonly getCredentials = keyedDebounce(this._getCredentials.bind(this))

packages/core/src/auth/sso/ssoAccessTokenProvider.ts

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ import { hasProps, hasStringProps, RequiredProps, selectFrom } from '../../share
2020
import { OidcClient } from './clients'
2121
import { loadOr } from '../../shared/utilities/cacheUtils'
2222
import {
23+
DiskCacheError,
2324
ToolkitError,
25+
getErrorMsg,
2426
getRequestId,
2527
getTelemetryReason,
2628
getTelemetryReasonDesc,
@@ -33,16 +35,18 @@ import { AwsLoginWithBrowser, AwsRefreshCredentials, telemetry } from '../../sha
3335
import { indent, toBase64URL } from '../../shared/utilities/textUtilities'
3436
import { AuthSSOServer } from './server'
3537
import { CancellationError, sleep } from '../../shared/utilities/timeoutUtils'
36-
import { getIdeProperties, isCloud9 } from '../../shared/extensionUtilities'
38+
import { getIdeProperties, isAmazonQ, isCloud9 } from '../../shared/extensionUtilities'
3739
import { randomBytes, createHash } from 'crypto'
3840
import { localize } from '../../shared/utilities/vsCodeUtils'
3941
import { randomUUID } from '../../shared/crypto'
4042
import { isRemoteWorkspace, isWebWorkspace } from '../../shared/vscode/env'
4143
import { showInputBox } from '../../shared/ui/inputPrompter'
42-
import { DevSettings } from '../../shared/settings'
44+
import { AmazonQPromptSettings, DevSettings, PromptSettings, ToolkitPromptSettings } from '../../shared/settings'
4345
import { onceChanged } from '../../shared/utilities/functionUtils'
4446
import { NestedMap } from '../../shared/utilities/map'
4547
import { asStringifiedStack } from '../../shared/telemetry/spans'
48+
import { showViewLogsMessage } from '../../shared/utilities/messages'
49+
import _ from 'lodash'
4650

4751
export const authenticationPath = 'sso/authenticated'
4852

@@ -188,6 +192,25 @@ export abstract class SsoAccessTokenProvider {
188192

189193
return refreshed
190194
} catch (err) {
195+
const possibleCacheError = DiskCacheError.instanceIf(err)
196+
if (possibleCacheError instanceof DiskCacheError) {
197+
/**
198+
* Background:
199+
* - During token refresh when saving to our filesystem/cache there are sometimes file system
200+
* related errors.
201+
* - When these errors ocurr it will cause the token refresh process to fail, and the users SSO
202+
* connection to become invalid.
203+
* - Because these filesystem errors do not indicate the SSO session is actually stale,
204+
* we want to catch these filesystem errors and not invalidate the users SSO connection since a
205+
* subsequent attempt to refresh may succeed.
206+
* - To give the user a chance to resolve their filesystem related issue, we want to point them
207+
* to the logs where the error was logged. Hopefully they can use this information to fix the issue,
208+
* or at least hint for them to provide the logs in a bug report.
209+
*/
210+
void DiskCacheErrorMessage.instance.showMessageThrottled(possibleCacheError)
211+
throw possibleCacheError
212+
}
213+
191214
if (!isNetworkError(err)) {
192215
const reason = getTelemetryReason(err)
193216
telemetry.aws_refreshCredentials.emit({
@@ -761,3 +784,53 @@ type ReAuthStateValue = {
761784
// the latest reason for why the connection was moved in to a "needs reauth" state
762785
reAuthReason?: string
763786
}
787+
788+
/**
789+
* Singleton class that manages showing the user a message during
790+
* failures that ocurr during SSO disk cache operations.
791+
*
792+
* Background:
793+
* - We need this {@link DiskCacheErrorMessage} specifically as a singleton since we want to ensure
794+
* that only 1 instance of this message appears at a time. There are cases where it shows multiple messages
795+
* if not used as a singleton.
796+
*/
797+
class DiskCacheErrorMessage {
798+
static #instance: DiskCacheErrorMessage
799+
static get instance() {
800+
return (this.#instance ??= new DiskCacheErrorMessage())
801+
}
802+
803+
/**
804+
* Show a `"don't show again"`-able message which tells the user about a file system related error
805+
* with the sso cache.
806+
*/
807+
public showMessageThrottled(error: Error) {
808+
return this._showMessageThrottled(error)
809+
}
810+
private _showMessageThrottled = _.throttle(async (error: Error) => this._showMessage(error), 60_000, {
811+
leading: true,
812+
})
813+
private async _showMessage(error: Error) {
814+
const dontShow = 'Never warn again'
815+
816+
const promptSettings: PromptSettings = isAmazonQ()
817+
? AmazonQPromptSettings.instance
818+
: ToolkitPromptSettings.instance
819+
820+
// We know 'ssoCacheError' is in all extension prompt settings
821+
if (await promptSettings.isPromptEnabled('ssoCacheError')) {
822+
const result = await showMessage()
823+
if (result === dontShow) {
824+
await promptSettings.disablePrompt('ssoCacheError')
825+
}
826+
}
827+
828+
function showMessage() {
829+
return showViewLogsMessage(
830+
`File system related error during SSO cache operation:\n"${getErrorMsg(error, true)}"`,
831+
'error',
832+
[dontShow]
833+
)
834+
}
835+
}
836+
}

packages/core/src/shared/errors.ts

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -880,7 +880,7 @@ function isEbusyError(err: Error) {
880880
}
881881

882882
/** Helper function to assert given error has the expected properties */
883-
function isError(err: Error, id: string, messageIncludes: string = '') {
883+
export function isError(err: Error, id: string, messageIncludes: string = '') {
884884
// It is not always clear if the error has the expected value in the `name` or `code` field
885885
// so this checks both.
886886
return (err.name === id || (err as any).code === id) && err.message.includes(messageIncludes)
@@ -984,6 +984,65 @@ export class AwsClientResponseError extends Error {
984984
}
985985
}
986986

987+
/**
988+
* Represents a generalized error that happened during a disk cache operation.
989+
*
990+
* For example, when SSO refreshes a token a disk cache error can occur when it
991+
* attempts to read/write the disk cache. These errors can be recoverable and do not
992+
* imply that the SSO session is stale. So by wrapping those errors in an instance of
993+
* this class it will help to distinguish them.
994+
*/
995+
export class DiskCacheError extends Error {
996+
#code: string | undefined
997+
998+
/** Use {@link DiskCacheError.instanceIf()} to create an instance */
999+
protected constructor(message: string, options?: { code?: string }) {
1000+
super(message)
1001+
this.#code = options?.code
1002+
}
1003+
1004+
/**
1005+
* We are seeing these errors in telemetry, but they have no error message attached.
1006+
* The best we can do is assume these are filesystem errors in the context that we see them.
1007+
*/
1008+
private static fileSystemErrorsWithoutMessage = ['EACCES', 'EBADF']
1009+
1010+
/**
1011+
* Return an instance of {@link DiskCacheError} that consists of the properties from the
1012+
* given error IF certain conditions are met. Otherwise, the original error is returned.
1013+
*
1014+
* - Also returns the original error if it is already a {@link DiskCacheError}.
1015+
*/
1016+
public static instanceIf<T>(err: T): DiskCacheError | T {
1017+
if (!(err instanceof Error) || err instanceof DiskCacheError) {
1018+
return err
1019+
}
1020+
1021+
const errorId = (err as any).code ?? err.name
1022+
1023+
if (DiskCacheError.fileSystemErrorsWithoutMessage.includes(errorId)) {
1024+
// The error message will probably be blank
1025+
const message = err.message ? err.message : 'No msg'
1026+
return new DiskCacheError(message, { code: errorId })
1027+
}
1028+
1029+
// These are errors we were seeing in telemetry
1030+
if (
1031+
isError(err, 'ENOSPC', 'no space left on device') ||
1032+
isError(err, 'EPERM', 'operation not permitted') ||
1033+
isError(err, 'EBUSY', 'resource busy or locked')
1034+
) {
1035+
return new DiskCacheError(err.message, { code: errorId })
1036+
}
1037+
1038+
return err // the error does no meet the conditions to be a DiskCacheError
1039+
}
1040+
1041+
public get code() {
1042+
return this.#code
1043+
}
1044+
}
1045+
9871046
/**
9881047
* Run a function and swallow any errors that are not specified by `shouldThrow`
9891048
*/

packages/core/src/shared/settings-amazonq.gen.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ export const amazonqSettings = {
1616
"codeWhispererNewWelcomeMessage": {},
1717
"codeWhispererConnectionExpired": {},
1818
"amazonQWelcomePage": {},
19-
"amazonQSessionConfigurationMessage": {}
19+
"amazonQSessionConfigurationMessage": {},
20+
"ssoCacheError": {}
2021
},
2122
"amazonQ.showInlineCodeSuggestionsWithCodeReferences": {},
2223
"amazonQ.importRecommendationForInlineCodeSuggestions": {},

packages/core/src/shared/settings-toolkit.gen.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ export const toolkitSettings = {
3636
"createCredentialsProfile": {},
3737
"samcliConfirmDevStack": {},
3838
"remoteConnected": {},
39-
"codeCatalystConnectionExpired": {}
39+
"codeCatalystConnectionExpired": {},
40+
"ssoCacheError": {}
4041
},
4142
"aws.experiments": {
4243
"jsonResourceModification": {}

packages/core/src/shared/settings.ts

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -620,10 +620,13 @@ export function fromExtensionManifest<T extends TypeDescriptor & Partial<Section
620620
*/
621621
export const toolkitPrompts = settingsProps['aws.suppressPrompts']
622622
type toolkitPromptName = keyof typeof toolkitPrompts
623-
export class ToolkitPromptSettings extends Settings.define(
624-
'aws.suppressPrompts',
625-
toRecord(keys(toolkitPrompts), () => Boolean)
626-
) {
623+
export class ToolkitPromptSettings
624+
extends Settings.define(
625+
'aws.suppressPrompts',
626+
toRecord(keys(toolkitPrompts), () => Boolean)
627+
)
628+
implements PromptSettings
629+
{
627630
public async isPromptEnabled(promptName: toolkitPromptName): Promise<boolean> {
628631
try {
629632
return !this._getOrThrow(promptName, false)
@@ -650,10 +653,13 @@ export class ToolkitPromptSettings extends Settings.define(
650653

651654
export const amazonQPrompts = settingsProps['amazonQ.suppressPrompts']
652655
type amazonQPromptName = keyof typeof amazonQPrompts
653-
export class AmazonQPromptSettings extends Settings.define(
654-
'amazonQ.suppressPrompts',
655-
toRecord(keys(amazonQPrompts), () => Boolean)
656-
) {
656+
export class AmazonQPromptSettings
657+
extends Settings.define(
658+
'amazonQ.suppressPrompts',
659+
toRecord(keys(amazonQPrompts), () => Boolean)
660+
)
661+
implements PromptSettings
662+
{
657663
public async isPromptEnabled(promptName: amazonQPromptName): Promise<boolean> {
658664
try {
659665
return !this._getOrThrow(promptName, false)
@@ -678,6 +684,18 @@ export class AmazonQPromptSettings extends Settings.define(
678684
}
679685
}
680686

687+
/**
688+
* Use cautiously as this is misleading. Ideally we create a type
689+
* which is the intersection of the types (only the values that occur
690+
* in each are selected), but idk how to do that.
691+
*/
692+
type AllPromptNames = amazonQPromptName | toolkitPromptName
693+
694+
export interface PromptSettings {
695+
isPromptEnabled(promptName: AllPromptNames): Promise<boolean>
696+
disablePrompt(promptName: AllPromptNames): Promise<void>
697+
}
698+
681699
const experiments = settingsProps['aws.experiments']
682700
type ExperimentName = keyof typeof experiments
683701

0 commit comments

Comments
 (0)