Skip to content

Commit cc8686f

Browse files
refactor: make an atomic write (#5361)
Problem: When writing to our SSO cache we noticed malformed json files that had our token. As a solution we did an "atomic" write where we wrote a temp file then renamed. This prevented the content from being malformed due to a separate write. The issue is that the combination of writing a temp file, then renaming it fails sometimes as indicated by our metric `aws_refreshCredentials` where we were seeing a 'FileNotFound' error. Solution: Move the atomic write logic in to `FileSystem.writeFile()` itself, where it is triggered as part of an argument to the function. Additionally when the `rename()` method is called under the hood of `writeFile()`, we validate that the file exists by retrying a few times. We suspect this to be the culprit of the `FileNotFound` error, see the comments in code for more info. Signed-off-by: Nikolas Komonen <[email protected]>
1 parent b5b0992 commit cc8686f

File tree

5 files changed

+166
-26
lines changed

5 files changed

+166
-26
lines changed

packages/core/src/shared/fs/fs.ts

Lines changed: 101 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,20 @@ import os from 'os'
77
import { promises as nodefs, constants as nodeConstants, WriteFileOptions } from 'fs'
88
import { isCloud9 } from '../extensionUtilities'
99
import _path from 'path'
10-
import { PermissionsError, PermissionsTriplet, ToolkitError, isFileNotFoundError, isPermissionsError } from '../errors'
10+
import {
11+
PermissionsError,
12+
PermissionsTriplet,
13+
ToolkitError,
14+
isFileNotFoundError,
15+
isPermissionsError,
16+
scrubNames,
17+
} from '../errors'
1118
import globals from '../extensionGlobals'
1219
import { isWin } from '../vscode/env'
1320
import { resolvePath } from '../utilities/pathUtils'
21+
import crypto from 'crypto'
22+
import { waitUntil } from '../utilities/timeoutUtils'
23+
import { telemetry } from '../telemetry/telemetry'
1424

1525
const vfs = vscode.workspace.fs
1626
type Uri = vscode.Uri
@@ -182,24 +192,57 @@ export class FileSystem {
182192
*
183193
* @param path File location
184194
* @param data File content
185-
* @param opt File permissions/flags. Only works in a non-web (nodejs) context. If provided,
195+
* @param opts File permissions/flags. Only works in a non-web (nodejs) context. If provided,
186196
* nodejs filesystem interface is used instead of routing through vscode VFS.
197+
* @param opts.atomic If true, ensures content is not corrupted from a concurrent write.
198+
* This is not truly atomic as an async write can override the final result,
199+
* but if the content is structured like JSON it will still be parseable.
200+
*
201+
* Eg: we were seeing a JSON file with an unexpected extra '}' and when parsing it
202+
* failed. We suspected a race condition from separate write, and all we wanted was
203+
* a parseable JSON.
204+
*
205+
* This optional has an uncalcuated performance impact as there is an additional
206+
* rename() operation.
187207
*/
188-
async writeFile(path: Uri | string, data: string | Uint8Array, opt?: WriteFileOptions): Promise<void> {
208+
async writeFile(
209+
path: Uri | string,
210+
data: string | Uint8Array,
211+
opts?: WriteFileOptions & { atomic?: boolean }
212+
): Promise<void> {
189213
const uri = this.#toUri(path)
190214
const errHandler = createPermissionsErrorHandler(this.isWeb, uri, '*w*')
191215
const content = this.#toBytes(data)
192-
// - Special case: if `opt` is given, use nodejs directly. This isn't ideal, but is the only
193-
// way (unless you know better) we can let callers specify permissions.
194-
// - Cloud9 vscode.workspace.writeFile has limited functionality, e.g. cannot write outside
195-
// of open workspace.
196-
const useNodejs = (opt && !this.isWeb) || isCloud9()
197216

198-
if (useNodejs) {
199-
return nodefs.writeFile(uri.fsPath, content, opt).catch(errHandler)
217+
if (this.isWeb) {
218+
return vfs.writeFile(uri, content).then(undefined, errHandler)
200219
}
201220

202-
return vfs.writeFile(uri, content).then(undefined, errHandler)
221+
// Node writeFile is the only way to set `writeOpts`, such as the `mode`, on a file .
222+
// When not in web we will use Node's writeFile() for all other scenarios.
223+
// It also has better error messages than VS Code's writeFile().
224+
let write = (u: Uri) => nodefs.writeFile(u.fsPath, content, opts).then(undefined, errHandler)
225+
226+
if (isCloud9()) {
227+
// In Cloud9 vscode.workspace.writeFile has limited functionality, e.g. cannot write outside
228+
// of open workspace.
229+
//
230+
// This is future proofing in the scenario we switch the initial implementation of `write()`
231+
// to something else, C9 will still use node fs.
232+
write = (u: Uri) => nodefs.writeFile(u.fsPath, content, opts).then(undefined, errHandler)
233+
}
234+
235+
// Node writeFile does NOT create parent folders by default, unlike VS Code FS writeFile()
236+
await fs.mkdir(_path.dirname(uri.fsPath))
237+
238+
if (opts?.atomic) {
239+
const tempFile = this.#toUri(`${uri.fsPath}.${crypto.randomBytes(8).toString('hex')}.tmp`)
240+
await write(tempFile)
241+
await fs.rename(tempFile, uri)
242+
return
243+
} else {
244+
await write(uri)
245+
}
203246
}
204247

205248
async rename(oldPath: vscode.Uri | string, newPath: vscode.Uri | string) {
@@ -211,9 +254,56 @@ export class FileSystem {
211254
return nodefs.rename(oldUri.fsPath, newUri.fsPath).catch(errHandler)
212255
}
213256

257+
/**
258+
* We were seeing 'FileNotFound' errors during renames, even though we did a `writeFile()` right before the rename.
259+
* The error looks to be from here: https://github.com/microsoft/vscode/blob/09d5f4efc5089ce2fc5c8f6aeb51d728d7f4e758/src/vs/platform/files/node/diskFileSystemProvider.ts#L747
260+
* So a guess is that the exists()(stat() under the hood) call needs to be retried since there may be a race condition.
261+
*/
262+
let attempts = 0
263+
const isExists = await waitUntil(async () => {
264+
const result = await fs.exists(oldUri)
265+
attempts += 1
266+
return result
267+
}, FileSystem.renameTimeoutOpts)
268+
// TODO: Move the `ide_fileSystem` or some variation of this metric in to common telemetry
269+
// TODO: Deduplicate the `ide_fileSystem` call. Maybe have a method that all operations pass through which wraps the call in telemetry.
270+
// Then look to report telemetry failure events.
271+
const scrubbedPath = scrubNames(oldUri.fsPath)
272+
if (!isExists) {
273+
// source path never existed after multiple attempts.
274+
// Either the file never existed, or had we waited longer it would have. We won't know.
275+
telemetry.ide_fileSystem.emit({
276+
result: 'Failed',
277+
action: 'rename',
278+
reason: 'SourceNotExists',
279+
reasonDesc: `After ${FileSystem.renameTimeoutOpts.timeout}ms the source path did not exist: ${scrubbedPath}`,
280+
})
281+
} else if (attempts > 1) {
282+
// Indicates that rename() would have failed if we had not waited for it to exist.
283+
telemetry.ide_fileSystem.emit({
284+
result: 'Succeeded',
285+
action: 'rename',
286+
reason: 'RenameRaceCondition',
287+
reasonDesc: `After multiple attempts the source path existed: ${scrubbedPath}`,
288+
attempts: attempts,
289+
})
290+
}
291+
214292
return vfs.rename(oldUri, newUri, { overwrite: true }).then(undefined, errHandler)
215293
}
216294

295+
/**
296+
* It looks like scenario of a failed rename is rare,
297+
* so we can afford to have a longer timeout and interval.
298+
*
299+
* These values are an arbitrary guess to how long it takes
300+
* for a newly created file to be visible on the filesystem.
301+
*/
302+
static readonly renameTimeoutOpts = {
303+
timeout: 10_000,
304+
interval: 300,
305+
} as const
306+
217307
/**
218308
* The stat of the file, throws if the file does not exist or on any other error.
219309
*/

packages/core/src/shared/telemetry/vscodeTelemetry.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,20 @@
273273
}
274274
],
275275
"metrics": [
276+
{
277+
"name": "ide_fileSystem",
278+
"description": "File System event on execution",
279+
"metadata": [
280+
{
281+
"type": "action",
282+
"required": true
283+
},
284+
{
285+
"type": "attempts",
286+
"required": false
287+
}
288+
]
289+
},
276290
{
277291
"name": "vscode_executeCommand",
278292
"description": "Emitted whenever a registered Toolkit command is executed",

packages/core/src/shared/utilities/cacheUtils.ts

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import * as vscode from 'vscode'
77
import { dirname } from 'path'
88
import { ToolkitError, isFileNotFoundError } from '../errors'
99
import fs from '../../shared/fs/fs'
10-
import crypto from 'crypto'
1110
import { isWeb } from '../extensionGlobals'
1211
import type { MapSync } from './map'
1312

@@ -131,14 +130,12 @@ export function createDiskCache<T, K>(
131130
await fs.mkdir(dirname(target))
132131
if (isWeb()) {
133132
// There is no web-compatible rename() method. So do a regular write.
134-
await fs.writeFile(target, JSON.stringify(data), { mode: 0o600 })
133+
await fs.writeFile(target, JSON.stringify(data))
135134
} else {
136135
// With SSO cache we noticed malformed JSON on read. A guess is that multiple writes
137136
// are occuring at the same time. The following is a bandaid that ensures an all-or-nothing
138137
// write, though there can still be race conditions with which version remains after overwrites.
139-
const tempFile = `${target}.tmp-${crypto.randomBytes(4).toString('hex')}`
140-
await fs.writeFile(tempFile, JSON.stringify(data), { mode: 0o600 })
141-
await fs.rename(tempFile, target)
138+
await fs.writeFile(target, JSON.stringify(data), { mode: 0o600, atomic: true })
142139
}
143140
} catch (error) {
144141
throw ToolkitError.chain(error, `Failed to save "${target}"`, {

packages/core/src/test/shared/fs/fs.test.ts

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@ import * as path from 'path'
99
import * as utils from 'util'
1010
import { existsSync, mkdirSync, promises as nodefs, readFileSync, rmSync } from 'fs'
1111
import { FakeExtensionContext } from '../../fakeExtensionContext'
12-
import fs from '../../../shared/fs/fs'
12+
import fs, { FileSystem } from '../../../shared/fs/fs'
1313
import * as os from 'os'
1414
import { isMinVscode, isWin } from '../../../shared/vscode/env'
1515
import Sinon from 'sinon'
1616
import * as extensionUtilities from '../../../shared/extensionUtilities'
17-
import { PermissionsError, formatError, isFileNotFoundError } from '../../../shared/errors'
17+
import { PermissionsError, formatError, isFileNotFoundError, scrubNames } from '../../../shared/errors'
1818
import { EnvironmentVariables } from '../../../shared/environmentVariables'
1919
import * as testutil from '../../testUtil'
2020
import globals from '../../../shared/extensionGlobals'
@@ -67,10 +67,14 @@ describe('FileSystem', function () {
6767
})
6868

6969
describe('writeFile()', function () {
70-
it('writes a file', async function () {
71-
const filePath = createTestPath('myFileName')
72-
await fs.writeFile(filePath, 'MyContent')
73-
assert.strictEqual(readFileSync(filePath, 'utf-8'), 'MyContent')
70+
const opts: { atomic: boolean }[] = [{ atomic: false }, { atomic: true }]
71+
72+
opts.forEach((opt) => {
73+
it(`writes a file (atomic: ${opt.atomic})`, async function () {
74+
const filePath = createTestPath('myFileName')
75+
await fs.writeFile(filePath, 'MyContent', opt)
76+
assert.strictEqual(readFileSync(filePath, 'utf-8'), 'MyContent')
77+
})
7478
})
7579

7680
it('writes a file with encoded text', async function () {
@@ -340,6 +344,7 @@ describe('FileSystem', function () {
340344

341345
assert.strictEqual(await fs.readFileAsString(newPath), 'hello world')
342346
assert(!existsSync(oldPath))
347+
assert.deepStrictEqual(testutil.getMetrics('ide_fileSystem').length, 0)
343348
})
344349

345350
it('renames a folder', async () => {
@@ -363,6 +368,43 @@ describe('FileSystem', function () {
363368
assert.strictEqual(await fs.readFileAsString(newPath), 'hello world')
364369
assert(!existsSync(oldPath))
365370
})
371+
372+
it('throws if source does not exist', async () => {
373+
const clock = testutil.installFakeClock()
374+
try {
375+
const oldPath = createTestPath('oldFile.txt')
376+
const newPath = createTestPath('newFile.txt')
377+
378+
const result = fs.rename(oldPath, newPath)
379+
await clock.tickAsync(FileSystem.renameTimeoutOpts.timeout)
380+
await assert.rejects(result)
381+
382+
testutil.assertTelemetry('ide_fileSystem', {
383+
action: 'rename',
384+
result: 'Failed',
385+
reason: 'SourceNotExists',
386+
reasonDesc: `After ${FileSystem.renameTimeoutOpts.timeout}ms the source path did not exist: ${scrubNames(oldPath)}`,
387+
})
388+
} finally {
389+
clock.uninstall()
390+
}
391+
})
392+
393+
it('source file does not exist at first, but eventually appears', async () => {
394+
const oldPath = createTestPath('oldFile.txt')
395+
const newPath = createTestPath('newFile.txt')
396+
397+
const result = fs.rename(oldPath, newPath)
398+
// this file is created after the first "exists" check fails, the following check should pass
399+
void testutil.toFile('hello world', oldPath)
400+
await result
401+
402+
testutil.assertTelemetry('ide_fileSystem', {
403+
action: 'rename',
404+
result: 'Succeeded',
405+
reason: 'RenameRaceCondition',
406+
})
407+
})
366408
})
367409

368410
describe('getUserHomeDir()', function () {

packages/core/src/test/shared/vscode/runCommand.test.ts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,10 @@ describe('runCommand', function () {
110110

111111
const pat = (() => {
112112
switch (os.platform()) {
113-
case 'linux':
114-
// vscode error not raised on linux? 💩
115-
return /EISDIR: illegal operation on a directory/
116113
case 'win32':
117114
return /EPERM: operation not permitted/
118115
default:
119-
return /EEXIST: file already exists/
116+
return /EISDIR: illegal operation on a directory/
120117
}
121118
})()
122119
await runAndWaitForMessage(pat, async () => {

0 commit comments

Comments
 (0)