fix(crash): fsync heartbeat file writes (#5942)

nkomonen-amazon · web-flow · commit fee4a43e7fc4 · 2024-11-07T14:51:30.000-05:00
## Problem In our telemetry data we saw the following scenario: - We saw a session (`7ebb2966-7e94-4177-80d9-c5a485511c13`) sending heartbeats as normal, with the latest one being at: `Nov 4, 2024 @ 03:35:45` (1730691345628). A heartbeat is simply a file with a timestamp value that is constantly updated. - ~2 minutes later we see that this session was reported as crashed at `Nov 4, 2024 @ 03:37:32`, but the timestamp on the heartbeat file it read was `Nov 4, 2024 @ 03:05:45` (1730689545627) - This does not make sense since the latest heartbeat was at `3:35`, but what was seen was `3:05`. 2 more heartbeats are known to have happened after `3:05` based on telemetry, but it looks like they somehow were not seen when the crash check happened We already do handling for this edge case by... - After the heartbeat file is written, we immediately read to ensure it returns the content we just wrote - If a heartbeat write fails for any reason, we terminate all crash monitoring for that session and clean it up so that there is not a chance for it to be falsely reported as a crash. - For any error that happens, we report a telemetry event and collect them in a graph to see any significant ones ## Solution Even with the handling above we were still seeing odd amounts of sessions being reported as crashed, even though heartbeats were appropriately being sent. A guess to why we still had issues is that even though a new heartbeat file write is successful, it does not truly propagate to all readers of it. `fsync` is a known solution for "finalizing" the change to disk as certain OS's may do things like caching a write and then eventually writing it to disk. The solution we have is to use `fsync` after writing the heartbeat file. We will then monitor our telemetry dashboard to see if these issues drop. ### Additional - On HB file deletions we will now clear the content of it first, then delete. This is due to an assumption that something is reading the previous text from the file even after it was deleted, such as an open file handle existing before delete. - Even if we have an empty HB file on disk [we handle that case gracefully](https://github.com/aws/aws-toolkit-vscode/blob/55e0b83aa13a09b49af5fe4db5b0d8879fd6f1dd/packages/core/src/shared/crashMonitoring.ts#L590) ---  License: I confirm that my contribution is made under the terms of the Apache 2.0 license. --------- Signed-off-by: nkomonen-amazon <nkomonen@amazon.com>
diff --git a/packages/core/src/shared/crashMonitoring.ts b/packages/core/src/shared/crashMonitoring.ts
@@ -179,7 +179,7 @@ class Heartbeat {
 
     public async start() {
         // Send an initial heartbeat immediately
-        await withFailCtx('initialSendHeartbeat', () => this.state.sendHeartbeat())
+        await withFailCtx('sendHeartbeatInitial', () => this.state.sendHeartbeat())
 
         // Send a heartbeat every interval
         this.intervalRef = globals.clock.setInterval(async () => {
@@ -200,6 +200,10 @@ class Heartbeat {
                 this._onFailure.fire()
             }
         }, this.heartbeatInterval)
+
+        // We will know the first heartbeat, and can infer the next ones starting from this timestamp.
+        // In case of heartbeat failure we have a separate failure metric.
+        telemetry.ide_heartbeat.emit({ timestamp: globals.clock.Date.now(), id: className, result: 'Succeeded' })
     }
 
     /** Stops everything, signifying a graceful shutdown */
@@ -455,38 +459,40 @@ export class FileSystemState {
             })
         }
 
-        await withFailCtx('init', () => fs.mkdir(this.stateDirPath))
+        await withFailCtx('init', () => nodeFs.mkdir(this.stateDirPath, { recursive: true }))
     }
 
     // ------------------ Heartbeat methods ------------------
     public async sendHeartbeat() {
         const extId = this.extId
+        const filePath = this.makeStateFilePath(extId)
+        const now = this.deps.now()
+
+        let fileHandle: nodeFs.FileHandle | undefined
         try {
-            const now = this.deps.now()
             const func = async () => {
-                const filePath = this.makeStateFilePath(extId)
-
-                // We were seeing weird behavior where we possibly read an old file, even though we overwrote it.
-                // So this is a sanity check.
-                await fs.delete(filePath, { force: true })
-
-                await fs.writeFile(filePath, JSON.stringify({ ...this.ext, lastHeartbeat: now }, undefined, 4))
-
-                // Sanity check to verify the write is accessible immediately after
-                const heartbeatData = JSON.parse(await fs.readFileText(filePath)) as ExtInstanceHeartbeat
+                fileHandle = await nodeFs.open(filePath, 'w')
+                await fileHandle.writeFile(JSON.stringify({ ...this.ext, lastHeartbeat: now }, undefined, 4))
+                // Noticing that some file reads are not immediately available after write. `fsync` is known to address this.
+                await fileHandle.sync()
+                await fileHandle.close()
+                fileHandle = undefined
+
+                // Sanity check to verify the latest write is accessible immediately
+                const heartbeatData = JSON.parse(await nodeFs.readFile(filePath, 'utf-8')) as ExtInstanceHeartbeat
                 if (heartbeatData.lastHeartbeat !== now) {
                     throw new CrashMonitoringError('Heartbeat write validation failed', { code: className })
                 }
+
+                this.deps.devLogger?.debug(`crashMonitoring: HEARTBEAT sent for ${truncateUuid(this.ext.sessionId)}`)
             }
             const funcWithCtx = () => withFailCtx('sendHeartbeatState', func)
             const funcWithRetries = withRetries(funcWithCtx, { maxRetries: 6, delay: 100, backoff: 2 })
-            const funcWithTelemetryRun = await telemetry.ide_heartbeat.run((span) => {
-                span.record({ id: className, timestamp: now })
-                return funcWithRetries
-            })
 
-            return funcWithTelemetryRun
+            return funcWithRetries
         } catch (e) {
+            await fileHandle?.close()
+
             // delete this ext from the state to avoid an incorrectly reported crash since we could not send a new heartbeat
             await this.deleteHeartbeatFile(extId, 'sendHeartbeatFailureCleanup')
             throw e
@@ -518,7 +524,21 @@ export class FileSystemState {
 
     private async deleteHeartbeatFile(ext: ExtInstanceId | ExtInstance, ctx: string) {
         // IMPORTANT: Must use NodeFs here since this is used during shutdown
-        const func = () => nodeFs.rm(this.makeStateFilePath(ext), { force: true })
+        const func = async () => {
+            const filePath = this.makeStateFilePath(ext)
+
+            // Even when deleting a file, if there is an open file handle it may still exist. This empties the
+            // contents, so that any following reads will get no data.
+            let fileHandle: nodeFs.FileHandle | undefined
+            try {
+                fileHandle = await nodeFs.open(filePath, 'w')
+                await fileHandle.sync()
+            } finally {
+                await fileHandle?.close()
+            }
+
+            await nodeFs.rm(filePath, { force: true })
+        }
         const funcWithCtx = () => withFailCtx(ctx, func)
         const funcWithRetries = withRetries(funcWithCtx, { maxRetries: 6, delay: 100, backoff: 2 })
         await funcWithRetries
@@ -553,7 +573,7 @@ export class FileSystemState {
     public async clearState(): Promise<void> {
         this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Started')
         await withFailCtx('clearState', async () => {
-            await fs.delete(this.stateDirPath, { force: true, recursive: true })
+            await nodeFs.rm(this.stateDirPath, { force: true, recursive: true })
             this.deps.devLogger?.debug('crashMonitoring: CLEAR_STATE: Succeeded')
         })
     }
@@ -576,7 +596,7 @@ export class FileSystemState {
                 // we will assume that other instance handled its termination appropriately.
                 // NOTE: On Windows we were failing on EBUSY, so we retry on failure.
                 const loadExtFromDisk = async () => {
-                    const text = await fs.readFileText(this.makeStateFilePath(extId))
+                    const text = await nodeFs.readFile(this.makeStateFilePath(extId), 'utf-8')
 
                     if (!text) {
                         return undefined
diff --git a/packages/core/src/test/shared/fs/node/fs.test.ts b/packages/core/src/test/shared/fs/node/fs.test.ts
@@ -0,0 +1,52 @@
+/*!
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+import nodeFs from 'fs/promises'
+import { TestFolder } from '../../../testUtil'
+import assert from 'assert'
+import { fs } from '../../../../shared'
+
+describe('Node FS', () => {
+    let testFolder: TestFolder
+
+    beforeEach(async function () {
+        testFolder = await TestFolder.create()
+    })
+
+    describe('open()', () => {
+        it('"w" flag clears file content', async () => {
+            const filePath = testFolder.pathFrom('file.txt')
+
+            // Make initial file with text
+            await nodeFs.writeFile(filePath, 'test')
+            assert.strictEqual(await fs.readFileText(filePath), 'test')
+
+            // Open file with "w"
+            const fileHandle = await nodeFs.open(filePath, 'w')
+            await fileHandle.close()
+
+            // file content was cleared
+            assert.strictEqual(await fs.readFileText(filePath), '')
+        })
+    })
+
+    describe('sync()', () => {
+        // we cannot accurately test if sync() works, so just assert nothing breaks when using it
+        it('runs without error', async () => {
+            const filePath = testFolder.pathFrom('file.txt')
+
+            // Make initial file with text
+            await nodeFs.writeFile(filePath, 'test')
+            assert.strictEqual(await fs.readFileText(filePath), 'test')
+
+            const fileHandle = await nodeFs.open(filePath, 'w')
+            await fileHandle.writeFile('updatedText')
+            await fileHandle.sync() // method under test
+            await fileHandle.close()
+
+            // file content was cleared
+            assert.strictEqual(await fs.readFileText(filePath), 'updatedText')
+        })
+    })
+})