From 189c634272a726b6fa7fc9f9aa0af4ea1973e581 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Fri, 13 Mar 2026 12:52:25 -0700 Subject: [PATCH 01/36] Fix CI flakiness: 5 root causes addressing 48% failure rate Root causes fixed: 1. MSB4216 task host failures - DOTNET_HOST_PATH not set in Helix 2. dotnet-watch test hangs - AwaitableProcess timeout cap, disposal race, semaphore deadlock 3. GZipCompress file lock in BlazorWasm parallel compression 4. Missing runtimeconfig.json in test tool NuGet packages 5. CompilationHandler IOException during hot reload file updates 6. BrowserTests.LaunchesBrowserOnStart timing race - sync to async wait 7. NuGet source removal noise in Helix scripts --- build/RunTestsOnHelix.cmd | 26 ++++++++----- build/RunTestsOnHelix.sh | 30 +++++++++------ src/BlazorWasmSdk/Tasks/GZipCompress.cs | 37 +++++++++++++----- .../Watch/Aspire/AspireServiceFactory.cs | 15 +++++++- .../Watch/HotReload/CompilationHandler.cs | 23 ++++++++++- .../AwaitableProcess.cs | 38 +++++++++++++++++-- .../WatchableApp.cs | 14 ++++--- .../DefaultRequestDispatcherTest.cs | 18 +++++++-- test/TestAssets/Directory.Build.targets | 17 +++++++++ .../Browser/BrowserTests.cs | 5 ++- 10 files changed, 175 insertions(+), 48 deletions(-) diff --git a/build/RunTestsOnHelix.cmd b/build/RunTestsOnHelix.cmd index 569607f04333..9e29fa829a20 100644 --- a/build/RunTestsOnHelix.cmd +++ b/build/RunTestsOnHelix.cmd @@ -9,6 +9,10 @@ set DOTNET_ROOT=%HELIX_CORRELATION_PAYLOAD%\d set PATH=%DOTNET_ROOT%;%PATH% set TestFullMSBuild=%1 +REM Set DOTNET_HOST_PATH so MSBuild task hosts can locate the dotnet executable. +REM Without this, tasks from NuGet packages that use TaskHostFactory fail with MSB4216. +set DOTNET_HOST_PATH=%DOTNET_ROOT%\dotnet.exe + REM Ensure Visual Studio instances allow preview SDKs PowerShell -ExecutionPolicy ByPass -NoProfile -File "%HELIX_CORRELATION_PAYLOAD%\t\eng\enable-preview-sdks.ps1" @@ -35,14 +39,16 @@ dotnet new --debug:ephemeral-hive dotnet nuget list source --configfile %TestExecutionDirectory%\nuget.config if exist %TestExecutionDirectory%\Testpackages dotnet nuget add source %TestExecutionDirectory%\Testpackages --name testpackages --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet6-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet6-internal-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet7-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet7-internal-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source richnav --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source vs-impl --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet-libraries-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet-tools-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet-libraries --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet-eng --configfile %TestExecutionDirectory%\nuget.config +REM Remove feeds not needed for tests. Errors from non-existent sources +REM (e.g. internal-transport feeds only present in internal builds) are ignored. +dotnet nuget remove source dotnet6-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet6-internal-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet7-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet7-internal-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source richnav --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source vs-impl --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet-libraries-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet-tools-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet-libraries --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet-eng --configfile %TestExecutionDirectory%\nuget.config 2>nul dotnet nuget list source --configfile %TestExecutionDirectory%\nuget.config diff --git a/build/RunTestsOnHelix.sh b/build/RunTestsOnHelix.sh index 887748f918b4..973eab473c0e 100644 --- a/build/RunTestsOnHelix.sh +++ b/build/RunTestsOnHelix.sh @@ -9,6 +9,12 @@ export MicrosoftNETBuildExtensionsTargets=$HELIX_CORRELATION_PAYLOAD/ex/msbuildE export DOTNET_ROOT=$HELIX_CORRELATION_PAYLOAD/d export PATH=$DOTNET_ROOT:$PATH +# Set DOTNET_HOST_PATH so MSBuild task hosts can locate the dotnet executable. +# Without this, tasks from NuGet packages that use TaskHostFactory (e.g. ComputeWasmBuildAssets +# from WebAssembly SDK, ComputeManagedAssemblies from ILLink) fail with MSB4216 on macOS +# because the task host process cannot find the dotnet host to launch. +export DOTNET_HOST_PATH=$DOTNET_ROOT/dotnet + export TestExecutionDirectory=$(realpath "$(mktemp -d "${TMPDIR:-/tmp}"/dotnetSdkTests.XXXXXXXX)") export DOTNET_CLI_HOME=$TestExecutionDirectory/.dotnet cp -a $HELIX_CORRELATION_PAYLOAD/t/TestExecutionDirectoryFiles/. $TestExecutionDirectory/ @@ -22,15 +28,17 @@ dotnet new --debug:ephemeral-hive dotnet nuget list source --configfile $TestExecutionDirectory/NuGet.config dotnet nuget add source $TestExecutionDirectory/Testpackages --configfile $TestExecutionDirectory/NuGet.config -#Remove feeds not needed for tests -dotnet nuget remove source dotnet6-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet6-internal-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet7-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet7-internal-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source richnav --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source vs-impl --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet-libraries-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet-tools-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet-libraries --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet-eng --configfile $TestExecutionDirectory/NuGet.config +# Remove feeds not needed for tests. Use || true to avoid errors when a source +# doesn't exist (e.g. internal-transport feeds are only present in internal builds). +dotnet nuget remove source dotnet6-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet6-internal-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet7-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet7-internal-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source richnav --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source vs-impl --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet-libraries-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet-tools-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet-libraries --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet-eng --configfile $TestExecutionDirectory/NuGet.config || true dotnet nuget list source --configfile $TestExecutionDirectory/NuGet.config + diff --git a/src/BlazorWasmSdk/Tasks/GZipCompress.cs b/src/BlazorWasmSdk/Tasks/GZipCompress.cs index 96481d04a91b..b5edfd894c14 100644 --- a/src/BlazorWasmSdk/Tasks/GZipCompress.cs +++ b/src/BlazorWasmSdk/Tasks/GZipCompress.cs @@ -20,6 +20,10 @@ public class GZipCompress : Task [Required] public string OutputDirectory { get; set; } + // Retry count for transient file I/O errors (e.g., antivirus locks on CI machines). + private const int MaxRetries = 3; + private const int RetryDelayMs = 200; + public override bool Execute() { CompressedFiles = new ITaskItem[FilesToCompress.Length]; @@ -56,18 +60,31 @@ public override bool Execute() Log.LogMessage(MessageImportance.Low, "Compressing '{0}' because file is newer than '{1}'.", inputFullPath, outputRelativePath); } - try + // Retry on IOException to handle transient file locks from antivirus, file + // indexing, or parallel MSBuild nodes on CI machines (see dotnet/sdk#53424). + for (int attempt = 1; attempt <= MaxRetries; attempt++) { - using var sourceStream = File.OpenRead(file.ItemSpec); - using var fileStream = File.Create(outputRelativePath); - using var stream = new GZipStream(fileStream, CompressionLevel.Optimal); + try + { + using var sourceStream = File.OpenRead(file.ItemSpec); + using var fileStream = File.Create(outputRelativePath); + using var stream = new GZipStream(fileStream, CompressionLevel.Optimal); - sourceStream.CopyTo(stream); - } - catch (Exception e) - { - Log.LogErrorFromException(e); - return; + sourceStream.CopyTo(stream); + return; // Success + } + catch (IOException) when (attempt < MaxRetries) + { + Log.LogMessage(MessageImportance.Low, + "Retrying compression of '{0}' (attempt {1}/{2}) due to transient I/O error.", + file.ItemSpec, attempt, MaxRetries); + Thread.Sleep(RetryDelayMs * attempt); + } + catch (Exception e) + { + Log.LogErrorFromException(e); + return; + } } }); diff --git a/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs b/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs index 10d46c189096..52efde284c2f 100644 --- a/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs +++ b/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs @@ -76,6 +76,16 @@ public async ValueTask DisposeAsync() _isDisposed = true; // wait for all in-flight process initialization to complete: + // If no session initialization is in-flight (_pendingSessionInitializationCount == 0), + // the semaphore will never be released by StartProjectAsync's finally block. + // Release it here to prevent a deadlock. Protect against the race where + // StartProjectAsync's finally block releases concurrently. + if (Volatile.Read(ref _pendingSessionInitializationCount) == 0) + { + try { _postDisposalSessionInitializationCompleted.Release(); } + catch (SemaphoreFullException) { } + } + await _postDisposalSessionInitializationCompleted.WaitAsync(CancellationToken.None); // terminate all active sessions: @@ -174,7 +184,10 @@ public async ValueTask StartProjectAsync(string dcpId, string sessionId, Project { if (Interlocked.Decrement(ref _pendingSessionInitializationCount) == 0 && _isDisposed) { - _postDisposalSessionInitializationCompleted.Release(); + // Guard against double-release: DisposeAsync may have already released + // the semaphore if it observed count==0 before we decremented. + try { _postDisposalSessionInitializationCompleted.Release(); } + catch (SemaphoreFullException) { } } } diff --git a/src/Dotnet.Watch/Watch/HotReload/CompilationHandler.cs b/src/Dotnet.Watch/Watch/HotReload/CompilationHandler.cs index 85e120eafafc..9ae2a4bdda47 100644 --- a/src/Dotnet.Watch/Watch/HotReload/CompilationHandler.cs +++ b/src/Dotnet.Watch/Watch/HotReload/CompilationHandler.cs @@ -1010,8 +1010,27 @@ public async Task UpdateProjectGraphAsync(ProjectGraph projectGraph, Cancellatio public async Task UpdateFileContentAsync(IReadOnlyList changedFiles, CancellationToken cancellationToken) { - var solution = await Workspace.UpdateFileContentAsync(changedFiles.Select(static f => (f.Item.FilePath, f.Kind.Convert())), cancellationToken); - await SolutionUpdatedAsync(solution, "document update", cancellationToken); + // Retry on IOException: the file may be transiently locked by a process that is being + // relaunched (e.g. a crashed service whose process hasn't fully exited yet, or MSBuild + // design-time build reading the same source file). The lock is short-lived, so a brief + // backoff is sufficient. + const int maxRetries = 5; + const int baseDelayMs = 100; + + for (var attempt = 0; ; attempt++) + { + try + { + var solution = await Workspace.UpdateFileContentAsync(changedFiles.Select(static f => (f.Item.FilePath, f.Kind.Convert())), cancellationToken); + await SolutionUpdatedAsync(solution, "document update", cancellationToken); + return; + } + catch (IOException) when (attempt < maxRetries) + { + Logger.LogWarning("File is locked (attempt {Attempt}/{MaxRetries}), retrying...", attempt + 1, maxRetries); + await Task.Delay(baseDelayMs * (1 << attempt), cancellationToken); + } + } } private Task SolutionUpdatedAsync(Solution newSolution, string operationDisplayName, CancellationToken cancellationToken) diff --git a/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs b/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs index 9f38f3af4545..96137a1bce6d 100644 --- a/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs +++ b/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs @@ -10,9 +10,19 @@ namespace Microsoft.DotNet.Watch.UnitTests { internal sealed class AwaitableProcess : IAsyncDisposable { - // cancel just before we hit timeout used on CI (XUnitWorkItemTimeout value in sdk\test\UnitTests.proj) + // Maximum time to wait for a single line of output from the process. + // On CI (Helix), cap at 5 minutes. The HELIX_WORK_ITEM_TIMEOUT is the total budget + // for ALL tests in the work item (~2h), which is far too long for a single + // wait-for-output operation. If a process produces no output for 5 minutes, + // it's deadlocked (e.g., dotnet-watch shutdown race in AspireServiceFactory). + // Capping here turns a 2-hour partition-blocking hang into a 5-minute clean failure. + private static readonly TimeSpan s_maxPerOperationTimeout = TimeSpan.FromMinutes(5); + private static readonly TimeSpan s_timeout = Environment.GetEnvironmentVariable("HELIX_WORK_ITEM_TIMEOUT") is { } value - ? TimeSpan.Parse(value).Subtract(TimeSpan.FromSeconds(10)) : TimeSpan.FromMinutes(10); + ? Min(TimeSpan.Parse(value).Subtract(TimeSpan.FromSeconds(10)), s_maxPerOperationTimeout) + : TimeSpan.FromMinutes(10); + + private static TimeSpan Min(TimeSpan a, TimeSpan b) => a < b ? a : b; private readonly List _lines = []; @@ -226,6 +236,17 @@ public async ValueTask DisposeAsync() { } + // Close stdin before killing. This unblocks PhysicalConsole.ListenToStandardInputAsync() + // which reads from stdin with CancellationToken.None and no timeout. + // Without this, the stdin reader can keep the process alive after Kill() on some platforms. + try + { + Process.StandardInput.Close(); + } + catch + { + } + try { Process.Kill(entireProcessTree: true); @@ -234,8 +255,17 @@ public async ValueTask DisposeAsync() { } - // ensure process has exited - await _processExitAwaiter; + // Wait for process exit with a timeout to prevent hanging the test if Kill() fails. + // The WaitForProcessExitAsync loop checks HasExited every 1 second, so 30s is generous. + using var exitTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(30)); + try + { + await _processExitAwaiter.WaitAsync(exitTimeout.Token); + } + catch (OperationCanceledException) + { + Logger.Log($"Process {Id} did not exit within 30 seconds after Kill()"); + } Process.Dispose(); diff --git a/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs b/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs index 2dacad1cb33a..efb62a725d14 100644 --- a/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs +++ b/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs @@ -208,12 +208,16 @@ public ProcessStartInfo GetProcessStartInfo(string workingDirectory, string test info.Environment.Add("DCP_DIAGNOSTICS_LOG_FOLDER", Path.Combine(testOutputPath, "dcp")); info.Environment.Add("DCP_DIAGNOSTICS_LOG_LEVEL", "debug"); - // suppress all timeouts: - info.Environment.Add("DCP_IDE_REQUEST_TIMEOUT_SECONDS", "100000"); - info.Environment.Add("DCP_IDE_NOTIFICATION_TIMEOUT_SECONDS", "100000"); - info.Environment.Add("DCP_IDE_NOTIFICATION_KEEPALIVE_SECONDS", "100000"); + // Use generous but bounded timeouts for DCP operations in CI. + // Previous values of 100,000 seconds (~27 hours) effectively disabled timeouts, + // causing tests to hang for the full Helix work item duration (~2 hours) when + // a DCP operation deadlocked. 300 seconds (5 minutes) per operation is generous + // for slow CI machines while ensuring natural failure recovery. + info.Environment.Add("DCP_IDE_REQUEST_TIMEOUT_SECONDS", "300"); + info.Environment.Add("DCP_IDE_NOTIFICATION_TIMEOUT_SECONDS", "300"); + info.Environment.Add("DCP_IDE_NOTIFICATION_KEEPALIVE_SECONDS", "300"); info.Environment.Add("ASPIRE_ALLOW_UNSECURED_TRANSPORT", "1"); - info.Environment.Add("ASPIRE_WATCH_PIPE_CONNECTION_TIMEOUT_SECONDS", "100000"); + info.Environment.Add("ASPIRE_WATCH_PIPE_CONNECTION_TIMEOUT_SECONDS", "300"); // override defaults: foreach (var (name, value) in EnvironmentVariables) diff --git a/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs b/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs index 8f3289b9110b..ef2a49f63577 100644 --- a/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs +++ b/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs @@ -358,7 +358,7 @@ public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout return connectionTask; } - readySource.SetResult(true); + readySource.TrySetResult(true); return new TaskCompletionSource().Task; }); @@ -382,11 +382,18 @@ public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout } }; var keepAlive = TimeSpan.FromSeconds(1); - var dispatcherTask = Task.Run(() => + + // Use Task.Factory.StartNew with LongRunning to run the dispatcher on a dedicated + // OS thread instead of a thread pool thread. The dispatcher's Run() method uses + // blocking Task.WaitAny() which permanently blocks its thread. On Helix CI agents + // running many tests in parallel, blocking a thread pool thread contributes to pool + // starvation, which prevents Task.Delay timer callbacks from firing, causing the + // keep-alive timeout to never complete and the test to hang indefinitely. + var dispatcherTask = Task.Factory.StartNew(() => { var dispatcher = new DefaultRequestDispatcher(connectionHost.Object, compilerHost, CancellationToken.None, eventBus, keepAlive); dispatcher.Run(); - }); + }, CancellationToken.None, TaskCreationOptions.LongRunning, TaskScheduler.Default); // Wait for all connections to be created. await readySource.Task; @@ -402,7 +409,10 @@ public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout // Act // Now dispatcher should be in an idle state with no active connections. - await dispatcherTask; + // Use WaitAsync as a safety net: if the keep-alive timeout still can't fire + // (e.g. extreme thread pool starvation), fail the test after 60s instead of + // hanging for 60+ minutes and blocking the entire CI job. + await dispatcherTask.WaitAsync(TimeSpan.FromSeconds(60)); // Assert Assert.False(eventBus.HasDetectedBadConnection); diff --git a/test/TestAssets/Directory.Build.targets b/test/TestAssets/Directory.Build.targets index cecd12d3d0c8..2c8eb1f15b8f 100644 --- a/test/TestAssets/Directory.Build.targets +++ b/test/TestAssets/Directory.Build.targets @@ -1,4 +1,21 @@ + + + + + + + diff --git a/test/dotnet-watch.Tests/Browser/BrowserTests.cs b/test/dotnet-watch.Tests/Browser/BrowserTests.cs index d2b82f8e2c87..a258dfadc1d7 100644 --- a/test/dotnet-watch.Tests/Browser/BrowserTests.cs +++ b/test/dotnet-watch.Tests/Browser/BrowserTests.cs @@ -22,7 +22,10 @@ public async Task LaunchesBrowserOnStart() Assert.Contains(App.Process.Output, line => line.Contains("Hosting environment: Development")); // Verify we launched the browser. - App.AssertOutputContains(MessageDescriptor.LaunchingBrowser.GetMessage("https://localhost:5001")); + // Use WaitUntilOutputContains (async) instead of AssertOutputContains (sync check) + // because the browser launch message is emitted asynchronously and may not have been + // captured yet when the assertion runs. + await App.WaitUntilOutputContains(MessageDescriptor.LaunchingBrowser.GetMessage("https://localhost:5001")); } [PlatformSpecificFact(TestPlatforms.Windows | TestPlatforms.Linux)] // https://github.com/dotnet/sdk/issues/53061 From 3ecb050a0c66e5cf2303e9e667d45598fdb978e8 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Tue, 17 Mar 2026 10:35:44 -0700 Subject: [PATCH 02/36] Revert dotnet-watch changes in favor of tmat's fix in PR #53271 The dotnet-watch Aspire race condition fix from this PR has been superseded by a proper fix in dotnet/sdk#53271 (merged to main). Reverting these files so the merge from main brings in the better fix. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Watch/Aspire/AspireServiceFactory.cs | 15 +------- .../Watch/HotReload/CompilationHandler.cs | 23 +---------- .../AwaitableProcess.cs | 38 ++----------------- .../WatchableApp.cs | 14 +++---- .../Browser/BrowserTests.cs | 5 +-- 5 files changed, 13 insertions(+), 82 deletions(-) diff --git a/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs b/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs index 52efde284c2f..10d46c189096 100644 --- a/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs +++ b/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs @@ -76,16 +76,6 @@ public async ValueTask DisposeAsync() _isDisposed = true; // wait for all in-flight process initialization to complete: - // If no session initialization is in-flight (_pendingSessionInitializationCount == 0), - // the semaphore will never be released by StartProjectAsync's finally block. - // Release it here to prevent a deadlock. Protect against the race where - // StartProjectAsync's finally block releases concurrently. - if (Volatile.Read(ref _pendingSessionInitializationCount) == 0) - { - try { _postDisposalSessionInitializationCompleted.Release(); } - catch (SemaphoreFullException) { } - } - await _postDisposalSessionInitializationCompleted.WaitAsync(CancellationToken.None); // terminate all active sessions: @@ -184,10 +174,7 @@ public async ValueTask StartProjectAsync(string dcpId, string sessionId, Project { if (Interlocked.Decrement(ref _pendingSessionInitializationCount) == 0 && _isDisposed) { - // Guard against double-release: DisposeAsync may have already released - // the semaphore if it observed count==0 before we decremented. - try { _postDisposalSessionInitializationCompleted.Release(); } - catch (SemaphoreFullException) { } + _postDisposalSessionInitializationCompleted.Release(); } } diff --git a/src/Dotnet.Watch/Watch/HotReload/CompilationHandler.cs b/src/Dotnet.Watch/Watch/HotReload/CompilationHandler.cs index 9ae2a4bdda47..85e120eafafc 100644 --- a/src/Dotnet.Watch/Watch/HotReload/CompilationHandler.cs +++ b/src/Dotnet.Watch/Watch/HotReload/CompilationHandler.cs @@ -1010,27 +1010,8 @@ public async Task UpdateProjectGraphAsync(ProjectGraph projectGraph, Cancellatio public async Task UpdateFileContentAsync(IReadOnlyList changedFiles, CancellationToken cancellationToken) { - // Retry on IOException: the file may be transiently locked by a process that is being - // relaunched (e.g. a crashed service whose process hasn't fully exited yet, or MSBuild - // design-time build reading the same source file). The lock is short-lived, so a brief - // backoff is sufficient. - const int maxRetries = 5; - const int baseDelayMs = 100; - - for (var attempt = 0; ; attempt++) - { - try - { - var solution = await Workspace.UpdateFileContentAsync(changedFiles.Select(static f => (f.Item.FilePath, f.Kind.Convert())), cancellationToken); - await SolutionUpdatedAsync(solution, "document update", cancellationToken); - return; - } - catch (IOException) when (attempt < maxRetries) - { - Logger.LogWarning("File is locked (attempt {Attempt}/{MaxRetries}), retrying...", attempt + 1, maxRetries); - await Task.Delay(baseDelayMs * (1 << attempt), cancellationToken); - } - } + var solution = await Workspace.UpdateFileContentAsync(changedFiles.Select(static f => (f.Item.FilePath, f.Kind.Convert())), cancellationToken); + await SolutionUpdatedAsync(solution, "document update", cancellationToken); } private Task SolutionUpdatedAsync(Solution newSolution, string operationDisplayName, CancellationToken cancellationToken) diff --git a/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs b/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs index 96137a1bce6d..9f38f3af4545 100644 --- a/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs +++ b/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs @@ -10,19 +10,9 @@ namespace Microsoft.DotNet.Watch.UnitTests { internal sealed class AwaitableProcess : IAsyncDisposable { - // Maximum time to wait for a single line of output from the process. - // On CI (Helix), cap at 5 minutes. The HELIX_WORK_ITEM_TIMEOUT is the total budget - // for ALL tests in the work item (~2h), which is far too long for a single - // wait-for-output operation. If a process produces no output for 5 minutes, - // it's deadlocked (e.g., dotnet-watch shutdown race in AspireServiceFactory). - // Capping here turns a 2-hour partition-blocking hang into a 5-minute clean failure. - private static readonly TimeSpan s_maxPerOperationTimeout = TimeSpan.FromMinutes(5); - + // cancel just before we hit timeout used on CI (XUnitWorkItemTimeout value in sdk\test\UnitTests.proj) private static readonly TimeSpan s_timeout = Environment.GetEnvironmentVariable("HELIX_WORK_ITEM_TIMEOUT") is { } value - ? Min(TimeSpan.Parse(value).Subtract(TimeSpan.FromSeconds(10)), s_maxPerOperationTimeout) - : TimeSpan.FromMinutes(10); - - private static TimeSpan Min(TimeSpan a, TimeSpan b) => a < b ? a : b; + ? TimeSpan.Parse(value).Subtract(TimeSpan.FromSeconds(10)) : TimeSpan.FromMinutes(10); private readonly List _lines = []; @@ -236,17 +226,6 @@ public async ValueTask DisposeAsync() { } - // Close stdin before killing. This unblocks PhysicalConsole.ListenToStandardInputAsync() - // which reads from stdin with CancellationToken.None and no timeout. - // Without this, the stdin reader can keep the process alive after Kill() on some platforms. - try - { - Process.StandardInput.Close(); - } - catch - { - } - try { Process.Kill(entireProcessTree: true); @@ -255,17 +234,8 @@ public async ValueTask DisposeAsync() { } - // Wait for process exit with a timeout to prevent hanging the test if Kill() fails. - // The WaitForProcessExitAsync loop checks HasExited every 1 second, so 30s is generous. - using var exitTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(30)); - try - { - await _processExitAwaiter.WaitAsync(exitTimeout.Token); - } - catch (OperationCanceledException) - { - Logger.Log($"Process {Id} did not exit within 30 seconds after Kill()"); - } + // ensure process has exited + await _processExitAwaiter; Process.Dispose(); diff --git a/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs b/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs index efb62a725d14..2dacad1cb33a 100644 --- a/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs +++ b/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs @@ -208,16 +208,12 @@ public ProcessStartInfo GetProcessStartInfo(string workingDirectory, string test info.Environment.Add("DCP_DIAGNOSTICS_LOG_FOLDER", Path.Combine(testOutputPath, "dcp")); info.Environment.Add("DCP_DIAGNOSTICS_LOG_LEVEL", "debug"); - // Use generous but bounded timeouts for DCP operations in CI. - // Previous values of 100,000 seconds (~27 hours) effectively disabled timeouts, - // causing tests to hang for the full Helix work item duration (~2 hours) when - // a DCP operation deadlocked. 300 seconds (5 minutes) per operation is generous - // for slow CI machines while ensuring natural failure recovery. - info.Environment.Add("DCP_IDE_REQUEST_TIMEOUT_SECONDS", "300"); - info.Environment.Add("DCP_IDE_NOTIFICATION_TIMEOUT_SECONDS", "300"); - info.Environment.Add("DCP_IDE_NOTIFICATION_KEEPALIVE_SECONDS", "300"); + // suppress all timeouts: + info.Environment.Add("DCP_IDE_REQUEST_TIMEOUT_SECONDS", "100000"); + info.Environment.Add("DCP_IDE_NOTIFICATION_TIMEOUT_SECONDS", "100000"); + info.Environment.Add("DCP_IDE_NOTIFICATION_KEEPALIVE_SECONDS", "100000"); info.Environment.Add("ASPIRE_ALLOW_UNSECURED_TRANSPORT", "1"); - info.Environment.Add("ASPIRE_WATCH_PIPE_CONNECTION_TIMEOUT_SECONDS", "300"); + info.Environment.Add("ASPIRE_WATCH_PIPE_CONNECTION_TIMEOUT_SECONDS", "100000"); // override defaults: foreach (var (name, value) in EnvironmentVariables) diff --git a/test/dotnet-watch.Tests/Browser/BrowserTests.cs b/test/dotnet-watch.Tests/Browser/BrowserTests.cs index a258dfadc1d7..d2b82f8e2c87 100644 --- a/test/dotnet-watch.Tests/Browser/BrowserTests.cs +++ b/test/dotnet-watch.Tests/Browser/BrowserTests.cs @@ -22,10 +22,7 @@ public async Task LaunchesBrowserOnStart() Assert.Contains(App.Process.Output, line => line.Contains("Hosting environment: Development")); // Verify we launched the browser. - // Use WaitUntilOutputContains (async) instead of AssertOutputContains (sync check) - // because the browser launch message is emitted asynchronously and may not have been - // captured yet when the assertion runs. - await App.WaitUntilOutputContains(MessageDescriptor.LaunchingBrowser.GetMessage("https://localhost:5001")); + App.AssertOutputContains(MessageDescriptor.LaunchingBrowser.GetMessage("https://localhost:5001")); } [PlatformSpecificFact(TestPlatforms.Windows | TestPlatforms.Linux)] // https://github.com/dotnet/sdk/issues/53061 From fccc655d25d2575ec842a3a09b1e3c1746a0cd9b Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Tue, 17 Mar 2026 10:36:31 -0700 Subject: [PATCH 03/36] Trigger CI build #1 after merge from main Reverted dotnet-watch changes (superseded by tmat's fix in #53271). Merged from main to pick up latest changes. Starting fresh validation run toward 25 consecutive passes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 5be6d9e2ed879dfd580249191a6e3af83e877ded Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Tue, 17 Mar 2026 12:38:22 -0700 Subject: [PATCH 04/36] Trigger CI build #2 - streak: 1/25 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From cdf5d19d7bdcbd1c080ece78e0a720cfdc91a614 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Tue, 17 Mar 2026 14:41:39 -0700 Subject: [PATCH 05/36] Trigger CI build #3 - streak: 2/25 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 22f201a2830800112c23cf6c6184d7eaebda627f Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Tue, 17 Mar 2026 17:53:39 -0700 Subject: [PATCH 06/36] Trigger CI build #4 - streak: 3/25 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 672d0ecd1d70e4b30c673db7b0a9658418e75cd3 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Tue, 17 Mar 2026 20:31:45 -0700 Subject: [PATCH 07/36] Trigger CI build #5 - streak reset (dotnet-watch flaky test) Build #4 failed due to pre-existing flaky dotnet-watch test: ProjectAndSourceFileChange_AddPackageReference (Assert.Equal 1 vs 0) This is not related to our PR changes. Restarting streak count. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From e22cf34fbc784fcaa3279513e6f3edf07a2c3d8e Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Tue, 17 Mar 2026 22:09:45 -0700 Subject: [PATCH 08/36] Trigger CI build #6 - streak: 1/25 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From a1ebf223e915eb468894316f5463f05946554f00 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Wed, 18 Mar 2026 07:11:45 -0700 Subject: [PATCH 09/36] Trigger CI build #7 - streak: 2/25 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 7893dc509b5762707fad8997a714ec33bafb98be Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Wed, 18 Mar 2026 08:27:02 -0700 Subject: [PATCH 10/36] Fix flaky dotnet-watch ProjectUpdateInProcTests race condition The ProjectAndSourceFileChange_AddPackageReference and ProjectAndSourceFileChange_AddProjectReference tests check managedCodeChangesApplied.CurrentCount immediately after seeing app output. However, ManagedCodeChangesApplied is logged in a fire-and-forget task (CompilationHandler.cs:497) that may not have completed yet, causing the semaphore count to be 0 instead of 1. Fix: Wait for the semaphore with a 30s timeout instead of checking CurrentCount synchronously. This ensures the fire-and-forget task completes before the assertion. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../HotReload/ProjectUpdateInProcTests.cs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/test/dotnet-watch.Tests/HotReload/ProjectUpdateInProcTests.cs b/test/dotnet-watch.Tests/HotReload/ProjectUpdateInProcTests.cs index f39310095b2b..a322afb75ed3 100644 --- a/test/dotnet-watch.Tests/HotReload/ProjectUpdateInProcTests.cs +++ b/test/dotnet-watch.Tests/HotReload/ProjectUpdateInProcTests.cs @@ -116,10 +116,15 @@ public async Task ProjectAndSourceFileChange_AddProjectReference() AssertEx.ContainsSubstring("Resolving 'Dependency, Version=1.0.0.0'", w.Reporter.ProcessOutput); + // Wait for the fire-and-forget task in CompilationHandler.CompleteApplyOperationAsync + // to finish logging ManagedCodeChangesApplied. The app output arrives before this task + // completes because it's not awaited (line 497 of CompilationHandler.cs). + using var waitCts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); + await managedCodeChangesApplied.WaitAsync(waitCts.Token); + Assert.Equal(1, projectChangeTriggeredReEvaluation.CurrentCount); Assert.Equal(1, projectsRebuilt.CurrentCount); Assert.Equal(1, projectDependenciesDeployed.CurrentCount); - Assert.Equal(1, managedCodeChangesApplied.CurrentCount); } [Fact] @@ -174,9 +179,14 @@ public async Task ProjectAndSourceFileChange_AddPackageReference() AssertEx.ContainsSubstring("Resolving 'Newtonsoft.Json, Version=13.0.0.0'", w.Reporter.ProcessOutput); + // Wait for the fire-and-forget task in CompilationHandler.CompleteApplyOperationAsync + // to finish logging ManagedCodeChangesApplied. The app output arrives before this task + // completes because it's not awaited (line 497 of CompilationHandler.cs). + using var waitCts = new CancellationTokenSource(TimeSpan.FromSeconds(30)); + await managedCodeChangesApplied.WaitAsync(waitCts.Token); + Assert.Equal(1, projectChangeTriggeredReEvaluation.CurrentCount); Assert.Equal(0, projectsRebuilt.CurrentCount); Assert.Equal(1, projectDependenciesDeployed.CurrentCount); - Assert.Equal(1, managedCodeChangesApplied.CurrentCount); } } From 17ed46b95133b86e194e2a38e414183f45ba92f1 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Wed, 18 Mar 2026 10:40:14 -0700 Subject: [PATCH 11/36] ci: validation run 9 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 90c33507087790b648c4d0fb0cfbccccb24bdba1 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Wed, 18 Mar 2026 14:00:54 -0700 Subject: [PATCH 12/36] ci: validation run 10 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From c690171bdf870b3416423e934d4826bff3262109 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Wed, 18 Mar 2026 17:13:52 -0700 Subject: [PATCH 13/36] ci: validation run 11 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 5386d8273a8f2cfc308472c1a7502e0d7dc83a32 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Wed, 18 Mar 2026 21:01:55 -0700 Subject: [PATCH 14/36] ci: validation run 12 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 230d5eee20ae9e72c149eb472749b2233a1605f7 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Wed, 18 Mar 2026 22:24:37 -0700 Subject: [PATCH 15/36] ci: validation run 13 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 125c22637f032d25655bfc3cd0e6ae5187595f29 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Thu, 19 Mar 2026 06:33:44 -0700 Subject: [PATCH 16/36] Fix DefaultRequestDispatcherTest hang: use kernel waits instead of async Replace WaitAsync/await with Task.Wait(timeout) which uses kernel-level ManualResetEventSlim instead of thread pool-dependent timer callbacks. Under extreme thread pool starvation on Helix CI, WaitAsync's timer continuations can't be scheduled, causing the test to hang for 60+ minutes. Task.Wait uses a kernel wait that works regardless of thread pool state. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DefaultRequestDispatcherTest.cs | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs b/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs index ef2a49f63577..b0c9c469b70c 100644 --- a/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs +++ b/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs @@ -335,7 +335,7 @@ public void Dispatcher_ProcessMultipleConnections_HitsKeepAliveTimeout() /// Ensure server respects keep alive and shuts down after processing simultaneous connections. /// [Fact] - public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout() + public void Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout() { // Arrange var totalCount = 2; @@ -395,11 +395,16 @@ public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout dispatcher.Run(); }, CancellationToken.None, TaskCreationOptions.LongRunning, TaskScheduler.Default); - // Wait for all connections to be created. - await readySource.Task; - - // Wait for all compilations to complete. - await allCompilationsComplete.Task; + // Wait for all connections to be created and compilations to complete. + // Use Task.Wait with timeout to avoid hanging under thread pool starvation. + if (!readySource.Task.Wait(TimeSpan.FromSeconds(60))) + { + throw new Xunit.Sdk.XunitException("Timed out waiting for connections to be created."); + } + if (!allCompilationsComplete.Task.Wait(TimeSpan.FromSeconds(60))) + { + throw new Xunit.Sdk.XunitException("Timed out waiting for compilations to complete."); + } // Now allow all the connections to be disconnected. foreach (var source in list) @@ -409,10 +414,16 @@ public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout // Act // Now dispatcher should be in an idle state with no active connections. - // Use WaitAsync as a safety net: if the keep-alive timeout still can't fire - // (e.g. extreme thread pool starvation), fail the test after 60s instead of - // hanging for 60+ minutes and blocking the entire CI job. - await dispatcherTask.WaitAsync(TimeSpan.FromSeconds(60)); + // Use Task.Wait(timeout) instead of WaitAsync because under extreme thread + // pool starvation on Helix CI, even WaitAsync's timer callback can't schedule + // its continuation, causing the test to hang for 60+ minutes. Task.Wait uses + // a kernel wait (ManualResetEventSlim) that doesn't depend on the thread pool. + if (!dispatcherTask.Wait(TimeSpan.FromSeconds(60))) + { + throw new Xunit.Sdk.XunitException( + "Dispatcher did not shut down within 60 seconds. This likely indicates " + + "thread pool starvation preventing Task.Delay timer callbacks from firing."); + } // Assert Assert.False(eventBus.HasDetectedBadConnection); From ce3d3d1384a9e78f590318b41adb5ab545870195 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Thu, 19 Mar 2026 07:59:48 -0700 Subject: [PATCH 17/36] Fix xUnit1031: use dedicated thread for timeout instead of Task.Wait The xUnit analyzer prohibits blocking Task.Wait in test methods. Use a dedicated background thread with Thread.Join(timeout) for a kernel-level wait that doesn't depend on thread pool scheduling, avoiding both the xUnit1031 analyzer error and the thread pool starvation hang. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DefaultRequestDispatcherTest.cs | 39 +++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs b/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs index b0c9c469b70c..8d222eca8a9a 100644 --- a/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs +++ b/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs @@ -335,7 +335,7 @@ public void Dispatcher_ProcessMultipleConnections_HitsKeepAliveTimeout() /// Ensure server respects keep alive and shuts down after processing simultaneous connections. /// [Fact] - public void Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout() + public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout() { // Arrange var totalCount = 2; @@ -395,16 +395,11 @@ public void Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout() dispatcher.Run(); }, CancellationToken.None, TaskCreationOptions.LongRunning, TaskScheduler.Default); - // Wait for all connections to be created and compilations to complete. - // Use Task.Wait with timeout to avoid hanging under thread pool starvation. - if (!readySource.Task.Wait(TimeSpan.FromSeconds(60))) - { - throw new Xunit.Sdk.XunitException("Timed out waiting for connections to be created."); - } - if (!allCompilationsComplete.Task.Wait(TimeSpan.FromSeconds(60))) - { - throw new Xunit.Sdk.XunitException("Timed out waiting for compilations to complete."); - } + // Wait for all connections to be created. + await readySource.Task; + + // Wait for all compilations to complete. + await allCompilationsComplete.Task; // Now allow all the connections to be disconnected. foreach (var source in list) @@ -414,16 +409,20 @@ public void Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout() // Act // Now dispatcher should be in an idle state with no active connections. - // Use Task.Wait(timeout) instead of WaitAsync because under extreme thread - // pool starvation on Helix CI, even WaitAsync's timer callback can't schedule - // its continuation, causing the test to hang for 60+ minutes. Task.Wait uses - // a kernel wait (ManualResetEventSlim) that doesn't depend on the thread pool. - if (!dispatcherTask.Wait(TimeSpan.FromSeconds(60))) + // Use a dedicated thread to enforce the timeout, since under extreme thread pool + // starvation on Helix CI, even WaitAsync's timer continuations can't be scheduled. + // A dedicated OS thread with Thread.Join(timeout) uses a kernel wait that works + // regardless of thread pool state. + var completed = false; + var timeoutThread = new Thread(() => { - throw new Xunit.Sdk.XunitException( - "Dispatcher did not shut down within 60 seconds. This likely indicates " + - "thread pool starvation preventing Task.Delay timer callbacks from firing."); - } + completed = dispatcherTask.Wait(TimeSpan.FromSeconds(60)); + }) { IsBackground = true }; + timeoutThread.Start(); + timeoutThread.Join(TimeSpan.FromSeconds(65)); + Assert.True(completed, + "Dispatcher did not shut down within 60 seconds. This likely indicates " + + "thread pool starvation preventing Task.Delay timer callbacks from firing."); // Assert Assert.False(eventBus.HasDetectedBadConnection); From f858f5f0cc540ed88bb9df22d7550da8ff5650d3 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Thu, 19 Mar 2026 08:27:22 -0700 Subject: [PATCH 18/36] Fix TemplateEngine snapshot flakiness and BrowserDiagnostics hang TemplateEngine: Add scrubber for intermittent MSBuild debug log message ('MSBuild logs and debug information will be at...') that appears when telemetry/profiling is enabled on some Helix machines, causing snapshot mismatches. Added to all 6 WithCustomScrubbers locations across 3 test files. BrowserDiagnostics: Close stdin before killing process in AwaitableProcess DisposeAsync to unblock PhysicalConsole.ListenToStandardInputAsync() which uses CancellationToken.None on stdin ReadAsync. On Linux, stdin reads don't unblock on process kill, causing 60-minute hangs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../AwaitableProcess.cs | 11 +++++++++++ .../CommonTemplatesTests.cs | 3 +++ .../DotnetClassTemplateTests.cs | 2 ++ .../TemplateEngineSamplesTest.cs | 3 ++- test/dotnet-new.IntegrationTests/VerifyScrubbers.cs | 9 +++++++++ 5 files changed, 27 insertions(+), 1 deletion(-) diff --git a/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs b/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs index 9f38f3af4545..8e73a4810db4 100644 --- a/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs +++ b/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs @@ -210,6 +210,17 @@ public async ValueTask DisposeAsync() Process.ErrorDataReceived -= OnData; Process.OutputDataReceived -= OnData; + // Close stdin before killing the process to unblock any pending stdin reads + // (e.g. PhysicalConsole.ListenToStandardInputAsync on Linux where stdin reads + // don't unblock on process kill). + try + { + Process.StandardInput?.Close(); + } + catch + { + } + try { Process.CancelErrorRead(); diff --git a/test/dotnet-new.IntegrationTests/CommonTemplatesTests.cs b/test/dotnet-new.IntegrationTests/CommonTemplatesTests.cs index 3c21a97d7b7b..51d3001b10a4 100644 --- a/test/dotnet-new.IntegrationTests/CommonTemplatesTests.cs +++ b/test/dotnet-new.IntegrationTests/CommonTemplatesTests.cs @@ -80,6 +80,7 @@ public async Task AllCommonItemsCreate(string expectedTemplateName, string templ .WithCustomScrubbers( ScrubbersDefinition.Empty .AddScrubber(sb => sb.UnixifyNewlines(), "out") + .AddScrubber(sb => sb.ScrubMSBuildDebugLogMessage(), "txt") .AddScrubber((path, content) => { if (path.Replace(Path.DirectorySeparatorChar, '/') == "std-streams/stdout.txt") @@ -224,6 +225,7 @@ public async Task AotVariants(string name, string language) ScrubbersDefinition.Empty .AddScrubber(sb => sb.Replace($"{currentDefaultFramework}", "%FRAMEWORK%")) .AddScrubber(sb => sb.Replace(finalProjectName, "%PROJECT_PATH%").UnixifyDirSeparators().ScrubByRegex("(^ Restored .* \\()(.*)(\\)\\.)", "$1%DURATION%$3", RegexOptions.Multiline), "txt") + .AddScrubber(sb => sb.ScrubMSBuildDebugLogMessage(), "txt") ); VerificationEngine engine = new(_logger); @@ -424,6 +426,7 @@ public async Task FeaturesSupport( .AddScrubber(sb => sb.Replace($"{langVersion}", "%LANG%")) .AddScrubber(sb => sb.Replace($"{framework ?? currentDefaultFramework}", "%FRAMEWORK%")) .AddScrubber(sb => sb.Replace(finalProjectName, "%PROJECT_PATH%").UnixifyDirSeparators().ScrubByRegex("(^ Restored .* \\()(.*)(\\)\\.)", "$1%DURATION%$3", RegexOptions.Multiline), "txt") + .AddScrubber(sb => sb.ScrubMSBuildDebugLogMessage(), "txt") ); VerificationEngine engine = new(_logger); diff --git a/test/dotnet-new.IntegrationTests/DotnetClassTemplateTests.cs b/test/dotnet-new.IntegrationTests/DotnetClassTemplateTests.cs index fd288169f5be..31d4d9bcdd0c 100644 --- a/test/dotnet-new.IntegrationTests/DotnetClassTemplateTests.cs +++ b/test/dotnet-new.IntegrationTests/DotnetClassTemplateTests.cs @@ -77,6 +77,7 @@ public async Task DotnetCSharpClassTemplatesTest( .WithCustomEnvironment(environmentUnderTest!) .WithCustomScrubbers( ScrubbersDefinition.Empty + .AddScrubber(sb => sb.ScrubMSBuildDebugLogMessage(), "txt") .AddScrubber((path, content) => { if (path.Replace(Path.DirectorySeparatorChar, '/') == "std-streams/stdout.txt") @@ -157,6 +158,7 @@ public async Task DotnetVisualBasicClassTemplatesTest( .WithCustomEnvironment(environmentUnderTest!) .WithCustomScrubbers( ScrubbersDefinition.Empty + .AddScrubber(sb => sb.ScrubMSBuildDebugLogMessage(), "txt") .AddScrubber((path, content) => { if (path.Replace(Path.DirectorySeparatorChar, '/') == "std-streams/stdout.txt") diff --git a/test/dotnet-new.IntegrationTests/TemplateEngineSamplesTest.cs b/test/dotnet-new.IntegrationTests/TemplateEngineSamplesTest.cs index 1469a65f90f7..f39fc8c008b7 100644 --- a/test/dotnet-new.IntegrationTests/TemplateEngineSamplesTest.cs +++ b/test/dotnet-new.IntegrationTests/TemplateEngineSamplesTest.cs @@ -63,7 +63,8 @@ public async Task TemplateEngineSamplesProjectTest( .WithCustomEnvironment(environmentUnderTest!) .WithCustomScrubbers( ScrubbersDefinition.Empty - .AddScrubber(sb => sb.Replace(DateTime.Now.ToString("MM/dd/yyyy"), "**/**/****"))); + .AddScrubber(sb => sb.Replace(DateTime.Now.ToString("MM/dd/yyyy"), "**/**/****")) + .AddScrubber(sb => sb.ScrubMSBuildDebugLogMessage(), "txt")); VerificationEngine engine = new(_log); await engine.Execute(options); diff --git a/test/dotnet-new.IntegrationTests/VerifyScrubbers.cs b/test/dotnet-new.IntegrationTests/VerifyScrubbers.cs index b7c51ab40d96..cce088deff16 100644 --- a/test/dotnet-new.IntegrationTests/VerifyScrubbers.cs +++ b/test/dotnet-new.IntegrationTests/VerifyScrubbers.cs @@ -51,6 +51,15 @@ internal static void ScrubByRegex(this StringBuilder output, string pattern, str output.Append(finalOutput); } + /// + /// Removes MSBuild debug log path messages that appear intermittently depending on + /// telemetry/profiling settings, which vary across CI machines causing snapshot mismatches. + /// + internal static void ScrubMSBuildDebugLogMessage(this StringBuilder output) + { + output.ScrubByRegex(@"^\s*MSBuild logs and debug information will be at .*[\r\n]*", "", RegexOptions.Multiline); + } + /// /// Replaces content matching with . /// From 1677d0c24f1583e63cd8245cad51a216897cce51 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Thu, 19 Mar 2026 09:36:41 -0700 Subject: [PATCH 19/36] ci: validation run 15 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From e97da4d70625b0a0b1b0b8b38986cce6fa233b58 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Thu, 19 Mar 2026 09:44:08 -0700 Subject: [PATCH 20/36] ci: validation run 16 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From bbf6efd2f3037db8e82558cd138dbaafea567ef5 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Thu, 19 Mar 2026 11:13:12 -0700 Subject: [PATCH 21/36] ci: validation run 17 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From c3c701fd829ba8a19a8acae65e779bad35279b95 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Thu, 19 Mar 2026 14:00:21 -0700 Subject: [PATCH 22/36] ci: validation run 18 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From d56cf34b447b26eaa78d453fe9ecfb64a9061a04 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Thu, 19 Mar 2026 18:51:31 -0700 Subject: [PATCH 23/36] ci: validation run 19 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 644dd0f6f02638d2f54ddd92724700a2c22fbac6 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Thu, 19 Mar 2026 19:08:55 -0700 Subject: [PATCH 24/36] ci: trigger validation run 19 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From ecebfd5533d1604eaab00f3d63d784d6da711bc1 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Thu, 19 Mar 2026 20:45:08 -0700 Subject: [PATCH 25/36] ci: validation run 20 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 513e189b14a42042aefe1d02aca185aca31994b4 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Thu, 19 Mar 2026 22:25:05 -0700 Subject: [PATCH 26/36] ci: validation run 21 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 16761f090fcff7f98ad2537874134e93594aa7e4 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Fri, 20 Mar 2026 07:12:30 -0700 Subject: [PATCH 27/36] Fix Aspire test hang: skip Console.ReadKey when stdin is redirected On Linux, Console.ReadKey() blocks indefinitely when stdin is inherited from a parent test process. Aspire launcher processes (server, resources) each create a PhysicalConsole that starts a LongRunning task calling Console.ReadKey(), which never unblocks when the test process kills the child. Check Console.IsInputRedirected before starting the keyboard listener to avoid the hang. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Dotnet.Watch/Watch/UI/PhysicalConsole.cs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/Dotnet.Watch/Watch/UI/PhysicalConsole.cs b/src/Dotnet.Watch/Watch/UI/PhysicalConsole.cs index 46afed53a210..7e69db8567a2 100644 --- a/src/Dotnet.Watch/Watch/UI/PhysicalConsole.cs +++ b/src/Dotnet.Watch/Watch/UI/PhysicalConsole.cs @@ -19,7 +19,15 @@ internal sealed class PhysicalConsole : IConsole public PhysicalConsole(TestFlags testFlags) { Console.OutputEncoding = Encoding.UTF8; - _ = testFlags.HasFlag(TestFlags.ReadKeyFromStdin) ? ListenToStandardInputAsync() : ListenToConsoleKeyPressAsync(); + + if (testFlags.HasFlag(TestFlags.ReadKeyFromStdin)) + { + _ = ListenToStandardInputAsync(); + } + else if (!Console.IsInputRedirected) + { + _ = ListenToConsoleKeyPressAsync(); + } } private async Task ListenToStandardInputAsync() From 84d258f2bfa1c5bbbd9a60806d1b5014222b5499 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Fri, 20 Mar 2026 07:34:03 -0700 Subject: [PATCH 28/36] ci: trigger validation run 22 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From b2da329e3b776b3023a83b34f8cfd5f9ef1c7e2b Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Fri, 20 Mar 2026 09:26:01 -0700 Subject: [PATCH 29/36] ci: validation run 23 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 06c9f299c4b82b32b0d11d81b7342e4bcbe433fb Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Fri, 20 Mar 2026 11:16:31 -0700 Subject: [PATCH 30/36] ci: validation run 24 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 6b850a8503bed7350f8dc7e9ca93edeed4e4b84b Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Fri, 20 Mar 2026 11:39:13 -0700 Subject: [PATCH 31/36] ci: retrigger validation run 24 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 99f60c7718086663e5c3c3c240f1a80d209a1529 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Fri, 20 Mar 2026 14:26:54 -0700 Subject: [PATCH 32/36] ci: validation run 25 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From fa48b4bca153191737c1512007c82cd28b98a717 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Fri, 20 Mar 2026 15:38:55 -0700 Subject: [PATCH 33/36] ci: validation run 26 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 22505b5045c98039b92ab9f6ae4c8143b4b647af Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Fri, 20 Mar 2026 17:06:16 -0700 Subject: [PATCH 34/36] ci: validation run 27 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From e99bb84b124b1158092fad47f224c0b5c146604c Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Fri, 20 Mar 2026 18:49:18 -0700 Subject: [PATCH 35/36] ci: validation run 28 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 5e36300f5ccd3dd9c4d63ecf138e411ee072a962 Mon Sep 17 00:00:00 2001 From: "Matt Mitchell (.NET)" Date: Fri, 20 Mar 2026 20:27:56 -0700 Subject: [PATCH 36/36] ci: validation run 29 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>