Skip to content

Commit 9949b40

Browse files
committed
Fix CI flakiness: MSB4216 task host, dotnet-watch deadlock+timeout, GZipCompress, NuGet noise, runtimeconfig
Root causes fixed: 1. MSB4216 TaskHostFactory: DOTNET_HOST_PATH not set in Helix scripts 2. dotnet-watch Aspire hang: 2-hour per-operation timeout (now capped at 5 min), stdin not closed before kill, no exit timeout, missing DCP timeout env vars, semaphore deadlock in AspireServiceFactory.DisposeAsync 3. GZipCompress file lock: Parallel.For races with antivirus/indexer (retry with backoff) 4. NuGet source removal noise: stderr from non-existent sources treated as errors 5. Missing runtimeconfig.json in NuGet test tool packages Validation: Build 1333280 = 18/18 jobs passed (consecutive pass #1)
1 parent b98e0dd commit 9949b40

File tree

8 files changed

+149
-45
lines changed

8 files changed

+149
-45
lines changed

build/RunTestsOnHelix.cmd

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ set DOTNET_ROOT=%HELIX_CORRELATION_PAYLOAD%\d
99
set PATH=%DOTNET_ROOT%;%PATH%
1010
set TestFullMSBuild=%1
1111

12+
REM Set DOTNET_HOST_PATH so MSBuild task hosts can locate the dotnet executable.
13+
REM Without this, tasks from NuGet packages that use TaskHostFactory fail with MSB4216.
14+
set DOTNET_HOST_PATH=%DOTNET_ROOT%\dotnet.exe
15+
1216
REM Ensure Visual Studio instances allow preview SDKs
1317
PowerShell -ExecutionPolicy ByPass -NoProfile -File "%HELIX_CORRELATION_PAYLOAD%\t\eng\enable-preview-sdks.ps1"
1418

@@ -35,14 +39,16 @@ dotnet new --debug:ephemeral-hive
3539
dotnet nuget list source --configfile %TestExecutionDirectory%\nuget.config
3640
if exist %TestExecutionDirectory%\Testpackages dotnet nuget add source %TestExecutionDirectory%\Testpackages --name testpackages --configfile %TestExecutionDirectory%\nuget.config
3741

38-
dotnet nuget remove source dotnet6-transport --configfile %TestExecutionDirectory%\nuget.config
39-
dotnet nuget remove source dotnet6-internal-transport --configfile %TestExecutionDirectory%\nuget.config
40-
dotnet nuget remove source dotnet7-transport --configfile %TestExecutionDirectory%\nuget.config
41-
dotnet nuget remove source dotnet7-internal-transport --configfile %TestExecutionDirectory%\nuget.config
42-
dotnet nuget remove source richnav --configfile %TestExecutionDirectory%\nuget.config
43-
dotnet nuget remove source vs-impl --configfile %TestExecutionDirectory%\nuget.config
44-
dotnet nuget remove source dotnet-libraries-transport --configfile %TestExecutionDirectory%\nuget.config
45-
dotnet nuget remove source dotnet-tools-transport --configfile %TestExecutionDirectory%\nuget.config
46-
dotnet nuget remove source dotnet-libraries --configfile %TestExecutionDirectory%\nuget.config
47-
dotnet nuget remove source dotnet-eng --configfile %TestExecutionDirectory%\nuget.config
42+
REM Remove feeds not needed for tests. Errors from non-existent sources
43+
REM (e.g. internal-transport feeds only present in internal builds) are ignored.
44+
dotnet nuget remove source dotnet6-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul
45+
dotnet nuget remove source dotnet6-internal-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul
46+
dotnet nuget remove source dotnet7-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul
47+
dotnet nuget remove source dotnet7-internal-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul
48+
dotnet nuget remove source richnav --configfile %TestExecutionDirectory%\nuget.config 2>nul
49+
dotnet nuget remove source vs-impl --configfile %TestExecutionDirectory%\nuget.config 2>nul
50+
dotnet nuget remove source dotnet-libraries-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul
51+
dotnet nuget remove source dotnet-tools-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul
52+
dotnet nuget remove source dotnet-libraries --configfile %TestExecutionDirectory%\nuget.config 2>nul
53+
dotnet nuget remove source dotnet-eng --configfile %TestExecutionDirectory%\nuget.config 2>nul
4854
dotnet nuget list source --configfile %TestExecutionDirectory%\nuget.config

build/RunTestsOnHelix.sh

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ export MicrosoftNETBuildExtensionsTargets=$HELIX_CORRELATION_PAYLOAD/ex/msbuildE
99
export DOTNET_ROOT=$HELIX_CORRELATION_PAYLOAD/d
1010
export PATH=$DOTNET_ROOT:$PATH
1111

12+
# Set DOTNET_HOST_PATH so MSBuild task hosts can locate the dotnet executable.
13+
# Without this, tasks from NuGet packages that use TaskHostFactory (e.g. ComputeWasmBuildAssets
14+
# from WebAssembly SDK, ComputeManagedAssemblies from ILLink) fail with MSB4216 on macOS
15+
# because the task host process cannot find the dotnet host to launch.
16+
export DOTNET_HOST_PATH=$DOTNET_ROOT/dotnet
17+
1218
export TestExecutionDirectory=$(realpath "$(mktemp -d "${TMPDIR:-/tmp}"/dotnetSdkTests.XXXXXXXX)")
1319
export DOTNET_CLI_HOME=$TestExecutionDirectory/.dotnet
1420
cp -a $HELIX_CORRELATION_PAYLOAD/t/TestExecutionDirectoryFiles/. $TestExecutionDirectory/
@@ -22,15 +28,16 @@ dotnet new --debug:ephemeral-hive
2228

2329
dotnet nuget list source --configfile $TestExecutionDirectory/NuGet.config
2430
dotnet nuget add source $TestExecutionDirectory/Testpackages --configfile $TestExecutionDirectory/NuGet.config
25-
#Remove feeds not needed for tests
26-
dotnet nuget remove source dotnet6-transport --configfile $TestExecutionDirectory/NuGet.config
27-
dotnet nuget remove source dotnet6-internal-transport --configfile $TestExecutionDirectory/NuGet.config
28-
dotnet nuget remove source dotnet7-transport --configfile $TestExecutionDirectory/NuGet.config
29-
dotnet nuget remove source dotnet7-internal-transport --configfile $TestExecutionDirectory/NuGet.config
30-
dotnet nuget remove source richnav --configfile $TestExecutionDirectory/NuGet.config
31-
dotnet nuget remove source vs-impl --configfile $TestExecutionDirectory/NuGet.config
32-
dotnet nuget remove source dotnet-libraries-transport --configfile $TestExecutionDirectory/NuGet.config
33-
dotnet nuget remove source dotnet-tools-transport --configfile $TestExecutionDirectory/NuGet.config
34-
dotnet nuget remove source dotnet-libraries --configfile $TestExecutionDirectory/NuGet.config
35-
dotnet nuget remove source dotnet-eng --configfile $TestExecutionDirectory/NuGet.config
31+
# Remove feeds not needed for tests. Use || true to avoid errors when a source
32+
# doesn't exist (e.g. internal-transport feeds are only present in internal builds).
33+
dotnet nuget remove source dotnet6-transport --configfile $TestExecutionDirectory/NuGet.config || true
34+
dotnet nuget remove source dotnet6-internal-transport --configfile $TestExecutionDirectory/NuGet.config || true
35+
dotnet nuget remove source dotnet7-transport --configfile $TestExecutionDirectory/NuGet.config || true
36+
dotnet nuget remove source dotnet7-internal-transport --configfile $TestExecutionDirectory/NuGet.config || true
37+
dotnet nuget remove source richnav --configfile $TestExecutionDirectory/NuGet.config || true
38+
dotnet nuget remove source vs-impl --configfile $TestExecutionDirectory/NuGet.config || true
39+
dotnet nuget remove source dotnet-libraries-transport --configfile $TestExecutionDirectory/NuGet.config || true
40+
dotnet nuget remove source dotnet-tools-transport --configfile $TestExecutionDirectory/NuGet.config || true
41+
dotnet nuget remove source dotnet-libraries --configfile $TestExecutionDirectory/NuGet.config || true
42+
dotnet nuget remove source dotnet-eng --configfile $TestExecutionDirectory/NuGet.config || true
3643
dotnet nuget list source --configfile $TestExecutionDirectory/NuGet.config

src/BlazorWasmSdk/Tasks/GZipCompress.cs

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ public class GZipCompress : Task
2020
[Required]
2121
public string OutputDirectory { get; set; }
2222

23+
// Retry count for transient file I/O errors (e.g., antivirus locks on CI machines).
24+
private const int MaxRetries = 3;
25+
private const int RetryDelayMs = 200;
26+
2327
public override bool Execute()
2428
{
2529
CompressedFiles = new ITaskItem[FilesToCompress.Length];
@@ -56,18 +60,31 @@ public override bool Execute()
5660
Log.LogMessage(MessageImportance.Low, "Compressing '{0}' because file is newer than '{1}'.", inputFullPath, outputRelativePath);
5761
}
5862

59-
try
63+
// Retry on IOException to handle transient file locks from antivirus, file
64+
// indexing, or parallel MSBuild nodes on CI machines (see dotnet/sdk#53424).
65+
for (int attempt = 1; attempt <= MaxRetries; attempt++)
6066
{
61-
using var sourceStream = File.OpenRead(file.ItemSpec);
62-
using var fileStream = File.Create(outputRelativePath);
63-
using var stream = new GZipStream(fileStream, CompressionLevel.Optimal);
67+
try
68+
{
69+
using var sourceStream = File.OpenRead(file.ItemSpec);
70+
using var fileStream = File.Create(outputRelativePath);
71+
using var stream = new GZipStream(fileStream, CompressionLevel.Optimal);
6472

65-
sourceStream.CopyTo(stream);
66-
}
67-
catch (Exception e)
68-
{
69-
Log.LogErrorFromException(e);
70-
return;
73+
sourceStream.CopyTo(stream);
74+
return; // Success
75+
}
76+
catch (IOException) when (attempt < MaxRetries)
77+
{
78+
Log.LogMessage(MessageImportance.Low,
79+
"Retrying compression of '{0}' (attempt {1}/{2}) due to transient I/O error.",
80+
file.ItemSpec, attempt, MaxRetries);
81+
Thread.Sleep(RetryDelayMs * attempt);
82+
}
83+
catch (Exception e)
84+
{
85+
Log.LogErrorFromException(e);
86+
return;
87+
}
7188
}
7289
});
7390

src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,16 @@ public async ValueTask DisposeAsync()
7676
_isDisposed = true;
7777

7878
// wait for all in-flight process initialization to complete:
79+
// If no session initialization is in-flight (_pendingSessionInitializationCount == 0),
80+
// the semaphore will never be released by StartProjectAsync's finally block.
81+
// Release it here to prevent a deadlock. Protect against the race where
82+
// StartProjectAsync's finally block releases concurrently.
83+
if (Volatile.Read(ref _pendingSessionInitializationCount) == 0)
84+
{
85+
try { _postDisposalSessionInitializationCompleted.Release(); }
86+
catch (SemaphoreFullException) { }
87+
}
88+
7989
await _postDisposalSessionInitializationCompleted.WaitAsync(CancellationToken.None);
8090

8191
// terminate all active sessions:
@@ -174,7 +184,10 @@ public async ValueTask StartProjectAsync(string dcpId, string sessionId, Project
174184
{
175185
if (Interlocked.Decrement(ref _pendingSessionInitializationCount) == 0 && _isDisposed)
176186
{
177-
_postDisposalSessionInitializationCompleted.Release();
187+
// Guard against double-release: DisposeAsync may have already released
188+
// the semaphore if it observed count==0 before we decremented.
189+
try { _postDisposalSessionInitializationCompleted.Release(); }
190+
catch (SemaphoreFullException) { }
178191
}
179192
}
180193

test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,19 @@ namespace Microsoft.DotNet.Watch.UnitTests
1010
{
1111
internal sealed class AwaitableProcess : IAsyncDisposable
1212
{
13-
// cancel just before we hit timeout used on CI (XUnitWorkItemTimeout value in sdk\test\UnitTests.proj)
13+
// Maximum time to wait for a single line of output from the process.
14+
// On CI (Helix), cap at 5 minutes. The HELIX_WORK_ITEM_TIMEOUT is the total budget
15+
// for ALL tests in the work item (~2h), which is far too long for a single
16+
// wait-for-output operation. If a process produces no output for 5 minutes,
17+
// it's deadlocked (e.g., dotnet-watch shutdown race in AspireServiceFactory).
18+
// Capping here turns a 2-hour partition-blocking hang into a 5-minute clean failure.
19+
private static readonly TimeSpan s_maxPerOperationTimeout = TimeSpan.FromMinutes(5);
20+
1421
private static readonly TimeSpan s_timeout = Environment.GetEnvironmentVariable("HELIX_WORK_ITEM_TIMEOUT") is { } value
15-
? TimeSpan.Parse(value).Subtract(TimeSpan.FromSeconds(10)) : TimeSpan.FromMinutes(10);
22+
? Min(TimeSpan.Parse(value).Subtract(TimeSpan.FromSeconds(10)), s_maxPerOperationTimeout)
23+
: TimeSpan.FromMinutes(10);
24+
25+
private static TimeSpan Min(TimeSpan a, TimeSpan b) => a < b ? a : b;
1626

1727
private readonly List<string> _lines = [];
1828

@@ -226,6 +236,17 @@ public async ValueTask DisposeAsync()
226236
{
227237
}
228238

239+
// Close stdin before killing. This unblocks PhysicalConsole.ListenToStandardInputAsync()
240+
// which reads from stdin with CancellationToken.None and no timeout.
241+
// Without this, the stdin reader can keep the process alive after Kill() on some platforms.
242+
try
243+
{
244+
Process.StandardInput.Close();
245+
}
246+
catch
247+
{
248+
}
249+
229250
try
230251
{
231252
Process.Kill(entireProcessTree: true);
@@ -234,8 +255,17 @@ public async ValueTask DisposeAsync()
234255
{
235256
}
236257

237-
// ensure process has exited
238-
await _processExitAwaiter;
258+
// Wait for process exit with a timeout to prevent hanging the test if Kill() fails.
259+
// The WaitForProcessExitAsync loop checks HasExited every 1 second, so 30s is generous.
260+
using var exitTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(30));
261+
try
262+
{
263+
await _processExitAwaiter.WaitAsync(exitTimeout.Token);
264+
}
265+
catch (OperationCanceledException)
266+
{
267+
Logger.Log($"Process {Id} did not exit within 30 seconds after Kill()");
268+
}
239269

240270
Process.Dispose();
241271

test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -204,12 +204,16 @@ public ProcessStartInfo GetProcessStartInfo(string workingDirectory, string test
204204
info.Environment.Add("Microsoft_CodeAnalysis_EditAndContinue_LogDir", testOutputPath);
205205
info.Environment.Add("DOTNET_CLI_CONTEXT_VERBOSE", "trace");
206206

207-
// suppress all timeouts:
208-
info.Environment.Add("DCP_IDE_REQUEST_TIMEOUT_SECONDS", "100000");
209-
info.Environment.Add("DCP_IDE_NOTIFICATION_TIMEOUT_SECONDS", "100000");
210-
info.Environment.Add("DCP_IDE_NOTIFICATION_KEEPALIVE_SECONDS", "100000");
207+
// Use generous but bounded timeouts for DCP operations in CI.
208+
// Previous values of 100,000 seconds (~27 hours) effectively disabled timeouts,
209+
// causing tests to hang for the full Helix work item duration (~2 hours) when
210+
// a DCP operation deadlocked. 300 seconds (5 minutes) per operation is generous
211+
// for slow CI machines while ensuring natural failure recovery.
212+
info.Environment.Add("DCP_IDE_REQUEST_TIMEOUT_SECONDS", "300");
213+
info.Environment.Add("DCP_IDE_NOTIFICATION_TIMEOUT_SECONDS", "300");
214+
info.Environment.Add("DCP_IDE_NOTIFICATION_KEEPALIVE_SECONDS", "300");
211215
info.Environment.Add("ASPIRE_ALLOW_UNSECURED_TRANSPORT", "1");
212-
info.Environment.Add("ASPIRE_WATCH_PIPE_CONNECTION_TIMEOUT_SECONDS", "100000");
216+
info.Environment.Add("ASPIRE_WATCH_PIPE_CONNECTION_TIMEOUT_SECONDS", "300");
213217

214218
// override defaults:
215219
foreach (var (name, value) in EnvironmentVariables)

test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout
358358
return connectionTask;
359359
}
360360

361-
readySource.SetResult(true);
361+
readySource.TrySetResult(true);
362362
return new TaskCompletionSource<Connection>().Task;
363363
});
364364

@@ -382,11 +382,18 @@ public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout
382382
}
383383
};
384384
var keepAlive = TimeSpan.FromSeconds(1);
385-
var dispatcherTask = Task.Run(() =>
385+
386+
// Use Task.Factory.StartNew with LongRunning to run the dispatcher on a dedicated
387+
// OS thread instead of a thread pool thread. The dispatcher's Run() method uses
388+
// blocking Task.WaitAny() which permanently blocks its thread. On Helix CI agents
389+
// running many tests in parallel, blocking a thread pool thread contributes to pool
390+
// starvation, which prevents Task.Delay timer callbacks from firing, causing the
391+
// keep-alive timeout to never complete and the test to hang indefinitely.
392+
var dispatcherTask = Task.Factory.StartNew(() =>
386393
{
387394
var dispatcher = new DefaultRequestDispatcher(connectionHost.Object, compilerHost, CancellationToken.None, eventBus, keepAlive);
388395
dispatcher.Run();
389-
});
396+
}, CancellationToken.None, TaskCreationOptions.LongRunning, TaskScheduler.Default);
390397

391398
// Wait for all connections to be created.
392399
await readySource.Task;
@@ -402,7 +409,10 @@ public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout
402409

403410
// Act
404411
// Now dispatcher should be in an idle state with no active connections.
405-
await dispatcherTask;
412+
// Use WaitAsync as a safety net: if the keep-alive timeout still can't fire
413+
// (e.g. extreme thread pool starvation), fail the test after 60s instead of
414+
// hanging for 60+ minutes and blocking the entire CI job.
415+
await dispatcherTask.WaitAsync(TimeSpan.FromSeconds(60));
406416

407417
// Assert
408418
Assert.False(eventBus.HasDetectedBadConnection);
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,21 @@
11
<!-- Prevent test asset projects from picking up the repo's root Directory.Build.targets. -->
22
<Project>
33

4+
<!-- For packable Exe projects (DotNetCliToolReference tools targeting netcoreapp2.2),
5+
include the auto-generated runtimeconfig.json in the NuGet package so the dotnet
6+
host can find it adjacent to the DLL. This enables RollForward=LatestMajor to
7+
work correctly, allowing tools to run on machines that only have .NET 6.0+
8+
installed (common on Helix CI agents that lack .NET Core 2.2). Without this,
9+
tools fail with FrameworkMissingFailure (exit code 0x80008096) because the host
10+
cannot roll forward from 2.2.0 without a runtimeconfig.json specifying the
11+
rollForward policy. -->
12+
<Target Name="IncludeRuntimeConfigInPackage"
13+
AfterTargets="Build"
14+
Condition="'$(OutputType)' == 'Exe' AND '$(IsPackable)' == 'true' AND '$(GenerateRuntimeConfigurationFiles)' != 'false'">
15+
<ItemGroup>
16+
<BuildOutputInPackage Include="$(ProjectRuntimeConfigFilePath)"
17+
Condition="'$(ProjectRuntimeConfigFilePath)' != '' AND Exists('$(ProjectRuntimeConfigFilePath)')" />
18+
</ItemGroup>
19+
</Target>
20+
421
</Project>

0 commit comments

Comments
 (0)