Skip to content

Commit a338add

Browse files
[17.14] Avoid hang when similar build requests are serviced (#12197)
Backports 370bb9d to 17.14. ### Summary The MSBuild scheduler could hang when two similar-but-not-identical requests were received and serviced in short proximity. VS calling patterns trigger this sometimes. This change handles the problematic case and additionally provides breadcrumbs in the memory dump that this situation occurred to improve future diagnosability of related issues. ### Customer Impact VS hang observed from telemetry; VS Perf team wants backport. ### Regression? No ### Testing Has been in main (.NET SDK 10 builds) and internal VS preview for ~1 month, with good telemetry from VS int preview. ### Risk Low due to bake time in main/int preview. --------- Co-authored-by: YuliiaKovalova <[email protected]>
1 parent a30e8b6 commit a338add

File tree

4 files changed

+134
-31
lines changed

4 files changed

+134
-31
lines changed

eng/Versions.props

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
<!-- Copyright (c) .NET Foundation and contributors. All rights reserved. Licensed under the MIT license. See License.txt in the project root for full license information. -->
33
<Project>
44
<PropertyGroup>
5-
<VersionPrefix>17.14.17</VersionPrefix>
6-
<DotNetFinalVersionKind>release</DotNetFinalVersionKind>
5+
<VersionPrefix>17.14.18</VersionPrefix><DotNetFinalVersionKind>release</DotNetFinalVersionKind>
76
<PackageValidationBaselineVersion>17.13.9</PackageValidationBaselineVersion>
87
<AssemblyVersion>15.1.0.0</AssemblyVersion>
98
<PreReleaseVersionLabel>servicing</PreReleaseVersionLabel>

src/Build.UnitTests/BackEnd/Scheduler_Tests.cs

Lines changed: 95 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@
1919

2020
namespace Microsoft.Build.UnitTests.BackEnd
2121
{
22+
using System.Linq;
23+
using FluentAssertions;
2224
using Microsoft.Build.Unittest;
2325

2426
/// <summary>
2527
/// Tests of the scheduler.
2628
/// </summary>
27-
// Ignore: Causing issues with other tests
28-
// NOTE: marked as "internal" to disable the entire test class, as was done for MSTest.
2929
public class Scheduler_Tests : IDisposable
3030
{
3131
/// <summary>
@@ -58,6 +58,11 @@ public class Scheduler_Tests : IDisposable
5858
/// </summary>
5959
private BuildParameters _parameters;
6060

61+
/// <summary>
62+
/// Configuration ID.
63+
/// </summary>
64+
private const int DefaultConfigId = 99;
65+
6166
/// <summary>
6267
/// Set up
6368
/// </summary>
@@ -71,8 +76,8 @@ public Scheduler_Tests()
7176
_host = new MockHost();
7277
_scheduler = new Scheduler();
7378
_scheduler.InitializeComponent(_host);
74-
CreateConfiguration(99, "parent.proj");
75-
_defaultParentRequest = CreateBuildRequest(99, 99, Array.Empty<string>(), null);
79+
CreateConfiguration(DefaultConfigId, "parent.proj");
80+
_defaultParentRequest = CreateBuildRequest(99, DefaultConfigId, Array.Empty<string>(), null);
7681

7782
// Set up the scheduler with one node to start with.
7883
_scheduler.ReportNodesCreated(new NodeInfo[] { new NodeInfo(1, NodeProviderType.InProc) });
@@ -387,8 +392,8 @@ public void VerifyRequestOrderingDoesNotAffectNodeCreationCountWithInProcAndAnyR
387392
_parameters.ShutdownInProcNodeOnBuildFinish = true;
388393
_buildManager = new BuildManager();
389394

390-
CreateConfiguration(99, "parent.proj");
391-
_defaultParentRequest = CreateBuildRequest(99, 99, Array.Empty<string>(), null);
395+
CreateConfiguration(DefaultConfigId, "parent.proj");
396+
_defaultParentRequest = CreateBuildRequest(99, DefaultConfigId, Array.Empty<string>(), null);
392397

393398
CreateConfiguration(1, "foo.proj");
394399
BuildRequest request1 = CreateBuildRequest(1, 1, new string[] { "foo" }, NodeAffinity.Any, _defaultParentRequest);
@@ -579,8 +584,8 @@ public void VerifyNoOverCreationOfNodesWithBuildLoop()
579584
_parameters.ShutdownInProcNodeOnBuildFinish = true;
580585
_buildManager = new BuildManager();
581586

582-
CreateConfiguration(99, "parent.proj");
583-
_defaultParentRequest = CreateBuildRequest(99, 99, Array.Empty<string>(), null);
587+
CreateConfiguration(DefaultConfigId, "parent.proj");
588+
_defaultParentRequest = CreateBuildRequest(99, DefaultConfigId, Array.Empty<string>(), null);
584589

585590
CreateConfiguration(1, "foo.proj");
586591
BuildRequest request1 = CreateBuildRequest(1, 1, new string[] { "foo" }, NodeAffinity.OutOfProc, _defaultParentRequest);
@@ -769,12 +774,13 @@ private BuildResult CacheBuildResult(BuildRequest request, string target, WorkUn
769774
}
770775

771776
/// <summary>
772-
/// Creates a build result for a request
777+
/// Creates a build result for a request.
773778
/// </summary>
774779
private BuildResult CreateBuildResult(BuildRequest request, string target, WorkUnitResult workUnitResult)
775780
{
776781
BuildResult result = new BuildResult(request);
777782
result.AddResultsForTarget(target, new TargetResult(Array.Empty<TaskItem>(), workUnitResult));
783+
778784
return result;
779785
}
780786

@@ -789,23 +795,30 @@ private BuildRequest CreateBuildRequest(int nodeRequestId, int configId)
789795
/// <summary>
790796
/// Creates a build request.
791797
/// </summary>
792-
private BuildRequest CreateBuildRequest(int nodeRequestId, int configId, string[] targets)
798+
private BuildRequest CreateBuildRequest(int nodeRequestId, int configId, string[] targets, BuildRequestDataFlags buildRequestDataFlags = BuildRequestDataFlags.None)
793799
{
794-
return CreateBuildRequest(nodeRequestId, configId, targets, _defaultParentRequest);
800+
return CreateBuildRequest(nodeRequestId, configId, targets, _defaultParentRequest, buildRequestDataFlags);
795801
}
796802

797803
/// <summary>
798804
/// Creates a build request.
799805
/// </summary>
800-
private BuildRequest CreateBuildRequest(int nodeRequestId, int configId, string[] targets, BuildRequest parentRequest)
806+
private BuildRequest CreateBuildRequest(int nodeRequestId, int configId, string[] targets, BuildRequest parentRequest, BuildRequestDataFlags buildRequestDataFlags = BuildRequestDataFlags.None)
801807
{
802-
return CreateBuildRequest(nodeRequestId, configId, targets, NodeAffinity.Any, parentRequest);
808+
return CreateBuildRequest(nodeRequestId, configId, targets, NodeAffinity.Any, parentRequest, buildRequestDataFlags: buildRequestDataFlags);
803809
}
804810

805811
/// <summary>
806812
/// Creates a build request.
807813
/// </summary>
808-
private BuildRequest CreateBuildRequest(int nodeRequestId, int configId, string[] targets, NodeAffinity nodeAffinity, BuildRequest parentRequest, ProxyTargets proxyTargets = null)
814+
private BuildRequest CreateBuildRequest(
815+
int nodeRequestId,
816+
int configId,
817+
string[] targets,
818+
NodeAffinity nodeAffinity,
819+
BuildRequest parentRequest,
820+
ProxyTargets proxyTargets = null,
821+
BuildRequestDataFlags buildRequestDataFlags = BuildRequestDataFlags.None)
809822
{
810823
(targets == null ^ proxyTargets == null).ShouldBeTrue();
811824

@@ -826,16 +839,19 @@ private BuildRequest CreateBuildRequest(int nodeRequestId, int configId, string[
826839
targets,
827840
hostServices,
828841
BuildEventContext.Invalid,
829-
parentRequest);
842+
parentRequest,
843+
buildRequestDataFlags: buildRequestDataFlags);
830844
}
831845

832846
parentRequest.ShouldBeNull();
847+
833848
return new BuildRequest(
834849
submissionId: 1,
835850
nodeRequestId,
836851
configId,
837852
proxyTargets,
838-
hostServices);
853+
hostServices,
854+
buildRequestDataFlags: buildRequestDataFlags);
839855
}
840856

841857
private BuildRequest CreateProxyBuildRequest(int nodeRequestId, int configId, ProxyTargets proxyTargets, BuildRequest parentRequest)
@@ -849,6 +865,69 @@ private BuildRequest CreateProxyBuildRequest(int nodeRequestId, int configId, Pr
849865
proxyTargets);
850866
}
851867

868+
/// <summary>
869+
/// The test checks how scheduler handles the duplicated requests and cache MISS for this case.
870+
/// It's expected to have the duplicated request rescheduled for the execution.
871+
/// </summary>
872+
[Fact]
873+
public void ReportResultTest_NoCacheHitForDupes()
874+
{
875+
// Create a duplicate of the existing _defaultParentRequest, but with a different build request flag, so we can't get the result from the cache.
876+
BuildRequest duplicateRequest = CreateBuildRequest(2, configId: DefaultConfigId, Array.Empty<string>(), parentRequest: null, BuildRequestDataFlags.ProvideSubsetOfStateAfterBuild);
877+
878+
// Schedule the duplicate request -> it goes to unscheduled request due to duplicated configId
879+
_scheduler.ReportRequestBlocked(2, new BuildRequestBlocker(-1, Array.Empty<string>(), [duplicateRequest]));
880+
881+
// try to get a result for the parent request and see if we get a result for the duplicate request
882+
var results = _scheduler.ReportResult(1, CreateBuildResult(_defaultParentRequest, "", BuildResultUtilities.GetSuccessResult()))
883+
.ToList();
884+
885+
results.ShouldNotBeNull();
886+
results.Count.ShouldBe(2);
887+
888+
// Completed _defaultParentRequest
889+
results[0].BuildResult.ShouldNotBeNull();
890+
results[0].BuildResult.BuildRequestDataFlags.ShouldBe(BuildRequestDataFlags.None);
891+
results[0].Action.ShouldBe(ScheduleActionType.SubmissionComplete);
892+
893+
// The automatically scheduled duplicated request.
894+
results[1].BuildResult.ShouldBeNull();
895+
results[1].NodeId.Should().Be(1);
896+
results[1].Action.ShouldBe(ScheduleActionType.Schedule);
897+
results[1].BuildRequest.BuildRequestDataFlags.ShouldBe(BuildRequestDataFlags.ProvideSubsetOfStateAfterBuild);
898+
}
899+
900+
/// <summary>
901+
/// The test checks how scheduler handles the duplicated requests and cache HIT for this case.
902+
/// It's expected to have an immediate result for the duplicated request.
903+
/// </summary>
904+
[Fact]
905+
public void ReportResultTest_CacheHitForDupes()
906+
{
907+
// Create a duplicate of the existing _defaultParentRequest.
908+
BuildRequest duplicateRequest = CreateBuildRequest(2, configId: DefaultConfigId, Array.Empty<string>(), parentRequest: null, BuildRequestDataFlags.None);
909+
910+
// Schedule the duplicate request -> it goes to unscheduled request due to duplicated configId
911+
_scheduler.ReportRequestBlocked(1, new BuildRequestBlocker(-1, Array.Empty<string>(), [duplicateRequest]));
912+
913+
// try to get a result for the parent request and see if we get a result for the duplicate request.
914+
var results = _scheduler.ReportResult(1, CreateBuildResult(duplicateRequest, "", BuildResultUtilities.GetSuccessResult()))
915+
.ToList();
916+
917+
results.ShouldNotBeNull();
918+
results.Count.ShouldBe(2);
919+
920+
// Completed _defaultParentRequest
921+
results[0].BuildResult.ShouldNotBeNull();
922+
results[0].BuildResult.BuildRequestDataFlags.ShouldBe(BuildRequestDataFlags.None);
923+
results[0].Action.ShouldBe(ScheduleActionType.SubmissionComplete);
924+
925+
// We hit cache and completed the duplicate request.
926+
results[1].BuildResult.ShouldNotBeNull();
927+
results[1].BuildResult.BuildRequestDataFlags.ShouldBe(BuildRequestDataFlags.None);
928+
results[1].Action.ShouldBe(ScheduleActionType.SubmissionComplete);
929+
}
930+
852931
/// <summary>
853932
/// Method that fakes the actions done by BuildManager.PerformSchedulingActions
854933
/// </summary>

src/Build/BackEnd/BuildManager/BuildManager.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2390,8 +2390,8 @@ private void HandleNewRequest(int node, BuildRequestBlocker blocker)
23902390
}
23912391
}
23922392

2393-
IEnumerable<ScheduleResponse> response = _scheduler!.ReportRequestBlocked(node, blocker);
2394-
PerformSchedulingActions(response);
2393+
IEnumerable<ScheduleResponse> responses = _scheduler!.ReportRequestBlocked(node, blocker);
2394+
PerformSchedulingActions(responses);
23952395
}
23962396

23972397
/// <summary>

src/Build/BackEnd/Components/Scheduler/Scheduler.cs

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,21 @@ internal bool ForceAffinityOutOfProc
174174

175175
private int _loggedWarningsForProxyBuildsOnOutOfProcNodes = 0;
176176

177+
/// <summary>
178+
/// If we hit the path that prevents from completing build submission this flag is set.
179+
/// </summary>
180+
#pragma warning disable IDE0052 // Remove unread private members because we need them for traceability only
181+
#pragma warning disable CS0414 // The field is assigned but its value is never used
182+
private bool _hitNoLoggingCompleted;
183+
184+
/// <summary>
185+
/// Information about the missed submission.
186+
/// </summary>
187+
private string _noLoggingCompletedSubmissionDetails;
188+
189+
#pragma warning restore IDE0052 // Remove unread private members
190+
#pragma warning restore CS0414 // The field is assigned but its value is never used
191+
177192
/// <summary>
178193
/// Constructor.
179194
/// </summary>
@@ -445,30 +460,40 @@ public IEnumerable<ScheduleResponse> ReportResult(int nodeId, BuildResult result
445460
// There are other requests which we can satisfy based on this result, lets pull the result out of the cache
446461
// and satisfy those requests. Normally a skipped result would lead to the cache refusing to satisfy the
447462
// request, because the correct response in that case would be to attempt to rebuild the target in case there
448-
// are state changes that would cause it to now excute. At this point, however, we already know that the parent
463+
// are state changes that would cause it to now execute. At this point, however, we already know that the parent
449464
// request has completed, and we already know that this request has the same global request ID, which means that
450465
// its configuration and set of targets are identical -- from MSBuild's perspective, it's the same. So since
451466
// we're not going to attempt to re-execute it, if there are skipped targets in the result, that's fine. We just
452467
// need to know what the target results are so that we can log them.
453468
ScheduleResponse response = TrySatisfyRequestFromCache(parentNode, unscheduledRequest.BuildRequest, skippedResultsDoNotCauseCacheMiss: true);
454469

455-
// If we have a response we need to tell the loggers that we satisified that request from the cache.
470+
// If we have a response we need to tell the loggers that we satisfied that request from the cache.
456471
if (response != null)
457472
{
458473
LogRequestHandledFromCache(unscheduledRequest.BuildRequest, response.BuildResult);
474+
475+
// Mark the request as complete (and the parent is no longer blocked by this request.)
476+
unscheduledRequest.Complete(newResult);
477+
responses.Add(response);
459478
}
460479
else
461480
{
462-
// Response may be null if the result was never added to the cache. This can happen if the result has
463-
// an exception in it. If that is the case, we should report the result directly so that the
464-
// build manager knows that it needs to shut down logging manually.
465-
response = GetResponseForResult(parentNode, unscheduledRequest.BuildRequest, newResult.Clone());
481+
// This is a critical error case where a result should be in the cache but isn't.
482+
// The result might be missing from the cache if:
483+
// 1. The result contained an exception that prevented it from being cached properly
484+
// 2. The result was for a skipped target that couldn't satisfy all dependencies
485+
486+
// Now scheduler will handle this situation automatically - the unscheduled request remains
487+
// in the unscheduled queue (_schedulingData.UnscheduledRequests) and will be picked up
488+
// in the next ScheduleUnassignedRequests execution to be properly rebuilt.
489+
490+
// IMPORTANT: In earlier versions, we would hit this code path and did not handle it properly,
491+
// which caused deadlocks/hangs in Visual Studio. Without completing the request's
492+
// logging lifecycle, VS would never receive the completion callback and would wait
493+
// indefinitely, freezing the UI.
494+
_hitNoLoggingCompleted = true;
495+
_noLoggingCompletedSubmissionDetails = $"SubmissionId: {unscheduledRequest.BuildRequest.SubmissionId}; BuildRequestDataFlags: {unscheduledRequest.BuildRequest.BuildRequestDataFlags}";
466496
}
467-
468-
responses.Add(response);
469-
470-
// Mark the request as complete (and the parent is no longer blocked by this request.)
471-
unscheduledRequest.Complete(newResult);
472497
}
473498
}
474499
}

0 commit comments

Comments
 (0)