diff --git a/src/Agent.Sdk/Knob/AgentKnobs.cs b/src/Agent.Sdk/Knob/AgentKnobs.cs index bc8e863b96..a555dcad02 100644 --- a/src/Agent.Sdk/Knob/AgentKnobs.cs +++ b/src/Agent.Sdk/Knob/AgentKnobs.cs @@ -798,7 +798,7 @@ public class AgentKnobs new RuntimeKnobSource("ADD_FORCE_CREDENTIALS_TO_GIT_CHECKOUT"), new PipelineFeatureSource(nameof(AddForceCredentialsToGitCheckout)), new BuiltInDefaultKnobSource("false")); - + public static readonly Knob InstallLegacyTfExe = new Knob( nameof(InstallLegacyTfExe), "If true, the agent will install the legacy versions of TF, vstsom and vstshost", @@ -827,5 +827,12 @@ public class AgentKnobs new EnvironmentKnobSource("AGENT_CDN_CONNECTIVITY_FAIL_WARNING"), new PipelineFeatureSource("AgentCDNConnectivityFailWarning"), new BuiltInDefaultKnobSource("false")); + + public static readonly Knob CheckBeforeRetryDockerStart = new Knob( + nameof(CheckBeforeRetryDockerStart), + "If true, the agent will check if container is running before retrying a Docker start command.", + new PipelineFeatureSource("CheckBeforeRetryDockerStart"), + new EnvironmentKnobSource("AGENT_CHECK_BEFORE_RETRY_DOCKER_START"), + new BuiltInDefaultKnobSource("false")); } } diff --git a/src/Agent.Worker/Container/DockerCommandManager.cs b/src/Agent.Worker/Container/DockerCommandManager.cs index 875ece569a..2095702832 100644 --- a/src/Agent.Worker/Container/DockerCommandManager.cs +++ b/src/Agent.Worker/Container/DockerCommandManager.cs @@ -209,9 +209,14 @@ public async Task DockerStart(IExecutionContext context, string containerId ArgUtil.NotNull(context, nameof(context)); ArgUtil.NotNull(containerId, nameof(containerId)); - var action = new Func>(async () => await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken)); - const string command = "Docker start"; - return await ExecuteDockerCommandAsyncWithRetries(context, action, command); + if (!AgentKnobs.CheckBeforeRetryDockerStart.GetValue(context).AsBoolean()) + { + var action = new Func>(async () => await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken)); + const string command = "Docker start"; + return await ExecuteDockerCommandAsyncWithRetries(context, action, command); + } + // Use the new helper for start with retries and running-state checks + return await ExecuteDockerStartWithRetriesAndCheck(context, containerId); } public async Task DockerRemove(IExecutionContext context, string containerId) @@ -374,7 +379,7 @@ public async Task> DockerPort(IExecutionContext context, strin /// true, if specified container is running, false otherwise. /// - public async Task IsContainerRunning(IExecutionContext context, string containerId) + public virtual async Task IsContainerRunning(IExecutionContext context, string containerId) { List filteredItems = await DockerPS(context, $"--filter id={containerId}"); @@ -386,8 +391,8 @@ public async Task IsContainerRunning(IExecutionContext context, string con return isContainerRunning; } - - private Task ExecuteDockerCommandAsync(IExecutionContext context, string command, string options, CancellationToken cancellationToken = default(CancellationToken)) + // making it protected for unit testing + protected virtual Task ExecuteDockerCommandAsync(IExecutionContext context, string command, string options, CancellationToken cancellationToken = default(CancellationToken)) { return ExecuteDockerCommandAsync(context, command, options, null, cancellationToken); } @@ -533,5 +538,60 @@ private static async Task> ExecuteDockerCommandAsyncWithRetries(IEx return output; } + + /// + /// Executes 'docker start' with retries, checking if the container is already running before each retry. + /// Returns 0 if the container is running or started successfully, otherwise returns the last exit code. + /// + protected virtual async Task ExecuteDockerStartWithRetriesAndCheck(IExecutionContext context, string containerId) + { + bool dockerActionRetries = AgentKnobs.DockerActionRetries.GetValue(context).AsBoolean(); + context.Output($"DockerActionRetries variable value: {dockerActionRetries}"); + + int retryCount = 0; + const int maxRetries = 3; + TimeSpan delayInSeconds = TimeSpan.FromSeconds(10); + int exitCode = 0; + + while (retryCount < maxRetries) + { + // Check if container is already running before attempting to start + if (await IsContainerRunning(context, containerId)) + { + context.Output($"Container {containerId} is running before attempt {retryCount + 1}."); + break; + } + + exitCode = await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken); + if (exitCode == 0 || !dockerActionRetries) + { + break; + } + + context.Warning($"Docker start failed with exit code {exitCode}, back off {delayInSeconds} seconds before retry."); + retryCount++; + await Task.Delay(delayInSeconds); + + } + + // handle the case where container is already running after retries but exit code is not 0 + if (exitCode != 0 && await IsContainerRunning(context, containerId)) + { + context.Output($"Container {containerId} is already running after {retryCount} retries. but exit code was {exitCode}."); + exitCode = 0; // Indicate success + } + // If the container is still not running after retries, log a warning + if (exitCode != 0) + { + context.Warning($"Container {containerId} is not running after {retryCount} retries. Last exit code: {exitCode}"); + } + else + { + context.Output($"Container {containerId} started successfully after {retryCount} retries."); + } + //return the exit code + context.Debug($"Docker start completed with exit code {exitCode}."); + return exitCode; + } } } diff --git a/src/Test/L0/Worker/Container/DockerCommandManagerL0Tests.cs b/src/Test/L0/Worker/Container/DockerCommandManagerL0Tests.cs new file mode 100644 index 0000000000..5a3b5353e5 --- /dev/null +++ b/src/Test/L0/Worker/Container/DockerCommandManagerL0Tests.cs @@ -0,0 +1,125 @@ +using System; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.VisualStudio.Services.Agent.Worker; +using Xunit; +using Moq; +using Agent.Sdk; +using Microsoft.VisualStudio.Services.Agent.Worker.Container; + +namespace Microsoft.VisualStudio.Services.Agent.Tests.Worker.Container +{ + public class DockerCommandManagerL0Tests + { + private Mock _ec; + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "worker.container")] + public async Task ReturnsZeroIfContainerAlreadyRunningBeforeStart() + { + _ec = new Mock(); + SetupEnvironmentVariables("true"); + var manager = new TestableDockerCommandManager(isRunningOnFirstCheck: true); + int result = await manager.ExecuteDockerStartWithRetriesAndCheckPublic(_ec.Object, "cid"); + Assert.Equal(0, result); + Assert.Equal(1, manager.IsContainerRunningCallCount); + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "worker.container")] + public async Task ReturnsZeroIfStartSucceedsFirstTry() + { + _ec = new Mock(); + SetupEnvironmentVariables("true"); + var manager = new TestableDockerCommandManager(exitCodes: new[] { 0 }, runningOnRetry: new[] { false, true }); + int result = await manager.ExecuteDockerStartWithRetriesAndCheckPublic(_ec.Object, "cid"); + Assert.Equal(0, result); + Assert.Equal(1, manager.IsContainerRunningCallCount); + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "worker.container")] + public async Task ReturnsZeroIfContainerStartsOnThirdRetry() + { + _ec = new Mock(); + SetupEnvironmentVariables("true"); + var manager = new TestableDockerCommandManager(exitCodes: new[] { 1, 1, 0 }, runningOnRetry: new[] { false, false, false, true }); + int result = await manager.ExecuteDockerStartWithRetriesAndCheckPublic(_ec.Object, "cid"); + Assert.Equal(0, result); + Assert.Equal(3, manager.IsContainerRunningCallCount); + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "worker.container")] + public async Task ReturnsExitCodeIfContainerNeverStarts() + { + _ec = new Mock(); + SetupEnvironmentVariables("true"); + var manager = new TestableDockerCommandManager(exitCodes: new[] { 1, 2, 3 }, runningOnRetry: new[] { false, false, false, false }); + int result = await manager.ExecuteDockerStartWithRetriesAndCheckPublic(_ec.Object, "cid"); + Assert.Equal(3, result); + Assert.Equal(4, manager.IsContainerRunningCallCount); + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "worker.container")] + public async Task ReturnsZeroIfContainerStartsButExitCodeNotZero() + { + _ec = new Mock(); + SetupEnvironmentVariables("true"); + // exitCode is 1, but container is running after + var manager = new TestableDockerCommandManager(exitCodes: new[] { 1 }, runningOnRetry: new[] { false, true }); + int result = await manager.ExecuteDockerStartWithRetriesAndCheckPublic(_ec.Object, "cid"); + Assert.Equal(0, result); + Assert.Equal(3, manager.IsContainerRunningCallCount); + } + + private class TestableDockerCommandManager : DockerCommandManager + { + private readonly int[] _exitCodes; + private readonly bool[] _runningOnRetry; + private int _startCallCount = 0; + private int _runningCallCount = 0; + public int IsContainerRunningCallCount => _runningCallCount; + + public TestableDockerCommandManager(bool isRunningOnFirstCheck = false) + { + _exitCodes = new[] { 1 }; + _runningOnRetry = new[] { isRunningOnFirstCheck }; + } + public TestableDockerCommandManager(int[] exitCodes, bool[] runningOnRetry) + { + _exitCodes = exitCodes; + _runningOnRetry = runningOnRetry; + } + public Task ExecuteDockerStartWithRetriesAndCheckPublic(IExecutionContext context, string containerId) + { + return base.ExecuteDockerStartWithRetriesAndCheck(context, containerId); + } + protected override Task ExecuteDockerCommandAsync(IExecutionContext context, string command, string options, CancellationToken cancellationToken = default) + { + int code = _exitCodes[Math.Min(_startCallCount, _exitCodes.Length - 1)]; + _startCallCount++; + return Task.FromResult(code); + } + public override Task IsContainerRunning(IExecutionContext context, string containerId) + { + bool running = _runningOnRetry[Math.Min(_runningCallCount, _runningOnRetry.Length - 1)]; + _runningCallCount++; + return Task.FromResult(running); + } + } + + private void SetupEnvironmentVariables(string allowDockerActionRetries) + { + var environment = new SystemEnvironment(); + environment.SetEnvironmentVariable("VSTSAGENT_DOCKER_ACTION_RETRIES", allowDockerActionRetries); + _ec.Setup(x => x.GetScopedEnvironment()).Returns(environment); + } + } +}