Skip to content

Impoving container initialize logic #5280

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/Agent.Sdk/Knob/AgentKnobs.cs
Original file line number Diff line number Diff line change
Expand Up @@ -798,7 +798,7 @@ public class AgentKnobs
new RuntimeKnobSource("ADD_FORCE_CREDENTIALS_TO_GIT_CHECKOUT"),
new PipelineFeatureSource(nameof(AddForceCredentialsToGitCheckout)),
new BuiltInDefaultKnobSource("false"));

public static readonly Knob InstallLegacyTfExe = new Knob(
nameof(InstallLegacyTfExe),
"If true, the agent will install the legacy versions of TF, vstsom and vstshost",
Expand Down Expand Up @@ -827,5 +827,12 @@ public class AgentKnobs
new EnvironmentKnobSource("AGENT_CDN_CONNECTIVITY_FAIL_WARNING"),
new PipelineFeatureSource("AgentCDNConnectivityFailWarning"),
new BuiltInDefaultKnobSource("false"));

public static readonly Knob CheckBeforeRetryDockerStart = new Knob(
nameof(CheckBeforeRetryDockerStart),
"If true, the agent will check if container is running before retrying a Docker start command.",
new PipelineFeatureSource("CheckBeforeRetryDockerStart"),
new EnvironmentKnobSource("AGENT_CHECK_BEFORE_RETRY_DOCKER_START"),
new BuiltInDefaultKnobSource("false"));
}
}
72 changes: 66 additions & 6 deletions src/Agent.Worker/Container/DockerCommandManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,14 @@ public async Task<int> DockerStart(IExecutionContext context, string containerId
ArgUtil.NotNull(context, nameof(context));
ArgUtil.NotNull(containerId, nameof(containerId));

var action = new Func<Task<int>>(async () => await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken));
const string command = "Docker start";
return await ExecuteDockerCommandAsyncWithRetries(context, action, command);
if (!AgentKnobs.CheckBeforeRetryDockerStart.GetValue(context).AsBoolean())
{
var action = new Func<Task<int>>(async () => await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken));
const string command = "Docker start";
return await ExecuteDockerCommandAsyncWithRetries(context, action, command);
}
// Use the new helper for start with retries and running-state checks
return await ExecuteDockerStartWithRetriesAndCheck(context, containerId);
}

public async Task<int> DockerRemove(IExecutionContext context, string containerId)
Expand Down Expand Up @@ -374,7 +379,7 @@ public async Task<List<PortMapping>> DockerPort(IExecutionContext context, strin
/// <returns
/// <c>true</c>, if specified container is running, <c>false</c> otherwise.
/// </returns>
public async Task<bool> IsContainerRunning(IExecutionContext context, string containerId)
public virtual async Task<bool> IsContainerRunning(IExecutionContext context, string containerId)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any specific reason why we have made this function virtual?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made the function virtual to override it in the unit test class DockerCommandManagerL0Tests.cs

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of overriding a function in unit tests, a better practice is to mock the function in test with given input/output.

{
List<string> filteredItems = await DockerPS(context, $"--filter id={containerId}");

Expand All @@ -386,8 +391,8 @@ public async Task<bool> IsContainerRunning(IExecutionContext context, string con

return isContainerRunning;
}

private Task<int> ExecuteDockerCommandAsync(IExecutionContext context, string command, string options, CancellationToken cancellationToken = default(CancellationToken))
// making it protected for unit testing
protected virtual Task<int> ExecuteDockerCommandAsync(IExecutionContext context, string command, string options, CancellationToken cancellationToken = default(CancellationToken))
{
return ExecuteDockerCommandAsync(context, command, options, null, cancellationToken);
}
Expand Down Expand Up @@ -533,5 +538,60 @@ private static async Task<List<string>> ExecuteDockerCommandAsyncWithRetries(IEx

return output;
}

/// <summary>
/// Executes 'docker start' with retries, checking if the container is already running before each retry.
/// Returns 0 if the container is running or started successfully, otherwise returns the last exit code.
/// </summary>
protected virtual async Task<int> ExecuteDockerStartWithRetriesAndCheck(IExecutionContext context, string containerId)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any specific reason why we have made this function virtual?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same reason as above

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of overriding a function in unit tests, a better practice is to mock the function in test with given input/output.

{
bool dockerActionRetries = AgentKnobs.DockerActionRetries.GetValue(context).AsBoolean();
context.Output($"DockerActionRetries variable value: {dockerActionRetries}");
Copy link
Contributor

@sanjuyadav24 sanjuyadav24 Aug 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need this log?


int retryCount = 0;
const int maxRetries = 3;
TimeSpan delayInSeconds = TimeSpan.FromSeconds(10);
int exitCode = 0;

while (retryCount < maxRetries)
{
// Check if container is already running before attempting to start
if (await IsContainerRunning(context, containerId))
{
context.Output($"Container {containerId} is running before attempt {retryCount + 1}.");
break;
}

exitCode = await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken);
if (exitCode == 0 || !dockerActionRetries)
{
break;
}

context.Warning($"Docker start failed with exit code {exitCode}, back off {delayInSeconds} seconds before retry.");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this logging required? anyhow logs are below when docker is already running vs started

retryCount++;
await Task.Delay(delayInSeconds);

}

// handle the case where container is already running after retries but exit code is not 0
if (exitCode != 0 && await IsContainerRunning(context, containerId))
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you please check simplifying this if/else as if container is already running/started do we need to check exitcode?

context.Output($"Container {containerId} is already running after {retryCount} retries. but exit code was {exitCode}.");
exitCode = 0; // Indicate success
}
// If the container is still not running after retries, log a warning
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Warning inside this block already shows container not running after retires exhausted. No need for comment here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should log a warning that the container didn't start after all the retries, and the last exit code.
I agree that we are logging a warning for every failed attempt, but doesn't tell the overall outcome rather step-wise outcome.

if (exitCode != 0)
{
context.Warning($"Container {containerId} is not running after {retryCount} retries. Last exit code: {exitCode}");
}
else
{
context.Output($"Container {containerId} started successfully after {retryCount} retries.");
}
//return the exit code
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this comment necessary. This seems redundant so can be removed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It marks the end of the function Docker Start, with the exit code.

context.Debug($"Docker start completed with exit code {exitCode}.");
return exitCode;
}
}
}
125 changes: 125 additions & 0 deletions src/Test/L0/Worker/Container/DockerCommandManagerL0Tests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.VisualStudio.Services.Agent.Worker;
using Xunit;
using Moq;
using Agent.Sdk;
using Microsoft.VisualStudio.Services.Agent.Worker.Container;

namespace Microsoft.VisualStudio.Services.Agent.Tests.Worker.Container
{
public class DockerCommandManagerL0Tests
{
private Mock<IExecutionContext> _ec;

[Fact]
[Trait("Level", "L0")]
[Trait("Category", "worker.container")]
public async Task ReturnsZeroIfContainerAlreadyRunningBeforeStart()
{
_ec = new Mock<IExecutionContext>();
SetupEnvironmentVariables("true");
var manager = new TestableDockerCommandManager(isRunningOnFirstCheck: true);
int result = await manager.ExecuteDockerStartWithRetriesAndCheckPublic(_ec.Object, "cid");
Assert.Equal(0, result);
Assert.Equal(1, manager.IsContainerRunningCallCount);
}

[Fact]
[Trait("Level", "L0")]
[Trait("Category", "worker.container")]
public async Task ReturnsZeroIfStartSucceedsFirstTry()
{
_ec = new Mock<IExecutionContext>();
SetupEnvironmentVariables("true");
var manager = new TestableDockerCommandManager(exitCodes: new[] { 0 }, runningOnRetry: new[] { false, true });
int result = await manager.ExecuteDockerStartWithRetriesAndCheckPublic(_ec.Object, "cid");
Assert.Equal(0, result);
Assert.Equal(1, manager.IsContainerRunningCallCount);
}

[Fact]
[Trait("Level", "L0")]
[Trait("Category", "worker.container")]
public async Task ReturnsZeroIfContainerStartsOnThirdRetry()
{
_ec = new Mock<IExecutionContext>();
SetupEnvironmentVariables("true");
var manager = new TestableDockerCommandManager(exitCodes: new[] { 1, 1, 0 }, runningOnRetry: new[] { false, false, false, true });
int result = await manager.ExecuteDockerStartWithRetriesAndCheckPublic(_ec.Object, "cid");
Assert.Equal(0, result);
Assert.Equal(3, manager.IsContainerRunningCallCount);
}

[Fact]
[Trait("Level", "L0")]
[Trait("Category", "worker.container")]
public async Task ReturnsExitCodeIfContainerNeverStarts()
{
_ec = new Mock<IExecutionContext>();
SetupEnvironmentVariables("true");
var manager = new TestableDockerCommandManager(exitCodes: new[] { 1, 2, 3 }, runningOnRetry: new[] { false, false, false, false });
int result = await manager.ExecuteDockerStartWithRetriesAndCheckPublic(_ec.Object, "cid");
Assert.Equal(3, result);
Assert.Equal(4, manager.IsContainerRunningCallCount);
}

[Fact]
[Trait("Level", "L0")]
[Trait("Category", "worker.container")]
public async Task ReturnsZeroIfContainerStartsButExitCodeNotZero()
{
_ec = new Mock<IExecutionContext>();
SetupEnvironmentVariables("true");
// exitCode is 1, but container is running after
var manager = new TestableDockerCommandManager(exitCodes: new[] { 1 }, runningOnRetry: new[] { false, true });
int result = await manager.ExecuteDockerStartWithRetriesAndCheckPublic(_ec.Object, "cid");
Assert.Equal(0, result);
Assert.Equal(3, manager.IsContainerRunningCallCount);
}

private class TestableDockerCommandManager : DockerCommandManager
{
private readonly int[] _exitCodes;
private readonly bool[] _runningOnRetry;
private int _startCallCount = 0;
private int _runningCallCount = 0;
public int IsContainerRunningCallCount => _runningCallCount;

public TestableDockerCommandManager(bool isRunningOnFirstCheck = false)
{
_exitCodes = new[] { 1 };
_runningOnRetry = new[] { isRunningOnFirstCheck };
}
public TestableDockerCommandManager(int[] exitCodes, bool[] runningOnRetry)
{
_exitCodes = exitCodes;
_runningOnRetry = runningOnRetry;
}
public Task<int> ExecuteDockerStartWithRetriesAndCheckPublic(IExecutionContext context, string containerId)
{
return base.ExecuteDockerStartWithRetriesAndCheck(context, containerId);
}
protected override Task<int> ExecuteDockerCommandAsync(IExecutionContext context, string command, string options, CancellationToken cancellationToken = default)
{
int code = _exitCodes[Math.Min(_startCallCount, _exitCodes.Length - 1)];
_startCallCount++;
return Task.FromResult(code);
}
public override Task<bool> IsContainerRunning(IExecutionContext context, string containerId)
{
bool running = _runningOnRetry[Math.Min(_runningCallCount, _runningOnRetry.Length - 1)];
_runningCallCount++;
return Task.FromResult(running);
}
}

private void SetupEnvironmentVariables(string allowDockerActionRetries)
{
var environment = new SystemEnvironment();
environment.SetEnvironmentVariable("VSTSAGENT_DOCKER_ACTION_RETRIES", allowDockerActionRetries);
_ec.Setup(x => x.GetScopedEnvironment()).Returns(environment);
}
}
}
Loading