Skip to content

Commit cdc67e3

Browse files
authored
Impoving container initialize logic (#5280)
* Impoving container initialise logic and adding unit tests * Adding a FF * Removing the unit tests for a later PR * Reverting functions to private * Removing comment
1 parent 567d6f9 commit cdc67e3

File tree

2 files changed

+71
-4
lines changed

2 files changed

+71
-4
lines changed

src/Agent.Sdk/Knob/AgentKnobs.cs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -799,7 +799,7 @@ public class AgentKnobs
799799
new RuntimeKnobSource("ADD_FORCE_CREDENTIALS_TO_GIT_CHECKOUT"),
800800
new PipelineFeatureSource(nameof(AddForceCredentialsToGitCheckout)),
801801
new BuiltInDefaultKnobSource("false"));
802-
802+
803803
public static readonly Knob InstallLegacyTfExe = new Knob(
804804
nameof(InstallLegacyTfExe),
805805
"If true, the agent will install the legacy versions of TF, vstsom and vstshost",
@@ -828,5 +828,12 @@ public class AgentKnobs
828828
new EnvironmentKnobSource("AGENT_CDN_CONNECTIVITY_FAIL_WARNING"),
829829
new PipelineFeatureSource("AgentCDNConnectivityFailWarning"),
830830
new BuiltInDefaultKnobSource("false"));
831+
832+
public static readonly Knob CheckBeforeRetryDockerStart = new Knob(
833+
nameof(CheckBeforeRetryDockerStart),
834+
"If true, the agent will check if container is running before retrying a Docker start command.",
835+
new PipelineFeatureSource("CheckBeforeRetryDockerStart"),
836+
new EnvironmentKnobSource("AGENT_CHECK_BEFORE_RETRY_DOCKER_START"),
837+
new BuiltInDefaultKnobSource("false"));
831838
}
832839
}

src/Agent.Worker/Container/DockerCommandManager.cs

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,9 +209,14 @@ public async Task<int> DockerStart(IExecutionContext context, string containerId
209209
ArgUtil.NotNull(context, nameof(context));
210210
ArgUtil.NotNull(containerId, nameof(containerId));
211211

212-
var action = new Func<Task<int>>(async () => await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken));
213-
const string command = "Docker start";
214-
return await ExecuteDockerCommandAsyncWithRetries(context, action, command);
212+
if (!AgentKnobs.CheckBeforeRetryDockerStart.GetValue(context).AsBoolean())
213+
{
214+
var action = new Func<Task<int>>(async () => await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken));
215+
const string command = "Docker start";
216+
return await ExecuteDockerCommandAsyncWithRetries(context, action, command);
217+
}
218+
// Use the new helper for start with retries and running-state checks
219+
return await ExecuteDockerStartWithRetriesAndCheck(context, containerId);
215220
}
216221

217222
public async Task<int> DockerRemove(IExecutionContext context, string containerId)
@@ -533,5 +538,60 @@ private static async Task<List<string>> ExecuteDockerCommandAsyncWithRetries(IEx
533538

534539
return output;
535540
}
541+
542+
/// <summary>
543+
/// Executes 'docker start' with retries, checking if the container is already running before each retry.
544+
/// Returns 0 if the container is running or started successfully, otherwise returns the last exit code.
545+
/// </summary>
546+
private async Task<int> ExecuteDockerStartWithRetriesAndCheck(IExecutionContext context, string containerId)
547+
{
548+
bool dockerActionRetries = AgentKnobs.DockerActionRetries.GetValue(context).AsBoolean();
549+
context.Output($"DockerActionRetries variable value: {dockerActionRetries}");
550+
551+
int retryCount = 0;
552+
const int maxRetries = 3;
553+
TimeSpan delayInSeconds = TimeSpan.FromSeconds(10);
554+
int exitCode = 0;
555+
556+
while (retryCount < maxRetries)
557+
{
558+
// Check if container is already running before attempting to start
559+
if (await IsContainerRunning(context, containerId))
560+
{
561+
context.Output($"Container {containerId} is running before attempt {retryCount + 1}.");
562+
break;
563+
}
564+
565+
exitCode = await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken);
566+
if (exitCode == 0 || !dockerActionRetries)
567+
{
568+
break;
569+
}
570+
571+
context.Warning($"Docker start failed with exit code {exitCode}, back off {delayInSeconds} seconds before retry.");
572+
retryCount++;
573+
await Task.Delay(delayInSeconds);
574+
575+
}
576+
577+
// handle the case where container is already running after retries but exit code is not 0
578+
if (exitCode != 0 && await IsContainerRunning(context, containerId))
579+
{
580+
context.Output($"Container {containerId} is already running after {retryCount} retries. but exit code was {exitCode}.");
581+
exitCode = 0; // Indicate success
582+
}
583+
// If the container is still not running after retries, log a warning
584+
if (exitCode != 0)
585+
{
586+
context.Warning($"Container {containerId} is not running after {retryCount} retries. Last exit code: {exitCode}");
587+
}
588+
else
589+
{
590+
context.Output($"Container {containerId} started successfully after {retryCount} retries.");
591+
}
592+
//return the exit code
593+
context.Debug($"Docker start completed with exit code {exitCode}.");
594+
return exitCode;
595+
}
536596
}
537597
}

0 commit comments

Comments
 (0)