Skip to content

Commit 1514129

Browse files
authored
Add timeout wehn Read&Write Parititon Table (#1200)
* initial commit * make this timeout configurable
1 parent 068d988 commit 1514129

File tree

4 files changed

+29
-4
lines changed

4 files changed

+29
-4
lines changed

src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
<!-- Version Info -->
1212
<PropertyGroup>
1313
<MajorVersion>0</MajorVersion>
14-
<MinorVersion>2</MinorVersion>
14+
<MinorVersion>3</MinorVersion>
1515
<PatchVersion>0</PatchVersion>
1616
<VersionPrefix>$(MajorVersion).$(MinorVersion).$(PatchVersion)</VersionPrefix>
1717
<FileVersion>$(VersionPrefix).0</FileVersion>

src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,14 @@ public class AzureStorageOrchestrationServiceSettings
170170
/// </remarks>
171171
public bool AllowReplayingTerminalInstances { get; set; } = false;
172172

173+
/// <summary>
174+
/// Specifies the timeout (in seconds) for read and write operations on the partition table in partition manager V3 (table partition manager).
175+
/// This helps detect and recover from potential silent hangs caused by Azure Storage client's internal retries.
176+
/// If the operation exceeds the timeout, a PartitionManagerWarning is logged and the operation is retried.
177+
/// The default time is 2 seconds.
178+
/// </summary>
179+
public TimeSpan PartitionTableOperationTimeout { get; set; } = TimeSpan.FromSeconds(2);
180+
173181
/// <summary>
174182
/// If UseAppLease is true, gets or sets the AppLeaseOptions used for acquiring the lease to start the application.
175183
/// </summary>

src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
<!-- Version Info -->
2222
<PropertyGroup>
2323
<MajorVersion>2</MajorVersion>
24-
<MinorVersion>0</MinorVersion>
25-
<PatchVersion>2</PatchVersion>
24+
<MinorVersion>1</MinorVersion>
25+
<PatchVersion>0</PatchVersion>
2626
<VersionPrefix>$(MajorVersion).$(MinorVersion).$(PatchVersion)</VersionPrefix>
2727
<FileVersion>$(VersionPrefix).0</FileVersion>
2828
<!-- FileVersionRevision is expected to be set by the CI. This is useful for distinguishing between multiple builds of the same version. -->

src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,10 @@ async Task PartitionManagerLoop(CancellationToken gracefulShutdownToken, Cancell
118118

119119
try
120120
{
121-
ReadTableReponse response = await this.tableLeaseManager.ReadAndWriteTableAsync(isShuttingDown, forcefulShutdownToken);
121+
using var timeoutCts = new CancellationTokenSource(this.settings.PartitionTableOperationTimeout);
122+
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(forcefulShutdownToken, timeoutCts.Token);
123+
124+
ReadTableReponse response = await this.tableLeaseManager.ReadAndWriteTableAsync(isShuttingDown, linkedCts.Token);
122125

123126
// If shutdown is requested and already released all ownership leases, then break the loop.
124127
if (isShuttingDown && response.ReleasedAllLeases)
@@ -147,6 +150,20 @@ async Task PartitionManagerLoop(CancellationToken gracefulShutdownToken, Cancell
147150
{
148151
consecutiveFailureCount++;
149152
}
153+
// ReadAndWriteTableAsync exceeded the set timeout.
154+
// This may indicate a transient storage or network issue.
155+
// The operation will be retried immediately unless it fails more than 10 consecutive times.
156+
catch (OperationCanceledException) when (!forcefulShutdownToken.IsCancellationRequested)
157+
{
158+
this.settings.Logger.PartitionManagerWarning(
159+
this.storageAccountName,
160+
this.settings.TaskHubName,
161+
this.settings.WorkerId,
162+
partitionId: NotApplicable,
163+
details: "Operation to read and write the partition table exceeded the 2-second timeout.");
164+
165+
consecutiveFailureCount++;
166+
}
150167
// Eat any unexpected exceptions.
151168
catch (Exception exception)
152169
{

0 commit comments

Comments
 (0)