Skip to content

Commit 874ee95

Browse files
authored
.Net Agents - Fix polling cycle to properly evaluate failure mode on exception (#9581)
### Motivation and Context <!-- Thank you for your contribution to the semantic-kernel repo! Please help reviewers and future users, providing the following information: 1. Why is this change required? 2. What problem does it solve? 3. What scenario does it contribute to? 4. If it fixes an open issue, please link to the issue here. --> Assistant polling not propertly responding to terminal error conditions. Fixes: #9579 ### Description <!-- Describe your changes, the overall approach, the underlying design. These notes will help understanding how your code works. Thanks! --> - Added explicit check for cancelling state: Invoke, InvokeStreaming, & Polling loops - Discriminate on different exception modes when polling - Ensure stale run state isn't evaluated in polling loop - Remove initial polling delay ### Contribution Checklist <!-- Before submitting this PR, please make sure: --> - [X] The code builds clean without any errors or warnings - [X] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [X] All unit tests pass, and I have added new tests where possible - [X] I didn't break anyone 😄
1 parent 6b20b98 commit 874ee95

File tree

2 files changed

+56
-7
lines changed

2 files changed

+56
-7
lines changed

dotnet/src/Agents/OpenAI/Internal/AssistantThreadActions.cs

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// Copyright (c) Microsoft. All rights reserved.
22
using System;
3+
using System.ClientModel;
34
using System.Collections.Generic;
45
using System.Linq;
56
using System.Net;
@@ -179,9 +180,11 @@ public static async IAsyncEnumerable<ChatMessageContent> GetMessagesAsync(Assist
179180
// Evaluate status and process steps and messages, as encountered.
180181
HashSet<string> processedStepIds = [];
181182
Dictionary<string, FunctionResultContent> functionSteps = [];
182-
183183
do
184184
{
185+
// Check for cancellation
186+
cancellationToken.ThrowIfCancellationRequested();
187+
185188
// Poll run and steps until actionable
186189
await PollRunStatusAsync().ConfigureAwait(false);
187190

@@ -301,20 +304,49 @@ async Task PollRunStatusAsync()
301304

302305
do
303306
{
304-
// Reduce polling frequency after a couple attempts
305-
await Task.Delay(agent.PollingOptions.GetPollingInterval(count), cancellationToken).ConfigureAwait(false);
307+
cancellationToken.ThrowIfCancellationRequested();
308+
309+
if (count > 0)
310+
{
311+
// Reduce polling frequency after a couple attempts
312+
await Task.Delay(agent.PollingOptions.GetPollingInterval(count), cancellationToken).ConfigureAwait(false);
313+
}
314+
306315
++count;
307316

308-
#pragma warning disable CA1031 // Do not catch general exception types
309317
try
310318
{
311319
run = await client.GetRunAsync(threadId, run.Id, cancellationToken).ConfigureAwait(false);
312320
}
313-
catch
321+
// The presence of a `Status` code means the server responded with error...always fail in that case
322+
catch (ClientResultException clientException) when (clientException.Status <= 0)
323+
{
324+
// Check maximum retry count
325+
if (count >= agent.PollingOptions.MaximumRetryCount)
326+
{
327+
throw;
328+
}
329+
330+
// Retry for potential transient failure
331+
continue;
332+
}
333+
catch (AggregateException aggregateException) when (aggregateException.InnerException is ClientResultException innerClientException)
314334
{
315-
// Retry anyway..
335+
// The presence of a `Status` code means the server responded with error
336+
if (innerClientException.Status > 0)
337+
{
338+
throw;
339+
}
340+
341+
// Check maximum retry count
342+
if (count >= agent.PollingOptions.MaximumRetryCount)
343+
{
344+
throw;
345+
}
346+
347+
// Retry for potential transient failure
348+
continue;
316349
}
317-
#pragma warning restore CA1031 // Do not catch general exception types
318350
}
319351
while (s_pollingStatuses.Contains(run.Status));
320352

@@ -373,6 +405,9 @@ public static async IAsyncEnumerable<StreamingChatMessageContent> InvokeStreamin
373405
IAsyncEnumerable<StreamingUpdate> asyncUpdates = client.CreateRunStreamingAsync(threadId, agent.Id, options, cancellationToken);
374406
do
375407
{
408+
// Check for cancellation
409+
cancellationToken.ThrowIfCancellationRequested();
410+
376411
stepsToProcess.Clear();
377412

378413
await foreach (StreamingUpdate update in asyncUpdates.ConfigureAwait(false))

dotnet/src/Agents/OpenAI/RunPollingOptions.cs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ namespace Microsoft.SemanticKernel.Agents.OpenAI;
88
/// </summary>
99
public sealed class RunPollingOptions
1010
{
11+
/// <summary>
12+
/// The default maximum number or retries when monitoring thread-run status.
13+
/// </summary>
14+
public static int DefaultMaximumRetryCount { get; } = 3;
15+
1116
/// <summary>
1217
/// The default polling interval when monitoring thread-run status.
1318
/// </summary>
@@ -28,6 +33,15 @@ public sealed class RunPollingOptions
2833
/// </summary>
2934
public static TimeSpan DefaultMessageSynchronizationDelay { get; } = TimeSpan.FromMilliseconds(500);
3035

36+
/// <summary>
37+
/// The maximum retry count when polling thread-run status.
38+
/// </summary>
39+
/// <remarks>
40+
/// Only affects failures that have the potential to be transient. Explicit server error responses
41+
/// will result in immediate failure.
42+
/// </remarks>
43+
public int MaximumRetryCount { get; set; } = DefaultMaximumRetryCount;
44+
3145
/// <summary>
3246
/// The polling interval when monitoring thread-run status.
3347
/// </summary>

0 commit comments

Comments
 (0)