Skip to content

Commit 496f13f

Browse files
Add retry logic for external link timeout exceptions (#80)
External links that timeout due to slow response times now get retried using the existing retry configuration (--max-external-retries and --retry-delay-seconds). This prevents false positives for legitimate sites that are simply responding slowly. - Catch TaskCanceledException and HttpRequestException with timeout indicators - Schedule retries with jitter for timeout failures - Maintain existing behavior for SSL errors and DNS failures (no retry)
1 parent f09f048 commit 496f13f

File tree

1 file changed

+38
-1
lines changed

1 file changed

+38
-1
lines changed

src/LinkValidator/Actors/CrawlerActor.cs

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,10 @@ private async Task<ICrawlResult> CrawlExternalPage(AbsoluteUri url, int retryCou
186186
var scheduler = Context.System.Scheduler;
187187
var self = Self;
188188

189+
using var cts = new CancellationTokenSource(_crawlConfiguration.RequestTimeout);
190+
189191
try
190192
{
191-
using var cts = new CancellationTokenSource(_crawlConfiguration.RequestTimeout);
192193
var response = await _httpClient.GetAsync(url.Value, cts.Token);
193194

194195
if (response.StatusCode == HttpStatusCode.TooManyRequests)
@@ -212,6 +213,42 @@ private async Task<ICrawlResult> CrawlExternalPage(AbsoluteUri url, int retryCou
212213

213214
return new ExternalLinkCrawled(url, response.StatusCode);
214215
}
216+
catch (TaskCanceledException tcEx) when (tcEx.InnerException is TimeoutException || cts.Token.IsCancellationRequested)
217+
{
218+
// This is a timeout - retry if we haven't exceeded max retries
219+
if (retryCount < _crawlConfiguration.MaxExternalRetries)
220+
{
221+
var jitteredDelay = AddJitter(_crawlConfiguration.DefaultExternalRetryDelay);
222+
_log.Warning("Request timeout for {0} (retry {1}), scheduling retry in {2}",
223+
url, retryCount, jitteredDelay);
224+
225+
scheduler.ScheduleTellOnce(jitteredDelay, self, new RetryExternalRequest(url, retryCount + 1), self);
226+
227+
return new ExternalLinkRetryScheduled(url, HttpStatusCode.RequestTimeout);
228+
}
229+
230+
_log.Warning("Max retries ({0}) exceeded for external URL {1} after timeout",
231+
_crawlConfiguration.MaxExternalRetries, url);
232+
return new ExternalLinkCrawled(url, HttpStatusCode.RequestTimeout);
233+
}
234+
catch (HttpRequestException httpEx) when (httpEx.InnerException is TaskCanceledException || httpEx.Message.Contains("timeout", StringComparison.OrdinalIgnoreCase))
235+
{
236+
// This is also a timeout - retry if we haven't exceeded max retries
237+
if (retryCount < _crawlConfiguration.MaxExternalRetries)
238+
{
239+
var jitteredDelay = AddJitter(_crawlConfiguration.DefaultExternalRetryDelay);
240+
_log.Warning("Request timeout for {0} (retry {1}), scheduling retry in {2}",
241+
url, retryCount, jitteredDelay);
242+
243+
scheduler.ScheduleTellOnce(jitteredDelay, self, new RetryExternalRequest(url, retryCount + 1), self);
244+
245+
return new ExternalLinkRetryScheduled(url, HttpStatusCode.RequestTimeout);
246+
}
247+
248+
_log.Warning("Max retries ({0}) exceeded for external URL {1} after timeout",
249+
_crawlConfiguration.MaxExternalRetries, url);
250+
return new ExternalLinkCrawled(url, HttpStatusCode.RequestTimeout);
251+
}
215252
catch (Exception ex)
216253
{
217254
_log.Warning(ex, "Failed to crawl {0}", url);

0 commit comments

Comments
 (0)