Skip to content

Commit f4ac92c

Browse files
HTTP 429 TooManyRequests handling (#63)
* Added HTTP 429 `TooManyRequests` handling * forgot to add `CrawlConfiguration` changes * added comprehensive fix for request retries * fixed regression in `UriHelpers`
1 parent 70cfea0 commit f4ac92c

9 files changed

+425
-24
lines changed

src/LinkValidator.Tests/End2EndSpecs.ShouldCrawlWebsiteCorrectly.verified.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
| URL | StatusCode | Linked From |
66
| --------------------- | ---------- | ------------------------------------------------------- |
7-
| `/` | OK | \- |
7+
| `/` | OK | - |
88
| `/about/contact.html` | OK | `/`, `/about/contact.html`, `/about/index.html` +2 more |
99
| `/about/index.html` | OK | `/`, `/about/contact.html`, `/about/index.html` +2 more |
1010
| `/index.html` | OK | `/`, `/about/contact.html`, `/about/index.html` +2 more |

src/LinkValidator.Tests/TestWebServerFixture.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public class TestWebServerFixture : IAsyncDisposable
2121
public string? BaseUrl { get; private set; }
2222
public Action<string>? Logger { get; set; }
2323

24-
public TestWebServerFixture StartServer(string contentDirectory)
24+
public TestWebServerFixture StartServer(string contentDirectory, int port = TestPort)
2525
{
2626
lock (_lock)
2727
{
@@ -36,7 +36,7 @@ public TestWebServerFixture StartServer(string contentDirectory)
3636
_webHost = new WebHostBuilder()
3737
.UseKestrel(options =>
3838
{
39-
options.Listen(IPAddress.Loopback, TestPort);
39+
options.Listen(IPAddress.Loopback, port);
4040
options.Limits.MaxConcurrentConnections = 100;
4141
options.Limits.MaxConcurrentUpgradedConnections = 100;
4242
})
@@ -68,7 +68,7 @@ public TestWebServerFixture StartServer(string contentDirectory)
6868
.Build();
6969

7070
_webHost.Start();
71-
BaseUrl = $"http://localhost:{TestPort}";
71+
BaseUrl = $"http://localhost:{port}";
7272

7373
return this;
7474
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Sitemap for `http://localhost:8081/`
2+
3+
## Internal Pages
4+
5+
| URL | StatusCode | Linked From |
6+
| ------------- | ---------- | ----------- |
7+
| `/` | OK | - |
8+
| `/about.html` | OK | `/` |
9+
10+
## 🔴 Pages with Broken Links
11+
12+
### `/` has broken links:
13+
14+
- `http://127.0.0.1:8082/always-429` (TooManyRequests)
15+
16+
### `/about.html` has broken links:
17+
18+
- `http://127.0.0.1:8082/another-rate-limited` (TooManyRequests)
Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
// -----------------------------------------------------------------------
2+
// <copyright file="TooManyRequestsRetrySpecs.cs">
3+
// Copyright (C) 2025 - 2025 Aaron Stannard <https://aaronstannard.com/>
4+
// </copyright>
5+
// -----------------------------------------------------------------------
6+
7+
using System.Collections.Concurrent;
8+
using System.Net;
9+
using Akka.Actor;
10+
using Akka.TestKit.Xunit2;
11+
using LinkValidator.Actors;
12+
using LinkValidator.Util;
13+
using Microsoft.AspNetCore.Builder;
14+
using Microsoft.AspNetCore.Hosting;
15+
using Microsoft.AspNetCore.Http;
16+
using Microsoft.Extensions.Hosting;
17+
using Xunit.Abstractions;
18+
using static LinkValidator.Util.CrawlerHelper;
19+
using static LinkValidator.Util.MarkdownHelper;
20+
21+
namespace LinkValidator.Tests;
22+
23+
public class TooManyRequestsRetrySpecs : TestKit, IClassFixture<TestWebServerFixture>
24+
{
25+
private readonly TestWebServerFixture _webServerFixture;
26+
private readonly ITestOutputHelper _output;
27+
28+
public TooManyRequestsRetrySpecs(ITestOutputHelper output, TestWebServerFixture webServerFixture) : base(output: output)
29+
{
30+
_webServerFixture = webServerFixture;
31+
_output = output;
32+
_webServerFixture.Logger = _output.WriteLine;
33+
}
34+
35+
[Fact]
36+
public async Task ShouldRetryExternalLinksAndGenerateCorrectReport()
37+
{
38+
// arrange - create test pages with external links that will return 429
39+
var testPagesDir = Path.Join(Directory.GetCurrentDirectory(), "test-pages-retry");
40+
Directory.CreateDirectory(testPagesDir);
41+
42+
// Track retry attempts for our mock server
43+
var retryTracker = new RetryAttemptTracker();
44+
45+
try
46+
{
47+
// Create test pages
48+
var indexPageContent = """
49+
<html>
50+
<body>
51+
<h1>Test Page with Rate Limited Links</h1>
52+
<p>This page contains links that will initially return 429.</p>
53+
<a href="/about.html">About Page</a>
54+
<a href="http://127.0.0.1:8082/always-429">Always Rate Limited</a>
55+
<a href="http://127.0.0.1:8082/retry-then-succeed">Eventually Succeeds</a>
56+
<a href="http://127.0.0.1:8082/with-retry-after">With Retry-After Header</a>
57+
</body>
58+
</html>
59+
""";
60+
61+
var aboutPageContent = """
62+
<html>
63+
<body>
64+
<h1>About Page</h1>
65+
<p>Internal page that links to more external resources.</p>
66+
<a href="http://127.0.0.1:8082/another-rate-limited">Another Rate Limited Link</a>
67+
</body>
68+
</html>
69+
""";
70+
71+
File.WriteAllText(Path.Join(testPagesDir, "index.html"), indexPageContent);
72+
File.WriteAllText(Path.Join(testPagesDir, "about.html"), aboutPageContent);
73+
74+
// Start main test server
75+
_webServerFixture.StartServer(testPagesDir, 8081);
76+
var baseUrl = new AbsoluteUri(new Uri(_webServerFixture.BaseUrl!));
77+
78+
// Start mock server for external links that simulates 429 responses
79+
using var mockServer = CreateMockRateLimitedServer(retryTracker, 8082);
80+
await mockServer.StartAsync();
81+
82+
// Configure for multiple retries with short delays for testing
83+
var crawlSettings = new CrawlConfiguration(
84+
baseUrl,
85+
5,
86+
TimeSpan.FromSeconds(5), // Reasonable timeout
87+
3, // 3 retry attempts
88+
TimeSpan.FromMilliseconds(200) // Very short retry delay for testing
89+
);
90+
91+
_output.WriteLine("=== CRAWL CONFIGURATION ===");
92+
_output.WriteLine($"Max External Retries: {crawlSettings.MaxExternalRetries}");
93+
_output.WriteLine($"Default Retry Delay: {crawlSettings.DefaultExternalRetryDelay.TotalMilliseconds}ms");
94+
_output.WriteLine($"Request Timeout: {crawlSettings.RequestTimeout.TotalSeconds}s");
95+
96+
// act
97+
var startTime = DateTime.UtcNow;
98+
var crawlResult = await CrawlWebsite(Sys, baseUrl, crawlSettings);
99+
var elapsed = DateTime.UtcNow - startTime;
100+
101+
// Generate markdown report like End2EndSpecs
102+
var markdown = GenerateMarkdown(crawlResult);
103+
104+
_output.WriteLine("=== CRAWL RESULTS ===");
105+
_output.WriteLine($"Crawl completed in {elapsed.TotalSeconds:F1} seconds");
106+
_output.WriteLine($"Found {crawlResult.InternalLinks.Count} internal links and {crawlResult.ExternalLinks.Count} external links");
107+
108+
_output.WriteLine("=== RETRY ATTEMPT TRACKING ===");
109+
foreach (var (url, attempts) in retryTracker.GetAttemptCounts())
110+
{
111+
_output.WriteLine($"{url}: {attempts} attempts");
112+
}
113+
114+
_output.WriteLine("=== RAW MARKDOWN OUTPUT ===");
115+
_output.WriteLine(markdown);
116+
_output.WriteLine("=== END RAW MARKDOWN ===");
117+
118+
// assert
119+
Assert.NotNull(crawlResult);
120+
121+
// Verify crawl completed in reasonable time (should be much faster than timeout due to retries)
122+
Assert.True(elapsed < TimeSpan.FromSeconds(30), $"Crawl took {elapsed.TotalSeconds:F1}s, should be much faster");
123+
124+
// Should have found external links
125+
Assert.True(crawlResult.ExternalLinks.Count >= 3, $"Should have found at least 3 external links, found {crawlResult.ExternalLinks.Count}");
126+
127+
// Verify retry attempts were made
128+
var attemptCounts = retryTracker.GetAttemptCounts();
129+
Assert.True(attemptCounts.Values.Any(count => count > 1), "Should have made retry attempts for some URLs");
130+
131+
// Verify that retry attempts were made and crawl completed without hanging
132+
var externalLinkStatuses = crawlResult.ExternalLinks.Values.Select(r => r.StatusCode).ToList();
133+
_output.WriteLine($"External link statuses: {string.Join(", ", externalLinkStatuses)}");
134+
135+
// Some should succeed after retries, some should fail after exhausting retries
136+
Assert.Contains(System.Net.HttpStatusCode.OK, externalLinkStatuses); // Should have some successful retries
137+
Assert.Contains(System.Net.HttpStatusCode.TooManyRequests, externalLinkStatuses); // Should have some failed after retries
138+
139+
// Verify markdown report includes the retry information
140+
Assert.Contains("🔴 Pages with Broken Links", markdown); // Should have broken links section
141+
Assert.Contains("429", markdown); // Should mention 429 status codes
142+
143+
// Verify with snapshot testing like End2EndSpecs
144+
await Verify(markdown);
145+
146+
await mockServer.StopAsync();
147+
}
148+
finally
149+
{
150+
if (Directory.Exists(testPagesDir))
151+
Directory.Delete(testPagesDir, true);
152+
}
153+
}
154+
155+
[Fact]
156+
public void ShouldParseRetryAfterHeaderCorrectly()
157+
{
158+
// arrange
159+
var crawlConfig = new CrawlConfiguration(
160+
new AbsoluteUri(new Uri("https://example.com")),
161+
5, TimeSpan.FromSeconds(5));
162+
163+
var crawler = Sys.ActorOf(Props.Create(() => new CrawlerActor(crawlConfig, ActorRefs.Nobody)), "crawler");
164+
165+
// Create a mock response message (this is more of a unit test for the logic)
166+
using var response = new HttpResponseMessage(HttpStatusCode.TooManyRequests);
167+
168+
// Test seconds format
169+
response.Headers.Add("Retry-After", "30");
170+
// We can't directly test the private method, but we can verify the logic works through integration
171+
172+
// Test date format
173+
var futureTime = DateTimeOffset.UtcNow.AddMinutes(5);
174+
response.Headers.Clear();
175+
response.Headers.Add("Retry-After", futureTime.ToString("R"));
176+
177+
// This test mainly verifies that the CrawlerActor can be instantiated with the new retry logic
178+
Assert.NotNull(crawler);
179+
_output.WriteLine("CrawlerActor with retry logic created successfully");
180+
}
181+
182+
private IWebHost CreateMockRateLimitedServer(RetryAttemptTracker tracker, int port)
183+
{
184+
return new WebHostBuilder()
185+
.UseKestrel()
186+
.UseUrls($"http://127.0.0.1:{port}")
187+
.Configure(app =>
188+
{
189+
app.Run(async context =>
190+
{
191+
var path = context.Request.Path.Value ?? "/";
192+
var url = $"http://127.0.0.1:{port}{path}";
193+
194+
var attemptCount = tracker.IncrementAttempt(url);
195+
_output.WriteLine($"Request to {url} - Attempt #{attemptCount}");
196+
197+
if (path == "/always-429")
198+
{
199+
// Always return 429
200+
context.Response.StatusCode = 429;
201+
await context.Response.WriteAsync("Too Many Requests - Always");
202+
}
203+
else if (path == "/retry-then-succeed")
204+
{
205+
// Return 429 for first 2 attempts, then succeed
206+
if (attemptCount <= 2)
207+
{
208+
context.Response.StatusCode = 429;
209+
await context.Response.WriteAsync("Too Many Requests - Retry");
210+
}
211+
else
212+
{
213+
context.Response.StatusCode = 200;
214+
await context.Response.WriteAsync("Success after retries!");
215+
}
216+
}
217+
else if (path == "/with-retry-after")
218+
{
219+
// Return 429 with Retry-After header for first attempt, then succeed
220+
if (attemptCount <= 1)
221+
{
222+
context.Response.StatusCode = 429;
223+
context.Response.Headers["Retry-After"] = "1"; // 1 second
224+
await context.Response.WriteAsync("Too Many Requests - With Header");
225+
}
226+
else
227+
{
228+
context.Response.StatusCode = 200;
229+
await context.Response.WriteAsync("Success after Retry-After!");
230+
}
231+
}
232+
else if (path == "/another-rate-limited")
233+
{
234+
// Always return 429 to test multiple failing links
235+
context.Response.StatusCode = 429;
236+
await context.Response.WriteAsync("Too Many Requests - Another");
237+
}
238+
else
239+
{
240+
context.Response.StatusCode = 404;
241+
await context.Response.WriteAsync("Not Found");
242+
}
243+
});
244+
})
245+
.Build();
246+
}
247+
}
248+
249+
public class RetryAttemptTracker
250+
{
251+
private readonly ConcurrentDictionary<string, int> _attemptCounts = new();
252+
253+
public int IncrementAttempt(string url)
254+
{
255+
return _attemptCounts.AddOrUpdate(url, 1, (_, count) => count + 1);
256+
}
257+
258+
public Dictionary<string, int> GetAttemptCounts()
259+
{
260+
return _attemptCounts.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
261+
}
262+
}

src/LinkValidator/Actors/CrawlConfiguration.cs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ namespace LinkValidator.Actors;
99
/// <summary>
1010
/// Configuration for the crawler
1111
/// </summary>
12-
public sealed record CrawlConfiguration(AbsoluteUri BaseUrl, int MaxInflightRequests, TimeSpan RequestTimeout)
12+
public sealed record CrawlConfiguration(AbsoluteUri BaseUrl, int MaxInflightRequests, TimeSpan RequestTimeout, int MaxExternalRetries = 3, TimeSpan DefaultExternalRetryDelay = default)
1313
{
1414
/// <summary>
1515
/// The absolute base url - we are only interested in urls stemming from it.
@@ -25,4 +25,14 @@ public sealed record CrawlConfiguration(AbsoluteUri BaseUrl, int MaxInflightRequ
2525
/// The amount of time we'll allot for any individual HTTP request
2626
/// </summary>
2727
public TimeSpan RequestTimeout { get; } = RequestTimeout;
28+
29+
/// <summary>
30+
/// Maximum number of retries for external requests that return 429 TooManyRequests
31+
/// </summary>
32+
public int MaxExternalRetries { get; } = MaxExternalRetries;
33+
34+
/// <summary>
35+
/// Default delay for retrying external requests when no Retry-After header is present
36+
/// </summary>
37+
public TimeSpan DefaultExternalRetryDelay { get; } = DefaultExternalRetryDelay == default ? TimeSpan.FromSeconds(10) : DefaultExternalRetryDelay;
2838
}

0 commit comments

Comments
 (0)