1+ // -----------------------------------------------------------------------
2+ // <copyright file="TooManyRequestsRetrySpecs.cs">
3+ // Copyright (C) 2025 - 2025 Aaron Stannard <https://aaronstannard.com/>
4+ // </copyright>
5+ // -----------------------------------------------------------------------
6+
7+ using System . Collections . Concurrent ;
8+ using System . Net ;
9+ using Akka . Actor ;
10+ using Akka . TestKit . Xunit2 ;
11+ using LinkValidator . Actors ;
12+ using LinkValidator . Util ;
13+ using Microsoft . AspNetCore . Builder ;
14+ using Microsoft . AspNetCore . Hosting ;
15+ using Microsoft . AspNetCore . Http ;
16+ using Microsoft . Extensions . Hosting ;
17+ using Xunit . Abstractions ;
18+ using static LinkValidator . Util . CrawlerHelper ;
19+ using static LinkValidator . Util . MarkdownHelper ;
20+
21+ namespace LinkValidator . Tests ;
22+
23+ public class TooManyRequestsRetrySpecs : TestKit , IClassFixture < TestWebServerFixture >
24+ {
25+ private readonly TestWebServerFixture _webServerFixture ;
26+ private readonly ITestOutputHelper _output ;
27+
28+ public TooManyRequestsRetrySpecs ( ITestOutputHelper output , TestWebServerFixture webServerFixture ) : base ( output : output )
29+ {
30+ _webServerFixture = webServerFixture ;
31+ _output = output ;
32+ _webServerFixture . Logger = _output . WriteLine ;
33+ }
34+
35+ [ Fact ]
36+ public async Task ShouldRetryExternalLinksAndGenerateCorrectReport ( )
37+ {
38+ // arrange - create test pages with external links that will return 429
39+ var testPagesDir = Path . Join ( Directory . GetCurrentDirectory ( ) , "test-pages-retry" ) ;
40+ Directory . CreateDirectory ( testPagesDir ) ;
41+
42+ // Track retry attempts for our mock server
43+ var retryTracker = new RetryAttemptTracker ( ) ;
44+
45+ try
46+ {
47+ // Create test pages
48+ var indexPageContent = """
49+ <html>
50+ <body>
51+ <h1>Test Page with Rate Limited Links</h1>
52+ <p>This page contains links that will initially return 429.</p>
53+ <a href="/about.html">About Page</a>
54+ <a href="http://127.0.0.1:8082/always-429">Always Rate Limited</a>
55+ <a href="http://127.0.0.1:8082/retry-then-succeed">Eventually Succeeds</a>
56+ <a href="http://127.0.0.1:8082/with-retry-after">With Retry-After Header</a>
57+ </body>
58+ </html>
59+ """ ;
60+
61+ var aboutPageContent = """
62+ <html>
63+ <body>
64+ <h1>About Page</h1>
65+ <p>Internal page that links to more external resources.</p>
66+ <a href="http://127.0.0.1:8082/another-rate-limited">Another Rate Limited Link</a>
67+ </body>
68+ </html>
69+ """ ;
70+
71+ File . WriteAllText ( Path . Join ( testPagesDir , "index.html" ) , indexPageContent ) ;
72+ File . WriteAllText ( Path . Join ( testPagesDir , "about.html" ) , aboutPageContent ) ;
73+
74+ // Start main test server
75+ _webServerFixture . StartServer ( testPagesDir , 8081 ) ;
76+ var baseUrl = new AbsoluteUri ( new Uri ( _webServerFixture . BaseUrl ! ) ) ;
77+
78+ // Start mock server for external links that simulates 429 responses
79+ using var mockServer = CreateMockRateLimitedServer ( retryTracker , 8082 ) ;
80+ await mockServer . StartAsync ( ) ;
81+
82+ // Configure for multiple retries with short delays for testing
83+ var crawlSettings = new CrawlConfiguration (
84+ baseUrl ,
85+ 5 ,
86+ TimeSpan . FromSeconds ( 5 ) , // Reasonable timeout
87+ 3 , // 3 retry attempts
88+ TimeSpan . FromMilliseconds ( 200 ) // Very short retry delay for testing
89+ ) ;
90+
91+ _output . WriteLine ( "=== CRAWL CONFIGURATION ===" ) ;
92+ _output . WriteLine ( $ "Max External Retries: { crawlSettings . MaxExternalRetries } ") ;
93+ _output . WriteLine ( $ "Default Retry Delay: { crawlSettings . DefaultExternalRetryDelay . TotalMilliseconds } ms") ;
94+ _output . WriteLine ( $ "Request Timeout: { crawlSettings . RequestTimeout . TotalSeconds } s") ;
95+
96+ // act
97+ var startTime = DateTime . UtcNow ;
98+ var crawlResult = await CrawlWebsite ( Sys , baseUrl , crawlSettings ) ;
99+ var elapsed = DateTime . UtcNow - startTime ;
100+
101+ // Generate markdown report like End2EndSpecs
102+ var markdown = GenerateMarkdown ( crawlResult ) ;
103+
104+ _output . WriteLine ( "=== CRAWL RESULTS ===" ) ;
105+ _output . WriteLine ( $ "Crawl completed in { elapsed . TotalSeconds : F1} seconds") ;
106+ _output . WriteLine ( $ "Found { crawlResult . InternalLinks . Count } internal links and { crawlResult . ExternalLinks . Count } external links") ;
107+
108+ _output . WriteLine ( "=== RETRY ATTEMPT TRACKING ===" ) ;
109+ foreach ( var ( url , attempts ) in retryTracker . GetAttemptCounts ( ) )
110+ {
111+ _output . WriteLine ( $ "{ url } : { attempts } attempts") ;
112+ }
113+
114+ _output . WriteLine ( "=== RAW MARKDOWN OUTPUT ===" ) ;
115+ _output . WriteLine ( markdown ) ;
116+ _output . WriteLine ( "=== END RAW MARKDOWN ===" ) ;
117+
118+ // assert
119+ Assert . NotNull ( crawlResult ) ;
120+
121+ // Verify crawl completed in reasonable time (should be much faster than timeout due to retries)
122+ Assert . True ( elapsed < TimeSpan . FromSeconds ( 30 ) , $ "Crawl took { elapsed . TotalSeconds : F1} s, should be much faster") ;
123+
124+ // Should have found external links
125+ Assert . True ( crawlResult . ExternalLinks . Count >= 3 , $ "Should have found at least 3 external links, found { crawlResult . ExternalLinks . Count } ") ;
126+
127+ // Verify retry attempts were made
128+ var attemptCounts = retryTracker . GetAttemptCounts ( ) ;
129+ Assert . True ( attemptCounts . Values . Any ( count => count > 1 ) , "Should have made retry attempts for some URLs" ) ;
130+
131+ // Verify that retry attempts were made and crawl completed without hanging
132+ var externalLinkStatuses = crawlResult . ExternalLinks . Values . Select ( r => r . StatusCode ) . ToList ( ) ;
133+ _output . WriteLine ( $ "External link statuses: { string . Join ( ", " , externalLinkStatuses ) } ") ;
134+
135+ // Some should succeed after retries, some should fail after exhausting retries
136+ Assert . Contains ( System . Net . HttpStatusCode . OK , externalLinkStatuses ) ; // Should have some successful retries
137+ Assert . Contains ( System . Net . HttpStatusCode . TooManyRequests , externalLinkStatuses ) ; // Should have some failed after retries
138+
139+ // Verify markdown report includes the retry information
140+ Assert . Contains ( "🔴 Pages with Broken Links" , markdown ) ; // Should have broken links section
141+ Assert . Contains ( "429" , markdown ) ; // Should mention 429 status codes
142+
143+ // Verify with snapshot testing like End2EndSpecs
144+ await Verify ( markdown ) ;
145+
146+ await mockServer . StopAsync ( ) ;
147+ }
148+ finally
149+ {
150+ if ( Directory . Exists ( testPagesDir ) )
151+ Directory . Delete ( testPagesDir , true ) ;
152+ }
153+ }
154+
155+ [ Fact ]
156+ public void ShouldParseRetryAfterHeaderCorrectly ( )
157+ {
158+ // arrange
159+ var crawlConfig = new CrawlConfiguration (
160+ new AbsoluteUri ( new Uri ( "https://example.com" ) ) ,
161+ 5 , TimeSpan . FromSeconds ( 5 ) ) ;
162+
163+ var crawler = Sys . ActorOf ( Props . Create ( ( ) => new CrawlerActor ( crawlConfig , ActorRefs . Nobody ) ) , "crawler" ) ;
164+
165+ // Create a mock response message (this is more of a unit test for the logic)
166+ using var response = new HttpResponseMessage ( HttpStatusCode . TooManyRequests ) ;
167+
168+ // Test seconds format
169+ response . Headers . Add ( "Retry-After" , "30" ) ;
170+ // We can't directly test the private method, but we can verify the logic works through integration
171+
172+ // Test date format
173+ var futureTime = DateTimeOffset . UtcNow . AddMinutes ( 5 ) ;
174+ response . Headers . Clear ( ) ;
175+ response . Headers . Add ( "Retry-After" , futureTime . ToString ( "R" ) ) ;
176+
177+ // This test mainly verifies that the CrawlerActor can be instantiated with the new retry logic
178+ Assert . NotNull ( crawler ) ;
179+ _output . WriteLine ( "CrawlerActor with retry logic created successfully" ) ;
180+ }
181+
182+ private IWebHost CreateMockRateLimitedServer ( RetryAttemptTracker tracker , int port )
183+ {
184+ return new WebHostBuilder ( )
185+ . UseKestrel ( )
186+ . UseUrls ( $ "http://127.0.0.1:{ port } ")
187+ . Configure ( app =>
188+ {
189+ app . Run ( async context =>
190+ {
191+ var path = context . Request . Path . Value ?? "/" ;
192+ var url = $ "http://127.0.0.1:{ port } { path } ";
193+
194+ var attemptCount = tracker . IncrementAttempt ( url ) ;
195+ _output . WriteLine ( $ "Request to { url } - Attempt #{ attemptCount } ") ;
196+
197+ if ( path == "/always-429" )
198+ {
199+ // Always return 429
200+ context . Response . StatusCode = 429 ;
201+ await context . Response . WriteAsync ( "Too Many Requests - Always" ) ;
202+ }
203+ else if ( path == "/retry-then-succeed" )
204+ {
205+ // Return 429 for first 2 attempts, then succeed
206+ if ( attemptCount <= 2 )
207+ {
208+ context . Response . StatusCode = 429 ;
209+ await context . Response . WriteAsync ( "Too Many Requests - Retry" ) ;
210+ }
211+ else
212+ {
213+ context . Response . StatusCode = 200 ;
214+ await context . Response . WriteAsync ( "Success after retries!" ) ;
215+ }
216+ }
217+ else if ( path == "/with-retry-after" )
218+ {
219+ // Return 429 with Retry-After header for first attempt, then succeed
220+ if ( attemptCount <= 1 )
221+ {
222+ context . Response . StatusCode = 429 ;
223+ context . Response . Headers [ "Retry-After" ] = "1" ; // 1 second
224+ await context . Response . WriteAsync ( "Too Many Requests - With Header" ) ;
225+ }
226+ else
227+ {
228+ context . Response . StatusCode = 200 ;
229+ await context . Response . WriteAsync ( "Success after Retry-After!" ) ;
230+ }
231+ }
232+ else if ( path == "/another-rate-limited" )
233+ {
234+ // Always return 429 to test multiple failing links
235+ context . Response . StatusCode = 429 ;
236+ await context . Response . WriteAsync ( "Too Many Requests - Another" ) ;
237+ }
238+ else
239+ {
240+ context . Response . StatusCode = 404 ;
241+ await context . Response . WriteAsync ( "Not Found" ) ;
242+ }
243+ } ) ;
244+ } )
245+ . Build ( ) ;
246+ }
247+ }
248+
249+ public class RetryAttemptTracker
250+ {
251+ private readonly ConcurrentDictionary < string , int > _attemptCounts = new ( ) ;
252+
253+ public int IncrementAttempt ( string url )
254+ {
255+ return _attemptCounts . AddOrUpdate ( url , 1 , ( _ , count ) => count + 1 ) ;
256+ }
257+
258+ public Dictionary < string , int > GetAttemptCounts ( )
259+ {
260+ return _attemptCounts . ToDictionary ( kvp => kvp . Key , kvp => kvp . Value ) ;
261+ }
262+ }
0 commit comments