1414
1515namespace LinkValidator . Actors ;
1616
17- public record CrawlUrl ( AbsoluteUri Url ) ;
17+ public interface ICrawlResult
18+ {
19+ AbsoluteUri Url { get ; }
20+ HttpStatusCode StatusCode { get ; }
21+ }
22+
23+ public record CrawlUrl ( AbsoluteUri Url , LinkType LinkType ) ;
24+
25+ public record PageCrawled (
26+ AbsoluteUri Url ,
27+ HttpStatusCode StatusCode ,
28+ IReadOnlyList < AbsoluteUri > InternalLinks ,
29+ IReadOnlyList < AbsoluteUri > ExternalLinks ) : ICrawlResult ;
1830
19- public record PageCrawled ( AbsoluteUri Url , HttpStatusCode StatusCode , IReadOnlyList < AbsoluteUri > Links ) ;
31+ /// <summary>
32+ /// Indicates whether an external link was found to be valid.
33+ /// </summary>
34+ /// <param name="Url">The link Uri</param>
35+ /// <param name="StatusCode">The crawler status code</param>
36+ public record ExternalLinkCrawled ( AbsoluteUri Url , HttpStatusCode StatusCode ) : ICrawlResult ;
2037
2138public sealed class CrawlerActor : UntypedActor , IWithStash
2239{
@@ -51,8 +68,8 @@ protected override void OnReceive(object message)
5168 if ( _inflightRequests == _crawlConfiguration . MaxInflightRequests )
5269 Become ( TooBusy ) ;
5370 break ;
54- case PageCrawled pageCrawled :
55- HandlePageCrawled ( pageCrawled ) ;
71+ case ICrawlResult pageCrawled :
72+ HandleCrawlResult ( pageCrawled ) ;
5673 break ;
5774 }
5875 }
@@ -65,17 +82,16 @@ private void TooBusy(object message)
6582 // too many in-flight requests right now
6683 Stash . Stash ( ) ;
6784 break ;
68- case PageCrawled pageCrawled :
69- HandlePageCrawled ( pageCrawled ) ;
70-
85+ case ICrawlResult pageCrawled :
86+ HandleCrawlResult ( pageCrawled ) ;
7187 // switch behaviors back and unstash one message
7288 Stash . Unstash ( ) ;
7389 Become ( OnReceive ) ;
7490 break ;
7591 }
7692 }
7793
78- private void HandlePageCrawled ( PageCrawled pageCrawled )
94+ private void HandleCrawlResult ( ICrawlResult pageCrawled )
7995 {
8096 _inflightRequests -- ;
8197 _coordinator . Tell ( pageCrawled ) ;
@@ -88,11 +104,36 @@ private void HandleCrawlUrl(CrawlUrl msg)
88104 * already seen this page before.
89105 */
90106 _inflightRequests ++ ;
91-
92- DoWork ( ) . PipeTo ( Self , Self , result => result ) ;
107+ switch ( msg . LinkType )
108+ {
109+ case LinkType . Internal :
110+ CrawlInternalPage ( ) . PipeTo ( Self , Self , result => result ) ;
111+ break ;
112+ case LinkType . External :
113+ CrawlExternalPage ( ) . PipeTo ( Self , Self , result => result ) ;
114+ break ;
115+ default :
116+ throw new ArgumentOutOfRangeException ( ) ;
117+ }
118+
93119 return ;
94120
95- async Task < PageCrawled > DoWork ( )
121+ async Task < ICrawlResult > CrawlExternalPage ( )
122+ {
123+ try {
124+ using var cts = new CancellationTokenSource ( _crawlConfiguration . RequestTimeout ) ;
125+ var response = await _httpClient . GetAsync ( msg . Url . Value , cts . Token ) ;
126+
127+ return new ExternalLinkCrawled ( msg . Url , response . StatusCode ) ;
128+ }
129+ catch ( Exception ex )
130+ {
131+ _log . Warning ( ex , "Failed to crawl {0}" , msg . Url ) ;
132+ return new ExternalLinkCrawled ( msg . Url , HttpStatusCode . RequestTimeout ) ;
133+ }
134+ }
135+
136+ async Task < ICrawlResult > CrawlInternalPage ( )
96137 {
97138 try
98139 {
@@ -113,16 +154,19 @@ async Task<PageCrawled> DoWork()
113154 */
114155 var processingUri = UriHelpers . GetDirectoryPath ( msg . Url ) ;
115156 var links = ParseLinks ( html , processingUri ) ;
157+
158+ var internalLinks = links . Where ( c => c . type == LinkType . Internal ) . Select ( c => c . uri ) . ToImmutableArray ( ) ;
159+ var externalLinks = links . Where ( c => c . type == LinkType . External ) . Select ( c => c . uri ) . ToImmutableArray ( ) ;
116160
117- return new PageCrawled ( msg . Url , response . StatusCode , links ) ;
161+ return new PageCrawled ( msg . Url , response . StatusCode , internalLinks , externalLinks ) ;
118162 }
119163
120- return new PageCrawled ( msg . Url , response . StatusCode , Array . Empty < AbsoluteUri > ( ) ) ;
164+ return new PageCrawled ( msg . Url , response . StatusCode , [ ] , [ ] ) ;
121165 }
122166 catch ( Exception ex )
123167 {
124168 _log . Warning ( ex , "Failed to crawl {0}" , msg . Url ) ;
125- return new PageCrawled ( msg . Url , HttpStatusCode . RequestTimeout , Array . Empty < AbsoluteUri > ( ) ) ;
169+ return new PageCrawled ( msg . Url , HttpStatusCode . RequestTimeout , [ ] , [ ] ) ;
126170 }
127171 }
128172 }
0 commit comments