Skip to content

Commit 70cfea0

Browse files
Adding support for tracking broken external Uris (#62)
* Adding support for tracking broken external Uris * Add Crawl handling for external documents too * creating combined `CrawlReport` type * added external link visiting to the indexer actor * added some external links to test sample * added broken external links to output report
1 parent 56c7d08 commit 70cfea0

File tree

14 files changed

+256
-89
lines changed

14 files changed

+256
-89
lines changed

src/LinkValidator.Tests/End2EndSpecs.ShouldCrawlWebsiteCorrectly.verified.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Sitemap for `http://localhost:8080/`
22

3+
## Internal Pages
4+
35
| URL | StatusCode | Linked From |
46
| --------------------- | ---------- | ------------------------------------------------------- |
57
| `/` | OK | \- |
@@ -14,7 +16,13 @@
1416
### `/` has broken links:
1517

1618
- `/page2.html` (NotFound)
19+
- `http://getakka.net/broken-link` (NotFound)
20+
21+
### `/about/index.html` has broken links:
22+
23+
- `http://getakka.net/broken-link` (NotFound)
1724

1825
### `/index.html` has broken links:
1926

2027
- `/page2.html` (NotFound)
28+
- `http://getakka.net/broken-link` (NotFound)

src/LinkValidator.Tests/End2EndSpecs.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ public async Task ShouldCrawlWebsiteCorrectly()
4444

4545
// act
4646
var crawlResult = await CrawlWebsite(Sys, baseUrl);
47-
var markdown = GenerateMarkdown(baseUrl, crawlResult);
47+
var markdown = GenerateMarkdown(crawlResult);
4848

4949
_output.WriteLine("=== RAW MARKDOWN OUTPUT ===");
5050
_output.WriteLine(markdown);

src/LinkValidator.Tests/ManualMarkdownTest.cs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,19 @@
22
using System.Net;
33
using LinkValidator.Actors;
44
using LinkValidator.Util;
5+
using Xunit.Abstractions;
56

67
namespace LinkValidator.Tests;
78

89
public class ManualMarkdownTest
910
{
11+
private readonly ITestOutputHelper _testOutputHelper;
12+
13+
public ManualMarkdownTest(ITestOutputHelper testOutputHelper)
14+
{
15+
_testOutputHelper = testOutputHelper;
16+
}
17+
1018
[Fact]
1119
public void TestRawMarkdownOutput()
1220
{
@@ -20,12 +28,14 @@ public void TestRawMarkdownOutput()
2028
ImmutableList<AbsoluteUri>.Empty
2129
.Add(baseUri)
2230
.Add(new AbsoluteUri(new Uri("http://localhost:8080/index.html")))));
31+
32+
var crawlResults = new CrawlReport(baseUri, results, ImmutableSortedDictionary<string, CrawlRecord>.Empty);
2333

24-
var markdown = MarkdownHelper.GenerateMarkdown(baseUri, results);
34+
var markdown = MarkdownHelper.GenerateMarkdown(crawlResults);
2535

26-
Console.WriteLine("RAW MARKDOWN:");
27-
Console.WriteLine(markdown);
28-
Console.WriteLine("END RAW MARKDOWN");
36+
_testOutputHelper.WriteLine("RAW MARKDOWN:");
37+
_testOutputHelper.WriteLine(markdown);
38+
_testOutputHelper.WriteLine("END RAW MARKDOWN");
2939

3040
// Check for escaping
3141
Assert.DoesNotContain("\\/", markdown);

src/LinkValidator.Tests/ParseHelperSpecs.cs

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ public void ParseHelper_should_return_absolute_uris()
3838

3939
// Assert
4040
uris.Should().HaveCount(3);
41-
uris.Should().Contain(new AbsoluteUri(new Uri("http://example.com/about")));
42-
uris.Should().Contain(new AbsoluteUri(new Uri("http://example.com/contact")));
43-
uris.Should().Contain(new AbsoluteUri(new Uri("http://example.com/faq")));
41+
uris.Should().Contain((new AbsoluteUri(new Uri("http://example.com/about")), LinkType.Internal));
42+
uris.Should().Contain((new AbsoluteUri(new Uri("http://example.com/contact")), LinkType.Internal));
43+
uris.Should().Contain((new AbsoluteUri(new Uri("http://example.com/faq")), LinkType.Internal));
4444
}
4545

4646
// create a string that contains HTML linking to a few different URLs using absolute links
@@ -71,9 +71,9 @@ public void ParseHelper_should_return_absolute_uris_when_given_absolute_links()
7171
uris.Should().HaveCount(3);
7272

7373
// notice that we convert the scheme to https
74-
uris.Should().Contain(new AbsoluteUri(new Uri("https://example.com/about")));
75-
uris.Should().Contain(new AbsoluteUri(new Uri("https://example.com/contact")));
76-
uris.Should().Contain(new AbsoluteUri(new Uri("https://example.com/faq")));
74+
uris.Should().Contain((new AbsoluteUri(new Uri("https://example.com/about")), LinkType.Internal));
75+
uris.Should().Contain((new AbsoluteUri(new Uri("https://example.com/contact")), LinkType.Internal));
76+
uris.Should().Contain((new AbsoluteUri(new Uri("https://example.com/faq")), LinkType.Internal));
7777
}
7878

7979
private const string MixedHtml = """
@@ -100,9 +100,10 @@ public void ParseHelper_should_return_absolute_uris_when_given_mixed_links()
100100
var uris = ParseHelpers.ParseLinks(MixedHtml, uri);
101101

102102
// Assert
103-
uris.Should().HaveCount(2); // don't count the FAKEURL one
104-
uris.Should().Contain(new AbsoluteUri(new Uri("http://example.com/about")));
105-
uris.Should().Contain(new AbsoluteUri(new Uri("http://example.com/contact")));
103+
uris.Where(c => c.type == LinkType.Internal).Should().HaveCount(2); // don't count the FAKEURL one
104+
uris.Where(c => c.type == LinkType.External).Should().HaveCount(1); // do count the FAKEURL one
105+
uris.Should().Contain((new AbsoluteUri(new Uri("http://example.com/about")), LinkType.Internal));
106+
uris.Should().Contain((new AbsoluteUri(new Uri("http://example.com/contact")), LinkType.Internal));
106107
}
107108

108109
public const string TweetShareLink = """
@@ -794,7 +795,8 @@ public void ParseHelper_should_not_include_absoluteUris_that_appear_in_querystri
794795
var uris = ParseHelpers.ParseLinks(TweetShareLink, uri);
795796

796797
// Assert
797-
uris.Should().HaveCount(22);
798+
uris.Where(c => c.type == LinkType.Internal).Should().HaveCount(22);
799+
uris.Where(c => c.type == LinkType.External).Should().HaveCount(15);
798800
}
799801

800802
private const string LinkFragmentsHtml = """
@@ -822,7 +824,7 @@ public void ParseHelper_should_not_count_LinkFragments_separately()
822824

823825
// Assert
824826
uris.Should().HaveCount(2);
825-
uris.Should().Contain(new AbsoluteUri(new Uri("http://example.com/about")));
826-
uris.Should().Contain(new AbsoluteUri(new Uri("http://example.com/contact")));
827+
uris.Should().Contain((new AbsoluteUri(new Uri("http://example.com/about")), LinkType.Internal));
828+
uris.Should().Contain((new AbsoluteUri(new Uri("http://example.com/contact")), LinkType.Internal));
827829
}
828830
}

src/LinkValidator.Tests/pages/about/index.html

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,12 @@
1313
<li><a href="/about/contact.html">Contact</a></li>
1414
</ul>
1515

16+
<b>Other:</b>
17+
<ul>
18+
<li><a href="https://phobos.petabridge.com/">Phobos</a></li>
19+
<li><a href="https://getakka.net/">Akka.NET Documentation</a></li>
20+
<li><a href="https://getakka.net/broken-link">Akka.NET Documentation (Broken Link)</a></li>
21+
</ul>
22+
1623
</body>
1724
</html>

src/LinkValidator.Tests/pages/index.html

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,12 @@
1919
<li><a href="/about/contact.html">Contact</a></li>
2020
</ul>
2121

22+
<b>Other:</b>
23+
<ul>
24+
<li><a href="https://phobos.petabridge.com/">Phobos</a></li>
25+
<li><a href="https://getakka.net/">Akka.NET Documentation</a></li>
26+
<li><a href="https://getakka.net/broken-link">Akka.NET Documentation (Broken Link)</a></li>
27+
</ul>
28+
2229
</body>
2330
</html>

src/LinkValidator.Tests/pages/page1.html

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,11 @@
1212
<li><a href="/about/contact.html">Contact</a></li>
1313
</ul>
1414

15+
<b>Other:</b>
16+
<ul>
17+
<li><a href="https://phobos.petabridge.com/">Phobos</a></li>
18+
<li><a href="https://getakka.net/">Akka.NET Documentation</a></li>
19+
</ul>
20+
1521
</body>
1622
</html>

src/LinkValidator/Actors/CrawlerActor.cs

Lines changed: 58 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,26 @@
1414

1515
namespace LinkValidator.Actors;
1616

17-
public record CrawlUrl(AbsoluteUri Url);
17+
public interface ICrawlResult
18+
{
19+
AbsoluteUri Url { get; }
20+
HttpStatusCode StatusCode { get; }
21+
}
22+
23+
public record CrawlUrl(AbsoluteUri Url, LinkType LinkType);
24+
25+
public record PageCrawled(
26+
AbsoluteUri Url,
27+
HttpStatusCode StatusCode,
28+
IReadOnlyList<AbsoluteUri> InternalLinks,
29+
IReadOnlyList<AbsoluteUri> ExternalLinks) : ICrawlResult;
1830

19-
public record PageCrawled(AbsoluteUri Url, HttpStatusCode StatusCode, IReadOnlyList<AbsoluteUri> Links);
31+
/// <summary>
32+
/// Indicates whether an external link was found to be valid.
33+
/// </summary>
34+
/// <param name="Url">The link Uri</param>
35+
/// <param name="StatusCode">The crawler status code</param>
36+
public record ExternalLinkCrawled(AbsoluteUri Url, HttpStatusCode StatusCode) : ICrawlResult;
2037

2138
public sealed class CrawlerActor : UntypedActor, IWithStash
2239
{
@@ -51,8 +68,8 @@ protected override void OnReceive(object message)
5168
if (_inflightRequests == _crawlConfiguration.MaxInflightRequests)
5269
Become(TooBusy);
5370
break;
54-
case PageCrawled pageCrawled:
55-
HandlePageCrawled(pageCrawled);
71+
case ICrawlResult pageCrawled:
72+
HandleCrawlResult(pageCrawled);
5673
break;
5774
}
5875
}
@@ -65,17 +82,16 @@ private void TooBusy(object message)
6582
// too many in-flight requests right now
6683
Stash.Stash();
6784
break;
68-
case PageCrawled pageCrawled:
69-
HandlePageCrawled(pageCrawled);
70-
85+
case ICrawlResult pageCrawled:
86+
HandleCrawlResult(pageCrawled);
7187
// switch behaviors back and unstash one message
7288
Stash.Unstash();
7389
Become(OnReceive);
7490
break;
7591
}
7692
}
7793

78-
private void HandlePageCrawled(PageCrawled pageCrawled)
94+
private void HandleCrawlResult(ICrawlResult pageCrawled)
7995
{
8096
_inflightRequests--;
8197
_coordinator.Tell(pageCrawled);
@@ -88,11 +104,36 @@ private void HandleCrawlUrl(CrawlUrl msg)
88104
* already seen this page before.
89105
*/
90106
_inflightRequests++;
91-
92-
DoWork().PipeTo(Self, Self, result => result);
107+
switch (msg.LinkType)
108+
{
109+
case LinkType.Internal:
110+
CrawlInternalPage().PipeTo(Self, Self, result => result);
111+
break;
112+
case LinkType.External:
113+
CrawlExternalPage().PipeTo(Self, Self, result => result);
114+
break;
115+
default:
116+
throw new ArgumentOutOfRangeException();
117+
}
118+
93119
return;
94120

95-
async Task<PageCrawled> DoWork()
121+
async Task<ICrawlResult> CrawlExternalPage()
122+
{
123+
try{
124+
using var cts = new CancellationTokenSource(_crawlConfiguration.RequestTimeout);
125+
var response = await _httpClient.GetAsync(msg.Url.Value, cts.Token);
126+
127+
return new ExternalLinkCrawled(msg.Url, response.StatusCode);
128+
}
129+
catch (Exception ex)
130+
{
131+
_log.Warning(ex, "Failed to crawl {0}", msg.Url);
132+
return new ExternalLinkCrawled(msg.Url, HttpStatusCode.RequestTimeout);
133+
}
134+
}
135+
136+
async Task<ICrawlResult> CrawlInternalPage()
96137
{
97138
try
98139
{
@@ -113,16 +154,19 @@ async Task<PageCrawled> DoWork()
113154
*/
114155
var processingUri = UriHelpers.GetDirectoryPath(msg.Url);
115156
var links = ParseLinks(html, processingUri);
157+
158+
var internalLinks = links.Where(c => c.type == LinkType.Internal).Select(c => c.uri).ToImmutableArray();
159+
var externalLinks = links.Where(c => c.type == LinkType.External).Select(c => c.uri).ToImmutableArray();
116160

117-
return new PageCrawled(msg.Url, response.StatusCode, links);
161+
return new PageCrawled(msg.Url, response.StatusCode, internalLinks, externalLinks);
118162
}
119163

120-
return new PageCrawled(msg.Url, response.StatusCode, Array.Empty<AbsoluteUri>());
164+
return new PageCrawled(msg.Url, response.StatusCode, [], []);
121165
}
122166
catch (Exception ex)
123167
{
124168
_log.Warning(ex, "Failed to crawl {0}", msg.Url);
125-
return new PageCrawled(msg.Url, HttpStatusCode.RequestTimeout, Array.Empty<AbsoluteUri>());
169+
return new PageCrawled(msg.Url, HttpStatusCode.RequestTimeout, [], []);
126170
}
127171
}
128172
}

0 commit comments

Comments
 (0)