-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathHtmlScraper.cs
More file actions
55 lines (52 loc) · 1.96 KB
/
HtmlScraper.cs
File metadata and controls
55 lines (52 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
namespace SmartCrawler;
/**
* Wrapper around HttpClient
* Logs critical and non-critical errors
*/
public class HtmlScraper
{
/// <summary>
/// Built-in http client that makes http requests to the online resources
/// </summary>
/// <remarks>
/// We create a single instance for the whole program, so we avoid port exhaustion,
/// which might occur when creating too many HttpClient instances
/// </remarks>
private static readonly HttpClient Client = new HttpClient();
static HtmlScraper()
{
// This is a quick win to decrease loading times. Most of the content loads much faster.
// In case a user wants to change this property I might add it to the CrawlerOptions class.
Client.Timeout = TimeSpan.FromSeconds(20);
}
/// <summary>
/// Wraps GetStringAsync and logs all errors instead of throwing them
/// </summary>
/// <param name="url">url of the resource that should be scraped</param>
/// <remarks>
/// TaskCanceledException AKA timeout is considered as a non-critical error
/// Returns a <see cref="ScraperResponse"/> object which contains the html content and metadata about the status of the request
/// </remarks>
public static async Task<ScraperResponse> ScrapeUrl(string url)
{
try
{
string html = await Client.GetStringAsync(url);
return new ScraperResponse(true, false, html);
}
catch (TaskCanceledException)
{
Console.WriteLine($"[HtmlScraper] Warning: {url} was timed out!");
return new ScraperResponse(false, false, "");
}
catch (InvalidOperationException)
{
Console.WriteLine($"[HtmlScraper] Warning: {url} is not a valid URL!");
}
catch (HttpRequestException)
{
Console.WriteLine($"[HtmlScraper] Warning: Connectivity issue! {url} skipped");
}
return new ScraperResponse(false, true, "");
}
}