Skip to content

Commit 50edf35

Browse files
committed
重構爬蟲並加上測試案例。
1 parent 9eda350 commit 50edf35

File tree

12 files changed

+293
-165
lines changed

12 files changed

+293
-165
lines changed
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
namespace Processors;
2+
3+
public class CNProcessor : ProcessorBase
4+
{
5+
public override string WikiUrl => "https://zh.wikipedia.org/zh-cn/ISO_3166-1";
6+
7+
public override async Task<Country[]> ListCountryAsync()
8+
{
9+
var list = new List<Country>();
10+
foreach (var tr in await ListTableRowsAsync(WikiUrl))
11+
{
12+
var tds = tr.Split("</td>").Select(o => o.Replace("<td>", "").Replace("<td align=\"center\">", "")).ToArray();
13+
14+
var name = tds[0].Replace("\n", "");
15+
if (name.Contains("</span>"))
16+
{
17+
var array = name.Split("</span>");
18+
name = array[^2].Split(">")[1];
19+
}
20+
21+
if (name.Contains(" ("))
22+
{
23+
var array = name.Split(" (");
24+
name = array[0];
25+
if (array.Length > 1)
26+
{
27+
name = $"{name}, {array[1]}";
28+
}
29+
}
30+
31+
list.Add(new Country()
32+
{
33+
Name = name,
34+
TwoLetterCode = tds[1].Split("<tt>")[1].Split("</tt>")[0].Replace("\n", ""),
35+
ThreeLetterCode = tds[2].Split("<tt>")[1].Split("</tt>")[0].Replace("\n", ""),
36+
NumericCode = tds[3].Split("<tt>")[1].Split("</tt>")[0].Replace("\n", ""),
37+
SimplifiedChineseName = tds[5].Split("<a href=").Last().Replace("</a>", "").Split(">")[1].Replace("\n", ""),
38+
Independent = tds[6].Contains("table-yes")
39+
});
40+
}
41+
42+
return list.ToArray();
43+
}
44+
}

WebCrawler/Processors/Country.cs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
namespace Processors;
2+
3+
public class Country
4+
{
5+
public string Name { get; set; } = string.Empty;
6+
public string TwoLetterCode { get; set; } = string.Empty;
7+
public string ThreeLetterCode { get; set; } = string.Empty;
8+
public string NumericCode { get; set; } = string.Empty;
9+
public string TraditionalChineseName { get; set; } = string.Empty;
10+
public string SimplifiedChineseName { get; set; } = string.Empty;
11+
public bool Independent { get; set; }
12+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
namespace Processors;
2+
3+
public interface IProcessor
4+
{
5+
Task<Country[]> ListCountryAsync();
6+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
namespace Processors;
2+
3+
public abstract class ProcessorBase : IProcessor
4+
{
5+
public abstract string WikiUrl { get; }
6+
7+
public abstract Task<Country[]> ListCountryAsync();
8+
9+
public virtual async Task<IEnumerable<string>> ListTableRowsAsync(string url)
10+
{
11+
using var httpClient = new HttpClient();
12+
var webData = await httpClient.GetStringAsync(url);
13+
14+
var tbody = webData.Split("<table class=\"wikitable sortable\">").Last()
15+
.Split("</table>").First()
16+
.Replace("<tbody>", "").Replace("</tbody>", "");
17+
18+
return tbody.Split("<tr>")
19+
.Select(o => o.Replace("</tr>", ""))
20+
.Where(o => o != "\n\n" && !o.StartsWith("\n<th scope=\"col\">"));
21+
}
22+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<TargetFramework>net6.0</TargetFramework>
5+
<ImplicitUsings>enable</ImplicitUsings>
6+
<Nullable>enable</Nullable>
7+
</PropertyGroup>
8+
9+
</Project>
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
using System.Xml.Linq;
2+
3+
namespace Processors;
4+
5+
public class TWProcessor : ProcessorBase
6+
{
7+
public override string WikiUrl => "https://zh.wikipedia.org/zh-tw/ISO_3166-1";
8+
9+
public override async Task<Country[]> ListCountryAsync()
10+
{
11+
var list = new List<Country>();
12+
foreach (var tr in await ListTableRowsAsync(WikiUrl))
13+
{
14+
var tds = tr.Split("</td>").Select(o => o.Replace("<td>", "").Replace("<td align=\"center\">", "")).ToArray();
15+
16+
var name = tds[0].Replace("\n", "");
17+
if (name.Contains("</span>"))
18+
{
19+
var array = name.Split("</span>");
20+
name = array[^2].Split(">")[1];
21+
}
22+
23+
if (name.Contains(" ("))
24+
{
25+
var array = name.Split(" (");
26+
name = array[0];
27+
if (array.Length > 1)
28+
{
29+
name = $"{name}, {array[1]}";
30+
}
31+
}
32+
33+
if (name.StartsWith("Taiwan"))
34+
{
35+
name = name.Split(",")[0];
36+
tds[5] = tds[5].Replace("中國台灣省", "台灣"); // Replace this with ROC or remove this line to meet your need.
37+
}
38+
39+
list.Add(new Country()
40+
{
41+
Name = name,
42+
TwoLetterCode = tds[1].Split("<tt>")[1].Split("</tt>")[0].Replace("\n", ""),
43+
ThreeLetterCode = tds[2].Split("<tt>")[1].Split("</tt>")[0].Replace("\n", ""),
44+
NumericCode = tds[3].Split("<tt>")[1].Split("</tt>")[0].Replace("\n", ""),
45+
TraditionalChineseName = tds[5].Split("<a href=").Last().Replace("</a>", "").Split(">")[1].Replace("\n", ""),
46+
Independent = tds[6].Contains("table-yes")
47+
});
48+
}
49+
50+
return list.ToArray();
51+
}
52+
}

WebCrawler/Tests/ProcessorTest.cs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
using Processors;
2+
3+
namespace Tests
4+
{
5+
[TestClass]
6+
public class ProcessorTest
7+
{
8+
[TestMethod]
9+
public async Task ListTableRows()
10+
{
11+
var processor = new TWProcessor();
12+
var rows = await processor.ListTableRowsAsync(processor.WikiUrl);
13+
Assert.IsTrue(rows != null && rows.Any());
14+
}
15+
16+
[TestMethod]
17+
public async Task ListTWCountries()
18+
{
19+
var processor = new TWProcessor();
20+
var countries = await processor.ListCountryAsync();
21+
Assert.IsTrue(countries != null && countries.Any());
22+
}
23+
24+
[TestMethod]
25+
public async Task ListCNCountries()
26+
{
27+
var processor = new CNProcessor();
28+
var countries = await processor.ListCountryAsync();
29+
Assert.IsTrue(countries != null && countries.Any());
30+
}
31+
}
32+
}

WebCrawler/Tests/Tests.csproj

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<TargetFramework>net6.0</TargetFramework>
5+
<ImplicitUsings>enable</ImplicitUsings>
6+
<Nullable>enable</Nullable>
7+
8+
<IsPackable>false</IsPackable>
9+
</PropertyGroup>
10+
11+
<ItemGroup>
12+
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.3.2" />
13+
<PackageReference Include="MSTest.TestAdapter" Version="2.2.10" />
14+
<PackageReference Include="MSTest.TestFramework" Version="2.2.10" />
15+
<PackageReference Include="coverlet.collector" Version="3.1.2" />
16+
</ItemGroup>
17+
18+
<ItemGroup>
19+
<ProjectReference Include="..\Processors\Processors.csproj" />
20+
</ItemGroup>
21+
22+
</Project>

WebCrawler/Tests/Usings.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
global using Microsoft.VisualStudio.TestTools.UnitTesting;

WebCrawler/WebCrawler.sln

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,32 @@
11

22
Microsoft Visual Studio Solution File, Format Version 12.00
3-
# Visual Studio 15
4-
VisualStudioVersion = 15.0.28010.2026
3+
# Visual Studio Version 17
4+
VisualStudioVersion = 17.4.33213.308
55
MinimumVisualStudioVersion = 10.0.40219.1
6-
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "WebCrawler", "WebCrawler\WebCrawler.csproj", "{866C6A6E-4C00-4AD2-85A1-8D3FA4BA1253}"
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Processors", "Processors\Processors.csproj", "{F3B5A318-9E0F-4A45-AC77-A56DB9EA6526}"
7+
EndProject
8+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Tests", "Tests\Tests.csproj", "{38D34094-6FFF-4165-BC44-72C0F015EC67}"
9+
EndProject
10+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "WebCrawler", "WebCrawler\WebCrawler.csproj", "{2FD5DE62-75E5-4A6C-9BD7-F8151E4508EE}"
711
EndProject
812
Global
913
GlobalSection(SolutionConfigurationPlatforms) = preSolution
1014
Debug|Any CPU = Debug|Any CPU
1115
Release|Any CPU = Release|Any CPU
1216
EndGlobalSection
1317
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14-
{866C6A6E-4C00-4AD2-85A1-8D3FA4BA1253}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15-
{866C6A6E-4C00-4AD2-85A1-8D3FA4BA1253}.Debug|Any CPU.Build.0 = Debug|Any CPU
16-
{866C6A6E-4C00-4AD2-85A1-8D3FA4BA1253}.Release|Any CPU.ActiveCfg = Release|Any CPU
17-
{866C6A6E-4C00-4AD2-85A1-8D3FA4BA1253}.Release|Any CPU.Build.0 = Release|Any CPU
18+
{F3B5A318-9E0F-4A45-AC77-A56DB9EA6526}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
19+
{F3B5A318-9E0F-4A45-AC77-A56DB9EA6526}.Debug|Any CPU.Build.0 = Debug|Any CPU
20+
{F3B5A318-9E0F-4A45-AC77-A56DB9EA6526}.Release|Any CPU.ActiveCfg = Release|Any CPU
21+
{F3B5A318-9E0F-4A45-AC77-A56DB9EA6526}.Release|Any CPU.Build.0 = Release|Any CPU
22+
{38D34094-6FFF-4165-BC44-72C0F015EC67}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
23+
{38D34094-6FFF-4165-BC44-72C0F015EC67}.Debug|Any CPU.Build.0 = Debug|Any CPU
24+
{38D34094-6FFF-4165-BC44-72C0F015EC67}.Release|Any CPU.ActiveCfg = Release|Any CPU
25+
{38D34094-6FFF-4165-BC44-72C0F015EC67}.Release|Any CPU.Build.0 = Release|Any CPU
26+
{2FD5DE62-75E5-4A6C-9BD7-F8151E4508EE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
27+
{2FD5DE62-75E5-4A6C-9BD7-F8151E4508EE}.Debug|Any CPU.Build.0 = Debug|Any CPU
28+
{2FD5DE62-75E5-4A6C-9BD7-F8151E4508EE}.Release|Any CPU.ActiveCfg = Release|Any CPU
29+
{2FD5DE62-75E5-4A6C-9BD7-F8151E4508EE}.Release|Any CPU.Build.0 = Release|Any CPU
1830
EndGlobalSection
1931
GlobalSection(SolutionProperties) = preSolution
2032
HideSolutionNode = FALSE

0 commit comments

Comments
 (0)