Skip to content

Commit f0b9320

Browse files
committed
Dynamic web scraping done
1 parent 67178ac commit f0b9320

File tree

11 files changed

+669
-0
lines changed

11 files changed

+669
-0
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,10 @@
55
*.ear
66
/CSharp/IWAAuthWebServer/bin/Debug
77
/CSharp/IWAAuthWebServer/obj/Debug
8+
/CSharp/DynamicWebScraping/obj
9+
/CSharp/DynamicWebScraping/bin
10+
/CSharp/DynamicWebScraping/.vs
11+
/CSharp/WebScraping/bin
12+
/CSharp/WebScraping/obj
13+
/CSharp/WebScraping/.vscode
14+
/CSharp/WebScraping/.vs
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp3.1</TargetFramework>
6+
</PropertyGroup>
7+
8+
<ItemGroup>
9+
<PackageReference Include="DotNetSeleniumExtras.WaitHelpers" Version="3.11.0" />
10+
<PackageReference Include="Selenium.WebDriver" Version="3.141.0" />
11+
<PackageReference Include="Selenium.WebDriver.ChromeDriver" Version="85.0.4183.8700" />
12+
</ItemGroup>
13+
14+
<ItemGroup>
15+
<None Update="page.html">
16+
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
17+
</None>
18+
</ItemGroup>
19+
20+
</Project>
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
using OpenQA.Selenium;
2+
using OpenQA.Selenium.Chrome;
3+
using OpenQA.Selenium.Support.UI;
4+
using SeleniumExtras.WaitHelpers;
5+
using System;
6+
using System.IO;
7+
using System.Reflection;
8+
9+
namespace DynamicWebScraping
10+
{
11+
class Program
12+
{
13+
static void Main(string[] args)
14+
{
15+
Scrape();
16+
Console.ReadLine();
17+
}
18+
19+
public static void Scrape()
20+
{
21+
ChromeOptions options = new ChromeOptions();
22+
using (IWebDriver driver = new ChromeDriver(options))
23+
{
24+
WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(10));
25+
driver.Navigate().GoToUrl($"file://{Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location)}/page.html");
26+
driver.FindElement(By.Id("heading1")).Click();
27+
IWebElement firstResult = wait.Until(ExpectedConditions.ElementExists(By.Id("heading2")));
28+
Console.WriteLine(firstResult.GetAttribute("textContent"));
29+
}
30+
}
31+
}
32+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Web Scraping
2+
3+
Follow my tutorial [here](TBD) :-)
4+
5+
## Instructions to Run the Project
6+
7+
1. Navigate in the console to the project directory.
8+
1. Execute `dotnet restore`.
9+
1. Execute `dotnet run`.
10+
1. Enjoy life.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<!DOCTYPE html>
2+
3+
<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
4+
<head>
5+
<meta charset="utf-8" />
6+
<title>Sample</title>
7+
<script>
8+
function addMoreContent() {
9+
var myDiv = document.getElementById("MyDiv");
10+
var newElement = document.createElement("h2");
11+
newElement.id = "heading2";
12+
newElement.appendChild(document.createTextNode("World"));
13+
document.getElementById("body").appendChild(newElement);
14+
}
15+
</script>
16+
</head>
17+
<body id="body">
18+
<h1 id="heading1" onclick="addMoreContent()" style="cursor:pointer">Hello</h1>
19+
</body>
20+
</html>

CSharp/WebScraping/Method1.cs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
using HtmlAgilityPack;
2+
using System;
3+
using System.Linq;
4+
5+
namespace Vainolo.WebScraping
6+
{
7+
class Method1
8+
{
9+
public static void Scrape()
10+
{
11+
var page = new HtmlDocument();
12+
page.Load("WebScraping.html");
13+
var techniquesTitle = page.GetElementbyId("Techniques");
14+
var currNode = techniquesTitle.ParentNode.NextSibling;
15+
while(currNode.Name != "h2")
16+
{
17+
if(currNode.GetClasses().Contains("mw-headline"))
18+
{
19+
var headline = currNode.InnerText;
20+
Console.WriteLine(headline);
21+
}
22+
if(currNode.HasChildNodes)
23+
{
24+
currNode = currNode.FirstChild;
25+
}
26+
else if(currNode == currNode.ParentNode.LastChild)
27+
{
28+
while(currNode.ParentNode.NextSibling == null)
29+
{
30+
currNode = currNode.ParentNode;
31+
}
32+
currNode = currNode.ParentNode.NextSibling;
33+
}
34+
else
35+
{
36+
currNode = currNode.NextSibling;
37+
}
38+
}
39+
}
40+
}
41+
}

CSharp/WebScraping/Method2.cs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
using HtmlAgilityPack;
2+
using System;
3+
using System.Linq;
4+
5+
namespace Vainolo.WebScraping
6+
{
7+
class Method2
8+
{
9+
public static void Scrape()
10+
{
11+
var page = new HtmlDocument();
12+
page.Load("WebScraping.html");
13+
var nodes = page.DocumentNode.Descendants().SkipWhile(e => e.Id != "Techniques").Skip(1).TakeWhile(e => e.Name != "h2");
14+
15+
foreach (var currNode in nodes)
16+
{
17+
if(currNode.GetClasses().Contains("mw-headline"))
18+
{
19+
var headline = currNode.InnerText;
20+
Console.WriteLine(headline);
21+
}
22+
}
23+
}
24+
}
25+
}

CSharp/WebScraping/Program.cs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
using System;
2+
using HtmlAgilityPack;
3+
using System.Linq;
4+
5+
namespace Vainolo.WebScraping
6+
{
7+
class Program
8+
{
9+
static void Main(string[] args)
10+
{
11+
Console.WriteLine("Scraping using method 1");
12+
Console.WriteLine("-----------------------");
13+
Method1.Scrape();
14+
Console.WriteLine("");
15+
Console.WriteLine("Scraping using method 2");
16+
Console.WriteLine("-----------------------");
17+
Method2.Scrape();
18+
Console.ReadLine();
19+
}
20+
}
21+
}

CSharp/WebScraping/Readme.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Web Scraping
2+
3+
Follow my tutorial [here](https://www.vainolo.com/2020/05/06/scraping-web-pages-with-c-and-htmlagilitypack/) :-)
4+
5+
## Instructions to Run the Project
6+
7+
1. Navigate in the console to the project directory.
8+
1. Execute `dotnet restore`.
9+
1. Execute `dotnet run`.
10+
1. Enjoy life.
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp3.1</TargetFramework>
6+
</PropertyGroup>
7+
8+
<ItemGroup>
9+
<PackageReference Include="HtmlAgilityPack" Version="1.11.23" />
10+
</ItemGroup>
11+
12+
<ItemGroup>
13+
<None Update="WebScraping.html">
14+
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
15+
</None>
16+
</ItemGroup>
17+
18+
</Project>

0 commit comments

Comments
 (0)