Skip to content

Commit 802db24

Browse files
authored
Merge pull request #10 from olangness/drew
Drew
2 parents fb1eb0b + 8474bac commit 802db24

File tree

5 files changed

+9
-26
lines changed

5 files changed

+9
-26
lines changed

WebCrawler/Model/Crawler.cs

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
using WebCrawler.Interfaces;
1111
using System.Text.RegularExpressions;
1212
using WebCrawler.Repositories;
13-
using InfinityCrawler.Processing.Content;
14-
using TurnerSoftware.RobotsExclusionTools;
1513

1614
namespace WebCrawler.Model
1715
{
@@ -24,8 +22,9 @@ class Crawler
2422
private static List<Page> _pages = new List<Page>();
2523
private static List<string> _exceptions = new List<string>();
2624
private bool isCurrentPage = true;
27-
//public List<String> Disallows = new List<String>();
2825
private static List<string> urlsWithTopics = new List<string>();
26+
private static List<Link> links = new List<Link>();
27+
private static List<Log> log = new List<Log>();
2928

3029
//Constructor
3130
public Crawler(IRepos externalUrlRepository, IRepos otherUrlRepository, IRepos failedUrlRepository, IRepos currentPageUrlRepository)
@@ -70,6 +69,9 @@ private void CrawlPage(string url)
7069
page.Url = url;
7170

7271
_pages.Add(page);
72+
links.Add(url);
73+
log.Add("New Log Entry: "+url);
74+
7375

7476
//AddUrlToList(topic);
7577

@@ -90,12 +92,6 @@ private void CrawlPage(string url)
9092

9193
isCurrentPage = false;
9294

93-
/*//For each extracted URL
94-
//• Obey robots.txt (freshness caveat)
95-
//c. Check that not already in frontier
96-
var paths = page.OutLinks.Where(x => page.RobotsAreObeyed(x)
97-
&& b.Contains(x) == false);*/
98-
9995
//Crawl all the links found on the page.
10096
foreach (string link in _externalUrlRepository.List)
10197
{
@@ -129,19 +125,6 @@ public static bool PageHasBeenCrawled(string url)
129125
return false;
130126
}
131127

132-
// Adds url containing keyword(s) to a list
133-
private static void AddUrlToList(string topic)
134-
{
135-
foreach (Page page in _pages)
136-
{
137-
if (page.ToString().Contains(topic))
138-
{
139-
urlsWithTopics.Add(page.Url);
140-
}
141-
}
142-
}
143-
144-
145128
// Fixes a path. Makes sure it is a fully functional absolute url
146129
public static string FixPath(string originatingUrl, string link)
147130
{

WebCrawler/Model/Data Storage/Data.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
using System.Text;
55
using System.Threading.Tasks;
66

7-
namespace WindowsFormsApp1
7+
namespace WebCrawler.Model
88
{
99
class Data
1010
{

WebCrawler/Model/Data Storage/Link.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
using System.Text;
55
using System.Threading.Tasks;
66

7-
namespace WindowsFormsApp1
7+
namespace WebCrawler.Model
88
{
99
class Link
1010
{

WebCrawler/Model/Data Storage/Log.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
using System.Text;
55
using System.Threading.Tasks;
66

7-
namespace WindowsFormsApp1
7+
namespace WebCrawler.Model
88
{
99
class Log
1010
{

WebCrawler/Model/Data Storage/Topic.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
using System.Text;
55
using System.Threading.Tasks;
66

7-
namespace WindowsFormsApp1
7+
namespace WebCrawler.Model
88
{
99
class Topic
1010
{

0 commit comments

Comments
 (0)