Skip to content

Commit 9239a64

Browse files
authored
Merge pull request #14 from olangness/WhateverYouWant
Whatever you want
2 parents 08160f6 + 7e1576a commit 9239a64

File tree

1 file changed

+60
-6
lines changed

1 file changed

+60
-6
lines changed

WebCrawler/Model/Crawler.cs

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,11 @@ public void InitializeCrawl(string key, string url)
7979
{
8080
//var crawlUrl = ConfigurationManager.AppSettings["url"];
8181
Configuration configuration = ConfigurationManager.OpenExeConfiguration(ConfigurationUserLevel.None);
82-
configuration.AppSettings.Settings[key].Value = url;
82+
83+
configuration.AppSettings.Settings["url"].Value = url;
8384
configuration.Save();
84-
8585
ConfigurationManager.RefreshSection("appSettings");
86-
//CrawlPage(ConfigurationManager.AppSettings["url"]);
86+
CrawlPage(ConfigurationManager.AppSettings["url"]);
8787
}
8888

8989
/*//Initialisting the reporting
@@ -97,11 +97,17 @@ public void InitilizeCreateReport()
9797
9898
//Environment.Exit(0);
9999
}*/
100-
100+
int depthLimit = 0;
101101
private void CrawlPage(string url)
102102
{
103+
103104
if (!PageHasBeenCrawled(url))
104105
{
106+
if(depthLimit++ >= 5)
107+
{
108+
return;
109+
}
110+
105111
var htmlText = GetWebText(url);
106112

107113
var linkParser = new LinkParser();
@@ -119,35 +125,83 @@ private void CrawlPage(string url)
119125
//Add data to main data lists
120126
if (isCurrentPage)
121127
{
122-
AddRangeButNoDuplicates(_currentPageUrlRepository.List, linkParser.ExternalUrls);
128+
AddRangeButNoDuplicates(_currentPageUrlRepository.List, linkParser.GoodUrls);
123129
}
124130

125131
AddRangeButNoDuplicates(_externalUrlRepository.List, linkParser.ExternalUrls);
126132
AddRangeButNoDuplicates(_otherUrlRepository.List, linkParser.OtherUrls);
127133
AddRangeButNoDuplicates(_failedUrlRepository.List, linkParser.BadUrls);
128134

135+
foreach(var urlToParse in linkParser.GoodUrls)
136+
{
137+
links.Add(new Link("Page Title", urlToParse));
138+
log.Add(new Log($"New Entry: {url}", DateTime.Now));
139+
140+
}
141+
if (linkParser.GoodUrls.Count == 0)
142+
{
143+
links.Add(new Link("Page Title", "==============No New URLs From Page=============="));
144+
}
145+
129146
foreach (string exception in linkParser.Exceptions)
130147
_exceptions.Add(exception);
131148

132149
isCurrentPage = false;
133-
150+
Console.WriteLine(linkParser.GoodUrls);
151+
Console.WriteLine(_externalUrlRepository.List);
134152
//Crawl all the links found on the page.
153+
int loopBreak1 = 0;
135154
foreach (string link in _externalUrlRepository.List)
136155
{
137156
string formattedLink = link;
157+
loopBreak1++;
138158
try
139159
{
140160
formattedLink = FixPath(url, formattedLink);
141161

142162
if (formattedLink != String.Empty)
143163
{
164+
links.Add(new Link(" ", "==============Crawling to new external page " + link + "...=============="));
144165
CrawlPage(formattedLink);
145166
}
146167
}
147168
catch (Exception exc)
148169
{
149170
_failedUrlRepository.List.Add(formattedLink + " (on page at url " + url + ") - " + exc.Message);
150171
}
172+
if (loopBreak1 >= 100)
173+
{
174+
break;
175+
}
176+
}
177+
int loopBreak = 0;
178+
foreach (string iLink in linkParser.GoodUrls)
179+
{
180+
string formattediLink = iLink;
181+
loopBreak++;
182+
try
183+
{
184+
formattediLink = FixPath(url, formattediLink);
185+
186+
if (formattediLink != String.Empty)
187+
{
188+
189+
links.Add(new Link(" ", "==============Crawling to new internal page " + iLink + "...=============="));
190+
CrawlPage(formattediLink);
191+
}
192+
}
193+
catch (Exception exc)
194+
{
195+
_failedUrlRepository.List.Add(formattediLink + " (on page at url " + url + ") - " + exc.Message);
196+
}
197+
if (loopBreak >= 5)
198+
{
199+
break;
200+
}
201+
/*if (formattediLink == "/covid19")
202+
{
203+
break;
204+
}*/
151205
}
152206
}
153207
}

0 commit comments

Comments
 (0)