@@ -79,11 +79,11 @@ public void InitializeCrawl(string key, string url)
79
79
{
80
80
//var crawlUrl = ConfigurationManager.AppSettings["url"];
81
81
Configuration configuration = ConfigurationManager . OpenExeConfiguration ( ConfigurationUserLevel . None ) ;
82
- configuration . AppSettings . Settings [ key ] . Value = url ;
82
+
83
+ configuration . AppSettings . Settings [ "url" ] . Value = url ;
83
84
configuration . Save ( ) ;
84
-
85
85
ConfigurationManager . RefreshSection ( "appSettings" ) ;
86
- // CrawlPage(ConfigurationManager.AppSettings["url"]);
86
+ CrawlPage ( ConfigurationManager . AppSettings [ "url" ] ) ;
87
87
}
88
88
89
89
/*//Initialisting the reporting
@@ -97,11 +97,17 @@ public void InitilizeCreateReport()
97
97
98
98
//Environment.Exit(0);
99
99
}*/
100
-
100
+ int depthLimit = 0 ;
101
101
private void CrawlPage ( string url )
102
102
{
103
+
103
104
if ( ! PageHasBeenCrawled ( url ) )
104
105
{
106
+ if ( depthLimit ++ >= 5 )
107
+ {
108
+ return ;
109
+ }
110
+
105
111
var htmlText = GetWebText ( url ) ;
106
112
107
113
var linkParser = new LinkParser ( ) ;
@@ -119,35 +125,83 @@ private void CrawlPage(string url)
119
125
//Add data to main data lists
120
126
if ( isCurrentPage )
121
127
{
122
- AddRangeButNoDuplicates ( _currentPageUrlRepository . List , linkParser . ExternalUrls ) ;
128
+ AddRangeButNoDuplicates ( _currentPageUrlRepository . List , linkParser . GoodUrls ) ;
123
129
}
124
130
125
131
AddRangeButNoDuplicates ( _externalUrlRepository . List , linkParser . ExternalUrls ) ;
126
132
AddRangeButNoDuplicates ( _otherUrlRepository . List , linkParser . OtherUrls ) ;
127
133
AddRangeButNoDuplicates ( _failedUrlRepository . List , linkParser . BadUrls ) ;
128
134
135
+ foreach ( var urlToParse in linkParser . GoodUrls )
136
+ {
137
+ links . Add ( new Link ( "Page Title" , urlToParse ) ) ;
138
+ log . Add ( new Log ( $ "New Entry: { url } ", DateTime . Now ) ) ;
139
+
140
+ }
141
+ if ( linkParser . GoodUrls . Count == 0 )
142
+ {
143
+ links . Add ( new Link ( "Page Title" , "==============No New URLs From Page==============" ) ) ;
144
+ }
145
+
129
146
foreach ( string exception in linkParser . Exceptions )
130
147
_exceptions . Add ( exception ) ;
131
148
132
149
isCurrentPage = false ;
133
-
150
+ Console . WriteLine ( linkParser . GoodUrls ) ;
151
+ Console . WriteLine ( _externalUrlRepository . List ) ;
134
152
//Crawl all the links found on the page.
153
+ int loopBreak1 = 0 ;
135
154
foreach ( string link in _externalUrlRepository . List )
136
155
{
137
156
string formattedLink = link ;
157
+ loopBreak1 ++ ;
138
158
try
139
159
{
140
160
formattedLink = FixPath ( url , formattedLink ) ;
141
161
142
162
if ( formattedLink != String . Empty )
143
163
{
164
+ links . Add ( new Link ( " " , "==============Crawling to new external page " + link + "...==============" ) ) ;
144
165
CrawlPage ( formattedLink ) ;
145
166
}
146
167
}
147
168
catch ( Exception exc )
148
169
{
149
170
_failedUrlRepository . List . Add ( formattedLink + " (on page at url " + url + ") - " + exc . Message ) ;
150
171
}
172
+ if ( loopBreak1 >= 100 )
173
+ {
174
+ break ;
175
+ }
176
+ }
177
+ int loopBreak = 0 ;
178
+ foreach ( string iLink in linkParser . GoodUrls )
179
+ {
180
+ string formattediLink = iLink ;
181
+ loopBreak ++ ;
182
+ try
183
+ {
184
+ formattediLink = FixPath ( url , formattediLink ) ;
185
+
186
+ if ( formattediLink != String . Empty )
187
+ {
188
+
189
+ links . Add ( new Link ( " " , "==============Crawling to new internal page " + iLink + "...==============" ) ) ;
190
+ CrawlPage ( formattediLink ) ;
191
+ }
192
+ }
193
+ catch ( Exception exc )
194
+ {
195
+ _failedUrlRepository . List . Add ( formattediLink + " (on page at url " + url + ") - " + exc . Message ) ;
196
+ }
197
+ if ( loopBreak >= 5 )
198
+ {
199
+ break ;
200
+ }
201
+ /*if (formattediLink == "/covid19")
202
+ {
203
+ break;
204
+ }*/
151
205
}
152
206
}
153
207
}
0 commit comments