22
33namespace App \Crawler ;
44
5- use App \Url ;
65use App \Search ;
7- use App \Resource ;
86use Hedii \Extractors \Extractor ;
97
108class Crawler
@@ -16,27 +14,13 @@ class Crawler
1614 */
1715 protected $ search ;
1816
19- /**
20- * The Eloquent model for the search entry point url.
21- *
22- * @var \App\Url
23- */
24- protected $ entryPoint ;
25-
2617 /**
2718 * The search's domain name.
2819 *
2920 * @var string
3021 */
3122 protected $ domainName ;
3223
33- /**
34- * Whether or not the search has to be limited to a domain name.
35- *
36- * @var bool
37- */
38- protected $ domainLimit ;
39-
4024 /**
4125 * @var \Hedii\Extractors\Extractor
4226 */
@@ -62,52 +46,51 @@ public function __construct(Extractor $extractor)
6246 public function run (Search $ search )
6347 {
6448 $ this ->search = $ search ;
49+ $ this ->domainName = $ this ->getDomainName ($ this ->search ->entrypoint );
6550
66- if (!$ this ->searchHasBeenDeleted ()) {
67- $ this ->domainName = $ this ->getDomainName ($ this ->search ->entrypoint );
68- $ this ->domainLimit = (bool ) $ this ->search ->domain_limit ;
51+ // crawl search's entrypoint url
52+ $ this ->crawl ($ this ->search ->entrypoint , true );
6953
70- return $ this ->crawl ();
54+ // crawl all search's urls
55+ while ($ url = $ this ->getNextNotCrawledUrl ()) {
56+ $ this ->crawl ($ url );
57+
58+ // check if the search has been deleted during the crawl process
59+ if ($ this ->searchIsDeletedOrFinished ()) {
60+ return false ;
61+ }
7162 }
7263
64+ // this search is finished!
65+ $ this ->search ->update (['finished ' => true ]);
66+
7367 return false ;
7468 }
7569
7670 /**
77- * All the logic for this class .
71+ * Crawl an url and extract resources .
7872 *
79- * @return bool
73+ * @param mixed $url
74+ * @param bool $entrypoint
8075 */
81- protected function crawl ()
76+ protected function crawl ($ url , $ entrypoint = false )
8277 {
83- $ resources = $ this ->extractor ->searchFor (['urls ' , 'emails ' ])
84- ->at ($ this ->search ->entrypoint )
85- ->get ();
86-
87- $ this ->storeUrls ($ resources ['urls ' ]);
88- $ this ->storeEmails ($ resources ['emails ' ]);
78+ if ($ entrypoint ) {
79+ $ resources = $ this ->extractor ->searchFor (['urls ' , 'emails ' ])
80+ ->at ($ url )
81+ ->get ();
8982
90- // crawl all search's url
91- while ( $ this ->searchHasNotCrawledUrl ()) {
92- $ url = $ this -> getNextNotCrawledUrl ();
83+ $ this -> storeUrls ( $ resources [ ' urls ' ]);
84+ $ this ->storeEmails ( $ resources [ ' emails ' ]);
85+ } else {
9386 $ resources = $ this ->extractor ->searchFor (['urls ' , 'emails ' ])
9487 ->at ($ url ->name )
9588 ->get ();
9689
9790 $ this ->storeUrls ($ resources ['urls ' ]);
9891 $ this ->storeEmails ($ resources ['emails ' ]);
99- $ this ->markAsCrawled ($ url );
100-
101- // check if search has been deleted
102- if ($ this ->searchHasBeenDeleted ()) {
103- return false ;
104- }
92+ $ url ->update (['crawled ' => true ]);
10593 }
106-
107- // this search is finished!
108- $ this ->markAsFinished ($ this ->search );
109-
110- return false ;
11194 }
11295
11396 /**
@@ -126,21 +109,17 @@ protected function storeUrls($urls)
126109 // if url is not a valid url, continue
127110 (!$ this ->isValidUrl ($ url )) ||
128111 // or, if domainLimit, get only the same domain urls
129- ($ this ->domainLimit && ($ this ->getDomainName ($ url ) !== $ this ->domainName )) ||
112+ ($ this ->search -> domain_limit && ($ this ->getDomainName ($ url ) !== $ this ->domainName )) ||
130113 // we don't want media files like images
131114 $ this ->isMediaFile ($ url )
132115 ) {
133116 continue ;
134117 }
135118
136- $ theUrl = new Url ();
137- $ theUrl ->name = $ url ;
138- $ theUrl ->crawled = false ;
139- $ theUrl ->user_id = $ this ->search ->user_id ;
140-
141- if (!$ this ->searchHasUrl ($ url )) {
142- $ this ->search ->urls ()->save ($ theUrl );
143- }
119+ $ this ->search ->urls ()->firstOrCreate ([
120+ 'name ' => $ url ,
121+ 'user_id ' => $ this ->search ->user_id
122+ ]);
144123 }
145124 }
146125
@@ -161,67 +140,31 @@ protected function storeEmails($emails)
161140 continue ;
162141 }
163142
164- $ resource = new Resource ();
165- $ resource ->type = $ this ->search ->type ;
166- $ resource ->name = $ email ;
167- $ resource ->user_id = $ this ->search ->user_id ;
168- $ resource ->search_id = $ this ->search ->id ;
169-
170- if (!$ this ->searchHasEmail ($ email )) {
171- $ this ->search ->resources ()->save ($ resource );
172- }
143+ $ this ->search ->resources ()->firstOrCreate ([
144+ 'type ' => $ this ->search ->type ,
145+ 'name ' => $ email ,
146+ 'user_id ' => $ this ->search ->user_id
147+ ]);
173148 }
174149 }
175150
176151 return $ this ;
177152 }
178153
179154 /**
180- * Check if the search already has this url.
181- *
182- * @param string $url
183- * @return bool
184- */
185- protected function searchHasUrl ($ url )
186- {
187- return $ this ->search ->urls ()
188- ->where (['name ' => $ url ])
189- ->first ();
190- }
191-
192- /**
193- * Check if the search already has this email.
155+ * Check if the search has been deleted or marked as finished.
194156 *
195- * @param string $email
196157 * @return bool
197158 */
198- protected function searchHasEmail ( $ email )
159+ protected function searchIsDeletedOrFinished ( )
199160 {
200- return $ this ->search ->resources ()->where ([
201- 'type ' => $ this ->search ->type ,
202- 'name ' => $ email
203- ])->first ();
204- }
161+ $ search = Search::find ($ this ->search ->id )->first ();
205162
206- /**
207- * Check if the search has been deleted in the database.
208- *
209- * @return bool
210- */
211- protected function searchHasBeenDeleted ()
212- {
213- return Search::find ($ this ->search ->id )->count () == 0 ;
214- }
163+ if ($ search && $ search ->finished != true ) {
164+ return false ;
165+ }
215166
216- /**
217- * Check if the search still has urls that have not been
218- * crawled yet.
219- *
220- * @return bool
221- */
222- protected function searchHasNotCrawledUrl ()
223- {
224- return $ this ->search ->urls ()->where (['crawled ' => false ])->first ();
167+ return true ;
225168 }
226169
227170 /**
@@ -231,35 +174,9 @@ protected function searchHasNotCrawledUrl()
231174 */
232175 protected function getNextNotCrawledUrl ()
233176 {
234- return $ this ->search ->urls ()->where (['crawled ' => false ])->first ();
235- }
236-
237- /**
238- * Update the url in the database: mark it as crawled for url.
239- *
240- * @param \App\Url $url
241- * @return $this
242- */
243- protected function markAsCrawled (Url $ url )
244- {
245- $ url ->crawled = true ;
246- $ url ->save ();
247-
248- return $ this ;
249- }
250-
251- /**
252- * Update the search in the database: mark it as finished.
253- *
254- * @param \App\Search $search
255- * @return $this
256- */
257- protected function markAsFinished (Search $ search )
258- {
259- $ search ->finished = true ;
260- $ search ->save ();
261-
262- return $ this ;
177+ return $ this ->search ->urls ()
178+ ->where (['crawled ' => false ])
179+ ->first ();
263180 }
264181
265182 /**
@@ -296,14 +213,21 @@ protected function isValidEmail($email)
296213 }
297214
298215 /**
299- * Remove unwanted character at the end of the url string.
216+ * Remove unwanted character at the end of the url string,
217+ * and remove anchors in the url.
300218 *
301219 * @param string $url
302220 * @return string
303221 */
304222 protected function cleanUrl ($ url )
305223 {
306- return rtrim (rtrim ($ url , '# ' ), '/ ' );
224+ $ url = rtrim (rtrim ($ url , '# ' ), '/ ' );
225+
226+ if (!empty (parse_url ($ url , PHP_URL_FRAGMENT ))) {
227+ $ url = str_replace ('# ' . parse_url ($ url , PHP_URL_FRAGMENT ), '' , $ url );
228+ }
229+
230+ return rtrim ($ url , '? ' );
307231 }
308232
309233 /**
0 commit comments