Skip to content
This repository was archived by the owner on May 21, 2021. It is now read-only.

Commit 6d285be

Browse files
committed
Merge pull request #5 from hedii/performance
Performance
2 parents 095ab87 + e62bff3 commit 6d285be

19 files changed

+246
-172
lines changed

app/Crawler/Crawler.php

Lines changed: 55 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@
22

33
namespace App\Crawler;
44

5-
use App\Url;
65
use App\Search;
7-
use App\Resource;
86
use Hedii\Extractors\Extractor;
97

108
class Crawler
@@ -16,27 +14,13 @@ class Crawler
1614
*/
1715
protected $search;
1816

19-
/**
20-
* The Eloquent model for the search entry point url.
21-
*
22-
* @var \App\Url
23-
*/
24-
protected $entryPoint;
25-
2617
/**
2718
* The search's domain name.
2819
*
2920
* @var string
3021
*/
3122
protected $domainName;
3223

33-
/**
34-
* Whether or not the search has to be limited to a domain name.
35-
*
36-
* @var bool
37-
*/
38-
protected $domainLimit;
39-
4024
/**
4125
* @var \Hedii\Extractors\Extractor
4226
*/
@@ -62,52 +46,51 @@ public function __construct(Extractor $extractor)
6246
public function run(Search $search)
6347
{
6448
$this->search = $search;
49+
$this->domainName = $this->getDomainName($this->search->entrypoint);
6550

66-
if (!$this->searchHasBeenDeleted()) {
67-
$this->domainName = $this->getDomainName($this->search->entrypoint);
68-
$this->domainLimit = (bool) $this->search->domain_limit;
51+
// crawl search's entrypoint url
52+
$this->crawl($this->search->entrypoint, true);
6953

70-
return $this->crawl();
54+
// crawl all search's urls
55+
while ($url = $this->getNextNotCrawledUrl()) {
56+
$this->crawl($url);
57+
58+
// check if the search has been deleted during the crawl process
59+
if ($this->searchIsDeletedOrFinished()) {
60+
return false;
61+
}
7162
}
7263

64+
// this search is finished!
65+
$this->search->update(['finished' => true]);
66+
7367
return false;
7468
}
7569

7670
/**
77-
* All the logic for this class.
71+
* Crawl an url and extract resources.
7872
*
79-
* @return bool
73+
* @param mixed $url
74+
* @param bool $entrypoint
8075
*/
81-
protected function crawl()
76+
protected function crawl($url, $entrypoint = false)
8277
{
83-
$resources = $this->extractor->searchFor(['urls', 'emails'])
84-
->at($this->search->entrypoint)
85-
->get();
86-
87-
$this->storeUrls($resources['urls']);
88-
$this->storeEmails($resources['emails']);
78+
if ($entrypoint) {
79+
$resources = $this->extractor->searchFor(['urls', 'emails'])
80+
->at($url)
81+
->get();
8982

90-
// crawl all search's url
91-
while ($this->searchHasNotCrawledUrl()) {
92-
$url = $this->getNextNotCrawledUrl();
83+
$this->storeUrls($resources['urls']);
84+
$this->storeEmails($resources['emails']);
85+
} else {
9386
$resources = $this->extractor->searchFor(['urls', 'emails'])
9487
->at($url->name)
9588
->get();
9689

9790
$this->storeUrls($resources['urls']);
9891
$this->storeEmails($resources['emails']);
99-
$this->markAsCrawled($url);
100-
101-
// check if search has been deleted
102-
if ($this->searchHasBeenDeleted()) {
103-
return false;
104-
}
92+
$url->update(['crawled' => true]);
10593
}
106-
107-
// this search is finished!
108-
$this->markAsFinished($this->search);
109-
110-
return false;
11194
}
11295

11396
/**
@@ -126,21 +109,17 @@ protected function storeUrls($urls)
126109
// if url is not a valid url, continue
127110
(!$this->isValidUrl($url)) ||
128111
// or, if domainLimit, get only the same domain urls
129-
($this->domainLimit && ($this->getDomainName($url) !== $this->domainName)) ||
112+
($this->search->domain_limit && ($this->getDomainName($url) !== $this->domainName)) ||
130113
// we don't want media files like images
131114
$this->isMediaFile($url)
132115
) {
133116
continue;
134117
}
135118

136-
$theUrl = new Url();
137-
$theUrl->name = $url;
138-
$theUrl->crawled = false;
139-
$theUrl->user_id = $this->search->user_id;
140-
141-
if (!$this->searchHasUrl($url)) {
142-
$this->search->urls()->save($theUrl);
143-
}
119+
$this->search->urls()->firstOrCreate([
120+
'name' => $url,
121+
'user_id' => $this->search->user_id
122+
]);
144123
}
145124
}
146125

@@ -161,67 +140,31 @@ protected function storeEmails($emails)
161140
continue;
162141
}
163142

164-
$resource = new Resource();
165-
$resource->type = $this->search->type;
166-
$resource->name = $email;
167-
$resource->user_id = $this->search->user_id;
168-
$resource->search_id = $this->search->id;
169-
170-
if (!$this->searchHasEmail($email)) {
171-
$this->search->resources()->save($resource);
172-
}
143+
$this->search->resources()->firstOrCreate([
144+
'type' => $this->search->type,
145+
'name' => $email,
146+
'user_id' => $this->search->user_id
147+
]);
173148
}
174149
}
175150

176151
return $this;
177152
}
178153

179154
/**
180-
* Check if the search already has this url.
181-
*
182-
* @param string $url
183-
* @return bool
184-
*/
185-
protected function searchHasUrl($url)
186-
{
187-
return $this->search->urls()
188-
->where(['name' => $url])
189-
->first();
190-
}
191-
192-
/**
193-
* Check if the search already has this email.
155+
* Check if the search has been deleted or marked as finished.
194156
*
195-
* @param string $email
196157
* @return bool
197158
*/
198-
protected function searchHasEmail($email)
159+
protected function searchIsDeletedOrFinished()
199160
{
200-
return $this->search->resources()->where([
201-
'type' => $this->search->type,
202-
'name' => $email
203-
])->first();
204-
}
161+
$search = Search::find($this->search->id)->first();
205162

206-
/**
207-
* Check if the search has been deleted in the database.
208-
*
209-
* @return bool
210-
*/
211-
protected function searchHasBeenDeleted()
212-
{
213-
return Search::find($this->search->id)->count() == 0;
214-
}
163+
if ($search && $search->finished != true) {
164+
return false;
165+
}
215166

216-
/**
217-
* Check if the search still has urls that have not been
218-
* crawled yet.
219-
*
220-
* @return bool
221-
*/
222-
protected function searchHasNotCrawledUrl()
223-
{
224-
return $this->search->urls()->where(['crawled' => false])->first();
167+
return true;
225168
}
226169

227170
/**
@@ -231,35 +174,9 @@ protected function searchHasNotCrawledUrl()
231174
*/
232175
protected function getNextNotCrawledUrl()
233176
{
234-
return $this->search->urls()->where(['crawled' => false])->first();
235-
}
236-
237-
/**
238-
* Update the url in the database: mark it as crawled for url.
239-
*
240-
* @param \App\Url $url
241-
* @return $this
242-
*/
243-
protected function markAsCrawled(Url $url)
244-
{
245-
$url->crawled = true;
246-
$url->save();
247-
248-
return $this;
249-
}
250-
251-
/**
252-
* Update the search in the database: mark it as finished.
253-
*
254-
* @param \App\Search $search
255-
* @return $this
256-
*/
257-
protected function markAsFinished(Search $search)
258-
{
259-
$search->finished = true;
260-
$search->save();
261-
262-
return $this;
177+
return $this->search->urls()
178+
->where(['crawled' => false])
179+
->first();
263180
}
264181

265182
/**
@@ -296,14 +213,21 @@ protected function isValidEmail($email)
296213
}
297214

298215
/**
299-
* Remove unwanted character at the end of the url string.
216+
* Remove unwanted character at the end of the url string,
217+
* and remove anchors in the url.
300218
*
301219
* @param string $url
302220
* @return string
303221
*/
304222
protected function cleanUrl($url)
305223
{
306-
return rtrim(rtrim($url, '#'), '/');
224+
$url = rtrim(rtrim($url, '#'), '/');
225+
226+
if (!empty(parse_url($url, PHP_URL_FRAGMENT))) {
227+
$url = str_replace('#' . parse_url($url, PHP_URL_FRAGMENT), '', $url);
228+
}
229+
230+
return rtrim($url, '?');
307231
}
308232

309233
/**

0 commit comments

Comments
 (0)