Skip to content

Commit 344e1b9

Browse files
Add execution time limit (#480)
1 parent 3383a96 commit 344e1b9

File tree

3 files changed

+111
-2
lines changed

3 files changed

+111
-2
lines changed

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,16 +224,19 @@ Crawler::create()
224224
->setConcurrency(1) // now all urls will be crawled one by one
225225
```
226226

227-
## Defining Crawl Limits
227+
## Defining Crawl and Time Limits
228228

229229
By default, the crawler continues until it has crawled every page it can find. This behavior might cause issues if you are working in an environment with limitations such as a serverless environment.
230230

231231
The crawl behavior can be controlled with the following two options:
232232

233233
- **Total Crawl Limit** (`setTotalCrawlLimit`): This limit defines the maximal count of URLs to crawl.
234234
- **Current Crawl Limit** (`setCurrentCrawlLimit`): This defines how many URLs are processed during the current crawl.
235+
- **Total Execution Time Limit** (`setTotalExecutionTimeLimit`): This limit defines the maximal execution time of the crawl.
236+
- **Current Execution Time Limit** (`setCurrentExecutionTimeLimit`): This limits the execution time of the current crawl.
235237

236-
Let's take a look at some examples to clarify the difference between these two methods.
238+
Let's take a look at some examples to clarify the difference between `setTotalCrawlLimit` and `setCurrentCrawlLimit`.
239+
The difference between `setTotalExecutionTimeLimit` and `setCurrentExecutionTimeLimit` will be the same.
237240

238241
### Example 1: Using the total crawl limit
239242

src/Crawler.php

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,14 @@ class Crawler
4343

4444
protected ?int $currentCrawlLimit = null;
4545

46+
protected ?int $startedAt = null;
47+
48+
protected int $executionTime = 0;
49+
50+
protected ?int $totalExecutionTimeLimit = null;
51+
52+
protected ?int $currentExecutionTimeLimit = null;
53+
4654
protected int $maximumResponseSize = 1024 * 1024 * 2;
4755

4856
protected ?int $maximumDepth = null;
@@ -174,6 +182,44 @@ public function getCurrentCrawlCount(): int
174182
return $this->currentUrlCount;
175183
}
176184

185+
public function setTotalExecutionTimeLimit(int $totalExecutionTimeLimitInSecond): self
186+
{
187+
$this->totalExecutionTimeLimit = $totalExecutionTimeLimitInSecond;
188+
189+
return $this;
190+
}
191+
192+
public function getTotalExecutionTimeLimit(): ?int
193+
{
194+
return $this->totalExecutionTimeLimit;
195+
}
196+
197+
public function getTotalExecutionTime(): int
198+
{
199+
return $this->executionTime + $this->getCurrentExecutionTime();
200+
}
201+
202+
public function setCurrentExecutionTimeLimit(int $currentExecutionTimeLimitInSecond): self
203+
{
204+
$this->currentExecutionTimeLimit = $currentExecutionTimeLimitInSecond;
205+
206+
return $this;
207+
}
208+
209+
public function getCurrentExecutionTimeLimit(): ?int
210+
{
211+
return $this->currentExecutionTimeLimit;
212+
}
213+
214+
public function getCurrentExecutionTime(): int
215+
{
216+
if (is_null($this->startedAt)) {
217+
return 0;
218+
}
219+
220+
return time() - $this->startedAt;
221+
}
222+
177223
public function setMaximumDepth(int $maximumDepth): self
178224
{
179225
$this->maximumDepth = $maximumDepth;
@@ -412,6 +458,8 @@ public function getBaseUrl(): UriInterface
412458

413459
public function startCrawling(UriInterface|string $baseUrl)
414460
{
461+
$this->startedAt = time();
462+
415463
if (! $baseUrl instanceof UriInterface) {
416464
$baseUrl = new Uri($baseUrl);
417465
}
@@ -445,6 +493,9 @@ public function startCrawling(UriInterface|string $baseUrl)
445493
foreach ($this->crawlObservers as $crawlObserver) {
446494
$crawlObserver->finishedCrawling();
447495
}
496+
497+
$this->executionTime += time() - $this->startedAt;
498+
$this->startedAt = null; // To reset currentExecutionTime
448499
}
449500

450501
public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null, ?UriInterface $originalUrl = null): ?Node
@@ -480,6 +531,7 @@ protected function startCrawlingQueue(): void
480531
{
481532
while (
482533
$this->reachedCrawlLimits() === false &&
534+
$this->reachedTimeLimits() === false &&
483535
$this->crawlQueue->hasPendingUrls()
484536
) {
485537
$pool = new Pool($this->client, $this->getCrawlRequests(), [
@@ -504,6 +556,7 @@ protected function getCrawlRequests(): Generator
504556
{
505557
while (
506558
$this->reachedCrawlLimits() === false &&
559+
$this->reachedTimeLimits() === false &&
507560
$crawlUrl = $this->crawlQueue->getPendingUrl()
508561
) {
509562
if (
@@ -556,4 +609,19 @@ public function reachedCrawlLimits(): bool
556609

557610
return false;
558611
}
612+
613+
public function reachedTimeLimits(): bool
614+
{
615+
$totalExecutionTimeLimit = $this->getTotalExecutionTimeLimit();
616+
if (! is_null($totalExecutionTimeLimit) && $this->getTotalExecutionTime() >= $totalExecutionTimeLimit) {
617+
return true;
618+
}
619+
620+
$currentExecutionTimeLimit = $this->getCurrentExecutionTimeLimit();
621+
if (! is_null($currentExecutionTimeLimit) && $this->getCurrentExecutionTime() >= $currentExecutionTimeLimit) {
622+
return true;
623+
}
624+
625+
return false;
626+
}
559627
}

tests/CrawlerTest.php

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,44 @@ function ($url) {
517517
assertCrawledUrlCount(3);
518518
});
519519

520+
it('respects the total execution time limit', function () {
521+
$baseUrl = 'http://localhost:8080';
522+
523+
$crawler = createCrawler()
524+
->setMaximumDepth(2)
525+
->setDelayBetweenRequests(500) // 500ms
526+
->setTotalExecutionTimeLimit(2)
527+
->setCrawlProfile(new CrawlSubdomains($baseUrl));
528+
529+
$crawler->startCrawling($baseUrl);
530+
531+
// At 500ms delay per URL, only four URL can be crawled in 2 seconds.
532+
assertCrawledUrlCount(4);
533+
534+
$crawler->startCrawling($baseUrl);
535+
536+
assertCrawledUrlCount(4);
537+
});
538+
539+
it('respects the current execution time limit', function () {
540+
$baseUrl = 'http://localhost:8080';
541+
542+
$crawler = createCrawler()
543+
->setMaximumDepth(2)
544+
->setDelayBetweenRequests(500) // 500ms
545+
->setCurrentExecutionTimeLimit(2)
546+
->setCrawlProfile(new CrawlSubdomains($baseUrl));
547+
548+
$crawler->startCrawling($baseUrl);
549+
550+
// At 500ms delay per URL, only four URL can be crawled in 2 seconds.
551+
assertCrawledUrlCount(4);
552+
553+
$crawler->startCrawling($baseUrl);
554+
555+
assertCrawledUrlCount(11);
556+
});
557+
520558
function javascriptInjectedUrls(): array
521559
{
522560
return [[

0 commit comments

Comments
 (0)