Add execution time limit (#480)

VincentLanglet · web-flow · commit 344e1b90488c · 2024-12-16T16:32:54.000+01:00
diff --git a/README.md b/README.md
@@ -224,16 +224,19 @@ Crawler::create()
     ->setConcurrency(1) // now all urls will be crawled one by one
 ```
 
-## Defining Crawl Limits
+## Defining Crawl and Time Limits
 
 By default, the crawler continues until it has crawled every page it can find. This behavior might cause issues if you are working in an environment with limitations such as a serverless environment.
 
 The crawl behavior can be controlled with the following two options:
 
  - **Total Crawl Limit** (`setTotalCrawlLimit`): This limit defines the maximal count of URLs to crawl.
  - **Current Crawl Limit** (`setCurrentCrawlLimit`): This defines how many URLs are processed during the current crawl.
+ - **Total Execution Time Limit** (`setTotalExecutionTimeLimit`): This limit defines the maximal execution time of the crawl.
+ - **Current Execution Time Limit** (`setCurrentExecutionTimeLimit`): This limits the execution time of the current crawl.
 
-Let's take a look at some examples to clarify the difference between these two methods.
+Let's take a look at some examples to clarify the difference between `setTotalCrawlLimit` and `setCurrentCrawlLimit`.
+The difference between `setTotalExecutionTimeLimit` and `setCurrentExecutionTimeLimit` will be the same.
 
 ### Example 1: Using the total crawl limit
 
diff --git a/src/Crawler.php b/src/Crawler.php
@@ -43,6 +43,14 @@ class Crawler
 
     protected ?int $currentCrawlLimit = null;
 
+    protected ?int $startedAt = null;
+
+    protected int $executionTime = 0;
+
+    protected ?int $totalExecutionTimeLimit = null;
+
+    protected ?int $currentExecutionTimeLimit = null;
+
     protected int $maximumResponseSize = 1024 * 1024 * 2;
 
     protected ?int $maximumDepth = null;
@@ -174,6 +182,44 @@ public function getCurrentCrawlCount(): int
         return $this->currentUrlCount;
     }
 
+    public function setTotalExecutionTimeLimit(int $totalExecutionTimeLimitInSecond): self
+    {
+        $this->totalExecutionTimeLimit = $totalExecutionTimeLimitInSecond;
+
+        return $this;
+    }
+
+    public function getTotalExecutionTimeLimit(): ?int
+    {
+        return $this->totalExecutionTimeLimit;
+    }
+
+    public function getTotalExecutionTime(): int
+    {
+        return $this->executionTime + $this->getCurrentExecutionTime();
+    }
+
+    public function setCurrentExecutionTimeLimit(int $currentExecutionTimeLimitInSecond): self
+    {
+        $this->currentExecutionTimeLimit = $currentExecutionTimeLimitInSecond;
+
+        return $this;
+    }
+
+    public function getCurrentExecutionTimeLimit(): ?int
+    {
+        return $this->currentExecutionTimeLimit;
+    }
+
+    public function getCurrentExecutionTime(): int
+    {
+        if (is_null($this->startedAt)) {
+            return 0;
+        }
+
+        return time() - $this->startedAt;
+    }
+
     public function setMaximumDepth(int $maximumDepth): self
     {
         $this->maximumDepth = $maximumDepth;
@@ -412,6 +458,8 @@ public function getBaseUrl(): UriInterface
 
     public function startCrawling(UriInterface|string $baseUrl)
     {
+        $this->startedAt = time();
+
         if (! $baseUrl instanceof UriInterface) {
             $baseUrl = new Uri($baseUrl);
         }
@@ -445,6 +493,9 @@ public function startCrawling(UriInterface|string $baseUrl)
         foreach ($this->crawlObservers as $crawlObserver) {
             $crawlObserver->finishedCrawling();
         }
+
+        $this->executionTime += time() - $this->startedAt;
+        $this->startedAt = null; // To reset currentExecutionTime
     }
 
     public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null, ?UriInterface $originalUrl = null): ?Node
@@ -480,6 +531,7 @@ protected function startCrawlingQueue(): void
     {
         while (
             $this->reachedCrawlLimits() === false &&
+            $this->reachedTimeLimits() === false &&
             $this->crawlQueue->hasPendingUrls()
         ) {
             $pool = new Pool($this->client, $this->getCrawlRequests(), [
@@ -504,6 +556,7 @@ protected function getCrawlRequests(): Generator
     {
         while (
             $this->reachedCrawlLimits() === false &&
+            $this->reachedTimeLimits() === false &&
             $crawlUrl = $this->crawlQueue->getPendingUrl()
         ) {
             if (
@@ -556,4 +609,19 @@ public function reachedCrawlLimits(): bool
 
         return false;
     }
+
+    public function reachedTimeLimits(): bool
+    {
+        $totalExecutionTimeLimit = $this->getTotalExecutionTimeLimit();
+        if (! is_null($totalExecutionTimeLimit) && $this->getTotalExecutionTime() >= $totalExecutionTimeLimit) {
+            return true;
+        }
+
+        $currentExecutionTimeLimit = $this->getCurrentExecutionTimeLimit();
+        if (! is_null($currentExecutionTimeLimit) && $this->getCurrentExecutionTime() >= $currentExecutionTimeLimit) {
+            return true;
+        }
+
+        return false;
+    }
 }
diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php
@@ -517,6 +517,44 @@ function ($url) {
     assertCrawledUrlCount(3);
 });
 
+it('respects the total execution time limit', function () {
+    $baseUrl = 'http://localhost:8080';
+
+    $crawler = createCrawler()
+        ->setMaximumDepth(2)
+        ->setDelayBetweenRequests(500) // 500ms
+        ->setTotalExecutionTimeLimit(2)
+        ->setCrawlProfile(new CrawlSubdomains($baseUrl));
+
+    $crawler->startCrawling($baseUrl);
+
+    // At 500ms delay per URL, only four URL can be crawled in 2 seconds.
+    assertCrawledUrlCount(4);
+
+    $crawler->startCrawling($baseUrl);
+
+    assertCrawledUrlCount(4);
+});
+
+it('respects the current execution time limit', function () {
+    $baseUrl = 'http://localhost:8080';
+
+    $crawler = createCrawler()
+        ->setMaximumDepth(2)
+        ->setDelayBetweenRequests(500) // 500ms
+        ->setCurrentExecutionTimeLimit(2)
+        ->setCrawlProfile(new CrawlSubdomains($baseUrl));
+
+    $crawler->startCrawling($baseUrl);
+
+    // At 500ms delay per URL, only four URL can be crawled in 2 seconds.
+    assertCrawledUrlCount(4);
+
+    $crawler->startCrawling($baseUrl);
+
+    assertCrawledUrlCount(11);
+});
+
 function javascriptInjectedUrls(): array
 {
     return [[