When fetching robots.txt, use the same User-Agent as defined by the user (#491)

mattiasgeniar · web-flow · commit a9c432c16d2c · 2025-10-28T15:08:29.000+01:00
* Bugfix: prevent infinite loops when a CrawlProfile prevents crawling of a particular URL, it should remove that URL from the queue

* When fetching robots.txt, use the same User-Agent as defined by the user
diff --git a/src/Crawler.php b/src/Crawler.php
@@ -566,7 +566,15 @@ protected function startCrawlingQueue(): void
 
     protected function createRobotsTxt(UriInterface $uri): RobotsTxt
     {
-        return RobotsTxt::create($uri->withPath('/robots.txt'));
+        try {
+            $robotsUrl = (string) $uri->withPath('/robots.txt');
+            $response = $this->client->get($robotsUrl);
+            $content = (string) $response->getBody();
+
+            return new RobotsTxt($content);
+        } catch (\Exception $exception) {
+            return new RobotsTxt('');
+        }
     }
 
     protected function getCrawlRequests(): Generator
diff --git a/tests/CrawlerRobotsUserAgentTest.php b/tests/CrawlerRobotsUserAgentTest.php
@@ -0,0 +1,40 @@
+<?php
+
+use GuzzleHttp\Client;
+use GuzzleHttp\HandlerStack;
+use GuzzleHttp\Middleware;
+use GuzzleHttp\Psr7\Response;
+use Spatie\Crawler\Crawler;
+
+beforeEach(function () {
+    $this->mockHandler = new \GuzzleHttp\Handler\MockHandler([
+        new Response(200, [], "User-agent: *\nDisallow: /admin"),
+        new Response(200, [], '<html><body>Home</body></html>'),
+    ]);
+
+    $this->crawledUrls = [];
+    $this->history = Middleware::history($this->crawledUrls);
+
+    $this->handlerStack = HandlerStack::create($this->mockHandler);
+    $this->handlerStack->push($this->history);
+});
+
+it('should send the correct user agent header when fetching robots.txt', function () {
+    $client = new Client(['handler' => $this->handlerStack]);
+    $crawler = new Crawler($client);
+    $crawler->respectRobots()->startCrawling('http://example.com');
+
+    expect($this->crawledUrls)->toHaveCount(2);
+    expect((string) $this->crawledUrls[0]['request']->getUri())->toBe('http://example.com/robots.txt');
+    expect($this->crawledUrls[0]['request']->getHeader('User-Agent'))->toBe(['GuzzleHttp/7']);
+});
+
+it('should send the custom user agent header when fetching robots.txt', function () {
+    $client = new Client(['handler' => $this->handlerStack]);
+    $crawler = new Crawler($client);
+    $crawler->respectRobots()->setUserAgent('CustomBot/2.0')->startCrawling('http://example.com');
+
+    expect($this->crawledUrls)->toHaveCount(2);
+    expect((string) $this->crawledUrls[0]['request']->getUri())->toBe('http://example.com/robots.txt');
+    expect($this->crawledUrls[0]['request']->getHeader('User-Agent'))->toBe(['CustomBot/2.0']);
+});