Skip to content

Commit a9c432c

Browse files
When fetching robots.txt, use the same User-Agent as defined by the user (#491)
* Bugfix: prevent infinite loops when a CrawlProfile prevents crawling of a particular URL, it should remove that URL from the queue * When fetching robots.txt, use the same User-Agent as defined by the user
1 parent 7927158 commit a9c432c

File tree

2 files changed

+49
-1
lines changed

2 files changed

+49
-1
lines changed

src/Crawler.php

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,15 @@ protected function startCrawlingQueue(): void
566566

567567
protected function createRobotsTxt(UriInterface $uri): RobotsTxt
568568
{
569-
return RobotsTxt::create($uri->withPath('/robots.txt'));
569+
try {
570+
$robotsUrl = (string) $uri->withPath('/robots.txt');
571+
$response = $this->client->get($robotsUrl);
572+
$content = (string) $response->getBody();
573+
574+
return new RobotsTxt($content);
575+
} catch (\Exception $exception) {
576+
return new RobotsTxt('');
577+
}
570578
}
571579

572580
protected function getCrawlRequests(): Generator
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<?php
2+
3+
use GuzzleHttp\Client;
4+
use GuzzleHttp\HandlerStack;
5+
use GuzzleHttp\Middleware;
6+
use GuzzleHttp\Psr7\Response;
7+
use Spatie\Crawler\Crawler;
8+
9+
beforeEach(function () {
10+
$this->mockHandler = new \GuzzleHttp\Handler\MockHandler([
11+
new Response(200, [], "User-agent: *\nDisallow: /admin"),
12+
new Response(200, [], '<html><body>Home</body></html>'),
13+
]);
14+
15+
$this->crawledUrls = [];
16+
$this->history = Middleware::history($this->crawledUrls);
17+
18+
$this->handlerStack = HandlerStack::create($this->mockHandler);
19+
$this->handlerStack->push($this->history);
20+
});
21+
22+
it('should send the correct user agent header when fetching robots.txt', function () {
23+
$client = new Client(['handler' => $this->handlerStack]);
24+
$crawler = new Crawler($client);
25+
$crawler->respectRobots()->startCrawling('http://example.com');
26+
27+
expect($this->crawledUrls)->toHaveCount(2);
28+
expect((string) $this->crawledUrls[0]['request']->getUri())->toBe('http://example.com/robots.txt');
29+
expect($this->crawledUrls[0]['request']->getHeader('User-Agent'))->toBe(['GuzzleHttp/7']);
30+
});
31+
32+
it('should send the custom user agent header when fetching robots.txt', function () {
33+
$client = new Client(['handler' => $this->handlerStack]);
34+
$crawler = new Crawler($client);
35+
$crawler->respectRobots()->setUserAgent('CustomBot/2.0')->startCrawling('http://example.com');
36+
37+
expect($this->crawledUrls)->toHaveCount(2);
38+
expect((string) $this->crawledUrls[0]['request']->getUri())->toBe('http://example.com/robots.txt');
39+
expect($this->crawledUrls[0]['request']->getHeader('User-Agent'))->toBe(['CustomBot/2.0']);
40+
});

0 commit comments

Comments
 (0)