Skip to content

Commit a3065d6

Browse files
author
Tom
committed
Merge pull request #4 from tomverran/issue-3
Fix user agents not being lowercased before being checked
2 parents b00811a + 9199385 commit a3065d6

File tree

5 files changed

+121
-41
lines changed

5 files changed

+121
-41
lines changed

src/Robot/Leaf.php

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,11 @@ public function getNode($values)
4242
* @param string $userAgent The user agent to check
4343
* @param bool $allowed Are they allowed
4444
*/
45-
public function addRule($userAgent, $allowed)
45+
public function addRule($userAgents, $allowed)
4646
{
47-
$this->rules[$userAgent] = $allowed;
47+
foreach ($userAgents as $userAgent) {
48+
$this->rules[$userAgent] = $allowed;
49+
}
4850
}
4951

5052
/**
@@ -55,10 +57,13 @@ public function addRule($userAgent, $allowed)
5557
*/
5658
public function allowed($userAgent, $urlParts)
5759
{
58-
$ourRule = isset($this->rules[$userAgent]) ? $this->rules[$userAgent] : null;
59-
$currentUrlPart = array_shift($urlParts);
60+
$wildcardRule = isset($this->rules['*']) ? $this->rules['*'] : null;
61+
$ourRuleForUserAgent = isset($this->rules[$userAgent]) ? $this->rules[$userAgent] : null;
62+
$ourRule = $ourRuleForUserAgent === null ? $wildcardRule : $ourRuleForUserAgent;
6063

64+
$currentUrlPart = array_shift($urlParts);
6165
$theirRule = null;
66+
6267
foreach ($this->children as $part => $leaf) {
6368

6469
//convert our leaf into a regular expression, replacing * with regex non greedy wildcards

src/Robot/RobotsFile.php

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
<?php
2+
/**
3+
* Created by PhpStorm.
4+
* User: Tom
5+
* Date: 11/10/2015
6+
* Time: 9:15 PM
7+
*/
8+
9+
namespace tomverran\Robot;
10+
11+
12+
class RobotsFile
13+
{
14+
/**
15+
* @var String[]
16+
*/
17+
private $lines;
18+
19+
/**
20+
* Construct this Robots file
21+
* @param String $content
22+
*/
23+
public function __construct($content)
24+
{
25+
$withoutComments = preg_replace( '/#.*/', '', strtolower($content));
26+
foreach(explode("\n", $withoutComments) as $line) {
27+
$lineParts = array_filter(array_map('trim', explode(':', $line)));
28+
if (count($lineParts) == 2) {
29+
$this->lines[] = $lineParts;
30+
}
31+
}
32+
}
33+
34+
/**
35+
* Get the first directive in the file
36+
*/
37+
public function firstDirective()
38+
{
39+
return $this->lines[0][0];
40+
}
41+
42+
public function firstDirectiveIs($args)
43+
{
44+
if (!$this->hasLines()) {
45+
return false;
46+
}
47+
return in_array($this->firstDirective(), func_get_args());
48+
}
49+
50+
/**
51+
* Get the argument of the first directive,
52+
* and shift the file to remove it
53+
*/
54+
public function shiftArgument()
55+
{
56+
return array_shift($this->lines)[1];
57+
}
58+
59+
/**
60+
* Does this file have any remaining lines
61+
* @return bool
62+
*/
63+
public function hasLines()
64+
{
65+
return !empty($this->lines);
66+
}
67+
}

src/Robot/RobotsTxt.php

Lines changed: 11 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -19,43 +19,25 @@ class RobotsTxt
1919
public function __construct($contents)
2020
{
2121
$this->tree = new Leaf();
22-
$this->parseFile($contents);
22+
$this->parseFile(new RobotsFile($contents));
2323
}
2424

2525
/**
2626
* Parse a robot file
2727
* @param $robotFile
2828
* @throws \LogicException
2929
*/
30-
private function parseFile($robotFile)
30+
private function parseFile(RobotsFile $robotFile)
3131
{
32-
$currentUserAgent = null;
33-
34-
$withoutComments = preg_replace( '/#.*/', '', strtolower( $robotFile ) );
35-
36-
foreach (explode( "\n", $withoutComments ) as $line) {
37-
38-
$parts = array_filter(array_map('trim', explode(':', $line)));
39-
40-
//if we don't have a full rule or this is a comment..
41-
if (count($parts) < 2) {
42-
continue;
32+
while($robotFile->hasLines()) {
33+
$currentUserAgents = [];
34+
while ($robotFile->firstDirectiveIs('user-agent')) {
35+
$currentUserAgents[] = $robotFile->shiftArgument();
4336
}
44-
45-
list($directive, $argument) = $parts;
46-
47-
//handle setting our user agent
48-
if ($directive == 'user-agent') {
49-
$currentUserAgent = $argument;
50-
continue;
51-
} else if (!$currentUserAgent) {
52-
throw new \LogicException('No user agent specified');
53-
}
54-
55-
//the last case is allow / deny. Add to the trees
56-
if ($directive == 'disallow' || $directive == 'allow') {
57-
$urlParts = array_filter(explode('/', $argument));
58-
$this->tree->getNode($urlParts)->addRule($currentUserAgent, $directive != 'disallow');
37+
while ($robotFile->firstDirectiveIs('allow', 'disallow')) {
38+
$isAllowed = $robotFile->firstDirective() == 'allow';
39+
$urlParts = array_filter(explode('/', $robotFile->shiftArgument()));
40+
$this->tree->getNode($urlParts)->addRule($currentUserAgents, $isAllowed);
5941
}
6042
}
6143
}
@@ -69,14 +51,6 @@ private function parseFile($robotFile)
6951
public function isAllowed($userAgent, $path)
7052
{
7153
$urlParts = array_filter(explode('/', $path));
72-
$ret = $this->tree->allowed($userAgent, $urlParts);
73-
74-
if ($ret === null) {
75-
$ret = $this->tree->allowed('*', $urlParts);
76-
}
77-
if ($ret === null) {
78-
$ret = true;
79-
}
80-
return $ret;
54+
return $this->tree->allowed(strtolower($userAgent), $urlParts) !== false;
8155
}
8256
}

tests/RobotTest.php

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,29 @@ public function testInlineComments()
8080
$this->assertFalse(self::getRobotsTxt('comment')->isAllowed('robot', '/comment'), 'lines with inline comments');
8181
$this->assertFalse(self::getRobotsTxt('comment')->isAllowed('robot', '/test'), 'lines following those with inline comments');
8282
}
83+
84+
/**
85+
* This entire library should be case insensitive
86+
*/
87+
public function testCapitalisedUserAgent()
88+
{
89+
$this->assertFalse(self::getRobotsTxt('multiUserAgent')->isAllowed('Googlebot', '/private/page.html'), 'Capitalised UA');
90+
}
91+
92+
/**
93+
*
94+
*/
95+
public function testMultipleConsecutiveUserAgents()
96+
{
97+
foreach(['UA', 'Googlebot', '*'] as $agent) {
98+
$this->assertFalse(self::getRobotsTxt('multiUserAgent')->isAllowed($agent, '/private/page.html'), $agent);
99+
$this->assertTrue(self::getRobotsTxt('multiUserAgent')->isAllowed($agent, '/'), $agent);
100+
}
101+
}
102+
103+
public function testMultipleNonConsecutiveUserAgents()
104+
{
105+
$this->assertFalse(self::getRobotsTxt('multiUserAgent')->isAllowed('robot2', '/some/other'));
106+
$this->assertTrue(self::getRobotsTxt('multiUserAgent')->isAllowed('Googlebot', '/some/other'));
107+
}
83108
}

tests/files/multiUserAgent.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
User-agent: *
2+
User-agent: UA
3+
User-agent: Googlebot
4+
Allow: /
5+
Disallow: /private/
6+
Disallow: /secret/page.html
7+
User-agent: robot2
8+
Allow: /
9+
Disallow: /some/other/

0 commit comments

Comments
 (0)