Skip to content

Commit 8b5f9ae

Browse files
author
Tom
committed
Merge pull request #9 from tomverran/issue-8
Issue 8
2 parents d6897d7 + 3548473 commit 8b5f9ae

16 files changed

+606
-135
lines changed

README.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,7 @@ Robots.txt checker
66
Given a robots.txt file this library will give you a straight forward yes/no as to whether you're allowed to access
77
a given resource with a given user agent. Internally it organises the file into a tree.
88

9-
Wildcards are supported in a basic way:
10-
11-
`Disallow: /hello/*/world` will disallow `/hello/whatever/world` but won't disallow `/hello/what/who/world`
9+
Wildcards are supported.
1210

1311
If you find any bugs with this library please don't hesitate to let me know, either create an issue on GitHub or submit a pull request.
1412

Vagrantfile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Vagrant.configure(2) do |config|
2+
config.vm.box = "landregistry/centos"
3+
config.vm.network "public_network"
4+
config.vm.provision "shell", inline: <<-SHELL
5+
yum install -y epel-release curl php
6+
curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/bin --filename=composer
7+
SHELL
8+
end

composer.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,10 @@
2222
"psr-4": {
2323
"tomverran\\Robot\\": "src/Robot/"
2424
}
25+
},
26+
"autoload-dev": {
27+
"psr-4": {
28+
"tomverran\\Robot\\": ["src/Robot/", "tests/"]
29+
}
2530
}
2631
}

src/Robot/AccessRules.php

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
<?php
2+
namespace tomverran\Robot;
3+
4+
5+
class AccessRules
6+
{
7+
private $rules;
8+
9+
/**
10+
* @param Boolean[] $rules A map of path => allowed
11+
*/
12+
public function __construct(array $rules)
13+
{
14+
uksort($rules, function($r1, $r2) {
15+
return strlen($r2) - strlen($r1);
16+
});
17+
$this->rules = $rules;
18+
}
19+
20+
private function convertPathToRegex($path) {
21+
$pathWithoutTrailingDollar = rtrim($path, '$');
22+
23+
$quotedWithWildcards = implode(array_map(function($input) {
24+
return preg_quote($input, '#');
25+
}, explode('*', $pathWithoutTrailingDollar)),'.*?');
26+
27+
$trailingDollar = $path == $pathWithoutTrailingDollar ? '' : '$';
28+
return "#^${quotedWithWildcards}{$trailingDollar}#";
29+
}
30+
31+
private function urlDecodeNonSlashes($str)
32+
{
33+
return implode(array_map(function($input) {
34+
return strtolower($input) == '%2f' ? $input : urldecode($input);
35+
}, preg_split('/(%2F)/i', $str, -1, PREG_SPLIT_DELIM_CAPTURE)), '');
36+
}
37+
38+
public function isAllowed($url)
39+
{
40+
$matches = [];
41+
foreach($this->rules as $ruleUrl => $allowed) {
42+
if ($ruleUrl && preg_match($this->convertPathToRegex($this->urlDecodeNonSlashes($ruleUrl)), $this->urlDecodeNonSlashes($url))) {
43+
$matches[$ruleUrl] = $allowed;
44+
}
45+
}
46+
return empty($matches) || array_shift($matches);
47+
}
48+
}

src/Robot/Leaf.php

Lines changed: 0 additions & 82 deletions
This file was deleted.

src/Robot/Record.php

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
<?php
2+
namespace tomverran\Robot;
3+
use tomverran\Robot\UserAgent;
4+
5+
class Record
6+
{
7+
/**
8+
* @var UserAgent
9+
*/
10+
private $ua;
11+
12+
/**
13+
* @var AccessRules
14+
*/
15+
private $ar;
16+
17+
public function __construct(UserAgent $ua, AccessRules $ar)
18+
{
19+
$this->ua = $ua;
20+
$this->ar = $ar;
21+
}
22+
23+
public function matches($userAgent)
24+
{
25+
$matches = $this->ua->getMatches($userAgent);
26+
return !empty($matches);
27+
}
28+
29+
public function getMatchStrength($userAgent)
30+
{
31+
if (!$this->matches($userAgent)) {
32+
return 0;
33+
}
34+
return max(array_map('strlen', $this->ua->getMatches($userAgent)));
35+
}
36+
37+
public function isAllowed($userAgent, $url)
38+
{
39+
return !$this->ua->getMatches($userAgent) || $this->ar->isAllowed($url);
40+
}
41+
}

src/Robot/RobotsFile.php

Lines changed: 46 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,12 @@
99
namespace tomverran\Robot;
1010

1111

12-
class RobotsFile
12+
class RobotsFile implements \Iterator
1313
{
1414
/**
15-
* @var String[]
15+
* @var \ArrayIterator
1616
*/
17-
private $lines;
18-
19-
const USER_AGENT = 'user-agent';
20-
21-
const DISALLOW = 'disallow';
22-
23-
const ALLOW = 'allow';
17+
private $lineIterator;
2418

2519
/**
2620
* Construct this Robots file
@@ -29,51 +23,73 @@ class RobotsFile
2923
public function __construct($content)
3024
{
3125
$withoutComments = preg_replace( '/#.*/', '', strtolower($content));
26+
$lines = [];
3227

3328
foreach(explode("\n", $withoutComments) as $line) {
3429
$lineParts = array_filter(array_map('trim', explode(':', $line)));
35-
if ($this->lineIsValid($lineParts)) {
36-
$this->lines[] = $lineParts;
30+
if (!empty($lineParts)) {
31+
$lines[] = $lineParts;
3732
}
3833
}
34+
35+
$this->lineIterator = new \ArrayIterator($lines);
3936
}
4037

41-
private function lineIsValid($line) {
42-
$validDirectives = [self::USER_AGENT, self::DISALLOW, self::ALLOW];
43-
return count($line) == 2 && in_array($line[0], $validDirectives);
38+
/**
39+
* Return the current element
40+
* @link http://php.net/manual/en/iterator.current.php
41+
* @return mixed Can return any type.
42+
* @since 5.0.0
43+
*/
44+
public function current()
45+
{
46+
$cur = $this->lineIterator->current();
47+
return count($cur) > 1 ? $cur[1] : '';
4448
}
4549

4650
/**
47-
* Get the first directive in the file
51+
* Move forward to next element
52+
* @link http://php.net/manual/en/iterator.next.php
53+
* @return void Any returned value is ignored.
54+
* @since 5.0.0
4855
*/
49-
public function firstDirective()
56+
public function next()
5057
{
51-
return $this->lines[0][0];
58+
$this->lineIterator->next();
5259
}
5360

54-
public function firstDirectiveIs($args)
61+
/**
62+
* Return the key of the current element
63+
* @link http://php.net/manual/en/iterator.key.php
64+
* @return mixed scalar on success, or null on failure.
65+
* @since 5.0.0
66+
*/
67+
public function key()
5568
{
56-
if (!$this->hasLines()) {
57-
return false;
58-
}
59-
return in_array($this->firstDirective(), func_get_args());
69+
$cur = $this->lineIterator->current();
70+
return $cur[0];
6071
}
6172

6273
/**
63-
* Get the argument of the first directive,
64-
* and shift the file to remove it
74+
* Checks if current position is valid
75+
* @link http://php.net/manual/en/iterator.valid.php
76+
* @return boolean The return value will be casted to boolean and then evaluated.
77+
* Returns true on success or false on failure.
78+
* @since 5.0.0
6579
*/
66-
public function shiftArgument()
80+
public function valid()
6781
{
68-
return array_shift($this->lines)[1];
82+
return $this->lineIterator->valid();
6983
}
7084

7185
/**
72-
* Does this file have any remaining lines
73-
* @return bool
86+
* Rewind the Iterator to the first element
87+
* @link http://php.net/manual/en/iterator.rewind.php
88+
* @return void Any returned value is ignored.
89+
* @since 5.0.0
7490
*/
75-
public function hasLines()
91+
public function rewind()
7692
{
77-
return !empty($this->lines);
93+
$this->lineIterator->rewind();
7894
}
7995
}

0 commit comments

Comments
 (0)