Skip to content

Commit 2c0f0ee

Browse files
author
Tom
committed
Match UAs more generously, as per google spec
So previously the UA "googlebot" would not match "googlebot-news" as the RFC says "The robot must obey the first record in /robots.txt that contains a User-Agent line whose value contains the name token of the robot as a substring." "googlebot-news" is not a substring of "googlebot" but the reverse is true, so we try both cases now.
1 parent aeef897 commit 2c0f0ee

File tree

2 files changed

+15
-2
lines changed

2 files changed

+15
-2
lines changed

src/Robot/UserAgent.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ public function getMatches($userAgent)
2121
}
2222

2323
$matches = array_filter($this->userAgents, function($elem) use($ua) {
24-
return strpos($elem, $ua) !== false || $elem == '*';
24+
return strpos($elem, $ua) !== false || strpos($ua, $elem) !== false || $elem == '*';
2525
});
26-
return $matches;
26+
return array_values($matches);
2727
}
2828
}

tests/UserAgentTest.php

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,17 @@ public function givenWildcardAgent_alwaysMatch()
9393
$this->assertTrue($wildcard->getMatches($ua) == ['*'], 'wilcard matches all');
9494
}
9595
}
96+
97+
/**
98+
* @test
99+
* https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#order-of-precedence-for-user-agents
100+
*/
101+
public function givenGoogleExamples_behaveAsExpected()
102+
{
103+
$googleUAs = new UserAgent(['googlebot-news', '*', 'googlebot']);
104+
$this->assertContains('googlebot-news', $googleUAs->getMatches('Googlebot-News'));
105+
$this->assertContains('googlebot', $googleUAs->getMatches('Googlebot-Images'));
106+
$this->assertNotContains('googlebot-news', $googleUAs->getMatches('Googlebot-Images'));
107+
$this->assertEquals(['*'], $googleUAs->getMatches('otherbot'));
108+
}
96109
}

0 commit comments

Comments
 (0)