Skip to content

Commit e4a0b2b

Browse files
committed
Improve IDNA conversion support
- add IDNAConvertrait to normalize internally IDN conversion as well as IDN error reporting
1 parent ad3eef8 commit e4a0b2b

File tree

8 files changed

+128
-63
lines changed

8 files changed

+128
-63
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ All Notable changes to `PHP Domain Parser` **5.x** series will be documented in
2020

2121
- `Pdp\Domain::getDomain` returns the normalized form of the domain name
2222
- `Pdp\PublicSuffix` is no longer internal.
23+
- normalizes IDN conversion using a internal `IDNConverterTrait`
2324

2425
### Deprecated
2526

src/Converter.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
*/
2424
final class Converter
2525
{
26+
use IDNAConverterTrait;
27+
2628
/**
2729
* Convert the Public Suffix List into
2830
* an associative, multidimensional array
@@ -102,7 +104,7 @@ private function addRule(array $list, array $rule_parts): array
102104
// "The domain and all rules must be canonicalized in the normal way
103105
// for hostnames - lower-case, Punycode (RFC 3492)."
104106

105-
$part = idn_to_ascii($part, 0, INTL_IDNA_VARIANT_UTS46);
107+
$part = $this->idnToAscii($part);
106108
$isDomain = true;
107109
if (0 === strpos($part, '!')) {
108110
$part = substr($part, 1);

src/Domain.php

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
*/
3030
final class Domain implements JsonSerializable
3131
{
32+
use IDNAConverterTrait;
33+
3234
/**
3335
* @var string|null
3436
*/
@@ -241,16 +243,6 @@ public function isPrivate(): bool
241243
return $this->publicSuffix->isPrivate();
242244
}
243245

244-
/**
245-
* Returns the public suffix section name used to determine the public suffix.
246-
*
247-
* @return string
248-
*/
249-
public function getSection(): string
250-
{
251-
return $this->publicSuffix->getSection();
252-
}
253-
254246
/**
255247
* Converts the domain to its IDNA ASCII form.
256248
*
@@ -267,12 +259,7 @@ public function toAscii(): self
267259
return $this;
268260
}
269261

270-
$domain = idn_to_ascii($this->domain, 0, INTL_IDNA_VARIANT_UTS46, $arr);
271-
if (!$arr['errors']) {
272-
return new self($domain, $this->publicSuffix->toAscii());
273-
}
274-
275-
throw new Exception(sprintf('The following domain `%s` can not be converted to ascii', $this->domain));
262+
return new self($this->idnToAscii($this->domain), $this->publicSuffix->toAscii());
276263
}
277264

278265
/**
@@ -291,11 +278,6 @@ public function toUnicode(): self
291278
return $this;
292279
}
293280

294-
$domain = idn_to_utf8($this->domain, 0, INTL_IDNA_VARIANT_UTS46, $arr);
295-
if (!$arr['errors']) {
296-
return new self($domain, $this->publicSuffix->toUnicode());
297-
}
298-
299-
throw new Exception(sprintf('The following domain `%s` can not be converted to unicode', $this->domain));
281+
return new self($this->idnToUnicode($this->domain), $this->publicSuffix->toUnicode());
300282
}
301283
}

src/IDNAConverterTrait.php

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
<?php
2+
/**
3+
* PHP Domain Parser: Public Suffix List based URL parsing.
4+
*
5+
* @see http://github.com/jeremykendall/php-domain-parser for the canonical source repository
6+
*
7+
* @copyright Copyright (c) 2017 Jeremy Kendall (http://jeremykendall.net)
8+
* @license http://github.com/jeremykendall/php-domain-parser/blob/master/LICENSE MIT License
9+
*/
10+
declare(strict_types=1);
11+
12+
namespace Pdp;
13+
14+
/**
15+
* A Wrapper around INTL IDNA function
16+
*
17+
* @author Ignace Nyamagana Butera <[email protected]>
18+
*/
19+
trait IDNAConverterTrait
20+
{
21+
/**
22+
* IDNA errors
23+
*
24+
* @see http://icu-project.org/apiref/icu4j/com/ibm/icu/text/IDNA.Error.html
25+
* @var array
26+
*/
27+
private static $idn_errors = [
28+
IDNA_ERROR_EMPTY_LABEL => 'a non-final domain name label (or the whole domain name) is empty',
29+
IDNA_ERROR_LABEL_TOO_LONG => 'a domain name label is longer than 63 bytes',
30+
IDNA_ERROR_DOMAIN_NAME_TOO_LONG => 'a domain name is longer than 255 bytes in its storage form',
31+
IDNA_ERROR_LEADING_HYPHEN => 'a label starts with a hyphen-minus ("-")',
32+
IDNA_ERROR_TRAILING_HYPHEN => 'a label ends with a hyphen-minus ("-")',
33+
IDNA_ERROR_HYPHEN_3_4 => 'a label contains hyphen-minus ("-") in the third and fourth positions',
34+
IDNA_ERROR_LEADING_COMBINING_MARK => 'a label starts with a combining mark',
35+
IDNA_ERROR_DISALLOWED => 'a label or domain name contains disallowed characters',
36+
IDNA_ERROR_PUNYCODE => 'a label starts with "xn--" but does not contain valid Punycode',
37+
IDNA_ERROR_LABEL_HAS_DOT => 'a label contains a dot=full stop',
38+
IDNA_ERROR_INVALID_ACE_LABEL => 'An ACE label does not contain a valid label string',
39+
IDNA_ERROR_BIDI => 'a label does not meet the IDNA BiDi requirements (for right-to-left characters)',
40+
IDNA_ERROR_CONTEXTJ => 'a label does not meet the IDNA CONTEXTJ requirements',
41+
];
42+
43+
/**
44+
* Get and format IDN conversion error message
45+
*
46+
* @param int $error_bit
47+
*
48+
* @return string
49+
*/
50+
private static function getIdnErrors(int $error_bit): string
51+
{
52+
$res = [];
53+
foreach (self::$idn_errors as $error => $reason) {
54+
if ($error_bit & $error) {
55+
$res[] = $reason;
56+
}
57+
}
58+
59+
return empty($res) ? 'Unknown IDNA conversion error.' : implode(', ', $res).'.';
60+
}
61+
62+
/**
63+
* Converts the input to its IDNA ASCII form.
64+
*
65+
* This method returns the string converted to IDN ASCII form
66+
*
67+
* @param string $host
68+
* @throws Exception if the string can not be converted to ASCII using IDN UTS46 algorithm
69+
*
70+
* @return string
71+
*/
72+
private function idnToAscii(string $host): string
73+
{
74+
if (false !== strpos($host, 'xn--')) {
75+
return $host;
76+
}
77+
78+
$output = idn_to_ascii($host, 0, INTL_IDNA_VARIANT_UTS46, $arr);
79+
if (!$arr['errors']) {
80+
return $output;
81+
}
82+
83+
throw new Exception(sprintf('The host `%s` is invalid : %s', $host, self::getIdnErrors($arr['errors'])));
84+
}
85+
86+
/**
87+
* Converts the input to its IDNA UNICODE form.
88+
*
89+
* This method returns the string converted to IDN UNICODE form
90+
*
91+
* @param string $host
92+
* @throws Exception if the string can not be converted to UNICODE using IDN UTS46 algorithm
93+
*
94+
* @return string
95+
*/
96+
private function idnToUnicode(string $host): string
97+
{
98+
if (false === strpos($host, 'xn--')) {
99+
return $host;
100+
}
101+
102+
$output = idn_to_utf8($host, 0, INTL_IDNA_VARIANT_UTS46, $arr);
103+
if (!$arr['errors']) {
104+
return $output;
105+
}
106+
107+
throw new Exception(sprintf('The host `%s` is invalid : %s', $host, self::getIdnErrors($arr['errors'])));
108+
}
109+
}

src/PublicSuffix.php

Lines changed: 4 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
*/
3030
final class PublicSuffix implements Countable, JsonSerializable
3131
{
32+
use IDNAConverterTrait;
33+
3234
/**
3335
* @var string|null
3436
*/
@@ -90,16 +92,6 @@ public function getContent()
9092
return $this->publicSuffix;
9193
}
9294

93-
/**
94-
* Returns the public suffix section name used to determine the public suffix.
95-
*
96-
* @return string
97-
*/
98-
public function getSection(): string
99-
{
100-
return $this->section;
101-
}
102-
10395
/**
10496
* {@inheritdoc}
10597
*/
@@ -158,15 +150,7 @@ public function toUnicode(): self
158150
return $this;
159151
}
160152

161-
$publicSuffix = idn_to_utf8($this->publicSuffix, 0, INTL_IDNA_VARIANT_UTS46, $arr);
162-
if (!$arr['errors']) {
163-
$clone = clone $this;
164-
$clone->publicSuffix = $publicSuffix;
165-
166-
return $clone;
167-
}
168-
169-
throw new Exception(sprintf('The following public suffix `%s` can not be converted to unicode', $this->publicSuffix));
153+
return new self($this->idnToUnicode($this->publicSuffix), $this->section);
170154
}
171155

172156
/**
@@ -185,14 +169,6 @@ public function toAscii(): self
185169
return $this;
186170
}
187171

188-
$publicSuffix = idn_to_ascii($this->publicSuffix, 0, INTL_IDNA_VARIANT_UTS46, $arr);
189-
if (!$arr['errors']) {
190-
$clone = clone $this;
191-
$clone->publicSuffix = $publicSuffix;
192-
193-
return $clone;
194-
}
195-
196-
throw new Exception(sprintf('The following public suffix `%s` can not be converted to ascii', $this->publicSuffix));
172+
return new self($this->idnToAscii($this->publicSuffix), $this->section);
197173
}
198174
}

src/Rules.php

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
*/
2020
final class Rules
2121
{
22+
use IDNAConverterTrait;
23+
2224
const ALL_DOMAINS = 'ALL_DOMAINS';
2325
const ICANN_DOMAINS = 'ICANN_DOMAINS';
2426
const PRIVATE_DOMAINS = 'PRIVATE_DOMAINS';
@@ -217,12 +219,11 @@ private function normalizeDomain(string $domain): string
217219
$domain = rawurldecode($domain);
218220
}
219221

220-
$normalize = idn_to_ascii($domain, 0, INTL_IDNA_VARIANT_UTS46, $arr);
221-
if ($arr['errors'] > 0) {
222+
try {
223+
return strtolower($this->idnToAscii($domain));
224+
} catch (Exception $e) {
222225
return '';
223226
}
224-
225-
return strtolower($normalize);
226227
}
227228

228229
/**
@@ -278,7 +279,7 @@ private function normalizePublicSuffix(PublicSuffix $publicSuffix, string $domai
278279
{
279280
if (null === $publicSuffix->getContent()) {
280281
$labels = explode('.', $domain);
281-
$publicSuffix = new PublicSuffix(array_pop($labels));
282+
$publicSuffix = new PublicSuffix($this->idnToAscii(array_pop($labels)));
282283
}
283284

284285
if (false === strpos($domain, 'xn--')) {

tests/DomainTest.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ public function testRegistrableDomainIsNullWithFoundDomain(string $domain, $publ
2323
$domain = new Domain($domain, new PublicSuffix($publicSuffix));
2424
$this->assertNull($domain->getRegistrableDomain());
2525
$this->assertNull($domain->getSubDomain());
26-
$this->assertEmpty($domain->getSection());
2726
}
2827

2928
public function testToAsciiThrowsException()

tests/RulesTest.php

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@ public function testWithPrivateDomain()
117117
$this->assertTrue($domain->isKnown());
118118
$this->assertFalse($domain->isICANN());
119119
$this->assertTrue($domain->isPrivate());
120-
$this->assertSame(Rules::PRIVATE_DOMAINS, $domain->getSection());
121120
$this->assertSame('github.io', $domain->getPublicSuffix());
122121
$this->assertSame('thephpleague.github.io', $domain->getRegistrableDomain());
123122
$this->assertNull($domain->getSubDomain());
@@ -130,7 +129,6 @@ public function testWithPrivateDomainInvalid()
130129
$this->assertFalse($domain->isKnown());
131130
$this->assertFalse($domain->isICANN());
132131
$this->assertFalse($domain->isPrivate());
133-
$this->assertSame('', $domain->getSection());
134132
$this->assertSame('be', $domain->getPublicSuffix());
135133
$this->assertSame('ac.be', $domain->getRegistrableDomain());
136134
$this->assertSame('private.ulb', $domain->getSubDomain());
@@ -143,7 +141,6 @@ public function testWithPrivateDomainValid()
143141
$this->assertTrue($domain->isKnown());
144142
$this->assertFalse($domain->isICANN());
145143
$this->assertTrue($domain->isPrivate());
146-
$this->assertSame(Rules::PRIVATE_DOMAINS, $domain->getSection());
147144
$this->assertSame('github.io', $domain->getPublicSuffix());
148145
$this->assertSame('thephpleague.github.io', $domain->getRegistrableDomain());
149146
$this->assertNull($domain->getSubDomain());
@@ -156,7 +153,6 @@ public function testWithICANNDomainInvalid()
156153
$this->assertTrue($domain->isKnown());
157154
$this->assertTrue($domain->isICANN());
158155
$this->assertFalse($domain->isPrivate());
159-
$this->assertSame(Rules::ICANN_DOMAINS, $domain->getSection());
160156
$this->assertSame('ac.be', $domain->getPublicSuffix());
161157
$this->assertSame('ulb.ac.be', $domain->getRegistrableDomain());
162158
$this->assertSame('private', $domain->getSubDomain());
@@ -248,7 +244,6 @@ public function testPublicSuffixSection()
248244
$expected = 'рф';
249245
$domain = 'Яндекс.РФ';
250246
$publicSuffix = $this->rules->getPublicSuffix($domain);
251-
$this->assertSame(Rules::ICANN_DOMAINS, $publicSuffix->getSection());
252247
$this->assertSame($expected, $publicSuffix->getContent());
253248
}
254249

@@ -267,7 +262,7 @@ public function testPublicSuffixSection()
267262
*/
268263
public function checkPublicSuffix($input, $expected)
269264
{
270-
$this->assertSame($expected, $this->rules->resolve($input, Rules::ICANN_DOMAINS)->getRegistrableDomain());
265+
$this->assertSame($expected, $this->rules->resolve($input)->getRegistrableDomain());
271266
}
272267

273268
/**

0 commit comments

Comments
 (0)