|
15 | 15 | use LayerShifter\TLDSupport\Helpers\Arr; |
16 | 16 | use LayerShifter\TLDSupport\Helpers\IP; |
17 | 17 | use LayerShifter\TLDSupport\Helpers\Str; |
| 18 | +use TrueBV\Exception\OutOfBoundsException; |
18 | 19 |
|
19 | 20 | /** |
20 | 21 | * Extract class accurately extracts subdomain, domain and TLD components from URLs. |
@@ -48,6 +49,13 @@ class Extract |
48 | 49 | * @see https://tools.ietf.org/html/rfc3986#section-3.1 |
49 | 50 | */ |
50 | 51 | const SCHEMA_PATTERN = '#^([a-zA-Z][a-zA-Z0-9+\-.]*:)?//#'; |
| 52 | + /** |
| 53 | + * @const string The specification for this regex is based upon the extracts from RFC 1034 and RFC 2181 below. |
| 54 | + * |
| 55 | + * @see https://tools.ietf.org/html/rfc1034 |
| 56 | + * @see https://tools.ietf.org/html/rfc2181 |
| 57 | + */ |
| 58 | + const HOSTNAME_PATTERN = '#^((?!-)[a-z0-9-]{0,62}[a-z0-9]\.)+[a-z]{2,63}|[xn\-\-a-z0-9]]{6,63}$#'; |
51 | 59 |
|
52 | 60 | /** |
53 | 61 | * @var int Value of extraction options. |
@@ -262,6 +270,33 @@ private function extractSuffix($hostname) |
262 | 270 | $hostname = $this->idn->toUTF8($hostname); |
263 | 271 | } |
264 | 272 |
|
| 273 | + // URI producers should use names that conform to the DNS syntax, even when use of DNS is not immediately |
| 274 | + // apparent, and should limit these names to no more than 255 characters in length. |
| 275 | + // |
| 276 | + // @see https://tools.ietf.org/html/rfc3986 |
| 277 | + // @see http://blogs.msdn.com/b/oldnewthing/archive/2012/04/12/10292868.aspx |
| 278 | + |
| 279 | + if (Str::length($hostname) > 253) { |
| 280 | + return null; |
| 281 | + } |
| 282 | + |
| 283 | + // The DNS itself places only one restriction on the particular labels that can be used to identify resource |
| 284 | + // records. That one restriction relates to the length of the label and the full name. The length of any one |
| 285 | + // label is limited to between 1 and 63 octets. A full domain name is limited to 255 octets (including the |
| 286 | + // separators). |
| 287 | + // |
| 288 | + // @see http://tools.ietf.org/html/rfc2181 |
| 289 | + |
| 290 | + try { |
| 291 | + $asciiHostname = $this->idn->toASCII($hostname); |
| 292 | + } catch (OutOfBoundsException $e) { |
| 293 | + return null; |
| 294 | + } |
| 295 | + |
| 296 | + if (0 === preg_match(self::HOSTNAME_PATTERN, $asciiHostname)) { |
| 297 | + return null; |
| 298 | + } |
| 299 | + |
265 | 300 | $suffix = $this->parseSuffix($hostname); |
266 | 301 |
|
267 | 302 | if (null === $suffix) { |
|
0 commit comments