Skip to content
This repository was archived by the owner on Sep 20, 2021. It is now read-only.

Commit 05740f1

Browse files
committed
Improve Lexer's error handling of invalid UTF-8 strings.
1 parent c620f44 commit 05740f1

File tree

3 files changed

+70
-3
lines changed

3 files changed

+70
-3
lines changed

Exception/InternalError.php

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
<?php
2+
3+
namespace Hoa\Compiler\Exception;
4+
5+
use LogicException;
6+
7+
/**
8+
* It probably points to some internal issue of the Hoa Compiler library.
9+
* Regardless source of the bug, try to report about this exception to the library maintainers.
10+
* Even if bug is yours, this exception must not happen.
11+
*/
12+
final class InternalError extends LogicException
13+
{
14+
public function __construct($message, Exception $previous = null)
15+
{
16+
parent::__construct($message, 0, $previous);
17+
}
18+
}

Llk/Lexer.php

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
namespace Hoa\Compiler\Llk;
3838

3939
use Hoa\Compiler;
40+
use Hoa\Compiler\Exception\InternalError;
4041

4142
/**
4243
* Class \Hoa\Compiler\Llk\Lexer.
@@ -110,6 +111,8 @@ public function __construct(array $pragmas = [])
110111
*/
111112
public function lexMe($text, array $tokens)
112113
{
114+
$this->validateInputInUnicodeMode($text);
115+
113116
$this->_text = $text;
114117
$this->_tokens = $tokens;
115118
$this->_nsStack = null;
@@ -272,9 +275,9 @@ protected function nextToken($offset)
272275
*/
273276
protected function matchLexeme($lexeme, $regex, $offset)
274277
{
275-
$_regex = str_replace('#', '\#', $regex);
276-
$preg = preg_match(
277-
'#\G(?|' . $_regex . ')#' . $this->_pcreOptions,
278+
$_regex = '#\G(?|' . str_replace('#', '\#', $regex) . ')#' . $this->_pcreOptions;
279+
$preg = @preg_match(
280+
$_regex,
278281
$this->_text,
279282
$matches,
280283
0,
@@ -285,6 +288,16 @@ protected function matchLexeme($lexeme, $regex, $offset)
285288
return null;
286289
}
287290

291+
if (false === $preg) {
292+
throw new Compiler\Exception\InternalError(
293+
sprintf(
294+
'Lexer encountered a PCRE error (code: %d), full regex: "%s".',
295+
preg_last_error(),
296+
$_regex
297+
)
298+
);
299+
}
300+
288301
if ('' === $matches[0]) {
289302
throw new Compiler\Exception\Lexer(
290303
'A lexeme must not match an empty value, which is the ' .
@@ -300,4 +313,17 @@ protected function matchLexeme($lexeme, $regex, $offset)
300313
'length' => mb_strlen($matches[0])
301314
];
302315
}
316+
317+
/**
318+
* @param string $text
319+
* @return bool
320+
*/
321+
private function validateInputInUnicodeMode($text)
322+
{
323+
if (strpos($this->_pcreOptions, 'u') !== false && !mb_check_encoding($text, 'utf-8')) {
324+
throw new Compiler\Exception\Lexer(
325+
'Text is not valid utf-8 string, you probably need to switch "lexer.unicode" setting off.'
326+
);
327+
}
328+
}
303329
}

Test/Unit/Llk/Lexer.php

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,4 +496,27 @@ public function case_unicode_disabled()
496496
''
497497
);
498498
}
499+
500+
public function case_invalid_utf8_with_unicode_mode()
501+
{
502+
$this
503+
->given(
504+
$lexer = new SUT(['lexer.unicode' => true]),
505+
$datum = "\xFF",
506+
$tokens = [
507+
'default' => [
508+
'foo' => "\xFF"
509+
]
510+
]
511+
)
512+
->when($result = $lexer->lexMe($datum, $tokens))
513+
->then
514+
->exception(function () use ($result) {
515+
$result->next();
516+
})
517+
->isInstanceOf(LUT\Exception\Lexer::class)
518+
->hasMessage(
519+
'Text is not valid utf-8 string, you probably need to switch "lexer.unicode" setting off.'
520+
);
521+
}
499522
}

0 commit comments

Comments
 (0)