Skip to content

Commit 10cff4e

Browse files
authored
implement PHP-like escape sequences in strings (#232)
This also fixes a problem with \n appearing in strings and not being rendered correctly, leading to problems in strings like `"2 \\neq 3"`.
1 parent 3c12eaa commit 10cff4e

File tree

3 files changed

+251
-11
lines changed

3 files changed

+251
-11
lines changed

classes/local/lexer.php

Lines changed: 135 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,136 @@ private function read_number(): token {
236236
return new token(token::NUMBER, floatval($result), $startingposition['row'], $startingposition['column'], $metadata);
237237
}
238238

239+
/**
240+
* Read various escape sequences in strings.
241+
*
242+
* @param bool $doublequote whether the string is delimited by double quotes
243+
* @return string
244+
*/
245+
private function read_escape_sequence(bool $doublequote = true): string {
246+
// Consume the backslash and look at the character immediately following.
247+
$this->inputstream->read();
248+
$afterbackslash = $this->inputstream->peek();
249+
250+
// If the backslash is followed by another backslash, also consume the second and
251+
// return a backslash.
252+
if ($afterbackslash === '\\') {
253+
$this->inputstream->read();
254+
return '\\';
255+
}
256+
257+
// If the string is delimited by single quotes, we simply return the backslash, because
258+
// all other escape sequences are treated literally. Note that this function
259+
// is not called if the backslash was used to escape the string's opening delimiter.
260+
if (!$doublequote) {
261+
return '\\';
262+
}
263+
264+
// In strings delimited by double quotes, some escape sequences have a special meaning.
265+
// We return them here. The character following the backslash has to be consumed.
266+
switch ($afterbackslash) {
267+
case 'n':
268+
$this->inputstream->read();
269+
return "\n";
270+
case 'r':
271+
$this->inputstream->read();
272+
return "\r";
273+
case 't':
274+
$this->inputstream->read();
275+
return "\t";
276+
case 'v':
277+
$this->inputstream->read();
278+
return "\v";
279+
case 'e':
280+
$this->inputstream->read();
281+
return "\e";
282+
case 'f':
283+
$this->inputstream->read();
284+
return "\f";
285+
case '$':
286+
$this->inputstream->read();
287+
return "\$";
288+
}
289+
290+
// The backslash can be followed by an octal number, i. e. one, two or three digits from 0
291+
// up to and including 7. In this case, we return the character. If it's more than 3 digits,
292+
// the remaining digits are not considered, but appended after the escape sequence.
293+
if (preg_match('/[0-7]/', $afterbackslash)) {
294+
$octal = 0;
295+
$digits = 0;
296+
$possiblenextdigit = $this->inputstream->peek();
297+
while (preg_match('/[0-7]/', $possiblenextdigit) && $digits < 3) {
298+
$digits++;
299+
$octal = 8 * $octal + intval($this->inputstream->read());
300+
$possiblenextdigit = $this->inputstream->peek();
301+
}
302+
return chr($octal);
303+
}
304+
305+
// The backslash can be followed by x in order to have a hexadecimal escale sequence.
306+
// In this case, there must be one or two hexadecimal digits after the x; if it's more,
307+
// that is not an error, but the digits will simply not be part of the escape sequence.
308+
if ($afterbackslash === 'x') {
309+
$hex = null;
310+
$digits = 0;
311+
$afterx = $this->inputstream->peek(1);
312+
while (preg_match('/[0-9A-F]/i', $afterx) && $digits < 2) {
313+
$digits++;
314+
$hex = 16 * $hex + hexdec($afterx);
315+
$this->inputstream->read();
316+
$afterx = $this->inputstream->peek(1);
317+
}
318+
// If there was no hexadecimal digit after the x, we must simply return \x verbatim.
319+
// Note that the x character must be consumed.
320+
if ($hex === null) {
321+
$this->inputstream->read();
322+
return '\x';
323+
}
324+
// Consume the last digit.
325+
$this->inputstream->read();
326+
return chr($hex);
327+
}
328+
329+
// Finally, the backslash can be use to reference a unicode codepoint. The codepoint must be
330+
// wrapped in curly braces and must be given as a hexadecimal number, not larger than 0x10FFFF.
331+
// A missing or an invalid codepoint shall trigger an error message, mimicking PHP's behaviour.
332+
if ($afterbackslash === 'u') {
333+
$afteru = $this->inputstream->peek(1);
334+
// If the u is not followed by an opening brace, we just return the backslash. The u
335+
// and all the rest will be read separately.
336+
if ($afteru != '{') {
337+
return '\\';
338+
}
339+
// So there was an opening brace, let's consume the u character.
340+
$this->inputstream->read();
341+
342+
// Read all digits and calculate the codepoint's value.
343+
$possibledigit = $this->inputstream->peek(1);
344+
$codepoint = null;
345+
while (preg_match('/[0-9A-F]/i', $possibledigit)) {
346+
$codepoint = 16 * $codepoint + hexdec($possibledigit);
347+
$this->inputstream->read();
348+
$possibledigit = $this->inputstream->peek(1);
349+
}
350+
// If the character following the last digit is not a closing curly brace, that is a
351+
// syntax error.
352+
if ($possibledigit != '}' || $codepoint === null) {
353+
$this->inputstream->die(get_string('error_invalidcodepoint', 'qtype_formulas'));
354+
}
355+
// Make sure the codepoint is not too large.
356+
if ($codepoint > 0x10FFFF) {
357+
$this->inputstream->die(get_string('error_invalidcodepoint_toolarge', 'qtype_formulas'));
358+
}
359+
// Consume the last digit and the curly brace and return the (probably multi-byte) character.
360+
$this->inputstream->read();
361+
$this->inputstream->read();
362+
return mb_chr($codepoint);
363+
}
364+
365+
// No escape sequence found? Then just return the backslash.
366+
return '\\';
367+
}
368+
239369
/**
240370
* Read a string token from the input stream.
241371
*
@@ -253,15 +383,16 @@ private function read_string(): token {
253383
while ($currentchar !== input_stream::EOF) {
254384
$nextchar = $this->inputstream->peek();
255385
// A backslash could be used to escape the opening/closing delimiter inside the string.
386+
// Also, we can have \n for newline or \t for tabulator. Furthermore, it is possible
387+
// to write \\ for the backslash. However, escaping is not mandatory, so it is
388+
// perfectly valid to have 2 \ 3 which would mean two-backslash-three.
256389
if ($nextchar == '\\') {
257390
$followedby = $this->inputstream->peek(1);
258391
if ($followedby === $opener) {
259392
// Consume the backslash. The quote will be appended later.
260393
$this->inputstream->read();
261-
} else if ($followedby === 't' || $followedby === 'n') {
262-
$this->inputstream->read();
263-
$currentchar = $this->inputstream->read();
264-
$result .= ($followedby === 't' ? "\t" : "\n");
394+
} else {
395+
$result .= $this->read_escape_sequence($opener === '"');
265396
continue;
266397
}
267398
} else if ($nextchar === $opener) {

lang/en/qtype_formulas.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,8 @@
175175
$string['error_inv_smallest'] = 'When using inv(), the smallest number in the list must be 0 or 1.';
176176
$string['error_invalidalgebraic'] = '\'{$a}\' is not a valid algebraic expression.';
177177
$string['error_invalidargsep'] = 'Syntax error: invalid use of separator token \',\'.';
178+
$string['error_invalidcodepoint'] = 'Invalid UTF-8 codepoint escape sequence.';
179+
$string['error_invalidcodepoint_toolarge'] = 'Invalid UTF-8 codepoint escape sequence: Codepoint larger than 0x10FFFF.';
178180
$string['error_invalidcontext'] = 'Invalid variable context given, aborting import.';
179181
$string['error_invalidrandvardef'] = 'Invalid definition of a random variable - you must provide a list of possible values.';
180182
$string['error_invalidrangesep'] = 'Syntax error: invalid use of range separator \':\'.';

tests/lexer_test.php

Lines changed: 114 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -616,22 +616,30 @@ public static function provide_valid_strings(): array {
616616
['foo"bar', '"foo\"bar"'],
617617
// Test useage of a single quote in a double quote string.
618618
["foo'bar", '"foo\'bar"'],
619-
// Test usage of the escape sequence \n in a single quote string.
620-
["foo\nbar", "'foo\\nbar'"],
619+
// Test usage of the escape sequence \n in a single quote string. It should be taken literally.
620+
['foo\nbar', "'foo\\nbar'"],
621621
// Test usage of the escape sequence \n in a double quote string.
622622
["foo\nbar", '"foo\nbar"'],
623-
// Test usage of the escape sequence \t in a single quote string.
624-
["foo\tbar", "'foo\\tbar'"],
623+
// Test usage of the escape sequence \t in a single quote string. It should be taken literally.
624+
['foo\tbar', "'foo\\tbar'"],
625625
// Test usage of the escape sequence \t in a double quote string.
626626
["foo\tbar", '"foo\tbar"'],
627627
// Test usage of an unescaped backslash in a double quote string.
628-
['foo\bar', '"foo\bar"'],
628+
['foo\bar', '"foo\\bar"'],
629629
// Test usage of an unescaped backslash in single quote string.
630630
['foo\bar', "'foo\\bar'"],
631631
// Test usage of an escaped backslash in a double quote string.
632-
['foo\\\\bar', '"foo\\\\bar"'],
632+
['foo\bar', '"foo\\\\bar"'],
633633
// Test usage of an escaped backslash in single quote string.
634-
['foo\\\\bar', "'foo\\\\bar'"],
634+
['foo\bar', "'foo\\\\bar'"],
635+
// Test usage of a verbatim \n (not a \n newline code) in a string.
636+
['2\neq3', '"2\\\\neq3"'],
637+
// Test of backslash in various situations.
638+
['\\', '"\\\\"'],
639+
[' \ ', '" \ "'],
640+
[' \ ', '" \\ "'],
641+
['2 \ 3', '"2 \\ 3"'],
642+
['2 \ 3', '"2 \ 3"'],
635643
];
636644
}
637645

@@ -693,6 +701,105 @@ public function test_various_inputs($expected, $input): void {
693701
self::assertEquals($expected, $tokens[0]->value);
694702
}
695703

704+
/**
705+
* Data provider
706+
*
707+
* @return array
708+
*/
709+
public static function provide_escape_sequences(): array {
710+
return [
711+
['a' . PHP_EOL . 'x', '"a\nx"'],
712+
['"', '"\""'],
713+
['\\', '"\\\\"'],
714+
['"', '"\\""'],
715+
["\n", '"\n"'],
716+
["\t", '"\t"'],
717+
["\r", '"\r"'],
718+
["\v", '"\v"'],
719+
["\e", '"\e"'],
720+
["\f", '"\f"'],
721+
['$', '"\$"'],
722+
['\m', '"\m"'],
723+
['A1', '"\1011"'],
724+
['8', '"\70"'],
725+
['A', '"\101"'],
726+
[chr(7), '"\7"'],
727+
[chr(7) . '9', '"\79"'],
728+
['8', '"\70"'],
729+
['\90', '"\90"'],
730+
['S4', '"\1234"'],
731+
['A', '"\x41"'],
732+
["\n", '"\xA"'],
733+
['A3', '"\x413"'],
734+
['\xG13', '"\xG13"'],
735+
['\u', '"\u"'],
736+
['\u1234', '"\u1234"'],
737+
['A', '"\u{41}"'],
738+
["\u{10FFFF}", '"\u{10FFFF}"'],
739+
['🐘', '"\u{1F418}"'],
740+
['!Unterminated string, started at row 1, column 1.', '"\\"'],
741+
['!Invalid UTF-8 codepoint escape sequence: Codepoint larger than 0x10FFFF.', '"\u{110000}"'],
742+
['!Invalid UTF-8 codepoint escape sequence.', '"\u{}"'],
743+
// For strings that are delimited by single quotes, only \\ and \' are special.
744+
['a\nx', "'a\\nx'"],
745+
["'", "'\\''"],
746+
['\\', "'\\\\'"],
747+
[' \\ ', "' \\ '"],
748+
["'", "'\\''"],
749+
['\n', "'\\n'"],
750+
['\t', "'\\t'"],
751+
['\r', "'\\r'"],
752+
['\v', "'\\v'"],
753+
['\e', "'\\e'"],
754+
['\f', "'\\f'"],
755+
['\$', "'\\$'"],
756+
['\1011', "'\\1011'"],
757+
['\70', "'\\70'"],
758+
['\101', "'\\101'"],
759+
['\7', "'\\7'"],
760+
['\70', "'\\70'"],
761+
['\90', "'\\90'"],
762+
['\1234', "'\\1234'"],
763+
['\xA', "'\\xA'"],
764+
['\x41', "'\\x41'"],
765+
['\x413', "'\\x413'"],
766+
['\xG13', "'\\xG13'"],
767+
['\u', "'\\u'"],
768+
['\u123', "'\\u123'"],
769+
['\u{41}', "'\\u{41}'"],
770+
['\u{10FFFF}', "'\\u{10FFFF}'"],
771+
['\u{1F418}', "'\\u{1F418}'"],
772+
['!Unterminated string, started at row 1, column 1.', "'\\'"],
773+
['\u{110000}', "'\\u{110000}'"],
774+
['\u{}', "'\\u{}'"],
775+
];
776+
}
777+
778+
/**
779+
* Test interpretation of escape sequences
780+
*
781+
* @param string $expected expected interpretation or error message (marked with ! at start)
782+
* @param string $input simulated input
783+
* @dataProvider provide_escape_sequences
784+
*/
785+
public function test_escape_sequences($expected, $input): void {
786+
$message = '';
787+
try {
788+
$lexer = new lexer($input);
789+
$tokens = $lexer->get_tokens();
790+
} catch (Exception $e) {
791+
$message = $e->getMessage();
792+
}
793+
794+
if ($expected[0] === '!') {
795+
self::assertStringEndsWith(substr($expected, 1), $message);
796+
return;
797+
}
798+
799+
self::assertEmpty($message);
800+
self::assertEquals($expected, reset($tokens)->value);
801+
}
802+
696803
/**
697804
* Data provider.
698805
*

0 commit comments

Comments
 (0)