@@ -236,6 +236,136 @@ private function read_number(): token {
236236 return new token (token::NUMBER , floatval ($ result ), $ startingposition ['row ' ], $ startingposition ['column ' ], $ metadata );
237237 }
238238
239+ /**
240+ * Read various escape sequences in strings.
241+ *
242+ * @param bool $doublequote whether the string is delimited by double quotes
243+ * @return string
244+ */
245+ private function read_escape_sequence (bool $ doublequote = true ): string {
246+ // Consume the backslash and look at the character immediately following.
247+ $ this ->inputstream ->read ();
248+ $ afterbackslash = $ this ->inputstream ->peek ();
249+
250+ // If the backslash is followed by another backslash, also consume the second and
251+ // return a backslash.
252+ if ($ afterbackslash === '\\' ) {
253+ $ this ->inputstream ->read ();
254+ return '\\' ;
255+ }
256+
257+ // If the string is delimited by single quotes, we simply return the backslash, because
258+ // all other escape sequences are treated literally. Note that this function
259+ // is not called if the backslash was used to escape the string's opening delimiter.
260+ if (!$ doublequote ) {
261+ return '\\' ;
262+ }
263+
264+ // In strings delimited by double quotes, some escape sequences have a special meaning.
265+ // We return them here. The character following the backslash has to be consumed.
266+ switch ($ afterbackslash ) {
267+ case 'n ' :
268+ $ this ->inputstream ->read ();
269+ return "\n" ;
270+ case 'r ' :
271+ $ this ->inputstream ->read ();
272+ return "\r" ;
273+ case 't ' :
274+ $ this ->inputstream ->read ();
275+ return "\t" ;
276+ case 'v ' :
277+ $ this ->inputstream ->read ();
278+ return "\v" ;
279+ case 'e ' :
280+ $ this ->inputstream ->read ();
281+ return "\e" ;
282+ case 'f ' :
283+ $ this ->inputstream ->read ();
284+ return "\f" ;
285+ case '$ ' :
286+ $ this ->inputstream ->read ();
287+ return "\$" ;
288+ }
289+
290+ // The backslash can be followed by an octal number, i. e. one, two or three digits from 0
291+ // up to and including 7. In this case, we return the character. If it's more than 3 digits,
292+ // the remaining digits are not considered, but appended after the escape sequence.
293+ if (preg_match ('/[0-7]/ ' , $ afterbackslash )) {
294+ $ octal = 0 ;
295+ $ digits = 0 ;
296+ $ possiblenextdigit = $ this ->inputstream ->peek ();
297+ while (preg_match ('/[0-7]/ ' , $ possiblenextdigit ) && $ digits < 3 ) {
298+ $ digits ++;
299+ $ octal = 8 * $ octal + intval ($ this ->inputstream ->read ());
300+ $ possiblenextdigit = $ this ->inputstream ->peek ();
301+ }
302+ return chr ($ octal );
303+ }
304+
305+ // The backslash can be followed by x in order to have a hexadecimal escale sequence.
306+ // In this case, there must be one or two hexadecimal digits after the x; if it's more,
307+ // that is not an error, but the digits will simply not be part of the escape sequence.
308+ if ($ afterbackslash === 'x ' ) {
309+ $ hex = null ;
310+ $ digits = 0 ;
311+ $ afterx = $ this ->inputstream ->peek (1 );
312+ while (preg_match ('/[0-9A-F]/i ' , $ afterx ) && $ digits < 2 ) {
313+ $ digits ++;
314+ $ hex = 16 * $ hex + hexdec ($ afterx );
315+ $ this ->inputstream ->read ();
316+ $ afterx = $ this ->inputstream ->peek (1 );
317+ }
318+ // If there was no hexadecimal digit after the x, we must simply return \x verbatim.
319+ // Note that the x character must be consumed.
320+ if ($ hex === null ) {
321+ $ this ->inputstream ->read ();
322+ return '\x ' ;
323+ }
324+ // Consume the last digit.
325+ $ this ->inputstream ->read ();
326+ return chr ($ hex );
327+ }
328+
329+ // Finally, the backslash can be use to reference a unicode codepoint. The codepoint must be
330+ // wrapped in curly braces and must be given as a hexadecimal number, not larger than 0x10FFFF.
331+ // A missing or an invalid codepoint shall trigger an error message, mimicking PHP's behaviour.
332+ if ($ afterbackslash === 'u ' ) {
333+ $ afteru = $ this ->inputstream ->peek (1 );
334+ // If the u is not followed by an opening brace, we just return the backslash. The u
335+ // and all the rest will be read separately.
336+ if ($ afteru != '{ ' ) {
337+ return '\\' ;
338+ }
339+ // So there was an opening brace, let's consume the u character.
340+ $ this ->inputstream ->read ();
341+
342+ // Read all digits and calculate the codepoint's value.
343+ $ possibledigit = $ this ->inputstream ->peek (1 );
344+ $ codepoint = null ;
345+ while (preg_match ('/[0-9A-F]/i ' , $ possibledigit )) {
346+ $ codepoint = 16 * $ codepoint + hexdec ($ possibledigit );
347+ $ this ->inputstream ->read ();
348+ $ possibledigit = $ this ->inputstream ->peek (1 );
349+ }
350+ // If the character following the last digit is not a closing curly brace, that is a
351+ // syntax error.
352+ if ($ possibledigit != '} ' || $ codepoint === null ) {
353+ $ this ->inputstream ->die (get_string ('error_invalidcodepoint ' , 'qtype_formulas ' ));
354+ }
355+ // Make sure the codepoint is not too large.
356+ if ($ codepoint > 0x10FFFF ) {
357+ $ this ->inputstream ->die (get_string ('error_invalidcodepoint_toolarge ' , 'qtype_formulas ' ));
358+ }
359+ // Consume the last digit and the curly brace and return the (probably multi-byte) character.
360+ $ this ->inputstream ->read ();
361+ $ this ->inputstream ->read ();
362+ return mb_chr ($ codepoint );
363+ }
364+
365+ // No escape sequence found? Then just return the backslash.
366+ return '\\' ;
367+ }
368+
239369 /**
240370 * Read a string token from the input stream.
241371 *
@@ -253,15 +383,16 @@ private function read_string(): token {
253383 while ($ currentchar !== input_stream::EOF ) {
254384 $ nextchar = $ this ->inputstream ->peek ();
255385 // A backslash could be used to escape the opening/closing delimiter inside the string.
386+ // Also, we can have \n for newline or \t for tabulator. Furthermore, it is possible
387+ // to write \\ for the backslash. However, escaping is not mandatory, so it is
388+ // perfectly valid to have 2 \ 3 which would mean two-backslash-three.
256389 if ($ nextchar == '\\' ) {
257390 $ followedby = $ this ->inputstream ->peek (1 );
258391 if ($ followedby === $ opener ) {
259392 // Consume the backslash. The quote will be appended later.
260393 $ this ->inputstream ->read ();
261- } else if ($ followedby === 't ' || $ followedby === 'n ' ) {
262- $ this ->inputstream ->read ();
263- $ currentchar = $ this ->inputstream ->read ();
264- $ result .= ($ followedby === 't ' ? "\t" : "\n" );
394+ } else {
395+ $ result .= $ this ->read_escape_sequence ($ opener === '" ' );
265396 continue ;
266397 }
267398 } else if ($ nextchar === $ opener ) {
0 commit comments