|
249 | 249 | \indextext{translation!phases|)} |
250 | 250 | \end{enumerate} |
251 | 251 |
|
252 | | -\rSec1[lex.charset]{Character sets} |
| 252 | +\rSec1[lex.char]{Characters}% |
| 253 | + |
| 254 | +\rSec2[lex.charset]{Character sets} |
253 | 255 |
|
254 | 256 | \pnum |
255 | 257 | \indextext{character set|(}% |
|
326 | 328 | \end{floattable} |
327 | 329 |
|
328 | 330 | \pnum |
329 | | -The \grammarterm{universal-character-name} construct provides a way to name |
330 | | -other characters. |
| 331 | +The \defnadj{basic literal}{character set} consists of |
| 332 | +all characters of the basic character set, |
| 333 | +plus the control characters specified in \tref{lex.charset.literal}. |
| 334 | + |
| 335 | +\begin{floattable}{Additional control characters in the basic literal character set}{lex.charset.literal}{ll} |
| 336 | +\topline |
| 337 | +\ohdrx{2}{character} \\ \capsep |
| 338 | +\ucode{0000} & \uname{null} \\ |
| 339 | +\ucode{0007} & \uname{alert} \\ |
| 340 | +\ucode{0008} & \uname{backspace} \\ |
| 341 | +\ucode{000d} & \uname{carriage return} \\ |
| 342 | +\end{floattable} |
| 343 | + |
| 344 | +\pnum |
| 345 | +A \defn{code unit} is an integer value |
| 346 | +of character type\iref{basic.fundamental}. |
| 347 | +Characters in a \grammarterm{character-literal} |
| 348 | +other than a multicharacter or non-encodable character literal or |
| 349 | +in a \grammarterm{string-literal} are encoded as |
| 350 | +a sequence of one or more code units, as determined |
| 351 | +by the \grammarterm{encoding-prefix}\iref{lex.ccon,lex.string}; |
| 352 | +this is termed the respective \defnadj{literal}{encoding}. |
| 353 | +The \defnadj{ordinary literal}{encoding} is |
| 354 | +the encoding applied to an ordinary character or string literal. |
| 355 | +The \defnadj{wide literal}{encoding} is the encoding applied |
| 356 | +to a wide character or string literal. |
| 357 | + |
| 358 | +\pnum |
| 359 | +A literal encoding or a locale-specific encoding of one of |
| 360 | +the execution character sets\iref{character.seq} |
| 361 | +encodes each element of the basic literal character set as |
| 362 | +a single code unit with non-negative value, |
| 363 | +distinct from the code unit for any other such element. |
| 364 | +\begin{note} |
| 365 | +A character not in the basic literal character set |
| 366 | +can be encoded with more than one code unit; |
| 367 | +the value of such a code unit can be the same as |
| 368 | +that of a code unit for an element of the basic literal character set. |
| 369 | +\end{note} |
| 370 | +\indextext{character!null}% |
| 371 | +\indextext{wide-character!null}% |
| 372 | +The \unicode{0000}{null} character is encoded as the value \tcode{0}. |
| 373 | +No other element of the translation character set |
| 374 | +is encoded with a code unit of value \tcode{0}. |
| 375 | +The code unit value of each decimal digit character after the digit \tcode{0} (\ucode{0030}) |
| 376 | +shall be one greater than the value of the previous. |
| 377 | +The ordinary and wide literal encodings are otherwise |
| 378 | +\impldef{ordinary and wide literal encodings}. |
| 379 | +\indextext{UTF-8}% |
| 380 | +\indextext{UTF-16}% |
| 381 | +\indextext{UTF-32}% |
| 382 | +For a UTF-8, UTF-16, or UTF-32 literal, |
| 383 | +the implementation shall encode |
| 384 | +the Unicode scalar value |
| 385 | +corresponding to each character of the translation character set |
| 386 | +as specified in the Unicode Standard |
| 387 | +for the respective Unicode encoding form. |
| 388 | +\indextext{character set|)} |
| 389 | + |
| 390 | +\rSec2[lex.universal.char]{Universal character names} |
331 | 391 |
|
332 | 392 | \begin{bnf} |
333 | | -\nontermdef{n-char} \textnormal{one of}\br |
| 393 | +\nontermdef{n-char}\br |
334 | 394 | \textnormal{any member of the translation character set except the \unicode{007d}{right curly bracket} or new-line character} |
335 | 395 | \end{bnf} |
336 | 396 |
|
|
364 | 424 | named-universal-character |
365 | 425 | \end{bnf} |
366 | 426 |
|
| 427 | +\pnum |
| 428 | +The \grammarterm{universal-character-name} construct provides a way to name any |
| 429 | +element in the translation character set using just the basic character set. |
| 430 | +If a \grammarterm{universal-character-name} outside |
| 431 | +the \grammarterm{c-char-sequence}, \grammarterm{s-char-sequence}, or |
| 432 | +\grammarterm{r-char-sequence} of a \grammarterm{character-literal} or |
| 433 | +\grammarterm{string-literal} |
| 434 | +(in either case, including within a \grammarterm{user-defined-literal}) |
| 435 | +corresponds to a control character or to a character in the basic character set, |
| 436 | +the program is ill-formed. |
| 437 | +\begin{note} |
| 438 | +A sequence of characters resembling a \grammarterm{universal-character-name} in an |
| 439 | +\grammarterm{r-char-sequence}\iref{lex.string} does not form a |
| 440 | +\grammarterm{universal-character-name}. |
| 441 | +\end{note} |
| 442 | + |
367 | 443 | \pnum |
368 | 444 | A \grammarterm{universal-character-name} |
369 | 445 | of the form \tcode{\textbackslash u} \grammarterm{hex-quad}, |
|
391 | 467 | None of these names or aliases have leading or trailing spaces. |
392 | 468 | \end{note} |
393 | 469 |
|
394 | | -\pnum |
395 | | -If a \grammarterm{universal-character-name} outside |
396 | | -the \grammarterm{c-char-sequence}, \grammarterm{s-char-sequence}, or |
397 | | -\grammarterm{r-char-sequence} of |
398 | | -a \grammarterm{character-literal} or \grammarterm{string-literal} |
399 | | -(in either case, including within a \grammarterm{user-defined-literal}) |
400 | | -corresponds to a control character or |
401 | | -to a character in the basic character set, the program is ill-formed. |
402 | | -\begin{note} |
403 | | -A sequence of characters resembling a \grammarterm{universal-character-name} in an |
404 | | -\grammarterm{r-char-sequence}\iref{lex.string} does not form a |
405 | | -\grammarterm{universal-character-name}. |
406 | | -\end{note} |
407 | | - |
408 | | -\pnum |
409 | | -The \defnadj{basic literal}{character set} consists of |
410 | | -all characters of the basic character set, |
411 | | -plus the control characters specified in \tref{lex.charset.literal}. |
412 | | - |
413 | | -\begin{floattable}{Additional control characters}{lex.charset.literal}{ll} |
414 | | -\topline |
415 | | -\ohdrx{2}{character} \\ \capsep |
416 | | -\ucode{0000} & \uname{null} \\ |
417 | | -\ucode{0007} & \uname{alert} \\ |
418 | | -\ucode{0008} & \uname{backspace} \\ |
419 | | -\ucode{000d} & \uname{carriage return} \\ |
420 | | -\end{floattable} |
421 | | - |
422 | | -\pnum |
423 | | -A \defn{code unit} is an integer value |
424 | | -of character type\iref{basic.fundamental}. |
425 | | -Characters in a \grammarterm{character-literal} |
426 | | -other than a multicharacter or non-encodable character literal or |
427 | | -in a \grammarterm{string-literal} are encoded as |
428 | | -a sequence of one or more code units, as determined |
429 | | -by the \grammarterm{encoding-prefix}\iref{lex.ccon,lex.string}; |
430 | | -this is termed the respective \defnadj{literal}{encoding}. |
431 | | -The \defnadj{ordinary literal}{encoding} is |
432 | | -the encoding applied to an ordinary character or string literal. |
433 | | -The \defnadj{wide literal}{encoding} is the encoding applied |
434 | | -to a wide character or string literal. |
435 | | - |
436 | | -\pnum |
437 | | -A literal encoding or a locale-specific encoding of one of |
438 | | -the execution character sets\iref{character.seq} |
439 | | -encodes each element of the basic literal character set as |
440 | | -a single code unit with non-negative value, |
441 | | -distinct from the code unit for any other such element. |
442 | | -\begin{note} |
443 | | -A character not in the basic literal character set |
444 | | -can be encoded with more than one code unit; |
445 | | -the value of such a code unit can be the same as |
446 | | -that of a code unit for an element of the basic literal character set. |
447 | | -\end{note} |
448 | | -\indextext{character!null}% |
449 | | -\indextext{wide-character!null}% |
450 | | -The \unicode{0000}{null} character is encoded as the value \tcode{0}. |
451 | | -No other element of the translation character set |
452 | | -is encoded with a code unit of value \tcode{0}. |
453 | | -The code unit value of each decimal digit character after the digit \tcode{0} (\ucode{0030}) |
454 | | -shall be one greater than the value of the previous. |
455 | | -The ordinary and wide literal encodings are otherwise |
456 | | -\impldef{ordinary and wide literal encodings}. |
457 | | -\indextext{UTF-8}% |
458 | | -\indextext{UTF-16}% |
459 | | -\indextext{UTF-32}% |
460 | | -For a UTF-8, UTF-16, or UTF-32 literal, |
461 | | -the implementation shall encode |
462 | | -the Unicode scalar value |
463 | | -corresponding to each character of the translation character set |
464 | | -as specified in the Unicode Standard |
465 | | -for the respective Unicode encoding form. |
466 | | -\indextext{character set|)} |
467 | | - |
468 | 470 | \rSec1[lex.pptoken]{Preprocessing tokens} |
469 | 471 |
|
470 | 472 | \indextext{token!preprocessing|(}% |
|
0 commit comments