44
55use crate :: cell:: { Cell , Ref , RefCell , RefMut , SyncUnsafeCell , UnsafeCell } ;
66use crate :: char:: { EscapeDebugExtArgs , MAX_LEN_UTF8 } ;
7+ use crate :: hint:: assert_unchecked;
78use crate :: marker:: { PhantomData , PointeeSized } ;
89use crate :: num:: fmt as numfmt;
910use crate :: ops:: Deref ;
@@ -640,22 +641,29 @@ impl<'a> Formatter<'a> {
640641//
641642// The template byte sequence is the concatenation of parts of the following types:
642643//
643- // - Literal string piece (1-127 bytes) :
644+ // - Literal string piece:
644645// Pieces that must be formatted verbatim (e.g. "hello " and "\n" in "hello {name}\n")
645- // are represented as a single byte containing their length followed directly by the bytes
646- // of the string:
646+ // appear literally in the template byte sequence, prefixed by their length.
647+ //
648+ // For pieces of up to 127 bytes, these are represented as a single byte containing the
649+ // length followed directly by the bytes of the string:
647650// ┌───┬────────────────────────────┐
648651// │len│ `len` bytes (utf-8) │ (e.g. b"\x06hello ")
649652// └───┴────────────────────────────┘
650653//
651- // These strings can be 127 bytes at most, such that the `len` byte always has the highest
652- // bit cleared. Longer pieces are split into multiple pieces (at utf-8 boundaries).
654+ // For larger pieces up to u16::MAX bytes, these are represented as a 0x80 followed by
655+ // their length in 16-bit little endian, followed by the bytes of the string:
656+ // ┌────┬─────────┬───────────────────────────┐
657+ // │0x80│ len │ `len` bytes (utf-8) │ (e.g. b"\x80\x00\x01hello … ")
658+ // └────┴─────────┴───────────────────────────┘
659+ //
660+ // Longer pieces are split into multiple pieces of max u16::MAX bytes (at utf-8 boundaries).
653661//
654662// - Placeholder:
655663// Placeholders (e.g. `{name}` in "hello {name}") are represented as a byte with the highest
656- // bit set, followed by zero or more fields depending on the flags set in the first byte:
664+ // two bits set, followed by zero or more fields depending on the flags in the first byte:
657665// ┌──────────┬┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┬┄┄┄┄┄┄┄┄┄┄┄┬┄┄┄┄┄┄┄┄┄┄┄┬┄┄┄┄┄┄┄┄┄┄┄┐
658- // │0b10______ │ flags ┊ width ┊ precision ┊ arg_index ┊ (e.g. b"\x82 \x05\0")
666+ // │0b11______ │ flags ┊ width ┊ precision ┊ arg_index ┊ (e.g. b"\xC2 \x05\0")
659667// └────││││││┴┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┴┄┄┄┄┄┄┄┄┄┄┄┴┄┄┄┄┄┄┄┄┄┄┄┴┄┄┄┄┄┄┄┄┄┄┄┘
660668// ││││││ 32 bit 16 bit 16 bit 16 bit
661669// │││││└─ flags present
@@ -665,12 +673,12 @@ impl<'a> Formatter<'a> {
665673// │└─ width indirect
666674// └─ precision indirect
667675//
668- // All fields other than the first byte are optional and only present when
669- // their corresponding flag is set in the first byte.
676+ // All fields other than the first byte are optional and only present when their
677+ // corresponding flag is set in the first byte.
670678//
671679// So, a fully default placeholder without any options is just a single byte:
672680// ┌──────────┐
673- // │0b10000000 │ (b"\x80 ")
681+ // │0b11000000 │ (b"\xC0 ")
674682// └──────────┘
675683//
676684// The fields are stored as little endian.
@@ -766,14 +774,21 @@ impl<'a> Arguments<'a> {
766774 // SAFETY: We can assume the template is valid.
767775 unsafe {
768776 let n = template. read ( ) ;
777+ template = template. add ( 1 ) ;
769778 if n == 0 {
770779 // End of template.
771780 break ;
772781 } else if n < 128 {
773- // Literal string piece.
782+ // Short literal string piece.
774783 length += n as usize ;
775- template = template. add ( 1 + n as usize ) ;
784+ template = template. add ( n as usize ) ;
785+ } else if n == 128 {
786+ // Long literal string piece.
787+ let len = usize:: from ( u16:: from_le_bytes ( template. cast_array ( ) . read ( ) ) ) ;
788+ length += len;
789+ template = template. add ( 2 + len) ;
776790 } else {
791+ assert_unchecked ( n >= 0xC0 ) ;
777792 // Placeholder piece.
778793 if length == 0 {
779794 starts_with_placeholder = true ;
@@ -783,7 +798,7 @@ impl<'a> Arguments<'a> {
783798 + ( n & 2 != 0 ) as usize * 2 // width (16 bit)
784799 + ( n & 4 != 0 ) as usize * 2 // precision (16 bit)
785800 + ( n & 8 != 0 ) as usize * 2 ; // arg_index (16 bit)
786- template = template. add ( 1 + skip as usize ) ;
801+ template = template. add ( skip as usize ) ;
787802 }
788803 }
789804 }
@@ -1633,7 +1648,7 @@ pub fn write(output: &mut dyn Write, fmt: Arguments<'_>) -> Result {
16331648 if n == 0 {
16341649 // End of template.
16351650 return Ok ( ( ) ) ;
1636- } else if n < 128 {
1651+ } else if n < 0x80 {
16371652 // Literal string piece of length `n`.
16381653
16391654 // SAFETY: We can assume the strings in the template are valid.
@@ -1643,7 +1658,19 @@ pub fn write(output: &mut dyn Write, fmt: Arguments<'_>) -> Result {
16431658 s
16441659 } ;
16451660 output. write_str ( s) ?;
1646- } else if n == 128 {
1661+ } else if n == 0x80 {
1662+ // Literal string piece with a 16-bit length.
1663+
1664+ // SAFETY: We can assume the strings in the template are valid.
1665+ let s = unsafe {
1666+ let len = usize:: from ( u16:: from_le_bytes ( template. cast_array ( ) . read ( ) ) ) ;
1667+ template = template. add ( 2 ) ;
1668+ let s = crate :: str:: from_raw_parts ( template. as_ptr ( ) , len) ;
1669+ template = template. add ( len) ;
1670+ s
1671+ } ;
1672+ output. write_str ( s) ?;
1673+ } else if n == 0xC0 {
16471674 // Placeholder for next argument with default options.
16481675 //
16491676 // Having this as a separate case improves performance for the common case.
@@ -1656,6 +1683,9 @@ pub fn write(output: &mut dyn Write, fmt: Arguments<'_>) -> Result {
16561683 }
16571684 arg_index += 1 ;
16581685 } else {
1686+ // SAFETY: We can assume the template is valid.
1687+ unsafe { assert_unchecked ( n > 0xC0 ) } ;
1688+
16591689 // Placeholder with custom options.
16601690
16611691 let mut opt = FormattingOptions :: new ( ) ;
0 commit comments