@@ -1008,36 +1008,30 @@ impl VerifiedInvoiceRequest {
10081008 }
10091009}
10101010
1011- /// `String.truncate(new_len)` panics if you split on a UTF-8 code point. This
1012- /// function will instead truncate the string to the next smaller code point
1013- /// boundary.
1011+ /// `String::truncate(new_len)` panics if you split inside a UTF-8 code point,
1012+ /// which would leave the `String` containing invalid UTF-8. This function will
1013+ /// instead truncate the string to the next smaller code point boundary so the
1014+ /// truncated string always remains valid UTF-8.
10141015///
10151016/// This can still split a grapheme cluster, but that's probably fine.
10161017/// We'd otherwise have to pull in the `unicode-segmentation` crate and its big
10171018/// unicode tables to find the next smaller grapheme cluster boundary.
10181019fn string_truncate_safe ( mut s : String , new_len : usize ) -> String {
1019- /// Returns true if a byte is the first byte of a UTF-8 code point sequence.
1020- // TODO(phlip9): remove when std stabilizes `str::floor_char_boundary`.
1021- #[ inline]
1022- const fn u8_is_utf8_char_boundary ( b : u8 ) -> bool {
1023- // This is bit magic equivalent to: b < 128 || b >= 192
1024- ( b as i8 ) >= -0x40
1025- }
1026-
1027- /// Finds the closest `x` not exceeding `index` where `s.is_char_boundary(x)`
1028- /// is true.
1029- // TODO(phlip9): remove when std stabilizes `str::floor_char_boundary`.
1020+ /// Finds the largest byte index `x` not exceeding byte index `index` where
1021+ /// `s.is_char_boundary(x)` is true.
1022+ // TODO(phlip9): remove when `std::str::floor_char_boundary` stabilizes.
10301023 #[ inline]
10311024 fn str_floor_char_boundary ( s : & str , index : usize ) -> usize {
10321025 if index >= s. len ( ) {
10331026 s. len ( )
10341027 } else {
1035- let lower_bound = index. saturating_sub ( 3 ) ;
1036- let new_index = s. as_bytes ( ) [ lower_bound..=index]
1037- . iter ( )
1038- . rposition ( |b| u8_is_utf8_char_boundary ( * b) )
1039- . unwrap_or ( 0 ) ;
1040- lower_bound + new_index
1028+ // UTF-8 code points are 1-4 bytes long, so we can limit our search
1029+ // to this range: [index - 3, index]
1030+ let lower_bound_index = index. saturating_sub ( 3 ) ;
1031+ ( lower_bound_index..=index)
1032+ . rev ( )
1033+ . find ( |idx| s. is_char_boundary ( * idx) )
1034+ . unwrap_or ( lower_bound_index)
10411035 }
10421036 }
10431037
@@ -1465,6 +1459,7 @@ mod tests {
14651459 use crate :: ln:: inbound_payment:: ExpandedKey ;
14661460 use crate :: ln:: msgs:: { DecodeError , MAX_VALUE_MSAT } ;
14671461 use crate :: offers:: invoice:: { Bolt12Invoice , SIGNATURE_TAG as INVOICE_SIGNATURE_TAG } ;
1462+ use crate :: offers:: invoice_request:: string_truncate_safe;
14681463 use crate :: offers:: merkle:: { self , SignatureTlvStreamRef , TaggedHash , TlvStream } ;
14691464 use crate :: offers:: nonce:: Nonce ;
14701465 #[ cfg( not( c_bindings) ) ]
@@ -3026,4 +3021,31 @@ mod tests {
30263021 Err ( _) => panic ! ( "unexpected error" ) ,
30273022 }
30283023 }
3024+
3025+ #[ test]
3026+ fn test_string_truncate_safe ( ) {
3027+ // We'll correctly truncate to the nearest UTF-8 code point boundary:
3028+ // ❤ variation-selector
3029+ // e29da4 efb88f
3030+ let s = String :: from ( "❤️" ) ;
3031+ for idx in 0 ..( s. len ( ) + 5 ) {
3032+ if idx >= s. len ( ) {
3033+ assert_eq ! ( s, string_truncate_safe( s. clone( ) , idx) ) ;
3034+ } else if ( 3 ..s. len ( ) ) . contains ( & idx) {
3035+ assert_eq ! ( "❤" , string_truncate_safe( s. clone( ) , idx) ) ;
3036+ } else {
3037+ assert_eq ! ( "" , string_truncate_safe( s. clone( ) , idx) ) ;
3038+ }
3039+ }
3040+
3041+ // Every byte in an ASCII string is also a full UTF-8 code point.
3042+ let s = String :: from ( "my ASCII string!" ) ;
3043+ for idx in 0 ..( s. len ( ) + 5 ) {
3044+ if idx >= s. len ( ) {
3045+ assert_eq ! ( s, string_truncate_safe( s. clone( ) , idx) ) ;
3046+ } else {
3047+ assert_eq ! ( s[ ..idx] , string_truncate_safe( s. clone( ) , idx) ) ;
3048+ }
3049+ }
3050+ }
30293051}
0 commit comments