fixup! offers: avoid panic when truncating payer_note in UTF-8 code point

phlip9 · phlip9 · commit fd123d31d66b · 2025-04-24T15:18:08.000-07:00
diff --git a/lightning/src/offers/invoice_request.rs b/lightning/src/offers/invoice_request.rs
@@ -1008,36 +1008,30 @@ impl VerifiedInvoiceRequest {
 	}
 }
 
-/// `String.truncate(new_len)` panics if you split on a UTF-8 code point. This
-/// function will instead truncate the string to the next smaller code point
-/// boundary.
+/// `String::truncate(new_len)` panics if you split inside a UTF-8 code point,
+/// which would leave the `String` containing invalid UTF-8. This function will
+/// instead truncate the string to the next smaller code point boundary so the
+/// truncated string always remains valid UTF-8.
 ///
 /// This can still split a grapheme cluster, but that's probably fine.
 /// We'd otherwise have to pull in the `unicode-segmentation` crate and its big
 /// unicode tables to find the next smaller grapheme cluster boundary.
 fn string_truncate_safe(mut s: String, new_len: usize) -> String {
-	/// Returns true if a byte is the first byte of a UTF-8 code point sequence.
-	// TODO(phlip9): remove when std stabilizes `str::floor_char_boundary`.
-	#[inline]
-	const fn u8_is_utf8_char_boundary(b: u8) -> bool {
-		// This is bit magic equivalent to: b < 128 || b >= 192
-		(b as i8) >= -0x40
-	}
-
-	/// Finds the closest `x` not exceeding `index` where `s.is_char_boundary(x)`
-	/// is true.
-	// TODO(phlip9): remove when std stabilizes `str::floor_char_boundary`.
+	/// Finds the largest byte index `x` not exceeding byte index `index` where
+	/// `s.is_char_boundary(x)` is true.
+	// TODO(phlip9): remove when `std::str::floor_char_boundary` stabilizes.
 	#[inline]
 	fn str_floor_char_boundary(s: &str, index: usize) -> usize {
 		if index >= s.len() {
 			s.len()
 		} else {
-			let lower_bound = index.saturating_sub(3);
-			let new_index = s.as_bytes()[lower_bound..=index]
-				.iter()
-				.rposition(|b| u8_is_utf8_char_boundary(*b))
-				.unwrap_or(0);
-			lower_bound + new_index
+			// UTF-8 code points are 1-4 bytes long, so we can limit our search
+			// to this range: [index - 3, index]
+			let lower_bound_index = index.saturating_sub(3);
+			(lower_bound_index..=index)
+				.rev()
+				.find(|idx| s.is_char_boundary(*idx))
+				.unwrap_or(lower_bound_index)
 		}
 	}
 
@@ -1465,6 +1459,7 @@ mod tests {
 	use crate::ln::inbound_payment::ExpandedKey;
 	use crate::ln::msgs::{DecodeError, MAX_VALUE_MSAT};
 	use crate::offers::invoice::{Bolt12Invoice, SIGNATURE_TAG as INVOICE_SIGNATURE_TAG};
+	use crate::offers::invoice_request::string_truncate_safe;
 	use crate::offers::merkle::{self, SignatureTlvStreamRef, TaggedHash, TlvStream};
 	use crate::offers::nonce::Nonce;
 	#[cfg(not(c_bindings))]
@@ -3026,4 +3021,31 @@ mod tests {
 			Err(_) => panic!("unexpected error"),
 		}
 	}
+
+	#[test]
+	fn test_string_truncate_safe() {
+		// We'll correctly truncate to the nearest UTF-8 code point boundary:
+		// ❤      variation-selector
+		// e29da4 efb88f
+		let s = String::from("❤️");
+		for idx in 0..(s.len() + 5) {
+			if idx >= s.len() {
+				assert_eq!(s, string_truncate_safe(s.clone(), idx));
+			} else if (3..s.len()).contains(&idx) {
+				assert_eq!("❤", string_truncate_safe(s.clone(), idx));
+			} else {
+				assert_eq!("", string_truncate_safe(s.clone(), idx));
+			}
+		}
+
+		// Every byte in an ASCII string is also a full UTF-8 code point.
+		let s = String::from("my ASCII string!");
+		for idx in 0..(s.len() + 5) {
+			if idx >= s.len() {
+				assert_eq!(s, string_truncate_safe(s.clone(), idx));
+			} else {
+				assert_eq!(s[..idx], string_truncate_safe(s.clone(), idx));
+			}
+		}
+	}
 }