Skip to content

Commit d0674a4

Browse files
committed
Diff-massaging commit
1 parent ddb7318 commit d0674a4

File tree

2 files changed

+70
-87
lines changed

2 files changed

+70
-87
lines changed

library/alloc/src/wtf8/mod.rs

Lines changed: 34 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -451,53 +451,46 @@ impl Extend<CodePoint> for Wtf8Buf {
451451
}
452452
}
453453

454-
// helps diff
455-
mod wtf8 {
456-
use super::*;
457-
458-
/// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
459-
pub(super) fn to_owned(slice: &Wtf8) -> Wtf8Buf {
460-
Wtf8Buf { bytes: slice.as_bytes().to_vec(), is_known_utf8: false }
461-
}
454+
/// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
455+
pub(super) fn to_owned(slice: &Wtf8) -> Wtf8Buf {
456+
Wtf8Buf { bytes: slice.as_bytes().to_vec(), is_known_utf8: false }
457+
}
462458

463-
/// Lossily converts the string to UTF-8.
464-
/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
465-
///
466-
/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
467-
///
468-
/// This only copies the data if necessary (if it contains any surrogate).
469-
pub(super) fn to_string_lossy(slice: &Wtf8) -> Cow<'_, str> {
470-
let Some((surrogate_pos, _)) = slice.next_surrogate(0) else {
471-
return Cow::Borrowed(unsafe { str::from_utf8_unchecked(slice.as_bytes()) });
472-
};
473-
let wtf8_bytes = slice.as_bytes();
474-
let mut utf8_bytes = Vec::with_capacity(slice.len());
475-
utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
476-
utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes());
477-
let mut pos = surrogate_pos + 3;
478-
loop {
479-
match slice.next_surrogate(pos) {
480-
Some((surrogate_pos, _)) => {
481-
utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
482-
utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes());
483-
pos = surrogate_pos + 3;
484-
}
485-
None => {
486-
utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
487-
return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
488-
}
459+
/// Lossily converts the string to UTF-8.
460+
/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
461+
///
462+
/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
463+
///
464+
/// This only copies the data if necessary (if it contains any surrogate).
465+
pub(super) fn to_string_lossy(slice: &Wtf8) -> Cow<'_, str> {
466+
let Some((surrogate_pos, _)) = slice.next_surrogate(0) else {
467+
return Cow::Borrowed(unsafe { str::from_utf8_unchecked(slice.as_bytes()) });
468+
};
469+
let wtf8_bytes = slice.as_bytes();
470+
let mut utf8_bytes = Vec::with_capacity(slice.len());
471+
utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
472+
utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes());
473+
let mut pos = surrogate_pos + 3;
474+
loop {
475+
match slice.next_surrogate(pos) {
476+
Some((surrogate_pos, _)) => {
477+
utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
478+
utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes());
479+
pos = surrogate_pos + 3;
480+
}
481+
None => {
482+
utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
483+
return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
489484
}
490485
}
491486
}
492-
493-
#[inline]
494-
pub(super) fn clone_into(slice: &Wtf8, buf: &mut Wtf8Buf) {
495-
buf.is_known_utf8 = false;
496-
slice.as_bytes().clone_into(&mut buf.bytes);
497-
}
498487
}
499488

500-
use self::wtf8::{to_owned, to_string_lossy, clone_into};
489+
#[inline]
490+
pub(super) fn clone_into(slice: &Wtf8, buf: &mut Wtf8Buf) {
491+
buf.is_known_utf8 = false;
492+
slice.as_bytes().clone_into(&mut buf.bytes);
493+
}
501494

502495
#[cfg(not(test))]
503496
impl Wtf8 {

library/core/src/wtf8.rs

Lines changed: 36 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -345,16 +345,6 @@ impl Wtf8 {
345345
pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
346346
self.bytes.eq_ignore_ascii_case(&other.bytes)
347347
}
348-
349-
#[inline]
350-
pub fn is_code_point_boundary(&self, index: usize) -> bool {
351-
is_code_point_boundary(self, index)
352-
}
353-
354-
#[inline]
355-
pub fn check_utf8_boundary(&self, index: usize) {
356-
check_utf8_boundary(self, index)
357-
}
358348
}
359349

360350
/// Returns a slice of the given string for the byte range \[`begin`..`end`).
@@ -435,44 +425,44 @@ fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
435425
0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
436426
}
437427

438-
// helps diff to be unindented
439-
440-
/// Copied from str::is_char_boundary
441-
#[inline]
442-
pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
443-
if index == 0 {
444-
return true;
445-
}
446-
match slice.bytes.get(index) {
447-
None => index == slice.len(),
448-
Some(&b) => (b as i8) >= -0x40,
428+
impl Wtf8 {
429+
/// Copied from str::is_char_boundary
430+
#[inline]
431+
pub fn is_code_point_boundary(&self, index: usize) -> bool {
432+
if index == 0 {
433+
return true;
434+
}
435+
match self.bytes.get(index) {
436+
None => index == self.len(),
437+
Some(&b) => (b as i8) >= -0x40,
438+
}
449439
}
450-
}
451440

452-
/// Verify that `index` is at the edge of either a valid UTF-8 codepoint
453-
/// (i.e. a codepoint that's not a surrogate) or of the whole string.
454-
///
455-
/// These are the cases currently permitted by `OsStr::slice_encoded_bytes`.
456-
/// Splitting between surrogates is valid as far as WTF-8 is concerned, but
457-
/// we do not permit it in the public API because WTF-8 is considered an
458-
/// implementation detail.
459-
#[track_caller]
460-
#[inline]
461-
pub fn check_utf8_boundary(slice: &Wtf8, index: usize) {
462-
if index == 0 {
463-
return;
464-
}
465-
match slice.bytes.get(index) {
466-
Some(0xED) => (), // Might be a surrogate
467-
Some(&b) if (b as i8) >= -0x40 => return,
468-
Some(_) => panic!("byte index {index} is not a codepoint boundary"),
469-
None if index == slice.len() => return,
470-
None => panic!("byte index {index} is out of bounds"),
471-
}
472-
if slice.bytes[index + 1] >= 0xA0 {
473-
// There's a surrogate after index. Now check before index.
474-
if index >= 3 && slice.bytes[index - 3] == 0xED && slice.bytes[index - 2] >= 0xA0 {
475-
panic!("byte index {index} lies between surrogate codepoints");
441+
/// Verify that `index` is at the edge of either a valid UTF-8 codepoint
442+
/// (i.e. a codepoint that's not a surrogate) or of the whole string.
443+
///
444+
/// These are the cases currently permitted by `OsStr::self_encoded_bytes`.
445+
/// Splitting between surrogates is valid as far as WTF-8 is concerned, but
446+
/// we do not permit it in the public API because WTF-8 is considered an
447+
/// implementation detail.
448+
#[track_caller]
449+
#[inline]
450+
pub fn check_utf8_boundary(&self, index: usize) {
451+
if index == 0 {
452+
return;
453+
}
454+
match self.bytes.get(index) {
455+
Some(0xED) => (), // Might be a surrogate
456+
Some(&b) if (b as i8) >= -0x40 => return,
457+
Some(_) => panic!("byte index {index} is not a codepoint boundary"),
458+
None if index == self.len() => return,
459+
None => panic!("byte index {index} is out of bounds"),
460+
}
461+
if self.bytes[index + 1] >= 0xA0 {
462+
// There's a surrogate after index. Now check before index.
463+
if index >= 3 && self.bytes[index - 3] == 0xED && self.bytes[index - 2] >= 0xA0 {
464+
panic!("byte index {index} lies between surrogate codepoints");
465+
}
476466
}
477467
}
478468
}

0 commit comments

Comments
 (0)