Skip to content

Commit 34603b0

Browse files
committed
rollup merge of #24310: alexcrichton/stabilize-utf8-error
The meaning of each variant of this enum was somewhat ambiguous and it's uncler that we wouldn't even want to add more enumeration values in the future. As a result this error has been altered to instead become an opaque structure. Learning about the "first invalid byte index" is still an unstable feature, but the type itself is now stable.
2 parents 88a145e + f329030 commit 34603b0

File tree

6 files changed

+26
-38
lines changed

6 files changed

+26
-38
lines changed

src/libcollections/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#![feature(str_char)]
4141
#![feature(slice_patterns)]
4242
#![feature(debug_builders)]
43+
#![feature(utf8_error)]
4344
#![cfg_attr(test, feature(rand, rustc_private, test, hash, collections))]
4445
#![cfg_attr(test, allow(deprecated))] // rand
4546

src/libcollections/string.rs

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ impl String {
132132
///
133133
/// let invalid_vec = vec![240, 144, 128];
134134
/// let s = String::from_utf8(invalid_vec).err().unwrap();
135-
/// assert_eq!(s.utf8_error(), Utf8Error::TooShort);
135+
/// let err = s.utf8_error();
136136
/// assert_eq!(s.into_bytes(), [240, 144, 128]);
137137
/// ```
138138
#[inline]
@@ -156,14 +156,10 @@ impl String {
156156
/// ```
157157
#[stable(feature = "rust1", since = "1.0.0")]
158158
pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> Cow<'a, str> {
159-
let mut i = 0;
159+
let mut i;
160160
match str::from_utf8(v) {
161161
Ok(s) => return Cow::Borrowed(s),
162-
Err(e) => {
163-
if let Utf8Error::InvalidByte(firstbad) = e {
164-
i = firstbad;
165-
}
166-
}
162+
Err(e) => i = e.valid_up_to(),
167163
}
168164

169165
const TAG_CONT_U8: u8 = 128;
@@ -188,9 +184,9 @@ impl String {
188184
};
189185
}
190186

191-
// subseqidx is the index of the first byte of the subsequence we're looking at.
192-
// It's used to copy a bunch of contiguous good codepoints at once instead of copying
193-
// them one by one.
187+
// subseqidx is the index of the first byte of the subsequence we're
188+
// looking at. It's used to copy a bunch of contiguous good codepoints
189+
// at once instead of copying them one by one.
194190
let mut subseqidx = i;
195191

196192
while i < total {

src/libcollectionstest/str.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1502,7 +1502,7 @@ fn test_str_from_utf8() {
15021502
assert_eq!(from_utf8(xs), Ok("ศไทย中华Việt Nam"));
15031503

15041504
let xs = b"hello\xFF";
1505-
assert_eq!(from_utf8(xs), Err(Utf8Error::TooShort));
1505+
assert!(from_utf8(xs).is_err());
15061506
}
15071507

15081508
#[test]

src/libcollectionstest/string.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ fn test_from_utf8() {
4545

4646
let xs = b"hello\xFF".to_vec();
4747
let err = String::from_utf8(xs).err().unwrap();
48-
assert_eq!(err.utf8_error(), Utf8Error::TooShort);
4948
assert_eq!(err.into_bytes(), b"hello\xff".to_vec());
5049
}
5150

src/libcore/str/mod.rs

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -106,19 +106,19 @@ Section: Creating a string
106106

107107
/// Errors which can occur when attempting to interpret a byte slice as a `str`.
108108
#[derive(Copy, Eq, PartialEq, Clone, Debug)]
109-
#[unstable(feature = "core",
110-
reason = "error enumeration recently added and definitions may be refined")]
111-
pub enum Utf8Error {
112-
/// An invalid byte was detected at the byte offset given.
113-
///
114-
/// The offset is guaranteed to be in bounds of the slice in question, and
115-
/// the byte at the specified offset was the first invalid byte in the
116-
/// sequence detected.
117-
InvalidByte(usize),
109+
#[stable(feature = "rust1", since = "1.0.0")]
110+
pub struct Utf8Error {
111+
valid_up_to: usize,
112+
}
118113

119-
/// The byte slice was invalid because more bytes were needed but no more
120-
/// bytes were available.
121-
TooShort,
114+
impl Utf8Error {
115+
/// Returns the index in the given string up to which valid UTF-8 was
116+
/// verified.
117+
///
118+
/// Starting at the index provided, but not necessarily at it precisely, an
119+
/// invalid UTF-8 encoding sequence was found.
120+
#[unstable(feature = "utf8_error", reason = "method just added")]
121+
pub fn valid_up_to(&self) -> usize { self.valid_up_to }
122122
}
123123

124124
/// Converts a slice of bytes to a string slice without performing any
@@ -147,14 +147,7 @@ pub unsafe fn from_utf8_unchecked<'a>(v: &'a [u8]) -> &'a str {
147147
#[stable(feature = "rust1", since = "1.0.0")]
148148
impl fmt::Display for Utf8Error {
149149
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
150-
match *self {
151-
Utf8Error::InvalidByte(n) => {
152-
write!(f, "invalid utf-8: invalid byte at index {}", n)
153-
}
154-
Utf8Error::TooShort => {
155-
write!(f, "invalid utf-8: byte slice too short")
156-
}
157-
}
150+
write!(f, "invalid utf-8: invalid byte near index {}", self.valid_up_to)
158151
}
159152
}
160153

@@ -1218,14 +1211,16 @@ fn run_utf8_validation_iterator(iter: &mut slice::Iter<u8>)
12181211
// restore the iterator we had at the start of this codepoint.
12191212
macro_rules! err { () => {{
12201213
*iter = old.clone();
1221-
return Err(Utf8Error::InvalidByte(whole.len() - iter.as_slice().len()))
1214+
return Err(Utf8Error {
1215+
valid_up_to: whole.len() - iter.as_slice().len()
1216+
})
12221217
}}}
12231218

12241219
macro_rules! next { () => {
12251220
match iter.next() {
12261221
Some(a) => *a,
12271222
// we needed data, but there was none: error!
1228-
None => return Err(Utf8Error::TooShort),
1223+
None => err!(),
12291224
}
12301225
}}
12311226

src/libstd/error.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,7 @@ impl Error for str::ParseBoolError {
122122
#[stable(feature = "rust1", since = "1.0.0")]
123123
impl Error for str::Utf8Error {
124124
fn description(&self) -> &str {
125-
match *self {
126-
str::Utf8Error::TooShort => "invalid utf-8: not enough bytes",
127-
str::Utf8Error::InvalidByte(..) => "invalid utf-8: corrupt contents",
128-
}
125+
"invalid utf-8: corrupt contents"
129126
}
130127
}
131128

0 commit comments

Comments
 (0)