Skip to content

Commit 750ab5d

Browse files
authored
[ISSUE #72]✨Add UTF-8 validation methods and deprecate unsafe conversions in CheetahString (#73)
1 parent cece62f commit 750ab5d

File tree

4 files changed

+430
-4
lines changed

4 files changed

+430
-4
lines changed

src/cheetah_string.rs

Lines changed: 86 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use core::fmt;
2+
use core::str::Utf8Error;
23
use std::borrow::{Borrow, Cow};
34
use std::cmp::Ordering;
45
use std::fmt::Display;
@@ -44,9 +45,18 @@ impl<'a> From<&'a str> for CheetahString {
4445
}
4546
}
4647

48+
/// # Safety Warning
49+
///
50+
/// This implementation uses `unsafe` code and may cause undefined behavior
51+
/// if the bytes are not valid UTF-8. Consider using `CheetahString::try_from_bytes()`
52+
/// for safe UTF-8 validation.
53+
///
54+
/// This implementation will be deprecated in a future version.
4755
impl From<&[u8]> for CheetahString {
4856
#[inline]
4957
fn from(b: &[u8]) -> Self {
58+
// SAFETY: This is unsafe and may cause UB if bytes are not valid UTF-8.
59+
// This will be deprecated in favor of try_from_bytes in the next version.
5060
CheetahString::from_slice(unsafe { std::str::from_utf8_unchecked(b) })
5161
}
5262
}
@@ -59,9 +69,18 @@ impl FromStr for CheetahString {
5969
}
6070
}
6171

72+
/// # Safety Warning
73+
///
74+
/// This implementation uses `unsafe` code and may cause undefined behavior
75+
/// if the bytes are not valid UTF-8. Consider using `CheetahString::try_from_vec()`
76+
/// for safe UTF-8 validation.
77+
///
78+
/// This implementation will be deprecated in a future version.
6279
impl From<Vec<u8>> for CheetahString {
6380
#[inline]
6481
fn from(v: Vec<u8>) -> Self {
82+
// SAFETY: This is unsafe and may cause UB if bytes are not valid UTF-8.
83+
// This will be deprecated in favor of try_from_vec in the next version.
6584
CheetahString::from_slice(unsafe { std::str::from_utf8_unchecked(&v) })
6685
}
6786
}
@@ -164,11 +183,17 @@ impl From<CheetahString> for String {
164183
} => s.to_string(),
165184
CheetahString {
166185
inner: InnerString::ArcVecString(s),
167-
} => unsafe { String::from_utf8_unchecked(s.to_vec()) },
186+
} => {
187+
// SAFETY: ArcVecString should only be created from valid UTF-8 sources
188+
unsafe { String::from_utf8_unchecked(s.to_vec()) }
189+
}
168190
#[cfg(feature = "bytes")]
169191
CheetahString {
170192
inner: InnerString::Bytes(b),
171-
} => unsafe { String::from_utf8_unchecked(b.to_vec()) },
193+
} => {
194+
// SAFETY: Bytes variant should only be created from valid UTF-8 sources
195+
unsafe { String::from_utf8_unchecked(b.to_vec()) }
196+
}
172197
CheetahString {
173198
inner: InnerString::Empty,
174199
} => String::new(),
@@ -240,6 +265,55 @@ impl CheetahString {
240265
}
241266
}
242267

268+
/// Creates a `CheetahString` from a byte vector with UTF-8 validation.
269+
///
270+
/// # Errors
271+
///
272+
/// Returns an error if the bytes are not valid UTF-8.
273+
///
274+
/// # Examples
275+
///
276+
/// ```
277+
/// use cheetah_string::CheetahString;
278+
///
279+
/// let bytes = vec![104, 101, 108, 108, 111]; // "hello"
280+
/// let s = CheetahString::try_from_vec(bytes).unwrap();
281+
/// assert_eq!(s, "hello");
282+
///
283+
/// let invalid = vec![0xFF, 0xFE];
284+
/// assert!(CheetahString::try_from_vec(invalid).is_err());
285+
/// ```
286+
pub fn try_from_vec(v: Vec<u8>) -> Result<Self, Utf8Error> {
287+
// Validate UTF-8
288+
std::str::from_utf8(&v)?;
289+
Ok(CheetahString {
290+
inner: InnerString::ArcVecString(Arc::new(v)),
291+
})
292+
}
293+
294+
/// Creates a `CheetahString` from a byte slice with UTF-8 validation.
295+
///
296+
/// # Errors
297+
///
298+
/// Returns an error if the bytes are not valid UTF-8.
299+
///
300+
/// # Examples
301+
///
302+
/// ```
303+
/// use cheetah_string::CheetahString;
304+
///
305+
/// let bytes = b"hello";
306+
/// let s = CheetahString::try_from_bytes(bytes).unwrap();
307+
/// assert_eq!(s, "hello");
308+
///
309+
/// let invalid = &[0xFF, 0xFE];
310+
/// assert!(CheetahString::try_from_bytes(invalid).is_err());
311+
/// ```
312+
pub fn try_from_bytes(b: &[u8]) -> Result<Self, Utf8Error> {
313+
let s = std::str::from_utf8(b)?;
314+
Ok(CheetahString::from_slice(s))
315+
}
316+
243317
#[inline]
244318
pub fn from_arc_vec(s: Arc<Vec<u8>>) -> Self {
245319
CheetahString {
@@ -280,9 +354,17 @@ impl CheetahString {
280354
match &self.inner {
281355
InnerString::ArcString(s) => s.as_str(),
282356
InnerString::StaticStr(s) => s,
283-
InnerString::ArcVecString(s) => std::str::from_utf8(s.as_ref()).unwrap(),
357+
InnerString::ArcVecString(s) => {
358+
// SAFETY: ArcVecString is only created from validated UTF-8 sources.
359+
// All constructors ensure this invariant is maintained.
360+
unsafe { std::str::from_utf8_unchecked(s.as_ref()) }
361+
}
284362
#[cfg(feature = "bytes")]
285-
InnerString::Bytes(b) => std::str::from_utf8(b.as_ref()).unwrap(),
363+
InnerString::Bytes(b) => {
364+
// SAFETY: Bytes variant is only created from validated UTF-8 sources.
365+
// The from_bytes constructor ensures this invariant.
366+
unsafe { std::str::from_utf8_unchecked(b.as_ref()) }
367+
}
286368
InnerString::Empty => EMPTY_STRING,
287369
}
288370
}

src/error.rs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
use core::fmt;
2+
use core::str::Utf8Error;
3+
4+
/// Errors that can occur during CheetahString operations
5+
#[derive(Debug, Clone, PartialEq, Eq)]
6+
pub enum Error {
7+
/// UTF-8 validation failed
8+
Utf8Error(Utf8Error),
9+
/// Index out of bounds
10+
IndexOutOfBounds { index: usize, len: usize },
11+
/// Invalid character boundary
12+
InvalidCharBoundary { index: usize },
13+
}
14+
15+
impl fmt::Display for Error {
16+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
17+
match self {
18+
Error::Utf8Error(e) => write!(f, "UTF-8 error: {}", e),
19+
Error::IndexOutOfBounds { index, len } => {
20+
write!(f, "index {} out of bounds (len: {})", index, len)
21+
}
22+
Error::InvalidCharBoundary { index } => {
23+
write!(f, "index {} is not a char boundary", index)
24+
}
25+
}
26+
}
27+
}
28+
29+
#[cfg(feature = "std")]
30+
impl std::error::Error for Error {
31+
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
32+
match self {
33+
Error::Utf8Error(e) => Some(e),
34+
_ => None,
35+
}
36+
}
37+
}
38+
39+
impl From<Utf8Error> for Error {
40+
fn from(e: Utf8Error) -> Self {
41+
Error::Utf8Error(e)
42+
}
43+
}
44+
45+
/// Result type for CheetahString operations
46+
pub type Result<T> = core::result::Result<T, Error>;

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@
1919
//! ```
2020
//!
2121
mod cheetah_string;
22+
mod error;
2223

2324
#[cfg(feature = "serde")]
2425
mod serde;
2526

2627
pub use cheetah_string::CheetahString;
28+
pub use error::{Error, Result};

0 commit comments

Comments
 (0)