diff --git a/src/cheetah_string.rs b/src/cheetah_string.rs index 4430f77..dcebf16 100644 --- a/src/cheetah_string.rs +++ b/src/cheetah_string.rs @@ -8,8 +8,6 @@ use std::ops::Deref; use std::str::FromStr; use std::sync::Arc; -pub const EMPTY_STRING: &str = ""; - #[derive(Clone)] #[repr(transparent)] pub struct CheetahString { @@ -19,7 +17,10 @@ pub struct CheetahString { impl Default for CheetahString { fn default() -> Self { CheetahString { - inner: InnerString::Empty, + inner: InnerString::Inline { + len: 0, + data: [0; INLINE_CAPACITY], + }, } } } @@ -176,11 +177,17 @@ impl From for String { fn from(s: CheetahString) -> Self { match s { CheetahString { - inner: InnerString::ArcString(s), - } => s.as_ref().clone(), + inner: InnerString::Inline { len, data }, + } => { + // SAFETY: Inline strings are always valid UTF-8 + unsafe { String::from_utf8_unchecked(data[..len as usize].to_vec()) } + } CheetahString { inner: InnerString::StaticStr(s), } => s.to_string(), + CheetahString { + inner: InnerString::ArcString(s), + } => s.as_ref().clone(), CheetahString { inner: InnerString::ArcVecString(s), } => { @@ -194,9 +201,6 @@ impl From for String { // SAFETY: Bytes variant should only be created from valid UTF-8 sources unsafe { String::from_utf8_unchecked(b.to_vec()) } } - CheetahString { - inner: InnerString::Empty, - } => String::new(), } } } @@ -242,7 +246,10 @@ impl CheetahString { #[inline] pub const fn empty() -> Self { CheetahString { - inner: InnerString::Empty, + inner: InnerString::Inline { + len: 0, + data: [0; INLINE_CAPACITY], + }, } } @@ -323,15 +330,41 @@ impl CheetahString { #[inline] pub fn from_slice(s: &str) -> Self { - CheetahString { - inner: InnerString::ArcString(Arc::new(s.to_owned())), + if s.len() <= INLINE_CAPACITY { + // Use inline storage for short strings + let mut data = [0u8; INLINE_CAPACITY]; + data[..s.len()].copy_from_slice(s.as_bytes()); + CheetahString { + inner: InnerString::Inline { + len: s.len() as u8, + data, + }, + } + } else { + // Use Arc for long strings + CheetahString { + inner: InnerString::ArcString(Arc::new(s.to_owned())), + } } } #[inline] pub fn from_string(s: String) -> Self { - CheetahString { - inner: InnerString::ArcString(Arc::new(s)), + if s.len() <= INLINE_CAPACITY { + // Use inline storage for short strings + let mut data = [0u8; INLINE_CAPACITY]; + data[..s.len()].copy_from_slice(s.as_bytes()); + CheetahString { + inner: InnerString::Inline { + len: s.len() as u8, + data, + }, + } + } else { + // Use Arc for long strings + CheetahString { + inner: InnerString::ArcString(Arc::new(s)), + } } } #[inline] @@ -352,8 +385,13 @@ impl CheetahString { #[inline] pub fn as_str(&self) -> &str { match &self.inner { - InnerString::ArcString(s) => s.as_str(), + InnerString::Inline { len, data } => { + // SAFETY: Inline strings are only created from valid UTF-8 sources. + // The data is always valid UTF-8 up to len bytes. + unsafe { std::str::from_utf8_unchecked(&data[..*len as usize]) } + } InnerString::StaticStr(s) => s, + InnerString::ArcString(s) => s.as_str(), InnerString::ArcVecString(s) => { // SAFETY: ArcVecString is only created from validated UTF-8 sources. // All constructors ensure this invariant is maintained. @@ -365,43 +403,42 @@ impl CheetahString { // The from_bytes constructor ensures this invariant. unsafe { std::str::from_utf8_unchecked(b.as_ref()) } } - InnerString::Empty => EMPTY_STRING, } } #[inline] pub fn as_bytes(&self) -> &[u8] { match &self.inner { - InnerString::ArcString(s) => s.as_bytes(), + InnerString::Inline { len, data } => &data[..*len as usize], InnerString::StaticStr(s) => s.as_bytes(), + InnerString::ArcString(s) => s.as_bytes(), InnerString::ArcVecString(s) => s.as_ref(), #[cfg(feature = "bytes")] InnerString::Bytes(b) => b.as_ref(), - InnerString::Empty => &[], } } #[inline] pub fn len(&self) -> usize { match &self.inner { - InnerString::ArcString(s) => s.len(), + InnerString::Inline { len, .. } => *len as usize, InnerString::StaticStr(s) => s.len(), + InnerString::ArcString(s) => s.len(), InnerString::ArcVecString(s) => s.len(), #[cfg(feature = "bytes")] InnerString::Bytes(b) => b.len(), - InnerString::Empty => 0, } } #[inline] pub fn is_empty(&self) -> bool { match &self.inner { - InnerString::ArcString(s) => s.is_empty(), + InnerString::Inline { len, .. } => *len == 0, InnerString::StaticStr(s) => s.is_empty(), + InnerString::ArcString(s) => s.is_empty(), InnerString::ArcVecString(s) => s.is_empty(), #[cfg(feature = "bytes")] InnerString::Bytes(b) => b.is_empty(), - InnerString::Empty => true, } } } @@ -506,20 +543,35 @@ impl Borrow for CheetahString { } } +/// Maximum capacity for inline string storage (23 bytes + 1 byte for length = 24 bytes total) +const INLINE_CAPACITY: usize = 23; + /// The `InnerString` enum represents different types of string storage. /// +/// This enum uses Small String Optimization (SSO) to avoid heap allocations for short strings. +/// /// Variants: /// -/// * `ArcString(Arc)` - A reference-counted string. -/// * `StaticStr(&'static str)` - A static string slice. +/// * `Inline` - Inline storage for strings <= 23 bytes (zero heap allocations). +/// * `StaticStr(&'static str)` - A static string slice (zero heap allocations). +/// * `ArcString(Arc)` - A reference-counted string (one heap allocation). +/// * `ArcVecString(Arc>)` - A reference-counted byte vector. /// * `Bytes(bytes::Bytes)` - A byte buffer (available when the "bytes" feature is enabled). -/// * `Empty` - An empty string. #[derive(Clone)] pub(super) enum InnerString { - ArcString(Arc), + /// Inline storage for short strings (up to 23 bytes). + /// Stores the length and data directly without heap allocation. + Inline { + len: u8, + data: [u8; INLINE_CAPACITY], + }, + /// Static string slice with 'static lifetime. StaticStr(&'static str), + /// Reference-counted heap-allocated string. + ArcString(Arc), + /// Reference-counted heap-allocated byte vector. ArcVecString(Arc>), + /// Bytes type integration (requires "bytes" feature). #[cfg(feature = "bytes")] Bytes(bytes::Bytes), - Empty, } diff --git a/src/serde.rs b/src/serde.rs index 010a221..821e567 100644 --- a/src/serde.rs +++ b/src/serde.rs @@ -1,4 +1,4 @@ -use crate::cheetah_string::{InnerString, EMPTY_STRING}; +use crate::cheetah_string::InnerString; use crate::CheetahString; use serde::de::{Error, Unexpected, Visitor}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -9,12 +9,16 @@ impl Serialize for CheetahString { S: Serializer, { match &self.inner { + InnerString::Inline { len, data } => { + // Safety: InnerString::Inline guarantees that data[0..len] is valid UTF-8 + let s = unsafe { std::str::from_utf8_unchecked(&data[..*len as usize]) }; + serializer.serialize_str(s) + } InnerString::ArcString(s) => serializer.serialize_str(s.as_str()), InnerString::StaticStr(s) => serializer.serialize_str(s), InnerString::ArcVecString(s) => serializer.serialize_bytes(s), #[cfg(feature = "bytes")] InnerString::Bytes(bytes) => serializer.serialize_bytes(bytes.as_ref()), - InnerString::Empty => serializer.serialize_str(EMPTY_STRING), } } } diff --git a/tests/sso.rs b/tests/sso.rs new file mode 100644 index 0000000..1444998 --- /dev/null +++ b/tests/sso.rs @@ -0,0 +1,208 @@ +use cheetah_string::CheetahString; + +#[test] +fn test_sso_empty_string() { + let s = CheetahString::new(); + assert!(s.is_empty()); + assert_eq!(s.len(), 0); + assert_eq!(s.as_str(), ""); +} + +#[test] +fn test_sso_short_string() { + // Test strings at various lengths up to 23 bytes + let test_cases = vec![ + ("a", 1), + ("hello", 5), + ("hello world", 11), + ("12345678901234567890123", 23), // Exactly 23 bytes + ]; + + for (text, expected_len) in test_cases { + let s = CheetahString::from(text); + assert_eq!(s.len(), expected_len); + assert_eq!(s.as_str(), text); + assert!(!s.is_empty()); + } +} + +#[test] +fn test_sso_boundary_23_bytes() { + // Test the exact boundary case: 23 bytes (should use inline) + let s23 = "a".repeat(23); + let cs = CheetahString::from(s23.as_str()); + assert_eq!(cs.len(), 23); + assert_eq!(cs.as_str(), s23); +} + +#[test] +fn test_sso_boundary_24_bytes() { + // Test 24 bytes (should use Arc) + let s24 = "a".repeat(24); + let cs = CheetahString::from(s24.as_str()); + assert_eq!(cs.len(), 24); + assert_eq!(cs.as_str(), s24); +} + +#[test] +fn test_sso_long_string() { + // Test long strings that should use Arc storage + let long = "a".repeat(100); + let s = CheetahString::from(long.as_str()); + assert_eq!(s.len(), 100); + assert_eq!(s.as_str(), long); +} + +#[test] +fn test_sso_clone_short_string() { + let s1 = CheetahString::from("hello"); + let s2 = s1.clone(); + assert_eq!(s1, s2); + assert_eq!(s1.as_str(), "hello"); + assert_eq!(s2.as_str(), "hello"); +} + +#[test] +fn test_sso_unicode_short() { + // Test short unicode strings + let s = CheetahString::from("你好"); + assert_eq!(s.len(), 6); // 2 chars * 3 bytes each + assert_eq!(s.as_str(), "你好"); +} + +#[test] +fn test_sso_unicode_boundary() { + // Test unicode at the boundary + // "你好世界" = 12 bytes (4 chars * 3 bytes) + let s = CheetahString::from("你好世界啊啊啊"); // 21 bytes + assert_eq!(s.len(), 21); + assert_eq!(s.as_str(), "你好世界啊啊啊"); +} + +#[test] +fn test_sso_from_string() { + let owned = String::from("short"); + let cs = CheetahString::from(owned); + assert_eq!(cs.as_str(), "short"); + assert_eq!(cs.len(), 5); +} + +#[test] +fn test_sso_to_string() { + let cs = CheetahString::from("hello"); + let s: String = cs.into(); + assert_eq!(s, "hello"); +} + +#[test] +fn test_sso_equality() { + let s1 = CheetahString::from("test"); + let s2 = CheetahString::from("test"); + let s3 = CheetahString::from("different"); + + assert_eq!(s1, s2); + assert_ne!(s1, s3); + assert_eq!(s1, "test"); + assert_eq!("test", s1); +} + +#[test] +fn test_sso_hash() { + use std::collections::HashMap; + + let mut map = HashMap::new(); + let key1 = CheetahString::from("key"); + let key2 = CheetahString::from("key"); + + map.insert(key1, 42); + assert_eq!(map.get(&key2), Some(&42)); +} + +#[test] +fn test_sso_ordering() { + let s1 = CheetahString::from("apple"); + let s2 = CheetahString::from("banana"); + let s3 = CheetahString::from("apple"); + + assert!(s1 < s2); + assert!(s2 > s1); + assert!(s1 <= s3); + assert!(s1 >= s3); +} + +#[test] +fn test_sso_as_bytes() { + let s = CheetahString::from("hello"); + assert_eq!(s.as_bytes(), b"hello"); +} + +#[test] +fn test_sso_deref() { + let s = CheetahString::from("hello"); + assert!(s.starts_with("hel")); + assert!(s.ends_with("llo")); +} + +#[test] +fn test_sso_display_debug() { + let s = CheetahString::from("test"); + assert_eq!(format!("{}", s), "test"); + assert_eq!(format!("{:?}", s), "\"test\""); +} + +#[test] +fn test_sso_mixed_lengths() { + // Test that we can handle mixed inline and arc strings properly + let short = CheetahString::from("short"); + let long = CheetahString::from("a".repeat(100)); + + assert_eq!(short.len(), 5); + assert_eq!(long.len(), 100); + + assert_eq!(short.as_str(), "short"); + assert_eq!(long.as_str(), &"a".repeat(100)); +} + +#[test] +fn test_sso_empty() { + let s = CheetahString::empty(); + assert!(s.is_empty()); + assert_eq!(s.len(), 0); + assert_eq!(s, ""); +} + +#[test] +fn test_sso_from_char() { + let s = CheetahString::from('x'); + assert_eq!(s.len(), 1); + assert_eq!(s.as_str(), "x"); +} + +#[test] +fn test_sso_special_chars() { + // Test special characters + let test_cases = vec!["\n", "\t", "\\", "\"", "hello\nworld", "tab\there"]; + + for text in test_cases { + let s = CheetahString::from(text); + assert_eq!(s.as_str(), text); + } +} + +#[test] +fn test_sso_try_from_bytes() { + // Test that try_from_bytes works with SSO + let bytes = b"hello"; + let s = CheetahString::try_from_bytes(bytes).unwrap(); + assert_eq!(s.as_str(), "hello"); + assert_eq!(s.len(), 5); +} + +#[test] +fn test_sso_try_from_vec() { + // Test that try_from_vec works with SSO + let bytes = vec![104, 101, 108, 108, 111]; // "hello" + let s = CheetahString::try_from_vec(bytes).unwrap(); + assert_eq!(s.as_str(), "hello"); + assert_eq!(s.len(), 5); +}