Skip to content

Commit dff947c

Browse files
authored
[ISSUE #74]šŸš€Implement Small String Optimization (SSO) for CheetahString to reduce heap allocations for short strings (#75)
* [ISSUE #74]šŸš€Implement Small String Optimization (SSO) for CheetahString to reduce heap allocations for short strings * [ISSUE #74]šŸš€Implement Small String Optimization (SSO) for CheetahString to reduce heap allocations for short strings * [ISSUE #74]šŸš€Implement Small String Optimization (SSO) for CheetahString to reduce heap allocations for short strings
1 parent 750ab5d commit dff947c

File tree

3 files changed

+292
-28
lines changed

3 files changed

+292
-28
lines changed

ā€Žsrc/cheetah_string.rsā€Ž

Lines changed: 78 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ use std::ops::Deref;
88
use std::str::FromStr;
99
use std::sync::Arc;
1010

11-
pub const EMPTY_STRING: &str = "";
12-
1311
#[derive(Clone)]
1412
#[repr(transparent)]
1513
pub struct CheetahString {
@@ -19,7 +17,10 @@ pub struct CheetahString {
1917
impl Default for CheetahString {
2018
fn default() -> Self {
2119
CheetahString {
22-
inner: InnerString::Empty,
20+
inner: InnerString::Inline {
21+
len: 0,
22+
data: [0; INLINE_CAPACITY],
23+
},
2324
}
2425
}
2526
}
@@ -176,11 +177,17 @@ impl From<CheetahString> for String {
176177
fn from(s: CheetahString) -> Self {
177178
match s {
178179
CheetahString {
179-
inner: InnerString::ArcString(s),
180-
} => s.as_ref().clone(),
180+
inner: InnerString::Inline { len, data },
181+
} => {
182+
// SAFETY: Inline strings are always valid UTF-8
183+
unsafe { String::from_utf8_unchecked(data[..len as usize].to_vec()) }
184+
}
181185
CheetahString {
182186
inner: InnerString::StaticStr(s),
183187
} => s.to_string(),
188+
CheetahString {
189+
inner: InnerString::ArcString(s),
190+
} => s.as_ref().clone(),
184191
CheetahString {
185192
inner: InnerString::ArcVecString(s),
186193
} => {
@@ -194,9 +201,6 @@ impl From<CheetahString> for String {
194201
// SAFETY: Bytes variant should only be created from valid UTF-8 sources
195202
unsafe { String::from_utf8_unchecked(b.to_vec()) }
196203
}
197-
CheetahString {
198-
inner: InnerString::Empty,
199-
} => String::new(),
200204
}
201205
}
202206
}
@@ -242,7 +246,10 @@ impl CheetahString {
242246
#[inline]
243247
pub const fn empty() -> Self {
244248
CheetahString {
245-
inner: InnerString::Empty,
249+
inner: InnerString::Inline {
250+
len: 0,
251+
data: [0; INLINE_CAPACITY],
252+
},
246253
}
247254
}
248255

@@ -323,15 +330,41 @@ impl CheetahString {
323330

324331
#[inline]
325332
pub fn from_slice(s: &str) -> Self {
326-
CheetahString {
327-
inner: InnerString::ArcString(Arc::new(s.to_owned())),
333+
if s.len() <= INLINE_CAPACITY {
334+
// Use inline storage for short strings
335+
let mut data = [0u8; INLINE_CAPACITY];
336+
data[..s.len()].copy_from_slice(s.as_bytes());
337+
CheetahString {
338+
inner: InnerString::Inline {
339+
len: s.len() as u8,
340+
data,
341+
},
342+
}
343+
} else {
344+
// Use Arc for long strings
345+
CheetahString {
346+
inner: InnerString::ArcString(Arc::new(s.to_owned())),
347+
}
328348
}
329349
}
330350

331351
#[inline]
332352
pub fn from_string(s: String) -> Self {
333-
CheetahString {
334-
inner: InnerString::ArcString(Arc::new(s)),
353+
if s.len() <= INLINE_CAPACITY {
354+
// Use inline storage for short strings
355+
let mut data = [0u8; INLINE_CAPACITY];
356+
data[..s.len()].copy_from_slice(s.as_bytes());
357+
CheetahString {
358+
inner: InnerString::Inline {
359+
len: s.len() as u8,
360+
data,
361+
},
362+
}
363+
} else {
364+
// Use Arc for long strings
365+
CheetahString {
366+
inner: InnerString::ArcString(Arc::new(s)),
367+
}
335368
}
336369
}
337370
#[inline]
@@ -352,8 +385,13 @@ impl CheetahString {
352385
#[inline]
353386
pub fn as_str(&self) -> &str {
354387
match &self.inner {
355-
InnerString::ArcString(s) => s.as_str(),
388+
InnerString::Inline { len, data } => {
389+
// SAFETY: Inline strings are only created from valid UTF-8 sources.
390+
// The data is always valid UTF-8 up to len bytes.
391+
unsafe { std::str::from_utf8_unchecked(&data[..*len as usize]) }
392+
}
356393
InnerString::StaticStr(s) => s,
394+
InnerString::ArcString(s) => s.as_str(),
357395
InnerString::ArcVecString(s) => {
358396
// SAFETY: ArcVecString is only created from validated UTF-8 sources.
359397
// All constructors ensure this invariant is maintained.
@@ -365,43 +403,42 @@ impl CheetahString {
365403
// The from_bytes constructor ensures this invariant.
366404
unsafe { std::str::from_utf8_unchecked(b.as_ref()) }
367405
}
368-
InnerString::Empty => EMPTY_STRING,
369406
}
370407
}
371408

372409
#[inline]
373410
pub fn as_bytes(&self) -> &[u8] {
374411
match &self.inner {
375-
InnerString::ArcString(s) => s.as_bytes(),
412+
InnerString::Inline { len, data } => &data[..*len as usize],
376413
InnerString::StaticStr(s) => s.as_bytes(),
414+
InnerString::ArcString(s) => s.as_bytes(),
377415
InnerString::ArcVecString(s) => s.as_ref(),
378416
#[cfg(feature = "bytes")]
379417
InnerString::Bytes(b) => b.as_ref(),
380-
InnerString::Empty => &[],
381418
}
382419
}
383420

384421
#[inline]
385422
pub fn len(&self) -> usize {
386423
match &self.inner {
387-
InnerString::ArcString(s) => s.len(),
424+
InnerString::Inline { len, .. } => *len as usize,
388425
InnerString::StaticStr(s) => s.len(),
426+
InnerString::ArcString(s) => s.len(),
389427
InnerString::ArcVecString(s) => s.len(),
390428
#[cfg(feature = "bytes")]
391429
InnerString::Bytes(b) => b.len(),
392-
InnerString::Empty => 0,
393430
}
394431
}
395432

396433
#[inline]
397434
pub fn is_empty(&self) -> bool {
398435
match &self.inner {
399-
InnerString::ArcString(s) => s.is_empty(),
436+
InnerString::Inline { len, .. } => *len == 0,
400437
InnerString::StaticStr(s) => s.is_empty(),
438+
InnerString::ArcString(s) => s.is_empty(),
401439
InnerString::ArcVecString(s) => s.is_empty(),
402440
#[cfg(feature = "bytes")]
403441
InnerString::Bytes(b) => b.is_empty(),
404-
InnerString::Empty => true,
405442
}
406443
}
407444
}
@@ -506,20 +543,35 @@ impl Borrow<str> for CheetahString {
506543
}
507544
}
508545

546+
/// Maximum capacity for inline string storage (23 bytes + 1 byte for length = 24 bytes total)
547+
const INLINE_CAPACITY: usize = 23;
548+
509549
/// The `InnerString` enum represents different types of string storage.
510550
///
551+
/// This enum uses Small String Optimization (SSO) to avoid heap allocations for short strings.
552+
///
511553
/// Variants:
512554
///
513-
/// * `ArcString(Arc<String>)` - A reference-counted string.
514-
/// * `StaticStr(&'static str)` - A static string slice.
555+
/// * `Inline` - Inline storage for strings <= 23 bytes (zero heap allocations).
556+
/// * `StaticStr(&'static str)` - A static string slice (zero heap allocations).
557+
/// * `ArcString(Arc<String>)` - A reference-counted string (one heap allocation).
558+
/// * `ArcVecString(Arc<Vec<u8>>)` - A reference-counted byte vector.
515559
/// * `Bytes(bytes::Bytes)` - A byte buffer (available when the "bytes" feature is enabled).
516-
/// * `Empty` - An empty string.
517560
#[derive(Clone)]
518561
pub(super) enum InnerString {
519-
ArcString(Arc<String>),
562+
/// Inline storage for short strings (up to 23 bytes).
563+
/// Stores the length and data directly without heap allocation.
564+
Inline {
565+
len: u8,
566+
data: [u8; INLINE_CAPACITY],
567+
},
568+
/// Static string slice with 'static lifetime.
520569
StaticStr(&'static str),
570+
/// Reference-counted heap-allocated string.
571+
ArcString(Arc<String>),
572+
/// Reference-counted heap-allocated byte vector.
521573
ArcVecString(Arc<Vec<u8>>),
574+
/// Bytes type integration (requires "bytes" feature).
522575
#[cfg(feature = "bytes")]
523576
Bytes(bytes::Bytes),
524-
Empty,
525577
}

ā€Žsrc/serde.rsā€Ž

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::cheetah_string::{InnerString, EMPTY_STRING};
1+
use crate::cheetah_string::InnerString;
22
use crate::CheetahString;
33
use serde::de::{Error, Unexpected, Visitor};
44
use serde::{Deserialize, Deserializer, Serialize, Serializer};
@@ -9,12 +9,16 @@ impl Serialize for CheetahString {
99
S: Serializer,
1010
{
1111
match &self.inner {
12+
InnerString::Inline { len, data } => {
13+
// Safety: InnerString::Inline guarantees that data[0..len] is valid UTF-8
14+
let s = unsafe { std::str::from_utf8_unchecked(&data[..*len as usize]) };
15+
serializer.serialize_str(s)
16+
}
1217
InnerString::ArcString(s) => serializer.serialize_str(s.as_str()),
1318
InnerString::StaticStr(s) => serializer.serialize_str(s),
1419
InnerString::ArcVecString(s) => serializer.serialize_bytes(s),
1520
#[cfg(feature = "bytes")]
1621
InnerString::Bytes(bytes) => serializer.serialize_bytes(bytes.as_ref()),
17-
InnerString::Empty => serializer.serialize_str(EMPTY_STRING),
1822
}
1923
}
2024
}

0 commit comments

Comments
Ā (0)