Skip to content

Commit 587bdab

Browse files
authored
perf: use simdutf8 to validate UTF-8 when reading files (#237)
1 parent 6c4297b commit 587bdab

File tree

3 files changed

+26
-4
lines changed

3 files changed

+26
-4
lines changed

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,14 @@ thiserror = "1.0.61"
8686
json-strip-comments = "1.0.2"
8787
indexmap = { version = "2.2.6", features = ["serde"] }
8888
cfg-if = "1.0"
89+
simdutf8 = { version = "0.1.4", features = ["aarch64_neon"] }
8990

9091
pnp = { version = "0.9.0", optional = true }
9192

9293
document-features = { version = "0.2.8", optional = true }
9394

9495
[dev-dependencies]
95-
vfs = "0.12.0" # for testing with in memory file system
96+
vfs = "0.12.0" # for testing with in memory file system
9697
rayon = { version = "1.10.0" }
9798
criterion2 = { version = "1.0.0", default-features = false }
9899
normalize-path = { version = "0.2.1" }

src/file_system.rs

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,20 @@ impl Default for FileSystemOs {
105105
}
106106
}
107107

108+
fn read_to_string(path: &Path) -> io::Result<String> {
109+
// `simdutf8` is faster than `std::str::from_utf8` which `fs::read_to_string` uses internally
110+
let bytes = std::fs::read(path)?;
111+
if simdutf8::basic::from_utf8(&bytes).is_err() {
112+
// Same error as `fs::read_to_string` produces (`io::Error::INVALID_UTF8`)
113+
return Err(io::Error::new(
114+
io::ErrorKind::InvalidData,
115+
"stream did not contain valid UTF-8",
116+
));
117+
}
118+
// SAFETY: `simdutf8` has ensured it's a valid UTF-8 string
119+
Ok(unsafe { String::from_utf8_unchecked(bytes) })
120+
}
121+
108122
impl FileSystem for FileSystemOs {
109123
fn read_to_string(&self, path: &Path) -> io::Result<String> {
110124
cfg_if! {
@@ -113,11 +127,11 @@ impl FileSystem for FileSystemOs {
113127
VPath::Zip(info) => {
114128
self.pnp_lru.read_to_string(info.physical_base_path(), info.zip_path)
115129
}
116-
VPath::Virtual(info) => fs::read_to_string(info.physical_base_path()),
117-
VPath::Native(path) => fs::read_to_string(path),
130+
VPath::Virtual(info) => read_to_string(&info.physical_base_path()),
131+
VPath::Native(path) => read_to_string(&path),
118132
}
119133
} else {
120-
fs::read_to_string(path)
134+
read_to_string(path)
121135
}
122136
}
123137
}

0 commit comments

Comments
 (0)