Skip to content

Commit 38c5a16

Browse files
ref(sourcebundle): Check UTF-8 validity memory efficiently (#890)
The current check to ensure a sourcebundle is valid UTF-8 reads the entire sourcebundle file into memory. This is inefficient for large files. This PR introduces a UTF8Reader which wraps any reader. The UTF8Reader ensures that the stream is valid UTF8 as it is being read, while only requiring a small amount of memory (currently 8 KiB) to be allocated as a buffer. Co-authored-by: Sebastian Zivota <loewenheim@users.noreply.github.com>
1 parent 2865146 commit 38c5a16

File tree

6 files changed

+455
-13
lines changed

6 files changed

+455
-13
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
## Unreleased
4+
5+
**Improvements**
6+
7+
- Check UTF-8 validity memory efficiently ([#890](https://github.com/getsentry/symbolic/pull/890))
8+
39
## 12.13.2
410

511
**Fixes**

Cargo.lock

Lines changed: 115 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ once_cell = "1.17.1"
4141
parking_lot = "0.12.1"
4242
pdb-addr2line = "0.10.4"
4343
proguard = { version = "5.4.0", features = ["uuid"] }
44+
proptest = "1.6.0"
4445
regex = "1.7.1"
4546
rustc-demangle = "0.1.21"
4647
# keep this in sync with whatever version `goblin` uses

symbolic-debuginfo/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ zstd = { workspace = true, optional = true }
115115
[dev-dependencies]
116116
criterion = { workspace = true }
117117
insta = { workspace = true }
118+
proptest = {workspace = true }
118119
tempfile = { workspace = true }
119120
similar-asserts = { workspace = true }
120121
symbolic-testutils = { path = "../symbolic-testutils" }

symbolic-debuginfo/src/sourcebundle.rs renamed to symbolic-debuginfo/src/sourcebundle/mod.rs

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,17 @@
4242
//! bundle a file entry has a `url` and might carry `headers` or individual debug IDs
4343
//! per source file.
4444
45+
mod utf8_reader;
46+
4547
use std::borrow::Cow;
4648
use std::collections::{BTreeMap, BTreeSet, HashMap};
4749
use std::error::Error;
48-
use std::fmt;
4950
use std::fmt::{Display, Formatter};
5051
use std::fs::{File, OpenOptions};
51-
use std::io::{BufReader, BufWriter, Read, Seek, Write};
52+
use std::io::{BufReader, BufWriter, ErrorKind, Read, Seek, Write};
5253
use std::path::Path;
5354
use std::sync::Arc;
55+
use std::{fmt, io};
5456

5557
use parking_lot::Mutex;
5658
use regex::Regex;
@@ -60,6 +62,7 @@ use zip::{write::SimpleFileOptions, ZipWriter};
6062

6163
use symbolic_common::{Arch, AsSelf, CodeId, DebugId, SourceLinkMappings};
6264

65+
use self::utf8_reader::Utf8Reader;
6366
use crate::base::*;
6467
use crate::js::{
6568
discover_debug_id, discover_sourcemap_embedded_debug_id, discover_sourcemaps_location,
@@ -1217,31 +1220,41 @@ where
12171220
pub fn add_file<S, R>(
12181221
&mut self,
12191222
path: S,
1220-
mut file: R,
1223+
file: R,
12211224
info: SourceFileInfo,
12221225
) -> Result<(), SourceBundleError>
12231226
where
12241227
S: AsRef<str>,
12251228
R: Read,
12261229
{
1227-
let mut buf = String::new();
1228-
1229-
if let Err(e) = file.read_to_string(&mut buf) {
1230-
return Err(SourceBundleError::new(SourceBundleErrorKind::ReadFailed, e));
1231-
}
1230+
let mut file_reader = Utf8Reader::new(file);
12321231

12331232
let full_path = self.file_path(path.as_ref());
12341233
let unique_path = self.unique_path(full_path);
12351234

12361235
self.writer
12371236
.start_file(unique_path.clone(), default_file_options())
12381237
.map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;
1239-
self.writer
1240-
.write_all(buf.as_bytes())
1241-
.map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;
12421238

1243-
self.manifest.files.insert(unique_path, info);
1244-
Ok(())
1239+
match io::copy(&mut file_reader, &mut self.writer) {
1240+
Err(e) => {
1241+
self.writer
1242+
.abort_file()
1243+
.map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;
1244+
1245+
// ErrorKind::InvalidData is returned by Utf8Reader when the file is not valid UTF-8.
1246+
let error_kind = match e.kind() {
1247+
ErrorKind::InvalidData => SourceBundleErrorKind::ReadFailed,
1248+
_ => SourceBundleErrorKind::WriteFailed,
1249+
};
1250+
1251+
Err(SourceBundleError::new(error_kind, e))
1252+
}
1253+
Ok(_) => {
1254+
self.manifest.files.insert(unique_path, info);
1255+
Ok(())
1256+
}
1257+
}
12451258
}
12461259

12471260
/// Calls add_file, and handles any ReadFailed errors by calling the skipped_file_callback.

0 commit comments

Comments
 (0)