zip-rs · cosmicexplorer · Jun 27, 2024 · Pr0methean · Oct 8, 2025 · Pr0methean
diff --git a/Cargo.toml b/Cargo.toml
@@ -28,20 +28,24 @@ time = { version = "0.3.37", default-features = false }
 
 [dependencies]
 aes = { version = "0.8", optional = true }
+by_address = { version = "1.2.1", optional = true }
 bzip2 = { version = "0.6.0", optional = true }
 chrono = { version = "^0.4.27", optional = true }
 constant_time_eq = { version = "0.3.1", optional = true }
 crc32fast = "1.4"
+displaydoc = "0.2.5"
 flate2 = { version = "1.1.1", default-features = false, optional = true }
 getrandom = { version = "0.3.1", features = ["std"], optional = true }
 hmac = { version = "0.12", optional = true, features = ["reset"] }
 indexmap = "2"
 jiff = { version = "0.2.4", optional = true }
 memchr = "2.7"
 nt-time = { version = "0.10.6", default-features = false, optional = true }
+num_cpus = { version = "1.16", optional = true }
 ppmd-rust = { version = "1.2", optional = true }
 pbkdf2 = { version = "0.12", optional = true }
 sha1 = { version = "0.10", optional = true }
+thiserror = "2"
 time = { workspace = true, optional = true, features = [
     "std",
 ] }
@@ -52,6 +56,10 @@ deflate64 = { version = "0.1.9", optional = true }
 lzma-rust2 = { version = "0.13", optional = true, default-features = false, features = ["std", "encoder", "optimization", "xz"] }
 bitstream-io =  { version = "4.5.0", optional = true }
 
+
+[target.'cfg(unix)'.dependencies]
+libc = { version = "0.2.155", optional = true }
+
 [target.'cfg(fuzzing)'.dependencies]
 arbitrary = { version = "1.4.1", features = ["derive"] }
 
@@ -62,7 +70,9 @@ walkdir = "2.5"
 time = { workspace = true, features = ["formatting", "macros"] }
 anyhow = "1.0.95"
 clap = { version = "=4.4.18", features = ["derive"] }
+tempdir = "0.3.7"
 tempfile = "3.15"
+num_cpus = "1"
 
 [features]
 aes-crypto = ["dep:aes", "dep:constant_time_eq", "hmac", "pbkdf2", "sha1", "getrandom", "zeroize"]
@@ -86,6 +96,7 @@ unreserved = []
 xz = ["dep:lzma-rust2"]
 xz-static = ["lzma"]
 legacy-zip = ["bitstream-io"]
+parallelism = ["libc", "num_cpus", "by_address"]
 default = [
     "aes-crypto",
     "bzip2",
@@ -109,3 +120,7 @@ harness = false
 [[bench]]
 name = "merge_archive"
 harness = false
+
+[[bench]]
+name = "extract"
+harness = false
diff --git a/benches/extract.rs b/benches/extract.rs
@@ -0,0 +1,144 @@
+use bencher::{benchmark_group, benchmark_main};
+
+use bencher::Bencher;
+use tempdir::TempDir;
+use tempfile::tempfile;
+
+use std::fs;
+use std::path::Path;
+use std::sync::{LazyLock, Mutex};
+
+use zip::result::ZipResult;
+use zip::write::ZipWriter;
+use zip::ZipArchive;
+
+#[cfg(all(feature = "parallelism", unix))]
+use zip::read::{split_extract, ExtractionParameters};
+
+/* This archive has a set of entries repeated 20x:
+ * - 200K random data, stored uncompressed (CompressionMethod::Stored)
+ * - 246K text data (the project gutenberg html version of king lear)
+ *   (CompressionMethod::Bzip2, compression level 1) (project gutenberg ebooks are public domain)
+ *
+ * The full archive file is 5.3MB.
+ */
+fn static_test_archive() -> ZipResult<ZipArchive<fs::File>> {
+    assert!(
+        cfg!(feature = "bzip2"),
+        "this test archive requires bzip2 support"
+    );
+    let path =
+        Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/data/stored-and-compressed-text.zip");
+    let file = fs::File::open(path)?;
+    ZipArchive::new(file)
+}
+
+static STATIC_TEST_ARCHIVE: LazyLock<Mutex<ZipArchive<fs::File>>> = LazyLock::new(|| {
+    let archive = static_test_archive().unwrap();
+    Mutex::new(archive)
+});
+
+/* This archive is generated dynamically, in order to scale with the number of reported CPUs.
+ * - We want at least 768 files (4 per VCPU on EC2 *.48xlarge instances) to run in CI.
+ * - We want to retain the interspersed random/text entries from static_test_archive().
+ *
+ * We will copy over entries from the static archive repeatedly until we reach the desired file
+ * count.
+ */
+fn dynamic_test_archive(src_archive: &mut ZipArchive<fs::File>) -> ZipResult<ZipArchive<fs::File>> {
+    let desired_num_entries: usize = num_cpus::get() * 4;
+    let mut output_archive = ZipWriter::new(tempfile()?);
+
+    for (src_index, output_index) in (0..src_archive.len()).cycle().zip(0..desired_num_entries) {
+        let src_file = src_archive.by_index_raw(src_index)?;
+        let output_name = if src_file.name().starts_with("random-") {
+            format!("random-{output_index}.dat")
+        } else {
+            assert!(src_file.name().starts_with("text-"));
+            format!("text-{output_index}.dat")
+        };
+        output_archive.raw_copy_file_rename(src_file, output_name)?;
+    }
+
+    output_archive.finish_into_readable()
+}
+
+static DYNAMIC_TEST_ARCHIVE: LazyLock<Mutex<ZipArchive<fs::File>>> = LazyLock::new(|| {
+    let mut src = STATIC_TEST_ARCHIVE.lock().unwrap();
+    let archive = dynamic_test_archive(&mut src).unwrap();
+    Mutex::new(archive)
+});
+
+fn do_extract_basic(bench: &mut Bencher, archive: &mut ZipArchive<fs::File>) {
+    let total_size: u64 = archive.decompressed_size().unwrap().try_into().unwrap();
+
+    let parent = TempDir::new("zip-extract").unwrap();
+
+    bench.bytes = total_size;
+    bench.bench_n(1, |bench| {
+        bench.iter(move || {
+            let outdir = TempDir::new_in(parent.path(), "bench-subdir")
+                .unwrap()
+                .into_path();
+            archive.extract(outdir).unwrap();
+        });
+    });
+}
+
+fn extract_basic_static(bench: &mut Bencher) {
+    let mut archive = STATIC_TEST_ARCHIVE.lock().unwrap();
+    do_extract_basic(bench, &mut archive);
+}
+
+fn extract_basic_dynamic(bench: &mut Bencher) {
+    let mut archive = DYNAMIC_TEST_ARCHIVE.lock().unwrap();
+    do_extract_basic(bench, &mut archive);
+}
+
+#[cfg(all(feature = "parallelism", unix))]
+fn do_extract_split(bench: &mut Bencher, archive: &ZipArchive<fs::File>) {
+    let total_size: u64 = archive.decompressed_size().unwrap().try_into().unwrap();
+
+    let params = ExtractionParameters {
+        decompression_threads: num_cpus::get() / 3,
+        ..Default::default()
+    };
+
+    let parent = TempDir::new("zip-extract").unwrap();
+
+    bench.bytes = total_size;
+    bench.bench_n(1, |bench| {
+        bench.iter(move || {
+            let outdir = TempDir::new_in(parent.path(), "bench-subdir")
+                .unwrap()
+                .into_path();
+            split_extract(archive, &outdir, params.clone()).unwrap();
+        });
+    });
+}
+
+#[cfg(all(feature = "parallelism", unix))]
+fn extract_split_static(bench: &mut Bencher) {
+    let archive = STATIC_TEST_ARCHIVE.lock().unwrap();
+    do_extract_split(bench, &archive);
+}
+
+#[cfg(all(feature = "parallelism", unix))]
+fn extract_split_dynamic(bench: &mut Bencher) {
+    let archive = DYNAMIC_TEST_ARCHIVE.lock().unwrap();
+    do_extract_split(bench, &archive);
+}
+
+#[cfg(not(all(feature = "parallelism", unix)))]
+benchmark_group!(benches, extract_basic_static, extract_basic_dynamic);
+
+#[cfg(all(feature = "parallelism", unix))]
+benchmark_group!(
+    benches,
+    extract_basic_static,
+    extract_basic_dynamic,
+    extract_split_static,
+    extract_split_dynamic
+);
+
+benchmark_main!(benches);
diff --git a/benches/read_metadata.rs b/benches/read_metadata.rs
@@ -123,7 +123,7 @@ fn parse_large_non_zip(bench: &mut Bencher) {
     let dir = TempDir::with_prefix("large-non-zip-bench").unwrap();
     let file = dir.path().join("zeros");
     let buf = vec![0u8; FILE_SIZE];
-    fs::write(&file, &buf).unwrap();
+    fs::write(&file, buf).unwrap();
 
     bench.iter(|| {
         assert!(zip::ZipArchive::new(std::fs::File::open(&file).unwrap()).is_err());

diff --git a/src/read.rs b/src/read.rs
@@ -37,6 +37,15 @@ pub(crate) mod stream;
 
 pub(crate) mod magic_finder;
 
+#[cfg(feature = "parallelism")]
+pub(crate) mod handle_creation;
+#[cfg(feature = "parallelism")]
+pub(crate) mod pipelining;
+#[cfg(all(unix, feature = "parallelism"))]
+pub use pipelining::split_extraction::{split_extract, ExtractionParameters, SplitExtractionError};
+#[cfg(feature = "parallelism")]
+pub(crate) mod split;
+
 // Put the struct declaration in a private module to convince rustdoc to display ZipArchive nicely
 pub(crate) mod zip_archive {
     use indexmap::IndexMap;