Skip to content

Commit cfe9120

Browse files
pipelined extraction
- initial sketch of lexicographic trie for pipelining - move path splitting into a submodule - lex trie can now propagate entry data - outline handle allocation - mostly handle files - mostly handle dirs - clarify symlink FIXMEs - do symlink validation - extract writable dir setting to helper method - modify args to handle allocation method - handle allocation test passes - simplify perms a lot - outline evaluation - handle symlinks - BIGGER CHANGE! add EntryReader/etc - make initial pipelined extract work - fix file perms by writing them after finishing the file write - support directory entries by unix mode as well - impl split extraction - remove dependency on reader refactoring - add dead_code to methods we don't use yet
1 parent 6d39456 commit cfe9120

File tree

7 files changed

+2510
-0
lines changed

7 files changed

+2510
-0
lines changed

Cargo.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ lzma-rs = { version = "0.3.0", default-features = false, optional = true }
5252
[target.'cfg(any(all(target_arch = "arm", target_pointer_width = "32"), target_arch = "mips", target_arch = "powerpc"))'.dependencies]
5353
crossbeam-utils = "0.8.20"
5454

55+
[target.'cfg(unix)'.dependencies]
56+
libc = { version = "0.2.155", optional = true }
57+
5558
[target.'cfg(fuzzing)'.dependencies]
5659
arbitrary = { version = "1.3.2", features = ["derive"] }
5760

@@ -63,6 +66,7 @@ time = { workspace = true, features = ["formatting", "macros"] }
6366
anyhow = "1"
6467
clap = { version = "=4.4.18", features = ["derive"] }
6568
tempdir = "0.3.7"
69+
tempfile = "3.10.1"
6670

6771
[features]
6872
aes-crypto = ["aes", "constant_time_eq", "hmac", "pbkdf2", "sha1", "rand", "zeroize"]
@@ -79,6 +83,7 @@ deflate-zopfli = ["zopfli", "_deflate-any"]
7983
lzma = ["lzma-rs/stream"]
8084
unreserved = []
8185
xz = ["lzma-rs/raw_decoder"]
86+
parallelism = ["libc"]
8287
default = [
8388
"aes-crypto",
8489
"bzip2",
@@ -101,3 +106,7 @@ harness = false
101106
[[bench]]
102107
name = "merge_archive"
103108
harness = false
109+
110+
[[bench]]
111+
name = "extract"
112+
harness = false

benches/extract.rs

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
use bencher::{benchmark_group, benchmark_main};
2+
3+
use bencher::Bencher;
4+
use tempdir::TempDir;
5+
6+
use std::fs;
7+
use std::path::Path;
8+
9+
use zip::result::ZipResult;
10+
use zip::ZipArchive;
11+
12+
#[cfg(all(feature = "parallelism", unix))]
13+
use zip::read::{split_extract, ExtractionParameters};
14+
15+
/* This archive has a set of entries repeated 20x:
16+
* - 200K random data, stored uncompressed (CompressionMethod::Stored)
17+
* - 246K text data (the project gutenberg html version of king lear)
18+
* (CompressionMethod::Bzip2, compression level 1) (project gutenberg ebooks are public domain)
19+
*
20+
* The full archive file is 5.3MB.
21+
*/
22+
fn get_test_archive() -> ZipResult<ZipArchive<fs::File>> {
23+
let path =
24+
Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/data/stored-and-compressed-text.zip");
25+
let file = fs::File::open(path)?;
26+
ZipArchive::new(file)
27+
}
28+
29+
fn extract_basic(bench: &mut Bencher) {
30+
let mut readable_archive = get_test_archive().unwrap();
31+
let total_size: u64 = readable_archive
32+
.decompressed_size()
33+
.unwrap()
34+
.try_into()
35+
.unwrap();
36+
37+
let parent = TempDir::new("zip-extract").unwrap();
38+
39+
bench.bytes = total_size;
40+
bench.bench_n(1, |bench| {
41+
bench.iter(move || {
42+
let outdir = TempDir::new_in(parent.path(), "bench-subdir")
43+
.unwrap()
44+
.into_path();
45+
readable_archive.extract(outdir).unwrap();
46+
});
47+
});
48+
}
49+
50+
#[cfg(all(feature = "parallelism", unix))]
51+
const DECOMPRESSION_THREADS: usize = 8;
52+
53+
#[cfg(all(feature = "parallelism", unix))]
54+
fn extract_split(bench: &mut Bencher) {
55+
let readable_archive = get_test_archive().unwrap();
56+
let total_size: u64 = readable_archive
57+
.decompressed_size()
58+
.unwrap()
59+
.try_into()
60+
.unwrap();
61+
62+
let params = ExtractionParameters {
63+
decompression_threads: DECOMPRESSION_THREADS,
64+
..Default::default()
65+
};
66+
67+
let parent = TempDir::new("zip-extract").unwrap();
68+
69+
bench.bytes = total_size;
70+
bench.bench_n(1, |bench| {
71+
bench.iter(move || {
72+
let outdir = TempDir::new_in(parent.path(), "bench-subdir")
73+
.unwrap()
74+
.into_path();
75+
split_extract(&readable_archive, &outdir, params.clone()).unwrap();
76+
});
77+
});
78+
}
79+
80+
#[cfg(not(all(feature = "parallelism", unix)))]
81+
benchmark_group!(benches, extract_basic);
82+
83+
#[cfg(all(feature = "parallelism", unix))]
84+
benchmark_group!(benches, extract_basic, extract_split);
85+
86+
benchmark_main!(benches);

src/read.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ pub(crate) mod lzma;
4242
#[cfg(feature = "xz")]
4343
pub(crate) mod xz;
4444

45+
#[cfg(feature = "parallelism")]
46+
pub(crate) mod pipelining;
47+
#[cfg(all(unix, feature = "parallelism"))]
48+
pub use pipelining::split_extraction::{split_extract, ExtractionParameters, SplitExtractionError};
49+
#[cfg(feature = "parallelism")]
50+
pub(crate) mod split;
51+
4552
// Put the struct declaration in a private module to convince rustdoc to display ZipArchive nicely
4653
pub(crate) mod zip_archive {
4754
use indexmap::IndexMap;
@@ -1011,6 +1018,9 @@ impl<R: Read + Seek> ZipArchive<R> {
10111018

10121019
fn make_writable_dir_all<T: AsRef<Path>>(outpath: T) -> Result<(), ZipError> {
10131020
create_dir_all(outpath.as_ref())?;
1021+
/* TODO: do we want to automatically make the directory writable? Wouldn't we prefer to
1022+
* respect the write permissions of the extraction dir? Pipelined extraction does not
1023+
* mutate permissions like this. */
10141024
#[cfg(unix)]
10151025
{
10161026
// Dirs must be writable until all normal files are extracted

0 commit comments

Comments
 (0)