From 0ab2c550c1309d5a8e8677907fa54c8d11c6a610 Mon Sep 17 00:00:00 2001 From: 0xCCF4 <0xccf4-git-commit@lenzentech.de> Date: Fri, 21 Jun 2024 09:26:56 +0200 Subject: [PATCH 1/7] Added the method fn read_zipfile_from_seekablestream(reader: &mut S) that reads the next zipfile entry from a stream by potential parsing the data descriptor This is an alternative method to read a zip file. If possible, use the ZipArchive functions as some information will be missing when reading this manner. This method extends the existing read_zipfile_from_stream method when the stream is seekable. This method is superior to ZipArchive in the special case that the underlying stream implementation must buffer all seen/read data to provide seek back support and the memory consumption must be kept small. This could be the case when reading a ZipFile B.zip nested within a zip file A.zip, that is stored on disk. Since A is seekable when stored as file, A.zip can be read using ZipArchive. Be B a zip file stored in A, then we can read B's content using a decompressing reader. The problem is that the decompressing reader will not be seekable (due to decompression). When one would like to still read contents of B.zip without extracting A to disk, the file B.zip must be buffered in RAM. When using ZipArchive to read B.zip from RAM, the whole B.zip file must be buffered to RAM because the central directory of B.zip is located at the end of the file B.zip. This method will read B.zip from the start of the file and returning the first file entry found in B.zip. After the execution of this function and the ZipFile return value is dropped, the reader will be positioned at the end of the file entry. Since this function will never seek back to before the initial position of the stream when the function was called, the underlying stream implementation may discard, after dropping ZipFile, all buffered data before the current position of the stream. Summarizing: In given scenario, this method must not buffer the whole B.zip file to RAM, but only the first file entry. # Conflicts: # src/read.rs --- src/read.rs | 348 +++++++++++++++++---- src/types.rs | 80 +++-- tests/data/data_descriptor_two_entries.zip | Bin 0 -> 392 bytes tests/deflate64.rs | 102 ++++++ 4 files changed, 449 insertions(+), 81 deletions(-) create mode 100644 tests/data/data_descriptor_two_entries.zip diff --git a/src/read.rs b/src/read.rs index 4643ca04f..c98038439 100644 --- a/src/read.rs +++ b/src/read.rs @@ -9,10 +9,11 @@ use crate::extra_fields::{ExtendedTimestamp, ExtraField, Ntfs}; use crate::read::zip_archive::{Shared, SharedBuilder}; use crate::result::invalid; use crate::result::{ZipError, ZipResult}; -use crate::spec::{self, CentralDirectoryEndInfo, DataAndPosition, FixedSizeBlock, Pod}; +use crate::spec::{self, DataAndPosition, + FixedSizeBlock, Magic, Zip32CentralDirectoryEnd}; use crate::types::{ - AesMode, AesVendorVersion, DateTime, System, ZipCentralEntryBlock, ZipFileData, - ZipLocalEntryBlock, + AesMode, AesVendorVersion, DateTime, System, ZipCentralEntryBlock, ZipDataDescriptor, + ZipFileData, ZipLocalEntryBlock, ZipLocalEntryBlockAndFields, }; use crate::write::SimpleFileOptions; use crate::zipcrypto::{ZipCryptoReader, ZipCryptoReaderValid, ZipCryptoValidator}; @@ -1908,23 +1909,60 @@ impl Drop for ZipFile<'_, R> { } } -/// Read ZipFile structures from a non-seekable reader. +/// A filter that determines whether an entry should be ignored when searching +/// for the root directory of a Zip archive. /// -/// This is an alternative method to read a zip file. If possible, use the ZipArchive functions -/// as some information will be missing when reading this manner. +/// Returns `true` if the entry should be considered, and `false` if it should +/// be ignored. /// -/// Reads a file header from the start of the stream. Will return `Ok(Some(..))` if a file is -/// present at the start of the stream. Returns `Ok(None)` if the start of the central directory -/// is encountered. No more files should be read after this. +/// See [`root_dir_common_filter`] for a sensible default filter. +pub trait RootDirFilter: Fn(&Path) -> bool {} +impl bool> RootDirFilter for F {} + +/// Common filters when finding the root directory of a Zip archive. /// -/// The Drop implementation of ZipFile ensures that the reader will be correctly positioned after -/// the structure is done. +/// This filter is a sensible default for most use cases and filters out common +/// system files that are usually irrelevant to the contents of the archive. /// -/// Missing fields are: -/// * `comment`: set to an empty string -/// * `data_start`: set to 0 -/// * `external_attributes`: `unix_mode()`: will return None -pub fn read_zipfile_from_stream(reader: &mut R) -> ZipResult>> { +/// Currently, the filter ignores: +/// - `/__MACOSX/` +/// - `/.DS_Store` +/// - `/Thumbs.db` +/// +/// **This function is not guaranteed to be stable and may change in future versions.** +/// +/// # Example +/// +/// ```rust +/// # use std::path::Path; +/// assert!(zip::read::root_dir_common_filter(Path::new("foo.txt"))); +/// assert!(!zip::read::root_dir_common_filter(Path::new(".DS_Store"))); +/// assert!(!zip::read::root_dir_common_filter(Path::new("Thumbs.db"))); +/// assert!(!zip::read::root_dir_common_filter(Path::new("__MACOSX"))); +/// assert!(!zip::read::root_dir_common_filter(Path::new("__MACOSX/foo.txt"))); +/// ``` +pub fn root_dir_common_filter(path: &Path) -> bool { + const COMMON_FILTER_ROOT_FILES: &[&str] = &[".DS_Store", "Thumbs.db"]; + + if path.starts_with("__MACOSX") { + return false; + } + + if path.components().count() == 1 + && path.file_name().is_some_and(|file_name| { + COMMON_FILTER_ROOT_FILES + .iter() + .map(OsStr::new) + .any(|cmp| cmp == file_name) + }) + { + return false; + } + + true +} + +fn read_local_fileblock(reader: &mut R) -> ZipResult> { // We can't use the typical ::parse() method, as we follow separate code paths depending on the // "magic" value (since the magic value will be from the central directory header if we've // finished iterating over all the actual files). @@ -1941,9 +1979,29 @@ pub fn read_zipfile_from_stream(reader: &mut R) -> ZipResult( + block: ZipLocalEntryBlockAndFields, + reader: &'a mut R, + data_descriptor: Option, +) -> ZipResult>> { + let mut result = ZipFileData::from_local_block(block, data_descriptor)?; + + match crate::read::parse_extra_field(&mut result) { Ok(..) | Err(ZipError::Io(..)) => {} Err(e) => return Err(e), } @@ -1951,7 +2009,7 @@ pub fn read_zipfile_from_stream(reader: &mut R) -> ZipResult(reader: &mut R) -> ZipResult(reader: &mut R) -> ZipResult bool {} -impl bool> RootDirFilter for F {} +/// Reads a file header from the start of the stream. Will return `Ok(Some(..))` if a file is +/// present at the start of the stream. Returns `Ok(None)` if the start of the central directory +/// is encountered. No more files should be read after this. +/// +/// The Drop implementation of ZipFile ensures that the reader will be correctly positioned after +/// the structure is done. +/// +/// This method will not find a ZipFile entry if the zip file uses data descriptors. +/// In that case you could use [read_zipfile_from_seekable_stream] +/// +/// Missing fields are: +/// * `comment`: set to an empty string +/// * `data_start`: set to 0 +/// * `external_attributes`: `unix_mode()`: will return None +pub fn read_zipfile_from_stream<'a, R: Read>(reader: &'a mut R) -> ZipResult>> { + let block = read_local_fileblock(reader)?; + let block = match block { + Some(block) => block, + None => return Ok(None), + }; -/// Common filters when finding the root directory of a Zip archive. + // we can't look for a data descriptor since we can't seek back + // TODO: provide a method that buffers the read data and allows seeking back + read_zipfile_from_fileblock(block, reader, None) +} + +/// Read ZipFile structures from a seekable reader. /// -/// This filter is a sensible default for most use cases and filters out common -/// system files that are usually irrelevant to the contents of the archive. +/// This is an alternative method to read a zip file. If possible, use the ZipArchive functions +/// as some information will be missing when reading this manner. /// -/// Currently, the filter ignores: -/// - `/__MACOSX/` -/// - `/.DS_Store` -/// - `/Thumbs.db` +/// Reads a file header from the start of the stream. Will return `Ok(Some(..))` if a file is +/// present at the start of the stream. Returns `Ok(None)` if the start of the central directory +/// is encountered. No more files should be read after this. /// -/// **This function is not guaranteed to be stable and may change in future versions.** +/// The Drop implementation of ZipFile ensures that the reader will be correctly positioned after +/// the structure is done. /// -/// # Example +/// This method will not find a ZipFile entry if the zip file uses central directory encryption. +/// In that case you should use ZipArchive functions. /// -/// ```rust -/// # use std::path::Path; -/// assert!(zip::read::root_dir_common_filter(Path::new("foo.txt"))); -/// assert!(!zip::read::root_dir_common_filter(Path::new(".DS_Store"))); -/// assert!(!zip::read::root_dir_common_filter(Path::new("Thumbs.db"))); -/// assert!(!zip::read::root_dir_common_filter(Path::new("__MACOSX"))); -/// assert!(!zip::read::root_dir_common_filter(Path::new("__MACOSX/foo.txt"))); -/// ``` -pub fn root_dir_common_filter(path: &Path) -> bool { - const COMMON_FILTER_ROOT_FILES: &[&str] = &[".DS_Store", "Thumbs.db"]; +/// This method is superior to ZipArchive in the special case that the underlying stream implementation must +/// buffer all seen/read data to provide seek back support and the memory consumption must be kept small. +/// This could be the case when reading a ZipFile B.zip nested within a zip file A.zip, that is stored on disk. Since A is seekable when stored +/// as file, A.zip can be read using ZipArchive. Be B a zip file stored in A, then we can read B's content +/// using a decompressing reader. The problem is that the decompressing reader will not be seekable (due to decompression). +/// When one would like to still read contents of B.zip without extracting A to disk, the file B.zip must be buffered in RAM. +/// When using ZipArchive to read B.zip from RAM, the whole B.zip file must be buffered to RAM because the central directory of B.zip is located +/// at the end of the file B.zip. +/// This method will read B.zip from the start of the file and returning the first file entry found in B.zip. +/// After the execution of this function and the ZipFile return value is dropped, the reader will be positioned at the end of the file entry. +/// Since this function will never seek back to before the initial position of the stream when the function was called, +/// the underlying stream implementation may discard, after dropping ZipFile, all buffered data before the current position of the stream. +/// Summarizing: In given scenario, this method must not buffer the whole B.zip file to RAM, but only the first file entry. +/// +/// Missing fields are: +/// * `comment`: set to an empty string +/// * `data_start`: set to 0 +/// * `external_attributes`: `unix_mode()`: will return None +pub fn read_zipfile_from_seekablestream( + reader: &mut S, +) -> ZipResult> { + let local_entry_block = read_local_fileblock(reader)?; + let local_entry_block = match local_entry_block { + Some(local_entry_block) => local_entry_block, + None => return Ok(None), + }; - if path.starts_with("__MACOSX") { - return false; + let using_data_descriptor = local_entry_block.block.flags & (1 << 3) == 1 << 3; + let mut shift_register: u32 = 0; + let mut read_buffer = [0u8; 1]; + if using_data_descriptor { + // we must find the data descriptor to get the compressed size + + let mut read_count_total: usize = 0; + loop { + let read_count = reader.read(&mut read_buffer)?; + if read_count == 0 { + return Ok(None); // EOF + } + + read_count_total += read_count; + + if read_count_total > u32::MAX as usize { + return Ok(None); // file too large + } + + shift_register = shift_register >> 8 | (read_buffer[0] as u32) << 24; + + if read_count_total < 4 { + continue; + } + + if spec::Magic::from_le_bytes(shift_register.to_le_bytes()) + == spec::Magic::DATA_DESCRIPTOR_SIGNATURE + { + // found a potential data descriptor + reader.seek(SeekFrom::Current(-4))?; // go back to the start of the data descriptor + read_count_total -= 4; + + let mut data_descriptor_block = [0u8; mem::size_of::()]; + reader.read_exact(&mut data_descriptor_block)?; + let data_descriptor_block: Box<[u8]> = data_descriptor_block.into(); + // seek back to data end + reader.seek(SeekFrom::Current( + -(mem::size_of::() as i64), + ))?; + + let data_descriptor = ZipDataDescriptor::interpret(&data_descriptor_block)?; + + // check if the data descriptor is indeed valid for the read data + if data_descriptor.compressed_size == read_count_total as u32 { + // todo also check crc + // valid data descriptor + + // seek back to data start + reader.seek(SeekFrom::Current(-(read_count_total as i64)))?; + + return read_zipfile_from_fileblock( + local_entry_block, + reader, + Some(data_descriptor), + ); + } else { + // data descriptor is invalid + + reader.seek(SeekFrom::Current(4))?; // skip the magic number + read_count_total += 4; + + // continue data descriptor searching + } + } + } + } else { + // dont using a data descriptor + read_zipfile_from_fileblock(local_entry_block, reader, None) } +} - if path.components().count() == 1 - && path.file_name().is_some_and(|file_name| { - COMMON_FILTER_ROOT_FILES - .iter() - .map(OsStr::new) - .any(|cmp| cmp == file_name) - }) - { - return false; +/// Advance the stream to the next zip file magic number (local file header, central directory header, data descriptor) +/// and return the magic number and the number of bytes read since the call to this function. +/// +/// This function is useful in combination with [read_zipfile_from_seekablestream] to read multiple zip files from a single stream. +/// This function will ever seek back 4 bytes and not before the initial position of the stream when the function was called. +/// +/// The stream will be positioned at the start of the magic number. +/// +/// Returns the found magic number and the number of bytes read since the call to this function. +fn advance_stream_to_next_magic( + reader: &mut S, +) -> ZipResult> { + let mut shift_register: u32 = 0; + let mut read_buffer = [0u8; 1]; + + let mut read_count_total: usize = 0; + loop { + let read_count = reader.read(&mut read_buffer)?; + + if read_count == 0 { + return Ok(None); // EOF + } + + read_count_total += read_count; + + shift_register = shift_register >> 8 | (read_buffer[0] as u32) << 24; + let magic = spec::Magic::from_le_bytes(shift_register.to_le_bytes()); + + if read_count_total < 4 { + continue; + } + + if magic == spec::Magic::LOCAL_FILE_HEADER_SIGNATURE + || magic == spec::Magic::CENTRAL_DIRECTORY_HEADER_SIGNATURE + || magic == spec::Magic::DATA_DESCRIPTOR_SIGNATURE + { + // found a potential magic number + reader.seek(SeekFrom::Current(-4))?; // go back to the start of the magic number + return Ok(Some((magic, read_count_total - 4))); + } } +} - true +/// Like advance_stream_to_next_magic but advances until given Magic is encountered +fn advance_stream_to_next_magic_start( + reader: &mut S, + target_magic: Magic, +) -> ZipResult> { + let mut bytes_read_total = 0; + + loop { + match advance_stream_to_next_magic(reader)? { + Some((magic, bytes_read)) => { + bytes_read_total += bytes_read; + if magic == target_magic { + return Ok(Some(bytes_read_total)); + } else { + reader.seek(SeekFrom::Current(1))?; + bytes_read_total += 1; + } + } + None => return Ok(None), + } + } +} + +/// Advance the stream to the next zip file start (local file header start) +/// and return the magic number and the number of bytes read since the call to this function. +/// +/// This function is useful in combination with [read_zipfile_from_seekablestream] to read multiple zip files from a single stream. +/// This function will ever seek back 4 bytes and not before the initial position of the stream when the function was called. +/// +/// The stream will be positioned at the start of the zip file start (local file header start). +/// +/// Returns the number of bytes read since the call to this function. +/// +/// Will return None if the end of the stream is reached. +pub fn advance_stream_to_next_zipfile_start( + reader: &mut S, +) -> ZipResult> { + advance_stream_to_next_magic_start(reader, spec::Magic::LOCAL_FILE_HEADER_SIGNATURE) } #[cfg(test)] diff --git a/src/types.rs b/src/types.rs index a076d1f4f..96e042ec0 100644 --- a/src/types.rs +++ b/src/types.rs @@ -545,6 +545,13 @@ pub struct ZipFileData { pub extra_fields: Vec, } +// contains the block and the following fields as read from the file +pub(crate) struct ZipLocalEntryBlockAndFields { + pub(crate) block: ZipLocalEntryBlock, + pub(crate) file_name_raw: Vec, + pub(crate) extra_field: Vec, +} + impl ZipFileData { /// Get the starting offset of the data of the compressed file pub fn data_start(&self, reader: &mut (impl Read + Seek + Sized)) -> ZipResult { @@ -750,9 +757,9 @@ impl ZipFileData { local_block } - pub(crate) fn from_local_block( - block: ZipLocalEntryBlock, - reader: &mut R, + pub(crate) fn from_local_block( + block: ZipLocalEntryBlockAndFields, + data_descriptor: Option, ) -> ZipResult { let ZipLocalEntryBlock { // magic, @@ -761,13 +768,11 @@ impl ZipFileData { compression_method, last_mod_time, last_mod_date, - crc32, - compressed_size, - uncompressed_size, - file_name_length, - extra_field_length, + mut crc32, + mut compressed_size, + mut uncompressed_size, .. - } = block; + } = block.block; let encrypted: bool = flags & 1 == 1; if encrypted { @@ -780,25 +785,27 @@ impl ZipFileData { /* flags & (1 << 3) != 0 */ let using_data_descriptor: bool = flags & (1 << 3) == 1 << 3; if using_data_descriptor { - return Err(ZipError::UnsupportedArchive( - "The file length is not available in the local header", - )); + match data_descriptor { + None => { + return Err(ZipError::UnsupportedArchive( + "The file uses a data descriptor, but the provided input stream is not seekable or the data descriptor is missing.", + )); + } + Some(data_descriptor) => { + uncompressed_size = data_descriptor.uncompressed_size; + compressed_size = data_descriptor.compressed_size; + crc32 = data_descriptor.crc32; + } + } } /* flags & (1 << 1) != 0 */ let is_utf8: bool = flags & (1 << 11) != 0; let compression_method = crate::CompressionMethod::parse_from_u16(compression_method); - let file_name_length: usize = file_name_length.into(); - let extra_field_length: usize = extra_field_length.into(); - - let mut file_name_raw = vec![0u8; file_name_length]; - reader.read_exact(&mut file_name_raw)?; - let mut extra_field = vec![0u8; extra_field_length]; - reader.read_exact(&mut extra_field)?; let file_name: Box = match is_utf8 { - true => String::from_utf8_lossy(&file_name_raw).into(), - false => file_name_raw.clone().from_cp437().into(), + true => String::from_utf8_lossy(&block.file_name_raw).into(), + false => block.file_name_raw.clone().from_cp437().into(), }; let system: u8 = (version_made_by >> 8).try_into().unwrap(); @@ -817,8 +824,8 @@ impl ZipFileData { compressed_size: compressed_size.into(), uncompressed_size: uncompressed_size.into(), file_name, - file_name_raw: file_name_raw.into(), - extra_field: Some(Arc::new(extra_field)), + file_name_raw: block.file_name_raw.into(), + extra_field: Some(Arc::new(block.extra_field)), central_extra_field: None, file_comment: String::with_capacity(0).into_boxed_str(), // file comment is only available in the central directory // header_start and data start are not available, but also don't matter, since seeking is @@ -1077,6 +1084,33 @@ impl FixedSizeBlock for ZipLocalEntryBlock { ]; } +#[derive(Copy, Clone, Debug)] +#[repr(packed)] +pub(crate) struct ZipDataDescriptor { + magic: spec::Magic, + pub crc32: u32, + pub compressed_size: u32, + pub uncompressed_size: u32, +} + +impl FixedSizeBlock for crate::types::ZipDataDescriptor { + const MAGIC: spec::Magic = spec::Magic::DATA_DESCRIPTOR_SIGNATURE; + + #[inline(always)] + fn magic(self) -> spec::Magic { + self.magic + } + + const WRONG_MAGIC_ERROR: ZipError = ZipError::InvalidArchive("Invalid data descriptor"); + + to_and_from_le![ + (magic, spec::Magic), + (crc32, u32), + (compressed_size, u32), + (uncompressed_size, u32), + ]; +} + #[derive(Copy, Clone, Debug)] pub(crate) struct Zip64ExtraFieldBlock { magic: spec::ExtraFieldMagic, diff --git a/tests/data/data_descriptor_two_entries.zip b/tests/data/data_descriptor_two_entries.zip new file mode 100644 index 0000000000000000000000000000000000000000..f487cccb9dd07ef67a7371ae549dce8a2a35eb45 GIT binary patch literal 392 zcmWIWW@Zs#-~d9_dD9~ppnwNRb22C}WTfWgi12Kq3vxJ+Ev$Xgz#Q46wl@=U813-oZcr!AIFe7|{Y&FOiFtDT%#3I^H i$i{-)jcg^Vv8g~~iS|W+H!H}|3`{_{4oDvXaTox;c2n*E literal 0 HcmV?d00001 diff --git a/tests/deflate64.rs b/tests/deflate64.rs index b0cd95a95..e48dcf0d6 100644 --- a/tests/deflate64.rs +++ b/tests/deflate64.rs @@ -19,3 +19,105 @@ fn decompress_deflate64() { .expect("couldn't read encrypted and compressed file"); assert_eq!(include_bytes!("data/folder/binary.wmv"), &content[..]); } + +#[test] +fn decompress_deflate64_stream_no_data_descriptor() { + let mut v = Vec::new(); + v.extend_from_slice(include_bytes!("data/deflate64.zip")); + let mut stream = io::Cursor::new(v); + + let mut entry = zip::read::read_zipfile_from_seekablestream(&mut stream) + .expect("couldn't open test zip file") + .expect("did not find file entry in zip file"); + assert_eq!("binary.wmv", entry.name()); + + let mut content = Vec::new(); + entry + .read_to_end(&mut content) + .expect("couldn't read encrypted and compressed file"); + assert_eq!(include_bytes!("data/folder/binary.wmv"), &content[..]); +} + +#[test] +fn decompress_deflate64_stream_with_data_descriptor_sanity_check() { + let mut v = Vec::new(); + v.extend_from_slice(include_bytes!("data/data_descriptor.zip")); + let mut archive = ZipArchive::new(io::Cursor::new(v)).expect("couldn't open test zip file"); + + let mut file = archive + .by_name("hello.txt") + .expect("couldn't find file in archive"); + assert_eq!("hello.txt", file.name()); + + let mut content = Vec::new(); + file.read_to_end(&mut content) + .expect("couldn't read encrypted and compressed file"); + assert_eq!(b"Hello World\n", &content[..]); +} + +#[test] +fn decompress_deflate64_stream_with_data_descriptor() { + let mut v = Vec::new(); + v.extend_from_slice(include_bytes!("data/data_descriptor.zip")); + let mut stream = io::Cursor::new(v); + + let mut entry = zip::read::read_zipfile_from_seekablestream(&mut stream) + .expect("couldn't open test zip file") + .expect("did not find file entry in zip file"); + assert_eq!("hello.txt", entry.name()); + + let mut content = Vec::new(); + entry + .read_to_end(&mut content) + .expect("couldn't read encrypted and compressed file"); + assert_eq!(b"Hello World\n", &content[..]); +} + +#[test] +fn decompress_deflate64_stream_with_data_descriptor_continue() { + let mut v = Vec::new(); + v.extend_from_slice(include_bytes!("data/data_descriptor_two_entries.zip")); + let mut stream = io::Cursor::new(v); + + // First entry + + let mut entry = zip::read::read_zipfile_from_seekablestream(&mut stream) + .expect("couldn't open test zip file") + .expect("did not find file entry in zip file"); + assert_eq!("hello.txt", entry.name()); + + let mut content = Vec::new(); + entry + .read_to_end(&mut content) + .expect("couldn't read encrypted and compressed file"); + assert_eq!(b"Hello World\n", &content[..]); + + drop(entry); + + // Second entry + + zip::read::advance_stream_to_next_zipfile_start(&mut stream) + .expect("couldn't advance to next entry in zip file"); + + let mut entry = zip::read::read_zipfile_from_seekablestream(&mut stream) + .expect("couldn't open test zip file") + .expect("did not find file entry in zip file"); + assert_eq!("world.txt", entry.name()); + + let mut content = Vec::new(); + entry + .read_to_end(&mut content) + .expect("couldn't read encrypted and compressed file"); + assert_eq!(b"STUFF\n", &content[..]); + + drop(entry); + + // No more entries + + let entry = zip::read::advance_stream_to_next_zipfile_start(&mut stream) + .expect("couldn't advance to next entry in zip file"); + match entry { + None => (), + _ => assert!(false, "expected no more entries"), + }; +} From 5fe2ea400bb9e642226a183f8ca8d5201f97c13e Mon Sep 17 00:00:00 2001 From: hennickc Date: Wed, 10 Sep 2025 16:45:55 -0700 Subject: [PATCH 2/7] Fix merge --- src/read.rs | 89 +++++++++++++++++++++++++--------------------------- src/types.rs | 18 +++++++++-- 2 files changed, 57 insertions(+), 50 deletions(-) diff --git a/src/read.rs b/src/read.rs index c98038439..18f26164c 100644 --- a/src/read.rs +++ b/src/read.rs @@ -9,8 +9,7 @@ use crate::extra_fields::{ExtendedTimestamp, ExtraField, Ntfs}; use crate::read::zip_archive::{Shared, SharedBuilder}; use crate::result::invalid; use crate::result::{ZipError, ZipResult}; -use crate::spec::{self, DataAndPosition, - FixedSizeBlock, Magic, Zip32CentralDirectoryEnd}; +use crate::spec::{self, CentralDirectoryEndInfo, DataAndPosition, FixedSizeBlock, Magic, Pod}; use crate::types::{ AesMode, AesVendorVersion, DateTime, System, ZipCentralEntryBlock, ZipDataDescriptor, ZipFileData, ZipLocalEntryBlock, ZipLocalEntryBlockAndFields, @@ -1909,6 +1908,34 @@ impl Drop for ZipFile<'_, R> { } } +/// Read ZipFile structures from a non-seekable reader. +/// +/// This is an alternative method to read a zip file. If possible, use the ZipArchive functions +/// as some information will be missing when reading this manner. +/// +/// Reads a file header from the start of the stream. Will return `Ok(Some(..))` if a file is +/// present at the start of the stream. Returns `Ok(None)` if the start of the central directory +/// is encountered. No more files should be read after this. +/// +/// The Drop implementation of ZipFile ensures that the reader will be correctly positioned after +/// the structure is done. +/// +/// Missing fields are: +/// * `comment`: set to an empty string +/// * `data_start`: set to 0 +/// * `external_attributes`: `unix_mode()`: will return None +pub fn read_zipfile_from_stream(reader: &mut R) -> ZipResult>> { + let block = read_local_fileblock(reader)?; + let block = match block { + Some(block) => block, + None => return Ok(None), + }; + + // we can't look for a data descriptor since we can't seek back + // TODO: provide a method that buffers the read data and allows seeking back + read_zipfile_from_fileblock(block, reader, None) +} + /// A filter that determines whether an entry should be ignored when searching /// for the root directory of a Zip archive. /// @@ -1950,11 +1977,11 @@ pub fn root_dir_common_filter(path: &Path) -> bool { if path.components().count() == 1 && path.file_name().is_some_and(|file_name| { - COMMON_FILTER_ROOT_FILES - .iter() - .map(OsStr::new) - .any(|cmp| cmp == file_name) - }) + COMMON_FILTER_ROOT_FILES + .iter() + .map(OsStr::new) + .any(|cmp| cmp == file_name) + }) { return false; } @@ -1994,12 +2021,12 @@ fn read_local_fileblock(reader: &mut R) -> ZipResult( +fn read_zipfile_from_fileblock( block: ZipLocalEntryBlockAndFields, - reader: &'a mut R, + reader: &mut R, data_descriptor: Option, -) -> ZipResult>> { - let mut result = ZipFileData::from_local_block(block, data_descriptor)?; +) -> ZipResult>> { + let mut result = ZipFileData::from_local_block(block, data_descriptor, reader)?; match crate::read::parse_extra_field(&mut result) { Ok(..) | Err(ZipError::Io(..)) => {} @@ -2029,37 +2056,6 @@ fn read_zipfile_from_fileblock<'a, R: Read>( })) } -/// Read ZipFile structures from a non-seekable reader. -/// -/// This is an alternative method to read a zip file. If possible, use the ZipArchive functions -/// as some information will be missing when reading this manner. -/// -/// Reads a file header from the start of the stream. Will return `Ok(Some(..))` if a file is -/// present at the start of the stream. Returns `Ok(None)` if the start of the central directory -/// is encountered. No more files should be read after this. -/// -/// The Drop implementation of ZipFile ensures that the reader will be correctly positioned after -/// the structure is done. -/// -/// This method will not find a ZipFile entry if the zip file uses data descriptors. -/// In that case you could use [read_zipfile_from_seekable_stream] -/// -/// Missing fields are: -/// * `comment`: set to an empty string -/// * `data_start`: set to 0 -/// * `external_attributes`: `unix_mode()`: will return None -pub fn read_zipfile_from_stream<'a, R: Read>(reader: &'a mut R) -> ZipResult>> { - let block = read_local_fileblock(reader)?; - let block = match block { - Some(block) => block, - None => return Ok(None), - }; - - // we can't look for a data descriptor since we can't seek back - // TODO: provide a method that buffers the read data and allows seeking back - read_zipfile_from_fileblock(block, reader, None) -} - /// Read ZipFile structures from a seekable reader. /// /// This is an alternative method to read a zip file. If possible, use the ZipArchive functions @@ -2095,7 +2091,7 @@ pub fn read_zipfile_from_stream<'a, R: Read>(reader: &'a mut R) -> ZipResult( reader: &mut S, -) -> ZipResult> { +) -> ZipResult>> { let local_entry_block = read_local_fileblock(reader)?; let local_entry_block = match local_entry_block { Some(local_entry_block) => local_entry_block, @@ -2134,15 +2130,14 @@ pub fn read_zipfile_from_seekablestream( reader.seek(SeekFrom::Current(-4))?; // go back to the start of the data descriptor read_count_total -= 4; - let mut data_descriptor_block = [0u8; mem::size_of::()]; - reader.read_exact(&mut data_descriptor_block)?; - let data_descriptor_block: Box<[u8]> = data_descriptor_block.into(); + let mut data_descriptor = ZipDataDescriptor::zeroed(); + reader.read_exact(data_descriptor.as_bytes_mut())?; // seek back to data end reader.seek(SeekFrom::Current( -(mem::size_of::() as i64), ))?; - let data_descriptor = ZipDataDescriptor::interpret(&data_descriptor_block)?; + let data_descriptor = ZipDataDescriptor::from_le(data_descriptor); // check if the data descriptor is indeed valid for the read data if data_descriptor.compressed_size == read_count_total as u32 { diff --git a/src/types.rs b/src/types.rs index 96e042ec0..2eb7fbf69 100644 --- a/src/types.rs +++ b/src/types.rs @@ -2,6 +2,7 @@ use crate::cp437::FromCp437; use crate::write::{FileOptionExtension, FileOptions}; use path::{Component, Path, PathBuf}; +use std::borrow::Cow; use std::cmp::Ordering; use std::ffi::OsStr; use std::fmt; @@ -757,9 +758,10 @@ impl ZipFileData { local_block } - pub(crate) fn from_local_block( + pub(crate) fn from_local_block( block: ZipLocalEntryBlockAndFields, data_descriptor: Option, + reader: &mut R, ) -> ZipResult { let ZipLocalEntryBlock { // magic, @@ -771,6 +773,8 @@ impl ZipFileData { mut crc32, mut compressed_size, mut uncompressed_size, + file_name_length, + extra_field_length, .. } = block.block; @@ -802,6 +806,13 @@ impl ZipFileData { /* flags & (1 << 1) != 0 */ let is_utf8: bool = flags & (1 << 11) != 0; let compression_method = crate::CompressionMethod::parse_from_u16(compression_method); + let file_name_length: usize = file_name_length.into(); + let extra_field_length: usize = extra_field_length.into(); + + let mut file_name_raw = vec![0u8; file_name_length]; + reader.read_exact(&mut file_name_raw)?; + let mut extra_field = vec![0u8; extra_field_length]; + reader.read_exact(&mut extra_field)?; let file_name: Box = match is_utf8 { true => String::from_utf8_lossy(&block.file_name_raw).into(), @@ -1085,13 +1096,14 @@ impl FixedSizeBlock for ZipLocalEntryBlock { } #[derive(Copy, Clone, Debug)] -#[repr(packed)] +#[repr(packed, C)] pub(crate) struct ZipDataDescriptor { magic: spec::Magic, pub crc32: u32, pub compressed_size: u32, pub uncompressed_size: u32, } +unsafe impl Pod for ZipDataDescriptor {} impl FixedSizeBlock for crate::types::ZipDataDescriptor { const MAGIC: spec::Magic = spec::Magic::DATA_DESCRIPTOR_SIGNATURE; @@ -1101,7 +1113,7 @@ impl FixedSizeBlock for crate::types::ZipDataDescriptor { self.magic } - const WRONG_MAGIC_ERROR: ZipError = ZipError::InvalidArchive("Invalid data descriptor"); + const WRONG_MAGIC_ERROR: ZipError = ZipError::InvalidArchive(Cow::Borrowed("Invalid data descriptor")); to_and_from_le![ (magic, spec::Magic), From 4b88b98366df55989151052401c6e0c96d2f2a62 Mon Sep 17 00:00:00 2001 From: hennickc Date: Wed, 10 Sep 2025 16:46:46 -0700 Subject: [PATCH 3/7] Fix build warnings --- src/read.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/read.rs b/src/read.rs index 18f26164c..42f549890 100644 --- a/src/read.rs +++ b/src/read.rs @@ -2025,7 +2025,7 @@ fn read_zipfile_from_fileblock( block: ZipLocalEntryBlockAndFields, reader: &mut R, data_descriptor: Option, -) -> ZipResult>> { +) -> ZipResult>> { let mut result = ZipFileData::from_local_block(block, data_descriptor, reader)?; match crate::read::parse_extra_field(&mut result) { @@ -2091,7 +2091,7 @@ fn read_zipfile_from_fileblock( /// * `external_attributes`: `unix_mode()`: will return None pub fn read_zipfile_from_seekablestream( reader: &mut S, -) -> ZipResult>> { +) -> ZipResult>> { let local_entry_block = read_local_fileblock(reader)?; let local_entry_block = match local_entry_block { Some(local_entry_block) => local_entry_block, From a0ced1939a9f9a6e22e1b95e2aa89cc601debaa6 Mon Sep 17 00:00:00 2001 From: hennickc Date: Wed, 10 Sep 2025 16:47:12 -0700 Subject: [PATCH 4/7] style: cargo fmt --all --- src/read.rs | 2 +- src/types.rs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/read.rs b/src/read.rs index 42f549890..f83a83ffa 100644 --- a/src/read.rs +++ b/src/read.rs @@ -1925,7 +1925,7 @@ impl Drop for ZipFile<'_, R> { /// * `data_start`: set to 0 /// * `external_attributes`: `unix_mode()`: will return None pub fn read_zipfile_from_stream(reader: &mut R) -> ZipResult>> { - let block = read_local_fileblock(reader)?; + let block = read_local_fileblock(reader)?; let block = match block { Some(block) => block, None => return Ok(None), diff --git a/src/types.rs b/src/types.rs index 2eb7fbf69..8c9aa2a5e 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1113,7 +1113,8 @@ impl FixedSizeBlock for crate::types::ZipDataDescriptor { self.magic } - const WRONG_MAGIC_ERROR: ZipError = ZipError::InvalidArchive(Cow::Borrowed("Invalid data descriptor")); + const WRONG_MAGIC_ERROR: ZipError = + ZipError::InvalidArchive(Cow::Borrowed("Invalid data descriptor")); to_and_from_le![ (magic, spec::Magic), From 2baf4cbb949dd73259562566da72cb67275aab0c Mon Sep 17 00:00:00 2001 From: hennickc Date: Wed, 10 Sep 2025 16:54:30 -0700 Subject: [PATCH 5/7] style: fix a Clippy lint in integration test --- tests/deflate64.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/deflate64.rs b/tests/deflate64.rs index e48dcf0d6..e9e14480b 100644 --- a/tests/deflate64.rs +++ b/tests/deflate64.rs @@ -118,6 +118,6 @@ fn decompress_deflate64_stream_with_data_descriptor_continue() { .expect("couldn't advance to next entry in zip file"); match entry { None => (), - _ => assert!(false, "expected no more entries"), + _ => unreachable!("expected no more entries"), }; } From 90178d7c0a3b6c7f546a10eddbcce5fa465fafd9 Mon Sep 17 00:00:00 2001 From: 0xCCF4 <0xCCF4-git-commit@lenzentech.de> Date: Thu, 11 Sep 2025 17:26:56 +0200 Subject: [PATCH 6/7] fix: offset problem when decoding a zip file with stream reading Signed-off-by: 0xCCF4 <0xCCF4-git-commit@lenzentech.de> --- src/read.rs | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/src/read.rs b/src/read.rs index f83a83ffa..7f1dc7e0d 100644 --- a/src/read.rs +++ b/src/read.rs @@ -1924,7 +1924,16 @@ impl Drop for ZipFile<'_, R> { /// * `comment`: set to an empty string /// * `data_start`: set to 0 /// * `external_attributes`: `unix_mode()`: will return None -pub fn read_zipfile_from_stream(reader: &mut R) -> ZipResult>> { +/// +/// Security considerations: +/// An attacker may craft an ZIP file that decodes normally using the `ZipArchive` struct +/// or any other ZIP file reader but decode to a different set of files when using this function +/// to read the stream. This is due to the design choice of the ZIP format, placing the file +/// index at the end of a ZIP file. When reading it incrementally different files may be seen, +/// which are not listed in the file index. +pub fn read_zipfile_from_stream<'a, R: Read>(reader: &'a mut R) -> ZipResult>> { + + let block = read_local_fileblock(reader)?; let block = match block { Some(block) => block, @@ -2089,9 +2098,17 @@ fn read_zipfile_from_fileblock( /// * `comment`: set to an empty string /// * `data_start`: set to 0 /// * `external_attributes`: `unix_mode()`: will return None +/// +/// Security considerations: +/// An attacker may craft an ZIP file that decodes normally using the `ZipArchive` struct +/// or any other ZIP file reader but decode to a different set of files when using this function +/// to read the stream. This is due to the design choice of the ZIP format, placing the file +/// index at the end of a ZIP file. When reading it incrementally different files may be seen, +/// which are not listed in the file index. pub fn read_zipfile_from_seekablestream( reader: &mut S, ) -> ZipResult>> { + let initial_stream_position = reader.stream_position()?; let local_entry_block = read_local_fileblock(reader)?; let local_entry_block = match local_entry_block { Some(local_entry_block) => local_entry_block, @@ -2104,22 +2121,22 @@ pub fn read_zipfile_from_seekablestream( if using_data_descriptor { // we must find the data descriptor to get the compressed size - let mut read_count_total: usize = 0; + let stream_position_compressed_file_start = reader.stream_position()?; loop { let read_count = reader.read(&mut read_buffer)?; if read_count == 0 { return Ok(None); // EOF } - read_count_total += read_count; + let stream_position = reader.stream_position()?; - if read_count_total > u32::MAX as usize { + if stream_position.saturating_sub(initial_stream_position) > u32::MAX as u64 { return Ok(None); // file too large } shift_register = shift_register >> 8 | (read_buffer[0] as u32) << 24; - if read_count_total < 4 { + if stream_position < stream_position_compressed_file_start+4 { continue; } @@ -2128,7 +2145,6 @@ pub fn read_zipfile_from_seekablestream( { // found a potential data descriptor reader.seek(SeekFrom::Current(-4))?; // go back to the start of the data descriptor - read_count_total -= 4; let mut data_descriptor = ZipDataDescriptor::zeroed(); reader.read_exact(data_descriptor.as_bytes_mut())?; @@ -2140,12 +2156,16 @@ pub fn read_zipfile_from_seekablestream( let data_descriptor = ZipDataDescriptor::from_le(data_descriptor); // check if the data descriptor is indeed valid for the read data - if data_descriptor.compressed_size == read_count_total as u32 { - // todo also check crc - // valid data descriptor + let compressed_byte_count_read_sofar = (stream_position-4)-stream_position_compressed_file_start; + if data_descriptor.compressed_size == compressed_byte_count_read_sofar as u32 { + // todo also check crc, current behavior when a the data descriptor magic number is encountered to to bail out + // and just try decompressing a file - if the found magic number was just a coincidence (not the actual data descriptor) + // then checking the crc will yield an error + // + // it would be best to check the crc already here to prevent this in the future // seek back to data start - reader.seek(SeekFrom::Current(-(read_count_total as i64)))?; + reader.seek(SeekFrom::Start(initial_stream_position + mem::size_of::() as u64))?; return read_zipfile_from_fileblock( local_entry_block, @@ -2156,7 +2176,6 @@ pub fn read_zipfile_from_seekablestream( // data descriptor is invalid reader.seek(SeekFrom::Current(4))?; // skip the magic number - read_count_total += 4; // continue data descriptor searching } @@ -2164,6 +2183,7 @@ pub fn read_zipfile_from_seekablestream( } } else { // dont using a data descriptor + reader.seek(SeekFrom::Start(initial_stream_position + mem::size_of::() as u64))?; read_zipfile_from_fileblock(local_entry_block, reader, None) } } From 86c69195cd76a54e2d74c8d9e884c44aeec2452d Mon Sep 17 00:00:00 2001 From: 0xCCF4 <0xCCF4-git-commit@lenzentech.de> Date: Thu, 11 Sep 2025 17:45:19 +0200 Subject: [PATCH 7/7] style: fix cargo fmt ci errors Signed-off-by: 0xCCF4 <0xCCF4-git-commit@lenzentech.de> --- src/read.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/read.rs b/src/read.rs index 48bfe2e55..fc16ca03a 100644 --- a/src/read.rs +++ b/src/read.rs @@ -1942,8 +1942,6 @@ impl Drop for ZipFile<'_, R> { /// index at the end of a ZIP file. When reading it incrementally different files may be seen, /// which are not listed in the file index. pub fn read_zipfile_from_stream(reader: &mut R) -> ZipResult>> { - - let block = read_local_fileblock(reader)?; let block = match block { Some(block) => block, @@ -2146,7 +2144,7 @@ pub fn read_zipfile_from_seekablestream( shift_register = shift_register >> 8 | (read_buffer[0] as u32) << 24; - if stream_position < stream_position_compressed_file_start+4 { + if stream_position < stream_position_compressed_file_start + 4 { continue; } @@ -2166,7 +2164,8 @@ pub fn read_zipfile_from_seekablestream( let data_descriptor = ZipDataDescriptor::from_le(data_descriptor); // check if the data descriptor is indeed valid for the read data - let compressed_byte_count_read_sofar = (stream_position-4)-stream_position_compressed_file_start; + let compressed_byte_count_read_sofar = + (stream_position - 4) - stream_position_compressed_file_start; if data_descriptor.compressed_size == compressed_byte_count_read_sofar as u32 { // todo also check crc, current behavior when a the data descriptor magic number is encountered to to bail out // and just try decompressing a file - if the found magic number was just a coincidence (not the actual data descriptor) @@ -2175,7 +2174,9 @@ pub fn read_zipfile_from_seekablestream( // it would be best to check the crc already here to prevent this in the future // seek back to data start - reader.seek(SeekFrom::Start(initial_stream_position + mem::size_of::() as u64))?; + reader.seek(SeekFrom::Start( + initial_stream_position + mem::size_of::() as u64, + ))?; return read_zipfile_from_fileblock( local_entry_block, @@ -2193,7 +2194,9 @@ pub fn read_zipfile_from_seekablestream( } } else { // dont using a data descriptor - reader.seek(SeekFrom::Start(initial_stream_position + mem::size_of::() as u64))?; + reader.seek(SeekFrom::Start( + initial_stream_position + mem::size_of::() as u64, + ))?; read_zipfile_from_fileblock(local_entry_block, reader, None) } }