diff --git a/src/read.rs b/src/read.rs index 68f6331f7..fc16ca03a 100644 --- a/src/read.rs +++ b/src/read.rs @@ -9,10 +9,10 @@ use crate::extra_fields::{ExtendedTimestamp, ExtraField, Ntfs}; use crate::read::zip_archive::{Shared, SharedBuilder}; use crate::result::invalid; use crate::result::{ZipError, ZipResult}; -use crate::spec::{self, CentralDirectoryEndInfo, DataAndPosition, FixedSizeBlock, Pod}; +use crate::spec::{self, CentralDirectoryEndInfo, DataAndPosition, FixedSizeBlock, Magic, Pod}; use crate::types::{ - AesMode, AesVendorVersion, DateTime, System, ZipCentralEntryBlock, ZipFileData, - ZipLocalEntryBlock, + AesMode, AesVendorVersion, DateTime, System, ZipCentralEntryBlock, ZipDataDescriptor, + ZipFileData, ZipLocalEntryBlock, ZipLocalEntryBlockAndFields, }; use crate::write::SimpleFileOptions; use crate::zipcrypto::{ZipCryptoReader, ZipCryptoReaderValid, ZipCryptoValidator}; @@ -1934,51 +1934,23 @@ impl Drop for ZipFile<'_, R> { /// * `comment`: set to an empty string /// * `data_start`: set to 0 /// * `external_attributes`: `unix_mode()`: will return None +/// +/// Security considerations: +/// An attacker may craft an ZIP file that decodes normally using the `ZipArchive` struct +/// or any other ZIP file reader but decode to a different set of files when using this function +/// to read the stream. This is due to the design choice of the ZIP format, placing the file +/// index at the end of a ZIP file. When reading it incrementally different files may be seen, +/// which are not listed in the file index. pub fn read_zipfile_from_stream(reader: &mut R) -> ZipResult>> { - // We can't use the typical ::parse() method, as we follow separate code paths depending on the - // "magic" value (since the magic value will be from the central directory header if we've - // finished iterating over all the actual files). - /* TODO: smallvec? */ - - let mut block = ZipLocalEntryBlock::zeroed(); - reader.read_exact(block.as_bytes_mut())?; - - match block.magic().from_le() { - spec::Magic::LOCAL_FILE_HEADER_SIGNATURE => (), - spec::Magic::CENTRAL_DIRECTORY_HEADER_SIGNATURE => return Ok(None), - _ => return Err(ZipLocalEntryBlock::WRONG_MAGIC_ERROR), - } - - let block = block.from_le(); - - let mut result = ZipFileData::from_local_block(block, reader)?; - - match parse_extra_field(&mut result) { - Ok(..) | Err(ZipError::Io(..)) => {} - Err(e) => return Err(e), - } - - let limit_reader = reader.take(result.compressed_size); - - let result_flags = result.flags; - let crypto_reader = make_crypto_reader(&result, limit_reader, None, None)?; - let ZipFileData { - crc32, - uncompressed_size, - compression_method, - .. - } = result; + let block = read_local_fileblock(reader)?; + let block = match block { + Some(block) => block, + None => return Ok(None), + }; - Ok(Some(ZipFile { - data: Cow::Owned(result), - reader: make_reader( - compression_method, - uncompressed_size, - crc32, - crypto_reader, - result_flags, - )?, - })) + // we can't look for a data descriptor since we can't seek back + // TODO: provide a method that buffers the read data and allows seeking back + read_zipfile_from_fileblock(block, reader, None) } /// A filter that determines whether an entry should be ignored when searching @@ -2034,6 +2006,284 @@ pub fn root_dir_common_filter(path: &Path) -> bool { true } +fn read_local_fileblock(reader: &mut R) -> ZipResult> { + // We can't use the typical ::parse() method, as we follow separate code paths depending on the + // "magic" value (since the magic value will be from the central directory header if we've + // finished iterating over all the actual files). + /* TODO: smallvec? */ + + let mut block = ZipLocalEntryBlock::zeroed(); + reader.read_exact(block.as_bytes_mut())?; + + match block.magic().from_le() { + spec::Magic::LOCAL_FILE_HEADER_SIGNATURE => (), + spec::Magic::CENTRAL_DIRECTORY_HEADER_SIGNATURE => return Ok(None), + _ => return Err(ZipLocalEntryBlock::WRONG_MAGIC_ERROR), + } + + let block = block.from_le(); + + let file_name_length: usize = block.file_name_length.into(); + let extra_field_length: usize = block.extra_field_length.into(); + + let mut file_name_raw = vec![0u8; file_name_length]; + reader.read_exact(&mut file_name_raw)?; + let mut extra_field = vec![0u8; extra_field_length]; + reader.read_exact(&mut extra_field)?; + + Ok(Some(ZipLocalEntryBlockAndFields { + block, + file_name_raw, + extra_field, + })) +} + +fn read_zipfile_from_fileblock( + block: ZipLocalEntryBlockAndFields, + reader: &mut R, + data_descriptor: Option, +) -> ZipResult>> { + let mut result = ZipFileData::from_local_block(block, data_descriptor, reader)?; + + match crate::read::parse_extra_field(&mut result) { + Ok(..) | Err(ZipError::Io(..)) => {} + Err(e) => return Err(e), + } + + let limit_reader = reader.take(result.compressed_size); + + let result_flags = result.flags; + let crypto_reader = crate::read::make_crypto_reader(&result, limit_reader, None, None)?; + let ZipFileData { + crc32, + uncompressed_size, + compression_method, + .. + } = result; + + Ok(Some(ZipFile { + data: Cow::Owned(result), + reader: crate::read::make_reader( + compression_method, + uncompressed_size, + crc32, + crypto_reader, + result_flags, + )?, + })) +} + +/// Read ZipFile structures from a seekable reader. +/// +/// This is an alternative method to read a zip file. If possible, use the ZipArchive functions +/// as some information will be missing when reading this manner. +/// +/// Reads a file header from the start of the stream. Will return `Ok(Some(..))` if a file is +/// present at the start of the stream. Returns `Ok(None)` if the start of the central directory +/// is encountered. No more files should be read after this. +/// +/// The Drop implementation of ZipFile ensures that the reader will be correctly positioned after +/// the structure is done. +/// +/// This method will not find a ZipFile entry if the zip file uses central directory encryption. +/// In that case you should use ZipArchive functions. +/// +/// This method is superior to ZipArchive in the special case that the underlying stream implementation must +/// buffer all seen/read data to provide seek back support and the memory consumption must be kept small. +/// This could be the case when reading a ZipFile B.zip nested within a zip file A.zip, that is stored on disk. Since A is seekable when stored +/// as file, A.zip can be read using ZipArchive. Be B a zip file stored in A, then we can read B's content +/// using a decompressing reader. The problem is that the decompressing reader will not be seekable (due to decompression). +/// When one would like to still read contents of B.zip without extracting A to disk, the file B.zip must be buffered in RAM. +/// When using ZipArchive to read B.zip from RAM, the whole B.zip file must be buffered to RAM because the central directory of B.zip is located +/// at the end of the file B.zip. +/// This method will read B.zip from the start of the file and returning the first file entry found in B.zip. +/// After the execution of this function and the ZipFile return value is dropped, the reader will be positioned at the end of the file entry. +/// Since this function will never seek back to before the initial position of the stream when the function was called, +/// the underlying stream implementation may discard, after dropping ZipFile, all buffered data before the current position of the stream. +/// Summarizing: In given scenario, this method must not buffer the whole B.zip file to RAM, but only the first file entry. +/// +/// Missing fields are: +/// * `comment`: set to an empty string +/// * `data_start`: set to 0 +/// * `external_attributes`: `unix_mode()`: will return None +/// +/// Security considerations: +/// An attacker may craft an ZIP file that decodes normally using the `ZipArchive` struct +/// or any other ZIP file reader but decode to a different set of files when using this function +/// to read the stream. This is due to the design choice of the ZIP format, placing the file +/// index at the end of a ZIP file. When reading it incrementally different files may be seen, +/// which are not listed in the file index. +pub fn read_zipfile_from_seekablestream( + reader: &mut S, +) -> ZipResult>> { + let initial_stream_position = reader.stream_position()?; + let local_entry_block = read_local_fileblock(reader)?; + let local_entry_block = match local_entry_block { + Some(local_entry_block) => local_entry_block, + None => return Ok(None), + }; + + let using_data_descriptor = local_entry_block.block.flags & (1 << 3) == 1 << 3; + let mut shift_register: u32 = 0; + let mut read_buffer = [0u8; 1]; + if using_data_descriptor { + // we must find the data descriptor to get the compressed size + + let stream_position_compressed_file_start = reader.stream_position()?; + loop { + let read_count = reader.read(&mut read_buffer)?; + if read_count == 0 { + return Ok(None); // EOF + } + + let stream_position = reader.stream_position()?; + + if stream_position.saturating_sub(initial_stream_position) > u32::MAX as u64 { + return Ok(None); // file too large + } + + shift_register = shift_register >> 8 | (read_buffer[0] as u32) << 24; + + if stream_position < stream_position_compressed_file_start + 4 { + continue; + } + + if spec::Magic::from_le_bytes(shift_register.to_le_bytes()) + == spec::Magic::DATA_DESCRIPTOR_SIGNATURE + { + // found a potential data descriptor + reader.seek(SeekFrom::Current(-4))?; // go back to the start of the data descriptor + + let mut data_descriptor = ZipDataDescriptor::zeroed(); + reader.read_exact(data_descriptor.as_bytes_mut())?; + // seek back to data end + reader.seek(SeekFrom::Current( + -(mem::size_of::() as i64), + ))?; + + let data_descriptor = ZipDataDescriptor::from_le(data_descriptor); + + // check if the data descriptor is indeed valid for the read data + let compressed_byte_count_read_sofar = + (stream_position - 4) - stream_position_compressed_file_start; + if data_descriptor.compressed_size == compressed_byte_count_read_sofar as u32 { + // todo also check crc, current behavior when a the data descriptor magic number is encountered to to bail out + // and just try decompressing a file - if the found magic number was just a coincidence (not the actual data descriptor) + // then checking the crc will yield an error + // + // it would be best to check the crc already here to prevent this in the future + + // seek back to data start + reader.seek(SeekFrom::Start( + initial_stream_position + mem::size_of::() as u64, + ))?; + + return read_zipfile_from_fileblock( + local_entry_block, + reader, + Some(data_descriptor), + ); + } else { + // data descriptor is invalid + + reader.seek(SeekFrom::Current(4))?; // skip the magic number + + // continue data descriptor searching + } + } + } + } else { + // dont using a data descriptor + reader.seek(SeekFrom::Start( + initial_stream_position + mem::size_of::() as u64, + ))?; + read_zipfile_from_fileblock(local_entry_block, reader, None) + } +} + +/// Advance the stream to the next zip file magic number (local file header, central directory header, data descriptor) +/// and return the magic number and the number of bytes read since the call to this function. +/// +/// This function is useful in combination with [read_zipfile_from_seekablestream] to read multiple zip files from a single stream. +/// This function will ever seek back 4 bytes and not before the initial position of the stream when the function was called. +/// +/// The stream will be positioned at the start of the magic number. +/// +/// Returns the found magic number and the number of bytes read since the call to this function. +fn advance_stream_to_next_magic( + reader: &mut S, +) -> ZipResult> { + let mut shift_register: u32 = 0; + let mut read_buffer = [0u8; 1]; + + let mut read_count_total: usize = 0; + loop { + let read_count = reader.read(&mut read_buffer)?; + + if read_count == 0 { + return Ok(None); // EOF + } + + read_count_total += read_count; + + shift_register = shift_register >> 8 | (read_buffer[0] as u32) << 24; + let magic = spec::Magic::from_le_bytes(shift_register.to_le_bytes()); + + if read_count_total < 4 { + continue; + } + + if magic == spec::Magic::LOCAL_FILE_HEADER_SIGNATURE + || magic == spec::Magic::CENTRAL_DIRECTORY_HEADER_SIGNATURE + || magic == spec::Magic::DATA_DESCRIPTOR_SIGNATURE + { + // found a potential magic number + reader.seek(SeekFrom::Current(-4))?; // go back to the start of the magic number + return Ok(Some((magic, read_count_total - 4))); + } + } +} + +/// Like advance_stream_to_next_magic but advances until given Magic is encountered +fn advance_stream_to_next_magic_start( + reader: &mut S, + target_magic: Magic, +) -> ZipResult> { + let mut bytes_read_total = 0; + + loop { + match advance_stream_to_next_magic(reader)? { + Some((magic, bytes_read)) => { + bytes_read_total += bytes_read; + if magic == target_magic { + return Ok(Some(bytes_read_total)); + } else { + reader.seek(SeekFrom::Current(1))?; + bytes_read_total += 1; + } + } + None => return Ok(None), + } + } +} + +/// Advance the stream to the next zip file start (local file header start) +/// and return the magic number and the number of bytes read since the call to this function. +/// +/// This function is useful in combination with [read_zipfile_from_seekablestream] to read multiple zip files from a single stream. +/// This function will ever seek back 4 bytes and not before the initial position of the stream when the function was called. +/// +/// The stream will be positioned at the start of the zip file start (local file header start). +/// +/// Returns the number of bytes read since the call to this function. +/// +/// Will return None if the end of the stream is reached. +pub fn advance_stream_to_next_zipfile_start( + reader: &mut S, +) -> ZipResult> { + advance_stream_to_next_magic_start(reader, spec::Magic::LOCAL_FILE_HEADER_SIGNATURE) +} + #[cfg(feature = "chrono")] /// Generate a `SystemTime` from a `DateTime`. fn datetime_to_systemtime(datetime: &DateTime) -> Option { diff --git a/src/types.rs b/src/types.rs index a076d1f4f..8c9aa2a5e 100644 --- a/src/types.rs +++ b/src/types.rs @@ -2,6 +2,7 @@ use crate::cp437::FromCp437; use crate::write::{FileOptionExtension, FileOptions}; use path::{Component, Path, PathBuf}; +use std::borrow::Cow; use std::cmp::Ordering; use std::ffi::OsStr; use std::fmt; @@ -545,6 +546,13 @@ pub struct ZipFileData { pub extra_fields: Vec, } +// contains the block and the following fields as read from the file +pub(crate) struct ZipLocalEntryBlockAndFields { + pub(crate) block: ZipLocalEntryBlock, + pub(crate) file_name_raw: Vec, + pub(crate) extra_field: Vec, +} + impl ZipFileData { /// Get the starting offset of the data of the compressed file pub fn data_start(&self, reader: &mut (impl Read + Seek + Sized)) -> ZipResult { @@ -751,7 +759,8 @@ impl ZipFileData { } pub(crate) fn from_local_block( - block: ZipLocalEntryBlock, + block: ZipLocalEntryBlockAndFields, + data_descriptor: Option, reader: &mut R, ) -> ZipResult { let ZipLocalEntryBlock { @@ -761,13 +770,13 @@ impl ZipFileData { compression_method, last_mod_time, last_mod_date, - crc32, - compressed_size, - uncompressed_size, + mut crc32, + mut compressed_size, + mut uncompressed_size, file_name_length, extra_field_length, .. - } = block; + } = block.block; let encrypted: bool = flags & 1 == 1; if encrypted { @@ -780,9 +789,18 @@ impl ZipFileData { /* flags & (1 << 3) != 0 */ let using_data_descriptor: bool = flags & (1 << 3) == 1 << 3; if using_data_descriptor { - return Err(ZipError::UnsupportedArchive( - "The file length is not available in the local header", - )); + match data_descriptor { + None => { + return Err(ZipError::UnsupportedArchive( + "The file uses a data descriptor, but the provided input stream is not seekable or the data descriptor is missing.", + )); + } + Some(data_descriptor) => { + uncompressed_size = data_descriptor.uncompressed_size; + compressed_size = data_descriptor.compressed_size; + crc32 = data_descriptor.crc32; + } + } } /* flags & (1 << 1) != 0 */ @@ -797,8 +815,8 @@ impl ZipFileData { reader.read_exact(&mut extra_field)?; let file_name: Box = match is_utf8 { - true => String::from_utf8_lossy(&file_name_raw).into(), - false => file_name_raw.clone().from_cp437().into(), + true => String::from_utf8_lossy(&block.file_name_raw).into(), + false => block.file_name_raw.clone().from_cp437().into(), }; let system: u8 = (version_made_by >> 8).try_into().unwrap(); @@ -817,8 +835,8 @@ impl ZipFileData { compressed_size: compressed_size.into(), uncompressed_size: uncompressed_size.into(), file_name, - file_name_raw: file_name_raw.into(), - extra_field: Some(Arc::new(extra_field)), + file_name_raw: block.file_name_raw.into(), + extra_field: Some(Arc::new(block.extra_field)), central_extra_field: None, file_comment: String::with_capacity(0).into_boxed_str(), // file comment is only available in the central directory // header_start and data start are not available, but also don't matter, since seeking is @@ -1077,6 +1095,35 @@ impl FixedSizeBlock for ZipLocalEntryBlock { ]; } +#[derive(Copy, Clone, Debug)] +#[repr(packed, C)] +pub(crate) struct ZipDataDescriptor { + magic: spec::Magic, + pub crc32: u32, + pub compressed_size: u32, + pub uncompressed_size: u32, +} +unsafe impl Pod for ZipDataDescriptor {} + +impl FixedSizeBlock for crate::types::ZipDataDescriptor { + const MAGIC: spec::Magic = spec::Magic::DATA_DESCRIPTOR_SIGNATURE; + + #[inline(always)] + fn magic(self) -> spec::Magic { + self.magic + } + + const WRONG_MAGIC_ERROR: ZipError = + ZipError::InvalidArchive(Cow::Borrowed("Invalid data descriptor")); + + to_and_from_le![ + (magic, spec::Magic), + (crc32, u32), + (compressed_size, u32), + (uncompressed_size, u32), + ]; +} + #[derive(Copy, Clone, Debug)] pub(crate) struct Zip64ExtraFieldBlock { magic: spec::ExtraFieldMagic, diff --git a/tests/data/data_descriptor_two_entries.zip b/tests/data/data_descriptor_two_entries.zip new file mode 100644 index 000000000..f487cccb9 Binary files /dev/null and b/tests/data/data_descriptor_two_entries.zip differ diff --git a/tests/deflate64.rs b/tests/deflate64.rs index b0cd95a95..e9e14480b 100644 --- a/tests/deflate64.rs +++ b/tests/deflate64.rs @@ -19,3 +19,105 @@ fn decompress_deflate64() { .expect("couldn't read encrypted and compressed file"); assert_eq!(include_bytes!("data/folder/binary.wmv"), &content[..]); } + +#[test] +fn decompress_deflate64_stream_no_data_descriptor() { + let mut v = Vec::new(); + v.extend_from_slice(include_bytes!("data/deflate64.zip")); + let mut stream = io::Cursor::new(v); + + let mut entry = zip::read::read_zipfile_from_seekablestream(&mut stream) + .expect("couldn't open test zip file") + .expect("did not find file entry in zip file"); + assert_eq!("binary.wmv", entry.name()); + + let mut content = Vec::new(); + entry + .read_to_end(&mut content) + .expect("couldn't read encrypted and compressed file"); + assert_eq!(include_bytes!("data/folder/binary.wmv"), &content[..]); +} + +#[test] +fn decompress_deflate64_stream_with_data_descriptor_sanity_check() { + let mut v = Vec::new(); + v.extend_from_slice(include_bytes!("data/data_descriptor.zip")); + let mut archive = ZipArchive::new(io::Cursor::new(v)).expect("couldn't open test zip file"); + + let mut file = archive + .by_name("hello.txt") + .expect("couldn't find file in archive"); + assert_eq!("hello.txt", file.name()); + + let mut content = Vec::new(); + file.read_to_end(&mut content) + .expect("couldn't read encrypted and compressed file"); + assert_eq!(b"Hello World\n", &content[..]); +} + +#[test] +fn decompress_deflate64_stream_with_data_descriptor() { + let mut v = Vec::new(); + v.extend_from_slice(include_bytes!("data/data_descriptor.zip")); + let mut stream = io::Cursor::new(v); + + let mut entry = zip::read::read_zipfile_from_seekablestream(&mut stream) + .expect("couldn't open test zip file") + .expect("did not find file entry in zip file"); + assert_eq!("hello.txt", entry.name()); + + let mut content = Vec::new(); + entry + .read_to_end(&mut content) + .expect("couldn't read encrypted and compressed file"); + assert_eq!(b"Hello World\n", &content[..]); +} + +#[test] +fn decompress_deflate64_stream_with_data_descriptor_continue() { + let mut v = Vec::new(); + v.extend_from_slice(include_bytes!("data/data_descriptor_two_entries.zip")); + let mut stream = io::Cursor::new(v); + + // First entry + + let mut entry = zip::read::read_zipfile_from_seekablestream(&mut stream) + .expect("couldn't open test zip file") + .expect("did not find file entry in zip file"); + assert_eq!("hello.txt", entry.name()); + + let mut content = Vec::new(); + entry + .read_to_end(&mut content) + .expect("couldn't read encrypted and compressed file"); + assert_eq!(b"Hello World\n", &content[..]); + + drop(entry); + + // Second entry + + zip::read::advance_stream_to_next_zipfile_start(&mut stream) + .expect("couldn't advance to next entry in zip file"); + + let mut entry = zip::read::read_zipfile_from_seekablestream(&mut stream) + .expect("couldn't open test zip file") + .expect("did not find file entry in zip file"); + assert_eq!("world.txt", entry.name()); + + let mut content = Vec::new(); + entry + .read_to_end(&mut content) + .expect("couldn't read encrypted and compressed file"); + assert_eq!(b"STUFF\n", &content[..]); + + drop(entry); + + // No more entries + + let entry = zip::read::advance_stream_to_next_zipfile_start(&mut stream) + .expect("couldn't advance to next entry in zip file"); + match entry { + None => (), + _ => unreachable!("expected no more entries"), + }; +}