diff --git a/dissect/hypervisor/disk/c_vmdk.py b/dissect/hypervisor/disk/c_vmdk.py index 07f3f77..1356153 100644 --- a/dissect/hypervisor/disk/c_vmdk.py +++ b/dissect/hypervisor/disk/c_vmdk.py @@ -1,95 +1,50 @@ from __future__ import annotations -import struct - from dissect.cstruct import cstruct +# https://github.com/vmware/open-vmdk/blob/master/vmdk/vmware_vmdk.h vmdk_def = """ -typedef struct { - char magic[4]; // Magic "KDMV" LE - uint32 version; // Version - uint32 flags; // Flags - uint64 capacity; // The maximum data number of sectors (capacity) - uint64 grain_size; // The grain number of sectors - uint64 descriptor_offset; // The descriptor sector number - uint64 descriptor_size; // The descriptor number of sectors - uint32 num_grain_table_entries; // The number of grain table entries - uint64 secondary_grain_directory_offset; // The secondary grain directory sector number - uint64 primary_grain_directory_offset; // The primary grain directory sector number - uint64 overhead; // The metadata (overhead) number of sectors - uint8 is_dirty; // Value to indicate the VMDK was cleanly closed - char single_end_line_char; // The single end of line character - char non_end_line_char; // A non end of line character - char double_end_line_chars[2]; // The double end of line characters - uint16 compress_algorithm; // The compression method - char pad[433]; // Padding -} VMDKSparseExtentHeader; +typedef struct SparseExtentHeader { + uint32 magicNumber; + uint32 version; + uint32 flags; + uint64 capacity; + uint64 grainSize; + uint64 descriptorOffset; + uint64 descriptorSize; + uint32 numGTEsPerGT; + uint64 rgdOffset; + uint64 gdOffset; + uint64 overHead; + uint8 uncleanShutdown; + char singleEndLineChar; + char nonEndLineChar; + char doubleEndLineChar1; + char doubleEndLineChar2; + uint16 compressAlgorithm; + char pad[433]; +} SparseExtentHeader; typedef struct { - char magic[4]; // Magic "COWD" LE - uint32 version; // Version - uint32 flags; // Flags - uint32 capacity; // The maximum data number of sectors (capacity) - uint32 grain_size; // The grain number of sectors - uint32 primary_grain_directory_offset; // The primary grain directory sector number - uint32 num_grain_directory_entries; // The number of grain table entries - uint32 next_free_grain; // The next free grain - - //uint32 num_cylinders; // The number of cylinders - //uint32 num_heads; // The number of heads - //uint32 num_sectors; // The number of sectors - - //char parent_filename[1024]; // The parent filename - //uint32 parent_generation; // The parent generation - - //uint32 generation; // The generation - //char name[60]; // The name - //char description[512]; // The description - //uint32 saved_generation; // The saved generation - //uint64 reserved; // Reserved - //uint8 is_dirty; // Value to indicate the COWD was cleanly closed - //char padding[396]; // Padding -} COWDSparseExtentHeader; + uint64 lba; + uint32 cmpSize; +} SparseGrainLBAHeader; typedef struct { - uint64 magic; - uint64 version; - uint64 capacity; - uint64 grain_size; - uint64 grain_table_size; - uint64 flags; - uint64 reserved1; - uint64 reserved2; - uint64 reserved3; - uint64 reserved4; - uint64 volatile_header_offset; - uint64 volatile_header_size; - uint64 journal_header_offset; - uint64 journal_header_size; - uint64 journal_offset; - uint64 journal_size; - uint64 grain_directory_offset; - uint64 grain_directory_size; - uint64 grain_tables_offset; - uint64 grain_tables_size; - uint64 free_bitmap_offset; - uint64 free_bitmap_size; - uint64 backmap_offset; - uint64 backmap_size; - uint64 grains_offset; - uint64 grains_size; - uint8 pad[304]; -} VMDKSESparseConstHeader; + uint64 lba; + uint32 cmpSize; + uint32 type; +} SparseSpecialLBAHeader; typedef struct { - uint64 magic; - uint64 free_gt_number; - uint64 next_txn_seq_number; - uint64 replay_journal; - uint8 pad[480]; -} VMDKSESparseVolatileHeader; - -#define SPARSE_MAGICNUMBER 0x564D444B + uint64 numSectors; + uint32 size; + uint32 type; + char pad[496]; + char metadata[0]; +} SparseMetaDataMarker; + +#define SPARSE_MAGICNUMBER 0x564d444b /* VMDK */ #define SPARSE_VERSION_INCOMPAT_FLAGS 3 #define SPARSE_GTE_EMPTY 0x00000000 #define SPARSE_GD_AT_END 0xFFFFFFFFFFFFFFFF @@ -107,6 +62,86 @@ #define SPARSE_COMPRESSALGORITHM_NONE 0x0000 #define SPARSE_COMPRESSALGORITHM_DEFLATE 0x0001 +#define GRAIN_MARKER_EOS 0 +#define GRAIN_MARKER_GRAIN_TABLE 1 +#define GRAIN_MARKER_GRAIN_DIRECTORY 2 +#define GRAIN_MARKER_FOOTER 3 +#define GRAIN_MARKER_PROGRESS 4 + +#define COWDISK_MAX_PARENT_FILELEN 1024 +#define COWDISK_MAX_NAME_LEN 60 +#define COWDISK_MAX_DESC_LEN 512 + +typedef struct COWDisk_Header { + uint32 magicNumber; + uint32 version; + uint32 flags; + uint32 numSectors; + uint32 grainSize; + uint32 gdOffset; + uint32 numGDEntries; + uint32 freeSector; + union { + struct { + uint32 cylinders; + uint32 heads; + uint32 sectors; + } root; + struct { + char parentFileName[COWDISK_MAX_PARENT_FILELEN]; + uint32 parentGeneration; + } child; + } u; + uint32 generation; + char name[COWDISK_MAX_NAME_LEN]; + char description[COWDISK_MAX_DESC_LEN]; + uint32 savedGeneration; + char reserved[8]; + uint32 uncleanShutdown; + char padding[396]; +} COWDisk_Header; + +#define COWDISK_MAGIC 0x44574f43 /* COWD */ +#define COWDISK_ROOT 0x01 +#define COWDISK_CHECKCAPABLE 0x02 +#define COWDISK_INCONSISTENT 0x04 + +// Confusingly, these seem to be called extents too +typedef struct SESparseExtent { + uint64 offset; + uint64 size; +} SESparseExtent; + +typedef struct { + uint64 constMagic; + uint64 version; + uint64 capacity; + uint64 grainSize; + uint64 grainTableSize; + uint64 flags; + uint64 reserved1; + uint64 reserved2; + uint64 reserved3; + uint64 reserved4; + SESparseExtent volatileHeader; + SESparseExtent journalHeader; + SESparseExtent journal; + SESparseExtent grainDirectory; + SESparseExtent grainTables; + SESparseExtent freeBitmap; + SESparseExtent backMap; + SESparseExtent grain; + char pad[304]; +} SESparseConstHeader; + +typedef struct { + uint64 volatileMagic; + uint64 freeGTNumber; + uint64 nextTxnSeqNumber; + uint64 replayJournal; + char pad[480]; +} SESparseVolatileHeader; + #define SESPARSE_CONST_HEADER_MAGIC 0x00000000CAFEBABE #define SESPARSE_VOLATILE_HEADER_MAGIC 0x00000000CAFEBABE @@ -115,30 +150,11 @@ #define SESPARSE_GRAIN_TYPE_FALLTHROUGH 0x1000000000000000 #define SESPARSE_GRAIN_TYPE_ZERO 0x2000000000000000 #define SESPARSE_GRAIN_TYPE_ALLOCATED 0x3000000000000000 - -typedef struct { - uint64 lba; - uint32 cmp_size; -} SparseGrainLBAHeaderOnDisk; - -typedef struct { - uint64 lba; - uint32 cmp_size; - uint32 type; -} SparseSpecialLBAHeaderOnDisk; - -#define GRAIN_MARKER_EOS 0 -#define GRAIN_MARKER_GRAIN_TABLE 1 -#define GRAIN_MARKER_GRAIN_DIRECTORY 2 -#define GRAIN_MARKER_FOOTER 3 -#define GRAIN_MARKER_PROGRESS 4 """ c_vmdk = cstruct().load(vmdk_def) -SECTOR_SIZE = 512 - -COWD_MAGIC = b"COWD" -VMDK_MAGIC = b"KDMV" +SPARSE_MAGIC = c_vmdk.uint32(c_vmdk.SPARSE_MAGICNUMBER).dumps() +COWD_MAGIC = c_vmdk.uint32(c_vmdk.COWDISK_MAGIC).dumps() # Technically a 8 byte header, but it's little endian so everything after the first 4 bytes is 0 -SESPARSE_MAGIC = struct.pack(" bytes: - log.debug("VMDK::read_sectors(0x%x, 0x%x)", sector, count) + def __enter__(self) -> Self: + return self - sectors_read = [] + def __exit__( + self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: TracebackType | None + ) -> None: + self.close() - disk_idx = bisect_right(self._disk_offsets, sector) + def open(self) -> ExtentStream: + """Open a stream to read the VMDK file.""" + return ExtentStream(self) - while count > 0: - disk = self.disks[disk_idx] + def close(self) -> None: + """Close the VMDK file and any associated resources we opened.""" + for extent in self.extents: + extent.close() - disk_remaining_sectors = disk.sector_count - (sector - disk.sector_offset) - disk_sectors = min(disk_remaining_sectors, count) - sectors_read.append(disk.read_sectors(sector, disk_sectors)) +class ExtentStream(AlignedStream): + def __init__(self, vmdk: VMDK): + self.vmdk = vmdk + self.parent = vmdk.parent - sector += disk_sectors - count -= disk_sectors - disk_idx += 1 + self.extents = vmdk.extents + self._offsets = vmdk._extents_offsets - return b"".join(sectors_read) + # Try to determine optimal alignment from the grain size of the first sparse extent + # This should reduce the amount of slicing we need to do when reading + align = SECTOR_SIZE + for extent in self.extents: + if isinstance(extent, SparseExtent): + align = extent._grain_size + break + + super().__init__(vmdk.size, align=align) def _read(self, offset: int, length: int) -> bytes: - log.debug("VMDK::_read(0x%x, 0x%x)", offset, length) + result = [] + + while length > 0: + idx = bisect_right(self._offsets, offset) + if idx > len(self._offsets) - 1: + break + + extent = self.extents[idx] + extent_offset = 0 if idx == 0 else self._offsets[idx - 1] + offset_in_extent = offset - extent_offset + read_size = min(length, extent.size - offset_in_extent) + + if isinstance(extent, RawExtent): + extent.fh.seek(extent.offset + offset_in_extent) + result.append(extent.fh.read(read_size)) + elif isinstance(extent, SparseExtent): + grain_idx, offset_in_grain = divmod(offset_in_extent, extent._grain_size) + grain_size = extent._last_grain_size if grain_idx == extent._last_grain_index else extent._grain_size + + if offset_in_grain >= grain_size: + break + + read_size = min(read_size, grain_size - offset_in_grain) + + grain = extent._grain(grain_idx) + # Unallocated grain + if grain == 0: + if self.parent is not None: + self.parent.seek(offset) + buf = self.parent.read(read_size) + else: + buf = b"\x00" * read_size + + # Sparse grain + elif grain == 1: + buf = b"\x00" * read_size + + # Allocated grain + else: + buf = extent._read_grain(grain)[offset_in_grain : offset_in_grain + read_size] + + result.append(buf) + + offset += read_size + length -= read_size - sector = offset // SECTOR_SIZE - count = (length + SECTOR_SIZE - 1) // SECTOR_SIZE + return b"".join(result) - return self.read_sectors(sector, count) +class Extent: + """Base class for VMDK extents. -class RawDisk: - def __init__(self, fh: BinaryIO, size: int | None = None, offset: int = 0, sector_offset: int = 0): + Args: + fh: File-like object for the extent. + path: Optional path for the extent. + size: Size of the extent in bytes. + """ + + def __init__(self, fh: BinaryIO, path: Path | None, size: int): self.fh = fh - self.offset = offset - self.sector_offset = sector_offset + self.path = path + self.size = size - if not size: - fh.seek(0, io.SEEK_END) - self.size = fh.tell() - fh.seek(0) - else: - self.size = size + def __enter__(self) -> Self: + return self - self.sector_count = self.size // SECTOR_SIZE + def __exit__( + self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: TracebackType | None + ) -> None: + self.close() - self.read = fh.read - self.seek = fh.seek - self.tell = fh.tell + @cached_property + def descriptor(self) -> DiskDescriptor | None: + """The disk descriptor if available.""" + return None - def read_sectors(self, sector: int, count: int) -> bytes: - log.debug("RawDisk::read_sectors(0x%x)", sector) + @classmethod + def from_fh( + cls, + fh: BinaryIO, + path: Path | None, + size: int | None = None, + offset: int | None = None, + ) -> RawExtent | HostedSparseExtent | SESparseExtent | COWDisk: + """Create an extent from a file-like object. + + Args: + fh: File-like object for the extent. + path: Optional path for the extent. + size: Optional size hint of the extent in bytes. + offset: Optional offset of the extent in bytes. + """ + fh.seek(0) + magic = fh.read(4) + fh.seek(0) - self.fh.seek((sector - self.sector_offset) * SECTOR_SIZE) - return self.fh.read(count * SECTOR_SIZE) + if magic == SPARSE_MAGIC: + return HostedSparseExtent(fh, path) + if magic == SESPARSE_MAGIC: + return SESparseExtent(fh, path) + if magic == COWD_MAGIC: + return COWDisk(fh, path) + return RawExtent(fh, path, size, offset) -class SparseDisk: - def __init__( - self, fh: BinaryIO, parent: VMDK | RawDisk | SparseDisk | None = None, offset: int = 0, sector_offset: int = 0 - ): - self.fh = fh - self.parent = parent - self.offset = offset - self.sector_offset = sector_offset + def close(self) -> None: + """Close the extent and any associated resources we opened.""" + if self.path is not None: + self.fh.close() - fh.seek(0, io.SEEK_END) - self.filesize = fh.tell() - fh.seek(0, io.SEEK_SET) - self.descriptor = None - self.header = SparseExtentHeader(fh) - if self.header.magic in (VMDK_MAGIC, COWD_MAGIC): - self.is_sesparse = False +class RawExtent(Extent): + """Raw extent implementation. - if ctypes.c_int64(self.header.primary_grain_directory_offset).value == -1: - # Footer starts -1024 from the end - fh.seek(-1024, io.SEEK_END) - self.header = SparseExtentHeader(fh) + Args: + fh: File-like object for the extent. + path: Optional path for the extent. + size: Optional size of the extent in bytes. If not provided, it will be determined from the file size. + offset: Optional offset of the extent in bytes in the source file. + """ - if self.header.magic == VMDK_MAGIC: - grain_table_coverage = self.header.num_grain_table_entries * self.header.grain_size - self._grain_directory_size = (self.header.capacity + grain_table_coverage - 1) // grain_table_coverage - self._grain_table_size = self.header.num_grain_table_entries + def __init__(self, fh: BinaryIO, path: Path | None, size: int | None = None, offset: int | None = None): + self.offset = offset or 0 - if self.header.descriptor_size > 0: - fh.seek(self.header.descriptor_offset * SECTOR_SIZE) - descriptor_buf = fh.read(self.header.descriptor_size * SECTOR_SIZE) - self.descriptor = DiskDescriptor.parse(descriptor_buf.split(b"\x00", 1)[0].decode()) + if size is None: + fh.seek(0, io.SEEK_END) + size = fh.tell() - self.offset + fh.seek(0) - elif self.header.magic == COWD_MAGIC: - self._grain_directory_size = self.header.num_grain_directory_entries - self._grain_table_size = 4096 + super().__init__(fh, path, size) - grain_directory_offset = self.header.primary_grain_directory_offset - self._grain_entry_type = c_vmdk.uint32 - self._grain_directory = c_vmdk.uint32[self._grain_directory_size](fh) +class SparseExtent(Extent): + """Base class for sparse extents. - elif self.header.magic == c_vmdk.SESPARSE_CONST_HEADER_MAGIC: - self.is_sesparse = True + Args: + fh: File-like object for the extent. + path: Optional path for the extent. + """ - self._grain_directory_size = self.header.grain_directory_size * SECTOR_SIZE // 8 - self._grain_table_size = self.header.grain_table_size * SECTOR_SIZE // 8 + def __init__(self, fh: BinaryIO, path: Path | None): + super().__init__(fh, path, self._capacity) + self._last_grain_index, self._last_grain_size = divmod(self._capacity, self._grain_size) - grain_directory_offset = self.header.grain_directory_offset - self._grain_entry_type = c_vmdk.uint64 + self._gt = lru_cache(128)(self._gt) - self.fh.seek(grain_directory_offset * SECTOR_SIZE) - self._grain_directory = self._grain_entry_type[self._grain_directory_size](fh) + @cached_property + def _capacity(self) -> int: + """The extent capacity in bytes.""" + raise NotImplementedError - self.size = self.header.capacity * SECTOR_SIZE - self.sector_count = self.header.capacity + @cached_property + def _grain_size(self) -> int: + """The grain size in bytes.""" + raise NotImplementedError - self._lookup_grain_table = lru_cache(128)(self._lookup_grain_table) + @cached_property + def _num_gte(self) -> int: + """The total number of grain table entries.""" + return self._last_grain_index + (1 if self._last_grain_size > 0 else 0) - def _lookup_grain_table(self, directory: int) -> list[int]: - gtbl_offset = self._grain_directory[directory] + @cached_property + def _num_gte_per_gt(self) -> int: + """The number of grain table entries per grain table.""" + raise NotImplementedError - if self.is_sesparse: - # qemu/block/vmdk.c: - # Top most nibble is 0x1 if grain table is allocated. - # strict check - top most 4 bytes must be 0x10000000 since max - # supported size is 64TB for disk - so no more than 64TB / 16MB - # grain directories which is smaller than uint32, - # where 16MB is the only supported default grain table coverage. - if not gtbl_offset or gtbl_offset & 0xFFFFFFFF00000000 != 0x1000000000000000: - table = None - else: - gtbl_offset &= 0x00000000FFFFFFFF - gtbl_offset = ( - self.header.grain_tables_offset + gtbl_offset * (self._grain_table_size * 8) // SECTOR_SIZE - ) - self.fh.seek(gtbl_offset * SECTOR_SIZE) - table = self._grain_entry_type[self._grain_table_size](self.fh) - else: - if gtbl_offset: - self.fh.seek(gtbl_offset * SECTOR_SIZE) - table = self._grain_entry_type[self._grain_table_size](self.fh) - else: - table = None - - return table - - def _lookup_grain(self, grain: int) -> int: - gdir_entry, gtbl_entry = divmod(grain, self._grain_table_size) - table = self._lookup_grain_table(gdir_entry) - - if table: - grain_entry = table[gtbl_entry] - if self.is_sesparse: - # SESparse uses a different method of specifying unallocated/sparse/allocated grains - # However, we can re-use the normal sparse logic of returning 0 for unallocated, 1 for - # sparse and >1 for allocated grains, since a grain of 0 or 1 isn't possible in SESparse. - grain_type = grain_entry & c_vmdk.SESPARSE_GRAIN_TYPE_MASK - if grain_type in (c_vmdk.SESPARSE_GRAIN_TYPE_UNALLOCATED, c_vmdk.SESPARSE_GRAIN_TYPE_FALLTHROUGH): - # Unallocated or scsi unmapped, fallthrough - return 0 - if grain_type == c_vmdk.SESPARSE_GRAIN_TYPE_ZERO: - # Sparse, zero grain - return 1 - if grain_type == c_vmdk.SESPARSE_GRAIN_TYPE_ALLOCATED: - # Allocated - cluster_sector_hi = (grain_entry & 0x0FFF000000000000) >> 48 - cluster_sector_lo = (grain_entry & 0x0000FFFFFFFFFFFF) << 12 - cluster_sector = cluster_sector_hi | cluster_sector_lo - return self.header.grains_offset + cluster_sector * self.header.grain_size - - raise ValueError("Unknown grain type") - - return grain_entry - - return 0 - - def get_runs(self, sector: int, count: int) -> list[tuple[int, int, int, int | None]]: - disk_sector = sector - self.sector_offset - - run_type = None - run_offset = 0 - run_count = 0 - run_parent = None - next_grain_sector = 0 - - read_sector = disk_sector - read_count = count - - runs = [] - - if read_count == 0: - return runs - - while read_count > 0: - grain, grain_offset = divmod(read_sector, self.header.grain_size) - grain_sector = self._lookup_grain(grain) - read_sector_count = min(read_count, self.header.grain_size - grain_offset) - - if (run_type == 0 and grain_sector == 0) or (run_type == 1 and grain_sector == 1): - run_count += read_sector_count - elif run_type and run_type > 1 and grain_sector == next_grain_sector: - next_grain_sector += self.header.grain_size - run_count += read_sector_count - else: - if run_type is not None: - runs.append((run_type, run_offset, run_count, run_parent)) - run_type = None - run_count = 0 - run_parent = None - if grain_sector == 0: - run_type = 0 - run_count += read_sector_count - run_parent = self.sector_offset + read_sector - elif grain_sector == 1: - run_type = 1 - run_count += read_sector_count - else: - run_type = grain_sector - run_offset = grain_offset - run_count += read_sector_count - next_grain_sector = grain_sector + self.header.grain_size + @cached_property + def _gd(self) -> list[int]: + """The grain directory.""" + raise NotImplementedError - read_count -= read_sector_count - read_sector += read_sector_count + def _gt(self, idx: int) -> list[int] | None: + """Get the grain table at the specified index. - assert run_type is not None - runs.append((run_type, run_offset, run_count, run_parent)) + Args: + idx: The grain table index. + """ + raise NotImplementedError - return runs + def _grain(self, idx: int) -> int: + """Get the grain number (sector) for the specified grain index. - def read_sectors(self, sector: int, count: int) -> bytes: - log.debug("SparseDisk::read_sectors(0x%x, 0x%x)", sector, count) + Args: + idx: The grain index. + """ + table, entry = divmod(idx, self._num_gte_per_gt) + if (gt := self._gt(table)) is None: + return 0 + return gt[entry] - runs = self.get_runs(sector, count) - sectors_read = [] + def _read_grain(self, grain: int) -> bytes: + """Read the specified grain. - for run_type, run_offset, run_count, run_parent in runs: - # Grain not present - if run_type == 0: - if self.parent: - sector_data = self.parent.read_sectors(run_parent, run_count) - else: - sector_data = b"\x00" * (run_count * SECTOR_SIZE) - sectors_read.append(sector_data) - continue + Args: + grain: The grain number. + """ + self.fh.seek(grain * SECTOR_SIZE) + return self.fh.read(self._grain_size) + + +class HostedSparseExtent(SparseExtent): + """Hosted sparse extent implementation. + + Args: + fh: File-like object for the extent. + path: Optional path for the extent. + """ + + def __init__(self, fh: BinaryIO, path: Path | None): + fh.seek(0) + self.header = c_vmdk.SparseExtentHeader(fh) + if self.header.gdOffset == c_vmdk.SPARSE_GD_AT_END: + # Sparse extents can have a footer at the end of the file + # TODO: find test data for this + fh.seek(-3 * SECTOR_SIZE, io.SEEK_END) + if (marker := c_vmdk.SparseMetaDataMarker(fh)).size == 0 and marker.type == c_vmdk.GRAIN_MARKER_FOOTER: + self.header = c_vmdk.SparseExtentHeader(fh) + + super().__init__(fh, path) + + @cached_property + def _capacity(self) -> int: + return self.header.capacity * SECTOR_SIZE + + @cached_property + def _grain_size(self) -> int: + return self.header.grainSize * SECTOR_SIZE + + @cached_property + def _num_gte_per_gt(self) -> int: + return self.header.numGTEsPerGT + + @cached_property + def _gd(self) -> list[int]: + num_gt = (self._num_gte + self._num_gte_per_gt - 1) // self._num_gte_per_gt + self.fh.seek(self.header.gdOffset * SECTOR_SIZE) + return c_vmdk.uint32[num_gt](self.fh) + + def _gt(self, idx: int) -> list[int] | None: + if (offset := self._gd[idx]) == 0: + return None + + self.fh.seek(offset * SECTOR_SIZE) + return c_vmdk.uint32[self._num_gte_per_gt](self.fh) + + @cached_property + def descriptor(self) -> DiskDescriptor | None: + if self.header.descriptorSize > 0: + self.fh.seek(self.header.descriptorOffset * SECTOR_SIZE) + buf = self.fh.read(self.header.descriptorSize * SECTOR_SIZE) + return DiskDescriptor(buf.split(b"\x00", 1)[0].decode()) + return None + + def _read_grain(self, grain: int) -> bytes: + buf = super()._read_grain(grain) + if self.header.flags & c_vmdk.SPARSEFLAG_COMPRESSED: + if self.header.flags & c_vmdk.SPARSEFLAG_EMBEDDED_LBA: + header_size = 12 + header = c_vmdk.SparseGrainLBAHeader(buf) + compressed_size = header.cmpSize + else: + header_size = 4 + compressed_size = c_vmdk.uint32(buf) + + buf = zlib.decompress(buf[header_size : header_size + compressed_size]) + + return buf + + +class SESparseExtent(SparseExtent): + """SESparse extent implementation. + + Args: + fh: File-like object for the extent. + path: Optional path for the extent. + """ + + def __init__(self, fh: BinaryIO, path: Path | None): + fh.seek(0) + self.header = c_vmdk.SESparseConstHeader(fh) + + super().__init__(fh, path) + + @cached_property + def _capacity(self) -> int: + return self.header.capacity * SECTOR_SIZE - # Sparse grain - if run_type == 1: - sectors_read.append(b"\x00" * (run_count * SECTOR_SIZE)) - continue + @cached_property + def _grain_size(self) -> int: + return self.header.grainSize * SECTOR_SIZE - # Uncompressed grain - if self.header.flags & c_vmdk.SPARSEFLAG_COMPRESSED == 0: - self.fh.seek((run_type + run_offset) * SECTOR_SIZE) - sector_data = self.fh.read(run_count * SECTOR_SIZE) - sectors_read.append(sector_data) - continue + @cached_property + def _num_gte_per_gt(self) -> int: + return (self.header.grainTableSize * SECTOR_SIZE) // 8 - # Compressed grain - while run_count > 0: - # We consolidate grain runs in get_runs, but we can't read a contiguous stream of compressed grains - # So loop over the consolidated grains - offset = run_offset * SECTOR_SIZE - grain_remaining = self.header.grain_size - run_offset - read_count = min(run_count, grain_remaining) + @cached_property + def _gd(self) -> list[int]: + num_gt = (self.header.grainDirectory.size * SECTOR_SIZE) // 8 + self.fh.seek(self.header.grainDirectory.offset * SECTOR_SIZE) + return c_vmdk.uint64[num_gt](self.fh) - buf = self._read_compressed_grain(run_type) - sectors_read.append(buf[offset : offset + read_count * SECTOR_SIZE]) + def _gt(self, idx: int) -> list[int] | None: + offset = self._gd[idx] - # If we loop, we're going to the next run, which means we'll start at offset 0 - run_offset = 0 - run_type += self.header.grain_size - run_count -= read_count + # qemu/block/vmdk.c: + # Top most nibble is 0x1 if grain table is allocated. + # strict check - top most 4 bytes must be 0x10000000 since max + # supported size is 64TB for disk - so no more than 64TB / 16MB + # grain directories which is smaller than uint32, + # where 16MB is the only supported default grain table coverage. + if offset == 0 or offset & 0xFFFFFFFF00000000 != 0x1000000000000000: + return None - return b"".join(sectors_read) + offset &= 0x00000000FFFFFFFF + self.fh.seek((self.header.grainTables.offset * SECTOR_SIZE) + (offset * (self._num_gte_per_gt * 8))) + return c_vmdk.uint64[self._num_gte_per_gt](self.fh) - def _read_compressed_grain(self, sector: int) -> bytes: - self.fh.seek(sector * SECTOR_SIZE) - buf = self.fh.read(SECTOR_SIZE) + def _grain(self, idx: int) -> int: + # SESparse uses a different method of specifying unallocated/sparse/allocated grains + # However, we can re-use the normal sparse logic of returning 0 for unallocated, 1 for + # sparse and >1 for allocated grains, since a grain of 0 or 1 isn't possible in SESparse. + table, entry = divmod(idx, self._num_gte_per_gt) + if (gt := self._gt(table)) is None: + return 0 + grain = gt[entry] - if self.header.flags & c_vmdk.SPARSEFLAG_EMBEDDED_LBA: - header_len = 12 - lba_header = c_vmdk.SparseGrainLBAHeaderOnDisk(buf) - compressed_len = lba_header.cmp_size - else: - header_len = 4 - compressed_len = c_vmdk.uint32(buf) + grain_type = grain & c_vmdk.SESPARSE_GRAIN_TYPE_MASK + if grain_type in (c_vmdk.SESPARSE_GRAIN_TYPE_UNALLOCATED, c_vmdk.SESPARSE_GRAIN_TYPE_FALLTHROUGH): + # Unallocated or scsi unmapped, fallthrough + return 0 + if grain_type == c_vmdk.SESPARSE_GRAIN_TYPE_ZERO: + # Sparse, zero grain + return 1 + if grain_type == c_vmdk.SESPARSE_GRAIN_TYPE_ALLOCATED: + # Allocated + cluster_sector_hi = (grain & 0x0FFF000000000000) >> 48 + cluster_sector_lo = (grain & 0x0000FFFFFFFFFFFF) << 12 + cluster_sector = cluster_sector_hi | cluster_sector_lo + # We need to return the sector + return self.header.grain.offset + cluster_sector * self.header.grainSize - if compressed_len + header_len > SECTOR_SIZE: - # Officially this is padded to SECTOR_SIZE, but we don't really care - remaining_len = header_len + compressed_len - SECTOR_SIZE - self.fh.seek((sector + 1) * SECTOR_SIZE) - buf += self.fh.read(remaining_len) + raise ValueError("Unknown grain type") - return zlib.decompress(buf[header_len : header_len + compressed_len]) +class COWDisk(SparseExtent): + """COW disk extent implementation. -class SparseExtentHeader: - def __init__(self, fh: BinaryIO): - magic = fh.read(4) - fh.seek(-4, io.SEEK_CUR) - - if magic == VMDK_MAGIC: - self.hdr = c_vmdk.VMDKSparseExtentHeader(fh) - elif magic == SESPARSE_MAGIC: - self.hdr = c_vmdk.VMDKSESparseConstHeader(fh) - elif magic == COWD_MAGIC: - self.hdr = c_vmdk.COWDSparseExtentHeader(fh) - else: - raise NotImplementedError("Unsupported sparse extent") + TODO: Regenerate test data and fix implementation. + + Args: + fh: File-like object for the extent. + path: Optional path for the extent. + """ - def __getattr__(self, attr: str) -> Any: - return getattr(self.hdr, attr) + def __init__(self, fh: BinaryIO, path: Path | None): + fh.seek(0) + self.header = c_vmdk.COWDisk_Header(fh) + super().__init__(fh, path) + + @cached_property + def _capacity(self) -> int: + return self.header.numSectors * SECTOR_SIZE + + @cached_property + def _grain_size(self) -> int: + return self.header.grainSize * SECTOR_SIZE + + @cached_property + def _gte_type(self) -> c_vmdk.uint32 | c_vmdk.uint64: + return c_vmdk.uint32 + + @cached_property + def _num_gte_per_gt(self) -> int: + return 4096 + + @cached_property + def _gd_size(self) -> int: + return self.header.numGDEntries + + @cached_property + def _gd_offset(self) -> int: + return self.header.gdOffset * SECTOR_SIZE RE_EXTENT_DESCRIPTOR = re.compile( r""" ^ - (?PRW|RDONLY|NOACCESS)\s - (?P\d+)\s - (?PSESPARSE|SPARSE|ZERO|FLAT|VMFS|VMFSSPARSE|VMFSRDM|VMFSRAW) + (?PRW|RDONLY|NOACCESS)\s + (?P\d+)\s + (?P[^\s]+) (\s(?P\".+\"))? - (\s(?P\d+))? - (\s(?P\S+))? - (\s(?P\S+))? - $ + (\s(?P\d+))? """, re.VERBOSE, ) -@dataclass -class ExtentDescriptor: - raw: str - access_mode: str - sectors: int +class ExtentDescriptor(NamedTuple): + access: str + """The access mode of the extent (RW, RDONLY, NOACCESS).""" + size: int + """The size of the extent in sectors.""" type: str + """The type of the extent (e.g., SPARSE, FLAT, ZERO).""" filename: str | None = None - start_sector: int | None = None - partition_uuid: str | None = None - device_identifier: str | None = None - - def __post_init__(self) -> None: - self.sectors = int(self.sectors) - - if self.filename: - self.filename = self.filename.strip('"') - - if self.start_sector: - self.start_sector = int(self.start_sector) - - def __repr__(self) -> str: - return f"" - - def __str__(self) -> str: - return self.raw + """The filename of the extent.""" + offset: int | None = None + """Optional offset of the extent data in the extent file.""" class DiskDescriptor: - def __init__( - self, attr: dict, extents: list[ExtentDescriptor], disk_db: dict, sectors: int, raw_config: str | None = None - ): - self.attr = attr - self.extents = extents - self.ddb = disk_db - self.sectors = sectors - self.raw = raw_config + """VMDK disk descriptor. - @classmethod - def parse(cls, vmdk_config: str) -> DiskDescriptor: - """Return :class:`DiskDescriptor` based on the provided ``vmdk_config``. + Args: + raw: The raw descriptor data as a string. + """ - Resources: - - https://github.com/libyal/libvmdk/blob/main/documentation/VMWare%20Virtual%20Disk%20Format%20(VMDK).asciidoc - """ + def __init__(self, raw: str): + self.raw = raw + self.attributes = {} + self.extents: list[ExtentDescriptor] = [] - descriptor_settings = {} - extents: list[ExtentDescriptor] = [] - disk_db = {} - sectors = 0 - - for line in vmdk_config.split("\n"): - line = line.strip() - - if not line or line.startswith("#"): + for line in raw.splitlines(): + if not (line := line.strip()) or line.startswith("#"): continue if line.startswith(("RW ", "RDONLY ", "NOACCESS ")): - match = RE_EXTENT_DESCRIPTOR.search(line) - - if not match: + if not (match := RE_EXTENT_DESCRIPTOR.search(line)): log.warning("Unexpected ExtentDescriptor format in vmdk config: %s, ignoring", line) continue - extent = ExtentDescriptor(raw=line, **match.groupdict()) - sectors += extent.sectors - extents.append(extent) - continue - - setting, _, value = line.partition("=") - setting = setting.strip() - value = value.strip(' "') - - if setting.startswith("ddb."): - disk_db[setting] = value + self.extents.append( + ExtentDescriptor( + access=match.group("access"), + size=int(match.group("size")), + type=match.group("type"), + filename=match.group("filename").strip('"') if match.group("filename") else None, + offset=int(match.group("offset")) if match.group("offset") else None, + ) + ) else: - descriptor_settings[setting] = value - - return cls(descriptor_settings, extents, disk_db, sectors, vmdk_config) - - def __str__(self) -> str: - str_template = textwrap.dedent( - """\ - # Disk DescriptorFile - version=1 - {} + key, _, value = line.partition("=") + self.attributes[key.strip()] = value.strip(' "') - # Extent Description - {} - # The Disk Data Base - #DDB +def open_parent(path: Path, hint: str) -> VMDK: + """Open the parent VMDK disk based on the filename hint. - {}""" - ) - - descriptor_settings = [] - for setting, value in self.attr.items(): - if setting != "version": - descriptor_settings.append(f"{setting}={value}") - descriptor_settings = "\n".join(descriptor_settings) - - extents = "\n".join(map(str, self.extents)) - - disk_db = [] - for setting, value in self.ddb.items(): - disk_db.append(f'{setting} = "{value}"') - disk_db = "\n".join(disk_db) - - return str_template.format(descriptor_settings, extents, disk_db) - - -def open_parent(path: Path, filename_hint: str) -> VMDK: + Args: + path: The directory path to look for the parent disk. + hint: The filename hint for the parent disk. + """ try: - filename_hint = filename_hint.replace("\\", "/") - hint_path, _, filename = filename_hint.rpartition("/") - filepath = path.joinpath(filename) - if not filepath.exists(): + hint = hint.replace("\\", "/") + hint_path, _, filename = hint.rpartition("/") + + if not (file_path := path.joinpath(filename)).exists(): _, _, hint_path_name = hint_path.rpartition("/") - filepath = path.parent.joinpath(hint_path_name).joinpath(filename) - vmdk = VMDK(filepath) - except Exception as err: - raise IOError(f"Failed to open parent disk with hint {filename_hint} from path {path}: {err}") + file_path = path.parent.joinpath(hint_path_name).joinpath(filename) - return vmdk + return VMDK(file_path) + except Exception as err: + raise IOError(f"Failed to open parent disk with hint {hint} from path {path}: {err}") diff --git a/tests/_data/disk/vmdk/flat-flat.vmdk.gz b/tests/_data/disk/vmdk/flat-flat.vmdk.gz new file mode 100644 index 0000000..31f823c --- /dev/null +++ b/tests/_data/disk/vmdk/flat-flat.vmdk.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0d837bd6d52414174e267b009a213897b06b06859f031fc50bea607038632d5 +size 15948 diff --git a/tests/_data/disk/vmdk/flat.vmdk.gz b/tests/_data/disk/vmdk/flat.vmdk.gz new file mode 100644 index 0000000..cdd6869 --- /dev/null +++ b/tests/_data/disk/vmdk/flat.vmdk.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353456c5bc38865cda759bf6070a2c46d9e2cb900311f100b6133774271df8ef +size 253 diff --git a/tests/_data/disk/vmdk/sparse.vmdk.gz b/tests/_data/disk/vmdk/sparse.vmdk.gz new file mode 100644 index 0000000..d9e6431 --- /dev/null +++ b/tests/_data/disk/vmdk/sparse.vmdk.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d41b3dda5064cbc5e753a6bc4152c24f6b0c3d1110126325acc0bc00c2c95aa +size 15493 diff --git a/tests/_data/disk/vmdk/split-flat-f001.vmdk.gz b/tests/_data/disk/vmdk/split-flat-f001.vmdk.gz new file mode 100644 index 0000000..31f823c --- /dev/null +++ b/tests/_data/disk/vmdk/split-flat-f001.vmdk.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0d837bd6d52414174e267b009a213897b06b06859f031fc50bea607038632d5 +size 15948 diff --git a/tests/_data/disk/vmdk/split-flat.vmdk.gz b/tests/_data/disk/vmdk/split-flat.vmdk.gz new file mode 100644 index 0000000..296c10b --- /dev/null +++ b/tests/_data/disk/vmdk/split-flat.vmdk.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cca9fef32058c8bc36fd05ab3d721ecc929d91f7099fb9a77204d3d3f73083b6 +size 259 diff --git a/tests/_data/disk/vmdk/split-sparse-s001.vmdk.gz b/tests/_data/disk/vmdk/split-sparse-s001.vmdk.gz new file mode 100644 index 0000000..636834e --- /dev/null +++ b/tests/_data/disk/vmdk/split-sparse-s001.vmdk.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1a36cdb20639163f634e0fd83e7678d362b89081a0012acdf9df0c9baa58fac +size 15203 diff --git a/tests/_data/disk/vmdk/split-sparse.vmdk.gz b/tests/_data/disk/vmdk/split-sparse.vmdk.gz new file mode 100644 index 0000000..0f89799 --- /dev/null +++ b/tests/_data/disk/vmdk/split-sparse.vmdk.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f1bf79422c8d569b7d4fcd9262826a32fab4afe3e319ebf8000ab79104c120e +size 260 diff --git a/tests/_data/disk/vmdk/stream.vmdk.gz b/tests/_data/disk/vmdk/stream.vmdk.gz new file mode 100644 index 0000000..56a940c --- /dev/null +++ b/tests/_data/disk/vmdk/stream.vmdk.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c292a321f8840a5d1848fb13c02520ebc7f8d80e16440764fe9576f0c507bd3e +size 2236 diff --git a/tests/_tools/disk/vmdk/generate.sh b/tests/_tools/disk/vmdk/generate.sh new file mode 100755 index 0000000..99e9864 --- /dev/null +++ b/tests/_tools/disk/vmdk/generate.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +set -euo pipefail + +readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly TESTS_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" +readonly OUT_DIR="${TESTS_ROOT}/_data/disk/vmdk" + +log() { printf '[INFO] %s\n' "$*" >&2; } +warn() { printf '[WARN] %s\n' "$*" >&2; } +error() { printf '[ERROR] %s\n' "$*" >&2; } + +have() { command -v "$1" >/dev/null 2>&1; } + +require_tools() { + local -a tools=(qemu-img pigz dd) + local missing=0 + + for t in "${tools[@]}"; do + if ! have "$t"; then + error "Missing required tool: $t" + missing=1 + fi + done + + if (( missing != 0 )); then + error "One or more required tools are missing. Aborting." + exit 1 + fi +} + +pattern() { + local size="$1" + + stream() { + while true; do + for i in $(seq 0 255); do + printf "`printf '%02x' "${i}"`%.0s" {0..4095} + done + done + } + + stream | xxd -r -ps | head -c "${size}" || true +} + +generate() { + local name="$1" + local size="$2" + local options="${3:-}" + + local raw="$(mktemp -t raw.XXXXXX)" + + pattern "${size}" > "${raw}" + # Create a hole at the start for testing sparse files + dd if=/dev/zero bs=1M count=1 seek=0 of="${raw}" conv=notrunc + + local outpath="${OUT_DIR}/${name}.vmdk" + + log "Converting RAW -> VMDK (${name})" + qemu-img convert -f raw -O vmdk -o "${options}" "${raw}" "${outpath}" + + # log "Compressing ${outpath} -> ${outpath}.gz" + # for file in "${OUT_DIR}/${name}"*; do + # cat "${file}" | pigz -c > "${file}.gz" + # done + + log "Generated: ${outpath}.gz" +} + +main() { + require_tools + + mkdir -p "${OUT_DIR}" + + generate "sparse" "$((10 * 1024 * 1024))" subformat=monolithicSparse + generate "flat" "$((10 * 1024 * 1024))" subformat=monolithicFlat + generate "stream" "$((10 * 1024 * 1024))" subformat=streamOptimized + generate "split-sparse" "$((10 * 1024 * 1024))" subformat=twoGbMaxExtentSparse + generate "split-flat" "$((10 * 1024 * 1024))" subformat=twoGbMaxExtentFlat + + # TODO: Generate some test data on ESXi + + log "All test cases generated under: ${OUT_DIR}" +} + +main "$@" diff --git a/tests/conftest.py b/tests/conftest.py index 3673181..e028dc0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -64,11 +64,6 @@ def differencing_vhdx() -> Iterator[BinaryIO]: yield from open_file_gz("_data/disk/vhdx/differencing.avhdx.gz") -@pytest.fixture -def sesparse_vmdk() -> Iterator[BinaryIO]: - yield from open_file_gz("_data/disk/vmdk/sesparse.vmdk.gz") - - @pytest.fixture def plain_hdd() -> Path: return absolute_path("_data/disk/hdd/plain.hdd") diff --git a/tests/disk/test_vmdk.py b/tests/disk/test_vmdk.py index 467a872..d411e50 100644 --- a/tests/disk/test_vmdk.py +++ b/tests/disk/test_vmdk.py @@ -1,203 +1,219 @@ from __future__ import annotations +import gzip +from pathlib import Path from typing import BinaryIO +from unittest.mock import patch import pytest from dissect.hypervisor.disk.c_vmdk import c_vmdk -from dissect.hypervisor.disk.vmdk import VMDK, DiskDescriptor, ExtentDescriptor +from dissect.hypervisor.disk.vmdk import VMDK, DiskDescriptor, ExtentDescriptor, SESparseExtent +from tests.conftest import absolute_path -def test_vmdk_sesparse(sesparse_vmdk: BinaryIO) -> None: - vmdk = VMDK(sesparse_vmdk) +def mock_open_gz(self: Path, *args, **kwargs) -> BinaryIO: + return gzip.open(self if self.suffix.lower() == ".gz" else self.with_suffix(self.suffix + ".gz")) - disk = vmdk.disks[0] - assert disk.is_sesparse - assert disk._grain_directory_size == 0x20000 - assert disk._grain_table_size == 0x1000 - assert disk._grain_entry_type == c_vmdk.uint64 - assert disk._grain_directory[0] == 0x1000000000000000 +@pytest.mark.parametrize( + ("path"), + [ + pytest.param("_data/disk/vmdk/flat.vmdk.gz", id="flat"), + pytest.param("_data/disk/vmdk/sparse.vmdk.gz", id="sparse"), + pytest.param("_data/disk/vmdk/split-flat.vmdk.gz", id="split-flat"), + pytest.param("_data/disk/vmdk/split-sparse.vmdk.gz", id="split-sparse"), + pytest.param("_data/disk/vmdk/stream.vmdk.gz", id="stream"), + ], +) +def test_vmdk(path: str) -> None: + """Test basic VMDK reading.""" + with patch.object(Path, "open", mock_open_gz): + vmdk = VMDK(absolute_path(path)) + + assert vmdk.size == 10 * 1024 * 1024 + + stream = vmdk.open() + assert stream.read(1 * 1024 * 1024) == bytes([0] * (1 * 1024 * 1024)) + + for i in range((1 * 1024 * 1024) // 4096, stream.size // 4096): + expected = bytes([i % 256] * 4096) + assert stream.read(4096) == expected, f"Mismatch at offset {i * 4096:#x}" + + assert stream.read() == b"" - header = disk.header - assert header.magic == c_vmdk.SESPARSE_CONST_HEADER_MAGIC - assert header.version == 0x200000001 - assert vmdk.read(0x1000000) == b"a" * 0x1000000 +def test_vmdk_sesparse() -> None: + # TODO: Recreate test data with new test pattern + with gzip.open(absolute_path("_data/disk/vmdk/sesparse.vmdk.gz"), "rb") as fh: + vmdk = VMDK(fh) + + extent = vmdk.extents[0] + assert isinstance(extent, SESparseExtent) + + assert extent.header.constMagic == c_vmdk.SESPARSE_CONST_HEADER_MAGIC + assert extent.header.version == 0x200000001 + + assert extent._num_gte_per_gt == 0x1000 + assert len(extent._gd) == 0x20000 + assert extent._gd[0] == 0x1000000000000000 + + stream = vmdk.open() + assert stream.read(0x1000000) == b"a" * 0x1000000 @pytest.mark.parametrize( - ("extent_description", "expected_extents"), + ("raw", "expected_extents"), [ - ( + pytest.param( 'RW 123456789 SPARSE "disk.vmdk"', [ ExtentDescriptor( - raw='RW 123456789 SPARSE "disk.vmdk"', - access_mode="RW", - sectors=123456789, + access="RW", + size=123456789, type="SPARSE", - filename='"disk.vmdk"', - start_sector=None, - partition_uuid=None, - device_identifier=None, + filename="disk.vmdk", ), ], + id="sparse", ), - ( + pytest.param( 'RW 123456789 FLAT "disk-flat.vmdk" 0', [ ExtentDescriptor( - raw='RW 123456789 FLAT "disk-flat.vmdk" 0', - access_mode="RW", - sectors=123456789, + access="RW", + size=123456789, type="FLAT", - filename='"disk-flat.vmdk"', - start_sector=0, - partition_uuid=None, - device_identifier=None, + filename="disk-flat.vmdk", + offset=0, ) ], + id="flat", ), - ( + pytest.param( "RDONLY 0 ZERO", [ ExtentDescriptor( - raw="RDONLY 0 ZERO", - access_mode="RDONLY", - sectors=0, + access="RDONLY", + size=0, type="ZERO", ), ], + id="zero", ), - ( + pytest.param( 'NOACCESS 123456789 SPARSE "disk-sparse.vmdk" 123 partition-uuid device-id', [ ExtentDescriptor( - raw='NOACCESS 123456789 SPARSE "disk-sparse.vmdk" 123 partition-uuid device-id', - access_mode="NOACCESS", - sectors=123456789, + access="NOACCESS", + size=123456789, type="SPARSE", - filename='"disk-sparse.vmdk"', - start_sector=123, - partition_uuid="partition-uuid", - device_identifier="device-id", + filename="disk-sparse.vmdk", + offset=123, ), ], + id="sparse-ids", + ), + pytest.param( + "RW 1234567890", + [], + id="bad-1", + ), + pytest.param( + 'RDONLY "file.vmdk"', + [], + id="bad-2", + ), + pytest.param( + "NOACCESS", + [], + id="bad-3", ), - ("RW 1234567890", []), - ('RDONLY "file.vmdk"', []), - ("NOACCESS", []), - ( + pytest.param( 'RW 1234567890 SPARSE "disk with spaces.vmdk"', [ ExtentDescriptor( - raw='RW 1234567890 SPARSE "disk with spaces.vmdk"', - access_mode="RW", - sectors=1234567890, + access="RW", + size=1234567890, type="SPARSE", - filename='"disk with spaces.vmdk"', - start_sector=None, - partition_uuid=None, - device_identifier=None, + filename="disk with spaces.vmdk", ) ], + id="spaces-four-parts", ), - ( + pytest.param( 'RW 1234567890 SPARSE "disk with spaces.vmdk" 123', [ ExtentDescriptor( - raw='RW 1234567890 SPARSE "disk with spaces.vmdk" 123', - access_mode="RW", - sectors=1234567890, + access="RW", + size=1234567890, type="SPARSE", - filename='"disk with spaces.vmdk"', - start_sector=123, - partition_uuid=None, - device_identifier=None, + filename="disk with spaces.vmdk", + offset=123, ) ], + id="spaces-five-parts", ), - ( + pytest.param( 'RW 1234567890 SPARSE "disk with spaces.vmdk" 123 part-uuid', [ ExtentDescriptor( - raw='RW 1234567890 SPARSE "disk with spaces.vmdk" 123 part-uuid', - access_mode="RW", - sectors=1234567890, + access="RW", + size=1234567890, type="SPARSE", - filename='"disk with spaces.vmdk"', - start_sector=123, - partition_uuid="part-uuid", - device_identifier=None, + filename="disk with spaces.vmdk", + offset=123, ) ], + id="spaces-six-parts", ), - ( + pytest.param( 'RW 1234567890 SPARSE "disk with spaces.vmdk" 123 part-uuid device-id', [ ExtentDescriptor( - raw='RW 1234567890 SPARSE "disk with spaces.vmdk" 123 part-uuid device-id', - access_mode="RW", - sectors=1234567890, + access="RW", + size=1234567890, type="SPARSE", - filename='"disk with spaces.vmdk"', - start_sector=123, - partition_uuid="part-uuid", - device_identifier="device-id", + filename="disk with spaces.vmdk", + offset=123, ) ], + id="spaces-seven-parts", ), - ( + pytest.param( r'RW 16777216 SPARSE "this is an example "\' diskëäô:)\\\'`\foo.vmdk" 123', [ ExtentDescriptor( - raw=r'RW 16777216 SPARSE "this is an example "\' diskëäô:)\\\'`\foo.vmdk" 123', - access_mode="RW", - sectors=16777216, + access="RW", + size=16777216, type="SPARSE", - filename=r'"this is an example "\' diskëäô:)\\\'`\foo.vmdk"', - start_sector=123, - partition_uuid=None, - device_identifier=None, + filename=r'this is an example "\' diskëäô:)\\\'`\foo.vmdk', + offset=123, ) ], + id="specials-five-parts", ), - ( + pytest.param( r'RW 13371337 SPARSE "🦊 🦊 🦊.vmdk"', [ ExtentDescriptor( - raw=r'RW 13371337 SPARSE "🦊 🦊 🦊.vmdk"', - access_mode="RW", - sectors=13371337, + access="RW", + size=13371337, type="SPARSE", - filename='"🦊 🦊 🦊.vmdk"', + filename="🦊 🦊 🦊.vmdk", ) ], + id="emoji-four-parts", ), ], - ids=( - "sparse", - "flat", - "zero", - "sparse-ids", - "bad-1", - "bad-2", - "bad-3", - "spaces-four-parts", - "spaces-five-parts", - "spaces-six-parts", - "spaces-seven-parts", - "specials-five-parts", - "emoji-four-parts", - ), ) -def test_vmdk_extent_description(extent_description: str, expected_extents: list[ExtentDescriptor]) -> None: +def test_vmdk_extent_description(raw: str, expected_extents: list[ExtentDescriptor]) -> None: """test if we correctly parse VMDK sparse and flat extent descriptions. Resources: - https://github.com/libyal/libvmdk/blob/main/documentation/VMWare%20Virtual%20Disk%20Format%20(VMDK).asciidoc#22-extent-descriptions + - https://web.archive.org/web/20120302211605/http://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf """ - descriptor = DiskDescriptor.parse(extent_description) + descriptor = DiskDescriptor(raw) assert descriptor.extents == expected_extents