Skip to content

Commit ec71e31

Browse files
authored
Merge pull request #1244 from jcrussell/msi
add support for MSI
2 parents 7a65bd9 + 38dbe19 commit ec71e31

File tree

353 files changed

+1335
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

353 files changed

+1335
-1
lines changed

docs/handlers.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
| [`LZIP`](#lzip) | COMPRESSION | :octicons-check-16: |
4141
| [`LZMA`](#lzma) | COMPRESSION | :octicons-check-16: |
4242
| [`LZO`](#lzo) | COMPRESSION | :octicons-check-16: |
43+
| [`MSI`](#msi) | ARCHIVE | :octicons-alert-fill-12: |
4344
| [`MULTI-SEVENZIP`](#multi-sevenzip) | ARCHIVE | :octicons-check-16: |
4445
| [`NETGEAR CHK`](#netgear-chk) | ARCHIVE | :octicons-check-16: |
4546
| [`NETGEAR TRX V1`](#netgear-trx-v1) | ARCHIVE | :octicons-check-16: |
@@ -718,6 +719,28 @@
718719

719720
- [LZO File Format Documentation](http://www.lzop.org/){ target="_blank" }
720721
- [LZO Wikipedia](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer){ target="_blank" }
722+
## MSI
723+
724+
!!! warning "Partially supported"
725+
726+
=== "Description"
727+
728+
Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.
729+
730+
---
731+
732+
- **Handler type:** Archive
733+
- **Vendor:** Microsoft
734+
735+
=== "References"
736+
737+
- [MSI File Format Documentation](https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer){ target="_blank" }
738+
- [Compound File Binary Format](https://en.wikipedia.org/wiki/Compound_File_Binary_Format){ target="_blank" }
739+
740+
=== "Limitations"
741+
742+
- Limited to CFB based extraction, not full-on MSI extraction
743+
- Extracted files have names coming from CFB internal representation, and may not correspond to the one they would have on disk after running the installer
721744
## multi-sevenzip
722745

723746
!!! success "Fully supported"

python/unblob/handlers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
cab,
77
cpio,
88
dmg,
9+
msi,
910
par2,
1011
partclone,
1112
rar,
@@ -89,6 +90,7 @@
8990
arc.ARCHandler,
9091
arj.ARJHandler,
9192
cab.CABHandler,
93+
msi.MsiHandler,
9294
tar.TarUstarHandler,
9395
tar.TarUnixHandler,
9496
cpio.PortableASCIIHandler,
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
import io
2+
import struct
3+
from typing import Optional
4+
5+
from structlog import get_logger
6+
7+
from unblob.extractors import Command
8+
9+
from ...file_utils import InvalidInputFormat
10+
from ...models import (
11+
File,
12+
HandlerDoc,
13+
HandlerType,
14+
HexString,
15+
Reference,
16+
StructHandler,
17+
ValidChunk,
18+
)
19+
20+
FREE_SECTOR = 0xFFFFFFFF
21+
END_OF_CHAIN = 0xFFFFFFFE
22+
HEADER_SIZE = 512
23+
24+
logger = get_logger()
25+
26+
27+
class MsiHandler(StructHandler):
28+
NAME = "msi"
29+
30+
PATTERNS = [HexString("D0 CF 11 E0 A1 B1 1A E1")]
31+
C_DEFINITIONS = r"""
32+
typedef struct cfbf_header
33+
{
34+
// [offset from start (bytes), length (bytes)]
35+
uint8 signature[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
36+
// 0x1a, 0xe1} for current version
37+
uint8 clsid[16]; // [08H,16] reserved must be zero (WriteClassStg/
38+
// GetClassFile uses root directory class id)
39+
uint16 minorVersion; // [18H,02] minor version of the format: 33 is
40+
// written by reference implementation
41+
uint16 dllVersion; // [1AH,02] major version of the dll/format: 3 for
42+
// 512-byte sectors, 4 for 4 KB sectors
43+
uint16 byteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
44+
uint16 sectorShift; // [1EH,02] size of sectors in power-of-two;
45+
// typically 9 indicating 512-byte sectors
46+
uint16 miniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
47+
// typically 6 indicating 64-byte mini-sectors
48+
uint16 reserved; // [22H,02] reserved, must be zero
49+
uint32 reserved1; // [24H,04] reserved, must be zero
50+
uint32 csectDir; // [28H,04] must be zero for 512-byte sectors,
51+
// number of SECTs in directory chain for 4 KB
52+
// sectors
53+
uint32 csectFat; // [2CH,04] number of SECTs in the FAT chain
54+
uint32 sectDirStart; // [30H,04] first SECT in the directory chain
55+
uint32 txSignature; // [34H,04] signature used for transactions; must
56+
// be zero. The reference implementation
57+
// does not support transactions
58+
uint32 miniSectorCutoff; // [38H,04] maximum size for a mini stream;
59+
// typically 4096 bytes
60+
uint32 sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
61+
uint32 csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
62+
uint32 sectDifStart; // [44H,04] first SECT in the DIFAT chain
63+
uint32 csectDif; // [48H,04] number of SECTs in the DIFAT chain
64+
uint32 sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
65+
} cfbf_header_t;
66+
"""
67+
HEADER_STRUCT = "cfbf_header_t"
68+
69+
EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
70+
71+
DOC = HandlerDoc(
72+
name="MSI",
73+
description="Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.",
74+
handler_type=HandlerType.ARCHIVE,
75+
vendor="Microsoft",
76+
references=[
77+
Reference(
78+
title="MSI File Format Documentation",
79+
url="https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer",
80+
),
81+
Reference(
82+
title="Compound File Binary Format",
83+
url="https://en.wikipedia.org/wiki/Compound_File_Binary_Format",
84+
),
85+
],
86+
limitations=[
87+
"Limited to CFB based extraction, not full-on MSI extraction",
88+
"Extracted files have names coming from CFB internal representation, and may not correspond to the one they would have on disk after running the installer",
89+
],
90+
)
91+
92+
def _read_sector(
93+
self, file: File, start_offset: int, sector_size: int, sector_id: int
94+
) -> bytes:
95+
# All sectors, including the fixed-size header, occupy full sector_size
96+
sector_offset = start_offset + sector_size + sector_id * sector_size
97+
if sector_offset > file.size():
98+
raise InvalidInputFormat("Invalid MSI file, sector offset too large")
99+
100+
file.seek(sector_offset, io.SEEK_SET)
101+
raw_sector = file.read(sector_size)
102+
if len(raw_sector) != sector_size:
103+
raise InvalidInputFormat("Invalid MSI file, sector shorter than expected")
104+
105+
return raw_sector
106+
107+
def _append_fat_sector(
108+
self, fat_sectors: list[int], sector_id: int, required_count: int
109+
) -> bool:
110+
if sector_id == FREE_SECTOR:
111+
return False
112+
113+
fat_sectors.append(sector_id)
114+
return len(fat_sectors) >= required_count
115+
116+
def _extend_fat_from_difat(
117+
self,
118+
file: File,
119+
header,
120+
start_offset: int,
121+
sector_size: int,
122+
entries_per_sector: int,
123+
fat_sectors: list[int],
124+
) -> None:
125+
difat_sector = header.sectDifStart
126+
127+
for _ in range(header.csectDif):
128+
if difat_sector in (FREE_SECTOR, END_OF_CHAIN):
129+
break
130+
131+
raw_sector = self._read_sector(
132+
file, start_offset, sector_size, difat_sector
133+
)
134+
entries = struct.unpack(f"<{entries_per_sector}I", raw_sector)
135+
136+
difat_sector = entries[-1]
137+
for entry in entries[:-1]:
138+
if self._append_fat_sector(
139+
fat_sectors, entry, required_count=header.csectFat
140+
):
141+
return
142+
143+
def _collect_fat_sectors(
144+
self,
145+
file: File,
146+
header,
147+
start_offset: int,
148+
sector_size: int,
149+
entries_per_sector: int,
150+
) -> list[int]:
151+
fat_sectors: list[int] = []
152+
153+
for sect in header.sectFat:
154+
if self._append_fat_sector(fat_sectors, sect, header.csectFat):
155+
return fat_sectors
156+
157+
if len(fat_sectors) < header.csectFat:
158+
self._extend_fat_from_difat(
159+
file, header, start_offset, sector_size, entries_per_sector, fat_sectors
160+
)
161+
162+
if len(fat_sectors) != header.csectFat:
163+
raise InvalidInputFormat("Invalid MSI file, incomplete FAT chain")
164+
165+
return fat_sectors
166+
167+
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
168+
file.seek(start_offset, io.SEEK_SET)
169+
header = self.parse_header(file)
170+
171+
sector_size = 2**header.sectorShift
172+
entries_per_sector = sector_size // 4
173+
174+
if sector_size < HEADER_SIZE:
175+
raise InvalidInputFormat("Invalid MSI file, sector smaller than header")
176+
177+
if header.csectFat == 0:
178+
raise InvalidInputFormat("Invalid MSI file, FAT chain is empty")
179+
180+
fat_sectors = self._collect_fat_sectors(
181+
file, header, start_offset, sector_size, entries_per_sector
182+
)
183+
184+
max_used_sector = 0
185+
for sector_index, sect in enumerate(fat_sectors):
186+
raw_sector = self._read_sector(file, start_offset, sector_size, sect)
187+
entries = struct.unpack(f"<{entries_per_sector}I", raw_sector)
188+
189+
base_sector_id = sector_index * entries_per_sector
190+
for entry_id in range(len(entries) - 1, -1, -1):
191+
if entries[entry_id] == FREE_SECTOR:
192+
continue
193+
194+
max_id = base_sector_id + entry_id
195+
max_used_sector = max(max_used_sector, max_id)
196+
break
197+
198+
total_size = sector_size + ((max_used_sector + 1) * sector_size)
199+
200+
return ValidChunk(
201+
start_offset=start_offset,
202+
end_offset=start_offset + total_size,
203+
)

python/unblob/processing.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@
5454
DEFAULT_PROCESS_NUM = multiprocessing.cpu_count()
5555
DEFAULT_SKIP_MAGIC = (
5656
"BFLT",
57-
"Composite Document File V2 Document",
5857
"Erlang BEAM file",
5958
"GIF",
6059
"GNU message catalog",

tests/handlers/archive/test_msi.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import struct
2+
3+
import pytest
4+
5+
from unblob.file_utils import File
6+
from unblob.handlers.archive.msi import (
7+
END_OF_CHAIN,
8+
FREE_SECTOR,
9+
MsiHandler,
10+
)
11+
12+
13+
def _build_msi_with_sector_shift(sector_shift: int) -> bytes:
14+
sector_size = 1 << sector_shift
15+
16+
header = bytearray(sector_size)
17+
header[:8] = bytes.fromhex("D0 CF 11 E0 A1 B1 1A E1")
18+
19+
dll_version = 4 if sector_shift >= 12 else 3
20+
# Offsets and values taken from the CFBF header specification
21+
struct.pack_into(
22+
"<HHHHHH",
23+
header,
24+
0x18,
25+
0x0033,
26+
dll_version,
27+
0xFFFE,
28+
sector_shift,
29+
6,
30+
0,
31+
)
32+
struct.pack_into("<I", header, 0x2C, 1) # csectFat
33+
struct.pack_into("<I", header, 0x38, 4096) # miniSectorCutoff
34+
struct.pack_into("<I", header, 0x3C, FREE_SECTOR) # sectMiniFatStart
35+
struct.pack_into("<I", header, 0x44, FREE_SECTOR) # sectDifStart
36+
37+
sect_fat_entries = [FREE_SECTOR] * 109
38+
sect_fat_entries[0] = 0
39+
for index, entry in enumerate(sect_fat_entries):
40+
struct.pack_into("<I", header, 0x4C + index * 4, entry)
41+
42+
entries_per_sector = sector_size // 4
43+
fat_sector = bytearray(sector_size)
44+
fat_entries = [END_OF_CHAIN] + [FREE_SECTOR] * (entries_per_sector - 1)
45+
for index, entry in enumerate(fat_entries):
46+
struct.pack_into("<I", fat_sector, index * 4, entry)
47+
48+
return bytes(header + fat_sector)
49+
50+
51+
@pytest.mark.parametrize("sector_shift", [9, 12])
52+
def test_calculate_chunk_respects_sector_size(sector_shift: int):
53+
handler = MsiHandler()
54+
55+
msi_content = _build_msi_with_sector_shift(sector_shift)
56+
prefix = b"prefix"
57+
file = File.from_bytes(prefix + msi_content)
58+
59+
chunk = handler.calculate_chunk(file, len(prefix))
60+
61+
assert chunk is not None
62+
assert chunk.start_offset == len(prefix)
63+
assert chunk.end_offset == len(prefix) + len(msi_content)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:dce9e456ace76b969fe0fe4d228bf096662c11d2376d99a9210f6364428a94c4
3+
size 1563648
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:da8f4120ab4ffacb19067a26f6a8b2695e00ec19bcc48ff694349c62df1b330b
3+
size 1563680
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:aa8e5036d973688f1e8622fbe9ab22e037346e0def0197bf5e7cdf37da4e223d
3+
size 3831808
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:12c87c542e1d4a39b47f176ffa5fd1691c98e5f9d502e6e46573962fb77c4510
3+
size 3831840
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:374708fff7719dd5979ec875d56cd2286f6d3cf7ec317a3b25632aab28ec37bb
3+
size 16

0 commit comments

Comments
 (0)