Skip to content

Commit 439dc3e

Browse files
jcrussellqkaiser
authored andcommitted
feat(handler): add support for MSI files
Extracts MSIs using 7z with custom CFBF header parsing to compute the full archive size. Works on both vanilla and padded MSI files. This could be migrated to a fully Python-based implementation in the future using: * https://github.com/nightlark/pymsi * https://github.com/decalage2/olefile As of v0.47, olefile does not handle padded MSIs properly so we re-implement CFBF header parsing and compute the archive size ourselves.
1 parent 9e13e9d commit 439dc3e

File tree

352 files changed

+1192
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

352 files changed

+1192
-1
lines changed

docs/handlers.md

Lines changed: 18 additions & 0 deletions

python/unblob/handlers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
cab,
77
cpio,
88
dmg,
9+
msi,
910
par2,
1011
partclone,
1112
rar,
@@ -89,6 +90,7 @@
8990
arc.ARCHandler,
9091
arj.ARJHandler,
9192
cab.CABHandler,
93+
msi.MsiHandler,
9294
tar.TarUstarHandler,
9395
tar.TarUnixHandler,
9496
cpio.PortableASCIIHandler,
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import io
2+
import struct
3+
from typing import Optional
4+
5+
from structlog import get_logger
6+
7+
from unblob.extractors import Command
8+
from ...file_utils import InvalidInputFormat
9+
from ...models import (
10+
File,
11+
HandlerDoc,
12+
HandlerType,
13+
HexString,
14+
Reference,
15+
StructHandler,
16+
ValidChunk,
17+
)
18+
19+
logger = get_logger()
20+
21+
22+
class MsiHandler(StructHandler):
23+
NAME = "msi"
24+
25+
PATTERNS = [HexString("D0 CF 11 E0 A1 B1 1A E1")]
26+
C_DEFINITIONS = r"""
27+
typedef struct cfbf_header
28+
{
29+
// [offset from start (bytes), length (bytes)]
30+
uint8 signature[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
31+
// 0x1a, 0xe1} for current version
32+
uint8 clsid[16]; // [08H,16] reserved must be zero (WriteClassStg/
33+
// GetClassFile uses root directory class id)
34+
uint16 minorVersion; // [18H,02] minor version of the format: 33 is
35+
// written by reference implementation
36+
uint16 dllVersion; // [1AH,02] major version of the dll/format: 3 for
37+
// 512-byte sectors, 4 for 4 KB sectors
38+
uint16 byteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
39+
uint16 sectorShift; // [1EH,02] size of sectors in power-of-two;
40+
// typically 9 indicating 512-byte sectors
41+
uint16 miniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
42+
// typically 6 indicating 64-byte mini-sectors
43+
uint16 reserved; // [22H,02] reserved, must be zero
44+
uint32 reserved1; // [24H,04] reserved, must be zero
45+
uint32 csectDir; // [28H,04] must be zero for 512-byte sectors,
46+
// number of SECTs in directory chain for 4 KB
47+
// sectors
48+
uint32 csectFat; // [2CH,04] number of SECTs in the FAT chain
49+
uint32 sectDirStart; // [30H,04] first SECT in the directory chain
50+
uint32 txSignature; // [34H,04] signature used for transactions; must
51+
// be zero. The reference implementation
52+
// does not support transactions
53+
uint32 miniSectorCutoff; // [38H,04] maximum size for a mini stream;
54+
// typically 4096 bytes
55+
uint32 sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
56+
uint32 csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
57+
uint32 sectDifStart; // [44H,04] first SECT in the DIFAT chain
58+
uint32 csectDif; // [48H,04] number of SECTs in the DIFAT chain
59+
uint32 sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
60+
} cfbf_header_t;
61+
"""
62+
HEADER_STRUCT = "cfbf_header_t"
63+
64+
EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
65+
66+
DOC = HandlerDoc(
67+
name="MSI",
68+
description="Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.",
69+
handler_type=HandlerType.ARCHIVE,
70+
vendor="Microsoft",
71+
references=[
72+
Reference(
73+
title="MSI File Format Documentation",
74+
url="https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer",
75+
),
76+
Reference(
77+
title="Compound File Binary Format",
78+
url="https://en.wikipedia.org/wiki/Compound_File_Binary_Format",
79+
),
80+
],
81+
limitations=[],
82+
)
83+
84+
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
85+
file.seek(start_offset, io.SEEK_SET)
86+
header = self.parse_header(file)
87+
88+
# Size of MSI is based on the maximum used sector. Need to walk the
89+
# DIFAT entries and find the maximum used sector to compute the size.
90+
sector_size = 2**header.sectorShift
91+
entries_per_sector = sector_size // 4
92+
93+
max_used_sector = 0
94+
95+
for sector_id, sect in enumerate(header.sectFat):
96+
# skip empty
97+
if sect == 0xFFFFFFFF:
98+
continue
99+
100+
sector_offset = start_offset + 512 + sect * sector_size
101+
if sector_offset > file.size():
102+
raise InvalidInputFormat("Invalid MSI file, sector offset too large")
103+
file.seek(sector_offset, io.SEEK_SET)
104+
raw_sector = file.read(sector_size)
105+
entries = struct.unpack(f"<{entries_per_sector}I", raw_sector)
106+
107+
base_sector_id = sector_id * entries_per_sector
108+
for entry_id in range(len(entries) - 1, -1, -1):
109+
if entries[entry_id] == 0xFFFFFFFF:
110+
continue
111+
112+
# Found the highest id on this page
113+
max_id = base_sector_id + entry_id
114+
115+
max_used_sector = max(max_used_sector, max_id)
116+
117+
# Once we have found the first non-empty element, we are done
118+
# with all IDs in this sector
119+
break
120+
121+
total_size = 512 + ((max_used_sector + 1) * sector_size)
122+
123+
return ValidChunk(
124+
start_offset=start_offset,
125+
end_offset=start_offset + total_size,
126+
)

python/unblob/processing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@
5454
DEFAULT_PROCESS_NUM = multiprocessing.cpu_count()
5555
DEFAULT_SKIP_MAGIC = (
5656
"BFLT",
57-
"Composite Document File V2 Document",
57+
# Disabled for MSI files
58+
# "Composite Document File V2 Document",
5859
"Erlang BEAM file",
5960
"GIF",
6061
"GNU message catalog",
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:dce9e456ace76b969fe0fe4d228bf096662c11d2376d99a9210f6364428a94c4
3+
size 1563648
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:da8f4120ab4ffacb19067a26f6a8b2695e00ec19bcc48ff694349c62df1b330b
3+
size 1563680
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:aa8e5036d973688f1e8622fbe9ab22e037346e0def0197bf5e7cdf37da4e223d
3+
size 3831808
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:12c87c542e1d4a39b47f176ffa5fd1691c98e5f9d502e6e46573962fb77c4510
3+
size 3831840
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:374708fff7719dd5979ec875d56cd2286f6d3cf7ec317a3b25632aab28ec37bb
3+
size 16
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:374708fff7719dd5979ec875d56cd2286f6d3cf7ec317a3b25632aab28ec37bb
3+
size 16

0 commit comments

Comments
 (0)