Skip to content

Commit 83f1eab

Browse files
committed
feat(handler): add multi-volume sevenzip directory handler
1 parent 04b69a5 commit 83f1eab

File tree

7 files changed

+97
-23
lines changed

7 files changed

+97
-23
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:c17f34d380545b1f139b52486f48b1852a9e74c2079a8e0338b0b7600a720fd6
3+
size 10240
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:a89ae76a6a3624af5eef2bbebe3ac0ac9916d66f3a65b055a4931f28065fd55e
3+
size 100
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:f3af8f79806dc5059ce0c746906f6d4c1f4e0206abd5e1742f8a8215bf6ebae0
3+
size 81
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:47d741b6059c6d7e99be25ce46fb9ba099cfd6515de1ef7681f93479d25996a4
3+
size 9
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:e0763097d2327a89fb7fc6a1fad40f87d2261dcdd6c09e65ee00b200a0128e1c
3+
size 9

unblob/handlers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,4 +100,4 @@
100100
engenius.EngeniusHandler,
101101
)
102102

103-
BUILTIN_DIR_HANDLERS: DirectoryHandlers = ()
103+
BUILTIN_DIR_HANDLERS: DirectoryHandlers = (sevenzip.MultiVolumeSevenZipHandler,)

unblob/handlers/archive/sevenzip.py

Lines changed: 81 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,62 @@
1818
https://py7zr.readthedocs.io/en/latest/archive_format.html
1919
"""
2020
import binascii
21+
from pathlib import Path
2122
from typing import Optional
2223

2324
from structlog import get_logger
2425

2526
from unblob.extractors import Command
2627

27-
from ...models import File, HexString, StructHandler, ValidChunk
28+
from ...extractors.command import MultiFileCommand
29+
from ...file_utils import Endian, InvalidInputFormat, StructParser
30+
from ...models import (
31+
DirectoryHandler,
32+
File,
33+
Glob,
34+
HexString,
35+
MultiFile,
36+
StructHandler,
37+
ValidChunk,
38+
)
2839

2940
logger = get_logger()
3041

42+
C_DEFINITIONS = r"""
43+
typedef struct sevenzip_header {
44+
char magic[6];
45+
uint8 version_maj;
46+
uint8 version_min;
47+
uint32 crc;
48+
uint64 next_header_offset;
49+
uint64 next_header_size;
50+
uint32 next_header_crc;
51+
} sevenzip_header_t;
52+
"""
53+
HEADER_STRUCT = "sevenzip_header_t"
54+
HEADER_SIZE = 6 + 1 + 1 + 4 + 8 + 8 + 4
55+
56+
HEADER_PARSER = StructParser(C_DEFINITIONS)
57+
3158
# StartHeader (next_header_offset, next_header_size, next_header_crc)
3259
START_HEADER_SIZE = 8 + 8 + 4
3360

3461

62+
SEVENZIP_MAGIC = b"7z\xbc\xaf\x27\x1c"
63+
64+
65+
def check_header_crc(header):
66+
# CRC includes the StartHeader (next_header_offset, next_header_size, next_header_crc)
67+
# CPP/7zip/Archive/7z/7zOut.cpp COutArchive::WriteStartHeader
68+
calculated_crc = binascii.crc32(header.dumps()[-START_HEADER_SIZE:])
69+
if header.crc != calculated_crc:
70+
raise InvalidInputFormat("Invalid sevenzip header CRC")
71+
72+
73+
def calculate_sevenzip_size(header) -> int:
74+
return len(header) + header.next_header_offset + header.next_header_size
75+
76+
3577
class SevenZipHandler(StructHandler):
3678
NAME = "sevenzip"
3779

@@ -43,31 +85,48 @@ class SevenZipHandler(StructHandler):
4385
"""
4486
)
4587
]
46-
C_DEFINITIONS = r"""
47-
typedef struct sevenzip_header {
48-
char magic[6];
49-
uint8 version_maj;
50-
uint8 version_min;
51-
uint32 crc;
52-
uint64 next_header_offset;
53-
uint64 next_header_size;
54-
uint32 next_header_crc;
55-
} sevenzip_header_t;
56-
"""
57-
HEADER_STRUCT = "sevenzip_header_t"
88+
C_DEFINITIONS = C_DEFINITIONS
89+
HEADER_STRUCT = HEADER_STRUCT
5890
EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
5991

6092
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
6193
header = self.parse_header(file)
6294

63-
# CRC includes the StartHeader (next_header_offset, next_header_size, next_header_crc)
64-
# CPP/7zip/Archive/7z/7zOut.cpp COutArchive::WriteStartHeader
65-
calculated_crc = binascii.crc32(header.dumps()[-START_HEADER_SIZE:])
66-
if header.crc != calculated_crc:
67-
logger.debug("Invalid header CRC", _verbosity=2)
95+
check_header_crc(header)
96+
97+
size = calculate_sevenzip_size(header)
98+
99+
return ValidChunk(start_offset=start_offset, end_offset=start_offset + size)
100+
101+
102+
class MultiVolumeSevenZipHandler(DirectoryHandler):
103+
NAME = "multi-sevenzip"
104+
EXTRACTOR = MultiFileCommand("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
105+
106+
PATTERN = Glob("*.7z.001")
107+
108+
def calculate_multifile(self, file: Path) -> Optional[MultiFile]:
109+
with file.open("rb") as f:
110+
header_data = f.read(HEADER_SIZE)
111+
112+
header = HEADER_PARSER.parse(HEADER_STRUCT, header_data, Endian.LITTLE)
113+
if header.magic != SEVENZIP_MAGIC:
68114
return None
69115

70-
# We read the signature header here to get the offset to the header database
71-
first_db_header = start_offset + len(header) + header.next_header_offset
72-
end_offset = first_db_header + header.next_header_size
73-
return ValidChunk(start_offset=start_offset, end_offset=end_offset)
116+
check_header_crc(header)
117+
size = calculate_sevenzip_size(header)
118+
logger.debug("Sevenzip header", header=header, size=size, _verbosity=3)
119+
120+
paths = sorted(file.parent.glob(f"{file.stem}.*"))
121+
122+
files_size = sum(path.stat().st_size for path in paths)
123+
logger.debug(
124+
"Multi-volume files", paths=paths, files_size=files_size, _verbosity=2
125+
)
126+
if files_size != size:
127+
return None
128+
129+
return MultiFile(
130+
name=file.stem,
131+
paths=paths,
132+
)

0 commit comments

Comments
 (0)