1818https://py7zr.readthedocs.io/en/latest/archive_format.html
1919"""
2020import binascii
21+ from pathlib import Path
2122from typing import Optional
2223
2324from structlog import get_logger
2425
2526from unblob .extractors import Command
2627
27- from ...models import File , HexString , StructHandler , ValidChunk
28+ from ...extractors .command import MultiFileCommand
29+ from ...file_utils import Endian , InvalidInputFormat , StructParser
30+ from ...models import (
31+ DirectoryHandler ,
32+ File ,
33+ Glob ,
34+ HexString ,
35+ MultiFile ,
36+ StructHandler ,
37+ ValidChunk ,
38+ )
2839
2940logger = get_logger ()
3041
42+ C_DEFINITIONS = r"""
43+ typedef struct sevenzip_header {
44+ char magic[6];
45+ uint8 version_maj;
46+ uint8 version_min;
47+ uint32 crc;
48+ uint64 next_header_offset;
49+ uint64 next_header_size;
50+ uint32 next_header_crc;
51+ } sevenzip_header_t;
52+ """
53+ HEADER_STRUCT = "sevenzip_header_t"
54+ HEADER_SIZE = 6 + 1 + 1 + 4 + 8 + 8 + 4
55+
56+ HEADER_PARSER = StructParser (C_DEFINITIONS )
57+
3158# StartHeader (next_header_offset, next_header_size, next_header_crc)
3259START_HEADER_SIZE = 8 + 8 + 4
3360
3461
62+ SEVENZIP_MAGIC = b"7z\xbc \xaf \x27 \x1c "
63+
64+
65+ def check_header_crc (header ):
66+ # CRC includes the StartHeader (next_header_offset, next_header_size, next_header_crc)
67+ # CPP/7zip/Archive/7z/7zOut.cpp COutArchive::WriteStartHeader
68+ calculated_crc = binascii .crc32 (header .dumps ()[- START_HEADER_SIZE :])
69+ if header .crc != calculated_crc :
70+ raise InvalidInputFormat ("Invalid sevenzip header CRC" )
71+
72+
73+ def calculate_sevenzip_size (header ) -> int :
74+ return len (header ) + header .next_header_offset + header .next_header_size
75+
76+
3577class SevenZipHandler (StructHandler ):
3678 NAME = "sevenzip"
3779
@@ -43,31 +85,48 @@ class SevenZipHandler(StructHandler):
4385 """
4486 )
4587 ]
46- C_DEFINITIONS = r"""
47- typedef struct sevenzip_header {
48- char magic[6];
49- uint8 version_maj;
50- uint8 version_min;
51- uint32 crc;
52- uint64 next_header_offset;
53- uint64 next_header_size;
54- uint32 next_header_crc;
55- } sevenzip_header_t;
56- """
57- HEADER_STRUCT = "sevenzip_header_t"
88+ C_DEFINITIONS = C_DEFINITIONS
89+ HEADER_STRUCT = HEADER_STRUCT
5890 EXTRACTOR = Command ("7z" , "x" , "-p" , "-y" , "{inpath}" , "-o{outdir}" )
5991
6092 def calculate_chunk (self , file : File , start_offset : int ) -> Optional [ValidChunk ]:
6193 header = self .parse_header (file )
6294
63- # CRC includes the StartHeader (next_header_offset, next_header_size, next_header_crc)
64- # CPP/7zip/Archive/7z/7zOut.cpp COutArchive::WriteStartHeader
65- calculated_crc = binascii .crc32 (header .dumps ()[- START_HEADER_SIZE :])
66- if header .crc != calculated_crc :
67- logger .debug ("Invalid header CRC" , _verbosity = 2 )
95+ check_header_crc (header )
96+
97+ size = calculate_sevenzip_size (header )
98+
99+ return ValidChunk (start_offset = start_offset , end_offset = start_offset + size )
100+
101+
102+ class MultiVolumeSevenZipHandler (DirectoryHandler ):
103+ NAME = "multi-sevenzip"
104+ EXTRACTOR = MultiFileCommand ("7z" , "x" , "-p" , "-y" , "{inpath}" , "-o{outdir}" )
105+
106+ PATTERN = Glob ("*.7z.001" )
107+
108+ def calculate_multifile (self , file : Path ) -> Optional [MultiFile ]:
109+ with file .open ("rb" ) as f :
110+ header_data = f .read (HEADER_SIZE )
111+
112+ header = HEADER_PARSER .parse (HEADER_STRUCT , header_data , Endian .LITTLE )
113+ if header .magic != SEVENZIP_MAGIC :
68114 return None
69115
70- # We read the signature header here to get the offset to the header database
71- first_db_header = start_offset + len (header ) + header .next_header_offset
72- end_offset = first_db_header + header .next_header_size
73- return ValidChunk (start_offset = start_offset , end_offset = end_offset )
116+ check_header_crc (header )
117+ size = calculate_sevenzip_size (header )
118+ logger .debug ("Sevenzip header" , header = header , size = size , _verbosity = 3 )
119+
120+ paths = sorted (file .parent .glob (f"{ file .stem } .*" ))
121+
122+ files_size = sum (path .stat ().st_size for path in paths )
123+ logger .debug (
124+ "Multi-volume files" , paths = paths , files_size = files_size , _verbosity = 2
125+ )
126+ if files_size != size :
127+ return None
128+
129+ return MultiFile (
130+ name = file .stem ,
131+ paths = paths ,
132+ )
0 commit comments