Skip to content

Commit 04b69a5

Browse files
committed
feat(processing): add multi-file directory handler
There are certain formats where the content is split between multiple files. Currently unblob operates under the assumption that all content resides in a single file. A few examples where this might be relevant: - multi-volume archives, such as 7zip, rar etc. - VM snapshots - content + index type formats This change introduces a DirectoryHandler which can operate on multiple files residing in one directory or at least under one subtree. Most formats there is a "main" file which can be identified by a directory file name pattern. Using this first file the handler can identify the other files and return a MultiFile object, similar to ValidChunks. We do not support cases where a single file is part of multiple MultiFile, also a file processed & extracted in the context of a MultiFile is not processed by traditional handlers. Also there is no carving step rather the files are extracted directly into an extraction directory. The original files are kept and never deleted, as these are normal files, unlike carved out temporary chunks. Files extracted from a MultiFile have a MultiFile as their parent. This required extending the current File -> Chunk reporting concept by introducing an abstract Blob type which is the parent of Chunk and MultiFile as well. MultiFileReports are reported under the directory Task, but contains all included file paths as well.
1 parent 11a88ab commit 04b69a5

File tree

17 files changed

+915
-139
lines changed

17 files changed

+915
-139
lines changed

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@
1111

1212
@pytest.fixture
1313
def task_result():
14-
task = Task(path=Path("/nonexistent"), depth=0, chunk_id="")
14+
task = Task(path=Path("/nonexistent"), depth=0, blob_id="")
1515
return TaskResult(task)

tests/test_cli.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
from pathlib import Path
2-
from typing import List
2+
from typing import List, Optional
33
from unittest import mock
44

55
import pytest
66
from click.testing import CliRunner
77

88
import unblob.cli
99
from unblob.extractors import Command
10+
from unblob.extractors.command import MultiFileCommand
1011
from unblob.handlers import BUILTIN_HANDLERS
11-
from unblob.models import Handler, HexString
12+
from unblob.models import DirectoryHandler, Glob, Handler, HexString, MultiFile
1213
from unblob.processing import DEFAULT_DEPTH, DEFAULT_PROCESS_NUM, ExtractionConfig
1314

1415

@@ -25,33 +26,54 @@ class ExistingCommandHandler(TestHandler):
2526
EXTRACTOR = Command("sh", "something")
2627

2728

28-
def test_show_external_dependencies_exists():
29+
class TestDirHandler(DirectoryHandler):
30+
NAME = "test_dir_handler"
31+
PATTERN = Glob("*.test")
32+
EXTRACTOR = MultiFileCommand("test-multi", "for", "test", "handler")
33+
34+
def calculate_multifile(self, file: Path) -> Optional[MultiFile]:
35+
pass
36+
37+
38+
class ExistingCommandDirHandler(TestDirHandler):
39+
EXTRACTOR = MultiFileCommand("true")
40+
41+
42+
def test_show_external_dependencies_missing():
2943
handlers = (ExistingCommandHandler, TestHandler)
3044
runner = CliRunner()
3145
result = runner.invoke(
32-
unblob.cli.cli, ["--show-external-dependencies"], handlers=handlers
46+
unblob.cli.cli,
47+
["--show-external-dependencies"],
48+
handlers=handlers,
49+
dir_handlers=(TestDirHandler,),
3350
)
3451
assert result.exit_code == 1
3552
assert (
3653
result.output
3754
== """The following executables found installed, which are needed by unblob:
3855
sh ✓
56+
test-multi ✗
3957
testcommand ✗
4058
"""
4159
)
4260

4361

44-
def test_show_external_dependencies_not_exists():
62+
def test_show_external_dependencies_exists():
4563
handlers = (ExistingCommandHandler, ExistingCommandHandler)
4664
runner = CliRunner()
4765
result = runner.invoke(
48-
unblob.cli.cli, ["--show-external-dependencies"], handlers=handlers
66+
unblob.cli.cli,
67+
["--show-external-dependencies"],
68+
handlers=handlers,
69+
dir_handlers=(ExistingCommandDirHandler,),
4970
)
5071
assert result.exit_code == 0
5172
assert (
5273
result.output
5374
== """The following executables found installed, which are needed by unblob:
54-
sh ✓
75+
sh ✓
76+
true ✓
5577
"""
5678
)
5779

tests/test_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
def test_carve_unknown_chunk(tmp_path: Path):
1515
content = b"test file"
1616
test_file = File.from_bytes(content)
17-
chunk = UnknownChunk(1, 8)
17+
chunk = UnknownChunk(start_offset=1, end_offset=8)
1818
carve_unknown_chunk(tmp_path, test_file, chunk)
1919
written_path = tmp_path / "1-8.unknown"
2020
assert list(tmp_path.iterdir()) == [written_path]

tests/test_finder.py

Lines changed: 61 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -112,64 +112,105 @@ def calculate_chunk(self, file, start_offset: int):
112112
@pytest.mark.parametrize(
113113
"content, expected_chunks",
114114
[
115-
pytest.param(b"00A23450", [ValidChunk(2, 7)], id="single-chunk"),
116115
pytest.param(
117-
b"0BB34A678900", [ValidChunk(0, 10)], id="chunk-with-relative-match-offset"
116+
b"00A23450", [ValidChunk(start_offset=2, end_offset=7)], id="single-chunk"
117+
),
118+
pytest.param(
119+
b"0BB34A678900",
120+
[ValidChunk(start_offset=0, end_offset=10)],
121+
id="chunk-with-relative-match-offset",
118122
),
119123
pytest.param(
120124
b"A23450BB3456789",
121-
[ValidChunk(0, 5), ValidChunk(5, 15)],
125+
[
126+
ValidChunk(start_offset=0, end_offset=5),
127+
ValidChunk(start_offset=5, end_offset=15),
128+
],
122129
id="multiple-chunk",
123130
),
124-
pytest.param(b"0BC34A67890", [ValidChunk(0, 10)], id="inner-chunk-ignored"),
131+
pytest.param(
132+
b"0BC34A67890",
133+
[ValidChunk(start_offset=0, end_offset=10)],
134+
id="inner-chunk-ignored",
135+
),
125136
pytest.param(
126137
b"0BC34A67890A2345",
127-
[ValidChunk(0, 10), ValidChunk(11, 16)],
138+
[
139+
ValidChunk(start_offset=0, end_offset=10),
140+
ValidChunk(start_offset=11, end_offset=16),
141+
],
128142
id="inner-chunk-ignored-scan-continues",
129143
),
130-
pytest.param(b"A23450BB34", [ValidChunk(0, 5)], id="overflowing-chunk-ignored"),
144+
pytest.param(
145+
b"A23450BB34",
146+
[ValidChunk(start_offset=0, end_offset=5)],
147+
id="overflowing-chunk-ignored",
148+
),
131149
pytest.param(
132150
b"0BBA2345",
133-
[ValidChunk(3, 8)],
151+
[ValidChunk(start_offset=3, end_offset=8)],
134152
id="overflowing-chunk-ignored-scan-continues",
135153
),
136-
pytest.param(b"A2345", [ValidChunk(0, 5)], id="whole-file-chunk"),
137-
pytest.param(b"00000A2345", [ValidChunk(5, 10)], id="chunk-till-end-of-file"),
154+
pytest.param(
155+
b"A2345", [ValidChunk(start_offset=0, end_offset=5)], id="whole-file-chunk"
156+
),
157+
pytest.param(
158+
b"00000A2345",
159+
[ValidChunk(start_offset=5, end_offset=10)],
160+
id="chunk-till-end-of-file",
161+
),
138162
pytest.param(
139163
b"BB34A678900",
140-
[ValidChunk(4, 9)],
164+
[ValidChunk(start_offset=4, end_offset=9)],
141165
id="chunk-with-invalid-relative-match-offset-ignored",
142166
),
143-
pytest.param(b"00D00A2345", [ValidChunk(5, 10)], id="invalid-chunk-ignored"),
144-
pytest.param(b"EOFA2345", [ValidChunk(3, 8)], id="eof-ignored-scan-continues"),
145167
pytest.param(
146-
b"IA2345", [ValidChunk(1, 6)], id="invalid-chunk-ignored-scan-continues"
168+
b"00D00A2345",
169+
[ValidChunk(start_offset=5, end_offset=10)],
170+
id="invalid-chunk-ignored",
171+
),
172+
pytest.param(
173+
b"EOFA2345",
174+
[ValidChunk(start_offset=3, end_offset=8)],
175+
id="eof-ignored-scan-continues",
147176
),
148177
pytest.param(
149-
b"EXCA2345", [ValidChunk(3, 8)], id="exception-ignored-scan-continues"
178+
b"IA2345",
179+
[ValidChunk(start_offset=1, end_offset=6)],
180+
id="invalid-chunk-ignored-scan-continues",
181+
),
182+
pytest.param(
183+
b"EXCA2345",
184+
[ValidChunk(start_offset=3, end_offset=8)],
185+
id="exception-ignored-scan-continues",
150186
),
151187
pytest.param(b"0", [], id="1-byte"),
152188
pytest.param(b"1234567890", [], id="no-chunk"),
153189
pytest.param(
154190
b"A2345L1" + b"1" * DEFAULT_BUFSIZE * 2,
155-
[ValidChunk(0, 5), ValidChunk(5, 5 + DEFAULT_BUFSIZE * 2)],
191+
[
192+
ValidChunk(start_offset=0, end_offset=5),
193+
ValidChunk(start_offset=5, end_offset=5 + DEFAULT_BUFSIZE * 2),
194+
],
156195
id="multi-large-chunk",
157196
),
158197
pytest.param(
159198
b"L" + b"1" * DEFAULT_BUFSIZE + b"A2345" + b"1" * DEFAULT_BUFSIZE,
160-
[ValidChunk(0, DEFAULT_BUFSIZE * 2)],
199+
[ValidChunk(start_offset=0, end_offset=DEFAULT_BUFSIZE * 2)],
161200
id="large-small-inside-ignored",
162201
),
163202
pytest.param(
164203
b"0123456789L" + b"1" * DEFAULT_BUFSIZE + b"A2345" + b"1" * DEFAULT_BUFSIZE,
165-
[ValidChunk(10, 10 + DEFAULT_BUFSIZE * 2)],
204+
[ValidChunk(start_offset=10, end_offset=10 + DEFAULT_BUFSIZE * 2)],
166205
id="padding-large-small-inside-ignored",
167206
),
168207
pytest.param(
169208
b"L" + b"1" * (DEFAULT_BUFSIZE * 2 - 1) + b"A2345" + b"1" * DEFAULT_BUFSIZE,
170209
[
171-
ValidChunk(0, DEFAULT_BUFSIZE * 2),
172-
ValidChunk(DEFAULT_BUFSIZE * 2, DEFAULT_BUFSIZE * 2 + 5),
210+
ValidChunk(start_offset=0, end_offset=DEFAULT_BUFSIZE * 2),
211+
ValidChunk(
212+
start_offset=DEFAULT_BUFSIZE * 2, end_offset=DEFAULT_BUFSIZE * 2 + 5
213+
),
173214
],
174215
id="large-small",
175216
),
@@ -192,6 +233,4 @@ def test_search_chunks(content, expected_chunks, task_result):
192233

193234
assert len(chunks) == len(expected_chunks)
194235
for expected_chunk, chunk in zip(expected_chunks, chunks):
195-
assert attr.evolve(chunk, chunk_id="") == attr.evolve(
196-
expected_chunk, chunk_id=""
197-
)
236+
assert attr.evolve(chunk, id="") == attr.evolve(expected_chunk, id="")

tests/test_models.py

Lines changed: 55 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,31 @@ class TestChunk:
1818
@pytest.mark.parametrize(
1919
"chunk1, chunk2, result",
2020
[
21-
(Chunk(0, 10), Chunk(1, 2), True),
22-
(Chunk(0, 10), Chunk(11, 12), False),
23-
(Chunk(0, 10), Chunk(15, 20), False),
24-
(Chunk(1, 2), Chunk(3, 5), False),
25-
(Chunk(0, 10), Chunk(1, 10), True),
21+
(
22+
Chunk(start_offset=0, end_offset=10),
23+
Chunk(start_offset=1, end_offset=2),
24+
True,
25+
),
26+
(
27+
Chunk(start_offset=0, end_offset=10),
28+
Chunk(start_offset=11, end_offset=12),
29+
False,
30+
),
31+
(
32+
Chunk(start_offset=0, end_offset=10),
33+
Chunk(start_offset=15, end_offset=20),
34+
False,
35+
),
36+
(
37+
Chunk(start_offset=1, end_offset=2),
38+
Chunk(start_offset=3, end_offset=5),
39+
False,
40+
),
41+
(
42+
Chunk(start_offset=0, end_offset=10),
43+
Chunk(start_offset=1, end_offset=10),
44+
True,
45+
),
2646
],
2747
)
2848
def test_contains(self, chunk1, chunk2, result):
@@ -35,10 +55,27 @@ def test_range_hex(self):
3555
@pytest.mark.parametrize(
3656
"chunk, offset, expected",
3757
[
38-
pytest.param(Chunk(0x1, 0x2), 0x0, False, id="offset_before_chunk"),
39-
pytest.param(Chunk(0x0, 0x2), 0x0, True, id="offset_start_of_chunk"),
40-
pytest.param(Chunk(0x0, 0x2), 0x1, True, id="offset_inside_chunk"),
41-
pytest.param(Chunk(0x0, 0x2), 0x2, False, id="offset_after"),
58+
pytest.param(
59+
Chunk(start_offset=0x1, end_offset=0x2),
60+
0x0,
61+
False,
62+
id="offset_before_chunk",
63+
),
64+
pytest.param(
65+
Chunk(start_offset=0x0, end_offset=0x2),
66+
0x0,
67+
True,
68+
id="offset_start_of_chunk",
69+
),
70+
pytest.param(
71+
Chunk(start_offset=0x0, end_offset=0x2),
72+
0x1,
73+
True,
74+
id="offset_inside_chunk",
75+
),
76+
pytest.param(
77+
Chunk(start_offset=0x0, end_offset=0x2), 0x2, False, id="offset_after"
78+
),
4279
],
4380
)
4481
def test_contains_offset(self, chunk, offset, expected):
@@ -56,12 +93,12 @@ def test_contains_offset(self, chunk, offset, expected):
5693
)
5794
def test_validation(self, start_offset, end_offset):
5895
with pytest.raises(InvalidInputFormat):
59-
Chunk(start_offset, end_offset)
96+
Chunk(start_offset=start_offset, end_offset=end_offset)
6097

6198

6299
class Test_to_json: # noqa: N801
63100
def test_process_result_conversion(self):
64-
task = Task(path=Path("/nonexistent"), depth=0, chunk_id="")
101+
task = Task(path=Path("/nonexistent"), depth=0, blob_id="")
65102
task_result = TaskResult(task)
66103
chunk_id = "test_basic_conversion:id"
67104

@@ -90,7 +127,7 @@ def test_process_result_conversion(self):
90127
)
91128
task_result.add_report(
92129
ChunkReport(
93-
chunk_id=chunk_id,
130+
id=chunk_id,
94131
handler_name="zip",
95132
start_offset=0,
96133
end_offset=384,
@@ -103,7 +140,7 @@ def test_process_result_conversion(self):
103140
Task(
104141
path=Path("/extractions/nonexistent_extract"),
105142
depth=314,
106-
chunk_id=chunk_id,
143+
blob_id=chunk_id,
107144
)
108145
)
109146

@@ -143,7 +180,7 @@ def test_process_result_conversion(self):
143180
"end_offset": 384,
144181
"extraction_reports": [],
145182
"handler_name": "zip",
146-
"chunk_id": "test_basic_conversion:id",
183+
"id": "test_basic_conversion:id",
147184
"is_encrypted": False,
148185
"size": 384,
149186
"start_offset": 0,
@@ -152,15 +189,17 @@ def test_process_result_conversion(self):
152189
"subtasks": [
153190
{
154191
"__typename__": "Task",
155-
"chunk_id": "test_basic_conversion:id",
192+
"blob_id": "test_basic_conversion:id",
156193
"depth": 314,
194+
"is_multi_file": False,
157195
"path": "/extractions/nonexistent_extract",
158196
}
159197
],
160198
"task": {
161199
"__typename__": "Task",
162-
"chunk_id": "",
200+
"blob_id": "",
163201
"depth": 0,
202+
"is_multi_file": False,
164203
"path": "/nonexistent",
165204
},
166205
},

0 commit comments

Comments
 (0)