Skip to content

Commit 6cff605

Browse files
committed
feat(metadata): report unknown chunks
1 parent 1601c99 commit 6cff605

File tree

5 files changed

+50
-59
lines changed

5 files changed

+50
-59
lines changed

tests/test_extractor.py

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,39 +3,22 @@
33
import pytest
44

55
from unblob.extractor import (
6-
carve_unknown_chunks,
6+
carve_unknown_chunk,
77
fix_extracted_directory,
88
fix_permission,
99
fix_symlink,
1010
)
1111
from unblob.models import File, TaskResult, UnknownChunk
1212

1313

14-
class TestCarveUnknownChunks:
15-
def test_no_chunks(self, tmp_path: Path):
16-
test_file = File.from_bytes(b"some file")
17-
carve_unknown_chunks(tmp_path, test_file, [])
18-
assert list(tmp_path.iterdir()) == []
19-
20-
def test_one_chunk(self, tmp_path: Path):
21-
content = b"test file"
22-
test_file = File.from_bytes(content)
23-
chunk = UnknownChunk(0, len(content))
24-
carve_unknown_chunks(tmp_path, test_file, [chunk])
25-
written_path = tmp_path / "0-9.unknown"
26-
assert list(tmp_path.iterdir()) == [written_path]
27-
assert written_path.read_bytes() == content
28-
29-
def test_multiple_chunks(self, tmp_path: Path):
30-
content = b"test file"
31-
test_file = File.from_bytes(content)
32-
chunks = [UnknownChunk(0, 4), UnknownChunk(4, 9)]
33-
carve_unknown_chunks(tmp_path, test_file, chunks)
34-
written_path1 = tmp_path / "0-4.unknown"
35-
written_path2 = tmp_path / "4-9.unknown"
36-
assert sorted(tmp_path.iterdir()) == [written_path1, written_path2]
37-
assert written_path1.read_bytes() == content[:4]
38-
assert written_path2.read_bytes() == content[4:]
14+
def test_carve_unknown_chunk(tmp_path: Path):
15+
content = b"test file"
16+
test_file = File.from_bytes(content)
17+
chunk = UnknownChunk(1, 8)
18+
carve_unknown_chunk(tmp_path, test_file, chunk)
19+
written_path = tmp_path / "1-8.unknown"
20+
assert list(tmp_path.iterdir()) == [written_path]
21+
assert written_path.read_bytes() == content[1:8]
3922

4023

4124
def test_fix_permission(tmpdir: Path):

unblob/extractor.py

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
"""
44
import os
55
from pathlib import Path
6-
from typing import List
76

87
from structlog import get_logger
98

@@ -95,23 +94,12 @@ def fix_extracted_directory(outdir: Path, task_result: TaskResult):
9594
fix_permission(path)
9695

9796

98-
def carve_unknown_chunks(
99-
extract_dir: Path, file: File, unknown_chunks: List[UnknownChunk]
100-
) -> List[Path]:
101-
if not unknown_chunks:
102-
return []
103-
104-
carved_paths = []
105-
logger.warning("Found unknown Chunks", chunks=unknown_chunks)
106-
107-
for chunk in unknown_chunks:
108-
filename = f"{chunk.start_offset}-{chunk.end_offset}.unknown"
109-
carve_path = extract_dir / filename
110-
logger.info("Extracting unknown chunk", path=carve_path, chunk=chunk)
111-
carve_chunk_to_file(carve_path, file, chunk)
112-
carved_paths.append(carve_path)
113-
114-
return carved_paths
97+
def carve_unknown_chunk(extract_dir: Path, file: File, chunk: UnknownChunk) -> Path:
98+
filename = f"{chunk.start_offset}-{chunk.end_offset}.unknown"
99+
carve_path = extract_dir / filename
100+
logger.info("Extracting unknown chunk", path=carve_path, chunk=chunk)
101+
carve_chunk_to_file(carve_path, file, chunk)
102+
return carve_path
115103

116104

117105
def carve_valid_chunk(extract_dir: Path, file: File, chunk: ValidChunk) -> Path:

unblob/models.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from .file_utils import Endian, File, InvalidInputFormat, StructParser
1212
from .identifiers import new_id
1313
from .parser import hexstring2regex
14-
from .report import ChunkReport, ErrorReport, Report
14+
from .report import ChunkReport, ErrorReport, Report, UnknownChunkReport
1515

1616
logger = get_logger()
1717

@@ -114,9 +114,17 @@ class UnknownChunk(Chunk):
114114
entropy, other chunks inside it, metadata, etc.
115115
116116
These are not extracted, just logged for information purposes and further analysis,
117-
like most common bytest (like \x00 and \xFF), ASCII strings, high entropy, etc.
117+
like most common bytes (like \x00 and \xFF), ASCII strings, high entropy, etc.
118118
"""
119119

120+
def as_report(self) -> UnknownChunkReport:
121+
return UnknownChunkReport(
122+
id=self.id,
123+
start_offset=self.start_offset,
124+
end_offset=self.end_offset,
125+
size=self.size,
126+
)
127+
120128

121129
@attr.define
122130
class TaskResult:
@@ -137,16 +145,18 @@ class ProcessResult:
137145

138146
@property
139147
def errors(self) -> List[ErrorReport]:
140-
reports = itertools.chain.from_iterable(
141-
r.reports for r in self.results
148+
reports = itertools.chain.from_iterable(r.reports for r in self.results)
149+
interesting_reports = (
150+
r for r in reports if isinstance(r, (ErrorReport, ChunkReport))
142151
)
143-
interesting_reports = (r for r in reports if isinstance(r, (ErrorReport, ChunkReport)))
144152
errors = []
145153
for report in interesting_reports:
146154
if isinstance(report, ErrorReport):
147155
errors.append(report)
148156
else:
149-
errors.extend(r for r in report.extraction_reports if isinstance(r, ErrorReport))
157+
errors.extend(
158+
r for r in report.extraction_reports if isinstance(r, ErrorReport)
159+
)
150160
return errors
151161

152162
def register(self, result: TaskResult):

unblob/processing.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from unblob.handlers import BUILTIN_HANDLERS, Handlers
1313

14-
from .extractor import carve_unknown_chunks, carve_valid_chunk, fix_extracted_directory
14+
from .extractor import carve_unknown_chunk, carve_valid_chunk, fix_extracted_directory
1515
from .file_utils import iterate_file, valid_path
1616
from .finder import search_chunks
1717
from .iter_utils import pairwise
@@ -297,7 +297,7 @@ def process(self):
297297
else:
298298
# we don't consider whole files as unknown chunks, but we still want to
299299
# calculate entropy for whole files which produced no valid chunks
300-
self._calculate_entropies([self.task.path])
300+
self._calculate_entropy(self.task.path)
301301

302302
self._ensure_root_extract_dir()
303303

@@ -307,10 +307,13 @@ def _process_chunks(
307307
outer_chunks: List[ValidChunk],
308308
unknown_chunks: List[UnknownChunk],
309309
):
310-
carved_unknown_paths = carve_unknown_chunks(
311-
self.carve_dir, file, unknown_chunks
312-
)
313-
self._calculate_entropies(carved_unknown_paths)
310+
if unknown_chunks:
311+
logger.warning("Found unknown Chunks", chunks=unknown_chunks)
312+
313+
for chunk in unknown_chunks:
314+
carved_unknown_path = carve_unknown_chunk(self.carve_dir, file, chunk)
315+
self._calculate_entropy(carved_unknown_path)
316+
self.result.add_report(chunk.as_report())
314317

315318
for chunk in outer_chunks:
316319
self._extract_chunk(file, chunk)
@@ -320,10 +323,9 @@ def _ensure_root_extract_dir(self):
320323
if self.task.depth == 0:
321324
self.carve_dir.mkdir(parents=True, exist_ok=True)
322325

323-
def _calculate_entropies(self, paths: List[Path]):
326+
def _calculate_entropy(self, path: Path):
324327
if self.task.depth < self.config.entropy_depth:
325-
for path in paths:
326-
calculate_entropy(path, draw_plot=self.config.entropy_plot)
328+
calculate_entropy(path, draw_plot=self.config.entropy_plot)
327329

328330
def _extract_chunk(self, file, chunk: ValidChunk):
329331
is_whole_file_chunk = chunk.start_offset == 0 and chunk.end_offset == self.size

unblob/report.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,3 +160,11 @@ class ChunkReport(Report):
160160
size: int
161161
is_encrypted: bool
162162
extraction_reports: List[Report]
163+
164+
165+
@attr.define(kw_only=True)
166+
class UnknownChunkReport(Report):
167+
id: str
168+
start_offset: int
169+
end_offset: int
170+
size: int

0 commit comments

Comments
 (0)