Skip to content

Commit 3acae4f

Browse files
vlaciqkaiser
authored andcommitted
feat(extraction): allow returning reports from Extractor.extract calls
1 parent 76668f3 commit 3acae4f

File tree

6 files changed

+60
-26
lines changed

6 files changed

+60
-26
lines changed

docs/development.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ class Extractor(abc.ABC):
515515
return []
516516

517517
@abc.abstractmethod
518-
def extract(self, inpath: Path, outdir: Path):
518+
def extract(self, inpath: Path, outdir: Path) -> Optional[ExtractResult]:
519519
"""Extract the carved out chunk. Raises ExtractError on failure."""
520520
```
521521

@@ -538,7 +538,7 @@ class DirectoryExtractor(abc.ABC):
538538
return []
539539

540540
@abc.abstractmethod
541-
def extract(self, paths: List[Path], outdir: Path):
541+
def extract(self, paths: List[Path], outdir: Path) -> Optional[ExtractResult]:
542542
"""Extract from a multi file path list.
543543
544544
Raises ExtractError on failure.

unblob/handlers/archive/_safe_tarfile.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,21 @@
22
import tarfile
33
from pathlib import Path
44

5-
import attrs
65
from structlog import get_logger
76

87
from unblob.extractor import is_safe_path
8+
from unblob.report import ExtractionProblem
99

1010
logger = get_logger()
1111

1212
RUNNING_AS_ROOT = os.getuid() == 0
1313
MAX_PATH_LEN = 255
1414

1515

16-
@attrs.define
17-
class ProblematicTarMember:
18-
tarinfo: tarfile.TarInfo
19-
problem: str
20-
resolution: str
21-
22-
2316
class SafeTarFile:
2417
def __init__(self, inpath: Path):
2518
self.inpath = inpath
26-
self.problems = []
19+
self.reports = []
2720
self.tarfile = tarfile.open(inpath)
2821
self.directories = {}
2922

@@ -143,4 +136,10 @@ def fix_directories(self, extract_root):
143136

144137
def record_problem(self, tarinfo, problem, resolution):
145138
logger.warning(f"{problem} {resolution}", path=tarinfo.name) # noqa: G004
146-
self.problems.append(ProblematicTarMember(tarinfo, problem, resolution))
139+
self.reports.append(
140+
ExtractionProblem(
141+
path=tarinfo.name,
142+
problem=problem,
143+
resolution=resolution,
144+
)
145+
)

unblob/handlers/archive/tar.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,14 @@
77
from structlog import get_logger
88

99
from ...file_utils import OffsetFile, SeekError, decode_int, round_up, snull
10-
from ...models import Extractor, File, HexString, StructHandler, ValidChunk
10+
from ...models import (
11+
Extractor,
12+
ExtractResult,
13+
File,
14+
HexString,
15+
StructHandler,
16+
ValidChunk,
17+
)
1118
from ._safe_tarfile import SafeTarFile
1219

1320
logger = get_logger()
@@ -88,6 +95,7 @@ class TarExtractor(Extractor):
8895
def extract(self, inpath: Path, outdir: Path):
8996
with contextlib.closing(SafeTarFile(inpath)) as tarfile:
9097
tarfile.extractall(outdir)
98+
return ExtractResult(reports=tarfile.reports)
9199

92100

93101
class TarHandler(StructHandler):

unblob/models.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ class ValidChunk(Chunk):
103103
handler: "Handler" = attr.ib(init=False, eq=False)
104104
is_encrypted: bool = attr.ib(default=False)
105105

106-
def extract(self, inpath: Path, outdir: Path):
106+
def extract(self, inpath: Path, outdir: Path) -> Optional["ExtractResult"]:
107107
if self.is_encrypted:
108108
logger.warning(
109109
"Encrypted file is not extracted",
@@ -112,7 +112,7 @@ def extract(self, inpath: Path, outdir: Path):
112112
)
113113
raise ExtractError
114114

115-
self.handler.extract(inpath, outdir)
115+
return self.handler.extract(inpath, outdir)
116116

117117
def as_report(self, extraction_reports: List[Report]) -> ChunkReport:
118118
return ChunkReport(
@@ -154,8 +154,8 @@ class MultiFile(Blob):
154154

155155
handler: "DirectoryHandler" = attr.ib(init=False, eq=False)
156156

157-
def extract(self, outdir: Path):
158-
self.handler.extract(self.paths, outdir)
157+
def extract(self, outdir: Path) -> Optional["ExtractResult"]:
158+
return self.handler.extract(self.paths, outdir)
159159

160160
def as_report(self, extraction_reports: List[Report]) -> MultiFileReport:
161161
return MultiFileReport(
@@ -253,13 +253,18 @@ def __init__(self, *reports: Report):
253253
self.reports: Tuple[Report, ...] = reports
254254

255255

256+
@attr.define(kw_only=True)
257+
class ExtractResult:
258+
reports: List[Report]
259+
260+
256261
class Extractor(abc.ABC):
257262
def get_dependencies(self) -> List[str]:
258263
"""Return the external command dependencies."""
259264
return []
260265

261266
@abc.abstractmethod
262-
def extract(self, inpath: Path, outdir: Path):
267+
def extract(self, inpath: Path, outdir: Path) -> Optional[ExtractResult]:
263268
"""Extract the carved out chunk.
264269
265270
Raises ExtractError on failure.
@@ -272,7 +277,7 @@ def get_dependencies(self) -> List[str]:
272277
return []
273278

274279
@abc.abstractmethod
275-
def extract(self, paths: List[Path], outdir: Path):
280+
def extract(self, paths: List[Path], outdir: Path) -> Optional[ExtractResult]:
276281
"""Extract from a multi file path list.
277282
278283
Raises ExtractError on failure.
@@ -381,15 +386,15 @@ def get_dependencies(cls):
381386
def calculate_multifile(self, file: Path) -> Optional[MultiFile]:
382387
"""Calculate the MultiFile in a directory, using a file matched by the pattern as a starting point."""
383388

384-
def extract(self, paths: List[Path], outdir: Path):
389+
def extract(self, paths: List[Path], outdir: Path) -> Optional[ExtractResult]:
385390
if self.EXTRACTOR is None:
386391
logger.debug("Skipping file: no extractor.", paths=paths)
387392
raise ExtractError
388393

389394
# We only extract every blob once, it's a mistake to extract the same blob again
390395
outdir.mkdir(parents=True, exist_ok=False)
391396

392-
self.EXTRACTOR.extract(paths, outdir)
397+
return self.EXTRACTOR.extract(paths, outdir)
393398

394399

395400
class Handler(abc.ABC):
@@ -414,15 +419,15 @@ def get_dependencies(cls):
414419
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
415420
"""Calculate the Chunk offsets from the File and the file type headers."""
416421

417-
def extract(self, inpath: Path, outdir: Path):
422+
def extract(self, inpath: Path, outdir: Path) -> Optional[ExtractResult]:
418423
if self.EXTRACTOR is None:
419424
logger.debug("Skipping file: no extractor.", path=inpath)
420425
raise ExtractError
421426

422427
# We only extract every blob once, it's a mistake to extract the same blob again
423428
outdir.mkdir(parents=True, exist_ok=False)
424429

425-
self.EXTRACTOR.extract(inpath, outdir)
430+
return self.EXTRACTOR.extract(inpath, outdir)
426431

427432

428433
class StructHandler(Handler):

unblob/processing.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,8 @@ def _extract_multi_file(self, multi_file: MultiFile) -> Path:
403403

404404
extraction_reports = []
405405
try:
406-
multi_file.extract(extract_dir)
406+
if result := multi_file.extract(extract_dir):
407+
extraction_reports.extend(result.reports)
407408
except ExtractError as e:
408409
extraction_reports.extend(e.reports)
409410
except Exception as exc:
@@ -522,7 +523,7 @@ def _calculate_entropy(self, path: Path) -> Optional[EntropyReport]:
522523
return report
523524
return None
524525

525-
def _extract_chunk(self, file, chunk: ValidChunk):
526+
def _extract_chunk(self, file, chunk: ValidChunk): # noqa: C901
526527
skip_carving = chunk.is_whole_file
527528
if skip_carving:
528529
inpath = self.task.path
@@ -554,7 +555,8 @@ def _extract_chunk(self, file, chunk: ValidChunk):
554555

555556
extraction_reports = []
556557
try:
557-
chunk.extract(inpath, extract_dir)
558+
if result := chunk.extract(inpath, extract_dir):
559+
extraction_reports.extend(result.reports)
558560

559561
if carved_path and not self.config.keep_extracted_chunks:
560562
logger.debug("Removing extracted chunk", path=carved_path)

unblob/report.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,3 +244,23 @@ class MultiFileReport(Report):
244244
name: str
245245
paths: List[Path]
246246
extraction_reports: List[Report]
247+
248+
249+
@attr.define(kw_only=True, frozen=True)
250+
class ExtractionProblem(Report):
251+
"""A non-fatal problem discovered during extraction.
252+
253+
A report like this still means, that the extraction was successful,
254+
but there were problems that got resolved.
255+
The output is expected to be complete, with the exception of
256+
the reported path.
257+
258+
Examples
259+
--------
260+
- duplicate entries for certain archive formats (tar, zip)
261+
- unsafe symlinks pointing outside of extraction directory
262+
"""
263+
264+
problem: str
265+
resolution: str
266+
path: Optional[str] = None

0 commit comments

Comments
 (0)