Skip to content

Commit 6db4508

Browse files
committed
chore: eliminate _FileTask.carve_dir
`_FileTask.carve_dir` was initially used for both extraction and carving. The naming of the directories can now differ, so it is not used anymore apart from an existence check, which would terminate this branch of the extraction. This output directory existence check is now present in both the carving and extraction paths, and the output report's name is also renamed, to accommodate both types of output directories. `ExtractDirectoryExistsReport` was generalized to `OutputDirectoryExistsReport` instead of introducing yet another `Report` type - `CarveDirectoryExistsReport`.
1 parent a9cdb06 commit 6db4508

File tree

3 files changed

+20
-22
lines changed

3 files changed

+20
-22
lines changed

tests/test_processing.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@
3434
)
3535
from unblob.report import (
3636
ChunkReport,
37-
ExtractDirectoryExistsReport,
3837
FileMagicReport,
3938
HashReport,
4039
MultiFileCollisionReport,
4140
MultiFileReport,
41+
OutputDirectoryExistsReport,
4242
RandomnessMeasurements,
4343
RandomnessReport,
4444
StatReport,
@@ -350,7 +350,7 @@ def test_process_file_prevents_double_extracts(tmp_path: Path, fw: Path):
350350

351351
# we expect exactly 1 problem reported, related to the extraction of "internal.zip"
352352
[report] = process_result.errors
353-
assert isinstance(report, ExtractDirectoryExistsReport)
353+
assert isinstance(report, OutputDirectoryExistsReport)
354354
assert report.path.name == "internal.zip_extract"
355355

356356
# the rest should be the same, except that the extraction is shifted with one extra directory
@@ -819,7 +819,7 @@ def test_multi_file_extract_dir(
819819
multi_file_reports = task_result_by_path[directory].filter_reports(MultiFileReport)
820820
assert multi_file_reports
821821
assert any(
822-
isinstance(report, ExtractDirectoryExistsReport)
822+
isinstance(report, OutputDirectoryExistsReport)
823823
for report in multi_file_reports[0].extraction_reports
824824
)
825825

unblob/processing.py

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@
3535
from .report import (
3636
CalculateMultiFileExceptionReport,
3737
CarveDirectoryReport,
38-
ExtractDirectoryExistsReport,
3938
FileMagicReport,
4039
HashReport,
4140
MultiFileCollisionReport,
41+
OutputDirectoryExistsReport,
4242
RandomnessMeasurements,
4343
RandomnessReport,
4444
Report,
@@ -426,7 +426,7 @@ def _extract_multi_file(self, multi_file: MultiFile) -> Path:
426426
raise DirectoryProcessingError(
427427
"Skipped: extraction directory exists",
428428
report=multi_file.as_report(
429-
[ExtractDirectoryExistsReport(path=extract_dir)]
429+
[OutputDirectoryExistsReport(path=extract_dir)]
430430
),
431431
)
432432

@@ -507,24 +507,9 @@ def __init__(
507507
self.size = size
508508
self.result = result
509509

510-
self.carve_dir = config.get_extract_dir_for(self.task.path)
511-
512510
def process(self):
513511
logger.debug("Processing file", path=self.task.path, size=self.size)
514512

515-
if self.carve_dir.exists() and not self.config.skip_extraction:
516-
# Extraction directory is not supposed to exist, it is usually a simple mistake of running
517-
# unblob again without cleaning up or using --force.
518-
# It would cause problems continuing, as it would mix up original and extracted files,
519-
# and it would just introduce weird, non-deterministic problems due to interference on paths
520-
# by multiple workers (parallel processing, modifying content (fix_symlink),
521-
# and `mmap` + open for write with O_TRUNC).
522-
logger.error(
523-
"Skipped: extraction directory exists", extract_dir=self.carve_dir
524-
)
525-
self.result.add_report(ExtractDirectoryExistsReport(path=self.carve_dir))
526-
return
527-
528513
with File.from_path(self.task.path) as file:
529514
all_chunks = search_chunks(
530515
file, self.size, self.config.handlers, self.result
@@ -576,11 +561,24 @@ def _process_chunks(
576561
self._carve_then_extract_chunks(file, outer_chunks, unknown_chunks)
577562

578563
def _carve_then_extract_chunks(self, file, outer_chunks, unknown_chunks):
564+
assert not self.config.skip_extraction
565+
579566
carve_dir = self.config.get_carve_dir_for(self.task.path)
580567

581568
# report the technical carve directory explicitly
582569
self.result.add_report(CarveDirectoryReport(carve_dir=carve_dir))
583570

571+
if carve_dir.exists():
572+
# Carve directory is not supposed to exist, it is usually a simple mistake of running
573+
# unblob again without cleaning up or using --force.
574+
# It would cause problems continuing, as it would mix up original and extracted files,
575+
# and it would just introduce weird, non-deterministic problems due to interference on paths
576+
# by multiple workers (parallel processing, modifying content (fix_symlink),
577+
# and `mmap` + open for write with O_TRUNC).
578+
logger.error("Skipped: carve directory exists", carve_dir=carve_dir)
579+
self.result.add_report(OutputDirectoryExistsReport(path=carve_dir))
580+
return
581+
584582
for chunk in unknown_chunks:
585583
carved_unknown_path = carve_unknown_chunk(carve_dir, file, chunk)
586584
randomness = self._calculate_randomness(carved_unknown_path)
@@ -633,7 +631,7 @@ def _extract_chunk(
633631
chunk=chunk,
634632
)
635633
self.result.add_report(
636-
chunk.as_report([ExtractDirectoryExistsReport(path=extract_dir)])
634+
chunk.as_report([OutputDirectoryExistsReport(path=extract_dir)])
637635
)
638636
return
639637

unblob/report.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ class ExtractCommandFailedReport(ErrorReport):
9090

9191

9292
@attr.define(kw_only=True, frozen=True)
93-
class ExtractDirectoryExistsReport(ErrorReport):
93+
class OutputDirectoryExistsReport(ErrorReport):
9494
severity: Severity = Severity.ERROR
9595
path: Path
9696

0 commit comments

Comments
 (0)