chore: eliminate _FileTask.carve_dir

e3krisztian · e3krisztian · commit 6db45085b2e5 · 2024-11-27T16:04:37.000+01:00
`_FileTask.carve_dir` was initially used for both extraction and carving.
The naming of the directories can now differ, so it is not used anymore
apart from an existence check, which would terminate this branch of the
extraction. This output directory existence check is now present in both
the carving and extraction paths, and the output report's name is also
renamed, to accommodate both types of output directories.

`ExtractDirectoryExistsReport` was generalized to
`OutputDirectoryExistsReport` instead of introducing yet another
`Report` type - `CarveDirectoryExistsReport`.
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -34,11 +34,11 @@
 )
 from unblob.report import (
     ChunkReport,
-    ExtractDirectoryExistsReport,
     FileMagicReport,
     HashReport,
     MultiFileCollisionReport,
     MultiFileReport,
+    OutputDirectoryExistsReport,
     RandomnessMeasurements,
     RandomnessReport,
     StatReport,
@@ -350,7 +350,7 @@ def test_process_file_prevents_double_extracts(tmp_path: Path, fw: Path):
 
     # we expect exactly 1 problem reported, related to the extraction of "internal.zip"
     [report] = process_result.errors
-    assert isinstance(report, ExtractDirectoryExistsReport)
+    assert isinstance(report, OutputDirectoryExistsReport)
     assert report.path.name == "internal.zip_extract"
 
     # the rest should be the same, except that the extraction is shifted with one extra directory
@@ -819,7 +819,7 @@ def test_multi_file_extract_dir(
     multi_file_reports = task_result_by_path[directory].filter_reports(MultiFileReport)
     assert multi_file_reports
     assert any(
-        isinstance(report, ExtractDirectoryExistsReport)
+        isinstance(report, OutputDirectoryExistsReport)
         for report in multi_file_reports[0].extraction_reports
     )
 
diff --git a/unblob/processing.py b/unblob/processing.py
@@ -35,10 +35,10 @@
 from .report import (
     CalculateMultiFileExceptionReport,
     CarveDirectoryReport,
-    ExtractDirectoryExistsReport,
     FileMagicReport,
     HashReport,
     MultiFileCollisionReport,
+    OutputDirectoryExistsReport,
     RandomnessMeasurements,
     RandomnessReport,
     Report,
@@ -426,7 +426,7 @@ def _extract_multi_file(self, multi_file: MultiFile) -> Path:
             raise DirectoryProcessingError(
                 "Skipped: extraction directory exists",
                 report=multi_file.as_report(
-                    [ExtractDirectoryExistsReport(path=extract_dir)]
+                    [OutputDirectoryExistsReport(path=extract_dir)]
                 ),
             )
 
@@ -507,24 +507,9 @@ def __init__(
         self.size = size
         self.result = result
 
-        self.carve_dir = config.get_extract_dir_for(self.task.path)
-
     def process(self):
         logger.debug("Processing file", path=self.task.path, size=self.size)
 
-        if self.carve_dir.exists() and not self.config.skip_extraction:
-            # Extraction directory is not supposed to exist, it is usually a simple mistake of running
-            # unblob again without cleaning up or using --force.
-            # It would cause problems continuing, as it would mix up original and extracted files,
-            # and it would just introduce weird, non-deterministic problems due to interference on paths
-            # by multiple workers (parallel processing, modifying content (fix_symlink),
-            # and `mmap` + open for write with O_TRUNC).
-            logger.error(
-                "Skipped: extraction directory exists", extract_dir=self.carve_dir
-            )
-            self.result.add_report(ExtractDirectoryExistsReport(path=self.carve_dir))
-            return
-
         with File.from_path(self.task.path) as file:
             all_chunks = search_chunks(
                 file, self.size, self.config.handlers, self.result
@@ -576,11 +561,24 @@ def _process_chunks(
             self._carve_then_extract_chunks(file, outer_chunks, unknown_chunks)
 
     def _carve_then_extract_chunks(self, file, outer_chunks, unknown_chunks):
+        assert not self.config.skip_extraction
+
         carve_dir = self.config.get_carve_dir_for(self.task.path)
 
         # report the technical carve directory explicitly
         self.result.add_report(CarveDirectoryReport(carve_dir=carve_dir))
 
+        if carve_dir.exists():
+            # Carve directory is not supposed to exist, it is usually a simple mistake of running
+            # unblob again without cleaning up or using --force.
+            # It would cause problems continuing, as it would mix up original and extracted files,
+            # and it would just introduce weird, non-deterministic problems due to interference on paths
+            # by multiple workers (parallel processing, modifying content (fix_symlink),
+            # and `mmap` + open for write with O_TRUNC).
+            logger.error("Skipped: carve directory exists", carve_dir=carve_dir)
+            self.result.add_report(OutputDirectoryExistsReport(path=carve_dir))
+            return
+
         for chunk in unknown_chunks:
             carved_unknown_path = carve_unknown_chunk(carve_dir, file, chunk)
             randomness = self._calculate_randomness(carved_unknown_path)
@@ -633,7 +631,7 @@ def _extract_chunk(
                 chunk=chunk,
             )
             self.result.add_report(
-                chunk.as_report([ExtractDirectoryExistsReport(path=extract_dir)])
+                chunk.as_report([OutputDirectoryExistsReport(path=extract_dir)])
             )
             return
 
diff --git a/unblob/report.py b/unblob/report.py
@@ -90,7 +90,7 @@ class ExtractCommandFailedReport(ErrorReport):
 
 
 @attr.define(kw_only=True, frozen=True)
-class ExtractDirectoryExistsReport(ErrorReport):
+class OutputDirectoryExistsReport(ErrorReport):
     severity: Severity = Severity.ERROR
     path: Path