processing: introduce ProcessResult

e3krisztian · e3krisztian · commit 0b831c9c04d0 · 2022-05-09T15:31:55.000+02:00
This object holds an association of `Task`s and their respective
`TaskResult`s (and of course `Reports`). With this change we are able
to reconstruct what report is for what task, and also what subtasks
are coming from what tasks after the processing is finished.

This is a useful basis for metadata support, as any additional
information can later be added as reports.
diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py
@@ -10,8 +10,7 @@
 
 from unblob.models import File, Handler, Regex, ValidChunk
 from unblob.processing import ExtractionConfig, process_files
-from unblob.report import ErrorReport
-from unblob.testing import check_reports
+from unblob.testing import check_result
 
 _ZIP_CONTENT = b"good file"
 # replacing _ZIP_CONTENT with _DAMAGED_ZIP_CONTENT will result in CRC error at unpacking time
@@ -57,7 +56,7 @@ def test_remove_extracted_chunks(input_dir: Path, output_dir: Path):
 
     all_reports = process_files(config, input_dir)
     assert list(output_dir.glob("**/*.zip")) == []
-    check_reports(all_reports)
+    check_result(all_reports)
 
 
 def test_keep_all_problematic_chunks(input_dir: Path, output_dir: Path):
@@ -69,9 +68,7 @@ def test_keep_all_problematic_chunks(input_dir: Path, output_dir: Path):
 
     all_reports = process_files(config, input_dir)
     # damaged zip file should not be removed
-    assert [
-        r for r in all_reports if isinstance(r, ErrorReport)
-    ] != [], "Unexpectedly no errors found!"
+    assert all_reports.errors != [], "Unexpectedly no errors found!"
     assert list(output_dir.glob("**/*.zip"))
 
 
@@ -84,7 +81,7 @@ def test_keep_all_unknown_chunks(input_dir: Path, output_dir: Path):
 
     all_reports = process_files(config, input_dir)
     assert list(output_dir.glob("**/*.unknown"))
-    check_reports(all_reports)
+    check_result(all_reports)
 
 
 class _HandlerWithNullExtractor(Handler):
@@ -105,4 +102,4 @@ def test_keep_chunks_with_null_extractor(input_dir: Path, output_dir: Path):
     )
     all_reports = process_files(config, input_dir)
     assert list(output_dir.glob("**/*.null"))
-    check_reports(all_reports)
+    check_result(all_reports)
diff --git a/tests/test_handlers.py b/tests/test_handlers.py
@@ -18,7 +18,7 @@
 from unblob.processing import ExtractionConfig, process_files
 from unblob.testing import (
     check_output_is_the_same,
-    check_reports,
+    check_result,
     gather_integration_tests,
 )
 
@@ -35,7 +35,7 @@ def test_all_handlers(
     all_reports = process_files(extraction_config, input_dir)
 
     check_output_is_the_same(output_dir, extraction_config.extract_root)
-    check_reports(all_reports)
+    check_result(all_reports)
 
 
 @pytest.mark.parametrize(
diff --git a/unblob/cli.py b/unblob/cli.py
@@ -1,18 +1,19 @@
 #!/usr/bin/env python3
 import sys
 from pathlib import Path
-from typing import Iterable, List, Optional
+from typing import Iterable, Optional
 
 import click
 from structlog import get_logger
 
+from unblob.models import ProcessResult
 from unblob.plugins import UnblobPluginManager
-from unblob.report import Report, Severity
+from unblob.report import Severity
 
 from .cli_options import verbosity_option
 from .dependencies import get_dependencies, pretty_format_dependencies
 from .handlers import BUILTIN_HANDLERS, Handlers
-from .logging import configure_logger, noformat
+from .logging import configure_logger
 from .processing import (
     DEFAULT_DEPTH,
     DEFAULT_PROCESS_NUM,
@@ -171,7 +172,7 @@ def cli(
     plugins_path: Optional[Path],
     handlers: Handlers,
     plugin_manager: UnblobPluginManager,
-) -> List[Report]:
+) -> ProcessResult:
     configure_logger(verbose, extract_root)
 
     plugin_manager.import_plugins(plugins_path)
@@ -198,14 +199,12 @@ def cli(
 cli.context_class = UnblobContext
 
 
-def get_exit_code_from_reports(reports: List[Report]) -> int:
+def get_exit_code_from_reports(reports: ProcessResult) -> int:
     severity_to_exit_code = [
         (Severity.ERROR, 1),
         (Severity.WARNING, 0),
     ]
-    severities = {
-        report.severity for report in reports if isinstance(report, ErrorReport)
-    }
+    severities = {error.severity for error in reports.errors}
 
     for severity, exit_code in severity_to_exit_code:
         if severity in severities:
diff --git a/unblob/models.py b/unblob/models.py
@@ -1,4 +1,5 @@
 import abc
+import itertools
 from pathlib import Path
 from typing import List, Optional, Tuple, Type
 
@@ -7,7 +8,7 @@
 
 from .file_utils import Endian, File, InvalidInputFormat, StructParser
 from .parser import hexstring2regex
-from .report import Report
+from .report import ErrorReport, Report
 
 logger = get_logger()
 
@@ -17,7 +18,7 @@
 #
 
 
-@attr.define
+@attr.define(frozen=True)
 class Task:
     path: Path
     depth: int
@@ -113,6 +114,21 @@ def add_subtask(self, task: Task):
         self.subtasks.append(task)
 
 
+@attr.define
+class ProcessResult:
+    results: List[TaskResult] = attr.field(factory=list)
+
+    @property
+    def errors(self) -> List[ErrorReport]:
+        reports = itertools.chain.from_iterable(
+            r.reports for r in self.results
+        )
+        return [r for r in reports if isinstance(r, ErrorReport)]
+
+    def register(self, result: TaskResult):
+        self.results.append(result)
+
+
 class ExtractError(Exception):
     """There was an error during extraction"""
 
diff --git a/unblob/processing.py b/unblob/processing.py
@@ -17,12 +17,19 @@
 from .iter_utils import pairwise
 from .logging import noformat
 from .math import shannon_entropy
-from .models import ExtractError, File, Task, TaskResult, UnknownChunk, ValidChunk
+from .models import (
+    ExtractError,
+    File,
+    ProcessResult,
+    Task,
+    TaskResult,
+    UnknownChunk,
+    ValidChunk,
+)
 from .pool import make_pool
 from .report import (
     ExtractDirectoryExistsReport,
     FileMagicReport,
-    Report,
     StatReport,
     UnknownError,
 )
@@ -60,15 +67,15 @@ class ExtractionConfig:
 
 
 @terminate_gracefully
-def process_files(config: ExtractionConfig, path: Path) -> List[Report]:
+def process_files(config: ExtractionConfig, path: Path) -> ProcessResult:
     task = Task(
         path=path,
         depth=0,
     )
 
     errors = check_extract_directory(task, config)
     if errors:
-        return errors
+        return ProcessResult(errors)
 
     result = _process_one_file(config, task)
 
@@ -85,7 +92,7 @@ def check_extract_directory(task: Task, config: ExtractionConfig):
         else:
             report = ExtractDirectoryExistsReport(path=extract_dir)
             logger.error("Extraction directory already exist", **report.asdict())
-            errors.append(report)
+            errors.append(TaskResult(task, [report]))
 
     return errors
 
@@ -106,14 +113,14 @@ def get_existing_extract_dirs(
     return extract_dirs
 
 
-def _process_one_file(config: ExtractionConfig, root_task: Task) -> List[Report]:
+def _process_one_file(config: ExtractionConfig, root_task: Task) -> ProcessResult:
     processor = Processor(config)
-    all_reports = []
+    aggregated_result = ProcessResult()
 
     def process_result(pool, result):
         for new_task in result.subtasks:
             pool.submit(new_task)
-        all_reports.extend(result.reports)
+        aggregated_result.register(result)
 
     pool = make_pool(
         process_num=config.process_num,
@@ -124,7 +131,8 @@ def process_result(pool, result):
     with pool:
         pool.submit(root_task)
         pool.process_until_done()
-    return all_reports
+
+    return aggregated_result
 
 
 class Processor:
diff --git a/unblob/testing.py b/unblob/testing.py
@@ -1,15 +1,14 @@
 import shlex
 import subprocess
 from pathlib import Path
-from typing import List
 
 import pytest
 from pytest_cov.embed import cleanup_on_sigterm
 
 from unblob.finder import build_hyperscan_database
 from unblob.logging import configure_logger
+from unblob.models import ProcessResult
 from unblob.processing import ExtractionConfig
-from unblob.report import ErrorReport, Report
 
 
 @pytest.fixture(scope="session", autouse=True)
@@ -79,9 +78,7 @@ def check_output_is_the_same(reference_dir: Path, extract_dir: Path):
         pytest.fail(f"\nDiff command: {runnable_diff_command}\n{exc.stdout}\n")
 
 
-def check_reports(reports: List[Report]):
+def check_result(reports: ProcessResult):
     __tracebackhide__ = True
 
-    assert [
-        r for r in reports if isinstance(r, ErrorReport)
-    ] == [], "Unexpected error reports"
+    assert reports.errors == [], "Unexpected error reports"