Skip to content

Commit 395efd0

Browse files
committed
refact: Introduce _carve_then_extract_chunks
This removes the burden of carving from already complex function _extract_chunks and also allowed for some better variable names.
1 parent 4e9ee57 commit 395efd0

File tree

1 file changed

+41
-15
lines changed

1 file changed

+41
-15
lines changed

unblob/processing.py

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -559,13 +559,42 @@ def _process_chunks(
559559
self.result.add_report(chunk.as_report(extraction_reports=[]))
560560
return
561561

562+
is_whole_file_chunk = len(outer_chunks) + len(unknown_chunks) == 1
563+
if is_whole_file_chunk:
564+
# skip carving, extract directly the whole file (chunk)
565+
carved_path = self.task.path
566+
for chunk in outer_chunks:
567+
self._extract_chunk(
568+
carved_path,
569+
chunk,
570+
self.config.get_extract_dir_for(carved_path),
571+
# since we do not carve, we want to keep the input around
572+
remove_extracted_input=False,
573+
)
574+
else:
575+
self._carve_then_extract_chunks(file, outer_chunks, unknown_chunks)
576+
577+
def _carve_then_extract_chunks(self, file, outer_chunks, unknown_chunks):
578+
carve_dir = self.config.get_carve_dir_for(self.task.path)
579+
562580
for chunk in unknown_chunks:
563-
carved_unknown_path = carve_unknown_chunk(self.carve_dir, file, chunk)
581+
carved_unknown_path = carve_unknown_chunk(carve_dir, file, chunk)
564582
randomness = self._calculate_randomness(carved_unknown_path)
565583
self.result.add_report(chunk.as_report(randomness=randomness))
566584

567585
for chunk in outer_chunks:
568-
self._extract_chunk(file, chunk)
586+
carved_path = carve_valid_chunk(carve_dir, file, chunk)
587+
588+
self._extract_chunk(
589+
carved_path,
590+
chunk,
591+
self.config.get_extract_dir_for(carved_path),
592+
# when a carved chunk is successfully extracted, usually
593+
# we want to get rid of it, as its data is available in
594+
# extracted format, and the raw data is still part of
595+
# the file the chunk belongs to
596+
remove_extracted_input=not self.config.keep_extracted_chunks,
597+
)
569598

570599
def _calculate_randomness(self, path: Path) -> Optional[RandomnessReport]:
571600
if self.task.depth < self.config.randomness_depth:
@@ -581,17 +610,14 @@ def _calculate_randomness(self, path: Path) -> Optional[RandomnessReport]:
581610
return report
582611
return None
583612

584-
def _extract_chunk(self, file, chunk: ValidChunk): # noqa: C901
585-
skip_carving = chunk.is_whole_file
586-
if skip_carving:
587-
inpath = self.task.path
588-
extract_dir = self.carve_dir
589-
carved_path = None
590-
else:
591-
inpath = carve_valid_chunk(self.carve_dir, file, chunk)
592-
extract_dir = self.carve_dir / (inpath.name + self.config.carve_suffix)
593-
carved_path = inpath
594-
613+
def _extract_chunk(
614+
self,
615+
carved_path: Path,
616+
chunk: ValidChunk,
617+
extract_dir: Path,
618+
*,
619+
remove_extracted_input: bool,
620+
):
595621
if extract_dir.exists():
596622
# Extraction directory is not supposed to exist, it mixes up original and extracted files,
597623
# and it would just introduce weird, non-deterministic problems due to interference on paths
@@ -613,10 +639,10 @@ def _extract_chunk(self, file, chunk: ValidChunk): # noqa: C901
613639

614640
extraction_reports = []
615641
try:
616-
if result := chunk.extract(inpath, extract_dir):
642+
if result := chunk.extract(carved_path, extract_dir):
617643
extraction_reports.extend(result.reports)
618644

619-
if carved_path and not self.config.keep_extracted_chunks:
645+
if remove_extracted_input:
620646
logger.debug("Removing extracted chunk", path=carved_path)
621647
carved_path.unlink()
622648

0 commit comments

Comments
 (0)