[skip actions] [hsqs] 2025-10-05T10:19:30+03:00

babenek · babenek · commit b1ece68b9832 · 2025-10-05T10:19:30.000+03:00
diff --git a/credsweeper/deep_scanner/deep_scanner.py b/credsweeper/deep_scanner/deep_scanner.py
@@ -23,6 +23,7 @@
 from .pptx_scanner import PptxScanner
 from .rpm_scanner import RpmScanner
 from .sqlite3_scanner import Sqlite3Scanner
+from .squashfs_scanner import SquashfsScanner
 from .strings_scanner import StringsScanner
 from .tar_scanner import TarScanner
 from .tmx_scanner import TmxScanner
@@ -49,6 +50,7 @@ class DeepScanner(
     PdfScanner,  #
     PkcsScanner,  #
     PptxScanner,  #
+    SquashfsScanner,  #
     RpmScanner,  #
     Sqlite3Scanner,  #
     StringsScanner,  #
@@ -132,6 +134,9 @@ def get_deep_scanners(data: bytes, descriptor: Descriptor, depth: int) -> Tuple[
         elif Util.is_sqlite3(data):
             if 0 < depth:
                 deep_scanners.append(Sqlite3Scanner)
+        elif Util.is_squashfs(data):
+            if 0 < depth:
+                deep_scanners.append(SquashfsScanner)
         elif Util.is_asn1(data):
             deep_scanners.append(PkcsScanner)
         elif Util.is_xml(data):
diff --git a/credsweeper/deep_scanner/squashfs_scanner.py b/credsweeper/deep_scanner/squashfs_scanner.py
@@ -0,0 +1,52 @@
+import logging
+from abc import ABC
+from typing import List, Optional
+
+from PySquashfsImage import SquashFsImage
+
+from credsweeper.credentials.candidate import Candidate
+from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
+from credsweeper.file_handler.data_content_provider import DataContentProvider
+from credsweeper.file_handler.file_path_extractor import FilePathExtractor
+from credsweeper.utils.util import Util
+
+logger = logging.getLogger(__name__)
+
+
+class SquashfsScanner(AbstractScanner, ABC):
+    """Implements squash file system scanning"""
+
+    def data_scan(
+            self,  #
+            data_provider: DataContentProvider,  #
+            depth: int,  #
+            recursive_limit_size: int) -> Optional[List[Candidate]]:
+        """Extracts files one by one from tar archive and launches data_scan"""
+        try:
+            candidates = []
+            with SquashFsImage.from_bytes(data_provider.data) as image:
+                for i in image:
+                    # skip directory
+                    if not i.is_file or i.is_symlink:
+                        continue
+                    logger.warning(f"{i.path}")
+                    if FilePathExtractor.check_exclude_file(self.config, i.path):
+                        continue
+                    if 0 > recursive_limit_size - i.size:
+                        logger.error(f"{i.name}: size {i.size}"
+                                     f" is over limit {recursive_limit_size} depth:{depth}")
+                        continue
+                    logger.warning(f"{i.path} {i.name}")
+                    hsqs_content_provider = DataContentProvider(data=image.read_file(i.inode),
+                                                                file_path=i.path,
+                                                                file_type=Util.get_extension(i.path),
+                                                                info=f"{data_provider.info}|HSQS:{i.path}")
+                    # Nevertheless, use extracted data size
+                    new_limit = recursive_limit_size - len(hsqs_content_provider.data)
+                    logger.info(f"{i.name}: size {len(hsqs_content_provider.data)}")
+                    hsqs_candidates = self.recursive_scan(hsqs_content_provider, depth, new_limit)
+                    candidates.extend(hsqs_candidates)
+            return candidates
+        except Exception as hsqs_exc:
+            logger.error(f"{data_provider.file_path}:{hsqs_exc}")
+        return None
diff --git a/credsweeper/secret/config.json b/credsweeper/secret/config.json
@@ -2,12 +2,14 @@
     "exclude": {
         "pattern": [],
         "containers": [
+            ".pak",
             ".aar",
             ".apk",
             ".bz2",
             ".class",
             ".gz",
             ".jar",
+            ".img",
             ".lzma",
             ".rpm",
             ".tar",
@@ -41,7 +43,6 @@
             ".gif",
             ".gmo",
             ".ico",
-            ".img",
             ".info",
             ".jpeg",
             ".jpg",
@@ -62,7 +63,6 @@
             ".ogg",
             ".ogv",
             ".ops",
-            ".pak",
             ".png",
             ".psd",
             ".pyc",
diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py
@@ -350,13 +350,23 @@ def is_lzma(data: Union[bytes, bytearray]) -> bool:
             return True
         return False
 
-    @classmethod
-    def is_sqlite3(cls, data):
+    @staticmethod
+    def is_sqlite3(data):
         """According https://en.wikipedia.org/wiki/List_of_file_signatures - SQLite Database"""
         if isinstance(data, (bytes, bytearray)) and data.startswith(b"SQLite format 3\0"):
             return True
         return False
 
+    @staticmethod
+    def is_squashfs(data):
+        """According https://en.wikipedia.org/wiki/List_of_file_signatures - SQLite Database"""
+        if isinstance(data, (bytes, bytearray)) and data.startswith(b"hsqs") and b"\x04\x00\x00\x00" == data[28:32]:
+            # "Must be a power of two between 4096 (4k) and 1048576 (1 MiB)"
+            block_size = int.from_bytes(data[12:16], byteorder="little", signed=False)
+            if 0 == 0xFFF & block_size and 4096 <= block_size <= 1048576:
+                return True
+        return False
+
     @staticmethod
     def is_asn1(data: Union[bytes, bytearray]) -> int:
         """Only sequence type 0x30 and size correctness are checked
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
     "python-pptx",
     "PyYAML",
     "rpmfile",
+    "PySquashfsImage",
     "whatthepatch",
     "xlrd",
 ]
diff --git a/requirements.txt b/requirements.txt
@@ -21,6 +21,7 @@ numpy==2.2.6; python_version == '3.10'
 numpy==2.3.3; python_version > '3.10'
 odfpy==1.4.1
 xlrd==2.0.2
+PySquashfsImage==0.9.0
 
 # onnxruntime - ML engine
 onnxruntime==1.23.0
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 
 # total number of files in test samples
-SAMPLES_FILES_COUNT = 167
+SAMPLES_FILES_COUNT = 168
 
 # the lowest value of ML threshold is used to display possible lowest values
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
@@ -19,7 +19,7 @@
 SAMPLES_POST_CRED_COUNT = 439
 
 # archived credentials that are not found without --depth
-SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 120
+SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 125
 SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 3
 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 4
 
diff --git a/tests/samples/sample.hsqs b/tests/samples/sample.hsqs
diff --git a/tests/test_app.py b/tests/test_app.py
@@ -493,7 +493,7 @@ def test_depth_p(self) -> None:
                     cvs_checksum = hashlib.md5(f.read()).digest()
                 checksum = bytes(a ^ b for a, b in zip(checksum, cvs_checksum))
         # update the checksum manually and keep line endings in the samples as is (git config core.autocrlf false)
-        self.assertEqual("0399a96ebab6339cac1c986dde578a27", binascii.hexlify(checksum).decode())
+        self.assertEqual("418534e183a0820bc3d6830fc29ef46a", binascii.hexlify(checksum).decode())
         normal_report = []
         sorted_report = []
         with tempfile.TemporaryDirectory() as tmp_dir:

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@ dependencies = [`
`26`	`26`	`"python-pptx",`
`27`	`27`	`"PyYAML",`
`28`	`28`	`"rpmfile",`
	`29`	`+ "PySquashfsImage",`
`29`	`30`	`"whatthepatch",`
`30`	`31`	`"xlrd",`
`31`	`32`	`]`