test: resolve bugs in sourcecode analyzer to allow unit tests to run offline (#1136)

art1f1c3R · web-flow · commit 2bc60e97b3ac · 2025-08-15T14:37:17.000+10:00
disable network connections using additional semgrep args, and warn against invalid rulesets to allow offline unit tests.

Signed-off-by: Carl Flottmann &lt;carl.flottmann@oracle.com&gt;
diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py
@@ -118,21 +118,44 @@ def _load_defaults(self, resources_path: str) -> tuple[str, str | None, set[str]
                 logger.debug(error_msg)
                 raise ConfigurationError(error_msg)
 
-            semgrep_commands: list[str] = ["semgrep", "scan", "--validate", "--oss-only", "--config", custom_rule_path]
+            # Extra argument explanation:
+            # --metrics off is used to disable metric collection. This makes a network connection, which can
+            # impact running Semgrep offline.
+            # --disable-version-check is used to disable the network connection made to semgrep.dev to check
+            # if this is the latest version. This network connection can also impact running Semgrep offline.
+            # --oss-only is used to ensure only the open-source offering of Semgrep is run.
+            # Note, validation with --validate still currently makes a network connection to download linting
+            # rules, which cannot be turned off.
+            semgrep_commands: list[str] = [
+                "semgrep",
+                "scan",
+                "--metrics",
+                "off",
+                "--disable-version-check",
+                "--validate",
+                "--oss-only",
+                "--config",
+                custom_rule_path,
+            ]
             try:
                 process = subprocess.run(semgrep_commands, check=True, capture_output=True)  # nosec B603
+                if process.returncode != 0:
+                    # Only a warning is used here, so that if running offline, the analysis can continue. Erroneous Semgrep files
+                    # will be picked up at analysis time in this case.
+                    warning_msg = (
+                        f"Running semgrep validation on {custom_rule_path} with argument(s)"
+                        f" {process.args} "
+                        f" was not successful: {process.returncode}."
+                        " These custom rule(s) may not run successfully."
+                    )
+                    logger.warning(warning_msg)
+
             except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as semgrep_error:
-                error_msg = (
-                    f"Unable to run semgrep validation on {custom_rule_path} with arguments "
+                warning_msg = (
+                    f"Unable to run semgrep validation on {custom_rule_path} with argument(s) "
                     f"{semgrep_commands}: {semgrep_error}."
                 )
-                logger.debug(error_msg)
-                raise ConfigurationError(error_msg) from semgrep_error
-
-            if process.returncode != 0:
-                error_msg = f"Error running semgrep validation on {custom_rule_path} with arguments" f" {process.args}."
-                logger.debug(error_msg)
-                raise ConfigurationError(error_msg)
+                logger.warning(warning_msg)
 
             logger.debug("Including custom ruleset from %s.", custom_rule_path)
 
@@ -245,10 +268,24 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
         analysis_result: dict = {}
         # Since we have to run them anyway, return disabled rule findings for debug information.
         disabled_results: dict = {}
-        # Here, we disable 'nosemgrep' ignoring so that this is not an evasion method of our scan (i.e. malware includes
-        # 'nosemgrep' comments to prevent our scan detecting those code lines). Read more about the 'nosemgrep' feature
-        # here: https://semgrep.dev/docs/ignoring-files-folders-code
-        semgrep_commands: list[str] = ["semgrep", "scan", "--oss-only", "--disable-nosem"]
+        # Extra argument explanation:
+        # --metrics off is used to disable metric collection. This makes a network connection, which can
+        # impact running Semgrep offline.
+        # --disable-version-check is used to disable the network connection made to semgrep.dev to check
+        # if this is the latest version. This network connection can also impact running Semgrep offline.
+        # --oss-only is used to ensure only the open-source offering of Semgrep is run.
+        # --disable-nosem is used to disable 'nosemgrep' ignoring so that this is not an evasion method of
+        # our scan (i.e. malware includes 'nosemgrep' comments to prevent our scan detecting those code lines).
+        # Read more about the 'nosemgrep' feature here: https://semgrep.dev/docs/ignoring-files-folders-code
+        semgrep_commands: list[str] = [
+            "semgrep",
+            "scan",
+            "--metrics",
+            "off",
+            "--disable-version-check",
+            "--oss-only",
+            "--disable-nosem",
+        ]
         result: HeuristicResult = HeuristicResult.PASS
 
         source_code_path = pypi_package_json.package_sourcecode_path
@@ -269,13 +306,13 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
                 process = subprocess.run(semgrep_commands, check=True, capture_output=True)  # nosec B603
             except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as semgrep_error:
                 error_msg = (
-                    f"Unable to run semgrep on {source_code_path} with arguments {semgrep_commands}: {semgrep_error}"
+                    f"Unable to run semgrep on {source_code_path} with argument(s) {semgrep_commands}: {semgrep_error}"
                 )
                 logger.debug(error_msg)
                 raise HeuristicAnalyzerValueError(error_msg) from semgrep_error
 
             if process.returncode != 0:
-                error_msg = f"Error running semgrep on {source_code_path} with arguments" f" {process.args}"
+                error_msg = f"Error running semgrep on {source_code_path} with argument(s)" f" {process.args}"
                 logger.debug(error_msg)
                 raise HeuristicAnalyzerValueError(error_msg)
 
diff --git a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py
@@ -91,7 +91,7 @@ def test_nonexistent_rule_path(mock_defaults: MagicMock) -> None:
 
 
 @patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults")
-def test_invalid_custom_rules(mock_defaults: MagicMock) -> None:
+def test_invalid_custom_rules(mock_defaults: MagicMock, pypi_package_json: MagicMock) -> None:
     """Test for when the provided file is not a valid semgrep rule, so error,"""
     # Use this file as an invalid semgrep rule as it is most definitely not a semgrep rule, and does exist.
     defaults = {
@@ -103,8 +103,14 @@ def test_invalid_custom_rules(mock_defaults: MagicMock) -> None:
     mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi"
     mock_defaults.__getitem__.side_effect = lambda section: sub_section if section == "heuristic.pypi" else None
 
-    with pytest.raises(ConfigurationError):
-        _ = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH)
+    analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH)
+    pypi_package_json.package_sourcecode_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "resources", "sourcecode_samples"
+    )
+
+    # Semgrep should fail to run when we launch analysis
+    with pytest.raises(HeuristicAnalyzerValueError):
+        _ = analyzer.analyze(pypi_package_json)
 
 
 @patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults")