Skip to content

Commit 2bc60e9

Browse files
authored
test: resolve bugs in sourcecode analyzer to allow unit tests to run offline (#1136)
disable network connections using additional semgrep args, and warn against invalid rulesets to allow offline unit tests. Signed-off-by: Carl Flottmann <[email protected]>
1 parent 5f8a652 commit 2bc60e9

File tree

2 files changed

+62
-19
lines changed

2 files changed

+62
-19
lines changed

src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py

Lines changed: 53 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -118,21 +118,44 @@ def _load_defaults(self, resources_path: str) -> tuple[str, str | None, set[str]
118118
logger.debug(error_msg)
119119
raise ConfigurationError(error_msg)
120120

121-
semgrep_commands: list[str] = ["semgrep", "scan", "--validate", "--oss-only", "--config", custom_rule_path]
121+
# Extra argument explanation:
122+
# --metrics off is used to disable metric collection. This makes a network connection, which can
123+
# impact running Semgrep offline.
124+
# --disable-version-check is used to disable the network connection made to semgrep.dev to check
125+
# if this is the latest version. This network connection can also impact running Semgrep offline.
126+
# --oss-only is used to ensure only the open-source offering of Semgrep is run.
127+
# Note, validation with --validate still currently makes a network connection to download linting
128+
# rules, which cannot be turned off.
129+
semgrep_commands: list[str] = [
130+
"semgrep",
131+
"scan",
132+
"--metrics",
133+
"off",
134+
"--disable-version-check",
135+
"--validate",
136+
"--oss-only",
137+
"--config",
138+
custom_rule_path,
139+
]
122140
try:
123141
process = subprocess.run(semgrep_commands, check=True, capture_output=True) # nosec B603
142+
if process.returncode != 0:
143+
# Only a warning is used here, so that if running offline, the analysis can continue. Erroneous Semgrep files
144+
# will be picked up at analysis time in this case.
145+
warning_msg = (
146+
f"Running semgrep validation on {custom_rule_path} with argument(s)"
147+
f" {process.args} "
148+
f" was not successful: {process.returncode}."
149+
" These custom rule(s) may not run successfully."
150+
)
151+
logger.warning(warning_msg)
152+
124153
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as semgrep_error:
125-
error_msg = (
126-
f"Unable to run semgrep validation on {custom_rule_path} with arguments "
154+
warning_msg = (
155+
f"Unable to run semgrep validation on {custom_rule_path} with argument(s) "
127156
f"{semgrep_commands}: {semgrep_error}."
128157
)
129-
logger.debug(error_msg)
130-
raise ConfigurationError(error_msg) from semgrep_error
131-
132-
if process.returncode != 0:
133-
error_msg = f"Error running semgrep validation on {custom_rule_path} with arguments" f" {process.args}."
134-
logger.debug(error_msg)
135-
raise ConfigurationError(error_msg)
158+
logger.warning(warning_msg)
136159

137160
logger.debug("Including custom ruleset from %s.", custom_rule_path)
138161

@@ -245,10 +268,24 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
245268
analysis_result: dict = {}
246269
# Since we have to run them anyway, return disabled rule findings for debug information.
247270
disabled_results: dict = {}
248-
# Here, we disable 'nosemgrep' ignoring so that this is not an evasion method of our scan (i.e. malware includes
249-
# 'nosemgrep' comments to prevent our scan detecting those code lines). Read more about the 'nosemgrep' feature
250-
# here: https://semgrep.dev/docs/ignoring-files-folders-code
251-
semgrep_commands: list[str] = ["semgrep", "scan", "--oss-only", "--disable-nosem"]
271+
# Extra argument explanation:
272+
# --metrics off is used to disable metric collection. This makes a network connection, which can
273+
# impact running Semgrep offline.
274+
# --disable-version-check is used to disable the network connection made to semgrep.dev to check
275+
# if this is the latest version. This network connection can also impact running Semgrep offline.
276+
# --oss-only is used to ensure only the open-source offering of Semgrep is run.
277+
# --disable-nosem is used to disable 'nosemgrep' ignoring so that this is not an evasion method of
278+
# our scan (i.e. malware includes 'nosemgrep' comments to prevent our scan detecting those code lines).
279+
# Read more about the 'nosemgrep' feature here: https://semgrep.dev/docs/ignoring-files-folders-code
280+
semgrep_commands: list[str] = [
281+
"semgrep",
282+
"scan",
283+
"--metrics",
284+
"off",
285+
"--disable-version-check",
286+
"--oss-only",
287+
"--disable-nosem",
288+
]
252289
result: HeuristicResult = HeuristicResult.PASS
253290

254291
source_code_path = pypi_package_json.package_sourcecode_path
@@ -269,13 +306,13 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
269306
process = subprocess.run(semgrep_commands, check=True, capture_output=True) # nosec B603
270307
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as semgrep_error:
271308
error_msg = (
272-
f"Unable to run semgrep on {source_code_path} with arguments {semgrep_commands}: {semgrep_error}"
309+
f"Unable to run semgrep on {source_code_path} with argument(s) {semgrep_commands}: {semgrep_error}"
273310
)
274311
logger.debug(error_msg)
275312
raise HeuristicAnalyzerValueError(error_msg) from semgrep_error
276313

277314
if process.returncode != 0:
278-
error_msg = f"Error running semgrep on {source_code_path} with arguments" f" {process.args}"
315+
error_msg = f"Error running semgrep on {source_code_path} with argument(s)" f" {process.args}"
279316
logger.debug(error_msg)
280317
raise HeuristicAnalyzerValueError(error_msg)
281318

tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def test_nonexistent_rule_path(mock_defaults: MagicMock) -> None:
9191

9292

9393
@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults")
94-
def test_invalid_custom_rules(mock_defaults: MagicMock) -> None:
94+
def test_invalid_custom_rules(mock_defaults: MagicMock, pypi_package_json: MagicMock) -> None:
9595
"""Test for when the provided file is not a valid semgrep rule, so error,"""
9696
# Use this file as an invalid semgrep rule as it is most definitely not a semgrep rule, and does exist.
9797
defaults = {
@@ -103,8 +103,14 @@ def test_invalid_custom_rules(mock_defaults: MagicMock) -> None:
103103
mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi"
104104
mock_defaults.__getitem__.side_effect = lambda section: sub_section if section == "heuristic.pypi" else None
105105

106-
with pytest.raises(ConfigurationError):
107-
_ = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH)
106+
analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH)
107+
pypi_package_json.package_sourcecode_path = os.path.join(
108+
os.path.dirname(os.path.abspath(__file__)), "resources", "sourcecode_samples"
109+
)
110+
111+
# Semgrep should fail to run when we launch analysis
112+
with pytest.raises(HeuristicAnalyzerValueError):
113+
_ = analyzer.analyze(pypi_package_json)
108114

109115

110116
@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults")

0 commit comments

Comments
 (0)