Skip to content

Commit c898a36

Browse files
art1f1c3RTrong Nhan Mai
andauthored
test: refactor fake email heuristic to be more optimistic and allow offline tests (#1154)
Signed-off-by: Carl Flottmann <[email protected]> Signed-off-by: Trong Nhan Mai <[email protected]> Co-authored-by: Trong Nhan Mai <[email protected]>
1 parent d2f3e39 commit c898a36

File tree

5 files changed

+196
-101
lines changed

5 files changed

+196
-101
lines changed

src/macaron/malware_analyzer/pypi_heuristics/metadata/fake_email.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -101,28 +101,40 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
101101
"""
102102
package_json = pypi_package_json.package_json
103103
if not package_json.get("info", {}):
104+
# Malformed JSON, the "info" field must be present
104105
raise HeuristicAnalyzerValueError("No package info available.")
105106

106107
author_email = json_extract(package_json, ["info", "author_email"], str)
107108
maintainer_email = json_extract(package_json, ["info", "maintainer_email"], str)
108109

110+
# If there is no email information, this heuristic does not apply (note, this is common, so
111+
# not a case of malformed JSON)
109112
if not author_email and not maintainer_email:
110113
return HeuristicResult.SKIP, {"message": "No author or maintainer email available."}
111114

112-
validated_emails: list[JsonType] = []
113-
details = ["normalized", "local_part", "domain"]
115+
# non_emails are ones where an email format cannot be extracted
116+
# invalid_emails are ones that are invalid as per validate_email()
117+
# valid_emails are ones that are valid as per validate_email()
118+
detail_info: dict = {"non_emails": [], "invalid_emails": [], "valid_emails": []}
119+
result = HeuristicResult.FAIL
114120

115121
for email_field in [author_email, maintainer_email]:
116-
if email_field:
117-
emails = self.get_emails(email_field)
118-
if not emails:
119-
return HeuristicResult.FAIL, {"message": "no emails found in the email field"}
122+
if not email_field:
123+
continue
120124

121-
for email in emails:
122-
email_info = self.is_valid_email(email)
123-
if not email_info:
124-
return HeuristicResult.FAIL, {"invalid_email": email}
125+
emails = self.get_emails(email_field)
126+
if not emails:
127+
detail_info["non_emails"].append(email_field)
125128

126-
validated_emails.append({key: getattr(email_info, key) for key in details})
129+
for email in emails:
130+
email_info = self.is_valid_email(email)
131+
if email_info is None:
132+
detail_info["invalid_emails"].append(email)
133+
continue
127134

128-
return HeuristicResult.PASS, {"validated_emails": validated_emails}
135+
logger.debug("Email %s normalized to %s", email, email_info.normalized)
136+
detail_info["valid_emails"].append(email_info.normalized)
137+
# Optimistic, so if there exists a valid email, we will pass this heuristic
138+
result = HeuristicResult.PASS
139+
140+
return result, detail_info
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
3+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
4+
result=$(sqlite3 --json output/macaron.db "SELECT detect_malicious_metadata_check.detail_information
5+
FROM detect_malicious_metadata_check JOIN check_facts on detect_malicious_metadata_check.id = check_facts.id
6+
JOIN check_result on check_facts.check_result_id = check_result.id JOIN component
7+
ON component.id = check_result.component_id WHERE check_result.check_id = 'mcn_detect_malicious_metadata_1'
8+
AND component.name = 'smooth-operator'" | jq -r ".[0].detail_information | fromjson | .invalid_emails | length > 0")
9+
if [[ "$result" == "false" ]]; then
10+
echo "ERROR: the invalid_emails report for smooth-operator is empty" >&2
11+
exit 1
12+
fi
13+
14+
result=$(sqlite3 --json output/macaron.db "SELECT detect_malicious_metadata_check.detail_information
15+
FROM detect_malicious_metadata_check JOIN check_facts on detect_malicious_metadata_check.id = check_facts.id
16+
JOIN check_result on check_facts.check_result_id = check_result.id JOIN component
17+
ON component.id = check_result.component_id WHERE check_result.check_id = 'mcn_detect_malicious_metadata_1'
18+
AND component.name = 'email-validator'" | jq -r ".[0].detail_information | fromjson | .valid_emails | length > 0")
19+
if [[ "$result" == "false" ]]; then
20+
echo "ERROR: the valid_emails report for email-validator is empty" >&2
21+
exit 2
22+
fi
23+
exit 0
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */
2+
/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */
3+
4+
#include "prelude.dl"
5+
6+
Policy("check-smooth-operator", component_id, "Check smooth-operator artifacts") :-
7+
check_passed(component_id, "mcn_detect_malicious_metadata_1").
8+
9+
apply_policy_to("check-smooth-operator", component_id) :-
10+
is_component(component_id, purl),
11+
match("pkg:pypi/smooth-operator", purl).
12+
13+
Policy("check-email-validator", component_id, "Check email-validator artifacts") :-
14+
check_passed(component_id, "mcn_detect_malicious_metadata_1").
15+
16+
apply_policy_to("check-email-validator", component_id) :-
17+
is_component(component_id, purl),
18+
match("pkg:pypi/email-validator", purl).
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
description: |
5+
Analyzing two python packages to check if the email address is deliverable and valid. The package smooth-operator is known
6+
to use an invalid email address with an example.com domain. email-validator is known to have a valid, deliverable, email address.
7+
8+
tags:
9+
- macaron-python-package
10+
11+
steps:
12+
- name: Run macaron analyze against smooth-operator
13+
kind: analyze
14+
options:
15+
command_args:
16+
- -purl
17+
- pkg:pypi/smooth-operator
18+
- name: Run macaron analyze against email-validator
19+
kind: analyze
20+
options:
21+
command_args:
22+
- -purl
23+
- pkg:pypi/email-validator
24+
- name: Query the output database to verify the emails were detected as invalid and valid respectively.
25+
kind: shell
26+
options:
27+
cmd: ./check_emails.sh
28+
- name: Run macaron verify-policy to check the results of the packages still passed.
29+
kind: verify
30+
options:
31+
policy: policy.dl

tests/malware_analyzer/pypi/test_fake_email.py

Lines changed: 100 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -4,126 +4,137 @@
44
"""Tests for the FakeEmailAnalyzer heuristic."""
55

66

7+
import os
8+
from pathlib import Path
79
from unittest.mock import MagicMock
810

911
import pytest
1012

13+
from macaron.config.defaults import load_defaults
1114
from macaron.errors import HeuristicAnalyzerValueError
1215
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
1316
from macaron.malware_analyzer.pypi_heuristics.metadata.fake_email import FakeEmailAnalyzer
14-
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
17+
18+
# If check_deliverability is True, DNS-based check is enabled for email_validator.
19+
# If check_deliverability is True and no network is available, email_validator will perform DNS-based check
20+
# but the DNS queries will time out.
21+
# email_validator doesn't treat timeout as a validation failure:
22+
# https://github.com/JoshData/python-email-validator/blob/98800bac023b8713351393a5043034065f1ea6cb/email_validator/deliverability.py#L144
23+
# Therefore, FakeEmailAnalyzer.is_valid_email doesn't return None, and we will treat this as an "valid email".
24+
# This has a risk of treating an invalid email as valid when an unexpected timeout occurs.
25+
# We have ensured that check_deliverability is always False for all unit test cases in this module, so that behavior
26+
# is not expected to happen.
27+
28+
29+
@pytest.fixture(name="fake_email_defaults_override")
30+
def set_defaults_(tmp_path: Path) -> None:
31+
"""Disable check_deliverability in defaults.ini so we do not make network connections.
32+
33+
Parameters
34+
----------
35+
tmp_path: Path
36+
Pytest temporary path fixture.
37+
"""
38+
defaults_file = Path(os.path.join(tmp_path, "config.ini"))
39+
content = """
40+
[heuristic.pypi]
41+
check_deliverability = False
42+
"""
43+
defaults_file.write_text(content, encoding="utf-8")
44+
assert load_defaults(str(defaults_file)) is True
1545

1646

1747
@pytest.fixture(name="analyzer")
18-
def analyzer_() -> FakeEmailAnalyzer:
48+
def analyzer_(fake_email_defaults_override: None) -> FakeEmailAnalyzer: # pylint: disable=unused-argument
1949
"""Pytest fixture to create a FakeEmailAnalyzer instance."""
2050
return FakeEmailAnalyzer()
2151

2252

23-
@pytest.fixture(name="pypi_package_json_asset_mock")
24-
def pypi_package_json_asset_mock_() -> MagicMock:
25-
"""Pytest fixture for a mock PyPIPackageJsonAsset."""
26-
mock_asset = MagicMock(spec=PyPIPackageJsonAsset)
27-
mock_asset.package_json = {}
28-
return mock_asset
29-
30-
31-
def test_analyze_skip_no_emails_present(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
32-
"""Test the analyzer skips if no author_email or maintainer_email is present."""
33-
pypi_package_json_asset_mock.package_json = {"info": {"author_email": None, "maintainer_email": None}}
34-
result, info = analyzer.analyze(pypi_package_json_asset_mock)
53+
def test_missing_info(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
54+
"""Test when JSON 'info' key is missing in the PyPI data (should error).
55+
56+
Parameters
57+
----------
58+
pypi_package_json: MagicMock
59+
The PyPIPackageJsonAsset MagicMock fixture.
60+
analyzer: FakeEmailAnalyzer
61+
An initialized FakeEmailAnalyzer instance.
62+
"""
63+
pypi_package_json.package_json = {} # No 'info' key
64+
with pytest.raises(HeuristicAnalyzerValueError):
65+
analyzer.analyze(pypi_package_json)
66+
67+
68+
def test_no_emails_present(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
69+
"""Test when no author_email or maintainer_email is present (should skip).
70+
71+
Parameters
72+
----------
73+
pypi_package_json: MagicMock
74+
The PyPIPackageJsonAsset MagicMock fixture.
75+
analyzer: FakeEmailAnalyzer
76+
An initialized FakeEmailAnalyzer instance.
77+
"""
78+
pypi_package_json.package_json = {"info": {"author_email": None, "maintainer_email": None}}
79+
result, _ = analyzer.analyze(pypi_package_json)
3580
assert result == HeuristicResult.SKIP
36-
assert info["message"] == "No author or maintainer email available."
37-
38-
39-
def test_analyze_raises_error_for_missing_info_key(
40-
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
41-
) -> None:
42-
"""Test the analyzer raises an error if the 'info' key is missing in the PyPI data."""
43-
pypi_package_json_asset_mock.package_json = {} # No 'info' key
44-
with pytest.raises(HeuristicAnalyzerValueError) as exc_info:
45-
analyzer.analyze(pypi_package_json_asset_mock)
46-
assert "No package info available." in str(exc_info.value)
47-
48-
49-
def test_analyze_fail_no_email_found_in_field(
50-
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
51-
) -> None:
52-
"""Test the analyzer fails if an email field does not contain a parsable email address."""
53-
pypi_package_json_asset_mock.package_json = {"info": {"author_email": "not an email", "maintainer_email": None}}
54-
result, info = analyzer.analyze(pypi_package_json_asset_mock)
55-
assert result == HeuristicResult.FAIL
56-
assert info == {"message": "no emails found in the email field"}
5781

5882

59-
def test_analyze_fail_invalid_email(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
60-
"""Test analyzer fails if the email field contains an invalid email format."""
61-
invalid_email = "user@example"
62-
pypi_package_json_asset_mock.package_json = {"info": {"author_email": invalid_email, "maintainer_email": None}}
83+
def test_non_email(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
84+
"""Test with a non-parsable email address (should fail).
6385
64-
result, info = analyzer.analyze(pypi_package_json_asset_mock)
86+
Parameters
87+
----------
88+
pypi_package_json: MagicMock
89+
The PyPIPackageJsonAsset MagicMock fixture.
90+
analyzer: FakeEmailAnalyzer
91+
An initialized FakeEmailAnalyzer instance.
92+
"""
93+
pypi_package_json.package_json = {"info": {"author_email": "not an email", "maintainer_email": "also not an email"}}
94+
result, info = analyzer.analyze(pypi_package_json)
6595
assert result == HeuristicResult.FAIL
66-
assert info == {"message": "no emails found in the email field"}
67-
6896

69-
def test_analyze_pass_only_maintainer_email_valid(
70-
analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
71-
) -> None:
72-
"""Test the analyzer passes if only a valid maintainer_email is present and deliverability is not checked."""
73-
74-
pypi_package_json_asset_mock.package_json = {"info": {"author_email": None, "maintainer_email": email}}
75-
result, info = analyzer.analyze(pypi_package_json_asset_mock)
97+
# assert types (for mypy)
98+
assert isinstance(info["non_emails"], list)
7699

77-
if analyzer.check_deliverability:
78-
assert result == HeuristicResult.FAIL
79-
assert info == {"invalid_email": email}
80-
return
81-
82-
assert result == HeuristicResult.PASS
83-
assert info["validated_emails"] == [
84-
{"normalized": "[email protected]", "local_part": "maintainer", "domain": "example.net"}
85-
]
100+
assert "not an email" in info["non_emails"]
101+
assert "also not an email" in info["non_emails"]
86102

87103

88-
def test_analyze_pass_both_emails_valid(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
89-
"""Test the analyzer passes if both emails are valid and deliverability is not checked."""
90-
author_email = "[email protected]"
91-
author_local_part, author_domain = author_email.split("@")
92-
maintainer_email = "[email protected]"
93-
maintainer_local_part, maintainer_domain = maintainer_email.split("@")
104+
def test_valid_email(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
105+
"""Test with valid email address format (should pass).
94106
95-
pypi_package_json_asset_mock.package_json = {
96-
"info": {"author_email": author_email, "maintainer_email": maintainer_email}
107+
Parameters
108+
----------
109+
pypi_package_json: MagicMock
110+
The PyPIPackageJsonAsset MagicMock fixture.
111+
analyzer: FakeEmailAnalyzer
112+
An initialized FakeEmailAnalyzer instance.
113+
"""
114+
pypi_package_json.package_json = {
115+
"info": {
116+
"author_email": "[email protected]",
117+
"maintainer_email": "[email protected]",
118+
}
97119
}
98-
result, info = analyzer.analyze(pypi_package_json_asset_mock)
99-
if analyzer.check_deliverability:
100-
assert result == HeuristicResult.FAIL
101-
assert info == {"invalid_email": maintainer_email}
102-
return
103-
120+
result, info = analyzer.analyze(pypi_package_json)
104121
assert result == HeuristicResult.PASS
105122

106-
validated_emails = info.get("validated_emails")
107-
assert isinstance(validated_emails, list)
108-
assert len(validated_emails) == 2
109-
assert {"normalized": author_email, "local_part": author_local_part, "domain": author_domain} in validated_emails
110-
assert {
111-
"normalized": maintainer_email,
112-
"local_part": maintainer_local_part,
113-
"domain": maintainer_domain,
114-
} in validated_emails
115-
123+
# assert types (for mypy)
124+
assert isinstance(info["valid_emails"], list)
116125

117-
def test_is_valid_email_failure(analyzer: FakeEmailAnalyzer) -> None:
118-
"""Test is_valid_email returns None on failure."""
119-
result = analyzer.is_valid_email("invalid-email")
120-
assert result is None
126+
assert "[email protected]" in info["valid_emails"]
127+
assert "[email protected]" in info["valid_emails"]
121128

122129

123130
def test_get_emails(analyzer: FakeEmailAnalyzer) -> None:
124-
"""Test the get_emails method."""
125-
email_field = "[email protected], another test <[email protected]>"
126-
131+
"""Test the get_emails method extracts emails from text correctly.
132+
133+
analyzer: FakeEmailAnalyzer
134+
An initialized FakeEmailAnalyzer instance.
135+
"""
136+
email_field = "[email protected], Another User <[email protected]>, please also email [email protected] thanks!"
137+
127138
assert analyzer.get_emails(email_field) == expected
128139

129140
email_field_no_email = "this is not an email"

0 commit comments

Comments
 (0)