test: refactor fake email heuristic to be more optimistic and allow offline tests (#1154)

art1f1c3R · Trong Nhan Mai · web-flow · commit c898a36f1e6d · 2025-08-19T11:11:16.000+10:00
Signed-off-by: Carl Flottmann &lt;carl.flottmann@oracle.com&gt;
Signed-off-by: Trong Nhan Mai &lt;trong.nhan.mai@oracle.com&gt;
Co-authored-by: Trong Nhan Mai &lt;trong.nhan.mai@oracle.com&gt;
diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/fake_email.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/fake_email.py
@@ -101,28 +101,40 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
         """
         package_json = pypi_package_json.package_json
         if not package_json.get("info", {}):
+            # Malformed JSON, the "info" field must be present
             raise HeuristicAnalyzerValueError("No package info available.")
 
         author_email = json_extract(package_json, ["info", "author_email"], str)
         maintainer_email = json_extract(package_json, ["info", "maintainer_email"], str)
 
+        # If there is no email information, this heuristic does not apply (note, this is common, so
+        # not a case of malformed JSON)
         if not author_email and not maintainer_email:
             return HeuristicResult.SKIP, {"message": "No author or maintainer email available."}
 
-        validated_emails: list[JsonType] = []
-        details = ["normalized", "local_part", "domain"]
+        # non_emails are ones where an email format cannot be extracted
+        # invalid_emails are ones that are invalid as per validate_email()
+        # valid_emails are ones that are valid as per validate_email()
+        detail_info: dict = {"non_emails": [], "invalid_emails": [], "valid_emails": []}
+        result = HeuristicResult.FAIL
 
         for email_field in [author_email, maintainer_email]:
-            if email_field:
-                emails = self.get_emails(email_field)
-                if not emails:
-                    return HeuristicResult.FAIL, {"message": "no emails found in the email field"}
+            if not email_field:
+                continue
 
-                for email in emails:
-                    email_info = self.is_valid_email(email)
-                    if not email_info:
-                        return HeuristicResult.FAIL, {"invalid_email": email}
+            emails = self.get_emails(email_field)
+            if not emails:
+                detail_info["non_emails"].append(email_field)
 
-                    validated_emails.append({key: getattr(email_info, key) for key in details})
+            for email in emails:
+                email_info = self.is_valid_email(email)
+                if email_info is None:
+                    detail_info["invalid_emails"].append(email)
+                    continue
 
-        return HeuristicResult.PASS, {"validated_emails": validated_emails}
+                logger.debug("Email %s normalized to %s", email, email_info.normalized)
+                detail_info["valid_emails"].append(email_info.normalized)
+                # Optimistic, so if there exists a valid email, we will pass this heuristic
+                result = HeuristicResult.PASS
+
+        return result, detail_info
diff --git a/tests/integration/cases/email_checking_pypi_malware_analyzer/check_emails.sh b/tests/integration/cases/email_checking_pypi_malware_analyzer/check_emails.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+result=$(sqlite3 --json output/macaron.db "SELECT detect_malicious_metadata_check.detail_information
+    FROM detect_malicious_metadata_check JOIN check_facts on detect_malicious_metadata_check.id = check_facts.id
+    JOIN check_result on check_facts.check_result_id = check_result.id JOIN component
+    ON component.id = check_result.component_id WHERE check_result.check_id = 'mcn_detect_malicious_metadata_1'
+    AND component.name = 'smooth-operator'" | jq -r ".[0].detail_information | fromjson | .invalid_emails | length > 0")
+if [[ "$result" == "false" ]]; then
+    echo "ERROR: the invalid_emails report for smooth-operator is empty" >&2
+    exit 1
+fi
+
+result=$(sqlite3 --json output/macaron.db "SELECT detect_malicious_metadata_check.detail_information
+    FROM detect_malicious_metadata_check JOIN check_facts on detect_malicious_metadata_check.id = check_facts.id
+    JOIN check_result on check_facts.check_result_id = check_result.id JOIN component
+    ON component.id = check_result.component_id WHERE check_result.check_id = 'mcn_detect_malicious_metadata_1'
+    AND component.name = 'email-validator'" | jq -r ".[0].detail_information | fromjson | .valid_emails | length > 0")
+if [[ "$result" == "false" ]]; then
+    echo "ERROR: the valid_emails report for email-validator is empty" >&2
+    exit 2
+fi
+exit 0
diff --git a/tests/integration/cases/email_checking_pypi_malware_analyzer/policy.dl b/tests/integration/cases/email_checking_pypi_malware_analyzer/policy.dl
@@ -0,0 +1,18 @@
+/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */
+/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */
+
+#include "prelude.dl"
+
+Policy("check-smooth-operator", component_id, "Check smooth-operator artifacts") :-
+    check_passed(component_id, "mcn_detect_malicious_metadata_1").
+
+apply_policy_to("check-smooth-operator", component_id) :-
+    is_component(component_id, purl),
+    match("pkg:pypi/smooth-operator", purl).
+
+Policy("check-email-validator", component_id, "Check email-validator artifacts") :-
+    check_passed(component_id, "mcn_detect_malicious_metadata_1").
+
+apply_policy_to("check-email-validator", component_id) :-
+    is_component(component_id, purl),
+    match("pkg:pypi/email-validator", purl).
diff --git a/tests/integration/cases/email_checking_pypi_malware_analyzer/test.yaml b/tests/integration/cases/email_checking_pypi_malware_analyzer/test.yaml
@@ -0,0 +1,31 @@
+# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+description: |
+  Analyzing two python packages to check if the email address is deliverable and valid. The package smooth-operator is known
+  to use an invalid email address with an example.com domain. email-validator is known to have a valid, deliverable, email address.
+
+tags:
+- macaron-python-package
+
+steps:
+- name: Run macaron analyze against smooth-operator
+  kind: analyze
+  options:
+    command_args:
+    - -purl
+    - pkg:pypi/smooth-operator
+- name: Run macaron analyze against email-validator
+  kind: analyze
+  options:
+    command_args:
+    - -purl
+    - pkg:pypi/email-validator
+- name: Query the output database to verify the emails were detected as invalid and valid respectively.
+  kind: shell
+  options:
+    cmd: ./check_emails.sh
+- name: Run macaron verify-policy to check the results of the packages still passed.
+  kind: verify
+  options:
+    policy: policy.dl
diff --git a/tests/malware_analyzer/pypi/test_fake_email.py b/tests/malware_analyzer/pypi/test_fake_email.py
@@ -4,126 +4,137 @@
 """Tests for the FakeEmailAnalyzer heuristic."""
 
 
+import os
+from pathlib import Path
 from unittest.mock import MagicMock
 
 import pytest
 
+from macaron.config.defaults import load_defaults
 from macaron.errors import HeuristicAnalyzerValueError
 from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
 from macaron.malware_analyzer.pypi_heuristics.metadata.fake_email import FakeEmailAnalyzer
-from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
+
+# If check_deliverability is True, DNS-based check is enabled for email_validator.
+# If check_deliverability is True and no network is available, email_validator will perform DNS-based check
+# but the DNS queries will time out.
+# email_validator doesn't treat timeout as a validation failure:
+# https://github.com/JoshData/python-email-validator/blob/98800bac023b8713351393a5043034065f1ea6cb/email_validator/deliverability.py#L144
+# Therefore, FakeEmailAnalyzer.is_valid_email doesn't return None, and we will treat this as an "valid email".
+# This has a risk of treating an invalid email as valid when an unexpected timeout occurs.
+# We have ensured that check_deliverability is always False for all unit test cases in this module, so that behavior
+# is not expected to happen.
+
+
+@pytest.fixture(name="fake_email_defaults_override")
+def set_defaults_(tmp_path: Path) -> None:
+    """Disable check_deliverability in defaults.ini so we do not make network connections.
+
+    Parameters
+    ----------
+    tmp_path: Path
+        Pytest temporary path fixture.
+    """
+    defaults_file = Path(os.path.join(tmp_path, "config.ini"))
+    content = """
+    [heuristic.pypi]
+    check_deliverability = False
+    """
+    defaults_file.write_text(content, encoding="utf-8")
+    assert load_defaults(str(defaults_file)) is True
 
 
 @pytest.fixture(name="analyzer")
-def analyzer_() -> FakeEmailAnalyzer:
+def analyzer_(fake_email_defaults_override: None) -> FakeEmailAnalyzer:  # pylint: disable=unused-argument
     """Pytest fixture to create a FakeEmailAnalyzer instance."""
     return FakeEmailAnalyzer()
 
 
-@pytest.fixture(name="pypi_package_json_asset_mock")
-def pypi_package_json_asset_mock_() -> MagicMock:
-    """Pytest fixture for a mock PyPIPackageJsonAsset."""
-    mock_asset = MagicMock(spec=PyPIPackageJsonAsset)
-    mock_asset.package_json = {}
-    return mock_asset
-
-
-def test_analyze_skip_no_emails_present(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
-    """Test the analyzer skips if no author_email or maintainer_email is present."""
-    pypi_package_json_asset_mock.package_json = {"info": {"author_email": None, "maintainer_email": None}}
-    result, info = analyzer.analyze(pypi_package_json_asset_mock)
+def test_missing_info(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
+    """Test when JSON 'info' key is missing in the PyPI data (should error).
+
+    Parameters
+    ----------
+    pypi_package_json: MagicMock
+        The PyPIPackageJsonAsset MagicMock fixture.
+    analyzer: FakeEmailAnalyzer
+        An initialized FakeEmailAnalyzer instance.
+    """
+    pypi_package_json.package_json = {}  # No 'info' key
+    with pytest.raises(HeuristicAnalyzerValueError):
+        analyzer.analyze(pypi_package_json)
+
+
+def test_no_emails_present(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
+    """Test when no author_email or maintainer_email is present (should skip).
+
+    Parameters
+    ----------
+    pypi_package_json: MagicMock
+        The PyPIPackageJsonAsset MagicMock fixture.
+    analyzer: FakeEmailAnalyzer
+        An initialized FakeEmailAnalyzer instance.
+    """
+    pypi_package_json.package_json = {"info": {"author_email": None, "maintainer_email": None}}
+    result, _ = analyzer.analyze(pypi_package_json)
     assert result == HeuristicResult.SKIP
-    assert info["message"] == "No author or maintainer email available."
-
-
-def test_analyze_raises_error_for_missing_info_key(
-    analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
-) -> None:
-    """Test the analyzer raises an error if the 'info' key is missing in the PyPI data."""
-    pypi_package_json_asset_mock.package_json = {}  # No 'info' key
-    with pytest.raises(HeuristicAnalyzerValueError) as exc_info:
-        analyzer.analyze(pypi_package_json_asset_mock)
-    assert "No package info available." in str(exc_info.value)
-
-
-def test_analyze_fail_no_email_found_in_field(
-    analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
-) -> None:
-    """Test the analyzer fails if an email field does not contain a parsable email address."""
-    pypi_package_json_asset_mock.package_json = {"info": {"author_email": "not an email", "maintainer_email": None}}
-    result, info = analyzer.analyze(pypi_package_json_asset_mock)
-    assert result == HeuristicResult.FAIL
-    assert info == {"message": "no emails found in the email field"}
 
 
-def test_analyze_fail_invalid_email(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
-    """Test analyzer fails if the email field contains an invalid email format."""
-    invalid_email = "user@example"
-    pypi_package_json_asset_mock.package_json = {"info": {"author_email": invalid_email, "maintainer_email": None}}
+def test_non_email(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
+    """Test with a non-parsable email address (should fail).
 
-    result, info = analyzer.analyze(pypi_package_json_asset_mock)
+    Parameters
+    ----------
+    pypi_package_json: MagicMock
+        The PyPIPackageJsonAsset MagicMock fixture.
+    analyzer: FakeEmailAnalyzer
+        An initialized FakeEmailAnalyzer instance.
+    """
+    pypi_package_json.package_json = {"info": {"author_email": "not an email", "maintainer_email": "also not an email"}}
+    result, info = analyzer.analyze(pypi_package_json)
     assert result == HeuristicResult.FAIL
-    assert info == {"message": "no emails found in the email field"}
-
 
-def test_analyze_pass_only_maintainer_email_valid(
-    analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock
-) -> None:
-    """Test the analyzer passes if only a valid maintainer_email is present and deliverability is not checked."""
-    email = "maintainer@example.net"
-    pypi_package_json_asset_mock.package_json = {"info": {"author_email": None, "maintainer_email": email}}
-    result, info = analyzer.analyze(pypi_package_json_asset_mock)
+    # assert types (for mypy)
+    assert isinstance(info["non_emails"], list)
 
-    if analyzer.check_deliverability:
-        assert result == HeuristicResult.FAIL
-        assert info == {"invalid_email": email}
-        return
-
-    assert result == HeuristicResult.PASS
-    assert info["validated_emails"] == [
-        {"normalized": "maintainer@example.net", "local_part": "maintainer", "domain": "example.net"}
-    ]
+    assert "not an email" in info["non_emails"]
+    assert "also not an email" in info["non_emails"]
 
 
-def test_analyze_pass_both_emails_valid(analyzer: FakeEmailAnalyzer, pypi_package_json_asset_mock: MagicMock) -> None:
-    """Test the analyzer passes if both emails are valid and deliverability is not checked."""
-    author_email = "example@gmail.com"
-    author_local_part, author_domain = author_email.split("@")
-    maintainer_email = "maintainer@example.net"
-    maintainer_local_part, maintainer_domain = maintainer_email.split("@")
+def test_valid_email(pypi_package_json: MagicMock, analyzer: FakeEmailAnalyzer) -> None:
+    """Test with valid email address format (should pass).
 
-    pypi_package_json_asset_mock.package_json = {
-        "info": {"author_email": author_email, "maintainer_email": maintainer_email}
+    Parameters
+    ----------
+    pypi_package_json: MagicMock
+        The PyPIPackageJsonAsset MagicMock fixture.
+    analyzer: FakeEmailAnalyzer
+        An initialized FakeEmailAnalyzer instance.
+    """
+    pypi_package_json.package_json = {
+        "info": {
+            "author_email": "test.email.zwiusiubwq@gmail.com",
+            "maintainer_email": "test.email.fqnmwbsxca@gmail.com",
+        }
     }
-    result, info = analyzer.analyze(pypi_package_json_asset_mock)
-    if analyzer.check_deliverability:
-        assert result == HeuristicResult.FAIL
-        assert info == {"invalid_email": maintainer_email}
-        return
-
+    result, info = analyzer.analyze(pypi_package_json)
     assert result == HeuristicResult.PASS
 
-    validated_emails = info.get("validated_emails")
-    assert isinstance(validated_emails, list)
-    assert len(validated_emails) == 2
-    assert {"normalized": author_email, "local_part": author_local_part, "domain": author_domain} in validated_emails
-    assert {
-        "normalized": maintainer_email,
-        "local_part": maintainer_local_part,
-        "domain": maintainer_domain,
-    } in validated_emails
-
+    # assert types (for mypy)
+    assert isinstance(info["valid_emails"], list)
 
-def test_is_valid_email_failure(analyzer: FakeEmailAnalyzer) -> None:
-    """Test is_valid_email returns None on failure."""
-    result = analyzer.is_valid_email("invalid-email")
-    assert result is None
+    assert "test.email.zwiusiubwq@gmail.com" in info["valid_emails"]
+    assert "test.email.fqnmwbsxca@gmail.com" in info["valid_emails"]
 
 
 def test_get_emails(analyzer: FakeEmailAnalyzer) -> None:
-    """Test the get_emails method."""
-    email_field = "test@example.com, another test <another@example.org>"
-    expected = ["test@example.com", "another@example.org"]
+    """Test the get_emails method extracts emails from text correctly.
+
+    analyzer: FakeEmailAnalyzer
+        An initialized FakeEmailAnalyzer instance.
+    """
+    email_field = "test@example.com, Another User <anotheruser@example.org>, please also email me@example.net thanks!"
+    expected = ["test@example.com", "anotheruser@example.org", "me@example.net"]
     assert analyzer.get_emails(email_field) == expected
 
     email_field_no_email = "this is not an email"