pySCG: add test to validate inlined code matches actual files

tommcd · tommcd · commit 34447b3b26fe · 2025-11-18T10:41:22.000Z
Add critical test to ensure README inlined code blocks match actual
Python files (after stripping SPDX headers and test markers).

This test catches a common documentation error where:
- Code is updated but README isn't updated
- SPDX headers are incorrectly included in inlined code
- Test markers (EXPECTED_TIMEOUT/EXPECTED_FAILURE) appear in docs

Currently identifies 18 files with mismatches (pre-existing issues).
These should be fixed in follow-up PRs.

Signed-off-by: tommcd &lt;tcmcdermott@gmail.com&gt;
diff --git a/docs/Secure-Coding-Guide-for-Python/tests/test_markdown_validation.py b/docs/Secure-Coding-Guide-for-Python/tests/test_markdown_validation.py
@@ -213,3 +213,42 @@ def test_readme_follows_template_order(readme_file: Path):
         f"{readme_file}:\n  Section order issues:\n"
         + "\n".join(f"    - {issue}" for issue in order_issues)
     )
+
+
+@pytest.mark.markdown
+def test_readme_inlined_code_matches_files(readme_file: Path):
+    """
+    Validate that inlined code in README.md matches actual Python files.
+
+    README files contain inlined code blocks that reference Python files like:
+    *[noncompliant01.py](noncompliant01.py):*
+    ```python
+    ... code ...
+    ```
+
+    This test ensures the inlined code matches the actual file content
+    (after stripping SPDX headers and test markers).
+
+    Args:
+        readme_file: Path to README.md file to validate
+    """
+    from tests.utils.code_inline_validator import compare_inlined_code, format_diff
+
+    mismatches = compare_inlined_code(readme_file)
+
+    if mismatches:
+        error_messages = []
+        for filename, issue_type, inlined, actual in mismatches:
+            if issue_type == "missing_file":
+                error_messages.append(f"  - {filename}: {inlined}")
+            elif issue_type == "content_mismatch":
+                error_messages.append(
+                    f"  - {filename}: Inlined code doesn't match file content"
+                )
+                # Expected = actual file content, Actual = what's inlined in README
+                error_messages.append(f"    {format_diff(actual, inlined)}")
+
+        assert False, (
+            f"{readme_file}:\n  Inlined code mismatches:\n"
+            + "\n".join(error_messages)
+        )
diff --git a/docs/Secure-Coding-Guide-for-Python/tests/utils/code_inline_validator.py b/docs/Secure-Coding-Guide-for-Python/tests/utils/code_inline_validator.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: OpenSSF Best Practices WG
+
+"""
+Utility functions for validating inlined code in README files.
+
+This module provides functions to extract code blocks from README.md files
+and compare them with the actual Python files they reference.
+"""
+
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+
+def strip_spdx_headers(code: str) -> str:
+    """
+    Remove SPDX copyright headers and test markers from code.
+
+    Removes:
+    - Lines starting with "# SPDX-"
+    - Lines starting with "# EXPECTED_TIMEOUT:" or "# EXPECTED_FAILURE:"
+
+    Args:
+        code: Python code string
+
+    Returns:
+        Code with SPDX headers and test markers removed
+    """
+    lines = code.split("\n")
+    result_lines = []
+
+    for line in lines:
+        # Skip SPDX lines
+        if line.strip().startswith("# SPDX-"):
+            continue
+        # Skip EXPECTED_TIMEOUT and EXPECTED_FAILURE markers
+        if line.strip().startswith("# EXPECTED_TIMEOUT:") or line.strip().startswith(
+            "# EXPECTED_FAILURE:"
+        ):
+            continue
+        result_lines.append(line)
+
+    return "\n".join(result_lines)
+
+
+def extract_inlined_code_blocks(readme_path: Path) -> Dict[str, str]:
+    """
+    Extract inlined code blocks from README.md that reference Python files.
+
+    Looks for patterns like:
+    *[noncompliant01.py](noncompliant01.py):*
+    ```python
+    ... code ...
+    ```
+
+    Args:
+        readme_path: Path to README.md file
+
+    Returns:
+        Dictionary mapping filename to inlined code content
+    """
+    content = readme_path.read_text(encoding="utf-8")
+    inlined_code = {}
+
+    # Pattern to match: *[filename.py](filename.py):* followed by ```python code block
+    # Using re.DOTALL to match across newlines
+    pattern = r"\*\[([^\]]+\.py)\]\([^\)]+\):\*\s*```python\s*\n(.*?)\n```"
+
+    matches = re.finditer(pattern, content, re.DOTALL)
+
+    for match in matches:
+        filename = match.group(1)
+        code = match.group(2)
+        inlined_code[filename] = code
+
+    return inlined_code
+
+
+def normalize_code(code: str) -> str:
+    """
+    Normalize code for comparison.
+
+    - Strips leading/trailing whitespace
+    - Normalizes line endings
+    - Removes trailing whitespace from each line
+
+    Args:
+        code: Code string to normalize
+
+    Returns:
+        Normalized code string
+    """
+    lines = code.strip().split("\n")
+    normalized_lines = [line.rstrip() for line in lines]
+    return "\n".join(normalized_lines)
+
+
+def compare_inlined_code(
+    readme_path: Path,
+) -> List[Tuple[str, str, str, str]]:
+    """
+    Compare inlined code in README with actual Python files.
+
+    Args:
+        readme_path: Path to README.md file
+
+    Returns:
+        List of tuples (filename, issue_type, expected, actual) for mismatches.
+        Empty list if all code matches.
+    """
+    readme_dir = readme_path.parent
+    inlined_code = extract_inlined_code_blocks(readme_path)
+    mismatches = []
+
+    for filename, inlined in inlined_code.items():
+        py_file = readme_dir / filename
+
+        if not py_file.exists():
+            mismatches.append(
+                (
+                    filename,
+                    "missing_file",
+                    f"File referenced in README but not found: {filename}",
+                    "",
+                )
+            )
+            continue
+
+        # Read actual file and strip SPDX headers
+        actual_code = py_file.read_text(encoding="utf-8")
+        actual_code_stripped = strip_spdx_headers(actual_code)
+
+        # Normalize both for comparison
+        inlined_normalized = normalize_code(inlined)
+        actual_normalized = normalize_code(actual_code_stripped)
+
+        if inlined_normalized != actual_normalized:
+            # Store as (filename, issue_type, inlined_code, actual_code)
+            mismatches.append(
+                (filename, "content_mismatch", inlined_normalized, actual_normalized)
+            )
+
+    return mismatches
+
+
+def format_diff(expected: str, actual: str, context_lines: int = 3) -> str:
+    """
+    Format a simple diff between expected and actual code.
+
+    Args:
+        expected: Expected code
+        actual: Actual code
+        context_lines: Number of context lines to show
+
+    Returns:
+        Formatted diff string
+    """
+    expected_lines = expected.split("\n")
+    actual_lines = actual.split("\n")
+
+    diff_lines = []
+    max_lines = max(len(expected_lines), len(actual_lines))
+
+    for i in range(max_lines):
+        exp_line = expected_lines[i] if i < len(expected_lines) else ""
+        act_line = actual_lines[i] if i < len(actual_lines) else ""
+
+        if exp_line != act_line:
+            diff_lines.append(f"Line {i + 1}:")
+            diff_lines.append(f"  Expected: {exp_line}")
+            diff_lines.append(f"  Actual:   {act_line}")
+
+    return "\n".join(diff_lines[:20])  # Limit to first 20 diff lines
+