|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: OpenSSF Best Practices WG |
| 3 | + |
| 4 | +""" |
| 5 | +Utility functions for validating inlined code in README files. |
| 6 | +
|
| 7 | +This module provides functions to extract code blocks from README.md files |
| 8 | +and compare them with the actual Python files they reference. |
| 9 | +""" |
| 10 | + |
| 11 | +import re |
| 12 | +from pathlib import Path |
| 13 | +from typing import Dict, List, Tuple |
| 14 | + |
| 15 | + |
| 16 | +def strip_spdx_headers(code: str) -> str: |
| 17 | + """ |
| 18 | + Remove SPDX copyright headers and test markers from code. |
| 19 | +
|
| 20 | + Removes: |
| 21 | + - Lines starting with "# SPDX-" |
| 22 | + - Lines starting with "# EXPECTED_TIMEOUT:" or "# EXPECTED_FAILURE:" |
| 23 | +
|
| 24 | + Args: |
| 25 | + code: Python code string |
| 26 | +
|
| 27 | + Returns: |
| 28 | + Code with SPDX headers and test markers removed |
| 29 | + """ |
| 30 | + lines = code.split("\n") |
| 31 | + result_lines = [] |
| 32 | + |
| 33 | + for line in lines: |
| 34 | + # Skip SPDX lines |
| 35 | + if line.strip().startswith("# SPDX-"): |
| 36 | + continue |
| 37 | + # Skip EXPECTED_TIMEOUT and EXPECTED_FAILURE markers |
| 38 | + if line.strip().startswith("# EXPECTED_TIMEOUT:") or line.strip().startswith( |
| 39 | + "# EXPECTED_FAILURE:" |
| 40 | + ): |
| 41 | + continue |
| 42 | + result_lines.append(line) |
| 43 | + |
| 44 | + return "\n".join(result_lines) |
| 45 | + |
| 46 | + |
| 47 | +def extract_inlined_code_blocks(readme_path: Path) -> Dict[str, str]: |
| 48 | + """ |
| 49 | + Extract inlined code blocks from README.md that reference Python files. |
| 50 | +
|
| 51 | + Looks for patterns like: |
| 52 | + *[noncompliant01.py](noncompliant01.py):* |
| 53 | + ```python |
| 54 | + ... code ... |
| 55 | + ``` |
| 56 | +
|
| 57 | + Args: |
| 58 | + readme_path: Path to README.md file |
| 59 | +
|
| 60 | + Returns: |
| 61 | + Dictionary mapping filename to inlined code content |
| 62 | + """ |
| 63 | + content = readme_path.read_text(encoding="utf-8") |
| 64 | + inlined_code = {} |
| 65 | + |
| 66 | + # Pattern to match: *[filename.py](filename.py):* followed by ```python code block |
| 67 | + # Using re.DOTALL to match across newlines |
| 68 | + pattern = r"\*\[([^\]]+\.py)\]\([^\)]+\):\*\s*```python\s*\n(.*?)\n```" |
| 69 | + |
| 70 | + matches = re.finditer(pattern, content, re.DOTALL) |
| 71 | + |
| 72 | + for match in matches: |
| 73 | + filename = match.group(1) |
| 74 | + code = match.group(2) |
| 75 | + inlined_code[filename] = code |
| 76 | + |
| 77 | + return inlined_code |
| 78 | + |
| 79 | + |
| 80 | +def normalize_code(code: str) -> str: |
| 81 | + """ |
| 82 | + Normalize code for comparison. |
| 83 | +
|
| 84 | + - Strips leading/trailing whitespace |
| 85 | + - Normalizes line endings |
| 86 | + - Removes trailing whitespace from each line |
| 87 | +
|
| 88 | + Args: |
| 89 | + code: Code string to normalize |
| 90 | +
|
| 91 | + Returns: |
| 92 | + Normalized code string |
| 93 | + """ |
| 94 | + lines = code.strip().split("\n") |
| 95 | + normalized_lines = [line.rstrip() for line in lines] |
| 96 | + return "\n".join(normalized_lines) |
| 97 | + |
| 98 | + |
| 99 | +def compare_inlined_code( |
| 100 | + readme_path: Path, |
| 101 | +) -> List[Tuple[str, str, str, str]]: |
| 102 | + """ |
| 103 | + Compare inlined code in README with actual Python files. |
| 104 | +
|
| 105 | + Args: |
| 106 | + readme_path: Path to README.md file |
| 107 | +
|
| 108 | + Returns: |
| 109 | + List of tuples (filename, issue_type, expected, actual) for mismatches. |
| 110 | + Empty list if all code matches. |
| 111 | + """ |
| 112 | + readme_dir = readme_path.parent |
| 113 | + inlined_code = extract_inlined_code_blocks(readme_path) |
| 114 | + mismatches = [] |
| 115 | + |
| 116 | + for filename, inlined in inlined_code.items(): |
| 117 | + py_file = readme_dir / filename |
| 118 | + |
| 119 | + if not py_file.exists(): |
| 120 | + mismatches.append( |
| 121 | + ( |
| 122 | + filename, |
| 123 | + "missing_file", |
| 124 | + f"File referenced in README but not found: {filename}", |
| 125 | + "", |
| 126 | + ) |
| 127 | + ) |
| 128 | + continue |
| 129 | + |
| 130 | + # Read actual file and strip SPDX headers |
| 131 | + actual_code = py_file.read_text(encoding="utf-8") |
| 132 | + actual_code_stripped = strip_spdx_headers(actual_code) |
| 133 | + |
| 134 | + # Normalize both for comparison |
| 135 | + inlined_normalized = normalize_code(inlined) |
| 136 | + actual_normalized = normalize_code(actual_code_stripped) |
| 137 | + |
| 138 | + if inlined_normalized != actual_normalized: |
| 139 | + # Store as (filename, issue_type, inlined_code, actual_code) |
| 140 | + mismatches.append( |
| 141 | + (filename, "content_mismatch", inlined_normalized, actual_normalized) |
| 142 | + ) |
| 143 | + |
| 144 | + return mismatches |
| 145 | + |
| 146 | + |
| 147 | +def format_diff(expected: str, actual: str, context_lines: int = 3) -> str: |
| 148 | + """ |
| 149 | + Format a simple diff between expected and actual code. |
| 150 | +
|
| 151 | + Args: |
| 152 | + expected: Expected code |
| 153 | + actual: Actual code |
| 154 | + context_lines: Number of context lines to show |
| 155 | +
|
| 156 | + Returns: |
| 157 | + Formatted diff string |
| 158 | + """ |
| 159 | + expected_lines = expected.split("\n") |
| 160 | + actual_lines = actual.split("\n") |
| 161 | + |
| 162 | + diff_lines = [] |
| 163 | + max_lines = max(len(expected_lines), len(actual_lines)) |
| 164 | + |
| 165 | + for i in range(max_lines): |
| 166 | + exp_line = expected_lines[i] if i < len(expected_lines) else "" |
| 167 | + act_line = actual_lines[i] if i < len(actual_lines) else "" |
| 168 | + |
| 169 | + if exp_line != act_line: |
| 170 | + diff_lines.append(f"Line {i + 1}:") |
| 171 | + diff_lines.append(f" Expected: {exp_line}") |
| 172 | + diff_lines.append(f" Actual: {act_line}") |
| 173 | + |
| 174 | + return "\n".join(diff_lines[:20]) # Limit to first 20 diff lines |
| 175 | + |
0 commit comments