Skip to content

Commit 34447b3

Browse files
committed
pySCG: add test to validate inlined code matches actual files
Add critical test to ensure README inlined code blocks match actual Python files (after stripping SPDX headers and test markers). This test catches a common documentation error where: - Code is updated but README isn't updated - SPDX headers are incorrectly included in inlined code - Test markers (EXPECTED_TIMEOUT/EXPECTED_FAILURE) appear in docs Currently identifies 18 files with mismatches (pre-existing issues). These should be fixed in follow-up PRs. Signed-off-by: tommcd <[email protected]>
1 parent a820d0d commit 34447b3

File tree

2 files changed

+214
-0
lines changed

2 files changed

+214
-0
lines changed

docs/Secure-Coding-Guide-for-Python/tests/test_markdown_validation.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,3 +213,42 @@ def test_readme_follows_template_order(readme_file: Path):
213213
f"{readme_file}:\n Section order issues:\n"
214214
+ "\n".join(f" - {issue}" for issue in order_issues)
215215
)
216+
217+
218+
@pytest.mark.markdown
219+
def test_readme_inlined_code_matches_files(readme_file: Path):
220+
"""
221+
Validate that inlined code in README.md matches actual Python files.
222+
223+
README files contain inlined code blocks that reference Python files like:
224+
*[noncompliant01.py](noncompliant01.py):*
225+
```python
226+
... code ...
227+
```
228+
229+
This test ensures the inlined code matches the actual file content
230+
(after stripping SPDX headers and test markers).
231+
232+
Args:
233+
readme_file: Path to README.md file to validate
234+
"""
235+
from tests.utils.code_inline_validator import compare_inlined_code, format_diff
236+
237+
mismatches = compare_inlined_code(readme_file)
238+
239+
if mismatches:
240+
error_messages = []
241+
for filename, issue_type, inlined, actual in mismatches:
242+
if issue_type == "missing_file":
243+
error_messages.append(f" - {filename}: {inlined}")
244+
elif issue_type == "content_mismatch":
245+
error_messages.append(
246+
f" - {filename}: Inlined code doesn't match file content"
247+
)
248+
# Expected = actual file content, Actual = what's inlined in README
249+
error_messages.append(f" {format_diff(actual, inlined)}")
250+
251+
assert False, (
252+
f"{readme_file}:\n Inlined code mismatches:\n"
253+
+ "\n".join(error_messages)
254+
)
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: OpenSSF Best Practices WG
3+
4+
"""
5+
Utility functions for validating inlined code in README files.
6+
7+
This module provides functions to extract code blocks from README.md files
8+
and compare them with the actual Python files they reference.
9+
"""
10+
11+
import re
12+
from pathlib import Path
13+
from typing import Dict, List, Tuple
14+
15+
16+
def strip_spdx_headers(code: str) -> str:
17+
"""
18+
Remove SPDX copyright headers and test markers from code.
19+
20+
Removes:
21+
- Lines starting with "# SPDX-"
22+
- Lines starting with "# EXPECTED_TIMEOUT:" or "# EXPECTED_FAILURE:"
23+
24+
Args:
25+
code: Python code string
26+
27+
Returns:
28+
Code with SPDX headers and test markers removed
29+
"""
30+
lines = code.split("\n")
31+
result_lines = []
32+
33+
for line in lines:
34+
# Skip SPDX lines
35+
if line.strip().startswith("# SPDX-"):
36+
continue
37+
# Skip EXPECTED_TIMEOUT and EXPECTED_FAILURE markers
38+
if line.strip().startswith("# EXPECTED_TIMEOUT:") or line.strip().startswith(
39+
"# EXPECTED_FAILURE:"
40+
):
41+
continue
42+
result_lines.append(line)
43+
44+
return "\n".join(result_lines)
45+
46+
47+
def extract_inlined_code_blocks(readme_path: Path) -> Dict[str, str]:
48+
"""
49+
Extract inlined code blocks from README.md that reference Python files.
50+
51+
Looks for patterns like:
52+
*[noncompliant01.py](noncompliant01.py):*
53+
```python
54+
... code ...
55+
```
56+
57+
Args:
58+
readme_path: Path to README.md file
59+
60+
Returns:
61+
Dictionary mapping filename to inlined code content
62+
"""
63+
content = readme_path.read_text(encoding="utf-8")
64+
inlined_code = {}
65+
66+
# Pattern to match: *[filename.py](filename.py):* followed by ```python code block
67+
# Using re.DOTALL to match across newlines
68+
pattern = r"\*\[([^\]]+\.py)\]\([^\)]+\):\*\s*```python\s*\n(.*?)\n```"
69+
70+
matches = re.finditer(pattern, content, re.DOTALL)
71+
72+
for match in matches:
73+
filename = match.group(1)
74+
code = match.group(2)
75+
inlined_code[filename] = code
76+
77+
return inlined_code
78+
79+
80+
def normalize_code(code: str) -> str:
81+
"""
82+
Normalize code for comparison.
83+
84+
- Strips leading/trailing whitespace
85+
- Normalizes line endings
86+
- Removes trailing whitespace from each line
87+
88+
Args:
89+
code: Code string to normalize
90+
91+
Returns:
92+
Normalized code string
93+
"""
94+
lines = code.strip().split("\n")
95+
normalized_lines = [line.rstrip() for line in lines]
96+
return "\n".join(normalized_lines)
97+
98+
99+
def compare_inlined_code(
100+
readme_path: Path,
101+
) -> List[Tuple[str, str, str, str]]:
102+
"""
103+
Compare inlined code in README with actual Python files.
104+
105+
Args:
106+
readme_path: Path to README.md file
107+
108+
Returns:
109+
List of tuples (filename, issue_type, expected, actual) for mismatches.
110+
Empty list if all code matches.
111+
"""
112+
readme_dir = readme_path.parent
113+
inlined_code = extract_inlined_code_blocks(readme_path)
114+
mismatches = []
115+
116+
for filename, inlined in inlined_code.items():
117+
py_file = readme_dir / filename
118+
119+
if not py_file.exists():
120+
mismatches.append(
121+
(
122+
filename,
123+
"missing_file",
124+
f"File referenced in README but not found: {filename}",
125+
"",
126+
)
127+
)
128+
continue
129+
130+
# Read actual file and strip SPDX headers
131+
actual_code = py_file.read_text(encoding="utf-8")
132+
actual_code_stripped = strip_spdx_headers(actual_code)
133+
134+
# Normalize both for comparison
135+
inlined_normalized = normalize_code(inlined)
136+
actual_normalized = normalize_code(actual_code_stripped)
137+
138+
if inlined_normalized != actual_normalized:
139+
# Store as (filename, issue_type, inlined_code, actual_code)
140+
mismatches.append(
141+
(filename, "content_mismatch", inlined_normalized, actual_normalized)
142+
)
143+
144+
return mismatches
145+
146+
147+
def format_diff(expected: str, actual: str, context_lines: int = 3) -> str:
148+
"""
149+
Format a simple diff between expected and actual code.
150+
151+
Args:
152+
expected: Expected code
153+
actual: Actual code
154+
context_lines: Number of context lines to show
155+
156+
Returns:
157+
Formatted diff string
158+
"""
159+
expected_lines = expected.split("\n")
160+
actual_lines = actual.split("\n")
161+
162+
diff_lines = []
163+
max_lines = max(len(expected_lines), len(actual_lines))
164+
165+
for i in range(max_lines):
166+
exp_line = expected_lines[i] if i < len(expected_lines) else ""
167+
act_line = actual_lines[i] if i < len(actual_lines) else ""
168+
169+
if exp_line != act_line:
170+
diff_lines.append(f"Line {i + 1}:")
171+
diff_lines.append(f" Expected: {exp_line}")
172+
diff_lines.append(f" Actual: {act_line}")
173+
174+
return "\n".join(diff_lines[:20]) # Limit to first 20 diff lines
175+

0 commit comments

Comments
 (0)