|
| 1 | +import re |
| 2 | +import sys |
| 3 | + |
| 4 | +def slugify(text): |
| 5 | + text = text.lower() |
| 6 | + text = re.sub(r'[^\w\s-]', '', text) |
| 7 | + text = re.sub(r'\s+', '-', text) |
| 8 | + return text |
| 9 | + |
| 10 | +def check_toc(filepath): |
| 11 | + with open(filepath, 'r', encoding='utf-8') as f: |
| 12 | + lines = f.readlines() |
| 13 | + |
| 14 | + headers = [] |
| 15 | + toc_links = [] |
| 16 | + |
| 17 | + # Simple state machine to detect TOC section if needed, |
| 18 | + # but for now we'll just scan all lines for headers and links. |
| 19 | + # Actually, we should only care about links that look like internal anchors (#...) |
| 20 | + |
| 21 | + for line in lines: |
| 22 | + line = line.strip() |
| 23 | + |
| 24 | + # Detect headers |
| 25 | + header_match = re.match(r'^(#+)\s+(.+)$', line) |
| 26 | + if header_match: |
| 27 | + title = header_match.group(2).strip() |
| 28 | + # Ignore some specific things if needed, but generally: |
| 29 | + anchor = slugify(title) |
| 30 | + headers.append(anchor) |
| 31 | + |
| 32 | + # Detect links |
| 33 | + # simple regex for markdown links [text](#anchor) |
| 34 | + # We perform findall because there might be multiple links in a line (though rare for TOC) |
| 35 | + links = re.findall(r'\[([^\]]+)\]\((#[^)]+)\)', line) |
| 36 | + for text, link in links: |
| 37 | + # clean the link (remove #) |
| 38 | + anchor_ref = link[1:] |
| 39 | + toc_links.append({'text': text, 'ref': anchor_ref}) |
| 40 | + |
| 41 | + # Verify |
| 42 | + broken_links = [] |
| 43 | + for link in toc_links: |
| 44 | + if link['ref'] not in headers: |
| 45 | + # Special case: sometimes headers might have duplicate names and some markdown parsers handle it. |
| 46 | + # But normally GitHub appends -1, -2 etc. We might need to handle that if strict. |
| 47 | + # For now, let's just check existence. |
| 48 | + broken_links.append(link) |
| 49 | + |
| 50 | + return broken_links |
| 51 | + |
| 52 | +if __name__ == "__main__": |
| 53 | + if len(sys.argv) < 2: |
| 54 | + print("Usage: python3 check_toc.py <path_to_markdown>") |
| 55 | + sys.exit(1) |
| 56 | + |
| 57 | + filepath = sys.argv[1] |
| 58 | + broken = check_toc(filepath) |
| 59 | + |
| 60 | + if broken: |
| 61 | + print(f"Found {len(broken)} broken links:") |
| 62 | + for b in broken: |
| 63 | + print(f"- [{b['text']}](#{b['ref']}) -> Header not found (expected #{b['ref']})") |
| 64 | + else: |
| 65 | + print("No broken links found.") |
0 commit comments