Skip to content

Commit c8baa31

Browse files
author
pixelead0
committed
Refactor link checking script to enhance URL resolution and result handling
1 parent c920ced commit c8baa31

File tree

1 file changed

+114
-118
lines changed

1 file changed

+114
-118
lines changed

scripts/check_links.py

Lines changed: 114 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -12,92 +12,79 @@
1212
import requests
1313

1414

15-
def find_internal_links(content):
16-
"""Find all internal links in markdown and HTML content."""
17-
links = []
15+
def is_external_link(url):
16+
"""Check if a URL is external (http, https, mailto, tel)."""
17+
return url.startswith(("http://", "https://", "mailto:", "tel:"))
1818

19-
# Markdown link pattern: [text](url)
20-
md_pattern = r"\[([^\]]+)\]\(([^)]+)\)"
2119

22-
# HTML link pattern: <a href="url">text</a> or <a href='url'>text</a>
23-
html_pattern = r'<a\s+href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>'
20+
def convert_md_to_html(url):
21+
"""Convert .md URLs to .html URLs."""
22+
return url.replace(".md", ".html") if ".md" in url else url
2423

25-
# Find markdown links
26-
for match in re.finditer(md_pattern, content):
27-
text = match.group(1)
28-
url = match.group(2)
2924

30-
# Skip external links
31-
if url.startswith(("http://", "https://", "mailto:", "tel:")):
32-
continue
25+
def find_internal_links(content):
26+
"""Find all internal links in markdown and HTML content."""
27+
links = []
3328

34-
links.append((text, url, "markdown", match.start()))
29+
# Common patterns for both markdown and HTML links
30+
patterns = [
31+
(r"\[([^\]]+)\]\(([^)]+)\)", "markdown"), # [text](url)
32+
(r'<a\s+href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>', "html"),
33+
]
3534

36-
# Find HTML links
37-
for match in re.finditer(html_pattern, content):
38-
url = match.group(1)
39-
text = match.group(2).strip()
35+
for pattern, link_type in patterns:
36+
for match in re.finditer(pattern, content):
37+
if link_type == "markdown":
38+
text, url = match.group(1), match.group(2)
39+
else: # html
40+
url, text = match.group(1), match.group(2).strip()
4041

41-
# Skip external links
42-
if url.startswith(("http://", "https://", "mailto:", "tel:")):
43-
continue
42+
# Skip external links
43+
if is_external_link(url):
44+
continue
4445

45-
links.append((text, url, "html", match.start()))
46+
links.append((text, url, link_type, match.start()))
4647

4748
return links
4849

4950

50-
def resolve_link_url(base_url, md_file, link_url):
51-
"""Resolve the real URL as a browser would from the markdown file."""
52-
# If link is absolute (starts with /), join with base_url
51+
def resolve_relative_url(base_url, current_file, link_url):
52+
"""Resolve a relative URL from the current file's directory."""
5353
if link_url.startswith("/"):
5454
return urljoin(base_url, link_url)
55-
# If link is relative, join with the file's directory path
55+
56+
# Get current file's directory
57+
current_dir = str(Path(current_file).parent)
58+
if current_dir != ".":
59+
resolved_path = str(Path(current_dir) / link_url)
5660
else:
57-
# Get the directory of the markdown file relative to docs/
58-
md_dir = Path(md_file).parent
59-
# Build the relative path as it would be in the site
60-
rel_path = (md_dir / link_url).as_posix()
61-
# Remove any leading './' for clean URLs
62-
if rel_path.startswith("./"):
63-
rel_path = rel_path[2:]
64-
return urljoin(base_url + "/", rel_path)
61+
resolved_path = link_url
62+
63+
# Ensure path starts with /
64+
if not resolved_path.startswith("/"):
65+
resolved_path = "/" + resolved_path
66+
67+
return urljoin(base_url, resolved_path)
68+
69+
70+
def build_full_url(base_url, link_url, current_file):
71+
"""Build the full URL for checking or display."""
72+
if link_url.startswith("#"):
73+
# Anchor link - resolve from current page
74+
file_path = current_file.replace(".md", ".html")
75+
if not file_path.startswith("/"):
76+
file_path = "/" + file_path
77+
return urljoin(base_url, file_path + link_url)
78+
else:
79+
# Regular link - convert .md to .html and resolve
80+
converted_url = convert_md_to_html(link_url)
81+
return resolve_relative_url(base_url, current_file, converted_url)
6582

6683

6784
def check_link(base_url, link_url, current_file):
6885
"""Check if a link returns 200 or 404."""
6986
try:
70-
# Handle anchor links - they should resolve from current page
71-
if link_url.startswith("#"):
72-
# Build URL from current file path, converting .md to .html
73-
file_path = current_file.replace(".md", ".html")
74-
if not file_path.startswith("/"):
75-
file_path = "/" + file_path
76-
full_url = urljoin(base_url, file_path + link_url)
77-
else:
78-
# Convert .md URLs to .html URLs for checking
79-
check_url = link_url
80-
if ".md" in check_url:
81-
check_url = check_url.replace(".md", ".html")
82-
# For relative links, resolve from current file's directory
83-
if not check_url.startswith("/"):
84-
# Get current file's directory
85-
current_dir = str(Path(current_file).parent)
86-
if current_dir != ".":
87-
# Resolve relative to current directory
88-
resolved_path = str(Path(current_dir) / check_url)
89-
else:
90-
resolved_path = check_url
91-
92-
# Convert to URL format
93-
if not resolved_path.startswith("/"):
94-
resolved_path = "/" + resolved_path
95-
full_url = urljoin(base_url, resolved_path)
96-
else:
97-
# Absolute path from site root
98-
full_url = urljoin(base_url, check_url)
99-
100-
# Make request
87+
full_url = build_full_url(base_url, link_url, current_file)
10188
response = requests.get(full_url, timeout=5)
10289

10390
if response.status_code == 200:
@@ -111,7 +98,62 @@ def check_link(base_url, link_url, current_file):
11198
return False, f"Error: {e}"
11299

113100

101+
def create_link_result(
102+
md_file, docs_dir, text, url, link_type, line_start, content, status
103+
):
104+
"""Create a standardized link result dictionary."""
105+
current_file = str(md_file.relative_to(docs_dir))
106+
full_url = build_full_url("http://127.0.0.1:8000", url, current_file)
107+
108+
return {
109+
"file": current_file,
110+
"text": text,
111+
"url": url,
112+
"full_url": full_url,
113+
"status": status,
114+
"line": content[:line_start].count("\n") + 1,
115+
"link_type": link_type,
116+
}
117+
118+
119+
def print_broken_links(broken_links):
120+
"""Print broken links to console."""
121+
if not broken_links:
122+
return
123+
124+
print("\n🔴 BROKEN LINKS (showing first 10):")
125+
print("-" * 50)
126+
for link in broken_links[:10]:
127+
print("📄 {}:{}".format(link["file"], link["line"]))
128+
print(f" Text: {link['text']}")
129+
print(f" URL: {link['url']}")
130+
print(f" Full URL: {link['full_url']}")
131+
print(f" Status: {link['status']}")
132+
print()
133+
134+
135+
def save_results(broken_links, working_links, docs_dir, base_url):
136+
"""Save results to JSON file."""
137+
results = {
138+
"summary": {
139+
"total_files_scanned": len(list(docs_dir.rglob("*.md"))),
140+
"working_links": len(working_links),
141+
"broken_links": len(broken_links),
142+
"base_url": base_url,
143+
},
144+
"broken_links": broken_links,
145+
"working_links": working_links,
146+
}
147+
148+
output_file = "broken_links.json"
149+
with open(output_file, "w", encoding="utf-8") as f:
150+
json.dump(results, f, indent=2, ensure_ascii=False)
151+
152+
print(f"\n📄 Results saved to: {output_file}")
153+
154+
114155
def main():
156+
"""Main function to check all internal links."""
115157
base_url = "http://127.0.0.1:8000"
116158
docs_dir = Path("docs")
117159

@@ -134,27 +176,9 @@ def main():
134176
base_url, url, str(md_file.relative_to(docs_dir))
135177
)
136178

137-
# Calculate full URL for display
138-
if not url.startswith("#"):
139-
# Convert .md URLs to .html URLs for display
140-
display_url = url
141-
if ".md" in display_url:
142-
display_url = display_url.replace(".md", ".html")
143-
full_url = urljoin(base_url, display_url)
144-
else:
145-
file_path = str(md_file.relative_to(docs_dir))
146-
file_path = file_path.replace(".md", ".html")
147-
full_url = urljoin(base_url, file_path + url)
148-
149-
result = {
150-
"file": str(md_file.relative_to(docs_dir)),
151-
"text": text,
152-
"url": url,
153-
"full_url": full_url,
154-
"status": status,
155-
"line": content[:line_start].count("\n") + 1,
156-
"link_type": link_type,
157-
}
179+
result = create_link_result(
180+
md_file, docs_dir, text, url, link_type, line_start, content, status
181+
)
158182

159183
if is_working:
160184
working_links.append(result)
@@ -168,38 +192,10 @@ def main():
168192
print(f"✅ Working links: {len(working_links)}")
169193
print(f"❌ Broken links: {len(broken_links)}")
170194

171-
# Save results to JSON
172-
results = {
173-
"summary": {
174-
"total_files_scanned": len(list(docs_dir.rglob("*.md"))),
175-
"working_links": len(working_links),
176-
"broken_links": len(broken_links),
177-
"base_url": base_url,
178-
},
179-
"broken_links": broken_links,
180-
"working_links": working_links,
181-
}
182-
183-
# Save to JSON file
184-
output_file = "broken_links.json"
185-
with open(output_file, "w", encoding="utf-8") as f:
186-
json.dump(results, f, indent=2, ensure_ascii=False)
187-
188-
print(f"\n📄 Results saved to: {output_file}")
195+
# Save results and print broken links
196+
save_results(broken_links, working_links, docs_dir, base_url)
197+
print_broken_links(broken_links)
189198

190-
# Show some broken links in console
191-
if broken_links:
192-
print("\n🔴 BROKEN LINKS (showing first 10):")
193-
print("-" * 50)
194-
for link in broken_links[:10]:
195-
print("📄 {}:{}".format(link["file"], link["line"]))
196-
print(f" Text: {link['text']}")
197-
print(f" URL: {link['url']}")
198-
print(f" Full URL: {link['full_url']}")
199-
print(f" Status: {link['status']}")
200-
print()
201-
202-
# Return number of broken links
203199
return len(broken_links)
204200

205201

0 commit comments

Comments
 (0)