Refactor link checking script to enhance URL resolution and result handling

pixelead0 · pixelead0 · commit c8baa316b5ef · 2025-09-02T23:34:50.000-06:00
diff --git a/scripts/check_links.py b/scripts/check_links.py
@@ -12,92 +12,79 @@
 import requests
 
 
-def find_internal_links(content):
-    """Find all internal links in markdown and HTML content."""
-    links = []
+def is_external_link(url):
+    """Check if a URL is external (http, https, mailto, tel)."""
+    return url.startswith(("http://", "https://", "mailto:", "tel:"))
 
-    # Markdown link pattern: [text](url)
-    md_pattern = r"\[([^\]]+)\]\(([^)]+)\)"
 
-    # HTML link pattern: <a href="url">text</a> or <a href='url'>text</a>
-    html_pattern = r'<a\s+href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>'
+def convert_md_to_html(url):
+    """Convert .md URLs to .html URLs."""
+    return url.replace(".md", ".html") if ".md" in url else url
 
-    # Find markdown links
-    for match in re.finditer(md_pattern, content):
-        text = match.group(1)
-        url = match.group(2)
 
-        # Skip external links
-        if url.startswith(("http://", "https://", "mailto:", "tel:")):
-            continue
+def find_internal_links(content):
+    """Find all internal links in markdown and HTML content."""
+    links = []
 
-        links.append((text, url, "markdown", match.start()))
+    # Common patterns for both markdown and HTML links
+    patterns = [
+        (r"\[([^\]]+)\]\(([^)]+)\)", "markdown"),  # [text](url)
+        (r'<a\s+href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>', "html"),
+    ]
 
-    # Find HTML links
-    for match in re.finditer(html_pattern, content):
-        url = match.group(1)
-        text = match.group(2).strip()
+    for pattern, link_type in patterns:
+        for match in re.finditer(pattern, content):
+            if link_type == "markdown":
+                text, url = match.group(1), match.group(2)
+            else:  # html
+                url, text = match.group(1), match.group(2).strip()
 
-        # Skip external links
-        if url.startswith(("http://", "https://", "mailto:", "tel:")):
-            continue
+            # Skip external links
+            if is_external_link(url):
+                continue
 
-        links.append((text, url, "html", match.start()))
+            links.append((text, url, link_type, match.start()))
 
     return links
 
 
-def resolve_link_url(base_url, md_file, link_url):
-    """Resolve the real URL as a browser would from the markdown file."""
-    # If link is absolute (starts with /), join with base_url
+def resolve_relative_url(base_url, current_file, link_url):
+    """Resolve a relative URL from the current file's directory."""
     if link_url.startswith("/"):
         return urljoin(base_url, link_url)
-    # If link is relative, join with the file's directory path
+
+    # Get current file's directory
+    current_dir = str(Path(current_file).parent)
+    if current_dir != ".":
+        resolved_path = str(Path(current_dir) / link_url)
     else:
-        # Get the directory of the markdown file relative to docs/
-        md_dir = Path(md_file).parent
-        # Build the relative path as it would be in the site
-        rel_path = (md_dir / link_url).as_posix()
-        # Remove any leading './' for clean URLs
-        if rel_path.startswith("./"):
-            rel_path = rel_path[2:]
-        return urljoin(base_url + "/", rel_path)
+        resolved_path = link_url
+
+    # Ensure path starts with /
+    if not resolved_path.startswith("/"):
+        resolved_path = "/" + resolved_path
+
+    return urljoin(base_url, resolved_path)
+
+
+def build_full_url(base_url, link_url, current_file):
+    """Build the full URL for checking or display."""
+    if link_url.startswith("#"):
+        # Anchor link - resolve from current page
+        file_path = current_file.replace(".md", ".html")
+        if not file_path.startswith("/"):
+            file_path = "/" + file_path
+        return urljoin(base_url, file_path + link_url)
+    else:
+        # Regular link - convert .md to .html and resolve
+        converted_url = convert_md_to_html(link_url)
+        return resolve_relative_url(base_url, current_file, converted_url)
 
 
 def check_link(base_url, link_url, current_file):
     """Check if a link returns 200 or 404."""
     try:
-        # Handle anchor links - they should resolve from current page
-        if link_url.startswith("#"):
-            # Build URL from current file path, converting .md to .html
-            file_path = current_file.replace(".md", ".html")
-            if not file_path.startswith("/"):
-                file_path = "/" + file_path
-            full_url = urljoin(base_url, file_path + link_url)
-        else:
-            # Convert .md URLs to .html URLs for checking
-            check_url = link_url
-            if ".md" in check_url:
-                check_url = check_url.replace(".md", ".html")
-            # For relative links, resolve from current file's directory
-            if not check_url.startswith("/"):
-                # Get current file's directory
-                current_dir = str(Path(current_file).parent)
-                if current_dir != ".":
-                    # Resolve relative to current directory
-                    resolved_path = str(Path(current_dir) / check_url)
-                else:
-                    resolved_path = check_url
-
-                # Convert to URL format
-                if not resolved_path.startswith("/"):
-                    resolved_path = "/" + resolved_path
-                full_url = urljoin(base_url, resolved_path)
-            else:
-                # Absolute path from site root
-                full_url = urljoin(base_url, check_url)
-
-        # Make request
+        full_url = build_full_url(base_url, link_url, current_file)
         response = requests.get(full_url, timeout=5)
 
         if response.status_code == 200:
@@ -111,7 +98,62 @@ def check_link(base_url, link_url, current_file):
         return False, f"Error: {e}"
 
 
+def create_link_result(
+    md_file, docs_dir, text, url, link_type, line_start, content, status
+):
+    """Create a standardized link result dictionary."""
+    current_file = str(md_file.relative_to(docs_dir))
+    full_url = build_full_url("http://127.0.0.1:8000", url, current_file)
+
+    return {
+        "file": current_file,
+        "text": text,
+        "url": url,
+        "full_url": full_url,
+        "status": status,
+        "line": content[:line_start].count("\n") + 1,
+        "link_type": link_type,
+    }
+
+
+def print_broken_links(broken_links):
+    """Print broken links to console."""
+    if not broken_links:
+        return
+
+    print("\n🔴 BROKEN LINKS (showing first 10):")
+    print("-" * 50)
+    for link in broken_links[:10]:
+        print("📄 {}:{}".format(link["file"], link["line"]))
+        print(f"   Text: {link['text']}")
+        print(f"   URL: {link['url']}")
+        print(f"   Full URL: {link['full_url']}")
+        print(f"   Status: {link['status']}")
+        print()
+
+
+def save_results(broken_links, working_links, docs_dir, base_url):
+    """Save results to JSON file."""
+    results = {
+        "summary": {
+            "total_files_scanned": len(list(docs_dir.rglob("*.md"))),
+            "working_links": len(working_links),
+            "broken_links": len(broken_links),
+            "base_url": base_url,
+        },
+        "broken_links": broken_links,
+        "working_links": working_links,
+    }
+
+    output_file = "broken_links.json"
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+
+    print(f"\n📄 Results saved to: {output_file}")
+
+
 def main():
+    """Main function to check all internal links."""
     base_url = "http://127.0.0.1:8000"
     docs_dir = Path("docs")
 
@@ -134,27 +176,9 @@ def main():
                     base_url, url, str(md_file.relative_to(docs_dir))
                 )
 
-                # Calculate full URL for display
-                if not url.startswith("#"):
-                    # Convert .md URLs to .html URLs for display
-                    display_url = url
-                    if ".md" in display_url:
-                        display_url = display_url.replace(".md", ".html")
-                    full_url = urljoin(base_url, display_url)
-                else:
-                    file_path = str(md_file.relative_to(docs_dir))
-                    file_path = file_path.replace(".md", ".html")
-                    full_url = urljoin(base_url, file_path + url)
-
-                result = {
-                    "file": str(md_file.relative_to(docs_dir)),
-                    "text": text,
-                    "url": url,
-                    "full_url": full_url,
-                    "status": status,
-                    "line": content[:line_start].count("\n") + 1,
-                    "link_type": link_type,
-                }
+                result = create_link_result(
+                    md_file, docs_dir, text, url, link_type, line_start, content, status
+                )
 
                 if is_working:
                     working_links.append(result)
@@ -168,38 +192,10 @@ def main():
     print(f"✅ Working links: {len(working_links)}")
     print(f"❌ Broken links: {len(broken_links)}")
 
-    # Save results to JSON
-    results = {
-        "summary": {
-            "total_files_scanned": len(list(docs_dir.rglob("*.md"))),
-            "working_links": len(working_links),
-            "broken_links": len(broken_links),
-            "base_url": base_url,
-        },
-        "broken_links": broken_links,
-        "working_links": working_links,
-    }
-
-    # Save to JSON file
-    output_file = "broken_links.json"
-    with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(results, f, indent=2, ensure_ascii=False)
-
-    print(f"\n📄 Results saved to: {output_file}")
+    # Save results and print broken links
+    save_results(broken_links, working_links, docs_dir, base_url)
+    print_broken_links(broken_links)
 
-    # Show some broken links in console
-    if broken_links:
-        print("\n🔴 BROKEN LINKS (showing first 10):")
-        print("-" * 50)
-        for link in broken_links[:10]:
-            print("📄 {}:{}".format(link["file"], link["line"]))
-            print(f"   Text: {link['text']}")
-            print(f"   URL: {link['url']}")
-            print(f"   Full URL: {link['full_url']}")
-            print(f"   Status: {link['status']}")
-            print()
-
-    # Return number of broken links
     return len(broken_links)