1212import requests
1313
1414
15- def find_internal_links ( content ):
16- """Find all internal links in markdown and HTML content ."""
17- links = []
15+ def is_external_link ( url ):
16+ """Check if a URL is external (http, https, mailto, tel) ."""
17+ return url . startswith (( "http://" , "https://" , "mailto:" , "tel:" ))
1818
19- # Markdown link pattern: [text](url)
20- md_pattern = r"\[([^\]]+)\]\(([^)]+)\)"
2119
22- # HTML link pattern: <a href="url">text</a> or <a href='url'>text</a>
23- html_pattern = r'<a\s+href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>'
20+ def convert_md_to_html (url ):
21+ """Convert .md URLs to .html URLs."""
22+ return url .replace (".md" , ".html" ) if ".md" in url else url
2423
25- # Find markdown links
26- for match in re .finditer (md_pattern , content ):
27- text = match .group (1 )
28- url = match .group (2 )
2924
30- # Skip external links
31- if url . startswith (( "http://" , "https://" , "mailto:" , "tel:" )):
32- continue
25+ def find_internal_links ( content ):
26+ """Find all internal links in markdown and HTML content."""
27+ links = []
3328
34- links .append ((text , url , "markdown" , match .start ()))
29+ # Common patterns for both markdown and HTML links
30+ patterns = [
31+ (r"\[([^\]]+)\]\(([^)]+)\)" , "markdown" ), # [text](url)
32+ (r'<a\s+href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>' , "html" ),
33+ ]
3534
36- # Find HTML links
37- for match in re .finditer (html_pattern , content ):
38- url = match .group (1 )
39- text = match .group (2 ).strip ()
35+ for pattern , link_type in patterns :
36+ for match in re .finditer (pattern , content ):
37+ if link_type == "markdown" :
38+ text , url = match .group (1 ), match .group (2 )
39+ else : # html
40+ url , text = match .group (1 ), match .group (2 ).strip ()
4041
41- # Skip external links
42- if url . startswith (( "http://" , "https://" , "mailto:" , "tel:" ) ):
43- continue
42+ # Skip external links
43+ if is_external_link ( url ):
44+ continue
4445
45- links .append ((text , url , "html" , match .start ()))
46+ links .append ((text , url , link_type , match .start ()))
4647
4748 return links
4849
4950
50- def resolve_link_url (base_url , md_file , link_url ):
51- """Resolve the real URL as a browser would from the markdown file."""
52- # If link is absolute (starts with /), join with base_url
51+ def resolve_relative_url (base_url , current_file , link_url ):
52+ """Resolve a relative URL from the current file's directory."""
5353 if link_url .startswith ("/" ):
5454 return urljoin (base_url , link_url )
55- # If link is relative, join with the file's directory path
55+
56+ # Get current file's directory
57+ current_dir = str (Path (current_file ).parent )
58+ if current_dir != "." :
59+ resolved_path = str (Path (current_dir ) / link_url )
5660 else :
57- # Get the directory of the markdown file relative to docs/
58- md_dir = Path (md_file ).parent
59- # Build the relative path as it would be in the site
60- rel_path = (md_dir / link_url ).as_posix ()
61- # Remove any leading './' for clean URLs
62- if rel_path .startswith ("./" ):
63- rel_path = rel_path [2 :]
64- return urljoin (base_url + "/" , rel_path )
61+ resolved_path = link_url
62+
63+ # Ensure path starts with /
64+ if not resolved_path .startswith ("/" ):
65+ resolved_path = "/" + resolved_path
66+
67+ return urljoin (base_url , resolved_path )
68+
69+
70+ def build_full_url (base_url , link_url , current_file ):
71+ """Build the full URL for checking or display."""
72+ if link_url .startswith ("#" ):
73+ # Anchor link - resolve from current page
74+ file_path = current_file .replace (".md" , ".html" )
75+ if not file_path .startswith ("/" ):
76+ file_path = "/" + file_path
77+ return urljoin (base_url , file_path + link_url )
78+ else :
79+ # Regular link - convert .md to .html and resolve
80+ converted_url = convert_md_to_html (link_url )
81+ return resolve_relative_url (base_url , current_file , converted_url )
6582
6683
6784def check_link (base_url , link_url , current_file ):
6885 """Check if a link returns 200 or 404."""
6986 try :
70- # Handle anchor links - they should resolve from current page
71- if link_url .startswith ("#" ):
72- # Build URL from current file path, converting .md to .html
73- file_path = current_file .replace (".md" , ".html" )
74- if not file_path .startswith ("/" ):
75- file_path = "/" + file_path
76- full_url = urljoin (base_url , file_path + link_url )
77- else :
78- # Convert .md URLs to .html URLs for checking
79- check_url = link_url
80- if ".md" in check_url :
81- check_url = check_url .replace (".md" , ".html" )
82- # For relative links, resolve from current file's directory
83- if not check_url .startswith ("/" ):
84- # Get current file's directory
85- current_dir = str (Path (current_file ).parent )
86- if current_dir != "." :
87- # Resolve relative to current directory
88- resolved_path = str (Path (current_dir ) / check_url )
89- else :
90- resolved_path = check_url
91-
92- # Convert to URL format
93- if not resolved_path .startswith ("/" ):
94- resolved_path = "/" + resolved_path
95- full_url = urljoin (base_url , resolved_path )
96- else :
97- # Absolute path from site root
98- full_url = urljoin (base_url , check_url )
99-
100- # Make request
87+ full_url = build_full_url (base_url , link_url , current_file )
10188 response = requests .get (full_url , timeout = 5 )
10289
10390 if response .status_code == 200 :
@@ -111,7 +98,62 @@ def check_link(base_url, link_url, current_file):
11198 return False , f"Error: { e } "
11299
113100
101+ def create_link_result (
102+ md_file , docs_dir , text , url , link_type , line_start , content , status
103+ ):
104+ """Create a standardized link result dictionary."""
105+ current_file = str (md_file .relative_to (docs_dir ))
106+ full_url = build_full_url ("http://127.0.0.1:8000" , url , current_file )
107+
108+ return {
109+ "file" : current_file ,
110+ "text" : text ,
111+ "url" : url ,
112+ "full_url" : full_url ,
113+ "status" : status ,
114+ "line" : content [:line_start ].count ("\n " ) + 1 ,
115+ "link_type" : link_type ,
116+ }
117+
118+
119+ def print_broken_links (broken_links ):
120+ """Print broken links to console."""
121+ if not broken_links :
122+ return
123+
124+ print ("\n 🔴 BROKEN LINKS (showing first 10):" )
125+ print ("-" * 50 )
126+ for link in broken_links [:10 ]:
127+ print ("📄 {}:{}" .format (link ["file" ], link ["line" ]))
128+ print (f" Text: { link ['text' ]} " )
129+ print (f" URL: { link ['url' ]} " )
130+ print (f" Full URL: { link ['full_url' ]} " )
131+ print (f" Status: { link ['status' ]} " )
132+ print ()
133+
134+
135+ def save_results (broken_links , working_links , docs_dir , base_url ):
136+ """Save results to JSON file."""
137+ results = {
138+ "summary" : {
139+ "total_files_scanned" : len (list (docs_dir .rglob ("*.md" ))),
140+ "working_links" : len (working_links ),
141+ "broken_links" : len (broken_links ),
142+ "base_url" : base_url ,
143+ },
144+ "broken_links" : broken_links ,
145+ "working_links" : working_links ,
146+ }
147+
148+ output_file = "broken_links.json"
149+ with open (output_file , "w" , encoding = "utf-8" ) as f :
150+ json .dump (results , f , indent = 2 , ensure_ascii = False )
151+
152+ print (f"\n 📄 Results saved to: { output_file } " )
153+
154+
114155def main ():
156+ """Main function to check all internal links."""
115157 base_url = "http://127.0.0.1:8000"
116158 docs_dir = Path ("docs" )
117159
@@ -134,27 +176,9 @@ def main():
134176 base_url , url , str (md_file .relative_to (docs_dir ))
135177 )
136178
137- # Calculate full URL for display
138- if not url .startswith ("#" ):
139- # Convert .md URLs to .html URLs for display
140- display_url = url
141- if ".md" in display_url :
142- display_url = display_url .replace (".md" , ".html" )
143- full_url = urljoin (base_url , display_url )
144- else :
145- file_path = str (md_file .relative_to (docs_dir ))
146- file_path = file_path .replace (".md" , ".html" )
147- full_url = urljoin (base_url , file_path + url )
148-
149- result = {
150- "file" : str (md_file .relative_to (docs_dir )),
151- "text" : text ,
152- "url" : url ,
153- "full_url" : full_url ,
154- "status" : status ,
155- "line" : content [:line_start ].count ("\n " ) + 1 ,
156- "link_type" : link_type ,
157- }
179+ result = create_link_result (
180+ md_file , docs_dir , text , url , link_type , line_start , content , status
181+ )
158182
159183 if is_working :
160184 working_links .append (result )
@@ -168,38 +192,10 @@ def main():
168192 print (f"✅ Working links: { len (working_links )} " )
169193 print (f"❌ Broken links: { len (broken_links )} " )
170194
171- # Save results to JSON
172- results = {
173- "summary" : {
174- "total_files_scanned" : len (list (docs_dir .rglob ("*.md" ))),
175- "working_links" : len (working_links ),
176- "broken_links" : len (broken_links ),
177- "base_url" : base_url ,
178- },
179- "broken_links" : broken_links ,
180- "working_links" : working_links ,
181- }
182-
183- # Save to JSON file
184- output_file = "broken_links.json"
185- with open (output_file , "w" , encoding = "utf-8" ) as f :
186- json .dump (results , f , indent = 2 , ensure_ascii = False )
187-
188- print (f"\n 📄 Results saved to: { output_file } " )
195+ # Save results and print broken links
196+ save_results (broken_links , working_links , docs_dir , base_url )
197+ print_broken_links (broken_links )
189198
190- # Show some broken links in console
191- if broken_links :
192- print ("\n 🔴 BROKEN LINKS (showing first 10):" )
193- print ("-" * 50 )
194- for link in broken_links [:10 ]:
195- print ("📄 {}:{}" .format (link ["file" ], link ["line" ]))
196- print (f" Text: { link ['text' ]} " )
197- print (f" URL: { link ['url' ]} " )
198- print (f" Full URL: { link ['full_url' ]} " )
199- print (f" Status: { link ['status' ]} " )
200- print ()
201-
202- # Return number of broken links
203199 return len (broken_links )
204200
205201
0 commit comments