@@ -25,7 +25,7 @@ def __init__(self):
2525 self .in_skip = False
2626 self .skip_tags = {'nav' , 'footer' , 'script' , 'style' , 'noscript' , 'header' }
2727
28- def handle_starttag (self , tag , _attrs ):
28+ def handle_starttag (self , tag , attrs ):
2929 if tag == 'title' :
3030 self .in_title = True
3131 elif tag in self .skip_tags :
@@ -69,7 +69,7 @@ def load_config(repo_root):
6969def extract_page_info (html_path ):
7070 """Extract title and text content from an HTML file."""
7171 try :
72- with open (html_path , 'r' , encoding = 'utf-8' , errors = 'ignore ' ) as f :
72+ with open (html_path , 'r' , encoding = 'utf-8' , errors = 'replace ' ) as f :
7373 content = f .read ()
7474
7575 parser = HTMLTextExtractor ()
@@ -153,9 +153,9 @@ def build_canonical_url(base_url, baseurl, rel_path):
153153 # Remove index.html from path
154154 path_str = str (rel_path )
155155 if path_str .endswith ('index.html' ):
156- path_str = path_str [:- 10 ] # Remove 'index.html'
156+ path_str = path_str [:- len ( 'index.html' ) ] # Remove 'index.html'
157157 elif path_str .endswith ('.html' ):
158- path_str = path_str [:- 5 ] # Remove '.html'
158+ path_str = path_str [:- len ( '.html' ) ] # Remove '.html'
159159
160160 # Ensure trailing slash for directories
161161 if not path_str or path_str .endswith ('/' ):
0 commit comments