Skip to content

Commit c9c571e

Browse files
committed
fix(script): Update HTML parser to handle attributes and improve file reading errors
1 parent 28f51e2 commit c9c571e

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

scripts/generate_llms_txt.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def __init__(self):
2525
self.in_skip = False
2626
self.skip_tags = {'nav', 'footer', 'script', 'style', 'noscript', 'header'}
2727

28-
def handle_starttag(self, tag, _attrs):
28+
def handle_starttag(self, tag, attrs):
2929
if tag == 'title':
3030
self.in_title = True
3131
elif tag in self.skip_tags:
@@ -69,7 +69,7 @@ def load_config(repo_root):
6969
def extract_page_info(html_path):
7070
"""Extract title and text content from an HTML file."""
7171
try:
72-
with open(html_path, 'r', encoding='utf-8', errors='ignore') as f:
72+
with open(html_path, 'r', encoding='utf-8', errors='replace') as f:
7373
content = f.read()
7474

7575
parser = HTMLTextExtractor()
@@ -153,9 +153,9 @@ def build_canonical_url(base_url, baseurl, rel_path):
153153
# Remove index.html from path
154154
path_str = str(rel_path)
155155
if path_str.endswith('index.html'):
156-
path_str = path_str[:-10] # Remove 'index.html'
156+
path_str = path_str[:-len('index.html')] # Remove 'index.html'
157157
elif path_str.endswith('.html'):
158-
path_str = path_str[:-5] # Remove '.html'
158+
path_str = path_str[:-len('.html')] # Remove '.html'
159159

160160
# Ensure trailing slash for directories
161161
if not path_str or path_str.endswith('/'):

0 commit comments

Comments
 (0)