Skip to content

Commit e20d983

Browse files
committed
feat(llms.txt): Implement script to generate llms.txt from Jekyll site output
1 parent 04e14d8 commit e20d983

File tree

1 file changed

+267
-0
lines changed

1 file changed

+267
-0
lines changed

scripts/generate_llms_txt.py

Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Generate llms.txt file from Jekyll site build output.
4+
5+
This script walks through the _site directory, extracts content from HTML pages,
6+
and generates an llms.txt file with organized sections and summaries.
7+
"""
8+
9+
import sys
10+
import yaml
11+
import re
12+
from pathlib import Path
13+
from html.parser import HTMLParser
14+
from collections import defaultdict
15+
16+
17+
class HTMLTextExtractor(HTMLParser):
18+
"""Extract text content from HTML, ignoring nav, footer, and script elements."""
19+
20+
def __init__(self):
21+
super().__init__()
22+
self.text_parts = []
23+
self.title = None
24+
self.in_title = False
25+
self.in_skip = False
26+
self.skip_tags = {'nav', 'footer', 'script', 'style', 'noscript', 'header'}
27+
28+
def handle_starttag(self, tag, _attrs):
29+
if tag == 'title':
30+
self.in_title = True
31+
elif tag in self.skip_tags:
32+
self.in_skip = True
33+
34+
def handle_endtag(self, tag):
35+
if tag == 'title':
36+
self.in_title = False
37+
elif tag in self.skip_tags:
38+
self.in_skip = False
39+
40+
def handle_data(self, data):
41+
if self.in_title and not self.title:
42+
self.title = data.strip()
43+
elif not self.in_skip and not self.in_title:
44+
text = data.strip()
45+
if text:
46+
self.text_parts.append(text)
47+
48+
def get_text(self):
49+
"""Get extracted text content."""
50+
return ' '.join(self.text_parts)
51+
52+
def get_title(self):
53+
"""Get page title."""
54+
return self.title or "Untitled"
55+
56+
57+
def load_config(repo_root):
58+
"""Load Jekyll configuration."""
59+
config_path = repo_root / '_config.yml'
60+
if not config_path.exists():
61+
raise FileNotFoundError(f"Could not find _config.yml at {config_path}")
62+
63+
with open(config_path, 'r', encoding='utf-8') as f:
64+
config = yaml.safe_load(f)
65+
66+
return config
67+
68+
69+
def extract_page_info(html_path):
70+
"""Extract title and text content from an HTML file."""
71+
try:
72+
with open(html_path, 'r', encoding='utf-8', errors='ignore') as f:
73+
content = f.read()
74+
75+
parser = HTMLTextExtractor()
76+
parser.feed(content)
77+
78+
title = parser.get_title()
79+
text = parser.get_text()
80+
81+
# Clean up text: remove excessive whitespace
82+
text = re.sub(r'\s+', ' ', text).strip()
83+
84+
return title, text
85+
except Exception as e:
86+
print(f"Warning: Error extracting content from {html_path}: {e}", file=sys.stderr)
87+
return None, None
88+
89+
90+
def truncate_text(text, max_chars=300):
91+
"""Truncate text to a reasonable summary length."""
92+
if len(text) <= max_chars:
93+
return text
94+
95+
# Try to truncate at a sentence boundary
96+
truncated = text[:max_chars]
97+
last_period = truncated.rfind('.')
98+
last_exclaim = truncated.rfind('!')
99+
last_question = truncated.rfind('?')
100+
101+
last_sentence = max(last_period, last_exclaim, last_question)
102+
103+
if last_sentence > max_chars * 0.6: # At least 60% of target length
104+
return text[:last_sentence + 1]
105+
else:
106+
# Just cut at word boundary
107+
last_space = truncated.rfind(' ')
108+
if last_space > 0:
109+
return truncated[:last_space] + '...'
110+
return truncated + '...'
111+
112+
113+
def should_ignore_file(rel_path):
114+
"""Determine if a file should be ignored."""
115+
ignore_patterns = [
116+
'feed.xml', 'atom.xml', 'sitemap.xml',
117+
'robots.txt', '404.html',
118+
'/assets/', '/css/', '/js/', '/img/', '/images/',
119+
]
120+
121+
path_str = str(rel_path)
122+
123+
for pattern in ignore_patterns:
124+
if pattern in path_str:
125+
return True
126+
127+
# Ignore index files that are just redirects or utility pages
128+
if '404' in path_str:
129+
return True
130+
131+
return False
132+
133+
134+
def get_section_name(rel_path):
135+
"""Determine section name from URL path."""
136+
parts = rel_path.parts
137+
138+
# Root level files
139+
if len(parts) == 1 or (len(parts) == 2 and parts[1] == 'index.html'):
140+
return "Root"
141+
142+
# Use first path segment as section name
143+
section = parts[0] if parts else "Root"
144+
145+
# Capitalize and clean up section name
146+
section = section.replace('-', ' ').replace('_', ' ').title()
147+
148+
return section
149+
150+
151+
def build_canonical_url(base_url, baseurl, rel_path):
152+
"""Build the canonical URL for a page."""
153+
# Remove index.html from path
154+
path_str = str(rel_path)
155+
if path_str.endswith('index.html'):
156+
path_str = path_str[:-10] # Remove 'index.html'
157+
elif path_str.endswith('.html'):
158+
path_str = path_str[:-5] # Remove '.html'
159+
160+
# Ensure trailing slash for directories
161+
if not path_str or path_str.endswith('/'):
162+
pass
163+
else:
164+
path_str += '/'
165+
166+
# Combine base URL and path
167+
full_url = base_url.rstrip('/')
168+
if baseurl:
169+
full_url += '/' + baseurl.strip('/')
170+
171+
if path_str:
172+
full_url += '/' + path_str.lstrip('/')
173+
else:
174+
full_url += '/'
175+
176+
return full_url
177+
178+
179+
def generate_llms_txt(repo_root):
180+
"""Generate llms.txt from the built Jekyll site."""
181+
# Check for _site directory
182+
site_dir = repo_root / '_site'
183+
if not site_dir.exists():
184+
print(f"Error: _site directory not found at {site_dir}", file=sys.stderr)
185+
print("Please run 'bundle exec jekyll build' first.", file=sys.stderr)
186+
sys.exit(1)
187+
188+
# Load configuration
189+
config = load_config(repo_root)
190+
base_url = config.get('url', 'https://us-rse.org')
191+
baseurl = config.get('baseurl', '')
192+
193+
# Collect pages by section
194+
sections = defaultdict(list)
195+
196+
# Walk through _site directory
197+
for html_file in site_dir.rglob('*.html'):
198+
rel_path = html_file.relative_to(site_dir)
199+
200+
# Skip ignored files
201+
if should_ignore_file(rel_path):
202+
continue
203+
204+
# Extract page information
205+
title, text = extract_page_info(html_file)
206+
if not title or not text:
207+
continue
208+
209+
# Build canonical URL
210+
url = build_canonical_url(base_url, baseurl, rel_path)
211+
212+
# Get section
213+
section = get_section_name(rel_path)
214+
215+
# Truncate description
216+
description = truncate_text(text, max_chars=250)
217+
218+
# Store page info
219+
sections[section].append({
220+
'title': title,
221+
'url': url,
222+
'description': description,
223+
'path': rel_path # For sorting
224+
})
225+
226+
# Generate llms.txt content
227+
output_lines = []
228+
229+
# Sort sections: Root first, then alphabetically
230+
sorted_sections = sorted(sections.keys(), key=lambda x: (x != "Root", x))
231+
232+
for section in sorted_sections:
233+
pages = sections[section]
234+
235+
# Sort pages within section by path
236+
pages.sort(key=lambda p: str(p['path']))
237+
238+
# Add section header
239+
output_lines.append(f"## {section}")
240+
241+
# Add pages
242+
for page in pages:
243+
output_lines.append(f"- [{page['title']}]({page['url']}): {page['description']}")
244+
245+
# Add blank line between sections
246+
output_lines.append("")
247+
248+
# Write to llms.txt
249+
output_path = repo_root / 'llms.txt'
250+
with open(output_path, 'w', encoding='utf-8') as f:
251+
f.write('\n'.join(output_lines))
252+
253+
print(f"Successfully generated llms.txt with {sum(len(pages) for pages in sections.values())} pages across {len(sections)} sections.")
254+
print(f"Output written to: {output_path}")
255+
256+
257+
def main():
258+
"""Main entry point."""
259+
# Determine repo root (script is in scripts/ directory)
260+
script_dir = Path(__file__).parent.resolve()
261+
repo_root = script_dir.parent
262+
263+
generate_llms_txt(repo_root)
264+
265+
266+
if __name__ == '__main__':
267+
main()

0 commit comments

Comments
 (0)