Skip to content

Commit 5331572

Browse files
committed
add llms.txt
1 parent 1b7d82a commit 5331572

File tree

3 files changed

+267
-1
lines changed

3 files changed

+267
-1
lines changed

.github/workflows/deploy_aliyun.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ jobs:
1515
uses: actions/checkout@v2
1616
with:
1717
fetch-depth: 0
18+
19+
- name: Generate llms.txt
20+
run: ruby generate_llms_txt.rb
1821

1922
- name: Install SSH Key
2023
uses: shimataro/ssh-key-action@v2
@@ -42,8 +45,10 @@ jobs:
4245
sed -i 's/rubygems\.org/gems.ruby-china.com/g' Gemfile
4346
sed -i 's/rubygems\.org/gems.ruby-china.com/g' Gemfile.lock
4447
grep "gems.ruby-china.com" Gemfile
45-
git commit -am "update Gemfile"
4648
git checkout -b master
49+
git add Gemfile Gemfile.lock
50+
git add -f llms.txt
51+
git commit -m "build: prepare deploy artifacts"
4752
env:
4853
SSH_KEY: ${{secrets.ALIYUN_SERVER_ACCESS_TOKEN}}
4954
PEM_KEY: ${{secrets.ALIYUN_PEM}}

.github/workflows/deploy_pages.yml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
name: 'deploy-pages'
2+
3+
on:
4+
push:
5+
branches:
6+
- gh-pages
7+
workflow_dispatch:
8+
9+
permissions:
10+
contents: read
11+
pages: write
12+
id-token: write
13+
14+
concurrency:
15+
group: 'pages'
16+
cancel-in-progress: true
17+
18+
jobs:
19+
build:
20+
runs-on: ubuntu-latest
21+
steps:
22+
- name: Checkout
23+
uses: actions/checkout@v4
24+
25+
- name: Setup Pages
26+
uses: actions/configure-pages@v5
27+
28+
- name: Setup Ruby
29+
uses: ruby/setup-ruby@v1
30+
with:
31+
bundler-cache: true
32+
33+
- name: Build site
34+
run: bundle exec jekyll build --destination _site
35+
36+
- name: Generate llms.txt into site output
37+
run: ruby generate_llms_txt.rb _site/llms.txt
38+
39+
- name: Upload Pages artifact
40+
uses: actions/upload-pages-artifact@v3
41+
with:
42+
path: _site
43+
44+
deploy:
45+
environment:
46+
name: github-pages
47+
url: ${{ steps.deployment.outputs.page_url }}
48+
needs: build
49+
runs-on: ubuntu-latest
50+
steps:
51+
- name: Deploy to GitHub Pages
52+
id: deployment
53+
uses: actions/deploy-pages@v4

generate_llms_txt.rb

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
#!/usr/bin/env ruby
2+
3+
require 'yaml'
4+
require 'date'
5+
require 'fileutils'
6+
7+
ROOT = __dir__
8+
CONFIG_PATH = File.join(ROOT, '_config.yml')
9+
DEFAULT_OUTPUT_PATH = File.join(ROOT, 'llms.txt')
10+
11+
SECTION_TITLES = {
12+
'core' => 'Core Pages',
13+
'new' => "What's New",
14+
'basic' => 'Quick Start',
15+
'purchase' => 'Purchase & Activation',
16+
'how-to' => 'How-To Guides',
17+
'reference' => 'Reference',
18+
'announcement' => 'Announcements',
19+
'ja' => 'Japanese Translation',
20+
'zh' => 'Chinese Translation',
21+
'other' => 'Other Pages'
22+
}.freeze
23+
24+
SECTION_ORDER = SECTION_TITLES.keys.freeze
25+
26+
Doc = Struct.new(:section, :title, :url, :summary, keyword_init: true)
27+
28+
def parse_front_matter(path)
29+
content = File.read(path)
30+
return [{}, content] unless content.start_with?('---')
31+
32+
lines = content.lines
33+
closing_index = lines[1..].find_index { |line| line.strip == '---' }
34+
return [{}, content] unless closing_index
35+
36+
front_matter = YAML.safe_load(
37+
lines[1..closing_index].join,
38+
permitted_classes: [Date, Time],
39+
aliases: true
40+
) || {}
41+
42+
body = lines[(closing_index + 2)..] || []
43+
[front_matter, body.join]
44+
end
45+
46+
def normalize_path(path)
47+
return '/' if path.nil? || path.empty?
48+
49+
normalized = path.start_with?('/') ? path : "/#{path}"
50+
normalized.gsub!(%r{/+}, '/')
51+
normalized
52+
end
53+
54+
def absolute_url(site_url, path)
55+
"#{site_url.sub(%r{/+$}, '')}#{normalize_path(path)}"
56+
end
57+
58+
def post_url(path, front_matter)
59+
permalink = front_matter['permalink']
60+
return normalize_path(permalink) if permalink
61+
62+
slug = File.basename(path, File.extname(path)).sub(/^\d{4}-\d{2}-\d{2}-/, '')
63+
"/#{slug}/"
64+
end
65+
66+
def root_page_url(path, front_matter)
67+
permalink = front_matter['permalink']
68+
return normalize_path(permalink) if permalink
69+
70+
basename = File.basename(path)
71+
return '/' if basename == 'index.html'
72+
73+
"/#{basename}"
74+
end
75+
76+
def detect_section(path, front_matter)
77+
permalink = front_matter['permalink'].to_s
78+
return 'ja' if permalink.start_with?('/ja/') || path.include?('/ja/')
79+
return 'zh' if permalink.start_with?('/zh/') || path.include?('/zh/')
80+
81+
category = front_matter['category'].to_s
82+
return category if SECTION_TITLES.key?(category)
83+
84+
folder = path.sub(%r{\A#{Regexp.escape(ROOT)}/?}, '').split('/')[1]
85+
return folder if SECTION_TITLES.key?(folder)
86+
87+
front_matter['homepage'] ? 'core' : 'other'
88+
end
89+
90+
def clean_text(text)
91+
cleaned = text.dup
92+
cleaned.gsub!(%r{<[^>]+>}, ' ')
93+
cleaned.gsub!(%r{\{\%.*?\%\}}m, ' ')
94+
cleaned.gsub!(%r{\{\{.*?\}\}}m, ' ')
95+
cleaned.gsub!(%r/\{:[^}]+\}/, ' ')
96+
cleaned.gsub!(%r{!\[[^\]]*\]\([^)]+\)}, ' ')
97+
cleaned.gsub!(%r{\[([^\]]+)\]\([^)]+\)}, '\1')
98+
cleaned.gsub!(%r{`{1,3}[^`]+`{1,3}}, ' ')
99+
cleaned.gsub!(%r{[#>*_~-]}, ' ')
100+
cleaned.gsub!(/\s+/, ' ')
101+
cleaned.strip
102+
end
103+
104+
def extract_summary(body)
105+
sanitized = body.gsub(%r{```.*?```}m, ' ')
106+
paragraphs = sanitized.split(/\n\s*\n+/).map(&:strip)
107+
108+
paragraphs.each do |paragraph|
109+
next if paragraph.empty?
110+
next if paragraph.start_with?('{%')
111+
next if paragraph.lines.first.to_s.start_with?('#')
112+
next if paragraph.include?('{:toc}')
113+
next if paragraph.include?('document.querySelector')
114+
next if paragraph.include?('{% for')
115+
116+
summary = clean_text(paragraph)
117+
next if summary.empty?
118+
next if summary.length < 35
119+
next if summary =~ %r{\A(?:English Original Version|日本語|中文|Help Improve the Translation)}
120+
121+
return summary[0, 180].sub(/\s+\S*\z/, '') if summary.length > 180
122+
return summary
123+
end
124+
125+
nil
126+
end
127+
128+
def collect_root_pages(site_url)
129+
Dir.glob(File.join(ROOT, '*.{html,md}')).filter_map do |path|
130+
next if File.basename(path) == 'llms.txt'
131+
132+
front_matter, body = parse_front_matter(path)
133+
next if front_matter.empty?
134+
135+
title = front_matter['title'].to_s.strip
136+
next if title.empty?
137+
138+
Doc.new(
139+
section: detect_section(path, front_matter),
140+
title: title,
141+
url: absolute_url(site_url, root_page_url(path, front_matter)),
142+
summary: nil
143+
)
144+
end
145+
end
146+
147+
def collect_posts(site_url)
148+
Dir.glob(File.join(ROOT, '_posts', '**', '*.md')).filter_map do |path|
149+
front_matter, body = parse_front_matter(path)
150+
next if front_matter['draft']
151+
152+
title = front_matter['title'].to_s.strip
153+
next if title.empty?
154+
155+
Doc.new(
156+
section: detect_section(path, front_matter),
157+
title: title,
158+
url: absolute_url(site_url, post_url(path, front_matter)),
159+
summary: extract_summary(body)
160+
)
161+
end
162+
end
163+
164+
def build_output(site_name, site_url, description, docs)
165+
lines = []
166+
lines << "# #{site_name}"
167+
lines << ""
168+
lines << "> #{description}" unless description.to_s.strip.empty?
169+
lines << ""
170+
lines << "- Base URL: #{site_url}"
171+
lines << "- Format: Markdown index for LLM-friendly site discovery"
172+
lines << ""
173+
174+
grouped = docs.group_by(&:section)
175+
SECTION_ORDER.each do |section|
176+
items = grouped[section]
177+
next if items.nil? || items.empty?
178+
179+
lines << "## #{SECTION_TITLES.fetch(section)}"
180+
lines << ""
181+
items.sort_by(&:title).each do |doc|
182+
line = "- [#{doc.title}](#{doc.url})"
183+
line += " - #{doc.summary}" if doc.summary
184+
lines << line
185+
end
186+
lines << ""
187+
end
188+
189+
lines.join("\n").rstrip + "\n"
190+
end
191+
192+
abort('Run this script from the repository root.') unless File.exist?(CONFIG_PATH)
193+
194+
config = YAML.safe_load(File.read(CONFIG_PATH), permitted_classes: [Date, Time], aliases: true) || {}
195+
site_name = config.fetch('name', 'Website')
196+
site_url = config.fetch('url')
197+
description = config['description']
198+
output_path = if ARGV[0]
199+
File.expand_path(ARGV[0], ROOT)
200+
else
201+
DEFAULT_OUTPUT_PATH
202+
end
203+
204+
docs = collect_root_pages(site_url) + collect_posts(site_url)
205+
FileUtils.mkdir_p(File.dirname(output_path))
206+
File.write(output_path, build_output(site_name, site_url, description, docs))
207+
208+
puts "Generated #{output_path}"

0 commit comments

Comments
 (0)