Skip to content

Commit 1aeebb3

Browse files
committed
Add script to extract glossary
1 parent c239214 commit 1aeebb3

File tree

1 file changed

+100
-2
lines changed

1 file changed

+100
-2
lines changed

script/update-docs.rb

Lines changed: 100 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
require "set"
99
require 'fileutils'
1010
require 'yaml'
11+
require 'json'
1112
require 'diffy'
1213
require_relative "version"
1314
require_relative 'asciidoctor-extensions'
@@ -95,6 +96,68 @@ def extract_headings(html)
9596
headings
9697
end
9798

99+
def extract_glossary_from_html(content)
100+
# skip front matter
101+
content = content.split(/^---$/)[2] || content
102+
103+
doc = Nokogiri::HTML::DocumentFragment.parse(content)
104+
105+
glossary = {}
106+
107+
doc.css('dt').each do |dt|
108+
def_anchor = dt.css('a[id^="def_"]').first
109+
next unless def_anchor
110+
111+
term_id = def_anchor['id']
112+
next unless term_id&.start_with?('def_')
113+
114+
term_name = dt.text.strip
115+
# hack to handle this one weird (also) thing
116+
term_names = []
117+
if term_name == "tree-ish (also treeish)"
118+
term_names = ['tree-ish', 'treeish']
119+
else
120+
term_names = [term_name]
121+
end
122+
current_element = dt.next_element
123+
raise 'Expected dd' unless current_element&.name == 'dd'
124+
125+
definition = current_element.inner_html.strip
126+
term_names.each do |term|
127+
glossary[term] = definition
128+
end
129+
end
130+
131+
glossary
132+
end
133+
134+
def save_glossary_files(glossary_data_by_lang)
135+
return if glossary_data_by_lang.empty?
136+
137+
glossary_dir = "#{SITE_ROOT}static/js/glossary"
138+
FileUtils.mkdir_p(glossary_dir)
139+
140+
glossary_data_by_lang.each do |lang, glossary_data|
141+
output_file = "#{glossary_dir}/#{lang}.json"
142+
puts " saving glossary data to #{output_file} (#{glossary_data.size} terms)"
143+
File.write(output_file, JSON.generate(glossary_data))
144+
end
145+
end
146+
147+
def mark_glossary_tooltips(html, glossary_data_by_lang, lang)
148+
current_glossary = glossary_data_by_lang[lang] || {}
149+
150+
html.gsub(/<([^&]+)>/) do |match|
151+
term = $1
152+
# Only mark terms that exist in the glossary
153+
if current_glossary.key?(term)
154+
"<span class=\"hover-term\" data-term=\"#{term}\">&lt;#{term}&gt;</span>"
155+
else
156+
match
157+
end
158+
end
159+
end
160+
98161
def index_l10n_doc(filter_tags, doc_list, get_content)
99162
rebuild = ENV.fetch("REBUILD_DOC", nil)
100163
rerun = ENV["RERUN"] || rebuild || false
@@ -139,8 +202,15 @@ def index_l10n_doc(filter_tags, doc_list, get_content)
139202
end
140203

141204
check_paths = Set.new([])
205+
glossary_data_by_lang = {}
142206

143-
doc_files.each do |entry|
207+
# Process glossary docs first so that we can use the parsed glossary to mark
208+
# tooltip items in the other documents
209+
glossary_docs = doc_files.select { |entry| File.basename(entry[0], ".#{ext}") == 'gitglossary' }
210+
other_docs = doc_files.reject { |entry| File.basename(entry[0], ".#{ext}") == 'gitglossary' }
211+
ordered_docs = glossary_docs + other_docs
212+
213+
ordered_docs.each do |entry|
144214
full_path, sha = entry
145215
ids = Set.new([])
146216
lang = File.dirname(full_path)
@@ -177,6 +247,12 @@ def index_l10n_doc(filter_tags, doc_list, get_content)
177247
next if !rerun && lang_data[lang] == asciidoc_sha
178248

179249
html = asciidoc.render
250+
251+
if path == 'gitglossary'
252+
glossary_data_by_lang[lang] = extract_glossary_from_html(html)
253+
puts " extracted #{glossary_data_by_lang[lang].size} glossary terms for #{lang}"
254+
end
255+
180256
html.gsub!(/linkgit:(\S+?)\[(\d+)\]/) do |line|
181257
x = /^linkgit:(\S+?)\[(\d+)\]/.match(line)
182258
relurl = "docs/#{x[1].gsub(/&#x2d;/, '-')}/#{lang}"
@@ -223,6 +299,8 @@ def index_l10n_doc(filter_tags, doc_list, get_content)
223299
"#{before}{{< relurl \"#{after}\" >}}"
224300
end
225301

302+
html = mark_glossary_tooltips(html, glossary_data_by_lang, lang)
303+
226304
# Write <docname>/<lang>.html
227305
front_matter = {
228306
"category" => "manual",
@@ -248,6 +326,8 @@ def index_l10n_doc(filter_tags, doc_list, get_content)
248326
lang_data[lang] = asciidoc_sha
249327
end
250328

329+
save_glossary_files(glossary_data_by_lang)
330+
251331
# In some cases, translations are not complete. As a consequence, some
252332
# translated manual pages may point to other translated manual pages that do
253333
# not exist. In these cases, redirect to the English version.
@@ -432,8 +512,15 @@ def index_doc(filter_tags, doc_list, get_content)
432512
end
433513

434514
check_paths = Set.new([])
515+
glossary_data_by_lang = {}
516+
517+
# Process glossary docs first so that we can use the parsed glossary to mark
518+
# tooltip items in the other documents
519+
glossary_docs = doc_files.select { |entry| File.basename(entry[0].sub(/\.adoc$/, '.txt'), '.txt') == 'gitglossary' }
520+
other_docs = doc_files.reject { |entry| File.basename(entry[0].sub(/\.adoc$/, '.txt'), '.txt') == 'gitglossary' }
521+
ordered_docs = glossary_docs + other_docs
435522

436-
doc_files.each do |entry|
523+
ordered_docs.each do |entry|
437524
path, sha = entry
438525
txt_path = path.sub(/\.adoc$/, '.txt')
439526
ids = Set.new([])
@@ -482,6 +569,12 @@ def index_doc(filter_tags, doc_list, get_content)
482569

483570
# Generate HTML
484571
html = asciidoc.render
572+
573+
if docname == 'gitglossary'
574+
glossary_data_by_lang['en'] = extract_glossary_from_html(html)
575+
puts " extracted #{glossary_data_by_lang['en'].size} glossary terms for 'en'"
576+
end
577+
485578
html.gsub!(/linkgit:+(\S+?)\[(\d+)\]/) do |line|
486579
x = /^linkgit:+(\S+?)\[(\d+)\]/.match(line)
487580
if x[1] == "curl"
@@ -522,6 +615,8 @@ def index_doc(filter_tags, doc_list, get_content)
522615
"#{before}{{< relurl \"#{after}\" >}}"
523616
end
524617

618+
html = mark_glossary_tooltips(html, glossary_data_by_lang, 'en')
619+
525620
doc_versions = version_map.keys.sort{|a, b| Version.version_to_num(a) <=> Version.version_to_num(b)}
526621
doc_version_index = doc_versions.index(version)
527622

@@ -640,6 +735,9 @@ def index_doc(filter_tags, doc_list, get_content)
640735
end
641736
end
642737
end
738+
739+
save_glossary_files(glossary_data_by_lang)
740+
643741
data["latest-version"] = version if !data["latest-version"] || Version.version_to_num(data["latest-version"]) < Version.version_to_num(version)
644742
end
645743

0 commit comments

Comments
 (0)