Skip to content

Commit d63674a

Browse files
maellehadley
andauthored
Build documentation for LLMs (#2917)
Generates an `LLM.txt` that combines the README, reference index, and articles index, and generates a `.md` file for each `.html` file (using pandoc with a few xml2 tweaks for nicer rendering). --------- Co-authored-by: Hadley Wickham <h.wickham@gmail.com>
1 parent 6d27931 commit d63674a

19 files changed

+491
-5
lines changed

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ export(build_articles_index)
116116
export(build_favicons)
117117
export(build_home)
118118
export(build_home_index)
119+
export(build_llm_docs)
119120
export(build_news)
120121
export(build_redirects)
121122
export(build_reference)

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# pkgdown (development version)
22

3+
* New `build_llm_docs()` generates a `LLMs.txt` at the root directory of your site, and provides a `.md` version of every page. You can disable by adding `llm-docs: false` to your `_pkgdown.yaml` (#2914, @maelle)
34
* Links generated with `\code{\link{foo}()}` now have the `()` moved into the `<a>` in the generated output (@maelle).
45
* Plots in dark mode are now transformed with a CSS filter to improve their
56
visibility (thanks to @gadenbuie).

R/build-llm-dl.R

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
simplify_dls <- function(html) {
2+
dls <- xml2::xml_find_all(html, ".//dl")
3+
for (dl in dls) {
4+
simplify_dl(dl)
5+
}
6+
invisible()
7+
}
8+
9+
simplify_dl <- function(dl) {
10+
children <- xml2::xml_children(dl)
11+
12+
names <- xml2::xml_name(children)
13+
if (!is_simple_dl(names)) {
14+
cli::cli_warn("Skipping this <dl>: not a simple term-definition list")
15+
return()
16+
}
17+
18+
groups <- split(children, (seq_along(children) - 1) %/% 2)
19+
20+
bullets <- lapply(groups, create_li_from_group)
21+
ul <- xml2::read_xml("<ul></ul>")
22+
xml_insert(ul, bullets)
23+
24+
xml2::xml_replace(dl, ul)
25+
}
26+
27+
# Must have an even number of children that alternate between dt and dd
28+
is_simple_dl <- function(names) {
29+
if (length(names) %% 2 != 0) {
30+
return(FALSE)
31+
}
32+
odd <- names[seq_along(names) %% 2 == 1]
33+
even <- names[seq_along(names) %% 2 == 0]
34+
35+
all(odd == "dt") && all(even == "dd")
36+
}
37+
38+
create_li_from_group <- function(group) {
39+
dt <- group[[1]]
40+
dd <- group[[2]]
41+
42+
if (has_children(dd)) {
43+
# params case
44+
para <- xml2::read_xml("<p></p>")
45+
xml_insert(para, xml2::xml_contents(dt))
46+
xml2::xml_add_child(para, xml_text_node(": "))
47+
48+
bullet <- xml2::read_xml("<li></li>")
49+
xml2::xml_add_child(bullet, para)
50+
} else {
51+
# reference index
52+
bullet <- xml2::read_xml("<li></li>")
53+
xml_insert(bullet, xml2::xml_contents(dt))
54+
xml2::xml_add_child(bullet, xml_text_node(": "))
55+
}
56+
xml_insert(bullet, xml2::xml_contents(dd))
57+
58+
bullet
59+
}
60+
61+
has_children <- function(x) length(xml2::xml_children(x)) > 0
62+
63+
xml_text_node <- function(x) {
64+
span <- xml2::read_xml(paste0("<span>", x, "</span>"))
65+
xml2::xml_find_first(span, ".//text()")
66+
}

R/build-llm.R

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
#' Build docs for LLMs
2+
#'
3+
#' @description
4+
#' `build_llm_docs()` creates an `LLMs.txt` at the root of your site
5+
#' that contains the contents of your `README.md`, your reference index,
6+
#' and your articles index. It also creates a `.md` file for every existing
7+
#' `.html` file in your site. Together, this gives an LLM an overview of your
8+
#' package and the ability to find out more by following links.
9+
#'
10+
#' If you don't want these files generated for your site, you can opt-out by
11+
#' adding the following to your `pkgdown.yml`:
12+
#'
13+
#' ```yaml
14+
#' llm-docs: false
15+
#' ```
16+
#'
17+
#' @family site components
18+
#' @inheritParams build_site
19+
#' @export
20+
build_llm_docs <- function(pkg = ".") {
21+
pkg <- as_pkgdown(pkg)
22+
if (isFALSE(pkg$meta$`llm-docs`)) {
23+
return(invisible())
24+
}
25+
26+
cli::cli_rule("Building docs for llms")
27+
28+
paths <- get_site_paths(pkg)
29+
purrr::walk(paths, \(path) {
30+
src_path <- path(pkg[["dst_path"]], path)
31+
dst_path <- path_ext_set(src_path, "md")
32+
convert_md(src_path, dst_path, full_url(pkg, path))
33+
})
34+
35+
index <- c(
36+
read_lines(path(pkg$dst_path, "index.md")),
37+
"",
38+
read_file_if_exists(path(pkg$dst_path, "reference", "index.md")),
39+
"",
40+
read_file_if_exists(path(pkg$dst_path, "articles", "index.md"))
41+
)
42+
write_lines(index, path(pkg$dst_path, "llms.txt"))
43+
44+
invisible()
45+
}
46+
47+
full_url <- function(pkg, path) {
48+
if (is.null(pkg$meta$url)) {
49+
return()
50+
}
51+
52+
url <- paste0(pkg$meta$url, "/")
53+
if (pkg$development$in_dev) {
54+
url <- paste0(url, pkg$prefix)
55+
}
56+
57+
xml2::url_absolute(paste0(path_dir(path), "/"), url)
58+
}
59+
60+
convert_md <- function(src_path, dst_path, url = NULL) {
61+
html <- xml2::read_html(src_path)
62+
main_html <- xml2::xml_find_first(html, ".//main")
63+
if (length(main_html) == 0) {
64+
return()
65+
}
66+
67+
simplify_page_header(main_html)
68+
simplify_anchors(main_html)
69+
simplify_code(main_html)
70+
simplify_popovers_to_footnotes(main_html)
71+
simplify_lifecycle_badges(main_html)
72+
simplify_dls(main_html)
73+
create_absolute_links(main_html, url)
74+
75+
path <- file_temp()
76+
xml2::write_html(main_html, path, format = FALSE)
77+
on.exit(file_delete(path), add = TRUE)
78+
79+
rmarkdown::pandoc_convert(
80+
input = path,
81+
output = dst_path,
82+
from = "html",
83+
to = "gfm+definition_lists-raw_html",
84+
)
85+
}
86+
87+
# Helpers ---------------------------------------------------------------------
88+
89+
# simplify page header (which includes logo + source link)
90+
simplify_page_header <- function(html) {
91+
title <- xml2::xml_find_first(html, ".//h1")
92+
# website for a package without README/index.md
93+
if (length(title) > 0) {
94+
xml2::xml_remove(xml2::xml_find_first(html, ".//div[@class='page-header']"))
95+
xml2::xml_add_child(html, title, .where = 0)
96+
}
97+
invisible()
98+
}
99+
100+
# drop internal anchors
101+
simplify_anchors <- function(html) {
102+
xml2::xml_remove(xml2::xml_find_all(html, ".//a[@class='anchor']"))
103+
invisible()
104+
}
105+
106+
# strip extraneoous classes
107+
simplify_code <- function(html) {
108+
extract_lang <- function(class) {
109+
trimws(gsub("sourceCode|downlit", "", class))
110+
}
111+
code <- xml2::xml_find_all(html, ".//pre[contains(@class, 'sourceCode')]")
112+
113+
purrr::walk(code, \(x) {
114+
xml2::xml_attr(x, "class") <- extract_lang(xml2::xml_attr(x, "class"))
115+
})
116+
invisible()
117+
}
118+
119+
simplify_popovers_to_footnotes <- function(main_html) {
120+
popover_refs <- xml2::xml_find_all(main_html, ".//a[@class='footnote-ref']")
121+
if (length(popover_refs) == 0) {
122+
return()
123+
}
124+
125+
# Create footnotes section
126+
footnotes_section <- xml2::xml_find_first(
127+
main_html,
128+
".//section[@class='footnotes']"
129+
)
130+
if (length(footnotes_section) == 0) {
131+
footnotes_section <- xml2::xml_add_child(
132+
main_html,
133+
"section",
134+
id = "footnotes",
135+
class = "footnotes footnotes-end-of-document",
136+
role = "doc-endnotes"
137+
)
138+
xml2::xml_add_child(footnotes_section, "hr")
139+
footnotes_ol <- xml2::xml_add_child(footnotes_section, "ol")
140+
} else {
141+
footnotes_ol <- xml2::xml_find_first(footnotes_section, ".//ol")
142+
}
143+
144+
purrr::iwalk(popover_refs, function(ref, i) {
145+
text_content <- xml2::xml_attr(ref, "data-bs-content")
146+
fn_id <- paste0("fn", i)
147+
fnref_id <- paste0("fnref", i)
148+
xml2::xml_attrs(ref) <- list(
149+
href = paste0("#", fn_id),
150+
id = fnref_id,
151+
role = "doc-noteref",
152+
class = "footnote-ref"
153+
)
154+
155+
fn_li <- xml2::xml_add_child(footnotes_ol, "li", id = fn_id)
156+
parsed_content <- xml2::read_html(text_content) |>
157+
xml2::xml_find_first(".//body") |>
158+
xml2::xml_children()
159+
purrr::walk(parsed_content, \(x) xml2::xml_add_child(fn_li, x))
160+
})
161+
}
162+
163+
simplify_lifecycle_badges <- function(html) {
164+
# on reference index
165+
badges <- xml2::xml_find_all(html, "//span[contains(@class, 'lifecycle')]")
166+
xml2::xml_replace(badges, "strong", paste0("[", xml2::xml_text(badges), "]"))
167+
168+
# on individual pages
169+
badges <- xml2::xml_find_all(
170+
html,
171+
"//a[.//img[starts-with(@src, 'figures/lifecycle-')]]"
172+
)
173+
imgs <- xml2::xml_find_first(badges, ".//img")
174+
xml2::xml_replace(badges, "strong", tolower(xml2::xml_attr(imgs, "alt")))
175+
176+
invisible()
177+
}
178+
179+
create_absolute_links <- function(main_html, url = NULL) {
180+
a <- xml2::xml_find_all(main_html, ".//a")
181+
xml2::xml_attr(a, "class") <- NULL
182+
183+
href <- xml2::xml_attr(a, "href")
184+
is_internal <- !startsWith(href, "https") & !startsWith(href, "#")
185+
if (!is.null(url)) {
186+
href[is_internal] <- xml2::url_absolute(href[is_internal], url)
187+
}
188+
href[is_internal] <- sub("html$", "md", href[is_internal])
189+
190+
xml2::xml_attr(a[is_internal], "href") <- href[is_internal]
191+
192+
invisible()
193+
}
194+
195+
read_file_if_exists <- function(path) {
196+
if (file_exists(path)) {
197+
read_lines(path)
198+
}
199+
}

R/build.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#' * [build_tutorials()]
1111
#' * [build_news()]
1212
#' * [build_redirects()]
13+
#' * [build_llm_docs()]
1314
#'
1415
#' See the documentation for the each function to learn how to control
1516
#' that aspect of the site. This page documents options that affect the
@@ -467,6 +468,9 @@ build_site_local <- function(
467468
build_tutorials(pkg, override = override, preview = FALSE)
468469
build_news(pkg, override = override, preview = FALSE)
469470
build_sitemap(pkg)
471+
if (pkg$bs_version > 3) {
472+
build_llm_docs(pkg)
473+
}
470474
build_redirects(pkg, override = override)
471475
if (pkg$bs_version == 3) {
472476
build_docsearch_json(pkg)

R/tweak-reference.R

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,16 @@ tweak_highlight_other <- function(div) {
8484

8585
xml_replace_contents <- function(node, new) {
8686
xml2::xml_remove(xml2::xml_contents(node))
87-
8887
contents <- xml2::xml_contents(new)
89-
for (child in contents) {
88+
xml_insert(node, contents)
89+
}
90+
91+
xml_insert <- function(node, new) {
92+
for (child in new) {
9093
xml2::xml_add_child(node, child)
9194
}
9295
}
9396

94-
9597
tweak_extra_logo <- function(html) {
9698
img <- xml2::xml_find_all(
9799
html,

inst/BS5/templates/content-reference-index.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@ <h1>{{{pagetitle}}}</h1>
1010
{{#subtitle}}<h3>{{{.}}}</h3>{{/subtitle}}
1111
{{#desc}}<div class="section-desc">{{{desc}}}</div>{{/desc}}
1212

13-
{{#topics}}<dl>
13+
<dl>{{#topics}}
1414
<dt>
1515
{{#has_icons}}{{#icon}}<a class="icon" href="{{path}}"><img src="icons/{{{.}}}" alt=""/></a>{{/icon}}{{/has_icons}}
1616
{{#aliases}}<code><a href="{{path}}">{{{.}}}</a></code> {{/aliases}}
1717
{{#lifecycle}}<span class="badge lifecycle lifecycle-{{.}}">{{.}}</span>{{/lifecycle}}
1818
</dt>
1919
<dd>{{{title}}}</dd>
20-
</dl>{{/topics}}
20+
{{/topics}}</dl>
2121
</div>{{/rows}}
2222
</main>
2323

man/build_articles.Rd

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/build_home.Rd

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/build_llm_docs.Rd

Lines changed: 33 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)