Skip to content

Commit 4babdb8

Browse files
authored
Add option to generate a sitemap (#59)
1 parent df29008 commit 4babdb8

File tree

5 files changed

+399
-108
lines changed

5 files changed

+399
-108
lines changed

docs/make.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ MultiDocumenter.make(
7979
engine = MultiDocumenter.FlexSearch,
8080
),
8181
rootpath = "/MultiDocumenter.jl/",
82+
canonical_domain = "https://juliacomputing.github.io/",
83+
sitemap = true,
8284
)
8385

8486
if "deploy" in ARGS

src/MultiDocumenter.jl

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ include("renderers.jl")
8787
include("search/flexsearch.jl")
8888
include("search/stork.jl")
8989
include("canonical.jl")
90+
include("sitemap.jl")
9091

9192
const DEFAULT_ENGINE = SearchConfig(index_versions = ["stable", "dev"], engine = FlexSearch)
9293

@@ -117,9 +118,13 @@ Aggregates multiple Documenter.jl-based documentation pages `docs` into `outdir`
117118
- `prettyurls` removes all `index.html` suffixes from links in the global navigation.
118119
- `rootpath` is the path your site ends up being deployed at, e.g. `/foo/` if it's hosted at `https://bar.com/foo`
119120
- `hide_previews` removes preview builds from the aggregated documentation.
120-
- `canonical`: if set to the root URL of the MultiDocumenter site, will check and, if necessary, update the
121-
canonical URL tags for each package site to point to the directory. Similar to the `canonical` argument of
122-
`Documenter.HTML` constructor.
121+
- `canonical_domain`: determines the the schema and authority (domain) of the (e.g. `https://example.org`)
122+
deployed site. If set, MultiDocumenter will check and, if necessary, update the canonical URL tags for each
123+
package site to point to the correct place directory. Similar to the `canonical` argument of `Documenter.HTML`
124+
constructor, except that it should not contain the path component -- that is determined from `rootpath`.
125+
- `sitemap`, if enabled, will generate a `sitemap.xml` file at the root of the output directory. Requires
126+
`canonical_domain` to be set, since the sitemap is determined from canonical URLs.
127+
- `sitemap_filename` can be used to override the default sitemap filename (`sitemap.xml`)
123128
"""
124129
function make(
125130
outdir,
@@ -132,18 +137,41 @@ function make(
132137
prettyurls = true,
133138
rootpath = "/",
134139
hide_previews = true,
135-
canonical::Union{AbstractString,Nothing} = nothing,
140+
canonical_domain::Union{AbstractString,Nothing} = nothing,
141+
sitemap::Bool = false,
142+
sitemap_filename::AbstractString = "sitemap.xml",
136143
)
144+
if isnothing(canonical_domain)
145+
(sitemap === true) &&
146+
throw(ArgumentError("When sitemap=true, canonical_domain must also be set"))
147+
else
148+
!isnothing(canonical_domain)
149+
if !startswith(canonical_domain, r"^https?://")
150+
throw(ArgumentError("""
151+
Invalid value for canonical_domain: $(canonical_domain)
152+
Must start with http:// or https://"""))
153+
end
154+
# We'll strip any trailing /-s though, in case the user passed something like
155+
# https://example.org/, because we want to concatenate the file paths with `/`
156+
canonical_domain = rstrip(canonical_domain, '/')
157+
end
158+
# We'll normalize rootpath to have /-s at the beginning and at the end, so that we
159+
# can assume that when concatenating this to other paths
160+
if !startswith(rootpath, "/")
161+
rootpath = string('/', rootpath)
162+
end
163+
if !endswith(rootpath, "/")
164+
rootpath = string(rootpath, '/')
165+
end
166+
site_root_url = string(canonical_domain, rstrip(rootpath, '/'))
167+
137168
maybe_clone(flatten_multidocrefs(docs))
138169

139-
if !isnothing(canonical)
140-
canonical = rstrip(canonical, '/')
141-
end
142170
dir = make_output_structure(
143171
flatten_multidocrefs(docs),
144172
prettyurls,
145173
hide_previews;
146-
canonical,
174+
canonical = site_root_url,
147175
)
148176
out_assets = joinpath(dir, "assets")
149177
if assets_dir !== nothing && isdir(assets_dir)
@@ -179,6 +207,14 @@ function make(
179207
)
180208
end
181209

210+
if sitemap
211+
make_sitemap(;
212+
sitemap_root = site_root_url,
213+
sitemap_filename,
214+
docs_root_directory = dir,
215+
)
216+
end
217+
182218
cp(dir, outdir; force = true)
183219
rm(dir; force = true, recursive = true)
184220

src/documentertools/canonical_urls.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,10 @@ end
135135
Parses the HTML file at `indexhtml_path` and tries to extract the `url=...` value
136136
of the redirect `<meta http-equiv="refresh" ...>` tag.
137137
"""
138-
function get_meta_redirect_url(indexhtml_path::AbstractString)
139-
html = Gumbo.parsehtml(read(indexhtml_path, String))
138+
get_meta_redirect_url(indexhtml_path::AbstractString) =
139+
get_meta_redirect_url(Gumbo.parsehtml(read(indexhtml_path, String)))
140+
141+
function get_meta_redirect_url(html::Gumbo.HTMLDocument)
140142
for e in AbstractTrees.PreOrderDFS(html.root)
141143
e isa Gumbo.HTMLElement || continue
142144
Gumbo.tag(e) == :meta || continue

src/sitemap.jl

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# Note: Franklin.jl also implements sitemap generation:
2+
#
3+
# https://github.com/tlienart/Franklin.jl/blob/f1f7d044dc95ba0d9f368a3d1afc233eb58a59cf/src/manager/sitemap_generator.jl#L51
4+
#
5+
# At some point it might be worth factoring the code into a small shared package.
6+
# Franklin's implementation is more general that this here, but it looks like it relies
7+
# of Franklin-specific globals, so it's not a trivial copy-paste into a separate package.
8+
9+
# The sitemap spec limits the size of the sitemap, both in terms of number of entires
10+
# and the total filesize (https://www.sitemaps.org//protocol.html#index).
11+
const SITEMAP_URL_LIMIT = 50_000
12+
const SITEMAP_SIZE_LIMIT = 52_428_800
13+
const SITEMAP_LIMIT_MSG = "Sitemaps are limited to $(SITEMAP_URL_LIMIT) URLs and a maximum filesize if $(SITEMAP_SIZE_LIMIT) bytes."
14+
15+
struct SitemapTooLargeError <: Exception
16+
msg::String
17+
value::Int
18+
limit::Int
19+
end
20+
function Base.showerror(io::IO, e::SitemapTooLargeError)
21+
println(io, "SitemapTooLargeError: $(e.msg)")
22+
println(io, " limit is $(e.limit), but sitemap has $(e.value)")
23+
print(io, SITEMAP_LIMIT_MSG)
24+
end
25+
function check_sitemap_size_limit(msg::AbstractString, value::Integer, limit::Integer)
26+
if value > limit
27+
throw(SitemapTooLargeError(msg, value, limit))
28+
elseif value > div(limit, 10) * 8
29+
# Soft limits are 80% of the full limit
30+
@warn "Sitemap too large: $(msg) (> 80% soft limit)\n$(SITEMAP_LIMIT_MSG)"
31+
end
32+
return nothing
33+
end
34+
35+
function make_sitemap(;
36+
sitemap_filename::AbstractString,
37+
sitemap_root::AbstractString,
38+
docs_root_directory::AbstractString,
39+
)
40+
# Determine the list of sitemap URLs by finding all canonical URLs
41+
sitemap_urls = find_sitemap_urls(; docs_root_directory, sitemap_root)
42+
if length(sitemap_urls) == 0
43+
@error "No sitemap URLs found"
44+
return
45+
end
46+
sitemap_bytes = make_sitemap_bytes(sitemap_urls)
47+
# Write the actual sitemap.xml file into the output directory
48+
write(joinpath(docs_root_directory, sitemap_filename), sitemap_bytes)
49+
end
50+
51+
function make_sitemap_bytes(sitemap_urls)::Vector{UInt8}
52+
# Sitemaps are limited to 50 000 URLs: https://www.sitemaps.org/protocol.html#index
53+
# TODO: we could automatically split the sitemap up if it's bigger than that and
54+
# generate a sitemap index.
55+
check_sitemap_size_limit("too many URLs", length(sitemap_urls), SITEMAP_URL_LIMIT)
56+
sitemap_buffer = IOBuffer()
57+
write(
58+
sitemap_buffer,
59+
"""
60+
<?xml version="1.0" encoding="UTF-8"?>
61+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
62+
""",
63+
)
64+
for loc in sort(sitemap_urls)
65+
write(sitemap_buffer, "<url><loc>$(loc)</loc></url>\n")
66+
end
67+
write(sitemap_buffer, "</urlset>\n")
68+
sitemap_bytes = take!(sitemap_buffer)
69+
check_sitemap_size_limit(
70+
"sitemap too large (bytes)",
71+
length(sitemap_bytes),
72+
SITEMAP_SIZE_LIMIT,
73+
)
74+
return sitemap_bytes
75+
end
76+
77+
function find_sitemap_urls(;
78+
docs_root_directory::AbstractString,
79+
sitemap_root::AbstractString,
80+
)
81+
# On Windows, .relpath should have \ as path separators, which we have to
82+
# "normalize" to /-s for web
83+
canonical_urls = String[]
84+
DocumenterTools.walkdocs(docs_root_directory, DocumenterTools.isdochtml) do fileinfo
85+
html = Gumbo.parsehtml(read(fileinfo.fullpath, String))
86+
canonical_href = find_canonical_url(html; filepath = fileinfo.fullpath)
87+
if isnothing(canonical_href)
88+
# A common case for why the canonical URL would be missing is when it's a redirect
89+
# HTML file, and that is fine. So we check for that and only warn if it is _not_
90+
# a redirect file.
91+
if DocumenterTools.get_meta_redirect_url(html) === nothing
92+
@warn "Canonical URL missing: $(fileinfo.relpath)"
93+
end
94+
return
95+
end
96+
# Check that the canonincal URL is correct.
97+
# First, we check that the root part is actually what we expect it to be.
98+
if !startswith(canonical_href, sitemap_root)
99+
@warn "Invalid canonical URL, excluded from sitemap." canonical_href fileinfo.fullpath
100+
return
101+
end
102+
# Let's make sure we're not adding duplicates, but first we must normalize the URL
103+
canonical_href = normalize_canonical_url(canonical_href)
104+
if !(canonical_href in canonical_urls)
105+
push!(canonical_urls, canonical_href)
106+
end
107+
end
108+
return canonical_urls
109+
end
110+
111+
# foo/bar, foo/bar/ and foo/bar/index.html are basically equivalent, so we normalize the canonical
112+
# URL to foo/bar/
113+
normalize_canonical_url(url::AbstractString) = replace(url, r"/(index\.html)?$" => "/")
114+
115+
# Loops through a Gumbo-parsed DOM tree
116+
function find_canonical_url(html::Gumbo.HTMLDocument; filepath::AbstractString)
117+
canonical_href = nothing
118+
for e in AbstractTrees.PreOrderDFS(html.root)
119+
e isa Gumbo.HTMLElement || continue
120+
Gumbo.tag(e) == :link || continue
121+
Gumbo.getattr(e, "rel", nothing) == "canonical" || continue
122+
if isnothing(canonical_href)
123+
canonical_href = Gumbo.getattr(e, "href", nothing)
124+
else
125+
@warn "Duplicate <link rel=\"canonical\" ...> tag. Ignoring." filepath e canonical_href
126+
end
127+
end
128+
return canonical_href
129+
end

0 commit comments

Comments
 (0)