Add option to generate a sitemap (#59)

mortenpi · web-flow · commit 4babdb8dc034 · 2023-08-29T22:28:05.000+12:00
diff --git a/docs/make.jl b/docs/make.jl
@@ -79,6 +79,8 @@ MultiDocumenter.make(
         engine = MultiDocumenter.FlexSearch,
     ),
     rootpath = "/MultiDocumenter.jl/",
+    canonical_domain = "https://juliacomputing.github.io/",
+    sitemap = true,
 )
 
 if "deploy" in ARGS
diff --git a/src/MultiDocumenter.jl b/src/MultiDocumenter.jl
@@ -87,6 +87,7 @@ include("renderers.jl")
 include("search/flexsearch.jl")
 include("search/stork.jl")
 include("canonical.jl")
+include("sitemap.jl")
 
 const DEFAULT_ENGINE = SearchConfig(index_versions = ["stable", "dev"], engine = FlexSearch)
 
@@ -117,9 +118,13 @@ Aggregates multiple Documenter.jl-based documentation pages `docs` into `outdir`
 - `prettyurls` removes all `index.html` suffixes from links in the global navigation.
 - `rootpath` is the path your site ends up being deployed at, e.g. `/foo/` if it's hosted at `https://bar.com/foo`
 - `hide_previews` removes preview builds from the aggregated documentation.
-- `canonical`: if set to the root URL of the MultiDocumenter site, will check and, if necessary, update the
-  canonical URL tags for each package site to point to the directory. Similar to the `canonical` argument of
-  `Documenter.HTML` constructor.
+- `canonical_domain`: determines the the schema and authority (domain) of the (e.g. `https://example.org`)
+  deployed site. If set, MultiDocumenter will check and, if necessary, update the canonical URL tags for each
+  package site to point to the correct place directory. Similar to the `canonical` argument of `Documenter.HTML`
+  constructor, except that it should not contain the path component -- that is determined from `rootpath`.
+- `sitemap`, if enabled, will generate a `sitemap.xml` file at the root of the output directory. Requires
+  `canonical_domain` to be set, since the sitemap is determined from canonical URLs.
+- `sitemap_filename` can be used to override the default sitemap filename (`sitemap.xml`)
 """
 function make(
     outdir,
@@ -132,18 +137,41 @@ function make(
     prettyurls = true,
     rootpath = "/",
     hide_previews = true,
-    canonical::Union{AbstractString,Nothing} = nothing,
+    canonical_domain::Union{AbstractString,Nothing} = nothing,
+    sitemap::Bool = false,
+    sitemap_filename::AbstractString = "sitemap.xml",
 )
+    if isnothing(canonical_domain)
+        (sitemap === true) &&
+            throw(ArgumentError("When sitemap=true, canonical_domain must also be set"))
+    else
+        !isnothing(canonical_domain)
+        if !startswith(canonical_domain, r"^https?://")
+            throw(ArgumentError("""
+            Invalid value for canonical_domain: $(canonical_domain)
+            Must start with http:// or https://"""))
+        end
+        # We'll strip any trailing /-s though, in case the user passed something like
+        # https://example.org/, because we want to concatenate the file paths with `/`
+        canonical_domain = rstrip(canonical_domain, '/')
+    end
+    # We'll normalize rootpath to have /-s at the beginning and at the end, so that we
+    # can assume that when concatenating this to other paths
+    if !startswith(rootpath, "/")
+        rootpath = string('/', rootpath)
+    end
+    if !endswith(rootpath, "/")
+        rootpath = string(rootpath, '/')
+    end
+    site_root_url = string(canonical_domain, rstrip(rootpath, '/'))
+
     maybe_clone(flatten_multidocrefs(docs))
 
-    if !isnothing(canonical)
-        canonical = rstrip(canonical, '/')
-    end
     dir = make_output_structure(
         flatten_multidocrefs(docs),
         prettyurls,
         hide_previews;
-        canonical,
+        canonical = site_root_url,
     )
     out_assets = joinpath(dir, "assets")
     if assets_dir !== nothing && isdir(assets_dir)
@@ -179,6 +207,14 @@ function make(
         )
     end
 
+    if sitemap
+        make_sitemap(;
+            sitemap_root = site_root_url,
+            sitemap_filename,
+            docs_root_directory = dir,
+        )
+    end
+
     cp(dir, outdir; force = true)
     rm(dir; force = true, recursive = true)
 
diff --git a/src/documentertools/canonical_urls.jl b/src/documentertools/canonical_urls.jl
@@ -135,8 +135,10 @@ end
 Parses the HTML file at `indexhtml_path` and tries to extract the `url=...` value
 of the redirect `<meta http-equiv="refresh" ...>` tag.
 """
-function get_meta_redirect_url(indexhtml_path::AbstractString)
-    html = Gumbo.parsehtml(read(indexhtml_path, String))
+get_meta_redirect_url(indexhtml_path::AbstractString) =
+    get_meta_redirect_url(Gumbo.parsehtml(read(indexhtml_path, String)))
+
+function get_meta_redirect_url(html::Gumbo.HTMLDocument)
     for e in AbstractTrees.PreOrderDFS(html.root)
         e isa Gumbo.HTMLElement || continue
         Gumbo.tag(e) == :meta || continue
diff --git a/src/sitemap.jl b/src/sitemap.jl
@@ -0,0 +1,129 @@
+# Note: Franklin.jl also implements sitemap generation:
+#
+#   https://github.com/tlienart/Franklin.jl/blob/f1f7d044dc95ba0d9f368a3d1afc233eb58a59cf/src/manager/sitemap_generator.jl#L51
+#
+# At some point it might be worth factoring the code into a small shared package.
+# Franklin's implementation is more general that this here, but it looks like it relies
+# of Franklin-specific globals, so it's not a trivial copy-paste into a separate package.
+
+# The sitemap spec limits the size of the sitemap, both in terms of number of entires
+# and the total filesize (https://www.sitemaps.org//protocol.html#index).
+const SITEMAP_URL_LIMIT = 50_000
+const SITEMAP_SIZE_LIMIT = 52_428_800
+const SITEMAP_LIMIT_MSG = "Sitemaps are limited to $(SITEMAP_URL_LIMIT) URLs and a maximum filesize if $(SITEMAP_SIZE_LIMIT) bytes."
+
+struct SitemapTooLargeError <: Exception
+    msg::String
+    value::Int
+    limit::Int
+end
+function Base.showerror(io::IO, e::SitemapTooLargeError)
+    println(io, "SitemapTooLargeError: $(e.msg)")
+    println(io, " limit is $(e.limit), but sitemap has $(e.value)")
+    print(io, SITEMAP_LIMIT_MSG)
+end
+function check_sitemap_size_limit(msg::AbstractString, value::Integer, limit::Integer)
+    if value > limit
+        throw(SitemapTooLargeError(msg, value, limit))
+    elseif value > div(limit, 10) * 8
+        # Soft limits are 80% of the full limit
+        @warn "Sitemap too large: $(msg) (> 80% soft limit)\n$(SITEMAP_LIMIT_MSG)"
+    end
+    return nothing
+end
+
+function make_sitemap(;
+    sitemap_filename::AbstractString,
+    sitemap_root::AbstractString,
+    docs_root_directory::AbstractString,
+)
+    # Determine the list of sitemap URLs by finding all canonical URLs
+    sitemap_urls = find_sitemap_urls(; docs_root_directory, sitemap_root)
+    if length(sitemap_urls) == 0
+        @error "No sitemap URLs found"
+        return
+    end
+    sitemap_bytes = make_sitemap_bytes(sitemap_urls)
+    # Write the actual sitemap.xml file into the output directory
+    write(joinpath(docs_root_directory, sitemap_filename), sitemap_bytes)
+end
+
+function make_sitemap_bytes(sitemap_urls)::Vector{UInt8}
+    # Sitemaps are limited to 50 000 URLs: https://www.sitemaps.org/protocol.html#index
+    # TODO: we could automatically split the sitemap up if it's bigger than that and
+    # generate a sitemap index.
+    check_sitemap_size_limit("too many URLs", length(sitemap_urls), SITEMAP_URL_LIMIT)
+    sitemap_buffer = IOBuffer()
+    write(
+        sitemap_buffer,
+        """
+        <?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+        """,
+    )
+    for loc in sort(sitemap_urls)
+        write(sitemap_buffer, "<url><loc>$(loc)</loc></url>\n")
+    end
+    write(sitemap_buffer, "</urlset>\n")
+    sitemap_bytes = take!(sitemap_buffer)
+    check_sitemap_size_limit(
+        "sitemap too large (bytes)",
+        length(sitemap_bytes),
+        SITEMAP_SIZE_LIMIT,
+    )
+    return sitemap_bytes
+end
+
+function find_sitemap_urls(;
+    docs_root_directory::AbstractString,
+    sitemap_root::AbstractString,
+)
+    # On Windows, .relpath should have \ as path separators, which we have to
+    # "normalize" to /-s for web
+    canonical_urls = String[]
+    DocumenterTools.walkdocs(docs_root_directory, DocumenterTools.isdochtml) do fileinfo
+        html = Gumbo.parsehtml(read(fileinfo.fullpath, String))
+        canonical_href = find_canonical_url(html; filepath = fileinfo.fullpath)
+        if isnothing(canonical_href)
+            # A common case for why the canonical URL would be missing is when it's a redirect
+            # HTML file, and that is fine. So we check for that and only warn if it is _not_
+            # a redirect file.
+            if DocumenterTools.get_meta_redirect_url(html) === nothing
+                @warn "Canonical URL missing: $(fileinfo.relpath)"
+            end
+            return
+        end
+        # Check that the canonincal URL is correct.
+        # First, we check that the root part is actually what we expect it to be.
+        if !startswith(canonical_href, sitemap_root)
+            @warn "Invalid canonical URL, excluded from sitemap." canonical_href fileinfo.fullpath
+            return
+        end
+        # Let's make sure we're not adding duplicates, but first we must normalize the URL
+        canonical_href = normalize_canonical_url(canonical_href)
+        if !(canonical_href in canonical_urls)
+            push!(canonical_urls, canonical_href)
+        end
+    end
+    return canonical_urls
+end
+
+# foo/bar, foo/bar/ and foo/bar/index.html are basically equivalent, so we normalize the canonical
+# URL to foo/bar/
+normalize_canonical_url(url::AbstractString) = replace(url, r"/(index\.html)?$" => "/")
+
+# Loops through a Gumbo-parsed DOM tree
+function find_canonical_url(html::Gumbo.HTMLDocument; filepath::AbstractString)
+    canonical_href = nothing
+    for e in AbstractTrees.PreOrderDFS(html.root)
+        e isa Gumbo.HTMLElement || continue
+        Gumbo.tag(e) == :link || continue
+        Gumbo.getattr(e, "rel", nothing) == "canonical" || continue
+        if isnothing(canonical_href)
+            canonical_href = Gumbo.getattr(e, "href", nothing)
+        else
+            @warn "Duplicate <link rel=\"canonical\" ...> tag. Ignoring." filepath e canonical_href
+        end
+    end
+    return canonical_href
+end
diff --git a/test/runtests.jl b/test/runtests.jl

Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,8 @@ MultiDocumenter.make(`
`79`	`79`	`engine = MultiDocumenter.FlexSearch,`
`80`	`80`	`),`
`81`	`81`	`rootpath = "/MultiDocumenter.jl/",`
	`82`	`+ canonical_domain = "https://juliacomputing.github.io/",`
	`83`	`+ sitemap = true,`
`82`	`84`	`)`
`83`	`85`
`84`	`86`	`if "deploy" in ARGS`