|
| 1 | +# Note: Franklin.jl also implements sitemap generation: |
| 2 | +# |
| 3 | +# https://github.com/tlienart/Franklin.jl/blob/f1f7d044dc95ba0d9f368a3d1afc233eb58a59cf/src/manager/sitemap_generator.jl#L51 |
| 4 | +# |
| 5 | +# At some point it might be worth factoring the code into a small shared package. |
| 6 | +# Franklin's implementation is more general that this here, but it looks like it relies |
| 7 | +# of Franklin-specific globals, so it's not a trivial copy-paste into a separate package. |
| 8 | + |
| 9 | +# The sitemap spec limits the size of the sitemap, both in terms of number of entires |
| 10 | +# and the total filesize (https://www.sitemaps.org//protocol.html#index). |
| 11 | +const SITEMAP_URL_LIMIT = 50_000 |
| 12 | +const SITEMAP_SIZE_LIMIT = 52_428_800 |
| 13 | +const SITEMAP_LIMIT_MSG = "Sitemaps are limited to $(SITEMAP_URL_LIMIT) URLs and a maximum filesize if $(SITEMAP_SIZE_LIMIT) bytes." |
| 14 | + |
| 15 | +struct SitemapTooLargeError <: Exception |
| 16 | + msg::String |
| 17 | + value::Int |
| 18 | + limit::Int |
| 19 | +end |
| 20 | +function Base.showerror(io::IO, e::SitemapTooLargeError) |
| 21 | + println(io, "SitemapTooLargeError: $(e.msg)") |
| 22 | + println(io, " limit is $(e.limit), but sitemap has $(e.value)") |
| 23 | + print(io, SITEMAP_LIMIT_MSG) |
| 24 | +end |
| 25 | +function check_sitemap_size_limit(msg::AbstractString, value::Integer, limit::Integer) |
| 26 | + if value > limit |
| 27 | + throw(SitemapTooLargeError(msg, value, limit)) |
| 28 | + elseif value > div(limit, 10) * 8 |
| 29 | + # Soft limits are 80% of the full limit |
| 30 | + @warn "Sitemap too large: $(msg) (> 80% soft limit)\n$(SITEMAP_LIMIT_MSG)" |
| 31 | + end |
| 32 | + return nothing |
| 33 | +end |
| 34 | + |
| 35 | +function make_sitemap(; |
| 36 | + sitemap_filename::AbstractString, |
| 37 | + sitemap_root::AbstractString, |
| 38 | + docs_root_directory::AbstractString, |
| 39 | +) |
| 40 | + # Determine the list of sitemap URLs by finding all canonical URLs |
| 41 | + sitemap_urls = find_sitemap_urls(; docs_root_directory, sitemap_root) |
| 42 | + if length(sitemap_urls) == 0 |
| 43 | + @error "No sitemap URLs found" |
| 44 | + return |
| 45 | + end |
| 46 | + sitemap_bytes = make_sitemap_bytes(sitemap_urls) |
| 47 | + # Write the actual sitemap.xml file into the output directory |
| 48 | + write(joinpath(docs_root_directory, sitemap_filename), sitemap_bytes) |
| 49 | +end |
| 50 | + |
| 51 | +function make_sitemap_bytes(sitemap_urls)::Vector{UInt8} |
| 52 | + # Sitemaps are limited to 50 000 URLs: https://www.sitemaps.org/protocol.html#index |
| 53 | + # TODO: we could automatically split the sitemap up if it's bigger than that and |
| 54 | + # generate a sitemap index. |
| 55 | + check_sitemap_size_limit("too many URLs", length(sitemap_urls), SITEMAP_URL_LIMIT) |
| 56 | + sitemap_buffer = IOBuffer() |
| 57 | + write( |
| 58 | + sitemap_buffer, |
| 59 | + """ |
| 60 | + <?xml version="1.0" encoding="UTF-8"?> |
| 61 | + <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> |
| 62 | + """, |
| 63 | + ) |
| 64 | + for loc in sort(sitemap_urls) |
| 65 | + write(sitemap_buffer, "<url><loc>$(loc)</loc></url>\n") |
| 66 | + end |
| 67 | + write(sitemap_buffer, "</urlset>\n") |
| 68 | + sitemap_bytes = take!(sitemap_buffer) |
| 69 | + check_sitemap_size_limit( |
| 70 | + "sitemap too large (bytes)", |
| 71 | + length(sitemap_bytes), |
| 72 | + SITEMAP_SIZE_LIMIT, |
| 73 | + ) |
| 74 | + return sitemap_bytes |
| 75 | +end |
| 76 | + |
| 77 | +function find_sitemap_urls(; |
| 78 | + docs_root_directory::AbstractString, |
| 79 | + sitemap_root::AbstractString, |
| 80 | +) |
| 81 | + # On Windows, .relpath should have \ as path separators, which we have to |
| 82 | + # "normalize" to /-s for web |
| 83 | + canonical_urls = String[] |
| 84 | + DocumenterTools.walkdocs(docs_root_directory, DocumenterTools.isdochtml) do fileinfo |
| 85 | + html = Gumbo.parsehtml(read(fileinfo.fullpath, String)) |
| 86 | + canonical_href = find_canonical_url(html; filepath = fileinfo.fullpath) |
| 87 | + if isnothing(canonical_href) |
| 88 | + # A common case for why the canonical URL would be missing is when it's a redirect |
| 89 | + # HTML file, and that is fine. So we check for that and only warn if it is _not_ |
| 90 | + # a redirect file. |
| 91 | + if DocumenterTools.get_meta_redirect_url(html) === nothing |
| 92 | + @warn "Canonical URL missing: $(fileinfo.relpath)" |
| 93 | + end |
| 94 | + return |
| 95 | + end |
| 96 | + # Check that the canonincal URL is correct. |
| 97 | + # First, we check that the root part is actually what we expect it to be. |
| 98 | + if !startswith(canonical_href, sitemap_root) |
| 99 | + @warn "Invalid canonical URL, excluded from sitemap." canonical_href fileinfo.fullpath |
| 100 | + return |
| 101 | + end |
| 102 | + # Let's make sure we're not adding duplicates, but first we must normalize the URL |
| 103 | + canonical_href = normalize_canonical_url(canonical_href) |
| 104 | + if !(canonical_href in canonical_urls) |
| 105 | + push!(canonical_urls, canonical_href) |
| 106 | + end |
| 107 | + end |
| 108 | + return canonical_urls |
| 109 | +end |
| 110 | + |
| 111 | +# foo/bar, foo/bar/ and foo/bar/index.html are basically equivalent, so we normalize the canonical |
| 112 | +# URL to foo/bar/ |
| 113 | +normalize_canonical_url(url::AbstractString) = replace(url, r"/(index\.html)?$" => "/") |
| 114 | + |
| 115 | +# Loops through a Gumbo-parsed DOM tree |
| 116 | +function find_canonical_url(html::Gumbo.HTMLDocument; filepath::AbstractString) |
| 117 | + canonical_href = nothing |
| 118 | + for e in AbstractTrees.PreOrderDFS(html.root) |
| 119 | + e isa Gumbo.HTMLElement || continue |
| 120 | + Gumbo.tag(e) == :link || continue |
| 121 | + Gumbo.getattr(e, "rel", nothing) == "canonical" || continue |
| 122 | + if isnothing(canonical_href) |
| 123 | + canonical_href = Gumbo.getattr(e, "href", nothing) |
| 124 | + else |
| 125 | + @warn "Duplicate <link rel=\"canonical\" ...> tag. Ignoring." filepath e canonical_href |
| 126 | + end |
| 127 | + end |
| 128 | + return canonical_href |
| 129 | +end |
0 commit comments