|
1 | 1 | module HTMLSanitizer
|
2 | 2 |
|
3 |
| -greet() = print("Hello World!") |
| 3 | +using Gumbo |
| 4 | + |
| 5 | +export sanitize |
| 6 | + |
| 7 | +""" |
| 8 | + sanitize(input::AbstractString; isfragment = true, whitelist = WHITELIST, prettyprint = false) |
| 9 | +
|
| 10 | +Sanitizes the HTML input according to `whitelist`. |
| 11 | +
|
| 12 | +- `isfragment`: If true, removes enclosing `<HTML>` tags from the output. |
| 13 | +- `whitelist`: Whitelist for allowed elements and attributes. |
| 14 | +- `prettyprint`: Returns a prettier multiline string instead of a somewhat minified version. |
| 15 | +""" |
| 16 | +function sanitize(input::AbstractString; isfragment = true, whitelist = WHITELIST, prettyprint = false) |
| 17 | + doc = parsehtml(input) |
| 18 | + |
| 19 | + sanitize_bfs(doc.root, whitelist) |
| 20 | + |
| 21 | + out = IOBuffer() |
| 22 | + print(out, doc.root, pretty = prettyprint) |
| 23 | + |
| 24 | + out = String(take!(out)) |
| 25 | + if isfragment |
| 26 | + out = replace(out, r"^<HTML>" => "") |
| 27 | + out = replace(out, r"</HTML>$" => "") |
| 28 | + else |
| 29 | + return out |
| 30 | + end |
| 31 | +end |
| 32 | + |
| 33 | +reparent!(node, parent) = node.parent = parent |
| 34 | + |
| 35 | +# HTMLText isn't mutable, so this does nothing. Will lead to inconsistencies, but ¯\_(ツ)_/¯. |
| 36 | +reparent!(node::HTMLText, parent) = nothing |
| 37 | + |
| 38 | +function sanitize_bfs(tree, whitelist) |
| 39 | + i = 1 |
| 40 | + while i <= length(tree.children) |
| 41 | + el = tree.children[i] |
| 42 | + |
| 43 | + sanitized = sanitize_element(el, whitelist) |
| 44 | + if sanitized isa Vector |
| 45 | + # reparent all nodes |
| 46 | + reparent!.(sanitized, Ref(tree)) |
| 47 | + splice!(tree.children, i, sanitized) |
| 48 | + i += length(sanitized) |
| 49 | + else |
| 50 | + # reparent node |
| 51 | + reparent!(sanitized, tree) |
| 52 | + tree.children[i] = sanitized |
| 53 | + i += 1 |
| 54 | + end |
| 55 | + end |
| 56 | + sanitize_bfs.(tree.children, Ref(whitelist)) |
| 57 | +end |
| 58 | + |
| 59 | +sanitize_bfs(tree::HTMLText, whitelist) = nothing |
| 60 | + |
| 61 | +function sanitize_element(el::HTMLElement{TAG}, whitelist) where TAG |
| 62 | + tag = string(TAG) |
| 63 | + |
| 64 | + @debug("Sanitizing `$(tag)`.") |
| 65 | + |
| 66 | + if !(tag in get(whitelist, :elements, [])) |
| 67 | + @debug("Element `$(tag)` not in whitelist.") |
| 68 | + if tag in get(whitelist, :remove_contents, []) |
| 69 | + @debug("Removing contents for `$(tag)`.") |
| 70 | + return Gumbo.HTMLText("") |
| 71 | + end |
| 72 | + @debug("Replacing `$(tag)` with its contents.") |
| 73 | + return sanitize_element.(el.children, Ref(whitelist)) |
| 74 | + end |
| 75 | + |
| 76 | + el = sanitize_attributes(el, whitelist) |
| 77 | + |
| 78 | + return el |
| 79 | +end |
| 80 | + |
| 81 | +sanitize_element(el::HTMLElement{:HTML}, whitelist) = el |
| 82 | + |
| 83 | +sanitize_element(el::HTMLText, whitelist) = el |
| 84 | + |
| 85 | +const REGEX_PROTOCOL = r"\A\s*([^\/#]*?)(?:\:|�*58|�*3a)"i |
| 86 | + |
| 87 | +function sanitize_attributes(el::HTMLElement{TAG}, whitelist) where TAG |
| 88 | + tag = string(TAG) |
| 89 | + attributes = attrs(el) |
| 90 | + protocols = get(get(whitelist, :protocols, Dict()), tag, Dict()) |
| 91 | + |
| 92 | + attributes_for_tag = get(get(whitelist, :attributes, Dict()), tag, []) |
| 93 | + attributes_for_all = get(get(whitelist, :attributes, Dict()), :ALL, []) |
| 94 | + |
| 95 | + for (attr, val) in attributes |
| 96 | + if !(attr in attributes_for_tag) && !(attr in attributes_for_all) |
| 97 | + # not in whitelist, so remove the attribute altogether |
| 98 | + @debug("Deleting attribute `$(attr)` in element `$(tag)` (not in whitelist).") |
| 99 | + delete!(attributes, attr) |
| 100 | + elseif haskey(protocols, attr) |
| 101 | + # allowed, but only specific values are ok |
| 102 | + is_acceptable = false |
| 103 | + |
| 104 | + if occursin(REGEX_PROTOCOL, val) |
| 105 | + # looks like a protocol is specified |
| 106 | + if any(startswith.(Ref(lowercase(val)), string.(protocols[attr]))) |
| 107 | + is_acceptable = true |
| 108 | + end |
| 109 | + else |
| 110 | + if :relative in protocols[attr] && is_relative_url(val) |
| 111 | + is_acceptable = true |
| 112 | + end |
| 113 | + end |
| 114 | + |
| 115 | + if !is_acceptable |
| 116 | + @debug("Deleting attribute `$(attr)` in element `$(tag)` (does not conform to protocol).") |
| 117 | + delete!(attributes, attr) |
| 118 | + end |
| 119 | + end |
| 120 | + end |
| 121 | + |
| 122 | + return el |
| 123 | +end |
| 124 | + |
| 125 | +function is_relative_url(url) |
| 126 | + startswith(url, "./") |
| 127 | +end |
| 128 | + |
| 129 | +""" |
| 130 | +Default whitelist. Allows many elements and attributes, but crucially removes `<script>` elements |
| 131 | +as well as `style` attributes. |
| 132 | +""" |
| 133 | +const WHITELIST = Dict( |
| 134 | + :elements => [ |
| 135 | + "h1","h2","h3","h4","h5","h6","h7","h8","br","b","i","strong","em","a","pre","code","img","tt", |
| 136 | + "div","ins","del","sup","sub","p","ol","ul","table","thead","tbody","tfoot","blockquote", |
| 137 | + "dl","dt","dd","kbd","q","samp","var","hr","ruby","rt","rp","li","tr","td","th","s","strike", |
| 138 | + "summary","details","caption","figure","figcaption","abbr","bdo","cite","dfn","mark", |
| 139 | + "small","span","time","wbr" |
| 140 | + ], |
| 141 | + :remove_contents => ["script"], |
| 142 | + :attributes => Dict( |
| 143 | + "a" => ["href"], |
| 144 | + "img" => ["src", "longdesc"], |
| 145 | + "div" => ["itemscope", "itemtype"], |
| 146 | + "blockquote" => ["cite"], |
| 147 | + "del" => ["cite"], |
| 148 | + "ins" => ["cite"], |
| 149 | + "q" => ["cite"], |
| 150 | + :ALL => [ |
| 151 | + "abbr", "accept", "accept-charset", |
| 152 | + "accesskey", "action", "align", "alt", |
| 153 | + "aria-describedby", "aria-hidden", "aria-label", "aria-labelledby", |
| 154 | + "axis", "border", "cellpadding", "cellspacing", "char", |
| 155 | + "charoff", "charset", "checked", |
| 156 | + "clear", "cols", "colspan", "color", |
| 157 | + "compact", "coords", "datetime", "dir", |
| 158 | + "disabled", "enctype", "for", "frame", |
| 159 | + "headers", "height", "hreflang", |
| 160 | + "hspace", "ismap", "label", "lang", |
| 161 | + "maxlength", "media", "method", |
| 162 | + "multiple", "name", "nohref", "noshade", |
| 163 | + "nowrap", "open", "prompt", "readonly", "rel", "rev", |
| 164 | + "rows", "rowspan", "rules", "scope", |
| 165 | + "selected", "shape", "size", "span", |
| 166 | + "start", "summary", "tabindex", "target", |
| 167 | + "title", "type", "usemap", "valign", "value", |
| 168 | + "vspace", "width", "itemprop" |
| 169 | + ] |
| 170 | + ), |
| 171 | + :protocols => Dict( |
| 172 | + "a" => Dict("href" => ["http", "https", "mailto", :relative]), |
| 173 | + "img" => Dict( |
| 174 | + "src" => ["http", "https", :relative], |
| 175 | + "longdesc" => ["http", "https", :relative]), |
| 176 | + "blockquote" => Dict("cite" => ["http", "https", :relative]), |
| 177 | + "del" => Dict("cite" => ["http", "https", :relative]), |
| 178 | + "ins" => Dict("cite" => ["http", "https", :relative]), |
| 179 | + "q" => Dict("cite" => ["http", "https", :relative]), |
| 180 | + ) |
| 181 | +) |
| 182 | + |
| 183 | +""" |
| 184 | +Similar to the default whitelist, but only allows very few elements types. |
| 185 | +""" |
| 186 | +const LIMITED = merge(WHITELIST, Dict( |
| 187 | + :elements => ["b", "i", "strong", "em", "a", "pre", "code", "img", "ins", "del", "sup", "sub", "mark", "abbr", "p", "ol", "ul", "li"] |
| 188 | +)) |
4 | 189 |
|
5 | 190 | end # module
|
0 commit comments