Skip to content

Commit 5e63bdc

Browse files
committed
initial functionality
1 parent 3bf0733 commit 5e63bdc

File tree

6 files changed

+414
-2
lines changed

6 files changed

+414
-2
lines changed

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ os:
55
- osx
66
julia:
77
- 1.0
8+
- 1.3
89
- nightly
910
matrix:
1011
allow_failures:

Project.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,12 @@ uuid = "9a15a9f4-ddd5-46ee-89fc-c219f813dd6f"
33
authors = ["Sebastian Pfitzner"]
44
version = "0.1.0"
55

6+
[deps]
7+
Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
8+
69
[compat]
710
julia = "1"
11+
Gumbo = "0.5"
812

913
[extras]
1014
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,38 @@
11
# HTMLSanitizer
22

33
[![Build Status](https://travis-ci.com/pfitzseb/HTMLSanitizer.jl.svg?branch=master)](https://travis-ci.com/pfitzseb/HTMLSanitizer.jl)
4+
5+
Whitelist-based HTML sanitizer inspired by [sanitize](https://github.com/rgrove/sanitize/) and [html-pipeline](https://github.com/jch/html-pipeline/blob/13057c4dcde5e769dd116682f1bed7e65e920b40/lib/html/pipeline/sanitization_filter.rb).
6+
7+
HTMLSanitizer.jl parses your source HTML with [Gumbo.jl](https://github.com/JuliaWeb/Gumbo.jl) and then filters tags and attributes according to a whitelist. The default whitelists are fairly close to GitHubs pipeline for rendering markdown to HTML.
8+
9+
## Usage
10+
11+
```
12+
julia> sanitize("<a onclick='javascript:alert(0)'>YO DAWG</a>")
13+
"<a>YO DAWG</a>"
14+
```
15+
```
16+
julia> sanitize("""<img src="./foo.jpg" longdesc="javascript:alert(1)"></img>""")
17+
"<img src=\"./foo.jpg\"></img>"
18+
```
19+
```
20+
julia> whitelist = deepcopy(HTMLSanitizer.WHITELIST)
21+
Dict{Symbol,Any} with 4 entries:
22+
:protocols => Dict("del"=>Dict("cite"=>["http", "https", :relative]),"ins"=>D…
23+
:attributes => Dict{Any,Array{String,1}}("del"=>["cite"],"ins"=>["cite"],:ALL=…
24+
:elements => ["h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "br", "b" … …
25+
:remove_contents => ["script"]
26+
27+
julia> append!(whitelist[:elements], ["body", "head"]); # body and head are not allowed by default
28+
29+
julia> HTMLSanitizer.sanitize("""
30+
<html><head></head><body onload!#\$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>
31+
""", isfragment = false, whitelist = whitelist)
32+
"<HTML><head></head><body></body></HTML>"
33+
```
34+
35+
## Whitelists
36+
37+
Two whitelists are provided: `HTMLSanitizer.WHITELIST` and `HTMLSanitizer.LIMITED`. Check out the
38+
implementation if you want to know what exactly is whitelisted.

src/HTMLSanitizer.jl

Lines changed: 186 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,190 @@
11
module HTMLSanitizer
22

3-
greet() = print("Hello World!")
3+
using Gumbo
4+
5+
export sanitize
6+
7+
"""
8+
sanitize(input::AbstractString; isfragment = true, whitelist = WHITELIST, prettyprint = false)
9+
10+
Sanitizes the HTML input according to `whitelist`.
11+
12+
- `isfragment`: If true, removes enclosing `<HTML>` tags from the output.
13+
- `whitelist`: Whitelist for allowed elements and attributes.
14+
- `prettyprint`: Returns a prettier multiline string instead of a somewhat minified version.
15+
"""
16+
function sanitize(input::AbstractString; isfragment = true, whitelist = WHITELIST, prettyprint = false)
17+
doc = parsehtml(input)
18+
19+
sanitize_bfs(doc.root, whitelist)
20+
21+
out = IOBuffer()
22+
print(out, doc.root, pretty = prettyprint)
23+
24+
out = String(take!(out))
25+
if isfragment
26+
out = replace(out, r"^<HTML>" => "")
27+
out = replace(out, r"</HTML>$" => "")
28+
else
29+
return out
30+
end
31+
end
32+
33+
reparent!(node, parent) = node.parent = parent
34+
35+
# HTMLText isn't mutable, so this does nothing. Will lead to inconsistencies, but ¯\_(ツ)_/¯.
36+
reparent!(node::HTMLText, parent) = nothing
37+
38+
function sanitize_bfs(tree, whitelist)
39+
i = 1
40+
while i <= length(tree.children)
41+
el = tree.children[i]
42+
43+
sanitized = sanitize_element(el, whitelist)
44+
if sanitized isa Vector
45+
# reparent all nodes
46+
reparent!.(sanitized, Ref(tree))
47+
splice!(tree.children, i, sanitized)
48+
i += length(sanitized)
49+
else
50+
# reparent node
51+
reparent!(sanitized, tree)
52+
tree.children[i] = sanitized
53+
i += 1
54+
end
55+
end
56+
sanitize_bfs.(tree.children, Ref(whitelist))
57+
end
58+
59+
sanitize_bfs(tree::HTMLText, whitelist) = nothing
60+
61+
function sanitize_element(el::HTMLElement{TAG}, whitelist) where TAG
62+
tag = string(TAG)
63+
64+
@debug("Sanitizing `$(tag)`.")
65+
66+
if !(tag in get(whitelist, :elements, []))
67+
@debug("Element `$(tag)` not in whitelist.")
68+
if tag in get(whitelist, :remove_contents, [])
69+
@debug("Removing contents for `$(tag)`.")
70+
return Gumbo.HTMLText("")
71+
end
72+
@debug("Replacing `$(tag)` with its contents.")
73+
return sanitize_element.(el.children, Ref(whitelist))
74+
end
75+
76+
el = sanitize_attributes(el, whitelist)
77+
78+
return el
79+
end
80+
81+
sanitize_element(el::HTMLElement{:HTML}, whitelist) = el
82+
83+
sanitize_element(el::HTMLText, whitelist) = el
84+
85+
const REGEX_PROTOCOL = r"\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)"i
86+
87+
function sanitize_attributes(el::HTMLElement{TAG}, whitelist) where TAG
88+
tag = string(TAG)
89+
attributes = attrs(el)
90+
protocols = get(get(whitelist, :protocols, Dict()), tag, Dict())
91+
92+
attributes_for_tag = get(get(whitelist, :attributes, Dict()), tag, [])
93+
attributes_for_all = get(get(whitelist, :attributes, Dict()), :ALL, [])
94+
95+
for (attr, val) in attributes
96+
if !(attr in attributes_for_tag) && !(attr in attributes_for_all)
97+
# not in whitelist, so remove the attribute altogether
98+
@debug("Deleting attribute `$(attr)` in element `$(tag)` (not in whitelist).")
99+
delete!(attributes, attr)
100+
elseif haskey(protocols, attr)
101+
# allowed, but only specific values are ok
102+
is_acceptable = false
103+
104+
if occursin(REGEX_PROTOCOL, val)
105+
# looks like a protocol is specified
106+
if any(startswith.(Ref(lowercase(val)), string.(protocols[attr])))
107+
is_acceptable = true
108+
end
109+
else
110+
if :relative in protocols[attr] && is_relative_url(val)
111+
is_acceptable = true
112+
end
113+
end
114+
115+
if !is_acceptable
116+
@debug("Deleting attribute `$(attr)` in element `$(tag)` (does not conform to protocol).")
117+
delete!(attributes, attr)
118+
end
119+
end
120+
end
121+
122+
return el
123+
end
124+
125+
function is_relative_url(url)
126+
startswith(url, "./")
127+
end
128+
129+
"""
130+
Default whitelist. Allows many elements and attributes, but crucially removes `<script>` elements
131+
as well as `style` attributes.
132+
"""
133+
const WHITELIST = Dict(
134+
:elements => [
135+
"h1","h2","h3","h4","h5","h6","h7","h8","br","b","i","strong","em","a","pre","code","img","tt",
136+
"div","ins","del","sup","sub","p","ol","ul","table","thead","tbody","tfoot","blockquote",
137+
"dl","dt","dd","kbd","q","samp","var","hr","ruby","rt","rp","li","tr","td","th","s","strike",
138+
"summary","details","caption","figure","figcaption","abbr","bdo","cite","dfn","mark",
139+
"small","span","time","wbr"
140+
],
141+
:remove_contents => ["script"],
142+
:attributes => Dict(
143+
"a" => ["href"],
144+
"img" => ["src", "longdesc"],
145+
"div" => ["itemscope", "itemtype"],
146+
"blockquote" => ["cite"],
147+
"del" => ["cite"],
148+
"ins" => ["cite"],
149+
"q" => ["cite"],
150+
:ALL => [
151+
"abbr", "accept", "accept-charset",
152+
"accesskey", "action", "align", "alt",
153+
"aria-describedby", "aria-hidden", "aria-label", "aria-labelledby",
154+
"axis", "border", "cellpadding", "cellspacing", "char",
155+
"charoff", "charset", "checked",
156+
"clear", "cols", "colspan", "color",
157+
"compact", "coords", "datetime", "dir",
158+
"disabled", "enctype", "for", "frame",
159+
"headers", "height", "hreflang",
160+
"hspace", "ismap", "label", "lang",
161+
"maxlength", "media", "method",
162+
"multiple", "name", "nohref", "noshade",
163+
"nowrap", "open", "prompt", "readonly", "rel", "rev",
164+
"rows", "rowspan", "rules", "scope",
165+
"selected", "shape", "size", "span",
166+
"start", "summary", "tabindex", "target",
167+
"title", "type", "usemap", "valign", "value",
168+
"vspace", "width", "itemprop"
169+
]
170+
),
171+
:protocols => Dict(
172+
"a" => Dict("href" => ["http", "https", "mailto", :relative]),
173+
"img" => Dict(
174+
"src" => ["http", "https", :relative],
175+
"longdesc" => ["http", "https", :relative]),
176+
"blockquote" => Dict("cite" => ["http", "https", :relative]),
177+
"del" => Dict("cite" => ["http", "https", :relative]),
178+
"ins" => Dict("cite" => ["http", "https", :relative]),
179+
"q" => Dict("cite" => ["http", "https", :relative]),
180+
)
181+
)
182+
183+
"""
184+
Similar to the default whitelist, but only allows very few elements types.
185+
"""
186+
const LIMITED = merge(WHITELIST, Dict(
187+
:elements => ["b", "i", "strong", "em", "a", "pre", "code", "img", "ins", "del", "sup", "sub", "mark", "abbr", "p", "ol", "ul", "li"]
188+
))
4189

5190
end # module

test/malicious_html.jl

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
@testset "malicious html" begin
2+
# https://github.com/rgrove/sanitize/blob/master/test/test_malicious_html.rb
3+
4+
@testset "comments" begin
5+
@testset "should not allow script injection via conditional comments" begin
6+
@test "" == HTMLSanitizer.sanitize("""<!--[if gte IE 4]>\n<script>alert('XSS');</script>\n<![endif]-->""")
7+
end
8+
end
9+
10+
@testset "<body>" begin
11+
@testset "should not be possible to inject JS via a malformed event attribute" begin
12+
whitelist = deepcopy(HTMLSanitizer.WHITELIST)
13+
append!(whitelist[:elements], ["body", "head"])
14+
@test """<HTML><head></head><body></body></HTML>""" == HTMLSanitizer.sanitize("""<html><head></head><body onload!#\$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>""", isfragment = false, whitelist = whitelist)
15+
end
16+
end
17+
18+
@testset "<iframe>" begin
19+
@testset "should not be possible to inject an iframe using an improperly closed tag" begin
20+
@test "" == HTMLSanitizer.sanitize("""<iframe src=http://ha.ckers.org/scriptlet.html <""")
21+
end
22+
end
23+
24+
@testset "<img>" begin
25+
@testset "should not be possible to inject JS via an unquoted <img> src attribute" begin
26+
@test "<img></img>" == HTMLSanitizer.sanitize("""<img src=javascript:alert('XSS')>""")
27+
end
28+
29+
@testset "should not be possible to inject JS using grave accents as <img> src delimiters" begin
30+
@test "<img></img>" == HTMLSanitizer.sanitize("""<img src=`javascript:alert('XSS')`>""")
31+
end
32+
33+
@testset "should not be possible to inject <script> via a malformed <img> tag" begin
34+
@test """<img></img>">""" == HTMLSanitizer.sanitize("""<img \"\"\"><script>alert("XSS")</script>">""")
35+
end
36+
37+
@testset "should not be possible to inject protocol-based JS" begin
38+
@test "<img></img>" == HTMLSanitizer.sanitize("""<img src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>""")
39+
40+
@test "<img></img>" == HTMLSanitizer.sanitize("""<img src=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>""")
41+
42+
@test "<img></img>" == HTMLSanitizer.sanitize("""<img src=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>""")
43+
44+
# Encoded tab character.
45+
@test "<img></img>" == HTMLSanitizer.sanitize("""<img src="jav&#x09;ascript:alert('XSS');">""")
46+
47+
# Encoded newline.
48+
@test "<img></img>" == HTMLSanitizer.sanitize("""<img src="jav&#x0A;ascript:alert('XSS');">""")
49+
50+
# Encoded carriage return.
51+
@test "<img></img>" == HTMLSanitizer.sanitize("""<img src="jav&#x0D;ascript:alert('XSS');">""")
52+
53+
# Spaces plus meta char.
54+
@test "<img></img>" == HTMLSanitizer.sanitize("""<img src=" &#14; javascript:alert('XSS');">""")
55+
56+
# Mixed spaces and tabs.
57+
@test "<img></img>" == HTMLSanitizer.sanitize("""<img src="j\na v\tascript://alert('XSS');">""")
58+
end
59+
60+
@testset "should not be possible to inject protocol-based JS via whitespace" begin
61+
@test "<img></img>" == HTMLSanitizer.sanitize("""<img src="jav\tascript:alert('XSS');">""")
62+
end
63+
64+
@testset "should not be possible to inject JS using a half-open <img> tag" begin
65+
@test """""" == HTMLSanitizer.sanitize("""<img src="javascript:alert('XSS')" """)
66+
end
67+
end
68+
end

0 commit comments

Comments
 (0)