Skip to content

Commit e4890d8

Browse files
authored
Merge pull request #3 from JuliaComputing/sp/upgrade
upgrade gumbo
2 parents ae54c61 + c9b54bc commit e4890d8

File tree

5 files changed

+137
-20
lines changed

5 files changed

+137
-20
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
88

99
[compat]
1010
julia = "1"
11-
Gumbo = "0.5"
11+
Gumbo = "0.7"
1212

1313
[extras]
1414
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

src/HTMLSanitizer.jl

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,13 @@ Sanitizes the HTML input according to `whitelist`.
1414
- `prettyprint`: Returns a prettier multiline string instead of a somewhat minified version.
1515
"""
1616
function sanitize(input::AbstractString; isfragment = true, whitelist = WHITELIST, prettyprint = false)
17-
input_preserve_ws = replace(input, r"(\s+)"s => s" 🐑\1🐑 ")
18-
doc = parsehtml(input_preserve_ws)
19-
17+
doc = parsehtml(input, preserve_whitespace=true)
2018
sanitize_bfs(doc.root, whitelist)
2119

2220
out = IOBuffer()
2321
print(out, doc.root, pretty = prettyprint)
2422

2523
out = String(take!(out))
26-
out = replace(out, r"\s?🐑(\s+)🐑\s?"s => s"\1")
2724

2825
if isfragment
2926
out = replace(out, r"^<HTML>" => "")
@@ -33,12 +30,7 @@ function sanitize(input::AbstractString; isfragment = true, whitelist = WHITELIS
3330
end
3431
end
3532

36-
reparent!(_, _) = nothing
37-
38-
reparent!(node::HTMLElement, parent) = node.parent = parent
39-
40-
# HTMLText isn't mutable, so this does nothing. Will lead to inconsistencies, but ¯\_(ツ)_/¯.
41-
reparent!(node::HTMLText, parent) = nothing
33+
reparent!(node, parent) = node.parent = parent
4234

4335
function sanitize_bfs(tree, whitelist)
4436
i = 1
@@ -50,7 +42,7 @@ function sanitize_bfs(tree, whitelist)
5042
# reparent all nodes
5143
reparent!.(sanitized, Ref(tree))
5244
splice!(tree.children, i, sanitized)
53-
i += length(sanitized)
45+
# don't increment i here so the newly inserted nodes are sanitized in the next iteration
5446
else
5547
# reparent node
5648
reparent!(sanitized, tree)
@@ -75,7 +67,7 @@ function sanitize_element(el::HTMLElement{TAG}, whitelist) where TAG
7567
return Gumbo.HTMLText("")
7668
end
7769
@debug("Replacing `$(tag)` with its contents.")
78-
out = sanitize_element.(el.children, Ref(whitelist))
70+
out = el.children
7971
return isempty(out) ? Gumbo.HTMLText("") : out
8072
end
8173

@@ -90,6 +82,8 @@ sanitize_element(el::HTMLText, whitelist) = el
9082

9183
const REGEX_PROTOCOL = r"\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)"i
9284

85+
sanitize_attributes(el, whitelist) = el
86+
9387
function sanitize_attributes(el::HTMLElement{TAG}, whitelist) where TAG
9488
tag = string(TAG)
9589
attributes = attrs(el)
@@ -128,9 +122,12 @@ function sanitize_attributes(el::HTMLElement{TAG}, whitelist) where TAG
128122
return el
129123
end
130124

131-
function is_relative_url(url)
132-
startswith(url, "./")
133-
end
125+
# A relative URL either
126+
# 1. starts with `/` (root-relative).
127+
# 2. starts with `//` (protocol-relative).
128+
# 3. starts with `../`/`./` (relative directory traversal)
129+
# 4. doesn't start with either of the above and doesn't start with a protocol (e.g. `foo/bar.html`)
130+
is_relative_url(url) = occursin(r"\.?\.?//?"i, url) || !occursin(r"^\w+://"i, url)
134131

135132
"""
136133
Default whitelist. Allows many elements and attributes, but crucially removes `<script>` elements
@@ -142,7 +139,7 @@ const WHITELIST = Dict(
142139
"div","ins","del","sup","sub","p","ol","ul","table","thead","tbody","tfoot","blockquote",
143140
"dl","dt","dd","kbd","q","samp","var","hr","ruby","rt","rp","li","tr","td","th","s","strike",
144141
"summary","details","caption","figure","figcaption","abbr","bdo","cite","dfn","mark",
145-
"small","span","time","wbr"
142+
"small","span","time","wbr","center"
146143
],
147144
:remove_contents => ["script"],
148145
:attributes => Dict(

test/runtests.jl

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ using Test
4747
end
4848

4949
@testset "test_whitelisted_longdesc_schemes_are_allowed" begin
50-
stuff = """<img longdesc="http://longdesc.com"src="./foo.jpg"></img>"""
50+
stuff = """<img longdesc="http://longdesc.com" src="./foo.jpg"></img>"""
5151
html = HTMLSanitizer.sanitize(stuff)
5252
@test stuff == html
5353
end
@@ -72,7 +72,12 @@ using Test
7272
@test stuff == html
7373
end
7474

75-
@testset "test_script_contents_are_removed" begin
75+
@testset "test_script_contents_are_removed1" begin
76+
orig = """<div><script>JavaScript!</script></div>"""
77+
@test "<div></div>" == HTMLSanitizer.sanitize(orig)
78+
end
79+
80+
@testset "test_script_contents_are_removed2" begin
7681
orig = """<script>JavaScript!</script>"""
7782
@test "" == HTMLSanitizer.sanitize(orig)
7883
end
@@ -142,8 +147,43 @@ end
142147
</body>
143148
</html>
144149
"""
145-
expected = "<HTML>\n\n \n \n \n \n <p>A simple test page.</p>\n <a></a>\n <a></a>\n <pre>\n <code>\nfoo\nbar\nbaz\n </code>\n </pre>\n \n\n</HTML>"
150+
expected = "<HTML>\n \n \n \n <p>A simple test page.</p>\n <a></a>\n <a></a>\n <pre> <code>\nfoo\nbar\nbaz\n </code>\n </pre>\n \n\n</HTML>"
146151
@test sanitize(orig, isfragment=false) == expected
147152
end
148153

154+
@testset "urls" begin
155+
@testset "relative" begin
156+
orig = """<img src="foo/bar.html"></img>"""
157+
@test sanitize(orig) == orig
158+
159+
orig = """<img src="/foo/bar.html"></img>"""
160+
@test sanitize(orig) == orig
161+
162+
orig = """<img src="//foo/bar.html"></img>"""
163+
@test sanitize(orig) == orig
164+
165+
orig = """<img src="./foo/bar.html"></img>"""
166+
@test sanitize(orig) == orig
167+
168+
orig = """<img src="/asd://foo/bar.html"></img>"""
169+
@test sanitize(orig) == orig
170+
end
171+
172+
@testset "protocols" begin
173+
orig = """<img src="asd://foo/bar.html"></img>"""
174+
@test sanitize(orig) == "<img></img>"
175+
176+
orig = """<img src="http://foo/bar.html"></img>"""
177+
@test sanitize(orig) == orig
178+
179+
orig = """<img src="https://foo/bar.html"></img>"""
180+
@test sanitize(orig) == orig
181+
end
182+
end
183+
184+
@testset "edge case" begin
185+
html = read(joinpath(@__DIR__, "testhtml.html"), String)
186+
sanitize(html) == read(joinpath(@__DIR__, "testhtml_out.html"), String)
187+
end
188+
149189
include("malicious_html.jl")

test/testhtml.html

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<h1>RiemannHilbert.jl</h1>
2+
<p>A Julia package for solving Riemann–Hilbert problems</p>
3+
<p><a href="https://travis-ci.org/JuliaHolomorphic/RiemannHilbert.jl"><img src="https://travis-ci.org/JuliaHolomorphic/RiemannHilbert.jl.svg?branch=master" alt="Build Status" /></a>
4+
<a href="https://codecov.io/gh/JuliaHolomorphic/RiemannHilbert.jl"><img src="https://codecov.io/gh/JuliaHolomorphic/RiemannHilbert.jl/branch/master/graph/badge.svg" alt="codecov" /></a>
5+
<a href="https://gitter.im/JuliaApproximation/ApproxFun.jl?utm_source=badge&amp;utm_medium=badge&amp;utm_campaign=pr-badge&amp;utm_content=badge"><img src="https://badges.gitter.im/JuliaApproximation/ApproxFun.jl.svg" alt="Join the chat at https://gitter.im/JuliaApproximation/ApproxFun.jl" /></a></p>
6+
<center>
7+
<img src="images/sixrays.jpg" height="250" alt=".">
8+
</center>
9+
<p>A Riemann–Hilbert problem is a certain type of boundary value problem in the complex plane where an analytic function has prescribed jumps.
10+
They arise in integrable systems, random matrices, spectral analysis, orthogonal polynomials, and elsewhere. This package implements
11+
the numerical method of [Olver 2011, Olver 2012] (see also review in [Trogodon &amp; Olver 2015]) for solving Riemann–Hilbert problems, and is very much related to <a href="https://github.com/dlfivefifty/RHPackage">RHPackage</a>.</p>
12+
<p>For an example, the following calculates the Hastings–McLeod solution to Painlev'e II at the origin,
13+
which is posed on 4 rays:</p>
14+
<pre><code class="language-julia"># Define the contour
15+
Γ = Segment(0, 2.5exp(im*π/6)) ∪ Segment(0, 2.5exp(5im*π/6)) ∪
16+
Segment(0, 2.5exp(-5im*π/6)) ∪ Segment(0, 2.5exp(-im*π/6))
17+
18+
# Defe the jump function
19+
G = Fun( z -&gt; if angle(z) ≈ π/6
20+
[1 0; im*exp(8im/3*z^3) 1]
21+
elseif angle(z) ≈ 5π/6
22+
[1 0; -im*exp(8im/3*z^3) 1]
23+
elseif angle(z) ≈ -π/6
24+
[1 im*exp(-8im/3*z^3); 0 1]
25+
elseif angle(z) ≈ -5π/6
26+
[1 -im*exp(-8im/3*z^3); 0 1]
27+
end, Γ)
28+
29+
# Solve the Riemann–Hilbert problem. We transpose to recast a left
30+
# Riemann–Hilbert problem as a left one.
31+
Φ = transpose(rhsolve(transpose(G), 4*200)) # use 200 collocation points per ray
32+
z = Fun(ℂ) # The function z in the complex plane
33+
2(z*Φ[1,2])(Inf) # Evaluate 2lim_{z -&gt; ∞} zΦ(z)_{1,2}
34+
</code></pre>
35+
<h1>References</h1>
36+
<ol>
37+
<li>T. Trogdon &amp; S. Olver (2015), <a href="http://bookstore.siam.org/ot146/">Riemann–Hilbert Problems, Their Numerical Solution and the Computation of Nonlinear Special Functions</a>, SIAM.</li>
38+
<li>S. Olver (2012), <a href="https://link.springer.com/article/10.1007/s00211-012-0459-7">A general framework for solving Riemann–Hilbert problems numerically</a>, Numer. Math., 122: 305–340.</li>
39+
<li>S. Olver (2011), <a href="https://link.springer.com/article/10.1007/s10208-010-9079-8">Numerical solution of Riemann–Hilbert problems: Painlevé II</a>, Found. Comput. Maths, 11: 153–179.</li>
40+
</ol>

test/testhtml_out.html

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<h1>RiemannHilbert.jl</h1>
2+
<p>A Julia package for solving Riemann–Hilbert problems</p>
3+
<p><a href="https://travis-ci.org/JuliaHolomorphic/RiemannHilbert.jl"><img alt="Build Status" src="https://travis-ci.org/JuliaHolomorphic/RiemannHilbert.jl.svg?branch=master"></img></a>
4+
<a href="https://codecov.io/gh/JuliaHolomorphic/RiemannHilbert.jl"><img alt="codecov" src="https://codecov.io/gh/JuliaHolomorphic/RiemannHilbert.jl/branch/master/graph/badge.svg"></img></a>
5+
<a href="https://gitter.im/JuliaApproximation/ApproxFun.jl?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge"><img alt="Join the chat at https://gitter.im/JuliaApproximation/ApproxFun.jl" src="https://badges.gitter.im/JuliaApproximation/ApproxFun.jl.svg"></img></a></p>
6+
<center>
7+
<img alt="." height="250" src="images/sixrays.jpg"></img>
8+
</center>
9+
<p>A Riemann–Hilbert problem is a certain type of boundary value problem in the complex plane where an analytic function has prescribed jumps.
10+
They arise in integrable systems, random matrices, spectral analysis, orthogonal polynomials, and elsewhere. This package implements
11+
the numerical method of [Olver 2011, Olver 2012] (see also review in [Trogodon & Olver 2015]) for solving Riemann–Hilbert problems, and is very much related to <a href="https://github.com/dlfivefifty/RHPackage">RHPackage</a>.</p>
12+
<p>For an example, the following calculates the Hastings–McLeod solution to Painlev'e II at the origin,
13+
which is posed on 4 rays:</p>
14+
<pre><code># Define the contour
15+
Γ = Segment(0, 2.5exp(im*π/6)) ∪ Segment(0, 2.5exp(5im*π/6)) ∪
16+
Segment(0, 2.5exp(-5im*π/6)) ∪ Segment(0, 2.5exp(-im*π/6))
17+
18+
# Defe the jump function
19+
G = Fun( z -> if angle(z) ≈ π/6
20+
[1 0; im*exp(8im/3*z^3) 1]
21+
elseif angle(z) ≈ 5π/6
22+
[1 0; -im*exp(8im/3*z^3) 1]
23+
elseif angle(z) ≈ -π/6
24+
[1 im*exp(-8im/3*z^3); 0 1]
25+
elseif angle(z) ≈ -5π/6
26+
[1 -im*exp(-8im/3*z^3); 0 1]
27+
end, Γ)
28+
29+
# Solve the Riemann–Hilbert problem. We transpose to recast a left
30+
# Riemann–Hilbert problem as a left one.
31+
Φ = transpose(rhsolve(transpose(G), 4*200)) # use 200 collocation points per ray
32+
z = Fun(ℂ) # The function z in the complex plane
33+
2(z*Φ[1,2])(Inf) # Evaluate 2lim_{z -> ∞} zΦ(z)_{1,2}
34+
</code></pre>
35+
<h1>References</h1>
36+
<ol>
37+
<li>T. Trogdon & S. Olver (2015), <a href="http://bookstore.siam.org/ot146/">Riemann–Hilbert Problems, Their Numerical Solution and the Computation of Nonlinear Special Functions</a>, SIAM.</li>
38+
<li>S. Olver (2012), <a href="https://link.springer.com/article/10.1007/s00211-012-0459-7">A general framework for solving Riemann–Hilbert problems numerically</a>, Numer. Math., 122: 305–340.</li>
39+
<li>S. Olver (2011), <a href="https://link.springer.com/article/10.1007/s10208-010-9079-8">Numerical solution of Riemann–Hilbert problems: Painlevé II</a>, Found. Comput. Maths, 11: 153–179.</li>
40+
</ol>

0 commit comments

Comments
 (0)