Skip to content

Commit 9f686f0

Browse files
Get default file name from content-disposition or URL (#269)
The order here is: - use a file name from content disposition header (filename* over filename) - otherwise try to get a file name from the redirected URL - otherwise try to get a file name from the original URL - otherwise use a default file name ("download.txt")
1 parent 9391b09 commit 9f686f0

File tree

4 files changed

+375
-2
lines changed

4 files changed

+375
-2
lines changed

src/Downloads.jl

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,11 @@ end
170170

171171
## download API ##
172172

173+
# Getting file names from URLs and Responses:
174+
include("filenames.jl")
175+
176+
const DEFAULT_FILENAME = "download.txt"
177+
173178
"""
174179
download(url, [ output = tempname() ];
175180
[ method = "GET", ]
@@ -255,7 +260,21 @@ function download(
255260
debug :: Union{Function, Nothing} = nothing,
256261
downloader :: Union{Downloader, Nothing} = nothing,
257262
) :: ArgWrite
258-
arg_write(output) do output
263+
# only rename when output is originally empty
264+
try_rename = false
265+
if output === nothing
266+
@show url
267+
try_rename = true
268+
# guess file name from URL (might not be final name)
269+
name = url_filename(url)
270+
if !is_safe_filename(name)
271+
name = DEFAULT_FILENAME
272+
end
273+
@show name
274+
output = joinpath(mktempdir(), name)
275+
end
276+
local response # capture outside closure
277+
path = arg_write(output) do output
259278
response = request(
260279
url,
261280
output = output,
@@ -270,6 +289,20 @@ function download(
270289
status_ok(response) && return output
271290
throw(RequestError(url, Curl.CURLE_OK, "", response))
272291
end
292+
# fix file suffix based on headers & redirected URL
293+
if try_rename && ispath(path)
294+
name = get_filename(response)
295+
@show name, is_safe_filename(name)
296+
if is_safe_filename(name)
297+
path′ = joinpath(dirname(path), name)
298+
@assert dirname(path) == dirname(path′)
299+
if path != path′
300+
mv(path, path′)
301+
path = path′
302+
end
303+
end
304+
end
305+
return path
273306
end
274307

275308
## request API ##

src/filenames.jl

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
## Getting file names from URLs and Responses
2+
3+
struct BadEncoding <: Exception end
4+
5+
function hex_digit(str::AbstractString, i::Int)::Tuple{UInt8,Int}
6+
if i ncodeunits(str)
7+
d, i = iterate(str, i)
8+
'0' d '9' && return d - '0', i
9+
'a' d 'f' && return d - 'a' + 10, i
10+
'A' d 'F' && return d - 'A' + 10, i
11+
end
12+
throw(BadEncoding())
13+
end
14+
15+
function url_unescape(str::Union{String, SubString{String}})
16+
try return sprint(sizehint = ncodeunits(str)) do io
17+
i = 1
18+
while i ncodeunits(str)
19+
c, i = iterate(str, i)
20+
if c == '%'
21+
hi, i = hex_digit(str, i)
22+
lo, i = hex_digit(str, i)
23+
x = hi*0x10 + lo
24+
write(io, x)
25+
else
26+
print(io, c)
27+
end
28+
end
29+
end
30+
catch err
31+
err isa BadEncoding && return
32+
rethrow()
33+
end
34+
end
35+
36+
function url_filename(url::AbstractString)
37+
m = match(r"^[a-z][a-z+._-]*://[^#?]*/([^/#?]+)(?:[#?]|$)"i, url)
38+
m === nothing && return
39+
url_unescape(m[1])
40+
end
41+
42+
let # build some complex regular expressions
43+
s = raw"\s*" # interpolating this is handy
44+
token = raw"[A-Za-z0-9!#$%&'*+-.\^_`|~]+"
45+
bare_value = raw"[^\s'\";][^;]*(?<!\s)"
46+
single_quoted = raw"'(?:[^'\\]|\\.)*'"
47+
double_quoted = raw"\"(?:[^\"\\]|\\.)*\""
48+
value = "(?:" *bare_value* "|" *single_quoted* "|" *double_quoted* ")"
49+
pair = "(" *token* ")$s=$s(" *value* ")"
50+
header_re = "^$s" *token* "$s(?:;$s" *pair* "$s)*;?$s\$"
51+
each_pair_re = "(?:^" *token* "|\\G)$s;$s" *pair
52+
global const content_disposition_re = Regex(header_re)
53+
global const content_disposition_each_re = Regex(each_pair_re)
54+
end
55+
56+
function get_filename(response::Response)
57+
# look for content disposition header
58+
filename = filename⁺ = nothing
59+
for (h_key, h_val) in response.headers
60+
h_key == "content-disposition" &&
61+
contains(h_val, content_disposition_re) || continue
62+
for m in eachmatch(content_disposition_each_re, h_val)
63+
a_key = lowercase(m.captures[1])
64+
a_val = m.captures[2]
65+
a_val === nothing && continue
66+
if a_key == "filename"
67+
if a_val[1] in ('"', '\'') && a_val[1] == a_val[end]
68+
# quoted value
69+
filename = sprint(sizehint=ncodeunits(a_val)-2) do io
70+
i = nextind(a_val, 1)
71+
while i < ncodeunits(a_val)
72+
c, i = iterate(a_val, i)
73+
if c == '\\'
74+
c, i = iterate(a_val, i)
75+
end
76+
write(io, c)
77+
end
78+
end
79+
else # unquoted value
80+
filename = a_val
81+
end
82+
elseif a_key == "filename*"
83+
m = match(r"^([\w-]+)'\w*'(.*)$", a_val)
84+
m === nothing && continue
85+
encoding = lowercase(m.captures[1])
86+
encoding in ("utf-8", "iso-8859-1") || continue
87+
encoded = m.captures[2]
88+
try filename⁺ = sprint() do io
89+
i = 1
90+
while i ncodeunits(encoded)
91+
c, i = iterate(encoded, i)
92+
if c == '%'
93+
hi, i = hex_digit(encoded, i)
94+
lo, i = hex_digit(encoded, i)
95+
x = hi*0x10 + lo
96+
if encoding == "utf-8"
97+
write(io, x)
98+
else
99+
write(io, Char(x))
100+
end
101+
else
102+
write(io, c)
103+
end
104+
end
105+
end
106+
catch err
107+
err isa BadEncoding || rethrow()
108+
end
109+
end
110+
end
111+
end
112+
filename⁺ !== nothing && return filename⁺
113+
filename !== nothing && return filename
114+
# no usable content disposition header
115+
# extract from URL after redirects
116+
return url_filename(response.url)
117+
end
118+
119+
# Special names on Windows: CON PRN AUX NUL COM1-9 LPT1-9
120+
# we spell out uppercase/lowercase because of locales
121+
# these are dangerous with or without an extension
122+
const WIN_SPECIAL_NAMES = r"^(
123+
[Cc][Oo][Nn] |
124+
[Pp][Rr][Nn] |
125+
[Aa][Uu][Xx] |
126+
[Nn][Uu][Ll] |
127+
[Cc][Oo][Mm][1-9] |
128+
[Ll][Pp][Tt][1-9]
129+
)(\.|$)"x
130+
131+
function is_safe_filename(name::AbstractString)
132+
isvalid(name) || return false
133+
'/' in name && return false
134+
name in ("", ".", "..") && return false
135+
any(iscntrl, name) && return false
136+
if Sys.iswindows()
137+
name[end] ". " && return false
138+
any(in("\"*:<>?\\|"), name) && return false
139+
contains(name, WIN_SPECIAL_NAMES) && return false
140+
end
141+
return true
142+
end
143+
144+
is_safe_filename(::Nothing) = false

test/runtests.jl

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,149 @@ include("setup.jl")
220220
end
221221
end
222222

223+
@testset "download file names" begin
224+
@testset "url_filename helper" begin
225+
for url in urls_with_filename()
226+
@test nothing === Downloads.url_filename(url)
227+
end
228+
for name in file_names,
229+
url in urls_with_filename(name)
230+
@test name === Downloads.url_filename(url)
231+
end
232+
# URLs that we shouldn't get a file name from
233+
for url in [
234+
"",
235+
"abc",
236+
"abc.txt",
237+
"abc:def",
238+
"file:/",
239+
"file://",
240+
"file:///",
241+
"$server/anything/%",
242+
"$server/anything/%.txt",
243+
"$server/anything/%0.txt",
244+
]
245+
@test nothing === Downloads.url_filename(url)
246+
end
247+
end
248+
# set a more unique default file name
249+
default = Downloads.DEFAULT_FILENAME
250+
@testset "from URL" begin
251+
for url in rand(urls_with_filename(), 10)
252+
@test default == splitdir(download(url))[2]
253+
url′ = "$server/redirect-to?url="*url_escape(url)
254+
# would be reasonable for this to be `default` too but
255+
# currently we use the name from the original URL
256+
@test Downloads.url_filename(url′) ==
257+
splitdir(download(url′))[2]
258+
end
259+
for name in file_names
260+
Sys.iswindows() && '"' in name && continue
261+
for url in rand(urls_with_filename(name), 3)
262+
@test name == splitdir(download(url))[2]
263+
url′ = "$server/redirect-to?url="*url_escape(url)
264+
@test name == splitdir(download(url′))[2]
265+
end
266+
end
267+
end
268+
@testset "unsafe names in URLs" begin
269+
bad_names = [
270+
url_escape(".")
271+
url_escape("..")
272+
url_escape("\a")
273+
"%ff"
274+
"%ff.txt"
275+
]
276+
if Sys.iswindows()
277+
push!(bad_names, url_escape("file."))
278+
push!(bad_names, url_escape("file "))
279+
push!(bad_names, url_escape("file:txt"))
280+
push!(bad_names, url_escape("CON"))
281+
push!(bad_names, url_escape("LPT1.txt"))
282+
end
283+
for name in bad_names
284+
url = "$server/anything/$name"
285+
@test default == splitdir(download(url))[2]
286+
end
287+
end
288+
@testset "from content disposition" begin
289+
for name in file_names
290+
Sys.iswindows() && '"' in name && continue
291+
url = content_disposition_url(:utf8 => name)
292+
@test name == splitdir(download(url))[2]
293+
isascii(name) || continue
294+
url = content_disposition_url(:ascii => name)
295+
@test name == splitdir(download(url))[2]
296+
url = content_disposition_url(:ascii_1q => name)
297+
@test name == splitdir(download(url))[2]
298+
url = content_disposition_url(:ascii_2q => name)
299+
@test name == splitdir(download(url))[2]
300+
end
301+
let name = "ÿ.txt"
302+
url = content_disposition_url(:latin1 => name)
303+
@test name == splitdir(download(url))[2]
304+
end
305+
let name = "y.txt", name⁺ = "ÿ.txt"
306+
url = content_disposition_url(:ascii => name, :utf8 => name⁺)
307+
@test name⁺ == splitdir(download(url))[2]
308+
url = content_disposition_url(:ascii => name, :latin1 => name⁺)
309+
@test name⁺ == splitdir(download(url))[2]
310+
url = content_disposition_url(:utf8 => name⁺, :ascii => name)
311+
@test name⁺ == splitdir(download(url))[2]
312+
url = content_disposition_url(:latin1 => name⁺, :ascii => name)
313+
@test name⁺ == splitdir(download(url))[2]
314+
url = content_disposition_url(:latin1 => name, :utf8 => name⁺)
315+
@test name⁺ == splitdir(download(url))[2]
316+
url = content_disposition_url(:utf8 => name, :latin1 => name⁺)
317+
@test name⁺ == splitdir(download(url))[2]
318+
end
319+
end
320+
@testset "invalid content disposition" begin
321+
# invalid content disposition header syntax
322+
values = [
323+
"\a\b"
324+
"inline"
325+
"attachment"
326+
"attachment; filename"
327+
"attachment; filename='"
328+
"attachment; filename=\""
329+
"attachment; filename='unclosed"
330+
"attachment; filename=\"unclosed"
331+
"attachment; filename*=name.txt"
332+
"attachment; filename*='name.txt'"
333+
"attachment; filename*=\"name.txt\""
334+
"attachment; filename*=utf-8''%"
335+
"attachment; filename*=utf-8''%.txt"
336+
]
337+
bad_names = [
338+
""
339+
"."
340+
".."
341+
"foo/bar"
342+
"foo\0"
343+
"\a"
344+
"ding!\v"
345+
]
346+
if Sys.iswindows()
347+
push!(bad_names, "file.")
348+
push!(bad_names, "file ")
349+
push!(bad_names, "file:txt")
350+
push!(bad_names, "CON")
351+
push!(bad_names, "LPT1.txt")
352+
end
353+
for name in bad_names
354+
push!(values, "attachment; filename*=utf-8''$(url_escape(name))")
355+
'\0' in name && continue
356+
push!(values, "attachment; filename=\"$name\"")
357+
end
358+
for value in values
359+
url = "$server/response-headers?content-disposition="*
360+
url_escape(value)
361+
@test "response-headers" === splitdir(download(url))[2]
362+
end
363+
end
364+
end
365+
223366
@testset "debug callback" begin
224367
url = "$server/get"
225368
events = Pair{String,String}[]

0 commit comments

Comments
 (0)