Skip to content

Commit 6aae107

Browse files
authored
ask the package server for zstd compressed data (#4472)
1 parent a76de9e commit 6aae107

File tree

5 files changed

+160
-19
lines changed

5 files changed

+160
-19
lines changed

Project.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
2323
TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
2424
Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
2525
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
26+
Zstd_jll = "3161d3a3-bdf6-5164-811a-617609db77b4"
2627
p7zip_jll = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
2728

2829
[weakdeps]
@@ -41,11 +42,12 @@ Libdl = "1.11"
4142
Logging = "1.11"
4243
Markdown = "1.11"
4344
Printf = "1.11"
44-
Random = "1.11"
4545
REPL = "1.11"
46+
Random = "1.11"
4647
SHA = "0.7, 1"
4748
TOML = "1"
4849
Tar = "1.10"
4950
UUIDs = "1.11"
51+
Zstd_jll = "1.5.7"
5052
julia = "1.12"
5153
p7zip_jll = "17.5"

docs/src/protocol.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,19 @@ The client can make GET or HEAD requests to the following resources:
136136

137137
Only the `/registries` changes - all other resources can be cached forever and the server will indicate this with the appropriate HTTP headers.
138138

139+
### Compression Negotiation
140+
141+
The Pkg protocol supports multiple compression formats.
142+
143+
- **Zstd compression** (current): Modern clients send `Accept-Encoding: zstd, gzip` to request Zstandard-compressed resources with gzip as a fallback.
144+
- **Gzip compression** (legacy): Older clients that only support gzip send `Accept-Encoding: gzip` or omit the header entirely.
145+
146+
Clients verify the actual compression format by reading file magic bytes after download:
147+
148+
- **Zstd format**: Magic bytes `0x28 0xB5 0x2F 0xFD` (4 bytes) - decompressed with `zstd` (significantly faster)
149+
- **Gzip format**: Magic bytes `0x1F 0x8B` (2 bytes) - decompressed with 7z
150+
151+
139152
### Reference Implementation
140153

141154
A reference implementation of the Pkg Server protocol is available at [PkgServer.jl](https://github.com/JuliaPackaging/PkgServer.jl).

src/PlatformEngines.jl

Lines changed: 111 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44

55
module PlatformEngines
66

7-
using SHA, Downloads, Tar
7+
using SHA, Downloads, Tar, Dates, Printf
88
import ...Pkg: Pkg, TOML, pkg_server, depots1, can_fancyprint, stderr_f, atomic_toml_write
99
using ..MiniProgressBars
10-
using Base.BinaryPlatforms, p7zip_jll
10+
using Base.BinaryPlatforms, p7zip_jll, Zstd_jll
1111

12-
export verify, unpack, package, download_verify_unpack
12+
export verify, unpack, package, download_verify_unpack, get_extract_cmd, detect_archive_format
1313

1414
const EXE7Z_LOCK = ReentrantLock()
1515
const EXE7Z = Ref{String}()
16+
const EXEZSTD_LOCK = ReentrantLock()
17+
const EXEZSTD = Ref{String}()
1618

1719
function exe7z()
1820
# If the JLL is available, use the wrapper function defined in there
@@ -28,6 +30,20 @@ function exe7z()
2830
end
2931
end
3032

33+
function exezstd()
34+
# If the JLL is available, use the wrapper function defined in there
35+
if Zstd_jll.is_available()
36+
return Zstd_jll.zstd()
37+
end
38+
39+
return lock(EXEZSTD_LOCK) do
40+
if !isassigned(EXEZSTD)
41+
EXEZSTD[] = findzstd()
42+
end
43+
return Cmd([EXEZSTD[]])
44+
end
45+
end
46+
3147
function find7z()
3248
name = "7z"
3349
Sys.iswindows() && (name = "$name.exe")
@@ -40,6 +56,18 @@ function find7z()
4056
error("7z binary not found")
4157
end
4258

59+
function findzstd()
60+
name = "zstd"
61+
Sys.iswindows() && (name = "$name.exe")
62+
for dir in (joinpath("..", "libexec"), ".")
63+
path = normpath(Sys.BINDIR::String, dir, name)
64+
isfile(path) && return path
65+
end
66+
path = Sys.which(name)
67+
path !== nothing && return path
68+
error("zstd binary not found")
69+
end
70+
4371
is_secure_url(url::AbstractString) =
4472
occursin(r"^(https://|\w+://(127\.0\.0\.1|localhost)(:\d+)?($|/))"i, url)
4573

@@ -232,6 +260,13 @@ function get_metadata_headers(url::AbstractString)
232260
end
233261
push!(headers, "Julia-CI-Variables" => join(ci_info, ';'))
234262
push!(headers, "Julia-Interactive" => string(isinteractive()))
263+
264+
# Add Accept-Encoding header only for compressed archive resources
265+
# (registries, packages, artifacts - not for metadata endpoints like /registries or /meta)
266+
if occursin(r"/(registry|package|artifact)/", url)
267+
push!(headers, "Accept-Encoding" => "zstd, gzip")
268+
end
269+
235270
for (key, val) in ENV
236271
m = match(r"^JULIA_PKG_SERVER_([A-Z0-9_]+)$"i, key)
237272
m === nothing && continue
@@ -403,22 +438,89 @@ function copy_symlinks()
403438
lowercase(var) in ("false", "f", "no", "n", "0") ? false : nothing
404439
end
405440

441+
"""
442+
detect_archive_format(tarball_path::AbstractString)
443+
444+
Detect compression format by reading file magic bytes.
445+
Returns one of: "zstd", "gzip", "bzip2", "xz", "lz4", "tar", or "unknown".
446+
447+
Note: This is used both for determining file extensions after download
448+
and for selecting the appropriate decompression tool.
449+
"""
450+
function detect_archive_format(tarball_path::AbstractString)
451+
file_size = filesize(tarball_path)
452+
453+
if file_size == 0
454+
error("cannot detect compression format: $tarball_path is empty")
455+
end
456+
457+
magic = open(tarball_path, "r") do io
458+
read(io, min(6, file_size))
459+
end
460+
461+
# Check magic bytes for various formats
462+
# Zstd: 0x28 0xB5 0x2F 0xFD (4 bytes)
463+
if length(magic) >= 4 && magic[1:4] == [0x28, 0xB5, 0x2F, 0xFD]
464+
return "zstd"
465+
end
466+
# Gzip: 0x1F 0x8B (2 bytes)
467+
if length(magic) >= 2 && magic[1:2] == [0x1F, 0x8B]
468+
return "gzip"
469+
end
470+
# Bzip2: 0x42 0x5A 0x68 (BZh) (3 bytes)
471+
if length(magic) >= 3 && magic[1:3] == [0x42, 0x5A, 0x68]
472+
return "bzip2"
473+
end
474+
# XZ: 0xFD 0x37 0x7A 0x58 0x5A 0x00 (6 bytes)
475+
if length(magic) >= 6 && magic[1:6] == [0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00]
476+
return "xz"
477+
end
478+
# LZ4: 0x04 0x22 0x4D 0x18 (4 bytes)
479+
if length(magic) >= 4 && magic[1:4] == [0x04, 0x22, 0x4D, 0x18]
480+
return "lz4"
481+
end
482+
return "unknown"
483+
end
484+
485+
"""
486+
get_extract_cmd(tarball_path::AbstractString)
487+
488+
Get the decompression command for a tarball by detecting format via magic bytes.
489+
"""
490+
function get_extract_cmd(tarball_path::AbstractString)
491+
format = detect_archive_format(tarball_path)
492+
if format == "zstd"
493+
return `$(exezstd()) -d -c $tarball_path`
494+
else
495+
return `$(exe7z()) x $tarball_path -so`
496+
end
497+
end
498+
406499
function unpack(
407500
tarball_path::AbstractString,
408501
dest::AbstractString;
409502
verbose::Bool = false,
410503
)
411-
return Tar.extract(`$(exe7z()) x $tarball_path -so`, dest, copy_symlinks = copy_symlinks())
504+
return Tar.extract(get_extract_cmd(tarball_path), dest, copy_symlinks = copy_symlinks())
412505
end
413506

414507
"""
415508
package(src_dir::AbstractString, tarball_path::AbstractString)
416509
417510
Compress `src_dir` into a tarball located at `tarball_path`.
511+
Supports both gzip and zstd compression based on file extension.
418512
"""
419513
function package(src_dir::AbstractString, tarball_path::AbstractString; io = stderr_f())
420514
rm(tarball_path, force = true)
421-
cmd = `$(exe7z()) a -si -tgzip -mx9 $tarball_path`
515+
# Choose compression based on file extension (case-insensitive)
516+
tarball_lower = lowercase(tarball_path)
517+
if endswith(tarball_lower, ".zst") || endswith(tarball_lower, ".tar.zst")
518+
# Use zstd compression (level 19 for good compression)
519+
cmd = `$(exezstd()) -19 -c -T -o $tarball_path`
520+
else
521+
# Use gzip compression (default)
522+
cmd = `$(exe7z()) a -si -tgzip -mx9 $tarball_path`
523+
end
422524
return open(pipeline(cmd, stdout = devnull, stderr = io), write = true) do io
423525
Tar.create(src_dir, io)
424526
end
@@ -497,7 +599,7 @@ function download_verify_unpack(
497599

498600
# If extension of url contains a recognized extension, use it, otherwise use ".gz"
499601
ext = url_ext(url)
500-
if !(ext in ["tar", "gz", "tgz", "bz2", "xz"])
602+
if !(ext in ["tar", "gz", "tgz", "bz2", "xz", "zst"])
501603
ext = "gz"
502604
end
503605

@@ -538,7 +640,7 @@ function download_verify_unpack(
538640
@info("Unpacking $(tarball_path) into $(dest)...")
539641
end
540642
isnothing(progress) || progress(10000, 10000; status = "unpacking")
541-
open(`$(exe7z()) x $tarball_path -so`) do io
643+
open(get_extract_cmd(tarball_path)) do io
542644
Tar.extract(io, dest, copy_symlinks = copy_symlinks())
543645
end
544646
finally
@@ -685,12 +787,12 @@ function verify(
685787
end
686788

687789
# Verify the git-tree-sha1 hash of a compressed archive.
688-
function verify_archive_tree_hash(tar_gz::AbstractString, expected_hash::Base.SHA1)
790+
function verify_archive_tree_hash(compressed_tar::AbstractString, expected_hash::Base.SHA1)
689791
# This can fail because unlike sha256 verification of the downloaded
690792
# tarball, tree hash verification requires that the file can i) be
691793
# decompressed and ii) is a proper archive.
692794
calc_hash = try
693-
Base.SHA1(open(Tar.tree_hash, `$(exe7z()) x $tar_gz -so`))
795+
Base.SHA1(open(Tar.tree_hash, get_extract_cmd(compressed_tar)))
694796
catch err
695797
@warn "unable to decompress and read archive" exception = err
696798
return false

src/Registry/Registry.jl

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ module Registry
4141
import ..Pkg
4242
using ..Pkg: depots, depots1, printpkgstyle, stderr_f, isdir_nothrow, pathrepr, pkg_server,
4343
GitTools, atomic_toml_write, create_cachedir_tag
44-
using ..Pkg.PlatformEngines: download_verify_unpack, download, download_verify, exe7z, verify_archive_tree_hash
44+
using ..Pkg.PlatformEngines: download_verify_unpack, download, download_verify, verify_archive_tree_hash, get_extract_cmd, detect_archive_format
4545
using UUIDs, LibGit2, TOML, Dates
4646
import FileWatching
4747

@@ -240,6 +240,25 @@ function check_registry_state(reg)
240240
return nothing
241241
end
242242

243+
function archive_format_to_extension(filepath::AbstractString)::String
244+
format = detect_archive_format(filepath)
245+
# Map detected format to file extension
246+
if format == "zstd"
247+
return ".tar.zst"
248+
elseif format == "gzip"
249+
return ".tar.gz"
250+
elseif format == "bzip2"
251+
return ".tar.bz2"
252+
elseif format == "xz"
253+
return ".tar.xz"
254+
elseif format == "lz4"
255+
return ".tar.lz4"
256+
else
257+
# Default to .tar.gz for tar or unknown formats
258+
return ".tar.gz"
259+
end
260+
end
261+
243262
function download_registries(io::IO, regs::Vector{RegistrySpec}, depots::Union{String, Vector{String}} = depots())
244263
# Use the first depot as the target
245264
target_depot = depots1(depots)
@@ -282,8 +301,10 @@ function download_registries(io::IO, regs::Vector{RegistrySpec}, depots::Union{S
282301
reg_unc = uncompress_registry(tmp)
283302
reg.name = TOML.parse(reg_unc["Registry.toml"])["name"]::String
284303
end
285-
mv(tmp, joinpath(regdir, reg.name * ".tar.gz"); force = true)
286-
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(_hash), "path" => reg.name * ".tar.gz")
304+
# Detect what we actually got from the server (defensive against servers that don't support zstd yet)
305+
ext = archive_format_to_extension(tmp)
306+
mv(tmp, joinpath(regdir, reg.name * ext); force = true)
307+
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(_hash), "path" => reg.name * ext)
287308
atomic_toml_write(joinpath(regdir, reg.name * ".toml"), reg_info)
288309
registry_update_log[string(reg.uuid)] = now()
289310
printpkgstyle(io, :Added, "`$(reg.name)` registry to $(Base.contractuser(regdir))")
@@ -546,8 +567,11 @@ function update(regs::Vector{RegistrySpec}; io::IO = stderr_f(), force::Bool = t
546567
Base.rm(reg.path; recursive = true, force = true)
547568
end
548569
registry_path = dirname(reg.path)
549-
mv(tmp, joinpath(registry_path, reg.name * ".tar.gz"); force = true)
550-
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(hash), "path" => reg.name * ".tar.gz")
570+
# Detect what we actually got from the server (defensive against servers that don't support zstd yet)
571+
format = detect_archive_format(tmp)
572+
ext = format == "zstd" ? ".tar.zst" : ".tar.gz"
573+
mv(tmp, joinpath(registry_path, reg.name * ext); force = true)
574+
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(hash), "path" => reg.name * ext)
551575
atomic_toml_write(joinpath(registry_path, reg.name * ".toml"), reg_info)
552576
registry_update_log[string(reg.uuid)] = now()
553577
@label done_tarball_read

src/Registry/registry_instance.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,14 +272,14 @@ function init_package_info!(pkg::PkgEntry)
272272
end
273273

274274

275-
function uncompress_registry(tar_gz::AbstractString)
276-
if !isfile(tar_gz)
277-
error("$(repr(tar_gz)): No such file")
275+
function uncompress_registry(compressed_tar::AbstractString)
276+
if !isfile(compressed_tar)
277+
error("$(repr(compressed_tar)): No such file")
278278
end
279279
data = Dict{String, String}()
280280
buf = Vector{UInt8}(undef, Tar.DEFAULT_BUFFER_SIZE)
281281
io = IOBuffer()
282-
open(`$(exe7z()) x $tar_gz -so`) do tar
282+
open(get_extract_cmd(compressed_tar)) do tar
283283
Tar.read_tarball(x -> true, tar; buf = buf) do hdr, _
284284
if hdr.type == :file
285285
Tar.read_data(tar, io; size = hdr.size, buf = buf)

0 commit comments

Comments
 (0)