Skip to content

Commit e04d89e

Browse files
authored
[LibLz4] Add LZ4 HDF5 (#27)
* Add LZ4 HDF5 support * include hdf5 tests * enable imagecodecs compat tests * test h5py compat * Use latest versions of hdf5plugin and h5py from pip * more h5py tests
1 parent d9b3213 commit e04d89e

File tree

9 files changed

+411
-10
lines changed

9 files changed

+411
-10
lines changed

LibLz4/src/ChunkCodecLibLz4.jl

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ export LZ4FrameCodec,
3030
LZ4NumcodecsCodec,
3131
LZ4NumcodecsEncodeOptions,
3232
LZ4NumcodecsDecodeOptions,
33-
# LZ4HDF5Codec,
34-
# LZ4HDF5EncodeOptions,
35-
# LZ4HDF5DecodeOptions,
33+
LZ4HDF5Codec,
34+
LZ4HDF5EncodeOptions,
35+
LZ4HDF5DecodeOptions,
3636
LZ4DecodingError
3737

3838
# reexport ChunkCodecCore
@@ -108,6 +108,24 @@ struct LZ4NumcodecsCodec <: Codec
108108
end
109109
decode_options(::LZ4NumcodecsCodec) = LZ4NumcodecsDecodeOptions() # default decode options
110110

111+
"""
112+
struct LZ4HDF5Codec <: Codec
113+
LZ4HDF5Codec()
114+
115+
LZ4 HDF5 format compression using liblz4: https://lz4.org/
116+
117+
This is the LZ4 HDF5 format used in HDF5 Filter ID: 32004.
118+
119+
This format is documented in https://github.com/HDFGroup/hdf5_plugins
120+
121+
This format is NOT compatible with the `lz4` CLI.
122+
123+
See also [`LZ4HDF5EncodeOptions`](@ref) and [`LZ4HDF5DecodeOptions`](@ref)
124+
"""
125+
struct LZ4HDF5Codec <: Codec
126+
end
127+
decode_options(::LZ4HDF5Codec) = LZ4HDF5DecodeOptions() # default decode options
128+
111129
include("encode.jl")
112130
include("decode.jl")
113131

LibLz4/src/decode.jl

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,3 +390,117 @@ function try_decode!(d::LZ4NumcodecsDecodeOptions, dst::AbstractVector{UInt8}, s
390390
end
391391
end
392392
end
393+
394+
395+
"""
396+
struct LZ4HDF5DecodeOptions <: DecodeOptions
397+
LZ4HDF5DecodeOptions(; kwargs...)
398+
399+
LZ4 HDF5 format compression using liblz4: https://lz4.org/
400+
401+
This is the LZ4 HDF5 format used in HDF5 Filter ID: 32004.
402+
403+
This format is documented in https://github.com/HDFGroup/hdf5_plugins
404+
405+
This format is NOT compatible with the `lz4` CLI.
406+
407+
# Keyword Arguments
408+
409+
- `codec::LZ4HDF5Codec=LZ4HDF5Codec()`
410+
"""
411+
struct LZ4HDF5DecodeOptions <: DecodeOptions
412+
codec::LZ4HDF5Codec
413+
end
414+
function LZ4HDF5DecodeOptions(;
415+
codec::LZ4HDF5Codec=LZ4HDF5Codec(),
416+
kwargs...
417+
)
418+
LZ4HDF5DecodeOptions(codec)
419+
end
420+
421+
is_thread_safe(::LZ4HDF5DecodeOptions) = true
422+
423+
function try_find_decoded_size(::LZ4HDF5DecodeOptions, src::AbstractVector{UInt8})::Int64
424+
if length(src) < 12
425+
throw(LZ4DecodingError("unexpected end of input"))
426+
else
427+
decoded_size = Int64(0)
428+
for i in 0:7
429+
decoded_size |= Int64(src[begin+i])<<((7-i)*8)
430+
end
431+
if signbit(decoded_size)
432+
throw(LZ4DecodingError("decoded size is negative"))
433+
else
434+
decoded_size
435+
end
436+
end
437+
end
438+
439+
function unsafe_load_i32be(src_p::Ptr{UInt8})::Int32
440+
r = Int32(0)
441+
for i in 0:3
442+
r |= Int32(unsafe_load(src_p+i))<<((3-i)*8)
443+
end
444+
r
445+
end
446+
447+
function try_decode!(d::LZ4HDF5DecodeOptions, dst::AbstractVector{UInt8}, src::AbstractVector{UInt8}; kwargs...)::Union{Nothing, Int64}
448+
check_contiguous(dst)
449+
check_contiguous(src)
450+
decoded_size = try_find_decoded_size(d, src)
451+
src_size::Int64 = length(src)
452+
dst_size::Int64 = length(dst)
453+
if decoded_size > dst_size
454+
return nothing
455+
end
456+
cconv_src = Base.cconvert(Ptr{UInt8}, src)
457+
cconv_dst = Base.cconvert(Ptr{UInt8}, dst)
458+
GC.@preserve cconv_src cconv_dst begin
459+
src_p = Base.unsafe_convert(Ptr{UInt8}, cconv_src)
460+
dst_p = Base.unsafe_convert(Ptr{UInt8}, cconv_dst)
461+
src_left = src_size
462+
dst_left = decoded_size
463+
@assert src_left 12 # this is checked by try_find_decoded_size
464+
src_left -= 8
465+
src_p += 8
466+
block_size = unsafe_load_i32be(src_p)
467+
src_left -= 4
468+
src_p += 4
469+
if block_size 0
470+
throw(LZ4DecodingError("block size must be greater than zero"))
471+
end
472+
while dst_left > 0
473+
local b_size = min(Int64(block_size), dst_left)%Int32
474+
if src_left < 4
475+
throw(LZ4DecodingError("unexpected end of input"))
476+
end
477+
local c_size = unsafe_load_i32be(src_p)
478+
src_left -= 4
479+
src_p += 4
480+
if c_size 0
481+
throw(LZ4DecodingError("block compressed size must be greater than zero"))
482+
end
483+
if src_left < c_size
484+
throw(LZ4DecodingError("unexpected end of input"))
485+
end
486+
if c_size == b_size # There was no compression
487+
Libc.memcpy(dst_p, src_p, b_size)
488+
else # do the decompression
489+
local ret = unsafe_lz4_decompress(src_p, dst_p, c_size, b_size)
490+
if signbit(ret)
491+
throw(LZ4DecodingError("src is malformed"))
492+
elseif ret != b_size
493+
throw(LZ4DecodingError("saved decoded size is not correct"))
494+
end
495+
end
496+
src_left -= c_size
497+
src_p += c_size
498+
dst_left -= b_size
499+
dst_p += b_size
500+
end
501+
if !iszero(src_left)
502+
throw(LZ4DecodingError("unexpected $(src_left) bytes after stream"))
503+
end
504+
return decoded_size
505+
end
506+
end

LibLz4/src/encode.jl

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,3 +257,118 @@ function try_encode!(e::LZ4NumcodecsEncodeOptions, dst::AbstractVector{UInt8}, s
257257
Int64(ret) + Int64(4)
258258
end
259259
end
260+
261+
262+
"""
263+
struct LZ4HDF5EncodeOptions <: EncodeOptions
264+
LZ4HDF5EncodeOptions(; kwargs...)
265+
266+
LZ4 HDF5 format compression using liblz4: https://lz4.org/
267+
268+
This is the LZ4 HDF5 format used in HDF5 Filter ID: 32004.
269+
270+
This format is documented in https://github.com/HDFGroup/hdf5_plugins
271+
272+
This format is NOT compatible with the `lz4` CLI.
273+
274+
# Keyword Arguments
275+
276+
- `codec::LZ4HDF5Codec=LZ4HDF5Codec()`
277+
- `compressionLevel::Integer=0`: Compression level, 0: default (fast mode); values > $(LZ4_MAX_CLEVEL) count as $(LZ4_MAX_CLEVEL); values < 0 trigger fast acceleration.
278+
- `blockSize::Integer=2^30`: Decompressed bytes per block. Must be in `1:$(LZ4_MAX_INPUT_SIZE)`.
279+
"""
280+
struct LZ4HDF5EncodeOptions <: EncodeOptions
281+
codec::LZ4HDF5Codec
282+
compressionLevel::Int32
283+
blockSize::Int32
284+
end
285+
function LZ4HDF5EncodeOptions(;
286+
codec::LZ4HDF5Codec=LZ4HDF5Codec(),
287+
compressionLevel::Integer=0,
288+
blockSize::Integer=2^30,
289+
kwargs...
290+
)
291+
check_in_range(1:LZ4_MAX_INPUT_SIZE; blockSize)
292+
_clamped_compression_level = clamp(compressionLevel, LZ4_MIN_CLEVEL, LZ4_MAX_CLEVEL)
293+
LZ4HDF5EncodeOptions(codec, _clamped_compression_level, blockSize)
294+
end
295+
296+
is_thread_safe(::LZ4HDF5EncodeOptions) = true
297+
298+
# Prevent encode_bound reaching typemax(Int64) if blockSize is 1
299+
decoded_size_range(e::LZ4HDF5EncodeOptions) = Int64(0):Int64(1):Int64(1844674407370955155)
300+
301+
function encode_bound(e::LZ4HDF5EncodeOptions, src_size::Int64)::Int64
302+
if src_size > last(decoded_size_range(e))
303+
typemax(Int64)
304+
else
305+
block_size = clamp(src_size, Int64(1), Int64(e.blockSize))
306+
nblocks = cld(src_size, block_size)
307+
lz4_scratch_space = block_size÷Int64(255) + Int64(16)
308+
src_size + Int64(4)*nblocks + lz4_scratch_space + Int64(12)
309+
end
310+
end
311+
312+
function try_encode!(e::LZ4HDF5EncodeOptions, dst::AbstractVector{UInt8}, src::AbstractVector{UInt8}; kwargs...)::Union{Nothing, Int64}
313+
check_contiguous(dst)
314+
check_contiguous(src)
315+
src_size::Int64 = length(src)
316+
dst_size::Int64 = length(dst)
317+
check_in_range(decoded_size_range(e); src_size)
318+
block_size = clamp(src_size, Int64(1), Int64(e.blockSize))
319+
if dst_size < 12
320+
return nothing
321+
end
322+
cconv_src = Base.cconvert(Ptr{UInt8}, src)
323+
cconv_dst = Base.cconvert(Ptr{UInt8}, dst)
324+
GC.@preserve cconv_src cconv_dst begin
325+
src_p = Base.unsafe_convert(Ptr{UInt8}, cconv_src)
326+
dst_p = Base.unsafe_convert(Ptr{UInt8}, cconv_dst)
327+
src_left = src_size
328+
dst_left = dst_size
329+
# Store original size as big endian signed 64 bit
330+
for i in 0:7
331+
unsafe_store!(dst_p+i, (src_size>>>((7-i)*8))%UInt8)
332+
end
333+
dst_left -= 8
334+
dst_p += 8
335+
# Store block size as big endian signed 32 bit
336+
for i in 0:3
337+
unsafe_store!(dst_p+i, (block_size>>>((3-i)*8))%UInt8)
338+
end
339+
dst_left -= 4
340+
dst_p += 4
341+
while src_left > 0
342+
if dst_left < 5
343+
return nothing
344+
end
345+
local b_size = min(src_left, block_size)%Int32
346+
@assert !iszero(b_size)
347+
local c_size_p = dst_p
348+
dst_left -= 4
349+
dst_p += 4
350+
local ret = unsafe_lz4_compress(src_p, dst_p, b_size, clamp(dst_left, Int32), e.compressionLevel)
351+
# Store the data directly if there was no compression
352+
# iszero(ret) indicates that dst_left was too small for compression.
353+
# but it might be large enough for a copy.
354+
local c_size = if ret b_size || iszero(ret)
355+
if dst_left < b_size
356+
return nothing
357+
end
358+
Libc.memcpy(dst_p, src_p, b_size)
359+
b_size
360+
else
361+
ret
362+
end
363+
# Store block compressed size as big endian signed 32 bit
364+
for i in 0:3
365+
unsafe_store!(c_size_p+i, (c_size>>>((3-i)*8))%UInt8)
366+
end
367+
dst_left -= c_size
368+
dst_p += c_size
369+
src_left -= b_size
370+
src_p += b_size
371+
end
372+
return dst_size - dst_left
373+
end
374+
end

0 commit comments

Comments
 (0)