diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 112f462..7dd8105 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -35,6 +35,10 @@ jobs: - ChunkCodecCore/** - ChunkCodecTests/** - LibBlosc/** + LibBlosc2: + - ChunkCodecCore/** + - ChunkCodecTests/** + - LibBlosc2/** LibBrotli: - ChunkCodecCore/** - ChunkCodecTests/** diff --git a/LibBlosc2/CHANGELOG.md b/LibBlosc2/CHANGELOG.md new file mode 100644 index 0000000..65eee55 --- /dev/null +++ b/LibBlosc2/CHANGELOG.md @@ -0,0 +1,11 @@ +# Release Notes + +All notable changes to this package will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). + +## Unreleased + +### Added + +- Initial release diff --git a/LibBlosc2/LICENSE b/LibBlosc2/LICENSE new file mode 100644 index 0000000..568769b --- /dev/null +++ b/LibBlosc2/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Erik Schnetter + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/LibBlosc2/Project.toml b/LibBlosc2/Project.toml new file mode 100644 index 0000000..9a85531 --- /dev/null +++ b/LibBlosc2/Project.toml @@ -0,0 +1,18 @@ +name = "ChunkCodecLibBlosc2" +uuid = "59b5581c-e2bc-42b3-a6f1-80e88eec7b70" +authors = ["Erik Schnetter "] +version = "0.1.0" + +[deps] +Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" +Blosc2_jll = "d43303dc-dd0e-56c6-b0a8-331f4c8c9bfb" +ChunkCodecCore = "0b6fb165-00bc-4d37-ab8b-79f91016dbe1" + +[compat] +Accessors = "0.1.42" +Blosc2_jll = "201.1700.100" +ChunkCodecCore = "0.5.0" +julia = "1.10" + +[workspace] +projects = ["test"] diff --git a/LibBlosc2/README.md b/LibBlosc2/README.md new file mode 100644 index 0000000..40a8a72 --- /dev/null +++ b/LibBlosc2/README.md @@ -0,0 +1,32 @@ +# ChunkCodecLibBlosc2 + +## Warning: ChunkCodecLibBlosc2 is currently a WIP and its API may drastically change at any time. + +This package implements the ChunkCodec interface for the following encoders and decoders +using the c-blosc2 library + +1. `Blosc2CFrame`, `Blosc2EncodeOptions`, `Blosc2DecodeOptions` + +Note: It appears that the [Blosc2 Contiguous Frame +Format](https://www.blosc.org/c-blosc2/format/cframe_format.html) is +not fully protected by checksums. The [`c-blosc2` +library](https://www.blosc.org/c-blosc2) may crash (segfault) for +invalid inputs. + +## Example + +```julia-repl +julia> using ChunkCodecLibBlosc2 + +julia> data = collect(0x00:0x07); + +julia> compressed_data = encode(Blosc2EncodeOptions(), data); + +julia> decompressed_data = decode(Blosc2CFrame(), compressed_data; max_size=length(data), size_hint=length(data)); + +julia> data == decompressed_data +true +``` + +The low level interface is defined in the `ChunkCodecCore` package. + diff --git a/LibBlosc2/src/ChunkCodecLibBlosc2.jl b/LibBlosc2/src/ChunkCodecLibBlosc2.jl new file mode 100644 index 0000000..ef90504 --- /dev/null +++ b/LibBlosc2/src/ChunkCodecLibBlosc2.jl @@ -0,0 +1,57 @@ +module ChunkCodecLibBlosc2 + +using Base.Threads + +using Accessors: @reset + +using Blosc2_jll: libblosc2 + +using ChunkCodecCore: + Codec, + EncodeOptions, + DecodeOptions, + check_in_range, + check_contiguous, + DecodingError +import ChunkCodecCore: + decode_options, + try_decode!, + try_encode!, + encode_bound, + try_find_decoded_size, + decoded_size_range + +export Blosc2CFrame, + Blosc2EncodeOptions, + Blosc2DecodeOptions, + Blosc2DecodingError + +if VERSION >= v"1.11.0-DEV.469" + eval(Meta.parse("public is_compressor_valid, compcode, compname")) +end + +# reexport ChunkCodecCore +using ChunkCodecCore: ChunkCodecCore, encode, decode +export ChunkCodecCore, encode, decode + +include("libblosc2.jl") + +""" + struct Blosc2CFrame <: Codec + Blosc2CFrame() + +Blosc2 compression using c-blosc2 library: https://github.com/Blosc2/c-blosc2 + +Decoding does not accept any extra data appended to the compressed block. +Decoding also does not accept truncated data, or multiple compressed blocks concatenated together. + +[`Blosc2EncodeOptions`](@ref) and [`Blosc2DecodeOptions`](@ref) +can be used to set decoding and encoding options. +""" +struct Blosc2CFrame <: Codec end +decode_options(::Blosc2CFrame) = Blosc2DecodeOptions() + +include("encode.jl") +include("decode.jl") + +end # module ChunkCodecLibBlosc2 diff --git a/LibBlosc2/src/decode.jl b/LibBlosc2/src/decode.jl new file mode 100644 index 0000000..c8fc42d --- /dev/null +++ b/LibBlosc2/src/decode.jl @@ -0,0 +1,123 @@ +""" + Blosc2DecodingError() + +Error for data that cannot be decoded. +""" +struct Blosc2DecodingError <: DecodingError + code::Cint +end + +function Base.showerror(io::IO, err::Blosc2DecodingError) + print(io, "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded, error code: $(err.code)") + return nothing +end + +""" + struct Blosc2DecodeOptions <: DecodeOptions + Blosc2DecodeOptions(; kwargs...) + +Blosc2 decompression using c-blosc2 library: https://github.com/Blosc/c-blosc2 + +# Keyword Arguments + +- `codec::Blosc2CFrame = Blosc2CFrame()` +- `nthreads::Integer = 1`: The number of threads to use +""" +struct Blosc2DecodeOptions <: DecodeOptions + codec::Blosc2CFrame + + nthreads::Int +end +function Blosc2DecodeOptions(; codec::Blosc2CFrame=Blosc2CFrame(), + nthreads::Integer=1, + kwargs...) + _nthreads = nthreads + check_in_range(1:typemax(Int32); nthreads=_nthreads) + + return Blosc2DecodeOptions(codec, _nthreads) +end + +function try_find_decoded_size(::Blosc2DecodeOptions, src::AbstractVector{UInt8})::Int64 + check_contiguous(src) + + blosc2_init() + + copy_cframe = false + schunk = @ccall libblosc2.blosc2_schunk_from_buffer(src::Ptr{UInt8}, length(src)::Int64, copy_cframe::UInt8)::Ptr{Blosc2SChunk} + if schunk == Ptr{Blosc2Storage}() + # These are not a valid blosc2-encoded data + throw(Blosc2DecodingError(0)) + end + @ccall libblosc2.blosc2_schunk_avoid_cframe_free(schunk::Ptr{Blosc2SChunk}, true::UInt8)::Cvoid + + total_nbytes = unsafe_load(schunk).nbytes + + success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Cvoid})::Cint + if success != 0 + # Something went wrong + throw(Blosc2DecodingError(0)) + end + + return total_nbytes::Int64 +end + +# Note: We should implement `try_resize_decode!` + +function try_decode!(d::Blosc2DecodeOptions, dst::AbstractVector{UInt8}, src::AbstractVector{UInt8}; + kwargs...)::Union{Nothing,Int64} + check_contiguous(dst) + check_contiguous(src) + + blosc2_init() + + # I don't think there is a way to specify a decompression context. + # That means that our `Blosc2DecodeOptions` will be unused. + # We could try writing to the `dctx` field in the `schunk`. + + copy_cframe = false + schunk = @ccall libblosc2.blosc2_schunk_from_buffer(src::Ptr{UInt8}, length(src)::Int64, copy_cframe::UInt8)::Ptr{Blosc2SChunk} + if schunk == Ptr{Blosc2Storage}() + # These are not a valid blosc2-encoded data + throw(Blosc2DecodingError(0)) + end + @ccall libblosc2.blosc2_schunk_avoid_cframe_free(schunk::Ptr{Blosc2SChunk}, true::UInt8)::Cvoid + + total_nbytes = unsafe_load(schunk).nbytes + if total_nbytes > length(dst) + # There is not enough space to decode the data + success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Cvoid})::Cint + if success != 0 + # Something went wrong + throw(Blosc2DecodingError(0)) + end + + return nothing + end + + dst_position = Int64(0) + + nchunks = unsafe_load(schunk).nchunks + for nchunk in 0:(nchunks - 1) + nbytes_left = clamp(total_nbytes - dst_position, Int32) + nbytes = @ccall libblosc2.blosc2_schunk_decompress_chunk(schunk::Ptr{Blosc2SChunk}, nchunk::Int64, + pointer(dst, dst_position+1)::Ptr{Cvoid}, nbytes_left::Int32)::Cint + if nbytes <= 0 + # There was an error decompressing the data + throw(Blosc2DecodingError(nbytes)) + end + + dst_position += nbytes + end + if dst_position != total_nbytes + # The decompressed size is inconsistent + throw(Blosc2DecodingError(0)) + end + + success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Cvoid})::Cint + if success != 0 + # Something went wrong + throw(Blosc2DecodingError(0)) + end + + return total_nbytes::Int64 +end diff --git a/LibBlosc2/src/encode.jl b/LibBlosc2/src/encode.jl new file mode 100644 index 0000000..2b58b69 --- /dev/null +++ b/LibBlosc2/src/encode.jl @@ -0,0 +1,241 @@ +""" + struct Blosc2EncodeOptions <: EncodeOptions + Blosc2EncodeOptions(; kwargs...) + +Blosc2 compression using c-blosc2 library: https://github.com/Blosc2/c-blosc2 + +# Keyword Arguments + +- `codec::Blosc2CFrame = Blosc2CFrame()` +- `doshuffle::Union{Integer,Symbol,AbstractString} = 1`: Whether to use the shuffle filter. + + Possible values are + - `:noshuffle`, `"noshuffle"`, 0: do not shuffle + - `:shuffle`, `"shuffle"`, 1: shuffle bytes + - `:bitshuffle`, `"bitshuffle"`, 2: shuffle bits (slower but compresses better) +- `dodelta::Union{Integer,Symbol,AbstractString} = 1`: Whether to use the delta filter. + + Possible values are + - `:nofilter`, `"nofilter"`, 0: no filter + - `:delta`, `"delta"`, 1: use delta filter +- `typesize::Integer = 8`: The element size to use when shuffling. + + `typesize` must be in the range `1:$(BLOSC_MAX_TYPESIZE)`. +- `clevel::Integer = 5`: The compression level, between 0 (no compression) and 9 (maximum compression) +- `compressor::AbstractString = "blosclz"`: The string representing the type of compressor to use. + + For example, `"blosclz"`, `"lz4"`, `"lz4hc"`, `"zlib"`, or `"zstd"`. + Use `is_compressor_valid` to check if a compressor is supported. +- `blocksize::Integer = 0`: Length of block in bytes (0 for automatic choice) +- `nthreads::Integer = 1`: The number of threads to use +- `splitmode::Union{Integer,Symbol,AbstractString} = 4: Whether blocks should be split or not + + Possible values are + - `:always`, `"always"`, 1 + - `:never`, `"never"`, 2 + - `:auto`, `"auto"`, 3 + - `:forward_compat`, `"forward_compat"`, 4: default setting +- `chunksize::Integer = 1024^3`: Chunk size for very large inputs +""" +struct Blosc2EncodeOptions <: EncodeOptions + codec::Blosc2CFrame + + doshuffle::Int # :noshuffle, :shuffle, :bitshuffle + dodelta::Int # :nofilter, :delta + typesize::Int + clevel::Int + compressor::String + blocksize::Int + nthreads::Int + splitmode::Int # :always, :never, :auto, :forward_compat + + chunksize::Int64 +end +function Blosc2EncodeOptions(; + codec::Blosc2CFrame=Blosc2CFrame(), + doshuffle::Union{Integer,Symbol,AbstractString}=1, + dodelta::Union{Integer,Symbol,AbstractString}=0, + typesize::Integer=8, + clevel::Integer=5, + compressor::Union{Symbol,AbstractString}=:blosclz, + blocksize::Integer=0, + nthreads::Integer=1, + splitmode::Union{Integer,Symbol,AbstractString}=4, + chunksize::Integer=Int64(1024)^3, # 1 GByte + kwargs...) + _doshuffle = doshuffle + if _doshuffle isa AbstractString + _doshuffle = Symbol(lowercase(_doshuffle)) + end + if _doshuffle isa Symbol + _doshuffle = get(Dict(:noshuffle => 0, + :shuffle => 1, + :bitshuffle => 2), _doshuffle, -1) + _doshuffle >= 0 || + throw(ArgumentError("Unknown `doshuffle` value `$(repr(doshuffle))`")) + end + _doshuffle::Integer + check_in_range(0:2; doshuffle=_doshuffle) + + _dodelta = dodelta + if _dodelta isa AbstractString + _dodelta = Symbol(lowercase(_dodelta)) + end + if _dodelta isa Symbol + _dodelta = get(Dict(:nofilter => 0, + :delta => 1), _dodelta, -1) + _dodelta >= 0 || + throw(ArgumentError("Unknown `dodelta` value `$(repr(dodelta))`")) + end + _dodelta::Integer + check_in_range(0:1; dodelta=_dodelta) + + _typesize = typesize + if _typesize ∉ 1:BLOSC_MAX_TYPESIZE + _typesize = 8 # use default + end + + _clevel = clamp(clevel, 0:9) + + _compressor = compressor + if _compressor isa Symbol + _compressor = string(_compressor) + end + is_compressor_valid(_compressor) || + throw(ArgumentError("is_compressor_valid(compressor) must hold. Got\ncompressor => $(repr(compressor))")) + + _blocksize = blocksize + check_in_range(0:typemax(Int32); blocksize=_blocksize) + + _nthreads=nthreads + check_in_range(1:typemax(Int32); nthreads=_nthreads) + + _splitmode = splitmode + if _splitmode isa AbstractString + _splitmode = Symbol(lowercase(_splitmode)) + end + if _splitmode isa Symbol + _splitmode = get(Dict(:always => 1, + :never => 2, + :auto => 3, + :forward_compat => 4), _splitmode, -1) + _splitmode >= 0 || + throw(ArgumentError("Unknown `splitmode` value `$(repr(splitmode))`")) + end + _splitmode::Integer + check_in_range(1:4; splitmode=_splitmode) + + _chunksize = clamp(chunksize, 1024, Int64(1024)^3) # at least 1 kByte, at most 1 GByte + + return Blosc2EncodeOptions(codec, + _doshuffle, _dodelta, _typesize, _clevel, _compressor, _blocksize, _nthreads, _splitmode, _chunksize) +end + +# The maximum overhead for the schunk +const MAX_SCHUNK_OVERHEAD = 172 # apparently undocumented -- just a guess + +# We just punt with the upper bound. typemax(Int64) is a huge number anyway. +decoded_size_range(e::Blosc2EncodeOptions) = Int64(0):Int64(e.typesize):(typemax(Int64) ÷ 2) + +function encode_bound(e::Blosc2EncodeOptions, src_size::Int64)::Int64 + return clamp(widen(src_size) + cld(src_size, e.chunksize) * BLOSC2_MAX_OVERHEAD + MAX_SCHUNK_OVERHEAD, Int64) +end + +function try_encode!(e::Blosc2EncodeOptions, dst::AbstractVector{UInt8}, src::AbstractVector{UInt8}; + kwargs...)::Union{Nothing,Int64} + check_contiguous(dst) + check_contiguous(src) + src_size::Int64 = length(src) + dst_size::Int64 = length(dst) + check_in_range(decoded_size_range(e); src_size) + + blosc2_init() + + ccode = compcode(e.compressor) + @assert ccode >= 0 + + # Create a super-chunk container + cparams = Blosc2CParams() + @reset cparams.typesize = e.typesize + @reset cparams.compcode = ccode + @reset cparams.clevel = e.clevel + @reset cparams.nthreads = e.nthreads + @reset cparams.blocksize = e.blocksize + @reset cparams.splitmode = e.splitmode + @reset cparams.filters[BLOSC2_MAX_FILTERS] = e.doshuffle + if e.dodelta > 0 + @reset cparams.filters[BLOSC2_MAX_FILTERS-1] = e.dodelta + end + cparams_obj = [cparams] + + io = Blosc2IO() + io_obj = [io] + + storage = Blosc2Storage() + @reset storage.cparams = pointer(cparams_obj) + @reset storage.io = pointer(io_obj) + storage_obj = [storage] + + GC.@preserve cparams_obj io_obj storage_obj begin + schunk = @ccall libblosc2.blosc2_schunk_new(storage_obj::Ptr{Blosc2Storage})::Ptr{Blosc2SChunk} + if schunk == Ptr{Blosc2Storage}() + # Allocation failure + return nothing + end + + # Break input into chunks + for pos in 1:e.chunksize:src_size + endpos = min(src_size, pos + e.chunksize - 1) + srcview = @view src[pos:endpos] + nbytes = length(srcview) + nchunks = @ccall libblosc2.blosc2_schunk_append_buffer(schunk::Ptr{Blosc2SChunk}, srcview::Ptr{Cvoid}, + nbytes::Int32)::Int64 + if nchunks < 0 + # Internal error in libblosc2, possibly due to invalid input + @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint + return nothing + end + if nchunks != (pos-1) ÷ e.chunksize + 1 + # Our accounting went wrong, probably an internal error in libblosc2, possibly due to invalid input + @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint + return nothing + end + end + + cframe = Ref{Ptr{UInt8}}() + needs_free = Ref{UInt8}() # bool + compressed_size = @ccall libblosc2.blosc2_schunk_to_buffer(schunk::Ptr{Blosc2SChunk}, cframe::Ref{Ptr{UInt8}}, + needs_free::Ref{UInt8})::Int64 + if compressed_size < 0 + # Internal error in libblosc2, possibly due to invalid input + @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint + return nothing + end + cframe = cframe[] + needs_free = Bool(needs_free[]) + + if compressed_size <= length(dst) + # We should try to encode directly into `dst`. (This may + # not be possible with the Blosc2 API.) + unsafe_copyto!(pointer(dst), cframe, compressed_size) + else + # Insufficient space to stored compressed data. + # We should detect this earlier, already in the loop above. + needs_free && Libc.free(cframe) + @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint + return nothing + end + + if needs_free + Libc.free(cframe) + end + + success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint + if success != 0 + # Internal error in libblosc2, possibly due to invalid input + return nothing + end + end + + return compressed_size::Int64 +end diff --git a/LibBlosc2/src/libblosc2.jl b/LibBlosc2/src/libblosc2.jl new file mode 100644 index 0000000..44ab872 --- /dev/null +++ b/LibBlosc2/src/libblosc2.jl @@ -0,0 +1,328 @@ +# Constants and C wrapper functions ported to Julia from blosc2.h https://github.com/Blosc/c-blosc2/blob/5fcd6fbf9ffcf613fabdb1eb3a90eeb12f7c04fe/include/blosc2.h + +################################################################################ +# Constants + +# [175] +# Extended header length (Blosc2, see README_HEADER) +const BLOSC_EXTENDED_HEADER_LENGTH = 32 +const BLOSC2_MAX_OVERHEAD = BLOSC_EXTENDED_HEADER_LENGTH +const BLOSC_MAX_TYPESIZE = Int(typemax(UInt8)) + +# [222] +const BLOSC2_MAX_FILTERS = 6 + +# [242] Codes for filters. +# No shuffle (for compatibility with Blosc1). +const BLOSC_NOSHUFFLE = 0 +# No filter. +const BLOSC_NOFILTER = 0 +const BLOSC_SHUFFLE = 1 +# Byte-wise shuffle. `filters_meta` does not have any effect here. +const BLOSC_BITSHUFFLE = 2 +# Bit-wise shuffle. `filters_meta` does not have any effect here. +const BLOSC_DELTA = 3 +# Delta filter. `filters_meta` does not have any effect here. +const BLOSC_TRUNC_PREC = 4 +# Truncate mantissa precision. +# Positive values in `filters_meta` will keep bits; negative values will zero bits. +const BLOSC_LAST_FILTER = 5 + +# [314] Codes for the different compressors shipped with Blosc +const BLOSC_BLOSCLZ = 0 +const BLOSC_LZ4 = 1 +const BLOSC_LZ4HC = 2 +const BLOSC_ZLIB = 4 +const BLOSC_ZSTD = 5 +const BLOSC_LAST_CODEC = 6 + +# [396] Split mode for blocks. +const BLOSC_ALWAYS_SPLIT = 1 +const BLOSC_NEVER_SPLIT = 2 +const BLOSC_AUTO_SPLIT = 3 +const BLOSC_FORWARD_COMPAT_SPLIT = 4 + +# [1641] +const BLOSC2_MAX_METALAYERS = 16 +const BLOSC2_MAX_VLMETALAYERS = 8 * 1024 + +################################################################################ +# Types + +""" + struct Blosc2CParams + +The parameters for creating a context for compression purposes. +""" +struct Blosc2CParams + # The compressor codec. + compcode::UInt8 + # The metadata for the compressor codec. + compcode_meta::UInt8 + # The compression level (5). + clevel::UInt8 + # Use dicts or not when compressing (only for ZSTD). + use_dict::Cint + # The type size (8). + typesize::Int32 + # The number of threads to use internally (1). + nthreads::Int16 + # The requested size of the compressed blocks (0 means automatic). + blocksize::Int32 + # Whether the blocks should be split or not. + splitmode::Int32 + # The associated schunk, if any (NULL). + schunk::Ptr{Cvoid} + # The (sequence of) filters. + filters::NTuple{BLOSC2_MAX_FILTERS,UInt8} + # The metadata for filters. + filters_meta::NTuple{BLOSC2_MAX_FILTERS,UInt8} + # The prefilter function. + prefilter::Ptr{Cvoid} # blosc2_prefilter_fn + # The prefilter parameters. + preparams::Ptr{Cvoid} # blosc2_prefilter_params* + # Tune configuration. + tuner_params::Ptr{Cvoid} + # The tuner id. + tuner_id::Cint + # Whether the codec is instrumented or not + instr_codec::UInt8 # bool + # User defined parameters for the codec + codec_params::Ptr{Cvoid} + # User defined parameters for the filters + filter_params::NTuple{BLOSC2_MAX_FILTERS,Ptr{Cvoid}} +end +Blosc2CParams() = @ccall libblosc2.blosc2_get_blosc2_cparams_defaults()::Blosc2CParams + +""" + struct Blosc2DParams + +The parameters for creating a context for decompression purposes. +""" +struct Blosc2DParams + # The number of threads to use internally (1). + nthreads::Int16 + # The associated schunk, if any (NULL). + schunk::Ptr{Cvoid} + # The postfilter function. + postfilter::Ptr{Cvoid} # blosc2_postfilter_fn + # The postfilter parameters. + postparams::Ptr{Cvoid} # blosc2_postfilter_params* +end +Blosc2DParams() = @ccall libblosc2.blosc2_get_blosc2_dparams_defaults()::Blosc2DParams + +""" + struct Blosc2IO + +Input/Output parameters. +""" +struct Blosc2IO + id::UInt8 + # The IO identifier. + name::Cstring + # The IO parameters. + params::Ptr{Cvoid} +end +Blosc2IO() = @ccall libblosc2.blosc2_get_blosc2_io_defaults()::Blosc2IO + +""" + struct Blosc2Storage + +This struct is meant for holding storage parameters for a +for a blosc2 container, allowing to specify, for example, how to interpret +the contents included in the schunk. +""" +struct Blosc2Storage + # Whether the chunks are contiguous or sparse. + contiguous::UInt8 # bool + # The path for persistent storage. If NULL, that means in-memory. + urlpath::Cstring + # The compression params when creating a schunk. + # If NULL, sensible defaults are used depending on the context. + cparams::Ptr{Blosc2CParams} + # The decompression params when creating a schunk. + # If NULL, sensible defaults are used depending on the context. + dparams::Ptr{Blosc2DParams} + # Input/output backend. + io::Ptr{Blosc2IO} +end +Blosc2Storage() = @ccall libblosc2.blosc2_get_blosc2_storage_defaults()::Blosc2Storage + +struct Blosc2Metalayer + # The metalayer identifier for Blosc client (e.g. Blosc2 NDim). + name::Cstring + # The serialized (msgpack preferably) content of the metalayer. + content::Ptr{UInt8} + # The length in bytes of the content. + content_len::Int32 +end + +""" + struct Blosc2SChunk + +This struct is the standard container for Blosc 2 compressed data. +""" +struct Blosc2SChunk + version::UInt8 + # The default compressor. Each chunk can override this. + compcode::UInt8 + # The default compressor metadata. Each chunk can override this. + compcode_meta::UInt8 + # The compression level and other compress params. + clevel::UInt8 + # The split mode. + splitmode::UInt8 + # The type size. + typesize::Int32 + # The requested size of the compressed blocks (0; meaning automatic). + blocksize::Int32 + # Size of each chunk. 0 if not a fixed chunksize. + chunksize::Int32 + # The (sequence of) filters. 8-bit per filter. + filters::NTuple{BLOSC2_MAX_FILTERS,UInt8} + # Metadata for filters. 8-bit per meta-slot. + filters_meta::NTuple{BLOSC2_MAX_FILTERS,UInt8} + # Number of chunks in super-chunk. + nchunks::Int64 + # The current chunk that is being accessed + current_nchunk::Int64 + # The data size (uncompressed). + nbytes::Int64 + # The data size + chunks header size (compressed). + cbytes::Int64 + # Pointer to chunk data pointers buffer. + data::Ptr{Ptr{UInt8}} + # Length of the chunk data pointers buffer. + data_len::Csize_t + # Pointer to storage info. + storage::Ptr{Blosc2Storage} + # Pointer to frame used as store for chunks. + frame::Ptr{Cvoid} # blosc2_frame* + # Context for the thread holder. NULL if not acquired. + # ctx::Ptr{UInt8} + # Context for compression + cctx::Ptr{Cvoid} # blosc2_context* + # Context for decompression. + dctx::Ptr{Cvoid} # blosc2_context* + # The array of metalayers. + metalayers::NTuple{BLOSC2_MAX_METALAYERS,Ptr{Blosc2Metalayer}} + # The number of metalayers in the super-chunk + nmetalayers::UInt16 + # The array of variable-length metalayers. + vlmetalayers::NTuple{BLOSC2_MAX_VLMETALAYERS,Ptr{Blosc2Metalayer}} + # The number of variable-length metalayers. + nvlmetalayers::Int16 + # Tune configuration. + tuner_params::Ptr{Cvoid} + # Id for tuner + tuner_id::Cint + # The ndim (mainly for ZFP usage) + ndim::Int8 + # The blockshape (mainly for ZFP usage) + blockshape::Ptr{Int64} +end + +################################################################################ +# Functions + +const blosc2_initialized = Atomic{Bool}(false) +const blosc2_initialized_lock = ReentrantLock() +# Initialize the Blosc2 library. This function is reentrant and +# idempotent, i.e. it can be called called multiple times without +# harm. +function blosc2_init() + blosc2_initialized[] && return + @lock blosc2_initialized_lock begin + blosc2_initialized[] && return + @ccall libblosc2.blosc2_init()::Cvoid + blosc2_initialized[] = true + end + return +end + +""" + is_compressor_valid(s::AbstractString)::Bool + +Check if a compressor name is valid. +""" +function is_compressor_valid(s::AbstractString) + '\0' ∈ s && return false + code = @ccall libblosc2.blosc2_compname_to_compcode(s::Cstring)::Cint + return code >= 0 +end + +""" + compcode(s::AbstractString)::Int + +Return a nonnegative integer code used internally by Blosc to identify the compressor. +Throws an `ArgumentError` if `s` is not the name of a supported algorithm. +""" +function compcode(s::AbstractString) + code = @ccall libblosc2.blosc2_compname_to_compcode(s::Cstring)::Cint + code < 0 && throw(ArgumentError("unrecognized compressor $(repr(s))")) + return Int(code) +end + +""" + compname(compcode::Integer)::String + +Return the compressor name corresponding to the internal integer code used by Blosc. +Throws an `ArgumentError` if `compcode` is not a valid code. +""" +function compname(compcode::Integer) + name = Ref{Ptr{UInt8}}() + code = @ccall libblosc2.blosc2_compcode_to_compname(compcode::Cint, name::Ref{Ptr{UInt8}})::Cint + code == -1 && throw(ArgumentError("unrecognized compcode $compcode")) + name = name[] + return unsafe_string(name) +end + +################################################################################ + +# The following is the original license info from blosc2.h and LICENSE.txt + +#= +/********************************************************************* + Blosc - Blocked Shuffling and Compression Library + + Copyright (c) 2021 Blosc Development Team + https://blosc.org + License: BSD 3-Clause (see LICENSE.txt) + + See LICENSE.txt for details about copyright and rights to use. +**********************************************************************/ +=# + +#= contents of LICENSE.txt +BSD License + +For Blosc - A blocking, shuffling and lossless compression library + +Copyright (c) 2009-2018 Francesc Alted +Copyright (c) 2019-present Blosc Development Team + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name Francesc Alted nor the names of its contributors may be used + to endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +=# diff --git a/LibBlosc2/test/Project.toml b/LibBlosc2/test/Project.toml new file mode 100644 index 0000000..d98123d --- /dev/null +++ b/LibBlosc2/test/Project.toml @@ -0,0 +1,7 @@ +[deps] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +ChunkCodecCore = "0b6fb165-00bc-4d37-ab8b-79f91016dbe1" +ChunkCodecLibBlosc2 = "59b5581c-e2bc-42b3-a6f1-80e88eec7b70" +ChunkCodecTests = "06b1ce50-b741-4199-b118-ba5fe1a70fa7" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/LibBlosc2/test/runtests.jl b/LibBlosc2/test/runtests.jl new file mode 100644 index 0000000..51d01fa --- /dev/null +++ b/LibBlosc2/test/runtests.jl @@ -0,0 +1,114 @@ +using Random: Random +using ChunkCodecLibBlosc2: + ChunkCodecLibBlosc2, + Blosc2CFrame, + Blosc2EncodeOptions, + Blosc2DecodeOptions, + Blosc2DecodingError +using ChunkCodecCore: decode, encode +using ChunkCodecTests: test_codec +using Test: @testset, @test_throws, @test +using Aqua: Aqua + +Aqua.test_all(ChunkCodecLibBlosc2; persistent_tasks=false) + +Random.seed!(1234) + +@testset "default" begin + test_codec(Blosc2CFrame(), Blosc2EncodeOptions(), Blosc2DecodeOptions(); trials=100) +end +@testset "typesize" begin + for i in 1:50 + test_codec(Blosc2CFrame(), Blosc2EncodeOptions(; typesize=i), Blosc2DecodeOptions(); trials=10) + end +end +@testset "compressors" begin + for clevel in 0:9 + for compressor in ["blosclz", "lz4", "lz4hc", "zlib", "zstd"] + test_codec(Blosc2CFrame(), Blosc2EncodeOptions(; compressor, clevel), Blosc2DecodeOptions(); trials=10) + end + end +end +@testset "large inputs" begin + # We cannot really test large inputs (multi-Gigabyte) in a regular test. + # We therefore simulate this with smaller inputs and a ridiculously small chunk size. + u = reinterpret(UInt8, collect(float(1:(10 ^ 6)))) + e = Blosc2EncodeOptions(; clevel=9, doshuffle=2, typesize=sizeof(float(1)), chunksize=10^4, compressor="zstd") + c = encode(e, u) + u′ = decode(Blosc2DecodeOptions(), c) + @test u′ == u +end +@testset "invalid options" begin + @test Blosc2EncodeOptions(; clevel=-1).clevel == 0 + @test Blosc2EncodeOptions(; clevel=100).clevel == 9 + # typesize can be anything, but out of the range it gets set to 8 (the default) + e = Blosc2EncodeOptions(; typesize=typemax(UInt128)) + @test e.typesize == 8 + e = Blosc2EncodeOptions(; typesize=0) + @test e.typesize == 8 + e = Blosc2EncodeOptions(; typesize=-1) + @test e.typesize == 8 + e = Blosc2EncodeOptions(; typesize=ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE) + @test e.typesize == ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE + e = Blosc2EncodeOptions(; typesize=(ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE+1)) + @test e.typesize == 8 + @test_throws ArgumentError Blosc2EncodeOptions(; compressor="") + @test_throws ArgumentError Blosc2EncodeOptions(; compressor="asfdgfsdgrwwea") + @test_throws ArgumentError Blosc2EncodeOptions(; compressor="blosclz,") + @test_throws ArgumentError Blosc2EncodeOptions(; compressor="blosclz\0") +end +@testset "compcode and compname" begin + @test ChunkCodecLibBlosc2.compcode("blosclz") == 0 + @test ChunkCodecLibBlosc2.is_compressor_valid("blosclz") + @test ChunkCodecLibBlosc2.compname(0) == "blosclz" + + @test_throws ArgumentError ChunkCodecLibBlosc2.compcode("sdaffads") + @test !ChunkCodecLibBlosc2.is_compressor_valid("sdaffads") + @test_throws ArgumentError ChunkCodecLibBlosc2.compcode("sdaffads") + @test_throws ArgumentError ChunkCodecLibBlosc2.compname(100) + + @test !ChunkCodecLibBlosc2.is_compressor_valid("\0") +end +@testset "errors" begin + # check Blosc2DecodingError prints the correct error message + @test sprint(Base.showerror, Blosc2DecodingError(0)) == "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded, error code: 0" + # check that a truncated buffer throws a Blosc2DecodingError + u = zeros(UInt8, 8) + c = encode(Blosc2EncodeOptions(), u) + @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c[1:(end - 1)]) + @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), UInt8[0x00]) + # check that a buffer with extra data throws a Blosc2DecodingError + @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), [c; 0x00;]) + # check corrupting LZ4 encoding throws a Blosc2DecodingError + u = zeros(UInt8, 1000) + c = encode(Blosc2EncodeOptions(), u) + + c[end-5] = 0x40 + # Blosc2 does not detect this corruption. (Apparently it stores + # unused and unchecked data in the trailer near the end of the + # compressed data.) We check whether at least the decompressed + # data are correct. + # BROKEN @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) + @test decode(Blosc2DecodeOptions(), c) == u + + # There's more unused/unchecked data + c[end-50] = 0x40 + # BROKEN @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) + @test decode(Blosc2DecodeOptions(), c) == u + + # Finally, this corruption has an effect + c[end-100] = 0x40 + # Windows segfaults in this call with exit code 3221226356, + # indicating a heap corruption. That's clearly a bug in c-blosc2. + # It seems c-blosc2 does not checksum its compressed data. + if !Sys.iswindows() + @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) + end +end +@testset "public" begin + if VERSION >= v"1.11.0-DEV.469" + for sym in (:is_compressor_valid, :compcode, :compname) + @test Base.ispublic(ChunkCodecLibBlosc2, sym) + end + end +end diff --git a/Project.toml b/Project.toml index cd45adb..c00a838 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ projects = [ "ChunkCodecCore", "ChunkCodecTests", "LibBlosc", + "LibBlosc2", "LibBrotli", "LibBzip2", "LibLz4",