Skip to content

Commit b52be51

Browse files
authored
Add more filters (#154)
* Document the Filter interface * Move filters to a folder Same rationale as the other changes :D - just for cleanliness and clarity. * Factor out variable-length filters to a new file * Add docstrings to filter API functions * Add a Fletcher32 filter and test * re-add the dictionary entries for the vlen filters * Semi-working fixed scale offset filter * Add FixedScaleOffset tests * Add shuffle filter (buggy in the last few bytes, indexing issues) * WIP quantize filter * ShuffleFilter working and tested * Semi working quantize filter * Format tests better * Complete interface and test quantize * Uncomment the FixedScaleOffset tests * fix getfilter syntax * Add delta filter * Adapt for Kerchunk playing fast and loose with the spec - Kerchunk often encodes the compressor as the last filter, so we check that the compressor isn't hiding in the filters array if the compressor is null. - Similarly, the dtype is often unknown in this case, or the transform is not encoded correctly, so we ensure that the datatypes of `data` and `a2` remain the same by reinterpreting. * Fix the delta and quantize JSON.lower * Change the tests to be more sensible/Julian and avoid truncation errors * Fix the FixedScaleOffset filter materializer * Fix decoding for fill values to use `reinterpret` on unsigned -> integer * If `getfilter` fails, show the filter name and then throw an error * Apply reinterpret before multiplication in fixed-scale-offset filter * Only reinterpret negative integers when decoding fill values to unsigned * Revert "Only reinterpret negative integers when decoding fill values to unsigned" This reverts commit 24a68e6. * let Fletcher32 operate on n-dimensional arrays not just vectors, as it was previously constrained to * fix FixedScaleOffset in many ways - Never use reinterpret - use array comprehensions to support 0-dimensional arrays correctly, the performance impact is negligible based on testing - only round if the target type is an integer, otherwise let it be if it's a float. * add filter tests in Python * Fix filter astype, id to conform to Python names * remove encoding validity check for quantize - it's pointless
1 parent 2ae3c2a commit b52be51

File tree

12 files changed

+640
-42
lines changed

12 files changed

+640
-42
lines changed

src/Filters/Filters.jl

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import JSON
2+
3+
"""
4+
abstract type Filter{T,TENC}
5+
6+
The supertype for all Zarr filters.
7+
8+
## Interface
9+
10+
All subtypes MUST implement the following methods:
11+
12+
- [`zencode(ain, filter::Filter)`](@ref zencode): Encodes data `ain` using the filter, and returns a vector of bytes.
13+
- [`zdecode(ain, filter::Filter)`](@ref zdecode): Decodes data `ain`, a vector of bytes, using the filter, and returns the original data.
14+
- [`JSON.lower`](@ref): Returns a JSON-serializable dictionary representing the filter, according to the Zarr specification.
15+
- [`getfilter(::Type{<: Filter}, filterdict)`](@ref getfilter): Returns the filter type read from a given filter dictionary.
16+
17+
If the filter has type parameters, it MUST also implement:
18+
- [`sourcetype(::Filter)::T`](@ref sourcetype): equivalent to `dtype` in the Python Zarr implementation.
19+
- [`desttype(::Filter)::T`](@ref desttype): equivalent to `atype` in the Python Zarr implementation.
20+
21+
Finally, an entry MUST be added to the `filterdict` dictionary for each filter type.
22+
This must also follow the Zarr specification's name for that filter. The name of the filter
23+
is the key, and the value is the filter type (e.g. `VLenUInt8Filter` or `Fletcher32Filter`).
24+
25+
26+
Subtypes include: [`VLenArrayFilter`](@ref), [`VLenUTF8Filter`](@ref), [`Fletcher32Filter`](@ref).
27+
"""
28+
abstract type Filter{T,TENC} end
29+
30+
"""
31+
zencode(ain, filter::Filter)
32+
33+
Encodes data `ain` using the filter, and returns a vector of bytes.
34+
"""
35+
function zencode end
36+
37+
"""
38+
zdecode(ain, filter::Filter)
39+
40+
Decodes data `ain`, a vector of bytes, using the filter, and returns the original data.
41+
"""
42+
function zdecode end
43+
44+
"""
45+
getfilter(::Type{<: Filter}, filterdict)
46+
47+
Returns the filter type read from a given specification dictionary, which must follow the Zarr specification.
48+
"""
49+
function getfilter end
50+
51+
"""
52+
sourcetype(::Filter)::T
53+
54+
Returns the source type of the filter.
55+
"""
56+
function sourcetype end
57+
58+
"""
59+
desttype(::Filter)::T
60+
61+
Returns the destination type of the filter.
62+
"""
63+
function desttype end
64+
65+
filterdict = Dict{String,Type{<:Filter}}()
66+
67+
function getfilters(d::Dict)
68+
if !haskey(d,"filters")
69+
return nothing
70+
else
71+
if d["filters"] === nothing || isempty(d["filters"])
72+
return nothing
73+
end
74+
f = map(d["filters"]) do f
75+
try
76+
getfilter(filterdict[f["id"]], f)
77+
catch e
78+
@show f
79+
rethrow(e)
80+
end
81+
end
82+
return (f...,)
83+
end
84+
end
85+
sourcetype(::Filter{T}) where T = T
86+
desttype(::Filter{<:Any,T}) where T = T
87+
88+
zencode(ain,::Nothing) = ain
89+
90+
include("vlenfilters.jl")
91+
include("fletcher32.jl")
92+
include("fixedscaleoffset.jl")
93+
include("shuffle.jl")
94+
include("quantize.jl")
95+
include("delta.jl")

src/Filters/delta.jl

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#=
2+
# Delta compression
3+
4+
5+
=#
6+
7+
"""
8+
DeltaFilter(; DecodingType, [EncodingType = DecodingType])
9+
10+
Delta-based compression for Zarr arrays. (Delta encoding is Julia `diff`, decoding is Julia `cumsum`).
11+
"""
12+
struct DeltaFilter{T, TENC} <: Filter{T, TENC}
13+
end
14+
15+
function DeltaFilter(; DecodingType = Float16, EncodingType = DecodingType)
16+
return DeltaFilter{DecodingType, EncodingType}()
17+
end
18+
19+
DeltaFilter{T}() where T = DeltaFilter{T, T}()
20+
21+
function zencode(data::AbstractArray, filter::DeltaFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType}
22+
arr = reinterpret(DecodingType, vec(data))
23+
24+
enc = similar(arr, EncodingType)
25+
# perform the delta operation
26+
enc[begin] = arr[begin]
27+
enc[begin+1:end] .= diff(arr)
28+
return enc
29+
end
30+
31+
function zdecode(data::AbstractArray, filter::DeltaFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType}
32+
encoded = reinterpret(EncodingType, vec(data))
33+
decoded = DecodingType.(cumsum(encoded))
34+
return decoded
35+
end
36+
37+
function JSON.lower(filter::DeltaFilter{T, Tenc}) where {T, Tenc}
38+
return Dict("id" => "delta", "dtype" => typestr(T), "astype" => typestr(Tenc))
39+
end
40+
41+
function getfilter(::Type{<: DeltaFilter}, d)
42+
return DeltaFilter{typestr(d["dtype"], haskey(d, "astype") ? typestr(d["astype"]) : d["dtype"])}()
43+
end
44+
45+
filterdict["delta"] = DeltaFilter

src/Filters/fixedscaleoffset.jl

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
2+
"""
3+
FixedScaleOffsetFilter{T,TENC}(scale, offset)
4+
5+
A compressor that scales and offsets the data.
6+
7+
!!! note
8+
The geographic CF standards define scale/offset decoding as `x * scale + offset`,
9+
but this filter defines it as `x / scale + offset`. Constructing a `FixedScaleOffsetFilter`
10+
from CF data means `FixedScaleOffsetFilter(1/cf_scale_factor, cf_add_offset)`.
11+
"""
12+
struct FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc} <: Filter{T, Tenc}
13+
scale::ScaleOffsetType
14+
offset::ScaleOffsetType
15+
end
16+
17+
FixedScaleOffsetFilter{T}(scale::ScaleOffsetType, offset::ScaleOffsetType) where {T, ScaleOffsetType} = FixedScaleOffsetFilter{T, ScaleOffsetType}(scale, offset)
18+
FixedScaleOffsetFilter(scale::ScaleOffsetType, offset::ScaleOffsetType) where {ScaleOffsetType} = FixedScaleOffsetFilter{ScaleOffsetType, ScaleOffsetType}(scale, offset)
19+
20+
function FixedScaleOffsetFilter(; scale::ScaleOffsetType, offset::ScaleOffsetType, T, Tenc = T) where ScaleOffsetType
21+
return FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}(scale, offset)
22+
end
23+
24+
function zencode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType}
25+
if Tenc <: Integer
26+
return [round(Tenc, (a - c.offset) * c.scale) for a in a] # apply scale and offset, and round to nearest integer
27+
else
28+
return [convert(Tenc, (a - c.offset) * c.scale) for a in a] # apply scale and offset
29+
end
30+
end
31+
32+
function zdecode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType}
33+
return [convert(Base.nonmissingtype(T), (a / c.scale) + c.offset) for a in a]
34+
end
35+
36+
37+
function getfilter(::Type{<: FixedScaleOffsetFilter}, d::Dict)
38+
scale = d["scale"]
39+
offset = d["offset"]
40+
# Types must be converted from strings to the actual Julia types they represent.
41+
string_T = d["dtype"]
42+
string_Tenc = get(d, "astype", string_T)
43+
T = typestr(string_T)
44+
Tenc = typestr(string_Tenc)
45+
return FixedScaleOffsetFilter{Tenc, T, Tenc}(scale, offset)
46+
end
47+
48+
function JSON.lower(c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {ScaleOffsetType, T, Tenc}
49+
return Dict("id" => "fixedscaleoffset", "scale" => c.scale, "offset" => c.offset, "dtype" => typestr(T), "astype" => typestr(Tenc))
50+
end
51+
52+
filterdict["fixedscaleoffset"] = FixedScaleOffsetFilter

src/Filters/fletcher32.jl

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#=
2+
# Fletcher32 filter
3+
4+
This "filter" basically injects a 4-byte checksum at the end of the data, to ensure data integrity.
5+
6+
The implementation is based on the [numcodecs implementation here](https://github.com/zarr-developers/numcodecs/blob/79d1a8d4f9c89d3513836aba0758e0d2a2a1cfaf/numcodecs/fletcher32.pyx)
7+
and the [original C implementation for NetCDF](https://github.com/Unidata/netcdf-c/blob/main/plugins/H5checksum.c#L109) linked therein.
8+
9+
=#
10+
11+
"""
12+
Fletcher32Filter()
13+
14+
A compressor that uses the Fletcher32 checksum algorithm to compress and uncompress data.
15+
16+
Note that this goes from UInt8 to UInt8, and is effectively only checking
17+
the checksum and cropping the last 4 bytes of the data during decoding.
18+
"""
19+
struct Fletcher32Filter <: Filter{UInt8, UInt8}
20+
end
21+
22+
getfilter(::Type{<: Fletcher32Filter}, d::Dict) = Fletcher32Filter()
23+
JSON.lower(::Fletcher32Filter) = Dict("id" => "fletcher32")
24+
filterdict["fletcher32"] = Fletcher32Filter
25+
26+
function _checksum_fletcher32(data::AbstractArray{UInt8})
27+
len = length(data) ÷ 2 # length in 16-bit words
28+
sum1::UInt32 = 0
29+
sum2::UInt32 = 0
30+
data_idx = 1
31+
32+
#=
33+
Compute the checksum for pairs of bytes.
34+
The magic `360` value is the largest number of sums that can be performed without overflow in UInt32.
35+
=#
36+
while len > 0
37+
tlen = len > 360 ? 360 : len
38+
len -= tlen
39+
while tlen > 0
40+
sum1 += begin # create a 16 bit word from two bytes, the first one shifted to the end of the word
41+
(UInt16(data[data_idx]) << 8) | UInt16(data[data_idx + 1])
42+
end
43+
sum2 += sum1
44+
data_idx += 2
45+
tlen -= 1
46+
if tlen < 1
47+
break
48+
end
49+
end
50+
sum1 = (sum1 & 0xffff) + (sum1 >> 16)
51+
sum2 = (sum2 & 0xffff) + (sum2 >> 16)
52+
end
53+
54+
# if the length of the data is odd, add the first byte to the checksum again (?!)
55+
if length(data) % 2 == 1
56+
sum1 += UInt16(data[1]) << 8
57+
sum2 += sum1
58+
sum1 = (sum1 & 0xffff) + (sum1 >> 16)
59+
sum2 = (sum2 & 0xffff) + (sum2 >> 16)
60+
end
61+
return (sum2 << 16) | sum1
62+
end
63+
64+
function zencode(data, ::Fletcher32Filter)
65+
bytes = reinterpret(UInt8, vec(data))
66+
checksum = _checksum_fletcher32(bytes)
67+
result = copy(bytes)
68+
append!(result, reinterpret(UInt8, [checksum])) # TODO: decompose this without the extra allocation of wrapping in Array
69+
return result
70+
end
71+
72+
function zdecode(data, ::Fletcher32Filter)
73+
bytes = reinterpret(UInt8, data)
74+
checksum = _checksum_fletcher32(view(bytes, 1:length(bytes) - 4))
75+
stored_checksum = only(reinterpret(UInt32, view(bytes, (length(bytes) - 3):length(bytes))))
76+
if checksum != stored_checksum
77+
throw(ErrorException("""
78+
Checksum mismatch in Fletcher32 decoding.
79+
80+
The computed value is $(checksum) and the stored value is $(stored_checksum).
81+
This might be a sign that the data is corrupted.
82+
""")) # TODO: make this a custom error type
83+
end
84+
return view(bytes, 1:length(bytes) - 4)
85+
end

src/Filters/quantize.jl

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#=
2+
# Quantize compression
3+
4+
5+
=#
6+
7+
"""
8+
QuantizeFilter(; digits, DecodingType, [EncodingType = DecodingType])
9+
10+
Quantization based compression for Zarr arrays.
11+
"""
12+
struct QuantizeFilter{T, TENC} <: Filter{T, TENC}
13+
digits::Int32
14+
end
15+
16+
function QuantizeFilter(; digits = 10, T = Float16, Tenc = T)
17+
return QuantizeFilter{T, Tenc}(digits)
18+
end
19+
20+
QuantizeFilter{T, Tenc}(; digits = 10) where {T, Tenc} = QuantizeFilter{T, Tenc}(digits)
21+
QuantizeFilter{T}(; digits = 10) where T = QuantizeFilter{T, T}(digits)
22+
23+
function zencode(data::AbstractArray, filter::QuantizeFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType}
24+
arr = reinterpret(DecodingType, vec(data))
25+
26+
precision = 10.0^(-filter.digits)
27+
28+
_exponent = log(10, precision) # log 10 in base `precision`
29+
exponent = _exponent < 0 ? floor(Int, _exponent) : ceil(Int, _exponent)
30+
31+
bits = ceil(log(2, 10.0^(-exponent)))
32+
scale = 2.0^bits
33+
34+
enc = @. convert(EncodingType, round(scale * arr) / scale)
35+
36+
return enc
37+
end
38+
39+
# Decoding is a no-op; quantization is a lossy filter but data is encoded directly.
40+
function zdecode(data::AbstractArray, filter::QuantizeFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType}
41+
return data
42+
end
43+
44+
function JSON.lower(filter::QuantizeFilter{T, Tenc}) where {T, Tenc}
45+
return Dict("id" => "quantize", "digits" => filter.digits, "dtype" => typestr(T), "astype" => typestr(Tenc))
46+
end
47+
48+
function getfilter(::Type{<: QuantizeFilter}, d)
49+
return QuantizeFilter{typestr(d["dtype"], typestr(d["astype"]))}(; digits = d["digits"])
50+
end
51+
52+
filterdict["quantize"] = QuantizeFilter

0 commit comments

Comments
 (0)