Skip to content

Commit 90a8dc2

Browse files
authored
Merge pull request #46 from TGSAI/enh/optimize_zfp
Improved ZFP compression quality using `FIXED-ACCURACY` mode
2 parents 2258857 + 9b72a45 commit 90a8dc2

File tree

3 files changed

+38
-28
lines changed

3 files changed

+38
-28
lines changed

src/mdio/commands/segy.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,12 @@
123123
show_default=True,
124124
)
125125
@click.option(
126-
"-ratio",
127-
"--compression-ratio",
126+
"-tolerance",
127+
"--compression-tolerance",
128128
required=False,
129-
default=4,
130-
help="Lossy compression ratio.",
131-
type=click.INT,
129+
default=0.01,
130+
help="Lossy compression tolerance in ZFP.",
131+
type=click.FLOAT,
132132
show_default=True,
133133
)
134134
@click.option(
@@ -156,7 +156,7 @@ def segy_import(
156156
chunk_size,
157157
endian,
158158
lossless,
159-
compression_ratio,
159+
compression_tolerance,
160160
storage_options,
161161
overwrite,
162162
):
@@ -213,13 +213,18 @@ def segy_import(
213213
214214
By default, the data is ingested with LOSSLESS compression. This
215215
saves disk space in the range of 20% to 40%. MDIO also allows
216-
data to be compressed using the ZFP compressor's fixed rate lossy
217-
compression. If lossless parameter is set to False and MDIO was
218-
installed using the lossy extra; then the data will be compressed
216+
data to be compressed using the ZFP compressor's fixed accuracy
217+
lossy compression. If lossless parameter is set to False and MDIO
218+
was installed using the lossy extra; then the data will be compressed
219219
to approximately 30% of its original size and will be perceptually
220-
lossless. The compression ratio can be adjusted using the option
221-
compression_ratio (integer). Higher values will compress more, but
222-
will introduce artifacts.
220+
lossless. The compression amount can be adjusted using the option
221+
compression_tolerance (float). Values less than 1 gives good results.
222+
The higher the value, the more compression, but will introduce artifacts.
223+
The default value is 0.01 tolerance, however we get good results
224+
up to 0.5; where data is almost compressed to 10% of its original size.
225+
NOTE: This assumes data has amplitudes normalized to have approximately
226+
standard deviation of 1. If dataset has values smaller than this
227+
tolerance, a lot of loss may occur.
223228
224229
Usage:
225230
@@ -265,7 +270,7 @@ def segy_import(
265270
chunksize=chunk_size,
266271
endian=endian,
267272
lossless=lossless,
268-
compression_ratio=compression_ratio,
273+
compression_tolerance=compression_tolerance,
269274
storage_options=storage_options,
270275
overwrite=overwrite,
271276
)

src/mdio/converters/segy.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def segy_to_mdio(
3636
chunksize: Sequence[int] | None = None,
3737
endian: str = "big",
3838
lossless: bool = True,
39-
compression_ratio: int | float = 4,
39+
compression_tolerance: float = 0.01,
4040
storage_options: dict[str, Any] | None = None,
4141
overwrite: bool = False,
4242
) -> None:
@@ -84,8 +84,10 @@ def segy_to_mdio(
8484
endian: Endianness of the input SEG-Y. Rev.2 allows little endian.
8585
Default is 'big'. Must be in `{"big", "little"}`
8686
lossless: Lossless Blosc with zstandard, or ZFP with fixed precision.
87-
compression_ratio: Approximate compression ratio for ZFP compression.
88-
Will be ignored if `lossless=True`
87+
compression_tolerance: Tolerance ZFP compression, optional. The fixed
88+
accuracy mode in ZFP guarantees there won't be any errors larger
89+
than this value. The default is 0.01, which gives about 70%
90+
reduction in size. Will be ignored if `lossless=True`.
8991
storage_options: Storage options for the cloud storage backend.
9092
Default is `None` (will assume anonymous)
9193
overwrite: Toggle for overwriting existing store
@@ -253,7 +255,7 @@ def segy_to_mdio(
253255
dtype="float32",
254256
chunks=chunksize,
255257
lossless=lossless,
256-
compression_ratio=compression_ratio,
258+
compression_tolerance=compression_tolerance,
257259
)
258260

259261
for key, value in stats.items():

src/mdio/segy/blocked_io.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,12 @@
1919

2020

2121
try:
22-
from zarr import ZFPY
22+
import zfpy # Base library
23+
from zarr import ZFPY # Codec
24+
2325
except ImportError:
2426
ZFPY = None
27+
zfpy = None
2528

2629
# Globals
2730
NUM_CORES = cpu_count(logical=False)
@@ -36,7 +39,7 @@ def to_zarr(
3639
name: str,
3740
chunks: tuple[int, ...],
3841
lossless: bool,
39-
compression_ratio: int | float,
42+
compression_tolerance: float = 0.01,
4043
**kwargs,
4144
) -> dict:
4245
"""Blocked I/O from SEG-Y to chunked `zarr.core.Array`.
@@ -50,8 +53,10 @@ def to_zarr(
5053
name: Name of the zarr.Array
5154
chunks: Chunk sizes for trace data
5255
lossless: Lossless Blosc with zstandard, or ZFP with fixed precision.
53-
compression_ratio: Approximate compression ratio for ZFP compression.
54-
Will be ignored for `lossless=True`
56+
compression_tolerance: Tolerance ZFP compression, optional. The fixed
57+
accuracy mode in ZFP guarantees there won't be any errors larger
58+
than this value. The default is 0.01, which gives about 70%
59+
reduction in size.
5560
**kwargs: Additional keyword arguments passed to zarr.core.Array # noqa: RST210
5661
5762
Returns:
@@ -63,13 +68,11 @@ def to_zarr(
6368
if lossless is True:
6469
trace_compressor = Blosc("zstd")
6570
header_compressor = trace_compressor
66-
elif ZFPY is not None:
67-
# Compression precision is 32 bit float divided by ratio.
68-
# We round it to the nearest integer since ZFP expects an
69-
# integer precision for `mode=3`. This will approximately
70-
# give the user the ratio they asked for.
71-
zfp_precision = np.round(32 / compression_ratio)
72-
trace_compressor = ZFPY(mode=3, precision=zfp_precision)
71+
elif ZFPY is not None or zfpy is not None:
72+
trace_compressor = ZFPY(
73+
mode=zfpy.mode_fixed_accuracy,
74+
tolerance=compression_tolerance,
75+
)
7376
header_compressor = Blosc("zstd")
7477
else:
7578
raise ImportError(

0 commit comments

Comments
 (0)