Skip to content

Commit f4f4f2b

Browse files
committed
remove all zarr v2 refs and fix fill_value attributes
1 parent faeb616 commit f4f4f2b

File tree

7 files changed

+17
-59
lines changed

7 files changed

+17
-59
lines changed

src/mdio/api/convenience.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,6 @@ def create_rechunk_plan(
124124
Raises:
125125
NameError: if trying to write to original data.
126126
"""
127-
zarr.config.set({"write_empty_chunks": False})
128-
129127
data_group = source._data_group
130128
metadata_group = source._metadata_group
131129

@@ -154,8 +152,6 @@ def create_rechunk_plan(
154152
chunks=norm_chunks[:-1],
155153
compressor=header_compressor,
156154
overwrite=overwrite,
157-
zarr_format=2,
158-
dimension_separator="/",
159155
)
160156
)
161157

@@ -167,8 +163,6 @@ def create_rechunk_plan(
167163
chunks=norm_chunks,
168164
compressor=trace_compressor,
169165
overwrite=overwrite,
170-
zarr_format=2,
171-
dimension_separator="/",
172166
)
173167
)
174168

src/mdio/api/opener.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,4 @@ def open_dataset(storage_location: StorageLocation, chunks: T_Chunks = None) ->
2929
Returns:
3030
An Xarray dataset opened from the storage location.
3131
"""
32-
# NOTE: If mask_and_scale is not set,
33-
# Xarray will convert int to float and replace _FillValue with NaN
34-
# Fixed in Zarr v3, so we can fix this later.
35-
return xr.open_dataset(storage_location.uri, engine="zarr", chunks=chunks, mask_and_scale=False)
32+
return xr.open_dataset(storage_location.uri, engine="zarr", chunks=chunks)

src/mdio/converters/segy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -377,11 +377,11 @@ def segy_to_mdio(
377377
# blocked_io.to_zarr() -> _workers.trace_worker()
378378

379379
# This will create the Zarr store with the correct structure but with empty arrays
380-
xr_dataset.to_zarr(store=output_location.uri, mode="w", write_empty_chunks=False, zarr_format=2, compute=False)
380+
xr_dataset.to_zarr(store=output_location.uri, mode="w", compute=False)
381381

382382
# This will write the non-dimension coordinates and trace mask
383383
meta_ds = xr_dataset[drop_vars_delayed + ["trace_mask"]]
384-
meta_ds.to_zarr(store=output_location.uri, mode="r+", write_empty_chunks=False, zarr_format=2, compute=True)
384+
meta_ds.to_zarr(store=output_location.uri, mode="r+", compute=True)
385385

386386
# Now we can drop them to simplify chunked write of the data variable
387387
xr_dataset = xr_dataset.drop_vars(drop_vars_delayed)

src/mdio/core/factory.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,6 @@ def create_empty(
119119
Returns:
120120
Group: The root Zarr group representing the newly created MDIO dataset.
121121
"""
122-
zarr.config.set({"default_zarr_format": 2, "write_empty_chunks": False})
123-
124122
url = process_url(url=config.path, disk_cache=False)
125123
root_group = open_group(url, mode="w", storage_options=storage_options)
126124
root_group = create_zarr_hierarchy(root_group, overwrite)
@@ -146,7 +144,6 @@ def create_empty(
146144
shape=live_shape,
147145
chunks=live_chunks,
148146
dtype="bool",
149-
chunk_key_encoding={"name": "v2", "separator": "/"},
150147
)
151148

152149
for variable in config.variables:
@@ -156,7 +153,6 @@ def create_empty(
156153
dtype=variable.dtype,
157154
compressors=variable.compressors,
158155
chunks=variable.chunks,
159-
chunk_key_encoding={"name": "v2", "separator": "/"},
160156
)
161157

162158
header_dtype = variable.header_dtype or DEFAULT_TRACE_HEADER_DTYPE
@@ -166,7 +162,6 @@ def create_empty(
166162
chunks=variable.chunks[:-1], # Same spatial chunks as data
167163
compressors=Blosc("zstd"),
168164
dtype=header_dtype,
169-
chunk_key_encoding={"name": "v2", "separator": "/"},
170165
)
171166

172167
stats = {"mean": 0, "std": 0, "rms": 0, "min": 0, "max": 0}

src/mdio/schemas/v1/dataset_serializer.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from numcodecs import Blosc as nc_Blosc
66
from xarray import DataArray as xr_DataArray
77
from xarray import Dataset as xr_Dataset
8-
from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding
98

109
from mdio.converters.type_converter import to_numpy_dtype
1110

@@ -213,20 +212,13 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_Dataset: # noqa: PLR0912
213212
if v.long_name:
214213
data_array.attrs["long_name"] = v.long_name
215214

216-
# Create a custom chunk key encoding with "/" as separator
217-
chunk_key_encoding = V2ChunkKeyEncoding(separator="/").to_dict()
218215
encoding = {
219216
"chunks": chunks,
220-
"chunk_key_encoding": chunk_key_encoding,
221217
"compressor": _convert_compressor(v.compressor),
222218
}
223219
# NumPy structured data types have fields attribute, while scalar types do not.
224220
if not hasattr(v.data_type, "fields"):
225-
# TODO(Dmitriy Repin): work around of the bug
226-
# https://github.com/TGSAI/mdio-python/issues/582
227-
# For structured data types we will not use the _FillValue
228-
# NOTE: See Zarr documentation on use of fill_value and _FillValue in Zarr v2 vs v3
229-
encoding["_FillValue"] = _get_fill_value(v.data_type)
221+
encoding["fill_value"] = _get_fill_value(v.data_type)
230222

231223
data_array.encoding = encoding
232224

src/mdio/segy/_workers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ def trace_worker( # noqa: PLR0913
160160
encoding=ds_to_write[data_variable_name].encoding, # Not strictly necessary, but safer than not doing it.
161161
)
162162

163-
ds_to_write.to_zarr(output_location.uri, region=region, mode="r+", write_empty_chunks=False, zarr_format=2)
163+
ds_to_write.to_zarr(output_location.uri, region=region, mode="r+")
164164

165165
histogram = CenteredBinHistogram(bin_centers=[], counts=[])
166166
return SummaryStatistics(

tests/unit/v1/test_dataset_serializer.py

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ def test_to_xarray_dataset(tmp_path: Path) -> None:
325325
xr_ds = to_xarray_dataset(dataset)
326326

327327
file_path = output_path(tmp_path, f"{xr_ds.attrs['name']}", debugging=False)
328-
xr_ds.to_zarr(store=file_path, mode="w", zarr_format=2, compute=False)
328+
xr_ds.to_zarr(store=file_path, mode="w", compute=False)
329329

330330

331331
def test_seismic_poststack_3d_acceptance_to_xarray_dataset(tmp_path: Path) -> None:
@@ -335,7 +335,7 @@ def test_seismic_poststack_3d_acceptance_to_xarray_dataset(tmp_path: Path) -> No
335335
xr_ds = to_xarray_dataset(dataset)
336336

337337
file_path = output_path(tmp_path, f"{xr_ds.attrs['name']}", debugging=False)
338-
xr_ds.to_zarr(store=file_path, mode="w", zarr_format=2, compute=False)
338+
xr_ds.to_zarr(store=file_path, mode="w", compute=False)
339339

340340

341341
@pytest.mark.skip(reason="Bug reproducer for the issue 582")
@@ -348,12 +348,7 @@ def test_buf_reproducer_dask_to_zarr(tmp_path: Path) -> None:
348348
dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")])
349349
dtype_fill_value = np_zeros((), dtype=dtype)
350350

351-
# Use '_FillValue' instead of 'fill_value'
352-
# 'fill_value' is not a valid encoding key in Zarr v2
353-
my_attr_encoding = {
354-
"_FillValue": dtype_fill_value,
355-
"chunk_key_encoding": {"name": "v2", "separator": "/"},
356-
}
351+
my_attr_encoding = {"fill_value": dtype_fill_value}
357352

358353
# Create a dask array using the data type
359354
# Do not specify encoding as the array attribute
@@ -363,7 +358,7 @@ def test_buf_reproducer_dask_to_zarr(tmp_path: Path) -> None:
363358
# Specify encoding per array
364359
encoding = {"myattr": my_attr_encoding}
365360
file_path = output_path(tmp_path, "to_zarr/zarr_dask", debugging=False)
366-
aa.to_zarr(file_path, mode="w", zarr_format=2, encoding=encoding, compute=False)
361+
aa.to_zarr(file_path, mode="w", encoding=encoding, compute=False)
367362

368363

369364
def test_to_zarr_from_zarr_zeros_1(tmp_path: Path) -> None:
@@ -375,21 +370,16 @@ def test_to_zarr_from_zarr_zeros_1(tmp_path: Path) -> None:
375370
dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")])
376371
dtype_fill_value = np_zeros((), dtype=dtype)
377372

378-
# Use '_FillValue' instead of 'fill_value'
379-
# 'fill_value' is not a valid encoding key in Zarr v2
380-
my_attr_encoding = {
381-
"_FillValue": dtype_fill_value,
382-
"chunk_key_encoding": {"name": "v2", "separator": "/"},
383-
}
373+
my_attr_encoding = {"fill_value": dtype_fill_value}
384374

385375
# Create a zarr array using the data type,
386376
# Specify encoding as the array attribute
387-
data = zarr_zeros((36, 36), dtype=dtype, zarr_format=2)
377+
data = zarr_zeros((36, 36), dtype=dtype)
388378
aa = xr_DataArray(name="myattr", data=data)
389379
aa.encoding = my_attr_encoding
390380

391381
file_path = output_path(tmp_path, "to_zarr/zarr_zarr_zerros_1", debugging=False)
392-
aa.to_zarr(file_path, mode="w", zarr_format=2, compute=False)
382+
aa.to_zarr(file_path, mode="w", compute=False)
393383

394384

395385
def test_to_zarr_from_zarr_zeros_2(tmp_path: Path) -> None:
@@ -401,22 +391,17 @@ def test_to_zarr_from_zarr_zeros_2(tmp_path: Path) -> None:
401391
dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")])
402392
dtype_fill_value = np_zeros((), dtype=dtype)
403393

404-
# Use '_FillValue' instead of 'fill_value'
405-
# 'fill_value' is not a valid encoding key in Zarr v2
406-
my_attr_encoding = {
407-
"_FillValue": dtype_fill_value,
408-
"chunk_key_encoding": {"name": "v2", "separator": "/"},
409-
}
394+
my_attr_encoding = {"fill_value": dtype_fill_value}
410395

411396
# Create a zarr array using the data type,
412397
# Do not specify encoding as the array attribute
413-
data = zarr_zeros((36, 36), dtype=dtype, zarr_format=2)
398+
data = zarr_zeros((36, 36), dtype=dtype)
414399
aa = xr_DataArray(name="myattr", data=data)
415400

416401
file_path = output_path(tmp_path, "to_zarr/zarr_zarr_zerros_2", debugging=False)
417402
# Specify encoding per array
418403
encoding = {"myattr": my_attr_encoding}
419-
aa.to_zarr(file_path, mode="w", zarr_format=2, encoding=encoding, compute=False)
404+
aa.to_zarr(file_path, mode="w", encoding=encoding, compute=False)
420405

421406

422407
def test_to_zarr_from_np(tmp_path: Path) -> None:
@@ -425,12 +410,7 @@ def test_to_zarr_from_np(tmp_path: Path) -> None:
425410
dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")])
426411
dtype_fill_value = np_zeros((), dtype=dtype)
427412

428-
# Use '_FillValue' instead of 'fill_value'
429-
# 'fill_value' is not a valid encoding key in Zarr v2
430-
my_attr_encoding = {
431-
"_FillValue": dtype_fill_value,
432-
"chunk_key_encoding": {"name": "v2", "separator": "/"},
433-
}
413+
my_attr_encoding = {"fill_value": dtype_fill_value}
434414

435415
# Create a zarr array using the data type
436416
# Do not specify encoding as the array attribute
@@ -440,4 +420,4 @@ def test_to_zarr_from_np(tmp_path: Path) -> None:
440420
file_path = output_path(tmp_path, "to_zarr/zarr_np", debugging=False)
441421
# Specify encoding per array
442422
encoding = {"myattr": my_attr_encoding}
443-
aa.to_zarr(file_path, mode="w", zarr_format=2, encoding=encoding, compute=False)
423+
aa.to_zarr(file_path, mode="w", encoding=encoding, compute=False)

0 commit comments

Comments
 (0)