remove all zarr v2 refs and fix fill_value attributes

tasansal · tasansal · commit f4f4f2b27268 · 2025-09-04T09:22:40.000-05:00
diff --git a/src/mdio/api/convenience.py b/src/mdio/api/convenience.py
@@ -124,8 +124,6 @@ def create_rechunk_plan(
     Raises:
         NameError: if trying to write to original data.
     """
-    zarr.config.set({"write_empty_chunks": False})
-
     data_group = source._data_group
     metadata_group = source._metadata_group
 
@@ -154,8 +152,6 @@ def create_rechunk_plan(
                 chunks=norm_chunks[:-1],
                 compressor=header_compressor,
                 overwrite=overwrite,
-                zarr_format=2,
-                dimension_separator="/",
             )
         )
 
@@ -167,8 +163,6 @@ def create_rechunk_plan(
                 chunks=norm_chunks,
                 compressor=trace_compressor,
                 overwrite=overwrite,
-                zarr_format=2,
-                dimension_separator="/",
             )
         )
 
diff --git a/src/mdio/api/opener.py b/src/mdio/api/opener.py
@@ -29,7 +29,4 @@ def open_dataset(storage_location: StorageLocation, chunks: T_Chunks = None) ->
     Returns:
         An Xarray dataset opened from the storage location.
     """
-    # NOTE: If mask_and_scale is not set,
-    # Xarray will convert int to float and replace _FillValue with NaN
-    # Fixed in Zarr v3, so we can fix this later.
-    return xr.open_dataset(storage_location.uri, engine="zarr", chunks=chunks, mask_and_scale=False)
+    return xr.open_dataset(storage_location.uri, engine="zarr", chunks=chunks)
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
@@ -377,11 +377,11 @@ def segy_to_mdio(
     # blocked_io.to_zarr() -> _workers.trace_worker()
 
     # This will create the Zarr store with the correct structure but with empty arrays
-    xr_dataset.to_zarr(store=output_location.uri, mode="w", write_empty_chunks=False, zarr_format=2, compute=False)
+    xr_dataset.to_zarr(store=output_location.uri, mode="w", compute=False)
 
     # This will write the non-dimension coordinates and trace mask
     meta_ds = xr_dataset[drop_vars_delayed + ["trace_mask"]]
-    meta_ds.to_zarr(store=output_location.uri, mode="r+", write_empty_chunks=False, zarr_format=2, compute=True)
+    meta_ds.to_zarr(store=output_location.uri, mode="r+", compute=True)
 
     # Now we can drop them to simplify chunked write of the data variable
     xr_dataset = xr_dataset.drop_vars(drop_vars_delayed)
diff --git a/src/mdio/core/factory.py b/src/mdio/core/factory.py
@@ -119,8 +119,6 @@ def create_empty(
     Returns:
         Group: The root Zarr group representing the newly created MDIO dataset.
     """
-    zarr.config.set({"default_zarr_format": 2, "write_empty_chunks": False})
-
     url = process_url(url=config.path, disk_cache=False)
     root_group = open_group(url, mode="w", storage_options=storage_options)
     root_group = create_zarr_hierarchy(root_group, overwrite)
@@ -146,7 +144,6 @@ def create_empty(
         shape=live_shape,
         chunks=live_chunks,
         dtype="bool",
-        chunk_key_encoding={"name": "v2", "separator": "/"},
     )
 
     for variable in config.variables:
@@ -156,7 +153,6 @@ def create_empty(
             dtype=variable.dtype,
             compressors=variable.compressors,
             chunks=variable.chunks,
-            chunk_key_encoding={"name": "v2", "separator": "/"},
         )
 
         header_dtype = variable.header_dtype or DEFAULT_TRACE_HEADER_DTYPE
@@ -166,7 +162,6 @@ def create_empty(
             chunks=variable.chunks[:-1],  # Same spatial chunks as data
             compressors=Blosc("zstd"),
             dtype=header_dtype,
-            chunk_key_encoding={"name": "v2", "separator": "/"},
         )
 
     stats = {"mean": 0, "std": 0, "rms": 0, "min": 0, "max": 0}
diff --git a/src/mdio/schemas/v1/dataset_serializer.py b/src/mdio/schemas/v1/dataset_serializer.py
@@ -5,7 +5,6 @@
 from numcodecs import Blosc as nc_Blosc
 from xarray import DataArray as xr_DataArray
 from xarray import Dataset as xr_Dataset
-from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding
 
 from mdio.converters.type_converter import to_numpy_dtype
 
@@ -213,20 +212,13 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_Dataset:  # noqa: PLR0912
         if v.long_name:
             data_array.attrs["long_name"] = v.long_name
 
-        # Create a custom chunk key encoding with "/" as separator
-        chunk_key_encoding = V2ChunkKeyEncoding(separator="/").to_dict()
         encoding = {
             "chunks": chunks,
-            "chunk_key_encoding": chunk_key_encoding,
             "compressor": _convert_compressor(v.compressor),
         }
         # NumPy structured data types have fields attribute, while scalar types do not.
         if not hasattr(v.data_type, "fields"):
-            # TODO(Dmitriy Repin): work around of the bug
-            # https://github.com/TGSAI/mdio-python/issues/582
-            # For structured data types we will not use the _FillValue
-            # NOTE: See Zarr documentation on use of fill_value and _FillValue in Zarr v2 vs v3
-            encoding["_FillValue"] = _get_fill_value(v.data_type)
+            encoding["fill_value"] = _get_fill_value(v.data_type)
 
         data_array.encoding = encoding
 
diff --git a/src/mdio/segy/_workers.py b/src/mdio/segy/_workers.py
@@ -160,7 +160,7 @@ def trace_worker(  # noqa: PLR0913
         encoding=ds_to_write[data_variable_name].encoding,  # Not strictly necessary, but safer than not doing it.
     )
 
-    ds_to_write.to_zarr(output_location.uri, region=region, mode="r+", write_empty_chunks=False, zarr_format=2)
+    ds_to_write.to_zarr(output_location.uri, region=region, mode="r+")
 
     histogram = CenteredBinHistogram(bin_centers=[], counts=[])
     return SummaryStatistics(
diff --git a/tests/unit/v1/test_dataset_serializer.py b/tests/unit/v1/test_dataset_serializer.py
@@ -325,7 +325,7 @@ def test_to_xarray_dataset(tmp_path: Path) -> None:
     xr_ds = to_xarray_dataset(dataset)
 
     file_path = output_path(tmp_path, f"{xr_ds.attrs['name']}", debugging=False)
-    xr_ds.to_zarr(store=file_path, mode="w", zarr_format=2, compute=False)
+    xr_ds.to_zarr(store=file_path, mode="w", compute=False)
 
 
 def test_seismic_poststack_3d_acceptance_to_xarray_dataset(tmp_path: Path) -> None:
@@ -335,7 +335,7 @@ def test_seismic_poststack_3d_acceptance_to_xarray_dataset(tmp_path: Path) -> No
     xr_ds = to_xarray_dataset(dataset)
 
     file_path = output_path(tmp_path, f"{xr_ds.attrs['name']}", debugging=False)
-    xr_ds.to_zarr(store=file_path, mode="w", zarr_format=2, compute=False)
+    xr_ds.to_zarr(store=file_path, mode="w", compute=False)
 
 
 @pytest.mark.skip(reason="Bug reproducer for the issue 582")
@@ -348,12 +348,7 @@ def test_buf_reproducer_dask_to_zarr(tmp_path: Path) -> None:
     dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")])
     dtype_fill_value = np_zeros((), dtype=dtype)
 
-    # Use '_FillValue' instead of 'fill_value'
-    # 'fill_value' is not a valid encoding key in Zarr v2
-    my_attr_encoding = {
-        "_FillValue": dtype_fill_value,
-        "chunk_key_encoding": {"name": "v2", "separator": "/"},
-    }
+    my_attr_encoding = {"fill_value": dtype_fill_value}
 
     # Create a dask array using the data type
     # Do not specify encoding as the array attribute
@@ -363,7 +358,7 @@ def test_buf_reproducer_dask_to_zarr(tmp_path: Path) -> None:
     # Specify encoding per array
     encoding = {"myattr": my_attr_encoding}
     file_path = output_path(tmp_path, "to_zarr/zarr_dask", debugging=False)
-    aa.to_zarr(file_path, mode="w", zarr_format=2, encoding=encoding, compute=False)
+    aa.to_zarr(file_path, mode="w", encoding=encoding, compute=False)
 
 
 def test_to_zarr_from_zarr_zeros_1(tmp_path: Path) -> None:
@@ -375,21 +370,16 @@ def test_to_zarr_from_zarr_zeros_1(tmp_path: Path) -> None:
     dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")])
     dtype_fill_value = np_zeros((), dtype=dtype)
 
-    # Use '_FillValue' instead of 'fill_value'
-    # 'fill_value' is not a valid encoding key in Zarr v2
-    my_attr_encoding = {
-        "_FillValue": dtype_fill_value,
-        "chunk_key_encoding": {"name": "v2", "separator": "/"},
-    }
+    my_attr_encoding = {"fill_value": dtype_fill_value}
 
     # Create a zarr array using the data type,
     # Specify encoding as the array attribute
-    data = zarr_zeros((36, 36), dtype=dtype, zarr_format=2)
+    data = zarr_zeros((36, 36), dtype=dtype)
     aa = xr_DataArray(name="myattr", data=data)
     aa.encoding = my_attr_encoding
 
     file_path = output_path(tmp_path, "to_zarr/zarr_zarr_zerros_1", debugging=False)
-    aa.to_zarr(file_path, mode="w", zarr_format=2, compute=False)
+    aa.to_zarr(file_path, mode="w", compute=False)
 
 
 def test_to_zarr_from_zarr_zeros_2(tmp_path: Path) -> None:
@@ -401,22 +391,17 @@ def test_to_zarr_from_zarr_zeros_2(tmp_path: Path) -> None:
     dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")])
     dtype_fill_value = np_zeros((), dtype=dtype)
 
-    # Use '_FillValue' instead of 'fill_value'
-    # 'fill_value' is not a valid encoding key in Zarr v2
-    my_attr_encoding = {
-        "_FillValue": dtype_fill_value,
-        "chunk_key_encoding": {"name": "v2", "separator": "/"},
-    }
+    my_attr_encoding = {"fill_value": dtype_fill_value}
 
     # Create a zarr array using the data type,
     # Do not specify encoding as the array attribute
-    data = zarr_zeros((36, 36), dtype=dtype, zarr_format=2)
+    data = zarr_zeros((36, 36), dtype=dtype)
     aa = xr_DataArray(name="myattr", data=data)
 
     file_path = output_path(tmp_path, "to_zarr/zarr_zarr_zerros_2", debugging=False)
     # Specify encoding per array
     encoding = {"myattr": my_attr_encoding}
-    aa.to_zarr(file_path, mode="w", zarr_format=2, encoding=encoding, compute=False)
+    aa.to_zarr(file_path, mode="w", encoding=encoding, compute=False)
 
 
 def test_to_zarr_from_np(tmp_path: Path) -> None:
@@ -425,12 +410,7 @@ def test_to_zarr_from_np(tmp_path: Path) -> None:
     dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")])
     dtype_fill_value = np_zeros((), dtype=dtype)
 
-    # Use '_FillValue' instead of 'fill_value'
-    # 'fill_value' is not a valid encoding key in Zarr v2
-    my_attr_encoding = {
-        "_FillValue": dtype_fill_value,
-        "chunk_key_encoding": {"name": "v2", "separator": "/"},
-    }
+    my_attr_encoding = {"fill_value": dtype_fill_value}
 
     # Create a zarr array using the data type
     # Do not specify encoding as the array attribute
@@ -440,4 +420,4 @@ def test_to_zarr_from_np(tmp_path: Path) -> None:
     file_path = output_path(tmp_path, "to_zarr/zarr_np", debugging=False)
     # Specify encoding per array
     encoding = {"myattr": my_attr_encoding}
-    aa.to_zarr(file_path, mode="w", zarr_format=2, encoding=encoding, compute=False)
+    aa.to_zarr(file_path, mode="w", encoding=encoding, compute=False)

Original file line number	Diff line number	Diff line change
`@@ -124,8 +124,6 @@ def create_rechunk_plan(`
`124`	`124`	`Raises:`
`125`	`125`	`NameError: if trying to write to original data.`
`126`	`126`	`"""`
`127`		`- zarr.config.set({"write_empty_chunks": False})`
`128`		`-`
`129`	`127`	`data_group = source._data_group`
`130`	`128`	`metadata_group = source._metadata_group`
`131`	`129`
`@@ -154,8 +152,6 @@ def create_rechunk_plan(`
`154`	`152`	`chunks=norm_chunks[:-1],`
`155`	`153`	`compressor=header_compressor,`
`156`	`154`	`overwrite=overwrite,`
`157`		`- zarr_format=2,`
`158`		`- dimension_separator="/",`
`159`	`155`	`)`
`160`	`156`	`)`
`161`	`157`
`@@ -167,8 +163,6 @@ def create_rechunk_plan(`
`167`	`163`	`chunks=norm_chunks,`
`168`	`164`	`compressor=trace_compressor,`
`169`	`165`	`overwrite=overwrite,`
`170`		`- zarr_format=2,`
`171`		`- dimension_separator="/",`
`172`	`166`	`)`
`173`	`167`	`)`
`174`	`168`
Original file line number	Diff line number	Diff line change
`@@ -160,7 +160,7 @@ def trace_worker( # noqa: PLR0913`
`160`	`160`	`encoding=ds_to_write[data_variable_name].encoding, # Not strictly necessary, but safer than not doing it.`
`161`	`161`	`)`
`162`	`162`
`163`		`- ds_to_write.to_zarr(output_location.uri, region=region, mode="r+", write_empty_chunks=False, zarr_format=2)`
	`163`	`+ ds_to_write.to_zarr(output_location.uri, region=region, mode="r+")`
`164`	`164`
`165`	`165`	`histogram = CenteredBinHistogram(bin_centers=[], counts=[])`
`166`	`166`	`return SummaryStatistics(`