88
99from virtualizarr import open_virtual_dataset
1010from virtualizarr .manifests import ChunkManifest , ManifestArray
11- from virtualizarr .tests import parametrize_over_hdf_backends , requires_kerchunk
11+ from virtualizarr .tests import (
12+ has_fastparquet ,
13+ has_kerchunk ,
14+ parametrize_over_hdf_backends ,
15+ requires_kerchunk ,
16+ requires_zarr_python ,
17+ )
1218from virtualizarr .translators .kerchunk import (
1319 dataset_from_kerchunk_refs ,
1420)
@@ -34,16 +40,16 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
3440 ),
3541 chunkmanifest = manifest ,
3642 )
37- ds = xr .Dataset ({"a" : (["x" , "y" ], marr )})
43+ vds = xr .Dataset ({"a" : (["x" , "y" ], marr )})
3844
3945 # Use accessor to write it out to kerchunk reference dict
40- ds_refs = ds .virtualize .to_kerchunk (format = "dict" )
46+ ds_refs = vds .virtualize .to_kerchunk (format = "dict" )
4147
4248 # Use dataset_from_kerchunk_refs to reconstruct the dataset
4349 roundtrip = dataset_from_kerchunk_refs (ds_refs )
4450
4551 # Assert equal to original dataset
46- xrt .assert_equal (roundtrip , ds )
52+ xrt .assert_equal (roundtrip , vds )
4753
4854
4955@requires_kerchunk
@@ -84,11 +90,45 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
8490 assert refs ["refs" ]["time/0" ] == expected ["refs" ]["time/0" ]
8591
8692
87- @requires_kerchunk
88- @pytest .mark .parametrize ("format" , ["dict" , "json" , "parquet" ])
89- class TestKerchunkRoundtrip :
93+ def roundtrip_as_kerchunk_dict (vds : xr .Dataset , tmpdir , ** kwargs ):
94+ # write those references to an in-memory kerchunk-formatted references dictionary
95+ ds_refs = vds .virtualize .to_kerchunk (format = "dict" )
96+
97+ # use fsspec to read the dataset from the kerchunk references dict
98+ return xr .open_dataset (ds_refs , engine = "kerchunk" , ** kwargs )
99+
100+
101+ def roundtrip_as_kerchunk_json (vds : xr .Dataset , tmpdir , ** kwargs ):
102+ # write those references to disk as kerchunk references format
103+ vds .virtualize .to_kerchunk (f"{ tmpdir } /refs.json" , format = "json" )
104+
105+ # use fsspec to read the dataset from disk via the kerchunk references
106+ return xr .open_dataset (f"{ tmpdir } /refs.json" , engine = "kerchunk" , ** kwargs )
107+
108+
109+ def roundtrip_as_kerchunk_parquet (vds : xr .Dataset , tmpdir , ** kwargs ):
110+ # write those references to disk as kerchunk references format
111+ vds .virtualize .to_kerchunk (f"{ tmpdir } /refs.parquet" , format = "parquet" )
112+
113+ # use fsspec to read the dataset from disk via the kerchunk references
114+ return xr .open_dataset (f"{ tmpdir } /refs.parquet" , engine = "kerchunk" , ** kwargs )
115+
116+
117+ @requires_zarr_python
118+ @pytest .mark .parametrize (
119+ "roundtrip_func" ,
120+ [
121+ * (
122+ [roundtrip_as_kerchunk_dict , roundtrip_as_kerchunk_json ]
123+ if has_kerchunk
124+ else []
125+ ),
126+ * ([roundtrip_as_kerchunk_parquet ] if has_kerchunk and has_fastparquet else []),
127+ ],
128+ )
129+ class TestRoundtrip :
90130 @parametrize_over_hdf_backends
91- def test_kerchunk_roundtrip_no_concat (self , tmpdir , format , hdf_backend ):
131+ def test_roundtrip_no_concat (self , tmpdir , roundtrip_func , hdf_backend ):
92132 # set up example xarray dataset
93133 ds = xr .tutorial .open_dataset ("air_temperature" , decode_times = False )
94134
@@ -98,20 +138,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
98138 # use open_dataset_via_kerchunk to read it as references
99139 vds = open_virtual_dataset (f"{ tmpdir } /air.nc" , indexes = {}, backend = hdf_backend )
100140
101- if format == "dict" :
102- # write those references to an in-memory kerchunk-formatted references dictionary
103- ds_refs = vds .virtualize .to_kerchunk (format = format )
104-
105- # use fsspec to read the dataset from the kerchunk references dict
106- roundtrip = xr .open_dataset (ds_refs , engine = "kerchunk" , decode_times = False )
107- else :
108- # write those references to disk as kerchunk references format
109- vds .virtualize .to_kerchunk (f"{ tmpdir } /refs.{ format } " , format = format )
110-
111- # use fsspec to read the dataset from disk via the kerchunk references
112- roundtrip = xr .open_dataset (
113- f"{ tmpdir } /refs.{ format } " , engine = "kerchunk" , decode_times = False
114- )
141+ roundtrip = roundtrip_func (vds , tmpdir , decode_times = False )
115142
116143 # assert all_close to original dataset
117144 xrt .assert_allclose (roundtrip , ds )
@@ -123,7 +150,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
123150 @parametrize_over_hdf_backends
124151 @pytest .mark .parametrize ("decode_times,time_vars" , [(False , []), (True , ["time" ])])
125152 def test_kerchunk_roundtrip_concat (
126- self , tmpdir , format , hdf_backend , decode_times , time_vars
153+ self , tmpdir , roundtrip_func , hdf_backend , decode_times , time_vars
127154 ):
128155 # set up example xarray dataset
129156 ds = xr .tutorial .open_dataset ("air_temperature" , decode_times = decode_times )
@@ -159,22 +186,7 @@ def test_kerchunk_roundtrip_concat(
159186 # concatenate virtually along time
160187 vds = xr .concat ([vds1 , vds2 ], dim = "time" , coords = "minimal" , compat = "override" )
161188
162- if format == "dict" :
163- # write those references to an in-memory kerchunk-formatted references dictionary
164- ds_refs = vds .virtualize .to_kerchunk (format = format )
165-
166- # use fsspec to read the dataset from the kerchunk references dict
167- roundtrip = xr .open_dataset (
168- ds_refs , engine = "kerchunk" , decode_times = decode_times
169- )
170- else :
171- # write those references to disk as kerchunk references format
172- vds .virtualize .to_kerchunk (f"{ tmpdir } /refs.{ format } " , format = format )
173-
174- # use fsspec to read the dataset from disk via the kerchunk references
175- roundtrip = xr .open_dataset (
176- f"{ tmpdir } /refs.{ format } " , engine = "kerchunk" , decode_times = decode_times
177- )
189+ roundtrip = roundtrip_func (vds , tmpdir , decode_times = decode_times )
178190
179191 if decode_times is False :
180192 # assert all_close to original dataset
@@ -191,7 +203,7 @@ def test_kerchunk_roundtrip_concat(
191203 assert roundtrip .time .encoding ["calendar" ] == ds .time .encoding ["calendar" ]
192204
193205 @parametrize_over_hdf_backends
194- def test_non_dimension_coordinates (self , tmpdir , format , hdf_backend ):
206+ def test_non_dimension_coordinates (self , tmpdir , roundtrip_func , hdf_backend ):
195207 # regression test for GH issue #105
196208
197209 if hdf_backend :
@@ -209,20 +221,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
209221 assert "lat" in vds .coords
210222 assert "coordinates" not in vds .attrs
211223
212- if format == "dict" :
213- # write those references to an in-memory kerchunk-formatted references dictionary
214- ds_refs = vds .virtualize .to_kerchunk (format = format )
215-
216- # use fsspec to read the dataset from the kerchunk references dict
217- roundtrip = xr .open_dataset (ds_refs , engine = "kerchunk" , decode_times = False )
218- else :
219- # write those references to disk as kerchunk references format
220- vds .virtualize .to_kerchunk (f"{ tmpdir } /refs.{ format } " , format = format )
221-
222- # use fsspec to read the dataset from disk via the kerchunk references
223- roundtrip = xr .open_dataset (
224- f"{ tmpdir } /refs.{ format } " , engine = "kerchunk" , decode_times = False
225- )
224+ roundtrip = roundtrip_func (vds , tmpdir )
226225
227226 # assert equal to original dataset
228227 xrt .assert_allclose (roundtrip , ds )
@@ -231,7 +230,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
231230 for coord in ds .coords :
232231 assert ds .coords [coord ].attrs == roundtrip .coords [coord ].attrs
233232
234- def test_datetime64_dtype_fill_value (self , tmpdir , format ):
233+ def test_datetime64_dtype_fill_value (self , tmpdir , roundtrip_func ):
235234 chunks_dict = {
236235 "0.0.0" : {"path" : "/foo.nc" , "offset" : 100 , "length" : 100 },
237236 }
@@ -249,7 +248,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
249248 zarr_format = 2 ,
250249 )
251250 marr1 = ManifestArray (zarray = zarray , chunkmanifest = manifest )
252- ds = xr .Dataset (
251+ vds = xr .Dataset (
253252 {
254253 "a" : xr .DataArray (
255254 marr1 ,
@@ -260,20 +259,9 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
260259 }
261260 )
262261
263- if format == "dict" :
264- # write those references to an in-memory kerchunk-formatted references dictionary
265- ds_refs = ds .virtualize .to_kerchunk (format = format )
266-
267- # use fsspec to read the dataset from the kerchunk references dict
268- roundtrip = xr .open_dataset (ds_refs , engine = "kerchunk" )
269- else :
270- # write those references to disk as kerchunk references format
271- ds .virtualize .to_kerchunk (f"{ tmpdir } /refs.{ format } " , format = format )
272-
273- # use fsspec to read the dataset from disk via the kerchunk references
274- roundtrip = xr .open_dataset (f"{ tmpdir } /refs.{ format } " , engine = "kerchunk" )
262+ roundtrip = roundtrip_func (vds , tmpdir )
275263
276- assert roundtrip .a .attrs == ds .a .attrs
264+ assert roundtrip .a .attrs == vds .a .attrs
277265
278266
279267@parametrize_over_hdf_backends
0 commit comments