-
Notifications
You must be signed in to change notification settings - Fork 56
Description
I know the package is very much still under development and I understand that not all features are implemented yet, e.g. the ability to open reference files with inlined references.
What I did
I created a kerchunk reference dataset and inlined all references (except .za* as those need to be not inlined) and tried opening those references again with virtualizarr.
import xarray as xr
import tempfile
import json
from pathlib import Path
import virtualizarr as vz
import os
#Create xarray dataset
ds1 = xr.Dataset(
{
"a": (("x", "y"), [[1, 2], [3, 4]]),
"b": (("x", "y"), [[10, 20], [30, 40]]),
},
coords={"x": [10, 20], "y": [1, 2]},
)
ref1 = ds1.virtualize.to_kerchunk()
tempdir1 = Path(tempfile.TemporaryDirectory().name)
def outline_references(ref: dict, folder: Path = None) -> dict:
"""
Virtualizarr currently does not support inlined references.
To open references with virtualizarr, the references must be written to a file.
Except the .zarray, .zattrs and .zgroup files, all references are written to disk.
"""
refs = ref["refs"]
for k, v in refs.items():
if os.path.basename(k).startswith('.'):
continue
elif isinstance(v, str):
file = folder / k
if not os.path.exists(os.path.dirname(file)):
os.makedirs(os.path.dirname(file))
with open(folder / k, "w") as f:
f.write(v)
refs[k] = [str(file), 0, v.__sizeof__()]
return ref
ref1 = outline_references(ref1, tempdir1)
## Write references to disk (open_virtual_dataset expects a string)
with open("ref1.json", "w") as f:
json.dump(ref1, f)
vds1 = vz.open_virtual_dataset("ref1.json", filetype='kerchunk')What happened
I get several errors when doing vds1 = vz.open_virtual_dataset("ref1.json", filetype='kerchunk'):
File ~/virtualizarr/lib/python3.10/site-packages/virtualizarr/manifests/manifest.py:100, in validate_and_normalize_path_to_uri(path, fs_root)
97 _path = PosixPath(path)
99 if not _path.suffix:
--> 100 raise ValueError(
101 f"entries in the manifest must be paths to files, but this path has no file suffix: {path}"
102 )
104 # only posix paths can possibly not be absolute
105 if not _path.is_absolute():
ValueError: entries in the manifest must be paths to files, but this path has no file suffix: /var/folders/fj/g0x4n_f15tb6zfwjhzc8gvzr0000gn/T/tmpcz41hy4y/x/0
Full traceback
In [10]: vds1 = vz.open_virtual_dataset("ref1.json", filetype='kerchunk')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[10], line 1
----> 1 vds1 = vz.open_virtual_dataset("ref1.json", filetype='kerchunk')
File ~/virtualizarr/lib/python3.10/site-packages/virtualizarr/backend.py:203, in open_virtual_dataset(filepath, filetype, group, drop_variables, loadable_variables, decode_times, cftime_variables, indexes, virtual_array_class, virtual_backend_kwargs, reader_options, backend)
200 if backend_cls is None:
201 raise NotImplementedError(f"Unsupported file type: {filetype.name}")
--> 203 vds = backend_cls.open_virtual_dataset(
204 filepath,
205 group=group,
206 drop_variables=drop_variables,
207 loadable_variables=loadable_variables,
208 decode_times=decode_times,
209 indexes=indexes,
210 virtual_backend_kwargs=virtual_backend_kwargs,
211 reader_options=reader_options,
212 )
214 return vds
File ~/virtualizarr/lib/python3.10/site-packages/virtualizarr/readers/kerchunk.py:75, in KerchunkVirtualBackend.open_virtual_dataset(filepath, group, drop_variables, loadable_variables, decode_times, indexes, virtual_backend_kwargs, reader_options)
72 with fs.open_file() as of:
73 refs = ujson.load(of)
---> 75 vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs), fs_root=fs_root)
77 else:
78 raise ValueError(
79 "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. If your Kerchunk generated references are saved in parquet format, make sure the file extension is `.parquet`. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues"
80 )
File ~/virtualizarr/lib/python3.10/site-packages/virtualizarr/translators/kerchunk.py:136, in dataset_from_kerchunk_refs(refs, drop_variables, virtual_array_class, indexes, fs_root)
119 def dataset_from_kerchunk_refs(
120 refs: KerchunkStoreRefs,
121 drop_variables: list[str] = [],
(...)
124 fs_root: str | None = None,
125 ) -> Dataset:
126 """
127 Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays.
128
(...)
133 Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that.
134 """
--> 136 vars = virtual_vars_from_kerchunk_refs(
137 refs, drop_variables, virtual_array_class, fs_root=fs_root
138 )
139 ds_attrs = fully_decode_arr_refs(refs["refs"]).get(".zattrs", {})
140 coord_names = ds_attrs.pop("coordinates", [])
File ~/virtualizarr/lib/python3.10/site-packages/virtualizarr/translators/kerchunk.py:110, in virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class, fs_root)
105 drop_variables = []
106 var_names_to_keep = [
107 var_name for var_name in var_names if var_name not in drop_variables
108 ]
--> 110 vars = {
111 var_name: variable_from_kerchunk_refs(
112 refs, var_name, virtual_array_class, fs_root=fs_root
113 )
114 for var_name in var_names_to_keep
115 }
116 return vars
File ~/virtualizarr/lib/python3.10/site-packages/virtualizarr/translators/kerchunk.py:111, in <dictcomp>(.0)
105 drop_variables = []
106 var_names_to_keep = [
107 var_name for var_name in var_names if var_name not in drop_variables
108 ]
110 vars = {
--> 111 var_name: variable_from_kerchunk_refs(
112 refs, var_name, virtual_array_class, fs_root=fs_root
113 )
114 for var_name in var_names_to_keep
115 }
116 return vars
File ~/virtualizarr/lib/python3.10/site-packages/virtualizarr/translators/kerchunk.py:169, in variable_from_kerchunk_refs(refs, var_name, virtual_array_class, fs_root)
167 dims = zattrs.pop("_ARRAY_DIMENSIONS")
168 if chunk_dict:
--> 169 manifest = manifest_from_kerchunk_chunk_dict(chunk_dict, fs_root=fs_root)
170 varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest)
171 elif len(zarray.shape) != 0:
172 # empty variables don't have physical chunks, but zarray shows that the variable
173 # is at least 1D
File ~/virtualizarr/lib/python3.10/site-packages/virtualizarr/translators/kerchunk.py:200, in manifest_from_kerchunk_chunk_dict(kerchunk_chunk_dict, fs_root)
198 elif not isinstance(v, (tuple, list)):
199 raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}")
--> 200 chunk_entries[k] = chunkentry_from_kerchunk(v, fs_root=fs_root)
201 return ChunkManifest(entries=chunk_entries)
File ~/virtualizarr/lib/python3.10/site-packages/virtualizarr/translators/kerchunk.py:217, in chunkentry_from_kerchunk(path_and_byte_range_info, fs_root)
215 else:
216 path, offset, length = path_and_byte_range_info
--> 217 return ChunkEntry.with_validation( # type: ignore[attr-defined]
218 path=path, offset=offset, length=length, fs_root=fs_root
219 )
File ~/virtualizarr/lib/python3.10/site-packages/virtualizarr/manifests/manifest.py:52, in ChunkEntry.with_validation(cls, path, offset, length, fs_root)
40 """
41 Constructor which validates each part of the chunk entry.
42
(...)
47 Required if any (likely kerchunk-generated) paths are relative in order to turn them into absolute paths (which virtualizarr requires).
48 """
50 # note: we can't just use `__init__` or a dataclass' `__post_init__` because we need `fs_root` to be an optional kwarg
---> 52 path = validate_and_normalize_path_to_uri(path, fs_root=fs_root)
53 validate_byte_range(offset=offset, length=length)
54 return ChunkEntry(path=path, offset=offset, length=length)
File ~/virtualizarr/lib/python3.10/site-packages/virtualizarr/manifests/manifest.py:100, in validate_and_normalize_path_to_uri(path, fs_root)
97 _path = PosixPath(path)
99 if not _path.suffix:
--> 100 raise ValueError(
101 f"entries in the manifest must be paths to files, but this path has no file suffix: {path}"
102 )
104 # only posix paths can possibly not be absolute
105 if not _path.is_absolute():
ValueError: entries in the manifest must be paths to files, but this path has no file suffix: /var/folders/fj/g0x4n_f15tb6zfwjhzc8gvzr0000gn/T/tmpcz41hy4y/x/0I head to modify manifest.py both at line 88 and 100 and deactivate these suffix checks to be able to load the data:
elif any(path.startswith(prefix) for prefix in VALID_URI_PREFIXES):
#if not PosixPath(path).suffix:
# raise ValueError(
# f"entries in the manifest must be paths to files, but this path has no file suffix: {path}"
# )
return path # path is already in URI form #if not _path.suffix:
#raise ValueError(
# f"entries in the manifest must be paths to files, but this path has no file suffix: {path}"
# This is obviously not a permanent fix and ignores other cases, but this is how it currently works for me.
(This issue has originally be posted in a modified version at fsspec/kerchunk#536 (comment))