-
Notifications
You must be signed in to change notification settings - Fork 49
open_virtual_dataset with dmr++ #113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 2 commits
Commits
Show all changes
28 commits
Select commit
Hold shift + click to select a range
18b53bd
basic dmr parsing functionality
ayushnag 47d8901
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] f3bfa82
Merge branch 'TomNicholas:main' into dmr-adapter
ayushnag aaf6af2
Speedup DMR chunk key parsing
agoodm fc8b0d8
Merge pull request #1 from agoodm/dmr-adapter
ayushnag 7b81eeb
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] 8334d0a
added groups, docs, and bug fixes
ayushnag 64d59b1
Merge branch 'TomNicholas:main' into dmr-adapter
ayushnag 1a3b787
Merge branch 'zarr-developers:main' into dmr-adapter
ayushnag 7580fdc
rework hdf5 parser and group logic
ayushnag 52ceba0
Merge remote-tracking branch 'upstream/main' into dmr-adapter
ayushnag b1f9aee
update attrs cast to python dtype
ayushnag ae29176
parser passing tests
ayushnag 6e763f9
match main manifest dtypes
ayushnag 0824ed2
Merge branch 'zarr-developers:main' into dmr-adapter
ayushnag 659ab65
Merge branch 'zarr-developers:main' into dmr-adapter
ayushnag b8531c8
Merge branch 'zarr-developers:main' into dmr-adapter
ayushnag 0125d71
Merge branch 'zarr-developers:main' into dmr-adapter
ayushnag ef8aa9c
modularize dmrpp.py
ayushnag 7638092
add dmrpp api docs
ayushnag 83cb586
resolve conflict
ayushnag cb6feff
resolve releases conflict
ayushnag 888ce32
indexes and docs fix
ayushnag 3e15e8e
Merge branch 'main' into dmr-adapter
TomNicholas ee23ec0
Fix type hint for shape
TomNicholas d9337ff
change how FileType is used
TomNicholas 6bb9218
Change FileType check again
TomNicholas d1948d4
fix storage_options bug
TomNicholas File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
import ast | ||
from xml.etree import ElementTree as ET | ||
|
||
import numpy as np | ||
import xarray as xr | ||
|
||
from virtualizarr.manifests import ManifestArray | ||
from virtualizarr.zarr import ZArray | ||
|
||
|
||
class DMRParser: | ||
dap_namespace = "{http://xml.opendap.org/ns/DAP/4.0#}" | ||
dmr_namespace = "{http://xml.opendap.org/dap/dmrpp/1.0.0#}" | ||
dap_npdtype = { | ||
"Byte": "uint8", | ||
"UByte": "uint8", | ||
"Int8": "int8", | ||
"UInt8": "uint8", | ||
"Int16": "int16", | ||
"UInt16": "uint16", | ||
"Int32": "int32", | ||
"UInt32": "uint32", | ||
"Int64": "int64", | ||
"UInt64": "uint64", | ||
"Url": "str", | ||
"Float32": "float32", | ||
"Float64": "float64", | ||
"String": "str", | ||
} | ||
|
||
def __init__(self, dmr: str): | ||
self.root = ET.fromstring(dmr) | ||
self.data_filepath = self.root.attrib["name"] | ||
self.global_dims = {} | ||
|
||
def parse_dataset(self): | ||
# find all dimension names and sizes | ||
for d in self.root.iterfind(self.dap_namespace + "Dimension"): | ||
self.global_dims[d.attrib["name"]] = int(d.attrib["size"]) | ||
vars_tags = [] | ||
for dap_dtype in self.dap_npdtype: | ||
vars_tags += self.root.findall(self.dap_namespace + dap_dtype) | ||
# find all coordinate names (using Map tags) | ||
coord_names = set() | ||
for var_tag in vars_tags: | ||
for map_tag in var_tag.iterfind(self.dap_namespace + "Map"): | ||
coord_names.add(map_tag.attrib["name"].removeprefix("/")) | ||
coords = {} | ||
data_vars = {} | ||
for var_tag in vars_tags: | ||
if var_tag.attrib["name"] in coord_names: | ||
coords[var_tag.attrib["name"]] = self.parse_variable(var_tag) | ||
# if len(coords[v.attrib['name']].dims) == 1: | ||
# dim1d, *_ = coords[v.attrib['name']].dims | ||
# indexes[v.attrib['name']] = PandasIndex(coords[v.attrib['name']], dim1d) | ||
else: | ||
data_vars[var_tag.attrib["name"]] = self.parse_variable(var_tag) | ||
# find all dataset attributes | ||
attrs = {} | ||
for attr_tag in self.root.iterfind(self.dap_namespace + "Attribute"): | ||
if attr_tag.attrib["type"] != "Container": | ||
attrs.update(self.parse_attribute(attr_tag)) | ||
return xr.Dataset( | ||
data_vars=data_vars, | ||
coords=xr.Coordinates(coords=coords, indexes={}), | ||
attrs=attrs, | ||
) | ||
|
||
def parse_variable(self, root) -> xr.Variable: | ||
# parse dimensions | ||
dims = [] | ||
for d in root.iterfind(self.dap_namespace + "Dim"): | ||
dims.append(d.attrib["name"].removeprefix("/")) | ||
shape = tuple([self.global_dims[d] for d in dims]) | ||
# parse chunks | ||
chunks = shape | ||
chunks_tag = root.find(self.dmr_namespace + "chunks") | ||
if chunks_tag.find(self.dmr_namespace + "chunkDimensionSizes") is not None: | ||
dim_str = chunks_tag.find(self.dmr_namespace + "chunkDimensionSizes").text | ||
chunks = tuple(map(int, dim_str.split())) | ||
chunkmanifest = self.parse_chunks(chunks_tag, chunks) | ||
# parse attributes | ||
attrs = {} | ||
for a in root.iterfind(self.dap_namespace + "Attribute"): | ||
attrs.update(self.parse_attribute(a)) | ||
# create ManifestArray and ZArray | ||
dtype = np.dtype(self.dap_npdtype[root.tag.removeprefix(self.dap_namespace)]) | ||
fill_value = ( | ||
attrs["_FillValue"] | ||
if "_FillValue" in attrs and attrs["_FillValue"] != "*" | ||
else None | ||
) | ||
zarray = ZArray( | ||
chunks=chunks, | ||
dtype=dtype, | ||
fill_value=fill_value, | ||
order="C", | ||
shape=shape, | ||
zarr_format=3, | ||
) | ||
marr = ManifestArray(zarray=zarray, chunkmanifest=chunkmanifest) | ||
# create encoding dict (and remove those keys from attrs) | ||
encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"} | ||
encoding = {key: value for key, value in attrs.items() if key in encoding_keys} | ||
attrs = {key: value for key, value in attrs.items() if key not in encoding_keys} | ||
return xr.Variable(dims=dims, data=marr, attrs=attrs, encoding=encoding) | ||
|
||
def parse_attribute(self, root) -> dict: | ||
attr = {} | ||
values = [] | ||
# if multiple Value tags are present, store as "key": "[v1, v2, ...]" | ||
for r in root: | ||
values.append(r.text) | ||
attr[root.attrib["name"]] = values[0] if len(values) == 1 else str(values) | ||
return attr | ||
|
||
def parse_chunks(self, root, chunks: tuple) -> dict: | ||
chunkmanifest = {} | ||
for r in root.iterfind(self.dmr_namespace + "chunk"): | ||
chunk_pos = ( | ||
np.zeros(len(chunks), dtype=int) | ||
if "chunkPositionInArray" not in r.attrib | ||
else np.asarray(ast.literal_eval(r.attrib["chunkPositionInArray"])) | ||
) | ||
chunk_num = ( | ||
chunk_pos // chunks | ||
) # [0,1023,10235] // [1, 1023, 2047] -> [0,1,5] | ||
chunk_key = ".".join(map(str, chunk_num)) # [0,0,1] -> "0.0.1" | ||
TomNicholas marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
chunkmanifest[chunk_key] = { | ||
"path": self.data_filepath, | ||
"offset": int(r.attrib["offset"]), | ||
"length": int(r.attrib["nBytes"]), | ||
} | ||
return chunkmanifest |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.