MNT: Update enum parsing (#68)

bzah · web-flow · commit 29a0a69ef1a9 · 2024-02-07T17:07:37.000+01:00
* MNT: Update enum parsing

This aligns xncml enum parsing behavior with xarray's
netCDF4 backend behavior.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,7 @@
 **Breaking changes**
 - Nested group handling:
   Before this version, all groups were read, but conflicting variable names in-between groups would shadow data.  Now, similarly to xarray ``open_dataset``, ``open_ncml`` accepts an optional ``group`` argument to specify which group should be read. When ``group`` is not specified, it defaults to the root group. Additionally ``group`` can be set to ``'*'`` so that every group is read and the hierarchy is flattened.   In the event of conflicting variable/dimension names across groups, the conflicting name will be modified by appending ``'__n'`` where n is incremented.
-
+- Enums are no longer transformed into CF flag_values and flag_meanings attributes, instead they are stored in the ``encoding["dtype"].metadata`` of their respective variable. This is aligned with what is done on xarray v2024.01.0
 
 0.4.0 (2024-01-08)
 ==================
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -333,8 +333,8 @@ def test_multiple_values_for_scalar():
 def test_read_enum():
     """A enum should be turned into CF flag_values and flag_meanings attributes."""
     ds = xncml.open_ncml(data / 'testEnums.xml')
-    assert ds['be_or_not_to_be'].attrs['flag_values'] == [0, 1]
-    assert ds['be_or_not_to_be'].attrs['flag_meanings'] == ['false', 'true']
+    assert ds.be_or_not_to_be.dtype.metadata['enum'] == {'false': 0, 'true': 1}
+    assert ds.be_or_not_to_be.dtype.metadata['enum_name'] == 'boolean'
 
 
 def test_empty_attr():
diff --git a/xncml/parser.py b/xncml/parser.py
@@ -459,20 +459,17 @@ def read_enum(obj: EnumTypedef) -> dict[str, list]:
     Returns
     -------
     dict:
-        A dictionary with CF flag_values and flag_meanings that describe the Enum.
+        A dictionary describing the Enum.
     """
-    return {
-        'flag_values': list(map(lambda e: e.key, obj.content)),
-        'flag_meanings': list(map(lambda e: e.content[0], obj.content)),
-    }
+    return {e.content[0]: e.key for e in obj.content}
 
 
 def read_variable(
     target: xr.Dataset,
     ref: xr.Dataset,
     obj: Variable,
     dimensions: dict,
-    enums: dict,
+    enums: dict[str, dict[str, int]],
     group_path: str,
 ) -> xr.Dataset:
     """
@@ -576,10 +573,10 @@ def read_variable(
         raise NotImplementedError
 
     if obj.typedef in enums.keys():
-        # TODO (@bzah): Update this once Enums are merged in xarray
-        #      https://github.com/pydata/xarray/pull/8147
-        out.attrs['flag_values'] = enums[obj.typedef]['flag_values']
-        out.attrs['flag_meanings'] = enums[obj.typedef]['flag_meanings']
+        dtype = out.dtype
+        new_dtype = np.dtype(dtype, metadata={'enum': enums[obj.typedef], 'enum_name': obj.typedef})
+        out.encoding['dtype'] = new_dtype
+        out = out.astype(new_dtype)
     elif obj.typedef is not None:
         raise NotImplementedError
     import re