merging main

orianac · orianac · commit 67582119eff2 · 2021-11-23T21:32:28.000Z
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -29,7 +29,7 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
           architecture: x64
-      - uses: actions/cache@v2.1.6
+      - uses: actions/cache@v2.1.7
         with:
           path: ~/.cache/pip
           key: ${{ runner.os }}-pip-${{ hashFiles('**/dev-requirements.txt') }}
@@ -62,7 +62,7 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
           architecture: x64
-      - uses: actions/cache@v2.1.6
+      - uses: actions/cache@v2.1.7
         with:
           path: ~/.cache/pip
           key: ${{ runner.os }}-pip-${{ hashFiles('**/dev-requirements.txt') }}
diff --git a/setup.cfg b/setup.cfg
@@ -8,7 +8,7 @@ select = B,C,E,F,W,T4,B9
 
 [isort]
 known_first_party=xarray_schema
-known_third_party=numpy,pkg_resources,pytest,setuptools,xarray
+known_third_party=dask,numpy,pkg_resources,pytest,setuptools,xarray
 multi_line_output=3
 include_trailing_comma=True
 force_grid_wrap=0
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -102,6 +102,40 @@ def test_dataarray_validate_chunks():
     with pytest.raises(SchemaError, match=r'.*(2, 1).*'):
         schema.validate(da)
 
+    # check that when expected chunk == -1 it fails
+    schema = DataArraySchema(chunks={'x': -1})
+    with pytest.raises(SchemaError, match=r'.*(4).*'):
+        schema.validate(da)
+
+    # check that when chunking schema is -1 it also works
+    # both when chunking is specified as -1 and as 4
+    schema = DataArraySchema(chunks={'x': 4})
+    da = xr.DataArray(np.ones(4), dims=['x']).chunk({'x': -1})
+    schema.validate(da)
+
+    schema = DataArraySchema(chunks={'x': -1})
+    da = xr.DataArray(np.ones(4), dims=['x']).chunk({'x': 4})
+    schema.validate(da)
+
+    schema = DataArraySchema(chunks={'x': -1})
+    da = xr.DataArray(np.ones(4), dims=['x']).chunk({'x': -1})
+    schema.validate(da)
+
+    # test for agnostic chunks
+    schema = DataArraySchema(chunks=True)
+    da = xr.DataArray(np.ones(4), dims=['x'])
+    with pytest.raises(SchemaError, match='Schema expected DataArray to be chunked but it is not'):
+        schema.validate(da)
+
+    # now try passing an irregularly chunked data array
+    da = xr.DataArray(np.ones(4), dims=['x']).chunk({'x': (1, 2, 1)})
+    schema.validate(da)
+
+    # test the check for regular chunk sizes
+    schema = DataArraySchema(chunks={'x': -1})
+    with pytest.raises(AssertionError, match=r'.*(gracious).*'):
+        schema.validate(da)
+
 
 def test_dataset_empty_constructor():
     ds_schema = DatasetSchema()
@@ -125,3 +159,13 @@ def test_dataset_example():
         }
     )
     ds_schema.validate(ds)
+
+
+def test_validate():
+    da = xr.DataArray(np.ones(4), dims=['x']).chunk({'x': (1, 2, 1)})
+    schema = DataArraySchema(chunks=False)
+    # check that da is unchunked
+    with pytest.raises(SchemaError, match='Schema expected unchunked DataArray but it is chunked!'):
+        schema.validate(da)
+    da = xr.DataArray(np.ones(4), dims=['x'])
+    schema.validate(da)
diff --git a/xarray_schema/core.py b/xarray_schema/core.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import xarray as xr
+from dask.array.core import _check_regular_chunks
 
 # TODOs:
 # - api grouping, should the constructors look similar to the DataArray/Dataset constructors
@@ -24,6 +25,9 @@ class DataArraySchema:
         Shape of the DataArray. `None` may be used as a wildcard value. By default None
     dims : Tuple[Union[Hashable, None]], optional
         Dimensions of the DataArray.  `None` may be used as a wildcard value. By default None
+    chunks : Union[bool, Dict[Hashable, Union[int, None]]], optional
+        If bool, specifies whether DataArray is chunked or not, agnostic to chunk sizes.
+        If dict, includes the expected chunks for the DataArray, by default None
     name : str, optional
         Name of the DataArray, by default None
     array_type : Any, optional
@@ -37,7 +41,7 @@ def __init__(
         shape: Tuple[Union[int, None]] = None,
         dims: Tuple[Union[Hashable, None]] = None,
         coords: Dict[Hashable, Any] = None,
-        chunks: Dict[Hashable, Union[int, None]] = None,
+        chunks: Union[bool, Dict[Hashable, Union[int, None]]] = None,
         name: str = None,
         array_type: Any = None,
         attrs: Dict[Hashable, Any] = None,
@@ -72,6 +76,12 @@ def validate(self, da: xr.DataArray) -> xr.DataArray:
         ------
         SchemaError
         '''
+        if not isinstance(da, xr.DataArray):
+            raise ValueError('Input must be a xarray.DataArray')
+
+        if self.chunks is not None:
+            if self.chunks is False and da.chunks:
+                raise SchemaError('Schema expected unchunked DataArray but it is chunked!')
 
         if self.dtype is not None and not np.issubdtype(da.dtype, self.dtype):
             raise SchemaError(f'dtype {da.dtype} != {self.dtype}')
@@ -103,17 +113,30 @@ def validate(self, da: xr.DataArray) -> xr.DataArray:
             raise NotImplementedError('coords schema not implemented yet')
 
         if self.chunks:
-            dim_chunks = dict(zip(da.dims, da.chunks))
-            for key, ec in self.chunks.items():
-                if isinstance(ec, int):
-                    for ac in dim_chunks[key][:-1]:
+            if self.chunks is True:
+                if not da.chunks:
+                    raise SchemaError('Schema expected DataArray to be chunked but it is not')
+
+            else:
+                assert type(self.chunks) == dict, 'Must pass chunks information as dictionary'
+                dim_chunks = dict(zip(da.dims, da.chunks))
+                dim_sizes = dict(zip(da.dims, da.shape))
+                # check whether chunk sizes are regular because we assume the first chunk to be representative below
+                assert _check_regular_chunks(da.chunks), 'Good gracious no! Chunks are not regular!'
+                for key, ec in self.chunks.items():
+                    if isinstance(ec, int):
+                        # handles case of expected chunksize is shorthand of -1 which translates to the full length of dimension
+                        if ec < 0:
+                            ec = dim_sizes[key]
+                            # grab the first entry in da's tuple of chunks to be representative (since we've checked above that they're regular)
+                        ac = dim_chunks[key][0]
                         if ac != ec:
                             raise SchemaError(f'{key} chunks did not match: {ac} != {ec}')
 
-                else:  # assumes ec is an iterable
-                    ac = dim_chunks[key]
-                    if tuple(ac) != tuple(ec):
-                        raise SchemaError(f'{key} chunks did not match: {ac} != {ec}')
+                    else:  # assumes ec is an iterable
+                        ac = dim_chunks[key]
+                        if tuple(ac) != tuple(ec):
+                            raise SchemaError(f'{key} chunks did not match: {ac} != {ec}')
 
         if self.attrs:
             raise NotImplementedError('attrs schema not implemented yet')