more stuff + travis

rabernat · rabernat · commit 5060c0e914c3 · 2018-11-04T14:37:39.000-05:00
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,38 @@
+language: python
+
+# sudo false implies containerized builds, so we can use cacheing
+sudo: false
+
+notifications:
+  email: false
+
+python:
+  - 3.6
+
+env:
+  - CONDA_DEPS="pip flake8 pytest coverage pandas xarray dask" PIP_DEPS="codecov pytest-cov"
+
+before_install:
+  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
+      wget http://repo.continuum.io/miniconda/Miniconda-3.16.0-Linux-x86_64.sh -O miniconda.sh;
+    else
+      wget http://repo.continuum.io/miniconda/Miniconda3-3.16.0-Linux-x86_64.sh -O miniconda.sh;
+    fi
+  - bash miniconda.sh -b -f -p $HOME/miniconda
+  - export PATH="$HOME/miniconda/bin:$PATH"
+  - hash -r
+  - conda config --set always_yes yes --set changeps1 no
+  - conda update -q conda
+  - conda info -a
+  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION $CONDA_DEPS
+  - source activate test-environment
+  - travis_retry pip install $PIP_DEPS
+
+install:
+- python setup.py install --record installed_files.txt
+
+script:
+- py.test xbatcher --cov=xrft --cov-config .coveragerc --cov-report term-missing -v
+
+after_success:
+- codecov
diff --git a/xbatcher/features.py b/xbatcher/features.py
@@ -0,0 +1,6 @@
+"""Functions for transforming xarray datasets into features that can
+be input to machine learning libraries."""
+
+def dataset_to_feature_dataframe(ds, coords_as_features=False):
+    df = ds.to_dataframe()
+    return df
diff --git a/xbatcher/generators.py b/xbatcher/generators.py
@@ -1,3 +1,5 @@
+"""Classes for iterating through xarray datarrays / datasets in batches."""
+
 import xarray as xr
 from collections import OrderedDict
 import itertools
@@ -26,6 +28,7 @@ def __iter__(self):
         for slices in itertools.product(*[self._iterate_dim(dim)
                                           for dim in self.batch_dims]):
             selector = {key: slice for key, slice in zip(self.batch_dims, slices)}
+            #print(selector)
             yield self.ds.isel(**selector)
 
 
diff --git a/xbatcher/tests/test_generators.py b/xbatcher/tests/test_generators.py
@@ -6,12 +6,13 @@
 
 @pytest.fixture(scope='module')
 def sample_ds_1d():
-    size=100
+    size = 100
     ds = xr.Dataset({'foo': (['x'], np.random.rand(size)),
                      'bar': (['x'], np.random.randint(0, 10, size))},
                     {'x': (['x'], np.arange(size))})
     return ds
 
+
 # TODO: decide how to handle bsizes like 15 that don't evenly divide the dimension
 # Should we enforce that each batch size always has to be the same
 @pytest.mark.parametrize("bsize", [5, 10])
@@ -25,6 +26,7 @@ def test_batch_1d(sample_ds_1d, bsize):
         ds_batch_expected = sample_ds_1d.isel(x=expected_slice)
         assert ds_batch.equals(ds_batch_expected)
 
+
 @pytest.mark.parametrize("olap", [1, 4])
 def test_batch_1d_overlap(sample_ds_1d, olap):
     bsize = 10
@@ -37,3 +39,38 @@ def test_batch_1d_overlap(sample_ds_1d, olap):
         expected_slice = slice(stride*n, stride*n + bsize)
         ds_batch_expected = sample_ds_1d.isel(x=expected_slice)
         assert ds_batch.equals(ds_batch_expected)
+
+
+@pytest.fixture(scope='module')
+def sample_ds_2d():
+    shape = (50, 100)
+    ds = xr.Dataset({'foo': (['y', 'x'], np.random.rand(*shape)),
+                     'bar': (['y', 'x'], np.random.randint(0, 10, shape))},
+                    {'x': (['x'], np.arange(shape[-1])),
+                     'y': (['y'], np.arange(shape[-2]))})
+    return ds
+
+
+@pytest.mark.parametrize("bsize", [5, 10])
+def test_batch_2d(sample_ds_2d, bsize):
+
+    # first do the iteration over just one dimension
+    bg = BatchGenerator(sample_ds_2d, batch_sizes={'x': bsize})
+    for n, ds_batch in enumerate(bg):
+        assert isinstance(ds_batch, xr.Dataset)
+        assert ds_batch.dims['x'] == bsize
+        assert ds_batch.dims['y'] == sample_ds_2d.dims['y']
+        expected_slice = slice(bsize*n, bsize*(n+1))
+        ds_batch_expected = sample_ds_2d.isel(x=expected_slice)
+        assert ds_batch.equals(ds_batch_expected)
+
+    # now iterate over both
+    xbsize = 20
+    bg = BatchGenerator(sample_ds_2d, batch_sizes={'y': bsize, 'x': xbsize})
+    for n, ds_batch in enumerate(bg):
+        assert isinstance(ds_batch, xr.Dataset)
+        assert ds_batch.dims['x'] == xbsize
+        assert ds_batch.dims['y'] == bsize
+        # TODO? Is it worth it to try to reproduce the internal logic of the
+        # generator and verify that the slices are correct?
+    assert (n+1)==((sample_ds_2d.dims['x']//xbsize) * (sample_ds_2d.dims['y']//bsize))