Skip to content

Commit 2f4bfb3

Browse files
authored
Merge pull request #93 from scottyhq/http-netcdf
Add logic for remote or local files in NetCDFSource
2 parents ddff413 + dc2ec1f commit 2f4bfb3

File tree

10 files changed

+326
-16
lines changed

10 files changed

+326
-16
lines changed

.github/workflows/main.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
strategy:
1414
fail-fast: false
1515
matrix:
16-
CONDA_ENV: [36, 37, 38, 37-upstream]
16+
CONDA_ENV: [py36, py37, py38, py39, upstream]
1717
steps:
1818
- name: Checkout
1919
uses: actions/checkout@v2
@@ -24,7 +24,7 @@ jobs:
2424
auto-update-conda: true
2525
auto-activate-base: false
2626
activate-environment: test_env
27-
environment-file: ci/environment-py${{ matrix.CONDA_ENV }}.yml
27+
environment-file: ci/environment-${{ matrix.CONDA_ENV }}.yml
2828

2929
- name: Development Install Intake-Xarray
3030
shell: bash -l {0}
@@ -35,4 +35,4 @@ jobs:
3535
- name: Run Tests
3636
shell: bash -l {0}
3737
run: |
38-
pytest --verbose
38+
pytest --verbose --ignore=intake_xarray/tests/test_network.py

ci/environment-py36.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,19 @@ channels:
44
dependencies:
55
- python=3.6
66
- aiohttp
7+
- boto3
8+
- flask
9+
- h5netcdf
710
- intake
811
- netcdf4
912
- pip
1013
- pydap
1114
- pytest
1215
- rasterio
16+
- s3fs
1317
- scikit-image
1418
- xarray
1519
- zarr
1620
- pip:
1721
- rangehttpserver
22+
- moto[s3]

ci/environment-py37.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,19 @@ channels:
44
dependencies:
55
- python=3.7
66
- aiohttp
7+
- boto3
8+
- flask
9+
- h5netcdf
710
- intake
811
- netcdf4
912
- pip
1013
- pydap
1114
- pytest
1215
- rasterio
16+
- s3fs
1317
- scikit-image
1418
- xarray
1519
- zarr
1620
- pip:
1721
- rangehttpserver
22+
- moto[s3]

ci/environment-py38.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,19 @@ channels:
44
dependencies:
55
- python=3.8
66
- aiohttp
7+
- boto3
8+
- flask
9+
- h5netcdf
710
- intake
811
- netcdf4
912
- pip
1013
- pydap
1114
- pytest
1215
- rasterio
16+
- s3fs
1317
- scikit-image
1418
- xarray
1519
- zarr
1620
- pip:
1721
- rangehttpserver
22+
- moto[s3]

ci/environment-py39.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: test_env
2+
channels:
3+
- conda-forge
4+
dependencies:
5+
- python=3.9
6+
- aiohttp
7+
- boto3
8+
- flask
9+
- h5netcdf
10+
- intake
11+
- netcdf4
12+
- pip
13+
- pydap
14+
- pytest
15+
- rasterio
16+
- s3fs
17+
- scikit-image
18+
- xarray
19+
- zarr
20+
- pip:
21+
- rangehttpserver
22+
- moto[s3]
Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,22 @@ name: test_env
22
channels:
33
- conda-forge
44
dependencies:
5-
- python=3.7
5+
- python
66
- aiohttp
7+
- boto3
8+
- flask
9+
- h5netcdf
710
- netcdf4
811
- pip
912
- pydap
1013
- pytest
1114
- rasterio
15+
- s3fs
1216
- scikit-image
13-
- xarray
1417
- zarr
1518
- pip:
1619
- git+https://github.com/intake/filesystem_spec.git
1720
- git+https://github.com/intake/intake.git
21+
- git+https://github.com/pydata/xarray.git
1822
- rangehttpserver
23+
- moto[s3]

intake_xarray/netcdf.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class NetCDFSource(DataSourceMixin, PatternMixin):
3737
Whether to treat the path as a pattern (ie. ``data_{field}.nc``)
3838
and create new coodinates in the output corresponding to pattern
3939
fields. If str, is treated as pattern to match on. Default is True.
40+
xarray_kwargs: dict
41+
Additional xarray kwargs for xr.open_dataset() or xr.open_mfdataset().
4042
storage_options: dict
4143
If using a remote fs (whether caching locally or not), these are
4244
the kwargs to pass to that FS.
@@ -54,6 +56,10 @@ def __init__(self, urlpath, chunks=None, combine=None, concat_dim=None,
5456
self.storage_options = storage_options or {}
5557
self.xarray_kwargs = xarray_kwargs or {}
5658
self._ds = None
59+
if isinstance(self.urlpath, list):
60+
self._can_be_local = fsspec.utils.can_be_local(self.urlpath[0])
61+
else:
62+
self._can_be_local = fsspec.utils.can_be_local(self.urlpath)
5763
super(NetCDFSource, self).__init__(metadata=metadata, **kwargs)
5864

5965
def _open_dataset(self):
@@ -76,7 +82,12 @@ def _open_dataset(self):
7682
kwargs.update(concat_dim=self.concat_dim)
7783
else:
7884
_open_dataset = xr.open_dataset
79-
url = fsspec.open_local(url, **self.storage_options)
85+
86+
if self._can_be_local:
87+
url = fsspec.open_local(self.urlpath, **self.storage_options)
88+
else:
89+
# https://github.com/intake/filesystem_spec/issues/476#issuecomment-732372918
90+
url = fsspec.open(self.urlpath, **self.storage_options).open()
8091

8192
self._ds = _open_dataset(url, chunks=self.chunks, **kwargs)
8293

intake_xarray/raster.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,18 @@ class RasterIOSource(DataSourceMixin, PatternMixin):
3030
- ``s3://data/landsat8_band{band}.tif``
3131
- ``s3://data/{location}/landsat8_band{band}.tif``
3232
- ``{{ CATALOG_DIR }}data/landsat8_{start_date:%Y%m%d}_band{band}.tif``
33-
chunks: int or dict
33+
chunks: None or int or dict, optional
3434
Chunks is used to load the new dataset into dask
3535
arrays. ``chunks={}`` loads the dataset with dask using a single
36-
chunk for all arrays.
36+
chunk for all arrays. default `None` loads numpy arrays.
3737
path_as_pattern: bool or str, optional
3838
Whether to treat the path as a pattern (ie. ``data_{field}.tif``)
3939
and create new coodinates in the output corresponding to pattern
4040
fields. If str, is treated as pattern to match on. Default is True.
4141
"""
4242
name = 'rasterio'
4343

44-
def __init__(self, urlpath, chunks, concat_dim='concat_dim',
44+
def __init__(self, urlpath, chunks=None, concat_dim='concat_dim',
4545
xarray_kwargs=None, metadata=None, path_as_pattern=True,
4646
storage_options=None, **kwargs):
4747
self.path_as_pattern = path_as_pattern
@@ -81,7 +81,9 @@ def _open_dataset(self):
8181
if self._can_be_local:
8282
files = fsspec.open_local(self.urlpath, **self.storage_options)
8383
else:
84+
# pass URLs to delegate remote opening to rasterio library
8485
files = self.urlpath
86+
#files = fsspec.open(self.urlpath, **self.storage_options).open()
8587
if isinstance(files, list):
8688
self._ds = self._open_files(files)
8789
else:
@@ -115,11 +117,17 @@ def _get_schema(self):
115117
metadata[k] = v
116118
except TypeError:
117119
pass
120+
121+
if hasattr(self._ds.data, 'npartitions'):
122+
npart = self._ds.data.npartitions
123+
else:
124+
npart = None
125+
118126
self._schema = Schema(
119127
datashape=None,
120128
dtype=str(self._ds.dtype),
121129
shape=self._ds.shape,
122-
npartitions=self._ds.data.npartitions,
130+
npartitions=npart,
123131
extra_metadata=metadata)
124132

125133
return self._schema
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Tests that read public data over the internet
2+
import intake
3+
import pytest
4+
import xarray as xr
5+
import s3fs
6+
import gcsfs
7+
8+
# RasterIOSource
9+
def test_open_rasterio_http():
10+
prefix = 'https://landsat-pds.s3.us-west-2.amazonaws.com/L8/139/045'
11+
image = 'LC81390452014295LGN00/LC81390452014295LGN00_B1.TIF'
12+
url = f'{prefix}/{image}'
13+
source = intake.open_rasterio(url,
14+
chunks=dict(band=1))
15+
ds = source.to_dask()
16+
assert isinstance(ds, xr.core.dataarray.DataArray)
17+
18+
19+
def test_open_rasterio_s3():
20+
bucket = 's3://landsat-pds'
21+
key = 'L8/139/045/LC81390452014295LGN00/LC81390452014295LGN00_B1.TIF'
22+
url = f'{bucket}/{key}'
23+
source = intake.open_rasterio(url,
24+
chunks=dict(band=1),
25+
storage_options = dict(anon=True))
26+
ds = source.to_dask()
27+
assert isinstance(ds, xr.core.dataarray.DataArray)
28+
29+
30+
# NETCDFSource
31+
def test_open_netcdf_gs():
32+
bucket = 'gs://ldeo-glaciology'
33+
key = 'bedmachine/BedMachineAntarctica_2019-11-05_v01.nc'
34+
url = f'{bucket}/{key}'
35+
source = intake.open_netcdf(url,
36+
chunks=3000,
37+
xarray_kwargs=dict(engine='h5netcdf'),
38+
)
39+
ds = source.to_dask()
40+
assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore)
41+
assert isinstance(ds, xr.core.dataarray.Dataset)
42+
43+
def test_open_netcdf_s3():
44+
bucket = 's3://its-live-data.jpl.nasa.gov'
45+
key = 'icesat2/alt06/rel003/ATL06_20181230162257_00340206_003_01.h5'
46+
url = f'{bucket}/{key}'
47+
source = intake.open_netcdf(url,
48+
xarray_kwargs=dict(group='gt1l/land_ice_segments', engine='h5netcdf'),
49+
storage_options=dict(anon=True),
50+
)
51+
ds = source.to_dask()
52+
assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore)
53+
assert isinstance(ds, xr.core.dataarray.Dataset)
54+
55+
56+
def test_open_netcdf_s3_simplecache():
57+
bucket = 's3://its-live-data.jpl.nasa.gov'
58+
key = 'icesat2/alt06/rel003/ATL06_20181230162257_00340206_003_01.h5'
59+
url = f'simplecache::{bucket}/{key}'
60+
source = intake.open_netcdf(url,
61+
xarray_kwargs=dict(group='gt1l/land_ice_segments', engine='h5netcdf'),
62+
storage_options=dict(s3={'anon': True}),
63+
)
64+
ds = source.to_dask()
65+
assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore)
66+
assert isinstance(ds, xr.core.dataarray.Dataset)

0 commit comments

Comments
 (0)