Support fetching data across datasets and writing data to parquet

trey-stafford · trey-stafford · commit 4ecabc6dbf98 · 2024-10-29T11:33:24.000-06:00
Makes it easier to fetch a large amount of data and filter/process it with `dask`
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,8 @@
   `target_epoch`.
 - Add support for ILATM1B v2 and BLATM1B v1.
 - Add support for ILVIS2 v1 and v2.
+- Improve API, providing ability to search across datasets and save to
+  intermediate parquet file.
 
 # v0.2.0
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,7 +33,8 @@ dependencies = [
   "shapely >=2.0.5",
   "pandera[mypy] >= 0.20.3",
   "pydantic >=2.8.2",
-  "loguru",
+  "loguru >=0.7.2",
+  "dask >=2024.10.0",
 ]
 
 [project.urls]
@@ -117,6 +118,8 @@ module = [
   "h5py.*",
   "numpy.*",
   "gps_timemachine.*",
+  "dask.*",
+  "dask.dataframe.*",
 ]
 disallow_incomplete_defs = true
 ignore_missing_imports = true
diff --git a/src/nsidc/iceflow/api.py b/src/nsidc/iceflow/api.py
@@ -1,40 +1,44 @@
 from __future__ import annotations
 
+import datetime as dt
+import shutil
 from pathlib import Path
 
+import dask.dataframe as dd
 import pandas as pd
+from loguru import logger
 
 from nsidc.iceflow.data.fetch import search_and_download
 from nsidc.iceflow.data.models import (
+    BoundingBox,
+    Dataset,
     DatasetSearchParameters,
     IceflowDataFrame,
 )
 from nsidc.iceflow.data.read import read_data
 from nsidc.iceflow.itrf.converter import transform_itrf
 
 
-def fetch_iceflow_df(
+def _df_for_one_dataset(
     *,
-    dataset_search_params: DatasetSearchParameters,
+    dataset: Dataset,
+    bounding_box: BoundingBox,
+    temporal: tuple[dt.datetime | dt.date, dt.datetime | dt.date],
     output_dir: Path,
-    output_itrf: str | None = None,
+    # TODO: also add option for target epoch!!
+    output_itrf: str | None,
 ) -> IceflowDataFrame:
-    """Search for data matching parameters and return an IceflowDataframe.
-
-    Optionally transform data to the given ITRF for consistency.
-    """
-
     results = search_and_download(
-        short_name=dataset_search_params.dataset.short_name,
-        version=dataset_search_params.dataset.version,
-        bounding_box=dataset_search_params.bounding_box,
-        temporal=dataset_search_params.temporal,
+        short_name=dataset.short_name,
+        version=dataset.version,
+        bounding_box=bounding_box,
+        temporal=temporal,
         output_dir=output_dir,
     )
 
     all_dfs = []
     for result in results:
-        data_df = read_data(dataset_search_params.dataset, result)
+        data_df = read_data(dataset, result)
         all_dfs.append(data_df)
 
     complete_df = IceflowDataFrame(pd.concat(all_dfs))
@@ -46,3 +50,110 @@ def fetch_iceflow_df(
         )
 
     return complete_df
+
+
+def fetch_iceflow_df(
+    *,
+    dataset_search_params: DatasetSearchParameters,
+    output_dir: Path,
+    # TODO: also add option for target epoch!!
+    output_itrf: str | None = None,
+) -> IceflowDataFrame:
+    """Search for data matching parameters and return an IceflowDataframe.
+
+    Optionally transform data to the given ITRF for consistency.
+
+    Note: a potentially large amount of data may be returned, especially if the
+    user requests a large spatial/temporal area across multiple datasets. The
+    result may not even fit in memory!
+
+    Consider using `create_iceflow_parquet` to fetch and store data in parquet
+    format.
+    """
+
+    dfs = []
+    for dataset in dataset_search_params.datasets:
+        result = _df_for_one_dataset(
+            dataset=dataset,
+            temporal=dataset_search_params.temporal,
+            bounding_box=dataset_search_params.bounding_box,
+            output_dir=output_dir,
+            output_itrf=output_itrf,
+        )
+        dfs.append(result)
+
+    complete_df = IceflowDataFrame(pd.concat(dfs))
+
+    return complete_df
+
+
+def create_iceflow_parquet(
+    *,
+    dataset_search_params: DatasetSearchParameters,
+    output_dir: Path,
+    target_itrf: str,
+    overwrite: bool = False,
+    target_epoch: str | None = None,
+) -> Path:
+    """Create a parquet file containing the lat/lon/elev data matching the dataset search params.
+
+    This function creates a parquet file that can be easily used alongside dask,
+    containing lat/lon/elev data compatible with a comparison to icesat 2 data.
+
+    Note: this function writes a single `iceflow.parquet` to the output
+    dir. This code does not currently support updates to the parquet after being
+    written. This is intended to help facilitate analysis of a specific area
+    over time. If an existing `iceflow.parquet` exists and the user wants to
+    create a new `iceflow.parquet` for a different area or timespan, they will
+    need to move/remove the existing `iceflow.parquet` first (e.g., with the
+    `overwrite=True` kwarg).
+    """
+    output_subdir = output_dir / "iceflow.parquet"
+    if output_subdir.exists():
+        if overwrite:
+            logger.info("Removing existing iceflow.parquet")
+            shutil.rmtree(output_subdir)
+        else:
+            raise RuntimeError(
+                "An iceflow parquet file already exists. Use `overwrite=True` to overwrite."
+            )
+
+    for dataset in dataset_search_params.datasets:
+        results = search_and_download(
+            short_name=dataset.short_name,
+            version=dataset.version,
+            temporal=dataset_search_params.temporal,
+            bounding_box=dataset_search_params.bounding_box,
+            output_dir=output_dir,
+        )
+
+        for result in results:
+            data_df = read_data(dataset, result)
+            df = IceflowDataFrame(data_df)
+
+            df = transform_itrf(
+                data=df,
+                target_itrf=target_itrf,
+                target_epoch=target_epoch,
+            )
+
+            # Add a string col w/ dataset name and version.
+            df["dataset"] = [f"{dataset.short_name}v{dataset.version}"] * len(
+                df.latitude
+            )
+            common_columns = ["latitude", "longitude", "elevation", "dataset"]
+            common_dask_df = dd.from_pandas(df[common_columns])  # type: ignore[attr-defined]
+            if output_subdir.exists():
+                dd.to_parquet(  # type: ignore[attr-defined]
+                    df=common_dask_df,
+                    path=output_subdir,
+                    append=True,
+                    ignore_divisions=True,
+                )
+            else:
+                dd.to_parquet(  # type: ignore[attr-defined]
+                    df=common_dask_df,
+                    path=output_subdir,
+                )
+
+    return output_subdir
diff --git a/src/nsidc/iceflow/data/fetch.py b/src/nsidc/iceflow/data/fetch.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 import earthaccess
+from loguru import logger
 
 from nsidc.iceflow.data.models import BoundingBox
 
@@ -21,26 +22,47 @@ def search_and_download(
     Wraps EDL auth and CMR search using `earthaccess`.
 
     Data matching the given parameters are downloaded to a subfolder of the
-    given `output_dir` named after th e`short_name`.
+    given `output_dir` named after the `short_name`.
     """
     earthaccess.login()
 
-    results = earthaccess.search_data(
-        short_name=short_name,
-        version=version,
-        bounding_box=(
-            bounding_box.lower_left_lon,
-            bounding_box.lower_left_lat,
-            bounding_box.upper_right_lon,
-            bounding_box.upper_right_lat,
-        ),
-        temporal=temporal,
-    )
+    ctx_string = f"{short_name=} {version=} with {bounding_box=} {temporal=}"
+
+    try:
+        results = earthaccess.search_data(
+            short_name=short_name,
+            version=version,
+            bounding_box=(
+                bounding_box.lower_left_lon,
+                bounding_box.lower_left_lat,
+                bounding_box.upper_right_lon,
+                bounding_box.upper_right_lat,
+            ),
+            temporal=temporal,
+        )
+    except IndexError:
+        # There's no data matching the given parameters.
+        logger.error(f"Found no results for {ctx_string}")
+        return []
+
+    num_results = len(results)
+
+    if not num_results:
+        logger.error(f"Found no results for {ctx_string}")
+        return []
 
     # short_name based subdir for data.
     output_subdir = output_dir / short_name
+    logger.info(
+        f"Found {num_results} granules for {ctx_string}."
+        f" Downloading to {output_subdir}."
+    )
+
     output_subdir.mkdir(exist_ok=True)
     downloaded_files = earthaccess.download(results, str(output_subdir))
     downloaded_filepaths = [Path(filepath_str) for filepath_str in downloaded_files]
+    # There may be duplicate filepaths returned by earthaccess because of data
+    # existing both in the cloud and on ECS.
+    downloaded_filepaths = list(set(downloaded_filepaths))
 
     return downloaded_filepaths
diff --git a/src/nsidc/iceflow/data/models.py b/src/nsidc/iceflow/data/models.py
@@ -232,6 +232,6 @@ class BoundingBox(pydantic.BaseModel):
 
 
 class DatasetSearchParameters(pydantic.BaseModel):
-    dataset: Dataset
+    datasets: list[Dataset]
     bounding_box: BoundingBox
     temporal: tuple[dt.datetime | dt.date, dt.datetime | dt.date]
diff --git a/tests/integration/test_e2e.py b/tests/integration/test_e2e.py
@@ -38,7 +38,7 @@ def test_atm1b_ilatm1b(tmp_path):
     # Native ITRF is ITRF2005
     results_ilatm1b_v1_2009 = fetch_iceflow_df(
         dataset_search_params=DatasetSearchParameters(
-            dataset=ILATM1BDataset(version="1"),
+            datasets=[ILATM1BDataset(version="1")],
             bounding_box=common_bounding_box,
             temporal=(dt.date(2009, 11, 1), dt.date(2009, 12, 1)),
         ),
@@ -49,7 +49,7 @@ def test_atm1b_ilatm1b(tmp_path):
     # Native ITRF is ITRF2008
     results_ilatm1b_v2_2014 = fetch_iceflow_df(
         dataset_search_params=DatasetSearchParameters(
-            dataset=ILATM1BDataset(version="2"),
+            datasets=[ILATM1BDataset(version="2")],
             bounding_box=common_bounding_box,
             temporal=(dt.date(2014, 11, 1), dt.date(2014, 12, 1)),
         ),
@@ -74,7 +74,7 @@ def test_atm1b_blatm1b(tmp_path):
 
     results_blamt1b_v2_2014 = fetch_iceflow_df(
         dataset_search_params=DatasetSearchParameters(
-            dataset=BLATM1BDataset(),
+            datasets=[BLATM1BDataset()],
             bounding_box=common_bounding_box,
             temporal=(dt.date(2002, 11, 27), dt.date(2002, 11, 28)),
         ),
@@ -87,7 +87,7 @@ def test_atm1b_blatm1b(tmp_path):
 def test_ivlis2(tmp_path):
     results_v1 = fetch_iceflow_df(
         dataset_search_params=DatasetSearchParameters(
-            dataset=ILVIS2Dataset(version="1"),
+            datasets=[ILVIS2Dataset(version="1")],
             bounding_box=BoundingBox(
                 lower_left_lon=-120.0,
                 lower_left_lat=-80.0,
@@ -103,7 +103,7 @@ def test_ivlis2(tmp_path):
 
     results_v2 = fetch_iceflow_df(
         dataset_search_params=DatasetSearchParameters(
-            dataset=ILVIS2Dataset(version="2"),
+            datasets=[ILVIS2Dataset(version="2")],
             bounding_box=BoundingBox(
                 lower_left_lon=-180,
                 lower_left_lat=60.0,
@@ -133,7 +133,7 @@ def test_glah06(tmp_path):
 
     results = fetch_iceflow_df(
         dataset_search_params=DatasetSearchParameters(
-            dataset=GLAH06Dataset(),
+            datasets=[GLAH06Dataset()],
             bounding_box=common_bounding_box,
             temporal=(
                 dt.datetime(2003, 2, 20, 22, 25),