👌 Subset dataframe properly to within lake polygon

weiji14 · weiji14 · commit fe281f14a1a8 · 2020-12-20T18:34:22.000+13:00
Ice volume displacements (a proxy for subglacial lake volume discharge) did not seem right, and it's because the geographical subsetting was done using a bounding box rather than an actual polygon. Using geopandas (CPU-based) for this since there's only a few hundred points involved. Also decided I might as well randomly commit an enhancement to the point_in_polygon_gpu code, with a new poly_limit parameter for conserving GPU memory.
diff --git a/atlxi_xover.ipynb b/atlxi_xover.ipynb
@@ -116,7 +116,7 @@
    "outputs": [],
    "source": [
     "# Choose one Antarctic active subglacial lake polygon with EPSG:3031 coordinates\n",
-    "lake_name: str = \"Subglacial Lake Conway\"\n",
+    "lake_name: str = \"Whillans 12\"\n",
     "lake_catalog = deepicedrain.catalog.subglacial_lakes()\n",
     "lake_ids: list = (\n",
     "    pd.json_normalize(lake_catalog.metadata[\"lakedict\"])\n",
@@ -147,7 +147,11 @@
    "source": [
     "# Subset data to lake of interest\n",
     "placename: str = region.name.lower().replace(\" \", \"_\")\n",
-    "df_lake: pd.DataFrame = region.subset(data=df_dhdt)"
+    "df_lake: cudf.DataFrame = region.subset(data=df_dhdt)  # bbox subset\n",
+    "gdf_lake = gpd.GeoDataFrame(\n",
+    "    df_lake, geometry=gpd.points_from_xy(x=df_lake.x, y=df_lake.y, crs=3031)\n",
+    ")\n",
+    "df_lake: pd.DataFrame = df_lake.loc[gdf_lake.within(lake.geometry)]  # polygon subset"
    ]
   },
   {
diff --git a/atlxi_xover.py b/atlxi_xover.py
@@ -84,7 +84,7 @@
 
 # %%
 # Choose one Antarctic active subglacial lake polygon with EPSG:3031 coordinates
-lake_name: str = "Subglacial Lake Conway"
+lake_name: str = "Whillans 12"
 lake_catalog = deepicedrain.catalog.subglacial_lakes()
 lake_ids: list = (
     pd.json_normalize(lake_catalog.metadata["lakedict"])
@@ -107,8 +107,11 @@
 # %%
 # Subset data to lake of interest
 placename: str = region.name.lower().replace(" ", "_")
-df_lake: pd.DataFrame = region.subset(data=df_dhdt)
-
+df_lake: cudf.DataFrame = region.subset(data=df_dhdt)  # bbox subset
+gdf_lake = gpd.GeoDataFrame(
+    df_lake, geometry=gpd.points_from_xy(x=df_lake.x, y=df_lake.y, crs=3031)
+)
+df_lake: pd.DataFrame = df_lake.loc[gdf_lake.within(lake.geometry)]  # polygon subset
 
 # %%
 # Run crossover analysis on all tracks
diff --git a/deepicedrain/features/subglacial_lakes.feature b/deepicedrain/features/subglacial_lakes.feature
@@ -59,6 +59,6 @@ Feature: Mapping Antarctic subglacial lakes
     Then we see a trend of active subglacial lake surfaces changing over time
 
     Examples:
-    | lake_name   | location            |
-    | Whillans 7  | whillans_upstream   |
-    | Whillans 12 | whillans_downstream |
+    | lake_name                | location            |
+    | Whillans 7               | whillans_upstream   |
+    | Whillans 12              | whillans_downstream |
diff --git a/deepicedrain/spatiotemporal.py b/deepicedrain/spatiotemporal.py
@@ -222,6 +222,7 @@ def point_in_polygon_gpu(
     points_x_col: str = "x",
     points_y_col: str = "y",
     poly_label_col: str = None,
+    poly_limit: int = 32,
 ):
     """
     Find polygon labels for each of the input points.
@@ -244,6 +245,11 @@ def point_in_polygon_gpu(
         e.g. "placename". Default is to automatically use the first column
         unless otherwise specified.
 
+    poly_limit : int
+        Number of polygons to check in each loop of the point in polygon
+        algorithm, workaround for a limitation in cuspatial. Default is 32
+        (maximum), adjust to lower value (e.g. 16) if hitting MemoryError.
+
     Returns
     -------
     point_labels : cudf.Series
@@ -278,11 +284,11 @@ def point_in_polygon_gpu(
         )
 
     # Run the actual point in polygon algorithm!
-    # Note that cuspatial's point_in_polygon function has a 31 polygon limit,
+    # Note that cuspatial's point_in_polygon function has a 32 polygon limit,
     # hence the for-loop code below. See also
     # https://github.com/rapidsai/cuspatial/blob/branch-0.15/notebooks/nyc_taxi_years_correlation.ipynb
     num_poly: int = len(poly_df_)
-    point_in_poly_iter: list = list(np.arange(0, num_poly, 31)) + [num_poly]
+    point_in_poly_iter: list = list(np.arange(0, num_poly, poly_limit - 1)) + [num_poly]
     for i in range(len(point_in_poly_iter) - 1):
         start, end = point_in_poly_iter[i], point_in_poly_iter[i + 1]
         poly_labels: cudf.DataFrame = cuspatial.point_in_polygon(
diff --git a/deepicedrain/tests/conftest.py b/deepicedrain/tests/conftest.py
@@ -4,6 +4,7 @@
 import os
 
 import fsspec
+import geopandas as gpd
 import numpy as np
 import pandas as pd
 import pytest
@@ -77,7 +78,13 @@ def lake_altimetry_data(lake_name: str, location: str, context) -> pd.DataFrame:
 
     # Subset data to lake of interest
     context.placename: str = context.lake_name.lower().replace(" ", "_")
-    df_lake: pd.DataFrame = context.region.subset(data=dataframe)
+    df_lake: cudf.DataFrame = context.region.subset(data=dataframe)  # bbox subset
+    gdf_lake = gpd.GeoDataFrame(
+        df_lake, geometry=gpd.points_from_xy(x=df_lake.x, y=df_lake.y, crs=3031)
+    )
+    df_lake: pd.DataFrame = df_lake.loc[
+        gdf_lake.within(context.lake.geometry)
+    ]  # polygon subset
 
     # Save lake outline to OGR GMT file format
     os.makedirs(name=f"figures/{context.placename}", exist_ok=True)