Add GeoDataFrame support to Pipeline (#173)

jf-geo · hobu · web-flow · commit f8583508cb91 · 2024-10-04T15:04:23.000-05:00
* Added GeoDataFrame support to pipeline.py

Added basic GeoPandas GeoDataFrame support. If GeoPandas is installed users can read an array from an executed pipeline and return a GeoDataFrame, with optional arguments for XY vs XYZ point and CRS. DataFrames passed to the Pipeline constructor will drop the "geometry" column if present.

* Update test_pipeline.py

Added test for GeoDataFrames

* add geopandas to environment reqs

---------

Co-authored-by: Howard Butler &lt;hobu.inc@gmail.com&gt;
diff --git a/.github/environment.yml b/.github/environment.yml
@@ -9,4 +9,4 @@ dependencies:
   - pdal
   - pytest
   - meshio
-  - pandas
+  - geopandas
diff --git a/src/pdal/pipeline.py b/src/pdal/pipeline.py
@@ -17,6 +17,11 @@
 except ModuleNotFoundError:  # pragma: no cover
     DataFrame = None
 
+try:
+    from geopandas import GeoDataFrame, points_from_xy
+except ModuleNotFoundError:  # pragma: no cover
+    GeoDataFrame = points_from_xy = None
+
 from . import drivers, libpdalpython
 
 LogLevelToPDAL = {
@@ -45,7 +50,7 @@ def __init__(
 
         # Convert our data frames to Numpy Structured Arrays
         if dataframes:
-            arrays = [df.to_records() for df in dataframes]
+            arrays = [df.to_records() if not "geometry" in df.columns else df.drop(columns=["geometry"]).to_records() for df in dataframes]
 
         super().__init__()
         self._stages: List[Stage] = []
@@ -124,13 +129,26 @@ def get_meshio(self, idx: int) -> Optional[Mesh]:
             [("triangle", np.stack((mesh["A"], mesh["B"], mesh["C"]), 1))],
         )
 
-
     def get_dataframe(self, idx: int) -> Optional[DataFrame]:
         if DataFrame is None:
             raise RuntimeError("Pandas support requires Pandas to be installed")
 
         return DataFrame(self.arrays[idx])
 
+    def get_geodataframe(self, idx: int, xyz: bool=False, crs: Any=None) -> Optional[GeoDataFrame]:
+        if GeoDataFrame is None:
+            raise RuntimeError("GeoPandas support requires GeoPandas to be installed")
+        df = DataFrame(self.arrays[idx])
+        coords = [df["X"], df["Y"], df["Z"]] if xyz else [df["X"], df["Y"]]
+        geometry = points_from_xy(*coords)
+        gdf = GeoDataFrame(
+            df,
+            geometry=geometry,
+            crs=crs,
+        )
+        df = coords = geometry = None
+        return gdf
+
     def _get_json(self) -> str:
         return self.toJSON()
 
diff --git a/test/test_pipeline.py b/test/test_pipeline.py
@@ -541,6 +541,65 @@ def test_load(self):
             assert data["Intensity"].sum() == 57684
 
 
+class TestGeoDataFrame:
+
+    @pytest.mark.skipif(
+        not pdal.pipeline.GeoDataFrame,
+        reason="geopandas is not available",
+    )
+    def test_fetch(self):
+        r = pdal.Reader(os.path.join(DATADIRECTORY,"autzen-utm.las"))
+        p = r.pipeline()
+        p.execute()
+        record_count = p.arrays[0].shape[0]
+        dimension_count = len(p.arrays[0].dtype)
+        gdf = p.get_geodataframe(0)
+        gdf_xyz = p.get_geodataframe(0, xyz=True)
+        gdf_crs = p.get_geodataframe(0, crs="EPSG:4326")
+        assert len(gdf) == record_count
+        assert len(gdf.columns) == dimension_count + 1
+        assert isinstance(gdf, pdal.pipeline.GeoDataFrame)
+        assert gdf.geometry.is_valid.all()
+        assert not gdf.geometry.is_empty.any()
+        assert gdf.crs is None
+        assert gdf.geometry.z.isna().all()
+        assert not gdf_xyz.geometry.z.isna().any()
+        assert gdf_crs.crs.srs == "EPSG:4326"
+
+    @pytest.mark.skipif(
+        not pdal.pipeline.GeoDataFrame,
+        reason="geopandas is not available",
+    )
+    def test_load(self):
+        r = pdal.Reader(os.path.join(DATADIRECTORY,"autzen-utm.las"))
+        p = r.pipeline()
+        p.execute()
+        data = p.arrays[0]
+        gdf = pdal.pipeline.GeoDataFrame(
+            data,
+            geometry=pdal.pipeline.points_from_xy(data["X"], data["Y"], data["Z"])
+        )
+        dataframes = [gdf, gdf, gdf]
+        filter_intensity = """{
+          "pipeline":[
+            {
+              "type":"filters.range",
+              "limits":"Intensity[100:300)"
+            }
+          ]
+        }"""
+        p = pdal.Pipeline(filter_intensity, dataframes = dataframes)
+        p.execute()
+        arrays = p.arrays
+        assert len(arrays) == 3
+
+        # We copied the array three times. Sum the Intensity values
+        # post filtering to see if we had our intended effect
+        for data in arrays:
+            assert len(data) == 387
+            assert data["Intensity"].sum() == 57684
+
+
 class TestPipelineIterator:
     @pytest.mark.parametrize("filename", ["sort.json", "sort.py"])
     def test_non_streamable(self, filename):

-Original file line number
+Diff line change
   - pdal
   - pytest
   - meshio
 -  - pandas
 +  - geopandas