PR feedback

vpipkt · vpipkt · commit e34258059b81 · 2019-09-06T13:03:53.000-04:00
Signed-off-by: Jason T. Brown &lt;jason@astraea.earth&gt;
diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py
@@ -117,19 +117,20 @@ def _raster_reader(
         lazy_tiles=True,
         **options):
     """
-    Returns a Spark DataFrame from raster data files specified by URI pointers
-    The returned DataFrame will have a column of (CRS, Extent, Tile) for each URI read
-    Multiple bands from the same raster file are spread across rows of the DataFrame. See band_indexes param.
-    If bands from a scene are stored in separate files, provide a DataFrame to the `source` parameter. Each row in the returned DataFrame will contain one (CRS, Extent, Tile) for each item in `catalog_col_names`
+    Returns a Spark DataFrame from raster data files specified by URIs.
+    Each row in the returned DataFrame will contain a column with struct of (CRS, Extent, Tile) for each item in
+      `catalog_col_names`.
+    Multiple bands from the same raster file are spread across rows of the DataFrame. See `band_indexes` param.
+    If bands from a scene are stored in separate files, provide a DataFrame to the `source` parameter.
 
     For more details and example usage, consult https://rasterframes.io/raster-read.html
 
-    :param source: a string, list of strings, list of lists of strings, a pandas DataFrame or a Spark DataFrame giving URIs to the raster data to read
-    :param catalog_col_names: required if source is a DataFrame or CSV string. It is a list of strings giving the names of columns containing URIs to read
-    :param band_indexes: list of integers indicating which bands, zero-based, to read from the raster files specified; default is to read only the first band
-    :param tile_dimensions: tuple or list of two indicating the default tile dimension as (columns, rows)
-    :param lazy_tiles: If true (default) only generate minimal references to tile contents; if false, fetch tile cell values
-    :param options: Additional keyword arguments to pass to the spark DataSource
+    :param source: a string, list of strings, list of lists of strings, a Pandas DataFrame or a Spark DataFrame giving URIs to the raster data to read.
+    :param catalog_col_names: required if `source` is a DataFrame or CSV string. It is a list of strings giving the names of columns containing URIs to read.
+    :param band_indexes: list of integers indicating which bands, zero-based, to read from the raster files specified; default is to read only the first band.
+    :param tile_dimensions: tuple or list of two indicating the default tile dimension as (columns, rows).
+    :param lazy_tiles: If true (default) only generate minimal references to tile contents; if false, fetch tile cell values.
+    :param options: Additional keyword arguments to pass to the Spark DataSource.
     """
 
     from pandas import DataFrame as PdDataFrame
diff --git a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py
@@ -286,7 +286,7 @@ def test_render_composite(self):
         cat = self.spark.createDataFrame([
             Row(red=self.l8band_uri(4), green=self.l8band_uri(3), blue=self.l8band_uri(2))
         ])
-        rf = self.spark.read.raster(catalog=cat, catalog_col_names=cat.columns)
+        rf = self.spark.read.raster(cat, catalog_col_names=cat.columns)
 
         # Test composite construction
         rgb = rf.select(rf_tile(rf_rgb_composite('red', 'green', 'blue')).alias('rgb')).first()['rgb']
diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py
@@ -21,6 +21,8 @@
 from pyrasterframes.rasterfunctions import *
 from pyrasterframes.rf_types import *
 from pyspark.sql.functions import *
+import pandas as pd
+from shapely.geometry import Point
 import os.path
 from unittest import skip
 from . import TestEnvironment
@@ -41,6 +43,14 @@ def path(scene, band):
         p = scene_dict[scene]
         return p.format(band)
 
+    def path_pandas_df(self):
+        return pd.DataFrame([
+            {'b1': self.path(1, 1), 'b2': self.path(1, 2), 'b3': self.path(1, 3), 'geo': Point(1, 1)},
+            {'b1': self.path(2, 1), 'b2': self.path(2, 2), 'b3': self.path(2, 3), 'geo': Point(2, 2)},
+            {'b1': self.path(3, 1), 'b2': self.path(3, 2), 'b3': self.path(3, 3), 'geo': Point(3, 3)},
+        ])
+
+
     def test_handle_lazy_eval(self):
         df = self.spark.read.raster(self.path(1, 1))
         ltdf = df.select('proj_raster')
@@ -129,59 +139,41 @@ def test_schemeless_string(self):
         self.assertTrue(df.count() > 0)
 
     def test_spark_df_source(self):
-        import pandas as pd
+        catalog_columns = ['b1', 'b2', 'b3']
+        catalog = self.spark.createDataFrame(self.path_pandas_df())
 
-        # Create a pandas dataframe (makes it easy to create spark df)
-        path_pandas = pd.DataFrame([
-            {'b1': self.path(1, 1), 'b2': self.path(1, 2), 'b3': self.path(1, 3)},
-            {'b1': self.path(2, 1), 'b2': self.path(2, 2), 'b3': self.path(2, 3)},
-            {'b1': self.path(3, 1), 'b2': self.path(3, 2), 'b3': self.path(3, 3)},
-        ])
-        # comma separated list of column names containing URI's to read.
-        catalog_columns = path_pandas.columns.tolist()
-        path_table = self.spark.createDataFrame(path_pandas)
-
-        path_df = self.spark.read.raster(
-            path_table,
+        df = self.spark.read.raster(
+            catalog,
             tile_dimensions=(512, 512),
             catalog_col_names=catalog_columns,
             lazy_tiles=True  # We'll get an OOM error if we try to read 9 scenes all at once!
         )
 
-        self.assertTrue(len(path_df.columns) == 6)  # three bands times {path, tile}
-        self.assertTrue(path_df.select('b1_path').distinct().count() == 3)  # as per scene_dict
-        b1_paths_maybe = path_df.select('b1_path').distinct().collect()
+        self.assertTrue(len(df.columns) == 7)  # three bands times {path, tile} plus geo
+        self.assertTrue(df.select('b1_path').distinct().count() == 3)  # as per scene_dict
+        b1_paths_maybe = df.select('b1_path').distinct().collect()
         b1_paths = [self.path(s, 1) for s in [1, 2, 3]]
         self.assertTrue(all([row.b1_path in b1_paths for row in b1_paths_maybe]))
 
     def test_pandas_source(self):
-        import pandas as pd
-        import geopandas
-        from shapely.geometry import Point
 
-        # Create a pandas dataframe (makes it easy to create spark df)
-        path_pandas = pd.DataFrame([
-            {'b1': self.path(1, 1), 'b2': self.path(1, 2), 'b3': self.path(1, 3), 'geo': Point(1, 1)},
-            {'b1': self.path(2, 1), 'b2': self.path(2, 2), 'b3': self.path(2, 3), 'geo': Point(2, 2)},
-            {'b1': self.path(3, 1), 'b2': self.path(3, 2), 'b3': self.path(3, 3), 'geo': Point(3, 3)},
-        ])
-
-        # here a subtle difference with the test_raster_source_catalog_reader test, feed the DataFrame
-        #     not a CSV and not an already created spark DF.
         df = self.spark.read.raster(
-            path_pandas,
+            self.path_pandas_df(),
             catalog_col_names=['b1', 'b2', 'b3']
         )
         self.assertEqual(len(df.columns), 7)  # three path cols, three tile cols, and geo
         self.assertTrue('geo' in df.columns)
         self.assertTrue(df.select('b1_path').distinct().count() == 3)
 
-        # Same test with geopandas
-        geo_df = geopandas.GeoDataFrame(path_pandas, crs={'init': 'EPSG:4326'}, geometry='geo')
-        df2 = self.spark.read.raster(geo_df, ['b1', 'b2', 'b3'])
-        self.assertEqual(len(df2.columns), 7)  # three path cols, three tile cols, and geo
-        self.assertTrue('geo' in df2.columns)
-        self.assertTrue(df2.select('b1_path').distinct().count() == 3)
+    def test_geopandas_source(self):
+        from geopandas import GeoDataFrame
+        # Same test as test_pandas_source with geopandas
+        geo_df = GeoDataFrame(self.path_pandas_df(), crs={'init': 'EPSG:4326'}, geometry='geo')
+        df = self.spark.read.raster(geo_df, ['b1', 'b2', 'b3'])
+
+        self.assertEqual(len(df.columns), 7)  # three path cols, three tile cols, and geo
+        self.assertTrue('geo' in df.columns)
+        self.assertTrue(df.select('b1_path').distinct().count() == 3)
 
     def test_csv_string(self):
 
@@ -198,3 +190,9 @@ def test_csv_string(self):
         df = self.spark.read.raster(s, ['b1', 'b2'])
         self.assertEqual(len(df.columns), 3 + 2)  # number of columns in original DF plus cardinality of catalog_col_names
         self.assertTrue(len(df.take(1)))  # non-empty check
+
+    def test_catalog_named_arg(self):
+        # through version 0.8.1 reading a catalog was via named argument only.
+        df = self.spark.read.raster(catalog=self.path_pandas_df(), catalog_col_names=['b1', 'b2', 'b3'])
+        self.assertEqual(len(df.columns), 7)  # three path cols, three tile cols, and geo
+        self.assertTrue(df.select('b1_path').distinct().count() == 3)