Merge pull request #329 from s22s/feature/python-raster-reader-arg-refactor-and-docs

metasim · web-flow · commit 61d6b6ed011b · 2019-09-09T09:42:07.000-04:00
Python raster reader argument refactor
diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala
@@ -44,12 +44,12 @@ object RasterSourceDataSource {
   final val SHORT_NAME = "raster"
   final val PATH_PARAM = "path"
   final val PATHS_PARAM = "paths"
-  final val BAND_INDEXES_PARAM = "bandIndexes"
-  final val TILE_DIMS_PARAM = "tileDimensions"
-  final val CATALOG_TABLE_PARAM = "catalogTable"
-  final val CATALOG_TABLE_COLS_PARAM = "catalogColumns"
-  final val CATALOG_CSV_PARAM = "catalogCSV"
-  final val LAZY_TILES_PARAM = "lazyTiles"
+  final val BAND_INDEXES_PARAM = "band_indexes"
+  final val TILE_DIMS_PARAM = "tile_dimensions"
+  final val CATALOG_TABLE_PARAM = "catalog_table"
+  final val CATALOG_TABLE_COLS_PARAM = "catalog_col_names"
+  final val CATALOG_CSV_PARAM = "catalog_csv"
+  final val LAZY_TILES_PARAM = "lazy_tiles"
 
   final val DEFAULT_COLUMN_NAME = PROJECTED_RASTER_COLUMN.columnName
 
diff --git a/pyrasterframes/src/main/python/docs/languages.pymd b/pyrasterframes/src/main/python/docs/languages.pymd
@@ -42,7 +42,7 @@ red_nir_monthly_2017.printSchema()
 
 ```python, step_3_python
 red_nir_tiles_monthly_2017 = spark.read.raster(
-    catalog=red_nir_monthly_2017,
+    red_nir_monthly_2017,
     catalog_col_names=['red', 'nir'],
     tile_dimensions=(256, 256)
 )
@@ -97,9 +97,9 @@ sql("""
 CREATE OR REPLACE TEMPORARY VIEW red_nir_tiles_monthly_2017
 USING raster
 OPTIONS (
-    catalogTable='red_nir_monthly_2017',
-    catalogColumns='red,nir',
-    tileDimensions='256,256'
+    catalog_table='red_nir_monthly_2017',
+    catalog_col_names='red,nir',
+    tile_dimensions='256,256'
     )
 """)
 ```
diff --git a/pyrasterframes/src/main/python/docs/local-algebra.pymd b/pyrasterframes/src/main/python/docs/local-algebra.pymd
@@ -40,7 +40,7 @@ catalog_df = spark.createDataFrame([
     Row(red=uri_pattern.format(4), nir=uri_pattern.format(8))
 ])
 df = spark.read.raster(
-    catalog=catalog_df,
+    catalog_df,
     catalog_col_names=['red', 'nir']
 )
 df.printSchema()
diff --git a/pyrasterframes/src/main/python/docs/nodata-handling.pymd b/pyrasterframes/src/main/python/docs/nodata-handling.pymd
@@ -90,7 +90,7 @@ from pyspark.sql import Row
 blue_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif'
 scl_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/SCL.tif'
 cat = spark.createDataFrame([Row(blue=blue_uri, scl=scl_uri),])
-unmasked = spark.read.raster(catalog=cat, catalog_col_names=['blue', 'scl'])
+unmasked = spark.read.raster(cat, catalog_col_names=['blue', 'scl'])
 unmasked.printSchema()
 ```
 
diff --git a/pyrasterframes/src/main/python/docs/numpy-pandas.pymd b/pyrasterframes/src/main/python/docs/numpy-pandas.pymd
@@ -51,7 +51,7 @@ cat = spark.read.format('aws-pds-modis-catalog').load() \
             (col('acquisition_date') < lit('2018-02-22'))
         )
 
-spark_df = spark.read.raster(catalog=cat, catalog_col_names=['B01']) \
+spark_df = spark.read.raster(cat, catalog_col_names=['B01']) \
                 .select(
                     'acquisition_date',
                     'granule_id',
diff --git a/pyrasterframes/src/main/python/docs/raster-read.pymd b/pyrasterframes/src/main/python/docs/raster-read.pymd
@@ -101,8 +101,6 @@ modis_catalog = spark.read \
     .withColumn('red' , F.concat('base_url', F.lit("_B01.TIF"))) \
     .withColumn('nir' , F.concat('base_url', F.lit("_B02.TIF")))
 
-modis_catalog.printSchema()
-
 print("Available scenes: ", modis_catalog.count())
 ```
 
@@ -124,10 +122,7 @@ equator.select('date', 'gid')
 Now that we have prepared our catalog, we simply pass the DataFrame or CSV string to the `raster` DataSource to load the imagery. The `catalog_col_names` parameter gives the columns that contain the URI's to be read.
 
 ```python, read_catalog
-rf = spark.read.raster(
-    catalog=equator,
-    catalog_col_names=['red', 'nir']
-)
+rf = spark.read.raster(equator, catalog_col_names=['red', 'nir'])
 rf.printSchema()
 ```
 
@@ -179,7 +174,7 @@ mb.printSchema()
 
 If a band is passed into `band_indexes` that exceeds the number of bands in the raster, a projected raster column will still be generated in the schema but the column will be full of `null` values.
 
-You can also pass a `catalog` and `band_indexes` together into the `raster` reader. This will create a projected raster column for the combination of all items passed into `catalog_col_names` and `band_indexes`. Again if a band in `band_indexes` exceeds the number of bands in a raster, it will have a `null` value for the corresponding column.
+You can also pass a _catalog_ and `band_indexes` together into the `raster` reader. This will create a projected raster column for the combination of all items in `catalog_col_names` and `band_indexes`. Again if a band in `band_indexes` exceeds the number of bands in a raster, it will have a `null` value for the corresponding column.
 
 Here is a trivial example with a _catalog_ over multiband rasters. We specify two columns containing URIs and two bands, resulting in four projected raster columns.
 
@@ -191,7 +186,7 @@ mb_cat = pd.DataFrame([
     },
 ])
 mb2 = spark.read.raster(
-    catalog=spark.createDataFrame(mb_cat),
+    spark.createDataFrame(mb_cat),
     catalog_col_names=['foo', 'bar'],
     band_indexes=[0, 1],
     tile_dimensions=(64,64)
diff --git a/pyrasterframes/src/main/python/docs/supervised-learning.pymd b/pyrasterframes/src/main/python/docs/supervised-learning.pymd
@@ -33,10 +33,8 @@ catalog_df = pd.DataFrame([
     {b: uri_base.format(b) for b in cols}
 ])
 
-df = spark.read.raster(catalog=catalog_df,
-					   catalog_col_names=cols,
-					   tile_dimensions=(128, 128)
-					   ).repartition(100)
+df = spark.read.raster(catalog_df, catalog_col_names=cols, tile_dimensions=(128, 128)) \
+					  .repartition(100)
 
 df = df.select(
     rf_crs(df.B01).alias('crs'),
diff --git a/pyrasterframes/src/main/python/docs/time-series.pymd b/pyrasterframes/src/main/python/docs/time-series.pymd
@@ -97,7 +97,7 @@ We then [reproject](https://gis.stackexchange.com/questions/247770/understanding
 ```python read_catalog
 raster_cols = ['B01', 'B02',] # red and near-infrared respectively
 park_rf = spark.read.raster(
-        catalog=park_cat.select(['acquisition_date', 'granule_id', 'geo_simp'] + raster_cols),
+        park_cat.select(['acquisition_date', 'granule_id', 'geo_simp'] + raster_cols),
         catalog_col_names=raster_cols) \
     .withColumn('park_native', st_reproject('geo_simp', lit('EPSG:4326'), rf_crs('B01'))) \
     .filter(st_intersects('park_native', rf_geometry('B01'))) 
diff --git a/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd b/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd
@@ -37,7 +37,7 @@ filenamePattern = "L8-B{}-Elkton-VA.tiff"
 catalog_df = pd.DataFrame([
     {'b' + str(b): os.path.join(resource_dir_uri(), filenamePattern.format(b)) for b in range(1, 8)}
 ])
-df = spark.read.raster(catalog=catalog_df, catalog_col_names=catalog_df.columns)
+df = spark.read.raster(catalog_df, catalog_col_names=catalog_df.columns)
 df = df.select(
     rf_crs(df.b1).alias('crs'),
     rf_extent(df.b1).alias('extent'),
diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py
@@ -110,16 +110,34 @@ def _aliased_writer(df_writer, format_key, path, **options):
 
 def _raster_reader(
         df_reader,
-        path=None,
-        catalog=None,
+        source=None,
         catalog_col_names=None,
         band_indexes=None,
         tile_dimensions=(256, 256),
         lazy_tiles=True,
         **options):
+    """
+    Returns a Spark DataFrame from raster data files specified by URIs.
+    Each row in the returned DataFrame will contain a column with struct of (CRS, Extent, Tile) for each item in
+      `catalog_col_names`.
+    Multiple bands from the same raster file are spread across rows of the DataFrame. See `band_indexes` param.
+    If bands from a scene are stored in separate files, provide a DataFrame to the `source` parameter.
+
+    For more details and example usage, consult https://rasterframes.io/raster-read.html
+
+    :param source: a string, list of strings, list of lists of strings, a Pandas DataFrame or a Spark DataFrame giving URIs to the raster data to read.
+    :param catalog_col_names: required if `source` is a DataFrame or CSV string. It is a list of strings giving the names of columns containing URIs to read.
+    :param band_indexes: list of integers indicating which bands, zero-based, to read from the raster files specified; default is to read only the first band.
+    :param tile_dimensions: tuple or list of two indicating the default tile dimension as (columns, rows).
+    :param lazy_tiles: If true (default) only generate minimal references to tile contents; if false, fetch tile cell values.
+    :param options: Additional keyword arguments to pass to the Spark DataSource.
+    """
 
     from pandas import DataFrame as PdDataFrame
 
+    if 'catalog' in options:
+        source = options['catalog']  # maintain back compatibility with 0.8.0
+
     def to_csv(comp):
         if isinstance(comp, str):
             return comp
@@ -135,37 +153,75 @@ def temp_name():
         band_indexes = [0]
 
     options.update({
-        "bandIndexes": to_csv(band_indexes),
-        "tileDimensions": to_csv(tile_dimensions),
-        "lazyTiles": lazy_tiles
+        "band_indexes": to_csv(band_indexes),
+        "tile_dimensions": to_csv(tile_dimensions),
+        "lazy_tiles": lazy_tiles
     })
 
+    # Parse the `source` argument
+    path = None  # to pass into `path` param
+    if isinstance(source, list):
+        if all([isinstance(i, str) for i in source]):
+            path = None
+            catalog = None
+            options.update(dict(paths='\n'.join([str(i) for i in source])))  # pass in "uri1\nuri2\nuri3\n..."
+        if all([isinstance(i, list) for i in source]):
+            # list of lists; we will rely on pandas to:
+            #   - coerce all data to str (possibly using objects' __str__ or __repr__)
+            #   - ensure data is not "ragged": all sublists are same len
+            path = None
+            catalog_col_names = ['proj_raster_{}'.format(i) for i in range(len(source[0]))]  # assign these names
+            catalog = PdDataFrame(source,
+                                  columns=catalog_col_names,
+                                  dtype=str,
+                                  )
+    elif isinstance(source, str):
+        if '\n' in source or '\r' in source:
+            # then the `source` string is a catalog as a CSV (header is required)
+            path = None
+            catalog = source
+        else:
+            # interpret source as a single URI string
+            path = source
+            catalog = None
+    else:
+        # user has passed in some other type, we will try to interpret as a catalog
+        catalog = source
+
     if catalog is not None:
         if catalog_col_names is None:
             raise Exception("'catalog_col_names' required when DataFrame 'catalog' specified")
+
         if isinstance(catalog, str):
             options.update({
-                "catalogCSV": catalog,
-                "catalogColumns": to_csv(catalog_col_names)
+                "catalog_csv": catalog,
+                "catalog_col_names": to_csv(catalog_col_names)
             })
         elif isinstance(catalog, DataFrame):
+            # check catalog_col_names
+            assert all([c in catalog.columns for c in catalog_col_names]), \
+                "All items in catalog_col_names must be the name of a column in the catalog DataFrame."
             # Create a random view name
             tmp_name = temp_name()
             catalog.createOrReplaceTempView(tmp_name)
             options.update({
-                "catalogTable": tmp_name,
-                "catalogColumns": to_csv(catalog_col_names)
+                "catalog_table": tmp_name,
+                "catalog_col_names": to_csv(catalog_col_names)
             })
         elif isinstance(catalog, PdDataFrame):
+            # check catalog_col_names
+            assert all([c in catalog.columns for c in catalog_col_names]), \
+                "All items in catalog_col_names must be the name of a column in the catalog DataFrame."
+
             # Handle to active spark session
             session = SparkContext._active_spark_context._rf_context._spark_session
             # Create a random view name
             tmp_name = temp_name()
             spark_catalog = session.createDataFrame(catalog)
             spark_catalog.createOrReplaceTempView(tmp_name)
             options.update({
-                "catalogTable": tmp_name,
-                "catalogColumns": to_csv(catalog_col_names)
+                "catalog_table": tmp_name,
+                "catalog_col_names": to_csv(catalog_col_names)
             })
 
     return df_reader \
diff --git a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py
@@ -410,8 +410,6 @@ def test_raster_join(self):
             self.rf.raster_join(rf_prime, join_exprs=self.rf.extent)
 
 
-
-
 def suite():
     function_tests = unittest.TestSuite()
     return function_tests
diff --git a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py
@@ -286,7 +286,7 @@ def test_render_composite(self):
         cat = self.spark.createDataFrame([
             Row(red=self.l8band_uri(4), green=self.l8band_uri(3), blue=self.l8band_uri(2))
         ])
-        rf = self.spark.read.raster(catalog = cat, catalog_col_names=['red', 'green', 'blue'])
+        rf = self.spark.read.raster(cat, catalog_col_names=cat.columns)
 
         # Test composite construction
         rgb = rf.select(rf_tile(rf_rgb_composite('red', 'green', 'blue')).alias('rgb')).first()['rgb']
diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py
diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTests.py b/pyrasterframes/src/main/python/tests/RasterSourceTests.py
diff --git a/pyrasterframes/src/main/python/tests/__init__.py b/pyrasterframes/src/main/python/tests/__init__.py

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ catalog_df = spark.createDataFrame([`
`40`	`40`	`Row(red=uri_pattern.format(4), nir=uri_pattern.format(8))`
`41`	`41`	`])`
`42`	`42`	`df = spark.read.raster(`
`43`		`- catalog=catalog_df,`
	`43`	`+ catalog_df,`
`44`	`44`	`catalog_col_names=['red', 'nir']`
`45`	`45`	`)`
`46`	`46`	`df.printSchema()`
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ cat = spark.read.format('aws-pds-modis-catalog').load() \`
`51`	`51`	`(col('acquisition_date') < lit('2018-02-22'))`
`52`	`52`	`)`
`53`	`53`
`54`		`-spark_df = spark.read.raster(catalog=cat, catalog_col_names=['B01']) \`
	`54`	`+spark_df = spark.read.raster(cat, catalog_col_names=['B01']) \`
`55`	`55`	`.select(`
`56`	`56`	`'acquisition_date',`
`57`	`57`	`'granule_id',`