locationtech
diff --git a/‎pyrasterframes/src/main/python/pyrasterframes/__init__.py‎
Lines changed: 43 additions & 2 deletions b/‎pyrasterframes/src/main/python/pyrasterframes/__init__.py‎
Lines changed: 43 additions & 2 deletions
diff --git a/‎pyrasterframes/src/main/python/tests/PyRasterFramesTests.py‎
Lines changed: 0 additions & 152 deletions b/‎pyrasterframes/src/main/python/tests/PyRasterFramesTests.py‎
Lines changed: 0 additions & 152 deletions
@@ -110,13 +110,28 @@ def _aliased_writer(df_writer, format_key, path, **options):
 
 def _raster_reader(
         df_reader,
-        path=None,
-        catalog=None,
+        source=None,
         catalog_col_names=None,
         band_indexes=None,
         tile_dimensions=(256, 256),
         lazy_tiles=True,
         **options):
+    """
+    Returns a Spark DataFrame from a raster data files specified by URI pointers
+    The returned DataFrame will have a column of (CRS, Extent, Tile) for each URI read
+    Multiple bands from the same raster file are spread across rows of the DataFrame. See band_indexes param.
+    If bands from a scene are stored in separate files, provide a DataFrame to the `source` parameter. Each row in the returned DataFrame will contain one (CRS, Extent, Tile) for each item in `catalog_col_names`
+
+    For more details and example usage, consult https://rasterframes.io/raster-read.html
+
+    :param source: a string, list of strings, a pandas DataFrame or a Spark DataFrame giving URIs to the raster data to read
+    :param catalog_col_names: required if source is a DataFrame or CSV string. It is a list of strings giving the names of columns containing URIs to read
+    :param band_indexes: list of integers indicating which bands, zero-based, to read from the raster files specified; default is to read only the first band
+    :param tile_dimensions: tuple or list of two indicating the default tile dimension as (columns, rows)
+    :param lazy_tiles: If true (default) only generate minimal references to tile contents; if false, fetch tile cell values
+    :param options: Additional keyword arguments to pass to the spark DataSource
+    :return:
+    """
 
     from pandas import DataFrame as PdDataFrame
 
@@ -140,6 +155,25 @@ def temp_name():
         "lazyTiles": lazy_tiles
     })
 
+    # Parse the `source` argument
+    path = None  # to pass into `path` param
+    if isinstance(source, list):
+        path = None
+        catalog = None
+        options.update(dict(paths='\n'.join(str(source))))
+    elif isinstance(source, str):
+        if '\n' in source or '\r' in source:
+            # then the `source` string is a catalog as a CSV (header is required)
+            path = None
+            catalog = source
+        else:
+            # interpret source as a single URI string
+            path = source
+            catalog = None
+    else:
+        # user has passed in some other type, we will interpret as a catalog
+        catalog = source
+
     if catalog is not None:
         if catalog_col_names is None:
             raise Exception("'catalog_col_names' required when DataFrame 'catalog' specified")
@@ -149,6 +183,9 @@ def temp_name():
                 "catalogColumns": to_csv(catalog_col_names)
             })
         elif isinstance(catalog, DataFrame):
+            # check catalog_col_names
+            assert all([c in catalog.columns for c in catalog_col_names]), \
+                "All items in catalog_col_names must be the name of a column in the catalog DataFrame."
             # Create a random view name
             tmp_name = temp_name()
             catalog.createOrReplaceTempView(tmp_name)
@@ -157,6 +194,10 @@ def temp_name():
                 "catalogColumns": to_csv(catalog_col_names)
             })
         elif isinstance(catalog, PdDataFrame):
+            # check catalog_col_names
+            assert all([c in catalog.columns for c in catalog_col_names]), \
+                "All items in catalog_col_names must be the name of a column in the catalog DataFrame."
+
             # Handle to active spark session
             session = SparkContext._active_spark_context._rf_context._spark_session
             # Create a random view name
 
@@ -410,158 +410,6 @@ def test_raster_join(self):
             self.rf.raster_join(rf_prime, join_exprs=self.rf.extent)
 
 
-class RasterSource(TestEnvironment):
-
-    def test_handle_lazy_eval(self):
-        df = self.spark.read.raster(self.img_uri)
-        ltdf = df.select('proj_raster')
-        self.assertGreater(ltdf.count(), 0)
-        self.assertIsNotNone(ltdf.first())
-
-        tdf = df.select(rf_tile('proj_raster'))
-        self.assertGreater(tdf.count(),  0)
-        self.assertIsNotNone(tdf.first())
-
-    def test_strict_eval(self):
-        df_lazy = self.spark.read.raster(self.img_uri, lazy_tiles=True)
-        # when doing Show on a lazy tile we will see something like RasterRefTile(RasterRef(JVMGeoTiffRasterSource(...
-        # use this trick to get the `show` string
-        show_str_lazy = df_lazy.select('proj_raster')._jdf.showString(1, -1, False)
-        self.assertTrue('RasterRef' in show_str_lazy)
-
-        # again for strict
-        df_strict = self.spark.read.raster(self.img_uri, lazy_tiles=False)
-        show_str_strict = df_strict.select('proj_raster')._jdf.showString(1, -1, False)
-        self.assertTrue('RasterRef' not in show_str_strict)
-
-
-    def test_prt_functions(self):
-        df = self.spark.read.raster(self.img_uri) \
-            .withColumn('crs', rf_crs('proj_raster')) \
-            .withColumn('ext', rf_extent('proj_raster')) \
-            .withColumn('geom', rf_geometry('proj_raster'))
-        df.select('crs', 'ext', 'geom').first()                         
-
-    def test_raster_source_reader(self):
-        # much the same as RasterSourceDataSourceSpec here; but using https PDS. Takes about 30s to run
-
-        def l8path(b):
-            assert b in range(1, 12)
-            base = "https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/199/026/LC08_L1TP_199026_20180919_20180928_01_T1/LC08_L1TP_199026_20180919_20180928_01_T1_B{}.TIF"
-            return base.format(b)
-
-        path_param = '\n'.join([l8path(b) for b in [1, 2, 3]])  # "http://foo.com/file1.tif,http://foo.com/file2.tif"
-        tile_size = 512
-
-        df = self.spark.read.raster(
-            tile_dimensions=(tile_size, tile_size),
-            paths=path_param,
-            lazy_tiles=True,
-        ).cache()
-
-        # schema is tile_path and tile
-        # df.printSchema()
-        self.assertTrue(len(df.columns) == 2 and 'proj_raster_path' in df.columns and 'proj_raster' in df.columns)
-
-        # the most common tile dimensions should be as passed to `options`, showing that options are correctly applied
-        tile_size_df = df.select(rf_dimensions(df.proj_raster).rows.alias('r'), rf_dimensions(df.proj_raster).cols.alias('c')) \
-            .groupby(['r', 'c']).count().toPandas()
-        most_common_size = tile_size_df.loc[tile_size_df['count'].idxmax()]
-        self.assertTrue(most_common_size.r == tile_size and most_common_size.c == tile_size)
-
-        # all rows are from a single source URI
-        path_count = df.groupby(df.proj_raster_path).count()
-        print(path_count.toPandas())
-        self.assertTrue(path_count.count() == 3)
-
-    def test_raster_source_reader_schemeless(self):
-        import os.path
-        path = os.path.join(self.resource_dir, "L8-B8-Robinson-IL.tiff")
-        self.assertTrue(not path.startswith('file://'))
-        df = self.spark.read.raster(path)
-        self.assertTrue(df.count() > 0)
-
-    def test_raster_source_catalog_reader(self):
-        import pandas as pd
-
-        scene_dict = {
-            1: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/041/LC08_L1TP_015041_20190305_20190309_01_T1/LC08_L1TP_015041_20190305_20190309_01_T1_B{}.TIF',
-            2: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/042/LC08_L1TP_015042_20190305_20190309_01_T1/LC08_L1TP_015042_20190305_20190309_01_T1_B{}.TIF',
-            3: 'http://landsat-pds.s3.amazonaws.com/c1/L8/016/041/LC08_L1TP_016041_20190224_20190309_01_T1/LC08_L1TP_016041_20190224_20190309_01_T1_B{}.TIF',
-        }
-
-        def path(scene, band):
-            assert band in range(1, 12)
-            p = scene_dict[scene]
-            return p.format(band)
-
-        # Create a pandas dataframe (makes it easy to create spark df)
-        path_pandas = pd.DataFrame([
-            {'b1': path(1, 1), 'b2': path(1, 2), 'b3': path(1, 3)},
-            {'b1': path(2, 1), 'b2': path(2, 2), 'b3': path(2, 3)},
-            {'b1': path(3, 1), 'b2': path(3, 2), 'b3': path(3, 3)},
-        ])
-        # comma separated list of column names containing URI's to read.
-        catalog_columns = ','.join(path_pandas.columns.tolist())  # 'b1,b2,b3'
-        path_table = self.spark.createDataFrame(path_pandas)
-
-        path_df = self.spark.read.raster(
-            tile_dimensions=(512, 512),
-            catalog=path_table,
-            catalog_col_names=catalog_columns,
-            lazy_tiles=True # We'll get an OOM error if we try to read 9 scenes all at once!
-        )
-
-        self.assertTrue(len(path_df.columns) == 6)  # three bands times {path, tile}
-        self.assertTrue(path_df.select('b1_path').distinct().count() == 3)  # as per scene_dict
-        b1_paths_maybe = path_df.select('b1_path').distinct().collect()
-        b1_paths = [s.format('1') for s in scene_dict.values()]
-        self.assertTrue(all([row.b1_path in b1_paths for row in b1_paths_maybe]))
-
-    def test_raster_source_catalog_reader_with_pandas(self):
-        import pandas as pd
-        import geopandas
-        from shapely.geometry import Point
-
-        scene_dict = {
-            1: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/041/LC08_L1TP_015041_20190305_20190309_01_T1/LC08_L1TP_015041_20190305_20190309_01_T1_B{}.TIF',
-            2: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/042/LC08_L1TP_015042_20190305_20190309_01_T1/LC08_L1TP_015042_20190305_20190309_01_T1_B{}.TIF',
-            3: 'http://landsat-pds.s3.amazonaws.com/c1/L8/016/041/LC08_L1TP_016041_20190224_20190309_01_T1/LC08_L1TP_016041_20190224_20190309_01_T1_B{}.TIF',
-        }
-
-        def path(scene, band):
-            assert band in range(1, 12)
-            p = scene_dict[scene]
-            return p.format(band)
-
-        # Create a pandas dataframe (makes it easy to create spark df)
-        path_pandas = pd.DataFrame([
-            {'b1': path(1, 1), 'b2': path(1, 2), 'b3': path(1, 3), 'geo': Point(1, 1)},
-            {'b1': path(2, 1), 'b2': path(2, 2), 'b3': path(2, 3), 'geo': Point(2, 2)},
-            {'b1': path(3, 1), 'b2': path(3, 2), 'b3': path(3, 3), 'geo': Point(3, 3)},
-        ])
-
-        # here a subtle difference with the test_raster_source_catalog_reader test, feed the DataFrame not a CSV and not an already created spark DF.
-        df = self.spark.read.raster(
-            catalog=path_pandas,
-            catalog_col_names=['b1', 'b2', 'b3']
-        )
-        self.assertEqual(len(df.columns), 7)  # three path cols, three tile cols, and geo
-        self.assertTrue('geo' in df.columns)
-        self.assertTrue(df.select('b1_path').distinct().count() == 3)
-
-
-        # Same test with geopandas
-        geo_df = geopandas.GeoDataFrame(path_pandas, crs={'init': 'EPSG:4326'}, geometry='geo')
-        df2 = self.spark.read.raster(
-            catalog=geo_df,
-            catalog_col_names=['b1', 'b2', 'b3']
-        )
-        self.assertEqual(len(df2.columns), 7)  # three path cols, three tile cols, and geo
-        self.assertTrue('geo' in df2.columns)
-        self.assertTrue(df2.select('b1_path').distinct().count() == 3)
-
-
 def suite():
     function_tests = unittest.TestSuite()
     return function_tests