Merge remote-tracking branch 'lt/develop' into docs/fix-250-89

vpipkt · vpipkt · commit 8da23d1dfbe6 · 2019-08-08T13:02:50.000-04:00
Signed-off-by: Jason T. Brown &lt;jason@astraea.earth&gt;
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -204,4 +204,4 @@ workflows:
       - it
       - itWithoutGdal
       - docs
-      - staticAnalysis
+#      - staticAnalysis
diff --git a/core/src/main/scala/org/locationtech/rasterframes/ref/RasterSource.scala b/core/src/main/scala/org/locationtech/rasterframes/ref/RasterSource.scala
@@ -129,13 +129,18 @@ object RasterSource extends LazyLogging {
       } else false
 
     /** Extractor for determining if a scheme indicates GDAL preference.  */
-    def unapply(source: URI): Boolean =
-      gdalOnly(source) || ((preferGdal || source.getScheme.startsWith("gdal")) && GDALRasterSource.hasGDAL)
+    def unapply(source: URI): Boolean = {
+      lazy val schemeIsGdal = Option(source.getScheme())
+        .exists(_.startsWith("gdal"))
+
+      gdalOnly(source) || ((preferGdal || schemeIsGdal) && GDALRasterSource.hasGDAL)
+    }
   }
 
   object IsDefaultGeoTiff {
     def unapply(source: URI): Boolean = source.getScheme match {
-      case "file" | "http" | "https" | "s3" | "" => true
+      case "file" | "http" | "https" | "s3" => true
+      case null | ""                        ⇒ true
       case _                                => false
     }
   }
diff --git a/core/src/test/scala/org/locationtech/rasterframes/ref/RasterSourceSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/ref/RasterSourceSpec.scala
@@ -106,6 +106,13 @@ class RasterSourceSpec extends TestEnvironment with TestData {
       val src = RasterSource(localSrc)
       assert(!src.extent.isEmpty)
     }
+    it("should interpret no scheme as file://"){
+      val localSrc = geotiffDir.resolve("LC08_B7_Memphis_COG.tiff").toString()
+      val schemelessUri = new URI(localSrc)
+      schemelessUri.getScheme should be (null)
+      val src = RasterSource(schemelessUri)
+      assert(!src.extent.isEmpty)
+    }
   }
 
   if(GDALRasterSource.hasGDAL) {
@@ -132,6 +139,15 @@ class RasterSourceSpec extends TestEnvironment with TestData {
 
         gdal.bandCount should be (3)
       }
+
+      it("should interpret no scheme as file://") {
+        val localSrc = geotiffDir.resolve("LC08_B7_Memphis_COG.tiff").toString()
+        val schemelessUri = new URI(localSrc)
+        val gdal = GDALRasterSource(schemelessUri)
+        val jvm = JVMGeoTiffRasterSource(schemelessUri)
+        gdal.extent should be (jvm.extent)
+        gdal.cellSize should be(jvm.cellSize)
+      }
     }
   }
 
diff --git a/pyrasterframes/src/main/python/docs/concepts.md b/pyrasterframes/src/main/python/docs/concepts.md
@@ -45,6 +45,10 @@ A "NoData" (or N/A) value is a specifically identified value for a cell type use
 
 A scene (or granule) is a discrete instance of EO @ref:[raster data](concepts.md#raster) with a specific extent (region), date-time, and map projection (or CRS).
 
+## Band
+
+A @ref:[scene](concepts.md#scene) frequently defines many different measurements captured a the same date-time, over the same extent, and meant to be processed together. These different measurements are referred to as bands. The name comes from the varying bandwidths of light and electromagnetic radiation measured in many EO datasets. 
+
 ## Coordinate Reference System (CRS)
 
 A [coordinate reference system (or spatial reference system)][CRS] is a set of mathematical constructs used to translate locations on the three-dimensional surface of the earth to the two dimensional raster grid. A CRS typically accompanies any EO data so it can be precisely located. 
@@ -57,10 +61,10 @@ An extent (or bounding box) is a rectangular region specifying the geospatial co
 
 A tile (sometimes called a "chip") is a rectangular subset of a @ref:[scene](concepts.md#scene). As a scene is a raster, a tile is also a raster. A tile can conceptually be thought of as a two-dimensional array. 
 
-Some EO data has many bands or channels. Tiles in this context are conceptually a three-dimensional array, with the extra dimension representing the bands.
+Some EO data has many @ref:[bands](concepts.md#band) or channels. Within RasterFrames, this third dimension is handled across columns of the DataFrame, such that the tiles within DataFrames are all two-dimensional arrays.
 
 Tiles are often square and the dimensions are some power of two, for example 256 by 256.
 
-The tile is the primary discretization unit used in RasterFrames. Each band of a scene is in a separate column. The scene's overall @ref:[extent](concepts.md#extent) is carved up into smaller extents for each tile. Each row of the DataFrame contains a two-dimensional tile per band column.
+The tile is the primary discretization unit used in RasterFrames. The scene's overall @ref:[extent](concepts.md#extent) is carved up into smaller extents and spread across rows. 
 
-[CRS]: https://en.wikipedia.org/wiki/Spatial_reference_system
+[CRS]: https://en.wikipedia.org/wiki/Spatial_reference_system
diff --git a/pyrasterframes/src/main/python/docs/raster-catalogs.pymd b/pyrasterframes/src/main/python/docs/raster-catalogs.pymd
@@ -1,15 +1,15 @@
 # Raster Catalogs
 
-While much interesting processing can be done on a @ref:[single raster file](raster-read.md#single-raster), RasterFrames shines when _Catalogs_ of raster data are to be processed. In its simplest form, a _Catalog_ is a listing of @ref:[URLs referencing raster files](raster-read.md#uri-formats). This listing can be manifested as a DataFrame, CSV file or CSV string. The _Catalog_ is input into the `raster` DataSource, described in the next page.
+While much interesting processing can be done on a @ref:[single raster file](raster-read.md#single-raster), RasterFrames shines when _catalogs_ of raster data are to be processed. In its simplest form, a _catalog_ is a list of @ref:[URLs referencing raster files](raster-read.md#uri-formats). This list can be a Spark DataFrame, Pandas DataFrame, CSV file or CSV string. The _catalog_ is input into the `raster` DataSource, described in the @ref:[next page](raster-read.pymd), which creates tiles from the rasters at the referenced URLs.
 
-A _Catalog_ can have one or two dimensions:
+A _catalog_ can have one or two dimensions:
 
-* One-D: A single column containing one or many URLs across the rows. All referenced rasters represent the same content type. For example, a column of URLs to Landsat 8 NIR rasters covering Europe. Each row represents different places and times.
-* Two-D: Many columns containing raster URLs. Each column references the same content type, and each row represents the same place and time. For example, red-, green-, and blue-band columns for scenes covering Europe. Each row represents a single spatiotemporal location (or scene) with the same dimensions, extent, [_CRS_][CRS], etc across the row.
+* One-D: A single column contains raster URLs across the rows. All referenced rasters represent the same @ref:[band](concepts.md#band). For example, a column of URLs to Landsat 8 near-infrared rasters covering Europe. Each row represents different places and times.
+* Two-D: Many columns containing raster URLs. Each column references the same band, and each row represents the same place and time. For example, red-, green-, and blue-band columns for scenes covering Europe. Each row represents a single @ref:[scene](concepts.md#scene) with the same resolution, extent, [_CRS_][CRS], etc across the row.
 
 ## Creating a Catalog
 
-This section will provide some examples of creating your own _Catalogs_, as well as introduce some experimental _Catalogs_ built into RasterFrames. Reading raster data represented by a _Catalog_ is covered in more detail in the @ref:[next page](raster-read.md).
+This section will provide some examples of creating your own _catalogs_, as well as introduce some experimental _catalogs_ built into RasterFrames. Reading raster data represented by a _catalog_ is covered in more detail in the @ref:[next page](raster-read.md).
 
 ```python, echo=False
 from IPython.display import display
@@ -25,30 +25,37 @@ A single URL is the simplest form of a catalog.
 
 ```python
 from pyspark.sql import Row
-my_cat = "https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018185/MCD43A4.A2018185.h04v09.006.2018194032851_B01.TIF"
-# or
-my_cat_df = spark.createDataFrame([Row(B01=my_cat)])
+
+file_uri = "/data/raster/myfile.tif"
+# Pandas DF
+my_cat = pd.DataFrame({'B01': [file_uri]})
+
+# equivalent Spark DF
+my_cat = spark.createDataFrame([Row(B01=file_uri)])
+
+#equivalent CSV string
+my_cat = "B01\n{}".format(file_uri)
 ```
 
-A single column represents the same content type with different observations along the rows. In this example it is band 1 of MODIS, which is visible red. In the example the location of the images is the same (h04v09) but the dates differ.
+A single column represents the same content type with different observations along the rows. In this example it is band 1 of MODIS surface reflectance, which is visible red. In the example the location of the images is the same, indicated by the granule identifier `h04v09`, but the dates differ: 2018185 (July 4, 2018) and 2018188 (July 7, 2018).
 
 ```python
 scene1_B01 = "https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018185/MCD43A4.A2018185.h04v09.006.2018194032851_B01.TIF"
 scene2_B01 = "https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018188/MCD43A4.A2018188.h04v09.006.2018198232008_B01.TIF"
 
-# As CSV string
-my_cat = '\n'.join(['B01', scene1_B01, scene2_B01])
-# or
-my_cat_df = spark.createDataFrame([Row(B01=scene1_B01), Row(B01=scene2_B01)])
-my_cat_df.printSchema()
-# or
-pandas_cat = pd.DataFrame([{'B01': scene1_B01}, {'B01': scene2_B01}])
-my_cat = spark.createDataFrame(pandas_cat)
+# a pandas DF
+one_d_cat = pd.DataFrame({'B01': [scene1_B01, scene2_B01]})
+
+# equivalent spark DF
+one_d_cat = spark.createDataFrame([Row(B01=scene1_B01), Row(B01=scene2_B01)])
+
+# equivalent CSV string
+one_d_cat = '\n'.join(['B01', scene1_B01, scene2_B01])
 ```
 
 ### Two-D
 
-Example of a multiple columns representing multiple content types (bands) across multiple scenes. In each row, the scene is the same. The first column is band 1 and the second is band 2, near infrared.
+Example of a multiple columns representing multiple content types (bands) across multiple scenes. In each row, the scene is the same: granule id `h04v09` on July 4 or July 7, 2018. The first column is band 1, red,  and the second is band 2, near infrared.
 
 ```python
 scene1_B01 = "https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018185/MCD43A4.A2018185.h04v09.006.2018194032851_B01.TIF"
@@ -68,36 +75,37 @@ my_cat_df.printSchema()
 
 ## Using External Catalogs
 
-The simplest example of an external _Catalog_ is a DataFrame (or a transformation of) in one of the formats above. Here's an extended example of reading an external CSV file of MODIS scenes and transforming it into a _Catalog_. The metadata describing the content of each URL is an important aspect of processing raster data. This example includes some minimal metadata.
+The concept of a _catalog_ is much more powerful when we consider examples beyond constructing the DataFrame, and instead read the data from an external source. Here's an extended example of reading an cloud-hosted CSV file containing MODIS scene metadata and transforming it into a _catalog_. The metadata describing the content of each URL is an important aspect of processing raster data. 
 
 ```python
 from pyspark import SparkFiles
 from pyspark.sql import functions as F
 
 spark.sparkContext.addFile("https://modis-pds.s3.amazonaws.com/MCD43A4.006/2018-07-04_scenes.txt")
 
-# The scenes list file has index URIs in the download_url column, for example:
-#    https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018185/index.html
-# Image URIs take the form:
-#    https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018185/MCD43A4.A2018185.h04v09.006.2018194032851_B01.TIF
-
-modis_catalog = spark.read \
+scene_list = spark.read \
     .format("csv") \
     .option("header", "true") \
-    .load(SparkFiles.get("2018-07-04_scenes.txt")) \
+    .load(SparkFiles.get("2018-07-04_scenes.txt")) 
+scene_list.head(3)
+```
+
+Observe the scenes list file has URIs to `index.html` files in the download_url column. The image URI's are in the same directory. The filenames are of the form `${gid}_B${band}.TIF`. The next code chunk builds these URIs, which completes our catalog.
+
+```python
+modis_catalog = scene_list \
     .withColumn('base_url',
         F.concat(F.regexp_replace('download_url', 'index.html$', ''), 'gid',)
     ) \
     .withColumn('B01' , F.concat('base_url', F.lit("_B01.TIF"))) \
     .withColumn('B02' , F.concat('base_url', F.lit("_B02.TIF"))) \
     .withColumn('B03' , F.concat('base_url', F.lit("_B03.TIF")))
-# ... and so on.
-modis_catalog.printSchema()
+modis_catalog.head(3)
 ```
 
 ## Using Built-in Experimental Catalogs
 
-RasterFrames comes with two experimental catalogs over the AWS PDS [Landsat 8][Landsat] and [MODIS][MODIS] repositories. They are created by downloading the latest scene lists and transforming as in the prior example.
+RasterFrames comes with two experimental catalogs over the AWS PDS [Landsat 8][Landsat] and [MODIS][MODIS] repositories. They are created by downloading the latest scene lists and building up the appropriate band URI columns as in the prior example.
 
 > Note: The first time you run these may take some time, as the catalogs are large. However, they are cached and subsequent invocations should be faster.
 
diff --git a/pyrasterframes/src/main/python/docs/raster-read.pymd b/pyrasterframes/src/main/python/docs/raster-read.pymd
@@ -148,11 +148,11 @@ In the initial examples on this page, you may have noticed that the realized (no
 
 ## Multiband Rasters
 
-A multiband raster represents a three dimensional numeric array. The first two dimensions are spatial, and the third dimension is typically designated as different bands. The bands could represent intensity of different wavelengths of light (or other electromagnetic radiation), or they could represent other phenomena such as measurement time, quality indications, or additional measurements. 
+A multiband raster represents a three dimensional numeric array. The first two dimensions are spatial, and the third dimension is typically designated as different @ref:[bands](concepts.md#band). The bands could represent intensity of different wavelengths of light (or other electromagnetic radiation), or they could represent other phenomena such as measurement time, quality indications, or additional measurements.
 
 Multiband rasters files have a strictly ordered set of bands, which are typically indexed from 1. Some files have metadata tags associated with each band. Some files have a color interpetation metadata tag indicating how to interpret the bands.
 
-When reading a multiband raster or a _Catalog_ describing multiband rasters, you will need to know ahead of time which bands you want to read. You will specify the bands to read, **indexed from zero**, passing a list of integers into the `band_indexes` parameter of the `raster` reader.
+When reading a multiband raster or a _catalog_ describing multiband rasters, you will need to know ahead of time which bands you want to read. You will specify the bands to read, **indexed from zero**, as a list of integers into the `band_indexes` parameter of the `raster` reader.
 
 For example, we can read a four-band (red, green, blue, and near-infrared) image as follows. The individual rows of the resulting DataFrame still represent distinct spatial extents, with a projected raster column for each band specified by `band_indexes`.
 
diff --git a/pyrasterframes/src/main/python/docs/raster-write.pymd b/pyrasterframes/src/main/python/docs/raster-write.pymd
@@ -66,7 +66,7 @@ Fortunately, we can use the cluster computing capability to downsample the data
 
 ```python write_geotiff
 outfile = os.path.join('/tmp', 'geotiff-overview.tif')
-spark_df.write.geotiff('file://' + outfile, crs='EPSG:4326', raster_dimensions=(256, 256))
+spark_df.write.geotiff(outfile, crs='EPSG:4326', raster_dimensions=(256, 256))
 ```
 
 View it with `rasterio` to check the results:
@@ -83,6 +83,10 @@ with rasterio.open(outfile) as src:
 
 If there are many tile or projected raster columns in the DataFrame, the GeoTIFF writer will write each one as a separate band in the file. Each band in the output will be tagged the input column names for reference.
 
+```python, echo=False
+os.remove(outfile)
+```
+
 ## GeoTrellis Layers
 
 [GeoTrellis][GeoTrellis] is one of the key libraries that RasterFrames builds upon. It provides a Scala language API to working with large raster data with Apache Spark. Ingesting raster data into a Layer is one of the key concepts for creating a dataset for processing on Spark. RasterFrames writes data from an appropriate DataFrame into a [GeoTrellis Layer](https://geotrellis.readthedocs.io/en/latest/guide/tile-backends.html). RasterFrames provides a `geotrellis` DataSource that supports both @ref:[reading](raster-read.md#geotrellis) and writing of GeoTrellis layers.
diff --git a/pyrasterframes/src/main/python/tests/GeoTiffWriterTests.py b/pyrasterframes/src/main/python/tests/GeoTiffWriterTests.py
@@ -33,11 +33,14 @@ def _tmpfile():
 
     def test_identity_write(self):
         rf = self.spark.read.geotiff(self.img_uri)
+        rf_count = rf.count()
+        self.assertTrue(rf_count > 0)
 
         dest = self._tmpfile()
         rf.write.geotiff(dest)
 
-        rf2 = self.spark.read.geotiff('file://' + dest)
+        rf2 = self.spark.read.geotiff(dest)
+
         self.assertEqual(rf2.count(), rf.count())
 
         os.remove(dest)
@@ -47,7 +50,7 @@ def test_unstructured_write(self):
         dest_file = self._tmpfile()
         rf.write.geotiff(dest_file, crs='EPSG:32616')
 
-        rf2 = self.spark.read.raster('file://' + dest_file)
+        rf2 = self.spark.read.raster(dest_file)
         self.assertEqual(rf2.count(), rf.count())
 
         with rasterio.open(self.img_uri) as source:
@@ -58,6 +61,22 @@ def test_unstructured_write(self):
 
         os.remove(dest_file)
 
+    def test_unstructured_write_schemeless(self):
+        # should be able to write a projected raster tile column to path like '/data/foo/file.tif'
+        from pyrasterframes.rasterfunctions import rf_agg_stats, rf_crs
+        rf = self.spark.read.raster(self.img_uri)
+        max = rf.agg(rf_agg_stats('proj_raster').max.alias('max')).first()['max']
+        crs = rf.select(rf_crs('proj_raster').crsProj4.alias('c')).first()['c']
+
+        dest_file = self._tmpfile()
+        self.assertTrue(not dest_file.startswith('file://'))
+        rf.write.geotiff(dest_file, crs=crs)
+
+        with rasterio.open(dest_file) as src:
+            self.assertEqual(src.read().max(), max)
+
+        os.remove(dest_file)
+
     def test_downsampled_write(self):
         rf = self.spark.read.raster(self.img_uri)
         dest = self._tmpfile()
diff --git a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py
@@ -470,6 +470,13 @@ def l8path(b):
         print(path_count.toPandas())
         self.assertTrue(path_count.count() == 3)
 
+    def test_raster_source_reader_schemeless(self):
+        import os.path
+        path = os.path.join(self.resource_dir, "L8-B8-Robinson-IL.tiff")
+        self.assertTrue(not path.startswith('file://'))
+        df = self.spark.read.raster(path)
+        self.assertTrue(df.count() > 0)
+
     def test_raster_source_catalog_reader(self):
         import pandas as pd
 
diff --git a/rf-notebook/src/main/docker/Dockerfile b/rf-notebook/src/main/docker/Dockerfile
@@ -2,7 +2,8 @@ FROM s22s/pyspark-notebook:spark-2.3.3-hadoop-2.7
 
 MAINTAINER Astraea, Inc.
 
-ENV RF_LIB_LOC /usr/local/rasterframes
+ENV RF_LIB_LOC=/usr/local/rasterframes \
+    LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib"
 
 USER root
 
@@ -14,11 +15,6 @@ EXPOSE 4040 4041 4042 4043 4044
 RUN conda install --quiet --yes \
     anaconda sphinx nbsphinx shapely numpy folium geopandas geojsonio
 
-RUN apt-get update && \
-    apt-get install -y gdal-bin && \
-    apt autoremove && \
-    apt-get clean all
-
 # Cleanup pip residuals
 RUN rm -rf /home/$NB_USER/.local && \
     fix-permissions /home/$NB_USER

Original file line number	Diff line number	Diff line change
`@@ -106,6 +106,13 @@ class RasterSourceSpec extends TestEnvironment with TestData {`
`106`	`106`	`val src = RasterSource(localSrc)`
`107`	`107`	`assert(!src.extent.isEmpty)`
`108`	`108`	`}`
	`109`	`+ it("should interpret no scheme as file://"){`
	`110`	`+ val localSrc = geotiffDir.resolve("LC08_B7_Memphis_COG.tiff").toString()`
	`111`	`+ val schemelessUri = new URI(localSrc)`
	`112`	`+ schemelessUri.getScheme should be (null)`
	`113`	`+ val src = RasterSource(schemelessUri)`
	`114`	`+ assert(!src.extent.isEmpty)`
	`115`	`+ }`
`109`	`116`	`}`
`110`	`117`
`111`	`118`	`if(GDALRasterSource.hasGDAL) {`
`@@ -132,6 +139,15 @@ class RasterSourceSpec extends TestEnvironment with TestData {`
`132`	`139`
`133`	`140`	`gdal.bandCount should be (3)`
`134`	`141`	`}`
	`142`	`+`
	`143`	`+ it("should interpret no scheme as file://") {`
	`144`	`+ val localSrc = geotiffDir.resolve("LC08_B7_Memphis_COG.tiff").toString()`
	`145`	`+ val schemelessUri = new URI(localSrc)`
	`146`	`+ val gdal = GDALRasterSource(schemelessUri)`
	`147`	`+ val jvm = JVMGeoTiffRasterSource(schemelessUri)`
	`148`	`+ gdal.extent should be (jvm.extent)`
	`149`	`+ gdal.cellSize should be(jvm.cellSize)`
	`150`	`+ }`
`135`	`151`	`}`
`136`	`152`	`}`
`137`	`153`