Merge branch 'develop' into docs/sql-support

metasim · metasim · commit 9ac281957f18 · 2019-08-01T11:14:56.000-04:00
* develop: Regression fix. GeoTIFF writing post-walkthrough comments. Update release notes to reflect change at e8d117c Bumped to JVM to Spark 2.3.3. Simplify python test_sql that has been failing on travis Tighten up python Tile equality test for nodata/masked cells; fix test_matmul Fix python raster reader strict eval test condition pip '--user' not available in virtualenv. Fixes to TravisCI build to handle pinning of Python and Java version to support PySpark. # Conflicts: # core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala # datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala
diff --git a/.travis.yml b/.travis.yml
@@ -1,6 +1,9 @@
 sudo: false
 dist: xenial
-language: scala
+language: python
+
+python:
+  - "3.7"
 
 cache:
   directories:
@@ -11,30 +14,24 @@ cache:
 scala:
   - 2.11.11
 
-jdk:
-  - openjdk8
-
-python:
-  - "3.7"
+env:
+  - COURSIER_VERBOSITY=-1 JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
 
 addons:
   apt:
     packages:
+      - openjdk-8-jdk
       - pandoc
-      - python-pip
 
 install:
-  - pip install setuptools
-
-sbt_args: -no-colors
+  - pip install rasterio shapely pandas numpy
+  - wget -O - https://piccolo.link/sbt-1.2.8.tgz | tar xzf -
 
 script:
-  - sbt test
-  - sbt it:test
+  - sbt/bin/sbt -java-home $JAVA_HOME -batch test
+  - sbt/bin/sbt -java-home $JAVA_HOME -batch it:test
   #  - sbt -Dfile.encoding=UTF8 clean coverage test coverageReport
   # Tricks to avoid unnecessary cache updates
   - find $HOME/.sbt -name "*.lock" | xargs rm
   - find $HOME/.ivy2 -name "ivydata-*.properties" | xargs rm
 
-#after_success:
-#  - bash <(curl -s https://codecov.io/bash)
diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/ProjectedLayerMetadataAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/ProjectedLayerMetadataAggregate.scala
@@ -94,9 +94,8 @@ object ProjectedLayerMetadataAggregate {
   // Ordering must match InputRecord schema
     new ProjectedLayerMetadataAggregate(destCRS, destDims)(extent, crs, cellType, tileSize).as[TileLayerMetadata[SpatialKey]]
 
-
   private[expressions]
-  case class InputRecord(extent: Extent, crs: CRS, cellType: CellType, tileSize: TileDimensions) { self ⇒
+  case class InputRecord(extent: Extent, crs: CRS, cellType: CellType, tileSize: TileDimensions) {
     def toBufferRecord(destCRS: CRS): BufferRecord = {
       val transform = Transform(crs, destCRS)
 
diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala
@@ -58,7 +58,7 @@ class TileRasterizerAggregate(prd: ProjectedRasterDefinition) extends UserDefine
   override def dataType: DataType = schemaOf[Raster[Tile]]
 
   override def initialize(buffer: MutableAggregationBuffer): Unit = {
-    buffer(0) = ArrayTile.empty(prd.cellType, prd.cols, prd.rows)
+    buffer(0) = ArrayTile.empty(prd.cellType, prd.totalCols, prd.totalRows)
   }
 
   override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
@@ -90,13 +90,14 @@ class TileRasterizerAggregate(prd: ProjectedRasterDefinition) extends UserDefine
 object TileRasterizerAggregate {
   val nodeName = "rf_tile_rasterizer_aggregate"
   /**  Convenience grouping of  parameters needed for running aggregate. */
-  case class ProjectedRasterDefinition(cols: Int, rows: Int, cellType: CellType, crs: CRS, extent: Extent, sampler: ResampleMethod = ResampleMethod.DEFAULT)
+  case class ProjectedRasterDefinition(totalCols: Int, totalRows: Int, cellType: CellType, crs: CRS, extent: Extent, sampler: ResampleMethod = ResampleMethod.DEFAULT)
 
   object ProjectedRasterDefinition {
     def apply(tlm: TileLayerMetadata[_]): ProjectedRasterDefinition = apply(tlm, ResampleMethod.DEFAULT)
 
     def apply(tlm: TileLayerMetadata[_], sampler: ResampleMethod): ProjectedRasterDefinition = {
-      val actualSize = tlm.layout.toRasterExtent().gridBoundsFor(tlm.extent)
+      // Try to determine the actual dimensions of our data coverage
+      val actualSize = tlm.layout.toRasterExtent().gridBoundsFor(tlm.extent) // <--- Do we have the math right here?
       val cols = actualSize.width
       val rows = actualSize.height
       new ProjectedRasterDefinition(cols, rows, tlm.cellType, tlm.crs, tlm.extent, sampler)
diff --git a/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala
@@ -25,7 +25,7 @@ import geotrellis.proj4.LatLng
 import geotrellis.raster.{ByteCellType, GridBounds, TileLayout}
 import geotrellis.spark.tiling.{CRSWorldExtent, LayoutDefinition}
 import geotrellis.spark.{KeyBounds, SpatialKey, TileLayerMetadata}
-import org.apache.spark.sql.{Encoder, Encoders}
+import org.apache.spark.sql.Encoders
 import org.locationtech.rasterframes.util.SubdivideSupport
 
 /**
@@ -62,6 +62,7 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu
       df.extentColumns.size should be(2)
     }
     it("should find multiple crs columns") {
+      // Not sure why implicit resolution isn't handling this properly.
       implicit val enc = Encoders.tuple(crsEncoder, Encoders.STRING, crsEncoder, Encoders.scalaDouble)
       val df = Seq((pe.crs, "fred", pe.crs, 34.0)).toDF("c1", "s", "c2", "n")
       df.crsColumns.size should be (2)
@@ -72,7 +73,6 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu
       assert(tl1.subdivide(1) === tl1)
       assert(tl1.subdivide(2) === TileLayout(4, 6, 5, 5))
       assertThrows[IllegalArgumentException](tl1.subdivide(-1))
-
     }
     it("should split KeyBounds[SpatialKey]") {
       val grid = GridBounds(0, 0, 9, 9)
diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala
@@ -28,6 +28,10 @@ import _root_.geotrellis.raster._
 import _root_.geotrellis.raster.io.geotiff.compression._
 import _root_.geotrellis.raster.io.geotiff.tags.codes.ColorSpace
 import _root_.geotrellis.raster.io.geotiff.{GeoTiffOptions, MultibandGeoTiff, Tags, Tiled}
+<<<<<<< HEAD
+=======
+import _root_.geotrellis.spark._
+>>>>>>> develop
 import com.typesafe.scalalogging.LazyLogging
 import org.apache.spark.sql._
 import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}
@@ -94,32 +98,37 @@ class GeoTiffDataSource extends DataSourceRegister
       require(parameters.crs.nonEmpty, "A destination CRS must be provided")
       require(tileCols.nonEmpty, "need at least one tile column")
 
+      // Grab CRS to project into
       val destCRS = parameters.crs.get
 
-      val (extCol, crsCol) = {
+      // Select the anchoring Tile, Extent and CRS columns
+      val (extCol, crsCol, tileCol) = {
+        // Favor "ProjectedRaster" columns
         val prCols = df.projRasterColumns
         if(prCols.nonEmpty) {
-          (rf_extent(prCols.head), rf_crs(prCols.head))
+          (rf_extent(prCols.head), rf_crs(prCols.head), rf_tile(prCols.head))
         }
         else {
+          // If no "ProjectedRaster" column, look for single Extent and CRS columns.
           val crsCols = df.crsColumns
           require(crsCols.size == 1, "Exactly one CRS column must be in DataFrame")
           val extentCols = df.extentColumns
           require(extentCols.size == 1, "Exactly one Extent column must be in DataFrame")
-          (extentCols.head, crsCols.head)
+          (extentCols.head, crsCols.head, tileCols.head)
         }
       }
 
-      val tlm = df
+      // Scan table and constuct what the TileLayerMetadata would be in the specified destination CRS.
+      val tlm: TileLayerMetadata[SpatialKey] = df
         .select(ProjectedLayerMetadataAggregate(
-          destCRS, extCol, crsCol, rf_cell_type(tileCols.head), rf_dimensions(tileCols.head)
+          destCRS, extCol, crsCol, rf_cell_type(tileCol), rf_dimensions(tileCol)
         ))
         .first()
 
       val c = ProjectedRasterDefinition(tlm)
 
       val config = parameters.rasterDimensions.map { dims =>
-        c.copy(cols = dims.cols, rows = dims.rows)
+        c.copy(totalCols = dims.cols, totalRows = dims.rows)
       }.getOrElse(c)
 
       val aggs = tileCols
diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md
@@ -4,7 +4,7 @@
 
 ### 0.8.0
 
-* Upgraded to the following core dependencies: Spark 2.3.2, GeoTrellis 2.3.0, GeoMesa 2.2.1, JTS 1.16.0.
+* Upgraded to the following core dependencies: Spark 2.3.3, GeoTrellis 2.3.0, GeoMesa 2.2.1, JTS 1.16.0.
 * Build `pyrasterframes` binary distribution for pip installation.
 * Added support for rendering RasterFrame types in IPython/Jupyter.
 * Added new tile functions `rf_round`, `rf_abs`, `rf_log`, `rf_log10`, `rf_log2`, `rf_log1p`, `rf_exp`, `rf_exp10`, `rf_exp2`, `rf_expm1`, `rf_resample`.
diff --git a/pyrasterframes/src/main/python/docs/raster-write.pymd b/pyrasterframes/src/main/python/docs/raster-write.pymd
@@ -63,7 +63,7 @@ GeoTIFF is one of the most common file formats for spatial data, providing flexi
 
 One downside to GeoTIFF is that it is not a big data native format. To create a GeoTIFF all the data to be encoded has to be in the memory of one compute node (in Spark parlance, this is a "collect"), limiting it's maximum size substantially compared to that of a full cluster environment. When rendering GeoTIFFs in RasterFrames, you either need to specify the dimensions of the output raster, or be aware of how big the collected data will end up being.
 
-Fortunately, we can use the cluster computing capability to downlample the data into a more manageble saze. For sake of example, let's render a simple RGB overview image of our scene as a small raster:
+Fortunately, we can use the cluster computing capability to downlample the data (using nearest-neighbor) into a more manageble size. For sake of example, let's render a simple RGB overview image of our scene as a small raster, reprojecting it to latitude and longitude coordinates on the [WGS84](https://en.wikipedia.org/wiki/World_Geodetic_System) reference ellipsoid (aka [EPSG:4326](https://spatialreference.org/ref/epsg/4326/)):
 
 ```python write_geotiff
 import os.path
@@ -94,10 +94,9 @@ with rasterio.open(outfile) as src:
 
 ## GeoTrellis Layers
 
-[GeoTrellis][GeoTrellis] is one of the key libraries that RasterFrames builds upon. It provides a Scala language API to working with large raster data with Apache Spark. Ingesting raster data into a Layer is one of the key concepts for creating a dataset for processing on Spark. RasterFrames write data from an appropriate DataFrame into a GeoTrellis Layer.
-
-\[ More details see https://s22s.myjetbrains.com/youtrack/issue/RF-72 \]
+[GeoTrellis][GeoTrellis] is one of the key libraries that RasterFrames builds upon. It provides a Scala language API to working with large raster data with Apache Spark. Ingesting raster data into a Layer is one of the key concepts for creating a dataset for processing on Spark. RasterFrames write data from an appropriate DataFrame into a [GeoTrellis Layer](https://geotrellis.readthedocs.io/en/latest/guide/tile-backends.html). RasterFrames provides a `geotrellis` DataSource that supports both reading and writing of GeoTrellis layers.
 
+> An example is forthcoming.
 
 ## Parquet
 
diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py
@@ -180,18 +180,25 @@ def _geotiff_writer(
     raster_dimensions=None,
     **options):
 
+    def set_dims(parts):
+        parts = [int(p) for p in parts]
+        assert(len(parts) == 2, "Expected dimensions specification to have exactly two components")
+        assert(all([p > 0 for p in parts]), "Expected all components in dimensions to be positive integers")
+        options.update({
+            "imageWidth": parts[0],
+            "imageHeight": parts[1]
+        })
+        try:
+            parts = [int(p) for p in parts]
+            assert(all([p > 0 for p in parts]), 'nice message')
+        except ValueError:
+            raise
+
     if raster_dimensions is not None:
-        if isinstance(raster_dimensions, tuple):
-            options.update({
-                "imageWidth": raster_dimensions[0],
-                "imageHeight": raster_dimensions[1]
-            })
+        if isinstance(raster_dimensions, (list, tuple)):
+            set_dims(raster_dimensions)
         elif isinstance(raster_dimensions, str):
-            parts = raster_dimensions.split(',')
-            options.update({
-                "imageWidth": parts[0],
-                "imageHeight": parts[1]
-            })
+            set_dims(raster_dimensions.split(','))
 
     if crs is not None:
         options.update({
@@ -200,8 +207,11 @@ def _geotiff_writer(
 
     return _aliased_writer(df_writer, "geotiff", path, **options)
 
-# Patch new method on SparkSession to mirror Scala approach
+
+# Patch RasterFrames initialization method on SparkSession to mirror Scala approach
 SparkSession.withRasterFrames = _rf_init
+
+# Patch Kryo serialization initialization method on SparkSession.Builder to mirror Scala approach
 SparkSession.Builder.withKryoSerialization = _kryo_init
 
 # Add the 'asLayer' method to pyspark DataFrame
diff --git a/pyrasterframes/src/main/python/pyrasterframes/rf_types.py b/pyrasterframes/src/main/python/pyrasterframes/rf_types.py
@@ -298,7 +298,8 @@ def __init__(self, cells, cell_type=None):
 
     def __eq__(self, other):
         if type(other) is type(self):
-            return self.cell_type == other.cell_type and np.ma.allequal(self.cells, other.cells)
+            return self.cell_type == other.cell_type and \
+                   np.ma.allequal(self.cells, other.cells, fill_value=True)
         else:
             return False
 
diff --git a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py
@@ -112,28 +112,23 @@ def test_aggregations(self):
         self.assertEqual(row['rf_agg_stats(tile)'].data_cells, row['rf_agg_data_cells(tile)'])
 
     def test_sql(self):
-        self.rf.createOrReplaceTempView("rf")
-
-        dims = self.rf.withColumn('dims', rf_dimensions('tile')).first().dims
-        dims_str = """{}, {}""".format(dims.cols, dims.rows)
-
-        self.spark.sql("""SELECT tile, rf_make_constant_tile(1, {}, 'uint16') AS One, 
-                            rf_make_constant_tile(2, {}, 'uint16') AS Two FROM rf""".format(dims_str, dims_str)) \
-            .createOrReplaceTempView("r3")
-
-        ops = self.spark.sql("""SELECT tile, rf_local_add(tile, One) AS AndOne, 
-                                    rf_local_subtract(tile, One) AS LessOne, 
-                                    rf_local_multiply(tile, Two) AS TimesTwo, 
-                                    rf_local_divide(tile, Two) AS OverTwo 
-                                FROM r3""")
-
-        # ops.printSchema
-        statsRow = ops.select(rf_tile_mean('tile').alias('base'),
-                              rf_tile_mean("AndOne").alias('plus_one'),
-                              rf_tile_mean("LessOne").alias('minus_one'),
-                              rf_tile_mean("TimesTwo").alias('double'),
-                              rf_tile_mean("OverTwo").alias('half')) \
-            .first()
+        self.rf.createOrReplaceTempView("rf_test_sql")
+
+        self.spark.sql("""SELECT tile, 
+                            rf_local_add(tile, 1) AS and_one, 
+                            rf_local_subtract(tile, 1) AS less_one, 
+                            rf_local_multiply(tile, 2) AS times_two, 
+                            rf_local_divide(tile, 2) AS over_two 
+                        FROM rf_test_sql""").createOrReplaceTempView('rf_test_sql_1')
+
+        statsRow = self.spark.sql("""
+        SELECT rf_tile_mean(tile) as base,
+            rf_tile_mean(and_one) as plus_one,
+            rf_tile_mean(less_one) as minus_one,
+            rf_tile_mean(times_two) as double,
+            rf_tile_mean(over_two) as half
+        FROM rf_test_sql_1
+        """).first()
 
         self.assertTrue(self.rounded_compare(statsRow.base, statsRow.plus_one - 1))
         self.assertTrue(self.rounded_compare(statsRow.base, statsRow.minus_one + 1))
@@ -416,8 +411,6 @@ def less_pi(t):
 
 class TileOps(TestEnvironment):
 
-    from pyrasterframes.rf_types import Tile
-
     def setUp(self):
         # convenience so we can assert around Tile() == Tile()
         self.t1 = Tile(np.array([[1, 2],
@@ -473,9 +466,11 @@ def test_matmul(self):
         #     r1 = self.t1 @ self.t2
         r1 = self.t1.__matmul__(self.t2)
 
-        nd = r1.cell_type.no_data_value()
-        e1 = Tile(np.ma.masked_equal(np.array([[nd, 10],
-                                               [nd, nd]], dtype=r1.cell_type.to_numpy_dtype()), nd))
+        # The behavior of np.matmul with masked arrays is not well documented
+        # it seems to treat the 2nd arg as if not a MaskedArray
+        e1 = Tile(np.matmul(self.t1.cells, self.t2.cells), r1.cell_type)
+
+        self.assertTrue(r1 == e1, "{} was not equal to {}".format(r1, e1))
         self.assertEqual(r1, e1)
 
 
@@ -598,7 +593,7 @@ def test_strict_eval(self):
         # again for strict
         df_strict = self.spark.read.raster(self.img_uri, lazy_tiles=False)
         show_str_strict = df_strict.select('proj_raster')._jdf.showString(1, -1, False)
-        self.assertTrue('RasterRef' not in show_str_lazy)
+        self.assertTrue('RasterRef' not in show_str_strict)
 
 
     def test_prt_functions(self):