Fixed problems with png data normalization with nan values.

metasim · metasim · commit eb9da2c3ca84 · 2019-08-16T14:27:21.000-04:00
diff --git a/pyrasterframes/src/main/python/docs/nodata-handling.pymd b/pyrasterframes/src/main/python/docs/nodata-handling.pymd
@@ -2,7 +2,7 @@
 
 ## What is NoData?
 
-In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and is usually called the "NoData" value.
+In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and in scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and is usually called the "NoData" value.
 
 RasterFrames provides a variety of functions to inspect and manage NoData within _tiles_.
 
diff --git a/pyrasterframes/src/main/python/docs/numpy-pandas.pymd b/pyrasterframes/src/main/python/docs/numpy-pandas.pymd
@@ -45,11 +45,11 @@ import pyrasterframes.rf_ipython
 from pyspark.sql.functions import lit, col
 
 cat = spark.read.format('aws-pds-modis-catalog').load() \
-                .filter(
-                    (col('granule_id') == 'h11v04') &
-                    (col('acquisition_date') > lit('2018-02-19')) &
-                    (col('acquisition_date') < lit('2018-02-22'))
-                )
+        .filter(
+            (col('granule_id') == 'h11v04') &
+            (col('acquisition_date') > lit('2018-02-19')) &
+            (col('acquisition_date') < lit('2018-02-22'))
+        )
 
 spark_df = spark.read.raster(catalog=cat, catalog_col_names=['B01']) \
                 .select(
@@ -92,7 +92,7 @@ np.abs(diff.cells).max()
 We can also inspect an image of the difference between the two _tiles_, which is just random noise. Both _tiles_ have the same structure of NoData, as exhibited by the white areas.
 
 ```python udf_diff_noise_tile
-display(diff)
+diff.show(0, 100)
 ```
 
 ## Creating a Spark DataFrame
@@ -105,12 +105,11 @@ The example below will create a Pandas DataFrame with ten rows of noise _tiles_
 import pandas as pd
 from shapely.geometry import Point
 
-pandas_df = pd.DataFrame([
-        {
-            'tile': Tile(np.random.randn(100, 100)),
-            'geom': Point(-90 + 90 * np.random.random((2, 1)))
-        }  for _ in range(10)
-    ])
+pandas_df = pd.DataFrame([{
+    'tile': Tile(np.random.randn(100, 100)),
+    'geom': Point(-90 + 90 * np.random.random((2, 1)))
+    } for _ in range(10)
+])
 
 spark_df = spark.createDataFrame(pandas_df)
 
diff --git a/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py b/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py
@@ -21,12 +21,15 @@
 import pyrasterframes.rf_types
 import numpy as np
 
-def plot_tile(tile, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args):
+
+def plot_tile(tile, normalize, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args):
     """
     Display an image of the tile
 
     Parameters
     ----------
+    normalize: if True, will normalize the data between using
+               lower_percentile and upper_percentile as bounds
     lower_percentile: between 0 and 100 inclusive.
                       Specifies to clip values below this percentile
     upper_percentile: between 0 and 100 inclusive.
@@ -45,18 +48,24 @@ def plot_tile(tile, lower_percentile=1, upper_percentile=99, axis=None, **imshow
 
     arr = tile.cells
 
-    def normalize_cells(cells, lower_percentile=lower_percentile, upper_percentile=upper_percentile):
-        assert upper_percentile > lower_percentile, 'invalid upper and lower percentiles'
-        lower = np.percentile(cells, lower_percentile)
-        upper = np.percentile(cells, upper_percentile)
+    def normalize_cells(cells):
+        assert upper_percentile > lower_percentile, 'invalid upper and lower percentiles {}, {}'.format(lower_percentile, upper_percentile)
+        sans_mask = np.array(cells)
+        lower = np.nanpercentile(sans_mask, lower_percentile)
+        upper = np.nanpercentile(sans_mask, upper_percentile)
         cells_clipped = np.clip(cells, lower, upper)
         return (cells_clipped - lower) / (upper - lower)
 
     axis.set_aspect('equal')
     axis.xaxis.set_ticks([])
     axis.yaxis.set_ticks([])
 
-    axis.imshow(normalize_cells(arr), **imshow_args)
+    if normalize:
+        cells = normalize_cells(arr)
+    else:
+        cells = arr
+
+    axis.imshow(cells, **imshow_args)
 
     return axis
 
@@ -71,15 +80,15 @@ def tile_to_png(tile, lower_percentile=1, upper_percentile=99, title=None, fig_s
     from matplotlib.figure import Figure
 
     # Set up matplotlib objects
-    nominal_size = 4  # approx full size for a 256x256 tile
+    nominal_size = 3  # approx full size for a 256x256 tile
     if fig_size is None:
         fig_size = (nominal_size, nominal_size)
 
     fig = Figure(figsize=fig_size)
     canvas = FigureCanvas(fig)
     axis = fig.add_subplot(1, 1, 1)
 
-    plot_tile(tile, lower_percentile, upper_percentile, axis=axis)
+    plot_tile(tile, True, lower_percentile, upper_percentile, axis=axis)
     axis.set_aspect('equal')
     axis.xaxis.set_ticks([])
     axis.yaxis.set_ticks([])
@@ -164,8 +173,9 @@ def spark_df_to_markdown(df, num_rows=5, truncate=True, vertical=False):
         markdown_formatter = ip.display_formatter.formatters['text/markdown']
         html_formatter.for_type(pyspark.sql.DataFrame, spark_df_to_markdown)
 
-        Tile.show = lambda tile, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args: \
-            plot_tile(tile, lower_percentile, upper_percentile, axis, **imshow_args)
+        Tile.show = lambda tile, normalize=False, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args: \
+            plot_tile(tile, normalize, lower_percentile, upper_percentile, axis, **imshow_args)
+        Tile.show.__doc__ = plot_tile.__doc__
 
         # See if we're in documentation mode and register a custom show implementation.
         if 'InProcessInteractiveShell' in ip.__class__.__name__: