Merge pull request #286 from s22s/feature/tile-show

metasim · web-flow · commit 3541cd69ee52 · 2019-08-16T14:27:48.000-04:00
Better tile rendering
diff --git a/pyrasterframes/src/main/python/docs/nodata-handling.pymd b/pyrasterframes/src/main/python/docs/nodata-handling.pymd
@@ -2,7 +2,7 @@
 
 ## What is NoData?
 
-In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and is usually called the "NoData" value.
+In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and in scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and is usually called the "NoData" value.
 
 RasterFrames provides a variety of functions to inspect and manage NoData within _tiles_.
 
diff --git a/pyrasterframes/src/main/python/docs/numpy-pandas.pymd b/pyrasterframes/src/main/python/docs/numpy-pandas.pymd
@@ -45,11 +45,11 @@ import pyrasterframes.rf_ipython
 from pyspark.sql.functions import lit, col
 
 cat = spark.read.format('aws-pds-modis-catalog').load() \
-                .filter(
-                    (col('granule_id') == 'h11v04') &
-                    (col('acquisition_date') > lit('2018-02-19')) &
-                    (col('acquisition_date') < lit('2018-02-22'))
-                )
+        .filter(
+            (col('granule_id') == 'h11v04') &
+            (col('acquisition_date') > lit('2018-02-19')) &
+            (col('acquisition_date') < lit('2018-02-22'))
+        )
 
 spark_df = spark.read.raster(catalog=cat, catalog_col_names=['B01']) \
                 .select(
@@ -92,7 +92,7 @@ np.abs(diff.cells).max()
 We can also inspect an image of the difference between the two _tiles_, which is just random noise. Both _tiles_ have the same structure of NoData, as exhibited by the white areas.
 
 ```python udf_diff_noise_tile
-display(diff)
+diff.show(0, 100)
 ```
 
 ## Creating a Spark DataFrame
@@ -105,12 +105,11 @@ The example below will create a Pandas DataFrame with ten rows of noise _tiles_
 import pandas as pd
 from shapely.geometry import Point
 
-pandas_df = pd.DataFrame([
-        {
-            'tile': Tile(np.random.randn(100, 100)),
-            'geom': Point(-90 + 90 * np.random.random((2, 1)))
-        }  for _ in range(10)
-    ])
+pandas_df = pd.DataFrame([{
+    'tile': Tile(np.random.randn(100, 100)),
+    'geom': Point(-90 + 90 * np.random.random((2, 1)))
+    } for _ in range(10)
+])
 
 spark_df = spark.createDataFrame(pandas_df)
 
diff --git a/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py b/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py
@@ -19,9 +19,58 @@
 #
 
 import pyrasterframes.rf_types
+import numpy as np
+
+
+def plot_tile(tile, normalize, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args):
+    """
+    Display an image of the tile
+
+    Parameters
+    ----------
+    normalize: if True, will normalize the data between using
+               lower_percentile and upper_percentile as bounds
+    lower_percentile: between 0 and 100 inclusive.
+                      Specifies to clip values below this percentile
+    upper_percentile: between 0 and 100 inclusive.
+                      Specifies to clip values above this percentile
+    axis : matplotlib axis object to plot onto. Creates new axis if None
+    imshow_args : parameters to pass into matplotlib.pyplot.imshow
+                  see https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.imshow.html
+    Returns
+    -------
+    created or modified axis object
+    """
+
+    if axis is None:
+        import matplotlib.pyplot as plt
+        axis = plt.gca()
+
+    arr = tile.cells
+
+    def normalize_cells(cells):
+        assert upper_percentile > lower_percentile, 'invalid upper and lower percentiles {}, {}'.format(lower_percentile, upper_percentile)
+        sans_mask = np.array(cells)
+        lower = np.nanpercentile(sans_mask, lower_percentile)
+        upper = np.nanpercentile(sans_mask, upper_percentile)
+        cells_clipped = np.clip(cells, lower, upper)
+        return (cells_clipped - lower) / (upper - lower)
 
+    axis.set_aspect('equal')
+    axis.xaxis.set_ticks([])
+    axis.yaxis.set_ticks([])
+
+    if normalize:
+        cells = normalize_cells(arr)
+    else:
+        cells = arr
+
+    axis.imshow(cells, **imshow_args)
 
-def tile_to_png(tile, fig_size=None):
+    return axis
+
+
+def tile_to_png(tile, lower_percentile=1, upper_percentile=99, title=None, fig_size=None):
     """ Provide image of Tile."""
     if tile.cells is None:
         return None
@@ -31,23 +80,24 @@ def tile_to_png(tile, fig_size=None):
     from matplotlib.figure import Figure
 
     # Set up matplotlib objects
-    nominal_size = 2  # approx full size for a 256x256 tile
+    nominal_size = 3  # approx full size for a 256x256 tile
     if fig_size is None:
         fig_size = (nominal_size, nominal_size)
 
     fig = Figure(figsize=fig_size)
     canvas = FigureCanvas(fig)
     axis = fig.add_subplot(1, 1, 1)
 
-    data = tile.cells
-
-    axis.imshow(data)
+    plot_tile(tile, True, lower_percentile, upper_percentile, axis=axis)
     axis.set_aspect('equal')
     axis.xaxis.set_ticks([])
     axis.yaxis.set_ticks([])
 
-    axis.set_title('{}, {}'.format(tile.dimensions(), tile.cell_type.__repr__()),
-                   fontsize=fig_size[0]*4)  # compact metadata as title
+    if title is None:
+        axis.set_title('{}, {}'.format(tile.dimensions(), tile.cell_type.__repr__()),
+                       fontsize=fig_size[0]*4)  # compact metadata as title
+    else:
+        axis.set_title(title, fontsize=fig_size[0]*4)  # compact metadata as title
 
     with io.BytesIO() as output:
         canvas.print_png(output)
@@ -58,7 +108,7 @@ def tile_to_html(tile, fig_size=None):
     """ Provide HTML string representation of Tile image."""
     import base64
     b64_img_html = '<img src="data:image/png;base64,{}" />'
-    png_bits = tile_to_png(tile, fig_size)
+    png_bits = tile_to_png(tile, fig_size=fig_size)
     b64_png = base64.b64encode(png_bits).decode('utf-8').replace('\n', '')
     return b64_img_html.format(b64_png)
 
@@ -102,6 +152,7 @@ def _safe_tile_to_html(t):
     pd.set_option('display.max_colwidth', default_max_colwidth)
     return return_html
 
+
 def spark_df_to_markdown(df, num_rows=5, truncate=True, vertical=False):
     from pyrasterframes import RFContext
     return RFContext.active().call("_dfToMarkdown", df._jdf, num_rows, truncate)
@@ -122,14 +173,14 @@ def spark_df_to_markdown(df, num_rows=5, truncate=True, vertical=False):
         markdown_formatter = ip.display_formatter.formatters['text/markdown']
         html_formatter.for_type(pyspark.sql.DataFrame, spark_df_to_markdown)
 
-        Tile.show = lambda t: display_png(t._repr_png_(), raw=True)
+        Tile.show = lambda tile, normalize=False, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args: \
+            plot_tile(tile, normalize, lower_percentile, upper_percentile, axis, **imshow_args)
+        Tile.show.__doc__ = plot_tile.__doc__
 
         # See if we're in documentation mode and register a custom show implementation.
         if 'InProcessInteractiveShell' in ip.__class__.__name__:
             pyspark.sql.DataFrame._repr_markdown_ = spark_df_to_markdown
             pyspark.sql.DataFrame.show = lambda df, num_rows=5, truncate=True: display_markdown(spark_df_to_markdown(df, num_rows, truncate), raw=True)
 
 except ImportError as e:
-    print(e)
-    raise e
     pass