Skip to content

Commit eb9da2c

Browse files
committed
Fixed problems with png data normalization with nan values.
1 parent 131bba6 commit eb9da2c

File tree

3 files changed

+32
-23
lines changed

3 files changed

+32
-23
lines changed

pyrasterframes/src/main/python/docs/nodata-handling.pymd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
## What is NoData?
44

5-
In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and is usually called the "NoData" value.
5+
In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and in scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and is usually called the "NoData" value.
66

77
RasterFrames provides a variety of functions to inspect and manage NoData within _tiles_.
88

pyrasterframes/src/main/python/docs/numpy-pandas.pymd

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,11 @@ import pyrasterframes.rf_ipython
4545
from pyspark.sql.functions import lit, col
4646

4747
cat = spark.read.format('aws-pds-modis-catalog').load() \
48-
.filter(
49-
(col('granule_id') == 'h11v04') &
50-
(col('acquisition_date') > lit('2018-02-19')) &
51-
(col('acquisition_date') < lit('2018-02-22'))
52-
)
48+
.filter(
49+
(col('granule_id') == 'h11v04') &
50+
(col('acquisition_date') > lit('2018-02-19')) &
51+
(col('acquisition_date') < lit('2018-02-22'))
52+
)
5353

5454
spark_df = spark.read.raster(catalog=cat, catalog_col_names=['B01']) \
5555
.select(
@@ -92,7 +92,7 @@ np.abs(diff.cells).max()
9292
We can also inspect an image of the difference between the two _tiles_, which is just random noise. Both _tiles_ have the same structure of NoData, as exhibited by the white areas.
9393

9494
```python udf_diff_noise_tile
95-
display(diff)
95+
diff.show(0, 100)
9696
```
9797

9898
## Creating a Spark DataFrame
@@ -105,12 +105,11 @@ The example below will create a Pandas DataFrame with ten rows of noise _tiles_
105105
import pandas as pd
106106
from shapely.geometry import Point
107107

108-
pandas_df = pd.DataFrame([
109-
{
110-
'tile': Tile(np.random.randn(100, 100)),
111-
'geom': Point(-90 + 90 * np.random.random((2, 1)))
112-
} for _ in range(10)
113-
])
108+
pandas_df = pd.DataFrame([{
109+
'tile': Tile(np.random.randn(100, 100)),
110+
'geom': Point(-90 + 90 * np.random.random((2, 1)))
111+
} for _ in range(10)
112+
])
114113

115114
spark_df = spark.createDataFrame(pandas_df)
116115

pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,15 @@
2121
import pyrasterframes.rf_types
2222
import numpy as np
2323

24-
def plot_tile(tile, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args):
24+
25+
def plot_tile(tile, normalize, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args):
2526
"""
2627
Display an image of the tile
2728
2829
Parameters
2930
----------
31+
normalize: if True, will normalize the data between using
32+
lower_percentile and upper_percentile as bounds
3033
lower_percentile: between 0 and 100 inclusive.
3134
Specifies to clip values below this percentile
3235
upper_percentile: between 0 and 100 inclusive.
@@ -45,18 +48,24 @@ def plot_tile(tile, lower_percentile=1, upper_percentile=99, axis=None, **imshow
4548

4649
arr = tile.cells
4750

48-
def normalize_cells(cells, lower_percentile=lower_percentile, upper_percentile=upper_percentile):
49-
assert upper_percentile > lower_percentile, 'invalid upper and lower percentiles'
50-
lower = np.percentile(cells, lower_percentile)
51-
upper = np.percentile(cells, upper_percentile)
51+
def normalize_cells(cells):
52+
assert upper_percentile > lower_percentile, 'invalid upper and lower percentiles {}, {}'.format(lower_percentile, upper_percentile)
53+
sans_mask = np.array(cells)
54+
lower = np.nanpercentile(sans_mask, lower_percentile)
55+
upper = np.nanpercentile(sans_mask, upper_percentile)
5256
cells_clipped = np.clip(cells, lower, upper)
5357
return (cells_clipped - lower) / (upper - lower)
5458

5559
axis.set_aspect('equal')
5660
axis.xaxis.set_ticks([])
5761
axis.yaxis.set_ticks([])
5862

59-
axis.imshow(normalize_cells(arr), **imshow_args)
63+
if normalize:
64+
cells = normalize_cells(arr)
65+
else:
66+
cells = arr
67+
68+
axis.imshow(cells, **imshow_args)
6069

6170
return axis
6271

@@ -71,15 +80,15 @@ def tile_to_png(tile, lower_percentile=1, upper_percentile=99, title=None, fig_s
7180
from matplotlib.figure import Figure
7281

7382
# Set up matplotlib objects
74-
nominal_size = 4 # approx full size for a 256x256 tile
83+
nominal_size = 3 # approx full size for a 256x256 tile
7584
if fig_size is None:
7685
fig_size = (nominal_size, nominal_size)
7786

7887
fig = Figure(figsize=fig_size)
7988
canvas = FigureCanvas(fig)
8089
axis = fig.add_subplot(1, 1, 1)
8190

82-
plot_tile(tile, lower_percentile, upper_percentile, axis=axis)
91+
plot_tile(tile, True, lower_percentile, upper_percentile, axis=axis)
8392
axis.set_aspect('equal')
8493
axis.xaxis.set_ticks([])
8594
axis.yaxis.set_ticks([])
@@ -164,8 +173,9 @@ def spark_df_to_markdown(df, num_rows=5, truncate=True, vertical=False):
164173
markdown_formatter = ip.display_formatter.formatters['text/markdown']
165174
html_formatter.for_type(pyspark.sql.DataFrame, spark_df_to_markdown)
166175

167-
Tile.show = lambda tile, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args: \
168-
plot_tile(tile, lower_percentile, upper_percentile, axis, **imshow_args)
176+
Tile.show = lambda tile, normalize=False, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args: \
177+
plot_tile(tile, normalize, lower_percentile, upper_percentile, axis, **imshow_args)
178+
Tile.show.__doc__ = plot_tile.__doc__
169179

170180
# See if we're in documentation mode and register a custom show implementation.
171181
if 'InProcessInteractiveShell' in ip.__class__.__name__:

0 commit comments

Comments
 (0)