Update doc to use rf_local_is_in when masking; fix #351

vpipkt · vpipkt · commit 52983e33990b · 2019-10-23T14:21:44.000-04:00
Signed-off-by: Jason T. Brown &lt;jason@astraea.earth&gt;
diff --git a/pyrasterframes/src/main/python/docs/nodata-handling.pymd b/pyrasterframes/src/main/python/docs/nodata-handling.pymd
@@ -105,32 +105,23 @@ Drawing on @ref:[local map algebra](local-algebra.md) techniques, we will create
 ```python, def_mask
 from pyspark.sql.functions import lit
 
-mask_part = unmasked.withColumn('nodata', rf_local_equal('scl', lit(0))) \
-                    .withColumn('defect', rf_local_equal('scl', lit(1))) \
-                    .withColumn('cloud8', rf_local_equal('scl', lit(8))) \
-                    .withColumn('cloud9', rf_local_equal('scl', lit(9))) \
-                    .withColumn('cirrus', rf_local_equal('scl', lit(10)))
-
-one_mask = mask_part.withColumn('mask', rf_local_add('nodata', 'defect')) \
-                    .withColumn('mask', rf_local_add('mask', 'cloud8')) \
-                    .withColumn('mask', rf_local_add('mask', 'cloud9')) \
-                    .withColumn('mask', rf_local_add('mask', 'cirrus'))
-
-cell_types = one_mask.select(rf_cell_type('mask')).distinct()
+mask = unmasked.withColumn('mask', rf_local_is_in('scl', [0, 1, 8, 9, 10])
+
+cell_types = mask.select(rf_cell_type('mask')).distinct()
 cell_types
 ```
 
 Because there is not a NoData already defined, we will choose one. In this particular example, the minimum value is greater than zero, so we can use 0 as the NoData value.
 
 ```python, pick_nd
-blue_min = one_mask.agg(rf_agg_stats('blue').min.alias('blue_min'))
+blue_min = mask.agg(rf_agg_stats('blue').min.alias('blue_min'))
 blue_min
 ```
 
 We can now construct the cell type string for our blue band's cell type, designating 0 as NoData.
 
 ```python, get_ct_string
-blue_ct = one_mask.select(rf_cell_type('blue')).distinct().first()[0][0]
+blue_ct = mask.select(rf_cell_type('blue')).distinct().first()[0][0]
 masked_blue_ct = CellType(blue_ct).with_no_data_value(0)
 masked_blue_ct.cell_type_name
 ```
@@ -139,9 +130,8 @@ Now we will use the @ref:[`rf_mask_by_value`](reference.md#rf-mask-by-value) to
 
 ```python, mask_blu
 with_nd = rf_convert_cell_type('blue', masked_blue_ct)
-masked = one_mask.withColumn('blue_masked',
-    rf_mask_by_value(with_nd, 'mask', lit(1))) \
-    .drop('nodata', 'defect', 'cloud8', 'cloud9', 'cirrus', 'blue')
+masked = mask.withColumn('blue_masked',
+    rf_mask_by_value(with_nd, 'mask', lit(1)))
 ```
 
 We can verify that the number of NoData cells in the resulting `blue_masked` column matches the total of the boolean `mask` _tile_ to ensure our logic is correct.
diff --git a/pyrasterframes/src/main/python/docs/supervised-learning.pymd b/pyrasterframes/src/main/python/docs/supervised-learning.pymd
@@ -32,7 +32,7 @@ catalog_df = pd.DataFrame([
     {b: uri_base.format(b) for b in cols}
 ])
 
-df = spark.read.raster(catalog_df, catalog_col_names=cols, tile_dimensions=(128, 128)) \
+df = spark.read.raster(catalog_df, catalog_col_names=cols, tile_dimensions=(256, 256)) \
 					  .repartition(100)
 
 df = df.select(
@@ -91,23 +91,12 @@ To filter only for good quality pixels, we follow roughly the same procedure as
 ```python, make_mask
 from pyspark.sql.functions import lit
 
-mask_part = df_labeled \
-    .withColumn('nodata', rf_local_equal('scl', lit(0))) \
-    .withColumn('defect', rf_local_equal('scl', lit(1))) \
-    .withColumn('cloud8', rf_local_equal('scl', lit(8))) \
-    .withColumn('cloud9', rf_local_equal('scl', lit(9))) \
-    .withColumn('cirrus', rf_local_equal('scl', lit(10)))
-
-df_mask_inv = mask_part \
-    .withColumn('mask', rf_local_add('nodata', 'defect')) \
-    .withColumn('mask', rf_local_add('mask', 'cloud8')) \
-    .withColumn('mask', rf_local_add('mask', 'cloud9')) \
-    .withColumn('mask', rf_local_add('mask', 'cirrus')) \
-    .drop('nodata', 'defect', 'cloud8', 'cloud9', 'cirrus')
-    
+df_labeled = df_labeled \
+    .withColumn('mask', rf_local_is_in('scl', [0, 1, 8, 9, 10]))
+
 # at this point the mask contains 0 for good cells and 1 for defect, etc
 # convert cell type and set value 1 to NoData
-df_mask = df_mask_inv.withColumn('mask',
+df_mask = df_labeled.withColumn('mask',
   rf_with_no_data(rf_convert_cell_type('mask', 'uint8'), 1.0)
 )
 
@@ -213,20 +202,26 @@ retiled.printSchema()
 ```
 
 Take a look at a sample of the resulting output and the corresponding area's red-green-blue composite image.
+Recall the label coding: 1 is forest (purple), 2 is cropland (green) and 3 is developed areas(yellow).
 
 ```python, display_rgb
 sample = retiled \
-    .select('prediction', rf_rgb_composite('red', 'grn', 'blu').alias('rgb')) \
+    .select('prediction', 'red', 'grn', 'blu') \
     .sort(-rf_tile_sum(rf_local_equal('prediction', lit(3.0)))) \
     .first()
 
-sample_rgb = sample['rgb']
-mins = np.nanmin(sample_rgb.cells, axis=(0,1))
-plt.imshow((sample_rgb.cells -  mins) / (np.nanmax(sample_rgb.cells, axis=(0,1)) - mins))
-```
+sample_rgb = np.concatenate([sample['red'].cells[:, :, None],
+                             sample['grn'].cells[ :, :, None],
+                             sample['blu'].cells[ :, :, None]], axis=2)
+# plot  scaled RGB
+scaling_quantiles = np.nanpercentile(sample_rgb, [3.00, 97.00], axis=(0,1))
+scaled = np.clip(sample_rgb, scaling_quantiles[0, :], scaling_quantiles[1, :])
+scaled -= scaling_quantiles[0, :]
+scaled /= (scaling_quantiles[1, : ] - scaling_quantiles[0, :])
 
-Recall the label coding: 1 is forest (purple), 2 is cropland (green) and 3 is developed areas(yellow).
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
+ax1.imshow(scaled)
 
-```python, display_prediction
-display(sample['prediction'])
+# display prediction
+ax2.imshow(sample['prediction'].cells)
 ```