Merge pull request #426 from s22s/fix/425-ml-pipeline

metasim · web-flow · commit 47b42a908b90 · 2019-11-19T11:03:14.000-05:00
Fix #425 ML Pipeline save/load
diff --git a/pyrasterframes/src/main/python/pyrasterframes/rf_types.py b/pyrasterframes/src/main/python/pyrasterframes/rf_types.py
@@ -31,7 +31,7 @@ class here provides the PyRasterFrames entry point.
 
 from pyspark.ml.param.shared import HasInputCols
 from pyspark.ml.wrapper import JavaTransformer
-from pyspark.ml.util import JavaMLReadable, JavaMLWritable
+from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
 
 from pyrasterframes.rf_context import RFContext
 
@@ -462,7 +462,7 @@ def deserialize(self, datum):
 Tile.__UDT__ = TileUDT()
 
 
-class TileExploder(JavaTransformer, JavaMLReadable, JavaMLWritable):
+class TileExploder(JavaTransformer, DefaultParamsReadable, DefaultParamsWritable):
     """
     Python wrapper for TileExploder.scala
     """
@@ -472,7 +472,7 @@ def __init__(self):
         self._java_obj = self._new_java_obj("org.locationtech.rasterframes.ml.TileExploder", self.uid)
 
 
-class NoDataFilter(JavaTransformer, HasInputCols, JavaMLReadable, JavaMLWritable):
+class NoDataFilter(JavaTransformer, HasInputCols, DefaultParamsReadable, DefaultParamsWritable):
     """
     Python wrapper for NoDataFilter.scala
     """
diff --git a/pyrasterframes/src/main/python/tests/ExploderTests.py b/pyrasterframes/src/main/python/tests/ExploderTests.py
@@ -25,7 +25,7 @@
 from pyrasterframes import TileExploder
 
 from pyspark.ml.feature import VectorAssembler
-from pyspark.ml import Pipeline
+from pyspark.ml import Pipeline, PipelineModel
 from pyspark.sql.functions import *
 
 import unittest
@@ -56,3 +56,16 @@ def test_tile_exploder_pipeline_for_tile(self):
         pipe_model = pipe.fit(df)
         tranformed_df = pipe_model.transform(df)
         self.assertTrue(tranformed_df.count() > df.count())
+
+    def test_tile_exploder_read_write(self):
+        path = 'test_tile_exploder_read_write.pipe'
+        df = self.spark.read.raster(self.img_uri)
+
+        assembler = VectorAssembler().setInputCols(['proj_raster'])
+        pipe = Pipeline().setStages([TileExploder(), assembler])
+
+        pipe.fit(df).write().overwrite().save(path)
+
+        read_pipe = PipelineModel.load(path)
+        self.assertEqual(len(read_pipe.stages), 2)
+        self.assertTrue(isinstance(read_pipe.stages[0], TileExploder))
diff --git a/pyrasterframes/src/main/python/tests/NoDataFilterTests.py b/pyrasterframes/src/main/python/tests/NoDataFilterTests.py
@@ -0,0 +1,51 @@
+#
+# This software is licensed under the Apache 2 license, quoted below.
+#
+# Copyright 2019 Astraea, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# [http://www.apache.org/licenses/LICENSE-2.0]
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+from . import TestEnvironment
+
+from pyrasterframes.rasterfunctions import *
+from pyrasterframes.rf_types import *
+
+from pyspark.ml.feature import VectorAssembler
+from pyspark.ml import Pipeline, PipelineModel
+from pyspark.sql.functions import *
+
+import unittest
+
+
+class ExploderTests(TestEnvironment):
+
+    def test_no_data_filter_read_write(self):
+        path = 'test_no_data_filter_read_write.pipe'
+        df = self.spark.read.raster(self.img_uri) \
+            .select(rf_tile_mean('proj_raster').alias('mean'))
+
+        input_cols = ['mean']
+        ndf = NoDataFilter().setInputCols(input_cols)
+        assembler = VectorAssembler().setInputCols(input_cols)
+
+        pipe = Pipeline().setStages([ndf, assembler])
+
+        pipe.fit(df).write().overwrite().save(path)
+
+        read_pipe = PipelineModel.load(path)
+        self.assertEqual(len(read_pipe.stages), 2)
+        actual_stages_ndf = read_pipe.stages[0].getInputCols()
+        self.assertEqual(actual_stages_ndf, input_cols)