[#54] Update to spark 2.1

jkbradley · web-flow · commit d6f6f5651ba2 · 2017-06-22T20:58:45.000-07:00
Updated spark-sklearn to be compatible with spark versions &gt;= 2.1.1. This change is not backwards compatible with spark 2.0.
diff --git a/.travis.yml b/.travis.yml
@@ -8,7 +8,7 @@ cache:
     - $HOME/.cache/spark-versions
 env:
   matrix:
-    - SPARK_VERSION="2.0.0" SPARK_BUILD="spark-$SPARK_VERSION-bin-hadoop2.7" SPARK_BUILD_URL="http://d3kbcqa49mib13.cloudfront.net/$SPARK_BUILD.tgz"
+    - SPARK_VERSION="2.1.1" SPARK_BUILD="spark-$SPARK_VERSION-bin-hadoop2.7" SPARK_BUILD_URL="http://d3kbcqa49mib13.cloudfront.net/$SPARK_BUILD.tgz"
 
 before_install:
  - ./bin/download_travis_dependencies.sh
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ This project is also available as as [Spark package](http://spark-packages.org/p
 
 The developer version has the following requirements:
  - a recent release of scikit-learn. Release 0.17 has been tested, older versions may work too.
- - Spark >= 2.0. Spark may be downloaded from the [Spark official website](http://spark.apache.org/). In order to use this package, you need to use the pyspark interpreter or another Spark-compliant python interpreter. See the [Spark guide](https://spark.apache.org/docs/latest/programming-guide.html#overview) for more details.
+ - Spark >= 2.1.1. Spark may be downloaded from the [Spark official website](http://spark.apache.org/). In order to use this package, you need to use the pyspark interpreter or another Spark-compliant python interpreter. See the [Spark guide](https://spark.apache.org/docs/latest/programming-guide.html#overview) for more details.
  - [nose](https://nose.readthedocs.org) (testing dependency only)
  - Pandas, if using the Pandas integration or testing. Pandas==0.18 has been tested.
 
diff --git a/build.sbt b/build.sbt
@@ -3,7 +3,7 @@
 
 scalaVersion := "2.10.4"
 
-sparkVersion := "2.0.0"
+sparkVersion := "2.1.1"
 
 spName := "databricks/spark-sklearn"
 
diff --git a/python/README.md b/python/README.md
@@ -21,7 +21,7 @@ This package is released under the Apache 2.0 license. See the LICENSE file.
 
 This package has the following requirements:
  - a recent version of scikit-learn. Version 0.17 has been tested, older versions may work too.
- - Spark >= 2.0. Spark may be downloaded from the
+ - Spark >= 2.1.1 Spark may be downloaded from the
  [Spark official website](http://spark.apache.org/). In order to use spark-sklearn, you need to use the pyspark interpreter or another Spark-compliant python interpreter. See the [Spark guide](https://spark.apache.org/docs/latest/programming-guide.html#overview) for more details.
  - [nose](https://nose.readthedocs.org) (testing dependency only)
 
diff --git a/python/spark_sklearn/keyed_models.py b/python/spark_sklearn/keyed_models.py
@@ -320,7 +320,7 @@ def __init__(self, sklearnEstimator=None, keyCols=["key"], xCol="features",
         self._setDefault(**{paramName: paramSpec["default"]
                             for paramName, paramSpec in KeyedEstimator._paramSpecs.items()
                             if "default" in paramSpec})
-        kwargs = KeyedEstimator._inferredParams(sklearnEstimator, self.__init__._input_kwargs)
+        kwargs = KeyedEstimator._inferredParams(sklearnEstimator, self._input_kwargs)
         self._set(**kwargs)
 
         self._verifyEstimatorType()
@@ -489,7 +489,7 @@ def implies(a, b):
         if yCol and type(outputType) not in KeyedModel._sql_types:
             raise TypeError("Output type {} is not an AtomicType (expected for {} estimator)"
                             .format(outputType, estimatorType))
-        self._set(**self.__init__._input_kwargs)
+        self._set(**self._input_kwargs)
 
     def _verifyEstimatorType(self):
         estimatorType = self.getOrDefault("estimatorType")
diff --git a/python/spark_sklearn/tests/test_gapply.py b/python/spark_sklearn/tests/test_gapply.py
@@ -85,10 +85,6 @@ def pandasAggFunction(series):
         dataGen = lambda: (random.randrange(GapplyTests.NVALS), random.randrange(GapplyTests.NVALS))
         self.checkGapplyEquivalentToPandas(pandasAggFunction, dataType, dataGen)
 
-    @unittest.skip("""
-    python only UDTs can't be nested in arraytypes for now, see SPARK-15989
-    this is only available starting in Spark 2.0.1
-    """)
     def test_gapply_python_only_udt_val(self):
         def pandasAggFunction(series):
             x = float(series.apply(lambda pt: int(pt.x) + int(pt.y)).sum())