Merge pull request spark-jobserver#639 from instructure/2.0

velvia · web-flow · commit 5b6a08a08310 · 2016-11-30T09:53:39.000-08:00
Get latest master working with Spark 2.0
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,6 @@
+/target
+*/target
+**/target
+.git
+.ensime
+.ensime_cache
diff --git a/.travis.yml b/.travis.yml
@@ -1,23 +1,35 @@
-language: scala
 sudo: required
-dist: trusty
+
 env:
   global:
-    - _JAVA_OPTIONS="-Xmx1500m -XX:MaxPermSize=512m -Dakka.test.timefactor=3"
-    - SPARK_HOME=/tmp/spark-1.6.2-bin-hadoop2.6
-scala:
-   - 2.10.6
-   - 2.11.8
-jdk:
-   - oraclejdk8
-   - oraclejdk7
+    - DOCKER_IMAGE="spark-jobserver:ci"
+
+language: go
+go:
+  - 1.7
+
+services:
+  - docker
+
+cache:
+  directories:
+    - docker
+
 before_install:
-  - sudo apt-get -qq update
-  - sudo apt-get -y install python3 python3-pip
-  - ci/install-python-dependencies.sh
-  - ci/install-spark.sh
+  - go get github.com/tonistiigi/buildcache/cmd/buildcache
+
+install:
+  - if [ -e docker/latest_image.tar.gz ]; then
+      docker load -i docker/latest_image.tar.gz;
+    fi
+  - if [ -e docker/latest_cache.tar.gz ]; then
+      docker load -i docker/latest_cache.tar.gz;
+    fi
+  - echo "docker/" >> .dockerignore
+  - docker build -f Dockerfile.test -t $DOCKER_IMAGE .
+  - docker save $DOCKER_IMAGE | gzip > docker/latest_image.tar.gz
+  - sudo `which buildcache` save -o docker/latest_cache.tar.gz $DOCKER_IMAGE
+  - sudo chown $USER docker/latest_cache.tar.gz
+
 script:
-  - sbt clean coverage testPython test coverageReport
-  - find job-server-python/src/python -name *.py -exec pep8 {} +
-after_success:
-  - bash <(curl -s https://codecov.io/bash)
+  - docker run --rm -t -i $DOCKER_IMAGE
diff --git a/Dockerfile.test b/Dockerfile.test
@@ -0,0 +1,30 @@
+FROM instructure/java:8
+
+USER root
+# install and cache sbt, python
+
+RUN echo 'deb http://dl.bintray.com/sbt/debian /' > /etc/apt/sources.list.d/sbt.list && \
+    apt-get -qq update && \
+    apt-get install -y --force-yes python3 python3-pip python-pip sbt=0.13.8 && \
+    sbt
+# running sbt downloads some of its internals, speed up sebsequent sbt runs
+
+WORKDIR /usr/src/app/
+
+# install other ci deps
+COPY ci ci
+RUN ci/install-python-dependencies.sh && \
+    ci/install-spark.sh
+
+# add sbt and cache deps
+COPY project project
+COPY build.sbt .
+RUN sbt update
+
+# add the rest of the code
+COPY . .
+
+ENV SPARK_HOME /tmp/spark-2.0.1-bin-hadoop2.7
+ENV JAVA_OPTIONS "-Xmx1500m -XX:MaxPermSize=512m -Dakka.test.timefactor=3"
+
+CMD ["/usr/src/app/run_tests.sh"]
diff --git a/build.sbt b/build.sbt
@@ -116,13 +116,8 @@ lazy val dockerSettings = Seq(
 
     val sparkBuild = s"spark-$sparkVersion"
     val sparkBuildCmd = scalaBinaryVersion.value match {
-      case "2.10" =>
-        "./make-distribution.sh -Phadoop-2.4 -Phive"
       case "2.11" =>
-        """
-          |./dev/change-scala-version.sh 2.11 && \
-          |./make-distribution.sh -Dscala-2.11 -Phadoop-2.4 -Phive
-        """.stripMargin.trim
+        "./make-distribution.sh -Dscala-2.11 -Phadoop-2.7 -Phive"
       case other => throw new RuntimeException(s"Scala version $other is not supported!")
     }
 
@@ -170,7 +165,13 @@ lazy val dockerSettings = Seq(
   imageNames in docker := Seq(
     sbtdocker.ImageName(namespace = Some("velvia"),
                         repository = "spark-jobserver",
-                        tag = Some(s"${version.value}.mesos-${mesosVersion.split('-')(0)}.spark-$sparkVersion.scala-${scalaBinaryVersion.value}"))
+                        tag = Some(
+                          s"${version.value}" +
+                          s".mesos-${mesosVersion.split('-')(0)}" +
+                          s".spark-$sparkVersion" +
+                          s".scala-${scalaBinaryVersion.value}" +
+                          s".jdk-$javaVersion")
+                        )
   )
 )
 
@@ -208,8 +209,7 @@ lazy val runScalaStyle = taskKey[Unit]("testScalaStyle")
 lazy val commonSettings = Defaults.coreDefaultSettings ++ dirSettings ++ implicitlySettings ++ Seq(
   organization := "spark.jobserver",
   crossPaths   := true,
-  crossScalaVersions := Seq("2.10.6","2.11.8"),
-  scalaVersion := sys.env.getOrElse("SCALA_VERSION", "2.10.6"),
+  scalaVersion := sys.env.getOrElse("SCALA_VERSION", "2.11.8"),
   dependencyOverrides += "org.scala-lang" % "scala-compiler" % scalaVersion.value,
   publishTo    := Some(Resolver.file("Unused repo", file("target/unusedrepo"))),
   // scalastyleFailOnError := true,
@@ -250,4 +250,4 @@ lazy val publishSettings = Seq(
 
 // This is here so we can easily switch back to Logback when Spark fixes its log4j dependency.
 lazy val jobServerLogbackLogging = "-Dlogback.configurationFile=config/logback-local.xml"
-lazy val jobServerLogging = "-Dlog4j.configuration=file:config/log4j-local.properties"
+lazy val jobServerLogging = "-Dlog4j.configuration=file:config/log4j-local.properties"
diff --git a/ci/install-python-dependencies.sh b/ci/install-python-dependencies.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env bash
+set -e
 pip install --upgrade pip
 pip install --user pyhocon
 pip3 install --user pyhocon
diff --git a/ci/install-spark.sh b/ci/install-spark.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
-curl -o /tmp/spark.tgz http://apache.mirror.anlx.net/spark/spark-1.6.2/spark-1.6.2-bin-hadoop2.6.tgz
+set -e
+curl -L -o /tmp/spark.tgz http://d3kbcqa49mib13.cloudfront.net/spark-2.0.1-bin-hadoop2.7.tgz
 tar -xvzf /tmp/spark.tgz -C /tmp
diff --git a/job-server-extras/src/main/scala/spark/jobserver/HiveTestJob.scala b/job-server-extras/src/main/scala/spark/jobserver/HiveTestJob.scala
@@ -25,8 +25,8 @@ object HiveLoaderJob extends SparkHiveJob {
   val tableCreate = "CREATE TABLE `default`.`test_addresses`"
   val tableArgs = "(`firstName` String, `lastName` String, `address` String, `city` String)"
   val tableRowFormat = "ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'"
-  val tableColFormat = "COLLECTION ITEMS TERMINATED BY '\002'"
-  val tableMapFormat = "MAP KEYS TERMINATED BY '\003' STORED"
+  val tableColFormat = "COLLECTION ITEMS TERMINATED BY '\u0002'"
+  val tableMapFormat = "MAP KEYS TERMINATED BY '\u0003' STORED"
   val tableAs = "AS TextFile"
 
   val loadPath = s"'src/main/resources/hive_test_job_addresses.txt'"
diff --git a/job-server-extras/src/main/scala/spark/jobserver/context/HiveContextFactory.scala b/job-server-extras/src/main/scala/spark/jobserver/context/HiveContextFactory.scala
@@ -3,8 +3,7 @@ package spark.jobserver.context
 import com.typesafe.config.Config
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.sql.hive.HiveContext
-import spark.jobserver.{api, ContextLike, SparkHiveJob}
-import spark.jobserver.util.SparkJobUtils
+import spark.jobserver.{ContextLike, SparkHiveJob, api}
 
 class HiveContextFactory extends ScalaContextFactory {
   type C = HiveContext with ContextLike
diff --git a/job-server-extras/src/main/scala/spark/jobserver/python/JobEndpoint.scala b/job-server-extras/src/main/scala/spark/jobserver/python/JobEndpoint.scala
@@ -2,7 +2,6 @@ package spark.jobserver.python
 
 import com.typesafe.config.{ConfigRenderOptions, Config}
 import org.apache.spark.SparkConf
-import spark.jobserver.api.JobEnvironment
 import scala.collection.JavaConverters._
 
 /**
@@ -15,14 +14,15 @@ import scala.collection.JavaConverters._
   * The Spark Job Server python subprocess assumes the endpoint to be an implementation of this Trait,
   * and attempts to access fields and methods accordingly.
   */
-case class JobEndpoint[C <: PythonContextLike](context: C,
-                                               sparkConf: SparkConf,
-                                               contextConfig: Config,
-                                               jobId: String,
-                                               jobConfig: Config,
-                                               jobClass: String,
-                                               py4JImports: Seq[String]
-                                              ){
+case class JobEndpoint[C <: PythonContextLike](
+  context: C,
+  sparkConf: SparkConf,
+  contextConfig: Config,
+  jobId: String,
+  jobConfig: Config,
+  jobClass: String,
+  py4JImports: Seq[String]
+  ){
 
   /**
     * @return The contextConfig, which is a Typesafe Config object, serialized to HOCON,
diff --git a/job-server-extras/src/main/scala/spark/jobserver/python/PythonJob.scala b/job-server-extras/src/main/scala/spark/jobserver/python/PythonJob.scala
@@ -8,7 +8,6 @@ import spark.jobserver.api.{SparkJobBase, ValidationProblem, JobEnvironment}
 
 import scala.sys.process.{ProcessLogger, Process}
 import scala.util.{Failure, Success, Try}
-import scala.collection.JavaConverters._
 
 case class PythonJob[X <: PythonContextLike](eggPath: String,
                                              modulePath:String,
@@ -24,8 +23,6 @@ case class PythonJob[X <: PythonContextLike](eggPath: String,
     JobEndpoint(context, sparkConf, contextConfig, jobId, jobConfig, modulePath, py4JImports)
   }
 
-  def gateway(endpoint: JobEndpoint[C]): GatewayServer = new GatewayServer(endpoint, 0)
-
   /**
     *
     * To support a useful validate method here for Python jobs we would have call two python processes,
diff --git a/job-server-extras/src/test/scala/spark/jobserver/HiveJobSpec.scala b/job-server-extras/src/test/scala/spark/jobserver/HiveJobSpec.scala
@@ -1,28 +1,12 @@
 package spark.jobserver
 
-import scala.util.{Failure, Success, Try}
-
-import com.typesafe.config.ConfigFactory
+import com.typesafe.config.{ConfigFactory}
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.hive.test.TestHiveContext
-import org.apache.spark.{SparkConf, SparkContext}
-import spark.jobserver.context.{HiveContextFactory, HiveContextLike}
+import spark.jobserver.context.{HiveContextFactory}
 import spark.jobserver.io.JobDAOActor
 
-class TestHiveContextFactory extends HiveContextFactory {
-  override protected def contextFactory(conf: SparkConf): C = {
-    val sc = SparkContext.getOrCreate(conf)
-    Try(new TestHiveContext(sc) with HiveContextLike) match {
-      case Success(hc) => hc
-      case Failure(e) =>
-        sc.stop
-        throw e
-    }
-  }
-}
-
 object HiveJobSpec extends JobSpecConfig {
-  override val contextFactory = classOf[TestHiveContextFactory].getName
+  override val contextFactory = classOf[HiveContextFactory].getName
 }
 
 class HiveJobSpec extends ExtrasJobSpecBase(HiveJobSpec.getNewSystem) {
diff --git a/job-server-extras/src/test/scala/spark/jobserver/python/PythonHiveContextFactorySpec.scala b/job-server-extras/src/test/scala/spark/jobserver/python/PythonHiveContextFactorySpec.scala
@@ -1,11 +1,12 @@
 package spark.jobserver.python
 
-import com.typesafe.config.{ConfigFactory, Config}
+import java.io.File
+
+import com.typesafe.config.{Config, ConfigFactory}
 import org.apache.spark.sql.hive.HiveContext
 import org.joda.time.DateTime
-import org.scalatest.{BeforeAndAfter, Matchers, FunSpec}
+import org.scalatest.{BeforeAndAfter, FunSpec, Matchers}
 
-import scala.reflect.io.File
 import scala.collection.JavaConverters._
 
 
@@ -21,8 +22,8 @@ object PythonHiveContextFactorySpec {
     * This approach wouldn't be suitable in a production scenario, but ok for tests.
     */
   private def resetDerby(): Unit = {
-    File("/tmp/metastore_db/dbex.lck").deleteIfExists()
-    File("/tmp/metastore_db/db.lck").deleteIfExists()
+    new File("/tmp/metastore_db/dbex.lck").delete()
+    new File("/tmp/metastore_db/db.lck").delete()
   }
 }
 
diff --git a/job-server-extras/src/test/scala/spark/jobserver/python/PythonSparkContextFactorySpec.scala b/job-server-extras/src/test/scala/spark/jobserver/python/PythonSparkContextFactorySpec.scala
@@ -74,7 +74,7 @@ object PythonSparkContextFactorySpec {
   lazy val jobServerAPIExamplePath = jobServerPaths.find(_.getAbsolutePath.contains("examples"))
 
   lazy val pysparkPath = sys.env.get("SPARK_HOME").map(d => s"$d/python/lib/pyspark.zip")
-  lazy val py4jPath  = sys.env.get("SPARK_HOME").map(d => s"$d/python/lib/py4j-0.9-src.zip")
+  lazy val py4jPath  = sys.env.get("SPARK_HOME").map(d => s"$d/python/lib/py4j-0.10.3-src.zip")
   lazy val originalPythonPath  = sys.env.get("PYTHONPATH")
 
   case object DummyJobCache extends JobCache {
diff --git a/job-server-python/src/python/example_jobs/sql_two_jobs/__init__.py b/job-server-python/src/python/example_jobs/sql_two_jobs/__init__.py
@@ -28,7 +28,7 @@ def run_job(self, context, runtime, data):
         context.createDataFrame(rdd, ['name', 'age', 'salary']).\
             write.save("/tmp/people.parquet", mode='overwrite')
         context.read.load('/tmp/people.parquet').\
-            registerTempTable('people_table')
+            createOrReplaceTempView('people_table')
         return "done"
 
 
diff --git a/job-server-python/src/python/run-tests.sh b/job-server-python/src/python/run-tests.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-PYTHONPATH=.:$SPARK_HOME/python/lib/pyspark.zip:$SPARK_HOME/python/lib/py4j-0.9-src.zip:$PYTHONPATH python test/apitests.py
+PYTHONPATH=.:$SPARK_HOME/python/lib/pyspark.zip:$SPARK_HOME/python/lib/py4j-0.10.3-src.zip:$PYTHONPATH python test/apitests.py
 exitCode=$?
 #This sleep is here so that all of Spark's shutdown stdout if written before we exit,
 #so that we return cleanly to the command prompt.
diff --git a/job-server-python/src/python/sparkjobserver/subprocess.py b/job-server-python/src/python/sparkjobserver/subprocess.py
@@ -20,7 +20,7 @@
 from py4j.java_gateway import JavaGateway, java_import, GatewayClient
 from pyhocon import ConfigFactory
 from pyspark.context import SparkContext, SparkConf
-from pyspark.sql import SQLContext, HiveContext
+from pyspark.sql import SQLContext, HiveContext, SparkSession
 from sparkjobserver.api import ValidationProblem, JobEnvironment
 import traceback
 
@@ -78,7 +78,8 @@ def import_class(cls):
         jsc = gateway.jvm.org.apache.spark.api.java.JavaSparkContext(
                 jcontext.sparkContext())
         sc = SparkContext(gateway=gateway, jsc=jsc, conf=spark_conf)
-        context = SQLContext(sc, jcontext)
+        ss = SparkSession(sc, jcontext.sparkSession())
+        context = SQLContext(sc, ss, jcontext)
     elif context_class == 'org.apache.spark.sql.hive.HiveContext':
         jsc = gateway.jvm.org.apache.spark.api.java.JavaSparkContext(
                 jcontext.sparkContext())
diff --git a/job-server-python/src/test/scala/spark/jobserver/python/SubprocessSpec.scala b/job-server-python/src/test/scala/spark/jobserver/python/SubprocessSpec.scala
diff --git a/job-server-tests/src/main/scala/spark/jobserver/SparkTestJobs.scala b/job-server-tests/src/main/scala/spark/jobserver/SparkTestJobs.scala
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
diff --git a/project/Versions.scala b/project/Versions.scala
diff --git a/run_tests.sh b/run_tests.sh

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`#!/usr/bin/env bash`
	`2`	`+set -e`
`2`	`3`	`pip install --upgrade pip`
`3`	`4`	`pip install --user pyhocon`
`4`	`5`	`pip3 install --user pyhocon`
Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,6 @@ import spark.jobserver.api.{SparkJobBase, ValidationProblem, JobEnvironment}`
`8`	`8`
`9`	`9`	`import scala.sys.process.{ProcessLogger, Process}`
`10`	`10`	`import scala.util.{Failure, Success, Try}`
`11`		`-import scala.collection.JavaConverters._`
`12`	`11`
`13`	`12`	`case class PythonJob[X <: PythonContextLike](eggPath: String,`
`14`	`13`	`modulePath:String,`
`@@ -24,8 +23,6 @@ case class PythonJob[X <: PythonContextLike](eggPath: String,`
`24`	`23`	`JobEndpoint(context, sparkConf, contextConfig, jobId, jobConfig, modulePath, py4JImports)`
`25`	`24`	`}`
`26`	`25`
`27`		`- def gateway(endpoint: JobEndpoint[C]): GatewayServer = new GatewayServer(endpoint, 0)`
`28`		`-`
`29`	`26`	`/**`
`30`	`27`	`*`
`31`	`28`	`* To support a useful validate method here for Python jobs we would have call two python processes,`