Merge branch 'master' into pr/165

velvia · velvia · commit ad8154f681a4 · 2015-07-04T17:11:00.000-07:00
Conflicts:
	job-server/src/main/resources/application.conf
diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,6 @@ test-reports/
 # ignore deployment configs
 config/*.conf
 config/*.sh
+job-server/config/*.conf
+job-server/config/*.sh
 metastore_db/
diff --git a/README.md b/README.md
@@ -17,11 +17,14 @@ See [Troubleshooting Tips](doc/troubleshooting.md) as well as [Yarn tips](doc/ya
 - Avenida.com
 - GumGum
 - Fuse Elements
+- Frontline Solvers
+- Aruba Networks
+- [Zed Worldwide](www.zed.com)
 
 ## Features
 
 - *"Spark as a Service"*: Simple REST interface for all aspects of job, context management
-- Support for Spark SQL and Hive Contexts/jobs and custom job contexts!  See [Contexts](doc/contexts.md).
+- Support for Spark SQL, Hive, Streaming Contexts/jobs and custom job contexts!  See [Contexts](doc/contexts.md).
 - Supports sub-second low-latency jobs via long-running job contexts
 - Start and stop job contexts for RDD sharing and low-latency jobs; change resources on restart
 - Kill running jobs via stop context
@@ -46,6 +49,8 @@ For release notes, look in the `notes/` directory.  They should also be up on [l
 
 ## Quick start / development mode
 
+NOTE: This quick start guide uses SBT to run the job server and the included test jar, but the normal development process is to create a separate project for Job Server jobs and to deploy the job server to a Spark cluster.  Please see the deployment section below for more details.
+
 You need to have [SBT](http://www.scala-sbt.org/release/docs/Getting-Started/Setup.html) installed.
 
 To set the current version, do something like this:
@@ -231,6 +236,8 @@ def validate(sc:SparkContext, config: Contig): SparkJobValidation = {
    it to the remotes you have configured in `<environment>.sh`
 3. On the remote server, start it in the deployed directory with `server_start.sh` and stop it with `server_stop.sh`
 
+The `server_start.sh` script uses `spark-submit` under the hood and may be passed any of the standard extra arguments from `spark-submit`.
+
 NOTE: by default the assembly jar from `job-server-extras`, which includes support for SQLContext and HiveContext, is used.  If you face issues with all the extra dependencies, consider modifying the install scripts to invoke `sbt job-server/assembly` instead, which doesn't include the extra dependencies.
 
 Note: to test out the deploy to a local staging dir, or package the job server for Mesos,
@@ -275,6 +282,8 @@ the REST API.
     DELETE /jobs/<jobId>     - Kills the specified job
     GET /jobs/<jobId>/config - Gets the job configuration
 
+For details on the Typesafe config format used for input (JSON also works), see the [Typesafe Config docs](https://github.com/typesafehub/config).
+
 ### Context configuration
 
 A number of context-specific settings can be controlled when creating a context (POST /contexts) or running an
diff --git a/akka-app/src/ooyala.common.akka/AkkaTestUtils.scala b/akka-app/src/ooyala.common.akka/AkkaTestUtils.scala
@@ -10,7 +10,7 @@ object AkkaTestUtils {
   // This is a var for now because we need to let people change it, and we can't pass this in as a param
   // because then we would change the API.  If we have it as a default param, we can't have multiple methods
   // with the same name.
-  var timeout = 10 seconds
+  var timeout = 15 seconds
 
   def shutdownAndWait(actor: ActorRef) {
     if (actor != null) {
diff --git a/bin/server_start.sh b/bin/server_start.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 # Script to start the job server
+# Extra arguments will be spark-submit options, for example
+#  ./server_start.sh --jars cassandra-spark-connector.jar
 set -e
 
 get_abs_script_path() {
@@ -16,7 +18,7 @@ GC_OPTS="-XX:+UseConcMarkSweepGC
          -XX:MaxPermSize=512m
          -XX:+CMSClassUnloadingEnabled "
 
-JAVA_OPTS="-Xmx5g -XX:MaxDirectMemorySize=512M
+JAVA_OPTS="-XX:MaxDirectMemorySize=512M
            -XX:+HeapDumpOnOutOfMemoryError -Djava.net.preferIPv4Stack=true
            -Dcom.sun.management.jmxremote.port=9999
            -Dcom.sun.management.jmxremote.authenticate=false
@@ -42,13 +44,6 @@ if [ -z "$SPARK_HOME" ]; then
   exit 1
 fi
 
-if [ -z "$SPARK_CONF_DIR" ]; then
-  SPARK_CONF_DIR=$SPARK_HOME/conf
-fi
-
-# Pull in other env vars in spark config, such as MESOS_NATIVE_LIBRARY
-. $SPARK_CONF_DIR/spark-env.sh
-
 pidFilePath=$appdir/$PIDFILE
 
 if [ -f "$pidFilePath" ] && kill -0 $(cat "$pidFilePath"); then
@@ -62,7 +57,7 @@ if [ -z "$LOG_DIR" ]; then
 fi
 mkdir -p $LOG_DIR
 
-LOGGING_OPTS="-Dlog4j.configuration=log4j-server.properties
+LOGGING_OPTS="-Dlog4j.configuration=file:$appdir/log4j-server.properties
               -DLOG_DIR=$LOG_DIR"
 
 # For Mesos
@@ -75,12 +70,15 @@ if [ "$PORT" != "" ]; then
   CONFIG_OVERRIDES+="-Dspark.jobserver.port=$PORT "
 fi
 
+if [ -z "$DRIVER_MEMORY" ]; then
+	DRIVER_MEMORY=1G
+fi
+
 # This needs to be exported for standalone mode so drivers can connect to the Spark cluster
 export SPARK_HOME
 
-# job server jar needs to appear first so its deps take higher priority
-# need to explicitly include app dir in classpath so logging configs can be found
-CLASSPATH="$appdir:$appdir/spark-job-server.jar:$($SPARK_HOME/bin/compute-classpath.sh)"
-
-exec java -cp $CLASSPATH $GC_OPTS $JAVA_OPTS $LOGGING_OPTS $CONFIG_OVERRIDES $MAIN $conffile 2>&1 &
+$SPARK_HOME/bin/spark-submit --class $MAIN --driver-memory $DRIVER_MEMORY \
+  --conf "spark.executor.extraJavaOptions=$LOGGING_OPTS" \
+  --driver-java-options "$GC_OPTS $JAVA_OPTS $LOGGING_OPTS $CONFIG_OVERRIDES" \
+  $@ $appdir/spark-job-server.jar $conffile 2>&1 &
 echo $! > $pidFilePath
diff --git a/doc/contexts.md b/doc/contexts.md
@@ -38,4 +38,13 @@ This can be done easily by extending the `SparkContextFactory` trait, like `SQLC
 
 ## Jars
 
-If you wish to use the `SQLContext` or `HiveContext`, be sure to pull down the job-server-extras package.
+If you wish to use the `SQLContext` or `HiveContext`, be sure to pull down the job-server-extras package.
+
+## StreamingContext
+
+`job-server-extras` provides a context to run Spark Streaming jobs. There are a couple of configurations you can change in job-server's .conf file:
+
+* `streaming.batch_interval`: the streaming batch in millis
+* `streaming.stopGracefully`: if true, stops gracefully by waiting for the processing of all received data to be completed 
+* `streaming.stopSparkContext`: if true, stops the SparkContext with the StreamingContext. The underlying SparkContext will be stopped regardless of whether the StreamingContext has been started.
+
diff --git a/doc/troubleshooting.md b/doc/troubleshooting.md
@@ -16,6 +16,19 @@ send timeout param along with your request (in secs). eg below.
 http://devsparkcluster.cloudapp.net/jobs?appName=job-server-tests&classPath=spark.jobserver.WordCountExample&sync=true&timeout=20
 ```
 
+You may need to adjust Spray's default request timeout and idle timeout, which are by default 40 secs and 60 secs.  To do this, modify the configuration file in your deployed job server, adding a section like the following:
+
+```
+spray.can.server {
+  idle-timeout = 210 s
+  request-timeout = 200 s
+}
+```
+
+Then simply restart the job server.
+
+Note that the idle-timeout must be higher than request-timeout, or Spray and the job server won't start.
+
 ## Job server won't start / cannot bind to 0.0.0.0:8090
 
 Check that another process isn't already using that port.  If it is, you may want to start it on another port:
@@ -33,6 +46,10 @@ after this fixed, I can run jobs submitted from a remote job server successfully
 
 (Thanks to @pcliu)
 
+## Exception in thread "main" java.lang.NoSuchMethodError: akka.actor.ActorRefFactory.dispatcher()Lscala/concurrent/ExecutionContextExecutor;
+
+If you are running CDH 5.3 or older, you may have an incompatible version of Akka bundled together.  :(  Try modifying the version of Akka included with spark-jobserver to match the one in CDH (2.2.4, I think), or upgrade to CDH 5.4.   If you are on CDH 5.4, check that `sparkVersion` in `Dependencies.scala` matches CDH.  Or see [isse #154](https://github.com/spark-jobserver/spark-jobserver/issues/154).
+
 ## I want to run job-server on Windows
 
 1. Create directory `C:\Hadoop\bin`
diff --git a/job-server-extras/src/spark.jobserver/SparkStreamingJob.scala b/job-server-extras/src/spark.jobserver/SparkStreamingJob.scala
@@ -0,0 +1,12 @@
+package spark.jobserver
+
+import org.apache.spark.streaming.StreamingContext
+
+/**
+ * Defines a Job that runs on a [[StreamingContext]], note that
+ * these jobs are usually long running jobs and there's (yet) no way in Spark
+ * Job Server to query the status of these jobs.
+ */
+trait SparkStramingJob extends SparkJobBase {
+  type C = StreamingContext
+}
diff --git a/job-server-extras/src/spark.jobserver/SqlTestJob.scala b/job-server-extras/src/spark.jobserver/SqlTestJob.scala
@@ -1,7 +1,6 @@
 package spark.jobserver
 
-import com.typesafe.config.{Config, ConfigFactory}
-import org.apache.spark._
+import com.typesafe.config.Config
 import org.apache.spark.sql.SQLContext
 
 /**
diff --git a/job-server-extras/src/spark.jobserver/StreamingTestJob.scala b/job-server-extras/src/spark.jobserver/StreamingTestJob.scala
@@ -0,0 +1,27 @@
+package spark.jobserver
+
+import com.google.common.annotations.VisibleForTesting
+import com.typesafe.config.Config
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.StreamingContext
+
+import scala.collection.mutable
+
+@VisibleForTesting
+object StreamingTestJob extends SparkStramingJob {
+  def validate(ssc: StreamingContext, config: Config): SparkJobValidation = SparkJobValid
+
+
+  def runJob(ssc: StreamingContext, config: Config): Any = {
+    val queue = mutable.Queue[RDD[String]]()
+    queue += ssc.sparkContext.makeRDD(Seq("123", "test", "test2"))
+    val lines = ssc.queueStream(queue)
+    val words = lines.flatMap(_.split(" "))
+    val pairs = words.map(word => (word, 1))
+    val wordCounts = pairs.reduceByKey(_ + _)
+    //do something
+    wordCounts.foreachRDD(rdd => println(rdd.count()))
+    ssc.start()
+    ssc.awaitTermination()
+  }
+}
diff --git a/job-server-extras/src/spark.jobserver/context/HiveContextFactory.scala b/job-server-extras/src/spark.jobserver/context/HiveContextFactory.scala
@@ -7,13 +7,10 @@ import spark.jobserver.{ContextLike, SparkHiveJob, SparkJobBase}
 import spark.jobserver.util.SparkJobUtils
 
 class HiveContextFactory extends SparkContextFactory {
-  import SparkJobUtils._
-
   type C = HiveContext with ContextLike
 
-  def makeContext(config: Config, contextConfig: Config, contextName: String): C = {
-    val conf = configToSparkConf(config, contextConfig, contextName)
-    contextFactory(conf)
+  def makeContext(sparkConf: SparkConf, config: Config,  contextName: String): C = {
+    contextFactory(sparkConf)
   }
 
   protected def contextFactory(conf: SparkConf): C = {
diff --git a/job-server-extras/src/spark.jobserver/context/SQLContextFactory.scala b/job-server-extras/src/spark.jobserver/context/SQLContextFactory.scala
@@ -1,19 +1,16 @@
 package spark.jobserver.context
 
 import com.typesafe.config.Config
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.sql.SQLContext
 import spark.jobserver.{ContextLike, SparkSqlJob, SparkJobBase}
 import spark.jobserver.util.SparkJobUtils
 
 class SQLContextFactory extends SparkContextFactory {
-  import SparkJobUtils._
-
   type C = SQLContext with ContextLike
 
-  def makeContext(config: Config, contextConfig: Config, contextName: String): C = {
-    val conf = configToSparkConf(config, contextConfig, contextName)
-    new SQLContext(new SparkContext(conf)) with ContextLike {
+  def makeContext(sparkConf: SparkConf, config: Config,  contextName: String): C = {
+    new SQLContext(new SparkContext(sparkConf)) with ContextLike {
       def isValidJob(job: SparkJobBase): Boolean = job.isInstanceOf[SparkSqlJob]
       def stop() { this.sparkContext.stop() }
     }
diff --git a/job-server-extras/src/spark.jobserver/context/StreamingContextFactory.scala b/job-server-extras/src/spark.jobserver/context/StreamingContextFactory.scala
@@ -0,0 +1,24 @@
+package spark.jobserver.context
+
+import com.typesafe.config.Config
+import org.apache.spark.SparkConf
+import org.apache.spark.streaming.{Milliseconds, StreamingContext}
+import spark.jobserver.{ContextLike, SparkJobBase, SparkStramingJob}
+
+class StreamingContextFactory extends SparkContextFactory {
+
+  type C = StreamingContext with ContextLike
+
+  def makeContext(sparkConf: SparkConf, config: Config,  contextName: String): C = {
+    val interval = config.getInt("streaming.batch_interval")
+    val stopGracefully = config.getBoolean("streaming.stopGracefully")
+    val stopSparkContext = config.getBoolean("streaming.stopSparkContext")
+    new StreamingContext(sparkConf, Milliseconds(interval)) with ContextLike {
+      def isValidJob(job: SparkJobBase): Boolean = job.isInstanceOf[SparkStramingJob]
+      def stop() {
+        //Gracefully stops the spark context
+        stop(stopSparkContext, stopGracefully)
+      }
+    }
+  }
+}
diff --git a/job-server-extras/test/spark.jobserver/HiveJobSpec.scala b/job-server-extras/test/spark.jobserver/HiveJobSpec.scala
@@ -31,7 +31,7 @@ class HiveJobSpec extends ExtrasJobSpecBase(HiveJobSpec.getNewSystem) {
   before {
     dao = new InMemoryDAO
     manager =
-      system.actorOf(JobManagerActor.props(dao, "test", HiveJobSpec.config, false))
+      system.actorOf(JobManagerActor.props(dao, "test", HiveJobSpec.contextConfig, false))
   }
 
   describe("Spark Hive Jobs") {
diff --git a/job-server-extras/test/spark.jobserver/SqlJobSpec.scala b/job-server-extras/test/spark.jobserver/SqlJobSpec.scala
@@ -26,7 +26,7 @@ class SqlJobSpec extends ExtrasJobSpecBase(SqlJobSpec.getNewSystem) {
   before {
     dao = new InMemoryDAO
     manager =
-      system.actorOf(JobManagerActor.props(dao, "test", SqlJobSpec.config, false))
+      system.actorOf(JobManagerActor.props(dao, "test", SqlJobSpec.contextConfig, false))
   }
 
   describe("Spark SQL Jobs") {
diff --git a/job-server-extras/test/spark.jobserver/StreamingJobSpec.scala b/job-server-extras/test/spark.jobserver/StreamingJobSpec.scala
@@ -0,0 +1,55 @@
+package spark.jobserver
+
+import com.typesafe.config.ConfigFactory
+import spark.jobserver.context.StreamingContextFactory
+import spark.jobserver.io.JobInfo
+
+/**
+ * Test for Straming Jobs.
+ */
+object StreamingJobSpec extends JobSpecConfig {
+  override val contextFactory = classOf[StreamingContextFactory].getName
+}
+
+class StreamingJobSpec extends JobSpecBase(StreamingJobSpec.getNewSystem) {
+
+  import CommonMessages._
+
+  import collection.JavaConverters._
+  import scala.concurrent.duration._
+
+  val classPrefix = "spark.jobserver."
+  private val streamingJob = classPrefix + "StreamingTestJob"
+
+  val configMap = Map("streaming.batch_interval" -> Integer.valueOf(3))
+
+  val emptyConfig = ConfigFactory.parseMap(configMap.asJava)
+  var jobId = ""
+
+  before {
+    dao = new InMemoryDAO
+    manager =
+      system.actorOf(JobManagerActor.props(dao, "test", StreamingJobSpec.contextConfig, false))
+  }
+
+  describe("Spark Streaming Jobs") {
+    it("should be able to process data usign Streaming jobs") {
+      manager ! JobManagerActor.Initialize
+      expectMsgClass(10 seconds, classOf[JobManagerActor.Initialized])
+      uploadTestJar()
+      manager ! JobManagerActor.StartJob("demo", streamingJob, emptyConfig, asyncEvents ++ errorEvents)
+
+      jobId = expectMsgPF(6 seconds, "Did not start StreamingTestJob, expecting JobStarted") {
+        case JobStarted(jobid, _, _) => {
+          jobid should not be null
+          jobid
+        }
+      }
+      Thread sleep 1000
+      dao.getJobInfos.get(jobId).get match  {
+        case JobInfo(_, _, _, _, _, None, _) => {  }
+        case e => fail("Unexpected JobInfo" + e)
+      }
+    }
+  }
+}
diff --git a/job-server/config/local.sh.template b/job-server/config/local.sh.template
@@ -10,6 +10,8 @@ APP_GROUP=spark
 INSTALL_DIR=/home/spark/job-server
 LOG_DIR=/var/log/job-server
 PIDFILE=spark-jobserver.pid
+DRIVER_MEMORY=1G
+SPARK_VERSION=1.3.1
 SPARK_HOME=/home/spark/spark-0.8.0
 SPARK_CONF_DIR=$SPARK_HOME/conf
 # Only needed for Mesos deploys
diff --git a/job-server/src/main/resources/application.conf b/job-server/src/main/resources/application.conf
@@ -69,6 +69,18 @@ spark {
     # Determines the type of jobs that can run in a SparkContext
     context-factory = spark.jobserver.context.DefaultSparkContextFactory
 
+    streaming {
+      # Default batch interval for Spark Streaming contexts in milliseconds
+      batch_interval = 1000
+
+      # if true, stops gracefully by waiting for the processing of all received data to be completed
+      stopGracefully = true
+
+      # if true, stops the SparkContext with the StreamingContext. The underlying SparkContext will be
+      # stopped regardless of whether the StreamingContext has been started.
+      stopSparkContext = true
+    }
+
     # uris of jars to be loaded into the classpath for this context. Uris is a string list, or a string separated by commas ','
     # dependent-jar-uris = ["file:///some/path/present/in/each/mesos/slave/somepackage.jar"]
 
diff --git a/job-server/src/spark.jobserver/JobServer.scala b/job-server/src/spark.jobserver/JobServer.scala
@@ -36,11 +36,12 @@ object JobServer {
         println("Could not find configuration file " + configFile)
         sys.exit(1)
       }
-      ConfigFactory.parseFile(configFile).withFallback(defaultConfig)
+      ConfigFactory.parseFile(configFile).withFallback(defaultConfig).resolve()
     } else {
       defaultConfig
     }
     logger.info("Starting JobServer with config {}", config.getConfig("spark").root.render())
+    logger.info("Spray config: {}", config.getConfig("spray.can.server").root.render())
     val port = config.getInt("spark.jobserver.port")
 
     // TODO: Hardcode for now to get going. Make it configurable later.
diff --git a/job-server/src/spark.jobserver/WebApi.scala b/job-server/src/spark.jobserver/WebApi.scala
diff --git a/job-server/src/spark.jobserver/context/SparkContextFactory.scala b/job-server/src/spark.jobserver/context/SparkContextFactory.scala
diff --git a/job-server/test/spark.jobserver/JobManagerActorAdHocSpec.scala b/job-server/test/spark.jobserver/JobManagerActorAdHocSpec.scala
diff --git a/job-server/test/spark.jobserver/JobManagerActorSpec.scala b/job-server/test/spark.jobserver/JobManagerActorSpec.scala
diff --git a/job-server/test/spark.jobserver/JobSpecBase.scala b/job-server/test/spark.jobserver/JobSpecBase.scala
diff --git a/notes/0.5.2.markdown b/notes/0.5.2.markdown
diff --git a/project/Dependencies.scala b/project/Dependencies.scala