riptano
diff --git a/‎.gitignore‎
Lines changed: 10 additions & 1 deletion b/‎.gitignore‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎.travis.yml‎
Lines changed: 5 additions & 2 deletions b/‎.travis.yml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 243 additions & 22 deletions b/‎README.md‎
Lines changed: 243 additions & 22 deletions
diff --git a/‎bin/ec2_deploy.sh‎
Lines changed: 43 additions & 0 deletions b/‎bin/ec2_deploy.sh‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎bin/ec2_destroy.sh‎
Lines changed: 7 additions & 0 deletions b/‎bin/ec2_destroy.sh‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎bin/ec2_example.sh.template‎
Lines changed: 16 additions & 0 deletions b/‎bin/ec2_example.sh.template‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎bin/manager_start.sh‎
Lines changed: 44 additions & 0 deletions b/‎bin/manager_start.sh‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎bin/server_deploy.sh‎
Lines changed: 9 additions & 5 deletions b/‎bin/server_deploy.sh‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎bin/server_package.sh‎
Lines changed: 4 additions & 1 deletion b/‎bin/server_package.sh‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎bin/server_start.sh‎
Lines changed: 8 additions & 75 deletions b/‎bin/server_start.sh‎
Lines changed: 8 additions & 75 deletions
@@ -13,4 +13,13 @@ config/*.conf
 config/*.sh
 job-server/config/*.conf
 job-server/config/*.sh
-metastore_db/
+metastore_db/
+
+#ignore generated config
+bin/ec2_example.sh
+
+# ignore spark-ec2 script
+ec2Cluster/
+
+# don't ignore the ec2 config and sh files
+!job-server/config/ec2.sh
@@ -3,5 +3,8 @@ env:
   global:
     _JAVA_OPTIONS="-Xmx1500m -XX:MaxPermSize=512m -Dakka.test.timefactor=3"
 scala:
-   - 2.10.4
-   - 2.11.6
+   - 2.10.6
+   - 2.11.8
+jdk:
+   - oraclejdk8
+   - oraclejdk7
@@ -0,0 +1,43 @@
+#!/bin/bash
+bin=`dirname "${BASH_SOURCE-$0}"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/../config/user-ec2-settings.sh
+
+#get spark deployment scripts if they haven't been downloaded and extracted yet
+SPARK_DIR=ec2Cluster
+if [ ! -d "$bin"/../$SPARK_DIR ]; then
+    mkdir "$bin"/../$SPARK_DIR
+    mkdir "$bin"/../$SPARK_DIR/deploy.generic/root/spark-ec2
+    wget -P "$bin"/../$SPARK_DIR/deploy.generic/root/spark-ec2 https://raw.githubusercontent.com/apache/spark/master/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
+    wget -P "$bin"/../$SPARK_DIR https://raw.githubusercontent.com/apache/spark/master/ec2/spark_ec2.py
+    wget -P "$bin"/../$SPARK_DIR https://raw.githubusercontent.com/apache/spark/master/ec2/spark-ec2
+    chmod u+x "$bin"/../$SPARK_DIR/*
+fi
+
+#run spark-ec2 to start ec2 cluster
+EC2DEPLOY="$bin"/../$SPARK_DIR/spark-ec2
+"$EC2DEPLOY" --copy-aws-credentials --key-pair=$KEY_PAIR --hadoop-major-version=yarn --identity-file=$SSH_KEY --region=us-east-1 --zone=us-east-1a --spark-version=$SPARK_VERSION --instance-type=$INSTANCE_TYPE --slaves $NUM_SLAVES launch $CLUSTER_NAME
+#There is only 1 deploy host. However, the variable is plural as that is how Spark Job Server named it.
+#To minimize changes, I left the variable name alone.
+export DEPLOY_HOSTS=$("$EC2DEPLOY" get-master $CLUSTER_NAME | tail -n1)
+
+#This line is a hack to edit the ec2.conf file so that the master option is correct. Since we are allowing Amazon to
+#dynamically allocate a url for the master node, we must update the configuration file in between cluster startup
+#and Job Server deployment
+cp "$bin"/../config/ec2.conf.template "$bin"/../config/ec2.conf
+sed -i -E "s/master = .*/master = \"spark:\/\/$DEPLOY_HOSTS:7077\"/g" "$bin"/../config/ec2.conf
+
+#also get ec2_example.sh right
+cp "$bin"/ec2_example.sh.template "$bin"/ec2_example.sh
+sed -i -E "s/DEPLOY_HOSTS=.*/DEPLOY_HOSTS=\"$DEPLOY_HOSTS:8090\"/g" "$bin"/ec2_example.sh
+
+#open all ports so the master for Spark Job Server to work and you can see the results of your jobs
+aws ec2 authorize-security-group-ingress --group-name $CLUSTER_NAME-master --protocol tcp --port 0-65535 --cidr 0.0.0.0/0
+
+cd "$bin"/..
+bin/server_deploy.sh ec2
+ssh -o StrictHostKeyChecking=no -i "$SSH_KEY"  root@$DEPLOY_HOSTS "(echo 'export AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID' >> spark/conf/spark-env.sh)"
+ssh -o StrictHostKeyChecking=no -i "$SSH_KEY"  root@$DEPLOY_HOSTS "(echo 'export AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY' >> spark/conf/spark-env.sh)"
+ssh -o StrictHostKeyChecking=no -i "$SSH_KEY"  root@$DEPLOY_HOSTS "(cd job-server; nohup ./server_start.sh < /dev/null &> /dev/null &)"
+echo "The Job Server is listening at $DEPLOY_HOSTS:8090"
@@ -0,0 +1,7 @@
+#!/bin/bash
+bin=`dirname "${BASH_SOURCE-$0}"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/../config/user-ec2-settings.sh
+
+"$bin"/../ec2Cluster/spark-ec2 destroy $CLUSTER_NAME
@@ -0,0 +1,16 @@
+DEPLOY_HOSTS=ENTER_DEPLOY_HOST_HERE
+bin=`dirname "${BASH_SOURCE-$0}"`
+bin=`cd "$bin"; pwd`
+
+. "$bin"/../config/ec2.sh
+
+ssh_key_to_use=""
+if [ -n "$SSH_KEY" ]  ; then
+  ssh_key_to_use="-i $SSH_KEY"
+fi
+
+VERSION=$(sed -E 's/version in ThisBuild := "(.*)"/\1/' version.sbt)
+wget -O- --post-file "$bin"/../job-server-extras/target/scala-2.10/job-server-extras_2.10-$VERSION.jar "$DEPLOY_HOSTS/jars/km"
+scp -rp -o StrictHostKeyChecking=no $ssh_key_to_use "$bin"/../job-server-extras/src/main/KMeansExample/* ${APP_USER}@"${DEPLOY_HOSTS%:*}:/var/www/html/"
+
+echo "The example is running at ${DEPLOY_HOSTS%:*}:5080"
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Script to start the job manager
+# args: <work dir for context> <cluster address> [proxy_user]
+set -e
+
+get_abs_script_path() {
+   pushd . >/dev/null
+   cd $(dirname $0)
+   appdir=$(pwd)
+   popd  >/dev/null
+}
+get_abs_script_path
+
+. $appdir/setenv.sh
+
+# Override logging options to provide per-context logging
+LOGGING_OPTS="-Dlog4j.configuration=file:$appdir/log4j-server.properties
+              -DLOG_DIR=$1"
+
+GC_OPTS="-XX:+UseConcMarkSweepGC
+         -verbose:gc -XX:+PrintGCTimeStamps -Xloggc:$appdir/gc.out
+         -XX:MaxPermSize=512m
+         -XX:+CMSClassUnloadingEnabled "
+
+JAVA_OPTS="-XX:MaxDirectMemorySize=$MAX_DIRECT_MEMORY
+           -XX:+HeapDumpOnOutOfMemoryError -Djava.net.preferIPv4Stack=true"
+
+MAIN="spark.jobserver.JobManager"
+
+if [ ! -z $3 ]; then
+  cmd='$SPARK_HOME/bin/spark-submit --class $MAIN --driver-memory $JOBSERVER_MEMORY
+  --conf "spark.executor.extraJavaOptions=$LOGGING_OPTS"
+  --proxy-user $3
+  --driver-java-options "$GC_OPTS $JAVA_OPTS $LOGGING_OPTS $CONFIG_OVERRIDES"
+  $appdir/spark-job-server.jar $1 $2 $conffile'
+else
+  cmd='$SPARK_HOME/bin/spark-submit --class $MAIN --driver-memory $JOBSERVER_MEMORY
+  --conf "spark.executor.extraJavaOptions=$LOGGING_OPTS"
+  --driver-java-options "$GC_OPTS $JAVA_OPTS $LOGGING_OPTS $CONFIG_OVERRIDES"
+  $appdir/spark-job-server.jar $1 $2 $conffile'
+fi
+
+eval $cmd > /dev/null 2>&1 &
+# exec java -cp $CLASSPATH $GC_OPTS $JAVA_OPTS $LOGGING_OPTS $CONFIG_OVERRIDES $MAIN $1 $2 $conffile 2>&1 &
@@ -18,7 +18,7 @@ if [ ! -f "$configFile" ]; then
   echo "Could not find $configFile"
   exit 1
 fi
-. $configFile
+. "$configFile"
 
 majorRegex='([0-9]+\.[0-9]+)\.[0-9]+'
 if [[ $SCALA_VERSION =~ $majorRegex ]]
@@ -42,8 +42,10 @@ FILES="job-server-extras/target/scala-$majorVersion/spark-job-server.jar
        bin/server_start.sh
        bin/server_stop.sh
        bin/kill-process-tree.sh
+       bin/manager_start.sh
+       bin/setenv.sh
        $CONFIG_DIR/$ENV.conf
-	   config/shiro.ini
+  	   config/shiro.ini
        config/log4j-server.properties"
 
 ssh_key_to_use=""
@@ -53,7 +55,9 @@ fi
 
 for host in $DEPLOY_HOSTS; do
   # We assume that the deploy user is APP_USER and has permissions
-  ssh $ssh_key_to_use  ${APP_USER}@$host mkdir -p $INSTALL_DIR
-  scp $ssh_key_to_use  $FILES ${APP_USER}@$host:$INSTALL_DIR/
-  scp $ssh_key_to_use  $configFile ${APP_USER}@$host:$INSTALL_DIR/settings.sh
+  ssh -o StrictHostKeyChecking=no $ssh_key_to_use  ${APP_USER}@$host mkdir -p $INSTALL_DIR
+  scp -o StrictHostKeyChecking=no $ssh_key_to_use  $FILES ${APP_USER}@$host:$INSTALL_DIR/
+  scp -o StrictHostKeyChecking=no $ssh_key_to_use  "$CONFIG_DIR/$ENV.conf" ${APP_USER}@$host:$INSTALL_DIR/
+  scp -o StrictHostKeyChecking=no $ssh_key_to_use  "$configFile" ${APP_USER}@$host:$INSTALL_DIR/settings.sh
 done
+
@@ -45,8 +45,11 @@ FILES="job-server-extras/target/scala-$majorVersion/spark-job-server.jar
        bin/server_start.sh
        bin/server_stop.sh
        bin/kill-process-tree.sh
+       bin/manager_start.sh
+       bin/setenv.sh
        $CONFIG_DIR/$ENV.conf
-       config/logback-server.xml"
+       config/logback-server.xml
+       config/shiro.ini"
 
 rm -rf $WORK_DIR
 mkdir -p $WORK_DIR
 
@@ -18,6 +18,8 @@ get_abs_script_path() {
 
 get_abs_script_path
 
+. $appdir/setenv.sh
+
 GC_OPTS="-XX:+UseConcMarkSweepGC
          -verbose:gc -XX:+PrintGCTimeStamps -Xloggc:$appdir/gc.out
          -XX:MaxPermSize=512m
@@ -26,7 +28,7 @@ GC_OPTS="-XX:+UseConcMarkSweepGC
 # To truly enable JMX in AWS and other containerized environments, also need to set
 # -Djava.rmi.server.hostname equal to the hostname in that environment.  This is specific
 # depending on AWS vs GCE etc.
-JAVA_OPTS="-XX:MaxDirectMemorySize=512M \
+JAVA_OPTS="-XX:MaxDirectMemorySize=$MAX_DIRECT_MEMORY \
            -XX:+HeapDumpOnOutOfMemoryError -Djava.net.preferIPv4Stack=true \
            -Dcom.sun.management.jmxremote.port=9999 \
            -Dcom.sun.management.jmxremote.rmi.port=9999 \
@@ -35,89 +37,20 @@ JAVA_OPTS="-XX:MaxDirectMemorySize=512M \
 
 MAIN="spark.jobserver.JobServer"
 
-if [ -f "$JOBSERVER_CONFIG" ]; then
-  conffile="$JOBSERVER_CONFIG"
-else
-  conffile=$(ls -1 $appdir/*.conf | head -1)
-  if [ -z "$conffile" ]; then
-    echo "No configuration file found"
-    exit 1
-  fi
-fi
-
-if [ -f "$appdir/settings.sh" ]; then
-  . "$appdir/settings.sh"
-else
-  echo "Missing $appdir/settings.sh, exiting"
-  exit 1
-fi
-
-if [ -z "$SPARK_HOME" ]; then
-  echo "Please set SPARK_HOME or put it in $appdir/settings.sh first"
-  exit 1
-fi
-
-pidFilePath=$appdir/$PIDFILE
-
-if [ -f "$pidFilePath" ] && kill -0 "$(cat "$pidFilePath")"; then
+PIDFILE=$appdir/spark-jobserver.pid
+if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE"); then
    echo 'Job server is already running'
    exit 1
 fi
 
-if [ -z "$LOG_DIR" ]; then
-  LOG_DIR=/tmp/job-server
-  echo "LOG_DIR empty; logging will go to $LOG_DIR"
-fi
-mkdir -p $LOG_DIR
-
-LOGGING_OPTS="-DLOG_DIR=$LOG_DIR"
-
-export SPARK_SUBMIT_LOGBACK_CONF_FILE="$appdir/logback-server.xml"
-
-# For Mesos
-CONFIG_OVERRIDES=""
-if [ -n "$SPARK_EXECUTOR_URI" ]; then
-  CONFIG_OVERRIDES="-Dspark.executor.uri=$SPARK_EXECUTOR_URI "
-fi
-# For Mesos/Marathon, use the passed-in port
-if [ "$PORT" != "" ]; then
-  CONFIG_OVERRIDES+="-Dspark.jobserver.port=$PORT "
-fi
-
-if [ -z "$JOBSERVER_MEMORY" ]; then
-	JOBSERVER_MEMORY=1G
-fi
-
-# This needs to be exported for standalone mode so drivers can connect to the Spark cluster
-export SPARK_HOME
-export YARN_CONF_DIR
-export HADOOP_CONF_DIR
-
-# Identify location of dse command
-DSE="/usr/bin/dse"
-if [ -z "$DSE_HOME" ]; then
-    if [ -e "$DSE" ]; then
-        export DSE_HOME=/usr/share/dse
-    fi
-fi
-if [ ! -e "$DSE" ]; then
-    if [ -e "$DSE_HOME"/bin/dse ]; then
-        DSE="$DSE_HOME"/bin/dse
-    else
-      echo "Cannot determine DSE_HOME, please set it manually to your DSE install directory"
-      exit 1
-    fi
-fi
-
-# Submit the job server
-cmd='$DSE spark-submit --class $MAIN --driver-memory $JOBSERVER_MEMORY
+cmd='$SPARK_HOME/bin/spark-submit --class $MAIN --driver-memory $JOBSERVER_MEMORY
   --conf "spark.executor.extraJavaOptions=$LOGGING_OPTS"
   --driver-java-options "$GC_OPTS $JAVA_OPTS $LOGGING_OPTS $CONFIG_OVERRIDES"
   $@ $appdir/spark-job-server.jar $conffile'
 
 if [ -z "$JOBSERVER_FG" ]; then
-  eval $cmd 2>&1 &
-  echo $! > $pidFilePath
+  eval $cmd > /dev/null 2>&1 < /dev/null &
+  echo $! > $PIDFILE
 else
   eval $cmd
 fi