More updates for getting cluster mode working with 1.5.0

rf972 · rf972 · commit e7bf58afc183 · 2022-02-11T09:49:03.000-05:00
diff --git a/cluster_setup.md b/cluster_setup.md
@@ -0,0 +1,19 @@
+Our cluster configuration uses docker host networking. There are a series of scripts to bring up the dockers that make up our cluster.  You will likely need to tailor these scripts to meet the needs of your configuration.
+
+We have several scripts:
+spark/docker/start_master_host.sh  This brings up the spark master container using host networking.
+spark/docker/start_worker_host.sh  This brings up the spark worker container using host networking. 
+spark/docker/start_launcher_host.sh  This brings up the spark launcher container using host networking.  This is the container where our run_tpch.sh will launch the benchmark from. 
+dikeHDFS/start_server_host.sh  This brings up the docker with HDFS, and NDP.
+
+There is a config file called spark/spark.config.   It has the config of the addresses and hostnames needed by the above scripts.  You need to modify it for your configuration.  There is an example in our repo.
+
+You also need to configure dikeHDFS/start_server_host.sh with your IP address.  Change the line with --add-host=dikehdfs to include your storage server's ip address.
+
+As an example, in our configuration we typically will follow this sequence.
+1)	From our master node we will run start_master_host.sh  and start_launcher_host.sh
+2)	Next we go to the worker nodes and run start_worker_host.sh 1 8
+3)	Note that the 1 8 above is the number of workers followed by the number of cores to use.
+4)	Launch the NDP server via dikeHDFS/start_server_host.sh
+
+
diff --git a/demo.sh b/demo.sh
@@ -4,22 +4,22 @@
 printf "\nNext Test: Spark TPC-H query with HDFS storage and with no pushdown\n"
 read -n 1 -s -r -p "Press any key to continue with test."
 cd benchmark/tpch
-./run_tpch.sh -t 6 -ds ndp --protocol ndphdfs
+./run_tpch.sh --local -t 6 -ds ndp --protocol ndphdfs
 printf "\nTest Complete: Spark TPC-H query with HDFS storage and with no pushdown\n"
 
 printf "\nNext Test: Spark TPC-H query with HDFS storage and with pushdown enabled.\n"
 read -n 1 -s -r -p "Press any key to continue with test."
-./run_tpch.sh -t 6 -ds ndp --protocol ndphdfs --pushdown
+./run_tpch.sh --local -t 6 -ds ndp --protocol ndphdfs --pushdown
 printf "\nTest Complete: Spark TPC-H query with HDFS storage and with pushdown enabled.\n"
 
 
 
-printf "\nNext Test: Spark TPC-H query with S3 storage and with no pushdown\n"
-read -n 1 -s -r -p "Press any key to continue with test."
-./run_tpch.sh -t 6 -ds ndp --protocol s3
-printf "Test Complete: Spark TPC-H query with S3 storage and with no pushdown\n"
+#printf "\nNext Test: Spark TPC-H query with S3 storage and with no pushdown\n"
+#read -n 1 -s -r -p "Press any key to continue with test."
+#./run_tpch.sh --local -t 6 -ds ndp --protocol s3
+#printf "Test Complete: Spark TPC-H query with S3 storage and with no pushdown\n"
 
-printf "\nNext Test: Spark TPC-H query with S3 and with pushdown enabled.\n"
-read -n 1 -s -r -p "Press any key to continue with test."
-./run_tpch.sh -t 6 -ds ndp --protocol s3 --pushdown
-printf "\nTest Complete: Spark TPC-H query with S3 and with pushdown enabled.\n"
+#printf "\nNext Test: Spark TPC-H query with S3 and with pushdown enabled.\n"
+#read -n 1 -s -r -p "Press any key to continue with test."
+#./run_tpch.sh --local -t 6 -ds ndp --protocol s3 --pushdown
+#printf "\nTest Complete: Spark TPC-H query with S3 and with pushdown enabled.\n"
diff --git a/dikeHDFS b/dikeHDFS
@@ -1 +1 @@
-Subproject commit 5c7ac77effe5f7350699061c7bdae23d565ed2dd
+Subproject commit 2a788cf9bb52dbe4115a3526ca20d4cb32d35094
diff --git a/spark/docker/start-launcher.sh b/spark/docker/start-launcher.sh
@@ -11,78 +11,25 @@ rm -f "${ROOT_DIR}/volume/status/MASTER*"
 
 CMD="sleep 365d"
 RUNNING_MODE="daemon"
-START_LOCAL="NO"
-if [ ! -d spark.config ]; then
-  START_LOCAL="YES"
-else
-  DOCKER_HOSTS="$(cat spark.config | grep DOCKER_HOSTS)"
-  IFS='=' read -a IP_ARRAY <<< "$DOCKER_HOSTS"
-  DOCKER_HOSTS=${IP_ARRAY[1]}
-  HOSTS=""
-  IFS=',' read -a IP_ARRAY <<< "$DOCKER_HOSTS"
-  for i in "${IP_ARRAY[@]}"
-  do
-    HOSTS="$HOSTS --add-host=$i"
-  done
-  DOCKER_HOSTS=$HOSTS
-  echo "Docker Hosts: $DOCKER_HOSTS"
 
-  LAUNCHER_IP="$(cat spark.config | grep LAUNCHER_IP)"
-  IFS='=' read -a IP_ARRAY <<< "$LAUNCHER_IP"
-  LAUNCHER_IP=${IP_ARRAY[1]}
-  echo "LAUNCHER_IP: $LAUNCHER_IP"
-fi
-DOCKER_ID=""
 if [ $RUNNING_MODE = "interactive" ]; then
   DOCKER_IT="-i -t"
 fi
 #  --cpuset-cpus="9-12" \
-if [ ${START_LOCAL} == "YES" ]; then
-  DOCKER_RUN="docker run ${DOCKER_IT} --rm \
+DOCKER_RUN="docker run ${DOCKER_IT} --rm \
   -p 5006:5006 \
   --name sparklauncher \
   --network dike-net \
   -e MASTER=spark://sparkmaster:7077 \
   -e SPARK_CONF_DIR=/conf \
   -e SPARK_PUBLIC_DNS=localhost \
-  --mount type=bind,source=$(pwd)/spark,target=/spark \
-  --mount type=bind,source=$(pwd)/build,target=/build \
-  --mount type=bind,source=$(pwd)/examples,target=/examples \
-  --mount type=bind,source=$(pwd)/../data,target=/tpch-data \
-  --mount type=bind,source=$(pwd)/../dikeHDFS,target=/dikeHDFS \
-  --mount type=bind,source=$(pwd)/../benchmark/tpch,target=/tpch \
-  --mount type=bind,source=$(pwd)/../pyNdp,target=/pyNdp \
-  --mount type=bind,source=$(pwd)/../pushdown-datasource/pushdown-datasource,target=/pushdown-datasource \
-  -v $(pwd)/conf/master:/conf  \
-  -v ${ROOT_DIR}/build/.m2:${DOCKER_HOME_DIR}/.m2 \
-  -v ${ROOT_DIR}/build/.gnupg:${DOCKER_HOME_DIR}/.gnupg \
-  -v ${ROOT_DIR}/build/.sbt:${DOCKER_HOME_DIR}/.sbt \
-  -v ${ROOT_DIR}/build/.cache:${DOCKER_HOME_DIR}/.cache \
-  -v ${ROOT_DIR}/build/.ivy2:${DOCKER_HOME_DIR}/.ivy2 \
-  -v ${ROOT_DIR}/volume/status:/opt/volume/status \
-  -v ${ROOT_DIR}/volume/logs:/opt/volume/logs \
-  -v ${ROOT_DIR}/bin/:${DOCKER_HOME_DIR}/bin \
-  -e "AWS_ACCESS_KEY_ID=${USER_NAME}" \
-  -e "AWS_SECRET_ACCESS_KEY=admin123" \
-  -e "AWS_EC2_METADATA_DISABLED=true" \
-  -e RUNNING_MODE=${RUNNING_MODE} \
-  -u ${USER_ID} \
-  spark-run-${USER_NAME} ${CMD}"
-else
-  DOCKER_RUN="docker run ${DOCKER_IT} --rm \
-  -p 5006:5006 \
-  --name sparklauncher \
-  --network dike-net --ip ${LAUNCHER_IP} ${DOCKER_HOSTS} \
-  -e MASTER=spark://sparkmaster:7077 \
-  -e SPARK_CONF_DIR=/conf \
-  -e SPARK_PUBLIC_DNS=localhost \
   -e SPARK_MASTER="spark://sparkmaster:7077" \
-  -e SPARK_DRIVER_HOST=${LAUNCHER_IP} \
   --mount type=bind,source=$(pwd)/spark,target=/spark \
   --mount type=bind,source=$(pwd)/build,target=/build \
   --mount type=bind,source=$(pwd)/examples,target=/examples \
   --mount type=bind,source=$(pwd)/../dikeHDFS,target=/dikeHDFS \
   --mount type=bind,source=$(pwd)/../benchmark/tpch,target=/tpch \
+  --mount type=bind,source=$(pwd)/../data,target=/tpch-data \
   --mount type=bind,source=$(pwd)/../pushdown-datasource/pushdown-datasource,target=/pushdown-datasource \
   -v $(pwd)/conf/master:/conf  \
   -v ${ROOT_DIR}/build/.m2:${DOCKER_HOME_DIR}/.m2 \
@@ -98,11 +45,10 @@ else
   -e "AWS_EC2_METADATA_DISABLED=true" \
   -e RUNNING_MODE=${RUNNING_MODE} \
   -u ${USER_ID} \
-  spark-run-${USER_NAME} ${CMD}"
-fi
-echo "mode: $RUNNING_MODE"
+  v${DIKE_VERSION}-spark-run-${USER_NAME} ${CMD}"
+
 if [ $RUNNING_MODE = "interactive" ]; then
   eval "${DOCKER_RUN}"
 else
   eval "${DOCKER_RUN}" &
-fi
+fi
diff --git a/spark/docker/start-master.sh b/spark/docker/start-master.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 # Include the setup for our cached local directories. (.m2, .ivy2, etc)
+source docker/spark_version
 source docker/setup.sh
 
 mkdir -p "${ROOT_DIR}/volume/logs"
@@ -37,8 +38,8 @@ else
   fi
 fi
 echo "removing work and logs"
-rm -rf build/spark-3.1.2/work/
-rm -rf build/spark-3.1.2/logs/
+rm -rf build/spark-$SPARK_VERSION/work/
+rm -rf build/spark-$SPARK_VERSION/logs/
 
 #  --cpuset-cpus="9-12" \
 if [ ${START_LOCAL} == "YES" ]; then
@@ -67,7 +68,7 @@ if [ ${START_LOCAL} == "YES" ]; then
   -v ${ROOT_DIR}/bin/:${DOCKER_HOME_DIR}/bin \
   -e RUNNING_MODE=${RUNNING_MODE} \
   -u ${USER_ID} \
-  spark-run-${USER_NAME} ${CMD}"
+  v${DIKE_VERSION}-spark-run-${USER_NAME} ${CMD}"
 else
   DOCKER_RUN="docker run ${DOCKER_IT} --rm \
   -p 4040:4040 -p 6066:6066 -p 7077:7077 -p 8080:8080 -p 5005:5005 -p 18080:18080 \
@@ -98,7 +99,7 @@ else
   -e "AWS_EC2_METADATA_DISABLED=true" \
   -e RUNNING_MODE=${RUNNING_MODE} \
   -u ${USER_ID} \
-  spark-run-${USER_NAME} ${CMD}"
+  v${DIKE_VERSION}-spark-run-${USER_NAME} ${CMD}"
 fi
 if [ $RUNNING_MODE = "interactive" ]; then
   eval "${DOCKER_RUN}"
diff --git a/spark/docker/start-worker-host.sh b/spark/docker/start-worker-host.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-
+source docker/spark_version
 source docker/setup.sh
 
 mkdir -p "${ROOT_DIR}/volume/logs"
@@ -20,8 +20,8 @@ if [ "$#" -ge 2 ] ; then
   CORES=$2
 fi
 echo "removing work and logs"
-rm -rf build/spark-3.1.2/work/
-rm -rf build/spark-3.1.2/logs/
+rm -rf build/spark-$SPARK_VERSION/work/
+rm -rf build/spark-$SPARK_VERSION/logs/
 
 echo "Workers: $WORKERS"
 echo "Cores: $CORES"
diff --git a/spark/docker/start-worker.sh b/spark/docker/start-worker.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-
+source docker/spark_version
 source docker/setup.sh
 
 mkdir -p "${ROOT_DIR}/volume/logs"
@@ -20,27 +20,11 @@ if [ "$#" -ge 2 ] ; then
   CORES=$2
 fi
 echo "removing work and logs"
-rm -rf build/spark-3.1.2/work/
-rm -rf build/spark-3.1.2/logs/
+rm -rf build/spark-$SPARK_VERSION/work/
+rm -rf build/spark-$SPARK_VERSION/logs/
 
 echo "Workers: $WORKERS"
 echo "Cores: $CORES"
-DOCKER_HOSTS="$(cat spark.config | grep DOCKER_HOSTS)"
-IFS='=' read -a IP_ARRAY <<< "$DOCKER_HOSTS"
-DOCKER_HOSTS=${IP_ARRAY[1]}
-HOSTS=""
-IFS=',' read -a IP_ARRAY <<< "$DOCKER_HOSTS"
-for i in "${IP_ARRAY[@]}"
-do
-  HOSTS="$HOSTS --add-host=$i"
-done
-DOCKER_HOSTS=$HOSTS
-echo "Docker Hosts: $DOCKER_HOSTS"
-
-WORKER_IP="$(cat spark.config | grep WORKER_IP)"
-IFS='=' read -a IP_ARRAY <<< "$WORKER_IP"
-WORKER_IP=${IP_ARRAY[1]}
-echo "WORKER_IP: $WORKER_IP"
 
 if [ $RUNNING_MODE = "interactive" ]; then
   DOCKER_IT="-i -t"
@@ -50,7 +34,7 @@ fi
 DOCKER_RUN="docker run ${DOCKER_IT} --rm -p 8081:8081 \
   --expose 7012 --expose 7013 --expose 7014 --expose 7015 --expose 8881 \
   --name sparkworker \
-  --network dike-net --ip ${WORKER_IP} ${DOCKER_HOSTS} \
+  --network dike-net \
   -e SPARK_CONF_DIR=/conf \
       -e SPARK_WORKER_INSTANCES=$WORKERS \
       -e SPARK_WORKER_CORES=$CORES \
@@ -72,7 +56,7 @@ DOCKER_RUN="docker run ${DOCKER_IT} --rm -p 8081:8081 \
   -v ${ROOT_DIR}/bin/:${DOCKER_HOME_DIR}/bin \
   -e RUNNING_MODE=${RUNNING_MODE} \
   -u ${USER_ID} \
-  spark-run-${USER_NAME} ${CMD}"
+  v${DIKE_VERSION}-spark-run-${USER_NAME} ${CMD}"
 
 
 if [ $RUNNING_MODE = "interactive" ]; then
diff --git a/spark/start.sh b/spark/start.sh
@@ -2,4 +2,7 @@
 
 ./docker/start-master.sh && sleep 5 && ./docker/start-worker.sh
 
-sleep 5
+sleep 5
+./docker/start-launcher.sh
+
+sleep 5
diff --git a/start_hdfs.sh b/start_hdfs.sh
@@ -19,7 +19,7 @@ echo $CMDSTATUS
 if [ $CMDSTATUS -ne 0 ]; then
   pushd benchmark/tpch
   echo "Initialize tpch CSV database in hdfs"
-  ./run_tpch.sh --mode initCsv --protocol hdfs || (echo "*** failed tpch init of CSV for hdfs $?" ; exit 1)
+  ./run_tpch.sh --local --mode initCsv --protocol hdfs || (echo "*** failed tpch init of CSV for hdfs $?" ; exit 1)
   popd
 fi