1+ #! /bin/bash
2+
3+ # GLOBAL settings
4+ SPARK_HOME=${SPARK_HOME:- " /fsx/bigcode/spark/spark-3.5.0-bin-hadoop3" }
5+ SCRATCH=${SCRATCH:- " /scratch/$USER " }
6+ SPARK_MASTER_PORT=${SPARK_MASTER_PORT:- 7077}
7+ SPARK_MASTER_WEBUI_PORT=${SPARK_MASTER_WEBUI_PORT:- 8080}
8+
9+ # resources for spark's accounting processes on each node
10+ SPARK_DAEMON_CORES=1
11+ SPARK_DAEMON_MEMORY=1024
12+ # resources for the application's driver process (pyspark's main()) on the master node
13+ SPARK_DRIVER_CORES=2
14+ SPARK_DRIVER_MEMORY=16192
15+
16+ PYSPARK_PYTHON=${PYSPARK_PYTHON:- $(which python)}
17+ PYSPARK_DRIVER_PYTHON=${PYSPARK_DRIVER_PYTHON:- $(which python)}
18+
19+
20+ function spark-start() {
21+ # Verify if we are in a SLURM allocation or not
22+ if [[ -z " ${SLURM_JOB_NAME} " || -z " ${SLURM_CPUS_PER_TASK} " || -z " ${SLURM_MEM_PER_CPU} " || -z " ${SLURM_JOB_NUM_NODES} " ]]; then
23+ echo " Error: Some required SLURM environment variables are missing."
24+ echo " This script should only be run within a SLURM job."
25+ echo " SLURM_JOB_NAME: ${SLURM_JOB_NAME} "
26+ echo " SLURM_JOB_NUM_NODES: ${SLURM_JOB_NUM_NODES} "
27+ echo " SLURM_CPUS_PER_TASK: ${SLURM_CPUS_PER_TASK} "
28+ echo " SLURM_MEM_PER_CPU: ${SLURM_MEM_PER_CPU} "
29+ exit 1
30+ fi
31+
32+ # Access to spark-submit
33+ export PATH=" $SPARK_HOME /bin:$PATH "
34+
35+ # Initialize spark WORKER, CONF, LOG and TMP dirs
36+ SPARK_WORK_DIR=${SCRATCH} /spark/${SLURM_JOB_NAME##*/ } _${SLURM_JOB_ID}
37+ SPARK_WORKER_DIR=${SPARK_WORK_DIR}
38+ SPARK_CONF_DIR=${SPARK_WORK_DIR} /conf
39+ SPARK_LOG_DIR=${SPARK_WORK_DIR} /log
40+ SPARK_LOCAL_DIRS=${SPARK_WORK_DIR} /tmp
41+
42+ srun -l mkdir -p " ${SPARK_WORK_DIR} " " ${SPARK_CONF_DIR} " " ${SPARK_LOG_DIR} " " ${SPARK_LOCAL_DIRS} " \
43+ && srun -l chmod -R 766 " ${SPARK_WORK_DIR} "
44+
45+ SPARK_MASTER_HOST=$( scontrol show hostname ${SLURM_NODELIST} | head -n 1)
46+ export SPARK_URL=" spark://${SPARK_MASTER_HOST} :${SPARK_MASTER_PORT} "
47+
48+ # The driver runs only on the master node
49+ MASTER_COMPUTE_CORES=$(( SLURM_CPUS_PER_TASK - SPARK_DAEMON_CORES - SPARK_DRIVER_CORES))
50+ MASTER_COMPUTE_MEMORY=$(( SLURM_MEM_PER_CPU * SLURM_CPUS_PER_TASK - SPARK_DAEMON_MEMORY - SPARK_DRIVER_MEMORY - 2048 ))
51+ # The resources available on the rest of the nodes
52+ WORKER_COMPUTE_CORES=$(( SLURM_CPUS_PER_TASK - SPARK_DAEMON_CORES))
53+ WORKER_COMPUTE_MEMORY=$(( SLURM_MEM_PER_CPU * SLURM_CPUS_PER_TASK - SPARK_DAEMON_MEMORY - 2048 ))
54+
55+ export SPARK_DEFAULTS=" ${SPARK_CONF_DIR} /spark-defaults.conf"
56+ cat << EOF > "${SPARK_DEFAULTS} .tmp"
57+ spark.master ${SPARK_URL}
58+ spark.submit.deployMode client
59+ spark.ui.showConsoleProgress false
60+ spark.ui.enabled true
61+ spark.jars.packages org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-hadoop-cloud_2.12:3.3.4
62+
63+ spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
64+ spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
65+ spark.hadoop.mapreduce.outputcommitter.factory.scheme.s3a org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory
66+ spark.hadoop.fs.s3a.committer.name magic
67+ spark.hadoop.fs.s3a.committer.magic.enabled true
68+ spark.hadoop.fs.s3a.committer.threads ${SLURM_CPUS_PER_TASK}
69+ spark.hadoop.fs.s3a.buffer.dir ${SPARK_LOCAL_DIRS} /s3a
70+
71+ spark.local.dir ${SPARK_LOCAL_DIRS}
72+ spark.sql.warehouse.dir ${SPARK_LOCAL_DIRS} /warehouse
73+ spark.sql.autoBroadcastJoinThreshold -1
74+
75+ spark.driver.maxResultSize 8192m
76+ spark.driver.memory ${SPARK_DRIVER_MEMORY} m
77+ spark.executor.memory ${MASTER_COMPUTE_MEMORY} m
78+ spark.network.timeout 1200
79+ spark.port.maxRetries 100
80+ spark.task.maxFailures 100
81+ EOF
82+ sbcast " ${SPARK_DEFAULTS} .tmp" " ${SPARK_DEFAULTS} "
83+
84+ export SPARK_LAUNCHER=${SPARK_WORK_DIR} /spark-launcher.sh
85+ cat << EOF > "${SPARK_LAUNCHER} .tmp"
86+ #!/bin/bash
87+ export SPARK_HOME=${SPARK_HOME}
88+ export SPARK_WORKER_DIR=${SPARK_WORKER_DIR}
89+ export SPARK_LOG_DIR=${SPARK_LOG_DIR}
90+ export SPARK_LOCAL_DIRS=${SPARK_LOCAL_DIRS}
91+ export SPARK_CONF_DIR=${SPARK_CONF_DIR}
92+
93+ export SPARK_MASTER_HOST=${SPARK_MASTER_HOST}
94+
95+ export SPARK_DAEMON_CORES=${SPARK_DAEMON_CORES}
96+ export SPARK_DAEMON_MEMORY=${SPARK_DAEMON_MEMORY} m
97+ export SPARK_DRIVER_CORES=${SPARK_DRIVER_CORES}
98+ export SPARK_DRIVER_MEMORY=${SPARK_DRIVER_MEMORY} m
99+
100+ export PYSPARK_PYTHON=${PYSPARK_PYTHON}
101+ export PYSPARK_DRIVER_PYTHON=${PYSPARK_DRIVER_PYTHON}
102+
103+ source "$SPARK_HOME /sbin/spark-config.sh"
104+ source "$SPARK_HOME /bin/load-spark-env.sh"
105+
106+ if [[ \$ {SLURM_PROCID} -eq 0 ]]; then
107+ # Start a master + worker on the same node
108+ export SPARK_WORKER_CORES=${MASTER_COMPUTE_CORES}
109+ export SPARK_WORKER_MEMORY=${MASTER_COMPUTE_MEMORY} m
110+
111+ "${SPARK_HOME} /bin/spark-class" org.apache.spark.deploy.master.Master &> "${SPARK_LOG_DIR} /spark-master.log" &
112+ MASTER_PID=$!
113+ exec "${SPARK_HOME} /bin/spark-class" org.apache.spark.deploy.worker.Worker ${SPARK_URL} &> "${SPARK_LOG_DIR} /spark-worker.log" &
114+ WORKER_PID=$!
115+ wait $MASTER_PID $WORKER_PID
116+ else
117+ # Start a worker
118+ export SPARK_WORKER_CORES=${WORKER_COMPUTE_CORES}
119+ export SPARK_WORKER_MEMORY=${WORKER_COMPUTE_MEMORY} m
120+
121+ "${SPARK_HOME} /bin/spark-class" org.apache.spark.deploy.worker.Worker ${SPARK_URL} &> "${SPARK_LOG_DIR} /spark-worker.log" &
122+ WORKER_PID=$!
123+ wait $WORKER_PID
124+ fi
125+ EOF
126+ chmod +x " ${SPARK_LAUNCHER} .tmp"
127+ sbcast " ${SPARK_LAUNCHER} .tmp" " ${SPARK_LAUNCHER} "
128+
129+ srun --label --export=ALL --wait=0 " ${SPARK_LAUNCHER} " &
130+
131+ max_attempts=20
132+ attempt=0
133+ while ! grep -q " started at http://" " ${SPARK_LOG_DIR} /spark-master.log" ; do
134+ if (( attempt++ == max_attempts )) ; then
135+ echo " Error: Connection to Spark master not established after $(( max_attempts * 5 )) seconds."
136+ exit 1
137+ fi
138+ sleep 5
139+ done
140+ }
141+
142+
143+ function spark-stop() {
144+ # todo: check for running spark processes
145+ srun sh -c " rm -rf ${SPARK_WORK_DIR} "
146+ }
0 commit comments