Skip to content

Commit 5b96384

Browse files
authored
Merge pull request #3 from bigcode-project/stack-spark
2 parents 8ad095c + 9ef59d4 commit 5b96384

19 files changed

+1492
-0
lines changed

.gitignore

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py,cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
.pybuilder/
76+
target/
77+
78+
# Jupyter Notebook
79+
.ipynb_checkpoints
80+
81+
# IPython
82+
profile_default/
83+
ipython_config.py
84+
85+
# pyenv
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
89+
90+
# pipenv
91+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94+
# install all needed dependencies.
95+
#Pipfile.lock
96+
97+
# poetry
98+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99+
# This is especially recommended for binary packages to ensure reproducibility, and is more
100+
# commonly ignored for libraries.
101+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102+
#poetry.lock
103+
104+
# pdm
105+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106+
#pdm.lock
107+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108+
# in version control.
109+
# https://pdm.fming.dev/#use-with-ide
110+
.pdm.toml
111+
112+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113+
__pypackages__/
114+
115+
# Celery stuff
116+
celerybeat-schedule
117+
celerybeat.pid
118+
119+
# SageMath parsed files
120+
*.sage.py
121+
122+
# Environments
123+
.env
124+
.venv
125+
env/
126+
venv/
127+
ENV/
128+
env.bak/
129+
venv.bak/
130+
131+
# Spyder project settings
132+
.spyderproject
133+
.spyproject
134+
135+
# Rope project settings
136+
.ropeproject
137+
138+
# mkdocs documentation
139+
/site
140+
141+
# mypy
142+
.mypy_cache/
143+
.dmypy.json
144+
dmypy.json
145+
146+
# Pyre type checker
147+
.pyre/
148+
149+
# pytype static type analyzer
150+
.pytype/
151+
152+
# Cython debug symbols
153+
cython_debug/
154+
155+
# PyCharm
156+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158+
# and can be added to the global gitignore or merged into this file. For a more nuclear
159+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160+
#.idea/

spark/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
## Spark-on-SLURM prerequisites
2+
3+
### 1. Download spark binaries
4+
5+
```bash
6+
mkdir -p /fsx/bigcode/spark/
7+
chmod 777 /fsx/bigcode/spark/
8+
cd /fsx/bigcode/spark/
9+
wget https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
10+
tar -xvzf spark-3.5.0-bin-hadoop3.tgz
11+
```

spark/example.slurm

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
#SBATCH --partition=production-cluster
3+
#SBATCH --job-name=spark_example
4+
#SBATCH --nodes 2
5+
#SBATCH --ntasks-per-node 1
6+
#SBATCH --cpus-per-task=32
7+
#SBATCH --mem-per-cpu=11G
8+
#SBATCH --output=logs/%x_%j.out
9+
#SBATCH --error=logs/%x_%j.err
10+
#SBATCH --time=4-00:00:00
11+
12+
set -x -e
13+
source ~/.bashrc
14+
source "$CONDA_PREFIX/etc/profile.d/conda.sh"
15+
source activate stack_v2
16+
source spark_env.sh
17+
18+
spark-start
19+
20+
spark-submit --master $SPARK_URL app.py
21+
22+
spark-stop

spark/spark_env.sh

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
#!/bin/bash
2+
3+
# GLOBAL settings
4+
SPARK_HOME=${SPARK_HOME:-"/fsx/bigcode/spark/spark-3.5.0-bin-hadoop3"}
5+
SCRATCH=${SCRATCH:-"/scratch/$USER"}
6+
SPARK_MASTER_PORT=${SPARK_MASTER_PORT:-7077}
7+
SPARK_MASTER_WEBUI_PORT=${SPARK_MASTER_WEBUI_PORT:-8080}
8+
9+
# resources for spark's accounting processes on each node
10+
SPARK_DAEMON_CORES=1
11+
SPARK_DAEMON_MEMORY=1024
12+
# resources for the application's driver process (pyspark's main()) on the master node
13+
SPARK_DRIVER_CORES=2
14+
SPARK_DRIVER_MEMORY=16192
15+
16+
PYSPARK_PYTHON=${PYSPARK_PYTHON:-$(which python)}
17+
PYSPARK_DRIVER_PYTHON=${PYSPARK_DRIVER_PYTHON:-$(which python)}
18+
19+
20+
function spark-start() {
21+
# Verify if we are in a SLURM allocation or not
22+
if [[ -z "${SLURM_JOB_NAME}" || -z "${SLURM_CPUS_PER_TASK}" || -z "${SLURM_MEM_PER_CPU}" || -z "${SLURM_JOB_NUM_NODES}" ]]; then
23+
echo "Error: Some required SLURM environment variables are missing."
24+
echo "This script should only be run within a SLURM job."
25+
echo "SLURM_JOB_NAME: ${SLURM_JOB_NAME}"
26+
echo "SLURM_JOB_NUM_NODES: ${SLURM_JOB_NUM_NODES}"
27+
echo "SLURM_CPUS_PER_TASK: ${SLURM_CPUS_PER_TASK}"
28+
echo "SLURM_MEM_PER_CPU: ${SLURM_MEM_PER_CPU}"
29+
exit 1
30+
fi
31+
32+
# Access to spark-submit
33+
export PATH="$SPARK_HOME/bin:$PATH"
34+
35+
# Initialize spark WORKER, CONF, LOG and TMP dirs
36+
SPARK_WORK_DIR=${SCRATCH}/spark/${SLURM_JOB_NAME##*/}_${SLURM_JOB_ID}
37+
SPARK_WORKER_DIR=${SPARK_WORK_DIR}
38+
SPARK_CONF_DIR=${SPARK_WORK_DIR}/conf
39+
SPARK_LOG_DIR=${SPARK_WORK_DIR}/log
40+
SPARK_LOCAL_DIRS=${SPARK_WORK_DIR}/tmp
41+
42+
srun -l mkdir -p "${SPARK_WORK_DIR}" "${SPARK_CONF_DIR}" "${SPARK_LOG_DIR}" "${SPARK_LOCAL_DIRS}" \
43+
&& srun -l chmod -R 766 "${SPARK_WORK_DIR}"
44+
45+
SPARK_MASTER_HOST=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1)
46+
export SPARK_URL="spark://${SPARK_MASTER_HOST}:${SPARK_MASTER_PORT}"
47+
48+
# The driver runs only on the master node
49+
MASTER_COMPUTE_CORES=$((SLURM_CPUS_PER_TASK - SPARK_DAEMON_CORES - SPARK_DRIVER_CORES))
50+
MASTER_COMPUTE_MEMORY=$((SLURM_MEM_PER_CPU * SLURM_CPUS_PER_TASK - SPARK_DAEMON_MEMORY - SPARK_DRIVER_MEMORY - 2048))
51+
# The resources available on the rest of the nodes
52+
WORKER_COMPUTE_CORES=$((SLURM_CPUS_PER_TASK - SPARK_DAEMON_CORES))
53+
WORKER_COMPUTE_MEMORY=$((SLURM_MEM_PER_CPU * SLURM_CPUS_PER_TASK - SPARK_DAEMON_MEMORY - 2048))
54+
55+
export SPARK_DEFAULTS="${SPARK_CONF_DIR}/spark-defaults.conf"
56+
cat << EOF > "${SPARK_DEFAULTS}.tmp"
57+
spark.master ${SPARK_URL}
58+
spark.submit.deployMode client
59+
spark.ui.showConsoleProgress false
60+
spark.ui.enabled true
61+
spark.jars.packages org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-hadoop-cloud_2.12:3.3.4
62+
63+
spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
64+
spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
65+
spark.hadoop.mapreduce.outputcommitter.factory.scheme.s3a org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory
66+
spark.hadoop.fs.s3a.committer.name magic
67+
spark.hadoop.fs.s3a.committer.magic.enabled true
68+
spark.hadoop.fs.s3a.committer.threads ${SLURM_CPUS_PER_TASK}
69+
spark.hadoop.fs.s3a.buffer.dir ${SPARK_LOCAL_DIRS}/s3a
70+
71+
spark.local.dir ${SPARK_LOCAL_DIRS}
72+
spark.sql.warehouse.dir ${SPARK_LOCAL_DIRS}/warehouse
73+
spark.sql.autoBroadcastJoinThreshold -1
74+
75+
spark.driver.maxResultSize 8192m
76+
spark.driver.memory ${SPARK_DRIVER_MEMORY}m
77+
spark.executor.memory ${MASTER_COMPUTE_MEMORY}m
78+
spark.network.timeout 1200
79+
spark.port.maxRetries 100
80+
spark.task.maxFailures 100
81+
EOF
82+
sbcast "${SPARK_DEFAULTS}.tmp" "${SPARK_DEFAULTS}"
83+
84+
export SPARK_LAUNCHER=${SPARK_WORK_DIR}/spark-launcher.sh
85+
cat << EOF > "${SPARK_LAUNCHER}.tmp"
86+
#!/bin/bash
87+
export SPARK_HOME=${SPARK_HOME}
88+
export SPARK_WORKER_DIR=${SPARK_WORKER_DIR}
89+
export SPARK_LOG_DIR=${SPARK_LOG_DIR}
90+
export SPARK_LOCAL_DIRS=${SPARK_LOCAL_DIRS}
91+
export SPARK_CONF_DIR=${SPARK_CONF_DIR}
92+
93+
export SPARK_MASTER_HOST=${SPARK_MASTER_HOST}
94+
95+
export SPARK_DAEMON_CORES=${SPARK_DAEMON_CORES}
96+
export SPARK_DAEMON_MEMORY=${SPARK_DAEMON_MEMORY}m
97+
export SPARK_DRIVER_CORES=${SPARK_DRIVER_CORES}
98+
export SPARK_DRIVER_MEMORY=${SPARK_DRIVER_MEMORY}m
99+
100+
export PYSPARK_PYTHON=${PYSPARK_PYTHON}
101+
export PYSPARK_DRIVER_PYTHON=${PYSPARK_DRIVER_PYTHON}
102+
103+
source "$SPARK_HOME/sbin/spark-config.sh"
104+
source "$SPARK_HOME/bin/load-spark-env.sh"
105+
106+
if [[ \${SLURM_PROCID} -eq 0 ]]; then
107+
# Start a master + worker on the same node
108+
export SPARK_WORKER_CORES=${MASTER_COMPUTE_CORES}
109+
export SPARK_WORKER_MEMORY=${MASTER_COMPUTE_MEMORY}m
110+
111+
"${SPARK_HOME}/bin/spark-class" org.apache.spark.deploy.master.Master &> "${SPARK_LOG_DIR}/spark-master.log" &
112+
MASTER_PID=$!
113+
exec "${SPARK_HOME}/bin/spark-class" org.apache.spark.deploy.worker.Worker ${SPARK_URL} &> "${SPARK_LOG_DIR}/spark-worker.log" &
114+
WORKER_PID=$!
115+
wait $MASTER_PID $WORKER_PID
116+
else
117+
# Start a worker
118+
export SPARK_WORKER_CORES=${WORKER_COMPUTE_CORES}
119+
export SPARK_WORKER_MEMORY=${WORKER_COMPUTE_MEMORY}m
120+
121+
"${SPARK_HOME}/bin/spark-class" org.apache.spark.deploy.worker.Worker ${SPARK_URL} &> "${SPARK_LOG_DIR}/spark-worker.log" &
122+
WORKER_PID=$!
123+
wait $WORKER_PID
124+
fi
125+
EOF
126+
chmod +x "${SPARK_LAUNCHER}.tmp"
127+
sbcast "${SPARK_LAUNCHER}.tmp" "${SPARK_LAUNCHER}"
128+
129+
srun --label --export=ALL --wait=0 "${SPARK_LAUNCHER}" &
130+
131+
max_attempts=20
132+
attempt=0
133+
while ! grep -q "started at http://" "${SPARK_LOG_DIR}/spark-master.log"; do
134+
if (( attempt++ == max_attempts )); then
135+
echo "Error: Connection to Spark master not established after $(( max_attempts * 5 )) seconds."
136+
exit 1
137+
fi
138+
sleep 5
139+
done
140+
}
141+
142+
143+
function spark-stop() {
144+
# todo: check for running spark processes
145+
srun sh -c "rm -rf ${SPARK_WORK_DIR}"
146+
}

0 commit comments

Comments
 (0)