diff --git a/_binder_notebooks/01_local_cluster_monte_carlo_estimate_of_pi.ipynb b/_binder_notebooks/01_local_cluster_monte_carlo_estimate_of_pi.ipynb new file mode 100644 index 0000000..e9bc9fc --- /dev/null +++ b/_binder_notebooks/01_local_cluster_monte_carlo_estimate_of_pi.ipynb @@ -0,0 +1,677 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Monte-Carlo Estimate of $\\pi$\n", + "\n", + "We want to estimate the number $\\pi$ using a [Monte-Carlo method](https://en.wikipedia.org/wiki/Pi#Monte_Carlo_methods) exploiting that the area of a quarter circle of unit radius is $\\pi/4$ and that hence the probability of any randomly chosen point in a unit square to lie in a unit circle centerd at a corner of the unit square is $\\pi/4$ as well. So for N randomly chosen pairs $(x, y)$ with $x\\in[0, 1)$ and $y\\in[0, 1)$, we count the number $N_{circ}$ of pairs that also satisfy $(x^2 + y^2) < 1$ and estimage $\\pi \\approx 4 \\cdot N_{circ} / N$.\n", + "\n", + "[\"PI](https://en.wikipedia.org/wiki/Pi#Monte_Carlo_methods)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Core Lessons\n", + "\n", + "- short Dask recap (assuming that `LocalCluster`, `Client`, and `dask.array` are familiar)\n", + "- Scaling (local) clusters\n", + "- Adaptive (local) clusters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up a local cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from dask.distributed import LocalCluster, Client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 1
  • \n", + "
  • Cores: 1
  • \n", + "
  • Memory: 1000.00 MB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster = LocalCluster(n_workers=1, threads_per_worker=1, memory_limit=1e9)\n", + "client = Client(cluster)\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The Monte Carlo Method" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.array as da\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_pi_mc(size_in_bytes):\n", + " \"\"\"Calculate PI using a Monte Carlo estimate.\"\"\"\n", + " xy = da.random.uniform(0, 1,\n", + " size=(int(size_in_bytes / 8 / 2), 2),\n", + " chunks=(100e6 / 8, 2))\n", + " \n", + " in_circle = ((xy ** 2).sum(axis=-1) < 1)\n", + " pi = 4 * in_circle.mean()\n", + "\n", + " return pi.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def print_pi_stats(size, pi, time_delta, num_workers):\n", + " \"\"\"Print pi, calculate offset from true value, and print some stats.\"\"\"\n", + " print(f\"{size / 1e9} GB\\n\"\n", + " f\"\\tMC pi: {pi : 13.11f}\"\n", + " f\"\\tErr: {abs(pi - np.pi) : 10.3e}\\n\"\n", + " f\"\\tWorkers: {num_workers}\"\n", + " f\"\\t\\tTime: {time_delta : 7.3f}s\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The actual calculations\n", + "\n", + "We loop over different volumes of double-precision random numbers and estimate $\\pi$ as described above." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from time import time" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0 GB\n", + "\tMC pi: 3.14127948800\tErr: 3.132e-04\n", + "\tWorkers: 1\t\tTime: 5.114s\n", + "2.0 GB\n", + "\tMC pi: 3.14166908800\tErr: 7.643e-05\n", + "\tWorkers: 1\t\tTime: 10.141s\n", + "3.0 GB\n", + "\tMC pi: 3.14187464533\tErr: 2.820e-04\n", + "\tWorkers: 1\t\tTime: 15.123s\n" + ] + } + ], + "source": [ + "for size in (1e9 * n for n in (1, 2, 3)):\n", + " \n", + " start = time()\n", + " pi = calc_pi_mc(size)\n", + " elaps = time() - start\n", + "\n", + " print_pi_stats(size, pi,\n", + " time_delta=elaps,\n", + " num_workers=len(cluster.workers))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scaling the Cluster\n", + "\n", + "We increase the number of workers by 2 and the re-run the experiments." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from time import sleep" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scaling from 1 to 2 workers.\n" + ] + } + ], + "source": [ + "new_num_workers = 2 * len(cluster.workers)\n", + "\n", + "print(f\"Scaling from {len(cluster.workers)} to {new_num_workers} workers.\")\n", + "\n", + "cluster.scale(new_num_workers)\n", + "\n", + "sleep(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 2
  • \n", + "
  • Cores: 2
  • \n", + "
  • Memory: 2.00 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0 GB\n", + "\tMC pi: 3.14160531200\tErr: 1.266e-05\n", + "\tWorkers: 2\t\tTime: 3.171s\n", + "2.0 GB\n", + "\tMC pi: 3.14142736000\tErr: 1.653e-04\n", + "\tWorkers: 2\t\tTime: 5.252s\n", + "3.0 GB\n", + "\tMC pi: 3.14184736000\tErr: 2.547e-04\n", + "\tWorkers: 2\t\tTime: 8.289s\n" + ] + } + ], + "source": [ + "for size in (1e9 * n for n in (1, 2, 3)):\n", + " \n", + " start = time()\n", + " pi = calc_pi_mc(size)\n", + " elaps = time() - start\n", + " print_pi_stats(size, pi, time_delta=elaps,\n", + " num_workers=len(cluster.workers))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Automatically Scaling the Cluster\n", + "\n", + "We want each calculation to take approximately the same time irrespective of the actual work load.\n", + "\n", + "_**Watch** how the cluster will scale down to the minimum a few (three!) seconds after being made adaptive._" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Check docstring of distributed.Adaptive for keywords\n", + "ca = cluster.adapt(\n", + " minimum=1, maximum=4,\n", + " target_duration=\"10s\",\n", + " scale_factor=1);\n", + "\n", + "sleep(4) # Allow for scale-down" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 1
  • \n", + "
  • Cores: 1
  • \n", + "
  • Memory: 1000.00 MB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Repeat the calculation from above with larger work loads. (And watch the dash board!)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.0 GB\n", + "\tMC pi: 3.14154998400\tErr: 4.267e-05\n", + "\tWorkers: 2\t\tTime: 6.208s\n", + "4.0 GB\n", + "\tMC pi: 3.14154673600\tErr: 4.592e-05\n", + "\tWorkers: 3\t\tTime: 8.414s\n", + "8.0 GB\n", + "\tMC pi: 3.14142765600\tErr: 1.650e-04\n", + "\tWorkers: 4\t\tTime: 12.693s\n" + ] + } + ], + "source": [ + "for size in (n * 1e9 for n in (2, 4, 8)):\n", + " \n", + " start = time()\n", + " pi = calc_pi_mc(size)\n", + " elaps = time() - start\n", + " \n", + " print_pi_stats(size, pi, time_delta=elaps,\n", + " num_workers=len(cluster.workers))\n", + " \n", + " sleep(4) # allow for scale-down time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complete listing of software used here" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Package Version \n", + "------------------ --------\n", + "asn1crypto 0.24.0 \n", + "attrs 19.1.0 \n", + "backcall 0.1.0 \n", + "bleach 3.1.0 \n", + "bokeh 1.2.0 \n", + "certifi 2019.3.9\n", + "cffi 1.12.3 \n", + "chardet 3.0.4 \n", + "Click 7.0 \n", + "cloudpickle 1.2.1 \n", + "conda 4.6.14 \n", + "cryptography 2.7 \n", + "cytoolz 0.9.0.1 \n", + "dask 1.2.2 \n", + "dask-jobqueue 0.5.0 \n", + "decorator 4.4.0 \n", + "defusedxml 0.5.0 \n", + "distributed 1.28.1 \n", + "docrep 0.2.7 \n", + "entrypoints 0.3 \n", + "heapdict 1.0.0 \n", + "idna 2.8 \n", + "ipykernel 5.1.1 \n", + "ipython 7.5.0 \n", + "ipython-genutils 0.2.0 \n", + "ipywidgets 7.4.2 \n", + "jedi 0.13.3 \n", + "Jinja2 2.10.1 \n", + "jsonschema 3.0.1 \n", + "jupyter-client 5.2.4 \n", + "jupyter-core 4.4.0 \n", + "jupyterlab 0.35.6 \n", + "jupyterlab-server 0.2.0 \n", + "locket 0.2.0 \n", + "MarkupSafe 1.1.1 \n", + "mistune 0.8.4 \n", + "msgpack 0.6.1 \n", + "nbconvert 5.5.0 \n", + "nbformat 4.4.0 \n", + "notebook 5.7.8 \n", + "numpy 1.16.4 \n", + "olefile 0.46 \n", + "packaging 19.0 \n", + "pandas 0.24.2 \n", + "pandocfilters 1.4.2 \n", + "parso 0.4.0 \n", + "partd 0.3.10 \n", + "pexpect 4.7.0 \n", + "pickleshare 0.7.5 \n", + "Pillow 6.0.0 \n", + "pip 19.1.1 \n", + "prometheus-client 0.7.0 \n", + "prompt-toolkit 2.0.9 \n", + "psutil 5.6.3 \n", + "ptyprocess 0.6.0 \n", + "pycosat 0.6.3 \n", + "pycparser 2.19 \n", + "Pygments 2.4.2 \n", + "pyOpenSSL 19.0.0 \n", + "pyparsing 2.4.0 \n", + "pyrsistent 0.15.2 \n", + "PySocks 1.7.0 \n", + "python-dateutil 2.8.0 \n", + "pytz 2019.1 \n", + "PyYAML 5.1.1 \n", + "pyzmq 18.0.1 \n", + "requests 2.22.0 \n", + "ruamel-yaml 0.15.71 \n", + "Send2Trash 1.5.0 \n", + "setuptools 41.0.1 \n", + "six 1.12.0 \n", + "sortedcontainers 2.1.0 \n", + "tblib 1.4.0 \n", + "terminado 0.8.2 \n", + "testpath 0.4.2 \n", + "toolz 0.9.0 \n", + "tornado 6.0.2 \n", + "traitlets 4.3.2 \n", + "urllib3 1.24.3 \n", + "wcwidth 0.1.7 \n", + "webencodings 0.5.1 \n", + "wheel 0.33.4 \n", + "widgetsnbextension 3.4.2 \n", + "zict 0.1.4 \n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip list" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# This file may be used to create an environment using:\n", + "# $ conda create --name --file \n", + "# platform: linux-64\n", + "@EXPLICIT\n", + "https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2019.6.16-hecc5488_0.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-8.2.0-hdf63c60_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libgfortran-3.0.0-1.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-8.2.0-hdf63c60_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pandoc-2.7.3-0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/jpeg-9c-h14c3975_1001.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.2.1-hd88cf55_4.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libsodium-1.0.16-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.8.3-he1b5a44_1001.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.1-he6710b0_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/openblas-0.3.5-ha44fe06_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/openssl-1.1.1b-h14c3975_1.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/xz-5.2.4-h14c3975_4.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/yaml-0.1.7-had09818_2.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.11-h7b6447c_3.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libblas-3.8.0-7_openblas.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libedit-3.1.20181209-hc058e9b_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.37-hed695b0_0.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/readline-7.0-h7b6447c_5.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.9-hed695b0_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/zeromq-4.3.1-hf484d3e_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/zstd-1.4.0-h3b9ef0a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/freetype-2.10.0-he983fc9_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.8.0-7_openblas.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.8.0-7_openblas.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.0.10-h57b8799_1003.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.27.2-h7b6447c_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/python-3.6.7-h381d211_1004.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/asn1crypto-0.24.0-py36_1003.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/attrs-19.1.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/backcall-0.1.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/certifi-2019.3.9-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/chardet-3.0.4-py36_1003.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/click-7.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/cloudpickle-1.2.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/dask-core-1.2.2-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/decorator-4.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/defusedxml-0.5.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/entrypoints-0.3-py36_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/heapdict-1.0.0-py36_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/idna-2.8-py36_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/locket-0.2.0-py_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/markupsafe-1.1.1-py36h14c3975_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/mistune-0.8.4-py36h14c3975_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-0.6.1-py36h6bb024c_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/numpy-1.16.4-py36h95a1406_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pandocfilters-1.4.2-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/parso-0.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pickleshare-0.7.5-py36_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/prometheus_client-0.7.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/psutil-5.6.3-py36h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.6.0-py_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pycosat-0.6.3-py36h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pycparser-2.19-py36_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pyparsing-2.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pysocks-1.7.0-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pytz-2019.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyyaml-5.1.1-py36h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyzmq-18.0.1-py36hc4ba49a_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ruamel_yaml-0.15.71-py36h14c3975_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/send2trash-1.5.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/six-1.12.0-py36_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/sortedcontainers-2.1.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/tblib-1.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/testpath-0.4.2-py_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/toolz-0.9.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/tornado-6.0.2-py36h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.1.7-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/cffi-1.12.3-py36h8022711_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/cytoolz-0.9.0.1-py36h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/docrep-0.2.7-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/jedi-0.13.3-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/packaging-19.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/partd-0.3.10-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pexpect-4.7.0-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pillow-6.0.0-py36he7afcd5_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyrsistent-0.15.2-py36h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/setuptools-41.0.1-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/terminado-0.8.2-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/traitlets-4.3.2-py36_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/zict-0.1.4-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/bleach-3.1.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/cryptography-2.7-py36h72c5cf5_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/distributed-1.28.1-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jinja2-2.10.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/jsonschema-3.0.1-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jupyter_core-4.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pandas-0.24.2-py36hb3f55d8_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pygments-2.4.2-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/wheel-0.33.4-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/bokeh-1.2.0-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jupyter_client-5.2.4-py_3.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/nbformat-4.4.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pip-19.1.1-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/prompt_toolkit-2.0.9-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyopenssl-19.0.0-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/dask-1.2.2-py_3.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ipython-7.5.0-py36h24bf2e0_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/nbconvert-5.5.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/urllib3-1.24.3-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/dask-jobqueue-0.5.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ipykernel-5.1.1-py36h24bf2e0_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/requests-2.22.0-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/conda-4.6.14-py36_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/notebook-5.7.8-py36_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jupyterlab_server-0.2.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/widgetsnbextension-3.4.2-py36_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/ipywidgets-7.4.2-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/jupyterlab-0.35.6-py36_0.tar.bz2\n", + "\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%conda list --explicit" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_binder_notebooks/02_slurm_cluster_monte_carlo_estimate_of_pi.ipynb b/_binder_notebooks/02_slurm_cluster_monte_carlo_estimate_of_pi.ipynb new file mode 100644 index 0000000..b0ef414 --- /dev/null +++ b/_binder_notebooks/02_slurm_cluster_monte_carlo_estimate_of_pi.ipynb @@ -0,0 +1,822 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Monte-Carlo Estimate of $\\pi$\n", + "\n", + "We want to estimate the number $\\pi$ using a [Monte-Carlo method](https://en.wikipedia.org/wiki/Pi#Monte_Carlo_methods) exploiting that the area of a quarter circle of unit radius is $\\pi/4$ and that hence the probability of any randomly chosen point in a unit square to lie in a unit circle centerd at a corner of the unit square is $\\pi/4$ as well. So for N randomly chosen pairs $(x, y)$ with $x\\in[0, 1)$ and $y\\in[0, 1)$, we count the number $N_{circ}$ of pairs that also satisfy $(x^2 + y^2) < 1$ and estimage $\\pi \\approx 4 \\cdot N_{circ} / N$.\n", + "\n", + "[\"PI](https://en.wikipedia.org/wiki/Pi#Monte_Carlo_methods)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Core Lessons\n", + "\n", + "- setting up SLURM (and other jobqueue) clusters\n", + "- Scaling clusters\n", + "- Adaptive clusters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up a Slurm cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/p/project/cecam/rath1/miniconda3_20190521/envs/dask_jobqueue_workshop/lib/python3.7/site-packages/docrep/__init__.py:341: MatplotlibDeprecationWarning: \n", + "The dedent function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use inspect.cleandoc instead.\n", + " s = dedents('\\n' + '\\n'.join(lines[first:]))\n" + ] + } + ], + "source": [ + "from dask.distributed import Client\n", + "from dask_jobqueue import SLURMCluster" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "cluster = SLURMCluster(\n", + " cores=24,\n", + " processes=2,\n", + " memory=\"100GB\",\n", + " shebang='#!/usr/bin/env bash',\n", + " queue=\"batch\",\n", + " walltime=\"00:30:00\",\n", + " local_directory='/tmp',\n", + " death_timeout=\"15s\",\n", + " interface=\"ib0\",\n", + " log_directory=\"$SCRATCH_cecam/$USER/dask_jobqueue_logs/\",\n", + " project=\"ecam\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 0
  • \n", + "
  • Cores: 0
  • \n", + "
  • Memory: 0 B
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client = Client(cluster)\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The job scripts" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#!/usr/bin/env bash\n", + "\n", + "#SBATCH -J dask-worker\n", + "#SBATCH -e $SCRATCH_cecam/$USER/dask_jobqueue_logs//dask-worker-%J.err\n", + "#SBATCH -o $SCRATCH_cecam/$USER/dask_jobqueue_logs//dask-worker-%J.out\n", + "#SBATCH -p batch\n", + "#SBATCH -A ecam\n", + "#SBATCH -n 1\n", + "#SBATCH --cpus-per-task=24\n", + "#SBATCH --mem=94G\n", + "#SBATCH -t 00:30:00\n", + "JOB_ID=${SLURM_JOB_ID%;*}\n", + "\n", + "\n", + "\n", + "/p/project/cecam/rath1/miniconda3_20190521/envs/dask_jobqueue_workshop/bin/python -m distributed.cli.dask_worker tcp://10.80.32.36:46416 --nthreads 12 --nprocs 2 --memory-limit 50.00GB --name dask-worker--${JOB_ID}-- --death-timeout 15s --local-directory /tmp --interface ib0\n", + "\n" + ] + } + ], + "source": [ + "print(cluster.job_script())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scale the cluster to two nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.scale(4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The Monte Carlo Method" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.array as da\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_pi_mc(size_in_bytes, chunksize_in_bytes=200e6):\n", + " \"\"\"Calculate PI using a Monte Carlo estimate.\"\"\"\n", + " \n", + " size = int(size_in_bytes / 8)\n", + " chunksize = int(chunksize_in_bytes / 8)\n", + " \n", + " xy = da.random.uniform(0, 1,\n", + " size=(size / 2, 2),\n", + " chunks=(chunksize / 2, 2))\n", + " \n", + " in_circle = ((xy ** 2).sum(axis=-1) < 1)\n", + " pi = 4 * in_circle.mean()\n", + "\n", + " return pi" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def print_pi_stats(size, pi, time_delta, num_workers):\n", + " \"\"\"Print pi, calculate offset from true value, and print some stats.\"\"\"\n", + " print(f\"{size / 1e9} GB\\n\"\n", + " f\"\\tMC pi: {pi : 13.11f}\"\n", + " f\"\\tErr: {abs(pi - np.pi) : 10.3e}\\n\"\n", + " f\"\\tWorkers: {num_workers}\"\n", + " f\"\\t\\tTime: {time_delta : 7.3f}s\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The actual calculations\n", + "\n", + "We loop over different volumes of double-precision random numbers and estimate $\\pi$ as described above." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from time import time" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0 GB\n", + "\tMC pi: 3.14171686400\tErr: 1.242e-04\n", + "\tWorkers: 4\t\tTime: 10.157s\n", + "10.0 GB\n", + "\tMC pi: 3.14160633600\tErr: 1.368e-05\n", + "\tWorkers: 4\t\tTime: 1.617s\n", + "100.0 GB\n", + "\tMC pi: 3.14161142848\tErr: 1.877e-05\n", + "\tWorkers: 4\t\tTime: 9.231s\n" + ] + } + ], + "source": [ + "for size in (1e9 * n for n in (1, 10, 100)):\n", + " \n", + " start = time()\n", + " pi = calc_pi_mc(size).compute()\n", + " elaps = time() - start\n", + "\n", + " print_pi_stats(size, pi, time_delta=elaps,\n", + " num_workers=len(cluster.scheduler.workers))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scaling the Cluster to twice its size\n", + "\n", + "We increase the number of workers by 2 and the re-run the experiments." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from time import sleep" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scaling from 4 to 8 workers.\n" + ] + } + ], + "source": [ + "new_num_workers = 2 * len(cluster.scheduler.workers)\n", + "\n", + "print(f\"Scaling from {len(cluster.scheduler.workers)} to {new_num_workers} workers.\")\n", + "\n", + "cluster.scale(new_num_workers)\n", + "\n", + "sleep(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 4
  • \n", + "
  • Cores: 48
  • \n", + "
  • Memory: 200.00 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Re-run same experiments with doubled cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0 GB\n", + "\tMC pi: 3.14218988800\tErr: 5.972e-04\n", + "\tWorkers: 4\t\tTime: 0.724s\n", + "10.0 GB\n", + "\tMC pi: 3.14164865280\tErr: 5.600e-05\n", + "\tWorkers: 4\t\tTime: 1.627s\n", + "100.0 GB\n", + "\tMC pi: 3.14161009536\tErr: 1.744e-05\n", + "\tWorkers: 8\t\tTime: 9.199s\n" + ] + } + ], + "source": [ + "for size in (1e9 * n for n in (1, 10, 100)):\n", + " \n", + " \n", + " start = time()\n", + " pi = calc_pi_mc(size).compute()\n", + " elaps = time() - start\n", + "\n", + " print_pi_stats(size, pi,\n", + " time_delta=elaps,\n", + " num_workers=len(cluster.scheduler.workers))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Automatically Scaling the Cluster\n", + "\n", + "We want each calculation to take only a few seconds. Dask will try to add more workers to the cluster when workloads are high and remove workers when idling.\n", + "\n", + "_**Watch** how the cluster will scale down to the minimum a few seconds after being made adaptive._" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "ca = cluster.adapt(\n", + " minimum=4, maximum=100);\n", + "\n", + "sleep(4) # Allow for scale-down" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 4
  • \n", + "
  • Cores: 48
  • \n", + "
  • Memory: 200.00 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Repeat the calculation from above with larger work loads\n", + "\n", + "(And watch the dash board!)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0 GB\n", + "\tMC pi: 3.14177523200\tErr: 1.826e-04\n", + "\tWorkers: 4\t\tTime: 4.355s\n", + "10.0 GB\n", + "\tMC pi: 3.14153713280\tErr: 5.552e-05\n", + "\tWorkers: 4\t\tTime: 2.116s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "JobQueueCluster.scale_up was called with a number of workers lower that what is already running or pending\n", + "JobQueueCluster.scale_up was called with a number of workers lower that what is already running or pending\n", + "JobQueueCluster.scale_up was called with a number of workers lower that what is already running or pending\n", + "JobQueueCluster.scale_up was called with a number of workers lower that what is already running or pending\n", + "JobQueueCluster.scale_up was called with a number of workers lower that what is already running or pending\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100.0 GB\n", + "\tMC pi: 3.14160726144\tErr: 1.461e-05\n", + "\tWorkers: 4\t\tTime: 9.491s\n", + "1000.0 GB\n", + "\tMC pi: 3.14158722586\tErr: 5.428e-06\n", + "\tWorkers: 100\t\tTime: 20.810s\n" + ] + } + ], + "source": [ + "for size in (n * 1e9 for n in (1, 10, 100, 1000)):\n", + " \n", + " \n", + " start = time()\n", + " pi = calc_pi_mc(size, min(size / 1000, 500e6)).compute()\n", + " elaps = time() - start\n", + "\n", + " print_pi_stats(size, pi, time_delta=elaps,\n", + " num_workers=len(cluster.scheduler.workers))\n", + " \n", + " sleep(20) # allow for scale-down time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "- setup of a `SLURMCluster`\n", + "- manual scaling of a cluster\n", + "- basic adaptivity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complete listing of software used here" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/bin/sh: module: line 1: syntax error: unexpected end of file\n", + "/usr/bin/sh: error importing function definition for `BASH_FUNC_module'\n", + "/usr/bin/sh: jutil: line 1: syntax error: unexpected end of file\n", + "/usr/bin/sh: error importing function definition for `BASH_FUNC_jutil'\n", + "/usr/bin/sh: ml: line 1: syntax error: unexpected end of file\n", + "/usr/bin/sh: error importing function definition for `BASH_FUNC_ml'\n", + "Package Version \n", + "------------------ -----------------\n", + "asciitree 0.3.3 \n", + "aspy.yaml 1.2.0 \n", + "backcall 0.1.0 \n", + "bokeh 1.1.0 \n", + "certifi 2019.3.9 \n", + "cfgv 1.6.0 \n", + "cftime 1.0.3.4 \n", + "Click 7.0 \n", + "cloudpickle 1.0.0 \n", + "cycler 0.10.0 \n", + "cytoolz 0.9.0.1 \n", + "dask 1.2.0 \n", + "dask-jobqueue 0.4.1+32.g9c3371d\n", + "decorator 4.4.0 \n", + "distributed 1.27.1 \n", + "docrep 0.2.5 \n", + "fasteners 0.14.1 \n", + "heapdict 1.0.0 \n", + "identify 1.4.3 \n", + "importlib-metadata 0.13 \n", + "ipykernel 5.1.1 \n", + "ipython 7.5.0 \n", + "ipython-genutils 0.2.0 \n", + "jedi 0.13.3 \n", + "Jinja2 2.10.1 \n", + "jupyter-client 5.2.4 \n", + "jupyter-core 4.4.0 \n", + "kiwisolver 1.1.0 \n", + "llvmlite 0.28.0 \n", + "locket 0.2.0 \n", + "MarkupSafe 1.1.1 \n", + "matplotlib 3.1.0 \n", + "monotonic 1.5 \n", + "msgpack 0.6.1 \n", + "netCDF4 1.5.1.2 \n", + "nodeenv 1.3.3 \n", + "numba 0.43.1 \n", + "numcodecs 0.6.3 \n", + "numpy 1.16.3 \n", + "olefile 0.46 \n", + "packaging 19.0 \n", + "pandas 0.24.2 \n", + "parso 0.4.0 \n", + "partd 0.3.9 \n", + "patsy 0.5.1 \n", + "pexpect 4.7.0 \n", + "pickleshare 0.7.5 \n", + "Pillow 6.0.0 \n", + "pip 19.1 \n", + "pre-commit 1.16.1 \n", + "prompt-toolkit 2.0.9 \n", + "psutil 5.6.2 \n", + "ptyprocess 0.6.0 \n", + "Pygments 2.4.0 \n", + "pyparsing 2.4.0 \n", + "python-dateutil 2.8.0 \n", + "pytz 2019.1 \n", + "PyYAML 5.1 \n", + "pyzmq 18.0.1 \n", + "scipy 1.3.0 \n", + "seaborn 0.9.0 \n", + "setuptools 41.0.1 \n", + "six 1.12.0 \n", + "sortedcontainers 2.1.0 \n", + "statsmodels 0.9.0 \n", + "tblib 1.4.0 \n", + "toml 0.10.0 \n", + "toolz 0.9.0 \n", + "tornado 6.0.2 \n", + "traitlets 4.3.2 \n", + "virtualenv 16.6.0 \n", + "wcwidth 0.1.7 \n", + "wheel 0.33.4 \n", + "xarray 0.12.1 \n", + "zarr 2.3.1 \n", + "zict 0.1.4 \n", + "zipp 0.5.1 \n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip list" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/bin/sh: module: line 1: syntax error: unexpected end of file\n", + "/usr/bin/sh: error importing function definition for `BASH_FUNC_module'\n", + "/usr/bin/sh: jutil: line 1: syntax error: unexpected end of file\n", + "/usr/bin/sh: error importing function definition for `BASH_FUNC_jutil'\n", + "/usr/bin/sh: ml: line 1: syntax error: unexpected end of file\n", + "/usr/bin/sh: error importing function definition for `BASH_FUNC_ml'\n", + "# This file may be used to create an environment using:\n", + "# $ conda create --name --file \n", + "# platform: linux-64\n", + "@EXPLICIT\n", + "https://conda.anaconda.org/conda-forge/linux-64/git-lfs-2.7.2-0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2019.3.9-hecc5488_0.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-8.2.0-hdf63c60_1.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.3.0-hdf63c60_0.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-8.2.0-hdf63c60_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.6-h14c3975_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/expat-2.2.5-hf484d3e_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/icu-58.2-hf484d3e_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/jpeg-9c-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libffi-3.2.1-he1b5a44_1006.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.15-h516909a_1005.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libsodium-1.0.16-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.32.1-h14c3975_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.1-hf484d3e_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/openblas-0.3.6-h6e990d7_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/openssl-1.1.1b-h14c3975_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pcre-8.41-hf484d3e_1003.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/perl-5.26.2-h516909a_1006.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.9-h14c3975_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.3-h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.4-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/yaml-0.1.7-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.11-h14c3975_1004.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/gettext-0.19.8.1-hc5be6a0_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/hdf4-4.2.13-h9a582f1_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/hdf5-1.10.4-nompi_h3c11f04_1106.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libblas-3.8.0-10_openblas.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20170329-hf8c457e_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.37-hed695b0_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.8.2-h22169c7_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.0.10-h648cc4a_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.13-h14c3975_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.9.9-h13577e0_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/readline-7.0-hf8c457e_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.9-h84994c4_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/zeromq-4.3.1-hf484d3e_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/freetype-2.10.0-he983fc9_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/glib-2.58.3-hf63aee3_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/krb5-1.16.3-h05b26f9_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.8.0-10_openblas.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.8.0-10_openblas.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.26.0-h67949de_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/dbus-1.13.6-he372182_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.13.1-he4413a7_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/gstreamer-1.14.4-h66beb1c_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libcurl-7.64.1-hda55be3_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/python-3.7.3-h5b0a415_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/asciitree-0.3.3-py_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/backcall-0.1.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/certifi-2019.3.9-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/click-7.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/cloudpickle-1.0.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/curl-7.64.1-hf8cf82a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/dask-core-1.2.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/decorator-4.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.14.4-hdf3bae2_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/heapdict-1.0.0-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.1.0-py37hc9558a2_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/llvmlite-0.28.0-py37hdbcaa40_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/locket-0.2.0-py_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/markupsafe-1.1.1-py37h14c3975_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/monotonic-1.5-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-0.6.1-py37h6bb024c_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/numpy-1.16.3-py37he5ce36f_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/parso-0.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pickleshare-0.7.5-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/psutil-5.6.2-py37h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.6.0-py_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pyparsing-2.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pytz-2019.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyyaml-5.1-py37h14c3975_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyzmq-18.0.1-py37hc4ba49a_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/sip-4.19.8-py37hf484d3e_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/six-1.12.0-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/sortedcontainers-2.1.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/tblib-1.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/toolz-0.9.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/tornado-6.0.2-py37h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.1.7-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/cftime-1.0.3.4-py37hd352d35_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/cycler-0.10.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/cytoolz-0.9.0.1-py37h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/fasteners-0.14.1-py_3.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/git-2.21.0-pl526h2882143_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/jedi-0.13.3-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libnetcdf-4.6.2-hbdf4f91_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/numba-0.43.1-py37hf2d7682_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/numcodecs-0.6.3-py37hf484d3e_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/packaging-19.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/partd-0.3.9-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pexpect-4.7.0-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pillow-6.0.0-py37he7afcd5_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/qt-5.9.7-h52cfd70_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/scipy-1.3.0-py37hab63836_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/setuptools-41.0.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/traitlets-4.3.2-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/zict-0.1.4-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/distributed-1.27.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jinja2-2.10.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jupyter_core-4.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.1.0-py37h5f35d83_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/netcdf4-1.5.1.2-py37had58050_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pandas-0.24.2-py37hf484d3e_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/patsy-0.5.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pygments-2.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyqt-5.9.2-py37hcca6a23_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/wheel-0.33.4-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/zarr-2.3.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/bokeh-1.1.0-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jupyter_client-5.2.4-py_3.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.1.0-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pip-19.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/prompt_toolkit-2.0.9-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/statsmodels-0.9.0-py37h3010b51_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/xarray-0.12.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/dask-1.2.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ipython-7.5.0-py37h24bf2e0_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/seaborn-0.9.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ipykernel-5.1.1-py37h24bf2e0_0.tar.bz2\n", + "\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%conda list --explicit" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [conda env:dask_jobqueue_workshop]", + "language": "python", + "name": "conda-env-dask_jobqueue_workshop-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_binder_notebooks/03_tuning_adaptive_clusters.ipynb b/_binder_notebooks/03_tuning_adaptive_clusters.ipynb new file mode 100644 index 0000000..d62d38c --- /dev/null +++ b/_binder_notebooks/03_tuning_adaptive_clusters.ipynb @@ -0,0 +1,391 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Monte-Carlo Estimate of $\\pi$\n", + "\n", + "We want to estimate the number $\\pi$ using a [Monte-Carlo method](https://en.wikipedia.org/wiki/Pi#Monte_Carlo_methods) exploiting that the area of a quarter circle of unit radius is $\\pi/4$ and that hence the probability of any randomly chosen point in a unit square to lie in a unit circle centerd at a corner of the unit square is $\\pi/4$ as well. So for N randomly chosen pairs $(x, y)$ with $x\\in[0, 1)$ and $y\\in[0, 1)$, we count the number $N_{circ}$ of pairs that also satisfy $(x^2 + y^2) < 1$ and estimage $\\pi \\approx 4 \\cdot N_{circ} / N$.\n", + "\n", + "[\"PI](https://en.wikipedia.org/wiki/Pi#Monte_Carlo_methods)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Core Lessons\n", + "\n", + "- Adaptive clusters\n", + "- Tuning the adaptivity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up a Slurm cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dask.distributed import Client\n", + "from dask_jobqueue import SLURMCluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster = SLURMCluster(\n", + " cores=24,\n", + " processes=2,\n", + " memory=\"100GB\",\n", + " shebang='#!/usr/bin/env bash',\n", + " queue=\"batch\",\n", + " walltime=\"00:30:00\",\n", + " local_directory='/tmp',\n", + " death_timeout=\"15s\",\n", + " interface=\"ib0\",\n", + " log_directory=f'{os.environ[\"SCRATCH_cecam\"]}/{os.environ[\"USER\"]}/dask_jobqueue_logs/',\n", + " project=\"ecam\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = Client(cluster)\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The job scripts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(cluster.job_script())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scale the cluster to two nodes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.scale(4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The Monte Carlo Method" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.array as da\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_pi_mc(size_in_bytes, chunksize_in_bytes=200e6):\n", + " \"\"\"Calculate PI using a Monte Carlo estimate.\"\"\"\n", + " \n", + " size = int(size_in_bytes / 8)\n", + " chunksize = int(chunksize_in_bytes / 8)\n", + " \n", + " xy = da.random.uniform(0, 1,\n", + " size=(size / 2, 2),\n", + " chunks=(chunksize / 2, 2))\n", + " \n", + " in_circle = ((xy ** 2).sum(axis=-1) < 1)\n", + " pi = 4 * in_circle.mean()\n", + "\n", + " return pi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_pi_stats(size, pi, time_delta, num_workers):\n", + " \"\"\"Print pi, calculate offset from true value, and print some stats.\"\"\"\n", + " print(f\"{size / 1e9} GB\\n\"\n", + " f\"\\tMC pi: {pi : 13.11f}\"\n", + " f\"\\tErr: {abs(pi - np.pi) : 10.3e}\\n\"\n", + " f\"\\tWorkers: {num_workers}\"\n", + " f\"\\t\\tTime: {time_delta : 7.3f}s\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The actual calculations\n", + "\n", + "We loop over different volumes of double-precision random numbers and estimate $\\pi$ as described above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from time import time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for size in (1e9 * n for n in (1, 10, 100)):\n", + " \n", + " start = time()\n", + " pi = calc_pi_mc(size).compute()\n", + " elaps = time() - start\n", + "\n", + " print_pi_stats(size, pi, time_delta=elaps,\n", + " num_workers=len(cluster.scheduler.workers))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scaling the Cluster to twice its size\n", + "\n", + "We increase the number of workers by 2 and the re-run the experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from time import sleep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_num_workers = 2 * len(cluster.scheduler.workers)\n", + "\n", + "print(f\"Scaling from {len(cluster.scheduler.workers)} to {new_num_workers} workers.\")\n", + "\n", + "cluster.scale(new_num_workers)\n", + "\n", + "sleep(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Re-run same experiments with doubled cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for size in (1e9 * n for n in (1, 10, 100)):\n", + " \n", + " \n", + " start = time()\n", + " pi = calc_pi_mc(size).compute()\n", + " elaps = time() - start\n", + "\n", + " print_pi_stats(size, pi,\n", + " time_delta=elaps,\n", + " num_workers=len(cluster.scheduler.workers))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Automatically scale the cluster towards a target duration\n", + "\n", + "We'll target a wall time of 30 seconds.\n", + "\n", + "_**Watch** how the cluster will scale down to the minimum a few seconds after being made adaptive._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ca = cluster.adapt(\n", + " minimum=2, maximum=30,\n", + " target_duration=\"360s\", # measured in CPU time per worker\n", + " # -> 30 seconds at 12 cores / worker\n", + " scale_factor=1.0 # prevent from scaling up because of CPU or MEM need\n", + ");\n", + "\n", + "sleep(4) # Allow for scale-down" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Repeat the calculation from above with larger work loads\n", + "\n", + "(And watch the dash board!)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for size in (n * 1e9 for n in (200, 400, 800)):\n", + " \n", + " \n", + " start = time()\n", + " pi = calc_pi_mc(size, min(size / 1000, 500e6)).compute()\n", + " elaps = time() - start\n", + "\n", + " print_pi_stats(size, pi, time_delta=elaps,\n", + " num_workers=len(cluster.scheduler.workers))\n", + " \n", + " sleep(20) # allow for scale-down time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "- adaptivity with a target duration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complete listing of software used here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%conda list --explicit" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [conda env:dask_jobqueue_workshop]", + "language": "python", + "name": "conda-env-dask_jobqueue_workshop-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_binder_notebooks/04_resilient_clusters.ipynb b/_binder_notebooks/04_resilient_clusters.ipynb new file mode 100644 index 0000000..3196d8b --- /dev/null +++ b/_binder_notebooks/04_resilient_clusters.ipynb @@ -0,0 +1,893 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Monte-Carlo Estimate of $\\pi$\n", + "\n", + "We want to estimate the number $\\pi$ using a [Monte-Carlo method](https://en.wikipedia.org/wiki/Pi#Monte_Carlo_methods) exploiting that the area of a quarter circle of unit radius is $\\pi/4$ and that hence the probability of any randomly chosen point in a unit square to lie in a unit circle centerd at a corner of the unit square is $\\pi/4$ as well. So for N randomly chosen pairs $(x, y)$ with $x\\in[0, 1)$ and $y\\in[0, 1)$, we count the number $N_{circ}$ of pairs that also satisfy $(x^2 + y^2) < 1$ and estimage $\\pi \\approx 4 \\cdot N_{circ} / N$.\n", + "\n", + "[\"PI](https://en.wikipedia.org/wiki/Pi#Monte_Carlo_methods)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Core Lessons\n", + "\n", + "- Resilience against dying workers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up a Slurm cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/p/project/cecam/rath1/miniconda3_20190521/envs/dask_jobqueue_workshop/lib/python3.7/site-packages/docrep/__init__.py:341: MatplotlibDeprecationWarning: \n", + "The dedent function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use inspect.cleandoc instead.\n", + " s = dedents('\\n' + '\\n'.join(lines[first:]))\n" + ] + } + ], + "source": [ + "from dask.distributed import Client\n", + "from dask_jobqueue import SLURMCluster" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "cluster = SLURMCluster(\n", + " cores=24,\n", + " processes=2,\n", + " memory=\"100GB\",\n", + " shebang='#!/usr/bin/env bash',\n", + " queue=\"batch\",\n", + " walltime=\"00:30:00\",\n", + " local_directory='/tmp',\n", + " death_timeout=\"15s\",\n", + " interface=\"ib0\",\n", + " log_directory=f'{os.environ[\"SCRATCH_cecam\"]}/{os.environ[\"USER\"]}/dask_jobqueue_logs/',\n", + " project=\"ecam\",\n", + " name=\"resilient_clusters\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 0
  • \n", + "
  • Cores: 0
  • \n", + "
  • Memory: 0 B
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client = Client(cluster)\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The job scripts" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#!/usr/bin/env bash\n", + "\n", + "#SBATCH -J resilient_clusters\n", + "#SBATCH -e /p/scratch/cecam/rath1/dask_jobqueue_logs//resilient_clusters-%J.err\n", + "#SBATCH -o /p/scratch/cecam/rath1/dask_jobqueue_logs//resilient_clusters-%J.out\n", + "#SBATCH -p batch\n", + "#SBATCH -A ecam\n", + "#SBATCH -n 1\n", + "#SBATCH --cpus-per-task=24\n", + "#SBATCH --mem=94G\n", + "#SBATCH -t 00:30:00\n", + "JOB_ID=${SLURM_JOB_ID%;*}\n", + "\n", + "\n", + "\n", + "/p/project/cecam/rath1/miniconda3_20190521/envs/dask_jobqueue_workshop/bin/python -m distributed.cli.dask_worker tcp://10.80.32.41:33451 --nthreads 12 --nprocs 2 --memory-limit 50.00GB --name resilient_clusters--${JOB_ID}-- --death-timeout 15s --local-directory /tmp --interface ib0\n", + "\n" + ] + } + ], + "source": [ + "print(cluster.job_script())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scale the cluster to two nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster.adapt(minimum=12, maximum=12) # will lead to six jobs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The Monte Carlo Method" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.array as da\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_pi_mc(size_in_bytes, chunksize_in_bytes=200e6):\n", + " \"\"\"Calculate PI using a Monte Carlo estimate.\"\"\"\n", + " \n", + " size = int(size_in_bytes / 8)\n", + " chunksize = int(chunksize_in_bytes / 8)\n", + " \n", + " xy = da.random.uniform(0, 1,\n", + " size=(size / 2, 2),\n", + " chunks=(chunksize / 2, 2))\n", + " \n", + " in_circle = ((xy ** 2).sum(axis=-1) < 1)\n", + " pi = 4 * in_circle.mean()\n", + "\n", + " return pi" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def print_pi_stats(size, pi, time_delta, num_workers):\n", + " \"\"\"Print pi, calculate offset from true value, and print some stats.\"\"\"\n", + " print(f\"{size / 1e9} GB\\n\"\n", + " f\"\\tMC pi: {pi : 13.11f}\"\n", + " f\"\\tErr: {abs(pi - np.pi) : 10.3e}\\n\"\n", + " f\"\\tWorkers: {num_workers}\"\n", + " f\"\\t\\tTime: {time_delta : 7.3f}s\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The actual calculations\n", + "\n", + "We loop over different volumes of double-precision random numbers and estimate $\\pi$ as described above." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from time import time" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0 GB\n", + "\tMC pi: 3.14145440000\tErr: 1.383e-04\n", + "\tWorkers: 6\t\tTime: 14.832s\n", + "10.0 GB\n", + "\tMC pi: 3.14162355200\tErr: 3.090e-05\n", + "\tWorkers: 6\t\tTime: 1.158s\n", + "100.0 GB\n", + "\tMC pi: 3.14158572480\tErr: 6.929e-06\n", + "\tWorkers: 12\t\tTime: 4.409s\n" + ] + } + ], + "source": [ + "for size in (1e9 * n for n in (1, 10, 100)):\n", + " \n", + " start = time()\n", + " pi = calc_pi_mc(size).compute()\n", + " elaps = time() - start\n", + "\n", + " print_pi_stats(size, pi, time_delta=elaps,\n", + " num_workers=len(cluster.scheduler.workers))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What happens if a worker dies?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll find out all \"our\" job ids, mark a few of them non-preemptible, filter for the preemptible jobs, and define a function to kill one randomly selected preemptible job." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def get_current_jobs():\n", + " current_jobs = !squeue | grep R | grep $USER | grep resil | awk '{print $1}'\n", + " return current_jobs" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['7191395', '7191396']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "non_preemptible_jobs = get_current_jobs()[:2]\n", + "non_preemptible_jobs" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def get_preemptible_jobs():\n", + " return list(filter(lambda j: j not in non_preemptible_jobs, get_current_jobs()))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "\n", + "def kill_random_preemptible_job():\n", + " preemptible_jobs = get_preemptible_jobs()\n", + " if preemptible_jobs:\n", + " worker_to_kill = random.choice(preemptible_jobs)\n", + " print(f\"will cancel job {worker_to_kill}\")\n", + " !scancel {worker_to_kill}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Let's start a computation with disappearing workers" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "pi = calc_pi_mc(1e12, 500e6)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Future: finalize status: pending, key: finalize-2a7825eef9b2726c91452b1374ee6eca" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pi = client.compute(pi)\n", + "pi" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from time import sleep" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "will cancel job 7191399\n", + "will cancel job 7191400\n", + "will cancel job 7191397\n" + ] + } + ], + "source": [ + "sleep(5)\n", + "\n", + "while not pi.done():\n", + " kill_random_preemptible_job()\n", + " sleep(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Future: finalize status: error, key: finalize-2a7825eef9b2726c91452b1374ee6eca" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pi" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "ename": "KilledWorker", + "evalue": "(\"('sum-sum-aggregate-uniform-mean_chunk-d15a005003e49b0f8054e3bbec8f3ce1', 1688)\", 'tcp://10.80.35.39:40738')", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKilledWorker\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/p/project/cecam/rath1/miniconda3_20190521/envs/dask_jobqueue_workshop/lib/python3.7/site-packages/distributed/client.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msync\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_result\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraiseit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 226\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"error\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 227\u001b[0;31m \u001b[0msix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 228\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"cancelled\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 229\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/p/project/cecam/rath1/miniconda3_20190521/envs/dask_jobqueue_workshop/lib/python3.7/site-packages/six.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(tp, value, tb)\u001b[0m\n\u001b[1;32m 691\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__traceback__\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 692\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 693\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 694\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 695\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKilledWorker\u001b[0m: (\"('sum-sum-aggregate-uniform-mean_chunk-d15a005003e49b0f8054e3bbec8f3ce1', 1688)\", 'tcp://10.80.35.39:40738')" + ] + } + ], + "source": [ + "print(pi.result())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What happened?\n", + "\n", + "The Dask scheduler keeps a suspiciousness counter for each task it manages. Whenever a worker dies, all tasks that belong to the worker at the time of its death will have their suspiciousness increased by one. In doing so, the scheduler has no way of telling which exact task was responsible for the death of the worker and just flag all of them as bad.\n", + "\n", + "All tasks with suspiciousness `>= 3` (default) are considered bad and won't be rescheduled." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make dask more resilient\n", + "\n", + "We can increase the number of allowed failures. Let's practically disable the threshold and re-do the calculation." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.scheduler.allowed_failures = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "pi = calc_pi_mc(1e12, 500e6)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Future: finalize status: pending, key: finalize-054a7564edc5c9907858eac3e79c0d6d" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pi = client.compute(pi)\n", + "pi" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "will cancel job 7191405\n", + "will cancel job 7191398\n", + "will cancel job 7191406\n", + "will cancel job 7191402\n", + "will cancel job 7191411\n" + ] + } + ], + "source": [ + "sleep(5)\n", + "\n", + "while not pi.done():\n", + " kill_random_preemptible_job()\n", + " sleep(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Future: finalize status: finished, type: float64, key: finalize-054a7564edc5c9907858eac3e79c0d6d" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pi" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.141595365824\n" + ] + } + ], + "source": [ + "print(pi.result())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "Dask very consciously handles failing tasks and can be made very resilient against suddenly disappearing ressources." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complete listing of software used here" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Package Version \n", + "------------------ -----------------\n", + "asciitree 0.3.3 \n", + "aspy.yaml 1.2.0 \n", + "backcall 0.1.0 \n", + "bokeh 1.1.0 \n", + "certifi 2019.3.9 \n", + "cfgv 1.6.0 \n", + "cftime 1.0.3.4 \n", + "Click 7.0 \n", + "cloudpickle 1.0.0 \n", + "cycler 0.10.0 \n", + "cytoolz 0.9.0.1 \n", + "dask 1.2.0 \n", + "dask-jobqueue 0.4.1+32.g9c3371d\n", + "decorator 4.4.0 \n", + "distributed 1.27.1 \n", + "docrep 0.2.5 \n", + "fasteners 0.14.1 \n", + "heapdict 1.0.0 \n", + "identify 1.4.3 \n", + "importlib-metadata 0.13 \n", + "ipykernel 5.1.1 \n", + "ipython 7.5.0 \n", + "ipython-genutils 0.2.0 \n", + "jedi 0.13.3 \n", + "Jinja2 2.10.1 \n", + "jupyter-client 5.2.4 \n", + "jupyter-core 4.4.0 \n", + "kiwisolver 1.1.0 \n", + "llvmlite 0.28.0 \n", + "locket 0.2.0 \n", + "MarkupSafe 1.1.1 \n", + "matplotlib 3.1.0 \n", + "monotonic 1.5 \n", + "msgpack 0.6.1 \n", + "netCDF4 1.5.1.2 \n", + "nodeenv 1.3.3 \n", + "numba 0.43.1 \n", + "numcodecs 0.6.3 \n", + "numpy 1.16.3 \n", + "olefile 0.46 \n", + "packaging 19.0 \n", + "pandas 0.24.2 \n", + "parso 0.4.0 \n", + "partd 0.3.9 \n", + "patsy 0.5.1 \n", + "pexpect 4.7.0 \n", + "pickleshare 0.7.5 \n", + "Pillow 6.0.0 \n", + "pip 19.1 \n", + "pre-commit 1.16.1 \n", + "prompt-toolkit 2.0.9 \n", + "psutil 5.6.2 \n", + "ptyprocess 0.6.0 \n", + "Pygments 2.4.0 \n", + "pyparsing 2.4.0 \n", + "python-dateutil 2.8.0 \n", + "pytz 2019.1 \n", + "PyYAML 5.1 \n", + "pyzmq 18.0.1 \n", + "scipy 1.3.0 \n", + "seaborn 0.9.0 \n", + "setuptools 41.0.1 \n", + "six 1.12.0 \n", + "sortedcontainers 2.1.0 \n", + "statsmodels 0.9.0 \n", + "tblib 1.4.0 \n", + "toml 0.10.0 \n", + "toolz 0.9.0 \n", + "tornado 6.0.2 \n", + "traitlets 4.3.2 \n", + "virtualenv 16.6.0 \n", + "wcwidth 0.1.7 \n", + "wheel 0.33.4 \n", + "xarray 0.12.1 \n", + "zarr 2.3.1 \n", + "zict 0.1.4 \n", + "zipp 0.5.1 \n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip list" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# This file may be used to create an environment using:\n", + "# $ conda create --name --file \n", + "# platform: linux-64\n", + "@EXPLICIT\n", + "https://conda.anaconda.org/conda-forge/linux-64/git-lfs-2.7.2-0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2019.3.9-hecc5488_0.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-8.2.0-hdf63c60_1.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.3.0-hdf63c60_0.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-8.2.0-hdf63c60_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.6-h14c3975_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/expat-2.2.5-hf484d3e_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/icu-58.2-hf484d3e_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/jpeg-9c-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libffi-3.2.1-he1b5a44_1006.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.15-h516909a_1005.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libsodium-1.0.16-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.32.1-h14c3975_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.1-hf484d3e_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/openblas-0.3.6-h6e990d7_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/openssl-1.1.1b-h14c3975_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pcre-8.41-hf484d3e_1003.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/perl-5.26.2-h516909a_1006.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.9-h14c3975_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.3-h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.4-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/yaml-0.1.7-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.11-h14c3975_1004.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/gettext-0.19.8.1-hc5be6a0_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/hdf4-4.2.13-h9a582f1_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/hdf5-1.10.4-nompi_h3c11f04_1106.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libblas-3.8.0-10_openblas.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20170329-hf8c457e_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.37-hed695b0_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.8.2-h22169c7_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.0.10-h648cc4a_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.13-h14c3975_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.9.9-h13577e0_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/readline-7.0-hf8c457e_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.9-h84994c4_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/zeromq-4.3.1-hf484d3e_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/freetype-2.10.0-he983fc9_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/glib-2.58.3-hf63aee3_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/krb5-1.16.3-h05b26f9_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.8.0-10_openblas.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.8.0-10_openblas.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.26.0-h67949de_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/dbus-1.13.6-he372182_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.13.1-he4413a7_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/gstreamer-1.14.4-h66beb1c_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libcurl-7.64.1-hda55be3_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/python-3.7.3-h5b0a415_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/asciitree-0.3.3-py_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/backcall-0.1.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/certifi-2019.3.9-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/click-7.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/cloudpickle-1.0.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/curl-7.64.1-hf8cf82a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/dask-core-1.2.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/decorator-4.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.14.4-hdf3bae2_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/heapdict-1.0.0-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.1.0-py37hc9558a2_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/llvmlite-0.28.0-py37hdbcaa40_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/locket-0.2.0-py_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/markupsafe-1.1.1-py37h14c3975_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/monotonic-1.5-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-0.6.1-py37h6bb024c_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/numpy-1.16.3-py37he5ce36f_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/parso-0.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pickleshare-0.7.5-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/psutil-5.6.2-py37h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.6.0-py_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pyparsing-2.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pytz-2019.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyyaml-5.1-py37h14c3975_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyzmq-18.0.1-py37hc4ba49a_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/sip-4.19.8-py37hf484d3e_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/six-1.12.0-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/sortedcontainers-2.1.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/tblib-1.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/toolz-0.9.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/tornado-6.0.2-py37h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.1.7-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/cftime-1.0.3.4-py37hd352d35_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/cycler-0.10.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/cytoolz-0.9.0.1-py37h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/fasteners-0.14.1-py_3.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/git-2.21.0-pl526h2882143_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/jedi-0.13.3-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libnetcdf-4.6.2-hbdf4f91_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/numba-0.43.1-py37hf2d7682_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/numcodecs-0.6.3-py37hf484d3e_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/packaging-19.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/partd-0.3.9-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pexpect-4.7.0-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pillow-6.0.0-py37he7afcd5_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/qt-5.9.7-h52cfd70_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/scipy-1.3.0-py37hab63836_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/setuptools-41.0.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/traitlets-4.3.2-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/zict-0.1.4-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/distributed-1.27.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jinja2-2.10.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jupyter_core-4.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.1.0-py37h5f35d83_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/netcdf4-1.5.1.2-py37had58050_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pandas-0.24.2-py37hf484d3e_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/patsy-0.5.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pygments-2.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyqt-5.9.2-py37hcca6a23_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/wheel-0.33.4-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/zarr-2.3.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/bokeh-1.1.0-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jupyter_client-5.2.4-py_3.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.1.0-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pip-19.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/prompt_toolkit-2.0.9-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/statsmodels-0.9.0-py37h3010b51_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/xarray-0.12.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/dask-1.2.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ipython-7.5.0-py37h24bf2e0_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/seaborn-0.9.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ipykernel-5.1.1-py37h24bf2e0_0.tar.bz2\n", + "\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%conda list --explicit" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [conda env:dask_jobqueue_workshop]", + "language": "python", + "name": "conda-env-dask_jobqueue_workshop-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_binder_notebooks/05_cluster_size_and_distributed_IO.ipynb b/_binder_notebooks/05_cluster_size_and_distributed_IO.ipynb new file mode 100644 index 0000000..224ab86 --- /dev/null +++ b/_binder_notebooks/05_cluster_size_and_distributed_IO.ipynb @@ -0,0 +1,793 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed IO and cluster sizes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Core Lesson\n", + "\n", + "In a distributed file system, spreading IO to as many nodes as possible might speed up your computations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up a Slurm cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/p/project/cecam/rath1/miniconda3_20190521/envs/dask_jobqueue_workshop/lib/python3.7/site-packages/docrep/__init__.py:341: MatplotlibDeprecationWarning: \n", + "The dedent function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use inspect.cleandoc instead.\n", + " s = dedents('\\n' + '\\n'.join(lines[first:]))\n" + ] + } + ], + "source": [ + "from dask.distributed import Client\n", + "from dask_jobqueue import SLURMCluster" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "cluster = SLURMCluster(\n", + " cores=24,\n", + " processes=2,\n", + " memory=\"100GB\",\n", + " shebang='#!/usr/bin/env bash',\n", + " queue=\"batch\",\n", + " walltime=\"00:30:00\",\n", + " local_directory='/tmp',\n", + " death_timeout=\"15s\",\n", + " interface=\"ib0\",\n", + " log_directory=f'{os.environ[\"SCRATCH_cecam\"]}/{os.environ[\"USER\"]}/dask_jobqueue_logs/',\n", + " project=\"ecam\",\n", + " name=\"resilient_clusters\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 0
  • \n", + "
  • Cores: 0
  • \n", + "
  • Memory: 0 B
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client = Client(cluster)\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The job scripts" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#!/usr/bin/env bash\n", + "\n", + "#SBATCH -J resilient_clusters\n", + "#SBATCH -e /p/scratch/cecam/rath1/dask_jobqueue_logs//resilient_clusters-%J.err\n", + "#SBATCH -o /p/scratch/cecam/rath1/dask_jobqueue_logs//resilient_clusters-%J.out\n", + "#SBATCH -p batch\n", + "#SBATCH -A ecam\n", + "#SBATCH -n 1\n", + "#SBATCH --cpus-per-task=24\n", + "#SBATCH --mem=94G\n", + "#SBATCH -t 00:30:00\n", + "JOB_ID=${SLURM_JOB_ID%;*}\n", + "\n", + "\n", + "\n", + "/p/project/cecam/rath1/miniconda3_20190521/envs/dask_jobqueue_workshop/bin/python -m distributed.cli.dask_worker tcp://10.80.32.41:39894 --nthreads 12 --nprocs 2 --memory-limit 50.00GB --name resilient_clusters--${JOB_ID}-- --death-timeout 15s --local-directory /tmp --interface ib0\n", + "\n" + ] + } + ], + "source": [ + "print(cluster.job_script())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scale the cluster to a single node" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.scale(2) # will lead to one node" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create some data and dump it to disk" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from dask import array as da" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dask.array" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "random_numbers = da.random.uniform(0, 1, size=(1/8 * 500e9, 1), chunks=(1/8 * 200e6, 1))\n", + "random_numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf temporary_random_numbers.zarr/" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 31.7 s, sys: 2.19 s, total: 33.9 s\n", + "Wall time: 3min 26s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "random_numbers.to_zarr(\"temporary_random_numbers.zarr\");" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "500.0 GB\n" + ] + } + ], + "source": [ + "print(random_numbers.nbytes / 1e9, \"GB\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "406G\ttemporary_random_numbers.zarr\n" + ] + } + ], + "source": [ + "!du -sh temporary_random_numbers.zarr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Re-read and average" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dask.array" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reread_random_numbers = da.from_zarr(\"temporary_random_numbers.zarr/\")\n", + "reread_random_numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "500.0 GB\n" + ] + } + ], + "source": [ + "print(reread_random_numbers.nbytes / 1e9, \"GB\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.49999963422700916\n", + "CPU times: user 31.5 s, sys: 2.34 s, total: 33.9 s\n", + "Wall time: 1min 46s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "print(reread_random_numbers.mean().compute())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scale the cluster to four nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.scale(8) # will lead to four nodes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Re-read and average again" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dask.array" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reread_random_numbers = da.from_zarr(\"temporary_random_numbers.zarr/\")\n", + "reread_random_numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "500.0 GB\n" + ] + } + ], + "source": [ + "print(reread_random_numbers.nbytes / 1e9, \"GB\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.49999963422700916\n", + "CPU times: user 16.6 s, sys: 1.25 s, total: 17.8 s\n", + "Wall time: 23.5 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "print(reread_random_numbers.mean().compute())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How far can we take this?" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "cluster.scale(40) # will lead to 20 nodes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Re-read and average again" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dask.array" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reread_random_numbers = da.from_zarr(\"temporary_random_numbers.zarr/\")\n", + "reread_random_numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "500.0 GB\n" + ] + } + ], + "source": [ + "print(reread_random_numbers.nbytes / 1e9, \"GB\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.49999963422700916\n", + "CPU times: user 8.24 s, sys: 579 ms, total: 8.82 s\n", + "Wall time: 9.11 s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "print(reread_random_numbers.mean().compute())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "We have seen that IO bandwidth increases with the number of nodes we use for the computation. This is obvious on one hand, but it tells us that from an analytics perspective where a lot of data lying around on disk is analysed, spreading out resources to as many nodes as possible may be the way to go." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf temporary_random_numbers.zarr/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complete listing of software used here" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Package Version \n", + "------------------ -----------------\n", + "asciitree 0.3.3 \n", + "aspy.yaml 1.2.0 \n", + "backcall 0.1.0 \n", + "bokeh 1.1.0 \n", + "certifi 2019.3.9 \n", + "cfgv 1.6.0 \n", + "cftime 1.0.3.4 \n", + "Click 7.0 \n", + "cloudpickle 1.0.0 \n", + "cycler 0.10.0 \n", + "cytoolz 0.9.0.1 \n", + "dask 1.2.0 \n", + "dask-jobqueue 0.4.1+32.g9c3371d\n", + "decorator 4.4.0 \n", + "distributed 1.27.1 \n", + "docrep 0.2.5 \n", + "fasteners 0.14.1 \n", + "heapdict 1.0.0 \n", + "identify 1.4.3 \n", + "importlib-metadata 0.13 \n", + "ipykernel 5.1.1 \n", + "ipython 7.5.0 \n", + "ipython-genutils 0.2.0 \n", + "jedi 0.13.3 \n", + "Jinja2 2.10.1 \n", + "jupyter-client 5.2.4 \n", + "jupyter-core 4.4.0 \n", + "kiwisolver 1.1.0 \n", + "llvmlite 0.28.0 \n", + "locket 0.2.0 \n", + "MarkupSafe 1.1.1 \n", + "matplotlib 3.1.0 \n", + "monotonic 1.5 \n", + "msgpack 0.6.1 \n", + "netCDF4 1.5.1.2 \n", + "nodeenv 1.3.3 \n", + "numba 0.43.1 \n", + "numcodecs 0.6.3 \n", + "numpy 1.16.3 \n", + "olefile 0.46 \n", + "packaging 19.0 \n", + "pandas 0.24.2 \n", + "parso 0.4.0 \n", + "partd 0.3.9 \n", + "patsy 0.5.1 \n", + "pexpect 4.7.0 \n", + "pickleshare 0.7.5 \n", + "Pillow 6.0.0 \n", + "pip 19.1 \n", + "pre-commit 1.16.1 \n", + "prompt-toolkit 2.0.9 \n", + "psutil 5.6.2 \n", + "ptyprocess 0.6.0 \n", + "Pygments 2.4.0 \n", + "pyparsing 2.4.0 \n", + "python-dateutil 2.8.0 \n", + "pytz 2019.1 \n", + "PyYAML 5.1 \n", + "pyzmq 18.0.1 \n", + "scipy 1.3.0 \n", + "seaborn 0.9.0 \n", + "setuptools 41.0.1 \n", + "six 1.12.0 \n", + "sortedcontainers 2.1.0 \n", + "statsmodels 0.9.0 \n", + "tblib 1.4.0 \n", + "toml 0.10.0 \n", + "toolz 0.9.0 \n", + "tornado 6.0.2 \n", + "traitlets 4.3.2 \n", + "virtualenv 16.6.0 \n", + "wcwidth 0.1.7 \n", + "wheel 0.33.4 \n", + "xarray 0.12.1 \n", + "zarr 2.3.1 \n", + "zict 0.1.4 \n", + "zipp 0.5.1 \n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip list" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# This file may be used to create an environment using:\n", + "# $ conda create --name --file \n", + "# platform: linux-64\n", + "@EXPLICIT\n", + "https://conda.anaconda.org/conda-forge/linux-64/git-lfs-2.7.2-0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2019.3.9-hecc5488_0.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-8.2.0-hdf63c60_1.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.3.0-hdf63c60_0.tar.bz2\n", + "https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-8.2.0-hdf63c60_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.6-h14c3975_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/expat-2.2.5-hf484d3e_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/icu-58.2-hf484d3e_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/jpeg-9c-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libffi-3.2.1-he1b5a44_1006.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.15-h516909a_1005.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libsodium-1.0.16-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.32.1-h14c3975_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.1-hf484d3e_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/openblas-0.3.6-h6e990d7_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/openssl-1.1.1b-h14c3975_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pcre-8.41-hf484d3e_1003.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/perl-5.26.2-h516909a_1006.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.9-h14c3975_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.3-h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.4-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/yaml-0.1.7-h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.11-h14c3975_1004.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/gettext-0.19.8.1-hc5be6a0_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/hdf4-4.2.13-h9a582f1_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/hdf5-1.10.4-nompi_h3c11f04_1106.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libblas-3.8.0-10_openblas.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20170329-hf8c457e_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.37-hed695b0_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.8.2-h22169c7_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.0.10-h648cc4a_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.13-h14c3975_1002.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.9.9-h13577e0_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/readline-7.0-hf8c457e_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.9-h84994c4_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/zeromq-4.3.1-hf484d3e_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/freetype-2.10.0-he983fc9_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/glib-2.58.3-hf63aee3_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/krb5-1.16.3-h05b26f9_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.8.0-10_openblas.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.8.0-10_openblas.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.26.0-h67949de_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/dbus-1.13.6-he372182_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.13.1-he4413a7_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/gstreamer-1.14.4-h66beb1c_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libcurl-7.64.1-hda55be3_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/python-3.7.3-h5b0a415_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/asciitree-0.3.3-py_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/backcall-0.1.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/certifi-2019.3.9-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/click-7.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/cloudpickle-1.0.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/curl-7.64.1-hf8cf82a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/dask-core-1.2.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/decorator-4.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.14.4-hdf3bae2_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/heapdict-1.0.0-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.1.0-py37hc9558a2_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/llvmlite-0.28.0-py37hdbcaa40_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/locket-0.2.0-py_2.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/markupsafe-1.1.1-py37h14c3975_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/monotonic-1.5-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-0.6.1-py37h6bb024c_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/numpy-1.16.3-py37he5ce36f_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/parso-0.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pickleshare-0.7.5-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/psutil-5.6.2-py37h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.6.0-py_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pyparsing-2.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pytz-2019.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyyaml-5.1-py37h14c3975_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyzmq-18.0.1-py37hc4ba49a_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/sip-4.19.8-py37hf484d3e_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/six-1.12.0-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/sortedcontainers-2.1.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/tblib-1.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/toolz-0.9.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/tornado-6.0.2-py37h516909a_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.1.7-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/cftime-1.0.3.4-py37hd352d35_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/cycler-0.10.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/cytoolz-0.9.0.1-py37h14c3975_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/fasteners-0.14.1-py_3.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/git-2.21.0-pl526h2882143_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/jedi-0.13.3-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/libnetcdf-4.6.2-hbdf4f91_1001.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/numba-0.43.1-py37hf2d7682_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/numcodecs-0.6.3-py37hf484d3e_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/packaging-19.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/partd-0.3.9-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pexpect-4.7.0-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pillow-6.0.0-py37he7afcd5_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/qt-5.9.7-h52cfd70_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/scipy-1.3.0-py37hab63836_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/setuptools-41.0.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/traitlets-4.3.2-py37_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/zict-0.1.4-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/distributed-1.27.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jinja2-2.10.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jupyter_core-4.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.1.0-py37h5f35d83_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/netcdf4-1.5.1.2-py37had58050_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pandas-0.24.2-py37hf484d3e_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/patsy-0.5.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/pygments-2.4.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pyqt-5.9.2-py37hcca6a23_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/wheel-0.33.4-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/zarr-2.3.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/bokeh-1.1.0-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/jupyter_client-5.2.4-py_3.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.1.0-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/pip-19.1-py37_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/prompt_toolkit-2.0.9-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/statsmodels-0.9.0-py37h3010b51_1000.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/xarray-0.12.1-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/dask-1.2.0-py_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ipython-7.5.0-py37h24bf2e0_0.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/noarch/seaborn-0.9.0-py_1.tar.bz2\n", + "https://conda.anaconda.org/conda-forge/linux-64/ipykernel-5.1.1-py37h24bf2e0_0.tar.bz2\n", + "\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%conda list --explicit" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [conda env:dask_jobqueue_workshop]", + "language": "python", + "name": "conda-env-dask_jobqueue_workshop-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/binder/Dockerfile b/binder/Dockerfile new file mode 100644 index 0000000..6984f01 --- /dev/null +++ b/binder/Dockerfile @@ -0,0 +1,34 @@ +FROM giovtorres/docker-centos7-slurm + +RUN yum install -y sudo + +RUN curl -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash miniconda.sh -f -b -p /opt/anaconda && \ + /opt/anaconda/bin/conda clean -tipy && \ + rm -f miniconda.sh +ENV PATH /opt/anaconda/bin:$PATH + +RUN conda install --yes -c conda-forge python=3.6 dask distributed dask-jobqueue docrep jupyterlab ipywidgets + +COPY binder/slurm.conf /etc/slurm/slurm.conf + +COPY binder/my-docker-entrypoint.sh /usr/local/bin/my-docker-entrypoint.sh +ENTRYPOINT ["/usr/local/bin/my-docker-entrypoint.sh"] + +# create user with a home directory +ARG NB_USER +ARG NB_UID +ENV USER ${NB_USER} +ENV HOME /home/${NB_USER} + +RUN useradd -m --home-dir ${HOME} \ + --uid ${NB_UID} \ + ${NB_USER} + +RUN usermod -aG wheel ${NB_USER} +RUN echo '%wheel ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +WORKDIR ${HOME} +USER ${NB_USER} + +COPY _binder_notebooks/*.ipynb ${HOME}/ diff --git a/binder/my-docker-entrypoint.sh b/binder/my-docker-entrypoint.sh new file mode 100755 index 0000000..f70450b --- /dev/null +++ b/binder/my-docker-entrypoint.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +sudo /usr/local/bin/docker-entrypoint.sh +sudo chown ${USER}:${USER} ${HOME}/*ipynb + +exec "$@" diff --git a/binder/slurm.conf b/binder/slurm.conf new file mode 100644 index 0000000..4a13bbb --- /dev/null +++ b/binder/slurm.conf @@ -0,0 +1,94 @@ +# slurm.conf +# +# See the slurm.conf man page for more information. +# +ClusterName=linux +ControlMachine=localhost +#ControlAddr= +#BackupController= +#BackupAddr= +# +SlurmUser=slurm +#SlurmdUser=root +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +#JobCredentialPrivateKey= +#JobCredentialPublicCertificate= +StateSaveLocation=/var/lib/slurmd +SlurmdSpoolDir=/var/spool/slurmd +SwitchType=switch/none +MpiDefault=none +SlurmctldPidFile=/var/run/slurmd/slurmctld.pid +SlurmdPidFile=/var/run/slurmd/slurmd.pid +ProctrackType=proctrack/pgid +#PluginDir= +CacheGroups=0 +#FirstJobId= +ReturnToService=0 +#MaxJobCount= +#PlugStackConfig= +#PropagatePrioProcess= +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#Prolog= +#Epilog= +#SrunProlog= +#SrunEpilog= +#TaskProlog= +#TaskEpilog= +#TaskPlugin= +#TrackWCKey=no +#TreeWidth=50 +#TmpFS= +#UsePAM= +# +# TIMERS +SlurmctldTimeout=300 +SlurmdTimeout=300 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 +# +# SCHEDULING +SchedulerType=sched/backfill +#SchedulerAuth= +#SchedulerPort= +#SchedulerRootFilter= +SelectType=select/cons_res +SelectTypeParameters=CR_CPU_Memory +FastSchedule=1 +#PriorityType=priority/multifactor +#PriorityDecayHalfLife=14-0 +#PriorityUsageResetPeriod=14-0 +#PriorityWeightFairshare=100000 +#PriorityWeightAge=1000 +#PriorityWeightPartition=10000 +#PriorityWeightJobSize=1000 +#PriorityMaxAge=1-0 +# +# LOGGING +SlurmctldDebug=3 +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdDebug=3 +SlurmdLogFile=/var/log/slurm/slurmd.log +JobCompType=jobcomp/none +#JobCompLoc= +# +# ACCOUNTING +#JobAcctGatherType=jobacct_gather/linux +#JobAcctGatherFrequency=30 +# +AccountingStorageType=accounting_storage/slurmdbd +#AccountingStorageHost=localhost +#AccountingStorageLoc= +#AccountingStoragePass= +#AccountingStorageUser= +# +# COMPUTE NODES +GresTypes=gpu +NodeName=c[1-10] NodeHostName=localhost NodeAddr=127.0.0.1 RealMemory=2000 +# +# PARTITIONS +PartitionName=normal Default=yes Nodes=c[1-5] Priority=50 DefMemPerCPU=200 Shared=YES MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP diff --git a/environment.yml b/environment.yml deleted file mode 100644 index b1d8c0e..0000000 --- a/environment.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: dask_jobqueue_workshop -channels: - - conda-forge -dependencies: - - python=3 - - bokeh - - dask - - distributed - - git - - git-lfs - - hdf5 - - ipykernel - - matplotlib - - netCDF4 - - numba - - numpy - - pandas - - pip - - seaborn - - tornado - - xarray - - zarr - - pip: - - git+https://github.com/dask/dask-jobqueue@9c3371d17d943dd44103225c814b8ee042513a93