Add jupyter (#87)

sudiptob2 · web-flow · commit 1742d607ce1e · 2026-01-28T15:45:51.000-05:00
Closes G-Research/spark#128 --------- Signed-off-by: Sudipto Baral <sudiptobaral.me@gmail.com>
diff --git a/.gitignore b/.gitignore
@@ -80,3 +80,6 @@ scripts/armadactl
 e2e-test.log
 extraJars/*.jar
 scripts/.tmp/
+
+# Jupyter
+example/jupyter/workspace/
diff --git a/README.md b/README.md
@@ -151,3 +151,21 @@ The project includes a ready-to-use Spark job to test your setup:
 This job leverages the same configuration parameters (`ARMADA_MASTER`, `ARMADA_QUEUE`, `ARMADA_LOOKOUT_URL`) as the `scripts/config.sh` script.
 
 Use the -h option to see what other options are available.
+
+### Jupyter Notebook
+
+The Docker image includes Jupyter support. Run Jupyter with the example notebooks:
+
+```bash
+./scripts/runJupyter.sh
+```
+
+**Note:** The Docker image must be built with `INCLUDE_PYTHON=true` for Jupyter to work.
+
+This will start a Jupyter notebook server at `http://localhost:8888` (or the port specified by `JUPYTER_PORT` in `scripts/config.sh`). 
+The example notebooks from `example/jupyter/notebooks` are mounted in the container at `/home/spark/workspace/notebooks`.
+
+**Configuration:**
+- **Required:** `SPARK_DRIVER_HOST`
+- Override the Jupyter port if required by setting `JUPYTER_PORT` in `scripts/config.sh`
+- The script uses the same configuration (`ARMADA_MASTER`, `ARMADA_QUEUE`, `SPARK_DRIVER_HOST`, etc.) as other scripts
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -20,10 +20,13 @@ ARG spark_base_image_tag=3.3.3-scala2.12-java11-ubuntu
 FROM ${spark_base_image_prefix}:${spark_base_image_tag}
 
 ARG scala_binary_version=2.13
+ARG spark_version=3.3.3
+ARG include_python=false
 
 COPY target/armada-cluster-manager_${scala_binary_version}-*-all.jar /opt/spark/jars/
 COPY extraFiles /opt/spark/extraFiles
 COPY extraJars/* /opt/spark/jars
+COPY docker/jupyter-entrypoint.sh /opt/spark/bin/jupyter-entrypoint.sh
 
 
 USER 0
@@ -34,5 +37,41 @@ RUN mkdir -p /opt/spark/coreJars && \
 
 ENV SPARK_DIST_CLASSPATH=/opt/spark/coreJars/*
 
+# Install Jupyter, PySpark, and Python dependencies (only if include_python is true)
+RUN if [ "$include_python" = "true" ]; then \
+        apt-get update && \
+        apt-get install -y python3-pip && \
+        pip3 install --no-cache-dir \
+            jupyter \
+            notebook \
+            ipykernel \
+            pyspark==${spark_version} && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*; \
+    fi
+
+
+RUN if [ "$include_python" = "true" ]; then \
+        mkdir -p /home/spark/workspace && \
+        mkdir -p /home/spark/.local/share/jupyter && \
+        mkdir -p /home/spark/.jupyter && \
+        chown -R 185:185 /home/spark/workspace && \
+        chown -R 185:185 /home/spark/.local && \
+        chown -R 185:185 /home/spark/.jupyter; \
+    fi && \
+    chmod +x /opt/spark/bin/jupyter-entrypoint.sh
+
 ARG spark_uid=185
 USER ${spark_uid}
+
+# Install ipykernel (only if include_python is true)
+RUN if [ "$include_python" = "true" ]; then \
+        HOME=/home/spark python3 -m ipykernel install --user --name python3 --display-name "Python 3"; \
+    fi
+
+ENV HOME=/home/spark
+ENV SPARK_HOME=/opt/spark
+ENV PYSPARK_PYTHON=python3
+ENV PYSPARK_DRIVER_PYTHON=python3
+ENV PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-*src.zip
+ENV JUPYTER_RUNTIME_DIR=/home/spark/.local/share/jupyter/runtime
diff --git a/docker/jupyter-entrypoint.sh b/docker/jupyter-entrypoint.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+cd /home/spark/workspace
+
+exec jupyter notebook \
+     --ip=0.0.0.0 \
+     --port=8888 \
+     --no-browser \
+     --NotebookApp.token='' \
+     --NotebookApp.password='' \
+     --NotebookApp.notebook_dir=/home/spark/workspace
diff --git a/example/jupyter/notebooks/jupyter_armada_spark.ipynb b/example/jupyter/notebooks/jupyter_armada_spark.ipynb
@@ -0,0 +1,236 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "introduction",
+   "metadata": {},
+   "source": [
+    "# Armada Spark Example\n",
+    "\n",
+    "This notebook demonstrates how to run Spark jobs on Armada using PySpark in client mode."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "imports",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import glob\n",
+    "import subprocess\n",
+    "import random\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark import SparkConf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "setup-section",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "Clean up any existing Spark context and configure the environment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "stop-existing-context",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    from pyspark import SparkContext\n",
+    "    if SparkContext._active_spark_context:\n",
+    "        SparkContext._active_spark_context.stop()\n",
+    "except:\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "config-section",
+   "metadata": {},
+   "source": [
+    "## Configuration\n",
+    "\n",
+    "Set up connection parameters and locate the Armada Spark JAR file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "configuration",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Configuration\n",
+    "auth_token = os.environ.get('ARMADA_AUTH_TOKEN')\n",
+    "auth_script_path = os.environ.get('ARMADA_AUTH_SCRIPT_PATH')\n",
+    "driver_host = os.environ.get('SPARK_DRIVER_HOST')\n",
+    "driver_port = os.environ.get('SPARK_DRIVER_PORT', '7078')\n",
+    "block_manager_port = os.environ.get('SPARK_BLOCK_MANAGER_PORT', '10061')\n",
+    "armada_master = os.environ.get('ARMADA_MASTER', 'local://armada://host.docker.internal:30002')\n",
+    "armada_queue = os.environ.get('ARMADA_QUEUE', 'default')\n",
+    "armada_namespace = os.environ.get('ARMADA_NAMESPACE', 'default')\n",
+    "image_name = os.environ.get('IMAGE_NAME', 'spark:armada')\n",
+    "event_watcher_use_tls = os.environ.get('ARMADA_EVENT_WATCHER_USE_TLS', 'false')\n",
+    "\n",
+    "# Find JAR - try common Scala versions (2.12, 2.13)\n",
+    "jar_paths = glob.glob('/opt/spark/jars/armada-cluster-manager_2.1*-*-all.jar')\n",
+    "if not jar_paths:\n",
+    "    raise FileNotFoundError(\"Armada Spark JAR not found!\")\n",
+    "armada_jar = jar_paths[0]\n",
+    "\n",
+    "# Generate app ID, required for client mode\n",
+    "app_id = f\"jupyter-spark-{subprocess.check_output(['openssl', 'rand', '-hex', '3']).decode().strip()}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "spark-config-section",
+   "metadata": {},
+   "source": [
+    "## Spark Configuration\n",
+    "\n",
+    "Configure Spark to use Armada as the cluster manager in client mode."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "spark-config",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Spark Configuration\n",
+    "conf = SparkConf()\n",
+    "if auth_token:\n",
+    "    conf.set(\"spark.armada.auth.token\", auth_token)\n",
+    "if auth_script_path:\n",
+    "    conf.set(\"spark.armada.auth.script.path\", auth_script_path)\n",
+    "if not driver_host:\n",
+    "    raise ValueError(\n",
+    "        \"SPARK_DRIVER_HOST environment variable is required. \"\n",
+    "    )\n",
+    "conf.set(\"spark.master\", armada_master)\n",
+    "conf.set(\"spark.submit.deployMode\", \"client\")\n",
+    "conf.set(\"spark.app.id\", app_id)\n",
+    "conf.set(\"spark.app.name\", \"jupyter-spark-pi\")\n",
+    "conf.set(\"spark.driver.bindAddress\", \"0.0.0.0\")\n",
+    "conf.set(\"spark.driver.host\", driver_host)\n",
+    "conf.set(\"spark.driver.port\", driver_port)\n",
+    "conf.set(\"spark.driver.blockManager.port\", block_manager_port)\n",
+    "conf.set(\"spark.home\", \"/opt/spark\")\n",
+    "conf.set(\"spark.armada.container.image\", image_name)\n",
+    "conf.set(\"spark.armada.queue\", armada_queue)\n",
+    "conf.set(\"spark.armada.scheduling.namespace\", armada_namespace)\n",
+    "conf.set(\"spark.armada.eventWatcher.useTls\", event_watcher_use_tls)\n",
+    "conf.set(\"spark.kubernetes.file.upload.path\", \"/tmp\")\n",
+    "conf.set(\"spark.kubernetes.executor.disableConfigMap\", \"true\")\n",
+    "conf.set(\"spark.local.dir\", \"/tmp\")\n",
+    "conf.set(\"spark.jars\", armada_jar)\n",
+    "\n",
+    "# Network timeouts\n",
+    "conf.set(\"spark.network.timeout\", \"800s\")\n",
+    "conf.set(\"spark.executor.heartbeatInterval\", \"60s\")\n",
+    "\n",
+    "# Static mode - tune these values for your environment\n",
+    "conf.set(\"spark.executor.instances\", \"2\")\n",
+    "conf.set(\"spark.armada.driver.limit.memory\", \"1Gi\")\n",
+    "conf.set(\"spark.armada.driver.request.memory\", \"1Gi\")\n",
+    "conf.set(\"spark.armada.executor.limit.memory\", \"1Gi\")\n",
+    "conf.set(\"spark.armada.executor.request.memory\", \"1Gi\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "create-spark-session",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create SparkSession\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+    "print(f\"SparkSession created\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "examples-section",
+   "metadata": {},
+   "source": [
+    "## Examples\n",
+    "\n",
+    "Run Spark computations on the Armada cluster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "spark-pi-calculation",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Spark Pi calculation\n",
+    "print(f\"Running Spark Pi calculation...\")\n",
+    "n = 10000\n",
+    "\n",
+    "def inside(p):\n",
+    "    x, y = random.random(), random.random()\n",
+    "    return x*x + y*y < 1\n",
+    "\n",
+    "count = spark.sparkContext.parallelize(range(0, n)).filter(inside).count()\n",
+    "pi = 4.0 * count / n\n",
+    "print(f\"  Pi is approximately: {pi}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cleanup-section",
+   "metadata": {},
+   "source": [
+    "## Cleanup\n",
+    "\n",
+    "Stop the Spark context to release resources. This will stop the executors in Armada."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "stop-spark-context",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Stop Spark context\n",
+    "print(\"Stopping Spark context...\")\n",
+    "spark.stop()\n",
+    "print(\"Spark context stopped successfully\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/scripts/createImage.sh b/scripts/createImage.sh
@@ -59,6 +59,7 @@ docker build \
   --build-arg spark_base_image_prefix=$image_prefix \
   --build-arg spark_base_image_tag=$image_tag \
   --build-arg scala_binary_version=$SCALA_BIN_VERSION \
+  --build-arg spark_version=$SPARK_VERSION \
   --build-arg include_python=$INCLUDE_PYTHON \
   -f "$root/docker/Dockerfile" \
   "$root"
diff --git a/scripts/runJupyter.sh b/scripts/runJupyter.sh