1+ {
2+ "cells" : [
3+ {
4+ "cell_type" : " markdown" ,
5+ "id" : " introduction" ,
6+ "metadata" : {},
7+ "source" : [
8+ " # Armada Spark Example\n " ,
9+ " \n " ,
10+ " This notebook demonstrates how to run Spark jobs on Armada using PySpark in client mode."
11+ ]
12+ },
13+ {
14+ "cell_type" : " code" ,
15+ "execution_count" : null ,
16+ "id" : " imports" ,
17+ "metadata" : {},
18+ "outputs" : [],
19+ "source" : [
20+ " import os\n " ,
21+ " import glob\n " ,
22+ " import subprocess\n " ,
23+ " import random\n " ,
24+ " from pyspark.sql import SparkSession\n " ,
25+ " from pyspark import SparkConf"
26+ ]
27+ },
28+ {
29+ "cell_type" : " markdown" ,
30+ "id" : " setup-section" ,
31+ "metadata" : {},
32+ "source" : [
33+ " ## Setup\n " ,
34+ " \n " ,
35+ " Clean up any existing Spark context and configure the environment."
36+ ]
37+ },
38+ {
39+ "cell_type" : " code" ,
40+ "execution_count" : null ,
41+ "id" : " stop-existing-context" ,
42+ "metadata" : {},
43+ "outputs" : [],
44+ "source" : [
45+ " try:\n " ,
46+ " from pyspark import SparkContext\n " ,
47+ " if SparkContext._active_spark_context:\n " ,
48+ " SparkContext._active_spark_context.stop()\n " ,
49+ " except:\n " ,
50+ " pass"
51+ ]
52+ },
53+ {
54+ "cell_type" : " markdown" ,
55+ "id" : " config-section" ,
56+ "metadata" : {},
57+ "source" : [
58+ " ## Configuration\n " ,
59+ " \n " ,
60+ " Set up connection parameters and locate the Armada Spark JAR file."
61+ ]
62+ },
63+ {
64+ "cell_type" : " code" ,
65+ "execution_count" : null ,
66+ "id" : " configuration" ,
67+ "metadata" : {},
68+ "outputs" : [],
69+ "source" : [
70+ " # Configuration\n " ,
71+ " auth_token = os.environ.get('ARMADA_AUTH_TOKEN')\n " ,
72+ " auth_script_path = os.environ.get('ARMADA_AUTH_SCRIPT_PATH')\n " ,
73+ " driver_host = os.environ.get('SPARK_DRIVER_HOST')\n " ,
74+ " driver_port = os.environ.get('SPARK_DRIVER_PORT', '7078')\n " ,
75+ " block_manager_port = os.environ.get('SPARK_BLOCK_MANAGER_PORT', '10061')\n " ,
76+ " armada_master = os.environ.get('ARMADA_MASTER', 'local://armada://host.docker.internal:30002')\n " ,
77+ " armada_queue = os.environ.get('ARMADA_QUEUE', 'default')\n " ,
78+ " armada_namespace = os.environ.get('ARMADA_NAMESPACE', 'default')\n " ,
79+ " image_name = os.environ.get('IMAGE_NAME', 'spark:armada')\n " ,
80+ " event_watcher_use_tls = os.environ.get('ARMADA_EVENT_WATCHER_USE_TLS', 'false')\n " ,
81+ " \n " ,
82+ " # Find JAR - try common Scala versions (2.12, 2.13)\n " ,
83+ " jar_paths = glob.glob('/opt/spark/jars/armada-cluster-manager_2.1*-*-all.jar')\n " ,
84+ " if not jar_paths:\n " ,
85+ " raise FileNotFoundError(\" Armada Spark JAR not found!\" )\n " ,
86+ " armada_jar = jar_paths[0]\n " ,
87+ " \n " ,
88+ " # Generate app ID, required for client mode\n " ,
89+ " app_id = f\" jupyter-spark-{subprocess.check_output(['openssl', 'rand', '-hex', '3']).decode().strip()}\" "
90+ ]
91+ },
92+ {
93+ "cell_type" : " markdown" ,
94+ "id" : " spark-config-section" ,
95+ "metadata" : {},
96+ "source" : [
97+ " ## Spark Configuration\n " ,
98+ " \n " ,
99+ " Configure Spark to use Armada as the cluster manager in client mode."
100+ ]
101+ },
102+ {
103+ "cell_type" : " code" ,
104+ "execution_count" : null ,
105+ "id" : " spark-config" ,
106+ "metadata" : {},
107+ "outputs" : [],
108+ "source" : [
109+ " # Spark Configuration\n " ,
110+ " conf = SparkConf()\n " ,
111+ " if auth_token:\n " ,
112+ " conf.set(\" spark.armada.auth.token\" , auth_token)\n " ,
113+ " if auth_script_path:\n " ,
114+ " conf.set(\" spark.armada.auth.script.path\" , auth_script_path)\n " ,
115+ " if not driver_host:\n " ,
116+ " raise ValueError(\n " ,
117+ " \" SPARK_DRIVER_HOST environment variable is required. \"\n " ,
118+ " )\n " ,
119+ " conf.set(\" spark.master\" , armada_master)\n " ,
120+ " conf.set(\" spark.submit.deployMode\" , \" client\" )\n " ,
121+ " conf.set(\" spark.app.id\" , app_id)\n " ,
122+ " conf.set(\" spark.app.name\" , \" jupyter-spark-pi\" )\n " ,
123+ " conf.set(\" spark.driver.bindAddress\" , \" 0.0.0.0\" )\n " ,
124+ " conf.set(\" spark.driver.host\" , driver_host)\n " ,
125+ " conf.set(\" spark.driver.port\" , driver_port)\n " ,
126+ " conf.set(\" spark.driver.blockManager.port\" , block_manager_port)\n " ,
127+ " conf.set(\" spark.home\" , \" /opt/spark\" )\n " ,
128+ " conf.set(\" spark.armada.container.image\" , image_name)\n " ,
129+ " conf.set(\" spark.armada.queue\" , armada_queue)\n " ,
130+ " conf.set(\" spark.armada.scheduling.namespace\" , armada_namespace)\n " ,
131+ " conf.set(\" spark.armada.eventWatcher.useTls\" , event_watcher_use_tls)\n " ,
132+ " conf.set(\" spark.kubernetes.file.upload.path\" , \" /tmp\" )\n " ,
133+ " conf.set(\" spark.kubernetes.executor.disableConfigMap\" , \" true\" )\n " ,
134+ " conf.set(\" spark.local.dir\" , \" /tmp\" )\n " ,
135+ " conf.set(\" spark.jars\" , armada_jar)\n " ,
136+ " \n " ,
137+ " # Network timeouts\n " ,
138+ " conf.set(\" spark.network.timeout\" , \" 800s\" )\n " ,
139+ " conf.set(\" spark.executor.heartbeatInterval\" , \" 60s\" )\n " ,
140+ " \n " ,
141+ " # Static mode - tune these values for your environment\n " ,
142+ " conf.set(\" spark.executor.instances\" , \" 2\" )\n " ,
143+ " conf.set(\" spark.armada.driver.limit.memory\" , \" 1Gi\" )\n " ,
144+ " conf.set(\" spark.armada.driver.request.memory\" , \" 1Gi\" )\n " ,
145+ " conf.set(\" spark.armada.executor.limit.memory\" , \" 1Gi\" )\n " ,
146+ " conf.set(\" spark.armada.executor.request.memory\" , \" 1Gi\" )"
147+ ]
148+ },
149+ {
150+ "cell_type" : " code" ,
151+ "execution_count" : null ,
152+ "id" : " create-spark-session" ,
153+ "metadata" : {},
154+ "outputs" : [],
155+ "source" : [
156+ " # Create SparkSession\n " ,
157+ " spark = SparkSession.builder.config(conf=conf).getOrCreate()\n " ,
158+ " print(f\" SparkSession created\" )"
159+ ]
160+ },
161+ {
162+ "cell_type" : " markdown" ,
163+ "id" : " examples-section" ,
164+ "metadata" : {},
165+ "source" : [
166+ " ## Examples\n " ,
167+ " \n " ,
168+ " Run Spark computations on the Armada cluster."
169+ ]
170+ },
171+ {
172+ "cell_type" : " code" ,
173+ "execution_count" : null ,
174+ "id" : " spark-pi-calculation" ,
175+ "metadata" : {},
176+ "outputs" : [],
177+ "source" : [
178+ " # Spark Pi calculation\n " ,
179+ " print(f\" Running Spark Pi calculation...\" )\n " ,
180+ " n = 10000\n " ,
181+ " \n " ,
182+ " def inside(p):\n " ,
183+ " x, y = random.random(), random.random()\n " ,
184+ " return x*x + y*y < 1\n " ,
185+ " \n " ,
186+ " count = spark.sparkContext.parallelize(range(0, n)).filter(inside).count()\n " ,
187+ " pi = 4.0 * count / n\n " ,
188+ " print(f\" Pi is approximately: {pi}\" )"
189+ ]
190+ },
191+ {
192+ "cell_type" : " markdown" ,
193+ "id" : " cleanup-section" ,
194+ "metadata" : {},
195+ "source" : [
196+ " ## Cleanup\n " ,
197+ " \n " ,
198+ " Stop the Spark context to release resources. This will stop the executors in Armada."
199+ ]
200+ },
201+ {
202+ "cell_type" : " code" ,
203+ "execution_count" : null ,
204+ "id" : " stop-spark-context" ,
205+ "metadata" : {},
206+ "outputs" : [],
207+ "source" : [
208+ " # Stop Spark context\n " ,
209+ " print(\" Stopping Spark context...\" )\n " ,
210+ " spark.stop()\n " ,
211+ " print(\" Spark context stopped successfully\" )"
212+ ]
213+ }
214+ ],
215+ "metadata" : {
216+ "kernelspec" : {
217+ "display_name" : " Python 3" ,
218+ "language" : " python" ,
219+ "name" : " python3"
220+ },
221+ "language_info" : {
222+ "codemirror_mode" : {
223+ "name" : " ipython" ,
224+ "version" : 3
225+ },
226+ "file_extension" : " .py" ,
227+ "mimetype" : " text/x-python" ,
228+ "name" : " python" ,
229+ "nbconvert_exporter" : " python" ,
230+ "pygments_lexer" : " ipython3" ,
231+ "version" : " 3.10.12"
232+ }
233+ },
234+ "nbformat" : 4 ,
235+ "nbformat_minor" : 5
236+ }
0 commit comments