Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions nessie-stack/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
FROM jupyter/pyspark-notebook:latest

USER root

# Install AWS Hadoop integration for S3
RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar -P /usr/local/spark/jars/ && \
wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar -P /usr/local/spark/jars/

# Install Iceberg dependencies for Spark 3.5 (version 1.8.1)
RUN wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.8.1/iceberg-spark-runtime-3.5_2.12-1.8.1.jar -P /usr/local/spark/jars/ && \
wget https://repo1.maven.org/maven2/org/projectnessie/nessie-integrations/nessie-spark-extensions-3.5_2.12/0.102.5/nessie-spark-extensions-3.5_2.12-0.102.5.jar -P /usr/local/spark/jars/

# Install Python packages (update pyiceberg to match JAR version)
RUN pip install pyiceberg==0.9.0 pynessie==0.67.0

# Create a directory for init scripts
RUN mkdir -p /usr/local/bin/start-notebook.d

# Create init script to configure Spark environment
RUN echo '#!/bin/bash\nexport PYSPARK_DRIVER_PYTHON=jupyter\nexport PYSPARK_DRIVER_PYTHON_OPTS="lab --NotebookApp.token=\'\' --NotebookApp.password=\'\'"' > /usr/local/bin/start-notebook.d/spark-config.sh && \
chmod +x /usr/local/bin/start-notebook.d/spark-config.sh

USER $NB_UID

# Create a default spark-defaults.conf
RUN mkdir -p $HOME/.sparkmagic
COPY --chown=$NB_UID:$NB_GID spark-defaults.conf /usr/local/spark/conf/spark-defaults.conf
Empty file added nessie-stack/config_notebook.sh
Empty file.
168 changes: 168 additions & 0 deletions nessie-stack/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
version: '3.8'

services:
minio:
image: minio/minio:latest
container_name: minio
ports:
- "9000:9000"
- "9001:9001"
environment:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
volumes:
- minio_data:/data
command: server /data --console-address ":9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
networks:
- app-network

createbuckets:
image: minio/mc:latest
depends_on:
- minio
entrypoint: >
/bin/sh -c "
sleep 5;
/usr/bin/mc config host add myminio http://minio:9000 minioadmin minioadmin;
/usr/bin/mc mb myminio/nessie;
/usr/bin/mc mb myminio/spark;
exit 0;
"
networks:
- app-network

postgres:
image: postgres:14
container_name: postgres
ports:
- "5432:5432"
environment:
POSTGRES_USER: nessie
POSTGRES_PASSWORD: nessie
POSTGRES_DB: nessie
volumes:
- postgres_data:/var/lib/postgresql/data
networks:
- app-network

nessie:
image: projectnessie/nessie:latest
container_name: nessie
depends_on:
- postgres
ports:
- "19120:19120"
environment:
QUARKUS_PROFILE: postgresql
NESSIE_VERSION_STORE_TYPE: jdbc
QUARKUS_DATASOURCE_USERNAME: nessie
QUARKUS_DATASOURCE_PASSWORD: nessie
QUARKUS_DATASOURCE_JDBC_URL: jdbc:postgresql://postgres:5432/nessie
networks:
- app-network

spark-master:
image: bitnami/spark:latest
container_name: spark-master
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
ports:
- "8080:8080"
- "7077:7077"
networks:
- app-network

spark-worker:
image: bitnami/spark:latest
container_name: spark-worker
depends_on:
- spark-master
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_MEMORY=2G
- SPARK_WORKER_CORES=2
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
ports:
- "8081:8081"
networks:
- app-network

spark-thrift-server:
image: bitnami/spark:latest
container_name: spark-thrift-server
depends_on:
- spark-master
- nessie
- minio
ports:
- "10000:10000" # Thrift JDBC/ODBC server
- "4040:4040" # Spark UI
environment:
- SPARK_MODE=master
- SPARK_MASTER_URL=spark://spark-master:7077
command: >
bash -c "
/opt/bitnami/spark/sbin/start-thriftserver.sh \
--master spark://spark-master:7077 \
--hiveconf hive.server2.thrift.port=10000 \
--hiveconf hive.server2.thrift.bind.host=0.0.0.0 \
--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
--conf spark.sql.catalog.iceberg=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.iceberg.type=nessie \
--conf spark.sql.catalog.iceberg.uri=http://nessie:19120/api/v1 \
--conf spark.sql.catalog.iceberg.ref=main \
--conf spark.sql.catalog.iceberg.catalog-impl=org.apache.iceberg.aws.s3.S3FileIO \
--conf spark.sql.catalog.iceberg.s3.endpoint=http://minio:9000 \
--conf spark.sql.catalog.iceberg.s3.path-style-access=true \
--conf spark.sql.catalog.iceberg.warehouse=s3a://nessie \
--conf spark.hadoop.fs.s3a.access.key=minioadmin \
--conf spark.hadoop.fs.s3a.secret.key=minioadmin \
--conf spark.hadoop.fs.s3a.endpoint=http://minio:9000 \
--conf spark.hadoop.fs.s3a.path.style.access=true \
--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
&& tail -f /opt/bitnami/spark/logs/*thriftserver*.out
"
volumes:
- spark_logs:/opt/bitnami/spark/logs
networks:
- app-network

jupyter:
build:
context: .
container_name: jupyter
depends_on:
- spark-master
- nessie
- minio
ports:
- "8888:8888"
environment:
JUPYTER_ENABLE_LAB: "yes"
volumes:
- jupyter_notebooks:/home/jovyan/work
networks:
- app-network

networks:
app-network:
driver: bridge

volumes:
minio_data:
postgres_data:
spark_logs:
jupyter_notebooks:
18 changes: 18 additions & 0 deletions nessie-stack/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Default Spark configuration
spark.master spark://spark-master:7077
spark.driver.memory 1g
spark.executor.memory 1g
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions
spark.sql.catalog.iceberg org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.iceberg.type nessie
spark.sql.catalog.iceberg.uri http://nessie:19120/api/v1
spark.sql.catalog.iceberg.ref main
spark.sql.catalog.iceberg.catalog-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.catalog.iceberg.s3.endpoint http://minio:9000
spark.sql.catalog.iceberg.s3.path-style-access true
spark.sql.catalog.iceberg.warehouse s3a://nessie
spark.hadoop.fs.s3a.access.key minioadmin
spark.hadoop.fs.s3a.secret.key minioadmin
spark.hadoop.fs.s3a.endpoint http://minio:9000
spark.hadoop.fs.s3a.path.style.access true
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
Loading