diff --git a/nessie-stack/Dockerfile b/nessie-stack/Dockerfile new file mode 100644 index 0000000000000..c81bb8d2b45ad --- /dev/null +++ b/nessie-stack/Dockerfile @@ -0,0 +1,27 @@ +FROM jupyter/pyspark-notebook:latest + +USER root + +# Install AWS Hadoop integration for S3 +RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar -P /usr/local/spark/jars/ && \ + wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar -P /usr/local/spark/jars/ + +# Install Iceberg dependencies for Spark 3.5 (version 1.8.1) +RUN wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.8.1/iceberg-spark-runtime-3.5_2.12-1.8.1.jar -P /usr/local/spark/jars/ && \ + wget https://repo1.maven.org/maven2/org/projectnessie/nessie-integrations/nessie-spark-extensions-3.5_2.12/0.102.5/nessie-spark-extensions-3.5_2.12-0.102.5.jar -P /usr/local/spark/jars/ + +# Install Python packages (update pyiceberg to match JAR version) +RUN pip install pyiceberg==0.9.0 pynessie==0.67.0 + +# Create a directory for init scripts +RUN mkdir -p /usr/local/bin/start-notebook.d + +# Create init script to configure Spark environment +RUN echo '#!/bin/bash\nexport PYSPARK_DRIVER_PYTHON=jupyter\nexport PYSPARK_DRIVER_PYTHON_OPTS="lab --NotebookApp.token=\'\' --NotebookApp.password=\'\'"' > /usr/local/bin/start-notebook.d/spark-config.sh && \ + chmod +x /usr/local/bin/start-notebook.d/spark-config.sh + +USER $NB_UID + +# Create a default spark-defaults.conf +RUN mkdir -p $HOME/.sparkmagic +COPY --chown=$NB_UID:$NB_GID spark-defaults.conf /usr/local/spark/conf/spark-defaults.conf \ No newline at end of file diff --git a/nessie-stack/config_notebook.sh b/nessie-stack/config_notebook.sh new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/nessie-stack/docker-compose.yml b/nessie-stack/docker-compose.yml new file mode 100644 index 0000000000000..d80db3800a27a --- /dev/null +++ b/nessie-stack/docker-compose.yml @@ -0,0 +1,168 @@ +version: '3.8' + +services: + minio: + image: minio/minio:latest + container_name: minio + ports: + - "9000:9000" + - "9001:9001" + environment: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + volumes: + - minio_data:/data + command: server /data --console-address ":9001" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + networks: + - app-network + + createbuckets: + image: minio/mc:latest + depends_on: + - minio + entrypoint: > + /bin/sh -c " + sleep 5; + /usr/bin/mc config host add myminio http://minio:9000 minioadmin minioadmin; + /usr/bin/mc mb myminio/nessie; + /usr/bin/mc mb myminio/spark; + exit 0; + " + networks: + - app-network + + postgres: + image: postgres:14 + container_name: postgres + ports: + - "5432:5432" + environment: + POSTGRES_USER: nessie + POSTGRES_PASSWORD: nessie + POSTGRES_DB: nessie + volumes: + - postgres_data:/var/lib/postgresql/data + networks: + - app-network + + nessie: + image: projectnessie/nessie:latest + container_name: nessie + depends_on: + - postgres + ports: + - "19120:19120" + environment: + QUARKUS_PROFILE: postgresql + NESSIE_VERSION_STORE_TYPE: jdbc + QUARKUS_DATASOURCE_USERNAME: nessie + QUARKUS_DATASOURCE_PASSWORD: nessie + QUARKUS_DATASOURCE_JDBC_URL: jdbc:postgresql://postgres:5432/nessie + networks: + - app-network + + spark-master: + image: bitnami/spark:latest + container_name: spark-master + environment: + - SPARK_MODE=master + - SPARK_RPC_AUTHENTICATION_ENABLED=no + - SPARK_RPC_ENCRYPTION_ENABLED=no + - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no + - SPARK_SSL_ENABLED=no + ports: + - "8080:8080" + - "7077:7077" + networks: + - app-network + + spark-worker: + image: bitnami/spark:latest + container_name: spark-worker + depends_on: + - spark-master + environment: + - SPARK_MODE=worker + - SPARK_MASTER_URL=spark://spark-master:7077 + - SPARK_WORKER_MEMORY=2G + - SPARK_WORKER_CORES=2 + - SPARK_RPC_AUTHENTICATION_ENABLED=no + - SPARK_RPC_ENCRYPTION_ENABLED=no + - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no + - SPARK_SSL_ENABLED=no + ports: + - "8081:8081" + networks: + - app-network + + spark-thrift-server: + image: bitnami/spark:latest + container_name: spark-thrift-server + depends_on: + - spark-master + - nessie + - minio + ports: + - "10000:10000" # Thrift JDBC/ODBC server + - "4040:4040" # Spark UI + environment: + - SPARK_MODE=master + - SPARK_MASTER_URL=spark://spark-master:7077 + command: > + bash -c " + /opt/bitnami/spark/sbin/start-thriftserver.sh \ + --master spark://spark-master:7077 \ + --hiveconf hive.server2.thrift.port=10000 \ + --hiveconf hive.server2.thrift.bind.host=0.0.0.0 \ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.catalog.iceberg=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.iceberg.type=nessie \ + --conf spark.sql.catalog.iceberg.uri=http://nessie:19120/api/v1 \ + --conf spark.sql.catalog.iceberg.ref=main \ + --conf spark.sql.catalog.iceberg.catalog-impl=org.apache.iceberg.aws.s3.S3FileIO \ + --conf spark.sql.catalog.iceberg.s3.endpoint=http://minio:9000 \ + --conf spark.sql.catalog.iceberg.s3.path-style-access=true \ + --conf spark.sql.catalog.iceberg.warehouse=s3a://nessie \ + --conf spark.hadoop.fs.s3a.access.key=minioadmin \ + --conf spark.hadoop.fs.s3a.secret.key=minioadmin \ + --conf spark.hadoop.fs.s3a.endpoint=http://minio:9000 \ + --conf spark.hadoop.fs.s3a.path.style.access=true \ + --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ + && tail -f /opt/bitnami/spark/logs/*thriftserver*.out + " + volumes: + - spark_logs:/opt/bitnami/spark/logs + networks: + - app-network + + jupyter: + build: + context: . + container_name: jupyter + depends_on: + - spark-master + - nessie + - minio + ports: + - "8888:8888" + environment: + JUPYTER_ENABLE_LAB: "yes" + volumes: + - jupyter_notebooks:/home/jovyan/work + networks: + - app-network + +networks: + app-network: + driver: bridge + +volumes: + minio_data: + postgres_data: + spark_logs: + jupyter_notebooks: \ No newline at end of file diff --git a/nessie-stack/spark-defaults.conf b/nessie-stack/spark-defaults.conf new file mode 100644 index 0000000000000..ba87d3d6a7c35 --- /dev/null +++ b/nessie-stack/spark-defaults.conf @@ -0,0 +1,18 @@ +# Default Spark configuration +spark.master spark://spark-master:7077 +spark.driver.memory 1g +spark.executor.memory 1g +spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions +spark.sql.catalog.iceberg org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog.iceberg.type nessie +spark.sql.catalog.iceberg.uri http://nessie:19120/api/v1 +spark.sql.catalog.iceberg.ref main +spark.sql.catalog.iceberg.catalog-impl org.apache.iceberg.aws.s3.S3FileIO +spark.sql.catalog.iceberg.s3.endpoint http://minio:9000 +spark.sql.catalog.iceberg.s3.path-style-access true +spark.sql.catalog.iceberg.warehouse s3a://nessie +spark.hadoop.fs.s3a.access.key minioadmin +spark.hadoop.fs.s3a.secret.key minioadmin +spark.hadoop.fs.s3a.endpoint http://minio:9000 +spark.hadoop.fs.s3a.path.style.access true +spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem \ No newline at end of file