diff --git a/models/kanban.go b/models/kanban.go new file mode 100644 index 0000000000000..727262fd9cca8 --- /dev/null +++ b/models/kanban.go @@ -0,0 +1,30 @@ +// models/kanban.go +package models + +type KanbanBoard struct { + ID int64 `xorm:"pk autoincr"` + RepoID int64 `xorm:"INDEX NOT NULL"` + Title string `xorm:"NOT NULL"` + CreatedAt int64 `xorm:"created"` + UpdatedAt int64 `xorm:"updated"` +} + +type KanbanColumn struct { + ID int64 `xorm:"pk autoincr"` + BoardID int64 `xorm:"INDEX NOT NULL"` + Title string `xorm:"NOT NULL"` + Order int `xorm:"NOT NULL DEFAULT 0"` + CreatedAt int64 `xorm:"created"` + UpdatedAt int64 `xorm:"updated"` +} + +type KanbanCard struct { + ID int64 `xorm:"pk autoincr"` + ColumnID int64 `xorm:"INDEX NOT NULL"` + IssueID int64 `xorm:"INDEX"` + Title string `xorm:"NOT NULL"` + Description string `xorm:"TEXT"` + Order int `xorm:"NOT NULL DEFAULT 0"` + CreatedAt int64 `xorm:"created"` + UpdatedAt int64 `xorm:"updated"` +} diff --git a/nessie-stack/docker-compose.yml b/nessie-stack/docker-compose.yml new file mode 100644 index 0000000000000..50ecba16a14da --- /dev/null +++ b/nessie-stack/docker-compose.yml @@ -0,0 +1,69 @@ +version: '3.8' + +services: + minio: + image: quay.io/minio/minio:latest + container_name: minio + command: server /data --console-address ":9001" + environment: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + ports: + - "9000:9000" + - "9001:9001" + volumes: + - minio_data:/data + networks: + - common_network + + minio-client: + image: minio/mc + depends_on: + - minio + entrypoint: > + /bin/sh -c " + sleep 5; + mc alias set local http://minio:9000 minioadmin minioadmin; + mc mb local/warehouse; + exit 0; + " + networks: + - common_network + + nessie: + image: projectnessie/nessie:latest + container_name: nessie + ports: + - "19120:19120" + environment: + QUARKUS_HTTP_PORT: 19120 + networks: + - common_network + + spark-iceberg: + container_name: spark-iceberg + build: spark/ + depends_on: + - nessie + - minio + volumes: + - ./warehouse:/home/iceberg/warehouse + - ./notebooks:/home/iceberg/notebooks/notebooks + environment: + - AWS_ACCESS_KEY_ID=minioadmin + - AWS_SECRET_ACCESS_KEY=minioadmin + - AWS_REGION=us-east-1 + ports: + - 8888:8888 + - 8080:8080 + - 10000:10000 + - 10001:10001 + networks: + - common_network + +volumes: + minio_data: + +networks: + common_network: + driver: bridge \ No newline at end of file diff --git a/nessie-stack/notebooks/.gitignore b/nessie-stack/notebooks/.gitignore new file mode 100644 index 0000000000000..d5b7a52ff3941 --- /dev/null +++ b/nessie-stack/notebooks/.gitignore @@ -0,0 +1 @@ +metastore* diff --git a/nessie-stack/notebooks/.ipynb_checkpoints/test-checkpoint.ipynb b/nessie-stack/notebooks/.ipynb_checkpoints/test-checkpoint.ipynb new file mode 100644 index 0000000000000..c4fb21049bd69 --- /dev/null +++ b/nessie-stack/notebooks/.ipynb_checkpoints/test-checkpoint.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "\n", + "ICEBERG_VERSION = \"1.8.1\"\n", + "NESSIE_VERSION = \"0.103.0\"\n", + "SPARK_VERSION = \"3.5\"\n", + "\n", + "spark = (\n", + " SparkSession.builder.appName(\"IcebergNessieExample\")\n", + " # Use JARs from local Spark installation\n", + "# .config(\"spark.driver.extraClassPath\", \"/opt/spark/jars/*\")\n", + "# .config(\"spark.executor.extraClassPath\", \"/opt/spark/jars/*\")\n", + " # Use correct Iceberg & Nessie JARs for Spark 3.5\n", + " # .config(\"spark.jars.packages\", f\"org.apache.iceberg:iceberg-spark-runtime-{SPARK_VERSION}_2.12:{ICEBERG_VERSION},\"\n", + " # f\"org.projectnessie.nessie-integrations:nessie-spark-extensions-{SPARK_VERSION}_2.12:{NESSIE_VERSION}\")\n", + " # .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\")\n", + " # .config(\"spark.sql.catalog.nessie\", \"org.apache.iceberg.spark.SparkCatalog\")\n", + " # .config(\"spark.sql.catalog.nessie.catalog-impl\", \"org.apache.iceberg.nessie.NessieCatalog\")\n", + " # .config(\"spark.sql.catalog.nessie.uri\", \"http://nessie:19120/api/v1\")\n", + " # .config(\"spark.sql.catalog.nessie.ref\", \"main\")\n", + " .config(\"spark.sql.catalog.nessie.warehouse\", \"s3a://warehouse/\")\n", + " ### .config(\"spark.hadoop.fs.s3a.endpoint\", \"http://minio:9000\")\n", + "# .config(\"spark.hadoop.fs.s3a.access.key\", \"minioadmin\")\n", + "## .config(\"spark.hadoop.fs.s3a.secret.key\", \"minioadmin\")\n", + " # .config(\"spark.hadoop.fs.s3a.path.style.access\", \"true\")\n", + " # .config(\"spark.hadoop.fs.s3a.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\")\n", + " .getOrCreate()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('spark.app.submitTime', '1742208028134'),\n", + " ('spark.hadoop.fs.s3a.path.style.access', 'true'),\n", + " ('spark.driver.port', '44685'),\n", + " ('spark.sql.warehouse.dir',\n", + " 'file:/home/iceberg/notebooks/notebooks/spark-warehouse'),\n", + " ('spark.sql.catalog.iceberg.s3.path-style-access', 'true'),\n", + " ('spark.sql.catalog.iceberg.s3.endpoint', 'http://minio:9000'),\n", + " ('spark.sql.catalog.iceberg.type', 'nessie'),\n", + " ('spark.hadoop.fs.s3a.access.key', 'minioadmin'),\n", + " ('spark.serializer.objectStreamReset', '100'),\n", + " ('spark.master', 'local[*]'),\n", + " ('spark.submit.deployMode', 'client'),\n", + " ('spark.sql.extensions',\n", + " 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions'),\n", + " ('spark.sql.catalog.iceberg.warehouse', 's3a://nessie'),\n", + " ('spark.hadoop.fs.s3a.secret.key', 'minioadmin'),\n", + " ('spark.driver.extraJavaOptions',\n", + " '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),\n", + " ('spark.sql.catalog.iceberg.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO'),\n", + " ('spark.executor.id', 'driver'),\n", + " ('spark.sql.catalog.iceberg.ref', 'main'),\n", + " ('spark.driver.host', '75ff1b79603d'),\n", + " ('spark.app.name', 'PySparkShell'),\n", + " ('spark.app.id', 'local-1742208028733'),\n", + " ('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'),\n", + " ('spark.sql.catalog.iceberg', 'org.apache.iceberg.spark.SparkCatalog'),\n", + " ('spark.sql.catalogImplementation', 'hive'),\n", + " ('spark.rdd.compress', 'True'),\n", + " ('spark.executor.extraJavaOptions',\n", + " '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),\n", + " ('spark.submit.pyFiles', ''),\n", + " ('spark.hadoop.fs.s3a.endpoint', 'http://minio:9000'),\n", + " ('spark.sql.catalog.iceberg.uri', 'http://nessie:19120/api/v1'),\n", + " ('spark.app.startTime', '1742208028430'),\n", + " ('spark.ui.showConsoleProgress', 'true')]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.sparkContext.getConf().getAll()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/03/17 10:40:38 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist\n", + "25/03/17 10:40:38 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist\n", + "25/03/17 10:40:42 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0\n", + "25/03/17 10:40:42 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@10.89.3.5\n", + "25/03/17 10:40:42 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException\n", + "25/03/17 10:40:42 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException\n" + ] + }, + { + "data": { + "text/plain": [ + "DataFrame[]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.sql('use iceberg')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.sql('create namespace default')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "Py4JJavaError", + "evalue": "An error occurred while calling o41.sql.\n: software.amazon.awssdk.services.s3.model.NoSuchBucketException: The specified bucket does not exist (Service: S3, Status Code: 404, Request ID: 182D91089BA15409, Extended Request ID: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8)\n\tat software.amazon.awssdk.protocols.xml.internal.unmarshall.AwsXmlPredicatedResponseHandler.handleErrorResponse(AwsXmlPredicatedResponseHandler.java:155)\n\tat software.amazon.awssdk.protocols.xml.internal.unmarshall.AwsXmlPredicatedResponseHandler.handleResponse(AwsXmlPredicatedResponseHandler.java:107)\n\tat software.amazon.awssdk.protocols.xml.internal.unmarshall.AwsXmlPredicatedResponseHandler.handle(AwsXmlPredicatedResponseHandler.java:84)\n\tat software.amazon.awssdk.protocols.xml.internal.unmarshall.AwsXmlPredicatedResponseHandler.handle(AwsXmlPredicatedResponseHandler.java:42)\n\tat software.amazon.awssdk.awscore.client.handler.AwsSyncClientHandler$Crc32ValidationResponseHandler.handle(AwsSyncClientHandler.java:93)\n\tat software.amazon.awssdk.core.internal.handler.BaseClientHandler.lambda$successTransformationResponseHandler$7(BaseClientHandler.java:279)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.HandleResponseStage.execute(HandleResponseStage.java:50)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.HandleResponseStage.execute(HandleResponseStage.java:38)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptTimeoutTrackingStage.execute(ApiCallAttemptTimeoutTrackingStage.java:74)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptTimeoutTrackingStage.execute(ApiCallAttemptTimeoutTrackingStage.java:43)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.TimeoutExceptionHandlingStage.execute(TimeoutExceptionHandlingStage.java:79)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.TimeoutExceptionHandlingStage.execute(TimeoutExceptionHandlingStage.java:41)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptMetricCollectionStage.execute(ApiCallAttemptMetricCollectionStage.java:55)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptMetricCollectionStage.execute(ApiCallAttemptMetricCollectionStage.java:39)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage2.executeRequest(RetryableStage2.java:93)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage2.execute(RetryableStage2.java:56)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage2.execute(RetryableStage2.java:36)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:53)\n\tat software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:35)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.executeWithTimer(ApiCallTimeoutTrackingStage.java:82)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:62)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:43)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:50)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:32)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:37)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:26)\n\tat software.amazon.awssdk.core.internal.http.AmazonSyncHttpClient$RequestExecutionBuilderImpl.execute(AmazonSyncHttpClient.java:210)\n\tat software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.invoke(BaseSyncClientHandler.java:103)\n\tat software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.doExecute(BaseSyncClientHandler.java:173)\n\tat software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.lambda$execute$1(BaseSyncClientHandler.java:80)\n\tat software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.measureApiCallSuccess(BaseSyncClientHandler.java:182)\n\tat software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.execute(BaseSyncClientHandler.java:74)\n\tat software.amazon.awssdk.core.client.handler.SdkSyncClientHandler.execute(SdkSyncClientHandler.java:45)\n\tat software.amazon.awssdk.awscore.client.handler.AwsSyncClientHandler.execute(AwsSyncClientHandler.java:53)\n\tat software.amazon.awssdk.services.s3.DefaultS3Client.putObject(DefaultS3Client.java:11159)\n\tat org.apache.iceberg.aws.s3.S3OutputStream.completeUploads(S3OutputStream.java:444)\n\tat org.apache.iceberg.aws.s3.S3OutputStream.close(S3OutputStream.java:270)\n\tat org.apache.iceberg.aws.s3.S3OutputStream.close(S3OutputStream.java:256)\n\tat java.base/sun.nio.cs.StreamEncoder.implClose(StreamEncoder.java:347)\n\tat java.base/sun.nio.cs.StreamEncoder.close(StreamEncoder.java:169)\n\tat java.base/java.io.OutputStreamWriter.close(OutputStreamWriter.java:252)\n\tat org.apache.iceberg.TableMetadataParser.internalWrite(TableMetadataParser.java:134)\n\tat org.apache.iceberg.TableMetadataParser.overwrite(TableMetadataParser.java:117)\n\tat org.apache.iceberg.BaseMetastoreTableOperations.writeNewMetadata(BaseMetastoreTableOperations.java:160)\n\tat org.apache.iceberg.BaseMetastoreTableOperations.writeNewMetadataIfRequired(BaseMetastoreTableOperations.java:150)\n\tat org.apache.iceberg.nessie.NessieTableOperations.doCommit(NessieTableOperations.java:115)\n\tat org.apache.iceberg.BaseMetastoreTableOperations.commit(BaseMetastoreTableOperations.java:125)\n\tat org.apache.iceberg.BaseMetastoreCatalog$BaseMetastoreCatalogTableBuilder.create(BaseMetastoreCatalog.java:201)\n\tat org.apache.iceberg.CachingCatalog$CachingTableBuilder.lambda$create$0(CachingCatalog.java:264)\n\tat org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.lambda$doComputeIfAbsent$14(BoundedLocalCache.java:2406)\n\tat java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1916)\n\tat org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.doComputeIfAbsent(BoundedLocalCache.java:2404)\n\tat org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.computeIfAbsent(BoundedLocalCache.java:2387)\n\tat org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalCache.computeIfAbsent(LocalCache.java:108)\n\tat org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalManualCache.get(LocalManualCache.java:62)\n\tat org.apache.iceberg.CachingCatalog$CachingTableBuilder.create(CachingCatalog.java:260)\n\tat org.apache.iceberg.spark.SparkCatalog.createTable(SparkCatalog.java:246)\n\tat org.apache.spark.sql.connector.catalog.TableCatalog.createTable(TableCatalog.java:223)\n\tat org.apache.spark.sql.execution.datasources.v2.CreateTableExec.run(CreateTableExec.scala:44)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)\n\tat org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)\n\tat org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)\n\tat org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)\n\tat org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)\n\tat org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)\n\tat org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)\n\tat org.apache.spark.sql.Dataset.(Dataset.scala:220)\n\tat org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:100)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)\n\tat org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)\n\tat org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:638)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)\n\tat org.apache.spark.sql.SparkSession.sql(SparkSession.scala:629)\n\tat org.apache.spark.sql.SparkSession.sql(SparkSession.scala:659)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.lang.reflect.Method.invoke(Method.java:569)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n\tat java.base/java.lang.Thread.run(Thread.java:840)\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mspark\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mCREATE TABLE IF NOT EXISTS default.employees (id INT, name STRING, salary DOUBLE) USING iceberg\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m spark\u001b[38;5;241m.\u001b[39msql(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mINSERT INTO default.employees VALUES (1, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAlice\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, 75000), (2, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mBob\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, 80000)\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 5\u001b[0m df \u001b[38;5;241m=\u001b[39m spark\u001b[38;5;241m.\u001b[39msql(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSELECT * FROM default.employees\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/opt/spark/python/pyspark/sql/session.py:1631\u001b[0m, in \u001b[0;36mSparkSession.sql\u001b[0;34m(self, sqlQuery, args, **kwargs)\u001b[0m\n\u001b[1;32m 1627\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jvm \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1628\u001b[0m litArgs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jvm\u001b[38;5;241m.\u001b[39mPythonUtils\u001b[38;5;241m.\u001b[39mtoArray(\n\u001b[1;32m 1629\u001b[0m [_to_java_column(lit(v)) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m (args \u001b[38;5;129;01mor\u001b[39;00m [])]\n\u001b[1;32m 1630\u001b[0m )\n\u001b[0;32m-> 1631\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DataFrame(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jsparkSession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[43msqlQuery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlitArgs\u001b[49m\u001b[43m)\u001b[49m, \u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 1632\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 1633\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(kwargs) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", + "File \u001b[0;32m/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1322\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1316\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1317\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1318\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1319\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m 1321\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1322\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1323\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1325\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m 1326\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(temp_arg, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_detach\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n", + "File \u001b[0;32m/opt/spark/python/pyspark/errors/exceptions/captured.py:179\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdeco\u001b[39m(\u001b[38;5;241m*\u001b[39ma: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[1;32m 178\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 179\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m Py4JJavaError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 181\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n", + "File \u001b[0;32m/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/protocol.py:326\u001b[0m, in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 324\u001b[0m value \u001b[38;5;241m=\u001b[39m OUTPUT_CONVERTER[\u001b[38;5;28mtype\u001b[39m](answer[\u001b[38;5;241m2\u001b[39m:], gateway_client)\n\u001b[1;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m answer[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m REFERENCE_TYPE:\n\u001b[0;32m--> 326\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m Py4JJavaError(\n\u001b[1;32m 327\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn error occurred while calling \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39m\n\u001b[1;32m 328\u001b[0m \u001b[38;5;28mformat\u001b[39m(target_id, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, name), value)\n\u001b[1;32m 329\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m Py4JError(\n\u001b[1;32m 331\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn error occurred while calling \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m. Trace:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{3}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28mformat\u001b[39m(target_id, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, name, value))\n", + "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o41.sql.\n: software.amazon.awssdk.services.s3.model.NoSuchBucketException: The specified bucket does not exist (Service: S3, Status Code: 404, Request ID: 182D91089BA15409, Extended Request ID: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8)\n\tat software.amazon.awssdk.protocols.xml.internal.unmarshall.AwsXmlPredicatedResponseHandler.handleErrorResponse(AwsXmlPredicatedResponseHandler.java:155)\n\tat software.amazon.awssdk.protocols.xml.internal.unmarshall.AwsXmlPredicatedResponseHandler.handleResponse(AwsXmlPredicatedResponseHandler.java:107)\n\tat software.amazon.awssdk.protocols.xml.internal.unmarshall.AwsXmlPredicatedResponseHandler.handle(AwsXmlPredicatedResponseHandler.java:84)\n\tat software.amazon.awssdk.protocols.xml.internal.unmarshall.AwsXmlPredicatedResponseHandler.handle(AwsXmlPredicatedResponseHandler.java:42)\n\tat software.amazon.awssdk.awscore.client.handler.AwsSyncClientHandler$Crc32ValidationResponseHandler.handle(AwsSyncClientHandler.java:93)\n\tat software.amazon.awssdk.core.internal.handler.BaseClientHandler.lambda$successTransformationResponseHandler$7(BaseClientHandler.java:279)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.HandleResponseStage.execute(HandleResponseStage.java:50)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.HandleResponseStage.execute(HandleResponseStage.java:38)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptTimeoutTrackingStage.execute(ApiCallAttemptTimeoutTrackingStage.java:74)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptTimeoutTrackingStage.execute(ApiCallAttemptTimeoutTrackingStage.java:43)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.TimeoutExceptionHandlingStage.execute(TimeoutExceptionHandlingStage.java:79)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.TimeoutExceptionHandlingStage.execute(TimeoutExceptionHandlingStage.java:41)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptMetricCollectionStage.execute(ApiCallAttemptMetricCollectionStage.java:55)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptMetricCollectionStage.execute(ApiCallAttemptMetricCollectionStage.java:39)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage2.executeRequest(RetryableStage2.java:93)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage2.execute(RetryableStage2.java:56)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage2.execute(RetryableStage2.java:36)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:53)\n\tat software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:35)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.executeWithTimer(ApiCallTimeoutTrackingStage.java:82)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:62)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:43)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:50)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:32)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:37)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:26)\n\tat software.amazon.awssdk.core.internal.http.AmazonSyncHttpClient$RequestExecutionBuilderImpl.execute(AmazonSyncHttpClient.java:210)\n\tat software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.invoke(BaseSyncClientHandler.java:103)\n\tat software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.doExecute(BaseSyncClientHandler.java:173)\n\tat software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.lambda$execute$1(BaseSyncClientHandler.java:80)\n\tat software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.measureApiCallSuccess(BaseSyncClientHandler.java:182)\n\tat software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.execute(BaseSyncClientHandler.java:74)\n\tat software.amazon.awssdk.core.client.handler.SdkSyncClientHandler.execute(SdkSyncClientHandler.java:45)\n\tat software.amazon.awssdk.awscore.client.handler.AwsSyncClientHandler.execute(AwsSyncClientHandler.java:53)\n\tat software.amazon.awssdk.services.s3.DefaultS3Client.putObject(DefaultS3Client.java:11159)\n\tat org.apache.iceberg.aws.s3.S3OutputStream.completeUploads(S3OutputStream.java:444)\n\tat org.apache.iceberg.aws.s3.S3OutputStream.close(S3OutputStream.java:270)\n\tat org.apache.iceberg.aws.s3.S3OutputStream.close(S3OutputStream.java:256)\n\tat java.base/sun.nio.cs.StreamEncoder.implClose(StreamEncoder.java:347)\n\tat java.base/sun.nio.cs.StreamEncoder.close(StreamEncoder.java:169)\n\tat java.base/java.io.OutputStreamWriter.close(OutputStreamWriter.java:252)\n\tat org.apache.iceberg.TableMetadataParser.internalWrite(TableMetadataParser.java:134)\n\tat org.apache.iceberg.TableMetadataParser.overwrite(TableMetadataParser.java:117)\n\tat org.apache.iceberg.BaseMetastoreTableOperations.writeNewMetadata(BaseMetastoreTableOperations.java:160)\n\tat org.apache.iceberg.BaseMetastoreTableOperations.writeNewMetadataIfRequired(BaseMetastoreTableOperations.java:150)\n\tat org.apache.iceberg.nessie.NessieTableOperations.doCommit(NessieTableOperations.java:115)\n\tat org.apache.iceberg.BaseMetastoreTableOperations.commit(BaseMetastoreTableOperations.java:125)\n\tat org.apache.iceberg.BaseMetastoreCatalog$BaseMetastoreCatalogTableBuilder.create(BaseMetastoreCatalog.java:201)\n\tat org.apache.iceberg.CachingCatalog$CachingTableBuilder.lambda$create$0(CachingCatalog.java:264)\n\tat org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.lambda$doComputeIfAbsent$14(BoundedLocalCache.java:2406)\n\tat java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1916)\n\tat org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.doComputeIfAbsent(BoundedLocalCache.java:2404)\n\tat org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.computeIfAbsent(BoundedLocalCache.java:2387)\n\tat org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalCache.computeIfAbsent(LocalCache.java:108)\n\tat org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalManualCache.get(LocalManualCache.java:62)\n\tat org.apache.iceberg.CachingCatalog$CachingTableBuilder.create(CachingCatalog.java:260)\n\tat org.apache.iceberg.spark.SparkCatalog.createTable(SparkCatalog.java:246)\n\tat org.apache.spark.sql.connector.catalog.TableCatalog.createTable(TableCatalog.java:223)\n\tat org.apache.spark.sql.execution.datasources.v2.CreateTableExec.run(CreateTableExec.scala:44)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)\n\tat org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)\n\tat org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)\n\tat org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)\n\tat org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)\n\tat org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)\n\tat org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)\n\tat org.apache.spark.sql.Dataset.(Dataset.scala:220)\n\tat org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:100)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)\n\tat org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)\n\tat org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:638)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)\n\tat org.apache.spark.sql.SparkSession.sql(SparkSession.scala:629)\n\tat org.apache.spark.sql.SparkSession.sql(SparkSession.scala:659)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.lang.reflect.Method.invoke(Method.java:569)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n\tat java.base/java.lang.Thread.run(Thread.java:840)\n" + ] + } + ], + "source": [ + "spark.sql(\"CREATE TABLE IF NOT EXISTS default.employees (id INT, name STRING, salary DOUBLE) USING iceberg\")\n", + "\n", + "spark.sql(\"INSERT INTO default.employees VALUES (1, 'Alice', 75000), (2, 'Bob', 80000)\")\n", + "\n", + "df = spark.sql(\"SELECT * FROM default.employees\")\n", + "df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nessie-stack/notebooks/test.ipynb b/nessie-stack/notebooks/test.ipynb new file mode 100644 index 0000000000000..addd878fd0917 --- /dev/null +++ b/nessie-stack/notebooks/test.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "\n", + "ICEBERG_VERSION = \"1.8.1\"\n", + "NESSIE_VERSION = \"0.103.0\"\n", + "SPARK_VERSION = \"3.5\"\n", + "\n", + "spark = (\n", + " SparkSession.builder.appName(\"IcebergNessieExample\")\n", + " # Use JARs from local Spark installation\n", + "# .config(\"spark.driver.extraClassPath\", \"/opt/spark/jars/*\")\n", + "# .config(\"spark.executor.extraClassPath\", \"/opt/spark/jars/*\")\n", + " # Use correct Iceberg & Nessie JARs for Spark 3.5\n", + " # .config(\"spark.jars.packages\", f\"org.apache.iceberg:iceberg-spark-runtime-{SPARK_VERSION}_2.12:{ICEBERG_VERSION},\"\n", + " # f\"org.projectnessie.nessie-integrations:nessie-spark-extensions-{SPARK_VERSION}_2.12:{NESSIE_VERSION}\")\n", + " # .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\")\n", + " # .config(\"spark.sql.catalog.nessie\", \"org.apache.iceberg.spark.SparkCatalog\")\n", + " # .config(\"spark.sql.catalog.nessie.catalog-impl\", \"org.apache.iceberg.nessie.NessieCatalog\")\n", + " # .config(\"spark.sql.catalog.nessie.uri\", \"http://nessie:19120/api/v1\")\n", + " # .config(\"spark.sql.catalog.nessie.ref\", \"main\")\n", + " .config(\"spark.sql.catalog.nessie.warehouse\", \"s3a://warehouse/\")\n", + " ### .config(\"spark.hadoop.fs.s3a.endpoint\", \"http://minio:9000\")\n", + "# .config(\"spark.hadoop.fs.s3a.access.key\", \"minioadmin\")\n", + "## .config(\"spark.hadoop.fs.s3a.secret.key\", \"minioadmin\")\n", + " # .config(\"spark.hadoop.fs.s3a.path.style.access\", \"true\")\n", + " # .config(\"spark.hadoop.fs.s3a.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\")\n", + " .getOrCreate()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('spark.hadoop.fs.s3a.path.style.access', 'true'),\n", + " ('spark.driver.port', '35325'),\n", + " ('spark.sql.warehouse.dir',\n", + " 'file:/home/iceberg/notebooks/notebooks/spark-warehouse'),\n", + " ('spark.app.submitTime', '1742217572814'),\n", + " ('spark.sql.catalog.iceberg.s3.path-style-access', 'true'),\n", + " ('spark.sql.catalog.iceberg.s3.endpoint', 'http://minio:9000'),\n", + " ('spark.sql.catalog.iceberg.type', 'nessie'),\n", + " ('spark.app.id', 'local-1742217573458'),\n", + " ('spark.hadoop.fs.s3a.access.key', 'minioadmin'),\n", + " ('spark.serializer.objectStreamReset', '100'),\n", + " ('spark.master', 'local[*]'),\n", + " ('spark.driver.host', 'ff5452dce47b'),\n", + " ('spark.submit.deployMode', 'client'),\n", + " ('spark.sql.extensions',\n", + " 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions'),\n", + " ('spark.hadoop.fs.s3a.secret.key', 'minioadmin'),\n", + " ('spark.driver.extraJavaOptions',\n", + " '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),\n", + " ('spark.sql.catalog.iceberg.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO'),\n", + " ('spark.executor.id', 'driver'),\n", + " ('spark.sql.catalog.iceberg.ref', 'main'),\n", + " ('spark.app.name', 'PySparkShell'),\n", + " ('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'),\n", + " ('spark.sql.catalog.iceberg', 'org.apache.iceberg.spark.SparkCatalog'),\n", + " ('spark.sql.catalogImplementation', 'hive'),\n", + " ('spark.rdd.compress', 'True'),\n", + " ('spark.executor.extraJavaOptions',\n", + " '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),\n", + " ('spark.app.startTime', '1742217573143'),\n", + " ('spark.submit.pyFiles', ''),\n", + " ('spark.hadoop.fs.s3a.endpoint', 'http://minio:9000'),\n", + " ('spark.sql.catalog.iceberg.warehouse', 's3a://warehouse'),\n", + " ('spark.sql.catalog.iceberg.uri', 'http://nessie:19120/api/v1'),\n", + " ('spark.ui.showConsoleProgress', 'true')]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.sparkContext.getConf().getAll()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/03/17 13:19:42 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist\n", + "25/03/17 13:19:42 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist\n", + "25/03/17 13:19:43 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0\n", + "25/03/17 13:19:43 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@10.89.4.5\n", + "25/03/17 13:19:43 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException\n" + ] + }, + { + "data": { + "text/plain": [ + "DataFrame[]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.sql('use iceberg')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.sql('create namespace default')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-----+-------+\n", + "| id| name| salary|\n", + "+---+-----+-------+\n", + "| 1|Alice|75000.0|\n", + "| 2| Bob|80000.0|\n", + "+---+-----+-------+\n", + "\n" + ] + } + ], + "source": [ + "spark.sql(\"CREATE TABLE IF NOT EXISTS default.employees (id INT, name STRING, salary DOUBLE) USING iceberg\")\n", + "\n", + "spark.sql(\"INSERT INTO default.employees VALUES (1, 'Alice', 75000), (2, 'Bob', 80000)\")\n", + "\n", + "df = spark.sql(\"SELECT * FROM default.employees\")\n", + "df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nessie-stack/spark/.pyiceberg.yaml b/nessie-stack/spark/.pyiceberg.yaml new file mode 100644 index 0000000000000..7444ab09992f0 --- /dev/null +++ b/nessie-stack/spark/.pyiceberg.yaml @@ -0,0 +1,24 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +catalog: + default: + uri: http://rest:8181 + s3.endpoint: http://minio:9000 + s3.access-key-id: admin + s3.secret-access-key: password diff --git a/nessie-stack/spark/Dockerfile b/nessie-stack/spark/Dockerfile new file mode 100644 index 0000000000000..d8cb9fb777b38 --- /dev/null +++ b/nessie-stack/spark/Dockerfile @@ -0,0 +1,131 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# syntax=docker/dockerfile:1 +FROM python:3.10-bullseye + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + sudo \ + curl \ + vim \ + unzip \ + openjdk-17-jdk \ + build-essential \ + software-properties-common \ + ssh && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Install Jupyter and other python deps +COPY requirements.txt . +RUN pip3 install -r requirements.txt + +# Add scala kernel via spylon-kernel +RUN python3 -m spylon_kernel install + +# Download and install IJava jupyter kernel +RUN curl https://github.com/SpencerPark/IJava/releases/download/v1.3.0/ijava-1.3.0.zip -Lo ijava-1.3.0.zip \ + && unzip ijava-1.3.0.zip \ + && python3 install.py --sys-prefix \ + && rm ijava-1.3.0.zip + +# Optional env variables +ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"} +ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH + +WORKDIR ${SPARK_HOME} + +ENV SPARK_VERSION=3.5.5 +ENV SPARK_MAJOR_VERSION=3.5 +ENV ICEBERG_VERSION=1.8.1 + +# Download spark +RUN mkdir -p ${SPARK_HOME} \ + && curl https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ + && tar xvzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \ + && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz + +# Download iceberg spark runtime +RUN curl https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR_VERSION}_2.12/${ICEBERG_VERSION}/iceberg-spark-runtime-${SPARK_MAJOR_VERSION}_2.12-${ICEBERG_VERSION}.jar -Lo /opt/spark/jars/iceberg-spark-runtime-${SPARK_MAJOR_VERSION}_2.12-${ICEBERG_VERSION}.jar + +# Download AWS bundle +RUN curl -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar + +# Download GCP bundle +RUN curl -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-gcp-bundle/${ICEBERG_VERSION}/iceberg-gcp-bundle-${ICEBERG_VERSION}.jar -Lo /opt/spark/jars/iceberg-gcp-bundle-${ICEBERG_VERSION}.jar + +# Download Azure bundle +RUN curl -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-azure-bundle/${ICEBERG_VERSION}/iceberg-azure-bundle-${ICEBERG_VERSION}.jar -Lo /opt/spark/jars/iceberg-azure-bundle-${ICEBERG_VERSION}.jar + +# Install AWS CLI +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \ + && unzip awscliv2.zip \ + && sudo ./aws/install \ + && rm awscliv2.zip \ + && rm -rf aws/ + +# Add iceberg spark runtime jar to IJava classpath +ENV IJAVA_CLASSPATH=/opt/spark/jars/* + +RUN mkdir -p /home/iceberg/data \ + && curl https://data.cityofnewyork.us/resource/tg4x-b46p.json > /home/iceberg/data/nyc_film_permits.json \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-04.parquet -o /home/iceberg/data/yellow_tripdata_2022-04.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-03.parquet -o /home/iceberg/data/yellow_tripdata_2022-03.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2022-02.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -o /home/iceberg/data/yellow_tripdata_2022-01.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-12.parquet -o /home/iceberg/data/yellow_tripdata_2021-12.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-11.parquet -o /home/iceberg/data/yellow_tripdata_2021-11.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-10.parquet -o /home/iceberg/data/yellow_tripdata_2021-10.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-09.parquet -o /home/iceberg/data/yellow_tripdata_2021-09.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-08.parquet -o /home/iceberg/data/yellow_tripdata_2021-08.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-07.parquet -o /home/iceberg/data/yellow_tripdata_2021-07.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-06.parquet -o /home/iceberg/data/yellow_tripdata_2021-06.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-05.parquet -o /home/iceberg/data/yellow_tripdata_2021-05.parquet \ + && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-04.parquet -o /home/iceberg/data/yellow_tripdata_2021-04.parquet + +RUN mkdir -p /home/iceberg/localwarehouse /home/iceberg/notebooks /home/iceberg/warehouse /home/iceberg/spark-events /home/iceberg +COPY notebooks/ /home/iceberg/notebooks + +# Add a notebook command +RUN echo '#! /bin/sh' >> /bin/notebook \ + && echo 'export PYSPARK_DRIVER_PYTHON=jupyter-notebook' >> /bin/notebook \ + && echo "export PYSPARK_DRIVER_PYTHON_OPTS=\"--notebook-dir=/home/iceberg/notebooks --ip='*' --NotebookApp.token='' --NotebookApp.password='' --port=8888 --no-browser --allow-root\"" >> /bin/notebook \ + && echo "pyspark" >> /bin/notebook \ + && chmod u+x /bin/notebook + +# Add a pyspark-notebook command (alias for notebook command for backwards-compatibility) +RUN echo '#! /bin/sh' >> /bin/pyspark-notebook \ + && echo 'export PYSPARK_DRIVER_PYTHON=jupyter-notebook' >> /bin/pyspark-notebook \ + && echo "export PYSPARK_DRIVER_PYTHON_OPTS=\"--notebook-dir=/home/iceberg/notebooks --ip='*' --NotebookApp.token='' --NotebookApp.password='' --port=8888 --no-browser --allow-root\"" >> /bin/pyspark-notebook \ + && echo "pyspark" >> /bin/pyspark-notebook \ + && chmod u+x /bin/pyspark-notebook + +RUN mkdir -p /root/.ipython/profile_default/startup +COPY ipython/startup/00-prettytables.py /root/.ipython/profile_default/startup +COPY ipython/startup/README /root/.ipython/profile_default/startup + +COPY spark-defaults.conf /opt/spark/conf +ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" + +RUN chmod u+x /opt/spark/sbin/* && \ + chmod u+x /opt/spark/bin/* + +COPY .pyiceberg.yaml /root/.pyiceberg.yaml + +COPY entrypoint.sh . + +ENTRYPOINT ["./entrypoint.sh"] +CMD ["notebook"] diff --git a/nessie-stack/spark/entrypoint.sh b/nessie-stack/spark/entrypoint.sh new file mode 100755 index 0000000000000..2738b59baca59 --- /dev/null +++ b/nessie-stack/spark/entrypoint.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +start-master.sh -p 7077 +start-worker.sh spark://spark-iceberg:7077 +start-history-server.sh +start-thriftserver.sh --driver-java-options "-Dderby.system.home=/tmp/derby" + +# Entrypoint, for example notebook, pyspark or spark-sql +if [[ $# -gt 0 ]] ; then + eval "$1" +fi diff --git a/nessie-stack/spark/ipython/startup/00-prettytables.py b/nessie-stack/spark/ipython/startup/00-prettytables.py new file mode 100644 index 0000000000000..868f7e5ae0c58 --- /dev/null +++ b/nessie-stack/spark/ipython/startup/00-prettytables.py @@ -0,0 +1,81 @@ + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from prettytable import PrettyTable +from IPython.core.magic import register_line_cell_magic + +class DFTable(PrettyTable): + def __repr__(self): + return self.get_string() + + def _repr_html_(self): + return self.get_html_string() + +def _row_as_table(df): + cols = df.columns + + t = DFTable() + t.field_names = ["Column", "Value"] + t.align = "r" + row = df.limit(1).collect()[0].asDict() + for col in cols: + t.add_row([ col, row[col] ]) + + return t + +def _to_table(df, num_rows=100): + cols = df.columns + + t = DFTable() + t.field_names = cols + t.align = "r" + for row in df.limit(num_rows).collect(): + d = row.asDict() + t.add_row([ d[col] for col in cols ]) + + return t + +import re +import sys +from argparse import ArgumentParser +parser = ArgumentParser() +parser.add_argument("--limit", help="Number of lines to return", type=int, default=100) +parser.add_argument("--var", help="Variable name to hold the dataframe", type=str) + +@register_line_cell_magic +def sql(line, cell=None): + """Spark SQL magic + """ + from pyspark.sql import SparkSession + spark = SparkSession.builder.appName("Jupyter").getOrCreate() + if cell is None: + return _to_table(spark.sql(line)) + elif line: + df = spark.sql(cell) + + (args, others) = parser.parse_known_args([ arg for arg in re.split("\s+", line) if arg ]) + + if args.var: + setattr(sys.modules[__name__], args.var, df) + + if args.limit == 1: + return _row_as_table(df) + else: + return _to_table(df, num_rows=args.limit) + else: + return _to_table(spark.sql(cell)) diff --git a/nessie-stack/spark/ipython/startup/README b/nessie-stack/spark/ipython/startup/README new file mode 100644 index 0000000000000..61d470004218a --- /dev/null +++ b/nessie-stack/spark/ipython/startup/README @@ -0,0 +1,11 @@ +This is the IPython startup directory + +.py and .ipy files in this directory will be run *prior* to any code or files specified +via the exec_lines or exec_files configurables whenever you load this profile. + +Files will be run in lexicographical order, so you can control the execution order of files +with a prefix, e.g.:: + + 00-first.py + 50-middle.py + 99-last.ipy diff --git a/nessie-stack/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb b/nessie-stack/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb new file mode 100644 index 0000000000000..20af42b17b1a6 --- /dev/null +++ b/nessie-stack/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb @@ -0,0 +1,469 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "16f6bb49", + "metadata": {}, + "source": [ + "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" + ] + }, + { + "cell_type": "markdown", + "id": "c82657e9", + "metadata": {}, + "source": [ + "# An Introduction to the Iceberg Java API" + ] + }, + { + "cell_type": "markdown", + "id": "3ee90ad2", + "metadata": {}, + "source": [ + "## [Part 1 - Loading a Catalog and Creating a Table](https://tabular.io/blog/java-api-part-1/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72e68c62", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.catalog.Catalog;\n", + "import org.apache.hadoop.conf.Configuration;\n", + "import org.apache.iceberg.CatalogProperties;\n", + "import org.apache.iceberg.rest.RESTCatalog;\n", + "import org.apache.iceberg.aws.s3.S3FileIOProperties;\n", + "\n", + "Map properties = new HashMap<>();\n", + "\n", + "properties.put(CatalogProperties.CATALOG_IMPL, \"org.apache.iceberg.rest.RESTCatalog\");\n", + "properties.put(CatalogProperties.URI, \"http://rest:8181\");\n", + "properties.put(CatalogProperties.WAREHOUSE_LOCATION, \"s3a://warehouse/wh\");\n", + "properties.put(CatalogProperties.FILE_IO_IMPL, \"org.apache.iceberg.aws.s3.S3FileIO\");\n", + "properties.put(S3FileIOProperties.ENDPOINT, \"http://minio:9000\");\n", + "\n", + "RESTCatalog catalog = new RESTCatalog();\n", + "Configuration conf = new Configuration();\n", + "catalog.setConf(conf);\n", + "catalog.initialize(\"demo\", properties);\n", + "catalog.name();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4be615e7", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.Schema;\n", + "import org.apache.iceberg.types.Types;\n", + "\n", + "Schema schema = new Schema(\n", + " Types.NestedField.required(1, \"level\", Types.StringType.get()),\n", + " Types.NestedField.required(2, \"event_time\", Types.TimestampType.withZone()),\n", + " Types.NestedField.required(3, \"message\", Types.StringType.get()),\n", + " Types.NestedField.optional(4, \"call_stack\", Types.ListType.ofRequired(5, Types.StringType.get()))\n", + " );\n", + "schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7299d16", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.PartitionSpec;\n", + "\n", + "PartitionSpec spec = PartitionSpec.builderFor(schema)\n", + " .hour(\"event_time\")\n", + " .identity(\"level\")\n", + " .build();\n", + "spec" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d900c97", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.catalog.TableIdentifier;\n", + "import org.apache.iceberg.catalog.Namespace;\n", + "\n", + "Namespace nyc = Namespace.of(\"nyc\");\n", + "TableIdentifier name = TableIdentifier.of(nyc, \"logs\");\n", + "name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a4d8a6e", + "metadata": {}, + "outputs": [], + "source": [ + "catalog.createTable(name, schema, spec)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d8c46df", + "metadata": {}, + "outputs": [], + "source": [ + "catalog.dropTable(name)" + ] + }, + { + "cell_type": "markdown", + "id": "fe62e0a9", + "metadata": {}, + "source": [ + "## [Part 2 - Table Scans](https://tabular.io/blog/java-api-part-2/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1e7aa7a", + "metadata": {}, + "outputs": [], + "source": [ + "catalog.createTable(name, schema, spec)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78c95e06", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.spark.sql.SparkSession;\n", + "\n", + "SparkSession spark = SparkSession\n", + " .builder()\n", + " .master(\"local[*]\")\n", + " .appName(\"Java API Demo\")\n", + " .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\")\n", + " .config(\"spark.sql.catalog.demo\", \"org.apache.iceberg.spark.SparkCatalog\")\n", + " .config(\"spark.sql.catalog.demo.catalog-impl\", \"org.apache.iceberg.rest.RESTCatalog\")\n", + " .config(\"spark.sql.catalog.demo.uri\", \"http://rest:8181\")\n", + " .config(\"spark.sql.catalog.demo.io-impl\", \"org.apache.iceberg.aws.s3.S3FileIO\")\n", + " .config(\"spark.sql.catalog.demo.s3.endpoint\", \"http://minio:9000\")\n", + " .config(\"spark.sql.defaultCatalog\", \"demo\")\n", + " .config(\"spark.eventLog.enabled\", \"true\")\n", + " .config(\"spark.eventLog.dir\", \"/home/iceberg/spark-events\")\n", + " .config(\"spark.history.fs.logDirectory\", \"/home/iceberg/spark-events\")\n", + " .getOrCreate();\n", + "\n", + "spark.sparkContext().setLogLevel(\"ERROR\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b17f820", + "metadata": {}, + "outputs": [], + "source": [ + "String query = \"INSERT INTO demo.nyc.logs \"\n", + " + \"VALUES \"\n", + " + \"('info', timestamp 'today', 'Just letting you know!', array('stack trace line 1', 'stack trace line 2', 'stack trace line 3')), \"\n", + " + \"('warning', timestamp 'today', 'You probably should not do this!', array('stack trace line 1', 'stack trace line 2', 'stack trace line 3')), \"\n", + " + \"('error', timestamp 'today', 'This was a fatal application error!', array('stack trace line 1', 'stack trace line 2', 'stack trace line 3'))\";\n", + "\n", + "spark.sql(query).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15ca1822", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.catalog.Catalog;\n", + "import org.apache.hadoop.conf.Configuration;\n", + "import org.apache.iceberg.CatalogProperties;\n", + "import org.apache.iceberg.rest.RESTCatalog;\n", + "\n", + "Map properties = new HashMap<>();\n", + "\n", + "properties.put(CatalogProperties.CATALOG_IMPL, \"org.apache.iceberg.rest.RESTCatalog\");\n", + "properties.put(CatalogProperties.URI, \"http://rest:8181\");\n", + "properties.put(CatalogProperties.WAREHOUSE_LOCATION, \"s3a://warehouse/wh/\");\n", + "properties.put(CatalogProperties.FILE_IO_IMPL, \"org.apache.iceberg.aws.s3.S3FileIO\");\n", + "properties.put(S3FileIOProperties.ENDPOINT, \"http://minio:9000\");\n", + "\n", + "RESTCatalog catalog = new RESTCatalog();\n", + "Configuration conf = new Configuration();\n", + "catalog.setConf(conf);\n", + "catalog.initialize(\"demo\", properties);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a5cf423", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.Table;\n", + "import org.apache.iceberg.TableScan;\n", + "import org.apache.iceberg.catalog.Namespace;\n", + "import org.apache.iceberg.catalog.TableIdentifier;\n", + "\n", + "Namespace nyc = Namespace.of(\"nyc\");\n", + "TableIdentifier name = TableIdentifier.of(nyc, \"logs\");\n", + "Table table = catalog.loadTable(name);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e472d6a1", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.io.CloseableIterable;\n", + "import org.apache.iceberg.data.Record;\n", + "import org.apache.iceberg.data.IcebergGenerics;\n", + "\n", + "CloseableIterable result = IcebergGenerics.read(table).build();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d32f41c", + "metadata": {}, + "outputs": [], + "source": [ + "for (Record r: result) {\n", + " System.out.println(r);\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7dffc238", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.expressions.Expressions;\n", + "\n", + "CloseableIterable result = IcebergGenerics.read(table)\n", + " .where(Expressions.equal(\"level\", \"error\"))\n", + " .build();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec2b0431", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.CombinedScanTask;\n", + "import org.apache.iceberg.TableScan;\n", + "\n", + "TableScan scan = table.newScan();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09d13c6b", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.expressions.Expressions;\n", + "\n", + "TableScan filteredScan = scan.filter(Expressions.equal(\"level\", \"info\")).select(\"message\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1857c10f", + "metadata": {}, + "outputs": [], + "source": [ + "Iterable result = filteredScan.planTasks();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea206ec7", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.DataFile;\n", + "\n", + "CombinedScanTask task = result.iterator().next();\n", + "DataFile dataFile = task.files().iterator().next().file();\n", + "System.out.println(dataFile);" + ] + }, + { + "cell_type": "markdown", + "id": "41e9e10f", + "metadata": {}, + "source": [ + "## [Part 3 - Table Scans](https://tabular.io/blog/java-api-part-3/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81033412", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.Schema;\n", + "import org.apache.iceberg.types.Types;\n", + "import org.apache.iceberg.catalog.Namespace;\n", + "import org.apache.iceberg.catalog.TableIdentifier;\n", + "import org.apache.iceberg.PartitionSpec;\n", + "\n", + "Schema schema = new Schema(\n", + " Types.NestedField.optional(1, \"event_id\", Types.StringType.get()),\n", + " Types.NestedField.optional(2, \"username\", Types.StringType.get()),\n", + " Types.NestedField.optional(3, \"userid\", Types.IntegerType.get()),\n", + " Types.NestedField.optional(4, \"api_version\", Types.StringType.get()),\n", + " Types.NestedField.optional(5, \"command\", Types.StringType.get())\n", + " );\n", + "\n", + "Namespace webapp = Namespace.of(\"webapp\");\n", + "TableIdentifier name = TableIdentifier.of(webapp, \"user_events\");\n", + "catalog.createTable(name, schema, PartitionSpec.unpartitioned());" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12c45c6b", + "metadata": {}, + "outputs": [], + "source": [ + "import java.util.UUID;\n", + "import com.google.common.collect.ImmutableList;\n", + "import com.google.common.collect.ImmutableMap;\n", + "import org.apache.iceberg.data.GenericRecord;\n", + "\n", + "GenericRecord record = GenericRecord.create(schema);\n", + "ImmutableList.Builder builder = ImmutableList.builder();\n", + "builder.add(record.copy(ImmutableMap.of(\"event_id\", UUID.randomUUID().toString(), \"username\", \"Bruce\", \"userid\", 1, \"api_version\", \"1.0\", \"command\", \"grapple\")));\n", + "builder.add(record.copy(ImmutableMap.of(\"event_id\", UUID.randomUUID().toString(), \"username\", \"Wayne\", \"userid\", 1, \"api_version\", \"1.0\", \"command\", \"glide\")));\n", + "builder.add(record.copy(ImmutableMap.of(\"event_id\", UUID.randomUUID().toString(), \"username\", \"Clark\", \"userid\", 1, \"api_version\", \"2.0\", \"command\", \"fly\")));\n", + "builder.add(record.copy(ImmutableMap.of(\"event_id\", UUID.randomUUID().toString(), \"username\", \"Kent\", \"userid\", 1, \"api_version\", \"1.0\", \"command\", \"land\")));\n", + "ImmutableList records = builder.build();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83bc5319", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.Files;\n", + "import org.apache.iceberg.io.DataWriter;\n", + "import org.apache.iceberg.io.OutputFile;\n", + "import org.apache.iceberg.parquet.Parquet;\n", + "import org.apache.iceberg.data.parquet.GenericParquetWriter;\n", + "\n", + "String filepath = table.location() + \"/\" + UUID.randomUUID().toString();\n", + "OutputFile file = table.io().newOutputFile(filepath);\n", + "DataWriter dataWriter =\n", + " Parquet.writeData(file)\n", + " .schema(schema)\n", + " .createWriterFunc(GenericParquetWriter::buildWriter)\n", + " .overwrite()\n", + " .withSpec(PartitionSpec.unpartitioned())\n", + " .build();\n", + "try {\n", + " for (GenericRecord record : builder.build()) {\n", + " dataWriter.write(record);\n", + " }\n", + "} finally {\n", + " dataWriter.close();\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "469e6af4", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.DataFile;\n", + "\n", + "DataFile dataFile = dataWriter.toDataFile();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "142b6ed1", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.catalog.Namespace;\n", + "import org.apache.iceberg.catalog.TableIdentifier;\n", + "import org.apache.iceberg.Table;\n", + "\n", + "Namespace webapp = Namespace.of(\"webapp\");\n", + "TableIdentifier name = TableIdentifier.of(webapp, \"user_events\");\n", + "Table tbl = catalog.loadTable(name);\n", + "tbl.newAppend().appendFile(dataFile).commit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c61e9e79", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.io.CloseableIterable;\n", + "import org.apache.iceberg.data.Record;\n", + "import org.apache.iceberg.data.IcebergGenerics;\n", + "\n", + "CloseableIterable result = IcebergGenerics.read(tbl).build();\n", + "for (Record r: result) {\n", + " System.out.println(r);\n", + "}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Java", + "language": "java", + "name": "java" + }, + "language_info": { + "codemirror_mode": "java", + "file_extension": ".jshell", + "mimetype": "text/x-java-source", + "name": "Java", + "pygments_lexer": "java", + "version": "11.0.15+10-post-Debian-1deb11u1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nessie-stack/spark/notebooks/Iceberg - Berlin Buzzwords 2023.ipynb b/nessie-stack/spark/notebooks/Iceberg - Berlin Buzzwords 2023.ipynb new file mode 100644 index 0000000000000..66eaad26b11a5 --- /dev/null +++ b/nessie-stack/spark/notebooks/Iceberg - Berlin Buzzwords 2023.ipynb @@ -0,0 +1,361 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1041ae6f", + "metadata": {}, + "source": [ + "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a5c8206", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", + "\n", + "spark" + ] + }, + { + "cell_type": "markdown", + "id": "6f9a9f41", + "metadata": {}, + "source": [ + "## Load Two Months of NYC Taxi/Limousine Trip Data\n", + "\n", + "For this notebook, we will use the New York City Taxi and Limousine Commision Trip Record Data that's available on the AWS Open Data Registry. This contains data of trips taken by taxis and for-hire vehicles in New York City. We'll save this into an iceberg table called `taxis`." + ] + }, + { + "cell_type": "markdown", + "id": "747bee98", + "metadata": {}, + "source": [ + "To be able to rerun the notebook several times, let's drop the table if it exists to start fresh." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "930682ce", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE DATABASE IF NOT EXISTS nyc.taxis;" + ] + }, + { + "cell_type": "markdown", + "id": "5816de2e", + "metadata": {}, + "source": [ + "## First create the table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22ac5552", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DROP TABLE IF EXISTS nyc.taxis;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f918310a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE TABLE nyc.taxis (\n", + " VendorID bigint,\n", + " tpep_pickup_datetime timestamp,\n", + " tpep_dropoff_datetime timestamp,\n", + " passenger_count double,\n", + " trip_distance double,\n", + " RatecodeID double,\n", + " store_and_fwd_flag string,\n", + " PULocationID bigint,\n", + " DOLocationID bigint,\n", + " payment_type bigint,\n", + " fare_amount double,\n", + " extra double,\n", + " mta_tax double,\n", + " tip_amount double,\n", + " tolls_amount double,\n", + " improvement_surcharge double,\n", + " total_amount double,\n", + " congestion_surcharge double,\n", + " airport_fee double\n", + ")\n", + "USING iceberg\n", + "PARTITIONED BY (days(tpep_pickup_datetime))" + ] + }, + { + "cell_type": "markdown", + "id": "fcba103e", + "metadata": {}, + "source": [ + "# Write a month of data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c37ca92", + "metadata": {}, + "outputs": [], + "source": [ + "df = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-01.parquet\")\n", + "df.writeTo(\"nyc.taxis\").append()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a69152aa", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM nyc.taxis" + ] + }, + { + "cell_type": "markdown", + "id": "6fce6bb4", + "metadata": {}, + "source": [ + "## Metadata Tables\n", + "\n", + "Iceberg tables contain very rich metadata that can be easily queried. For example, you can retrieve the manifest list for any snapshot, simply by querying the table's `snapshots` table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fade1a3", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM nyc.taxis.snapshots" + ] + }, + { + "cell_type": "markdown", + "id": "4aa4a9cd", + "metadata": {}, + "source": [ + "# Write a month of data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed7a7b8f", + "metadata": {}, + "outputs": [], + "source": [ + "df = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-02.parquet\")\n", + "df.writeTo(\"nyc.taxis\").append()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbfb160c", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM nyc.taxis.snapshots\n", + "ORDER BY committed_at DESC" + ] + }, + { + "cell_type": "markdown", + "id": "65deb074", + "metadata": {}, + "source": [ + "## Manifest lists\n", + "\n", + "Now we'll list all the manifests. This is the abovemention `manifest_list` of the current snapshot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bab64f90", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM nyc.taxis.manifests" + ] + }, + { + "cell_type": "markdown", + "id": "b11e64c9", + "metadata": {}, + "source": [ + "# Manifests\n", + "\n", + "The next layer is the manifests that has references to the Parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c4a942c", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM nyc.taxis.files" + ] + }, + { + "cell_type": "markdown", + "id": "31567e4e", + "metadata": {}, + "source": [ + "# Flexibility of partitioning\n", + "\n", + "We can easily change the partitioning of the table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "156885c7", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT * FROM nyc.taxis.partitions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "184604d9", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.taxis DROP PARTITION FIELD days(tpep_pickup_datetime)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c26dddb5", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.taxis ADD PARTITION FIELD hours(tpep_pickup_datetime)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42ec7b70", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT * FROM nyc.taxis.partitions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d5dea98", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CALL system.rewrite_data_files('nyc.taxis')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fdf3a22", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM nyc.taxis.files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40447a02", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM nyc.taxis.snapshots\n", + "ORDER BY committed_at DESC" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nessie-stack/spark/notebooks/Iceberg - Getting Started.ipynb b/nessie-stack/spark/notebooks/Iceberg - Getting Started.ipynb new file mode 100644 index 0000000000000..fc817202e6420 --- /dev/null +++ b/nessie-stack/spark/notebooks/Iceberg - Getting Started.ipynb @@ -0,0 +1,509 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1041ae6f", + "metadata": {}, + "source": [ + "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" + ] + }, + { + "cell_type": "markdown", + "id": "247fb2ab", + "metadata": {}, + "source": [ + "### [Docker, Spark, and Iceberg: The Fastest Way to Try Iceberg!](https://tabular.io/blog/docker-spark-and-iceberg/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a5c8206", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", + "\n", + "spark" + ] + }, + { + "cell_type": "markdown", + "id": "6f9a9f41", + "metadata": {}, + "source": [ + "## Load One Month of NYC Taxi/Limousine Trip Data\n", + "\n", + "For this notebook, we will use the New York City Taxi and Limousine Commision Trip Record Data that's available on the AWS Open Data Registry. This contains data of trips taken by taxis and for-hire vehicles in New York City. We'll save this into an iceberg table called `taxis`." + ] + }, + { + "cell_type": "markdown", + "id": "747bee98", + "metadata": {}, + "source": [ + "To be able to rerun the notebook several times, let's drop the table if it exists to start fresh." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "930682ce", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE DATABASE IF NOT EXISTS nyc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f918310a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DROP TABLE IF EXISTS nyc.taxis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c37ca92", + "metadata": {}, + "outputs": [], + "source": [ + "df = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2021-04.parquet\")\n", + "df.write.saveAsTable(\"nyc.taxis\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fddb808", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DESCRIBE EXTENDED nyc.taxis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcf99fb3", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT COUNT(*) as cnt\n", + "FROM nyc.taxis" + ] + }, + { + "cell_type": "markdown", + "id": "cffd2c03", + "metadata": {}, + "source": [ + "## Schema Evolution\n", + "\n", + "Adding, dropping, renaming, or altering columns is easy and safe in Iceberg. In this example, we'll rename `fare_amount` to `fare` and `trip_distance` to `distance`. We'll also add a float column `fare_per_distance_unit` immediately after `distance`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efee8252", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.taxis RENAME COLUMN fare_amount TO fare" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "794de3a0", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.taxis RENAME COLUMN trip_distance TO distance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adac7564", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.taxis ALTER COLUMN distance COMMENT 'The elapsed trip distance in miles reported by the taximeter.'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32d7e6ef", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.taxis ALTER COLUMN distance TYPE double;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fb4b02a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.taxis ALTER COLUMN distance AFTER fare;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81f7cc19", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.taxis\n", + "ADD COLUMN fare_per_distance_unit float AFTER distance" + ] + }, + { + "cell_type": "markdown", + "id": "9416b498", + "metadata": {}, + "source": [ + "Let's update the new `fare_per_distance_unit` to equal `fare` divided by `distance`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18771ccb", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "UPDATE nyc.taxis\n", + "SET fare_per_distance_unit = fare/distance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09c72ca5", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT\n", + "VendorID\n", + ",tpep_pickup_datetime\n", + ",tpep_dropoff_datetime\n", + ",fare\n", + ",distance\n", + ",fare_per_distance_unit\n", + "FROM nyc.taxis" + ] + }, + { + "cell_type": "markdown", + "id": "37582e02", + "metadata": {}, + "source": [ + "## Expressive SQL for Row Level Changes\n", + "With Iceberg tables, `DELETE` queries can be used to perform row-level deletes. This is as simple as providing the table name and a `WHERE` predicate. If the filter matches an entire partition of the table, Iceberg will intelligently perform a metadata-only operation where it simply deletes the metadata for that partition.\n", + "\n", + "Let's perform a row-level delete for all rows that have a `fare_per_distance_unit` greater than 4 or a `distance` greater than 2. This should leave us with relatively short trips that have a relatively high fare per distance traveled." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ded820f1", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DELETE FROM nyc.taxis\n", + "WHERE fare_per_distance_unit > 4.0 OR distance > 2.0" + ] + }, + { + "cell_type": "markdown", + "id": "faef3712", + "metadata": {}, + "source": [ + "There are some fares that have a `null` for `fare_per_distance_unit` due to the distance being `0`. Let's remove those as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18b69265", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DELETE FROM nyc.taxis\n", + "WHERE fare_per_distance_unit is null" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b92d7db", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT\n", + "VendorID\n", + ",tpep_pickup_datetime\n", + ",tpep_dropoff_datetime\n", + ",fare\n", + ",distance\n", + ",fare_per_distance_unit\n", + "FROM nyc.taxis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d5472b7", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT COUNT(*) as cnt\n", + "FROM nyc.taxis" + ] + }, + { + "cell_type": "markdown", + "id": "c4b157e5", + "metadata": {}, + "source": [ + "## Partitioning\n", + "\n", + "A table’s partitioning can be updated in place and applied only to newly written data. Query plans are then split, using the old partition scheme for data written before the partition scheme was changed, and using the new partition scheme for data written after. People querying the table don’t even have to be aware of this split. Simple predicates in WHERE clauses are automatically converted to partition filters that prune out files with no matches. This is what’s referred to in Iceberg as *Hidden Partitioning*." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30e3e3b7", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.taxis\n", + "ADD PARTITION FIELD VendorID" + ] + }, + { + "cell_type": "markdown", + "id": "6fce6bb4", + "metadata": {}, + "source": [ + "## Metadata Tables\n", + "\n", + "Iceberg tables contain very rich metadata that can be easily queried. For example, you can retrieve the manifest list for any snapshot, simply by querying the table's `snapshots` table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fade1a3", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT snapshot_id, manifest_list\n", + "FROM nyc.taxis.snapshots" + ] + }, + { + "cell_type": "markdown", + "id": "64887133", + "metadata": {}, + "source": [ + "The `files` table contains loads of information on data files, including column level statistics such as null counts, lower bounds, and upper bounds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cb712f7", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT file_path, file_format, record_count, null_value_counts, lower_bounds, upper_bounds\n", + "FROM nyc.taxis.files" + ] + }, + { + "cell_type": "markdown", + "id": "65deb074", + "metadata": {}, + "source": [ + "## Time Travel\n", + "\n", + "The history table lists all snapshots and which parent snapshot they derive from. The `is_current_ancestor` flag let's you know if a snapshot is part of the linear history of the current snapshot of the table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bab64f90", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM nyc.taxis.history" + ] + }, + { + "cell_type": "markdown", + "id": "47129d69", + "metadata": {}, + "source": [ + "You can time-travel by altering the `current-snapshot-id` property of the table to reference any snapshot in the table's history. Let's revert the table to it's original state by traveling to the very first snapshot ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c360238", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql --var df\n", + "\n", + "SELECT *\n", + "FROM nyc.taxis.history" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8df43d00", + "metadata": {}, + "outputs": [], + "source": [ + "original_snapshot = df.head().snapshot_id\n", + "spark.sql(f\"CALL system.rollback_to_snapshot('nyc.taxis', {original_snapshot})\")\n", + "original_snapshot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "955a4c52", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT\n", + "VendorID\n", + ",tpep_pickup_datetime\n", + ",tpep_dropoff_datetime\n", + ",fare\n", + ",distance\n", + ",fare_per_distance_unit\n", + "FROM nyc.taxis" + ] + }, + { + "cell_type": "markdown", + "id": "67b71c76", + "metadata": {}, + "source": [ + "Another look at the history table shows that the original state of the table has been added as a new entry\n", + "with the original snapshot ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91b801d3", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM nyc.taxis.history" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85667efc", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT COUNT(*) as cnt\n", + "FROM nyc.taxis" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nessie-stack/spark/notebooks/Iceberg - Integrated Audits Demo.ipynb b/nessie-stack/spark/notebooks/Iceberg - Integrated Audits Demo.ipynb new file mode 100644 index 0000000000000..4b60b35e45cf4 --- /dev/null +++ b/nessie-stack/spark/notebooks/Iceberg - Integrated Audits Demo.ipynb @@ -0,0 +1,626 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1041ae6f", + "metadata": {}, + "source": [ + "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" + ] + }, + { + "cell_type": "markdown", + "id": "247fb2ab", + "metadata": {}, + "source": [ + "### [Integrated Audits: Streamlined Data Observability with Apache Iceberg](https://tabular.io/blog/integrated-audits/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd61c16f", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", + "\n", + "spark" + ] + }, + { + "cell_type": "markdown", + "id": "747bee98", + "metadata": {}, + "source": [ + "To be able to rerun the notebook several times, let's drop the `permits` table if it exists to start fresh." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26245f7e", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE DATABASE IF NOT EXISTS nyc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08a13fcc", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DROP TABLE IF EXISTS nyc.permits" + ] + }, + { + "cell_type": "markdown", + "id": "eead44c0", + "metadata": {}, + "source": [ + "# Load NYC Film Permits Data" + ] + }, + { + "cell_type": "markdown", + "id": "6f9a9f41", + "metadata": {}, + "source": [ + "For this demo, we will use the [New York City Film Permits dataset](https://data.cityofnewyork.us/City-Government/Film-Permits/tg4x-b46p) available as part of the NYC Open Data initiative. We're using a locally saved copy of a 1000 record sample, but feel free to download the entire dataset to use in this notebook!\n", + "\n", + "We'll save the sample dataset into an iceberg table called `permits`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3cc669a", + "metadata": {}, + "outputs": [], + "source": [ + "df = spark.read.option(\"inferSchema\",\"true\").option(\"multiline\",\"true\").json(\"/home/iceberg/data/nyc_film_permits.json\")\n", + "df.write.saveAsTable(\"nyc.permits\")" + ] + }, + { + "cell_type": "markdown", + "id": "378cf187", + "metadata": {}, + "source": [ + "Taking a quick peek at the data, you can see that there are a number of permits for different boroughs in New York." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3170161", + "metadata": {}, + "outputs": [], + "source": [ + "spark.read \\\n", + " .format(\"iceberg\") \\\n", + " .load(\"nyc.permits\") \\\n", + " .groupBy(\"borough\") \\\n", + " .count() \\\n", + " .show()" + ] + }, + { + "cell_type": "markdown", + "id": "c85a71a2", + "metadata": {}, + "source": [ + "# Generate an ID for an Integrated Audit Session" + ] + }, + { + "cell_type": "markdown", + "id": "182510da", + "metadata": {}, + "source": [ + "An integrated audit session is a single cadence of:\n", + "1. Staging changes to a table\n", + "2. Auditing the staged changes\n", + "3. Committing the changes (optional)\n", + "\n", + "Each of these sessions must be represented with an ID. You can use any convention that makes sense in your environment but in this demo we'll simply use a UUID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e39d3d1", + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "ia_session_id = uuid.uuid4().hex\n", + "ia_session_id" + ] + }, + { + "cell_type": "markdown", + "id": "fa31a9ea", + "metadata": {}, + "source": [ + "# The Setup" + ] + }, + { + "cell_type": "markdown", + "id": "d845953b", + "metadata": {}, + "source": [ + "Tables by default are not configured to allow integrated audits, therefore the first step is enabling this by setting the `write.wap.enabled` table metadata property to `true`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf29df0b", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.permits\n", + "SET TBLPROPERTIES (\n", + " 'write.wap.enabled'='true'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1dc5ad69", + "metadata": {}, + "source": [ + "Next, the `spark.wap.id` property of your Spark session configuration must be set to the integrated audit session ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65bc4280", + "metadata": {}, + "outputs": [], + "source": [ + "spark.conf.set('spark.wap.id', ia_session_id)" + ] + }, + { + "cell_type": "markdown", + "id": "3a34995b", + "metadata": {}, + "source": [ + "With a `spark.wap.id` value set, you can now safely write **directly to the permits table**--don't worry, these changes will only be staged, not committed!" + ] + }, + { + "cell_type": "markdown", + "id": "437088f6", + "metadata": {}, + "source": [ + "# Staging The Changes" + ] + }, + { + "cell_type": "markdown", + "id": "1c9fa6e9", + "metadata": {}, + "source": [ + "To stage the changes, you simply write directly to the `permits` table. This is awesome in situations where you're working with a large and complex data ingestion pipeline.\n", + "Instead of including hard-coded logic in your pipeline to switch between a sort of \"audit-mode\" as opposed to \"production-mode\", with integrated audits you simple run your\n", + "production code!\n", + "\n", + "For this demo, let's use a simple query that deletes all records for film permits in the manhattan borough." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14843243", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DELETE FROM nyc.permits\n", + "WHERE borough='Manhattan'" + ] + }, + { + "cell_type": "markdown", + "id": "56cc8478", + "metadata": {}, + "source": [ + "As described, even though the query was executed against the production table, these changes are only staged and not committed since we are within an integrated audit session. Let's confirm this by verifying that a count by borough still includes the Manhattan records." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95df15e9", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits\n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "3fe2c863", + "metadata": {}, + "source": [ + "# The Audit" + ] + }, + { + "cell_type": "markdown", + "id": "a7935b0d", + "metadata": {}, + "source": [ + "Once the changes for this session are staged, you can perform all of your audits to validate the data. The first step is to retrieve the snapshot ID generated by the changes and tagged with this integrated audit session ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95d71430", + "metadata": {}, + "outputs": [], + "source": [ + "query = f\"\"\"\n", + "SELECT snapshot_id\n", + "FROM nyc.permits.snapshots\n", + "WHERE summary['wap.id'] = '{ia_session_id}'\n", + "\"\"\"\n", + "\n", + "ia_session_snapshot = spark.sql(query).head().snapshot_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1035b246", + "metadata": {}, + "outputs": [], + "source": [ + "ia_session_snapshot" + ] + }, + { + "cell_type": "markdown", + "id": "4c602800", + "metadata": {}, + "source": [ + "This snapshot includes the staged (but not commited) changes to your production table. Once you have this snapshot ID, you can use Iceberg's Time Travel feature to query it!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c95130e9", + "metadata": {}, + "outputs": [], + "source": [ + "spark.read \\\n", + " .option(\"snapshot-id\", ia_session_snapshot) \\\n", + " .format(\"iceberg\") \\\n", + " .load(\"nyc.permits\") \\\n", + " .groupBy(\"borough\") \\\n", + " .count() \\\n", + " .show()" + ] + }, + { + "cell_type": "markdown", + "id": "0cab3813", + "metadata": {}, + "source": [ + "At this point, you can use any auditing tool or technique to validate your changes. For this demo, we'll do a simple audit that confirms that the only remaining boroughs are Queens, Brooklyn, Bronx, and Staten Island. If either borough is missing or any additional boroughs are found, we'll raise an exception." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82af5de3", + "metadata": {}, + "outputs": [], + "source": [ + "expected_boroughs = {\"Queens\", \"Brooklyn\", \"Bronx\", \"Staten Island\"}\n", + "distinct_boroughs = spark.read \\\n", + " .option(\"snapshot-id\", ia_session_snapshot) \\\n", + " .format(\"iceberg\") \\\n", + " .load(\"nyc.permits\") \\\n", + " .select(\"borough\") \\\n", + " .distinct() \\\n", + " .toLocalIterator()\n", + "boroughs = {row[0] for row in distinct_boroughs}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4fad7c2", + "metadata": {}, + "outputs": [], + "source": [ + "# Since `boroughs` and `required_boroughs` are both sets (array of distinct items),\n", + "# we can confirm that they match by checking that the lengths of the sets are equal\n", + "# to eachother as well as to the union of both sets.\n", + "if len(boroughs) != len(expected_boroughs) != len(set.union(boroughs, expected_boroughs)):\n", + " raise ValueError(f\"Audit failed, borough set does not match expected boroughs: {boroughs} != {expected_boroughs}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b1032255", + "metadata": {}, + "source": [ + "If the above check does not fail, we can go ahead and commit our staged data to publish our changes!" + ] + }, + { + "cell_type": "markdown", + "id": "2079435b", + "metadata": {}, + "source": [ + "# The Publish" + ] + }, + { + "cell_type": "markdown", + "id": "88d59f50", + "metadata": {}, + "source": [ + "After the audits are completed, publishing the data is as simple as running a `cherrypick_snapshot` stored procedure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "056236ba", + "metadata": {}, + "outputs": [], + "source": [ + "publish_query = f\"CALL system.cherrypick_snapshot('nyc.permits', {ia_session_snapshot})\"\n", + "%sql $publish_query" + ] + }, + { + "cell_type": "markdown", + "id": "17b868e8", + "metadata": {}, + "source": [ + "That's it! Publishing the changes from this integrated audit session is a simple metadata-only operation that instantly makes the changes live for all downstream consumers querying the `permits` table! Query results will now include the commit that removed all Manhattan records." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "930682ce", + "metadata": {}, + "outputs": [], + "source": [ + "spark.read \\\n", + " .format(\"iceberg\") \\\n", + " .load(\"nyc.permits\") \\\n", + " .groupBy(\"borough\") \\\n", + " .count() \\\n", + " .show()" + ] + }, + { + "cell_type": "markdown", + "id": "d941b990", + "metadata": {}, + "source": [ + "# What Happens When The Audits Fail?" + ] + }, + { + "cell_type": "markdown", + "id": "f6b4084e", + "metadata": {}, + "source": [ + "What about when your audits fail? What happens to the snapshots generated? How about the data and metadata files?\n", + "\n", + "One of the best parts of Iceberg's integrated audits is that the cleanup of \"*staged-yet-not-committed-data*\" is part of the normal snapshot cleanup process of a typical Iceberg warehouse. To be more specific, let's say a daily snapshot expiration is performed on the data warehouse (using the [expire_snapshots](https://iceberg.apache.org/docs/latest/spark-procedures/#expire_snapshots) procedure) and all snapshots older than 7 days are expired. That means once your staged snapshot reaches 7 days in age, it will be expired.\n", + "\n", + "Additionally, since the changes were never committed, the underlying data files for the snapshot will be removed since they're not referenced by any other snapshots in the linear history of the table.\n", + "\n", + "Let's see this in action. First, start a new integrated audit session and stage a commit by inserting a single record." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25eff1ef", + "metadata": {}, + "outputs": [], + "source": [ + "ia_session_id = uuid.uuid4().hex\n", + "ia_session_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4726b169", + "metadata": {}, + "outputs": [], + "source": [ + "spark.conf.set('spark.wap.id', ia_session_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31bf19f1", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "INSERT INTO nyc.permits\n", + "VALUES (\n", + " 'Hoboken',\n", + " 'Television',\n", + " '1',\n", + " 'United States of America',\n", + " '2021-11-24T23:00:00.000',\n", + " '2021-11-23T09:38:17.000',\n", + " 'Mayor\\'s Office of Film, Theatre & Broadcasting',\n", + " '613322',\n", + " 'Shooting Permit',\n", + " 'WASHINGTON STREET',\n", + " '100',\n", + " '2021-11-24T07:00:00.000',\n", + " 'Episodic series',\n", + " '07030'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "aa29184b", + "metadata": {}, + "source": [ + "Next, let's identify the snapshot that was tagged with the integrated audit session ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "682a5f52", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT snapshot_id\n", + "FROM nyc.permits.snapshots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef4dd148", + "metadata": {}, + "outputs": [], + "source": [ + "query = f\"\"\"\n", + "SELECT snapshot_id\n", + "FROM nyc.permits.snapshots\n", + "WHERE summary['wap.id'] = '{ia_session_id}'\n", + "\"\"\"\n", + "\n", + "ia_session_snapshot = spark.sql(query).head().snapshot_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62f52c08", + "metadata": {}, + "outputs": [], + "source": [ + "ia_session_snapshot" + ] + }, + { + "cell_type": "markdown", + "id": "60561dff", + "metadata": {}, + "source": [ + "A quick check of the history table shows that this snapshot is not included as part of the current history of the table since it has not been published yet." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec96a9c0", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM nyc.permits.history" + ] + }, + { + "cell_type": "markdown", + "id": "6ec54f3a", + "metadata": {}, + "source": [ + "In a scenario where the audits fail and this change is not published, the `expire_snapshots` procedure will clean up the snapshot **and** the data files. Let's demonstrate this by calling the `expire_snapshots` procedure for all snapshots older than the current timestamp." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4727c61e", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "%sql CALL system.expire_snapshots('nyc.permits', {round(time.time() * 1000)}, 100)" + ] + }, + { + "cell_type": "markdown", + "id": "c8e47351", + "metadata": {}, + "source": [ + "The output from the `expire_snapshots` procedure shows that a data file, a manifest file, and a manifest list file were deleted. Furthermore, the snapshot no longer appears in the permit table's snapshots table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53f53072", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT *\n", + "FROM nyc.permits.snapshots" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nessie-stack/spark/notebooks/Iceberg - Table Maintenance Spark Procedures.ipynb b/nessie-stack/spark/notebooks/Iceberg - Table Maintenance Spark Procedures.ipynb new file mode 100644 index 0000000000000..8f3192b49542e --- /dev/null +++ b/nessie-stack/spark/notebooks/Iceberg - Table Maintenance Spark Procedures.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1041ae6f", + "metadata": {}, + "source": [ + "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" + ] + }, + { + "cell_type": "markdown", + "id": "247fb2ab", + "metadata": {}, + "source": [ + "### [Table Maintenance: The Key To Keeping Your Iceberg Tables Healthy and Performant](https://tabular.io/blog/table-maintenance/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a5c8206", + "metadata": {}, + "outputs": [], + "source": [ + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dab5ef0", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"DROP TABLE IF EXISTS demo.nyc.taxis_sample\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49a45d0b", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"\"\"\n", + "CREATE TABLE demo.nyc.taxis_sample (\n", + " `VendorID` BIGINT,\n", + " `tpep_pickup_datetime` TIMESTAMP,\n", + " `tpep_dropoff_datetime` TIMESTAMP,\n", + " `passenger_count` DOUBLE,\n", + " `trip_distance` DOUBLE,\n", + " `RatecodeID` DOUBLE,\n", + " `store_and_fwd_flag` STRING,\n", + " `PULocationID` BIGINT,\n", + " `DOLocationID` BIGINT,\n", + " `payment_type` BIGINT,\n", + " `fare_amount` DOUBLE,\n", + " `extra` DOUBLE,\n", + " `mta_tax` DOUBLE,\n", + " `tip_amount` DOUBLE,\n", + " `tolls_amount` DOUBLE,\n", + " `improvement_surcharge` DOUBLE,\n", + " `total_amount` DOUBLE,\n", + " `congestion_surcharge` DOUBLE,\n", + " `airport_fee` DOUBLE)\n", + "USING iceberg\n", + "TBLPROPERTIES(\n", + " 'write.target-file-size-bytes'='5242880'\n", + ")\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "997bb9df", + "metadata": {}, + "outputs": [], + "source": [ + "val df_202201 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-01.parquet\")\n", + "val df_202202 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-02.parquet\")\n", + "val df_202203 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-03.parquet\")\n", + "val df_q1 = df_202201.union(df_202202).union(df_202203)\n", + "df_q1.write.insertInto(\"nyc.taxis_sample\")" + ] + }, + { + "cell_type": "markdown", + "id": "78cab088", + "metadata": {}, + "source": [ + "## Rewriting Data Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ad64e6b", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT file_path, file_size_in_bytes FROM nyc.taxis_sample.files\").show(100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5d10355", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"ALTER TABLE nyc.taxis_sample UNSET TBLPROPERTIES ('write.target-file-size-bytes')\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f26228a5", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"CALL demo.system.rewrite_data_files(table => 'nyc.taxis_sample', options => map('target-file-size-bytes','52428800'))\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43a9ed67", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT file_path, file_size_in_bytes FROM nyc.taxis_sample.files\").show(100)" + ] + }, + { + "cell_type": "markdown", + "id": "523eb893", + "metadata": {}, + "source": [ + "## Expiring Snapshots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98e8c5db", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT committed_at, snapshot_id, operation FROM nyc.taxis_sample.snapshots\").show(truncate=false)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b264c989", + "metadata": {}, + "outputs": [], + "source": [ + "val now = java.util.Calendar.getInstance().getTime()\n", + "val format = new java.text.SimpleDateFormat(\"yyyy-MM-dd HH:mm:ss.SSS\")\n", + "val now_str = format.format(now)\n", + "\n", + "spark.sql(s\"CALL demo.system.expire_snapshots(table => 'nyc.taxis_sample', older_than => TIMESTAMP '$now_str', retain_last => 1)\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "131e1f09", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT committed_at, snapshot_id, operation FROM nyc.taxis_sample.snapshots\").show(truncate=false)" + ] + }, + { + "cell_type": "markdown", + "id": "181212b6", + "metadata": {}, + "source": [ + "## Rewriting Manifest Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49290e56", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"CALL demo.system.rewrite_manifests('nyc.taxis_sample')\").show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "spylon-kernel", + "language": "scala", + "name": "spylon-kernel" + }, + "language_info": { + "codemirror_mode": "text/x-scala", + "file_extension": ".scala", + "help_links": [ + { + "text": "MetaKernel Magics", + "url": "https://metakernel.readthedocs.io/en/latest/source/README.html" + } + ], + "mimetype": "text/x-scala", + "name": "scala", + "pygments_lexer": "scala", + "version": "0.4.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nessie-stack/spark/notebooks/Iceberg - View Support.ipynb b/nessie-stack/spark/notebooks/Iceberg - View Support.ipynb new file mode 100644 index 0000000000000..fa8dd7f6bbc3e --- /dev/null +++ b/nessie-stack/spark/notebooks/Iceberg - View Support.ipynb @@ -0,0 +1,525 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1041ae6f", + "metadata": {}, + "source": [ + "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a5c8206", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", + "\n", + "spark" + ] + }, + { + "cell_type": "markdown", + "id": "6f9a9f41", + "metadata": {}, + "source": [ + "## Load Two Months of NYC Taxi/Limousine Trip Data\n", + "\n", + "This notebook uses the New York City Taxi and Limousine Commission Trip Record Data available on the AWS Open Data Registry. This contains data of trips taken by taxis and for-hire vehicles in New York City. This data is stored in an iceberg table called `taxis`." + ] + }, + { + "cell_type": "markdown", + "id": "747bee98", + "metadata": {}, + "source": [ + "To be able to rerun the notebook several times, let's drop the table and the views if they exist to start fresh." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "930682ce", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE DATABASE IF NOT EXISTS nyc.taxis;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22ac5552", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DROP TABLE IF EXISTS nyc.taxis\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cf5b4c0-89ac-4f79-8beb-fc55554bab22", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DROP VIEW IF EXISTS nyc.long_distances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a443155-85fe-4e7a-8216-9669e0765c93", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DROP VIEW IF EXISTS nyc.negative_amounts" + ] + }, + { + "cell_type": "markdown", + "id": "5816de2e", + "metadata": {}, + "source": [ + "## Create the table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f918310a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE TABLE nyc.taxis (\n", + " VendorID bigint,\n", + " tpep_pickup_datetime timestamp,\n", + " tpep_dropoff_datetime timestamp,\n", + " passenger_count double,\n", + " trip_distance double,\n", + " RatecodeID double,\n", + " store_and_fwd_flag string,\n", + " PULocationID bigint,\n", + " DOLocationID bigint,\n", + " payment_type bigint,\n", + " fare_amount double,\n", + " extra double,\n", + " mta_tax double,\n", + " tip_amount double,\n", + " tolls_amount double,\n", + " improvement_surcharge double,\n", + " total_amount double,\n", + " congestion_surcharge double,\n", + " airport_fee double\n", + ")\n", + "USING iceberg\n", + "PARTITIONED BY (days(tpep_pickup_datetime))" + ] + }, + { + "cell_type": "markdown", + "id": "fcba103e", + "metadata": {}, + "source": [ + "# Write a month of data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c37ca92", + "metadata": {}, + "outputs": [], + "source": [ + "df = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-01.parquet\")\n", + "df.writeTo(\"nyc.taxis\").append()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a69152aa", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT * FROM nyc.taxis" + ] + }, + { + "cell_type": "markdown", + "id": "fd854d56-33d5-46a5-b552-869479b8e188", + "metadata": {}, + "source": [ + "# Create a view\n", + "\n", + "Let's create an Iceberg view to look at the longest distances travelled and the total amount of the trips." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fade1a3", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE VIEW nyc.long_distances (\n", + " vendor_id COMMENT 'Vendor ID',\n", + " pickup_date,\n", + " dropoff_date,\n", + " distance COMMENT 'Trip Distance',\n", + " total COMMENT 'Total amount')\n", + " AS SELECT VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, trip_distance, total_amount FROM nyc.taxis ORDER BY trip_distance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfee5d8f-f862-4aa3-a096-8ff9ea66ba26", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT * FROM nyc.long_distances" + ] + }, + { + "cell_type": "markdown", + "id": "6fce6bb4", + "metadata": {}, + "source": [ + "## Update View to order results differently\n", + "\n", + "The output isn't as helpful as imagined, so let's update the view and change the order of columns and the ordering of the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74c10267-d65b-4650-ab92-02a978f5872a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE OR REPLACE VIEW nyc.long_distances (\n", + " distance COMMENT 'Trip Distance',\n", + " total COMMENT 'Total amount',\n", + " vendor_id COMMENT 'Vendor ID',\n", + " pickup_date,\n", + " dropoff_date)\n", + " AS SELECT trip_distance, total_amount, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime\n", + " FROM nyc.taxis\n", + " WHERE trip_distance > 35 ORDER BY total_amount, trip_distance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e764a28-297f-4c8d-87dc-45ae63380d6e", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT * FROM nyc.long_distances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "369b6e37-c7b3-4402-9087-2d9074b53dd7", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT count(*) FROM nyc.long_distances" + ] + }, + { + "cell_type": "markdown", + "id": "4aa4a9cd", + "metadata": {}, + "source": [ + "# Write a month of data\n", + "\n", + "Let's write another month of data and see how the results of the view change" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed7a7b8f", + "metadata": {}, + "outputs": [], + "source": [ + "df = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-02.parquet\")\n", + "df.writeTo(\"nyc.taxis\").append()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbfb160c", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT * FROM nyc.long_distances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8085d47e-d629-408c-9753-95f58fac23c5", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT count(*) FROM nyc.long_distances" + ] + }, + { + "cell_type": "markdown", + "id": "35bf8f88-a493-42c0-b7a9-3941f8ebf4c8", + "metadata": {}, + "source": [ + "# Create another view\n", + "It appears that there are trips with negative total amounts. Let's display these results in a separate view" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa47bf43-2460-4990-88df-6040897c3386", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE OR REPLACE VIEW nyc.negative_amounts (\n", + " total COMMENT 'Total amount',\n", + " distance COMMENT 'Trip Distance',\n", + " vendor_id COMMENT 'Vendor ID',\n", + " pickup_date,\n", + " dropoff_date)\n", + " AS SELECT total_amount, trip_distance, VendorID, tpep_pickup_datetime, tpep_dropoff_datetime\n", + " FROM nyc.taxis\n", + " WHERE total_amount < 0 ORDER BY total_amount" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6c6a306-1752-4a6d-9213-d5b615110b1d", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT * FROM nyc.negative_amounts" + ] + }, + { + "cell_type": "markdown", + "id": "65deb074", + "metadata": {}, + "source": [ + "# Listing and describing views" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bab64f90", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SHOW VIEWS in nyc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be3b1930-e140-4795-81d9-9e5abe626fb7", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SHOW VIEWS in nyc LIKE '*neg*'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfc2dcb3-4717-4730-94b6-18b9a239cf74", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DESCRIBE nyc.long_distances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7653ef78-f419-462b-915b-0cbd9f62d473", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DESCRIBE EXTENDED nyc.long_distances" + ] + }, + { + "cell_type": "markdown", + "id": "b11e64c9", + "metadata": {}, + "source": [ + "# Displaying the CREATE statement of a view" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c4a942c", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SHOW CREATE TABLE nyc.long_distances" + ] + }, + { + "cell_type": "markdown", + "id": "42f6e042-00ce-4277-bf9a-16931f898d7b", + "metadata": {}, + "source": [ + "# Altering and displaying properties of a view\n", + "\n", + "This will add a new property and also update the comment of the view. \n", + "The comment will be shown when describing the view.\n", + "The end of this section will also remove a property from the view." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa823a4c-ede3-40d7-906e-27818070fa9b", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SHOW TBLPROPERTIES nyc.long_distances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c3b2fb4-4db9-408f-a36d-84970108dd5b", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER VIEW nyc.long_distances SET TBLPROPERTIES ('key1' = 'val1', 'key2' = 'val2', 'comment' = 'This is a view comment')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a8dd804-d222-44e2-92b9-2069868e206a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SHOW TBLPROPERTIES nyc.long_distances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1950bd5d-a5fc-4ee1-a4a9-1242261232f0", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DESCRIBE EXTENDED nyc.long_distances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88781989-0967-4349-b435-ad193c9697e7", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER VIEW nyc.long_distances UNSET TBLPROPERTIES ('key1')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6864599-b4fa-4525-8304-f1cb3ee7144a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SHOW TBLPROPERTIES nyc.long_distances" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nessie-stack/spark/notebooks/Iceberg - Write-Audit-Publish (WAP) with Branches.ipynb b/nessie-stack/spark/notebooks/Iceberg - Write-Audit-Publish (WAP) with Branches.ipynb new file mode 100644 index 0000000000000..a1db83ae6c8c3 --- /dev/null +++ b/nessie-stack/spark/notebooks/Iceberg - Write-Audit-Publish (WAP) with Branches.ipynb @@ -0,0 +1,788 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1041ae6f", + "metadata": {}, + "source": [ + "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" + ] + }, + { + "cell_type": "markdown", + "id": "7b6633c1", + "metadata": {}, + "source": [ + "## Write-Audit-Publish with Branches in Apache Iceberg" + ] + }, + { + "cell_type": "markdown", + "id": "08d9d173", + "metadata": {}, + "source": [ + "This notebook runs using the Docker Compose at https://github.com/tabular-io/docker-spark-iceberg. \n", + "It's based on the [Iceberg - Integrated Audits Demo.ipynb](https://github.com/tabular-io/docker-spark-iceberg/blob/main/spark/notebooks/Iceberg%20-%20Integrated%20Audits%20Demo.ipynb) notebook. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd61c16f", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n", + "\n", + "spark" + ] + }, + { + "cell_type": "markdown", + "id": "747bee98", + "metadata": {}, + "source": [ + "To be able to rerun the notebook several times, let's drop the `permits` table if it exists to start fresh." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26245f7e", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE DATABASE IF NOT EXISTS nyc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f66e5810", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DROP TABLE IF EXISTS nyc.permits" + ] + }, + { + "cell_type": "markdown", + "id": "eead44c0", + "metadata": {}, + "source": [ + "# Load NYC Film Permits Data" + ] + }, + { + "cell_type": "markdown", + "id": "6f9a9f41", + "metadata": {}, + "source": [ + "For this demo, we will use the [New York City Film Permits dataset](https://data.cityofnewyork.us/City-Government/Film-Permits/tg4x-b46p) available as part of the NYC Open Data initiative. We're using a locally saved copy of a 1000 record sample, but feel free to download the entire dataset to use in this notebook!\n", + "\n", + "We'll save the sample dataset into an iceberg table called `permits`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3cc669a", + "metadata": {}, + "outputs": [], + "source": [ + "df = spark.read.option(\"inferSchema\",\"true\").option(\"multiline\",\"true\").json(\"/home/iceberg/data/nyc_film_permits.json\")\n", + "df.write.saveAsTable(\"nyc.permits\")" + ] + }, + { + "cell_type": "markdown", + "id": "378cf187", + "metadata": {}, + "source": [ + "Taking a quick peek at the data, you can see that there are a number of permits for different boroughs in New York." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3170161", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits\n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "fa31a9ea", + "metadata": {}, + "source": [ + "# The Setup" + ] + }, + { + "cell_type": "markdown", + "id": "d845953b", + "metadata": {}, + "source": [ + "Tables by default are not configured to allow integrated audits, therefore the first step is enabling this by setting the `write.wap.enabled` table metadata property to `true`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf29df0b", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.permits\n", + "SET TBLPROPERTIES (\n", + " 'write.wap.enabled'='true'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6405f8a7", + "metadata": {}, + "source": [ + "We create a branch for the work we want to do. This is a copy-on-write branch, so \"free\" until we start making changes (and \"cheap\" thereafter) since only data that's changed needs to be written. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14035a18", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.permits\n", + "CREATE BRANCH etl_job_42" + ] + }, + { + "cell_type": "markdown", + "id": "437088f6", + "metadata": {}, + "source": [ + "# Write" + ] + }, + { + "cell_type": "markdown", + "id": "a24b066e", + "metadata": {}, + "source": [ + "Before writing to the table we set `spark.wap.branch` so that writes (and reads) are against the specified branch of the table. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "842e361c", + "metadata": {}, + "outputs": [], + "source": [ + "spark.conf.set('spark.wap.branch', 'etl_job_42')" + ] + }, + { + "cell_type": "markdown", + "id": "6b749826", + "metadata": {}, + "source": [ + "Now make the change to the table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14843243", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DELETE FROM nyc.permits\n", + "WHERE borough='Manhattan'" + ] + }, + { + "cell_type": "markdown", + "id": "8eb688c8", + "metadata": {}, + "source": [ + "## Inspecting the staged/unpublished data" + ] + }, + { + "cell_type": "markdown", + "id": "ce1b1256", + "metadata": {}, + "source": [ + "### Staged/unpublished data" + ] + }, + { + "cell_type": "markdown", + "id": "1c4c04cd", + "metadata": {}, + "source": [ + "The changes are reflected in the table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfbd0d4b", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits\n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "bf332168", + "metadata": {}, + "source": [ + "Note that because `spark.wap.branch` is set the above query is effectively the same as this one with `VERSION AS OF` for the branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cd4b72b", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits VERSION AS OF 'etl_job_42'\n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "0ac96fd0", + "metadata": {}, + "source": [ + "Another syntax (albiet less clear IMHO) for `VERSION AS OF` is a `branch_` suffix to the table: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "169de151", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits.branch_etl_job_42\n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "85332bf5", + "metadata": {}, + "source": [ + "### Published data" + ] + }, + { + "cell_type": "markdown", + "id": "5ad40cf1", + "metadata": {}, + "source": [ + "We can also inspect the unmodified `main` version of the table with `VERSION AS OF`: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95df15e9", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits VERSION AS OF 'main'\n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "bf419df3", + "metadata": {}, + "source": [ + "The same `branch_` suffix words here too: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "270c09c6", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits.branch_main\n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "c17c824f", + "metadata": {}, + "source": [ + "Any other user of the table will see the full set of data. We can reassure ourselves of this by unsetting `spark.wap.branch` for the session and querying the table without any `VERSION AS OF` modifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e9f7ea3", + "metadata": {}, + "outputs": [], + "source": [ + "spark.conf.unset('spark.wap.branch')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "935d46c8", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits\n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "a8ad4c4e", + "metadata": {}, + "source": [ + "# Audit" + ] + }, + { + "cell_type": "markdown", + "id": "ba43c910", + "metadata": {}, + "source": [ + "How you audit the data is up to you. The nice thing about the data being staged is that you can do it within the same ETL job, or have another tool do it. " + ] + }, + { + "cell_type": "markdown", + "id": "90485620", + "metadata": {}, + "source": [ + "Here's a very simple example of doing in Python. We're going to programatically check that only the four expected boroughs remain in the data. \n", + "\n", + "First, we define those that are expected: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d68a3b1", + "metadata": {}, + "outputs": [], + "source": [ + "expected_boroughs = {\"Queens\", \"Brooklyn\", \"Bronx\", \"Staten Island\"}" + ] + }, + { + "cell_type": "markdown", + "id": "b17dfd72", + "metadata": {}, + "source": [ + "Then we get a set of the actual boroughs in the staged data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ce3d70a", + "metadata": {}, + "outputs": [], + "source": [ + "distinct_boroughs = spark.read \\\n", + " .option(\"branch\", \"etl_job_42\") \\\n", + " .format(\"iceberg\") \\\n", + " .load(\"nyc.permits\") \\\n", + " .select(\"borough\") \\\n", + " .distinct() \\\n", + " .toLocalIterator()\n", + "boroughs = {row[0] for row in distinct_boroughs}" + ] + }, + { + "cell_type": "markdown", + "id": "4a30827d", + "metadata": {}, + "source": [ + "Now we do two checks: \n", + "\n", + "1. Compare the length of the expected vs actual set\n", + "2. Check that the two sets when unioned are still the same length. This is necessary, since the first test isn't sufficient alone" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aeb78c2b", + "metadata": {}, + "outputs": [], + "source": [ + "if ( (len(boroughs) != len(expected_boroughs)) \\\n", + " or (len(boroughs) != len(set.union(boroughs, expected_boroughs))) \\\n", + " or (len(expected_boroughs) != len(set.union(boroughs, expected_boroughs)))):\n", + " raise ValueError(f\"Audit failed, borough set does not match expected boroughs: {boroughs} != {expected_boroughs}\")\n", + "else:\n", + " print(f\"Audit has passed 🙌đŸģ\")" + ] + }, + { + "cell_type": "markdown", + "id": "28a18925", + "metadata": {}, + "source": [ + "# Publish" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7bc16e05", + "metadata": {}, + "source": [ + "Iceberg supports fast-forward merging of branches back to `main`, using the [`manageSnapshots().fastForwardBranch`](https://iceberg.apache.org/javadoc/latest/org/apache/iceberg/ManageSnapshots.html#fastForwardBranch-java.lang.String-java.lang.String-) API.\n", + "\n", + "This isn't yet exposed in Spark, so the existing [`cherrypick`](https://iceberg.apache.org/javadoc/latest/org/apache/iceberg/ManageSnapshots.html#cherrypick-long-) can be used as a slightly less elegant option.\n", + "\n", + "â„šī¸ Note that `cherrypick` only works for one commit. " + ] + }, + { + "cell_type": "markdown", + "id": "4619fe57", + "metadata": {}, + "source": [ + "First, we need the snapshot ID of our branch, which we can get from the `.refs` table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cd5d318", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT * FROM nyc.permits.refs " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c5f09f1", + "metadata": {}, + "outputs": [], + "source": [ + "query = f\"\"\"\n", + "SELECT snapshot_id\n", + "FROM nyc.permits.refs\n", + "WHERE name = 'etl_job_42'\n", + "\"\"\"\n", + "\n", + "wap_snapshot_id = spark.sql(query).head().snapshot_id" + ] + }, + { + "cell_type": "markdown", + "id": "58520bc1", + "metadata": {}, + "source": [ + "Now we do the publish, using `cherrypick_snapshot` and the snapshot id:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b93d6f91", + "metadata": {}, + "outputs": [], + "source": [ + "publish_query = f\"CALL system.cherrypick_snapshot('nyc.permits', {wap_snapshot_id})\"\n", + "\n", + "%sql $publish_query" + ] + }, + { + "cell_type": "markdown", + "id": "d7546923", + "metadata": {}, + "source": [ + "Finally, we look at the table and revel in the glory that is our published changes 🎉" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d62ebc6", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits.branch_etl_job_42\n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "0f60be58", + "metadata": {}, + "source": [ + "We can also inspect the unmodified `main` version of the table with `VERSION AS OF`: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97c7a98e", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits VERSION AS OF 'main'\n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "b0072a24", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# What if You Don't Want to Publish Changes?" + ] + }, + { + "cell_type": "markdown", + "id": "441d1e92", + "metadata": {}, + "source": [ + "If you don't want to merge the branch you can simply `DROP` it. " + ] + }, + { + "cell_type": "markdown", + "id": "49dd455a", + "metadata": {}, + "source": [ + "## Create a new branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dd25215", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.permits\n", + "CREATE BRANCH new_etl_job" + ] + }, + { + "cell_type": "markdown", + "id": "bc9bac1e", + "metadata": {}, + "source": [ + "## Set `spark.wap.branch`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b2e2284", + "metadata": {}, + "outputs": [], + "source": [ + "spark.conf.set('spark.wap.branch', 'new_etl_job')" + ] + }, + { + "cell_type": "markdown", + "id": "a5f65ce0", + "metadata": {}, + "source": [ + "## Write" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4dbd6ed3", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "DELETE FROM nyc.permits WHERE borough LIKE '%'" + ] + }, + { + "cell_type": "markdown", + "id": "0344e3d0", + "metadata": {}, + "source": [ + "## Audit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea44436a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits \n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "b5974d6e", + "metadata": {}, + "source": [ + "### Whoops 🤭 \n", + "We deleted all the data" + ] + }, + { + "cell_type": "markdown", + "id": "74ebb57d", + "metadata": {}, + "source": [ + "### Reassure ourselves that the data is still there in `main`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87d8ee7e", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT borough, count(*) permit_cnt\n", + "FROM nyc.permits VERSION AS OF 'main'\n", + "GROUP BY borough" + ] + }, + { + "cell_type": "markdown", + "id": "22d5d357", + "metadata": {}, + "source": [ + "## Abandon changes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0533a1f6", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER TABLE nyc.permits\n", + "DROP BRANCH new_etl_job" + ] + }, + { + "cell_type": "markdown", + "id": "328ed36c", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Where Next?" + ] + }, + { + "cell_type": "markdown", + "id": "cca251c2", + "metadata": {}, + "source": [ + "For more information about write-audit-publish see [this talk from Michelle Winters](https://www.youtube.com/watch?v=fXHdeBnpXrg&t=1001s) and [this talk from Sam Redai](https://www.dremio.com/wp-content/uploads/2022/05/Sam-Redai-The-Write-Audit-Publish-Pattern-via-Apache-Iceberg.pdf)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nessie-stack/spark/notebooks/PyIceberg - Getting Started.ipynb b/nessie-stack/spark/notebooks/PyIceberg - Getting Started.ipynb new file mode 100644 index 0000000000000..e58f19c888164 --- /dev/null +++ b/nessie-stack/spark/notebooks/PyIceberg - Getting Started.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1041ae6f", + "metadata": {}, + "source": [ + "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a5c8206", + "metadata": {}, + "outputs": [], + "source": [ + "from pyiceberg import __version__\n", + "\n", + "__version__" + ] + }, + { + "cell_type": "markdown", + "id": "6f9a9f41", + "metadata": {}, + "source": [ + "## Load NYC Taxi/Limousine Trip Data\n", + "\n", + "For this notebook, we will use the New York City Taxi and Limousine Commision Trip Record Data that's available on the AWS Open Data Registry. This contains data of trips taken by taxis and for-hire vehicles in New York City. We'll save this into an iceberg table called `taxis`.\n", + "\n", + "First, load the Parquet file using PyArrow:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a890a18-6078-4574-8ade-7598ba91bf6b", + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow.parquet as pq\n", + "\n", + "tbl_taxis = pq.read_table('/home/iceberg/data/yellow_tripdata_2021-04.parquet')\n", + "tbl_taxis" + ] + }, + { + "cell_type": "markdown", + "id": "63d2ac2a-7e21-4f5b-b357-a020797099fd", + "metadata": {}, + "source": [ + "## Creating the table\n", + "\n", + "Next, create the namespace, and the `taxis` table from the schema that's derived from the Arrow schema:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fddb808", + "metadata": {}, + "outputs": [], + "source": [ + "from pyiceberg.catalog import load_catalog\n", + "from pyiceberg.exceptions import NamespaceAlreadyExistsError\n", + "\n", + "cat = load_catalog('default')\n", + "\n", + "try:\n", + " cat.create_namespace('default')\n", + "except NamespaceAlreadyExistsError:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "430bd828-f856-4230-aff7-94274fbce96d", + "metadata": {}, + "outputs": [], + "source": [ + "from pyiceberg.exceptions import NoSuchTableError\n", + "\n", + "try:\n", + " cat.drop_table('default.taxis')\n", + "except NoSuchTableError:\n", + " pass\n", + "\n", + "tbl = cat.create_table(\n", + " 'default.taxis',\n", + " schema=tbl_taxis.schema\n", + ")\n", + "\n", + "tbl" + ] + }, + { + "cell_type": "markdown", + "id": "56818a92-12c6-4806-a700-3071b9b3753c", + "metadata": {}, + "source": [ + "## Write the actual data into the table\n", + "\n", + "This will create a new snapshot on the table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "672a87b1-7132-489f-934c-8243016b20b4", + "metadata": {}, + "outputs": [], + "source": [ + "tbl.overwrite(tbl_taxis)\n", + "\n", + "tbl" + ] + }, + { + "cell_type": "markdown", + "id": "d87c4f8e-3d04-493b-9faf-292b39656a48", + "metadata": {}, + "source": [ + "## Append more data\n", + "\n", + "Let's append another month of data to the table:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a9b5f47-d696-4742-9b72-b4ea203bd8de", + "metadata": {}, + "outputs": [], + "source": [ + "tbl.append(pq.read_table('/home/iceberg/data/yellow_tripdata_2021-05.parquet'))\n", + "\n", + "tbl" + ] + }, + { + "cell_type": "markdown", + "id": "efa23071-8207-4c3d-86bc-db5bf4d768c0", + "metadata": {}, + "source": [ + "## Load data into a PyArrow Dataframe\n", + "\n", + "We'll fetch the table using the REST catalog that comes with the setup." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "794de3a0", + "metadata": {}, + "outputs": [], + "source": [ + "tbl = cat.load_table('default.taxis')\n", + "\n", + "sc = tbl.scan(row_filter=\"tpep_pickup_datetime >= '2021-05-01T00:00:00.000000'\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3ac7021", + "metadata": {}, + "outputs": [], + "source": [ + "df = sc.to_arrow().to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e818e4a", + "metadata": {}, + "outputs": [], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7034fa26", + "metadata": {}, + "outputs": [], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32d7e6ef", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fb4b02a", + "metadata": {}, + "outputs": [], + "source": [ + "df.hist(column='fare_amount')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81f7cc19", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from scipy import stats\n", + "\n", + "stats.zscore(df['fare_amount'])\n", + "\n", + "# Remove everything larger than 3 stddev\n", + "df = df[(np.abs(stats.zscore(df['fare_amount'])) < 3)]\n", + "# Remove everything below zero\n", + "df = df[df['fare_amount'] > 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18771ccb", + "metadata": {}, + "outputs": [], + "source": [ + "df.hist(column='fare_amount')" + ] + }, + { + "cell_type": "markdown", + "id": "886c8408", + "metadata": {}, + "source": [ + "# DuckDB\n", + "\n", + "Use DuckDB to Query the PyArrow Dataframe directly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d5d6fb8", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext sql\n", + "%config SqlMagic.autopandas = True\n", + "%config SqlMagic.feedback = False\n", + "%config SqlMagic.displaycon = False\n", + "%sql duckdb:///:memory:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b6f9522", + "metadata": {}, + "outputs": [], + "source": [ + "%sql SELECT * FROM df LIMIT 20" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5314f2b", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql --save tip_amount --no-execute\n", + "\n", + "SELECT tip_amount\n", + "FROM df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3dec260", + "metadata": {}, + "outputs": [], + "source": [ + "%sqlplot histogram --table df --column tip_amount --bins 22 --with tip_amount\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "989827d9", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql --save tip_amount_filtered --no-execute\n", + "\n", + "WITH tip_amount_stddev AS (\n", + " SELECT STDDEV_POP(tip_amount) AS tip_amount_stddev\n", + " FROM df\n", + ")\n", + "\n", + "SELECT tip_amount\n", + "FROM df, tip_amount_stddev\n", + "WHERE tip_amount > 0\n", + " AND tip_amount < tip_amount_stddev * 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d1df179", + "metadata": {}, + "outputs": [], + "source": [ + "%sqlplot histogram --table tip_amount_filtered --column tip_amount --bins 50 --with tip_amount_filtered\n" + ] + }, + { + "cell_type": "markdown", + "id": "08d2c62d", + "metadata": {}, + "source": [ + "# Iceberg â¤ī¸ PyArrow and DuckDB\n", + "\n", + "This notebook shows how you can load data into a PyArrow dataframe and query it using DuckDB easily. Iceberg allows you to take a slice out of the data that you need for your analysis, while reducing the time that you have to wait for the data and without polluting the memory with data that you're not going to use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72a9c64d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nessie-stack/spark/notebooks/PyIceberg - Write support.ipynb b/nessie-stack/spark/notebooks/PyIceberg - Write support.ipynb new file mode 100644 index 0000000000000..e8b654fb40e68 --- /dev/null +++ b/nessie-stack/spark/notebooks/PyIceberg - Write support.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1041ae6f", + "metadata": {}, + "source": [ + "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" + ] + }, + { + "cell_type": "markdown", + "id": "247fb2ab", + "metadata": {}, + "source": [ + "### [Docker, Spark, and Iceberg: The Fastest Way to Try Iceberg!](https://tabular.io/blog/docker-spark-and-iceberg/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a5c8206", + "metadata": {}, + "outputs": [], + "source": [ + "from pyiceberg import __version__\n", + "\n", + "__version__" + ] + }, + { + "cell_type": "markdown", + "id": "6f9a9f41", + "metadata": {}, + "source": [ + "# Write support\n", + "\n", + "This notebook demonstrates writing to Iceberg tables using PyIceberg. First, connect to the [catalog](https://iceberg.apache.org/concepts/catalog/#iceberg-catalogs), the place where tables are being tracked." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47645b52", + "metadata": {}, + "outputs": [], + "source": [ + "from pyiceberg.catalog import load_catalog\n", + "\n", + "catalog = load_catalog('default')" + ] + }, + { + "cell_type": "markdown", + "id": "c531bd4b-9943-4516-9a6a-99fab016ed2b", + "metadata": {}, + "source": [ + "# Loading data using Arrow\n", + "\n", + "PyArrow is used to load a Parquet file into memory, and using PyIceberg this data can be written to an Iceberg table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fddb808", + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow.parquet as pq\n", + "\n", + "df = pq.read_table(\"/home/iceberg/data/yellow_tripdata_2022-01.parquet\")\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "bf1d58ad-5cc1-4e8c-9d7b-a54e67def783", + "metadata": {}, + "source": [ + "# Create an Iceberg table\n", + "\n", + "Next create the Iceberg table directly from the `pyarrow.Table`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47e5a21d-de87-4aaf-aa06-dc5048acba58", + "metadata": {}, + "outputs": [], + "source": [ + "table_name = \"default.taxi_dataset\"\n", + "\n", + "try:\n", + " # In case the table already exists\n", + " catalog.drop_table(table_name)\n", + "except:\n", + " pass\n", + "\n", + "table = catalog.create_table(table_name, schema=df.schema)\n", + "\n", + "table" + ] + }, + { + "cell_type": "markdown", + "id": "d612c035-4cf6-47a0-844b-165dfb463bbc", + "metadata": {}, + "source": [ + "# Write the data\n", + "\n", + "Let's append the data to the table. Appending or overwriting is equivalent since the table is empty. Next we can query the table and see that the data is there." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efee8252", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "table.append(df) # or table.overwrite(df)\n", + "\n", + "assert len(table.scan().to_arrow()) == len(df)\n", + "\n", + "table.scan().to_arrow()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ce1cecc-8cb0-4622-b0eb-55880d091556", + "metadata": {}, + "outputs": [], + "source": [ + "str(table.current_snapshot())" + ] + }, + { + "cell_type": "markdown", + "id": "c029ea44-8ba6-4c08-a60d-5fffac6c3666", + "metadata": {}, + "source": [ + "# Append data\n", + "\n", + "Let's append another month of data to the table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "794de3a0", + "metadata": {}, + "outputs": [], + "source": [ + "df = pq.read_table(\"/home/iceberg/data/yellow_tripdata_2022-02.parquet\")\n", + "table.append(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3ac7021", + "metadata": {}, + "outputs": [], + "source": [ + "str(table.current_snapshot())" + ] + }, + { + "cell_type": "markdown", + "id": "85862bdc-7476-43f4-a604-5e4dfff065c9", + "metadata": {}, + "source": [ + "# Feature generation\n", + "\n", + "Consider that we want to train a model to determine which features contribute to the tip amount. `tip_per_mile` is a good target to train the model on. When we try to append the data, we need to evolve the schema first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72a9c64d", + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow.compute as pc\n", + "\n", + "df = table.scan().to_arrow()\n", + "df = df.append_column(\"tip_per_mile\", pc.divide(df[\"tip_amount\"], df[\"trip_distance\"]))\n", + "\n", + "try:\n", + " table.overwrite(df)\n", + "except ValueError as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9aafd972-30d2-41ec-90e1-d5e17baeaf0b", + "metadata": {}, + "outputs": [], + "source": [ + "with table.update_schema() as upd:\n", + " upd.union_by_name(df.schema)\n", + "\n", + "print(str(table.schema()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea4ee286-1943-4a88-8d96-1e2a9e11faa1", + "metadata": {}, + "outputs": [], + "source": [ + "table.overwrite(df)\n", + "\n", + "table" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nessie-stack/spark/requirements.txt b/nessie-stack/spark/requirements.txt new file mode 100644 index 0000000000000..53cac4991e475 --- /dev/null +++ b/nessie-stack/spark/requirements.txt @@ -0,0 +1,7 @@ +jupyter==1.1.1 +spylon-kernel==0.4.1 +pyiceberg[pyarrow,duckdb,pandas]==0.7.1 +jupysql==0.10.5 +matplotlib==3.9.2 +scipy==1.14.1 +duckdb-engine==0.13.1 diff --git a/nessie-stack/spark/spark-defaults.conf b/nessie-stack/spark/spark-defaults.conf new file mode 100644 index 0000000000000..bbb90e62732e9 --- /dev/null +++ b/nessie-stack/spark/spark-defaults.conf @@ -0,0 +1,44 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# Iceberg extensions +spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions + +# Iceberg catalog configuration +spark.sql.catalog.iceberg org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog.iceberg.type nessie +spark.sql.catalog.iceberg.uri http://nessie:19120/api/v1 +spark.sql.catalog.iceberg.ref main +spark.sql.catalog.iceberg.io-impl org.apache.iceberg.aws.s3.S3FileIO +spark.sql.catalog.iceberg.warehouse s3a://warehouse +spark.sql.catalog.iceberg.s3.endpoint http://minio:9000 +spark.sql.catalog.iceberg.s3.path-style-access true + +# S3/MinIO configurations +spark.hadoop.fs.s3a.access.key minioadmin +spark.hadoop.fs.s3a.secret.key minioadmin +spark.hadoop.fs.s3a.endpoint http://minio:9000 +spark.hadoop.fs.s3a.path.style.access true +spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem + +# Thrift server configurations (if needed) +#hive.server2.thrift.port 10000 +#hive.server2.thrift.bind.host 0.0.0.0 \ No newline at end of file diff --git a/plugins/kanban/go.mod b/plugins/kanban/go.mod new file mode 100644 index 0000000000000..2eda67dc2f398 --- /dev/null +++ b/plugins/kanban/go.mod @@ -0,0 +1,3 @@ +module kanban + +go 1.24rc3 diff --git a/plugins/kanban/main.go b/plugins/kanban/main.go new file mode 100644 index 0000000000000..1d7e5f69583cb --- /dev/null +++ b/plugins/kanban/main.go @@ -0,0 +1,48 @@ +package kanban + +import ( + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/routers" + "code.gitea.io/sdk/gitea" +) + +// Plugin represents a Gitea plugin +type Plugin struct { + Name string + Description string + Version string +} + +// Init initializes the plugin +func (p *Plugin) Init() error { + log.Info("Kanban plugin initialized") + + // Register routes + routers.Register("GET", "/api/v1/repos/:username/:reponame/kanban", GetKanbanHandler) + routers.Register("POST", "/api/v1/repos/:username/:reponame/kanban", CreateKanbanHandler) + + // Register templates + // Add UI components + + return nil +} + +func main() { + plugin := &Plugin{ + Name: "Kanban Board", + Description: "A Kanban board for Gitea projects", + Version: "1.0.0", + } + + gitea.RegisterPlugin(plugin) +} + +// Handler functions +func GetKanbanHandler(ctx *context.Context) { + // Implementation +} + +func CreateKanbanHandler(ctx *context.Context) { + // Implementation +}