add S3 Stack for Apache Iceberg table

ksmin23 · ksmin23 · commit 18f3f483939d · 2023-02-12T21:39:40.000+09:00
diff --git a/README.md b/README.md
@@ -59,7 +59,6 @@ For example:
     "--table_name": "iceberg_demo_table",
     "--primary_key": "name",
     "--kinesis_table_name": "iceberg_demo_kinesis_stream_table",
-    "--kinesis_stream_arn": "arn:aws:kinesis:us-east-1:123456789012:stream/iceberg-demo-stream",
     "--starting_position_of_kinesis_iterator": "LATEST",
     "--iceberg_s3_path": "s3://glue-iceberg-demo-atq4q5u/iceberg_demo_db",
     "--lock_table_name": "iceberg_lock",
@@ -93,6 +92,7 @@ For example:
 :information_source: `--primary_key` option should be set by Iceberg table's primary column name.
 
 :warning: **You should create a S3 bucket for a glue job script and upload the glue job script file into the s3 bucket.**
+
 At this point you can now synthesize the CloudFormation template for this code.
 
 <pre>
@@ -108,11 +108,15 @@ command.
 ## Run Test
 
 1. Set up **Apache Iceberg connector for AWS Glue** to use Apache Iceberg with AWS Glue jobs.
-2. Create a Kinesis data stream
+2. Create a S3 bucket for Apache Iceber table
+   <pre>
+   (.venv) $ cdk deploy IcebergS3Path
+   </pre>
+3. Create a Kinesis data stream
    <pre>
    (.venv) $ cdk deploy KinesisStreamAsGlueStreamingJobDataSource
    </pre>
-3. Define a schema for the streaming data
+4. Define a schema for the streaming data
    <pre>
    (.venv) $ cdk deploy GlueSchemaOnKinesisStream
    </pre>
@@ -137,7 +141,7 @@ command.
 
    (11) Choose **Finish**
 
-4. Upload **AWS SDK for Java 2.x** jar file into S3
+5. Upload **AWS SDK for Java 2.x** jar file into S3
    <pre>
    (.venv) $ wget https://repo1.maven.org/maven2/software/amazon/awssdk/aws-sdk-java/2.17.224/aws-sdk-java-2.17.224.jar
    (.venv) $ aws s3 cp aws-sdk-java-2.17.224.jar s3://aws-glue-assets-123456789012-atq4q5u/extra-jars/aws-sdk-java-2.17.224.jar
@@ -153,7 +157,7 @@ command.
    --user-jars-first true
    </pre>
    In order to do this, we might need to upload **AWS SDK for Java 2.x** jar file into S3.
-5. Create Glue Streaming Job
+6. Create Glue Streaming Job
 
    * (step 1) Select one of Glue Job Scripts and upload into S3
 
@@ -180,7 +184,7 @@ command.
                           GrantLFPermissionsOnGlueJobRole \
                           GlueStreamingSinkToIceberg
      </pre>
-6. Make sure the glue job to access the Kinesis Data Streams table in the Glue Catalog database, otherwise grant the glue job to permissions
+7. Make sure the glue job to access the Kinesis Data Streams table in the Glue Catalog database, otherwise grant the glue job to permissions
 
    Wec can get permissions by running the following command:
    <pre>
@@ -193,7 +197,7 @@ command.
                --permissions SELECT DESCRIBE ALTER INSERT DELETE \
                --resource '{ "Table": {"DatabaseName": "<i>iceberg_demo_db</i>", "TableWildcard": {}} }'
    </pre>
-7. Create a table with partitioned data in Amazon Athena
+8. Create a table with partitioned data in Amazon Athena
 
    Go to [Athena](https://console.aws.amazon.com/athena/home) on the AWS Management console.<br/>
    * (step 1) Create a database
@@ -236,11 +240,11 @@ command.
               --resource '{ "Table": {"DatabaseName": "<i>iceberg_demo_db</i>", "TableWildcard": {}} }'
       </pre>
 
-8. Run glue job to load data from Kinesis Data Streams into S3
+9. Run glue job to load data from Kinesis Data Streams into S3
     <pre>
     (.venv) $ aws glue start-job-run --job-name <i>streaming_data_from_kds_into_iceberg_table</i>
     </pre>
-9.  Generate streaming data
+10. Generate streaming data
 
     We can synthetically generate data in JSON format using a simple Python application.
     <pre>
@@ -287,7 +291,7 @@ command.
     {"name": "Micheal", "age": 44, "m_time": "2023-12-14 09:02:57"}
     {"name": "Takisha", "age": 24, "m_time": "2023-12-30 12:38:23"}
     </pre>
-10. Check streaming data in S3
+11. Check streaming data in S3
 
     After 3~5 minutes, you can see that the streaming data have been delivered from **Kinesis Data Streams** to **S3** and stored in a folder structure by year, month, day, and hour.
 
@@ -296,7 +300,7 @@ command.
     ![iceberg-table](./assets/iceberg-data-level-02.png)
     ![iceberg-table](./assets/iceberg-data-level-03.png)
 
-11. Run test query
+12. Run test query
 
     Enter the following SQL statement and execute the query.
     <pre>
@@ -354,6 +358,7 @@ command.
        </pre>
  * (10) [Apache Iceberg - Maintenance for streaming tables (v0.14.0)](https://iceberg.apache.org/docs/0.14.0/spark-structured-streaming/#maintenance-for-streaming-tables)
  * (11) [awsglue python package](https://github.com/awslabs/aws-glue-libs): The awsglue Python package contains the Python portion of the AWS Glue library. This library extends PySpark to support serverless ETL on AWS.
+ * (12) [AWS Glue Notebook Samples](https://github.com/aws-samples/aws-glue-samples/tree/master/examples/notebooks) - sample iPython notebook files which show you how to use open data dake formats; Apache Hudi, Delta Lake, and Apache Iceberg on AWS Glue Interactive Sessions and AWS Glue Studio Notebook.
 
 ## Troubleshooting
 
diff --git a/app.py b/app.py
@@ -8,15 +8,19 @@
   GlueJobRoleStack,
   GlueStreamDataSchemaStack,
   GlueStreamingJobStack,
-  DataLakePermissionsStack
+  DataLakePermissionsStack,
+  S3BucketStack
 )
 
 APP_ENV = cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'),
   region=os.getenv('CDK_DEFAULT_REGION'))
 
 app = cdk.App()
 
+s3_bucket = S3BucketStack(app, 'IcebergS3Path')
+
 kds_stack = KdsStack(app, 'KinesisStreamAsGlueStreamingJobDataSource')
+kds_stack.add_dependency(s3_bucket)
 
 glue_job_role = GlueJobRoleStack(app, 'GlueStreamingSinkToIcebergJobRole')
 glue_job_role.add_dependency(kds_stack)
@@ -33,7 +37,8 @@
 grant_lake_formation_permissions.add_dependency(glue_stream_schema)
 
 glue_streaming_job = GlueStreamingJobStack(app, 'GlueStreamingSinkToIceberg',
-  glue_job_role.glue_job_role
+  glue_job_role.glue_job_role,
+  kds_stack.kinesis_stream
 )
 glue_streaming_job.add_dependency(grant_lake_formation_permissions)
 
diff --git a/cdk_stacks/__init__.py b/cdk_stacks/__init__.py
@@ -3,3 +3,4 @@
 from .glue_stream_data_schema import GlueStreamDataSchemaStack
 from .glue_streaming_job import GlueStreamingJobStack
 from .lakeformation_permissions import DataLakePermissionsStack
+from .s3 import S3BucketStack
diff --git a/cdk_stacks/glue_streaming_job.py b/cdk_stacks/glue_streaming_job.py
@@ -9,14 +9,15 @@
 
 class GlueStreamingJobStack(Stack):
 
-  def __init__(self, scope: Construct, construct_id: str, glue_job_role, **kwargs) -> None:
+  def __init__(self, scope: Construct, construct_id: str, glue_job_role, kinesis_stream, **kwargs) -> None:
     super().__init__(scope, construct_id, **kwargs)
 
     glue_assets_s3_bucket_name = self.node.try_get_context('glue_assets_s3_bucket_name')
     glue_job_script_file_name = self.node.try_get_context('glue_job_script_file_name')
     glue_job_input_arguments = self.node.try_get_context('glue_job_input_arguments')
 
     glue_job_default_arguments = {
+      "--kinesis_stream_arn": kinesis_stream.stream_arn,
       "--enable-metrics": "true",
       "--enable-spark-ui": "true",
       "--spark-event-logs-path": f"s3://{glue_assets_s3_bucket_name}/sparkHistoryLogs/",
diff --git a/cdk_stacks/s3.py b/cdk_stacks/s3.py
@@ -0,0 +1,28 @@
+from urllib.parse import urlparse
+
+import aws_cdk as cdk
+
+from aws_cdk import (
+  Stack,
+  aws_s3 as s3
+)
+
+from constructs import Construct
+
+
+class S3BucketStack(Stack):
+
+  def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
+    super().__init__(scope, construct_id, **kwargs)
+
+    glue_job_input_arguments = self.node.try_get_context('glue_job_input_arguments')
+    s3_path = glue_job_input_arguments["--iceberg_s3_path"]
+    s3_bucket_name = urlparse(s3_path).netloc
+
+    s3_bucket = s3.Bucket(self, "s3bucket",
+      removal_policy=cdk.RemovalPolicy.DESTROY, #XXX: Default: cdk.RemovalPolicy.RETAIN - The bucket will be orphaned
+      bucket_name=s3_bucket_name)
+
+    self.s3_bucket_name = s3_bucket.bucket_name
+
+    cdk.CfnOutput(self, f'{self.stack_name}_S3Bucket', value=self.s3_bucket_name)