add stack that grant LakeFormation Perssions on Glue Job

ksmin23 · ksmin23 · commit 12bf0ceb7162 · 2023-01-28T17:01:58.000+09:00
diff --git a/README.md b/README.md
@@ -114,7 +114,7 @@ command.
    </pre>
 3. Define a schema for the streaming data
    <pre>
-   (.venv) $ cdk deploy GlueStreamingSinkToIcebergJobRole GlueSchemaOnKinesisStream
+   (.venv) $ cdk deploy GlueSchemaOnKinesisStream
    </pre>
 
    Running `cdk deploy GlueSchemaOnKinesisStream` command is like that we create a schema manually using the AWS Glue Data Catalog as the following steps:
@@ -176,15 +176,17 @@ command.
    * (step 2) Provision the Glue Streaming Job
 
      <pre>
-     (.venv) $ cdk deploy GlueStreamingSinkToIceberg
+     (.venv) $ cdk deploy GlueStreamingSinkToIcebergJobRole \
+                          GrantLFPermissionsOnGlueJobRole \
+                          GlueStreamingSinkToIceberg
      </pre>
 6. Make sure the glue job to access the Kinesis Data Streams table in the Glue Catalog database, otherwise grant the glue job to permissions
 
    Wec can get permissions by running the following command:
    <pre>
    (.venv) $ aws lakeformation list-permissions | jq -r '.PrincipalResourcePermissions[] | select(.Principal.DataLakePrincipalIdentifier | endswith(":role/GlueStreamingJobRole-Iceberg"))'
    </pre>
-   Also, we can grant the glue job to required permissions by running the following command:
+   If not found, we need manually to grant the glue job to required permissions by running the following command:
    <pre>
    (.venv) $ aws lakeformation grant-permissions \
                --principal DataLakePrincipalIdentifier=arn:aws:iam::<i>{account-id}</i>:role/<i>GlueStreamingJobRole-Iceberg</i> \
diff --git a/app.py b/app.py
@@ -7,7 +7,8 @@
   KdsStack,
   GlueJobRoleStack,
   GlueStreamDataSchemaStack,
-  GlueStreamingJobStack
+  GlueStreamingJobStack,
+  DataLakePermissionsStack
 )
 
 APP_ENV = cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'),
@@ -21,14 +22,19 @@
 glue_job_role.add_dependency(kds_stack)
 
 glue_stream_schema = GlueStreamDataSchemaStack(app, 'GlueSchemaOnKinesisStream',
-  kds_stack.kinesis_stream,
+  kds_stack.kinesis_stream
+)
+glue_stream_schema.add_dependency(kds_stack)
+
+grant_lake_formation_permissions = DataLakePermissionsStack(app, 'GrantLFPermissionsOnGlueJobRole',
   glue_job_role.glue_job_role
 )
-glue_stream_schema.add_dependency(glue_job_role)
+grant_lake_formation_permissions.add_dependency(glue_job_role)
+grant_lake_formation_permissions.add_dependency(glue_stream_schema)
 
 glue_streaming_job = GlueStreamingJobStack(app, 'GlueStreamingSinkToIceberg',
   glue_job_role.glue_job_role
 )
-glue_streaming_job.add_dependency(glue_stream_schema)
+glue_streaming_job.add_dependency(grant_lake_formation_permissions)
 
 app.synth()
diff --git a/cdk_stacks/__init__.py b/cdk_stacks/__init__.py
@@ -2,3 +2,4 @@
 from .glue_job_role import GlueJobRoleStack
 from .glue_stream_data_schema import GlueStreamDataSchemaStack
 from .glue_streaming_job import GlueStreamingJobStack
+from .lakeformation_permissions import DataLakePermissionsStack
diff --git a/cdk_stacks/glue_stream_data_schema.py b/cdk_stacks/glue_stream_data_schema.py
@@ -9,7 +9,7 @@
 
 class GlueStreamDataSchemaStack(Stack):
 
-  def __init__(self, scope: Construct, construct_id: str, kinesis_stream, glue_job_role, **kwargs) -> None:
+  def __init__(self, scope: Construct, construct_id: str, kinesis_stream, **kwargs) -> None:
     super().__init__(scope, construct_id, **kwargs)
 
     glue_kinesis_table = self.node.try_get_context('glue_kinesis_table')
diff --git a/cdk_stacks/lakeformation_permissions.py b/cdk_stacks/lakeformation_permissions.py
@@ -0,0 +1,51 @@
+import aws_cdk as cdk
+
+from aws_cdk import (
+  Stack,
+  aws_lakeformation
+)
+from constructs import Construct
+
+class DataLakePermissionsStack(Stack):
+
+  def __init__(self, scope: Construct, construct_id: str, glue_job_role, **kwargs) -> None:
+    super().__init__(scope, construct_id, **kwargs)
+
+    glue_job_input_arguments = self.node.try_get_context('glue_kinesis_table')
+    database_name = glue_job_input_arguments["database_name"]
+
+    #XXXX: The role assumed by cdk is not a data lake administrator.
+    # So, deploying PrincipalPermissions meets the error such as:
+    # "Resource does not exist or requester is not authorized to access requested permissions."
+    # In order to solve the error, it is necessary to promote the cdk execution role to the data lake administrator.
+    # For example, https://github.com/aws-samples/data-lake-as-code/blob/mainline/lib/stacks/datalake-stack.ts#L68
+    cfn_data_lake_settings = aws_lakeformation.CfnDataLakeSettings(self, "CfnDataLakeSettings",
+      admins=[aws_lakeformation.CfnDataLakeSettings.DataLakePrincipalProperty(
+        data_lake_principal_identifier=cdk.Fn.sub(self.synthesizer.cloud_formation_execution_role_arn)
+      )]
+    )
+
+    cfn_principal_permissions = aws_lakeformation.CfnPrincipalPermissions(self, "CfnPrincipalPermissions",
+      permissions=["SELECT", "INSERT", "DELETE", "DESCRIBE", "ALTER"],
+      permissions_with_grant_option=[],
+      principal=aws_lakeformation.CfnPrincipalPermissions.DataLakePrincipalProperty(
+        data_lake_principal_identifier=glue_job_role.role_arn
+      ),
+      resource=aws_lakeformation.CfnPrincipalPermissions.ResourceProperty(
+        #XXX: Can't specify a TableWithColumns resource and a Table resource
+        table=aws_lakeformation.CfnPrincipalPermissions.TableResourceProperty(
+          catalog_id=cdk.Aws.ACCOUNT_ID,
+          database_name=database_name,
+          # name="ALL_TABLES",
+          table_wildcard={}
+        )
+      )
+    )
+    cfn_principal_permissions.apply_removal_policy(cdk.RemovalPolicy.DESTROY)
+
+    #XXX: In order to keep resource destruction order,
+    # set dependency between CfnDataLakeSettings and CfnPrincipalPermissions
+    cfn_principal_permissions.add_dependency(cfn_data_lake_settings)
+
+    cdk.CfnOutput(self, f'{self.stack_name}_Principal',
+      value=cfn_principal_permissions.attr_principal_identifier)