Skip to content
This repository was archived by the owner on Jul 16, 2024. It is now read-only.

Commit 2be9ff1

Browse files
ijemmyvgkowski
andauthored
feature/use data lake core construct (#166)
* feat: Include core constructs from pypi and update cdk libraries to 1.121.0 to use it * refactor: Switch to use DataLakeCatalog construct * feat: Use DataLakeStorage instead of creating s3 AutoEmptyBuckett Co-authored-by: Vincent Gromakowski <vgkowski@users.noreply.github.com>
1 parent 4a35051 commit 2be9ff1

File tree

5 files changed

+64
-83
lines changed

5 files changed

+64
-83
lines changed

.gitignore

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@ package-lock.json
55
.env
66
venv/
77
*.egg-info
8+
89
# CDK asset staging directory
910
.cdk.staging
1011
cdk.out
1112
.DS_Store
12-
.idea
13-
*.iml
1413
*.pyc
1514
refarch/aws-native/common/data-generator/project/project/
1615
refarch/aws-native/common/data-generator/project/target/
@@ -24,4 +23,9 @@ core/node_modules/
2423
*.log
2524
doc/site
2625
__init__.py
27-
core/.github/*
26+
core/.github/*
27+
28+
# IDE settings
29+
.idea
30+
.vscode
31+
*.iml

refarch/aws-native/common/common_cdk/auto_empty_bucket_audited.py

Lines changed: 0 additions & 21 deletions
This file was deleted.

refarch/aws-native/common/common_cdk/data_lake.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,25 +46,25 @@ def __init__(self, scope: Construct, id: str, **kwargs) -> None:
4646
# BATCH module
4747
if is_module_enabled(batch_module_param):
4848
BatchModule(self, "Batch",
49-
raw_bucket=data_lake.raw_s3_bucket.bucket,
50-
clean_bucket=data_lake.clean_s3_bucket.bucket,
49+
raw_bucket=data_lake.raw_s3_bucket,
50+
clean_bucket=data_lake.clean_s3_bucket,
5151
raw_db=data_lake.raw_glue_db,
5252
clean_db=data_lake.clean_glue_db)
5353

5454
BatchDataGenerator(
5555
self, "BatchDatagen",
5656
config_table=datagen_config_table,
5757
tshirt_size='SMALL',
58-
log_bucket=data_lake.logs_s3_bucket.bucket,
59-
sink_bucket=data_lake.raw_s3_bucket.bucket,
58+
log_bucket=data_lake.logs_s3_bucket,
59+
sink_bucket=data_lake.raw_s3_bucket,
6060
vpc=data_lake.vpc
6161
)
6262

6363
# DATA WAREHOUSE module
6464
if is_module_enabled(dwh_module_param):
6565
dwh_stack = DwhModule(self, "dwh",
6666
vpc=data_lake.vpc,
67-
clean_bucket=data_lake.clean_s3_bucket.bucket,
67+
clean_bucket=data_lake.clean_s3_bucket,
6868
clean_glue_db=data_lake.clean_glue_db)
6969

7070
CfnOutput(self, 'Redshift-QuickSight-Secret-Arn',
@@ -113,23 +113,23 @@ def __init__(self, scope: Construct, id: str, **kwargs) -> None:
113113
streaming_stack = StreamingModule(self,
114114
id="Streaming",
115115
prefix=id,
116-
source_bucket=data_lake.raw_s3_bucket.bucket,
117-
dest_bucket=data_lake.curated_s3_bucket.bucket)
116+
source_bucket=data_lake.raw_s3_bucket,
117+
dest_bucket=data_lake.curated_s3_bucket)
118118

119119
StreamDataGenerator(
120120
self, "StreamDatagen",
121121
config_table=datagen_config_table,
122122
tshirt_size='SMALL',
123-
log_bucket=data_lake.logs_s3_bucket.bucket,
124-
sink_bucket=data_lake.raw_s3_bucket.bucket,
123+
log_bucket=data_lake.logs_s3_bucket,
124+
sink_bucket=data_lake.raw_s3_bucket,
125125
web_sale_stream=streaming_stack.sale_stream.stream_name,
126126
web_customer_stream=streaming_stack.customer_stream.stream_name,
127127
web_customer_address_stream=streaming_stack.address_stream.stream_name,
128128
kinesis_key=streaming_stack.kinesis_kms_key,
129129
vpc=data_lake.vpc
130130
)
131131

132-
CfnOutput(self, 'Clean-S3-Bucket', value=data_lake.clean_s3_bucket.bucket.bucket_name)
132+
CfnOutput(self, 'Clean-S3-Bucket', value=data_lake.clean_s3_bucket.bucket_name)
133133
CfnOutput(self, "VPC-ID", value=data_lake.vpc.vpc_id, export_name='ara-Vpc-Id')
134134

135135
for idx, val in enumerate(data_lake.private_subnets_selection.subnets):

refarch/aws-native/common/common_cdk/foundations.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
from aws_cdk.aws_ec2 import GatewayVpcEndpointAwsService, SubnetSelection, SubnetType, Vpc, InterfaceVpcEndpointAwsService
66
from aws_cdk.aws_glue import Database
77
from aws_cdk.aws_iam import Group
8+
from aws_analytics_reference_architecture import DataLakeCatalog, DataLakeStorage
9+
from common_cdk.audit_trail_glue import AuditTrailGlue
810

911
from common.common_cdk.auto_empty_bucket import AutoEmptyBucket
10-
from common.common_cdk.auto_empty_bucket_audited import AutoEmptyAuditedBucket
1112
from common.common_cdk.config import AutoEmptyConfig
1213

1314

@@ -73,34 +74,30 @@ def __init__(self, scope: Construct, id: str, **kwargs) -> None:
7374
super().__init__(scope, id, **kwargs)
7475

7576
# implement the glue data catalog databases used in the data lake
76-
self.__raw_glue_db = Database(self, 'RawGlueDB', database_name='ara_raw_data_' + self.account)
77-
self.__clean_glue_db = Database(self, 'CleanGlueDB', database_name='ara_clean_data_' + self.account)
78-
self.__curated_glue_db = Database(self, 'CuratedGlueDB', database_name='ara_curated_data_' + self.account)
77+
catalog = DataLakeCatalog(self, 'DataLakeCatalog')
78+
self.__raw_glue_db = catalog.raw_database
79+
self.__clean_glue_db = catalog.clean_database
80+
self.__curated_glue_db = catalog.transform_database
7981
self.__audit_glue_db = Database(self, 'AuditGlueDB', database_name='ara_audit_data_' + self.account)
8082

83+
8184
# implement the S3 buckets for the data lake
85+
storage = DataLakeStorage(self, 'DataLakeStorage')
8286
self.__logs_s3_bucket = AutoEmptyBucket(
8387
self, 'Logs',
8488
bucket_name='ara-logs-' + self.account,
8589
uuid=AutoEmptyConfig.FOUNDATIONS_UUID
86-
)
90+
).bucket
8791

88-
self.__raw_s3_bucket = AutoEmptyBucket(
89-
self, 'RawData',
90-
bucket_name='ara-raw-data-' + self.account,
91-
uuid=AutoEmptyConfig.FOUNDATIONS_UUID
92-
)
93-
self.__clean_s3_bucket = AutoEmptyBucket(
94-
self, 'CleanData',
95-
bucket_name='ara-clean-data-' + self.account,
96-
uuid=AutoEmptyConfig.FOUNDATIONS_UUID
97-
)
98-
self.__curated_s3_bucket = AutoEmptyAuditedBucket(
99-
self, 'CuratedData',
100-
bucket_name='ara-curated-data-' + self.account,
101-
uuid=AutoEmptyConfig.FOUNDATIONS_UUID,
102-
log_bucket=self.__logs_s3_bucket.bucket,
103-
audit_db=self.__audit_glue_db
92+
self.__raw_s3_bucket = storage.raw_bucket
93+
self.__clean_s3_bucket = storage.clean_bucket
94+
self.__curated_s3_bucket = storage.transform_bucket
95+
96+
AuditTrailGlue(self, 'GlueAudit',
97+
log_bucket=self.__logs_s3_bucket,
98+
audit_bucket=self.__curated_s3_bucket,
99+
audit_db=self.__audit_glue_db,
100+
audit_table=self.__curated_s3_bucket.bucket_name
104101
)
105102

106103
# the vpc used for the overall data lake (same vpc, different subnet for modules)

refarch/aws-native/setup.py

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,35 @@
1919
packages=setuptools.find_packages(where="common"),
2020

2121
install_requires=[
22-
"aws-cdk.core==1.103.0",
23-
"aws-cdk.aws-s3==1.103.0",
24-
"aws-cdk.aws-glue==1.103.0",
25-
"aws-cdk.aws_ec2==1.103.0",
26-
"aws-cdk.aws_iam==1.103.0",
27-
"aws-cdk.aws_s3_deployment==1.103.0",
28-
"aws-cdk.aws_lambda==1.103.0",
29-
"aws-cdk.aws_cloudformation==1.103.0",
30-
"aws_cdk.aws_redshift==1.103.0",
31-
"aws-cdk.aws_ec2==1.103.0",
32-
"aws-cdk.aws_secretsmanager==1.103.0",
33-
"aws-cdk.aws-kinesisanalytics==1.103.0",
34-
"aws-cdk.aws-elasticsearch==1.103.0",
35-
"aws-cdk.custom-resources==1.103.0",
36-
"aws-cdk.aws-cognito==1.103.0",
37-
"aws-cdk.aws-kinesis==1.103.0",
38-
"aws-cdk.aws_cloudtrail==1.103.0",
39-
"aws_cdk.aws_stepfunctions==1.103.0",
40-
"aws_cdk.aws_stepfunctions_tasks==1.103.0",
41-
"aws_cdk.aws_s3_notifications==1.103.0",
42-
"aws_cdk.aws_dynamodb==1.103.0",
43-
"aws_cdk.aws_events==1.103.0",
44-
"aws_cdk.aws_events_targets==1.103.0",
45-
"aws_cdk.aws_emr==1.103.0",
46-
"aws_cdk.aws_batch==1.103.0",
47-
"aws_cdk.aws_autoscaling==1.103.0",
48-
"aws_cdk.aws_elasticloadbalancingv2==1.103.0",
49-
"cdk_ec2_key_pair"
22+
"aws-analytics-reference-architecture==1.5.1",
23+
"aws-cdk.core==1.121.0",
24+
"aws-cdk.aws-s3==1.121.0",
25+
"aws-cdk.aws-glue==1.121.0",
26+
"aws-cdk.aws_ec2==1.121.0",
27+
"aws-cdk.aws_iam==1.121.0",
28+
"aws-cdk.aws_s3_deployment==1.121.0",
29+
"aws-cdk.aws_lambda==1.121.0",
30+
"aws-cdk.aws_cloudformation==1.121.0",
31+
"aws_cdk.aws_redshift==1.121.0",
32+
"aws-cdk.aws_ec2==1.121.0",
33+
"aws-cdk.aws_secretsmanager==1.121.0",
34+
"aws-cdk.aws-kinesisanalytics==1.121.0",
35+
"aws-cdk.aws-elasticsearch==1.121.0",
36+
"aws-cdk.custom-resources==1.121.0",
37+
"aws-cdk.aws-cognito==1.121.0",
38+
"aws-cdk.aws-kinesis==1.121.0",
39+
"aws-cdk.aws_cloudtrail==1.121.0",
40+
"aws_cdk.aws_stepfunctions==1.121.0",
41+
"aws_cdk.aws_stepfunctions_tasks==1.121.0",
42+
"aws_cdk.aws_s3_notifications==1.121.0",
43+
"aws_cdk.aws_dynamodb==1.121.0",
44+
"aws_cdk.aws_events==1.121.0",
45+
"aws_cdk.aws_events_targets==1.121.0",
46+
"aws_cdk.aws_emr==1.121.0",
47+
"aws_cdk.aws_batch==1.121.0",
48+
"aws_cdk.aws_autoscaling==1.121.0",
49+
"aws_cdk.aws_elasticloadbalancingv2==1.121.0",
50+
"cdk_ec2_key_pair==2.2.1"
5051
],
5152

5253
python_requires=">=3.8",

0 commit comments

Comments
 (0)