Skip to content

Commit 4dcaf26

Browse files
author
EC2 Default User
committed
Adding additional enrollment abstraction (RDS & S3)
1 parent 46bc2df commit 4dcaf26

File tree

6 files changed

+336
-249
lines changed

6 files changed

+336
-249
lines changed

bin/aws.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import * as cdk from '@aws-cdk/core';
44
import { BaselineStack } from '../lib/baseline-stack';
55
import { DatalakeStack } from '../lib/datalake-stack';
66
import { OpenTargetsStack } from '../lib/opentargets-stack';
7-
import { Chembl25Stack } from '../lib/chembl-25-stack';
7+
import { ChemblStack } from '../lib/chembl-25-stack';
88
import s3 = require('@aws-cdk/aws-s3');
99

1010

@@ -18,7 +18,7 @@ const coreDataLake = new DatalakeStack(app, 'CoreDataLake', {
1818

1919

2020

21-
const chemblStack = new Chembl25Stack(app, 'ChemblStack', {
21+
const chemblStack = new ChemblStack(app, 'ChemblStack', {
2222
database: baseline.ChemblDb,
2323
accessSecurityGroup: baseline.chemblDBChemblDbAccessSg,
2424
databaseSecret: baseline.chemblDBSecret,

lib/chembl-25-stack.ts

Lines changed: 27 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -5,147 +5,45 @@ import rds = require('@aws-cdk/aws-rds');
55
import glue = require('@aws-cdk/aws-glue');
66
import s3 = require('@aws-cdk/aws-s3');
77
import s3assets = require('@aws-cdk/aws-s3-assets');
8-
import { DataSetEnrollmentProps, DataLakeEnrollmentWorkflow } from './data-set-enrollment';
8+
import { RDSdataSetSetEnrollmentProps, RDSPostgresDataSetEnrollment } from './rds-data-set-enrollment';
99

1010

11-
export interface RDSdataSetSetEnrollmentProps extends DataSetEnrollmentProps {
11+
export interface ChemblStackEnrollmentProps extends cdk.StackProps {
1212
databaseSecret: rds.DatabaseSecret;
1313
database: rds.DatabaseInstance;
1414
accessSecurityGroup: ec2.SecurityGroup;
15+
dataLakeBucket: s3.Bucket;
1516
}
1617

17-
export class Chembl25Stack extends cdk.Stack {
18-
19-
constructor(scope: cdk.Construct, id: string, props: RDSdataSetSetEnrollmentProps) {
18+
export class ChemblStack extends cdk.Stack{
19+
constructor(scope: cdk.Construct, id: string, props: ChemblStackEnrollmentProps) {
2020
super(scope, id, props);
21-
22-
23-
const chembl_25_src = new glue.Database(this, 'chembl-25-src', {
24-
databaseName: 'chembl_25_src',
25-
locationUri: `s3://${props.dataLakeBucket.bucketName}/chembl/src/chembl25`
26-
});
27-
const chembl_25_dl = new glue.Database(this, 'chembl-25-dl', {
28-
databaseName: 'chembl_25_dl',
29-
locationUri: `s3://${props.dataLakeBucket.bucketName}/chembl/dl/chembl25`
30-
});
31-
32-
var chemblConnectionInput = {
33-
connectionProperties: {
34-
USERNAME: props.databaseSecret.secretValueFromJson('username'),
35-
JDBC_ENFORCE_SSL: "false",
36-
PASSWORD: props.databaseSecret.secretValueFromJson('password'),
37-
JDBC_CONNECTION_URL: `jdbc:postgresql://${props.database.dbInstanceEndpointAddress}:5432/chembl_25`
38-
}
39-
,connectionType: "JDBC"
40-
,description: "chembl-25-src connection"
41-
,name: "chembl-25-src"
42-
,physicalConnectionRequirements: {
43-
availabilityZone: props.database.vpc.privateSubnets[0].availabilityZone,
44-
subnetId: props.database.vpc.privateSubnets[0].subnetId,
45-
securityGroupIdList: [props.accessSecurityGroup.securityGroupId],
46-
47-
}
48-
};
49-
50-
const chemblConnection = new glue.CfnConnection(this, 'chembl-25-src-connection', {
51-
catalogId: chembl_25_src.catalogId,
52-
connectionInput: chemblConnectionInput
53-
});
54-
55-
const chemblGlueRole = new iam.Role(this, 'chembleGlueROle', {
56-
assumedBy: new iam.ServicePrincipal('glue.amazonaws.com')
57-
});
58-
59-
chemblGlueRole.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole'));
60-
chemblGlueRole.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('CloudWatchAgentServerPolicy'));
61-
props.dataLakeBucket.grantReadWrite(chemblGlueRole);
62-
63-
64-
const chembl25Crawler = new glue.CfnCrawler(this, 'chembl25-crawler',{
65-
name: "chembl_25_src_crawler",
66-
targets: {
67-
jdbcTargets: [
68-
{
69-
path: "chembl_25/%",
70-
exclusions: [],
71-
connectionName: chemblConnectionInput.name
72-
}
73-
],
74-
catalogTargets: [],
75-
s3Targets: [],
76-
},
77-
role: chemblGlueRole.roleName,
78-
databaseName: chembl_25_src.databaseName,
79-
schemaChangePolicy: {
80-
deleteBehavior: "DEPRECATE_IN_DATABASE",
81-
updateBehavior: "UPDATE_IN_DATABASE",
82-
},
83-
tablePrefix: "",
84-
classifiers: []
85-
});
86-
87-
88-
const chemblCopyTablesSparkScript = new s3assets.Asset(this, 'chemblCopyTablesSparkScript', {
89-
path: 'scripts/glue.s3importchembl25.py'
90-
});
91-
chemblCopyTablesSparkScript.grantRead(chemblGlueRole);
92-
93-
94-
const chembl_etl_job = new glue.CfnJob(this, 'chembl_etl_job', {
95-
executionProperty: {
96-
maxConcurrentRuns: 1
97-
},
98-
name: "chembl_src_to_dl_etl",
99-
timeout: 2880,
100-
glueVersion: "1.0",
101-
maxCapacity: 11.0,
102-
connections: {
103-
connections: [
104-
chemblConnectionInput.name
105-
]
106-
},
107-
command: {
108-
scriptLocation: `s3://${chemblCopyTablesSparkScript.s3BucketName}/${chemblCopyTablesSparkScript.s3ObjectKey}`,
109-
name: "glueetl",
110-
pythonVersion: "3"
111-
},
112-
role: chemblGlueRole.roleArn,
113-
maxRetries: 0,
114-
defaultArguments: {
21+
22+
23+
const dataSetName = "chembl_25";
24+
25+
new RDSPostgresDataSetEnrollment(this, 'chembl-25-enrollment', {
26+
databaseSecret: props.databaseSecret,
27+
database: props.database,
28+
accessSecurityGroup: props.accessSecurityGroup,
29+
dataLakeBucket: props.dataLakeBucket,
30+
DataSetName: dataSetName,
31+
JdbcTargetIncludePaths: ["chembl_25/%"],
32+
GlueScriptPath: "scripts/glue.s3importchembl25.py",
33+
GlueScriptArguments: {
11534
"--job-language": "python",
11635
"--job-bookmark-option": "job-bookmark-disable",
11736
"--enable-metrics": "",
11837
"--DL_BUCKET": props.dataLakeBucket.bucketName,
11938
"--DL_PREFIX": "/chembl/25/",
12039
"--DL_REGION": cdk.Stack.of(this).region,
121-
"--GLUE_SRC_DATABASE": chembl_25_src.databaseName
122-
}
123-
});
124-
125-
const chembl_datalake_crawler = new glue.CfnCrawler(this, 'chembl_datalake_crawler',{
126-
name: "chembl_25_dl_crawler",
127-
targets: {
128-
s3Targets: [
129-
{
130-
path: `s3://${props.dataLakeBucket.bucketName}/chembl/25/`
131-
}
132-
]
133-
},
134-
role: chemblGlueRole.roleArn,
135-
databaseName: chembl_25_dl.databaseName,
136-
schemaChangePolicy: {
137-
deleteBehavior: "DEPRECATE_IN_DATABASE",
138-
updateBehavior: "UPDATE_IN_DATABASE"
139-
},
140-
tablePrefix: ""
141-
});
142-
143-
const datalakeEnrollmentWorkflow = new DataLakeEnrollmentWorkflow(this,'chemblDataLakeWorkflow',{
144-
workfowName: "chemblDataLakeEnrollmentWorkflow",
145-
srcCrawler: chembl25Crawler,
146-
etlJob: chembl_etl_job,
147-
datalakeCrawler: chembl_datalake_crawler
148-
149-
})
40+
"--GLUE_SRC_DATABASE": "chembl_25_src"
41+
}
42+
43+
});
15044
}
151-
}
45+
}
46+
47+
48+
49+

lib/data-set-enrollment.ts

Lines changed: 141 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,159 @@ import lambda = require('@aws-cdk/aws-lambda');
55
import iam = require('@aws-cdk/aws-iam');
66
import cfn = require("@aws-cdk/aws-cloudformation");
77
import fs = require('fs');
8+
import s3assets = require('@aws-cdk/aws-s3-assets');
9+
import { URL } from "url";
10+
811

912
export interface DataSetEnrollmentProps extends cdk.StackProps {
1013
dataLakeBucket: s3.Bucket;
14+
dataSetName: string;
15+
SourceConnectionInput?: glue.CfnConnection.ConnectionInputProperty;
16+
SourceTargets: glue.CfnCrawler.TargetsProperty;
17+
GlueScriptPath: string;
18+
GlueScriptArguments: any;
19+
SourceAccessPolicy?: iam.Policy;
1120
}
1221

22+
23+
24+
25+
26+
export class DataSetEnrollment extends cdk.Construct {
27+
28+
public readonly Workflow: DataLakeEnrollmentWorkflow;
29+
public readonly SrcCrawlerCompleteTrigger: glue.CfnTrigger;
30+
public readonly ETLCompleteTrigger: glue.CfnTrigger;
31+
public readonly SourceConnection?: glue.CfnConnection;
32+
public readonly DataLakeConnection: glue.CfnConnection;
33+
public readonly DataSetName: string;
34+
public readonly DataSetGlueRole: iam.Role;
35+
public readonly Dataset_Source: glue.Database;
36+
public readonly Dataset_Datalake: glue.Database;
37+
38+
private setupCrawler(targetGlueDatabase: glue.Database, targets: glue.CfnCrawler.TargetsProperty, isSourceCrawler: boolean){
39+
40+
var sourceCrawler = isSourceCrawler ? "src" : "dl";
41+
42+
return new glue.CfnCrawler(this, `${this.DataSetName}-${sourceCrawler}-crawler`,{
43+
name: `${this.DataSetName}_${sourceCrawler}_crawler`,
44+
targets: targets,
45+
role: this.DataSetGlueRole.roleName,
46+
databaseName: targetGlueDatabase.databaseName,
47+
schemaChangePolicy: {
48+
deleteBehavior: "DEPRECATE_IN_DATABASE",
49+
updateBehavior: "UPDATE_IN_DATABASE",
50+
},
51+
tablePrefix: "",
52+
classifiers: []
53+
});
54+
55+
}
56+
57+
constructor(scope: cdk.Construct, id: string, props: DataSetEnrollmentProps) {
58+
super(scope, id);
59+
60+
this.DataSetName = props.dataSetName;
61+
62+
this.Dataset_Source = new glue.Database(this, `${props.dataSetName}_src`, {
63+
databaseName: `${props.dataSetName}_src`,
64+
locationUri: `s3://${props.dataLakeBucket.bucketName}/${props.dataSetName}/src/`
65+
});
66+
this.Dataset_Datalake = new glue.Database(this, `${props.dataSetName}_dl`, {
67+
databaseName: `${props.dataSetName}_dl`,
68+
locationUri: `s3://${props.dataLakeBucket.bucketName}/${props.dataSetName}/dl/`
69+
});
70+
71+
72+
let connectionArray = [];
73+
if(props.SourceConnectionInput){
74+
this.SourceConnection = new glue.CfnConnection(this, `${props.dataSetName}-src-connection`, {
75+
catalogId: this.Dataset_Source.catalogId,
76+
connectionInput: props.SourceConnectionInput
77+
});
78+
if(props.SourceConnectionInput.name){
79+
connectionArray.push(props.SourceConnectionInput.name);
80+
}
81+
}
82+
83+
84+
this.DataSetGlueRole = new iam.Role(this, `${props.dataSetName}-GlueRole`, {
85+
assumedBy: new iam.ServicePrincipal('glue.amazonaws.com')
86+
});
87+
88+
this.DataSetGlueRole.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole'));
89+
this.DataSetGlueRole.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('CloudWatchAgentServerPolicy'));
90+
props.dataLakeBucket.grantReadWrite(this.DataSetGlueRole);
91+
92+
if(props.SourceAccessPolicy){
93+
props.SourceAccessPolicy.attachToRole(this.DataSetGlueRole);
94+
}
95+
96+
97+
98+
const sourceCrawler = this.setupCrawler(this.Dataset_Source, props.SourceTargets, true);
99+
100+
101+
102+
const glueScript = new s3assets.Asset(this, `${props.dataSetName}-GlueScript`, {
103+
path: props.GlueScriptPath
104+
});
105+
glueScript.grantRead(this.DataSetGlueRole);
106+
107+
108+
const etl_job = new glue.CfnJob(this, `${props.dataSetName}-EtlJob`, {
109+
executionProperty: {
110+
maxConcurrentRuns: 1
111+
},
112+
name: `${props.dataSetName}_src_to_dl_etl`,
113+
timeout: 2880,
114+
connections: {
115+
connections: connectionArray
116+
},
117+
glueVersion: "1.0",
118+
maxCapacity: 10.0,
119+
command: {
120+
scriptLocation: `s3://${glueScript.s3BucketName}/${glueScript.s3ObjectKey}`,
121+
name: "glueetl",
122+
pythonVersion: "3"
123+
},
124+
role: this.DataSetGlueRole.roleArn,
125+
maxRetries: 0,
126+
defaultArguments: props.GlueScriptArguments
127+
});
128+
129+
130+
131+
const datalake_crawler = this.setupCrawler(this.Dataset_Datalake, {
132+
s3Targets: [
133+
{
134+
path: `s3://${props.dataLakeBucket.bucketName}/${props.dataSetName}/`
135+
}
136+
]
137+
}, false);
138+
139+
140+
const datalakeEnrollmentWorkflow = new DataLakeEnrollmentWorkflow(this,`${props.dataSetName}DataLakeWorkflow`,{
141+
workfowName: `${props.dataSetName}_DataLakeEnrollmentWorkflow`,
142+
srcCrawler: sourceCrawler,
143+
etlJob: etl_job,
144+
datalakeCrawler: datalake_crawler
145+
146+
})
147+
148+
}
149+
150+
151+
}
152+
153+
13154
export interface DataLakeEnrollmentWorkflowProps {
14155
workfowName: string;
15156
srcCrawler: glue.CfnCrawler,
16157
etlJob: glue.CfnJob,
17158
datalakeCrawler: glue.CfnCrawler
18159
}
19160

20-
21161
export class DataLakeEnrollmentWorkflow extends cdk.Construct {
22162

23163
public readonly StartTrigger: glue.CfnTrigger;

0 commit comments

Comments
 (0)