Skip to content

Commit b2c0306

Browse files
authored
Merge pull request #1 from aws-samples/development
Initial pull request after repo creation.
2 parents 25b8fb5 + a4d89ea commit b2c0306

31 files changed

+2878
-7
lines changed

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
*.js
2+
!jest.config.js
3+
*.d.ts
4+
node_modules
5+
6+
# CDK asset staging directory
7+
.cdk.staging
8+
cdk.out

.npmignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
*.ts
2+
!*.d.ts
3+
4+
# CDK asset staging directory
5+
.cdk.staging
6+
cdk.out

ApplyLakeFormationPermissions.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
python scripts/local.datalake.RemoveIamAllowedPrincipals.py

Config

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package.DataLakeAsCode = {
2+
interfaces = (1.0);
3+
4+
# Use NoOpBuild. See https://w.amazon.com/index.php/BrazilBuildSystem/NoOpBuild
5+
build-system = no-op;
6+
build-tools = {
7+
1.0 = {
8+
NoOpBuild = 1.0;
9+
};
10+
};
11+
12+
# Use runtime-dependencies for when you want to bring in additional
13+
# packages when deploying.
14+
# Use dependencies instead if you intend for these dependencies to
15+
# be exported to other packages that build against you.
16+
dependencies = {
17+
1.0 = {
18+
};
19+
};
20+
21+
runtime-dependencies = {
22+
1.0 = {
23+
};
24+
};
25+
26+
};

DeployChemblOpenTargetsEnv.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/bin/sh
2+
npm run build
3+
cdk bootstrap
4+
currentPrincipalArn=$(aws sts get-caller-identity --query Arn --output text)
5+
jq '.context.starterLakeFormationAdmin = $currentPrincipalArn' --arg currentPrincipalArn $currentPrincipalArn cdk.json > tmp.$$.json && mv tmp.$$.json cdk.json
6+
cdk deploy BaselineStack --require-approval never
7+
cdk deploy CoreDataLake --require-approval never
8+
cdk deploy ChemblStack --require-approval never
9+
cdk deploy OpenTargetsStack --require-approval never
10+
cdk deploy AnalyticsStack --require-approval never

InstallCdkDependencies.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
npm install -g aws-cdk
2+
npm install "@types/node" —save-dev
3+
npm update
4+
sudo yum install jq -y
5+
npm install

README.md

Lines changed: 144 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,150 @@
1-
## My Project
1+
# Data Lake as Code; Featuring ChEMBL and Open Targets
22

3-
TODO: Fill this README out!
3+
Companion code for upcoming AWS blogpost on enrolling chembl and opentargets into a data lake on AWS
44

5-
Be sure to:
5+
![](https://quip-amazon.com/blob/HPG9AAwumxR/D5akZWKUWmfWEhA8u4loEA?a=U93UPcmkUsuoToxZr2QpWU5nosB1RwimIsIW5TtaJvEa)
66

7-
* Change the title in this README
8-
* Edit your repository description on GitHub
7+
## To install this in your own AWS account:
98

10-
## License
9+
Your local machine needs to have the AWS CLI installed on your machine along with IAM permissions setup (through IAM role or .aws/credentials file). I like to use Cloud9 as my IDE as it comes with both of those already setup for me.
1110

12-
This library is licensed under the MIT-0 License. See the LICENSE file.
11+
Run the following commands
1312

13+
```shell
14+
git clone https://github.com/paulu-aws/chembl-opentargets-data-lake-example.git
15+
cd chembl-opentargets-data-lake-example
16+
./InstallCdkDependencies.sh
17+
./DeployChemblOpenTargetsEnv.sh
18+
```
19+
20+
Wait for Chembl and OpenTargets to be ‘staged’ into the baseline stack.
21+
22+
The ‘baseline stack’ in the CDK application spins up a VPC with an S3 bucket (for OpenTargets) and an RDS Postgres instance (for ChEMBL). It also spins up a little helper EC2 instance that stages those assets in their ‘raw’ form after downloading them from [OpenTargets.org](http://OpenTargets.org) and EMBL-EBI.
23+
24+
Go to Systems Manager in the AWS console, and then the ‘Run Command’ section. You will see the currently running command documents.
25+
26+
![](https://quip-amazon.com/blob/HPG9AAwumxR/x4lfduQeC3Ww-DyK8loIAg?a=6aMBuWAgnWaZ5pQaJndaM06ob734VpmiCI5xfguyPaca)
27+
28+
It takes about an hour for Chembl to build. If you get impatient and want to see the progress in real time, go to ‘Session Manager’ in the Systems Manager console, click the ‘Start session’ button, choose the ‘ChembDbImportInstance’ radio button, and click the ‘Start Session’ button.
29+
30+
![](https://quip-amazon.com/blob/HPG9AAwumxR/Fj7sA3VuIuvdPOHl017Xcg?a=EYFlHaKY8weEGFezDR4ld3sEhBMWl88afFdDjJQ15H8a)
31+
32+
That will open a SSM session window. Run the following command to tail the log output.
33+
34+
```tail -f /home/ssm-user/progressLog```
35+
36+
![](https://quip-amazon.com/blob/HPG9AAwumxR/rMcRhjzUcIGQVYeBFxup4Q?a=2NRscRrktD9kLK7rDqqD9bO3aXtTYttCeaEWLwDXVgIa)
37+
38+
## Enroll Chembl and OpenTargets into the data lake
39+
40+
Once the database has finished importing, go to Glue in the AWS console, and then the “Workflows” section
41+
42+
![](https://quip-amazon.com/blob/HPG9AAwumxR/K0liqaLzOGNHdODU_fN_MA?a=GQQahtSxVQNvaU6AkEjATwCE0WJglr630LH3bZcngB0a)
43+
44+
Select the openTargetsDataLakeEnrollment workflow, and click ‘Actions’, then 'Run'
45+
46+
![](https://quip-amazon.com/blob/HPG9AAwumxR/UV0-ZlwmK_KF9L9MfaUgfA?a=97k7vof4qlurzy3zSsmPVhomgCpRUJfREq8UCNZSzt4a)
47+
48+
Do the same for the chemblDataLakeEnrollmentWorkflow. Wait for the workflows to finish.
49+
50+
Both workflows will run in parallel, but it will take the openTargetsDataLakeEnrollmentWorkflow ~170 minutes to complete while the ChEMBL enrollment will finish in about 30 minutes.
51+
52+
## Query an Conquer!
53+
54+
Go to Athena in the AWS Console.
55+
56+
If you haven't used Athena in your account before, you will need to define a storage location for your query results. Click on the ‘Settings’ tab in the top right and specify a bucket name where you would like Athena results stored and click save.
57+
58+
![](https://quip-amazon.com/blob/HPG9AAwumxR/d9imQFzWnNdhWYDAo9Bt1A?a=8Q4UOXPqvG1fk3knDX9x2wr9Jeu9g8V2tPRYsnE3Vlga)
59+
60+
Now, click the ‘Databases’ dropdown:
61+
62+
You will see 4 databases listed, you only want to use 2 of them:
63+
64+
_**Use:**_
65+
66+
**chembl-25-dl**- This is the ‘dl’ or ‘data lake’ Chembl database. Always use tables in this database when running Chembl queries. Part of the chemblDataLakeEnrollment workflow converts the ‘source’ Chembl Postgres formats into a ‘data lake’ friendly parquet format optimized for Athena.
67+
68+
**opentargets-1911-dl**- This is the ‘dl’ or ‘data lake’ OpenTargets database. Always use this table when running OpenTarget queries. Part of the chemblDataLakeEnrollment workflow converts the ‘source’ OpenTargets json and csv formats into a ‘data lake’ parquet format optimized for Athena.
69+
70+
_**Dont use:**_
71+
72+
**chembl-25-src** - **This represents the ‘src’ or ‘source’ Chembl postgres database. By design, the source database is not directly queryable from Athena, so you will not use this database.
73+
74+
**opentargets-1911-src** - This is the ‘src’ or ‘source’ table. When you query this table, you are directly querying the original chembl json and csv filesfrom OpenTargets. The performance may be slow as those formats are not optimized for querying with Athena.
75+
76+
77+
## Permissions & Lake Formation
78+
79+
There are [two methods of security](https://docs.aws.amazon.com/lake-formation/latest/dg/access-control-overview.html) you can apply to your data lake. The default account configuration, which is likely what you are using at the moment, is essentially “open” Lake Formation permissions and “fine-grained” IAM polices. The DataSetStack construct implements a number of CDK-style grant*() methonds. The grantIamRead() method of the code grants a “fine-grained” IAM policy that gives users read access to just the tables in the data set you preform the grant on.
80+
81+
82+
83+
For example, in the bin/aws.ts file you can see an example of granting that “fine-grained” IAM read permission. Pretty easy! Here we are passing the role from the notebook, but you can import an existing IAM user, role, or group using the CDK.
84+
```typescript
85+
chemblStack.grantIamRead(analyticsStack.NotebookRole);
86+
openTargetsStack.grantIamRead(analyticsStack.NotebookRole);
87+
```
88+
The other method of security gives you more control. Specifically, the ability to control permissions at the database, table, and column level. This requires “fine-grained” Lake Formation permissions and “coarse” IAM permissions. The `grantDatabasePermissions()`, `grantTablePermissions()`, and `grantTableWithColumnPermissions()` setup both the fine-grained LakeFormation and coarse IAM permissions for you.
89+
90+
91+
92+
Again, another example in the `bin/aws.ts` file:
93+
94+
```typescript
95+
const exampleUser = iam.User.fromUserName(coreDataLake, 'exampleGrantee', 'paulUnderwood' );
96+
97+
var exampleTableWithColumnsGrant: DataLakeEnrollment.TableWithColumnPermissionGrant = {
98+
table: "chembl_25_public_compound_structures",
99+
// Note that we are NOT including 'canonical_smiles'. That effectivley prevents this user from querying that column.
100+
columns: ['molregno', 'molfile', 'standard_inchi', 'standard_inchi_key'],
101+
DatabasePermissions: [],
102+
GrantableDatabasePermissions: [],
103+
TableColumnPermissions: [DataLakeEnrollment.TablePermission.Select],
104+
GrantableTableColumnPermissions: []
105+
};
106+
107+
chemblStack.grantTableWithColumnPermissions(exampleUser, exampleTableWithColumnsGrant);
108+
````
109+
110+
111+
The `GrantableDatabasePermissions`, `GrantableTableColumnPermissions`, and `GrantableTableColumnPermissions` give the supplied IAM principal permissions to grant permissions others. If you have a data-set steward, or someone who should have the authority to grant permissions to others, you cant "grant the permission to grant" using those properties.
112+
113+
114+
115+
To illustrate the the relationship between the fine-grained and coarse permissions, think of it as two doors. An IAM principal needs to have permission to walk through both doors to query the data lake. The DataLakeEnrollment construct handles granting both the fine and coarse permissions for you.
116+
117+
![image.png](https://api.quip-amazon.com/2/blob/HPG9AAwumxR/ACYxNvcfFhaRL15neEGWHA)
118+
119+
120+
121+
If you decide that you want the additional flexibility of Lake Formation permissions, you need to perform two manual actions before Lake Formation permissions will begin protecting your resources. Until you perform these two steps, you are only protecting your resources with the coarse IAM permission and the Lake Formation permissions wont apply.
122+
123+
124+
125+
1) Change the default permissions for newly created databases and tables
126+
127+
128+
129+
Visit the Lake Formation service page in the AWS console, and go to theSettingssection on the left.
130+
131+
132+
You need to **UNCHECK** the two boxes and hitSave
133+
134+
![image.png](https://api.quip-amazon.com/2/blob/HPG9AAwumxR/luIf4C1WcTNeDeixOEbqsg)
135+
136+
2) You need to revoke all of the Lake Formation permissions that have been granted to `IAM_ALLOWED_PRINCIPALS`. If you have used Glue in the past or the ChEMBL or OpenTarget workflows have already completed you can see a bunch of them in theData Permissionssection in the Lake Formation console. By unchecking the boxes before, we are now stopping the default behavior where Lake Formation adds a `IAM_ALLOWED_PRINCIPALS` grant to any Glue Tables/Resources created.
137+
138+
139+
140+
Now that we have stopped that default-add `IAM_ALLOWED_PRINCIPALS` behavior, we need to back out any existing grants to `IAM_ALLOWED_PRINCIPALS`. As long as they remain, any IAM principal with coarse IAM permissions to the resource will still be able to query columns or tables they shouldn't have access to.
141+
142+
143+
144+
The `local.datalake.RemoveIamAllowedPrincipals.py` python script will save you the effort of manually revoking those permissions from IAM_ALLOWED_PRINCIPALS. Running the following command will issue the revokes for all IAM_ALLOWED_PRINCIPALS granted permissions.
145+
146+
```
147+
python ./script/local.datalake.RemoveIamAllowedPrincipals.py
148+
```
149+
150+
DONT RUN THIS COMMAND IF YOU HAVE PEOPLE ALREADY RELYING ON THE AWS GLUE CATALOG (via Athena for example). This will effectively remove their access until you grant them user/role/group specific Lake Formation permissions.

bin/aws.ts

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/usr/bin/env node
2+
import 'source-map-support/register';
3+
import * as cdk from '@aws-cdk/core';
4+
import { BaselineStack } from '../lib/baseline-stack';
5+
import { DataLakeStack } from '../lib/stacks/datalake-stack';
6+
import { OpenTargetsStack } from '../lib/opentargets-stack';
7+
import { ChemblStack } from '../lib/chembl-25-stack';
8+
import { AnalyticsStack } from '../lib/analytics-stack.js';
9+
import iam = require('@aws-cdk/aws-iam');
10+
import s3 = require('@aws-cdk/aws-s3');
11+
import { DataLakeEnrollment } from '../lib/constructs/data-lake-enrollment';
12+
13+
const app = new cdk.App();
14+
const baseline = new BaselineStack(app, 'BaselineStack');
15+
16+
17+
const coreDataLake = new DataLakeStack(app, 'CoreDataLake', {
18+
starterLakeFormationAdminPrincipalArn: app.node.tryGetContext("starterLakeFormationAdmin")
19+
});
20+
21+
const chemblStack = new ChemblStack(app, 'ChemblStack', {
22+
database: baseline.ChemblDb,
23+
accessSecurityGroup: baseline.chemblDBChemblDbAccessSg,
24+
databaseSecret: baseline.chemblDBSecret,
25+
DataLake: coreDataLake
26+
});
27+
28+
const openTargetsStack = new OpenTargetsStack(app, 'OpenTargetsStack', {
29+
sourceBucket: baseline.OpenTargetsSourceBucket,
30+
sourceBucketDataPrefix: '/opentargets/sourceExports/19.11/output/',
31+
DataLake: coreDataLake
32+
});
33+
34+
const analyticsStack = new AnalyticsStack(app, 'AnalyticsStack', {
35+
targetVpc: baseline.Vpc,
36+
});
37+
38+
39+
40+
41+
42+
chemblStack.grantIamRead(analyticsStack.NotebookRole);
43+
openTargetsStack.grantIamRead(analyticsStack.NotebookRole);
44+
45+
46+
47+
48+
49+
50+
51+
52+
// const exampleUser = iam.User.fromUserName(coreDataLake, 'exampleGrantee', 'paul1' );
53+
54+
// var exampleGrant: DataLakeEnrollment.TablePermissionGrant = {
55+
// tables: ["association_data", "evidence_data","target_list","disease_list"],
56+
// DatabasePermissions: [DataLakeEnrollment.DatabasePermission.Alter, DataLakeEnrollment.DatabasePermission.CreateTable, DataLakeEnrollment.DatabasePermission.Drop],
57+
// GrantableDatabasePermissions: [DataLakeEnrollment.DatabasePermission.Alter, DataLakeEnrollment.DatabasePermission.CreateTable, DataLakeEnrollment.DatabasePermission.Drop],
58+
// TablePermissions: [DataLakeEnrollment.TablePermission.Select, DataLakeEnrollment.TablePermission.Insert, DataLakeEnrollment.TablePermission.Delete],
59+
// GrantableTablePermissions: [DataLakeEnrollment.TablePermission.Select]
60+
// };
61+
62+
// openTargetsStack.grantTablePermissions(exampleUser, exampleGrant);
63+
64+
65+
66+
67+
// // In the example below, we are using the compound_structures table from ChEMBL. It has the following table definition:
68+
// // ['molregno', 'molfile', 'standard_inchi', 'standard_inchi_key', 'canonical_smiles']
69+
// // Lets say we want to give a principal ONLY select permissions to everything in the compound_structures table BUT the 'canonical_smiles' column.
70+
71+
// var exampleTableWithColumnsGrant: DataLakeEnrollment.TableWithColumnPermissionGrant = {
72+
// table: "chembl_25_public_compound_structures",
73+
// // Note that we are NOT including 'canonical_smiles'. That effectivley prevents this user from querying that column.
74+
// columns: ['molregno', 'molfile', 'standard_inchi', 'standard_inchi_key'],
75+
// DatabasePermissions: [],
76+
// GrantableDatabasePermissions: [],
77+
// TableColumnPermissions: [DataLakeEnrollment.TablePermission.Select],
78+
// GrantableTableColumnPermissions: []
79+
// };
80+
81+
// var exampleTableWithColumnsGrant_WithWildCard: DataLakeEnrollment.TableWithColumnPermissionGrant = {
82+
// table: "chembl_25_public_compound_structures",
83+
// wildCardFilter: DataLakeEnrollment.TableWithColumnFilter.Exclude,
84+
// columns: ['canonical_smiles'],
85+
// DatabasePermissions: [],
86+
// GrantableDatabasePermissions: [],
87+
// TableColumnPermissions: [DataLakeEnrollment.TablePermission.Select],
88+
// GrantableTableColumnPermissions: []
89+
// };
90+
91+
// // Note that exampleTableWithColumnsGrant exampleTableWithColumnsGrant_WithWildCard grants the same effecitve permissions. One just uses a the wildcard.
92+
// chemblStack.grantTableWithColumnPermissions(exampleUser, exampleTableWithColumnsGrant);

cdk.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"app": "npx ts-node bin/aws.ts",
3+
"context": {
4+
"starterLakeFormationAdmin": "XXXX-this value gets populated by DeployChemblOpenTargetsEnv.sh script-XXXXX"
5+
}
6+
}

jest.config.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
module.exports = {
2+
"roots": [
3+
"<rootDir>/test"
4+
],
5+
testMatch: [ '**/*.test.ts'],
6+
"transform": {
7+
"^.+\\.tsx?$": "ts-jest"
8+
},
9+
}

0 commit comments

Comments
 (0)